18 const real_type * RESTRICT data,
const int * RESTRICT cols_idx,
19 const int * RESTRICT data_idx,
20 const int num_rows,
const int num_cols,
const int blocks_per_line,
22 const int left_size,
const int right_size,
23 const int * RESTRICT right_range,
24 const value_type * RESTRICT x, value_type * RESTRICT
y
27 for(
int si = 0; si<left_size*num_rows; si++)
29 int s = si / num_rows;
30 int i = si % num_rows;
32 int* J = (
int*)alloca(blocks_per_line *
sizeof(
int));
34 int J[blocks_per_line];
36 for(
int d=0; d<blocks_per_line; d++)
38 int C = cols_idx[i*blocks_per_line+d];
39 J[d] = ( C == -1 ? -1 : (s*num_cols+C)*n);
41 for(
int k=0; k<n; k++)
44 int* B = (
int*)alloca(blocks_per_line *
sizeof(
int));
46 int B[blocks_per_line];
48 for(
int d=0; d<blocks_per_line; d++)
49 B[d] = (data_idx[i*blocks_per_line+d]*n+k)*n;
50 for(
int j=right_range[0]; j<right_range[1]; j++)
52 int I = ((s*num_rows + i)*n+k)*right_size+j;
54 y[I] = beta == 0 ? (value_type)0 :
y[I]*beta;
55 for(
int d=0; d<blocks_per_line; d++)
60 for(
int q=0; q<n; q++)
61 temp = DG_FMA(data[ B[d]+q],
62 x[(J[d]+q)*right_size+j],
64 y[I] = DG_FMA(alpha, temp,
y[I]);
73 const real_type * RESTRICT data,
const int * RESTRICT cols_idx,
74 const int * RESTRICT data_idx,
75 const int num_rows,
const int num_cols,
76 const int left_size,
const int right_size,
77 const int * RESTRICT right_range,
78 const value_type * RESTRICT x, value_type * RESTRICT
y
89 for(
int i=2; i<num_rows-1; i++)
90 for(
int d=0; d<blocks_per_line; d++)
92 if( data_idx[i*blocks_per_line+d]
93 != data_idx[blocks_per_line+d]) trivial =
false;
97 value_type xprivate[blocks_per_line*n];
98 real_type dprivate[blocks_per_line*n*n];
99 for(
int d=0; d<blocks_per_line; d++)
100 for(
int k=0; k<n; k++)
101 for(
int q=0; q<n; q++)
103 int B = data_idx[blocks_per_line+d];
104 dprivate[(k*blocks_per_line+d)*n+q] = data[(B*n+k)*n+q];
106 for(
int s=0; s<left_size; s++)
108 for(
int i=0; i<1; i++)
110 for(
int d=0; d<blocks_per_line; d++)
112 int C = cols_idx[i*blocks_per_line+d];
113 int J = (s*num_cols+C)*n;
114 for(
int q=0; q<n; q++)
115 xprivate[d*n+q] = (C == -1 ? 0 :
x[J+q]);
117 for(
int k=0; k<n; k++)
119 value_type temp[blocks_per_line] = {0};
120 for(
int d=0; d<blocks_per_line; d++)
122 int B = (data_idx[i*blocks_per_line+d]*n+k)*n;
123 for(
int q=0; q<n; q++)
124 temp[d] = DG_FMA(data[B+q], xprivate[d*n+q], temp[d]);
126 int I = ((s*num_rows + i)*n+k);
128 y[I] = beta == 0 ? (value_type)0 :
y[I]*beta;
129 for(
int d=0; d<blocks_per_line; d++)
130 y[I] = DG_FMA(alpha, temp[d],
y[I]);
133 for(
int i=1; i<num_rows-1; i++)
135 for(
int k=0; k<n; k++)
137 int I = ((s*num_rows + i)*n+k);
139 y[I] = beta == 0 ? (value_type)0 :
y[I]*beta;
140 int B = n*blocks_per_line*k;
141 for(
int d=0; d<blocks_per_line; d++)
144 int C = cols_idx[i*blocks_per_line+d];
147 for(
int q=0; q<n; q++)
149 int J = (s*num_cols+C)*n+q;
150 temp = DG_FMA( dprivate[B+d*n+q],
x[J], temp);
152 y[I] = DG_FMA(alpha, temp,
y[I]);
156 for(
int i=num_rows-1; i<num_rows; i++)
158 for(
int d=0; d<blocks_per_line; d++)
160 int C = cols_idx[i*blocks_per_line+d];
161 int J = (s*num_cols+C)*n;
162 for(
int q=0; q<n; q++)
163 xprivate[d*n+q] = (C == -1 ? 0 :
x[J+q]);
165 for(
int k=0; k<n; k++)
167 value_type temp[blocks_per_line] = {0};
168 for(
int d=0; d<blocks_per_line; d++)
170 int B = (data_idx[i*blocks_per_line+d]*n+k)*n;
171 for(
int q=0; q<n; q++)
172 temp[d] = DG_FMA( data[B+q], xprivate[d*n+q], temp[d]);
174 int I = ((s*num_rows + i)*n+k);
176 y[I] = beta == 0 ? (value_type)0 :
y[I]*beta;
177 for(
int d=0; d<blocks_per_line; d++)
178 y[I] = DG_FMA(alpha, temp[d],
y[I]);
185 value_type xprivate[blocks_per_line*n];
186 for(
int s=0; s<left_size; s++)
187 for(
int i=0; i<num_rows; i++)
189 for(
int d=0; d<blocks_per_line; d++)
191 int C = cols_idx[i*blocks_per_line+d];
192 int J = (s*num_cols+C)*n;
193 for(
int q=0; q<n; q++)
194 xprivate[d*n+q] = (C == -1 ? 0 :
x[J+q]);
196 for(
int k=0; k<n; k++)
198 value_type temp[blocks_per_line] = {0};
199 for(
int d=0; d<blocks_per_line; d++)
201 int B = (data_idx[i*blocks_per_line+d]*n+k)*n;
202 for(
int q=0; q<n; q++)
203 temp[d] = DG_FMA( data[B+q], xprivate[d*n+q], temp[d]);
205 int I = ((s*num_rows + i)*n+k);
207 y[I] = beta == 0 ? (value_type)0 :
y[I]*beta;
208 for(
int d=0; d<blocks_per_line; d++)
209 y[I] = DG_FMA(alpha, temp[d],
y[I]);
216 real_type dprivate[blocks_per_line*n];
217 int J[blocks_per_line];
218 if( !( (right_range[1]-right_range[0]) > 100*left_size*num_rows*n ))
220 for (
int sik = 0; sik < left_size*num_rows*n; sik++)
222 int s = sik / (num_rows*n);
223 int i = (sik % (num_rows*n)) / n;
224 int k = (sik % (num_rows*n)) % n;
226 for(
int d=0; d<blocks_per_line; d++)
228 int C = cols_idx[i*blocks_per_line+d];
229 J[d] = ( C == -1 ? -1 :(s*num_cols+C)*n );
230 int B = (data_idx[i*blocks_per_line+d]*n+k)*n;
231 for(
int q=0; q<n; q++)
232 dprivate[d*n+q] = data[B+q];
234 for(
int j=right_range[0]; j<right_range[1]; j++)
236 int I = ((s*num_rows + i)*n+k)*right_size+j;
238 y[I] = beta == 0 ? (value_type)0 :
y[I]*beta;
239 for(
int d=0; d<blocks_per_line; d++)
245 for(
int q=0; q<n; q++)
246 temp = DG_FMA( dprivate[ d*n+q],
247 x[(Jd+q)*right_size+j],
249 y[I] = DG_FMA(alpha, temp,
y[I]);
257 for (
int sik = 0; sik < left_size*num_rows*n; sik++)
259 int s = sik / (num_rows*n);
260 int i = (sik % (num_rows*n)) / n;
261 int k = (sik % (num_rows*n)) % n;
263 for(
int d=0; d<blocks_per_line; d++)
265 int C = cols_idx[i*blocks_per_line+d];
266 J[d] = ( C == -1 ? -1 :(s*num_cols+C)*n );
267 int B = (data_idx[i*blocks_per_line+d]*n+k)*n;
268 for(
int q=0; q<n; q++)
269 dprivate[d*n+q] = data[B+q];
271 for(
int j=right_range[0]; j<right_range[1]; j++)
273 int I = ((s*num_rows + i)*n+k)*right_size+j;
275 y[I] = beta == 0 ? (value_type)0 :
y[I]*beta;
276 for(
int d=0; d<blocks_per_line; d++)
282 for(
int q=0; q<n; q++)
283 temp = DG_FMA( dprivate[ d*n+q],
284 x[(Jd+q)*right_size+j],
286 y[I] = DG_FMA(alpha, temp,
y[I]);
296 const real_type * RESTRICT data_ptr,
const int * RESTRICT cols_ptr,
297 const int * RESTRICT block_ptr,
298 const int num_rows,
const int num_cols,
const int blocks_per_line,
299 const int left_size,
const int right_size,
300 const int * RESTRICT right_range_ptr,
301 const value_type * RESTRICT x_ptr, value_type * RESTRICT y_ptr)
303 if( blocks_per_line == 1)
305 cols_ptr, block_ptr, num_rows, num_cols, left_size, right_size,
306 right_range_ptr, x_ptr,y_ptr);
307 else if (blocks_per_line == 2)
309 cols_ptr, block_ptr, num_rows, num_cols, left_size, right_size,
310 right_range_ptr, x_ptr,y_ptr);
311 else if (blocks_per_line == 3)
313 cols_ptr, block_ptr, num_rows, num_cols, left_size, right_size,
314 right_range_ptr, x_ptr,y_ptr);
315 else if (blocks_per_line == 4)
317 cols_ptr, block_ptr, num_rows, num_cols, left_size, right_size,
318 right_range_ptr, x_ptr,y_ptr);
321 block_ptr, num_rows, num_cols, blocks_per_line, n, left_size,
322 right_size, right_range_ptr, x_ptr,y_ptr);
330 const real_type* data_ptr = thrust::raw_pointer_cast( &data[0]);
331 const int* cols_ptr = thrust::raw_pointer_cast( &cols_idx[0]);
332 const int* block_ptr = thrust::raw_pointer_cast( &data_idx[0]);
333 const int* right_range_ptr = thrust::raw_pointer_cast( &right_range[0]);
336 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
337 right_size, right_range_ptr, x_ptr,y_ptr);
341 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
342 right_size, right_range_ptr, x_ptr,y_ptr);
345 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
346 right_size, right_range_ptr, x_ptr,y_ptr);
349 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
350 right_size, right_range_ptr, x_ptr,y_ptr);
353 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
354 right_size, right_range_ptr, x_ptr,y_ptr);
357 block_ptr, num_rows, num_cols, blocks_per_line, n, left_size,
358 right_size, right_range_ptr, x_ptr,y_ptr);
void ell_cpu_multiply_kernel(value_type alpha, value_type beta, const real_type *RESTRICT data, const int *RESTRICT cols_idx, const int *RESTRICT data_idx, const int num_rows, const int num_cols, const int blocks_per_line, const int n, const int left_size, const int right_size, const int *RESTRICT right_range, const value_type *RESTRICT x, value_type *RESTRICT y)
Definition sparseblockmat_cpu_kernels.h:17
void call_ell_cpu_multiply_kernel(value_type alpha, value_type beta, const real_type *RESTRICT data_ptr, const int *RESTRICT cols_ptr, const int *RESTRICT block_ptr, const int num_rows, const int num_cols, const int blocks_per_line, const int left_size, const int right_size, const int *RESTRICT right_range_ptr, const value_type *RESTRICT x_ptr, value_type *RESTRICT y_ptr)
Definition sparseblockmat_cpu_kernels.h:295