18 const real_type * RESTRICT data,
const int * RESTRICT cols_idx,
19 const int * RESTRICT data_idx,
20 const int num_rows,
const int num_cols,
const int blocks_per_line,
22 const int left_size,
const int right_size,
23 const int * RESTRICT right_range,
27 std::vector<int> J(blocks_per_line), B(blocks_per_line);
28 for(
int si = 0; si<left_size*num_rows; si++)
30 int s = si / num_rows;
31 int i = si % num_rows;
32 for(
int d=0; d<blocks_per_line; d++)
34 int C = cols_idx[i*blocks_per_line+d];
35 J[d] = ( C == -1 ? -1 : (s*num_cols+C)*
n);
37 for(
int k=0; k<
n; k++)
39 for(
int d=0; d<blocks_per_line; d++)
40 B[d] = (data_idx[i*blocks_per_line+d]*
n+k)*
n;
41 for(
int j=right_range[0]; j<right_range[1]; j++)
43 int I = ((s*num_rows + i)*
n+k)*right_size+j;
46 for(
int d=0; d<blocks_per_line; d++)
51 for(
int q=0; q<
n; q++)
52 temp = DG_FMA(data[ B[d]+q],
53 x[(J[d]+q)*right_size+j],
55 y[I] = DG_FMA(
alpha, temp,
y[I]);
64 const real_type * RESTRICT data,
const int * RESTRICT cols_idx,
65 const int * RESTRICT data_idx,
66 const int num_rows,
const int num_cols,
67 const int left_size,
const int right_size,
68 const int * RESTRICT right_range,
80 for(
int i=2; i<num_rows-1; i++)
81 for(
int d=0; d<blocks_per_line; d++)
83 if( data_idx[i*blocks_per_line+d]
84 != data_idx[blocks_per_line+d]) trivial =
false;
89 real_type dprivate[blocks_per_line*
n*
n];
90 for(
int d=0; d<blocks_per_line; d++)
91 for(
int k=0; k<
n; k++)
92 for(
int q=0; q<
n; q++)
94 int B = data_idx[blocks_per_line+d];
95 dprivate[(k*blocks_per_line+d)*
n+q] = data[(B*
n+k)*
n+q];
97 for(
int s=0; s<left_size; s++)
99 for(
int i=0; i<1; i++)
101 for(
int d=0; d<blocks_per_line; d++)
103 int C = cols_idx[i*blocks_per_line+d];
104 int J = (s*num_cols+C)*
n;
105 for(
int q=0; q<
n; q++)
106 xprivate[d*
n+q] = (C == -1 ? 0 :
x[J+q]);
108 for(
int k=0; k<
n; k++)
111 for(
int d=0; d<blocks_per_line; d++)
113 int B = (data_idx[i*blocks_per_line+d]*
n+k)*
n;
114 for(
int q=0; q<
n; q++)
115 temp[d] = DG_FMA(data[B+q], xprivate[d*
n+q], temp[d]);
117 int I = ((s*num_rows + i)*
n+k);
120 for(
int d=0; d<blocks_per_line; d++)
121 y[I] = DG_FMA(
alpha, temp[d],
y[I]);
124 for(
int i=1; i<num_rows-1; i++)
126 for(
int k=0; k<
n; k++)
128 int I = ((s*num_rows + i)*
n+k);
131 int B =
n*blocks_per_line*k;
132 for(
int d=0; d<blocks_per_line; d++)
135 int C = cols_idx[i*blocks_per_line+d];
138 for(
int q=0; q<
n; q++)
140 int J = (s*num_cols+C)*
n+q;
141 temp = DG_FMA( dprivate[B+d*
n+q],
x[J], temp);
143 y[I] = DG_FMA(
alpha, temp,
y[I]);
147 for(
int i=num_rows-1; i<num_rows; i++)
149 for(
int d=0; d<blocks_per_line; d++)
151 int C = cols_idx[i*blocks_per_line+d];
152 int J = (s*num_cols+C)*
n;
153 for(
int q=0; q<
n; q++)
154 xprivate[d*
n+q] = (C == -1 ? 0 :
x[J+q]);
156 for(
int k=0; k<
n; k++)
159 for(
int d=0; d<blocks_per_line; d++)
161 int B = (data_idx[i*blocks_per_line+d]*
n+k)*
n;
162 for(
int q=0; q<
n; q++)
163 temp[d] = DG_FMA( data[B+q], xprivate[d*
n+q], temp[d]);
165 int I = ((s*num_rows + i)*
n+k);
168 for(
int d=0; d<blocks_per_line; d++)
169 y[I] = DG_FMA(
alpha, temp[d],
y[I]);
177 for(
int s=0; s<left_size; s++)
178 for(
int i=0; i<num_rows; i++)
180 for(
int d=0; d<blocks_per_line; d++)
182 int C = cols_idx[i*blocks_per_line+d];
183 int J = (s*num_cols+C)*
n;
184 for(
int q=0; q<
n; q++)
185 xprivate[d*
n+q] = (C == -1 ? 0 :
x[J+q]);
187 for(
int k=0; k<
n; k++)
190 for(
int d=0; d<blocks_per_line; d++)
192 int B = (data_idx[i*blocks_per_line+d]*
n+k)*
n;
193 for(
int q=0; q<
n; q++)
194 temp[d] = DG_FMA( data[B+q], xprivate[d*
n+q], temp[d]);
196 int I = ((s*num_rows + i)*
n+k);
199 for(
int d=0; d<blocks_per_line; d++)
200 y[I] = DG_FMA(
alpha, temp[d],
y[I]);
207 real_type dprivate[blocks_per_line*
n];
208 int J[blocks_per_line];
209 if( !( (right_range[1]-right_range[0]) > 100*left_size*num_rows*
n ))
211 for (
int sik = 0; sik < left_size*num_rows*
n; sik++)
213 int s = sik / (num_rows*
n);
214 int i = (sik % (num_rows*
n)) /
n;
215 int k = (sik % (num_rows*
n)) %
n;
217 for(
int d=0; d<blocks_per_line; d++)
219 int C = cols_idx[i*blocks_per_line+d];
220 J[d] = ( C == -1 ? -1 :(s*num_cols+C)*
n );
221 int B = (data_idx[i*blocks_per_line+d]*
n+k)*
n;
222 for(
int q=0; q<
n; q++)
223 dprivate[d*
n+q] = data[B+q];
225 for(
int j=right_range[0]; j<right_range[1]; j++)
227 int I = ((s*num_rows + i)*
n+k)*right_size+j;
230 for(
int d=0; d<blocks_per_line; d++)
236 for(
int q=0; q<
n; q++)
237 temp = DG_FMA( dprivate[ d*
n+q],
238 x[(Jd+q)*right_size+j],
240 y[I] = DG_FMA(
alpha, temp,
y[I]);
248 for (
int sik = 0; sik < left_size*num_rows*
n; sik++)
250 int s = sik / (num_rows*
n);
251 int i = (sik % (num_rows*
n)) /
n;
252 int k = (sik % (num_rows*
n)) %
n;
254 for(
int d=0; d<blocks_per_line; d++)
256 int C = cols_idx[i*blocks_per_line+d];
257 J[d] = ( C == -1 ? -1 :(s*num_cols+C)*
n );
258 int B = (data_idx[i*blocks_per_line+d]*
n+k)*
n;
259 for(
int q=0; q<
n; q++)
260 dprivate[d*
n+q] = data[B+q];
262 for(
int j=right_range[0]; j<right_range[1]; j++)
264 int I = ((s*num_rows + i)*
n+k)*right_size+j;
267 for(
int d=0; d<blocks_per_line; d++)
273 for(
int q=0; q<
n; q++)
274 temp = DG_FMA( dprivate[ d*
n+q],
275 x[(Jd+q)*right_size+j],
277 y[I] = DG_FMA(
alpha, temp,
y[I]);
287 const real_type * RESTRICT data_ptr,
const int * RESTRICT cols_ptr,
288 const int * RESTRICT block_ptr,
289 const int num_rows,
const int num_cols,
const int blocks_per_line,
290 const int left_size,
const int right_size,
291 const int * RESTRICT right_range_ptr,
294 if( blocks_per_line == 1)
296 cols_ptr, block_ptr, num_rows, num_cols, left_size, right_size,
297 right_range_ptr, x_ptr,y_ptr);
298 else if (blocks_per_line == 2)
300 cols_ptr, block_ptr, num_rows, num_cols, left_size, right_size,
301 right_range_ptr, x_ptr,y_ptr);
302 else if (blocks_per_line == 3)
304 cols_ptr, block_ptr, num_rows, num_cols, left_size, right_size,
305 right_range_ptr, x_ptr,y_ptr);
306 else if (blocks_per_line == 4)
308 cols_ptr, block_ptr, num_rows, num_cols, left_size, right_size,
309 right_range_ptr, x_ptr,y_ptr);
312 block_ptr, num_rows, num_cols, blocks_per_line,
n, left_size,
313 right_size, right_range_ptr, x_ptr,y_ptr);
321 const real_type* data_ptr = thrust::raw_pointer_cast( &data[0]);
322 const int* cols_ptr = thrust::raw_pointer_cast( &cols_idx[0]);
323 const int* block_ptr = thrust::raw_pointer_cast( &data_idx[0]);
324 const int* right_range_ptr = thrust::raw_pointer_cast( &right_range[0]);
327 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
328 right_size, right_range_ptr, x_ptr,y_ptr);
332 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
333 right_size, right_range_ptr, x_ptr,y_ptr);
336 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
337 right_size, right_range_ptr, x_ptr,y_ptr);
340 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
341 right_size, right_range_ptr, x_ptr,y_ptr);
344 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
345 right_size, right_range_ptr, x_ptr,y_ptr);
348 block_ptr, num_rows, num_cols, blocks_per_line,
n, left_size,
349 right_size, right_range_ptr, x_ptr,y_ptr);
void coo_cpu_multiply_kernel(value_type alpha, const value_type **x, value_type, value_type *RESTRICT y, const CooSparseBlockMat< real_type, Vector > &m)
Definition sparseblockmat_cpu_kernels.h:353
void ell_cpu_multiply_kernel(value_type alpha, value_type beta, const real_type *RESTRICT data, const int *RESTRICT cols_idx, const int *RESTRICT data_idx, const int num_rows, const int num_cols, const int blocks_per_line, const int n, const int left_size, const int right_size, const int *RESTRICT right_range, const value_type *RESTRICT x, value_type *RESTRICT y)
Definition sparseblockmat_cpu_kernels.h:17
void call_ell_cpu_multiply_kernel(value_type alpha, value_type beta, const real_type *RESTRICT data_ptr, const int *RESTRICT cols_ptr, const int *RESTRICT block_ptr, const int num_rows, const int num_cols, const int blocks_per_line, const int left_size, const int right_size, const int *RESTRICT right_range_ptr, const value_type *RESTRICT x_ptr, value_type *RESTRICT y_ptr)
Definition sparseblockmat_cpu_kernels.h:286