Discontinuous Galerkin Library
#include "dg/algorithm.h"
Loading...
Searching...
No Matches
sparseblockmat_cpu_kernels.h
Go to the documentation of this file.
1// DO NOT MANUALLY EDIT THIS FILE
2// Instead:
3// 0. Keep this notice
4// 1. copy sparseblockmat_omp_kernels.h (without include <omp.h>)
5// 2. Remove all lines with #pragma
6// 3. Replace OmpTag with SerialTag
7// 4. Replace omp_ with cpu_
8#include "config.h"
9#include "fma.h"
10
11//for the fmas it is important to activate -mfma compiler flag
12
13namespace dg{
14
15// general multiply kernel
16template<class real_type, class value_type>
17void ell_cpu_multiply_kernel( value_type alpha, value_type beta,
18 const real_type * RESTRICT data, const int * RESTRICT cols_idx,
19 const int * RESTRICT data_idx,
20 const int num_rows, const int num_cols, const int blocks_per_line,
21 const int n,
22 const int left_size, const int right_size,
23 const int * RESTRICT right_range,
24 const value_type * RESTRICT x, value_type * RESTRICT y
25 )
26{
27 for( int si = 0; si<left_size*num_rows; si++)
28 {
29 int s = si / num_rows;
30 int i = si % num_rows;
31#ifdef _MSC_VER //MSVC does not support variable lenght arrays...
32 int* J = (int*)alloca(blocks_per_line * sizeof(int));
33#else
34 int J[blocks_per_line];
35#endif
36 for( int d=0; d<blocks_per_line; d++)
37 {
38 int C = cols_idx[i*blocks_per_line+d];
39 J[d] = ( C == -1 ? -1 : (s*num_cols+C)*n);
40 }
41 for( int k=0; k<n; k++)
42 {
43#ifdef _MSC_VER
44 int* B = (int*)alloca(blocks_per_line * sizeof(int));
45#else
46 int B[blocks_per_line];
47#endif
48 for( int d=0; d<blocks_per_line; d++)
49 B[d] = (data_idx[i*blocks_per_line+d]*n+k)*n;
50 for( int j=right_range[0]; j<right_range[1]; j++)
51 {
52 int I = ((s*num_rows + i)*n+k)*right_size+j;
53 // if y[I] isnan then even beta = 0 does not make it 0
54 y[I] = beta == 0 ? (value_type)0 : y[I]*beta;
55 for( int d=0; d<blocks_per_line; d++)
56 {
57 value_type temp = 0;
58 if( J[d] == -1)
59 continue;
60 for( int q=0; q<n; q++) //multiplication-loop
61 temp = DG_FMA(data[ B[d]+q],
62 x[(J[d]+q)*right_size+j],
63 temp);
64 y[I] = DG_FMA(alpha, temp, y[I]);
65 }
66 }
67 }
68 }
69}
70//specialized multiply kernel
71template<class real_type, class value_type, int n, int blocks_per_line>
72void ell_cpu_multiply_kernel( value_type alpha, value_type beta,
73 const real_type * RESTRICT data, const int * RESTRICT cols_idx,
74 const int * RESTRICT data_idx,
75 const int num_rows, const int num_cols,
76 const int left_size, const int right_size,
77 const int * RESTRICT right_range,
78 const value_type * RESTRICT x, value_type * RESTRICT y
79 )
80{
81 //basically we check which direction is the largest and parallelize that one
82 if(right_size==1)
83 {
84 // trivial means that the data blocks do not change among rows
85 // that are not the first or the last row (where BCs usually live)
86 bool trivial = true;
87 if( num_rows < 4) // need at least 3 rows for this to make sense
88 trivial = false;
89 for( int i=2; i<num_rows-1; i++)
90 for( int d=0; d<blocks_per_line; d++)
91 {
92 if( data_idx[i*blocks_per_line+d]
93 != data_idx[blocks_per_line+d]) trivial = false;
94 }
95 if(trivial)
96 {
97 value_type xprivate[blocks_per_line*n];
98 real_type dprivate[blocks_per_line*n*n];
99 for( int d=0; d<blocks_per_line; d++)
100 for( int k=0; k<n; k++)
101 for( int q=0; q<n; q++)
102 {
103 int B = data_idx[blocks_per_line+d];
104 dprivate[(k*blocks_per_line+d)*n+q] = data[(B*n+k)*n+q];
105 }
106 for( int s=0; s<left_size; s++)
107 {
108 for( int i=0; i<1; i++)
109 {
110 for( int d=0; d<blocks_per_line; d++)
111 {
112 int C = cols_idx[i*blocks_per_line+d];
113 int J = (s*num_cols+C)*n;
114 for(int q=0; q<n; q++)
115 xprivate[d*n+q] = (C == -1 ? 0 : x[J+q]);
116 }
117 for( int k=0; k<n; k++)
118 {
119 value_type temp[blocks_per_line] = {0};
120 for( int d=0; d<blocks_per_line; d++)
121 {
122 int B = (data_idx[i*blocks_per_line+d]*n+k)*n;
123 for( int q=0; q<n; q++) //multiplication-loop
124 temp[d] = DG_FMA(data[B+q], xprivate[d*n+q], temp[d]);
125 }
126 int I = ((s*num_rows + i)*n+k);
127 // if y[I] isnan then even beta = 0 does not make it 0
128 y[I] = beta == 0 ? (value_type)0 : y[I]*beta;
129 for( int d=0; d<blocks_per_line; d++)
130 y[I] = DG_FMA(alpha, temp[d], y[I]);
131 }
132 }
133 for( int i=1; i<num_rows-1; i++)
134 {
135 for( int k=0; k<n; k++)
136 {
137 int I = ((s*num_rows + i)*n+k);
138 // if y[I] isnan then even beta = 0 does not make it 0
139 y[I] = beta == 0 ? (value_type)0 : y[I]*beta;
140 int B = n*blocks_per_line*k;
141 for( int d=0; d<blocks_per_line; d++)
142 {
143 value_type temp = 0;
144 int C = cols_idx[i*blocks_per_line+d];
145 if( C == -1)
146 continue;
147 for( int q=0; q<n; q++)
148 {
149 int J = (s*num_cols+C)*n+q;
150 temp = DG_FMA( dprivate[B+d*n+q], x[J], temp);
151 }
152 y[I] = DG_FMA(alpha, temp, y[I]);
153 }
154 }
155 }
156 for( int i=num_rows-1; i<num_rows; i++)
157 {
158 for( int d=0; d<blocks_per_line; d++)
159 {
160 int C = cols_idx[i*blocks_per_line+d];
161 int J = (s*num_cols+C)*n;
162 for(int q=0; q<n; q++)
163 xprivate[d*n+q] = (C == -1 ? 0 : x[J+q]);
164 }
165 for( int k=0; k<n; k++)
166 {
167 value_type temp[blocks_per_line] = {0};
168 for( int d=0; d<blocks_per_line; d++)
169 {
170 int B = (data_idx[i*blocks_per_line+d]*n+k)*n;
171 for( int q=0; q<n; q++) //multiplication-loop
172 temp[d] = DG_FMA( data[B+q], xprivate[d*n+q], temp[d]);
173 }
174 int I = ((s*num_rows + i)*n+k);
175 // if y[I] isnan then even beta = 0 does not make it 0
176 y[I] = beta == 0 ? (value_type)0 : y[I]*beta;
177 for( int d=0; d<blocks_per_line; d++)
178 y[I] = DG_FMA(alpha, temp[d], y[I]);
179 }
180 }
181 }
182 } //trivial
183 else // not trivial
184 {
185 value_type xprivate[blocks_per_line*n];
186 for( int s=0; s<left_size; s++)
187 for( int i=0; i<num_rows; i++)
188 {
189 for( int d=0; d<blocks_per_line; d++)
190 {
191 int C = cols_idx[i*blocks_per_line+d];
192 int J = (s*num_cols+C)*n;
193 for(int q=0; q<n; q++)
194 xprivate[d*n+q] = (C == -1 ? 0 : x[J+q]);
195 }
196 for( int k=0; k<n; k++)
197 {
198 value_type temp[blocks_per_line] = {0};
199 for( int d=0; d<blocks_per_line; d++)
200 {
201 int B = (data_idx[i*blocks_per_line+d]*n+k)*n;
202 for( int q=0; q<n; q++) //multiplication-loop
203 temp[d] = DG_FMA( data[B+q], xprivate[d*n+q], temp[d]);
204 }
205 int I = ((s*num_rows + i)*n+k);
206 // if y[I] isnan then even beta = 0 does not make it 0
207 y[I] = beta == 0 ? (value_type)0 : y[I]*beta;
208 for( int d=0; d<blocks_per_line; d++)
209 y[I] = DG_FMA(alpha, temp[d], y[I]);
210 }
211 }
212 }// trivial
213 }// right_size==1
214 else // right_size != 1
215 {
216 real_type dprivate[blocks_per_line*n];
217 int J[blocks_per_line];
218 if( !( (right_range[1]-right_range[0]) > 100*left_size*num_rows*n )) //typically a derivative in y ( Ny*Nz >~ Nx)
219 {
220 for (int sik = 0; sik < left_size*num_rows*n; sik++)
221 {
222 int s = sik / (num_rows*n);
223 int i = (sik % (num_rows*n)) / n;
224 int k = (sik % (num_rows*n)) % n;
225
226 for( int d=0; d<blocks_per_line; d++)
227 {
228 int C = cols_idx[i*blocks_per_line+d];
229 J[d] = ( C == -1 ? -1 :(s*num_cols+C)*n );
230 int B = (data_idx[i*blocks_per_line+d]*n+k)*n;
231 for(int q=0; q<n; q++)
232 dprivate[d*n+q] = data[B+q];
233 }
234 for( int j=right_range[0]; j<right_range[1]; j++)
235 {
236 int I = ((s*num_rows + i)*n+k)*right_size+j;
237 // if y[I] isnan then even beta = 0 does not make it 0
238 y[I] = beta == 0 ? (value_type)0 : y[I]*beta;
239 for( int d=0; d<blocks_per_line; d++)
240 {
241 int Jd = J[d];
242 if ( Jd == -1)
243 continue;
244 value_type temp = 0;
245 for( int q=0; q<n; q++) //multiplication-loop
246 temp = DG_FMA( dprivate[ d*n+q],
247 x[(Jd+q)*right_size+j],
248 temp);
249 y[I] = DG_FMA(alpha, temp, y[I]);
250 }
251 }
252 }
253 }
254 else //typically a derivative in z (since n*n*Nx*Ny > 100*Nz)
255 {
256
257 for (int sik = 0; sik < left_size*num_rows*n; sik++)
258 {
259 int s = sik / (num_rows*n);
260 int i = (sik % (num_rows*n)) / n;
261 int k = (sik % (num_rows*n)) % n;
262
263 for( int d=0; d<blocks_per_line; d++)
264 {
265 int C = cols_idx[i*blocks_per_line+d];
266 J[d] = ( C == -1 ? -1 :(s*num_cols+C)*n );
267 int B = (data_idx[i*blocks_per_line+d]*n+k)*n;
268 for(int q=0; q<n; q++)
269 dprivate[d*n+q] = data[B+q];
270 }
271 for( int j=right_range[0]; j<right_range[1]; j++)
272 {
273 int I = ((s*num_rows + i)*n+k)*right_size+j;
274 // if y[I] isnan then even beta = 0 does not make it 0
275 y[I] = beta == 0 ? (value_type)0 : y[I]*beta;
276 for( int d=0; d<blocks_per_line; d++)
277 {
278 int Jd = J[d];
279 if( Jd == -1)
280 continue;
281 value_type temp = 0;
282 for( int q=0; q<n; q++) //multiplication-loop
283 temp = DG_FMA( dprivate[ d*n+q],
284 x[(Jd+q)*right_size+j],
285 temp);
286 y[I] = DG_FMA(alpha, temp, y[I]);
287 }
288 }
289 }
290 }
291 }
292}
293
294template<class real_type, class value_type, int n>
295void call_ell_cpu_multiply_kernel( value_type alpha, value_type beta,
296 const real_type * RESTRICT data_ptr, const int * RESTRICT cols_ptr,
297 const int * RESTRICT block_ptr,
298 const int num_rows, const int num_cols, const int blocks_per_line,
299 const int left_size, const int right_size,
300 const int * RESTRICT right_range_ptr,
301 const value_type * RESTRICT x_ptr, value_type * RESTRICT y_ptr)
302{
303 if( blocks_per_line == 1)
305 cols_ptr, block_ptr, num_rows, num_cols, left_size, right_size,
306 right_range_ptr, x_ptr,y_ptr);
307 else if (blocks_per_line == 2)
309 cols_ptr, block_ptr, num_rows, num_cols, left_size, right_size,
310 right_range_ptr, x_ptr,y_ptr);
311 else if (blocks_per_line == 3)
313 cols_ptr, block_ptr, num_rows, num_cols, left_size, right_size,
314 right_range_ptr, x_ptr,y_ptr);
315 else if (blocks_per_line == 4)
317 cols_ptr, block_ptr, num_rows, num_cols, left_size, right_size,
318 right_range_ptr, x_ptr,y_ptr);
319 else
320 ell_cpu_multiply_kernel<real_type, value_type> (alpha, beta, data_ptr, cols_ptr,
321 block_ptr, num_rows, num_cols, blocks_per_line, n, left_size,
322 right_size, right_range_ptr, x_ptr,y_ptr);
323}
324
325
326template<class real_type, template<class> class Vector>
327template<class value_type>
328void EllSparseBlockMat<real_type, Vector>::launch_multiply_kernel( SerialTag, value_type alpha, const value_type* x_ptr, value_type beta, value_type* y_ptr) const
329{
330 const real_type* data_ptr = thrust::raw_pointer_cast( &data[0]);
331 const int* cols_ptr = thrust::raw_pointer_cast( &cols_idx[0]);
332 const int* block_ptr = thrust::raw_pointer_cast( &data_idx[0]);
333 const int* right_range_ptr = thrust::raw_pointer_cast( &right_range[0]);
334 if( n == 1)
336 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
337 right_size, right_range_ptr, x_ptr,y_ptr);
338
339 else if( n == 2)
341 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
342 right_size, right_range_ptr, x_ptr,y_ptr);
343 else if( n == 3)
345 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
346 right_size, right_range_ptr, x_ptr,y_ptr);
347 else if( n == 4)
349 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
350 right_size, right_range_ptr, x_ptr,y_ptr);
351 else if( n == 5)
353 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
354 right_size, right_range_ptr, x_ptr,y_ptr);
355 else
356 ell_cpu_multiply_kernel<real_type, value_type> ( alpha, beta, data_ptr, cols_ptr,
357 block_ptr, num_rows, num_cols, blocks_per_line, n, left_size,
358 right_size, right_range_ptr, x_ptr,y_ptr);
359}
360
361template<class real_type, class value_type, template<class> class Vector>
362void coo_cpu_multiply_kernel( value_type alpha, const value_type** x, value_type beta, value_type* RESTRICT y, const CooSparseBlockMat<real_type, Vector>& m )
363{
364 for (int skj = 0; skj < m.left_size*m.n*m.right_size; skj++)
365 {
366 int s = skj / (m.n*m.right_size);
367 int k = (skj % (m.n*m.right_size)) / m.right_size;
368 int j = (skj % (m.n*m.right_size)) % m.right_size;
369 for (int i = 0; i < m.num_entries; i++)
370 {
371 int I = ((s*m.num_rows + m.rows_idx[i])*m.n + k)*m.right_size + j;
372 value_type temp = 0;
373 for (int q = 0; q < m.n; q++) //multiplication-loop
374 temp = DG_FMA(m.data[(m.data_idx[i] * m.n + k)*m.n + q],
375 //x[((s*m.num_cols + m.cols_idx[i])*m.n+q)*m.right_size+j],
376 x[m.cols_idx[i]][(q*m.left_size +s )*m.right_size+j],
377 temp);
378 y[I] = DG_FMA(alpha, temp, y[I]);
379 }
380 }
381}
382template<class real_type, class value_type, int n, template<class > class Vector>
383void coo_cpu_multiply_kernel( value_type alpha, const value_type** x, value_type beta, value_type* RESTRICT y, const CooSparseBlockMat<real_type, Vector>& m )
384{
385 bool trivial = true;
386 int CC = m.cols_idx[0], DD = m.data_idx[0];
387 for( int i=0; i<m.num_entries; i++)
388 if( CC+i != m.cols_idx[i] || DD+i != m.data_idx[i])
389 trivial=false;
390 if( trivial)
391 {
392 for (int sj = 0; sj < m.left_size*m.right_size; sj++)
393 {
394 int s = sj / m.right_size;
395 int j = (sj % m.right_size) % m.right_size;
396 for( int k=0; k<n; k++)
397 {
398 for (int i = 0; i < m.num_entries; i++)
399 {
400 int I = ((s*m.num_rows + m.rows_idx[i])*n + k)*m.right_size + j;
401 int DDD = ((DD +i)*n+k)*n, CCC = CC+i;
402 value_type temp = 0;
403 for (int q = 0; q < n; q++) //multiplication-loop
404 temp = DG_FMA(m.data[DDD + q],
405 //x[((s*m.num_cols + CCC)*n+q)*m.right_size+sj],
406 x[CCC][q*m.left_size*m.right_size +sj],
407 temp);
408 y[I] = DG_FMA(alpha, temp, y[I]);
409 }
410 }
411 }
412 }
413 else
414 {
415 for (int sj = 0; sj < m.left_size*m.right_size; sj++)
416 {
417 int s = sj / m.right_size;
418 int j = (sj % m.right_size) % m.right_size;
419 for( int k=0; k<n; k++)
420 {
421 for (int i = 0; i < m.num_entries; i++)
422 {
423 int I = ((s*m.num_rows + m.rows_idx[i])*n + k)*m.right_size + j;
424 value_type temp = 0;
425 for (int q = 0; q < n; q++) //multiplication-loop
426 temp = DG_FMA(m.data[(m.data_idx[i] * n + k)*n + q],
427 //x[((s*m.num_cols + m.cols_idx[i])*n+q)*m.right_size+j],
428 x[m.cols_idx[i]][q*m.left_size*m.right_size +sj],
429 temp);
430 y[I] = DG_FMA(alpha, temp, y[I]);
431 }
432 }
433 }
434 }
435}
436template<class real_type, template<class> class Vector>
437template<class value_type>
438void CooSparseBlockMat<real_type, Vector>::launch_multiply_kernel( SerialTag, value_type alpha, const value_type** x, value_type beta, value_type* RESTRICT y) const
439{
440 if( num_entries==0)
441 return;
442 assert( beta == 1 && "Beta != 1 yields wrong results in CooSparseBlockMat!!");
443 if( n == 1)
445 else if( n == 2)
447 else if( n == 3)
449 else if( n == 4)
451 else
452 coo_cpu_multiply_kernel<real_type, value_type>( alpha, x, beta, y, *this);
453}
454
455}//namespace dg
@ y
y direction
@ x
x direction
This is the namespace for all functions and classes defined and used by the discontinuous Galerkin li...
void coo_cpu_multiply_kernel(value_type alpha, const value_type **x, value_type beta, value_type *RESTRICT y, const CooSparseBlockMat< real_type, Vector > &m)
Definition sparseblockmat_cpu_kernels.h:362
void ell_cpu_multiply_kernel(value_type alpha, value_type beta, const real_type *RESTRICT data, const int *RESTRICT cols_idx, const int *RESTRICT data_idx, const int num_rows, const int num_cols, const int blocks_per_line, const int n, const int left_size, const int right_size, const int *RESTRICT right_range, const value_type *RESTRICT x, value_type *RESTRICT y)
Definition sparseblockmat_cpu_kernels.h:17
void call_ell_cpu_multiply_kernel(value_type alpha, value_type beta, const real_type *RESTRICT data_ptr, const int *RESTRICT cols_ptr, const int *RESTRICT block_ptr, const int num_rows, const int num_cols, const int blocks_per_line, const int left_size, const int right_size, const int *RESTRICT right_range_ptr, const value_type *RESTRICT x_ptr, value_type *RESTRICT y_ptr)
Definition sparseblockmat_cpu_kernels.h:295
Coo Sparse Block Matrix format.
Definition sparseblockmat.h:228
int num_entries
number of entries in the matrix
Definition sparseblockmat.h:345
int left_size
size of the left Kronecker delta
Definition sparseblockmat.h:347
Vector< int > rows_idx
is of size num_entries and contains the row indices
Definition sparseblockmat.h:341
Vector< real_type > data
The data array is of size n*n*num_different_blocks and contains the blocks.
Definition sparseblockmat.h:339
int right_size
size of the right Kronecker delta (is e.g 1 for a x - derivative)
Definition sparseblockmat.h:348
Vector< int > data_idx
is of size num_entries and contains indices into the data array
Definition sparseblockmat.h:342
int n
each block has size n*n
Definition sparseblockmat.h:346
int num_rows
number of rows
Definition sparseblockmat.h:343
Vector< int > cols_idx
is of size num_entries and contains the column indices
Definition sparseblockmat.h:340
Ell Sparse Block Matrix format.
Definition sparseblockmat.h:46
Indicate sequential execution.
Definition execution_policy.h:26