Discontinuous Galerkin Library
#include "dg/algorithm.h"
Loading...
Searching...
No Matches
sparseblockmat_cpu_kernels.h
Go to the documentation of this file.
1// DO NOT MANUALLY EDIT THIS FILE
2// Instead:
3// 0. Keep this notice
4// 1. copy sparseblockmat_omp_kernels.h (without include <omp.h>)
5// 2. Remove all lines with #pragma
6// 3. Replace OmpTag with SerialTag
7// 4. Replace omp_ with cpu_
8#include "config.h"
9#include "fma.h"
10
11//for the fmas it is important to activate -mfma compiler flag
12
13namespace dg{
14
15// general multiply kernel
16template<class real_type, class value_type>
18 const real_type * RESTRICT data, const int * RESTRICT cols_idx,
19 const int * RESTRICT data_idx,
20 const int num_rows, const int num_cols, const int blocks_per_line,
21 const int n,
22 const int left_size, const int right_size,
23 const int * RESTRICT right_range,
24 const value_type * RESTRICT x, value_type * RESTRICT y
25 )
26{
27 std::vector<int> J(blocks_per_line), B(blocks_per_line);
28 for( int si = 0; si<left_size*num_rows; si++)
29 {
30 int s = si / num_rows;
31 int i = si % num_rows;
32 for( int d=0; d<blocks_per_line; d++)
33 {
34 int C = cols_idx[i*blocks_per_line+d];
35 J[d] = ( C == -1 ? -1 : (s*num_cols+C)*n);
36 }
37 for( int k=0; k<n; k++)
38 {
39 for( int d=0; d<blocks_per_line; d++)
40 B[d] = (data_idx[i*blocks_per_line+d]*n+k)*n;
41 for( int j=right_range[0]; j<right_range[1]; j++)
42 {
43 int I = ((s*num_rows + i)*n+k)*right_size+j;
44 // if y[I] isnan then even beta = 0 does not make it 0
45 y[I] = beta == 0 ? (value_type)0 : y[I]*beta;
46 for( int d=0; d<blocks_per_line; d++)
47 {
48 value_type temp = 0;
49 if( J[d] == -1)
50 continue;
51 for( int q=0; q<n; q++) //multiplication-loop
52 temp = DG_FMA(data[ B[d]+q],
53 x[(J[d]+q)*right_size+j],
54 temp);
55 y[I] = DG_FMA(alpha, temp, y[I]);
56 }
57 }
58 }
59 }
60}
61//specialized multiply kernel
62template<class real_type, class value_type, int n, int blocks_per_line>
64 const real_type * RESTRICT data, const int * RESTRICT cols_idx,
65 const int * RESTRICT data_idx,
66 const int num_rows, const int num_cols,
67 const int left_size, const int right_size,
68 const int * RESTRICT right_range,
69 const value_type * RESTRICT x, value_type * RESTRICT y
70 )
71{
72 //basically we check which direction is the largest and parallelize that one
73 if(right_size==1)
74 {
75 // trivial means that the data blocks do not change among rows
76 // that are not the first or the last row (where BCs usually live)
77 bool trivial = true;
78 if( num_rows < 4) // need at least 3 rows for this to make sense
79 trivial = false;
80 for( int i=2; i<num_rows-1; i++)
81 for( int d=0; d<blocks_per_line; d++)
82 {
83 if( data_idx[i*blocks_per_line+d]
84 != data_idx[blocks_per_line+d]) trivial = false;
85 }
86 if(trivial)
87 {
88 value_type xprivate[blocks_per_line*n];
89 real_type dprivate[blocks_per_line*n*n];
90 for( int d=0; d<blocks_per_line; d++)
91 for( int k=0; k<n; k++)
92 for( int q=0; q<n; q++)
93 {
94 int B = data_idx[blocks_per_line+d];
95 dprivate[(k*blocks_per_line+d)*n+q] = data[(B*n+k)*n+q];
96 }
97 for( int s=0; s<left_size; s++)
98 {
99 for( int i=0; i<1; i++)
100 {
101 for( int d=0; d<blocks_per_line; d++)
102 {
103 int C = cols_idx[i*blocks_per_line+d];
104 int J = (s*num_cols+C)*n;
105 for(int q=0; q<n; q++)
106 xprivate[d*n+q] = (C == -1 ? 0 : x[J+q]);
107 }
108 for( int k=0; k<n; k++)
109 {
110 value_type temp[blocks_per_line] = {0};
111 for( int d=0; d<blocks_per_line; d++)
112 {
113 int B = (data_idx[i*blocks_per_line+d]*n+k)*n;
114 for( int q=0; q<n; q++) //multiplication-loop
115 temp[d] = DG_FMA(data[B+q], xprivate[d*n+q], temp[d]);
116 }
117 int I = ((s*num_rows + i)*n+k);
118 // if y[I] isnan then even beta = 0 does not make it 0
119 y[I] = beta == 0 ? (value_type)0 : y[I]*beta;
120 for( int d=0; d<blocks_per_line; d++)
121 y[I] = DG_FMA(alpha, temp[d], y[I]);
122 }
123 }
124 for( int i=1; i<num_rows-1; i++)
125 {
126 for( int k=0; k<n; k++)
127 {
128 int I = ((s*num_rows + i)*n+k);
129 // if y[I] isnan then even beta = 0 does not make it 0
130 y[I] = beta == 0 ? (value_type)0 : y[I]*beta;
131 int B = n*blocks_per_line*k;
132 for( int d=0; d<blocks_per_line; d++)
133 {
134 value_type temp = 0;
135 int C = cols_idx[i*blocks_per_line+d];
136 if( C == -1)
137 continue;
138 for( int q=0; q<n; q++)
139 {
140 int J = (s*num_cols+C)*n+q;
141 temp = DG_FMA( dprivate[B+d*n+q], x[J], temp);
142 }
143 y[I] = DG_FMA(alpha, temp, y[I]);
144 }
145 }
146 }
147 for( int i=num_rows-1; i<num_rows; i++)
148 {
149 for( int d=0; d<blocks_per_line; d++)
150 {
151 int C = cols_idx[i*blocks_per_line+d];
152 int J = (s*num_cols+C)*n;
153 for(int q=0; q<n; q++)
154 xprivate[d*n+q] = (C == -1 ? 0 : x[J+q]);
155 }
156 for( int k=0; k<n; k++)
157 {
158 value_type temp[blocks_per_line] = {0};
159 for( int d=0; d<blocks_per_line; d++)
160 {
161 int B = (data_idx[i*blocks_per_line+d]*n+k)*n;
162 for( int q=0; q<n; q++) //multiplication-loop
163 temp[d] = DG_FMA( data[B+q], xprivate[d*n+q], temp[d]);
164 }
165 int I = ((s*num_rows + i)*n+k);
166 // if y[I] isnan then even beta = 0 does not make it 0
167 y[I] = beta == 0 ? (value_type)0 : y[I]*beta;
168 for( int d=0; d<blocks_per_line; d++)
169 y[I] = DG_FMA(alpha, temp[d], y[I]);
170 }
171 }
172 }
173 } //trivial
174 else // not trivial
175 {
176 value_type xprivate[blocks_per_line*n];
177 for( int s=0; s<left_size; s++)
178 for( int i=0; i<num_rows; i++)
179 {
180 for( int d=0; d<blocks_per_line; d++)
181 {
182 int C = cols_idx[i*blocks_per_line+d];
183 int J = (s*num_cols+C)*n;
184 for(int q=0; q<n; q++)
185 xprivate[d*n+q] = (C == -1 ? 0 : x[J+q]);
186 }
187 for( int k=0; k<n; k++)
188 {
189 value_type temp[blocks_per_line] = {0};
190 for( int d=0; d<blocks_per_line; d++)
191 {
192 int B = (data_idx[i*blocks_per_line+d]*n+k)*n;
193 for( int q=0; q<n; q++) //multiplication-loop
194 temp[d] = DG_FMA( data[B+q], xprivate[d*n+q], temp[d]);
195 }
196 int I = ((s*num_rows + i)*n+k);
197 // if y[I] isnan then even beta = 0 does not make it 0
198 y[I] = beta == 0 ? (value_type)0 : y[I]*beta;
199 for( int d=0; d<blocks_per_line; d++)
200 y[I] = DG_FMA(alpha, temp[d], y[I]);
201 }
202 }
203 }// trivial
204 }// right_size==1
205 else // right_size != 1
206 {
207 real_type dprivate[blocks_per_line*n];
208 int J[blocks_per_line];
209 if( !( (right_range[1]-right_range[0]) > 100*left_size*num_rows*n )) //typically a derivative in y ( Ny*Nz >~ Nx)
210 {
211 for (int sik = 0; sik < left_size*num_rows*n; sik++)
212 {
213 int s = sik / (num_rows*n);
214 int i = (sik % (num_rows*n)) / n;
215 int k = (sik % (num_rows*n)) % n;
216
217 for( int d=0; d<blocks_per_line; d++)
218 {
219 int C = cols_idx[i*blocks_per_line+d];
220 J[d] = ( C == -1 ? -1 :(s*num_cols+C)*n );
221 int B = (data_idx[i*blocks_per_line+d]*n+k)*n;
222 for(int q=0; q<n; q++)
223 dprivate[d*n+q] = data[B+q];
224 }
225 for( int j=right_range[0]; j<right_range[1]; j++)
226 {
227 int I = ((s*num_rows + i)*n+k)*right_size+j;
228 // if y[I] isnan then even beta = 0 does not make it 0
229 y[I] = beta == 0 ? (value_type)0 : y[I]*beta;
230 for( int d=0; d<blocks_per_line; d++)
231 {
232 int Jd = J[d];
233 if ( Jd == -1)
234 continue;
235 value_type temp = 0;
236 for( int q=0; q<n; q++) //multiplication-loop
237 temp = DG_FMA( dprivate[ d*n+q],
238 x[(Jd+q)*right_size+j],
239 temp);
240 y[I] = DG_FMA(alpha, temp, y[I]);
241 }
242 }
243 }
244 }
245 else //typically a derivative in z (since n*n*Nx*Ny > 100*Nz)
246 {
247
248 for (int sik = 0; sik < left_size*num_rows*n; sik++)
249 {
250 int s = sik / (num_rows*n);
251 int i = (sik % (num_rows*n)) / n;
252 int k = (sik % (num_rows*n)) % n;
253
254 for( int d=0; d<blocks_per_line; d++)
255 {
256 int C = cols_idx[i*blocks_per_line+d];
257 J[d] = ( C == -1 ? -1 :(s*num_cols+C)*n );
258 int B = (data_idx[i*blocks_per_line+d]*n+k)*n;
259 for(int q=0; q<n; q++)
260 dprivate[d*n+q] = data[B+q];
261 }
262 for( int j=right_range[0]; j<right_range[1]; j++)
263 {
264 int I = ((s*num_rows + i)*n+k)*right_size+j;
265 // if y[I] isnan then even beta = 0 does not make it 0
266 y[I] = beta == 0 ? (value_type)0 : y[I]*beta;
267 for( int d=0; d<blocks_per_line; d++)
268 {
269 int Jd = J[d];
270 if( Jd == -1)
271 continue;
272 value_type temp = 0;
273 for( int q=0; q<n; q++) //multiplication-loop
274 temp = DG_FMA( dprivate[ d*n+q],
275 x[(Jd+q)*right_size+j],
276 temp);
277 y[I] = DG_FMA(alpha, temp, y[I]);
278 }
279 }
280 }
281 }
282 }
283}
284
285template<class real_type, class value_type, int n>
287 const real_type * RESTRICT data_ptr, const int * RESTRICT cols_ptr,
288 const int * RESTRICT block_ptr,
289 const int num_rows, const int num_cols, const int blocks_per_line,
290 const int left_size, const int right_size,
291 const int * RESTRICT right_range_ptr,
292 const value_type * RESTRICT x_ptr, value_type * RESTRICT y_ptr)
293{
294 if( blocks_per_line == 1)
296 cols_ptr, block_ptr, num_rows, num_cols, left_size, right_size,
297 right_range_ptr, x_ptr,y_ptr);
298 else if (blocks_per_line == 2)
300 cols_ptr, block_ptr, num_rows, num_cols, left_size, right_size,
301 right_range_ptr, x_ptr,y_ptr);
302 else if (blocks_per_line == 3)
304 cols_ptr, block_ptr, num_rows, num_cols, left_size, right_size,
305 right_range_ptr, x_ptr,y_ptr);
306 else if (blocks_per_line == 4)
308 cols_ptr, block_ptr, num_rows, num_cols, left_size, right_size,
309 right_range_ptr, x_ptr,y_ptr);
310 else
312 block_ptr, num_rows, num_cols, blocks_per_line, n, left_size,
313 right_size, right_range_ptr, x_ptr,y_ptr);
314}
315
316
317template<class real_type, template<class> class Vector>
318template<class value_type>
320{
321 const real_type* data_ptr = thrust::raw_pointer_cast( &data[0]);
322 const int* cols_ptr = thrust::raw_pointer_cast( &cols_idx[0]);
323 const int* block_ptr = thrust::raw_pointer_cast( &data_idx[0]);
324 const int* right_range_ptr = thrust::raw_pointer_cast( &right_range[0]);
325 if( n == 1)
327 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
328 right_size, right_range_ptr, x_ptr,y_ptr);
329
330 else if( n == 2)
332 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
333 right_size, right_range_ptr, x_ptr,y_ptr);
334 else if( n == 3)
336 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
337 right_size, right_range_ptr, x_ptr,y_ptr);
338 else if( n == 4)
340 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
341 right_size, right_range_ptr, x_ptr,y_ptr);
342 else if( n == 5)
344 cols_ptr, block_ptr, num_rows, num_cols, blocks_per_line, left_size,
345 right_size, right_range_ptr, x_ptr,y_ptr);
346 else
348 block_ptr, num_rows, num_cols, blocks_per_line, n, left_size,
349 right_size, right_range_ptr, x_ptr,y_ptr);
350}
351
352template<class real_type, class value_type, template<class> class Vector>
354{
355 for (int skj = 0; skj < m.left_size*m.n*m.right_size; skj++)
356 {
357 int s = skj / (m.n*m.right_size);
358 int k = (skj % (m.n*m.right_size)) / m.right_size;
359 int j = (skj % (m.n*m.right_size)) % m.right_size;
360 for (int i = 0; i < m.num_entries; i++)
361 {
362 int I = ((s*m.num_rows + m.rows_idx[i])*m.n + k)*m.right_size + j;
363 value_type temp = 0;
364 for (int q = 0; q < m.n; q++) //multiplication-loop
365 temp = DG_FMA(m.data[(m.data_idx[i] * m.n + k)*m.n + q],
366 //x[((s*m.num_cols + m.cols_idx[i])*m.n+q)*m.right_size+j],
367 x[m.cols_idx[i]][(q*m.left_size +s )*m.right_size+j],
368 temp);
369 y[I] = DG_FMA(alpha, temp, y[I]);
370 }
371 }
372}
373template<class real_type, class value_type, int n, template<class > class Vector>
375{
376 bool trivial = true;
377 int CC = m.cols_idx[0], DD = m.data_idx[0];
378 for( int i=0; i<m.num_entries; i++)
379 if( CC+i != m.cols_idx[i] || DD+i != m.data_idx[i])
380 trivial=false;
381 if( trivial)
382 {
383 for (int sj = 0; sj < m.left_size*m.right_size; sj++)
384 {
385 int s = sj / m.right_size;
386 int j = (sj % m.right_size) % m.right_size;
387 for( int k=0; k<n; k++)
388 {
389 for (int i = 0; i < m.num_entries; i++)
390 {
391 int I = ((s*m.num_rows + m.rows_idx[i])*n + k)*m.right_size + j;
392 int DDD = ((DD +i)*n+k)*n, CCC = CC+i;
393 value_type temp = 0;
394 for (int q = 0; q < n; q++) //multiplication-loop
395 temp = DG_FMA(m.data[DDD + q],
396 //x[((s*m.num_cols + CCC)*n+q)*m.right_size+sj],
397 x[CCC][q*m.left_size*m.right_size +sj],
398 temp);
399 y[I] = DG_FMA(alpha, temp, y[I]);
400 }
401 }
402 }
403 }
404 else
405 {
406 for (int sj = 0; sj < m.left_size*m.right_size; sj++)
407 {
408 int s = sj / m.right_size;
409 int j = (sj % m.right_size) % m.right_size;
410 for( int k=0; k<n; k++)
411 {
412 for (int i = 0; i < m.num_entries; i++)
413 {
414 int I = ((s*m.num_rows + m.rows_idx[i])*n + k)*m.right_size + j;
415 value_type temp = 0;
416 for (int q = 0; q < n; q++) //multiplication-loop
417 temp = DG_FMA(m.data[(m.data_idx[i] * n + k)*n + q],
418 //x[((s*m.num_cols + m.cols_idx[i])*n+q)*m.right_size+j],
419 x[m.cols_idx[i]][q*m.left_size*m.right_size +sj],
420 temp);
421 y[I] = DG_FMA(alpha, temp, y[I]);
422 }
423 }
424 }
425 }
426}
427template<class real_type, template<class> class Vector>
428template<class value_type>
429void CooSparseBlockMat<real_type, Vector>::launch_multiply_kernel( SerialTag, value_type alpha, const value_type** x, value_type beta, value_type* RESTRICT y) const
430{
431 if( num_entries==0)
432 return;
433 assert( beta == 1 && "Beta != 1 yields wrong results in CooSparseBlockMat!!");
434 if( n == 1)
436 else if( n == 2)
438 else if( n == 3)
440 else if( n == 4)
442 else
443 coo_cpu_multiply_kernel<real_type, value_type>( alpha, x, beta, y, *this);
444}
445
446}//namespace dg
@ y
y direction
@ x
x direction
const double m
const double alpha
const double n
This is the namespace for all functions and classes defined and used by the discontinuous Galerkin li...
void coo_cpu_multiply_kernel(value_type alpha, const value_type **x, value_type, value_type *RESTRICT y, const CooSparseBlockMat< real_type, Vector > &m)
Definition sparseblockmat_cpu_kernels.h:353
void ell_cpu_multiply_kernel(value_type alpha, value_type beta, const real_type *RESTRICT data, const int *RESTRICT cols_idx, const int *RESTRICT data_idx, const int num_rows, const int num_cols, const int blocks_per_line, const int n, const int left_size, const int right_size, const int *RESTRICT right_range, const value_type *RESTRICT x, value_type *RESTRICT y)
Definition sparseblockmat_cpu_kernels.h:17
void call_ell_cpu_multiply_kernel(value_type alpha, value_type beta, const real_type *RESTRICT data_ptr, const int *RESTRICT cols_ptr, const int *RESTRICT block_ptr, const int num_rows, const int num_cols, const int blocks_per_line, const int left_size, const int right_size, const int *RESTRICT right_range_ptr, const value_type *RESTRICT x_ptr, value_type *RESTRICT y_ptr)
Definition sparseblockmat_cpu_kernels.h:286
const double beta
Coo Sparse Block Matrix format.
Definition sparseblockmat.h:228
Ell Sparse Block Matrix format.
Definition sparseblockmat.h:46
Indicate sequential execution.
Definition execution_policy.h:26
double value_type