Discontinuous Galerkin Library
#include "dg/algorithm.h"
Loading...
Searching...
No Matches
sparsematrix_cpu.h
Go to the documentation of this file.
1#pragma once
2
3#include <vector>
4#include <algorithm>
5#include <numeric>
6
7namespace dg
8{
9namespace detail
10{
11
12
13// MW if ever needed the code for assembly of sparsity structure of A and the computation of A can be separated
14// TODO Make gemm pointers restrict!?
15
16// A = B*C
17// We do not catch explicit zeros in A
18// Entries in A are sorted (even if B and/or C are not)
19template<class I, class V>
21 size_t B_num_rows, size_t B_num_cols, size_t C_num_cols,
22 const I& B_pos , const I& B_idx, const V& B_val,
23 const I& C_pos , const I& C_idx, const V& C_val,
24 I& A_pos , I& A_idx, V& A_val
25)
26{
27 // We follow
28 // https://commit.csail.mit.edu/papers/2018/kjolstad-18-workspaces.pdf
29 // Sparse data structures do not support fast random inserts ...
30 // That is why a dense workspace is inserted
31 // ... a dense workspace turns sparse-sparse into sparse-dense iterations (fewer conditional checks)
32 // Workspace have easier merge at the cost of managing the workspace and reduced temporal locality
33 //
34 // 1. Figure 10: assemble sparsity structure of A
35 // (Theoretically this could be cached if we did the same multiplication multiple times)
36 A_pos.resize( B_num_rows+1);
37 A_pos[0] = 0;
38 std::vector<bool> w( C_num_cols, false); // A dense work array
39 // A_ij = B_ik C_kj
40 I wlist; // one row of A
41 for( int i = 0; i<(int)B_num_rows; i++)
42 {
43 for( int pB = B_pos[i]; pB < B_pos[i+1]; pB++)
44 {
45 int k = B_idx[pB];
46 for( int pC = C_pos[k]; pC < C_pos[k+1]; pC++)
47 {
48 int j = C_idx[pC];
49 if( not w[j])
50 {
51 wlist.push_back( j);
52 w[j] = true;
53 }
54 }
55 }
56 // Sort entries of A in each row
57 std::sort( wlist.begin(), wlist.end());
58
59 // Append wlist to A_idx
60 A_idx.resize( A_pos[i] + wlist.size());
61 A_pos[i+1] = A_pos[i] + wlist.size();
62 for( int pwlist = 0; pwlist < (int)wlist.size(); pwlist ++)
63 {
64 int j = wlist[pwlist];
65 A_idx[ A_pos[i] + pwlist ] = j;
66 w[j] = false;
67 }
68 wlist.resize( 0);
69 }
70 A_val.resize( A_pos[B_num_rows]);
71 // 2. Figure 1d) Do the actual multiplication
72
73 // A dense workspace array
74 V workspace( C_num_cols, 0);
75 for (int i = 0; i < (int)B_num_rows; i++)
76 {
77 // Dense Workspace array
78 for (int pB = B_pos[i]; pB < B_pos[i+1]; pB++)
79 {
80 int k = B_idx[pB];
81 for (int pC = C_pos[k]; pC < C_pos[k+1]; pC++)
82 {
83 int j = C_idx[pC];
84 workspace[j] += B_val[pB] * C_val[pC];
85 }
86 }
87 // We have to know sparse structure of A...
88 for (int pA = A_pos[i]; pA < A_pos[i+1]; pA++)
89 {
90 int j = A_idx[pA];
91 A_val[pA] = workspace[j];
92 workspace[j] = 0;
93 }
94 }
95}
96
97//A = B + C
98// We do not catch explicit zeros in A
99// Entries in A are sorted
100template<class I, class V>
102 size_t B_num_rows, size_t B_num_cols,
103 const I& B_pos , const I& B_idx, const V& B_val,
104 const I& C_pos , const I& C_idx, const V& C_val,
105 I& A_pos , I& A_idx, V& A_val // restrict !
106)
107{
108 // We follow
109 // https://commit.csail.mit.edu/papers/2018/kjolstad-18-workspaces.pdf
110 //
111 // 1. (No Figure) assemble sparsity structure of A
112 // (Theoretically this could be cached if we did the same multiplication multiple times)
113 A_pos.resize( B_num_rows+1);
114 A_pos[0] = 0;
115 std::vector<bool> w( B_num_cols, false); // A dense work array
116 // A_ij = B_ik C_kj
117 I wlist; // one row of A
118 for( int i = 0; i<(int)B_num_rows; i++)
119 {
120 // Dense Workspace array
121 for (int pB = B_pos[i]; pB < B_pos[i+1]; pB++)
122 {
123 int ib = B_idx[pB];
124 wlist.push_back(ib);
125 w[ib] = true;
126 }
127 for (int pC = C_pos[i]; pC < C_pos[i+1]; pC++)
128 {
129 int ic = C_idx[pC];
130 if( not w[ic])
131 {
132 wlist.push_back(ic);
133 w[ic] = true;
134 }
135 }
136 // Sort entries of A in each row
137 std::sort( wlist.begin(), wlist.end());
138 // Append wlist to A_idx
139 A_idx.resize( A_pos[i] + wlist.size());
140 A_pos[i+1] = A_pos[i] + wlist.size();
141 for( int pwlist = 0; pwlist < (int)wlist.size(); pwlist ++)
142 {
143 int j = wlist[pwlist];
144 A_idx[ A_pos[i] + pwlist ] = j;
145 w[j] = false;
146 }
147 wlist.resize( 0);
148 }
149 A_val.resize( A_pos[B_num_rows]);
150 //
151 // 2. Figure 6) Sparse vector addition
152
153 // A dense workspace array
154 V workspace( B_num_cols, 0);
155 for (int i = 0; i < (int)B_num_rows; i++)
156 {
157 // Dense Workspace array
158 for (int pB = B_pos[i]; pB < B_pos[i+1]; pB++)
159 {
160 int ib = B_idx[pB];
161 workspace[ib] = B_val[pB];
162 }
163 for (int pC = C_pos[i]; pC < C_pos[i+1]; pC++)
164 {
165 int ic = C_idx[pC];
166 workspace[ic] += C_val[pC];
167 }
168 // We have to know sparse structure of A...
169 for (int pA = A_pos[i]; pA < A_pos[i+1]; pA++)
170 {
171 int ia = A_idx[pA];
172 A_val[pA] = workspace[ia];
173 workspace[ia] = 0;
174 }
175 }
176}
177
179{
180 void forget(){}
181};
182//y = alpha A*x + beta y
183template<class I, class V, class value_type, class C1, class C2>
185 CSRCache_cpu& cache,
186 size_t A_num_rows, size_t A_num_cols, size_t A_nnz,
187 const I* RESTRICT A_pos , const I* RESTRICT A_idx, const V* RESTRICT A_val,
188 value_type alpha, value_type beta, const C1* RESTRICT x_ptr, C2* RESTRICT y_ptr
189)
190{
191 if( beta == value_type(1))
192 {
193// #pragma omp for nowait
194 for(int i = 0; i < (int)A_num_rows; i++)
195 {
196 for (int jj = A_pos[i]; jj < A_pos[i+1]; jj++)
197 {
198 int j = A_idx[jj];
199 y_ptr[i] = DG_FMA( alpha*A_val[jj], x_ptr[j], y_ptr[i]);
200 }
201 }
202 }
203 else
204 {
205// #pragma omp for nowait
206 for(int i = 0; i < (int)A_num_rows; i++)
207 {
208 value_type temp = 0;
209 for (int jj = A_pos[i]; jj < A_pos[i+1]; jj++)
210 {
211 int j = A_idx[jj];
212 temp = DG_FMA( alpha*A_val[jj], x_ptr[j], temp);
213 }
214
215 y_ptr[i] = DG_FMA( beta, y_ptr[i], temp);
216 }
217 }
218}
219
220
221
222}// namespace detail
223
224} // namespace dg
void spadd_cpu_kernel(size_t B_num_rows, size_t B_num_cols, const I &B_pos, const I &B_idx, const V &B_val, const I &C_pos, const I &C_idx, const V &C_val, I &A_pos, I &A_idx, V &A_val)
Definition sparsematrix_cpu.h:101
void spmv_cpu_kernel(CSRCache_cpu &cache, size_t A_num_rows, size_t A_num_cols, size_t A_nnz, const I *RESTRICT A_pos, const I *RESTRICT A_idx, const V *RESTRICT A_val, value_type alpha, value_type beta, const C1 *RESTRICT x_ptr, C2 *RESTRICT y_ptr)
Definition sparsematrix_cpu.h:184
void spgemm_cpu_kernel(size_t B_num_rows, size_t B_num_cols, size_t C_num_cols, const I &B_pos, const I &B_idx, const V &B_val, const I &C_pos, const I &C_idx, const V &C_val, I &A_pos, I &A_idx, V &A_val)
Definition sparsematrix_cpu.h:20
This is the namespace for all functions and classes defined and used by the discontinuous Galerkin li...
Definition sparsematrix_cpu.h:179
void forget()
Definition sparsematrix_cpu.h:180