Discontinuous Galerkin Library
#include "dg/algorithm.h"
Loading...
Searching...
No Matches
mpi_matrix.h
Go to the documentation of this file.
1#pragma once
2
3#include "mpi_gather_kron.h"
4#include "sparseblockmat.h"
5#include "memory.h"
6#include "timer.h"
7
8
9
16namespace dg {
17namespace blas2 {
18//forward declare blas2 symv functions
19template< class MatrixType, class ContainerType1, class ContainerType2>
20void symv( MatrixType&& M,
21 const ContainerType1& x,
22 ContainerType2& y);
23template< class FunctorType, class MatrixType, class ContainerType1, class ContainerType2>
24void stencil( FunctorType f, MatrixType&& M,
25 const ContainerType1& x,
26 ContainerType2& y);
27template< class MatrixType, class ContainerType1, class ContainerType2>
29 MatrixType&& M,
30 const ContainerType1& x,
32 ContainerType2& y);
33namespace detail
34{
35template< class Stencil, class ContainerType, class ...ContainerTypes>
36void doParallelFor( SharedVectorTag, Stencil f, unsigned N, ContainerType&& x, ContainerTypes&&... xs);
37}
38}//namespace blas2
39
133
141template<template<class > class Vector, class LocalMatrixInner, class LocalMatrixOuter = LocalMatrixInner>
143{
144 MPISparseBlockMat() = default;
145 MPISparseBlockMat( const LocalMatrixInner& inside,
146 const LocalMatrixOuter& outside,
147 const MPIKroneckerGather<Vector>& mpi_gather // only gather foreign messages
148 )
149 : m_i(inside), m_o(outside), m_g(mpi_gather)
150 {
151 }
152 template< template<class> class V, class LI, class LO>
153 friend class MPISparseBlockMat; // enable copy
154
155 template< template<class> class V, class LI, class LO>
157 : m_i(src.m_i), m_o(src.m_o), m_g(src.m_g)
158 {}
159
161 const LocalMatrixInner& inner_matrix() const{return m_i;}
163 const LocalMatrixOuter& outer_matrix() const{return m_o;}
165 LocalMatrixInner& inner_matrix() {return m_i;}
167 LocalMatrixOuter& outer_matrix() {return m_o;}
168
169 MPI_Comm communicator() const { return m_g.communicator();}
170
183 template<class ContainerType1, class ContainerType2>
184 void symv( dg::get_value_type<ContainerType1> alpha, const ContainerType1&
185 x, dg::get_value_type<ContainerType1> beta, ContainerType2& y) const
186 {
187 // ContainerType is MPI_Vector here...
188 //the blas2 functions should make enough static assertions on tpyes
189 if( !m_g.isCommunicating()) //no communication needed
190 {
191 dg::blas2::symv( alpha, m_i, x.data(), beta, y.data());
192 return;
193
194 }
195 int result;
196 MPI_Comm_compare( x.communicator(), y.communicator(), &result);
197 assert( result == MPI_CONGRUENT || result == MPI_IDENT);
198 using value_type = dg::get_value_type<ContainerType1>;
199
200 m_buffer_ptrs.template set<const value_type*>( m_o.num_cols);
201 auto& buffer_ptrs = m_buffer_ptrs.template get<const value_type*>();
202 // 1 initiate communication
203 m_g.global_gather_init( x.data());
204 // 2 compute inner points
205 dg::blas2::symv( alpha, m_i, x.data(), beta, y.data());
206 // 3 wait for communication to finish
207 m_g.global_gather_wait( x.data(), buffer_ptrs);
208 // Local computation may be unnecessary
209 if( buffer_ptrs.size() > 0)
210 {
211 // 4 compute and add outer points
212 const value_type** b_ptrs = thrust::raw_pointer_cast( buffer_ptrs.data());
213 value_type* y_ptr = thrust::raw_pointer_cast( y.data().data());
215 alpha, b_ptrs, value_type(1.), y_ptr);
216 }
217 }
218 template<class ContainerType1, class ContainerType2>
219 void symv(const ContainerType1& x, ContainerType2& y) const
220 {
221 symv( 1, x, 0, y);
222 }
223 private:
224 LocalMatrixInner m_i;
225 LocalMatrixOuter m_o;
227 mutable detail::AnyVector<Vector> m_buffer_ptrs;
228};
229
230
241template<class real_type, class ConversionPolicyRows, class ConversionPolicyCols>
242auto make_mpi_sparseblockmat( // not actually communicating
244 const ConversionPolicyRows& g_rows, // must be 1d
245 const ConversionPolicyCols& g_cols
246)
247{
248 // Indices in src are BLOCK INDICES!
249 // g_rows and g_cols can have different n!!
250 // Our approach here is:
251 // 1. some preliminary MPI tests
252 // 2. grab the correct rows of src and mark rows that are communicating
253 // we need to grab the entire row to ensure reproducibility (which is guaranteed by order of computations)
254 // 3. Convert inner rows to local and move outer rows to Coo format vectors (invalidating them in inner)
255 // 4. Use global columns vector to construct MPI Kronecker Gather
256 // 5. Use buffer index to construct local CooSparseBlockMat
258 int result;
259 MPI_Comm_compare( g_rows.communicator(), g_cols.communicator(), &result);
260 assert( result == MPI_CONGRUENT or result == MPI_IDENT);
261 int rank;
262 MPI_Comm_rank( g_cols.communicator(), &rank);
263 int ndim;
264 MPI_Cartdim_get( g_cols.communicator(), &ndim);
265 assert( ndim == 1);
266 int dim, period, coord;
267 MPI_Cart_get( g_cols.communicator(), 1, &dim, &period, &coord); // coord of the calling process
268 if( dim == 1)
269 {
272 }
273 unsigned n = src.n, rn = g_rows.n()/n, cn = g_cols.n()/n;
274 int bpl = src.blocks_per_line;
275 EllSparseBlockMat<real_type, thrust::host_vector> inner(g_rows.local().N()*rn, g_cols.local().N()*cn, // number of blocks
276 bpl, src.data.size()/(src.n*src.n), src.n);
277 inner.set_left_size( src.left_size);
278 inner.set_right_size( src.right_size);
279 //first copy data elements (even though not all might be needed it doesn't slow down things either)
280 inner.data = src.data;
281 std::vector<bool> local_row( inner.num_rows, true);
282 //2. now grab the right rows of the cols and data indices (and mark communicating rows)
283 for( int i=0; i<inner.num_rows; i++)
284 for( int k=0; k<bpl; k++)
285 {
286 int gIdx=0;
287 assert( g_rows.local2globalIdx( i*src.n, rank, gIdx)); // convert from block idx to idx !
288 inner.data_idx[i*bpl + k] = src.data_idx[ gIdx/src.n*bpl + k];
289 inner.cols_idx[i*bpl + k] = src.cols_idx[ gIdx/src.n*bpl + k];
290 if( inner.cols_idx[i*bpl+k] == invalid_idx)
291 continue;
292 // find out if row is communicating
293 int lIdx = 0, pid = 0;
294 gIdx = inner.cols_idx[i*bpl + k]*src.n; // convert from block idx to idx !
295 assert( g_cols.global2localIdx( gIdx, lIdx, pid));
296 if ( pid != rank)
297 local_row[i] = false;
298 }
299 // 3. Convert inner rows to local and move outer rows to Coo format vectors (invalidating them in inner)
300 thrust::host_vector<std::array<int,2>> gColIdx;
301 thrust::host_vector<int> rowIdx;
302 thrust::host_vector<int> dataIdx;
303 for( int i=0; i<inner.num_rows; i++)
304 for( int k=0; k<bpl; k++)
305 {
306 int gIdx = inner.cols_idx[i*bpl + k];
307 int lIdx = 0, pid = 0;
308 if( gIdx != invalid_idx)
309 {
310 assert( g_cols.global2localIdx( gIdx*src.n, lIdx, pid));
311 lIdx = lIdx/src.n; // convert from idx to block idx !
312 }
313 else
314 lIdx = invalid_idx;
315 if ( !local_row[i] )
316 {
317 // Simply skip invalid entries in Coo matrix
318 if( gIdx != invalid_idx)
319 {
320 rowIdx.push_back( i);
321 dataIdx.push_back( inner.data_idx[i*bpl+k]);
322 gColIdx.push_back( {pid, lIdx});
323 inner.cols_idx[i*bpl + k] = invalid_idx;
324 }
325 }
326 else
327 {
328 inner.cols_idx[i*bpl + k] = lIdx;
329 }
330 }
331 // Now make MPI Gather object
332 thrust::host_vector<int> lColIdx;
333 auto gather_map = dg::gIdx2unique_idx( gColIdx, lColIdx);
334
336 inner.num_cols, inner.right_size, g_cols.communicator());
338 src.n, src.left_size, src.right_size);
339 outer.data = src.data; outer.cols_idx = lColIdx;
340 outer.rows_idx = rowIdx; outer.data_idx = dataIdx;
341 outer.num_entries = rowIdx.size();
342
345}
346
347/*
348// ///@cond
349// namespace detail
350// {
351// // A combined Symv with a scatter of rows
352// // Not good for cuda if number of columns is high so can be removed
353// struct CSRSymvScatterFilter
354// {
355// template<class real_type>
356// DG_DEVICE
357// void operator()( unsigned i, const int* scatter, const int* row_offsets,
358// const int* column_indices, const real_type* values,
359// const real_type* x, real_type* y)
360// {
361// for( int k=row_offsets[i]; k<row_offsets[i+1]; k++)
362// y[scatter[i]] += x[column_indices[k]]*values[k];
363// }
364// };
365// }// namespace detail
366// /// @endcond
367*/
370enum dist_type
371{
372 row_dist=0,
373 allreduce=2
374};
376
377
378
393template<template<class > class Vector, class LocalMatrixInner, class LocalMatrixOuter = LocalMatrixInner>
395{
397 MPIDistMat() = default;
398
404 MPIDistMat( const LocalMatrixInner& m)
405 : m_i(m), m_dist(row_dist){}
406
407 template< template<class> class V, class LI, class LO>
408 friend class MPIDistMat; // enable copy
427 MPIDistMat( const LocalMatrixInner& inside,
428 const LocalMatrixOuter& outside,
430 const Vector<int>& scatter // TODO scatter comp -> y from symv(outside, recv, comp);
431 )
432 : m_i(inside), m_o(outside), m_g(mpi_gather), m_scatter(scatter),
433 m_dist( row_dist), m_reduce(mpi_gather.communicator()) { }
434
442 MPIDistMat( const LocalMatrixInner& m, MPI_Comm comm)
443 : m_i(m), m_dist(allreduce), m_reduce(comm){}
444
457 template<template<class> class OtherVector, class OtherMatrixInner, class OtherMatrixOuter>
459 : m_i(src.inner_matrix()), m_o( src.outer_matrix()), m_g(src.mpi_gather()),
460 m_scatter( src.m_scatter), m_dist( src.m_dist), m_reduce(src.communicator())
461 { }
463 const LocalMatrixInner& inner_matrix() const{return m_i;}
465 const LocalMatrixOuter& outer_matrix() const{return m_o;}
467 const MPIGather<Vector>& mpi_gather() const{return m_g;}
468
470 MPI_Comm communicator() const{ return m_reduce.communicator();}
471
472 const Vector<int>& scatter() const {return m_scatter;}
473
484 template<class ContainerType1, class ContainerType2>
485 void symv( const ContainerType1& x, ContainerType2& y) const
486 {
487 //the blas2 functions should make enough static assertions on tpyes
488 if( !m_g.isCommunicating()) //no communication needed
489 {
490 dg::blas2::symv( m_i, x.data(), y.data());
491 if( m_dist == allreduce)
492 {
493 m_reduce.reduce( y.data());
494 }
495 return;
496
497 }
498 assert( m_dist == row_dist);
499 int result;
500 MPI_Comm_compare( x.communicator(), y.communicator(), &result);
501 assert( result == MPI_CONGRUENT || result == MPI_IDENT);
502
503 using value_type = dg::get_value_type<ContainerType1>;
504 m_recv_buffer.template set<value_type>( m_g.buffer_size());
505 m_comp_buffer.template set<value_type>( m_o.num_rows());
506 auto& recv_buffer = m_recv_buffer.template get<value_type>();
507 auto& comp_buffer = m_comp_buffer.template get<value_type>();
508
509 // 1 initiate communication
510 m_g.global_gather_init( x.data(), recv_buffer);
511 // 2 compute inner points
512 dg::blas2::symv( m_i, x.data(), y.data());
513 // 3 wait for communication to finish
514 m_g.global_gather_wait( recv_buffer);
515 if( comp_buffer.size() > 0)
516 {
517 // 4 compute and add outer points
518 dg::blas2::symv( m_o, recv_buffer, comp_buffer);
519 unsigned size = comp_buffer.size();
521 []DG_DEVICE( unsigned i, const value_type* x,
522 const int* idx, value_type* y) {
523 y[idx[i]] += x[i];
524 }, size, comp_buffer, m_scatter, y.data());
525 // What we need is a fused symv + scatter:
526 // Something like
527 // dg::blas2::parallel_for( detail::CSRSymvScatterFilter(),
528 // m_o.num_rows, m_scatter, m_o.row_offsets,
529 // m_o.column_indices, m_o.values, recv_buffer, y.data());
530 // parallel_for is quite slow for ds_mpit because 50 entries per row
531 // TODO Find out if cuSparse can do that and how fast
532 }
533 }
534
536 template<class Functor, class ContainerType1, class ContainerType2>
537 void stencil( const Functor f, const ContainerType1& x, ContainerType2& y) const
538 {
539 //the blas2 functions should make enough static assertions on tpyes
540 if( !m_g.isCommunicating()) //no communication needed
541 {
542 dg::blas2::stencil( f, m_i, x.data(), y.data());
543 if( m_dist == allreduce)
544 {
545 m_reduce.reduce( y.data());
546 }
547 return;
548 }
549 assert( m_dist == row_dist);
550 int result;
551 MPI_Comm_compare( x.communicator(), y.communicator(), &result);
552 assert( result == MPI_CONGRUENT || result == MPI_IDENT);
553
554 using value_type = dg::get_value_type<ContainerType1>;
555 m_recv_buffer.template set<value_type>( m_g.buffer_size());
556 m_comp_buffer.template set<value_type>( m_o.num_rows());
557 auto& recv_buffer = m_recv_buffer.template get<value_type>();
558 auto& comp_buffer = m_comp_buffer.template get<value_type>();
559
560 // 1 initiate communication
561 m_g.global_gather_init( x.data(), recv_buffer);
562 // 2 compute inner points
563 dg::blas2::stencil( f, m_i, x.data(), y.data());
564 // 3 wait for communication to finish
565 m_g.global_gather_wait( recv_buffer);
566 // 4 compute and add outer points
567 dg::blas2::stencil( f, m_o, recv_buffer, comp_buffer);
568 unsigned size = comp_buffer.size();
570 [size]DG_DEVICE( unsigned i, const value_type* x,
571 const int* idx, value_type* y) {
572 y[idx[i]] += x[i];
573 }, size, comp_buffer, m_scatter, y.data());
574 // What we really need here is a sparse vector format
575 // so we can fuse the symv with the scatter
576 // OR y += Ax
577 }
578
579 private:
580 LocalMatrixInner m_i;
581 LocalMatrixOuter m_o;
583 Vector<int> m_scatter;
584 enum dist_type m_dist;
585 detail::MPIAllreduce m_reduce;
586 mutable detail::AnyVector<Vector> m_recv_buffer, m_comp_buffer ;
587};
588
590
593template<template<class>class V, class LI, class LO>
599template<template<class>class V, class LI, class LO>
606
607} //namespace dg
void symv(MatrixType &&M, const ContainerType1 &x, ContainerType2 &y)
Definition blas2.h:325
void stencil(FunctorType f, MatrixType &&M, const ContainerType1 &x, ContainerType2 &y)
Definition blas2.h:454
@ y
y direction
@ x
x direction
typename TensorTraits< std::decay_t< Vector > >::execution_policy get_execution_policy
Definition tensor_traits.h:49
typename TensorTraits< std::decay_t< Vector > >::value_type get_value_type
Definition tensor_traits.h:45
void mpi_gather(const thrust::host_vector< std::array< int, 2 > > &gather_map, const ContainerType &gatherFrom, ContainerType &result, MPI_Comm comm)
Un-optimized distributed gather operation.
Definition mpi_permutation.h:149
auto make_mpi_sparseblockmat(const EllSparseBlockMat< real_type, thrust::host_vector > &src, const ConversionPolicyRows &g_rows, const ConversionPolicyCols &g_cols)
Split given EllSparseBlockMat into computation and communication part.
Definition mpi_matrix.h:242
#define DG_DEVICE
Expands to __host__ __device__ if compiled with nvcc else is empty.
Definition dg_doc.h:378
void doParallelFor(SharedVectorTag, Stencil f, unsigned N, ContainerType &&x, ContainerTypes &&... xs)
This is the namespace for all functions and classes defined and used by the discontinuous Galerkin li...
Coo Sparse Block Matrix format.
Definition sparseblockmat.h:228
int num_entries
number of entries in the matrix
Definition sparseblockmat.h:345
Vector< int > rows_idx
is of size num_entries and contains the row indices
Definition sparseblockmat.h:341
Vector< real_type > data
The data array is of size n*n*num_different_blocks and contains the blocks.
Definition sparseblockmat.h:339
Vector< int > data_idx
is of size num_entries and contains indices into the data array
Definition sparseblockmat.h:342
Vector< int > cols_idx
is of size num_entries and contains the column indices
Definition sparseblockmat.h:340
Ell Sparse Block Matrix format.
Definition sparseblockmat.h:46
int right_size
size of the right Kronecker delta (is e.g 1 for a x - derivative)
Definition sparseblockmat.h:177
void set_left_size(int new_left_size)
Set left_size = new_left_size;
Definition sparseblockmat.h:157
int num_rows
number of block rows, each row contains blocks
Definition sparseblockmat.h:172
Vector< real_type > data
The data array is of size n*n*num_different_blocks and contains the blocks. The first block is contai...
Definition sparseblockmat.h:168
int blocks_per_line
number of blocks in each line
Definition sparseblockmat.h:174
int num_cols
number of block columns
Definition sparseblockmat.h:173
Vector< int > data_idx
has the same size as cols_idx and contains indices into the data array, i.e. the block number
Definition sparseblockmat.h:170
void set_right_size(int new_right_size)
Set right_size = new_right_size; set_default_range();
Definition sparseblockmat.h:152
Vector< int > cols_idx
is of size num_rows*num_blocks_per_line and contains the column indices % n into the vector
Definition sparseblockmat.h:169
int left_size
size of the left Kronecker delta
Definition sparseblockmat.h:176
int n
each block has size n*n
Definition sparseblockmat.h:175
Distributed memory matrix class, asynchronous communication.
Definition mpi_matrix.h:395
const MPIGather< Vector > & mpi_gather() const
Read access to the communication object.
Definition mpi_matrix.h:467
MPIDistMat()=default
no memory allocation
MPIDistMat(const LocalMatrixInner &m, MPI_Comm comm)
Allreduce dist type.
Definition mpi_matrix.h:442
const LocalMatrixInner & inner_matrix() const
Read access to the inner matrix.
Definition mpi_matrix.h:463
const Vector< int > & scatter() const
Definition mpi_matrix.h:472
MPIDistMat(const LocalMatrixInner &inside, const LocalMatrixOuter &outside, const MPIGather< Vector > &mpi_gather, const Vector< int > &scatter)
Constructor.
Definition mpi_matrix.h:427
MPI_Comm communicator() const
Read access to the MPI Communicator.
Definition mpi_matrix.h:470
void stencil(const Functor f, const ContainerType1 &x, ContainerType2 &y) const
Stencil computations.
Definition mpi_matrix.h:537
MPIDistMat(const MPIDistMat< OtherVector, OtherMatrixInner, OtherMatrixOuter > &src)
Copy constructor.
Definition mpi_matrix.h:458
MPIDistMat(const LocalMatrixInner &m)
Only local computations.
Definition mpi_matrix.h:404
const LocalMatrixOuter & outer_matrix() const
Read access to the outer matrix.
Definition mpi_matrix.h:465
void symv(const ContainerType1 &x, ContainerType2 &y) const
Matrix Vector product.
Definition mpi_matrix.h:485
Optimized MPI Gather operation.
Definition mpi_gather.h:455
Communicator for asynchronous communication of MPISparseBlockMat.
Definition mpi_gather_kron.h:144
indicate one of our mpi matrices
Definition matrix_categories.h:31
Distributed memory Sparse block matrix class, asynchronous communication.
Definition mpi_matrix.h:143
LocalMatrixInner & inner_matrix()
Write access to the inner matrix.
Definition mpi_matrix.h:165
MPISparseBlockMat(const LocalMatrixInner &inside, const LocalMatrixOuter &outside, const MPIKroneckerGather< Vector > &mpi_gather)
Definition mpi_matrix.h:145
void symv(dg::get_value_type< ContainerType1 > alpha, const ContainerType1 &x, dg::get_value_type< ContainerType1 > beta, ContainerType2 &y) const
Matrix Vector product.
Definition mpi_matrix.h:184
const LocalMatrixInner & inner_matrix() const
Read access to the inner matrix.
Definition mpi_matrix.h:161
LocalMatrixOuter & outer_matrix()
Write access to the outer matrix.
Definition mpi_matrix.h:167
void symv(const ContainerType1 &x, ContainerType2 &y) const
Definition mpi_matrix.h:219
const LocalMatrixOuter & outer_matrix() const
Read access to the outer matrix.
Definition mpi_matrix.h:163
MPISparseBlockMat()=default
MPISparseBlockMat(const MPISparseBlockMat< V, LI, LO > &src)
Definition mpi_matrix.h:156
MPI_Comm communicator() const
Definition mpi_matrix.h:169
Indicate a contiguous chunk of shared memory.
Definition vector_categories.h:41
get_value_type< LI > value_type
value type
Definition mpi_matrix.h:602
get_value_type< LI > value_type
value type
Definition mpi_matrix.h:596
The vector traits.
Definition tensor_traits.h:38