Discontinuous Galerkin Library
#include "dg/algorithm.h"
mpi_vector.h
Go to the documentation of this file.
1#pragma once
2
3#include <cassert>
4#include <thrust/host_vector.h>
5#include <thrust/gather.h>
6#include "exceptions.h"
8#include "tensor_traits.h"
9#include "blas1_dispatch_shared.h"
10#include "mpi_communicator.h"
11#include "memory.h"
12#include "config.h"
13
14//TODO: should we catch the cases where outer_size \in {1,2,3} in NearestNeighborComm?
15namespace dg
16{
17
30template<class container>
32{
33 typedef container container_type;
36 m_comm = m_comm128 = m_comm128Reduce = MPI_COMM_NULL;
37 }
45 MPI_Vector( const container& data, MPI_Comm comm): m_data( data), m_comm(comm) {
46 exblas::mpi_reduce_communicator( comm, &m_comm128, &m_comm128Reduce);
47 }
48
56 template<class OtherContainer>
58 m_data = src.data();
59 m_comm = src.communicator();
60 m_comm128 = src.communicator_mod();
61 m_comm128Reduce = src.communicator_mod_reduce();
62 }
63
66 const container& data() const {return m_data;}
69 container& data() {return m_data;}
70
73 MPI_Comm communicator() const{return m_comm;}
75 MPI_Comm communicator_mod() const{return m_comm128;}
76
82 MPI_Comm communicator_mod_reduce() const{return m_comm128Reduce;}
83
96 void set_communicator(MPI_Comm comm, MPI_Comm comm_mod, MPI_Comm comm_mod_reduce){
97 m_comm = comm;
98 m_comm128 = comm_mod;
99 m_comm128Reduce = comm_mod_reduce;
100 }
101
104 unsigned size() const{return m_data.size();}
105
108 void swap( MPI_Vector& src){
109 m_data.swap(src.m_data);
110 std::swap( m_comm , src.m_comm);
111 std::swap( m_comm128 , src.m_comm128);
112 std::swap( m_comm128Reduce , src.m_comm128Reduce);
113 }
114 private:
115 container m_data;
116 MPI_Comm m_comm, m_comm128, m_comm128Reduce;
117
118};
120//free function as required by the std to be swappable
121//https://en.cppreference.com/w/cpp/named_req/Swappable
122//even though with move assignments std::swap also works as fast
123template<class container>
124void swap( MPI_Vector<container>& a, MPI_Vector<container>& b){
125 a.swap(b);
126}
128
131
133template<class container>
134struct TensorTraits<MPI_Vector<container> > {
138};
140
142
179template<class Index, class Buffer, class Vector>
181{
182 using container_type = Vector;
188 NearestNeighborComm( MPI_Comm comm = MPI_COMM_NULL){
189 m_comm = comm;
190 m_silent = true;
191 }
200 NearestNeighborComm( unsigned n, const unsigned vector_dimensions[3], MPI_Comm comm, unsigned direction)
201 {
202 static_assert( std::is_same<const_pointer_type, get_value_type<Buffer>>::value, "Must be same pointer types");
203 construct( n, vector_dimensions, comm, direction);
204 }
205
215 template< class OtherIndex, class OtherBuffer, class OtherVector>
217 if( src.buffer_size() == 0) m_silent=true;
218 else
219 construct( src.n(), src.dims(), src.communicator(), src.direction());
220 }
221
226 unsigned n() const{return m_n;}
231 const unsigned* dims() const{return m_dim;}
237 unsigned direction() const {return m_direction;}
239 MPI_Comm communicator() const{return m_comm;}
240
249 if( buffer_size() == 0 ) return Buffer();
250 return Buffer(6);
251 }
255 unsigned buffer_size() const;
257 bool isCommunicating() const{
258 if( buffer_size() == 0) return false;
259 return true;
260 }
266 int map_index(int i) const{
267 if( i==-1) return 0;
268 if( i== 0) return 1;
269 if( i==+1) return 2;
270 if( i==(int)m_outer_size-0) return 5;
271 if( i==(int)m_outer_size-1) return 4;
272 if( i==(int)m_outer_size-2) return 3;
273 throw Error( Message(_ping_)<<"Index not mappable!");
274 return -1;
275 }
276
283 void global_gather_init( const_pointer_type input, buffer_type& buffer, MPI_Request rqst[4])const
284 {
285 unsigned size = buffer_size();
286 //init pointers on host
287 const_pointer_type host_ptr[6];
288 if(m_trivial)
289 {
290 host_ptr[0] = thrust::raw_pointer_cast(&m_internal_buffer.data()[0*size]);
291 host_ptr[1] = input;
292 host_ptr[2] = input+size;
293 host_ptr[3] = input+(m_outer_size-2)*size;
294 host_ptr[4] = input+(m_outer_size-1)*size;
295 host_ptr[5] = thrust::raw_pointer_cast(&m_internal_buffer.data()[5*size]);
296 }
297 else
298 {
299 host_ptr[0] = thrust::raw_pointer_cast(&m_internal_buffer.data()[0*size]);
300 host_ptr[1] = thrust::raw_pointer_cast(&m_internal_buffer.data()[1*size]);
301 host_ptr[2] = thrust::raw_pointer_cast(&m_internal_buffer.data()[2*size]);
302 host_ptr[3] = thrust::raw_pointer_cast(&m_internal_buffer.data()[3*size]);
303 host_ptr[4] = thrust::raw_pointer_cast(&m_internal_buffer.data()[4*size]);
304 host_ptr[5] = thrust::raw_pointer_cast(&m_internal_buffer.data()[5*size]);
305 }
306 //copy pointers to device
307 thrust::copy( host_ptr, host_ptr+6, buffer.begin());
308 //fill internal_buffer if !trivial
309 do_global_gather_init( get_execution_policy<Vector>(), input, rqst);
310 sendrecv( host_ptr[1], host_ptr[4],
311 thrust::raw_pointer_cast(&m_internal_buffer.data()[0*size]), //host_ptr is const!
312 thrust::raw_pointer_cast(&m_internal_buffer.data()[5*size]), //host_ptr is const!
313 rqst);
314 }
323 void global_gather_wait(const_pointer_type input, const buffer_type& buffer, MPI_Request rqst[4])const
324 {
325 MPI_Waitall( 4, rqst, MPI_STATUSES_IGNORE );
326#ifdef _DG_CUDA_UNAWARE_MPI
327 if( std::is_same< get_execution_policy<Vector>, CudaTag>::value ) //could be serial tag
328 {
329 unsigned size = buffer_size();
330 cudaError_t code = cudaGetLastError( );
331 if( code != cudaSuccess)
332 throw dg::Error(dg::Message(_ping_)<<cudaGetErrorString(code));
333 code = cudaMemcpy( thrust::raw_pointer_cast(&m_internal_buffer.data()[0*size]), //dst
334 thrust::raw_pointer_cast(&m_internal_host_buffer.data()[0*size]), //src
335 size*sizeof(get_value_type<Vector>), cudaMemcpyHostToDevice);
336 if( code != cudaSuccess)
337 throw dg::Error(dg::Message(_ping_)<<cudaGetErrorString(code));
338
339 code = cudaMemcpy( thrust::raw_pointer_cast(&m_internal_buffer.data()[5*size]), //dst
340 thrust::raw_pointer_cast(&m_internal_host_buffer.data()[5*size]), //src
341 size*sizeof(get_value_type<Vector>), cudaMemcpyHostToDevice);
342 if( code != cudaSuccess)
343 throw dg::Error(dg::Message(_ping_)<<cudaGetErrorString(code));
344 }
345#endif
346 }
347 private:
348 void do_global_gather_init( OmpTag, const_pointer_type, MPI_Request rqst[4])const;
349 void do_global_gather_init( SerialTag, const_pointer_type, MPI_Request rqst[4])const;
350 void do_global_gather_init( CudaTag, const_pointer_type, MPI_Request rqst[4])const;
351 void construct( unsigned n, const unsigned vector_dimensions[3], MPI_Comm comm, unsigned direction);
352
353 unsigned m_n, m_dim[3]; //deepness, dimensions
354 MPI_Comm m_comm;
355 unsigned m_direction;
356 bool m_silent, m_trivial=false; //silent -> no comm, m_trivial-> comm in last dim
357 unsigned m_outer_size = 1; //size of vector in units of buffer_size
358 Index m_gather_map_middle;
359 dg::Buffer<Vector> m_internal_buffer;
360#ifdef _DG_CUDA_UNAWARE_MPI
361 //a copy of the data on the host (we need to send data manually through the host)
363#endif
364
365 void sendrecv(const_pointer_type, const_pointer_type, pointer_type, pointer_type, MPI_Request rqst[4])const;
366 int m_source[2], m_dest[2];
367};
368
370
371template<class I, class B, class V>
372void NearestNeighborComm<I,B,V>::construct( unsigned n, const unsigned dimensions[3], MPI_Comm comm, unsigned direction)
373{
374 static_assert( std::is_base_of<SharedVectorTag, get_tensor_category<V>>::value,
375 "Only Shared vectors allowed");
376 m_silent=false;
377 m_n=n;
378 m_dim[0] = dimensions[0], m_dim[1] = dimensions[1], m_dim[2] = dimensions[2];
379 m_direction = direction;
380 if( dimensions[2] == 1 && direction == 1) m_trivial = true;
381 else if( direction == 2) m_trivial = true;
382 else m_trivial = false;
383 assert( direction <3);
384 m_comm = comm;
385 //mpi_cart_shift may return MPI_PROC_NULL then the receive buffer is not modified
386 MPI_Cart_shift( m_comm, m_direction, -1, &m_source[0], &m_dest[0]);
387 MPI_Cart_shift( m_comm, m_direction, +1, &m_source[1], &m_dest[1]);
388 {
389 int ndims;
390 MPI_Cartdim_get( comm, &ndims);
391 int dims[ndims], periods[ndims], coords[ndims];
392 MPI_Cart_get( comm, ndims, dims, periods, coords);
393 if( dims[direction] == 1) m_silent = true;
394 }
395 if( !m_silent)
396 {
397 m_outer_size = dimensions[0]*dimensions[1]*dimensions[2]/buffer_size();
398 assert( m_outer_size > 1 && "Parallelization too fine grained!"); //right now we cannot have that
399 thrust::host_vector<int> mid_gather( 4*buffer_size());
400 switch( direction)
401 {
402 case( 0):
403 for( unsigned i=0; i<m_dim[2]*m_dim[1]; i++)
404 for( unsigned j=0; j<n; j++)
405 {
406 mid_gather[(0*n+j)*m_dim[2]*m_dim[1]+i] = i*m_dim[0] + j;
407 mid_gather[(1*n+j)*m_dim[2]*m_dim[1]+i] = i*m_dim[0] + n + j;
408 mid_gather[(2*n+j)*m_dim[2]*m_dim[1]+i] = i*m_dim[0] + m_dim[0]-2*n + j;
409 mid_gather[(3*n+j)*m_dim[2]*m_dim[1]+i] = i*m_dim[0] + m_dim[0]- n + j;
410 }
411 break;
412 case( 1):
413 for( unsigned i=0; i<m_dim[2]; i++)
414 for( unsigned j=0; j<n; j++)
415 for( unsigned k=0; k<m_dim[0]; k++)
416 {
417 mid_gather[((0*n+j)*m_dim[2]+i)*m_dim[0] + k] = (i*m_dim[1] + j)*m_dim[0] + k;
418 mid_gather[((1*n+j)*m_dim[2]+i)*m_dim[0] + k] = (i*m_dim[1] + n + j)*m_dim[0] + k;
419 mid_gather[((2*n+j)*m_dim[2]+i)*m_dim[0] + k] = (i*m_dim[1] + m_dim[1]-2*n + j)*m_dim[0] + k;
420 mid_gather[((3*n+j)*m_dim[2]+i)*m_dim[0] + k] = (i*m_dim[1] + m_dim[1]- n + j)*m_dim[0] + k;
421 }
422 break;
423 case( 2):
424 for( unsigned i=0; i<n; i++)
425 for( unsigned j=0; j<m_dim[0]*m_dim[1]; j++)
426 {
427 mid_gather[(0*n+i)*m_dim[0]*m_dim[1]+j] = (i )*m_dim[0]*m_dim[1] + j;
428 mid_gather[(1*n+i)*m_dim[0]*m_dim[1]+j] = (i + n )*m_dim[0]*m_dim[1] + j;
429 mid_gather[(2*n+i)*m_dim[0]*m_dim[1]+j] = (i + m_dim[2]-2*n )*m_dim[0]*m_dim[1] + j;
430 mid_gather[(3*n+i)*m_dim[0]*m_dim[1]+j] = (i + m_dim[2]- n )*m_dim[0]*m_dim[1] + j;
431 }
432 break;
433 }
434 m_gather_map_middle = mid_gather; //transfer to device
435 m_internal_buffer.data().resize( 6*buffer_size() );
436#ifdef _DG_CUDA_UNAWARE_MPI
437 m_internal_host_buffer.data().resize( 6*buffer_size() );
438#endif
439 }
440}
441
442template<class I, class B, class V>
444{
445 if( m_silent) return 0;
446 switch( m_direction)
447 {
448 case( 0): //x-direction
449 return m_n*m_dim[1]*m_dim[2];
450 case( 1): //y-direction
451 return m_n*m_dim[0]*m_dim[2];
452 case( 2): //z-direction
453 return m_n*m_dim[0]*m_dim[1]; //no further m_n (hide in m_dim)
454 default:
455 return 0;
456 }
457}
458
459template<class I, class B, class V>
460void NearestNeighborComm<I,B,V>::do_global_gather_init( SerialTag, const_pointer_type input, MPI_Request rqst[4]) const
461{
462 if( !m_trivial)
463 {
464 unsigned size = buffer_size();
465 for( unsigned i=0; i<4*size; i++)
466 m_internal_buffer.data()[i+size] = input[m_gather_map_middle[i]];
467 }
468}
469#ifdef _OPENMP
470template<class I, class B, class V>
471void NearestNeighborComm<I,B,V>::do_global_gather_init( OmpTag, const_pointer_type input, MPI_Request rqst[4]) const
472{
473 if(!m_trivial)
474 {
475 unsigned size = buffer_size();
476 #pragma omp parallel for
477 for( unsigned i=0; i<4*size; i++)
478 m_internal_buffer.data()[size+i] = input[m_gather_map_middle[i]];
479 }
480}
481#endif
482#if THRUST_DEVICE_SYSTEM==THRUST_DEVICE_SYSTEM_CUDA
483template<class I, class B, class V>
484void NearestNeighborComm<I,B,V>::do_global_gather_init( CudaTag, const_pointer_type input, MPI_Request rqst[4]) const
485{
486 //gather values from input into sendbuffer
487 if(!m_trivial)
488 {
489 unsigned size = buffer_size();
490 thrust::gather( thrust::cuda::tag(), m_gather_map_middle.begin(), m_gather_map_middle.end(), input, m_internal_buffer.data().begin()+size);
491 }
492 cudaError_t code = cudaGetLastError( );
493 if( code != cudaSuccess)
494 throw dg::Error(dg::Message(_ping_)<<cudaGetErrorString(code));
495 code = cudaDeviceSynchronize(); //wait until device functions are finished before sending data
496 if( code != cudaSuccess)
497 throw dg::Error(dg::Message(_ping_)<<cudaGetErrorString(code));
498}
499#endif
500
501template<class I, class B, class V>
502void NearestNeighborComm<I,B,V>::sendrecv( const_pointer_type sb1_ptr, const_pointer_type sb2_ptr, pointer_type rb1_ptr, pointer_type rb2_ptr, MPI_Request rqst[4]) const
503{
504 unsigned size = buffer_size();
505#ifdef _DG_CUDA_UNAWARE_MPI
506 if( std::is_same< get_execution_policy<V>, CudaTag>::value ) //could be serial tag
507 {
508 cudaError_t code = cudaGetLastError( );
509 if( code != cudaSuccess)
510 throw dg::Error(dg::Message(_ping_)<<cudaGetErrorString(code));
511 code = cudaMemcpy( thrust::raw_pointer_cast(&m_internal_host_buffer.data()[1*size]),//dst
512 sb1_ptr, size*sizeof(get_value_type<V>), cudaMemcpyDeviceToHost); //src
513 if( code != cudaSuccess)
514 throw dg::Error(dg::Message(_ping_)<<cudaGetErrorString(code));
515 code = cudaMemcpy( thrust::raw_pointer_cast(&m_internal_host_buffer.data()[4*size]), //dst
516 sb2_ptr, size*sizeof(get_value_type<V>), cudaMemcpyDeviceToHost); //src
517 if( code != cudaSuccess)
518 throw dg::Error(dg::Message(_ping_)<<cudaGetErrorString(code));
519 sb1_ptr = thrust::raw_pointer_cast(&m_internal_host_buffer.data()[1*size]);
520 sb2_ptr = thrust::raw_pointer_cast(&m_internal_host_buffer.data()[4*size]);
521 rb1_ptr = thrust::raw_pointer_cast(&m_internal_host_buffer.data()[0*size]);
522 rb2_ptr = thrust::raw_pointer_cast(&m_internal_host_buffer.data()[5*size]);
523 }
524//This is a mistake if called with a host_vector
525#endif
526 MPI_Isend( sb1_ptr, size,
527 getMPIDataType<get_value_type<V>>(), //sender
528 m_dest[0], 3, m_comm, &rqst[0]); //destination
529 MPI_Irecv( rb2_ptr, size,
530 getMPIDataType<get_value_type<V>>(), //receiver
531 m_source[0], 3, m_comm, &rqst[1]); //source
532
533 MPI_Isend( sb2_ptr, size,
534 getMPIDataType<get_value_type<V>>(), //sender
535 m_dest[1], 9, m_comm, &rqst[2]); //destination
536 MPI_Irecv( rb1_ptr, size,
537 getMPIDataType<get_value_type<V>>(), //receiver
538 m_source[1], 9, m_comm, &rqst[3]); //source
539}
540
541
543}//namespace dg
class intended for the use in throw statements
Definition: exceptions.h:83
small class holding a stringstream
Definition: exceptions.h:29
Error classes or the dg library.
#define _ping_
Definition: exceptions.h:12
void copy(const ContainerTypeIn &source, ContainerTypeOut &target)
Definition: blas1.h:164
direction
Direction of a discrete derivative.
Definition: enums.h:97
bool is_same(double x, double y, double eps=1e-15)
Definition: runge_kutta.h:948
typename TensorTraits< std::decay_t< Vector > >::value_type get_value_type
Definition: tensor_traits.h:38
typename TensorTraits< std::decay_t< Vector > >::execution_policy get_execution_policy
Definition: tensor_traits.h:42
static void mpi_reduce_communicator(MPI_Comm comm, MPI_Comm *comm_mod, MPI_Comm *comm_mod_reduce)
This is the namespace for all functions and classes defined and used by the discontinuous Galerkin li...
a manager class that invokes the copy constructor on the managed ptr when copied (deep copy)
Definition: memory.h:152
T & data() const
Get write access to the data on the heap.
Definition: memory.h:187
CUDA implementation.
Definition: execution_policy.h:27
mpi Vector class
Definition: mpi_vector.h:32
MPI_Vector()
no data is allocated, communicators are MPI_COMM_NULL
Definition: mpi_vector.h:35
const container & data() const
Get underlying data.
Definition: mpi_vector.h:66
MPI_Comm communicator_mod() const
Returns a communicator of fixed size 128.
Definition: mpi_vector.h:75
MPI_Vector(const MPI_Vector< OtherContainer > &src)
Conversion operator.
Definition: mpi_vector.h:57
container & data()
Set underlying data.
Definition: mpi_vector.h:69
MPI_Comm communicator() const
Get the communicator to which this vector belongs.
Definition: mpi_vector.h:73
MPI_Comm communicator_mod_reduce() const
Returns a communicator consisting of all processes with rank 0 in communicator_mod()
Definition: mpi_vector.h:82
MPI_Vector(const container &data, MPI_Comm comm)
construct a vector
Definition: mpi_vector.h:45
void set_communicator(MPI_Comm comm, MPI_Comm comm_mod, MPI_Comm comm_mod_reduce)
Set the communicators with dg::exblas::mpi_reduce_communicator.
Definition: mpi_vector.h:96
container container_type
Definition: mpi_vector.h:33
unsigned size() const
Return the size of the data object.
Definition: mpi_vector.h:104
void swap(MPI_Vector &src)
Swap data and communicator.
Definition: mpi_vector.h:108
A distributed vector contains a data container and a MPI communicator.
Definition: vector_categories.h:52
Communicator for asynchronous nearest neighbor communication.
Definition: mpi_vector.h:181
unsigned buffer_size() const
The size of the halo.
const unsigned * dims() const
The dimensionality of the input vector.
Definition: mpi_vector.h:231
get_value_type< Vector > * pointer_type
Definition: mpi_vector.h:184
unsigned n() const
halo size
Definition: mpi_vector.h:226
void global_gather_wait(const_pointer_type input, const buffer_type &buffer, MPI_Request rqst[4]) const
Wait for asynchronous communication to finish and gather received data into buffer.
Definition: mpi_vector.h:323
MPI_Comm communicator() const
The internal MPI communicator used.
Definition: mpi_vector.h:239
NearestNeighborComm(MPI_Comm comm=MPI_COMM_NULL)
no communication
Definition: mpi_vector.h:188
int map_index(int i) const
Map a local matrix index to a buffer index.
Definition: mpi_vector.h:266
Buffer allocate_buffer() const
Allocate a buffer object.
Definition: mpi_vector.h:248
NearestNeighborComm(unsigned n, const unsigned vector_dimensions[3], MPI_Comm comm, unsigned direction)
Construct.
Definition: mpi_vector.h:200
Vector container_type
Definition: mpi_vector.h:182
get_value_type< Vector > const * const_pointer_type
Definition: mpi_vector.h:185
unsigned direction() const
The direction of communication.
Definition: mpi_vector.h:237
NearestNeighborComm(const NearestNeighborComm< OtherIndex, OtherBuffer, OtherVector > &src)
Construct from other Communicator.
Definition: mpi_vector.h:216
void global_gather_init(const_pointer_type input, buffer_type &buffer, MPI_Request rqst[4]) const
Gather values from given Vector and initiate asynchronous MPI communication.
Definition: mpi_vector.h:283
bool isCommunicating() const
True if the gather/scatter operation involves actual MPI communication.
Definition: mpi_vector.h:257
OpenMP parallel execution.
Definition: execution_policy.h:28
Indicate sequential execution.
Definition: execution_policy.h:26
get_value_type< container > value_type
Definition: mpi_vector.h:135
get_execution_policy< container > execution_policy
Definition: mpi_vector.h:137
The vector traits.
Definition: tensor_traits.h:31