4#include <thrust/host_vector.h>
5#include <thrust/gather.h>
9#include "blas1_dispatch_shared.h"
30template<
class container>
36 m_comm = m_comm128 = m_comm128Reduce = MPI_COMM_NULL;
56 template<
class OtherContainer>
66 const container&
data()
const {
return m_data;}
69 container&
data() {
return m_data;}
99 m_comm128Reduce = comm_mod_reduce;
104 unsigned size()
const{
return m_data.size();}
109 m_data.swap(src.m_data);
110 std::swap( m_comm , src.m_comm);
111 std::swap( m_comm128 , src.m_comm128);
112 std::swap( m_comm128Reduce , src.m_comm128Reduce);
116 MPI_Comm m_comm, m_comm128, m_comm128Reduce;
123template<
class container>
124void swap( MPI_Vector<container>& a, MPI_Vector<container>& b){
133template<
class container>
179template<
class Index,
class Buffer,
class Vector>
202 static_assert( std::is_same<const_pointer_type, get_value_type<Buffer>>::value,
"Must be same pointer types");
203 construct(
n, vector_dimensions, comm,
direction);
215 template<
class OtherIndex,
class OtherBuffer,
class OtherVector>
226 unsigned n()
const{
return m_n;}
231 const unsigned*
dims()
const{
return m_dim;}
270 if( i==(
int)m_outer_size-0)
return 5;
271 if( i==(
int)m_outer_size-1)
return 4;
272 if( i==(
int)m_outer_size-2)
return 3;
290 host_ptr[0] = thrust::raw_pointer_cast(&m_internal_buffer.
data()[0*size]);
292 host_ptr[2] = input+size;
293 host_ptr[3] = input+(m_outer_size-2)*size;
294 host_ptr[4] = input+(m_outer_size-1)*size;
295 host_ptr[5] = thrust::raw_pointer_cast(&m_internal_buffer.
data()[5*size]);
299 host_ptr[0] = thrust::raw_pointer_cast(&m_internal_buffer.
data()[0*size]);
300 host_ptr[1] = thrust::raw_pointer_cast(&m_internal_buffer.
data()[1*size]);
301 host_ptr[2] = thrust::raw_pointer_cast(&m_internal_buffer.
data()[2*size]);
302 host_ptr[3] = thrust::raw_pointer_cast(&m_internal_buffer.
data()[3*size]);
303 host_ptr[4] = thrust::raw_pointer_cast(&m_internal_buffer.
data()[4*size]);
304 host_ptr[5] = thrust::raw_pointer_cast(&m_internal_buffer.
data()[5*size]);
310 sendrecv( host_ptr[1], host_ptr[4],
311 thrust::raw_pointer_cast(&m_internal_buffer.
data()[0*size]),
312 thrust::raw_pointer_cast(&m_internal_buffer.
data()[5*size]),
325 MPI_Waitall( 4, rqst, MPI_STATUSES_IGNORE );
326#ifdef _DG_CUDA_UNAWARE_MPI
330 cudaError_t code = cudaGetLastError( );
331 if( code != cudaSuccess)
333 code = cudaMemcpy( thrust::raw_pointer_cast(&m_internal_buffer.
data()[0*size]),
334 thrust::raw_pointer_cast(&m_internal_host_buffer.
data()[0*size]),
336 if( code != cudaSuccess)
339 code = cudaMemcpy( thrust::raw_pointer_cast(&m_internal_buffer.
data()[5*size]),
340 thrust::raw_pointer_cast(&m_internal_host_buffer.
data()[5*size]),
342 if( code != cudaSuccess)
351 void construct(
unsigned n,
const unsigned vector_dimensions[3], MPI_Comm comm,
unsigned direction);
353 unsigned m_n, m_dim[3];
355 unsigned m_direction;
356 bool m_silent, m_trivial=
false;
357 unsigned m_outer_size = 1;
358 Index m_gather_map_middle;
360#ifdef _DG_CUDA_UNAWARE_MPI
366 int m_source[2], m_dest[2];
371template<
class I,
class B,
class V>
372void NearestNeighborComm<I,B,V>::construct(
unsigned n,
const unsigned dimensions[3], MPI_Comm comm,
unsigned direction)
374 static_assert( std::is_base_of<SharedVectorTag, get_tensor_category<V>>::value,
375 "Only Shared vectors allowed");
378 m_dim[0] = dimensions[0], m_dim[1] = dimensions[1], m_dim[2] = dimensions[2];
380 if( dimensions[2] == 1 &&
direction == 1) m_trivial =
true;
381 else if(
direction == 2) m_trivial =
true;
382 else m_trivial =
false;
386 MPI_Cart_shift( m_comm, m_direction, -1, &m_source[0], &m_dest[0]);
387 MPI_Cart_shift( m_comm, m_direction, +1, &m_source[1], &m_dest[1]);
390 MPI_Cartdim_get( comm, &ndims);
391 int dims[ndims], periods[ndims], coords[ndims];
392 MPI_Cart_get( comm, ndims, dims, periods, coords);
393 if( dims[
direction] == 1) m_silent =
true;
397 m_outer_size = dimensions[0]*dimensions[1]*dimensions[2]/buffer_size();
398 assert( m_outer_size > 1 &&
"Parallelization too fine grained!");
399 thrust::host_vector<int> mid_gather( 4*buffer_size());
403 for(
unsigned i=0; i<m_dim[2]*m_dim[1]; i++)
404 for(
unsigned j=0; j<n; j++)
406 mid_gather[(0*n+j)*m_dim[2]*m_dim[1]+i] = i*m_dim[0] + j;
407 mid_gather[(1*n+j)*m_dim[2]*m_dim[1]+i] = i*m_dim[0] + n + j;
408 mid_gather[(2*n+j)*m_dim[2]*m_dim[1]+i] = i*m_dim[0] + m_dim[0]-2*n + j;
409 mid_gather[(3*n+j)*m_dim[2]*m_dim[1]+i] = i*m_dim[0] + m_dim[0]- n + j;
413 for(
unsigned i=0; i<m_dim[2]; i++)
414 for(
unsigned j=0; j<n; j++)
415 for(
unsigned k=0; k<m_dim[0]; k++)
417 mid_gather[((0*n+j)*m_dim[2]+i)*m_dim[0] + k] = (i*m_dim[1] + j)*m_dim[0] + k;
418 mid_gather[((1*n+j)*m_dim[2]+i)*m_dim[0] + k] = (i*m_dim[1] + n + j)*m_dim[0] + k;
419 mid_gather[((2*n+j)*m_dim[2]+i)*m_dim[0] + k] = (i*m_dim[1] + m_dim[1]-2*n + j)*m_dim[0] + k;
420 mid_gather[((3*n+j)*m_dim[2]+i)*m_dim[0] + k] = (i*m_dim[1] + m_dim[1]- n + j)*m_dim[0] + k;
424 for(
unsigned i=0; i<n; i++)
425 for(
unsigned j=0; j<m_dim[0]*m_dim[1]; j++)
427 mid_gather[(0*n+i)*m_dim[0]*m_dim[1]+j] = (i )*m_dim[0]*m_dim[1] + j;
428 mid_gather[(1*n+i)*m_dim[0]*m_dim[1]+j] = (i + n )*m_dim[0]*m_dim[1] + j;
429 mid_gather[(2*n+i)*m_dim[0]*m_dim[1]+j] = (i + m_dim[2]-2*n )*m_dim[0]*m_dim[1] + j;
430 mid_gather[(3*n+i)*m_dim[0]*m_dim[1]+j] = (i + m_dim[2]- n )*m_dim[0]*m_dim[1] + j;
434 m_gather_map_middle = mid_gather;
435 m_internal_buffer.data().resize( 6*buffer_size() );
436#ifdef _DG_CUDA_UNAWARE_MPI
437 m_internal_host_buffer.data().resize( 6*buffer_size() );
442template<
class I,
class B,
class V>
445 if( m_silent)
return 0;
449 return m_n*m_dim[1]*m_dim[2];
451 return m_n*m_dim[0]*m_dim[2];
453 return m_n*m_dim[0]*m_dim[1];
459template<
class I,
class B,
class V>
460void NearestNeighborComm<I,B,V>::do_global_gather_init( SerialTag, const_pointer_type input, MPI_Request rqst[4])
const
464 unsigned size = buffer_size();
465 for(
unsigned i=0; i<4*size; i++)
466 m_internal_buffer.data()[i+size] = input[m_gather_map_middle[i]];
470template<
class I,
class B,
class V>
471void NearestNeighborComm<I,B,V>::do_global_gather_init( OmpTag, const_pointer_type input, MPI_Request rqst[4])
const
475 unsigned size = buffer_size();
476 #pragma omp parallel for
477 for(
unsigned i=0; i<4*size; i++)
478 m_internal_buffer.data()[size+i] = input[m_gather_map_middle[i]];
482#if THRUST_DEVICE_SYSTEM==THRUST_DEVICE_SYSTEM_CUDA
483template<
class I,
class B,
class V>
484void NearestNeighborComm<I,B,V>::do_global_gather_init( CudaTag, const_pointer_type input, MPI_Request rqst[4])
const
489 unsigned size = buffer_size();
490 thrust::gather( thrust::cuda::tag(), m_gather_map_middle.begin(), m_gather_map_middle.end(), input, m_internal_buffer.data().begin()+size);
492 cudaError_t code = cudaGetLastError( );
493 if( code != cudaSuccess)
495 code = cudaDeviceSynchronize();
496 if( code != cudaSuccess)
501template<
class I,
class B,
class V>
502void NearestNeighborComm<I,B,V>::sendrecv( const_pointer_type sb1_ptr, const_pointer_type sb2_ptr, pointer_type rb1_ptr, pointer_type rb2_ptr, MPI_Request rqst[4])
const
504 unsigned size = buffer_size();
505#ifdef _DG_CUDA_UNAWARE_MPI
506 if(
std::is_same< get_execution_policy<V>, CudaTag>::value )
508 cudaError_t code = cudaGetLastError( );
509 if( code != cudaSuccess)
511 code = cudaMemcpy( thrust::raw_pointer_cast(&m_internal_host_buffer.data()[1*size]),
512 sb1_ptr, size*
sizeof(get_value_type<V>), cudaMemcpyDeviceToHost);
513 if( code != cudaSuccess)
515 code = cudaMemcpy( thrust::raw_pointer_cast(&m_internal_host_buffer.data()[4*size]),
516 sb2_ptr, size*
sizeof(get_value_type<V>), cudaMemcpyDeviceToHost);
517 if( code != cudaSuccess)
519 sb1_ptr = thrust::raw_pointer_cast(&m_internal_host_buffer.data()[1*size]);
520 sb2_ptr = thrust::raw_pointer_cast(&m_internal_host_buffer.data()[4*size]);
521 rb1_ptr = thrust::raw_pointer_cast(&m_internal_host_buffer.data()[0*size]);
522 rb2_ptr = thrust::raw_pointer_cast(&m_internal_host_buffer.data()[5*size]);
526 MPI_Isend( sb1_ptr, size,
527 getMPIDataType<get_value_type<V>>(),
528 m_dest[0], 3, m_comm, &rqst[0]);
529 MPI_Irecv( rb2_ptr, size,
530 getMPIDataType<get_value_type<V>>(),
531 m_source[0], 3, m_comm, &rqst[1]);
533 MPI_Isend( sb2_ptr, size,
534 getMPIDataType<get_value_type<V>>(),
535 m_dest[1], 9, m_comm, &rqst[2]);
536 MPI_Irecv( rb1_ptr, size,
537 getMPIDataType<get_value_type<V>>(),
538 m_source[1], 9, m_comm, &rqst[3]);
class intended for the use in throw statements
Definition: exceptions.h:83
small class holding a stringstream
Definition: exceptions.h:29
Error classes or the dg library.
#define _ping_
Definition: exceptions.h:12
void copy(const ContainerTypeIn &source, ContainerTypeOut &target)
Definition: blas1.h:164
direction
Direction of a discrete derivative.
Definition: enums.h:97
bool is_same(double x, double y, double eps=1e-15)
Definition: runge_kutta.h:948
typename TensorTraits< std::decay_t< Vector > >::value_type get_value_type
Definition: tensor_traits.h:38
typename TensorTraits< std::decay_t< Vector > >::execution_policy get_execution_policy
Definition: tensor_traits.h:42
static void mpi_reduce_communicator(MPI_Comm comm, MPI_Comm *comm_mod, MPI_Comm *comm_mod_reduce)
This is the namespace for all functions and classes defined and used by the discontinuous Galerkin li...
a manager class that invokes the copy constructor on the managed ptr when copied (deep copy)
Definition: memory.h:152
T & data() const
Get write access to the data on the heap.
Definition: memory.h:187
CUDA implementation.
Definition: execution_policy.h:27
mpi Vector class
Definition: mpi_vector.h:32
MPI_Vector()
no data is allocated, communicators are MPI_COMM_NULL
Definition: mpi_vector.h:35
const container & data() const
Get underlying data.
Definition: mpi_vector.h:66
MPI_Comm communicator_mod() const
Returns a communicator of fixed size 128.
Definition: mpi_vector.h:75
MPI_Vector(const MPI_Vector< OtherContainer > &src)
Conversion operator.
Definition: mpi_vector.h:57
container & data()
Set underlying data.
Definition: mpi_vector.h:69
MPI_Comm communicator() const
Get the communicator to which this vector belongs.
Definition: mpi_vector.h:73
MPI_Comm communicator_mod_reduce() const
Returns a communicator consisting of all processes with rank 0 in communicator_mod()
Definition: mpi_vector.h:82
MPI_Vector(const container &data, MPI_Comm comm)
construct a vector
Definition: mpi_vector.h:45
void set_communicator(MPI_Comm comm, MPI_Comm comm_mod, MPI_Comm comm_mod_reduce)
Set the communicators with dg::exblas::mpi_reduce_communicator.
Definition: mpi_vector.h:96
container container_type
Definition: mpi_vector.h:33
unsigned size() const
Return the size of the data object.
Definition: mpi_vector.h:104
void swap(MPI_Vector &src)
Swap data and communicator.
Definition: mpi_vector.h:108
A distributed vector contains a data container and a MPI communicator.
Definition: vector_categories.h:52
Communicator for asynchronous nearest neighbor communication.
Definition: mpi_vector.h:181
unsigned buffer_size() const
The size of the halo.
const unsigned * dims() const
The dimensionality of the input vector.
Definition: mpi_vector.h:231
get_value_type< Vector > * pointer_type
Definition: mpi_vector.h:184
unsigned n() const
halo size
Definition: mpi_vector.h:226
void global_gather_wait(const_pointer_type input, const buffer_type &buffer, MPI_Request rqst[4]) const
Wait for asynchronous communication to finish and gather received data into buffer.
Definition: mpi_vector.h:323
MPI_Comm communicator() const
The internal MPI communicator used.
Definition: mpi_vector.h:239
NearestNeighborComm(MPI_Comm comm=MPI_COMM_NULL)
no communication
Definition: mpi_vector.h:188
int map_index(int i) const
Map a local matrix index to a buffer index.
Definition: mpi_vector.h:266
Buffer allocate_buffer() const
Allocate a buffer object.
Definition: mpi_vector.h:248
NearestNeighborComm(unsigned n, const unsigned vector_dimensions[3], MPI_Comm comm, unsigned direction)
Construct.
Definition: mpi_vector.h:200
Vector container_type
Definition: mpi_vector.h:182
get_value_type< Vector > const * const_pointer_type
Definition: mpi_vector.h:185
unsigned direction() const
The direction of communication.
Definition: mpi_vector.h:237
NearestNeighborComm(const NearestNeighborComm< OtherIndex, OtherBuffer, OtherVector > &src)
Construct from other Communicator.
Definition: mpi_vector.h:216
void global_gather_init(const_pointer_type input, buffer_type &buffer, MPI_Request rqst[4]) const
Gather values from given Vector and initiate asynchronous MPI communication.
Definition: mpi_vector.h:283
bool isCommunicating() const
True if the gather/scatter operation involves actual MPI communication.
Definition: mpi_vector.h:257
OpenMP parallel execution.
Definition: execution_policy.h:28
Indicate sequential execution.
Definition: execution_policy.h:26
get_value_type< container > value_type
Definition: mpi_vector.h:135
get_execution_policy< container > execution_policy
Definition: mpi_vector.h:137
The vector traits.
Definition: tensor_traits.h:31