Extension: ExBLAS
#include "dg/algorithm.h" (or as a standalone library as "dg/exblas/exblas.h")
exdot_serial.h
Go to the documentation of this file.
1/*
2 * %%%%%%%%%%%%%%%%%%%%%%%Original development%%%%%%%%%%%%%%%%%%%%%%%%%
3 * Copyright (c) 2016 Inria and University Pierre and Marie Curie
4 * %%%%%%%%%%%%%%%%%%%%%%%Modifications and further additions%%%%%%%%%%
5 * Matthias Wiesenberger, 2017, within FELTOR and EXBLAS licenses
6 */
7
18#pragma once
19#include <cassert>
20#include <cstdlib>
21#include <cstdio>
22#include <cmath>
23#include <iostream>
24
25#include "accumulate.h"
26#include "ExSUM.FPE.hpp"
27
28namespace dg
29{
30namespace exblas{
31
40union udouble{
41 double d;
42 int64_t i;
43};
44
46namespace cpu{
47
48template<typename CACHE, typename PointerOrValue1, typename PointerOrValue2>
49void ExDOTFPE_cpu(int N, PointerOrValue1 a, PointerOrValue2 b, int64_t* acc, bool* error) {
50 CACHE cache(acc);
51#ifndef _WITHOUT_VCL
52 int r = (( int64_t(N) ) & ~7ul);
53 for(int i = 0; i < r; i+=8) {
54#ifndef _MSC_VER
55 asm ("# myloop");
56#endif
57 //vcl::Vec8d r1 ;
58 //vcl::Vec8d x = TwoProductFMA(make_vcl_vec8d(a,i), make_vcl_vec8d(b,i), r1);
59 //vcl::Vec8d x = TwoProductFMA(vcl::Vec8d().load(a+i), vcl::Vec8d().load(b+i), r1);
60 vcl::Vec8d x = make_vcl_vec8d(a,i)* make_vcl_vec8d(b,i);
61 vcl::Vec8db finite = vcl::is_finite( x);
62 if( !vcl::horizontal_and( finite) ) *error = true;
63 cache.Accumulate(x);
64 //cache.Accumulate(r1);
65 }
66 if( r != N) {
67 //accumulate remainder
68 //vcl::Vec8d r1;
69 //vcl::Vec8d x = TwoProductFMA(make_vcl_vec8d(a,r,N-r), make_vcl_vec8d(b,r,N-r), r1);
70 //vcl::Vec8d x = TwoProductFMA(vcl::Vec8d().load_partial(N-r, a+r), vcl::Vec8d().load_partial(N-r,b+r), r1);
71 vcl::Vec8d x = make_vcl_vec8d(a,r,N-r)*make_vcl_vec8d(b,r,N-r);
72 vcl::Vec8db finite = vcl::is_finite( x);
73 if( !vcl::horizontal_and( finite) ) *error = true;
74 cache.Accumulate(x);
75 //cache.Accumulate(r1);
76 }
77#else// _WITHOUT_VCL
78 for(int i = 0; i < N; i++) {
79 //double r1;
80 //double x = TwoProductFMA(get_element(a,i),get_element(b,i),r1);
81 double x = get_element(a,i)*get_element(b,i);
82 if( !std::isfinite(x) ) *error = true;
83 cache.Accumulate(x);
84 //cache.Accumulate(r1);
85 }
86#endif// _WITHOUT_VCL
87 cache.Flush();
88}
89
90template<typename CACHE, typename PointerOrValue1, typename PointerOrValue2, typename PointerOrValue3>
91void ExDOTFPE_cpu(int N, PointerOrValue1 a, PointerOrValue2 b, PointerOrValue3 c, int64_t* acc, bool* error) {
92 CACHE cache(acc);
93#ifndef _WITHOUT_VCL
94 int r = (( int64_t(N)) & ~7ul);
95 for(int i = 0; i < r; i+=8) {
96#ifndef _MSC_VER
97 asm ("# myloop");
98#endif
99 //vcl::Vec8d r1 , r2, cvec = vcl::Vec8d().load(c+i);
100 //vcl::Vec8d x = TwoProductFMA(vcl::Vec8d().load(a+i), vcl::Vec8d().load(b+i), r1);
101 //vcl::Vec8d x2 = TwoProductFMA(x , cvec, r2);
102 //vcl::Vec8d x1 = vcl::mul_add(vcl::Vec8d().load(a+i),vcl::Vec8d().load(b+i), 0);
103 //vcl::Vec8d x2 = vcl::mul_add( x1 ,vcl::Vec8d().load(c+i), 0);
104 vcl::Vec8d x1 = vcl::mul_add(make_vcl_vec8d(a,i),make_vcl_vec8d(b,i), 0);
105 vcl::Vec8d x2 = vcl::mul_add( x1 ,make_vcl_vec8d(c,i), 0);
106 vcl::Vec8db finite = vcl::is_finite( x2);
107 if( !vcl::horizontal_and( finite) ) *error = true;
108 cache.Accumulate(x2);
109 //cache.Accumulate(r2);
110 //x2 = TwoProductFMA(r1, cvec, r2);
111 //cache.Accumulate(x2);
112 //cache.Accumulate(r2);
113 }
114 if( r != N) {
115 //accumulate remainder
116 //vcl::Vec8d r1 , r2, cvec = vcl::Vec8d().load_partial(N-r, c+r);
117 //vcl::Vec8d x = TwoProductFMA(vcl::Vec8d().load_partial(N-r, a+r), vcl::Vec8d().load_partial(N-r,b+r), r1);
118 //vcl::Vec8d x2 = TwoProductFMA(x , cvec, r2);
119 vcl::Vec8d x1 = vcl::mul_add(make_vcl_vec8d(a,r,N-r),make_vcl_vec8d(b,r,N-r), 0);
120 vcl::Vec8d x2 = vcl::mul_add( x1 ,make_vcl_vec8d(c,r,N-r), 0);
121 vcl::Vec8db finite = vcl::is_finite( x2);
122 if( !vcl::horizontal_and( finite) ) *error = true;
123 cache.Accumulate(x2);
124 //cache.Accumulate(r2);
125 //x2 = TwoProductFMA(r1, cvec, r2);
126 //cache.Accumulate(x2);
127 //cache.Accumulate(r2);
128 }
129#else// _WITHOUT_VCL
130 for(int i = 0; i < N; i++) {
131 double x1 = get_element(a,i)*get_element(b,i);
132 double x2 = x1*get_element(c,i);
133 if( !std::isfinite(x2) ) *error = true;
134 cache.Accumulate(x2);
135 }
136#endif// _WITHOUT_VCL
137 cache.Flush();
138}
139}//namespace cpu
141
199template<class PointerOrValue1, class PointerOrValue2, size_t NBFPE=8>
200void exdot_cpu(unsigned size, PointerOrValue1 x1_ptr, PointerOrValue2 x2_ptr, int64_t* h_superacc, int* status){
201 static_assert( has_floating_value<PointerOrValue1>::value, "PointerOrValue1 needs to be T or T* with T one of (const) float or (const) double");
202 static_assert( has_floating_value<PointerOrValue2>::value, "PointerOrValue2 needs to be T or T* with T one of (const) float or (const) double");
203 for( int i=0; i<exblas::BIN_COUNT; i++)
204 h_superacc[i] = 0;
205 bool error = false;
206#ifndef _WITHOUT_VCL
207 cpu::ExDOTFPE_cpu<cpu::FPExpansionVect<vcl::Vec8d, NBFPE, cpu::FPExpansionTraits<true> > >((int)size,x1_ptr,x2_ptr, h_superacc, &error);
208#else
209 cpu::ExDOTFPE_cpu<cpu::FPExpansionVect<double, NBFPE, cpu::FPExpansionTraits<true> > >((int)size,x1_ptr,x2_ptr, h_superacc, &error);
210#endif//_WITHOUT_VCL
211 *status = 0;
212 if( error ) *status = 1;
213}
214
218template<class PointerOrValue1, class PointerOrValue2, class PointerOrValue3, size_t NBFPE=8>
219void exdot_cpu(unsigned size, PointerOrValue1 x1_ptr, PointerOrValue2 x2_ptr, PointerOrValue3 x3_ptr, int64_t* h_superacc, int* status) {
220 static_assert( has_floating_value<PointerOrValue1>::value, "PointerOrValue1 needs to be T or T* with T one of (const) float or (const) double");
221 static_assert( has_floating_value<PointerOrValue2>::value, "PointerOrValue2 needs to be T or T* with T one of (const) float or (const) double");
222 static_assert( has_floating_value<PointerOrValue3>::value, "PointerOrValue3 needs to be T or T* with T one of (const) float or (const) double");
223 for( int i=0; i<exblas::BIN_COUNT; i++)
224 h_superacc[i] = 0;
225 bool error = false;
226#ifndef _WITHOUT_VCL
227 cpu::ExDOTFPE_cpu<cpu::FPExpansionVect<vcl::Vec8d, NBFPE, cpu::FPExpansionTraits<true> > >((int)size,x1_ptr,x2_ptr, x3_ptr, h_superacc, &error);
228#else
229 cpu::ExDOTFPE_cpu<cpu::FPExpansionVect<double, NBFPE, cpu::FPExpansionTraits<true> > >((int)size,x1_ptr,x2_ptr, x3_ptr, h_superacc, &error);
230#endif//_WITHOUT_VCL
231 *status = 0;
232 if( error ) *status = 1;
233}
234
235
236
237}//namespace exblas
238} //namespace dg
Primitives for accumulation into superaccumulator.
void exdot_cpu(unsigned size, PointerOrValue1 x1_ptr, PointerOrValue2 x2_ptr, int64_t *h_superacc, int *status)
Serial version of exact dot product.
Definition: exdot_serial.h:200
Utility union to display all bits of a double (using type-punning)
Definition: exdot_serial.h:40
int64_t i
a 64 bit integer
Definition: exdot_serial.h:42
double d
a double
Definition: exdot_serial.h:41