Extension: ExBLAS
#include "dg/algorithm.h" (or as a standalone library as "dg/exblas/exblas.h")
Loading...
Searching...
No Matches
exdot_serial.h
Go to the documentation of this file.
1/*
2 * %%%%%%%%%%%%%%%%%%%%%%%Original development%%%%%%%%%%%%%%%%%%%%%%%%%
3 * Copyright (c) 2016 Inria and University Pierre and Marie Curie
4 * %%%%%%%%%%%%%%%%%%%%%%%Modifications and further additions%%%%%%%%%%
5 * Matthias Wiesenberger, 2017, within FELTOR and EXBLAS licenses
6 */
7
18#pragma once
19#include <cassert>
20#include <cstdlib>
21#include <cstdio>
22#include <cmath>
23#include <iostream>
24
25#include "accumulate.h"
26#include "ExSUM.FPE.hpp"
27
28namespace dg
29{
30namespace exblas{
31
40union udouble{
41 double d;
42 int64_t i;
43};
44
53union ufloat{
54 float f;
55 int32_t i;
56};
57
59namespace cpu{
60
61template<typename CACHE, typename PointerOrValue1, typename PointerOrValue2>
62void ExDOTFPE_cpu(int N, PointerOrValue1 a, PointerOrValue2 b, int64_t* acc, bool* error) {
63 CACHE cache(acc);
64#ifndef _WITHOUT_VCL
65 int r = (( int64_t(N) ) & ~7ul);
66 for(int i = 0; i < r; i+=8) {
67#ifndef _MSC_VER
68 asm ("# myloop");
69#endif
70 //vcl::Vec8d r1 ;
71 //vcl::Vec8d x = TwoProductFMA(make_vcl_vec8d(a,i), make_vcl_vec8d(b,i), r1);
72 //vcl::Vec8d x = TwoProductFMA(vcl::Vec8d().load(a+i), vcl::Vec8d().load(b+i), r1);
73 vcl::Vec8d x = make_vcl_vec8d(a,i)* make_vcl_vec8d(b,i);
74 vcl::Vec8db finite = vcl::is_finite( x);
75 if( !vcl::horizontal_and( finite) ) *error = true;
76 cache.Accumulate(x);
77 //cache.Accumulate(r1);
78 }
79 if( r != N) {
80 //accumulate remainder
81 //vcl::Vec8d r1;
82 //vcl::Vec8d x = TwoProductFMA(make_vcl_vec8d(a,r,N-r), make_vcl_vec8d(b,r,N-r), r1);
83 //vcl::Vec8d x = TwoProductFMA(vcl::Vec8d().load_partial(N-r, a+r), vcl::Vec8d().load_partial(N-r,b+r), r1);
84 vcl::Vec8d x = make_vcl_vec8d(a,r,N-r)*make_vcl_vec8d(b,r,N-r);
85 vcl::Vec8db finite = vcl::is_finite( x);
86 if( !vcl::horizontal_and( finite) ) *error = true;
87 cache.Accumulate(x);
88 //cache.Accumulate(r1);
89 }
90#else// _WITHOUT_VCL
91 for(int i = 0; i < N; i++) {
92 //double r1;
93 //double x = TwoProductFMA(get_element(a,i),get_element(b,i),r1);
94 double x = (double)get_element(a,i)*(double)get_element(b,i);
95 if( !std::isfinite(x) ) *error = true;
96 cache.Accumulate(x);
97 //cache.Accumulate(r1);
98 }
99#endif// _WITHOUT_VCL
100 cache.Flush();
101}
102
103template<typename CACHE, typename PointerOrValue1, typename PointerOrValue2, typename PointerOrValue3>
104void ExDOTFPE_cpu(int N, PointerOrValue1 a, PointerOrValue2 b, PointerOrValue3 c, int64_t* acc, bool* error) {
105 CACHE cache(acc);
106#ifndef _WITHOUT_VCL
107 int r = (( int64_t(N)) & ~7ul);
108 for(int i = 0; i < r; i+=8) {
109#ifndef _MSC_VER
110 asm ("# myloop");
111#endif
112 //vcl::Vec8d r1 , r2, cvec = vcl::Vec8d().load(c+i);
113 //vcl::Vec8d x = TwoProductFMA(vcl::Vec8d().load(a+i), vcl::Vec8d().load(b+i), r1);
114 //vcl::Vec8d x2 = TwoProductFMA(x , cvec, r2);
115 //vcl::Vec8d x1 = vcl::mul_add(vcl::Vec8d().load(a+i),vcl::Vec8d().load(b+i), 0);
116 //vcl::Vec8d x2 = vcl::mul_add( x1 ,vcl::Vec8d().load(c+i), 0);
117 vcl::Vec8d x1 = vcl::mul_add(make_vcl_vec8d(a,i),make_vcl_vec8d(b,i), 0);
118 vcl::Vec8d x2 = vcl::mul_add( x1 ,make_vcl_vec8d(c,i), 0);
119 vcl::Vec8db finite = vcl::is_finite( x2);
120 if( !vcl::horizontal_and( finite) ) *error = true;
121 cache.Accumulate(x2);
122 //cache.Accumulate(r2);
123 //x2 = TwoProductFMA(r1, cvec, r2);
124 //cache.Accumulate(x2);
125 //cache.Accumulate(r2);
126 }
127 if( r != N) {
128 //accumulate remainder
129 //vcl::Vec8d r1 , r2, cvec = vcl::Vec8d().load_partial(N-r, c+r);
130 //vcl::Vec8d x = TwoProductFMA(vcl::Vec8d().load_partial(N-r, a+r), vcl::Vec8d().load_partial(N-r,b+r), r1);
131 //vcl::Vec8d x2 = TwoProductFMA(x , cvec, r2);
132 vcl::Vec8d x1 = vcl::mul_add(make_vcl_vec8d(a,r,N-r),make_vcl_vec8d(b,r,N-r), 0);
133 vcl::Vec8d x2 = vcl::mul_add( x1 ,make_vcl_vec8d(c,r,N-r), 0);
134 vcl::Vec8db finite = vcl::is_finite( x2);
135 if( !vcl::horizontal_and( finite) ) *error = true;
136 cache.Accumulate(x2);
137 //cache.Accumulate(r2);
138 //x2 = TwoProductFMA(r1, cvec, r2);
139 //cache.Accumulate(x2);
140 //cache.Accumulate(r2);
141 }
142#else// _WITHOUT_VCL
143 for(int i = 0; i < N; i++) {
144 double x1 = (double)get_element(a,i)*(double)get_element(b,i);
145 double x2 = x1*(double)get_element(c,i);
146 if( !std::isfinite(x2) ) *error = true;
147 cache.Accumulate(x2);
148 }
149#endif// _WITHOUT_VCL
150 cache.Flush();
151}
152}//namespace cpu
154
210template<class PointerOrValue1, class PointerOrValue2, size_t NBFPE=8>
211void exdot_cpu(unsigned size, PointerOrValue1 x1_ptr, PointerOrValue2 x2_ptr, int64_t* h_superacc, int* status){
212 static_assert( has_floating_value<PointerOrValue1>::value, "PointerOrValue1 needs to be T or T* with T one of (const) float or (const) double");
213 static_assert( has_floating_value<PointerOrValue2>::value, "PointerOrValue2 needs to be T or T* with T one of (const) float or (const) double");
214 for( int i=0; i<exblas::BIN_COUNT; i++)
215 h_superacc[i] = 0;
216 bool error = false;
217#ifndef _WITHOUT_VCL
218 cpu::ExDOTFPE_cpu<cpu::FPExpansionVect<vcl::Vec8d, NBFPE, cpu::FPExpansionTraits<true> > >((int)size,x1_ptr,x2_ptr, h_superacc, &error);
219#else
220 cpu::ExDOTFPE_cpu<cpu::FPExpansionVect<double, NBFPE, cpu::FPExpansionTraits<true> > >((int)size,x1_ptr,x2_ptr, h_superacc, &error);
221#endif//_WITHOUT_VCL
222 *status = 0;
223 if( error ) *status = 1;
224}
225
229template<class PointerOrValue1, class PointerOrValue2, class PointerOrValue3, size_t NBFPE=8>
230void exdot_cpu(unsigned size, PointerOrValue1 x1_ptr, PointerOrValue2 x2_ptr, PointerOrValue3 x3_ptr, int64_t* h_superacc, int* status) {
231 static_assert( has_floating_value<PointerOrValue1>::value, "PointerOrValue1 needs to be T or T* with T one of (const) float or (const) double");
232 static_assert( has_floating_value<PointerOrValue2>::value, "PointerOrValue2 needs to be T or T* with T one of (const) float or (const) double");
233 static_assert( has_floating_value<PointerOrValue3>::value, "PointerOrValue3 needs to be T or T* with T one of (const) float or (const) double");
234 for( int i=0; i<exblas::BIN_COUNT; i++)
235 h_superacc[i] = 0;
236 bool error = false;
237#ifndef _WITHOUT_VCL
238 cpu::ExDOTFPE_cpu<cpu::FPExpansionVect<vcl::Vec8d, NBFPE, cpu::FPExpansionTraits<true> > >((int)size,x1_ptr,x2_ptr, x3_ptr, h_superacc, &error);
239#else
240 cpu::ExDOTFPE_cpu<cpu::FPExpansionVect<double, NBFPE, cpu::FPExpansionTraits<true> > >((int)size,x1_ptr,x2_ptr, x3_ptr, h_superacc, &error);
241#endif//_WITHOUT_VCL
242 *status = 0;
243 if( error ) *status = 1;
244}
245
246
247
248}//namespace exblas
249} //namespace dg
Primitives for accumulation into superaccumulator.
void exdot_cpu(unsigned size, PointerOrValue1 x1_ptr, PointerOrValue2 x2_ptr, int64_t *h_superacc, int *status)
Serial version of exact dot product.
Definition exdot_serial.h:211
Utility union to display all bits of a double (using type-punning)
Definition exdot_serial.h:40
int64_t i
a 64 bit integer
Definition exdot_serial.h:42
double d
a double
Definition exdot_serial.h:41
Utility union to display all bits of a float (using type-punning)
Definition exdot_serial.h:53
float f
a float
Definition exdot_serial.h:54
int32_t i
a 32 bit integer
Definition exdot_serial.h:55