Extension: ExBLAS
#include "dg/algorithm.h" (or as a standalone library as "dg/exblas/exblas.h")
Loading...
Searching...
No Matches
accumulate.h
Go to the documentation of this file.
1/*
2 * %%%%%%%%%%%%%%%%%%%%%%%Original development%%%%%%%%%%%%%%%%%%%%%%%%%
3 * Copyright (c) 2016 Inria and University Pierre and Marie Curie
4 * All rights reserved.
5 * %%%%%%%%%%%%%%%%%%%%%%%Modifications and further additions%%%%%%%%%%
6 * Matthias Wiesenberger, 2017, within FELTOR and EXBLAS licenses
7 */
18#pragma once
19#include "config.h"
20#include "mylibm.hpp"
21//this file has a direct correspondance to gpu code accumulate.cuh
22
23namespace dg
24{
25namespace exblas {
26namespace cpu {
29//********* Here, the change from float to double happens ***************//
31#ifndef _WITHOUT_VCL
32inline vcl::Vec8d make_vcl_vec8d( double x, int i){
33 return vcl::Vec8d(x);
34}
35inline vcl::Vec8d make_vcl_vec8d( const double* x, int i){
36 return vcl::Vec8d().load( x+i);
37}
38inline vcl::Vec8d make_vcl_vec8d( double x, int i, int num){
39 return vcl::Vec8d(x);
40}
41inline vcl::Vec8d make_vcl_vec8d( const double* x, int i, int num){
42 return vcl::Vec8d().load_partial( num, x+i);
43}
44inline vcl::Vec8d make_vcl_vec8d( float x, int i){
45 return vcl::Vec8d((double)x);
46}
47inline vcl::Vec8d make_vcl_vec8d( const float* x, int i){
48 return vcl::Vec8d( x[i], x[i+1], x[i+2], x[i+3], x[i+4], x[i+5], x[i+6], x[i+7]);
49}
50inline vcl::Vec8d make_vcl_vec8d( float x, int i, int num){
51 return vcl::Vec8d((double)x);
52}
53inline vcl::Vec8d make_vcl_vec8d( const float* x, int i, int num){
54 double tmp[8];
55 for(int j=0; j<num; j++)
56 tmp[j] = (double)x[i+j];
57 return vcl::Vec8d().load_partial( num, tmp);
58}
59#endif//_WITHOUT_VCL
60template<class T>
61inline T get_element( T x, int i){
62 return x;
63}
64template<class T>
65inline T get_element( T* x, int i){
66 return *(x+i);
67}
69// Auxiliary functions
71
72// Knuth 2Sum.
73template<typename T>
74inline std::enable_if_t<!std::is_integral_v<T>,T> KnuthTwoSum(T a, T b, T & s)
75{
76 T r = a + b;
77 T z = r - a;
78 s = (a - (r - z)) + (b - z);
79 return r;
80}
81template<typename T> // for unsigned, int, char etc.
82inline std::enable_if_t<std::is_integral_v<T>,T> KnuthTwoSum(T a, T b, T & s)
83{
84 s = 0;
85 return a + b;
86}
87
88template<typename T>
89inline T TwoProductFMA(T a, T b, T &d) {
90 T p = a * b;
91#ifdef _WITHOUT_VCL
92 d = a*b-p;
93#else
94 d = vcl::mul_sub_x(a, b, p); //extra precision even if FMA is not available
95#endif//_WITHOUT_VCL
96 return p;
97}
98
100// FPE Accumulate and Round function for leightweight implementation
102//does not check for NaN
103template<typename T, size_t N> UNROLL_ATTRIBUTE
104void Accumulate(T x, std::array<T,N>& fpe , int* status)
105{
106 if( x == T(0) )
107 return;
108 for(unsigned int i = 0; i != N; ++i) {
109 T s;
110 fpe[i] = KnuthTwoSum(fpe[i], x, s);
111 x = s;
112 if( x == T(0)) //early exit
113 return;
114 }
115
116 if (x != T(0) && *status != 1) {
117 *status = 2;
118 }
119}
126template<class T, size_t N>
127inline T Round( const std::array<T,N>& fpe ) {
128 // Our own implementation
129 // Just accumulate to a FPE of size 2 and return sum;
130 std::array<T, 2> fpe_red{T(0),T(0)};
131 int status_red;
132 for( unsigned u = 0; u<N; u++)
133 Accumulate( fpe[u], fpe_red, &status_red);
134 return fpe_red[0] + fpe_red[1];
135
136 // The problem with the following is to get it to work for complex
137
141 //static_assert( N > 2, "FPE size must be greater than 2");
142 //union {
143 // double d;
144 // int64_t l;
145 //} thdb;
146
147 //double tl;
148 //double th = KnuthTwoSum(fpe[1], fpe[2], tl);
149
150 //if (tl != 0.0) {
151 // thdb.d = th;
152 // // if the mantissa of th is odd, there is nothing to do
153 // if (!(thdb.l & 1)) {
154 // // choose the rounding direction
155 // // depending of the signs of th and tl
156 // if ((tl > 0.0) ^ (th < 0.0))
157 // thdb.l++;
158 // else
159 // thdb.l--;
160 // th = thdb.d;
161 // }
162 //}
163
165 //return fpe[0] + th;
166}
167
169// Main computation pass: compute partial superaccs
171inline void AccumulateWord( int64_t *accumulator, int i, int64_t x) {
172 // With atomic accumulator updates
173 // accumulation and carry propagation can happen in any order,
174 // as long as addition is atomic
175 // only constraint is: never forget an overflow bit
176 //assert(i >= 0 && i < BIN_COUNT);
177 unsigned char overflow;
178 int64_t carry = x;
179 int64_t carrybit;
180 int64_t oldword = cpu::xadd(accumulator[i], x, overflow);
181 // To propagate over- or underflow
182 while(unlikely(overflow)) {
183 // Carry or borrow
184 // oldword has sign S
185 // x has sign S
186 // accumulator[i] has sign !S (just after update)
187 // carry has sign !S
188 // carrybit has sign S
189 carry = (oldword + carry) >> DIGITS; // Arithmetic shift
190 bool s = oldword > 0;
191 carrybit = (s ? 1ll << KRX : (unsigned long long)(-1ll) << KRX); //MW left shift of negative number is undefined so convert to unsigned
192
193 // Cancel carry-save bits
194 cpu::xadd(accumulator[i], (int64_t) -(carry << DIGITS), overflow);
195 if(TSAFE && unlikely(s ^ overflow)) {
196 // (Another) overflow of sign S
197 carrybit *= 2;
198 }
199 carry += carrybit;
200
201 ++i;
202 if (i >= BIN_COUNT){
203 //status = Overflow;
204 return;
205 }
206 oldword = cpu::xadd(accumulator[i], carry, overflow);
207 }
208}
210
217inline void Accumulate( int64_t* accumulator, double x) {
218 if (x == 0)
219 return;
220 //assert( !std::isnan(x) && "Detected NaN in dot product!!");
221
222
223 int e = cpu::exponent(x);
224 int exp_word = e / DIGITS; // Word containing MSbit (upper bound)
225 int iup = exp_word + F_WORDS;
226
227 double xscaled = cpu::myldexp(x, -DIGITS * exp_word);
228
229 int i;
230 for (i = iup; i>=0 && xscaled != 0; --i) { //MW: i>=0 protects against NaN
231 double xrounded = cpu::myrint(xscaled);
232 int64_t xint = cpu::myllrint(xscaled);
233 AccumulateWord(accumulator, i, xint);
234
235 xscaled -= xrounded;
236 xscaled *= DELTASCALE;
237 }
238}
239#ifndef _WITHOUT_VCL
240inline void Accumulate( int64_t* accumulator, vcl::Vec8d x) {
241 double v[8];
242 x.store(v);
243
244#if INSTRSET >= 7
245 _mm256_zeroupper();
246#endif
247 for(unsigned int j = 0; j != 8; ++j) {
248 exblas::cpu::Accumulate(accumulator, v[j]);
249 }
250}
251#endif //_WITHOUT_VCL
253// Normalize functions
255// Returns sign
256// Does not really normalize! MW: what does that mean?
257//
267inline bool Normalize( int64_t *accumulator, int& imin, int& imax) {
268 int64_t carry_in = accumulator[imin] >> DIGITS;
269 accumulator[imin] -= carry_in << DIGITS;
270 int i;
271 // Sign-extend all the way
272 for (i = imin + 1; i < BIN_COUNT; ++i) {
273 accumulator[i] += carry_in;
274 int64_t carry_out = accumulator[i] >> DIGITS; // Arithmetic shift
275 accumulator[i] -= (carry_out << DIGITS);
276 carry_in = carry_out;
277 }
278 imax = i - 1;
279
280 // Do not cancel the last carry to avoid losing information
281 accumulator[imax] += carry_in << DIGITS;
282
283 return carry_in < 0;
284}
285
287// Rounding functions
289
290
297inline double Round( int64_t * accumulator) {
298 int imin = IMIN;
299 int imax = IMAX;
300 bool negative = Normalize(accumulator, imin, imax);
301
302 // Find leading word
303 int i;
304 // Skip zeroes
305 for(i = imax; i >= imin && accumulator[i] == 0; --i) {
306 // MW: note that i >= imin has to come *before* accumulator[i]
307 // else it is possible that accumulator[-1] is accessed
308 }
309 if (negative) {
310 // Skip ones
311 for(; i >= imin && (accumulator[i] & ((1ll << DIGITS) - 1)) == ((1ll << DIGITS) - 1); --i) {
312 }
313 }
314 if (i < 0) {
315 return 0.0;
316 }
317
318 int64_t hiword = negative ? ((1ll << DIGITS) - 1) - accumulator[i] : accumulator[i];
319 double rounded = (double)hiword;
320 double hi = ldexp(rounded, (i - F_WORDS) * DIGITS);
321 if (i == 0) {
322 return negative ? -hi : hi; // Correct rounding achieved
323 }
324 hiword -= std::llrint(rounded);
325 double mid = ldexp((double) hiword, (i - F_WORDS) * DIGITS);
326
327 // Compute sticky
328 int64_t sticky = 0;
329 for (int j = imin; j != i - 1; ++j) {
330 sticky |= negative ? ((1ll << DIGITS) - accumulator[j]) : accumulator[j];
331 }
332
333 int64_t loword = negative ? ((1ll << DIGITS) - accumulator[i - 1]) : accumulator[i - 1];
334 loword |= !!sticky;
335 double lo = ldexp((double) loword, (i - 1 - F_WORDS) * DIGITS);
336
337 // Now add3(hi, mid, lo)
338 // No overlap, we have already normalized
339 if (mid != 0) {
340 lo = cpu::OddRoundSumNonnegative(mid, lo);
341 }
342 // Final rounding
343 hi = hi + lo;
344 return negative ? -hi : hi;
345}
346
347}//namespace cpu
348} //namespace exblas
349} //namespace dg
void Accumulate(int64_t *accumulator, double x)
Accumulate a double to the superaccumulator.
Definition accumulate.h:217
double Round(int64_t *accumulator)
Convert a superaccumulator to the nearest double precision number (CPU version)
Definition accumulate.h:297
bool Normalize(int64_t *accumulator, int &imin, int &imax)
Normalize a superaccumulator.
Definition accumulate.h:267