Extension: ExBLAS
#include "dg/algorithm.h" (or as a standalone library as "dg/exblas/exblas.h")
accumulate.h
Go to the documentation of this file.
1/*
2 * %%%%%%%%%%%%%%%%%%%%%%%Original development%%%%%%%%%%%%%%%%%%%%%%%%%
3 * Copyright (c) 2016 Inria and University Pierre and Marie Curie
4 * All rights reserved.
5 * %%%%%%%%%%%%%%%%%%%%%%%Modifications and further additions%%%%%%%%%%
6 * Matthias Wiesenberger, 2017, within FELTOR and EXBLAS licenses
7 */
18#pragma once
19#include "config.h"
20#include "mylibm.hpp"
21//this file has a direct correspondance to gpu code accumulate.cuh
22
23namespace dg
24{
25namespace exblas {
26namespace cpu {
29//********* Here, the change from float to double happens ***************//
31#ifndef _WITHOUT_VCL
32static inline vcl::Vec8d make_vcl_vec8d( double x, int i){
33 return vcl::Vec8d(x);
34}
35static inline vcl::Vec8d make_vcl_vec8d( const double* x, int i){
36 return vcl::Vec8d().load( x+i);
37}
38static inline vcl::Vec8d make_vcl_vec8d( double x, int i, int num){
39 return vcl::Vec8d(x);
40}
41static inline vcl::Vec8d make_vcl_vec8d( const double* x, int i, int num){
42 return vcl::Vec8d().load_partial( num, x+i);
43}
44static inline vcl::Vec8d make_vcl_vec8d( float x, int i){
45 return vcl::Vec8d((double)x);
46}
47static inline vcl::Vec8d make_vcl_vec8d( const float* x, int i){
48 return vcl::Vec8d( x[i], x[i+1], x[i+2], x[i+3], x[i+4], x[i+5], x[i+6], x[i+7]);
49}
50static inline vcl::Vec8d make_vcl_vec8d( float x, int i, int num){
51 return vcl::Vec8d((double)x);
52}
53static inline vcl::Vec8d make_vcl_vec8d( const float* x, int i, int num){
54 double tmp[8];
55 for(int j=0; j<num; j++)
56 tmp[j] = (double)x[i+j];
57 return vcl::Vec8d().load_partial( num, tmp);
58}
59#endif//_WITHOUT_VCL
60template<class T>
61inline double get_element( T x, int i){
62 return (double)x;
63}
64template<class T>
65inline double get_element( T* x, int i){
66 return (double)(*(x+i));
67}
68
69
71// Main computation pass: compute partial superaccs
73static inline void AccumulateWord( int64_t *accumulator, int i, int64_t x) {
74 // With atomic accumulator updates
75 // accumulation and carry propagation can happen in any order,
76 // as long as addition is atomic
77 // only constraint is: never forget an overflow bit
78 //assert(i >= 0 && i < BIN_COUNT);
79 unsigned char overflow;
80 int64_t carry = x;
81 int64_t carrybit;
82 int64_t oldword = cpu::xadd(accumulator[i], x, overflow);
83 // To propagate over- or underflow
84 while(unlikely(overflow)) {
85 // Carry or borrow
86 // oldword has sign S
87 // x has sign S
88 // accumulator[i] has sign !S (just after update)
89 // carry has sign !S
90 // carrybit has sign S
91 carry = (oldword + carry) >> DIGITS; // Arithmetic shift
92 bool s = oldword > 0;
93 carrybit = (s ? 1ll << KRX : (unsigned long long)(-1ll) << KRX); //MW left shift of negative number is undefined so convert to unsigned
94
95 // Cancel carry-save bits
96 cpu::xadd(accumulator[i], (int64_t) -(carry << DIGITS), overflow);
97 if(TSAFE && unlikely(s ^ overflow)) {
98 // (Another) overflow of sign S
99 carrybit *= 2;
100 }
101 carry += carrybit;
102
103 ++i;
104 if (i >= BIN_COUNT){
105 //status = Overflow;
106 return;
107 }
108 oldword = cpu::xadd(accumulator[i], carry, overflow);
109 }
110}
112
120static inline void Accumulate( int64_t* accumulator, double x) {
121 if (x == 0)
122 return;
123 //assert( !std::isnan(x) && "Detected NaN in dot product!!");
124
125
126 int e = cpu::exponent(x);
127 int exp_word = e / DIGITS; // Word containing MSbit (upper bound)
128 int iup = exp_word + F_WORDS;
129
130 double xscaled = cpu::myldexp(x, -DIGITS * exp_word);
131
132 int i;
133 for (i = iup; i>=0 && xscaled != 0; --i) { //MW: i>=0 protects against NaN
134 double xrounded = cpu::myrint(xscaled);
135 int64_t xint = cpu::myllrint(xscaled);
136 AccumulateWord(accumulator, i, xint);
137
138 xscaled -= xrounded;
139 xscaled *= DELTASCALE;
140 }
141}
142#ifndef _WITHOUT_VCL
143static inline void Accumulate( int64_t* accumulator, vcl::Vec8d x) {
144 double v[8];
145 x.store(v);
146
147#if INSTRSET >= 7
148 _mm256_zeroupper();
149#endif
150 for(unsigned int j = 0; j != 8; ++j) {
151 exblas::cpu::Accumulate(accumulator, v[j]);
152 }
153}
154#endif //_WITHOUT_VCL
156// Normalize functions
158// Returns sign
159// Does not really normalize! MW: what does that mean?
160//
171static inline bool Normalize( int64_t *accumulator, int& imin, int& imax) {
172 int64_t carry_in = accumulator[imin] >> DIGITS;
173 accumulator[imin] -= carry_in << DIGITS;
174 int i;
175 // Sign-extend all the way
176 for (i = imin + 1; i < BIN_COUNT; ++i) {
177 accumulator[i] += carry_in;
178 int64_t carry_out = accumulator[i] >> DIGITS; // Arithmetic shift
179 accumulator[i] -= (carry_out << DIGITS);
180 carry_in = carry_out;
181 }
182 imax = i - 1;
183
184 // Do not cancel the last carry to avoid losing information
185 accumulator[imax] += carry_in << DIGITS;
186
187 return carry_in < 0;
188}
189
191// Rounding functions
193
194
202static inline double Round( int64_t * accumulator) {
203 int imin = IMIN;
204 int imax = IMAX;
205 bool negative = Normalize(accumulator, imin, imax);
206
207 // Find leading word
208 int i;
209 // Skip zeroes
210 for(i = imax; i >= imin && accumulator[i] == 0; --i) {
211 // MW: note that i >= imin has to come *before* accumulator[i]
212 // else it is possible that accumulator[-1] is accessed
213 }
214 if (negative) {
215 // Skip ones
216 for(; i >= imin && (accumulator[i] & ((1ll << DIGITS) - 1)) == ((1ll << DIGITS) - 1); --i) {
217 }
218 }
219 if (i < 0) {
220 return 0.0;
221 }
222
223 int64_t hiword = negative ? ((1ll << DIGITS) - 1) - accumulator[i] : accumulator[i];
224 double rounded = (double)hiword;
225 double hi = ldexp(rounded, (i - F_WORDS) * DIGITS);
226 if (i == 0) {
227 return negative ? -hi : hi; // Correct rounding achieved
228 }
229 hiword -= std::llrint(rounded);
230 double mid = ldexp((double) hiword, (i - F_WORDS) * DIGITS);
231
232 // Compute sticky
233 int64_t sticky = 0;
234 for (int j = imin; j != i - 1; ++j) {
235 sticky |= negative ? ((1ll << DIGITS) - accumulator[j]) : accumulator[j];
236 }
237
238 int64_t loword = negative ? ((1ll << DIGITS) - accumulator[i - 1]) : accumulator[i - 1];
239 loword |= !!sticky;
240 double lo = ldexp((double) loword, (i - 1 - F_WORDS) * DIGITS);
241
242 // Now add3(hi, mid, lo)
243 // No overlap, we have already normalized
244 if (mid != 0) {
245 lo = cpu::OddRoundSumNonnegative(mid, lo);
246 }
247 // Final rounding
248 hi = hi + lo;
249 return negative ? -hi : hi;
250}
251
252}//namespace cpu
253} //namespace exblas
254} //namespace dg
static double Round(int64_t *accumulator)
Convert a superaccumulator to the nearest double precision number (CPU version)
Definition: accumulate.h:202
static void Accumulate(int64_t *accumulator, double x)
Accumulate a double to the superaccumulator.
Definition: accumulate.h:120
static bool Normalize(int64_t *accumulator, int &imin, int &imax)
Normalize a superaccumulator.
Definition: accumulate.h:171