Details on the hardware used to gather the performance data

Details on the hardware used to gather the performance data

import pandas as pd
from collections import OrderedDict as odict
#name, cache-size (in kB)
hardware = odict({})
hardware['i5']  = ('Intel Core i5-6600 @ 3.30GHz (2x 8GB DDR4, 4 cores)',6144, 
                  '1 MPI task x 4 OpenMP threads (1 per core)')
hardware['skl'] = ('2x Intel Xeon 8160 (Skylake) at 2.10 GHz (12x 16GB DDR4, 2x 24 cores)',2*33000, 
                   '2 MPI tasks (1 per socket) x 24 OpenMP threads (1 per core)')
hardware['knl'] = ('Intel Xeon Phi 7250 (Knights Landing) at 1.40 GHz (16GB MCDRAM, 68 cores)',34000, 
                   '1 MPI task x 136 OpenMP hyperthreads (2 per core)')
hardware['gtx1060'] = ('Nvidia GeForce GTX 1060 (6GB global memory)',1572.864, '1 MPI task per GPU')
hardware['p100']    = ('Nvidia Tesla P100-PCIe (16GB global memory)',4194.304, '1 MPI task per GPU')
hardware['v100']    = ('Nvidia Tesla V100-PCIe (16GB global memory)',6291.456, '1 MPI task per GPU')
hardware['p100nv']    = ('Nvidia Tesla P100-Power8 (16GB global memory)',4194.304, '1 MPI task per GPU')
hardware['v100nv']    = ('Nvidia Tesla V100-Power9 (16GB global memory)',6291.456, '1 MPI task per GPU')

memory =odict({}) #find with 'dmidecode --type 17'
#name, I/O bus clockrate (MHz) , buswidth (bit), size (MB),
memory['i5'] = ('2x 8GB Kingston DDR4', 1066, 2*64, 2*8192) #ECC: no (actually it is DDR4-2400 but i5 has max DDR4-2133)
memory['skl'] = ('12x 16GB DDR4',1333,12*64,12*16000) #ECC:  ?
memory['knl'] = ('MCDRAM',None,None,16000) #ECC: ?
memory['gtx1060'] = ('on-card global memory',4004,192,6069) #ECC: no
memory['p100'] = ('on-card global memory',715,4096,16276) # ECC: yes
memory['v100'] = ('on-card global memory',877,4096,16152) # ECC: yes
compiler=odict({})
compiler['i5']  = ('mpic++ (gcc-5.4) -mavx -mfma -O3 -fopenmp')
compiler['skl'] = ('mpiicc-17.0.4 -mt_mpi -xCORE-AVX512 -mtune=skylake -O3 -restrict -fp-model precise -fimf-arch-consistency=true -qopenmp')
compiler['knl'] = ('mpiicc-17.0.4 -mt_mpi -xMIC-AVX512 -O3 -restrict -fp-model precise -fimf-arch-consistency=true -qopenmp')
compiler['gtx1060'] = ('nvcc-7.0 --compiler-bindir mpic++ (gcc-5.4) -O3 -arch sm_35')
compiler['p100']    = ('nvcc-8.0 --compiler-bindir mpic++ (gcc-5.4) -O3 -arch sm_60 -Xcompiler "-O3 -mavx -mfma"')
compiler['v100']    = ('nvcc-8.0 --compiler-bindir mpic++ (gcc-5.4) -O3 -arch sm_60 -Xcompiler "-O3 -mavx -mfma"')
df = pd.DataFrame(hardware)
df = df.transpose()
df.columns= ['device-name', 'cache-size-kB','single-node configuration']
com = pd.DataFrame(compiler, index=['compiler flags'])
com = com.transpose()
com
df = df.join(com)
mem = pd.DataFrame(memory)
mem = mem.transpose()
mem.columns = ['mem-description', 'clockrate-MHz', 'buswidth-bit', 'size-MB']
df=df.join(mem)
#df

From the available data we can compute the theoretical memory bandwidth via $\(bw = 2*clockrate*buswidth\)$ where the ‘2’ is for double data rate (DDR)

df['bandwidth'] = 2*df['clockrate-MHz']*1e6*df['buswidth-bit']/8/1e9
#df

Let us compare the theoretical bandwidth with our previously measured peak bandwidth from axpby

exp = pd.read_csv('performance.csv',delimiter=' ')
exp.set_index('arch',inplace=True)
exp.index.name = None
df = df.join(exp['axpby_bw'])
df['mem_efficiency']=df['axpby_bw']/df['bandwidth']*100
pd.set_option('display.float_format', lambda x: '%.2f' % x)
df
device-name cache-size-kB single-node configuration compiler flags mem-description clockrate-MHz buswidth-bit size-MB bandwidth axpby_bw mem_efficiency
i5 Intel Core i5-6600 @ 3.30GHz (2x 8GB DDR4, 4 c... 6144 1 MPI task x 4 OpenMP threads (1 per core) mpic++ (gcc-5.4) -mavx -mfma -O3 -fopenmp 2x 8GB Kingston DDR4 1066 128 16384 34.11 29.99 87.90
skl 2x Intel Xeon 8160 (Skylake) at 2.10 GHz (12x ... 66000 2 MPI tasks (1 per socket) x 24 OpenMP threads... mpiicc-17.0.4 -mt_mpi -xCORE-AVX512 -mtune=sky... 12x 16GB DDR4 1333 768 192000 255.94 206.71 80.77
knl Intel Xeon Phi 7250 (Knights Landing) at 1.40 ... 34000 1 MPI task x 136 OpenMP hyperthreads (2 per core) mpiicc-17.0.4 -mt_mpi -xMIC-AVX512 -O3 -restri... MCDRAM None None 16000 NaN 393.15 NaN
gtx1060 Nvidia GeForce GTX 1060 (6GB global memory) 1572.86 1 MPI task per GPU nvcc-7.0 --compiler-bindir mpic++ (gcc-5.4) -O... on-card global memory 4004 192 6069 192.19 157.05 81.71
p100 Nvidia Tesla P100-PCIe (16GB global memory) 4194.30 1 MPI task per GPU nvcc-8.0 --compiler-bindir mpic++ (gcc-5.4) -O... on-card global memory 715 4096 16276 732.16 550.51 75.19
v100 Nvidia Tesla V100-PCIe (16GB global memory) 6291.46 1 MPI task per GPU nvcc-8.0 --compiler-bindir mpic++ (gcc-5.4) -O... on-card global memory 877 4096 16152 898.05 846.42 94.25
p100nv Nvidia Tesla P100-Power8 (16GB global memory) 4194.30 1 MPI task per GPU NaN NaN NaN NaN NaN NaN NaN NaN
v100nv Nvidia Tesla V100-Power9 (16GB global memory) 6291.46 1 MPI task per GPU NaN NaN NaN NaN NaN NaN NaN NaN

Let us write a summarized LateX table to be used for publication

pd.set_option('precision',3)
file = df.loc[:,['device-name','single-node configuration']]#,'bandwidth']]
#file.loc['knl','bandwidth'] = '>400'
file.columns = ['device description', 'single-node configuration']#, 'bandwidth [GB/s]']
filename='hardware.tex'
df.loc['knl','bandwidth'] = '$>$400'
pd.set_option('display.max_colwidth', 200)
with open(filename, 'wb') as f:
    f.write(bytes(file.to_latex(
        column_format='@{}lp{6.5cm}p{5cm}@{}',
        bold_rows=True),'UTF-8'))
file
device description single-node configuration
i5 Intel Core i5-6600 @ 3.30GHz (2x 8GB DDR4, 4 cores) 1 MPI task x 4 OpenMP threads (1 per core)
skl 2x Intel Xeon 8160 (Skylake) at 2.10 GHz (12x 16GB DDR4, 2x 24 cores) 2 MPI tasks (1 per socket) x 24 OpenMP threads (1 per core)
knl Intel Xeon Phi 7250 (Knights Landing) at 1.40 GHz (16GB MCDRAM, 68 cores) 1 MPI task x 136 OpenMP hyperthreads (2 per core)
gtx1060 Nvidia GeForce GTX 1060 (6GB global memory) 1 MPI task per GPU
p100 Nvidia Tesla P100-PCIe (16GB global memory) 1 MPI task per GPU
v100 Nvidia Tesla V100-PCIe (16GB global memory) 1 MPI task per GPU
p100nv Nvidia Tesla P100-Power8 (16GB global memory) 1 MPI task per GPU
v100nv Nvidia Tesla V100-Power9 (16GB global memory) 1 MPI task per GPU