Details on the hardware used to gather the performance data
Details on the hardware used to gather the performance data¶
import pandas as pd
from collections import OrderedDict as odict
#name, cache-size (in kB)
hardware = odict({})
hardware['i5'] = ('Intel Core i5-6600 @ 3.30GHz (2x 8GB DDR4, 4 cores)',6144,
'1 MPI task x 4 OpenMP threads (1 per core)')
hardware['skl'] = ('2x Intel Xeon 8160 (Skylake) at 2.10 GHz (12x 16GB DDR4, 2x 24 cores)',2*33000,
'2 MPI tasks (1 per socket) x 24 OpenMP threads (1 per core)')
hardware['knl'] = ('Intel Xeon Phi 7250 (Knights Landing) at 1.40 GHz (16GB MCDRAM, 68 cores)',34000,
'1 MPI task x 136 OpenMP hyperthreads (2 per core)')
hardware['gtx1060'] = ('Nvidia GeForce GTX 1060 (6GB global memory)',1572.864, '1 MPI task per GPU')
hardware['p100'] = ('Nvidia Tesla P100-PCIe (16GB global memory)',4194.304, '1 MPI task per GPU')
hardware['v100'] = ('Nvidia Tesla V100-PCIe (16GB global memory)',6291.456, '1 MPI task per GPU')
hardware['p100nv'] = ('Nvidia Tesla P100-Power8 (16GB global memory)',4194.304, '1 MPI task per GPU')
hardware['v100nv'] = ('Nvidia Tesla V100-Power9 (16GB global memory)',6291.456, '1 MPI task per GPU')
memory =odict({}) #find with 'dmidecode --type 17'
#name, I/O bus clockrate (MHz) , buswidth (bit), size (MB),
memory['i5'] = ('2x 8GB Kingston DDR4', 1066, 2*64, 2*8192) #ECC: no (actually it is DDR4-2400 but i5 has max DDR4-2133)
memory['skl'] = ('12x 16GB DDR4',1333,12*64,12*16000) #ECC: ?
memory['knl'] = ('MCDRAM',None,None,16000) #ECC: ?
memory['gtx1060'] = ('on-card global memory',4004,192,6069) #ECC: no
memory['p100'] = ('on-card global memory',715,4096,16276) # ECC: yes
memory['v100'] = ('on-card global memory',877,4096,16152) # ECC: yes
compiler=odict({})
compiler['i5'] = ('mpic++ (gcc-5.4) -mavx -mfma -O3 -fopenmp')
compiler['skl'] = ('mpiicc-17.0.4 -mt_mpi -xCORE-AVX512 -mtune=skylake -O3 -restrict -fp-model precise -fimf-arch-consistency=true -qopenmp')
compiler['knl'] = ('mpiicc-17.0.4 -mt_mpi -xMIC-AVX512 -O3 -restrict -fp-model precise -fimf-arch-consistency=true -qopenmp')
compiler['gtx1060'] = ('nvcc-7.0 --compiler-bindir mpic++ (gcc-5.4) -O3 -arch sm_35')
compiler['p100'] = ('nvcc-8.0 --compiler-bindir mpic++ (gcc-5.4) -O3 -arch sm_60 -Xcompiler "-O3 -mavx -mfma"')
compiler['v100'] = ('nvcc-8.0 --compiler-bindir mpic++ (gcc-5.4) -O3 -arch sm_60 -Xcompiler "-O3 -mavx -mfma"')
df = pd.DataFrame(hardware)
df = df.transpose()
df.columns= ['device-name', 'cache-size-kB','single-node configuration']
com = pd.DataFrame(compiler, index=['compiler flags'])
com = com.transpose()
com
df = df.join(com)
mem = pd.DataFrame(memory)
mem = mem.transpose()
mem.columns = ['mem-description', 'clockrate-MHz', 'buswidth-bit', 'size-MB']
df=df.join(mem)
#df
From the available data we can compute the theoretical memory bandwidth via $\(bw = 2*clockrate*buswidth\)$ where the ‘2’ is for double data rate (DDR)
df['bandwidth'] = 2*df['clockrate-MHz']*1e6*df['buswidth-bit']/8/1e9
#df
Let us compare the theoretical bandwidth with our previously measured peak bandwidth from axpby
exp = pd.read_csv('performance.csv',delimiter=' ')
exp.set_index('arch',inplace=True)
exp.index.name = None
df = df.join(exp['axpby_bw'])
df['mem_efficiency']=df['axpby_bw']/df['bandwidth']*100
pd.set_option('display.float_format', lambda x: '%.2f' % x)
df
device-name | cache-size-kB | single-node configuration | compiler flags | mem-description | clockrate-MHz | buswidth-bit | size-MB | bandwidth | axpby_bw | mem_efficiency | |
---|---|---|---|---|---|---|---|---|---|---|---|
i5 | Intel Core i5-6600 @ 3.30GHz (2x 8GB DDR4, 4 c... | 6144 | 1 MPI task x 4 OpenMP threads (1 per core) | mpic++ (gcc-5.4) -mavx -mfma -O3 -fopenmp | 2x 8GB Kingston DDR4 | 1066 | 128 | 16384 | 34.11 | 29.99 | 87.90 |
skl | 2x Intel Xeon 8160 (Skylake) at 2.10 GHz (12x ... | 66000 | 2 MPI tasks (1 per socket) x 24 OpenMP threads... | mpiicc-17.0.4 -mt_mpi -xCORE-AVX512 -mtune=sky... | 12x 16GB DDR4 | 1333 | 768 | 192000 | 255.94 | 206.71 | 80.77 |
knl | Intel Xeon Phi 7250 (Knights Landing) at 1.40 ... | 34000 | 1 MPI task x 136 OpenMP hyperthreads (2 per core) | mpiicc-17.0.4 -mt_mpi -xMIC-AVX512 -O3 -restri... | MCDRAM | None | None | 16000 | NaN | 393.15 | NaN |
gtx1060 | Nvidia GeForce GTX 1060 (6GB global memory) | 1572.86 | 1 MPI task per GPU | nvcc-7.0 --compiler-bindir mpic++ (gcc-5.4) -O... | on-card global memory | 4004 | 192 | 6069 | 192.19 | 157.05 | 81.71 |
p100 | Nvidia Tesla P100-PCIe (16GB global memory) | 4194.30 | 1 MPI task per GPU | nvcc-8.0 --compiler-bindir mpic++ (gcc-5.4) -O... | on-card global memory | 715 | 4096 | 16276 | 732.16 | 550.51 | 75.19 |
v100 | Nvidia Tesla V100-PCIe (16GB global memory) | 6291.46 | 1 MPI task per GPU | nvcc-8.0 --compiler-bindir mpic++ (gcc-5.4) -O... | on-card global memory | 877 | 4096 | 16152 | 898.05 | 846.42 | 94.25 |
p100nv | Nvidia Tesla P100-Power8 (16GB global memory) | 4194.30 | 1 MPI task per GPU | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
v100nv | Nvidia Tesla V100-Power9 (16GB global memory) | 6291.46 | 1 MPI task per GPU | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Let us write a summarized LateX table to be used for publication
pd.set_option('precision',3)
file = df.loc[:,['device-name','single-node configuration']]#,'bandwidth']]
#file.loc['knl','bandwidth'] = '>400'
file.columns = ['device description', 'single-node configuration']#, 'bandwidth [GB/s]']
filename='hardware.tex'
df.loc['knl','bandwidth'] = '$>$400'
pd.set_option('display.max_colwidth', 200)
with open(filename, 'wb') as f:
f.write(bytes(file.to_latex(
column_format='@{}lp{6.5cm}p{5cm}@{}',
bold_rows=True),'UTF-8'))
file
device description | single-node configuration | |
---|---|---|
i5 | Intel Core i5-6600 @ 3.30GHz (2x 8GB DDR4, 4 cores) | 1 MPI task x 4 OpenMP threads (1 per core) |
skl | 2x Intel Xeon 8160 (Skylake) at 2.10 GHz (12x 16GB DDR4, 2x 24 cores) | 2 MPI tasks (1 per socket) x 24 OpenMP threads (1 per core) |
knl | Intel Xeon Phi 7250 (Knights Landing) at 1.40 GHz (16GB MCDRAM, 68 cores) | 1 MPI task x 136 OpenMP hyperthreads (2 per core) |
gtx1060 | Nvidia GeForce GTX 1060 (6GB global memory) | 1 MPI task per GPU |
p100 | Nvidia Tesla P100-PCIe (16GB global memory) | 1 MPI task per GPU |
v100 | Nvidia Tesla V100-PCIe (16GB global memory) | 1 MPI task per GPU |
p100nv | Nvidia Tesla P100-Power8 (16GB global memory) | 1 MPI task per GPU |
v100nv | Nvidia Tesla V100-Power9 (16GB global memory) | 1 MPI task per GPU |