{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### A test of the performance prediction model" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from collections import OrderedDict as odict\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import matplotlib.cm as cm" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#(hardware name, number of nodes)\n", "files = odict({})\n", "files['i5'] = ('i5',1)\n", "files['gtx1060'] = ('gtx1060',1)\n", "files['skl_mpi1'] = ('skl',1)\n", "files['skl_mpi2'] = ('skl',2)\n", "files['skl_mpi4'] = ('skl',4)\n", "files['knl_mpi1'] = ('knl',1)\n", "files['knl_mpi2'] = ('knl',2)\n", "files['knl_mpi4'] = ('knl',4)\n", "files['p100nv_mpi1'] = ('p100',1)\n", "files['p100nv_mpi2'] = ('p100',2)\n", "files['p100nv_mpi4'] = ('p100',4)\n", "files['v100nv_mpi1'] = ('v100',1)\n", "files['v100nv_mpi2'] = ('v100',2)\n", "files['v100nv_mpi4'] = ('v100',4)\n", "# order by number of nodes to make labeling easier further down\n", "files=odict(sorted(files.items(), key= lambda t : t[1][1]))\n", "# count number of 1 nodes in dict\n", "number=0\n", "for k,v in files.items(): \n", " if v[1]==1: number+=1\n", "#setup plotting specifications\n", "arch = {'knl':(cm.Greens, 450,0.5,0.33),'skl':(cm.Greys,200,0.5,0.75), 'p100':(cm.Blues, 550,0.5,0.43),\n", " 'v100':(cm.Purples,850,0.5,0.85), 'i5':(cm.Wistia,30,0.5,0.79),'gtx1060':(cm.Oranges,155,0.5,0.70)}\n", "intens={1:0.8, 2:0.6, 4:0.4}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here, we setup the prediction model by giving the number of function calls and memory operations of each of the three types of primitive functions axpby, dot and dxdy" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "#(axpby,dot,dxdy)\n", "latencies = odict()\n", "latencies['scal'] = (1,0,0)\n", "latencies['axpby'] = (1,0,0)\n", "latencies['pointwiseDot'] = (1,0,0)\n", "latencies['dot'] = (0,1,0)\n", "latencies['dx'] = (0,0,1)\n", "latencies['dy'] = (0,0,1)\n", "latencies['arakawa'] = (3,0,6) # N = 9\n", "latencies['cg'] = (6,2,6) # N = 13\n", "latencies['avg']= (9,2,12) # N=23\n", "memops = odict()\n", "memops['scal']= (2,0,0)\n", "memops['axpby']= (3,0,0)\n", "memops['pointwiseDot']= (6,0,0)\n", "memops['dot']= (0,2,0)\n", "memops['dx']= (0,0,3)\n", "memops['dy']= (0,0,3)\n", "memops['arakawa'] = (16,0,18) # M = 34 -> M/N = 3.78\n", "memops['cg'] = (20,4,18) # M = 42 -> M/N = 3.23\n", "memops['avg'] = (36,4,36) # M = 76 -> M/N = 3.30" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let us read in the previously measured bandwidths and latencies" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
axpby_bwaxpby_bw_errdot_bwdot_bw_errdxdy2_bwdxdy2_bw_errdxdy3_bwdxdy3_bw_errdxdy4_bwdxdy4_bw_err...axpby_lat_distaxpby_lat_dist_errdot_lat_shareddot_lat_shared_errdot_lat_distdot_lat_dist_errdxdy_lat_shareddxdy_lat_shared_errdxdy_lat_distdxdy_lat_dist_err
i529.990.199.310.0427.792.9729.122.8425.581.49...nannan4.760.23nannan0.001.44nannan
gtx1060157.050.0626.500.10130.630.40111.231.1183.8213.83...nannan92.068.70nannan0.000.82nannan
skl206.715.87192.0518.31181.5635.38161.7513.00118.0618.39...0.000.2617.282.3237.934.1422.702.1128.522.10
knl393.1522.19141.366.63239.0417.02172.6926.80126.0418.59...9.160.0954.831.79119.595.149.930.7052.673.72
p100550.511.23375.611.94293.257.11238.9912.63208.447.05...0.000.2750.897.0651.670.5926.230.0554.400.35
titanXp431.243.4561.370.12372.854.16308.929.47246.737.92...nannan44.375.15nannan2.380.57nannan
v100846.420.95610.155.99794.4320.52735.4233.02696.4915.14...0.000.3188.494.6897.580.794.200.0237.190.42
\n", "

7 rows × 24 columns

\n", "
" ], "text/plain": [ " axpby_bw axpby_bw_err dot_bw dot_bw_err dxdy2_bw dxdy2_bw_err \\\n", "i5 29.99 0.19 9.31 0.04 27.79 2.97 \n", "gtx1060 157.05 0.06 26.50 0.10 130.63 0.40 \n", "skl 206.71 5.87 192.05 18.31 181.56 35.38 \n", "knl 393.15 22.19 141.36 6.63 239.04 17.02 \n", "p100 550.51 1.23 375.61 1.94 293.25 7.11 \n", "titanXp 431.24 3.45 61.37 0.12 372.85 4.16 \n", "v100 846.42 0.95 610.15 5.99 794.43 20.52 \n", "\n", " dxdy3_bw dxdy3_bw_err dxdy4_bw dxdy4_bw_err ... axpby_lat_dist \\\n", "i5 29.12 2.84 25.58 1.49 ... nan \n", "gtx1060 111.23 1.11 83.82 13.83 ... nan \n", "skl 161.75 13.00 118.06 18.39 ... 0.00 \n", "knl 172.69 26.80 126.04 18.59 ... 9.16 \n", "p100 238.99 12.63 208.44 7.05 ... 0.00 \n", "titanXp 308.92 9.47 246.73 7.92 ... nan \n", "v100 735.42 33.02 696.49 15.14 ... 0.00 \n", "\n", " axpby_lat_dist_err dot_lat_shared dot_lat_shared_err dot_lat_dist \\\n", "i5 nan 4.76 0.23 nan \n", "gtx1060 nan 92.06 8.70 nan \n", "skl 0.26 17.28 2.32 37.93 \n", "knl 0.09 54.83 1.79 119.59 \n", "p100 0.27 50.89 7.06 51.67 \n", "titanXp nan 44.37 5.15 nan \n", "v100 0.31 88.49 4.68 97.58 \n", "\n", " dot_lat_dist_err dxdy_lat_shared dxdy_lat_shared_err \\\n", "i5 nan 0.00 1.44 \n", "gtx1060 nan 0.00 0.82 \n", "skl 4.14 22.70 2.11 \n", "knl 5.14 9.93 0.70 \n", "p100 0.59 26.23 0.05 \n", "titanXp nan 2.38 0.57 \n", "v100 0.79 4.20 0.02 \n", "\n", " dxdy_lat_dist dxdy_lat_dist_err \n", "i5 nan nan \n", "gtx1060 nan nan \n", "skl 28.52 2.10 \n", "knl 52.67 3.72 \n", "p100 54.40 0.35 \n", "titanXp nan nan \n", "v100 37.19 0.42 \n", "\n", "[7 rows x 24 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "theo = pd.read_csv('performance.csv',delimiter=' ')\n", "theo.set_index('arch',inplace=True)\n", "theo.index.name = None\n", "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n", "theo" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "#define conversion function \n", "def toString(x): \n", " if pd.isnull(x) : return 'n/a'\n", " #string = '%.1f'% x\n", " string = '%d' %np.ceil(x)\n", " #if np.ceil(x)<100 : string = '0'+string\n", " if np.ceil(x)<10 : string = '0'+string\n", " return string" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the followin cell we construct a table that shows the average bandwiths and latencies among a typical selection of primitive algorithms. " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
B(P=2) [GB/s]B(P=3) [GB/s]B(P=4) [GB/s]B(P=5) [GB/s]$T_{lat}(1)$ [$\\mu$s]$T_{lat}(4)$ [$\\mu$s]
i526 $\\pm$ 0227 $\\pm$ 0226 $\\pm$ 0123 $\\pm$ 0201 $\\pm$ 01n/a
gtx1060116 $\\pm$ 01108 $\\pm$ 0194 $\\pm$ 0985 $\\pm$ 1209 $\\pm$ 01n/a
skl194 $\\pm$ 20183 $\\pm$ 09153 $\\pm$ 15147 $\\pm$ 0714 $\\pm$ 0219 $\\pm$ 02
knl281 $\\pm$ 13232 $\\pm$ 24188 $\\pm$ 20160 $\\pm$ 1813 $\\pm$ 0142 $\\pm$ 02
titanXp310 $\\pm$ 02287 $\\pm$ 04259 $\\pm$ 05230 $\\pm$ 2106 $\\pm$ 01n/a
p100383 $\\pm$ 06336 $\\pm$ 12306 $\\pm$ 08267 $\\pm$ 2122 $\\pm$ 0133 $\\pm$ 01
v100806 $\\pm$ 11776 $\\pm$ 18755 $\\pm$ 09691 $\\pm$ 4311 $\\pm$ 0128 $\\pm$ 01
\n", "
" ], "text/plain": [ " B(P=2) [GB/s] B(P=3) [GB/s] B(P=4) [GB/s] B(P=5) [GB/s] \\\n", "i5 26 $\\pm$ 02 27 $\\pm$ 02 26 $\\pm$ 01 23 $\\pm$ 02 \n", "gtx1060 116 $\\pm$ 01 108 $\\pm$ 01 94 $\\pm$ 09 85 $\\pm$ 12 \n", "skl 194 $\\pm$ 20 183 $\\pm$ 09 153 $\\pm$ 15 147 $\\pm$ 07 \n", "knl 281 $\\pm$ 13 232 $\\pm$ 24 188 $\\pm$ 20 160 $\\pm$ 18 \n", "titanXp 310 $\\pm$ 02 287 $\\pm$ 04 259 $\\pm$ 05 230 $\\pm$ 21 \n", "p100 383 $\\pm$ 06 336 $\\pm$ 12 306 $\\pm$ 08 267 $\\pm$ 21 \n", "v100 806 $\\pm$ 11 776 $\\pm$ 18 755 $\\pm$ 09 691 $\\pm$ 43 \n", "\n", " $T_{lat}(1)$ [$\\mu$s] $T_{lat}(4)$ [$\\mu$s] \n", "i5 01 $\\pm$ 01 n/a \n", "gtx1060 09 $\\pm$ 01 n/a \n", "skl 14 $\\pm$ 02 19 $\\pm$ 02 \n", "knl 13 $\\pm$ 01 42 $\\pm$ 02 \n", "titanXp 06 $\\pm$ 01 n/a \n", "p100 22 $\\pm$ 01 33 $\\pm$ 01 \n", "v100 11 $\\pm$ 01 28 $\\pm$ 01 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lines=[]\n", "#now compute and plot the prediction \n", "archs = ['i5','gtx1060','skl','knl','titanXp','p100','v100']\n", "for k in archs : \n", " line =[]\n", " for q,l in latencies.items():\n", " m = memops[q]\n", " M = m[0]+m[1]+m[2]\n", " for n in [2,3,4,5]:\n", " bw = [theo.loc[k,'axpby_bw'],theo.loc[k,'dot_bw'],theo.loc[k,'dxdy'+str(n)+'_bw']]\n", " err_bw = [theo.loc[k,'axpby_bw_err'],theo.loc[k,'dot_bw_err'],theo.loc[k,'dxdy'+str(n)+'_bw_err']]\n", " bandwidth = M/(m[0]/bw[0] + m[1]/bw[1] + m[2]/bw[2])\n", " err_bandwidth = bandwidth/(m[0]/bw[0] + m[1]/bw[1] + m[2]/bw[2])*np.sqrt(\n", " (m[0]/bw[0]**2*err_bw[0])**2 + (m[1]/bw[1]**2*err_bw[1])**2 + (m[2]/bw[2]**2*err_bw[2])**2 )\n", " line.append( toString( bandwidth)+\" $\\pm$ \"+toString(err_bandwidth))\n", " L = l[0]+l[1]+l[2]\n", " for dist in ['shared','dist']:\n", " lat = [theo.loc[k,'axpby_lat_'+dist], theo.loc[k,'dot_lat_'+dist], theo.loc[k,'dxdy_lat_'+dist]]\n", " err_lat = [theo.loc[k,'axpby_lat_'+dist+'_err'], theo.loc[k,'dot_lat_'+dist+'_err'], theo.loc[k,'dxdy_lat_'+dist+'_err']]\n", " latency = ( l[0]*lat[0]+ l[1]* lat[1] + l[2]*lat[2])/L #in us\n", " err_latency = np.sqrt( (l[0]*err_lat[0])**2 + (l[1]*err_lat[1])**2 + (l[2]*err_lat[2])**2 )/L\n", " if (dist == 'dist') and ((k == 'i5') or (k=='gtx1060') or (k=='titanXp')):\n", " line.append(toString( None))\n", " else: line.append(toString( latency)+\" $\\pm$ \"+toString(err_latency))\n", " #print(q,latency)\n", "\n", " lines.append(line)\n", "index = archs\n", "tuples=[] \n", "for p in latencies.keys():\n", " for q in ['B(P=2) [GB/s]','B(P=3) [GB/s]','B(P=4) [GB/s]','B(P=5) [GB/s]',\n", " '$T_{lat}(1)$ [$\\mu$s]','$T_{lat}(4)$ [$\\mu$s]']:\n", " tuples.append((p,q))\n", " \n", "\n", "cols=pd.MultiIndex.from_tuples(tuples)\n", "\n", "avg = pd.DataFrame(lines, index=index, columns=cols)\n", "#avg.sort_values(by=('avg','B(P=2) [GB/s]'), inplace=True)\n", "#avg.loc[:,('cg','size')]=(avg['cg']['P=3']/1e3*9/34\n", "# )*avg['cg']['lat 4 [$\\mu$s]']\n", "pd.set_option('display.float_format', lambda x: '%.0f' % x)\n", "#avg.loc['i5',('avg','$T_{lat}(4)$ [$\\mu$s]')]='n/a'\n", "#avg.loc['gtx1060',('avg','$T_{lat}(4)$ [$\\mu$s]')]='n/a'\n", "filename='avg.tex'\n", "with open(filename, 'wb') as f:\n", " f.write(bytes(avg['avg'].to_latex(\n", " escape=False,column_format='lp{1.5cm}p{1.5cm}p{1.5cm}p{1.5cm}p{1.2cm}p{1.2cm}',\n", " bold_rows=True),'UTF-8'))\n", "avg['avg']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the following we compare the predicted with the measured values for the Arakawa and CG algorithm" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "scrolled": false }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#now compute and plot the prediction \n", "del latencies['avg']\n", "del memops['avg']\n", "for n in ['arakawa','cg'] :#latencies.keys():\n", " fig,ax=plt.subplots(1,1,figsize=(6,3.7),dpi= 80, facecolor='w', edgecolor='k')\n", " xs = np.array([0.1,1000])\n", " ys = np.array([1.0,1.0])\n", " for frac in [1.0,4/3,8/4]:\n", " plt.plot(xs,frac*ys,ls='--',color=cm.Greys(0.8/frac))\n", " plt.plot(xs,1/frac*ys,ls='--',color=cm.Greys(0.8/frac))\n", " \n", " for f, v in files.items() :#{'knl_mpi2':('knl',2)}.items():\n", " df=pd.read_csv('benchmark_'+f+'.csv', delimiter=' ')\n", " #add size and get rid of non-relevant columns\n", " df.insert(0,'size', 8*df['n']*df['n']*df['Nx']*df['Ny']/1e6/v[1])\n", " dfr = df[['n','Nx','Ny','size']+list(memops.keys())]\n", " #compute mean and standard derivation of 'same' groups \n", " dfr=dfr.groupby(['n', 'Nx','Ny','size']).mean()\n", " dfr=dfr.reset_index(level=['n','Nx','Ny','size'])\n", "\n", " dfr['FirstLevel']='measured'\n", " dfr.columns=pd.MultiIndex.from_product([dfr.columns,['measured']])\n", " del dfr['FirstLevel']\n", " \n", " dfr['dxdy_bw'] = dfr.apply( \n", " lambda row: theo.loc[v[0],'dxdy'+str(row['n','measured'].astype(int))+'_bw'], axis=1)\n", "\n", " dxdystring = 'dxdy_lat_shared'\n", " dotstring = 'dot_lat_shared'\n", " if v[1] > 1 : \n", " dxdystring = 'dxdy_lat_dist'\n", " dotstrint = 'dot_lat_dist'\n", " for q,l in latencies.items():\n", " m = memops[q]\n", " dfr.loc[:,(q,'predicted')] = (\n", " ( l[0]*theo.loc[v[0],'axpby_lat_shared']+\n", " l[1]*theo.loc[v[0],dotstring] +\n", " l[2]*theo.loc[v[0],dxdystring]\n", " )*1e-6+\n", " (m[0]/theo.loc[v[0],'axpby_bw'] + m[1]/theo.loc[v[0],'dot_bw'] + m[2]/dfr['dxdy_bw',''])\n", " *dfr[('size','measured')]/1000)\n", " dfr.loc[:,(q,'meas/pred')]=dfr[(q,'measured')]/dfr[(q,'predicted')]\n", "\n", " toPlot = dfr[n].join(dfr[('size')],rsuffix='_size')\n", " toPlot.plot(kind='scatter',ax=ax,color=arch[v[0]][0](intens[v[1]]),edgecolors='k',\n", " x='measured_size', y='meas/pred',label=v[0],s=64)\n", " handles, labels = plt.gca().get_legend_handles_labels()\n", " handles = handles[0:number]; labels = labels[0:number]\n", "\n", " #plt.plot(xs,ys)\n", "\n", " \n", " plt.legend(handles, labels, loc='upper right',ncol=2,\n", " scatterpoints=1,fontsize='medium',framealpha=0.5)\n", " plt.xscale('log')\n", " plt.xlim(xs[0],xs[1])\n", " #plt.xlabel('measured time in s')\n", " plt.xlabel('array size [MB] / # of nodes')\n", " plt.ylabel('measured / predicted time')\n", " plt.yscale('log', subsy=[0])#log scale, turn minor ticks off\n", " plt.ylim(1/3,3)\n", " plt.yticks([1/3,0.5,0.75,1,1.33,2,3],['1/3','1/2','3/4',1,'4/3',2,3])\n", " #plt.title(n)\n", " plt.savefig(n+'.pdf',bbox_inches='tight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Observations\n", "- plots show deviations from the predicted timing\n", "- the goal of the discussion is to prove that the time formula is correct because then performance can be discussed analytically and makes \"scaling\" plots somewhat obsolete\n", "- there seems to be a systematic overestimation of the knl MPI scaling (is this the fault of the implemenation? What is wrong there?)\n", "- skl and i5 for small sizes are mostly faster than predicted because cache effects are not included in the model\n", "- there seems to a drop in efficiency in GPUs when the problem size nears the full size of the GPU memory (last single node points in GTX, P100 and V100)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 2 }