:"=&./%$#()0+&23$3**+*.,'O& !"#$%&'#()*+,&-#$&./%$#()0./1& 23$3**+*&0#/0+2%,& T"+&'#()*+,& S#()*+C>3,+(&322$#30"& S=&0#/%+8%& ?#%+&#/&Z%)%#$.

Size: px
Start display at page:

Download ":"=&./%$#()0+&23$3**+*.,'O& !"#$%&'#()*+,&-#$&./%$#()0./1& 23$3**+*&0#/0+2%,& T"+&'#()*+,& S#()*+C>3,+(&322$#30"& S=&0#/%+8%& ?#%+&#/&Z%)%#$."

Transcription

1 :"=&./%$#()0+&23$3**+*.,'O&!"#$%&'#()*+,&-#$&./%$#()0./1& 23$3**+*&0#/0+2%,& P%Q,&"+$+& 99&GEDR&,3=,&=#)&,"#)*(&!%)(+/%,&N3/%&%#&,++&.%& 435.(&6)/(+& 7/#8&9#**+1+& S#()*+C>3,+(&322$#30"& P%Q,&"3$(&%#&$+5.,+&0)$$.0)*)'&#$&+/<$+&0#)$,+K& >)%&$+*3<5+*=&+3,=&%#&03$5+&#)%&3&0#)2*+&#-& (3=,& S#()*+,&3$+&,+*-C0#/%3./+(&GCR&(3=&)/.%,&%"3%& L%&N.%"./&+8.,</1&0#)$,+,& P/0*)(+&0#)$,+&'3%+$.3*,&3/(&>30;1$#)/(&,)22#$%& T"+&'#()*+,& S3/(+*>$#%&,+%&N.%"&U2+/SV&!"#$%&+8+$0.,+,&N.%"&9A4J& 9"32+*&./&J*1#$.%"',& S3%+$.3*,&353.*3>*+W& &"X2WYY-30)*%=I;/#8I+()Y(>)/(+Y%+30"./1Y99!9CS:DR& S=&0#/%+8%&?#%+&#/&Z%)%#$.3*[& T$.'+,%+$&03*+/(3$&!%)(+/%,&%3;+&R&0*3,,+,&3&%+$'K&N+&%+30"&G& 9#5+$&`D&,+'+,%+$&#-&'3%+$.3*&./%#&DE&N++;,& 9*3,,+,&N.%"&DECGE& S30&*3>,K&b./)8&,+$5+$,&

2 U5+$5.+N& S#()*+&D& S3/(+*>$#%&,+%&N.%"&U2+/SV& 6).*%&3$#)/(&2$#1$3'&%"3%& &&1+/+$3%+,&S3/(+*>$#%&,+%& &&3,&I>'2&L*+& U2+/SV& %"$+3(./1&*.>$3$=&>).*%&./%#&'#,%&9&0#'2.*+$,& A,+(&,+5+$3*&N3=,&3,&23$%&#-&(.,0),,.#/&#-& %"$+3(,&3/(&0#/0)$$+/0=&./&U!&0#)$,+&!+c/1&3**&%"+&2.8+*,& -#$&^./%&.&d&ea&.&e&/)'9#*,a&.ff_&g& &-#$&^./%&]&d&ea&]&e&/)'h#n,a&]ff_&g& && &8&d&^^(#)>*+_.&Y&/)'9#*,&CEIi_&j&Ga& && &=&d&^^(#)>*+_]&y&/)'h#n,&ceii_&j&ga& && &0#*#$&d&'3/(+*>$#%^8K=_a& && &2.8+*,k.lk]lI$1>%6*)+&d&2.8+*,k.lk]lI$1>%m$++/&d&& && & &2.8+*,k.lk]lI$1>%h+(&d&0#*#$a& & U2+/SV& U*(&,%3/(3$(&^D,% &./&DHH\_K&>)%&,<**&N.(+*=&),+(& :.(+*=&,)22#$%+(&^100K&o.,)3*&!%)(.#K&P/%+*K&III_& $+p).$+,&q-#2+/'2&m31&./&100& V3$3**+*&-#$&*##2& r2$31'3&#'2&23$3**+*&-#$& -#$^./%&.dda&.eddeea&.ff_&iii& Prior code J22*=./1&23$3**+*&-#$& r2$31'3&#'2&23$3**+*&-#$& -#$&^./%&.&d&ea&.&e&/)'9#*,a&.ff_&g& &-#$&^./%&]&d&ea&]&e&/)'h#n,a&]ff_&g& && &8&d&^^(#)>*+_.&Y&/)'9#*,&CEIi_&j&Ga& && &=&d&^^(#)>*+_]&y&/)'h#n,&ceii_&j&ga& && &0#*#$&d&'3/(+*>$#%^8K=_a& Iterations 1 25 Iterations Iterations Subsequent code Iterations && &2.8+*,k.lk]lI$1>%6*)+&d&2.8+*,k.lk]lI$1>%m$++/&d&& && & &2.8+*,k.lk]lI$1>%h+(&d&0#*#$a& &

3 h+,)*</1&#)%2)%&^0*#,+)2_& V$.53<s./1&*#03*&53$.3>*+,& r2$31'3&#'2&23$3**+*&-#$&2$.53%+^8k=k0#*#$_& -#$&^./%&.&d&ea&.&e&/)'9#*,a&.ff_&g& &-#$&^./%&]&d&ea&]&e&/)'h#n,a&]ff_&g& && &8&d&^^(#)>*+_.&Y&/)'9#*,&CEIi_&j&Ga& && &=&d&^^(#)>*+_]&y&/)'h#n,&ceii_&j&ga& && &0#*#$&d&'3/(+*>$#%^8K=_a& && &2.8+*,k.lk]lI$1>%6*)+&d&2.8+*,k.lk]lI$1>%m$++/&d&& && & &2.8+*,k.lk]lI$1>%h+(&d&0#*#$a& & t#n&n+**&(#+,&.%&23$3**+*.s+o& U$.1./3*&^,+$.3*_&$)//./1&<'+W&GIRH&,+0#/(,& V3$3**+*&$)//./1&<'+W&&&&&&&&&&&&&&&DIFR&,+0#/(,& &&&&!2++()2&d&&&&&&&&&&&&&&&&&&&&&&&&&d&DIu\&!+$.3*&<'+&& V3$3**+*&<'+& ^U/&'=&S30>##;&V$#K&N.%"&P/%+*&9#$+&.i&2$#0+,,#$_& V3$3**+*.s./1&.//+$&*##2& r2$31'3&#'2&23$3**+*&-#$&2$.53%+^8k=k0#*#$_& -#$&^./%&.&d&ea&.&e&/)'9#*,a&.ff_&g& &-#$&^./%&]&d&ea&]&e&/)'h#n,a&]ff_&g& && & & &III& -#$&^./%&.&d&ea&.&e&/)'9#*,a&.ff_&g& &&&&r2$31'3&#'2&23$3**+*&-#$&2$.53%+^8k=k0#*#$_& &-#$&^./%&]&d&ea&]&e&/)'h#n,a&]ff_&g& && & & &III& V3$3**+*.s./1&.//+$&*##2& r2$31'3&#'2&23$3**+*&-#$&2$.53%+^8k=k0#*#$_& -#$&^./%&.&d&ea&.&e&/)'9#*,a&.ff_&g& &-#$&^./%&]&d&ea&]&e&/)'h#n,a&]ff_&g& && & & &III& T.'+W&DIFR&,+0& -#$&^./%&.&d&ea&.&e&/)'9#*,a&.ff_&g& &&&&r2$31'3&#'2&23$3**+*&-#$&2$.53%+^8k=k0#*#$_& &-#$&^./%&]&d&ea&]&e&/)'h#n,a&]ff_&g& && & & &III& T.'+W&DIRi&,+0& P/,.(+&'3/(+*>$#%&-)/0<#/& (#)>*+&'3/(+*>$#%^(#)>*+&8K&(#)>*+&=_&g& &./%&'38P%+$3<#/&d&DEEEa&&./%&.%+$3<#/&d&Ea& &(#)>*+&$+&d&ek&.'&d&ea& &N".*+^^$+j$+&f&.'j.'&ed&F_&vv&^.%+$3<#/&e&'38P%+$3<#/ &g& & & &(#)>*+&%+'2&d&$+j$+&C&.'j.'&f&8a& & & &.'&d&gj$+j.'&f&=a& & & &$+&d&%+'2a& & & &.%+$3<#/ffa& & &.-^.%+$3<#/&wd&'38P%+$3<#/_&$+%)$/&Giia&+*,+&$+%)$/&Ea&

4 P/,.(+&'3/(+*>$#%&-)/0<#/& (#)>*+&'3/(+*>$#%^(#)>*+&8K&(#)>*+&=_&g& &./%&'38P%+$3<#/&d&DEEEa&&./%&.%+$3<#/&d&Ea& &(#)>*+&$+&d&ek&.'&d&ea& &N".*+^^$+j$+&f&.'j.'&ed&F_&vv&^.%+$3<#/&e&'38P%+$3<#/ &g& & & &(#)>*+&%+'2&d&$+j$+&C&.'j.'&f&8a& & & &.'&d&gj$+j.'&f&=a& & & &$+&d&%+'2a& & & &.%+$3<#/ffa& & T3;+,&*#/1+$&-#$&& 2#./%,&./&%"+&,+%&!N322./1&*##2&#$(+$& r2$31'3&#'2&23$3**+*&-#$&2$.53%+^8k=k0#*#$_& -#$&^./%&]&d&ea&]&e&/)'h#n,a&]ff_&g& &-#$&^./%&.&d&ea&.&e&/)'9#*,a&.ff_&g& T.'+W&DIRi&,+0& &.-^.%+$3<#/&wd&'38P%+$3<#/_&$+%)$/&Giia&+*,+&$+%)$/&Ea& 4=/3'.0&,0"+()*./1& r2$31'3&#'2&23$3**+*&-#$&iii&&,0"+()*+^(=/3'.0_& -#$&^./%&.&d&ea&.&e&/)'9#*,a&.ff_&g& &-#$&^./%&]&d&ea&]&e&/)'h#n,a&]ff_&g& && & & &III& T.'+W&EIHx&,+0&!)''3$=&#-&5+$,.#/,&!+$.3*&5+$,.#/&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&GIRH&,+0& P/0#$$+0%&23$3**+*&5+$,.#/&^$30+_& V3$3**+*&#)%+$&*##2&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&DIFR&,+0& V3$3**+*&.//+$&*##2&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&DIRi&,+0&!N32&*##2&#$(+$&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&DIGu&,+0& 4=/3'.0&,0"+()*./1&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&EIHx&,+0& J*%+$/3<5+W&V%"$+3(&*.>$3$=& 93/&(#&^'#,%&#-_&*+,,#/&),./1&VU!PyC,%3/(3$(& %"$+3(,&^2%"$+3(,_& 9*3,,$##'&"./%,& 93/Q%&"35+&%##&'3/=&,%)(+/%,&,"3$./1&,3'+& '30"./+& Prior code Same Thread Subsequent code Child Thread pthread_create(..., func_ptr, arg) pthread_join(..., &retval) void* func(void* arg) {... } m#&#5+$&0#/0+2%,&>+-#$+&3/(y#$&3z+$&,"#n./1& 0#(+&?#%&+3,=&%#&(#&(=/3'.0&,0"+()*./1&

5 t#n&pq5+&),+(&.%& V$+5.#),&*+0%)$+&./%$#()0./1&%"$+3(,& b3>&),./1&2%"$+3(,&^s3/(+*>$#%&#$&#%"+$& +83'2*+_& b+0%)$+&#/&*3>&3/(&),./1&s3/(+*>$#%& ^U2+/SV_&%#&.**),%$3%+&0#/0+2%,& 4+L/.%+&.'2$#5+'+/%&#5+$&(#./1&,3'+&'3%+$.3*& N.%"&V%"$+3(,&./&*+0%)$+& U2+/SV&#$&V%"$+3(,&L$,%O& U2+/SV&L$,%& m.5+&".1"c*+5+*&0#/0+2%,&>+-#$+&*#%,&#-&,=/%38& :3/%&%#&,2+/(&'#,%&#-&<'+&#/&0#/0+2%,&,#&(#&.%& L$,%& V%"$+3(,&L$,%& 4+'#/,%$3%+&+8+0)<#/&'#(+*&>+-#$+&,"#N./1& Z'31.0[& 9#)*(&),+&#%"+$&+83'2*+,&-#$&,.'2*.0.%=& ZTU4U[&*.,%& :".0"&#$(+$&-#$&V%"$+3(,&5,I&U2+/SVO& {#./&'=&+82+$.'+/%w& S#$+&0#*#$-)*&5+$,.#/,&#-&S3/(+*>$#%& P/%+$30<5+&.'31+&1+/+$3<#/& U%"+$&+83'2*+,& V*+3,+&,"3$+w& S#()*+&G&!"#$%&+8+$0.,+,&N.%"&9A4J& V3$%&#-&6)/(+K&73$353/.0K&S30"+K& S.%0"+**K&ZJ((./1&mVA&0#'2)</1&%#& 9#'2)%+$&U$13/.s3<#/&0#)$,+,[K& B()V3$&GEDR&& :"3%&.,&9A4JO& Z9#'2)%+&A/.L+(&4+5.0+&J$0".%+0%)$+[&?oP4PJQ,&3$0".%+0%)$+&3/(&*3/1)31+&-#$& 1+/+$3*C2)$2#,+&2$#1$3''./1&#/&1$32".0,& 03$(,& h+3**=&3&*.>$3$=&3/(&+8%+/,.#/&#-&9&^3/(&#%"+$& *3/1)31+,_& :"=&9A4JO& B3,=&%#&1+%&%"+&"3$(N3$+& S=&*32%#2&03'+&N.%"&3&FxC0#$+&03$(& 4+23$%'+/%&"3,&FFxC0#$+&03$(&^e& uee_&?op4pj&n.**./1&%#&(#/3%+&+p).2'+/%& B80.</1&-#$&,%)(+/%,& T"+=&"35+&03$(,&3/(&N3/%&%#&),+&%"+'& B3,=&%#&,++&2+$-#$'3/0+&>+/+L%,&

6 m3'+&#-&b.-+&^m#b_&!.')*3<#/&n.%"&0+**,&)2(3</1&./&*#0;&,%+2& B30"&%)$/K&0#)/%&*.5./1&/+.1">#$,& 9+**&3*.5+&/+8%&%)$/&.-& 3*.5+&%".,&<'+&3/(&"35+&G&*.5./1&/+.1">#$,K&#$& "35+&R&*.5./1&/+.1">#$,& S#()*+&0#/,%$3./%,& 6$.+-&<'+W&9#)$,+&"3,&*#%,&#-&#%"+$&1#3*,& U/+&\EC'./)%+&*3>&3/(&23$%,&#-&G&*+0%)$+,& h+*3<5+*=&./+82+$.+/0+(&,%)(+/%,&!#'+&]),%&#)%&#-&9!&g& S3/=&(.(/Q%&;/#N&9&#$&A/.8&2$#1$3''./1& P(+3&#-&23$3**+*.,'& A/.%&1#3*,& 6+/+L%,&3/(&0#,%,&#-&,=,%+'&"+%+$#1+/+.%=& 43%3&'#5+'+/%&3/(&?ASJ& m+/+$3**=k&%"+&+}+0%&#-&3$0".%+0%)$+&#/& 2$#1$3'&2+$-#$'3/0+& J22$#30"&%3;+/& P/%$#()0%#$=&*+0%)$+& mva,w&'3,,.5+*=&23$3**+*k&#)%,.(+&9vak&;+$/+*,k&!ps4& b3>&.**),%$3</1&-+3%)$+,&#-&9a4j&3$0".%+0%)$+& 43%3&%$3/,-+$&<'+& T"$+3(&(.5+$1+/0+& S+'#$=&%=2+,&^/+8%&<'+_& Zb+,,#/,&*+3$/+([&*+0%)$+& h+.%+$3%+&3$0".%+0%)$+& 4+'#/,%$3%+&,2++()2&N.%"&m3'+&#-&b.-+& T3*;&3>#)%&),+&./&T#2&iEE&,=,%+',& 9A4J&2$#1$3''./1&'#(+*& CPU "host" data GPU "device" code ("kernel") data kernel invocations &"3,&'3/=&0#$+,K&#$13/.s+(&./%#&1$#)2,& RGC%"$+3(&N3$2,&+8+0)%+&%"+&,3'+&./,%$)0<#/& 43%3&%$3/,-+$& //allocate memory on the device: cudamalloc((void**) &a_dev, N*sizeof(int));... //transfer array a to GPU cudamemcpy(a_dev, a, N*sizeof(int), cudamemcpyhosttodevice);... direction indicator //transfer array res back from GPU: cudamemcpy(res, res_dev, N*sizeof(int), cudamemcpydevicetohost);

7 P/5#;./1&%"+&;+$/+*& 7+$/+*&.%,+*-& int threads = 512; //# threads per block int blocks = (N+threads 1)/threads; //# blocks (N/threads rounded up) kernel<<<blocks,threads>>>(res_dev, a_dev, b_dev); 6*#0;,&3$+&3/&#$13/.s3<#/3*&)/.%&-#$&%"$+3(,& V+$-#$'3/0+&.,&5+$=&(+2+/(+/%&#/&r>*#0;,& 3/(&r%"$+3(,& U/+&$)*+W&r%"$+3(,&,"#)*(&>+&')*<2*+&#-&RG& global void kernel(int* res, int* a, int* b) { //function that runs on GPU to do the addition //sets res[i] = a[i] + b[i]; each thread is responsible for one value of i int thread_id = threadidx.x + blockidx.x*blockdim.x; if(thread_id < N) { res[thread_id] = a[thread_id] + b[thread_id]; } } since #threads potentially > array size b3>&30<5.%=&dw&43%3&%$3/,-+$&<'+&!%)(+/%,&0#'23$+&$)//./1&<'+&#-& N#$;./1&9A4J&2$#1$3'&%#&3((&23.$&#-&5+0%#$,& 2$#1$3'&N.%"&(3%3&%$3/,-+$K&>)%&/#&3$.%"'+<0& 2$#1$3'&%"3%&(#+,&3$.%"'+<0&3/(&#/*=&D&(.$+0<#/& #-&(3%3&%$3/,-+$& U>,+$5+&%"3%&(3%3&%$3/,-+$&.,&>)*;&#-&%"+&<'+& b3>&30<5.%=&dw&43%3&%$3/,-+$&<'+&!%)(+/%,&0#'23$+&$)//./1&<'+&#-& N#$;./1&9A4J&2$#1$3'&%#&3((&23.$&#-&5+0%#$,& 2$#1$3'&N.%"&(3%3&%$3/,-+$K&>)%&/#&3$.%"'+<0& 2$#1$3'&%"3%&(#+,&3$.%"'+<0&3/(&#/*=&D&(.$+0<#/& #-&(3%3&%$3/,-+$& U>,+$5+&%"3%&(3%3&%$3/,-+$&.,&>)*;&#-&%"+&<'+& b3>&30<5.%=&gw&t"$+3(&(.5+$1+/0+&& 9#'23$+&%N#&3223$+/%*=&+p).53*+/%&;+$/+*,W& ~~1*#>3*~~&5#.(&;+$/+*~D^./%&j3_&g& &&&&./%&<(&d&%"$+3(p(8i8a& &&&&./%&0+**&d&<(&&RGa& &&&&3k0+**lffa& ~~1*#>3*~~&5#.(&;+$/+*~G^./%&j3_&g& &&&&./%&0+**&d&%"$+3(P(8I8&&RGa& &&&&,N.%0"^0+**_&g& &&&&03,+&EW&3kElffa&>$+3;a& &&&&03,+&DW&3kDlffa&>$+3;a& &&&&III&&&YY0#/</)+,&%#&03,+&\& &&&&(+-3)*%W&3k0+**lffa& && U>,+$5+&53,%*=&(.}+$+/%&$)//./1&<'+,& T"$+3(,&./&3&N3$2&(+5#%+&<'+&%#&D&./,%$)0<#/&2+$& 0*#0;&0=0*+&!"!#$%&$#'($)**$+,#$%($^#%"+$,&/#2_& b3>&30<5.%=&gw&t"$+3(&(.5+$1+/0+&& 9#'23$+&%N#&3223$+/%*=&+p).53*+/%&;+$/+*,W& ~~1*#>3*~~&5#.(&;+$/+*~D^./%&j3_&g& &&&&./%&<(&d&%"$+3(p(8i8a& &&&&./%&0+**&d&<(&&RGa& &&&&3k0+**lffa& ~~1*#>3*~~&5#.(&;+$/+*~G^./%&j3_&g& &&&&./%&0+**&d&%"$+3(P(8I8&&RGa& &&&&,N.%0"^0+**_&g& &&&&03,+&EW&3kElffa&>$+3;a& &&&&03,+&DW&3kDlffa&>$+3;a& &&&&III&&&YY0#/</)+,&%#&03,+&\& &&&&(+-3)*%W&3k0+**lffa& && U>,+$5+&53,%*=&(.}+$+/%&$)//./1&<'+,& T"$+3(,&./&3&N3$2&(+5#%+&<'+&%#&D&./,%$)0<#/&2+$& 0*#0;&0=0*+&!"!#$%&$#'($)**$+,#$%($^#%"+$,&/#2_&

8 b3>&30<5.%=&rw&s+'#$=&%=2+,&& 63,+(&#/&9"32&u&#-&k!3/(+$,&3/(&73/($#%K&Z9A4J&>=&+83'2*+[K&GEDDl& Zh3=&%$30./1[&%"3%&%+,%,&./%+$,+0<#/,&N.%"& 3$$3=&#-&#>]+0%,&./&%"+&,3'+&#$(+$&!2++(,&)2&N.%"&,N.%0"&%#&0#/,%3/%&'+'#$=& &53*)+,&3$+&%$3/,'.X+(&%#&+/<$+&"3*-&N3$2& &3**#N,&030"./1& V+$-#$'3/0+&.,&N#$,+&.-&%"$+3(,&300+,,& #>]+0%,&./&(.}+$+/%&#$(+$,& b3>&30<5.%=&rw&s+'#$=&%=2+,&& 63,+(&#/&9"32&u&#-&k!3/(+$,&3/(&73/($#%K&Z9A4J&>=&+83'2*+[K&GEDDl& Zh3=&%$30./1[&%"3%&%+,%,&./%+$,+0<#/,&N.%"& 3$$3=&#-&#>]+0%,&./&%"+&,3'+&#$(+$&!2++(,&)2&N.%"&,N.%0"&%#&0#/,%3/%&'+'#$=& &53*)+,&3$+&%$3/,'.X+(&%#&+/<$+&"3*-&N3$2& &3**#N,&030"./1& V+$-#$'3/0+&.,&N#$,+&.-&%"$+3(,&300+,,& #>]+0%,&./&(.}+$+/%&#$(+$,&!)$5+=&$+,)*%,W&m##(&/+N,& J,;+(&%#&(+,0$.>+&9VAYmVA&./%+$30<#/W& H&#-&DD&'+/<#/&>#%"&(3%3&'#5+'+/%&3/(&./5#;./1&;+$/+*& J/#%"+$&]),%&'+/<#/,&./5#;./1&%"+&;+$/+*& J,;+(&%#&+82*3./&+82+$.'+/%&.**),%$3</1&(3%3& '#5+'+/%&0#,%W& H&#-&DG&,3=&0#'23$./1&0#'2)%3<#/&3/(& 0#'')/.03<#/&0#,%& G&'#$+&%3*;&3>#)%&0#'23$./1&(.}+$+/%&#2+$3<#/,&!)$5+=&$+,)*%,W&m##(&/+N,& J,;+(&%#&(+,0$.>+&9VAYmVA&./%+$30<#/W& H&#-&DD&'+/<#/&>#%"&(3%3&'#5+'+/%&3/(&./5#;./1&;+$/+*& J/#%"+$&]),%&'+/<#/,&./5#;./1&%"+&;+$/+*& J,;+(&%#&+82*3./&+82+$.'+/%&.**),%$3</1&(3%3& '#5+'+/%&0#,%W& H&#-&DG&,3=&0#'23$./1&0#'2)%3<#/&3/(& 0#'')/.03<#/&0#,%& G&'#$+&%3*;&3>#)%&0#'23$./1&(.}+$+/%&#2+$3<#/,&!)$5+=&$+,)*%,W&?#%&,#&1##(&/+N,& J,;+(&%#&+82*3./&+82+$.'+/%&.**),%$3</1&%"$+3(& (.5+$1+/0+W& G&#-&H&N+$+&0#$$+0%& G&'#$+&,++'+(&%#&)/(+$,%3/(K&>)%&'.,),+(& %+$'./#*#1=& R&'#$+&$+'+'>+$+(&2+$-#$'3/0+&+}+0%K&>)%&,3.(& /#%"./1&3>#)%&%"+&03),+&&&& 9#/0*),.#/,& A/.%&N3,&'#,%*=&,)00+,,-)*K&>)%&%"$+3(& (.5+$1+/0+&.,&3&"3$(+$&0#/0+2%&!%)(+/%,&./%+$+,%+(&./&9A4J&3/(&3>#)%&"3*-& %"+&0*3,,&$+p)+,%+(&'#$+&#-&.%& 6#X#'&*./+W&J&>$.+-&./%$#()0<#/&.,&2#,,.>*+& +5+/&%#&,%)(+/%,&N.%"&*.'.%+(&>30;1$#)/(&

9 9*3,,$##'&"./%,&?++(&1$32".0,&03$(&#/&*#03*&'30"./+&^3%&*+3,%& J*%+$/3%+&'#(+*,& b+n.,&3/(&9*3$;k&v#$%*3/(&!%3%+& b+0%)$+&./%$#()0./1&9a4j& b3>yt:&),./1&.%&%#&,2++(&)2&m3'+&#-&b.-+& 43/.+*&B$/,%& b#/1+$&)/.%&n.%"&>#%"&u2+/sv&3/(&9a4j& m+/+$3*&+'2"3,.,&#/&%)/./1&(3%3&*3=#)%&3/(& 300+,,&23X+$/& ZTU4U[&*.,%&?+N&+83'2*+&-#$&%=2+,&#-&'+'#$=& B82*3./&%"$+3(&(.5+$1+/0+&>+X+$& S.((*+&1$#)/(W&3((./1&2$#1$3''./1&%#&'./+& #$&0#/0+2%)3*&'3%+$.3*&%#&bv9&5+$,.#/& V#$</1&0#(+&%#&#%"+$&>3,+&*3/1)31+,&^{353_& U%"+$&2$#1$3''./1&+83'2*+&^O_& V*+3,+&,"3$+w& S#()*+&R3& 9"32+*&./&J*1#$.%"',& ^63,+(&#/&+82+$.+/0+,&#-&7=*+&6)$;+& 3/(&#)$&]#./%&%)%#$.3*&3%&!9&B(& V$#1$3'K&GEDG_& :"3%&.,&9"32+*O& V3$3**+*&2$#1$3''./1&*3/1)31+&(+5+*#2+(& N.%"&2$#1$3''+$&2$#()0<5.%=&./&'./(& U$.1./3**=&9$3=Q,&2$#]+0%&)/(+$&4JhVJQ,&t.1"& V$#()0<5.%=&9#'2)</1&!=,%+',&2$#1$3'&!).%3>*+&-#$&,"3$+(C&#$&(.,%$.>)%+(&'+'#$=&,=,%+',& P/,%3**,&+3,.*=&#/&b./)8&3/(&S30&U!a&),+& 9=1N./&%#&./,%3**&#/&:./(#N,& %"3%&=#)&/++(& V$#5.(+,&".1"C*+5+*&#2+$3<#/,& 4+,.1/+(&N.%"&23$3**+*.,'&./&'./(&

10 @*+8.>*+&,=/%38&!)22#$%,&,0$.2</1C*.;+&2$#1$3',W& N$.%+*/^Zt+**#&:#$*(w[_a& J*,#&2$#5.(+,&#>]+0%,&3/(&'#()*+,& V$#5.(+,&".1"C*+5+*&#2+$3<#/,& &6&d&-^J_a&&YY322*.+,&-&+*+'+/%N.,+&-#$&3/=&-)/0<#/&-& P/0*)(+,&>).*%C./&#2+$3%#$,W& &9&d&J&f&Da& &4&d&J&f&6a& &B&d&J&j&6a& &III& 4+,.1/+(&N.%"&23$3**+*.,'&./&'./(& U2+$3<#/,&#/&2$+5.#),&,*.(+,&23$3**+*.s+(& 3)%#'3<03**=& 9$+3%+&3,=/0"$#/#),&%3,;&NY&,./1*+&;+=N#$(& 6).*%C./&,=/0"$#/.s3<#/&-#$&%3,;,&3/(&53$.3>*+,& Zt+**#&:#$*([&./&9"32+*& 9$+3%+&L*+&"+**#I0"2*&0#/%3././1& &N$.%+*/^Zt+**#&:#$*(w[_a& 9#'2.*+&N.%"& &0"2*&q#&"+**#&"+**#I0"2*& h)/&n.%"& &IY"+**#& o3$.3>*+,&3/(&9#/,%3/%,& o3$.3>*+&(+0*3$3<#/&-#$'3%w& k0#/l1l&53$y0#/,%&.(+/<l+$&w&%=2+a& 53$&8&W&./%a& 0#/,%&2.&W&$+3*&d&RIDFa&& 0#/L1&0#/,%&/)'!.(+,&W&./%&d&Fa&&!+$.3*&9#/%$#*&!%$)0%)$+,&.-&,%3%+'+/%,K&N".*+&*##2,K&3/(&(#CN".*+&*##2,& 3$+&3**&2$+X=&,%3/(3$(& 4.}+$+/0+W&!%3%+'+/%&>#(.+,&'),%&+.%"+$&),+& >$30+,&#$&3/&+8%$3&;+=N#$(W& &.-^8&dd&i_&!"#$&=&d&Ra&+*,+&=&d&Da& &N".*+^8&e&i_&%&&8ffa&&

11 B83'2*+W&h+3(./1&)/<*&+#-& 53$&8&W&./%a& N".*+&,%(./I$+3(^8_&g& & &N$.%+*/^Zh+3(&53*)+&ZK&8_a& n && arg_type V$#0+()$+,Y@)/0<#/,& argument omit for generic function proc addone(in val : int, inout val2 : int) : int { val2 = val + 1; return val + 1; } return type (omit if none or if can be inferred) J$$3=,& P/(.0+,&(+%+$'./+(&>=&3&$3/1+W& &53$&J&W&kDIIil&./%a &&&&&&&&&&&&YY(+0*3$+,&J&3,&3$$3=&#-&i&./%,& &53$&6&W&kCRIIRl&./%a &&&&&&&&&&&&YY"3,&./(.0+,&CR&%"$)&R& &53$&9&W&kDIIDEK&DIIDEl&./%a&&YY')*<C(.'+/,.#/3*&3$$3=& J00+,,./1&./(.5.()3*&0+**,W& &JkDl&d&JkGl&f&GRa& h3/1+,&3*,#&),+(&./&-#$&*##2,w& &-#$&.&./&DIIDE&(#&,%3%+'+/%a& &-#$&.&./&DIIDE&g& & &*##2&>#(=& & 93/&3*,#&),+&3$$3=&#$&3/=%"./1&.%+$3>*+& V3$3**+*&b##2,& TN#&;./(,&#-&23$3**+*&*##2,W& &-#$3**&.&./&DIIDE&(#&,%3%+'+/%a&&YY#'.%&(#&NY&>$30+,& &0#-#$3**&.&./&DIIDE&(#&,%3%+'+/%a& -#$3**&0$+3%+,&D&%3,;&2+$&2$#0+,,./1&)/.%& 0#-#$3**&0$+3%+,&D&2+$&*##2&.%+$3<#/& A,+(&N"+/&+30"&.%+$3<#/&$+p).$+,&*#%,&#-&N#$;&3/(Y#$& %"+=&'),%&>+&(#/+&./&23$3**+*& J,=/0"$#/#),&T3,;,& B3,=&3,=/0"$#/#),&%3,;&0$+3<#/W& &>+1./&,%3%+'+/%a&& B3,=&-#$;C]#./&23$3**+*.,'W& &0#>+1./&g& & &,%3%+'+/%Da& & &,%3%+'+/%Ga& & &III& & &YY0$+3%+,&%3,;&2+$&,%3%+'+/%&3/(&N3.%,&"+$+&&

12 !=/0&>*#0;,&,=/0&>*#0;,&N3.%&-#$&%3,;,&0$+3%+(&./,.(+&.%& T"+,+&3$+&+p).53*+/%W& &&,=/0&g & & & & & &&&&&0#>+1./&g& &>+1./&,%3%+'+/%Da & & &&&&&,%3%+'+/%Da& &>+1./&,%3%+'+/%Ga & & &&&&&,%3%+'+/%Ga& &III & & & & & & & &&&&&III& n & & & & & & & & &&&&&!=/0&53$.3>*+,&,=/0&53$.3>*+,&"35+&53*)+&3/(&+'2%=Y-)**&,%3%+&,%#$+&Ä&D&53*)+&3/(&>*#0;&#2+$3<#/,&03/Q%&2$#0++(& 93/&>+&),+(&3,&*#0;W& &53$&*#0;&W&,=/0&./%a& &*#0;&d&Da & & & &YY30p).$+,&*#0;& &III& &53$&%+'2&d&*#0;a & &YY$+*+3,+,&%"+&*#0;& J/3*=,.,&#-&J*1#$.%"',& 9"32+*&'3%+$.3*& J,,.1/&>3,.0&%)%#$.3*& T+30"&-#$3**&v&0#>+1./&^3*,#&3*1#$.%"'.0&/#%3<#/_& V$#]+0%,& V3$<<#/&./%+1+$,& 6)>>*+!#$%& S+$1+!#$%&?+3$+,%&?+.1">#$,& J*1#$.%"',&V$#]+0%W&b.,%&V3$<<#/& V3$<<#/&3&*.,%&%#&%N#&+p)3*C,)''./1&"3*5+,I& 6$)%+C-#$0+&3*1#$.%"'&^(#/Å%&;/#N&V&5,&?V&=+%_& Ç)+,<#/,W& :"3%&3$+&*#/1+,%&*.,%,&=#)&03/&%+,%O& :"3%&3>#)%&./&23$3**+*O& T$.0;W&+/)'+$3%+&2#,,.>.*.<+,&3/(&),+&-#$3**& J*1#$.%"',&V$#]+0%W&6)>>*+!#$%&! P/,%+3(&#-&*+zC%#C$.1"%K&%+,%&3**&23.$,&./&%N#&,%+2,w& J*1#$.%"',&V$#]+0%W&S+$1+!#$%& V3$3**+*&(.5.(+C3/(C0#/p)+$W&),+&0#>+1./& B*+13/%&(.5.,.#/W&,2*.%&%"+&4#'3./&!2++()2&/#%&3,&/#<0+3>*+& B83'2*+&#-&+82+/,.5+&23$3**+*&#5+$"+3(&!&TN#&/+,%+(&-#$3**&*##2,&^./&,+p)+/0+_&./,.(+&3&-#$&*##2&

13 TN#&3*1#$.%"',W& ^),+&3&-#$3**&*.;+&>)>>*+!#$%_& 4.5.(+C3/(C9#/p)+$& ^),+&0#>+1./_& J&>.%&%$.0;=& o3*)+&#-&23$3**+*.,'w&')0"&+3,.+$&%#&2$#1$3'& %"+&>$)%+C-#$0+&'+%"#(& J*1#$.%"',&T3;+3N3=& b+3$/./1&0)$5+&#-&9"32+*&.,&,#&*#nk&,%)(+/%,& 03/&,%3$%&),./1&23$3**+*.,'&5+$=&p).0;*=&!)''./1&53*)+,&./&3/&3$$3=& S#()*+&R>& h+()0<#/,& ^h+()0<#/&-$3'+n#$;&-$#'&b./&3/(&!/=(+$k&-+%#.%/*!0$'&$/)+)**!*$ /+'1+)22%#1K&GEEHI_& !)''./1&53*)+,&./&3/&3$$3=& 16!)''./1&53*)+,&./&3/&3$$3=&

14 !)''./1&53*)+,&./&3/&3$$3=& 16!)''./1&53*)+,&./&3/&3$$3=& ,2 4,2 3, ,0 4,2 3,5 2, ,0 1,1 4,2 3,3 1,4 3,5 0,6 4,2 2 4,2 3,5 V3$%,&#-&3&$+()0<#/& T3**=W&P/%+$'+(.3%+&,%3%+&#-&0#'2)%3<#/& 9#'>./+W&9#'>./+&G&%3**.+,& 2,0 4,2 3,5 2,7 h+()0+c1+/w&m+/+$3%+&$+,)*%&-$#'&%3**=& 2,0 1,1 4,2 3,3 1,4 3,5 0,6 2,7 P/.%W&9$+3%+&Z+'2%=[&%3**=& J00)')*3%+W&J((&D&53*)+&%#&%3**=&

15 V3$%,&#-&3&$+()0<#/& T3**=W&P/%+$'+(.3%+&,%3%+&#-&0#'2)%3<#/& & && & & & &^53*)+K&./(+8_& 9#'>./+W&9#'>./+&G&%3**.+,& & && & & & &%3;+&N".0"+5+$&23.$&"3,&*3$1+$&53*)+& h+()0+c1+/w&m+/+$3%+&$+,)*%&-$#'&%3**=& & & & & &$+%)$/&%"+&./(+8& P/.%W&9$+3%+&Z+'2%=[&%3**=& TN#&.,,)+,&?++(&%#&0#/5+$%&./.<3*&53*)+,&./%#&%3**.+,& S3=&N3/%&,+23$3%+&#2+$3<#/&-#$&53*)+,&*#03*& %#&3&,./1*+&2$#0+,,#$& "Empty" tally Tally of these values J00)')*3%+W&J((&D&53*)+&%#&%3**=& TN#&.,,)+,&?++(&%#&0#/5+$%&./.<3*&53*)+,&./%#&%3**.+,& S3=&N3/%&,+23$3%+&#2+$3<#/&-#$&53*)+,&*#03*& %#&3&,./1*+&2$#0+,,#$& "Empty" tally Tally of these values V3$%,&#-&3&$+()0<#/& T3**=W&P/%+$'+(.3%+&,%3%+&#-&0#'2)%3<#/& 9#'>./+W&9#'>./+&G&%3**.+,& h+()0+c1+/w&m+/+$3%+&$+,)*%&-$#'&%3**=& P/.%W&9$+3%+&Z+'2%=[&%3**=& J00)')*3%+W&J((&D&53*)+&%#&%3**=& 0 i V3$3**+*&$+()0<#/&-$3'+N#$;& 7 c rg 36 c Tally: Intermediate state of computation i = Init: Create "empty" tally a = Accumulate: Add 1 value to tally c = Combine: Combine 2 tallies rg = Reduce gen: Generate result from tally a a a a a a a a i 3 2 i 8 4 i c 12 4+L/./1&$+()0<#/,& T3**=W&P/%+$'+(.3%+&,%3%+&#-&0#'2)%3<#/& 9#'>./+W&9#'>./+&G&%3**.+,& h+()0+c1+/w&m+/+$3%+&$+,)*%&-$#'&%3**=& P/.%W&9$+3%+&Z+'2%=[&%3**=& J00)')*3%+W&J((&D&53*)+&%#&%3**=&!3'2*+&2$#>*+',W&f&

16 4+L/./1&$+()0<#/,& T3**=W&P/%+$'+(.3%+&,%3%+&#-&0#'2)%3<#/& 9#'>./+W&9#'>./+&G&%3**.+,& h+()0+c1+/w&m+/+$3%+&$+,)*%&-$#'&%3**=& P/.%W&9$+3%+&Z+'2%=[&%3**=& J00)')*3%+W&J((&D&53*)+&%#&%3**=&!3'2*+&2$#>*+',W&fK&".,%#1$3'& 4+L/./1&$+()0<#/,& T3**=W&P/%+$'+(.3%+&,%3%+&#-&0#'2)%3<#/& 9#'>./+W&9#'>./+&G&%3**.+,& h+()0+c1+/w&m+/+$3%+&$+,)*%&-$#'&%3**=& P/.%W&9$+3%+&Z+'2%=[&%3**=& J00)')*3%+W&J((&D&53*)+&%#&%3**=&!3'2*+&2$#>*+',W&fK&".,%#1$3'K&'38& 4+L/./1&$+()0<#/,& T3**=W&P/%+$'+(.3%+&,%3%+&#-&0#'2)%3<#/& 9#'>./+W&9#'>./+&G&%3**.+,& h+()0+c1+/w&m+/+$3%+&$+,)*%&-$#'&%3**=& P/.%W&9$+3%+&Z+'2%=[&%3**=& J00)')*3%+W&J((&D&53*)+&%#&%3**=&!3'2*+&2$#>*+',W&fK&".,%#1$3'K&'38K&G /( &*3$1+,%&& 4+L/./1&$+()0<#/,& T3**=W&P/%+$'+(.3%+&,%3%+&#-&0#'2)%3<#/& 9#'>./+W&9#'>./+&G&%3**.+,& h+()0+c1+/w&m+/+$3%+&$+,)*%&-$#'&%3**=& P/.%W&9$+3%+&Z+'2%=[&%3**=& J00)')*3%+W&J((&D&53*)+&%#&%3**=&!3'2*+&2$#>*+',W&fK&".,%#1$3'K&'38K&G /( &*3$1+,%K&&&&& &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&*+/1%"&#-&*#/1+,%&$)/&& 93/&1#&>+=#/(&%"+,+III&./(+8U-&^L/(&./(+8&#-&L$,%&#00)$$+/0+_&,+p)+/0+&3*.1/'+/%&&&&&&&&&&&&&&&&&&&&&&&&&&&&&k!$./.53,&J*)$)l& /C>#(=&2$#>*+'&&& & & & & &&&&&&&k!$./.53,&j*)$)l& h+*3<#/,".2&%#&(=/3'.0&2$#1$3''./1& 9"3**+/1+,&./&(=/3'.0&2$#1$3''./1W& :"3%&3$+&%"+&%3>*+&+/%$.+,O& t#n&%#&0#'2)%+&3&%3>*+&+/%$=&-$#'&2$+5.#),&+/%$.+,o& 9"3**+/1+,&./&$+()0<#/&-$3'+N#$;W& :"3%&.,&%"+&%3**=O& t#n&%#&0#'2)%+&3&/+n&%3**.+,&-$#'&2$+5.#),&#/+,o&

17 h+()0<#/,&./&9"32+*& B82$+,,&$+()0<#/&#2+$3<#/&./&,./1*+&*./+W& &53$&,&d&f&$+()0+&Ja &YYJ&.,&3$$3=K&,&1+%,&,)'&!)22#$%,&fK&jK&É&^8#$_K&vvK&ÑÑK&'38K&'./K&III& './*#0&3/(&'38*#0&$+%)$/&3&%)2*+&N.%"&53*)+& 3/(&.%,&./(+8W& &53$&^53*K&*#0_&d&'./*#0&$+()0+&Ja& h+()0<#/&+83'2*+& 93/&3*,#&),+&$+()0+&#/&-)/0<#/&2*),&3&$3/1+& 1 1 " x B8W&J22$#8.'3%+&ÖYG&),./1&&&&&&&&&&&&&&&&&W& # 2 dx "1 &0#/L1&0#/,%&/)'h+0%&d&DEEEEEEEa& &0#/,%&N.(%"&d&GIE&Y&/)'h+0%a&&&&&&&&&&&&YY$+0%3/1*+&N.(%"& &0#/,%&>3,+y&d&CD&C&N.(%"YGa& &0#/,%&"3*-VP&d&f&$+()0+&k.&./&DII/)'h+0%l& && &^N.(%"&j&,p$%^DIE&q&^>3,+y&f&.jN.(%"_jjG a& 4+L/./1&3&0),%#'&$+()0<#/& 9$+3%+&#>]+0%&%#&$+2$+,+/%&./%+$'+(.3%+&,%3%+& S),%&,)22#$%& 300)')*3%+W&3((,&3&,./1*+&+*+'+/%&%#&%"+&,%3%+& 0#'>./+W&3((,&3/#%"+$&./%+$'+(.3%+&,%3%+& 1+/+$3%+W&0#/5+$%,&,%3%+&#>]+0%&./%#&L/3*&#)%2)%& 0*3,,&9.$0*+&g& 53$&$3(.),&W&$+3*a& 2$#0&3$+3^_&W&$+3*&g& $+%)$/&RIDF&j&$3(.),&j&$3(.),a& 9*3,,+,&./&9"32+*& 53$&0DK&0G&W&9.$0*+a & & &YY0$+3%+,&G&9.$0*+&$+-+$+/0+,& 0D&d&/+N&9.$0*+^DE_a& &Yj&),+,&,=,%+'C,)22*.+(&0#/,%$)0%#$& & & & & & & & & & &%#&0$+3%+&3&9.$0*+&#>]+0%& & & & & & & & & & &3/(&'3;+,&0D&$+-+$&%#&.%&jY& 0G&d&0Da& & & & &YY'3;+,&0G&$+-+$&%#&%"+&,3'+&#>]+0%& (+*+%+&0Da & & & &YY'+'#$=&'),%&>+&'3/)3**=&-$++(&& 0*3,,&9.$0*+&W&!"32+&g& &&III& P/"+$.%3/0+& &YY9.$0*+&./"+$.%,&-$#'&!"32+& 53$&,&W&!"32+a&,&d&/+N&9.$0*+^DEIE_a&&&YY3)%#'3<0&03,%&%#&>3,+&0*3,,& 53$&3$+3&d&,I3$+3^_a &&&Yj&03**&$+0.2.+/%&(+%+$'./+(&& && & & & & & & &&>=&#>]+0%Q,&(=/3'.0&%=2+&jY& B83'2*+&Z0),%#'[&$+()0<#/& 0*3,,&S=S./&W&h+()0+!03/U2&g&YYL/(,&'./&+*+'+/%&^+p).5I&%#&>).*%C./&Z'./[_& &%=2+&+*%T=2+a & & & & & &YY%=2+&#-&+*+'+/%,& &53$&,#@3$&W&+*%T=2+&d&'38^+*%T=2+_a &YY'./.')'&,#&-3$& &2$#0&300)')*3%+^53*&W&+*%T=2+_&g& & & &.-^53*&e&,#@3$_&g&,#@3$&d&53*a& & &2$#0&0#'>./+^#%"+$&W&S=S./_&g& & & &.-^#%"+$I,#@3$&e&,#@3$_&g&,#@3$&d&#%"+$I,#@3$a& & &2$#0&1+/+$3%+^_&g&$+%)$/&,#@3$a& &

18 J/(&%"3%Q,&/#%&3**III&&&^,03/,_& P/,%+3(&#-&]),%&1+c/1&#5+$3**&53*)+K&3*,#&0#'2)%+& 53*)+&-#$&+5+$=&2$+L8& J/(&%"3%Q,&/#%&3**III&&&^,03/,_& P/,%+3(&#-&]),%&1+c/1&#5+$3**&53*)+K&3*,#&0#'2)%+& 53*)+&-#$&+5+$=&2$+L8& A A sum sum A,+-)*&3/,N+$./1&p)+$.+,&*.;+&& &&&&Z:"3%&.,&%"+&,)'&#-&+*+'+/%,&G&%"$)&\O[& && & & & & &d&,)'k\l&q&,)'kdl& A,+-)*&3/,N+$./1&p)+$.+,&*.;+&& &&&&Z:"3%&.,&%"+&,)'&#-&+*+'+/%,&G&%"$)&\O[& && & & & & &d&,)'k\l&q&,)'kdl& 9#'2)</1&%"+&,03/&./&23$3**+*& 9#'2)</1&%"+&,03/&./&23$3**+*& Upward pass to compute reduction. Downward pass to also compute scan Upward pass to compute reduction. Downward pass to also compute scan #N/N3$(&23,,&N.%"&-)/0<#/&*3>+*,& S3/=&#2<#/,&-#$&'#()*+&R& 0 i 0 19 i = init a = accumulate A,./1&9"32+*&-#$&+3,+&#-&23$3**+*.s3<#/& h+()0<#/,&#/&232+$&^(+l/./1&3/(y#$&),./1_& input: a a a a a a a a J*,#&.'2*+'+/</1&$+()0<#/,&./&9"32+*& output: !.(+&p)+,<#/W&:"+$+&%#&2)%&.%O&

19 935+3%,&!<**&./&(+5+*#2'+/%& h+()0<#/,&,+$.3*.s+(&#/&')*<0#$+&^3,&#-&diu_& B$$#$&'+,,31+,&%"./&?+N&5+$,.#/,&+5+$=&u&'#/%",&q&,#'+&>.1&0"3/1+,&?#%&'3/=&*.>$3$.+,& ZTU4U[&*.,%&?#%+,K&,*.(+,K&3,,.1/'+/%,K&+%0& B5.(+/0+&#/&<+&%#&(=/3'.0&2$#1$3''./1&!3'2*+&3(#2<#/&,%$3%+1.+,& S#$+&322*.03<#/,&#-&$+()0<#/,&3/(&,03/,&?#&(+5+*#2'+/%&+/5.$#/'+/%& 9#''3/(C*./+&0#'2.*3<#/&./&b./)8& V*+3,+&,"3$+w& U%"+$&$+,#)$0+,& 9!&./&V3$3**+*& "X2WYY0,./23$3**+*I#$1& 43/&m$#,,'3/Q,&9!&G&/#%+,& "X2WYYNNNI0,I1,)I+()Y`%022Y0)$$.0)*)'Y& T"3/;,&-#$&=#)$&<'+& (>)/(+Ü;/#8I+()& "X2WYY-30)*%=I;/#8I+()Y(>)/(+Y%+30"./1Y99!9CS:DR&

Introduction to CUDA CIRC Summer School 2014

Introduction to CUDA CIRC Summer School 2014 Introduction to CUDA CIRC Summer School 2014 Baowei Liu Center of Integrated Research Computing University of Rochester October 20, 2014 Introduction Overview What will you learn on this class? Start from

More information

Short modules for introducing parallel concepts

Short modules for introducing parallel concepts Short modules for introducing parallel concepts David Bunde Knox College Work par

More information

CUDA. More on threads, shared memory, synchronization. cuprintf

CUDA. More on threads, shared memory, synchronization. cuprintf CUDA More on threads, shared memory, synchronization cuprintf Library function for CUDA Developers Copy the files from /opt/cuprintf into your source code folder #include cuprintf.cu global void testkernel(int

More information

CD _. _. 'p ~~M CD, CD~~~~V. C ~'* Co ~~~~~~~~~~~~- CD / X. pd.0 & CD. On 0 CDC _ C _- CD C P O ttic 2 _. OCt CD CD (IQ. q"3. 3 > n)1t.

CD _. _. 'p ~~M CD, CD~~~~V. C ~'* Co ~~~~~~~~~~~~- CD / X. pd.0 & CD. On 0 CDC _ C _- CD C P O ttic 2 _. OCt CD CD (IQ. q3. 3 > n)1t. n 5 L n q"3 +, / X g ( E 4 11 " ') $ n 4 ) w Z$ > _ X ~'* ) i 1 _ /3 L 2 _ L 4 : 5 n W 9 U~~~~~~ 5 T f V ~~~~~~~~~~~~ (Q ' ~~M 3 > n)1 % ~~~~V v,~~ _ + d V)m X LA) z~~11 4 _ N cc ', f 'd 4 5 L L " V +,

More information

CUDA Kenjiro Taura 1 / 36

CUDA Kenjiro Taura 1 / 36 CUDA Kenjiro Taura 1 / 36 Contents 1 Overview 2 CUDA Basics 3 Kernels 4 Threads and thread blocks 5 Moving data between host and device 6 Data sharing among threads in the device 2 / 36 Contents 1 Overview

More information

Lecture 3: Introduction to CUDA

Lecture 3: Introduction to CUDA CSCI-GA.3033-004 Graphics Processing Units (GPUs): Architecture and Programming Lecture 3: Introduction to CUDA Some slides here are adopted from: NVIDIA teaching kit Mohamed Zahran (aka Z) mzahran@cs.nyu.edu

More information

Lecture 8: GPU Programming. CSE599G1: Spring 2017

Lecture 8: GPU Programming. CSE599G1: Spring 2017 Lecture 8: GPU Programming CSE599G1: Spring 2017 Announcements Project proposal due on Thursday (4/28) 5pm. Assignment 2 will be out today, due in two weeks. Implement GPU kernels and use cublas library

More information

;/ *. 7 &9*R . C 2 <0* # "2 <AC (:* "' < %)V AB C. Downloaded from qjal.smtc.ac.ir at 23: on Sunday May 13th 2018

;/ *. 7 &9*R . C 2 <0* # 2 <AC (:* ' < %)V AB C. Downloaded from qjal.smtc.ac.ir at 23: on Sunday May 13th 2018 ( - ) 1394 @R$ 8 %& b ;/ *. 7 1393/12/18 1394/2/28 :6# :G"#a 2 "V*D 1 AB C &9*R "' : > ; *$ id: 0* :6 i: > (:* 0 > ' # *,2 T# "? 0 > # "5 EOAB N*> # "5 UA;

More information

Outline 2011/10/8. Memory Management. Kernels. Matrix multiplication. CIS 565 Fall 2011 Qing Sun

Outline 2011/10/8. Memory Management. Kernels. Matrix multiplication. CIS 565 Fall 2011 Qing Sun Outline Memory Management CIS 565 Fall 2011 Qing Sun sunqing@seas.upenn.edu Kernels Matrix multiplication Managing Memory CPU and GPU have separate memory spaces Host (CPU) code manages device (GPU) memory

More information

Plenty of Whoopee Strand New Years

Plenty of Whoopee Strand New Years N B V Y-NN \ - 4! / N N ) B > 3 9 N - N 95 q N N B zz 3 - z N Y B - 933 55 2 -- - -» 25-5 & V X X X X N Y B 5 932 / q - 8 4 6 B N 3 BN NY N ; -! 2-- - - 2 B z ; - - B VN N 4) - - - B N N N V 4- - 8 N-

More information

CUDA. Sathish Vadhiyar High Performance Computing

CUDA. Sathish Vadhiyar High Performance Computing CUDA Sathish Vadhiyar High Performance Computing Hierarchical Parallelism Parallel computations arranged as grids One grid executes after another Grid consists of blocks Blocks assigned to SM. A single

More information

Lecture 2: Introduction to CUDA C

Lecture 2: Introduction to CUDA C CS/EE 217 GPU Architecture and Programming Lecture 2: Introduction to CUDA C David Kirk/NVIDIA and Wen-mei W. Hwu, 2007-2013 1 CUDA /OpenCL Execution Model Integrated host+device app C program Serial or

More information

Module 2: Introduction to CUDA C

Module 2: Introduction to CUDA C ECE 8823A GPU Architectures Module 2: Introduction to CUDA C 1 Objective To understand the major elements of a CUDA program Introduce the basic constructs of the programming model Illustrate the preceding

More information

CUDA Programming (Basics, Cuda Threads, Atomics) Ezio Bartocci

CUDA Programming (Basics, Cuda Threads, Atomics) Ezio Bartocci TECHNISCHE UNIVERSITÄT WIEN Fakultät für Informatik Cyber-Physical Systems Group CUDA Programming (Basics, Cuda Threads, Atomics) Ezio Bartocci Outline of CUDA Basics Basic Kernels and Execution on GPU

More information

SPAREPARTSCATALOG: CONNECTORS SPARE CONNECTORS KTM ART.-NR.: 3CM EN

SPAREPARTSCATALOG: CONNECTORS SPARE CONNECTORS KTM ART.-NR.: 3CM EN SPAREPARTSCATALOG: CONNECTORS ART.-NR.: 3CM3208201EN CONTENT SPARE CONNECTORS AA-AN SPARE CONNECTORS AO-BC SPARE CONNECTORS BD-BQ SPARE CONNECTORS BR-CD 3 4 5 6 SPARE CONNECTORS CE-CR SPARE CONNECTORS

More information

Parallel Computing. Lecture 19: CUDA - I

Parallel Computing. Lecture 19: CUDA - I CSCI-UA.0480-003 Parallel Computing Lecture 19: CUDA - I Mohamed Zahran (aka Z) mzahran@cs.nyu.edu http://www.mzahran.com GPU w/ local DRAM (device) Behind CUDA CPU (host) Source: http://hothardware.com/reviews/intel-core-i5-and-i7-processors-and-p55-chipset/?page=4

More information

High-Performance Computing Using GPUs

High-Performance Computing Using GPUs High-Performance Computing Using GPUs Luca Caucci caucci@email.arizona.edu Center for Gamma-Ray Imaging November 7, 2012 Outline Slide 1 of 27 Why GPUs? What is CUDA? The CUDA programming model Anatomy

More information

SPARE CONNECTORS KTM 2014

SPARE CONNECTORS KTM 2014 SPAREPARTSCATALOG: // ENGINE ART.-NR.: 3208201EN CONTENT CONNECTORS FOR WIRING HARNESS AA-AN CONNECTORS FOR WIRING HARNESS AO-BC CONNECTORS FOR WIRING HARNESS BD-BQ CONNECTORS FOR WIRING HARNESS BR-CD

More information

Performance Diagnosis for Hybrid CPU/GPU Environments

Performance Diagnosis for Hybrid CPU/GPU Environments Performance Diagnosis for Hybrid CPU/GPU Environments Michael M. Smith and Karen L. Karavanic Computer Science Department Portland State University Performance Diagnosis for Hybrid CPU/GPU Environments

More information

22ND CENTURY_J1.xls Government Site Hourly Rate

22ND CENTURY_J1.xls Government Site Hourly Rate Escalation rate 000 AA0 Administrative Assistant Level I 000 AA0 Administrative Assistant Level II 000 AB0 Application Engineer Level I 000 AB0 Application Engineer Level II 000 AC0 Application Programmer

More information

CS 179: GPU Computing. Lecture 2: The Basics

CS 179: GPU Computing. Lecture 2: The Basics CS 179: GPU Computing Lecture 2: The Basics Recap Can use GPU to solve highly parallelizable problems Performance benefits vs. CPU Straightforward extension to C language Disclaimer Goal for Week 1: Fast-paced

More information

Module 2: Introduction to CUDA C. Objective

Module 2: Introduction to CUDA C. Objective ECE 8823A GPU Architectures Module 2: Introduction to CUDA C 1 Objective To understand the major elements of a CUDA program Introduce the basic constructs of the programming model Illustrate the preceding

More information

Appendix 5-1: Attachment J.1 Pricing Table -1: IMS Ceiling Loaded Rates at Contractor Site

Appendix 5-1: Attachment J.1 Pricing Table -1: IMS Ceiling Loaded Rates at Contractor Site Appendix 5-1: Attachment J.1 Pricing Table -1: IMS Ceiling Loaded Rates at Contractor Site Escalation rate 4.6% 4.6% 4.6% 4.6% 4.6% 4.6% 4.6% 4.6% 4.6% 0001 AA01 Administrative Assistant Level I $51.00

More information

CS/CoE 1541 Final exam (Fall 2017). This is the cumulative final exam given in the Fall of Question 1 (12 points): was on Chapter 4

CS/CoE 1541 Final exam (Fall 2017). This is the cumulative final exam given in the Fall of Question 1 (12 points): was on Chapter 4 CS/CoE 1541 Final exam (Fall 2017). Name: This is the cumulative final exam given in the Fall of 2017. Question 1 (12 points): was on Chapter 4 Question 2 (13 points): was on Chapter 4 For Exam 2, you

More information

GPU Programming. Lecture 2: CUDA C Basics. Miaoqing Huang University of Arkansas 1 / 34

GPU Programming. Lecture 2: CUDA C Basics. Miaoqing Huang University of Arkansas 1 / 34 1 / 34 GPU Programming Lecture 2: CUDA C Basics Miaoqing Huang University of Arkansas 2 / 34 Outline Evolvements of NVIDIA GPU CUDA Basic Detailed Steps Device Memories and Data Transfer Kernel Functions

More information

GPU Programming with CUDA. Pedro Velho

GPU Programming with CUDA. Pedro Velho GPU Programming with CUDA Pedro Velho Meeting the audience! How many of you used concurrent programming before? How many threads? How many already used CUDA? Introduction from games to science 1 2 Architecture

More information

Register file. A single large register file (ex. 16K registers) is partitioned among the threads of the dispatched blocks.

Register file. A single large register file (ex. 16K registers) is partitioned among the threads of the dispatched blocks. Sharing the resources of an SM Warp 0 Warp 1 Warp 47 Register file A single large register file (ex. 16K registers) is partitioned among the threads of the dispatched blocks Shared A single SRAM (ex. 16KB)

More information

Computation to Core Mapping Lessons learned from a simple application

Computation to Core Mapping Lessons learned from a simple application Lessons learned from a simple application Matrix Multiplication Used as an example throughout the course Goal for today: Show the concept of Computation-to-Core Mapping Block schedule, Occupancy, and thread

More information

CUDA Programming. Week 1. Basic Programming Concepts Materials are copied from the reference list

CUDA Programming. Week 1. Basic Programming Concepts Materials are copied from the reference list CUDA Programming Week 1. Basic Programming Concepts Materials are copied from the reference list G80/G92 Device SP: Streaming Processor (Thread Processors) SM: Streaming Multiprocessor 128 SP grouped into

More information

POSIX threads CS 241. February 17, Copyright University of Illinois CS 241 Staff

POSIX threads CS 241. February 17, Copyright University of Illinois CS 241 Staff POSIX threads CS 241 February 17, 2012 Copyright University of Illinois CS 241 Staff 1 Recall: Why threads over processes? Creating a new process can be expensive Time A call into the operating system

More information

CS377P Programming for Performance GPU Programming - I

CS377P Programming for Performance GPU Programming - I CS377P Programming for Performance GPU Programming - I Sreepathi Pai UTCS November 9, 2015 Outline 1 Introduction to CUDA 2 Basic Performance 3 Memory Performance Outline 1 Introduction to CUDA 2 Basic

More information

Massively Parallel Algorithms

Massively Parallel Algorithms Massively Parallel Algorithms Introduction to CUDA & Many Fundamental Concepts of Parallel Programming G. Zachmann University of Bremen, Germany cgvr.cs.uni-bremen.de Hybrid/Heterogeneous Computation/Architecture

More information

Programmable Accelerators

Programmable Accelerators Programmable Accelerators Jason Lowe-Power powerjg@cs.wisc.edu cs.wisc.edu/~powerjg Increasing specialization Need to program these accelerators Challenges 1. Consistent pointers 2. Data movement 3. Security

More information

GPU Computing: A Quick Start

GPU Computing: A Quick Start GPU Computing: A Quick Start Orest Shardt Department of Chemical and Materials Engineering University of Alberta August 25, 2011 Session Goals Get you started with highly parallel LBM Take a practical

More information

Introduction to CUDA CME343 / ME May James Balfour [ NVIDIA Research

Introduction to CUDA CME343 / ME May James Balfour [ NVIDIA Research Introduction to CUDA CME343 / ME339 18 May 2011 James Balfour [ jbalfour@nvidia.com] NVIDIA Research CUDA Programing system for machines with GPUs Programming Language Compilers Runtime Environments Drivers

More information

SMBJ5.0 thru SMBJ188CA. Surface Mount TRANSZORB Transient Voltage Suppressors. Vishay General Semiconductor

SMBJ5.0 thru SMBJ188CA. Surface Mount TRANSZORB Transient Voltage Suppressors. Vishay General Semiconductor SMBJ5.0 thru SMBJ188CA Surface Mount TRANSZORB Transient Voltage Suppressors DO-214AA (SMB J-Bend) PRIMARY CHARACTERISTICS V WM 5.0 V to 188 V P PPM 600 W I FSM (uni-directional only) A T J max. 150 C

More information

Fall 2007, Final Exam, Data Structures and Algorithms

Fall 2007, Final Exam, Data Structures and Algorithms Fall 2007, Final Exam, Data Structures and Algorithms Name: Section: Email id: 12th December, 2007 This is an open book, one crib sheet (2 sides), closed notebook exam. Answer all twelve questions. Each

More information

By: Tomer Morad Based on: Erik Lindholm, John Nickolls, Stuart Oberman, John Montrym. NVIDIA TESLA: A UNIFIED GRAPHICS AND COMPUTING ARCHITECTURE In IEEE Micro 28(2), 2008 } } Erik Lindholm, John Nickolls,

More information

Using Chapel to teach parallel concepts. David Bunde Knox College

Using Chapel to teach parallel concepts. David Bunde Knox College Using Chapel to teach parallel concepts David Bunde Knox College dbunde@knox.edu Acknowledgements Silent partner: Kyle Burke Material drawn from tutorials created with contribudons from Johnathan Ebbers,

More information

COSC 6374 Parallel Computations Introduction to CUDA

COSC 6374 Parallel Computations Introduction to CUDA COSC 6374 Parallel Computations Introduction to CUDA Edgar Gabriel Fall 2014 Disclaimer Material for this lecture has been adopted based on various sources Matt Heavener, CS, State Univ. of NY at Buffalo

More information

Lessons learned from a simple application

Lessons learned from a simple application Computation to Core Mapping Lessons learned from a simple application A Simple Application Matrix Multiplication Used as an example throughout the course Goal for today: Show the concept of Computation-to-Core

More information

π = 4 N in_circle N total ... // includes # define N 1000000 float uniform_rand(unsigned* seed, float lower, float upper) {... int main () { int num_in_circ = 0; float x, y, dist; #pragma omp parallel

More information

Reduc&ons II: The Revenge

Reduc&ons II: The Revenge Reduc&ons II: The Revenge Summing values in an array 16 10 6 3 7 4 2 2 1 4 3 1 3 0 2 Summing values in an array 16 10 6 3 7 4 2 2 1 4 3 1 3 0 2 Finding max of an array 4 4 3 2 4 3 2 2 1 4 3 1 3 0 2 Finding

More information

CUDA C Programming Mark Harris NVIDIA Corporation

CUDA C Programming Mark Harris NVIDIA Corporation CUDA C Programming Mark Harris NVIDIA Corporation Agenda Tesla GPU Computing CUDA Fermi What is GPU Computing? Introduction to Tesla CUDA Architecture Programming & Memory Models Programming Environment

More information

GPU programming CUDA C. GPU programming,ii. COMP528 Multi-Core Programming. Different ways:

GPU programming CUDA C. GPU programming,ii. COMP528 Multi-Core Programming. Different ways: COMP528 Multi-Core Programming GPU programming,ii www.csc.liv.ac.uk/~alexei/comp528 Alexei Lisitsa Dept of computer science University of Liverpool a.lisitsa@.liverpool.ac.uk Different ways: GPU programming

More information

Parallel Numerical Algorithms

Parallel Numerical Algorithms Parallel Numerical Algorithms http://sudalab.is.s.u-tokyo.ac.jp/~reiji/pna14/ [ 10 ] GPU and CUDA Parallel Numerical Algorithms / IST / UTokyo 1 PNA16 Lecture Plan General Topics 1. Architecture and Performance

More information

Efficient CPU GPU data transfers CUDA 6.0 Unified Virtual Memory

Efficient CPU GPU data transfers CUDA 6.0 Unified Virtual Memory Institute of Computational Science Efficient CPU GPU data transfers CUDA 6.0 Unified Virtual Memory Juraj Kardoš (University of Lugano) July 9, 2014 Juraj Kardoš Efficient GPU data transfers July 9, 2014

More information

Technische Universität München. GPU Programming. Rüdiger Westermann Chair for Computer Graphics & Visualization. Faculty of Informatics

Technische Universität München. GPU Programming. Rüdiger Westermann Chair for Computer Graphics & Visualization. Faculty of Informatics GPU Programming Rüdiger Westermann Chair for Computer Graphics & Visualization Faculty of Informatics Overview Programming interfaces and support libraries The CUDA programming abstraction An in-depth

More information

Optimizing Parallel Reduction in CUDA. Mark Harris NVIDIA Developer Technology

Optimizing Parallel Reduction in CUDA. Mark Harris NVIDIA Developer Technology Optimizing Parallel Reduction in CUDA Mark Harris NVIDIA Developer Technology Parallel Reduction Common and important data parallel primitive Easy to implement in CUDA Harder to get it right Serves as

More information

CS/EE 217 GPU Architecture and Parallel Programming. Lecture 10. Reduction Trees

CS/EE 217 GPU Architecture and Parallel Programming. Lecture 10. Reduction Trees CS/EE 217 GPU Architecture and Parallel Programming Lecture 10 Reduction Trees David Kirk/NVIDIA and Wen-mei W. Hwu University of Illinois, 2007-2012 1 Objective To master Reduction Trees, arguably the

More information

EEM528 GPU COMPUTING

EEM528 GPU COMPUTING EEM528 CS 193G GPU COMPUTING Lecture 2: GPU History & CUDA Programming Basics Slides Credit: Jared Hoberock & David Tarjan CS 193G History of GPUs Graphics in a Nutshell Make great images intricate shapes

More information

Computability and Complexity Sample Exam Questions

Computability and Complexity Sample Exam Questions Computability and Complexity Sample Exam Questions Wolfgang Schreiner Wolfgang.Schreiner@risc.jku.at Family Name: Given Name: Matriculation Number: Study Code: Total: 100 Points. 51 Points: GEN4 64 Points:

More information

CS 179: GPU Programming. Lecture 7

CS 179: GPU Programming. Lecture 7 CS 179: GPU Programming Lecture 7 Week 3 Goals: More involved GPU-accelerable algorithms Relevant hardware quirks CUDA libraries Outline GPU-accelerated: Reduction Prefix sum Stream compaction Sorting(quicksort)

More information

Atomic Operations. Atomic operations, fast reduction. GPU Programming. Szénási Sándor.

Atomic Operations. Atomic operations, fast reduction. GPU Programming.   Szénási Sándor. Atomic Operations Atomic operations, fast reduction GPU Programming http://cuda.nik.uni-obuda.hu Szénási Sándor szenasi.sandor@nik.uni-obuda.hu GPU Education Center of Óbuda University ATOMIC OPERATIONS

More information

Introduction to GPGPU and GPU-architectures

Introduction to GPGPU and GPU-architectures Introduction to GPGPU and GPU-architectures Henk Corporaal Gert-Jan van den Braak http://www.es.ele.tue.nl/ Contents 1. What is a GPU 2. Programming a GPU 3. GPU thread scheduling 4. GPU performance bottlenecks

More information

Lecture 6 CSE 260 Parallel Computation (Fall 2015) Scott B. Baden. Computing with Graphical Processing Units CUDA Programming Matrix multiplication

Lecture 6 CSE 260 Parallel Computation (Fall 2015) Scott B. Baden. Computing with Graphical Processing Units CUDA Programming Matrix multiplication Lecture 6 CSE 260 Parallel Computation (Fall 2015) Scott B. Baden Computing with Graphical Processing Units CUDA Programming Matrix multiplication Announcements A2 has been released: Matrix multiplication

More information

CUDA. GPU Computing. K. Cooper 1. 1 Department of Mathematics. Washington State University

CUDA. GPU Computing. K. Cooper 1. 1 Department of Mathematics. Washington State University GPU Computing K. Cooper 1 1 Department of Mathematics Washington State University 2014 Review of Parallel Paradigms MIMD Computing Multiple Instruction Multiple Data Several separate program streams, each

More information

Optimizing Parallel Reduction in CUDA. Mark Harris NVIDIA Developer Technology

Optimizing Parallel Reduction in CUDA. Mark Harris NVIDIA Developer Technology Optimizing Parallel Reduction in CUDA Mark Harris NVIDIA Developer Technology Parallel Reduction Common and important data parallel primitive Easy to implement in CUDA Harder to get it right Serves as

More information

CS333 Intro to Operating Systems. Jonathan Walpole

CS333 Intro to Operating Systems. Jonathan Walpole CS333 Intro to Operating Systems Jonathan Walpole Threads & Concurrency 2 Threads Processes have the following components: - an address space - a collection of operating system state - a CPU context or

More information

GPU CUDA Programming

GPU CUDA Programming GPU CUDA Programming 이정근 (Jeong-Gun Lee) 한림대학교컴퓨터공학과, 임베디드 SoC 연구실 www.onchip.net Email: Jeonggun.Lee@hallym.ac.kr ALTERA JOINT LAB Introduction 차례 Multicore/Manycore and GPU GPU on Medical Applications

More information

Introduction to parallel computing. Seminar Organization

Introduction to parallel computing. Seminar Organization Introduction to parallel computing Rami Melhem Department of Computer Science 1 Seminar Organization 1) Introductory lectures (probably 4) 2) aper presentations by students (2/3 per short/long class) -

More information

Lecture 5. Performance Programming with CUDA

Lecture 5. Performance Programming with CUDA Lecture 5 Performance Programming with CUDA Announcements 2011 Scott B. Baden / CSE 262 / Spring 2011 2 Today s lecture Matrix multiplication 2011 Scott B. Baden / CSE 262 / Spring 2011 3 Memory Hierarchy

More information

SMBJ5.0 thru SMBJ188CA. Surface Mount TRANSZORB Transient Voltage Suppressors. Vishay General Semiconductor

SMBJ5.0 thru SMBJ188CA. Surface Mount TRANSZORB Transient Voltage Suppressors. Vishay General Semiconductor SMBJ5.0 thru SMBJ188CA Surface Mount TRANSZORB Transient Voltage Suppressors DO-214AA (SMB J-Bend) PRIMARY CHARACTERISTICS V WM 5.0 V to 188 V P PPM 600 W I FSM (uni-directional only) A T J max. 150 C

More information

Objective. GPU Teaching Kit. OpenACC. To understand the OpenACC programming model. Introduction to OpenACC

Objective. GPU Teaching Kit. OpenACC. To understand the OpenACC programming model. Introduction to OpenACC GPU Teaching Kit Accelerated Computing OpenACC Introduction to OpenACC Objective To understand the OpenACC programming model basic concepts and pragma types simple examples 2 2 OpenACC The OpenACC Application

More information

SMF Transient Voltage Suppressor Diode Series

SMF Transient Voltage Suppressor Diode Series SMF Transient Voltage Suppressor Diode Series General Information The SMF series is designed specifically to protect sensitive electronic equipment from voltage transients induced by lightning and other

More information

SC13 GPU Technology Theater. Accessing New CUDA Features from CUDA Fortran Brent Leback, Compiler Manager, PGI

SC13 GPU Technology Theater. Accessing New CUDA Features from CUDA Fortran Brent Leback, Compiler Manager, PGI SC13 GPU Technology Theater Accessing New CUDA Features from CUDA Fortran Brent Leback, Compiler Manager, PGI The Case for Fortran Clear, straight-forward syntax Successful legacy in the scientific community

More information

FLIGHTS TO / FROM CANADA ARE DOMESTIC

FLIGHTS TO / FROM CANADA ARE DOMESTIC MINIMUM CONNECTING TIME Houston, USA FLIGHTS TO / FROM CANADA ARE DOMESTIC IAH (Intercontinental Airport) DOMESTIC TO DOMESTIC :45 AA TO DL :40 CO TO AC :30 AA, UA, US :20 CO :30 DL, WN :25 DOMESTIC TO

More information

CS510 Operating System Foundations. Jonathan Walpole

CS510 Operating System Foundations. Jonathan Walpole CS510 Operating System Foundations Jonathan Walpole The Process Concept 2 The Process Concept Process a program in execution Program - description of how to perform an activity instructions and static

More information

CSE 591: GPU Programming. Using CUDA in Practice. Klaus Mueller. Computer Science Department Stony Brook University

CSE 591: GPU Programming. Using CUDA in Practice. Klaus Mueller. Computer Science Department Stony Brook University CSE 591: GPU Programming Using CUDA in Practice Klaus Mueller Computer Science Department Stony Brook University Code examples from Shane Cook CUDA Programming Related to: score boarding load and store

More information

Parallel Accelerators

Parallel Accelerators Parallel Accelerators Přemysl Šůcha ``Parallel algorithms'', 2017/2018 CTU/FEL 1 Topic Overview Graphical Processing Units (GPU) and CUDA Vector addition on CUDA Intel Xeon Phi Matrix equations on Xeon

More information

HPCSE II. GPU programming and CUDA

HPCSE II. GPU programming and CUDA HPCSE II GPU programming and CUDA What is a GPU? Specialized for compute-intensive, highly-parallel computation, i.e. graphic output Evolution pushed by gaming industry CPU: large die area for control

More information

Reduction of a Symmetrical Matrix. to Tridiagonal Form on GPUs

Reduction of a Symmetrical Matrix. to Tridiagonal Form on GPUs Reduction of a Symmetrical Matrix to Tridiagonal Form on GPUs By Shuotian Chen Department of Electrical and Computer Engineering University of Illinois at Urbana-Champaign Adviser: Professor Volodymyr

More information

Introduc)on to GPU Programming

Introduc)on to GPU Programming Introduc)on to GPU Programming Mubashir Adnan Qureshi h3p://www.ncsa.illinois.edu/people/kindr/projects/hpca/files/singapore_p1.pdf h3p://developer.download.nvidia.com/cuda/training/nvidia_gpu_compu)ng_webinars_cuda_memory_op)miza)on.pdf

More information

High Performance Computing and GPU Programming

High Performance Computing and GPU Programming High Performance Computing and GPU Programming Lecture 3: GPU Application GPU Intro Review Simple Example Memory Effects GPU Intro Review GPU Intro Review Shared Multiprocessors Global parallelism Assign

More information

Working with Data sent to a Computer or Flash Stick

Working with Data sent to a Computer or Flash Stick Working with Data sent to a Computer or Flash Stick File Names and File Formats Data sent to a flash stick is saved as a.txt file. The file name for the saved file is in the format: DATA FILE TYPE Send

More information

Unrolling parallel loops

Unrolling parallel loops Unrolling parallel loops Vasily Volkov UC Berkeley November 14, 2011 1 Today Very simple optimization technique Closely resembles loop unrolling Widely used in high performance codes 2 Mapping to GPU:

More information

Stanford University. NVIDIA Tesla M2090. NVIDIA GeForce GTX 690

Stanford University. NVIDIA Tesla M2090. NVIDIA GeForce GTX 690 Stanford University NVIDIA Tesla M2090 NVIDIA GeForce GTX 690 Moore s Law 2 Clock Speed 10000 Pentium 4 Prescott Core 2 Nehalem Sandy Bridge 1000 Pentium 4 Williamette Clock Speed (MHz) 100 80486 Pentium

More information

CUDA Basics. July 6, 2016

CUDA Basics. July 6, 2016 Mitglied der Helmholtz-Gemeinschaft CUDA Basics July 6, 2016 CUDA Kernels Parallel portion of application: execute as a kernel Entire GPU executes kernel, many threads CUDA threads: Lightweight Fast switching

More information

Lecture 3. Programming with GPUs

Lecture 3. Programming with GPUs Lecture 3 Programming with GPUs GPU access Announcements lilliput: Tesla C1060 (4 devices) cseclass0{1,2}: Fermi GTX 570 (1 device each) MPI Trestles @ SDSC Kraken @ NICS 2011 Scott B. Baden / CSE 262

More information

Pinned-Memory. Table of Contents. Streams Learning CUDA to Solve Scientific Problems. Objectives. Technical Issues Stream. Pinned-memory.

Pinned-Memory. Table of Contents. Streams Learning CUDA to Solve Scientific Problems. Objectives. Technical Issues Stream. Pinned-memory. Table of Contents Streams Learning CUDA to Solve Scientific Problems. 1 Objectives Miguel Cárdenas Montes Centro de Investigaciones Energéticas Medioambientales y Tecnológicas, Madrid, Spain miguel.cardenas@ciemat.es

More information

Parallel Programming and Debugging with CUDA C. Geoff Gerfin Sr. System Software Engineer

Parallel Programming and Debugging with CUDA C. Geoff Gerfin Sr. System Software Engineer Parallel Programming and Debugging with CUDA C Geoff Gerfin Sr. System Software Engineer CUDA - NVIDIA s Architecture for GPU Computing Broad Adoption Over 250M installed CUDA-enabled GPUs GPU Computing

More information

Basic Elements of CUDA Algoritmi e Calcolo Parallelo. Daniele Loiacono

Basic Elements of CUDA Algoritmi e Calcolo Parallelo. Daniele Loiacono Basic Elements of CUDA Algoritmi e Calcolo Parallelo References q This set of slides is mainly based on: " CUDA Technical Training, Dr. Antonino Tumeo, Pacific Northwest National Laboratory " Slide of

More information

GPU Programming. Alan Gray, James Perry EPCC The University of Edinburgh

GPU Programming. Alan Gray, James Perry EPCC The University of Edinburgh GPU Programming EPCC The University of Edinburgh Contents NVIDIA CUDA C Proprietary interface to NVIDIA architecture CUDA Fortran Provided by PGI OpenCL Cross platform API 2 NVIDIA CUDA CUDA allows NVIDIA

More information

Introduction to Parallel Programming

Introduction to Parallel Programming Introduction to Parallel Programming Pablo Brubeck Department of Physics Tecnologico de Monterrey October 14, 2016 Student Chapter Tecnológico de Monterrey Tecnológico de Monterrey Student Chapter Outline

More information

CS510 Operating System Foundations. Jonathan Walpole

CS510 Operating System Foundations. Jonathan Walpole CS510 Operating System Foundations Jonathan Walpole Threads & Concurrency 2 Why Use Threads? Utilize multiple CPU s concurrently Low cost communication via shared memory Overlap computation and blocking

More information

Parallel Accelerators

Parallel Accelerators Parallel Accelerators Přemysl Šůcha ``Parallel algorithms'', 2017/2018 CTU/FEL 1 Topic Overview Graphical Processing Units (GPU) and CUDA Vector addition on CUDA Intel Xeon Phi Matrix equations on Xeon

More information

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series Introduction to GPU Computing Using CUDA Spring 2014 Westgid Seminar Series Scott Northrup SciNet www.scinethpc.ca March 13, 2014 Outline 1 Heterogeneous Computing 2 GPGPU - Overview Hardware Software

More information

CS 470 Spring Other Architectures. Mike Lam, Professor. (with an aside on linear algebra)

CS 470 Spring Other Architectures. Mike Lam, Professor. (with an aside on linear algebra) CS 470 Spring 2016 Mike Lam, Professor Other Architectures (with an aside on linear algebra) Parallel Systems Shared memory (uniform global address space) Primary story: make faster computers Programming

More information

GPU Programming Using CUDA

GPU Programming Using CUDA GPU Programming Using CUDA Michael J. Schnieders Depts. of Biomedical Engineering & Biochemistry The University of Iowa & Gregory G. Howes Department of Physics and Astronomy The University of Iowa Iowa

More information

Program Optimization. Jo, Heeseung

Program Optimization. Jo, Heeseung Program Optimization Jo, Heeseung Today Overview Generally Useful Optimizations Code motion/precomputation Strength reduction Sharing of common subexpressions Removing unnecessary procedure calls Optimization

More information

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series Introduction to GPU Computing Using CUDA Spring 2014 Westgid Seminar Series Scott Northrup SciNet www.scinethpc.ca (Slides http://support.scinet.utoronto.ca/ northrup/westgrid CUDA.pdf) March 12, 2014

More information

BOOK-IT 8.0. SIP2 implementation in the Z39.70 server

BOOK-IT 8.0. SIP2 implementation in the Z39.70 server BOOK-IT 8.0 2015-07-29 Axiell Sverige AB, Box 24014, 224 21 LUND. Besöksadress: Fältspatsvägen 4, 224 78 LUND Tel 046-270 04 00, e-post: axiellsverige@axiell.com, www.axiell.se Table of contents 1 Introduction...

More information

Introduction to CUDA C/C++ Mark Ebersole, NVIDIA CUDA Educator

Introduction to CUDA C/C++ Mark Ebersole, NVIDIA CUDA Educator Introduction to CUDA C/C++ Mark Ebersole, NVIDIA CUDA Educator What is CUDA? Programming language? Compiler? Classic car? Beer? Coffee? CUDA Parallel Computing Platform www.nvidia.com/getcuda Programming

More information

GPU Computing: Introduction to CUDA. Dr Paul Richmond

GPU Computing: Introduction to CUDA. Dr Paul Richmond GPU Computing: Introduction to CUDA Dr Paul Richmond http://paulrichmond.shef.ac.uk This lecture CUDA Programming Model CUDA Device Code CUDA Host Code and Memory Management CUDA Compilation Programming

More information

GPU programming: CUDA basics. Sylvain Collange Inria Rennes Bretagne Atlantique

GPU programming: CUDA basics. Sylvain Collange Inria Rennes Bretagne Atlantique GPU programming: CUDA basics Sylvain Collange Inria Rennes Bretagne Atlantique sylvain.collange@inria.fr This lecture: CUDA programming We have seen some GPU architecture Now how to program it? 2 Outline

More information

Standard Interchange Protocol 3.0 Part I

Standard Interchange Protocol 3.0 Part I 3 Standard Interchange Protocol 3.0 Part I 78-8129-4832-7 Copyright 2011, 3M. All rights reserved. Page 1 Contents Introduction...5 History of SIP...5 Definitions...6 Document Conventions...6 What s New

More information

Last class: Today: Thread Background. Thread Systems

Last class: Today: Thread Background. Thread Systems 1 Last class: Thread Background Today: Thread Systems 2 Threading Systems 3 What kind of problems would you solve with threads? Imagine you are building a web server You could allocate a pool of threads,

More information

CS 1110: Introduction to Computing Using Python Loop Invariants

CS 1110: Introduction to Computing Using Python Loop Invariants CS 1110: Introduction to Computing Using Python Lecture 21 Loop Invariants [Andersen, Gries, Lee, Marschner, Van Loan, White] Announcements Prelim 2 conflicts due by midnight tonight Lab 11 is out Due

More information

GPU Computing with CUDA

GPU Computing with CUDA GPU Computing with CUDA Hands-on: Shared Memory Use (Dot Product, Matrix Multiplication) Dan Melanz & Dan Negrut Simulation-Based Engineering Lab Wisconsin Applied Computing Center Department of Mechanical

More information

CS 220: Introduction to Parallel Computing. Condition Variables. Lecture 24

CS 220: Introduction to Parallel Computing. Condition Variables. Lecture 24 CS 220: Introduction to Parallel Computing Condition Variables Lecture 24 Remember: Creating a Thread int pthread_create( pthread_t *thread, const pthread_attr_t *attr, void *(*start_routine)(void *),

More information