:"=&./%$#()0+&23$3**+*.,'O& !"#$%&'#()*+,&-#$&./%$#()0./1& 23$3**+*&0#/0+2%,& T"+&'#()*+,& S#()*+C>3,+(&322$#30"& S=&0#/%+8%& ?#%+&#/&Z%)%#$.
|
|
- Aileen Bailey
- 5 years ago
- Views:
Transcription
1 :"=&./%$#()0+&23$3**+*.,'O&!"#$%&'#()*+,&-#$&./%$#()0./1& 23$3**+*&0#/0+2%,& P%Q,&"+$+& 99&GEDR&,3=,&=#)&,"#)*(&!%)(+/%,&N3/%&%#&,++&.%& 435.(&6)/(+& 7/#8&9#**+1+& S#()*+C>3,+(&322$#30"& P%Q,&"3$(&%#&$+5.,+&0)$$.0)*)'&#$&+/<$+&0#)$,+K& >)%&$+*3<5+*=&+3,=&%#&03$5+&#)%&3&0#)2*+&#-& (3=,& S#()*+,&3$+&,+*-C0#/%3./+(&GCR&(3=&)/.%,&%"3%& L%&N.%"./&+8.,</1&0#)$,+,& P/0*)(+&0#)$,+&'3%+$.3*,&3/(&>30;1$#)/(&,)22#$%& T"+&'#()*+,& S3/(+*>$#%&,+%&N.%"&U2+/SV&!"#$%&+8+$0.,+,&N.%"&9A4J& 9"32+*&./&J*1#$.%"',& S3%+$.3*,&353.*3>*+W& &"X2WYY-30)*%=I;/#8I+()Y(>)/(+Y%+30"./1Y99!9CS:DR& S=&0#/%+8%&?#%+&#/&Z%)%#$.3*[& T$.'+,%+$&03*+/(3$&!%)(+/%,&%3;+&R&0*3,,+,&3&%+$'K&N+&%+30"&G& 9#5+$&`D&,+'+,%+$&#-&'3%+$.3*&./%#&DE&N++;,& 9*3,,+,&N.%"&DECGE& S30&*3>,K&b./)8&,+$5+$,&
2 U5+$5.+N& S#()*+&D& S3/(+*>$#%&,+%&N.%"&U2+/SV& 6).*%&3$#)/(&2$#1$3'&%"3%& &&1+/+$3%+,&S3/(+*>$#%&,+%& &&3,&I>'2&L*+& U2+/SV& %"$+3(./1&*.>$3$=&>).*%&./%#&'#,%&9&0#'2.*+$,& A,+(&,+5+$3*&N3=,&3,&23$%&#-&(.,0),,.#/&#-& %"$+3(,&3/(&0#/0)$$+/0=&./&U!&0#)$,+&!+c/1&3**&%"+&2.8+*,& -#$&^./%&.&d&ea&.&e&/)'9#*,a&.ff_&g& &-#$&^./%&]&d&ea&]&e&/)'h#n,a&]ff_&g& && &8&d&^^(#)>*+_.&Y&/)'9#*,&CEIi_&j&Ga& && &=&d&^^(#)>*+_]&y&/)'h#n,&ceii_&j&ga& && &0#*#$&d&'3/(+*>$#%^8K=_a& && &2.8+*,k.lk]lI$1>%6*)+&d&2.8+*,k.lk]lI$1>%m$++/&d&& && & &2.8+*,k.lk]lI$1>%h+(&d&0#*#$a& & U2+/SV& U*(&,%3/(3$(&^D,% &./&DHH\_K&>)%&,<**&N.(+*=&),+(& :.(+*=&,)22#$%+(&^100K&o.,)3*&!%)(.#K&P/%+*K&III_& $+p).$+,&q-#2+/'2&m31&./&100& V3$3**+*&-#$&*##2& r2$31'3&#'2&23$3**+*&-#$& -#$^./%&.dda&.eddeea&.ff_&iii& Prior code J22*=./1&23$3**+*&-#$& r2$31'3&#'2&23$3**+*&-#$& -#$&^./%&.&d&ea&.&e&/)'9#*,a&.ff_&g& &-#$&^./%&]&d&ea&]&e&/)'h#n,a&]ff_&g& && &8&d&^^(#)>*+_.&Y&/)'9#*,&CEIi_&j&Ga& && &=&d&^^(#)>*+_]&y&/)'h#n,&ceii_&j&ga& && &0#*#$&d&'3/(+*>$#%^8K=_a& Iterations 1 25 Iterations Iterations Subsequent code Iterations && &2.8+*,k.lk]lI$1>%6*)+&d&2.8+*,k.lk]lI$1>%m$++/&d&& && & &2.8+*,k.lk]lI$1>%h+(&d&0#*#$a& &
3 h+,)*</1&#)%2)%&^0*#,+)2_& V$.53<s./1&*#03*&53$.3>*+,& r2$31'3&#'2&23$3**+*&-#$&2$.53%+^8k=k0#*#$_& -#$&^./%&.&d&ea&.&e&/)'9#*,a&.ff_&g& &-#$&^./%&]&d&ea&]&e&/)'h#n,a&]ff_&g& && &8&d&^^(#)>*+_.&Y&/)'9#*,&CEIi_&j&Ga& && &=&d&^^(#)>*+_]&y&/)'h#n,&ceii_&j&ga& && &0#*#$&d&'3/(+*>$#%^8K=_a& && &2.8+*,k.lk]lI$1>%6*)+&d&2.8+*,k.lk]lI$1>%m$++/&d&& && & &2.8+*,k.lk]lI$1>%h+(&d&0#*#$a& & t#n&n+**&(#+,&.%&23$3**+*.s+o& U$.1./3*&^,+$.3*_&$)//./1&<'+W&GIRH&,+0#/(,& V3$3**+*&$)//./1&<'+W&&&&&&&&&&&&&&&DIFR&,+0#/(,& &&&&!2++()2&d&&&&&&&&&&&&&&&&&&&&&&&&&d&DIu\&!+$.3*&<'+&& V3$3**+*&<'+& ^U/&'=&S30>##;&V$#K&N.%"&P/%+*&9#$+&.i&2$#0+,,#$_& V3$3**+*.s./1&.//+$&*##2& r2$31'3&#'2&23$3**+*&-#$&2$.53%+^8k=k0#*#$_& -#$&^./%&.&d&ea&.&e&/)'9#*,a&.ff_&g& &-#$&^./%&]&d&ea&]&e&/)'h#n,a&]ff_&g& && & & &III& -#$&^./%&.&d&ea&.&e&/)'9#*,a&.ff_&g& &&&&r2$31'3&#'2&23$3**+*&-#$&2$.53%+^8k=k0#*#$_& &-#$&^./%&]&d&ea&]&e&/)'h#n,a&]ff_&g& && & & &III& V3$3**+*.s./1&.//+$&*##2& r2$31'3&#'2&23$3**+*&-#$&2$.53%+^8k=k0#*#$_& -#$&^./%&.&d&ea&.&e&/)'9#*,a&.ff_&g& &-#$&^./%&]&d&ea&]&e&/)'h#n,a&]ff_&g& && & & &III& T.'+W&DIFR&,+0& -#$&^./%&.&d&ea&.&e&/)'9#*,a&.ff_&g& &&&&r2$31'3&#'2&23$3**+*&-#$&2$.53%+^8k=k0#*#$_& &-#$&^./%&]&d&ea&]&e&/)'h#n,a&]ff_&g& && & & &III& T.'+W&DIRi&,+0& P/,.(+&'3/(+*>$#%&-)/0<#/& (#)>*+&'3/(+*>$#%^(#)>*+&8K&(#)>*+&=_&g& &./%&'38P%+$3<#/&d&DEEEa&&./%&.%+$3<#/&d&Ea& &(#)>*+&$+&d&ek&.'&d&ea& &N".*+^^$+j$+&f&.'j.'&ed&F_&vv&^.%+$3<#/&e&'38P%+$3<#/ &g& & & &(#)>*+&%+'2&d&$+j$+&C&.'j.'&f&8a& & & &.'&d&gj$+j.'&f&=a& & & &$+&d&%+'2a& & & &.%+$3<#/ffa& & &.-^.%+$3<#/&wd&'38P%+$3<#/_&$+%)$/&Giia&+*,+&$+%)$/&Ea&
4 P/,.(+&'3/(+*>$#%&-)/0<#/& (#)>*+&'3/(+*>$#%^(#)>*+&8K&(#)>*+&=_&g& &./%&'38P%+$3<#/&d&DEEEa&&./%&.%+$3<#/&d&Ea& &(#)>*+&$+&d&ek&.'&d&ea& &N".*+^^$+j$+&f&.'j.'&ed&F_&vv&^.%+$3<#/&e&'38P%+$3<#/ &g& & & &(#)>*+&%+'2&d&$+j$+&C&.'j.'&f&8a& & & &.'&d&gj$+j.'&f&=a& & & &$+&d&%+'2a& & & &.%+$3<#/ffa& & T3;+,&*#/1+$&-#$&& 2#./%,&./&%"+&,+%&!N322./1&*##2&#$(+$& r2$31'3&#'2&23$3**+*&-#$&2$.53%+^8k=k0#*#$_& -#$&^./%&]&d&ea&]&e&/)'h#n,a&]ff_&g& &-#$&^./%&.&d&ea&.&e&/)'9#*,a&.ff_&g& T.'+W&DIRi&,+0& &.-^.%+$3<#/&wd&'38P%+$3<#/_&$+%)$/&Giia&+*,+&$+%)$/&Ea& 4=/3'.0&,0"+()*./1& r2$31'3&#'2&23$3**+*&-#$&iii&&,0"+()*+^(=/3'.0_& -#$&^./%&.&d&ea&.&e&/)'9#*,a&.ff_&g& &-#$&^./%&]&d&ea&]&e&/)'h#n,a&]ff_&g& && & & &III& T.'+W&EIHx&,+0&!)''3$=&#-&5+$,.#/,&!+$.3*&5+$,.#/&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&GIRH&,+0& P/0#$$+0%&23$3**+*&5+$,.#/&^$30+_& V3$3**+*&#)%+$&*##2&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&DIFR&,+0& V3$3**+*&.//+$&*##2&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&DIRi&,+0&!N32&*##2&#$(+$&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&DIGu&,+0& 4=/3'.0&,0"+()*./1&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&EIHx&,+0& J*%+$/3<5+W&V%"$+3(&*.>$3$=& 93/&(#&^'#,%&#-_&*+,,#/&),./1&VU!PyC,%3/(3$(& %"$+3(,&^2%"$+3(,_& 9*3,,$##'&"./%,& 93/Q%&"35+&%##&'3/=&,%)(+/%,&,"3$./1&,3'+& '30"./+& Prior code Same Thread Subsequent code Child Thread pthread_create(..., func_ptr, arg) pthread_join(..., &retval) void* func(void* arg) {... } m#+$&0#/0+2%,&>+-#$+&3/(y#$&3z+$&,"#n./1& 0#(+&?#%&+3,=&%#&(#&(=/3'.0&,0"+()*./1&
5 t#n&pq5+&),+(&.%& V$+5.#),&*+0%)$+&./%$#()0./1&%"$+3(,& b3>&),./1&2%"$+3(,&^s3/(+*>$#%&#$&#%"+$& +83'2*+_& b+0%)$+&#/&*3>&3/(&),./1&s3/(+*>$#%& ^U2+/SV_&%#&.**),%$3%+&0#/0+2%,& 4+L/.%+&.'2$#5+'+/%+$&(#./1&,3'+&'3%+$.3*& N.%"&V%"$+3(,&./&*+0%)$+& U2+/SV&#$&V%"$+3(,&L$,%O& U2+/SV&L$,%& m.5+&".1"c*+5+*&0#/0+2%,&>+-#$+&*#%,&#-&,=/%38& :3/%&%#&,2+/(&'#,%&#-&<'+&#/&0#/0+2%,&,#&(#&.%& L$,%& V%"$+3(,&L$,%& 4+'#/,%$3%+&+8+0)<#/&'#(+*&>+-#$+&,"#N./1& Z'31.0[& 9#)*(&),+&#%"+$&+83'2*+,&-#$&,.'2*.0.%=& ZTU4U[&*.,%& :".0"&#$(+$&-#$&V%"$+3(,&5,I&U2+/SVO& {#./&'=&+82+$.'+/%w& S#$+&0#*#$-)*&5+$,.#/,&#-&S3/(+*>$#%& P/%+$30<5+&.'31+&1+/+$3<#/& U%"+$&+83'2*+,& V*+3,+&,"3$+w& S#()*+&G&!"#$%&+8+$0.,+,&N.%"&9A4J& V3$%&#-&6)/(+K&73$353/.0K&S30"+K& S.%0"+**K&ZJ((./1&mVA&0#'2)</1&%#& 9#'2)%+$&U$13/.s3<#/&0#)$,+,[K& B()V3$&GEDR&& :"3%&.,&9A4JO& Z9#'2)%+&A/.L+(&4+5.0+&J$0".%+0%)$+[&?oP4PJQ,&3$0".%+0%)$+&3/(&*3/1)31+&-#$& 1+/+$3*C2)$2#,+&2$#1$3''./1&#/&1$32".0,& 03$(,& h+3**=&3&*.>$3$=&3/(&+8%+/,.#/&#-&9&^3/(&#%"+$& *3/1)31+,_& :"=&9A4JO& B3,=&%#&1+%&%"+&"3$(N3$+& S=&*32%#2&03'+&N.%"&3&FxC0#$+&03$(& 4+23$%'+/%&"3,&FFxC0#$+&03$(&^e& uee_&?op4pj&n.**./1&%#&(#/3%+&+p).2'+/%& B80.</1&-#$&,%)(+/%,& T"+=&"35+&03$(,&3/(&N3/%&%#&),+&%"+'& B3,=&%#&,++&2+$-#$'3/0+&>+/+L%,&
6 m3'+&#-&b.-+&^m#b_&!.')*3<#/&n.%"&0+**,&)2(3</1&./&*#0;&,%+2& B30"&%)$/K&0#)/%&*.5./1&/+.1">#$,& 9+**&3*.5+&/+8%&%)$/&.-& 3*.5+&%".,&<'+&3/(&"35+&G&*.5./1&/+.1">#$,K&#$& "35+&R&*.5./1&/+.1">#$,& S#()*+&0#/,%$3./%,& 6$.+-&<'+W&9#)$,+&"3,&*#%,&#-&#%"+$&1#3*,& U/+&\EC'./)%+&*3>&3/(&23$%,&#-&G&*+0%)$+,& h+*3<5+*=&./+82+$.+/0+(&,%)(+/%,&!#'+&]),%&#)%&#-&9!&g& S3/=&(.(/Q%&;/#N&9&#$&A/.8&2$#1$3''./1& P(+3&#-&23$3**+*.,'& A/.%&1#3*,& 6+/+L%,&3/(&0#,%,&#-&,=,%+'&"+%+$#1+/+.%=& 43%3&'#5+'+/%&3/(&?ASJ& m+/+$3**=k&%"+&+}+0%&#-&3$0".%+0%)$+&#/& 2$#1$3'&2+$-#$'3/0+& J22$#30"&%3;+/& P/%$#()0%#$=&*+0%)$+& mva,w&'3,,.5+*=&23$3**+*k&#)%,.(+&9vak&;+$/+*,k&!ps4& b3>&.**),%$3</1&-+3%)$+,&#-&9a4j&3$0".%+0%)$+& 43%3&%$3/,-+$&<'+& T"$+3(&(.5+$1+/0+& S+'#$=&%=2+,&^/+8%&<'+_& Zb+,,#/,&*+3$/+([&*+0%)$+& h+.%+$3%+&3$0".%+0%)$+& 4+'#/,%$3%+&,2++()2&N.%"&m3'+&#-&b.-+& T3*;&3>#)%&),+&./&T#2&iEE&,=,%+',& 9A4J&2$#1$3''./1&'#(+*& CPU "host" data GPU "device" code ("kernel") data kernel invocations &"3,&'3/=&0#$+,K&#$13/.s+(&./%#&1$#)2,& RGC%"$+3(&N3$2,&+8+0)%+&%"+&,3'+&./,%$)0<#/& 43%3&%$3/,-+$& //allocate memory on the device: cudamalloc((void**) &a_dev, N*sizeof(int));... //transfer array a to GPU cudamemcpy(a_dev, a, N*sizeof(int), cudamemcpyhosttodevice);... direction indicator //transfer array res back from GPU: cudamemcpy(res, res_dev, N*sizeof(int), cudamemcpydevicetohost);
7 P/5#;./1&%"+&;+$/+*& 7+$/+*&.%,+*-& int threads = 512; //# threads per block int blocks = (N+threads 1)/threads; //# blocks (N/threads rounded up) kernel<<<blocks,threads>>>(res_dev, a_dev, b_dev); 6*#0;,&3$+&3/&#$13/.s3<#/3*&)/.%&-#$&%"$+3(,& V+$-#$'3/0+&.,&5+$=&(+2+/(+/%&#/&r>*#0;,& 3/(&r%"$+3(,& U/+&$)*+W&r%"$+3(,&,"#)*(&>+&')*<2*+&#-&RG& global void kernel(int* res, int* a, int* b) { //function that runs on GPU to do the addition //sets res[i] = a[i] + b[i]; each thread is responsible for one value of i int thread_id = threadidx.x + blockidx.x*blockdim.x; if(thread_id < N) { res[thread_id] = a[thread_id] + b[thread_id]; } } since #threads potentially > array size b3>&30<5.%=&dw&43%3&%$3/,-+$&<'+&!%)(+/%,&0#'23$+&$)//./1&<'+&#-& N#$;./1&9A4J&2$#1$3'&%#&3((&23.$&#-&5+0%#$,& 2$#1$3'&N.%"&(3%3&%$3/,-+$K&>)%&/#&3$.%"'+<0& 2$#1$3'&%"3%&(#+,&3$.%"'+<0&3/(&#/*=&D&(.$+0<#/& #-&(3%3&%$3/,-+$& U>,+$5+&%"3%&(3%3&%$3/,-+$&.,&>)*;&#-&%"+&<'+& b3>&30<5.%=&dw&43%3&%$3/,-+$&<'+&!%)(+/%,&0#'23$+&$)//./1&<'+&#-& N#$;./1&9A4J&2$#1$3'&%#&3((&23.$&#-&5+0%#$,& 2$#1$3'&N.%"&(3%3&%$3/,-+$K&>)%&/#&3$.%"'+<0& 2$#1$3'&%"3%&(#+,&3$.%"'+<0&3/(&#/*=&D&(.$+0<#/& #-&(3%3&%$3/,-+$& U>,+$5+&%"3%&(3%3&%$3/,-+$&.,&>)*;&#-&%"+&<'+& b3>&30<5.%=&gw&t"$+3(&(.5+$1+/0+&& 9#'23$+&%N#&3223$+/%*=&+p).53*+/%&;+$/+*,W& ~~1*#>3*~~&5#.(&;+$/+*~D^./%&j3_&g& &&&&./%&<(&d&%"$+3(p(8i8a& &&&&./%&0+**&d&<(&&RGa& &&&&3k0+**lffa& ~~1*#>3*~~&5#.(&;+$/+*~G^./%&j3_&g& &&&&./%&0+**&d&%"$+3(P(8I8&&RGa& &&&&,N.%0"^0+**_&g& &&&&03,+&EW&3kElffa&>$+3;a& &&&&03,+&DW&3kDlffa&>$+3;a& &&&&III&&&YY0#/</)+,&%#&03,+&\& &&&&(+-3)*%W&3k0+**lffa& && U>,+$5+&53,%*=&(.}+$+/%&$)//./1&<'+,& T"$+3(,&./&3&N3$2&(+5#%+&<'+&%#&D&./,%$)0<#/&2+$& 0*#0;&0=0*+&!"!#$%&$#'($)**$+,#$%($^#%"+$,&/#2_& b3>&30<5.%=&gw&t"$+3(&(.5+$1+/0+&& 9#'23$+&%N#&3223$+/%*=&+p).53*+/%&;+$/+*,W& ~~1*#>3*~~&5#.(&;+$/+*~D^./%&j3_&g& &&&&./%&<(&d&%"$+3(p(8i8a& &&&&./%&0+**&d&<(&&RGa& &&&&3k0+**lffa& ~~1*#>3*~~&5#.(&;+$/+*~G^./%&j3_&g& &&&&./%&0+**&d&%"$+3(P(8I8&&RGa& &&&&,N.%0"^0+**_&g& &&&&03,+&EW&3kElffa&>$+3;a& &&&&03,+&DW&3kDlffa&>$+3;a& &&&&III&&&YY0#/</)+,&%#&03,+&\& &&&&(+-3)*%W&3k0+**lffa& && U>,+$5+&53,%*=&(.}+$+/%&$)//./1&<'+,& T"$+3(,&./&3&N3$2&(+5#%+&<'+&%#&D&./,%$)0<#/&2+$& 0*#0;&0=0*+&!"!#$%&$#'($)**$+,#$%($^#%"+$,&/#2_&
8 b3>&30<5.%=&rw&s+'#$=&%=2+,&& 63,+(&#/&9"32&u&#-&k!3/(+$,&3/(&73/($#%K&Z9A4J&>=&+83'2*+[K&GEDDl& Zh3=&%$30./1[&%"3%&%+,%,&./%+$,+0<#/,&N.%"& 3$$3=&#-&#>]+0%,&./&%"+&,3'+&#$(+$&!2++(,&)2&N.%"&,N.%0"&%#&0#/,%3/%&'+'#$=& &53*)+,&3$+&%$3/,'.X+(&%#&+/<$+&"3*-&N3$2& &3**#N,&030"./1& V+$-#$'3/0+&.,&N#$,+&.-&%"$+3(,&300+,,& #>]+0%,&./&(.}+$+/%&#$(+$,& b3>&30<5.%=&rw&s+'#$=&%=2+,&& 63,+(&#/&9"32&u&#-&k!3/(+$,&3/(&73/($#%K&Z9A4J&>=&+83'2*+[K&GEDDl& Zh3=&%$30./1[&%"3%&%+,%,&./%+$,+0<#/,&N.%"& 3$$3=&#-&#>]+0%,&./&%"+&,3'+&#$(+$&!2++(,&)2&N.%"&,N.%0"&%#&0#/,%3/%&'+'#$=& &53*)+,&3$+&%$3/,'.X+(&%#&+/<$+&"3*-&N3$2& &3**#N,&030"./1& V+$-#$'3/0+&.,&N#$,+&.-&%"$+3(,&300+,,& #>]+0%,&./&(.}+$+/%&#$(+$,&!)$5+=&$+,)*%,W&m##(&/+N,& J,;+(&%#&(+,0$.>+&9VAYmVA&./%+$30<#/W& H&#-&DD&'+/<#/&>#%"&(3%3&'#5+'+/%&3/(&./5#;./1&;+$/+*& J/#%"+$&]),%&'+/<#/,&./5#;./1&%"+&;+$/+*& J,;+(&%#&+82*3./&+82+$.'+/%&.**),%$3</1&(3%3& '#5+'+/%&0#,%W& H&#-&DG&,3=&0#'23$./1&0#'2)%3<#/&3/(& 0#'')/.03<#/&0#,%& G&'#$+&%3*;&3>#)%&0#'23$./1&(.}+$+/%+$3<#/,&!)$5+=&$+,)*%,W&m##(&/+N,& J,;+(&%#&(+,0$.>+&9VAYmVA&./%+$30<#/W& H&#-&DD&'+/<#/&>#%"&(3%3&'#5+'+/%&3/(&./5#;./1&;+$/+*& J/#%"+$&]),%&'+/<#/,&./5#;./1&%"+&;+$/+*& J,;+(&%#&+82*3./&+82+$.'+/%&.**),%$3</1&(3%3& '#5+'+/%&0#,%W& H&#-&DG&,3=&0#'23$./1&0#'2)%3<#/&3/(& 0#'')/.03<#/&0#,%& G&'#$+&%3*;&3>#)%&0#'23$./1&(.}+$+/%+$3<#/,&!)$5+=&$+,)*%,W&?#%&,#&1##(&/+N,& J,;+(&%#&+82*3./&+82+$.'+/%&.**),%$3</1&%"$+3(& (.5+$1+/0+W& G&#-&H&N+$+&0#$$+0%& G&'#$+&,++'+(&%#&)/(+$,%3/(K&>)%&'.,),+(& %+$'./#*#1=& R&'#$+&$+'+'>+$+(&2+$-#$'3/0+&+}+0%K&>)%&,3.(& /#%"./1&3>#)%&%"+&03),+&&&& 9#/0*),.#/,& A/.%&N3,&'#,%*=&,)00+,,-)*K&>)%&%"$+3(& (.5+$1+/0+&.,&3&"3$(+$&0#/0+2%&!%)(+/%,&./%+$+,%+(&./&9A4J&3/(&3>#)%&"3*-& %"+&0*3,,&$+p)+,%+(&'#$+&#-&.%& 6#X#'&*./+W&J&>$.+-&./%$#()0<#/&.,&2#,,.>*+& +5+/&%#&,%)(+/%,&N.%"&*.'.%+(&>30;1$#)/(&
9 9*3,,$##'&"./%,&?++(&1$32".0,&03$(&#/&*#03*&'30"./+&^3%&*+3,%& J*%+$/3%+&'#(+*,& b+n.,&3/(&9*3$;k&v#$%*3/(&!%3%+& b+0%)$+&./%$#()0./1&9a4j& b3>yt:&),./1&.%&%#&,2++(&)2&m3'+&#-&b.-+& 43/.+*&B$/,%& b#/1+$&)/.%&n.%"&>#%"&u2+/sv&3/(&9a4j& m+/+$3*&+'2"3,.,&#/&%)/./1&(3%3&*3=#)%&3/(& 300+,,&23X+$/& ZTU4U[&*.,%&?+N&+83'2*+&-#$&%=2+,&#-&'+'#$=& B82*3./&%"$+3(&(.5+$1+/0+&>+X+$& S.((*+&1$#)/(W&3((./1&2$#1$3''./1&%#&'./+& #$&0#/0+2%)3*&'3%+$.3*&%#&bv9&5+$,.#/& V#$</1&0#(+&%#&#%"+$&>3,+&*3/1)31+,&^{353_& U%"+$&2$#1$3''./1&+83'2*+&^O_& V*+3,+&,"3$+w& S#()*+&R3& 9"32+*&./&J*1#$.%"',& ^63,+(&#/&+82+$.+/0+,&#-&7=*+&6)$;+& 3/(&#)$&]#./%&%)%#$.3*&3%&!9&B(& V$#1$3'K&GEDG_& :"3%&.,&9"32+*O& V3$3**+*&2$#1$3''./1&*3/1)31+&(+5+*#2+(& N.%"&2$#1$3''+$&2$#()0<5.%=&./&'./(& U$.1./3**=&9$3=Q,&2$#]+0%&)/(+$&4JhVJQ,&t.1"& V$#()0<5.%=&9#'2)</1&!=,%+',&2$#1$3'&!).%3>*+&-#$&,"3$+(C&#$&(.,%$.>)%+(&'+'#$=&,=,%+',& P/,%3**,&+3,.*=&#/&b./)8&3/(&S30&U!a&),+& 9=1N./&%#&./,%3**&#/&:./(#N,& %"3%&=#)&/++(& V$#5.(+,&".1"C*+5+*+$3<#/,& 4+,.1/+(&N.%"&23$3**+*.,'&./&'./(&
10 @*+8.>*+&,=/%38&!)22#$%,&,0$.2</1C*.;+&2$#1$3',W& N$.%+*/^Zt+**#&:#$*(w[_a& J*,#&2$#5.(+,&#>]+0%,&3/(&'#()*+,& V$#5.(+,&".1"C*+5+*+$3<#/,& &6&d&-^J_a&&YY322*.+,&-&+*+'+/%N.,+&-#$&3/=&-)/0<#/&-& P/0*)(+,&>).*%C./+$3%#$,W& &9&d&J&f&Da& &4&d&J&f&6a& &B&d&J&j&6a& &III& 4+,.1/+(&N.%"&23$3**+*.,'&./&'./(& U2+$3<#/,&#/&2$+5.#),&,*.(+,&23$3**+*.s+(& 3)%#'3<03**=& 9$+3%+&3,=/0"$#/#),&%3,;&NY&,./1*+&;+=N#$(& 6).*%C./&,=/0"$#/.s3<#/&-#$&%3,;,&3/(&53$.3>*+,& Zt+**#&:#$*([&./&9"32+*& 9$+3%+&L*+&"+**#I0"2*&0#/%3././1& &N$.%+*/^Zt+**#&:#$*(w[_a& 9#'2.*+&N.%"& &0"2*&q#&"+**#&"+**#I0"2*& h)/&n.%"& &IY"+**#& o3$.3>*+,&3/(&9#/,%3/%,& o3$.3>*+&(+0*3$3<#/&-#$'3%w& k0#/l1l&53$y0#/,%&.(+/<l+$&w&%=2+a& 53$&8&W&./%a& 0#/,%&2.&W&$+3*&d&RIDFa&& 0#/L1&0#/,%&/)'!.(+,&W&./%&d&Fa&&!+$.3*&9#/%$#*&!%$)0%)$+,&.-&,%3%+'+/%,K&N".*+&*##2,K&3/(&(#CN".*+&*##2,& 3$+&3**&2$+X=&,%3/(3$(& 4.}+$+/0+W&!%3%+'+/%&>#(.+,&'),%&+.%"+$&),+& >$30+,&#$&3/&+8%$3&;+=N#$(W& &.-^8&dd&i_&!"#$&=&d&Ra&+*,+&=&d&Da& &N".*+^8&e&i_&%&&8ffa&&
11 B83'2*+W&h+3(./1&)/<*&+#-& 53$&8&W&./%a& N".*+&,%(./I$+3(^8_&g& & &N$.%+*/^Zh+3(&53*)+&ZK&8_a& n && arg_type V$#0+()$+,Y@)/0<#/,& argument omit for generic function proc addone(in val : int, inout val2 : int) : int { val2 = val + 1; return val + 1; } return type (omit if none or if can be inferred) J$$3=,& P/(.0+,&(+%+$'./+(&>=&3&$3/1+W& &53$&J&W&kDIIil&./%a &&&&&&&&&&&&YY(+0*3$+,&J&3,&3$$3=&#-&i&./%,& &53$&6&W&kCRIIRl&./%a &&&&&&&&&&&&YY"3,&./(.0+,&CR&%"$)&R& &53$&9&W&kDIIDEK&DIIDEl&./%a&&YY')*<C(.'+/,.#/3*&3$$3=& J00+,,./1&./(.5.()3*&0+**,W& &JkDl&d&JkGl&f&GRa& h3/1+,&3*,#&),+(&./&-#$&*##2,w& &-#$&.&./&DIIDE&(#&,%3%+'+/%a& &-#$&.&./&DIIDE&g& & &*##2&>#(=& & 93/&3*,#&),+&3$$3=&#$&3/=%"./1&.%+$3>*+& V3$3**+*&b##2,& TN#&;./(,&#-&23$3**+*&*##2,W& &-#$3**&.&./&DIIDE&(#&,%3%+'+/%a&&YY#'.%&(#&NY&>$30+,& &0#-#$3**&.&./&DIIDE&(#&,%3%+'+/%a& -#$3**&0$+3%+,&D&%3,;&2+$&2$#0+,,./1&)/.%& 0#-#$3**&0$+3%+,&D&2+$&*##2&.%+$3<#/& A,+(&N"+/&+30"&.%+$3<#/&$+p).$+,&*#%,&#-&N#$;&3/(Y#$& %"+=&'),%&>+&(#/+&./&23$3**+*& J,=/0"$#/#),&T3,;,& B3,=&3,=/0"$#/#),&%3,;&0$+3<#/W& &>+1./&,%3%+'+/%a&& B3,=&-#$;C]#./&23$3**+*.,'W& &0#>+1./&g& & &,%3%+'+/%Da& & &,%3%+'+/%Ga& & &III& & &YY0$+3%+,&%3,;&2+$&,%3%+'+/%&3/(&N3.%,&"+$+&&
12 !=/0&>*#0;,&,=/0&>*#0;,&N3.%&-#$&%3,;,&0$+3%+(&./,.(+&.%& T"+,+&3$+&+p).53*+/%W& &&,=/0&g & & & & & &&&&&0#>+1./&g& &>+1./&,%3%+'+/%Da & & &&&&&,%3%+'+/%Da& &>+1./&,%3%+'+/%Ga & & &&&&&,%3%+'+/%Ga& &III & & & & & & & &&&&&III& n & & & & & & & & &&&&&!=/0&53$.3>*+,&,=/0&53$.3>*+,&"35+&53*)+&3/(&+'2%=Y-)**&,%3%+&,%#$+&Ä&D&53*)+&3/(&>*#0;+$3<#/,&03/Q%&2$#0++(& 93/&>+&),+(&3,&*#0;W& &53$&*#0;&W&,=/0&./%a& &*#0;&d&Da & & & &YY30p).$+,&*#0;& &III& &53$&%+'2&d&*#0;a & &YY$+*+3,+,&%"+&*#0;& J/3*=,.,&#-&J*1#$.%"',& 9"32+*&'3%+$.3*& J,,.1/&>3,.0&%)%#$.3*& T+30"&-#$3**&v&0#>+1./&^3*,#&3*1#$.%"'.0&/#%3<#/_& V$#]+0%,& V3$<<#/&./%+1+$,& 6)>>*+!#$%& S+$1+!#$%&?+3$+,%&?+.1">#$,& J*1#$.%"',&V$#]+0%W&b.,%&V3$<<#/& V3$<<#/&3&*.,%&%#&%N#&+p)3*C,)''./1&"3*5+,I& 6$)%+C-#$0+&3*1#$.%"'&^(#/Å%&;/#N&V&5,&?V&=+%_& Ç)+,<#/,W& :"3%&3$+&*#/1+,%&*.,%,&=#)&03/&%+,%O& :"3%&3>#)%&./&23$3**+*O& T$.0;W&+/)'+$3%+&2#,,.>.*.<+,&3/(&),+&-#$3**& J*1#$.%"',&V$#]+0%W&6)>>*+!#$%&! P/,%+3(&#-&*+zC%#C$.1"%K&%+,%&3**&23.$,&./&%N#&,%+2,w& J*1#$.%"',&V$#]+0%W&S+$1+!#$%& V3$3**+*&(.5.(+C3/(C0#/p)+$W&),+&0#>+1./& B*+13/%&(.5.,.#/W&,2*.%&%"+&4#'3./&!2++()2&/#%&3,&/#<0+3>*+& B83'2*+&#-&+82+/,.5+&23$3**+*+$"+3(&!&TN#&/+,%+(&-#$3**&*##2,&^./&,+p)+/0+_&./,.(+&3&-#$&*##2&
13 TN#&3*1#$.%"',W& ^),+&3&-#$3**&*.;+&>)>>*+!#$%_& 4.5.(+C3/(C9#/p)+$& ^),+&0#>+1./_& J&>.%&%$.0;=& o3*)+&#-&23$3**+*.,'w&')0"&+3,.+$&%#&2$#1$3'& %"+&>$)%+C-#$0+&'+%"#(& J*1#$.%"',&T3;+3N3=& b+3$/./1&0)$5+&#-&9"32+*&.,&,#&*#nk&,%)(+/%,& 03/&,%3$%&),./1&23$3**+*.,'&5+$=&p).0;*=&!)''./1&53*)+,&./&3/&3$$3=& S#()*+&R>& h+()0<#/,& ^h+()0<#/&-$3'+n#$;&-$#'&b./&3/(&!/=(+$k&-+%#.%/*!0$'&$/)+)**!*$ /+'1+)22%#1K&GEEHI_& !)''./1&53*)+,&./&3/&3$$3=& 16!)''./1&53*)+,&./&3/&3$$3=&
14 !)''./1&53*)+,&./&3/&3$$3=& 16!)''./1&53*)+,&./&3/&3$$3=& ,2 4,2 3, ,0 4,2 3,5 2, ,0 1,1 4,2 3,3 1,4 3,5 0,6 4,2 2 4,2 3,5 V3$%,&#-&3&$+()0<#/& T3**=W&P/%+$'+(.3%+&,%3%+&#-&0#'2)%3<#/& 9#'>./+W&9#'>./+&G&%3**.+,& 2,0 4,2 3,5 2,7 h+()0+c1+/w&m+/+$3%+&$+,)*%&-$#'&%3**=& 2,0 1,1 4,2 3,3 1,4 3,5 0,6 2,7 P/.%W&9$+3%+&Z+'2%=[&%3**=& J00)')*3%+W&J((&D&53*)+&%#&%3**=&
15 V3$%,&#-&3&$+()0<#/& T3**=W&P/%+$'+(.3%+&,%3%+&#-&0#'2)%3<#/& & && & & & &^53*)+K&./(+8_& 9#'>./+W&9#'>./+&G&%3**.+,& & && & & & &%3;+&N".0"+5+$&23.$&"3,&*3$1+$&53*)+& h+()0+c1+/w&m+/+$3%+&$+,)*%&-$#'&%3**=& & & & & &$+%)$/&%"+&./(+8& P/.%W&9$+3%+&Z+'2%=[&%3**=& TN#&.,,)+,&?++(&%#&0#/5+$%&./.<3*&53*)+,&./%#&%3**.+,& S3=&N3/%&,+23$3%++$3<#/&-#$&53*)+,&*#03*& %#&3&,./1*+&2$#0+,,#$& "Empty" tally Tally of these values J00)')*3%+W&J((&D&53*)+&%#&%3**=& TN#&.,,)+,&?++(&%#&0#/5+$%&./.<3*&53*)+,&./%#&%3**.+,& S3=&N3/%&,+23$3%++$3<#/&-#$&53*)+,&*#03*& %#&3&,./1*+&2$#0+,,#$& "Empty" tally Tally of these values V3$%,&#-&3&$+()0<#/& T3**=W&P/%+$'+(.3%+&,%3%+&#-&0#'2)%3<#/& 9#'>./+W&9#'>./+&G&%3**.+,& h+()0+c1+/w&m+/+$3%+&$+,)*%&-$#'&%3**=& P/.%W&9$+3%+&Z+'2%=[&%3**=& J00)')*3%+W&J((&D&53*)+&%#&%3**=& 0 i V3$3**+*&$+()0<#/&-$3'+N#$;& 7 c rg 36 c Tally: Intermediate state of computation i = Init: Create "empty" tally a = Accumulate: Add 1 value to tally c = Combine: Combine 2 tallies rg = Reduce gen: Generate result from tally a a a a a a a a i 3 2 i 8 4 i c 12 4+L/./1&$+()0<#/,& T3**=W&P/%+$'+(.3%+&,%3%+&#-&0#'2)%3<#/& 9#'>./+W&9#'>./+&G&%3**.+,& h+()0+c1+/w&m+/+$3%+&$+,)*%&-$#'&%3**=& P/.%W&9$+3%+&Z+'2%=[&%3**=& J00)')*3%+W&J((&D&53*)+&%#&%3**=&!3'2*+&2$#>*+',W&f&
16 4+L/./1&$+()0<#/,& T3**=W&P/%+$'+(.3%+&,%3%+&#-&0#'2)%3<#/& 9#'>./+W&9#'>./+&G&%3**.+,& h+()0+c1+/w&m+/+$3%+&$+,)*%&-$#'&%3**=& P/.%W&9$+3%+&Z+'2%=[&%3**=& J00)')*3%+W&J((&D&53*)+&%#&%3**=&!3'2*+&2$#>*+',W&fK&".,%#1$3'& 4+L/./1&$+()0<#/,& T3**=W&P/%+$'+(.3%+&,%3%+&#-&0#'2)%3<#/& 9#'>./+W&9#'>./+&G&%3**.+,& h+()0+c1+/w&m+/+$3%+&$+,)*%&-$#'&%3**=& P/.%W&9$+3%+&Z+'2%=[&%3**=& J00)')*3%+W&J((&D&53*)+&%#&%3**=&!3'2*+&2$#>*+',W&fK&".,%#1$3'K&'38& 4+L/./1&$+()0<#/,& T3**=W&P/%+$'+(.3%+&,%3%+&#-&0#'2)%3<#/& 9#'>./+W&9#'>./+&G&%3**.+,& h+()0+c1+/w&m+/+$3%+&$+,)*%&-$#'&%3**=& P/.%W&9$+3%+&Z+'2%=[&%3**=& J00)')*3%+W&J((&D&53*)+&%#&%3**=&!3'2*+&2$#>*+',W&fK&".,%#1$3'K&'38K&G /( &*3$1+,%&& 4+L/./1&$+()0<#/,& T3**=W&P/%+$'+(.3%+&,%3%+&#-&0#'2)%3<#/& 9#'>./+W&9#'>./+&G&%3**.+,& h+()0+c1+/w&m+/+$3%+&$+,)*%&-$#'&%3**=& P/.%W&9$+3%+&Z+'2%=[&%3**=& J00)')*3%+W&J((&D&53*)+&%#&%3**=&!3'2*+&2$#>*+',W&fK&".,%#1$3'K&'38K&G /( &*3$1+,%K&&&&& &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&*+/1%"&#-&*#/1+,%&$)/&& 93/&1#&>+=#/(&%"+,+III&./(+8U-&^L/(&./(+8&#-&L$,%�)$$+/0+_&,+p)+/0+&3*.1/'+/%&&&&&&&&&&&&&&&&&&&&&&&&&&&&&k!$./.53,&J*)$)l& /C>#(=&2$#>*+'&&& & & & & &&&&&&&k!$./.53,&j*)$)l& h+*3<#/,".2&%#&(=/3'.0&2$#1$3''./1& 9"3**+/1+,&./&(=/3'.0&2$#1$3''./1W& :"3%&3$+&%"+&%3>*+&+/%$.+,O& t#n&%#&0#'2)%+&3&%3>*+&+/%$=&-$#'&2$+5.#),&+/%$.+,o& 9"3**+/1+,&./&$+()0<#/&-$3'+N#$;W& :"3%&.,&%"+&%3**=O& t#n&%#&0#'2)%+&3&/+n&%3**.+,&-$#'&2$+5.#),&#/+,o&
17 h+()0<#/,&./&9"32+*& B82$+,,&$+()0<#/+$3<#/&./&,./1*+&*./+W& &53$&,&d&f&$+()0+&Ja &YYJ&.,&3$$3=K&,&1+%,&,)'&!)22#$%,&fK&jK&É&^8#$_K&vvK&ÑÑK&'38K&'./K&III& './*#0&3/(&'38*#0&$+%)$/&3&%)2*+&N.%"&53*)+& 3/(&.%,&./(+8W& &53$&^53*K&*#0_&d&'./*#0&$+()0+&Ja& h+()0<#/&+83'2*+& 93/&3*,#&),+&$+()0+&#/&-)/0<#/&2*),&3&$3/1+& 1 1 " x B8W&J22$#8.'3%+&ÖYG&),./1&&&&&&&&&&&&&&&&&W& # 2 dx "1 &0#/L1&0#/,%&/)'h+0%&d&DEEEEEEEa& &0#/,%&N.(%"&d&GIE&Y&/)'h+0%a&&&&&&&&&&&&YY$+0%3/1*+&N.(%"& &0#/,%&>3,+y&d&CD&C&N.(%"YGa& &0#/,%&"3*-VP&d&f&$+()0+&k.&./&DII/)'h+0%l& && &^N.(%"&j&,p$%^DIE&q&^>3,+y&f&.jN.(%"_jjG a& 4+L/./1&3&0),%#'&$+()0<#/& 9$+3%+&#>]+0%&%#&$+2$+,+/%&./%+$'+(.3%+&,%3%+& S),%&,)22#$%& 300)')*3%+W&3((,&3&,./1*+&+*+'+/%&%#&%"+&,%3%+& 0#'>./+W&3((,&3/#%"+$&./%+$'+(.3%+&,%3%+& 1+/+$3%+W&0#/5+$%,&,%3%+&#>]+0%&./%#&L/3*&#)%2)%& 0*3,,&9.$0*+&g& 53$&$3(.),&W&$+3*a& 2$#0&3$+3^_&W&$+3*&g& $+%)$/&RIDF&j&$3(.),&j&$3(.),a& 9*3,,+,&./&9"32+*& 53$&0DK&0G&W&9.$0*+a & & &YY0$+3%+,&G&9.$0*+&$+-+$+/0+,& 0D&d&/+N&9.$0*+^DE_a& &Yj&),+,&,=,%+'C,)22*.+(&0#/,%$)0%#$& & & & & & & & & & &%#&0$+3%+&3&9.$0*+&#>]+0%& & & & & & & & & & &3/(&'3;+,&0D&$+-+$&%#&.%&jY& 0G&d&0Da& & & & &YY'3;+,&0G&$+-+$&%#&%"+&,3'+&#>]+0%& (+*+%+&0Da & & & &YY'+'#$=&'),%&>+&'3/)3**=&-$++(&& 0*3,,&9.$0*+&W&!"32+&g& &&III& P/"+$.%3/0+& &YY9.$0*+&./"+$.%,&-$#'&!"32+& 53$&,&W&!"32+a&,&d&/+N&9.$0*+^DEIE_a&&&YY3)%#'3<0&03,%&%#&>3,+&0*3,,& 53$&3$+3&d&,I3$+3^_a &&&Yj&03**&$+0.2.+/%&(+%+$'./+(&& && & & & & & & &&>=&#>]+0%Q,&(=/3'.0&%=2+&jY& B83'2*+&Z0),%#'[&$+()0<#/& 0*3,,&S=S./&W&h+()0+!03/U2&g&YYL/(,&'./&+*+'+/%&^+p).5I&%#&>).*%C./&Z'./[_& &%=2+&+*%T=2+a & & & & & &YY%=2+&#-&+*+'+/%,& &53$&,#@3$&W&+*%T=2+&d&'38^+*%T=2+_a &YY'./.')'&,#&-3$& &2$#0&300)')*3%+^53*&W&+*%T=2+_&g& & & &.-^53*&e&,#@3$_&g&,#@3$&d&53*a& & &2$#0&0#'>./+^#%"+$&W&S=S./_&g& & & &.-^#%"+$I,#@3$&e&,#@3$_&g&,#@3$&d&#%"+$I,#@3$a& & &2$#0&1+/+$3%+^_&g&$+%)$/&,#@3$a& &
18 J/(&%"3%Q,&/#%&3**III&&&^,03/,_& P/,%+3(&#-&]),%&1+c/1+$3**&53*)+K&3*,#&0#'2)%+& 53*)+&-#$&+5+$=&2$+L8& J/(&%"3%Q,&/#%&3**III&&&^,03/,_& P/,%+3(&#-&]),%&1+c/1+$3**&53*)+K&3*,#&0#'2)%+& 53*)+&-#$&+5+$=&2$+L8& A A sum sum A,+-)*&3/,N+$./1&p)+$.+,&*.;+&& &&&&Z:"3%&.,&%"+&,)'&#-&+*+'+/%,&G&%"$)&\O[& && & & & & &d&,)'k\l&q&,)'kdl& A,+-)*&3/,N+$./1&p)+$.+,&*.;+&& &&&&Z:"3%&.,&%"+&,)'&#-&+*+'+/%,&G&%"$)&\O[& && & & & & &d&,)'k\l&q&,)'kdl& 9#'2)</1&%"+&,03/&./&23$3**+*& 9#'2)</1&%"+&,03/&./&23$3**+*& Upward pass to compute reduction. Downward pass to also compute scan Upward pass to compute reduction. Downward pass to also compute scan #N/N3$(&23,,&N.%"&-)/0<#/&*3>+*,& S3/=<#/,&-#$&'#()*+&R& 0 i 0 19 i = init a = accumulate A,./1&9"32+*&-#$&+3,+&#-&23$3**+*.s3<#/& h+()0<#/,&#/&232+$&^(+l/./1&3/(y#$&),./1_& input: a a a a a a a a J*,#&.'2*+'+/</1&$+()0<#/,&./&9"32+*& output: !.(+&p)+,<#/W&:"+$+&%#&2)%&.%O&
19 935+3%,&!<**&./&(+5+*#2'+/%& h+()0<#/,&,+$.3*.s+(&#/&')*<0#$+&^3,&#-&diu_& B$$#$&'+,,31+,&%"./&?+N&5+$,.#/,&+5+$=&u&'#/%",&q&,#'+&>.1&0"3/1+,&?#%&'3/=&*.>$3$.+,& ZTU4U[&*.,%&?#%+,K&,*.(+,K&3,,.1/'+/%,K&+%0& B5.(+/0+&#/&<+&%#&(=/3'.0&2$#1$3''./1&!3'2*+&3(#2<#/&,%$3%+1.+,& S#$+&322*.03<#/,&#-&$+()0<#/,&3/(&,03/,&?#&(+5+*#2'+/%&+/5.$#/'+/%& 9#''3/(C*./+&0#'2.*3<#/&./&b./)8& V*+3,+&,"3$+w& U%"+$&$+,#)$0+,& 9!&./&V3$3**+*& "X2WYY0,./23$3**+*I#$1& 43/&m$#,,'3/Q,&9!&G&/#%+,& "X2WYYNNNI0,I1,)I+()Y`%022Y0)$$.0)*)'Y& T"3/;,&-#$&=#)$&<'+& (>)/(+Ü;/#8I+()& "X2WYY-30)*%=I;/#8I+()Y(>)/(+Y%+30"./1Y99!9CS:DR&
Introduction to CUDA CIRC Summer School 2014
Introduction to CUDA CIRC Summer School 2014 Baowei Liu Center of Integrated Research Computing University of Rochester October 20, 2014 Introduction Overview What will you learn on this class? Start from
More informationShort modules for introducing parallel concepts
Short modules for introducing parallel concepts David Bunde Knox College Work par
More informationCUDA. More on threads, shared memory, synchronization. cuprintf
CUDA More on threads, shared memory, synchronization cuprintf Library function for CUDA Developers Copy the files from /opt/cuprintf into your source code folder #include cuprintf.cu global void testkernel(int
More informationCD _. _. 'p ~~M CD, CD~~~~V. C ~'* Co ~~~~~~~~~~~~- CD / X. pd.0 & CD. On 0 CDC _ C _- CD C P O ttic 2 _. OCt CD CD (IQ. q"3. 3 > n)1t.
n 5 L n q"3 +, / X g ( E 4 11 " ') $ n 4 ) w Z$ > _ X ~'* ) i 1 _ /3 L 2 _ L 4 : 5 n W 9 U~~~~~~ 5 T f V ~~~~~~~~~~~~ (Q ' ~~M 3 > n)1 % ~~~~V v,~~ _ + d V)m X LA) z~~11 4 _ N cc ', f 'd 4 5 L L " V +,
More informationCUDA Kenjiro Taura 1 / 36
CUDA Kenjiro Taura 1 / 36 Contents 1 Overview 2 CUDA Basics 3 Kernels 4 Threads and thread blocks 5 Moving data between host and device 6 Data sharing among threads in the device 2 / 36 Contents 1 Overview
More informationLecture 3: Introduction to CUDA
CSCI-GA.3033-004 Graphics Processing Units (GPUs): Architecture and Programming Lecture 3: Introduction to CUDA Some slides here are adopted from: NVIDIA teaching kit Mohamed Zahran (aka Z) mzahran@cs.nyu.edu
More informationLecture 8: GPU Programming. CSE599G1: Spring 2017
Lecture 8: GPU Programming CSE599G1: Spring 2017 Announcements Project proposal due on Thursday (4/28) 5pm. Assignment 2 will be out today, due in two weeks. Implement GPU kernels and use cublas library
More information;/ *. 7 &9*R . C 2 <0* # "2 <AC (:* "' < %)V AB C. Downloaded from qjal.smtc.ac.ir at 23: on Sunday May 13th 2018
( - ) 1394 @R$ 8 %& b ;/ *. 7 1393/12/18 1394/2/28 :6# :G"#a 2 "V*D 1 AB C &9*R "' : > ; *$ id: 0* :6 i: > (:* 0 > ' # *,2 T# "? 0 > # "5 EOAB N*> # "5 UA;
More informationOutline 2011/10/8. Memory Management. Kernels. Matrix multiplication. CIS 565 Fall 2011 Qing Sun
Outline Memory Management CIS 565 Fall 2011 Qing Sun sunqing@seas.upenn.edu Kernels Matrix multiplication Managing Memory CPU and GPU have separate memory spaces Host (CPU) code manages device (GPU) memory
More informationPlenty of Whoopee Strand New Years
N B V Y-NN \ - 4! / N N ) B > 3 9 N - N 95 q N N B zz 3 - z N Y B - 933 55 2 -- - -» 25-5 & V X X X X N Y B 5 932 / q - 8 4 6 B N 3 BN NY N ; -! 2-- - - 2 B z ; - - B VN N 4) - - - B N N N V 4- - 8 N-
More informationCUDA. Sathish Vadhiyar High Performance Computing
CUDA Sathish Vadhiyar High Performance Computing Hierarchical Parallelism Parallel computations arranged as grids One grid executes after another Grid consists of blocks Blocks assigned to SM. A single
More informationLecture 2: Introduction to CUDA C
CS/EE 217 GPU Architecture and Programming Lecture 2: Introduction to CUDA C David Kirk/NVIDIA and Wen-mei W. Hwu, 2007-2013 1 CUDA /OpenCL Execution Model Integrated host+device app C program Serial or
More informationModule 2: Introduction to CUDA C
ECE 8823A GPU Architectures Module 2: Introduction to CUDA C 1 Objective To understand the major elements of a CUDA program Introduce the basic constructs of the programming model Illustrate the preceding
More informationCUDA Programming (Basics, Cuda Threads, Atomics) Ezio Bartocci
TECHNISCHE UNIVERSITÄT WIEN Fakultät für Informatik Cyber-Physical Systems Group CUDA Programming (Basics, Cuda Threads, Atomics) Ezio Bartocci Outline of CUDA Basics Basic Kernels and Execution on GPU
More informationSPAREPARTSCATALOG: CONNECTORS SPARE CONNECTORS KTM ART.-NR.: 3CM EN
SPAREPARTSCATALOG: CONNECTORS ART.-NR.: 3CM3208201EN CONTENT SPARE CONNECTORS AA-AN SPARE CONNECTORS AO-BC SPARE CONNECTORS BD-BQ SPARE CONNECTORS BR-CD 3 4 5 6 SPARE CONNECTORS CE-CR SPARE CONNECTORS
More informationParallel Computing. Lecture 19: CUDA - I
CSCI-UA.0480-003 Parallel Computing Lecture 19: CUDA - I Mohamed Zahran (aka Z) mzahran@cs.nyu.edu http://www.mzahran.com GPU w/ local DRAM (device) Behind CUDA CPU (host) Source: http://hothardware.com/reviews/intel-core-i5-and-i7-processors-and-p55-chipset/?page=4
More informationHigh-Performance Computing Using GPUs
High-Performance Computing Using GPUs Luca Caucci caucci@email.arizona.edu Center for Gamma-Ray Imaging November 7, 2012 Outline Slide 1 of 27 Why GPUs? What is CUDA? The CUDA programming model Anatomy
More informationSPARE CONNECTORS KTM 2014
SPAREPARTSCATALOG: // ENGINE ART.-NR.: 3208201EN CONTENT CONNECTORS FOR WIRING HARNESS AA-AN CONNECTORS FOR WIRING HARNESS AO-BC CONNECTORS FOR WIRING HARNESS BD-BQ CONNECTORS FOR WIRING HARNESS BR-CD
More informationPerformance Diagnosis for Hybrid CPU/GPU Environments
Performance Diagnosis for Hybrid CPU/GPU Environments Michael M. Smith and Karen L. Karavanic Computer Science Department Portland State University Performance Diagnosis for Hybrid CPU/GPU Environments
More information22ND CENTURY_J1.xls Government Site Hourly Rate
Escalation rate 000 AA0 Administrative Assistant Level I 000 AA0 Administrative Assistant Level II 000 AB0 Application Engineer Level I 000 AB0 Application Engineer Level II 000 AC0 Application Programmer
More informationCS 179: GPU Computing. Lecture 2: The Basics
CS 179: GPU Computing Lecture 2: The Basics Recap Can use GPU to solve highly parallelizable problems Performance benefits vs. CPU Straightforward extension to C language Disclaimer Goal for Week 1: Fast-paced
More informationModule 2: Introduction to CUDA C. Objective
ECE 8823A GPU Architectures Module 2: Introduction to CUDA C 1 Objective To understand the major elements of a CUDA program Introduce the basic constructs of the programming model Illustrate the preceding
More informationAppendix 5-1: Attachment J.1 Pricing Table -1: IMS Ceiling Loaded Rates at Contractor Site
Appendix 5-1: Attachment J.1 Pricing Table -1: IMS Ceiling Loaded Rates at Contractor Site Escalation rate 4.6% 4.6% 4.6% 4.6% 4.6% 4.6% 4.6% 4.6% 4.6% 0001 AA01 Administrative Assistant Level I $51.00
More informationCS/CoE 1541 Final exam (Fall 2017). This is the cumulative final exam given in the Fall of Question 1 (12 points): was on Chapter 4
CS/CoE 1541 Final exam (Fall 2017). Name: This is the cumulative final exam given in the Fall of 2017. Question 1 (12 points): was on Chapter 4 Question 2 (13 points): was on Chapter 4 For Exam 2, you
More informationGPU Programming. Lecture 2: CUDA C Basics. Miaoqing Huang University of Arkansas 1 / 34
1 / 34 GPU Programming Lecture 2: CUDA C Basics Miaoqing Huang University of Arkansas 2 / 34 Outline Evolvements of NVIDIA GPU CUDA Basic Detailed Steps Device Memories and Data Transfer Kernel Functions
More informationGPU Programming with CUDA. Pedro Velho
GPU Programming with CUDA Pedro Velho Meeting the audience! How many of you used concurrent programming before? How many threads? How many already used CUDA? Introduction from games to science 1 2 Architecture
More informationRegister file. A single large register file (ex. 16K registers) is partitioned among the threads of the dispatched blocks.
Sharing the resources of an SM Warp 0 Warp 1 Warp 47 Register file A single large register file (ex. 16K registers) is partitioned among the threads of the dispatched blocks Shared A single SRAM (ex. 16KB)
More informationComputation to Core Mapping Lessons learned from a simple application
Lessons learned from a simple application Matrix Multiplication Used as an example throughout the course Goal for today: Show the concept of Computation-to-Core Mapping Block schedule, Occupancy, and thread
More informationCUDA Programming. Week 1. Basic Programming Concepts Materials are copied from the reference list
CUDA Programming Week 1. Basic Programming Concepts Materials are copied from the reference list G80/G92 Device SP: Streaming Processor (Thread Processors) SM: Streaming Multiprocessor 128 SP grouped into
More informationPOSIX threads CS 241. February 17, Copyright University of Illinois CS 241 Staff
POSIX threads CS 241 February 17, 2012 Copyright University of Illinois CS 241 Staff 1 Recall: Why threads over processes? Creating a new process can be expensive Time A call into the operating system
More informationCS377P Programming for Performance GPU Programming - I
CS377P Programming for Performance GPU Programming - I Sreepathi Pai UTCS November 9, 2015 Outline 1 Introduction to CUDA 2 Basic Performance 3 Memory Performance Outline 1 Introduction to CUDA 2 Basic
More informationMassively Parallel Algorithms
Massively Parallel Algorithms Introduction to CUDA & Many Fundamental Concepts of Parallel Programming G. Zachmann University of Bremen, Germany cgvr.cs.uni-bremen.de Hybrid/Heterogeneous Computation/Architecture
More informationProgrammable Accelerators
Programmable Accelerators Jason Lowe-Power powerjg@cs.wisc.edu cs.wisc.edu/~powerjg Increasing specialization Need to program these accelerators Challenges 1. Consistent pointers 2. Data movement 3. Security
More informationGPU Computing: A Quick Start
GPU Computing: A Quick Start Orest Shardt Department of Chemical and Materials Engineering University of Alberta August 25, 2011 Session Goals Get you started with highly parallel LBM Take a practical
More informationIntroduction to CUDA CME343 / ME May James Balfour [ NVIDIA Research
Introduction to CUDA CME343 / ME339 18 May 2011 James Balfour [ jbalfour@nvidia.com] NVIDIA Research CUDA Programing system for machines with GPUs Programming Language Compilers Runtime Environments Drivers
More informationSMBJ5.0 thru SMBJ188CA. Surface Mount TRANSZORB Transient Voltage Suppressors. Vishay General Semiconductor
SMBJ5.0 thru SMBJ188CA Surface Mount TRANSZORB Transient Voltage Suppressors DO-214AA (SMB J-Bend) PRIMARY CHARACTERISTICS V WM 5.0 V to 188 V P PPM 600 W I FSM (uni-directional only) A T J max. 150 C
More informationFall 2007, Final Exam, Data Structures and Algorithms
Fall 2007, Final Exam, Data Structures and Algorithms Name: Section: Email id: 12th December, 2007 This is an open book, one crib sheet (2 sides), closed notebook exam. Answer all twelve questions. Each
More informationBy: Tomer Morad Based on: Erik Lindholm, John Nickolls, Stuart Oberman, John Montrym. NVIDIA TESLA: A UNIFIED GRAPHICS AND COMPUTING ARCHITECTURE In IEEE Micro 28(2), 2008 } } Erik Lindholm, John Nickolls,
More informationUsing Chapel to teach parallel concepts. David Bunde Knox College
Using Chapel to teach parallel concepts David Bunde Knox College dbunde@knox.edu Acknowledgements Silent partner: Kyle Burke Material drawn from tutorials created with contribudons from Johnathan Ebbers,
More informationCOSC 6374 Parallel Computations Introduction to CUDA
COSC 6374 Parallel Computations Introduction to CUDA Edgar Gabriel Fall 2014 Disclaimer Material for this lecture has been adopted based on various sources Matt Heavener, CS, State Univ. of NY at Buffalo
More informationLessons learned from a simple application
Computation to Core Mapping Lessons learned from a simple application A Simple Application Matrix Multiplication Used as an example throughout the course Goal for today: Show the concept of Computation-to-Core
More informationπ = 4 N in_circle N total ... // includes # define N 1000000 float uniform_rand(unsigned* seed, float lower, float upper) {... int main () { int num_in_circ = 0; float x, y, dist; #pragma omp parallel
More informationReduc&ons II: The Revenge
Reduc&ons II: The Revenge Summing values in an array 16 10 6 3 7 4 2 2 1 4 3 1 3 0 2 Summing values in an array 16 10 6 3 7 4 2 2 1 4 3 1 3 0 2 Finding max of an array 4 4 3 2 4 3 2 2 1 4 3 1 3 0 2 Finding
More informationCUDA C Programming Mark Harris NVIDIA Corporation
CUDA C Programming Mark Harris NVIDIA Corporation Agenda Tesla GPU Computing CUDA Fermi What is GPU Computing? Introduction to Tesla CUDA Architecture Programming & Memory Models Programming Environment
More informationGPU programming CUDA C. GPU programming,ii. COMP528 Multi-Core Programming. Different ways:
COMP528 Multi-Core Programming GPU programming,ii www.csc.liv.ac.uk/~alexei/comp528 Alexei Lisitsa Dept of computer science University of Liverpool a.lisitsa@.liverpool.ac.uk Different ways: GPU programming
More informationParallel Numerical Algorithms
Parallel Numerical Algorithms http://sudalab.is.s.u-tokyo.ac.jp/~reiji/pna14/ [ 10 ] GPU and CUDA Parallel Numerical Algorithms / IST / UTokyo 1 PNA16 Lecture Plan General Topics 1. Architecture and Performance
More informationEfficient CPU GPU data transfers CUDA 6.0 Unified Virtual Memory
Institute of Computational Science Efficient CPU GPU data transfers CUDA 6.0 Unified Virtual Memory Juraj Kardoš (University of Lugano) July 9, 2014 Juraj Kardoš Efficient GPU data transfers July 9, 2014
More informationTechnische Universität München. GPU Programming. Rüdiger Westermann Chair for Computer Graphics & Visualization. Faculty of Informatics
GPU Programming Rüdiger Westermann Chair for Computer Graphics & Visualization Faculty of Informatics Overview Programming interfaces and support libraries The CUDA programming abstraction An in-depth
More informationOptimizing Parallel Reduction in CUDA. Mark Harris NVIDIA Developer Technology
Optimizing Parallel Reduction in CUDA Mark Harris NVIDIA Developer Technology Parallel Reduction Common and important data parallel primitive Easy to implement in CUDA Harder to get it right Serves as
More informationCS/EE 217 GPU Architecture and Parallel Programming. Lecture 10. Reduction Trees
CS/EE 217 GPU Architecture and Parallel Programming Lecture 10 Reduction Trees David Kirk/NVIDIA and Wen-mei W. Hwu University of Illinois, 2007-2012 1 Objective To master Reduction Trees, arguably the
More informationEEM528 GPU COMPUTING
EEM528 CS 193G GPU COMPUTING Lecture 2: GPU History & CUDA Programming Basics Slides Credit: Jared Hoberock & David Tarjan CS 193G History of GPUs Graphics in a Nutshell Make great images intricate shapes
More informationComputability and Complexity Sample Exam Questions
Computability and Complexity Sample Exam Questions Wolfgang Schreiner Wolfgang.Schreiner@risc.jku.at Family Name: Given Name: Matriculation Number: Study Code: Total: 100 Points. 51 Points: GEN4 64 Points:
More informationCS 179: GPU Programming. Lecture 7
CS 179: GPU Programming Lecture 7 Week 3 Goals: More involved GPU-accelerable algorithms Relevant hardware quirks CUDA libraries Outline GPU-accelerated: Reduction Prefix sum Stream compaction Sorting(quicksort)
More informationAtomic Operations. Atomic operations, fast reduction. GPU Programming. Szénási Sándor.
Atomic Operations Atomic operations, fast reduction GPU Programming http://cuda.nik.uni-obuda.hu Szénási Sándor szenasi.sandor@nik.uni-obuda.hu GPU Education Center of Óbuda University ATOMIC OPERATIONS
More informationIntroduction to GPGPU and GPU-architectures
Introduction to GPGPU and GPU-architectures Henk Corporaal Gert-Jan van den Braak http://www.es.ele.tue.nl/ Contents 1. What is a GPU 2. Programming a GPU 3. GPU thread scheduling 4. GPU performance bottlenecks
More informationLecture 6 CSE 260 Parallel Computation (Fall 2015) Scott B. Baden. Computing with Graphical Processing Units CUDA Programming Matrix multiplication
Lecture 6 CSE 260 Parallel Computation (Fall 2015) Scott B. Baden Computing with Graphical Processing Units CUDA Programming Matrix multiplication Announcements A2 has been released: Matrix multiplication
More informationCUDA. GPU Computing. K. Cooper 1. 1 Department of Mathematics. Washington State University
GPU Computing K. Cooper 1 1 Department of Mathematics Washington State University 2014 Review of Parallel Paradigms MIMD Computing Multiple Instruction Multiple Data Several separate program streams, each
More informationOptimizing Parallel Reduction in CUDA. Mark Harris NVIDIA Developer Technology
Optimizing Parallel Reduction in CUDA Mark Harris NVIDIA Developer Technology Parallel Reduction Common and important data parallel primitive Easy to implement in CUDA Harder to get it right Serves as
More informationCS333 Intro to Operating Systems. Jonathan Walpole
CS333 Intro to Operating Systems Jonathan Walpole Threads & Concurrency 2 Threads Processes have the following components: - an address space - a collection of operating system state - a CPU context or
More informationGPU CUDA Programming
GPU CUDA Programming 이정근 (Jeong-Gun Lee) 한림대학교컴퓨터공학과, 임베디드 SoC 연구실 www.onchip.net Email: Jeonggun.Lee@hallym.ac.kr ALTERA JOINT LAB Introduction 차례 Multicore/Manycore and GPU GPU on Medical Applications
More informationIntroduction to parallel computing. Seminar Organization
Introduction to parallel computing Rami Melhem Department of Computer Science 1 Seminar Organization 1) Introductory lectures (probably 4) 2) aper presentations by students (2/3 per short/long class) -
More informationLecture 5. Performance Programming with CUDA
Lecture 5 Performance Programming with CUDA Announcements 2011 Scott B. Baden / CSE 262 / Spring 2011 2 Today s lecture Matrix multiplication 2011 Scott B. Baden / CSE 262 / Spring 2011 3 Memory Hierarchy
More informationSMBJ5.0 thru SMBJ188CA. Surface Mount TRANSZORB Transient Voltage Suppressors. Vishay General Semiconductor
SMBJ5.0 thru SMBJ188CA Surface Mount TRANSZORB Transient Voltage Suppressors DO-214AA (SMB J-Bend) PRIMARY CHARACTERISTICS V WM 5.0 V to 188 V P PPM 600 W I FSM (uni-directional only) A T J max. 150 C
More informationObjective. GPU Teaching Kit. OpenACC. To understand the OpenACC programming model. Introduction to OpenACC
GPU Teaching Kit Accelerated Computing OpenACC Introduction to OpenACC Objective To understand the OpenACC programming model basic concepts and pragma types simple examples 2 2 OpenACC The OpenACC Application
More informationSMF Transient Voltage Suppressor Diode Series
SMF Transient Voltage Suppressor Diode Series General Information The SMF series is designed specifically to protect sensitive electronic equipment from voltage transients induced by lightning and other
More informationSC13 GPU Technology Theater. Accessing New CUDA Features from CUDA Fortran Brent Leback, Compiler Manager, PGI
SC13 GPU Technology Theater Accessing New CUDA Features from CUDA Fortran Brent Leback, Compiler Manager, PGI The Case for Fortran Clear, straight-forward syntax Successful legacy in the scientific community
More informationFLIGHTS TO / FROM CANADA ARE DOMESTIC
MINIMUM CONNECTING TIME Houston, USA FLIGHTS TO / FROM CANADA ARE DOMESTIC IAH (Intercontinental Airport) DOMESTIC TO DOMESTIC :45 AA TO DL :40 CO TO AC :30 AA, UA, US :20 CO :30 DL, WN :25 DOMESTIC TO
More informationCS510 Operating System Foundations. Jonathan Walpole
CS510 Operating System Foundations Jonathan Walpole The Process Concept 2 The Process Concept Process a program in execution Program - description of how to perform an activity instructions and static
More informationCSE 591: GPU Programming. Using CUDA in Practice. Klaus Mueller. Computer Science Department Stony Brook University
CSE 591: GPU Programming Using CUDA in Practice Klaus Mueller Computer Science Department Stony Brook University Code examples from Shane Cook CUDA Programming Related to: score boarding load and store
More informationParallel Accelerators
Parallel Accelerators Přemysl Šůcha ``Parallel algorithms'', 2017/2018 CTU/FEL 1 Topic Overview Graphical Processing Units (GPU) and CUDA Vector addition on CUDA Intel Xeon Phi Matrix equations on Xeon
More informationHPCSE II. GPU programming and CUDA
HPCSE II GPU programming and CUDA What is a GPU? Specialized for compute-intensive, highly-parallel computation, i.e. graphic output Evolution pushed by gaming industry CPU: large die area for control
More informationReduction of a Symmetrical Matrix. to Tridiagonal Form on GPUs
Reduction of a Symmetrical Matrix to Tridiagonal Form on GPUs By Shuotian Chen Department of Electrical and Computer Engineering University of Illinois at Urbana-Champaign Adviser: Professor Volodymyr
More informationIntroduc)on to GPU Programming
Introduc)on to GPU Programming Mubashir Adnan Qureshi h3p://www.ncsa.illinois.edu/people/kindr/projects/hpca/files/singapore_p1.pdf h3p://developer.download.nvidia.com/cuda/training/nvidia_gpu_compu)ng_webinars_cuda_memory_op)miza)on.pdf
More informationHigh Performance Computing and GPU Programming
High Performance Computing and GPU Programming Lecture 3: GPU Application GPU Intro Review Simple Example Memory Effects GPU Intro Review GPU Intro Review Shared Multiprocessors Global parallelism Assign
More informationWorking with Data sent to a Computer or Flash Stick
Working with Data sent to a Computer or Flash Stick File Names and File Formats Data sent to a flash stick is saved as a.txt file. The file name for the saved file is in the format: DATA FILE TYPE Send
More informationUnrolling parallel loops
Unrolling parallel loops Vasily Volkov UC Berkeley November 14, 2011 1 Today Very simple optimization technique Closely resembles loop unrolling Widely used in high performance codes 2 Mapping to GPU:
More informationStanford University. NVIDIA Tesla M2090. NVIDIA GeForce GTX 690
Stanford University NVIDIA Tesla M2090 NVIDIA GeForce GTX 690 Moore s Law 2 Clock Speed 10000 Pentium 4 Prescott Core 2 Nehalem Sandy Bridge 1000 Pentium 4 Williamette Clock Speed (MHz) 100 80486 Pentium
More informationCUDA Basics. July 6, 2016
Mitglied der Helmholtz-Gemeinschaft CUDA Basics July 6, 2016 CUDA Kernels Parallel portion of application: execute as a kernel Entire GPU executes kernel, many threads CUDA threads: Lightweight Fast switching
More informationLecture 3. Programming with GPUs
Lecture 3 Programming with GPUs GPU access Announcements lilliput: Tesla C1060 (4 devices) cseclass0{1,2}: Fermi GTX 570 (1 device each) MPI Trestles @ SDSC Kraken @ NICS 2011 Scott B. Baden / CSE 262
More informationPinned-Memory. Table of Contents. Streams Learning CUDA to Solve Scientific Problems. Objectives. Technical Issues Stream. Pinned-memory.
Table of Contents Streams Learning CUDA to Solve Scientific Problems. 1 Objectives Miguel Cárdenas Montes Centro de Investigaciones Energéticas Medioambientales y Tecnológicas, Madrid, Spain miguel.cardenas@ciemat.es
More informationParallel Programming and Debugging with CUDA C. Geoff Gerfin Sr. System Software Engineer
Parallel Programming and Debugging with CUDA C Geoff Gerfin Sr. System Software Engineer CUDA - NVIDIA s Architecture for GPU Computing Broad Adoption Over 250M installed CUDA-enabled GPUs GPU Computing
More informationBasic Elements of CUDA Algoritmi e Calcolo Parallelo. Daniele Loiacono
Basic Elements of CUDA Algoritmi e Calcolo Parallelo References q This set of slides is mainly based on: " CUDA Technical Training, Dr. Antonino Tumeo, Pacific Northwest National Laboratory " Slide of
More informationGPU Programming. Alan Gray, James Perry EPCC The University of Edinburgh
GPU Programming EPCC The University of Edinburgh Contents NVIDIA CUDA C Proprietary interface to NVIDIA architecture CUDA Fortran Provided by PGI OpenCL Cross platform API 2 NVIDIA CUDA CUDA allows NVIDIA
More informationIntroduction to Parallel Programming
Introduction to Parallel Programming Pablo Brubeck Department of Physics Tecnologico de Monterrey October 14, 2016 Student Chapter Tecnológico de Monterrey Tecnológico de Monterrey Student Chapter Outline
More informationCS510 Operating System Foundations. Jonathan Walpole
CS510 Operating System Foundations Jonathan Walpole Threads & Concurrency 2 Why Use Threads? Utilize multiple CPU s concurrently Low cost communication via shared memory Overlap computation and blocking
More informationParallel Accelerators
Parallel Accelerators Přemysl Šůcha ``Parallel algorithms'', 2017/2018 CTU/FEL 1 Topic Overview Graphical Processing Units (GPU) and CUDA Vector addition on CUDA Intel Xeon Phi Matrix equations on Xeon
More informationIntroduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series
Introduction to GPU Computing Using CUDA Spring 2014 Westgid Seminar Series Scott Northrup SciNet www.scinethpc.ca March 13, 2014 Outline 1 Heterogeneous Computing 2 GPGPU - Overview Hardware Software
More informationCS 470 Spring Other Architectures. Mike Lam, Professor. (with an aside on linear algebra)
CS 470 Spring 2016 Mike Lam, Professor Other Architectures (with an aside on linear algebra) Parallel Systems Shared memory (uniform global address space) Primary story: make faster computers Programming
More informationGPU Programming Using CUDA
GPU Programming Using CUDA Michael J. Schnieders Depts. of Biomedical Engineering & Biochemistry The University of Iowa & Gregory G. Howes Department of Physics and Astronomy The University of Iowa Iowa
More informationProgram Optimization. Jo, Heeseung
Program Optimization Jo, Heeseung Today Overview Generally Useful Optimizations Code motion/precomputation Strength reduction Sharing of common subexpressions Removing unnecessary procedure calls Optimization
More informationIntroduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series
Introduction to GPU Computing Using CUDA Spring 2014 Westgid Seminar Series Scott Northrup SciNet www.scinethpc.ca (Slides http://support.scinet.utoronto.ca/ northrup/westgrid CUDA.pdf) March 12, 2014
More informationBOOK-IT 8.0. SIP2 implementation in the Z39.70 server
BOOK-IT 8.0 2015-07-29 Axiell Sverige AB, Box 24014, 224 21 LUND. Besöksadress: Fältspatsvägen 4, 224 78 LUND Tel 046-270 04 00, e-post: axiellsverige@axiell.com, www.axiell.se Table of contents 1 Introduction...
More informationIntroduction to CUDA C/C++ Mark Ebersole, NVIDIA CUDA Educator
Introduction to CUDA C/C++ Mark Ebersole, NVIDIA CUDA Educator What is CUDA? Programming language? Compiler? Classic car? Beer? Coffee? CUDA Parallel Computing Platform www.nvidia.com/getcuda Programming
More informationGPU Computing: Introduction to CUDA. Dr Paul Richmond
GPU Computing: Introduction to CUDA Dr Paul Richmond http://paulrichmond.shef.ac.uk This lecture CUDA Programming Model CUDA Device Code CUDA Host Code and Memory Management CUDA Compilation Programming
More informationGPU programming: CUDA basics. Sylvain Collange Inria Rennes Bretagne Atlantique
GPU programming: CUDA basics Sylvain Collange Inria Rennes Bretagne Atlantique sylvain.collange@inria.fr This lecture: CUDA programming We have seen some GPU architecture Now how to program it? 2 Outline
More informationStandard Interchange Protocol 3.0 Part I
3 Standard Interchange Protocol 3.0 Part I 78-8129-4832-7 Copyright 2011, 3M. All rights reserved. Page 1 Contents Introduction...5 History of SIP...5 Definitions...6 Document Conventions...6 What s New
More informationLast class: Today: Thread Background. Thread Systems
1 Last class: Thread Background Today: Thread Systems 2 Threading Systems 3 What kind of problems would you solve with threads? Imagine you are building a web server You could allocate a pool of threads,
More informationCS 1110: Introduction to Computing Using Python Loop Invariants
CS 1110: Introduction to Computing Using Python Lecture 21 Loop Invariants [Andersen, Gries, Lee, Marschner, Van Loan, White] Announcements Prelim 2 conflicts due by midnight tonight Lab 11 is out Due
More informationGPU Computing with CUDA
GPU Computing with CUDA Hands-on: Shared Memory Use (Dot Product, Matrix Multiplication) Dan Melanz & Dan Negrut Simulation-Based Engineering Lab Wisconsin Applied Computing Center Department of Mechanical
More informationCS 220: Introduction to Parallel Computing. Condition Variables. Lecture 24
CS 220: Introduction to Parallel Computing Condition Variables Lecture 24 Remember: Creating a Thread int pthread_create( pthread_t *thread, const pthread_attr_t *attr, void *(*start_routine)(void *),
More information