Local copy of object argument to CUDA kernel -
i understand this answer, arguments cuda kernel passed via constant memory (for compute capability 2.0 , higher) and, if modified, stored local copies in either registers or on stack. happens if argument object , of members modified kernel? whole object have stored locally, or copies created modified members?
this interesting question hadn't considered before, , answer appears used members of structure loaded register (at least empirically based on 1 example).
consider following contrived example:
struct parameters { float w,x,y,z; int a,b,c,d; }; __global__ void kernel(float *in, float *out, parameters p) { unsigned int tid = threadidx.x + blockidx.x * blockdim.x; float val_in = in[tid]; p.b += 10; p.w *= 2.0f; p.z /= 5.0f; out[tid] = (p.b>0) ? (p.w*val_in) : (p.z*val_in); }
if compiler loads parameters required, should see 3 32 bit parameter loads p
register. resulting ptx emitted compiler (cuda 5.0 release compiler sm_30) looks this:
// // generated nvidia nvvm compiler // compiler built on sat sep 22 02:35:14 2012 (1348274114) // cuda compilation tools, release 5.0, v0.2.1221 // .version 3.1 .target sm_30 .address_size 64 .file 1 "/tmp/tmpxft_00000b1a_00000000-9_parameters.cpp3.i" .file 2 "/home/talonmies/parameters.cu" .file 3 "/opt/cuda-5.0/bin/../include/device_functions.h" .visible .entry _z6kernelpfs_10parameters( .param .u64 _z6kernelpfs_10parameters_param_0, .param .u64 _z6kernelpfs_10parameters_param_1, .param .align 4 .b8 _z6kernelpfs_10parameters_param_2[32] ) { .reg .pred %p<2>; .reg .s32 %r<9>; .reg .f32 %f<8>; .reg .s64 %rd<8>; ld.param.u64 %rd1, [_z6kernelpfs_10parameters_param_0]; ld.param.u64 %rd2, [_z6kernelpfs_10parameters_param_1]; ld.param.f32 %f1, [_z6kernelpfs_10parameters_param_2+12]; ld.param.f32 %f2, [_z6kernelpfs_10parameters_param_2]; ld.param.u32 %r1, [_z6kernelpfs_10parameters_param_2+20]; cvta.to.global.u64 %rd3, %rd2; ///home/talonmies/parameters.cu:11 unsigned int tid = threadidx.x + blockidx.x * blockdim.x; .loc 2 11 1 mov.u32 %r2, %ntid.x; mov.u32 %r3, %ctaid.x; mov.u32 %r4, %tid.x; mad.lo.s32 %r5, %r2, %r3, %r4; cvta.to.global.u64 %rd4, %rd1; ///home/talonmies/parameters.cu:12 float val_in = in[tid]; .loc 2 12 1 mul.wide.u32 %rd5, %r5, 4; add.s64 %rd6, %rd4, %rd5; ///home/talonmies/parameters.cu:14 p.b += 10; .loc 2 14 1 add.s32 %r6, %r1, 10; ///home/talonmies/parameters.cu:15 p.w *= 2.0f; .loc 2 15 1 add.f32 %f3, %f2, %f2; ///opt/cuda-5.0/bin/../include/device_functions.h:2399 return / b; .loc 3 2399 3 div.rn.f32 %f4, %f1, 0f40a00000; ///home/talonmies/parameters.cu:18 out[tid] = (p.b>0) ? (p.w*val_in) : (p.z*val_in); .loc 2 18 1 setp.gt.s32 %p1, %r6, 0; selp.f32 %f5, %f3, %f4, %p1; ///home/talonmies/parameters.cu:12 float val_in = in[tid]; .loc 2 12 1 ld.global.f32 %f6, [%rd6]; ///home/talonmies/parameters.cu:18 out[tid] = (p.b>0) ? (p.w*val_in) : (p.z*val_in); .loc 2 18 1 mul.f32 %f7, %f5, %f6; add.s64 %rd7, %rd3, %rd5; st.global.f32 [%rd7], %f7; ///home/talonmies/parameters.cu:19 } .loc 2 19 2 ret; }
you can see _z6kernelpfs_10parameters_param_2
(which p.w
), _z6kernelpfs_10parameters_param_2+12
(which p.z
), , _z6kernelpfs_10parameters_param_2+20
(which p.b
) loaded registers. other members of structure never loaded kernel.
Comments
Post a Comment