Local copy of object argument to CUDA kernel -


i understand this answer, arguments cuda kernel passed via constant memory (for compute capability 2.0 , higher) and, if modified, stored local copies in either registers or on stack. happens if argument object , of members modified kernel? whole object have stored locally, or copies created modified members?

this interesting question hadn't considered before, , answer appears used members of structure loaded register (at least empirically based on 1 example).

consider following contrived example:

struct parameters {     float w,x,y,z;     int a,b,c,d; };  __global__ void kernel(float *in, float *out, parameters p) {     unsigned int tid = threadidx.x + blockidx.x * blockdim.x;     float val_in = in[tid];      p.b += 10;     p.w *= 2.0f;     p.z /= 5.0f;      out[tid] = (p.b>0) ? (p.w*val_in) : (p.z*val_in); } 

if compiler loads parameters required, should see 3 32 bit parameter loads p register. resulting ptx emitted compiler (cuda 5.0 release compiler sm_30) looks this:

// // generated nvidia nvvm compiler // compiler built on sat sep 22 02:35:14 2012 (1348274114) // cuda compilation tools, release 5.0, v0.2.1221 //  .version 3.1 .target sm_30 .address_size 64      .file   1 "/tmp/tmpxft_00000b1a_00000000-9_parameters.cpp3.i"     .file   2 "/home/talonmies/parameters.cu"     .file   3 "/opt/cuda-5.0/bin/../include/device_functions.h"  .visible .entry _z6kernelpfs_10parameters(     .param .u64 _z6kernelpfs_10parameters_param_0,     .param .u64 _z6kernelpfs_10parameters_param_1,     .param .align 4 .b8 _z6kernelpfs_10parameters_param_2[32] ) {     .reg .pred  %p<2>;     .reg .s32   %r<9>;     .reg .f32   %f<8>;     .reg .s64   %rd<8>;       ld.param.u64    %rd1, [_z6kernelpfs_10parameters_param_0];     ld.param.u64    %rd2, [_z6kernelpfs_10parameters_param_1];     ld.param.f32    %f1, [_z6kernelpfs_10parameters_param_2+12];     ld.param.f32    %f2, [_z6kernelpfs_10parameters_param_2];     ld.param.u32    %r1, [_z6kernelpfs_10parameters_param_2+20];     cvta.to.global.u64  %rd3, %rd2;  ///home/talonmies/parameters.cu:11     unsigned int tid = threadidx.x + blockidx.x * blockdim.x;     .loc 2 11 1     mov.u32     %r2, %ntid.x;     mov.u32     %r3, %ctaid.x;     mov.u32     %r4, %tid.x;     mad.lo.s32  %r5, %r2, %r3, %r4;     cvta.to.global.u64  %rd4, %rd1;  ///home/talonmies/parameters.cu:12     float val_in = in[tid];     .loc 2 12 1     mul.wide.u32    %rd5, %r5, 4;     add.s64     %rd6, %rd4, %rd5;  ///home/talonmies/parameters.cu:14     p.b += 10;     .loc 2 14 1     add.s32     %r6, %r1, 10;  ///home/talonmies/parameters.cu:15     p.w *= 2.0f;     .loc 2 15 1     add.f32     %f3, %f2, %f2;  ///opt/cuda-5.0/bin/../include/device_functions.h:2399   return / b;     .loc 3 2399 3     div.rn.f32  %f4, %f1, 0f40a00000;  ///home/talonmies/parameters.cu:18     out[tid] = (p.b>0) ? (p.w*val_in) : (p.z*val_in);     .loc 2 18 1     setp.gt.s32     %p1, %r6, 0;     selp.f32    %f5, %f3, %f4, %p1;  ///home/talonmies/parameters.cu:12     float val_in = in[tid];     .loc 2 12 1     ld.global.f32   %f6, [%rd6];  ///home/talonmies/parameters.cu:18     out[tid] = (p.b>0) ? (p.w*val_in) : (p.z*val_in);     .loc 2 18 1     mul.f32     %f7, %f5, %f6;     add.s64     %rd7, %rd3, %rd5;     st.global.f32   [%rd7], %f7;  ///home/talonmies/parameters.cu:19 }     .loc 2 19 2     ret; } 

you can see _z6kernelpfs_10parameters_param_2 (which p.w), _z6kernelpfs_10parameters_param_2+12(which p.z), , _z6kernelpfs_10parameters_param_2+20 (which p.b) loaded registers. other members of structure never loaded kernel.


Comments

Popular posts from this blog

Why does Ruby on Rails generate add a blank line to the end of a file? -

keyboard - Smiles and long press feature in Android -

node.js - Bad Request - node js ajax post -