c++ - 3D array writing and reading as texture in CUDA -


due nature of algorithm programming need write/fill 3d matrix specific maths , read matrix (in separate kernel) 3d linearly interpolated texture.

as texture reading mode, assuming can somehow write in global memory bind texture, , in separate read it, without need of double memory , copying values write read matrix. don't seem figure out how this.

  • how can use 3d texture memory read , write (in separate kernels) ?

my problem don't know how define global read/write array. in sample below, have created 3d texture, using code cudaextent , cudaarray. don't seem able use types write on them, neither seem able create them float* or likes.

i may not able , need memcpy somewhere in middle, these arrays big, i'd save memory.

sample code (doesn't compile, defines structure of trying do). uses 100x100x100 3d memory default because yes.

#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <cuda_runtime_api.h> #include <cuda.h>  #define maxtreads 1024  cudaerror_t addwithcuda(int *c, const int *a, const int *b, unsigned int size); texture<float, cudatexturetype3d, cudareadmodeelementtype> tex;  __global__ void readkernel(float* imageend ) {     int indy = blockidx.y * blockdim.y + threadidx.y;     int indx = blockidx.x * blockdim.x + threadidx.x;     int indz = blockidx.z * blockdim.z + threadidx.z;     //make sure dont go out of bounds     size_t idx = indz * 100 * 100 + indy * 100 + indx;     if (indx >= 100 | indy >= 100 | indz >= 100)         return;     imageend[idx] = tex3d(tex, indx + 0.5, indy + 0.5, indz + 0.5);  } __global__ void writekernel(float* imageaux){     int indy = blockidx.y * blockdim.y + threadidx.y;     int indx = blockidx.x * blockdim.x + threadidx.x;     int indz = blockidx.z * blockdim.z + threadidx.z;     //make sure dont go out of bounds     size_t idx = indz * 100 * 100 + indy * 100 + indx;     if (indx >= 100 | indy >= 100 | indz >= 100)         return;     imageaux[idx] = (float)idx;  } int main() {      cudaarray *d_image_aux= 0;     const cudaextent extent = make_cudaextent(100, 100, 100);     cudachannelformatdesc channeldesc = cudacreatechanneldesc<float>();     cudamalloc3darray(&d_image_aux, &channeldesc, extent);      // configure texture options     tex.normalized = false;     tex.filtermode = cudafiltermodelinear;     tex.addressmode[0] = cudaaddressmodeborder;     tex.addressmode[1] = cudaaddressmodeborder;     tex.addressmode[2] = cudaaddressmodeborder;      cudabindtexturetoarray(tex, d_image_aux, channeldesc);      float *d_image_end = 0;     size_t num_bytes = 100 * 100 * 100 * sizeof(float);     cudamalloc((void**)&d_image_end, num_bytes);     cudamemset(d_image_end, 0, num_bytes);      int divx, divy, divz; //irrelevant demo, important main code     divx = 32;     divy = 32;     divz = 1;     dim3 grid((100 + divx - 1) / divx,         (100 + divy - 1) / divy,         (100 + divz - 1) / divz);     dim3 block(divx, divy, divz);      // kernels     writekernel << <grid, block >> >(d_image_aux);     readkernel  << <grid, block >> >(d_image_end);       cudaunbindtexture(tex);     cudafree(d_image_aux);     cudafree(d_image_end);      return 0; } 

note: aware can not write "interpolated" or whatever be. write operation in integer indexes, while read operation needs use trilinear interpolation.

i believe of necessary pieces demonstrate kernel writing 3d surface (bound underlying 3d cudaarray), followed kernel texturing (i.e. auto interpolation) same data (a 3d texture bound same underlying 3d cudaarray) contained in volumefiltering cuda sample code.

the conceptual difference sample code has 2 different underlying 3d cudaarrays (one texture, 1 surface) can combine these, data written surface subsequently read during texturing operation.

here's worked example:

$ cat texsurf.cu #include <stdio.h> #include <helper_cuda.h>  texture<float, cudatexturetype3d, cudareadmodeelementtype>  volumetexin; surface<void,  3>                                    volumetexout;  __global__ void surf_write(float *data,cudaextent volumesize) {     int x = blockidx.x*blockdim.x + threadidx.x;     int y = blockidx.y*blockdim.y + threadidx.y;     int z = blockidx.z*blockdim.z + threadidx.z;      if (x >= volumesize.width || y >= volumesize.height || z >= volumesize.depth)     {         return;     }     float output = data[z*(volumesize.width*volumesize.height)+y*(volumesize.width)+x];     // surface writes need byte offsets x!     surf3dwrite(output,volumetexout,x * sizeof(float),y,z);  }  __global__ void tex_read(float x, float y, float z){     printf("x: %f, y: %f, z:%f, val: %f\n", x,y,z,tex3d(volumetexin,x,y,z)); }  void runtest(float *data, cudaextent vol, float x, float y, float z) {     // create 3d array     cudachannelformatdesc channeldesc = cudacreatechanneldesc<float>();     cudaarray_t content;     checkcudaerrors(cudamalloc3darray(&content, &channeldesc, vol, cudaarraysurfaceloadstore));      // copy data device     float *d_data;     checkcudaerrors(cudamalloc(&d_data, vol.width*vol.height*vol.depth*sizeof(float)));     checkcudaerrors(cudamemcpy(d_data, data, vol.width*vol.height*vol.depth*sizeof(float), cudamemcpyhosttodevice));      dim3 blocksize(8,8,8);     dim3 gridsize((vol.width+7)/8,(vol.height+7)/8,(vol.depth+7)/8);     volumetexin.filtermode     = cudafiltermodelinear;     checkcudaerrors(cudabindsurfacetoarray(volumetexout,content));     surf_write<<<gridsize, blocksize>>>(d_data, vol);     // bind array 3d texture     checkcudaerrors(cudabindtexturetoarray(volumetexin, content));     tex_read<<<1,1>>>(x, y, z);     checkcudaerrors(cudadevicesynchronize());     cudafreearray(content);     cudafree(d_data);     return; }  int main(){    const int dim = 8;    float *data = (float *)malloc(dim*dim*dim*sizeof(float));    (int z = 0; z < dim; z++)      (int y = 0; y < dim; y++)        (int x = 0; x < dim; x++)          data[z*dim*dim+y*dim+x] = z*100+y*10+x;    cudaextent vol = {dim,dim,dim};    runtest(data, vol, 1.5, 1.5, 1.5);    runtest(data, vol, 1.6, 1.6, 1.6);    return 0; }   $ nvcc -i/usr/local/cuda/samples/common/inc texsurf.cu -o texsurf $ cuda-memcheck ./texsurf ========= cuda-memcheck x: 1.500000, y: 1.500000, z:1.500000, val: 111.000000 x: 1.600000, y: 1.600000, z:1.600000, val: 122.234375 ========= error summary: 0 errors $ 

i'm not going try give full tutorial on linear texture filtering here. there plenty of other example questions here cover details of indexing , filtering, , doesn't seem crux of question. i've chosen points (1.5, 1.5, 1.5) , (1.6, 1.6, 1.6) easy verification of underlying data; results make sense me.


Comments