parallel processing - Minimize cudaDeviceSynchronize launch overhead -


i'm doing project cuda pipeline refreshed 200-10000 new events every 1ms. each time, want call one(/two) kernels compute small list of outputs; fed outputs next element of pipeline.

the theoretical flow is:

  1. receive data in std::vector
  2. cudamemcpy vector gpu
  3. processing
  4. generate small list of outputs
  5. cudamemcpy output std::vector

but when i'm calling cudadevicesynchronize on 1block/1thread empty kernel no processing, takes in average 0.7 1.4ms, higher 1ms timeframe.

i change timeframe of pipeline in order receive events every 5ms, 5x more each times. wouldn't ideal though.

what best way minimize overhead of cudadevicesynchronize? streams helpful in situation? or solution efficiently run pipeline.

(jetson tk1, compute capabilities 3.2)

here's nvprof log of applications:

==8285== nvprof profiling process 8285, command: python player.py test.rec ==8285== profiling application: python player.py test.rec ==8285== profiling result: time(%)      time     calls       avg       min       max  name  94.92%  47.697ms      5005  9.5290us  1.7500us  13.083us  reset_timesurface(__int64, __int64*, __int64*, __int64*, __int64*, float*, float*, bool*, bool*, event*)   5.08%  2.5538ms         8  319.23us  99.750us  413.42us  [cuda memset]  ==8285== api calls: time(%)      time     calls       avg       min       max  name  75.00%  5.03966s      5005  1.0069ms  25.083us  11.143ms  cudadevicesynchronize  17.44%  1.17181s      5005  234.13us  83.750us  3.1391ms  cudalaunch   4.71%  316.62ms         9  35.180ms  23.083us  314.99ms  cudamalloc   2.30%  154.31ms     50050  3.0830us  1.0000us  2.6866ms  cudasetupargument   0.52%  34.857ms      5005  6.9640us  2.5000us  464.67us  cudaconfigurecall   0.02%  1.2048ms         8  150.60us  71.917us  183.33us  cudamemset   0.01%  643.25us        83  7.7490us  1.3330us  287.42us  cudevicegetattribute   0.00%  12.916us         2  6.4580us  2.0000us  10.916us  cudevicegetcount   0.00%  5.3330us         1  5.3330us  5.3330us  5.3330us  cudevicetotalmem   0.00%  4.0830us         1  4.0830us  4.0830us  4.0830us  cudevicegetname   0.00%  3.4160us         2  1.7080us  1.5830us  1.8330us  cudeviceget 

a small reconstitution of program (nvprof log @ end) - reason, average of cudadevicesynchronize 4 times lower, it's still high empty 1-thread kernel:

/* compile `nvcc test.cu -i.`  * -i pointing "helper_cuda.h" , "helper_string.h" cuda samples  **/ #include <iostream> #include <cuda.h> #include <helper_cuda.h>  #define max_input_buffer_size 131072  typedef struct {     unsigned short x;     unsigned short y;     short a;     long long b; } event;  long long *d_a_[2], *d_b_[2]; float *d_as_, *d_bs_; bool *d_some_bool_[2]; event *d_data_; int width_ = 320; int height_ = 240;  __global__ void reset_timesurface(long long ts,         long long *d_a_0, long long *d_a_1,         long long *d_b_0, long long *d_b_1,         float *d_as, float *d_bs,         bool *d_some_bool_0, bool *d_some_bool_1, event *d_data) {     // nothing here } void reset_errors(long long ts) {     static const int n  = 1024;     static const dim3 grid_size(width_ * height_ / n             + (width_ * height_ % n != 0), 1, 1);     static const dim3 block_dim(n, 1, 1);      reset_timesurface<<<1, 1>>>(ts, d_a_[0], d_a_[1],             d_b_[0], d_b_[1],             d_as_, d_bs_,             d_some_bool_[0], d_some_bool_[1], d_data_);     cudadevicesynchronize();     //  static long long *h_holder = (long long*)malloc(sizeof(long long) * 2000);     //  cudamemcpy(h_holder, d_a_[0], 0, cudamemcpydevicetohost); }  int main(void) {     checkcudaerrors(cudamalloc(&(d_a_[0]), sizeof(long long)*width_*height_*2));     checkcudaerrors(cudamemset(d_a_[0], 0, sizeof(long long)*width_*height_*2));     checkcudaerrors(cudamalloc(&(d_a_[1]), sizeof(long long)*width_*height_*2));     checkcudaerrors(cudamemset(d_a_[1], 0, sizeof(long long)*width_*height_*2));     checkcudaerrors(cudamalloc(&(d_b_[0]), sizeof(long long)*width_*height_*2));     checkcudaerrors(cudamemset(d_b_[0], 0, sizeof(long long)*width_*height_*2));     checkcudaerrors(cudamalloc(&(d_b_[1]), sizeof(long long)*width_*height_*2));     checkcudaerrors(cudamemset(d_b_[1], 0, sizeof(long long)*width_*height_*2));     checkcudaerrors(cudamalloc(&d_as_, sizeof(float)*width_*height_*2));     checkcudaerrors(cudamemset(d_as_, 0, sizeof(float)*width_*height_*2));     checkcudaerrors(cudamalloc(&d_bs_, sizeof(float)*width_*height_*2));     checkcudaerrors(cudamemset(d_bs_, 0, sizeof(float)*width_*height_*2));     checkcudaerrors(cudamalloc(&(d_some_bool_[0]), sizeof(bool)*width_*height_*2));     checkcudaerrors(cudamemset(d_some_bool_[0], 0, sizeof(bool)*width_*height_*2));     checkcudaerrors(cudamalloc(&(d_some_bool_[1]), sizeof(bool)*width_*height_*2));     checkcudaerrors(cudamemset(d_some_bool_[1], 0, sizeof(bool)*width_*height_*2));     checkcudaerrors(cudamalloc(&d_data_, sizeof(event)*max_input_buffer_size));      (int = 0; < 5005; ++i)         reset_errors(16487l);      cudafree(d_a_[0]);     cudafree(d_a_[1]);     cudafree(d_b_[0]);     cudafree(d_b_[1]);     cudafree(d_as_);     cudafree(d_bs_);     cudafree(d_some_bool_[0]);     cudafree(d_some_bool_[1]);     cudafree(d_data_);     cudadevicereset(); }  /* nvprof ./a.out ==9258== nvprof profiling process 9258, command: ./a.out ==9258== profiling application: ./a.out ==9258== profiling result: time(%)      time     calls       avg       min       max  name  92.64%  48.161ms      5005  9.6220us  6.4160us  13.250us  reset_timesurface(__int64, __int64*, __int64*, __int64*, __int64*, float*, float*, bool*, bool*, event*)   7.36%  3.8239ms         8  477.99us  148.92us  620.17us  [cuda memset]  ==9258== api calls: time(%)      time     calls       avg       min       max  name  53.12%  1.22036s      5005  243.83us  9.6670us  8.5762ms  cudadevicesynchronize  25.10%  576.78ms      5005  115.24us  44.250us  11.888ms  cudalaunch   9.13%  209.77ms         9  23.308ms  16.667us  208.54ms  cudamalloc   6.56%  150.65ms         1  150.65ms  150.65ms  150.65ms  cudadevicereset   5.33%  122.39ms     50050  2.4450us     833ns  6.1167ms  cudasetupargument   0.60%  13.808ms      5005  2.7580us  1.0830us  104.25us  cudaconfigurecall   0.10%  2.3845ms         9  264.94us  22.333us  537.75us  cudafree   0.04%  938.75us         8  117.34us  58.917us  169.08us  cudamemset   0.02%  461.33us        83  5.5580us  1.4160us  197.58us  cudevicegetattribute   0.00%  15.500us         2  7.7500us  3.6670us  11.833us  cudevicegetcount   0.00%  7.6670us         1  7.6670us  7.6670us  7.6670us  cudevicetotalmem   0.00%  4.8340us         1  4.8340us  4.8340us  4.8340us  cudevicegetname   0.00%  3.6670us         2  1.8330us  1.6670us  2.0000us  cudeviceget */ 

as detailled in comments of original message, problem entirely related gpu i'm using (tegra k1). here's answer found particular problem; might useful other gpus well. average cudadevicesynchronize on jetson tk1 went 250us 10us.

the rate of tegra 72000khz default, we'll have set 852000khz using command:

$ echo 852000000 > /sys/kernel/debug/clock/override.gbus/rate $ echo 1 > /sys/kernel/debug/clock/override.gbus/state 

we can find list of available frequency using command:

$ cat /sys/kernel/debug/clock/gbus/possible_rates 72000 108000 180000 252000 324000 396000 468000 540000 612000 648000 684000 708000 756000 804000 852000 (khz) 

more performance can obtained (again, in exchange higher power draw) on both cpu , gpu; check this link more informations.


Comments