i'm doing project cuda pipeline refreshed 200-10000 new events every 1ms. each time, want call one(/two) kernels compute small list of outputs; fed outputs next element of pipeline.
the theoretical flow is:
- receive data in
std::vector
cudamemcpy
vector gpu- processing
- generate small list of outputs
cudamemcpy
outputstd::vector
but when i'm calling cudadevicesynchronize
on 1block/1thread empty kernel no processing, takes in average 0.7 1.4ms, higher 1ms timeframe.
i change timeframe of pipeline in order receive events every 5ms, 5x more each times. wouldn't ideal though.
what best way minimize overhead of cudadevicesynchronize
? streams helpful in situation? or solution efficiently run pipeline.
(jetson tk1, compute capabilities 3.2)
here's nvprof log of applications:
==8285== nvprof profiling process 8285, command: python player.py test.rec ==8285== profiling application: python player.py test.rec ==8285== profiling result: time(%) time calls avg min max name 94.92% 47.697ms 5005 9.5290us 1.7500us 13.083us reset_timesurface(__int64, __int64*, __int64*, __int64*, __int64*, float*, float*, bool*, bool*, event*) 5.08% 2.5538ms 8 319.23us 99.750us 413.42us [cuda memset] ==8285== api calls: time(%) time calls avg min max name 75.00% 5.03966s 5005 1.0069ms 25.083us 11.143ms cudadevicesynchronize 17.44% 1.17181s 5005 234.13us 83.750us 3.1391ms cudalaunch 4.71% 316.62ms 9 35.180ms 23.083us 314.99ms cudamalloc 2.30% 154.31ms 50050 3.0830us 1.0000us 2.6866ms cudasetupargument 0.52% 34.857ms 5005 6.9640us 2.5000us 464.67us cudaconfigurecall 0.02% 1.2048ms 8 150.60us 71.917us 183.33us cudamemset 0.01% 643.25us 83 7.7490us 1.3330us 287.42us cudevicegetattribute 0.00% 12.916us 2 6.4580us 2.0000us 10.916us cudevicegetcount 0.00% 5.3330us 1 5.3330us 5.3330us 5.3330us cudevicetotalmem 0.00% 4.0830us 1 4.0830us 4.0830us 4.0830us cudevicegetname 0.00% 3.4160us 2 1.7080us 1.5830us 1.8330us cudeviceget
a small reconstitution of program (nvprof log @ end) - reason, average of cudadevicesynchronize
4 times lower, it's still high empty 1-thread kernel:
/* compile `nvcc test.cu -i.` * -i pointing "helper_cuda.h" , "helper_string.h" cuda samples **/ #include <iostream> #include <cuda.h> #include <helper_cuda.h> #define max_input_buffer_size 131072 typedef struct { unsigned short x; unsigned short y; short a; long long b; } event; long long *d_a_[2], *d_b_[2]; float *d_as_, *d_bs_; bool *d_some_bool_[2]; event *d_data_; int width_ = 320; int height_ = 240; __global__ void reset_timesurface(long long ts, long long *d_a_0, long long *d_a_1, long long *d_b_0, long long *d_b_1, float *d_as, float *d_bs, bool *d_some_bool_0, bool *d_some_bool_1, event *d_data) { // nothing here } void reset_errors(long long ts) { static const int n = 1024; static const dim3 grid_size(width_ * height_ / n + (width_ * height_ % n != 0), 1, 1); static const dim3 block_dim(n, 1, 1); reset_timesurface<<<1, 1>>>(ts, d_a_[0], d_a_[1], d_b_[0], d_b_[1], d_as_, d_bs_, d_some_bool_[0], d_some_bool_[1], d_data_); cudadevicesynchronize(); // static long long *h_holder = (long long*)malloc(sizeof(long long) * 2000); // cudamemcpy(h_holder, d_a_[0], 0, cudamemcpydevicetohost); } int main(void) { checkcudaerrors(cudamalloc(&(d_a_[0]), sizeof(long long)*width_*height_*2)); checkcudaerrors(cudamemset(d_a_[0], 0, sizeof(long long)*width_*height_*2)); checkcudaerrors(cudamalloc(&(d_a_[1]), sizeof(long long)*width_*height_*2)); checkcudaerrors(cudamemset(d_a_[1], 0, sizeof(long long)*width_*height_*2)); checkcudaerrors(cudamalloc(&(d_b_[0]), sizeof(long long)*width_*height_*2)); checkcudaerrors(cudamemset(d_b_[0], 0, sizeof(long long)*width_*height_*2)); checkcudaerrors(cudamalloc(&(d_b_[1]), sizeof(long long)*width_*height_*2)); checkcudaerrors(cudamemset(d_b_[1], 0, sizeof(long long)*width_*height_*2)); checkcudaerrors(cudamalloc(&d_as_, sizeof(float)*width_*height_*2)); checkcudaerrors(cudamemset(d_as_, 0, sizeof(float)*width_*height_*2)); checkcudaerrors(cudamalloc(&d_bs_, sizeof(float)*width_*height_*2)); checkcudaerrors(cudamemset(d_bs_, 0, sizeof(float)*width_*height_*2)); checkcudaerrors(cudamalloc(&(d_some_bool_[0]), sizeof(bool)*width_*height_*2)); checkcudaerrors(cudamemset(d_some_bool_[0], 0, sizeof(bool)*width_*height_*2)); checkcudaerrors(cudamalloc(&(d_some_bool_[1]), sizeof(bool)*width_*height_*2)); checkcudaerrors(cudamemset(d_some_bool_[1], 0, sizeof(bool)*width_*height_*2)); checkcudaerrors(cudamalloc(&d_data_, sizeof(event)*max_input_buffer_size)); (int = 0; < 5005; ++i) reset_errors(16487l); cudafree(d_a_[0]); cudafree(d_a_[1]); cudafree(d_b_[0]); cudafree(d_b_[1]); cudafree(d_as_); cudafree(d_bs_); cudafree(d_some_bool_[0]); cudafree(d_some_bool_[1]); cudafree(d_data_); cudadevicereset(); } /* nvprof ./a.out ==9258== nvprof profiling process 9258, command: ./a.out ==9258== profiling application: ./a.out ==9258== profiling result: time(%) time calls avg min max name 92.64% 48.161ms 5005 9.6220us 6.4160us 13.250us reset_timesurface(__int64, __int64*, __int64*, __int64*, __int64*, float*, float*, bool*, bool*, event*) 7.36% 3.8239ms 8 477.99us 148.92us 620.17us [cuda memset] ==9258== api calls: time(%) time calls avg min max name 53.12% 1.22036s 5005 243.83us 9.6670us 8.5762ms cudadevicesynchronize 25.10% 576.78ms 5005 115.24us 44.250us 11.888ms cudalaunch 9.13% 209.77ms 9 23.308ms 16.667us 208.54ms cudamalloc 6.56% 150.65ms 1 150.65ms 150.65ms 150.65ms cudadevicereset 5.33% 122.39ms 50050 2.4450us 833ns 6.1167ms cudasetupargument 0.60% 13.808ms 5005 2.7580us 1.0830us 104.25us cudaconfigurecall 0.10% 2.3845ms 9 264.94us 22.333us 537.75us cudafree 0.04% 938.75us 8 117.34us 58.917us 169.08us cudamemset 0.02% 461.33us 83 5.5580us 1.4160us 197.58us cudevicegetattribute 0.00% 15.500us 2 7.7500us 3.6670us 11.833us cudevicegetcount 0.00% 7.6670us 1 7.6670us 7.6670us 7.6670us cudevicetotalmem 0.00% 4.8340us 1 4.8340us 4.8340us 4.8340us cudevicegetname 0.00% 3.6670us 2 1.8330us 1.6670us 2.0000us cudeviceget */
as detailled in comments of original message, problem entirely related gpu i'm using (tegra k1). here's answer found particular problem; might useful other gpus well. average cudadevicesynchronize
on jetson tk1 went 250us 10us.
the rate of tegra 72000khz default, we'll have set 852000khz using command:
$ echo 852000000 > /sys/kernel/debug/clock/override.gbus/rate $ echo 1 > /sys/kernel/debug/clock/override.gbus/state
we can find list of available frequency using command:
$ cat /sys/kernel/debug/clock/gbus/possible_rates 72000 108000 180000 252000 324000 396000 468000 540000 612000 648000 684000 708000 756000 804000 852000 (khz)
more performance can obtained (again, in exchange higher power draw) on both cpu , gpu; check this link more informations.
Comments
Post a Comment