i'm doing project cuda pipeline refreshed 200-10000 new events every 1ms. each time, want call one(/two) kernels compute small list of outputs; fed outputs next element of pipeline.
the theoretical flow is:
- receive data in
std::vector cudamemcpyvector gpu- processing
- generate small list of outputs
cudamemcpyoutputstd::vector
but when i'm calling cudadevicesynchronize on 1block/1thread empty kernel no processing, takes in average 0.7 1.4ms, higher 1ms timeframe.
i change timeframe of pipeline in order receive events every 5ms, 5x more each times. wouldn't ideal though.
what best way minimize overhead of cudadevicesynchronize? streams helpful in situation? or solution efficiently run pipeline.
(jetson tk1, compute capabilities 3.2)
here's nvprof log of applications:
==8285== nvprof profiling process 8285, command: python player.py test.rec ==8285== profiling application: python player.py test.rec ==8285== profiling result: time(%) time calls avg min max name 94.92% 47.697ms 5005 9.5290us 1.7500us 13.083us reset_timesurface(__int64, __int64*, __int64*, __int64*, __int64*, float*, float*, bool*, bool*, event*) 5.08% 2.5538ms 8 319.23us 99.750us 413.42us [cuda memset] ==8285== api calls: time(%) time calls avg min max name 75.00% 5.03966s 5005 1.0069ms 25.083us 11.143ms cudadevicesynchronize 17.44% 1.17181s 5005 234.13us 83.750us 3.1391ms cudalaunch 4.71% 316.62ms 9 35.180ms 23.083us 314.99ms cudamalloc 2.30% 154.31ms 50050 3.0830us 1.0000us 2.6866ms cudasetupargument 0.52% 34.857ms 5005 6.9640us 2.5000us 464.67us cudaconfigurecall 0.02% 1.2048ms 8 150.60us 71.917us 183.33us cudamemset 0.01% 643.25us 83 7.7490us 1.3330us 287.42us cudevicegetattribute 0.00% 12.916us 2 6.4580us 2.0000us 10.916us cudevicegetcount 0.00% 5.3330us 1 5.3330us 5.3330us 5.3330us cudevicetotalmem 0.00% 4.0830us 1 4.0830us 4.0830us 4.0830us cudevicegetname 0.00% 3.4160us 2 1.7080us 1.5830us 1.8330us cudeviceget a small reconstitution of program (nvprof log @ end) - reason, average of cudadevicesynchronize 4 times lower, it's still high empty 1-thread kernel:
/* compile `nvcc test.cu -i.` * -i pointing "helper_cuda.h" , "helper_string.h" cuda samples **/ #include <iostream> #include <cuda.h> #include <helper_cuda.h> #define max_input_buffer_size 131072 typedef struct { unsigned short x; unsigned short y; short a; long long b; } event; long long *d_a_[2], *d_b_[2]; float *d_as_, *d_bs_; bool *d_some_bool_[2]; event *d_data_; int width_ = 320; int height_ = 240; __global__ void reset_timesurface(long long ts, long long *d_a_0, long long *d_a_1, long long *d_b_0, long long *d_b_1, float *d_as, float *d_bs, bool *d_some_bool_0, bool *d_some_bool_1, event *d_data) { // nothing here } void reset_errors(long long ts) { static const int n = 1024; static const dim3 grid_size(width_ * height_ / n + (width_ * height_ % n != 0), 1, 1); static const dim3 block_dim(n, 1, 1); reset_timesurface<<<1, 1>>>(ts, d_a_[0], d_a_[1], d_b_[0], d_b_[1], d_as_, d_bs_, d_some_bool_[0], d_some_bool_[1], d_data_); cudadevicesynchronize(); // static long long *h_holder = (long long*)malloc(sizeof(long long) * 2000); // cudamemcpy(h_holder, d_a_[0], 0, cudamemcpydevicetohost); } int main(void) { checkcudaerrors(cudamalloc(&(d_a_[0]), sizeof(long long)*width_*height_*2)); checkcudaerrors(cudamemset(d_a_[0], 0, sizeof(long long)*width_*height_*2)); checkcudaerrors(cudamalloc(&(d_a_[1]), sizeof(long long)*width_*height_*2)); checkcudaerrors(cudamemset(d_a_[1], 0, sizeof(long long)*width_*height_*2)); checkcudaerrors(cudamalloc(&(d_b_[0]), sizeof(long long)*width_*height_*2)); checkcudaerrors(cudamemset(d_b_[0], 0, sizeof(long long)*width_*height_*2)); checkcudaerrors(cudamalloc(&(d_b_[1]), sizeof(long long)*width_*height_*2)); checkcudaerrors(cudamemset(d_b_[1], 0, sizeof(long long)*width_*height_*2)); checkcudaerrors(cudamalloc(&d_as_, sizeof(float)*width_*height_*2)); checkcudaerrors(cudamemset(d_as_, 0, sizeof(float)*width_*height_*2)); checkcudaerrors(cudamalloc(&d_bs_, sizeof(float)*width_*height_*2)); checkcudaerrors(cudamemset(d_bs_, 0, sizeof(float)*width_*height_*2)); checkcudaerrors(cudamalloc(&(d_some_bool_[0]), sizeof(bool)*width_*height_*2)); checkcudaerrors(cudamemset(d_some_bool_[0], 0, sizeof(bool)*width_*height_*2)); checkcudaerrors(cudamalloc(&(d_some_bool_[1]), sizeof(bool)*width_*height_*2)); checkcudaerrors(cudamemset(d_some_bool_[1], 0, sizeof(bool)*width_*height_*2)); checkcudaerrors(cudamalloc(&d_data_, sizeof(event)*max_input_buffer_size)); (int = 0; < 5005; ++i) reset_errors(16487l); cudafree(d_a_[0]); cudafree(d_a_[1]); cudafree(d_b_[0]); cudafree(d_b_[1]); cudafree(d_as_); cudafree(d_bs_); cudafree(d_some_bool_[0]); cudafree(d_some_bool_[1]); cudafree(d_data_); cudadevicereset(); } /* nvprof ./a.out ==9258== nvprof profiling process 9258, command: ./a.out ==9258== profiling application: ./a.out ==9258== profiling result: time(%) time calls avg min max name 92.64% 48.161ms 5005 9.6220us 6.4160us 13.250us reset_timesurface(__int64, __int64*, __int64*, __int64*, __int64*, float*, float*, bool*, bool*, event*) 7.36% 3.8239ms 8 477.99us 148.92us 620.17us [cuda memset] ==9258== api calls: time(%) time calls avg min max name 53.12% 1.22036s 5005 243.83us 9.6670us 8.5762ms cudadevicesynchronize 25.10% 576.78ms 5005 115.24us 44.250us 11.888ms cudalaunch 9.13% 209.77ms 9 23.308ms 16.667us 208.54ms cudamalloc 6.56% 150.65ms 1 150.65ms 150.65ms 150.65ms cudadevicereset 5.33% 122.39ms 50050 2.4450us 833ns 6.1167ms cudasetupargument 0.60% 13.808ms 5005 2.7580us 1.0830us 104.25us cudaconfigurecall 0.10% 2.3845ms 9 264.94us 22.333us 537.75us cudafree 0.04% 938.75us 8 117.34us 58.917us 169.08us cudamemset 0.02% 461.33us 83 5.5580us 1.4160us 197.58us cudevicegetattribute 0.00% 15.500us 2 7.7500us 3.6670us 11.833us cudevicegetcount 0.00% 7.6670us 1 7.6670us 7.6670us 7.6670us cudevicetotalmem 0.00% 4.8340us 1 4.8340us 4.8340us 4.8340us cudevicegetname 0.00% 3.6670us 2 1.8330us 1.6670us 2.0000us cudeviceget */
as detailled in comments of original message, problem entirely related gpu i'm using (tegra k1). here's answer found particular problem; might useful other gpus well. average cudadevicesynchronize on jetson tk1 went 250us 10us.
the rate of tegra 72000khz default, we'll have set 852000khz using command:
$ echo 852000000 > /sys/kernel/debug/clock/override.gbus/rate $ echo 1 > /sys/kernel/debug/clock/override.gbus/state we can find list of available frequency using command:
$ cat /sys/kernel/debug/clock/gbus/possible_rates 72000 108000 180000 252000 324000 396000 468000 540000 612000 648000 684000 708000 756000 804000 852000 (khz) more performance can obtained (again, in exchange higher power draw) on both cpu , gpu; check this link more informations.
Comments
Post a Comment