Lines Matching refs:nthreads
261 template <int nthreads, // Number of threads which process one block historgam
274 __shared__ float sh_squares[nthreads * nblocks]; in normalize_hists_kernel_many_blocks()
275 float* squares = sh_squares + threadIdx.z * nthreads; in normalize_hists_kernel_many_blocks()
281 float sum = reduce_smem<nthreads>(squares, elem * elem); in normalize_hists_kernel_many_blocks()
286 sum = reduce_smem<nthreads>(squares, elem * elem); in normalize_hists_kernel_many_blocks()
301 int nthreads = power_2up(block_hist_size); in normalize_hists() local
302 dim3 threads(nthreads, 1, nblocks); in normalize_hists()
308 if (nthreads == 32) in normalize_hists()
310 else if (nthreads == 64) in normalize_hists()
312 else if (nthreads == 128) in normalize_hists()
314 else if (nthreads == 256) in normalize_hists()
316 else if (nthreads == 512) in normalize_hists()
332 template <int nthreads, // Number of threads per one histogram block
348 for (int i = threadIdx.x; i < cdescr_size; i += nthreads) in compute_confidence_hists_kernel_many_blocks()
355 __shared__ float products[nthreads * nblocks]; in compute_confidence_hists_kernel_many_blocks()
357 const int tid = threadIdx.z * nthreads + threadIdx.x; in compute_confidence_hists_kernel_many_blocks()
359 reduce<nthreads>(products, product, tid, plus<float>()); in compute_confidence_hists_kernel_many_blocks()
370 const int nthreads = 256; in compute_confidence_hists() local
378 dim3 threads(nthreads, 1, nblocks); in compute_confidence_hists()
381 …cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>, in compute_confidence_hists()
386 compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>( in compute_confidence_hists()
394 template <int nthreads, // Number of threads per one histogram block
410 for (int i = threadIdx.x; i < cdescr_size; i += nthreads) in classify_hists_kernel_many_blocks()
417 __shared__ float products[nthreads * nblocks]; in classify_hists_kernel_many_blocks()
419 const int tid = threadIdx.z * nthreads + threadIdx.x; in classify_hists_kernel_many_blocks()
421 reduce<nthreads>(products, product, tid, plus<float>()); in classify_hists_kernel_many_blocks()
432 const int nthreads = 256; in classify_hists() local
440 dim3 threads(nthreads, 1, nblocks); in classify_hists()
443 …cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks<nthreads, nblocks>, cudaFunc… in classify_hists()
446 classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>( in classify_hists()
458 template <int nthreads>
470 for (int i = threadIdx.x; i < cdescr_size; i += nthreads) in extract_descrs_by_rows_kernel()
482 const int nthreads = 256; in extract_descrs_by_rows() local
488 dim3 threads(nthreads, 1); in extract_descrs_by_rows()
492 extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>( in extract_descrs_by_rows()
500 template <int nthreads>
513 for (int i = threadIdx.x; i < cdescr_size; i += nthreads) in extract_descrs_by_cols_kernel()
531 const int nthreads = 256; in extract_descrs_by_cols() local
537 dim3 threads(nthreads, 1); in extract_descrs_by_cols()
541 extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>( in extract_descrs_by_cols()
552 template <int nthreads, int correct_gamma>
560 __shared__ float sh_row[(nthreads + 2) * 3]; in compute_gradients_8UC4_kernel()
569 sh_row[threadIdx.x + 1 + (nthreads + 2)] = val.y; in compute_gradients_8UC4_kernel()
570 sh_row[threadIdx.x + 1 + 2 * (nthreads + 2)] = val.z; in compute_gradients_8UC4_kernel()
576 sh_row[(nthreads + 2)] = val.y; in compute_gradients_8UC4_kernel()
577 sh_row[2 * (nthreads + 2)] = val.z; in compute_gradients_8UC4_kernel()
584 sh_row[blockDim.x + 1 + (nthreads + 2)] = val.y; in compute_gradients_8UC4_kernel()
585 sh_row[blockDim.x + 1 + 2 * (nthreads + 2)] = val.z; in compute_gradients_8UC4_kernel()
594 b.y = sh_row[threadIdx.x + 2 + (nthreads + 2)]; in compute_gradients_8UC4_kernel()
595 b.z = sh_row[threadIdx.x + 2 + 2 * (nthreads + 2)]; in compute_gradients_8UC4_kernel()
597 a.y = sh_row[threadIdx.x + (nthreads + 2)]; in compute_gradients_8UC4_kernel()
598 a.z = sh_row[threadIdx.x + 2 * (nthreads + 2)]; in compute_gradients_8UC4_kernel()
659 const int nthreads = 256; in compute_gradients_8UC4() local
661 dim3 bdim(nthreads, 1); in compute_gradients_8UC4()
665 …compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, … in compute_gradients_8UC4()
667 …compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, … in compute_gradients_8UC4()
674 template <int nthreads, int correct_gamma>
682 __shared__ float sh_row[nthreads + 2]; in compute_gradients_8UC1_kernel()
732 const int nthreads = 256; in compute_gradients_8UC1() local
734 dim3 bdim(nthreads, 1); in compute_gradients_8UC1()
738 …compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, … in compute_gradients_8UC1()
740 …compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, … in compute_gradients_8UC1()