Home
last modified time | relevance | path

Searched refs:ldg (Results 1 – 25 of 65) sorted by relevance

123

/external/tensorflow/tensorflow/core/kernels/
Dcompare_and_bitpack_op_gpu.cu.cc45 const T thresh = ldg(threshold); in CompareAndBitpackKernel()
49 ((((ldg(block) > thresh) << 7)) | (((ldg(block + 1) > thresh) << 6)) | in CompareAndBitpackKernel()
50 (((ldg(block + 2) > thresh) << 5)) | in CompareAndBitpackKernel()
51 (((ldg(block + 3) > thresh) << 4)) | in CompareAndBitpackKernel()
52 (((ldg(block + 4) > thresh) << 3)) | in CompareAndBitpackKernel()
53 (((ldg(block + 5) > thresh) << 2)) | in CompareAndBitpackKernel()
54 (((ldg(block + 6) > thresh) << 1)) | (((ldg(block + 7) > thresh)))); in CompareAndBitpackKernel()
66 const int64 block = ldg(reinterpret_cast<const int64*>(input + 8 * i)); in CompareAndBitpackKernel()
83 const float thresh = ldg(threshold); in CompareAndBitpackKernel()
85 const float4 block0 = ldg(reinterpret_cast<const float4*>(input + 8 * i)); in CompareAndBitpackKernel()
[all …]
Dsoftmax_op_gpu.cu.cc88 max_val[i] = strict_cast<U>(ldg(max_logits + row)); in GenerateNormalizedProb()
99 result[i] = input[i] - max_val[i] - log(ldg(sum_probs + row)); in GenerateNormalizedProb()
101 result[i] = exp(input[i] - max_val[i]) / ldg(sum_probs + row); in GenerateNormalizedProb()
134 max_val[i] = strict_cast<float>(ldg(max_logits + row[i])); in GenerateNormalizedProb()
136 result[i] = input[i] - max_val[i] - log(ldg(sum_probs + row[i])); in GenerateNormalizedProb()
138 result[i] = exp(input[i] - max_val[i]) / ldg(sum_probs + row[i]); in GenerateNormalizedProb()
150 max_val[i] = strict_cast<float>(ldg(max_logits + row[i])); in GenerateNormalizedProb()
152 result[i] = input[i] - max_val[i] - log(ldg(sum_probs + row[i])); in GenerateNormalizedProb()
154 result[i] = exp(input[i] - max_val[i]) / ldg(sum_probs + row[i]); in GenerateNormalizedProb()
172 strict_cast<U>(logits_[gid] - ldg(max_logits_ + gid / num_cols_)); in operator ()()
Dpopulation_count_op_gpu.cu.cc39 GPU_1D_KERNEL_LOOP(i, size) { output[i] = __popc(ldg(input + i)); } in PopulationCountKernel()
48 output[i] = __popc(ldg(reinterpret_cast<const uint8*>(input + i))); in PopulationCountKernel()
58 output[i] = __popc(ldg(reinterpret_cast<const uint16*>(input + i))); in PopulationCountKernel()
66 GPU_1D_KERNEL_LOOP(i, size) { output[i] = __popcll(ldg(input + i)); } in PopulationCountKernel()
Dsparse_tensor_dense_matmul_op_gpu.cu.cc40 const int i = ldg(a_indices + 2 * a_ix + ((ADJ_A) ? 1 : 0)); in SparseTensorDenseMatMulKernel()
41 const int k = ldg(a_indices + 2 * a_ix + ((ADJ_A) ? 0 : 1)); in SparseTensorDenseMatMulKernel()
53 const T a_value = ldg(a_values + a_ix); in SparseTensorDenseMatMulKernel()
56 const T b_value = ldg(b + ((ADJ_B) ? j * b_cols + k : k * b_cols + j)); in SparseTensorDenseMatMulKernel()
Dbias_op_gpu.cu.cc59 output[index] = ldg(input + index) + ldg(bias + bias_offset); in BiasNHWCKernel()
71 output[index] = ldg(input + index) + ldg(bias + bias_offset); in BiasNCHWKernel()
109 GpuAtomicAdd(bias_backprop + bias_offset, ldg(output_backprop + index)); in BiasGradNHWC_Naive()
122 GpuAtomicAdd(bias_backprop + bias_offset, ldg(output_backprop + index)); in BiasGradNCHW_Naive()
141 GpuAtomicAdd(s_data + bias_offset, AccT(ldg(output_backprop + index))); in BiasGradNHWC_SharedAtomics()
173 T val = ldg(output_backprop + in BiasGradNCHW_SharedAtomics()
Ddepthwise_conv_op_gpu.h144 sum += static_cast<S>(ldg(input + input_offset)) *
145 static_cast<S>(ldg(filter + filter_offset));
167 sum += static_cast<S>(ldg(input + input_offset)) *
168 static_cast<S>(ldg(filter + filter_offset));
281 tile_ptr[0] = static_cast<S>(ldg(in_ptr));
283 tile_ptr[tile_offset] = static_cast<S>(ldg(tensor_offset + in_ptr));
288 static_cast<S>(ldg(filter_offset + filter));
429 sum += static_cast<S>(ldg(input + input_offset)) *
430 static_cast<S>(ldg(filter + filter_offset));
457 sum += static_cast<S>(ldg(input + input_offset)) *
[all …]
Dbincount_op_gpu.cu.cc108 Tidx bin = ldg(in + index); in BincountReduceKernel()
139 Tidx bin = ldg(in + index); in BincountColReduceKernel()
146 T value = (weights_size == 0) ? T(1) : ldg(weights + index); in BincountColReduceKernel()
169 Tidx bin = ldg(in + index); in BincountColReduceSharedKernel()
176 T value = (weights_size == 0) ? T(1) : ldg(weights + index); in BincountColReduceSharedKernel()
Din_topk_op_gpu.cu.cc46 TargetT target_idx = ldg(targets + batch_index); in ComputePredictionMaskKernel()
53 T prediction = ldg(predictions + i); in ComputePredictionMaskKernel()
55 ldg(predictions + batch_index * num_classes + target_idx); in ComputePredictionMaskKernel()
Dinplace_ops_functor_gpu.cu.cc39 *p = ldg(q); in DoParallelConcatOpKernel()
96 *p = ldg(q); in DoInplaceOpKernel()
99 *p += ldg(q); in DoInplaceOpKernel()
102 *p -= ldg(q); in DoInplaceOpKernel()
Dgather_functor_batched_gpu.cu.h67 Index gather_i = ldg(indices + batch_i * indices_size + indices_i); in GatherOpKernel()
80 out[i] = ldg(params + params_i); in GatherOpKernel()
Dgather_functor_gpu.cu.h56 Index gather_i = ldg(indices + indices_i); in GatherOpKernel()
69 out[i] = ldg(params + params_i); in GatherOpKernel()
Ddepthtospace_op_gpu.cu.cc57 *(output_ptr + out_idx) = ldg(input_ptr + inp_idx); in D2S_NHWC()
96 *(output_ptr + output_idx) = ldg(input_ptr + input_idx); in D2S_NCHW()
134 output_ptr[bY * output_width + bX] = ldg( in D2S_NCHW_LOOP()
Dspacetodepth_op_gpu.cu.cc57 *(output_ptr + out_idx) = ldg(input_ptr + inp_idx); in S2D_NHWC()
95 *(output_ptr + output_idx) = ldg(input_ptr + input_idx); in S2D_NCHW()
135 ldg(input_ptr + bY * input_width + bX); in S2D_NCHW_LOOP()
Dgather_nd_op_gpu.cu.cc44 const Index index_j = ldg(indices_i + j); in GatherSliceOpKernel()
61 out[i] = (out_of_bounds) ? T(0) : ldg(params + offset + loc_offset); in GatherSliceOpKernel()
Dmultinomial_op_gpu.cu.cc49 if (ldg(maxima + maxima_idx) == ldg(scores + index)) { in MultinomialKernel()
Dspacetobatch_functor_gpu.cu.cc89 ldg(space_tensor_ptr + space_tensor_idx); in S2B()
92 ldg(batch_tensor_ptr + batch_tensor_idx); in S2B()
/external/tensorflow/tensorflow/core/kernels/sparse/
Dkernels_gpu.cu.cc44 return static_cast<int>(ldg(begin_ + idx * stride_)); in operator ()()
141 coo_rows_out[i] = static_cast<int>(ldg(indices + i * stride + offset)); in SparseTensorToCOOMatrixKernel()
142 coo_cols_out[i] = static_cast<int>(ldg(indices + i * stride + offset + 1)); in SparseTensorToCOOMatrixKernel()
173 indices_out[i * 2] = static_cast<int64>(ldg(coo_rows + i)); in COOMatrixToSparseTensorKernel2D()
174 indices_out[i * 2 + 1] = static_cast<int64>(ldg(coo_cols + i)); in COOMatrixToSparseTensorKernel2D()
211 indices_out[i * 3 + 1] = static_cast<int64>(ldg(coo_rows + i)); in COOMatrixToSparseTensorKernel3D()
212 indices_out[i * 3 + 2] = static_cast<int64>(ldg(coo_cols + i)); in COOMatrixToSparseTensorKernel3D()
279 c_values[i] = ldg(a_values + i) * local_batch_values[b]; in CSRSparseMatrixBatchMulVecKernel3D()
356 row_max = Eigen::numext::maxi(row_max, ldg(logits + r_i)); in CalculateRowSoftmax()
360 const T exp_i = Eigen::numext::exp(ldg(logits + r_i) - row_max); in CalculateRowSoftmax()
[all …]
/external/llvm-project/llvm/test/CodeGen/AArch64/
Darm64-mte.ll173 %1 = tail call i8* @llvm.aarch64.ldg(i8* %0, i8* %0)
176 ; CHECK: ldg x0, [x0]
184 %2 = tail call i8* @llvm.aarch64.ldg(i8* %0, i8* %1)
187 ; CHECK: ldg x0, [x1]
196 %1 = call i8* @llvm.aarch64.ldg(i8* nonnull %0, i8* nonnull %0)
201 ; CHECK: ldg [[T0]], [sp]
211 %2 = call i8* @llvm.aarch64.ldg(i8* nonnull %1, i8* nonnull %0)
216 ; CHECK: ldg x0, [sp]
225 %1 = tail call i8* @llvm.aarch64.ldg(i8* nonnull %0, i8* nonnull %0)
229 ; CHECK: ldg [[T0]], [x0, #16]
[all …]
/external/llvm/test/CodeGen/NVPTX/
Dldu-ldg.ll6 declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align)
7 declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align)
27 %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 4)
34 %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 4)
Dbug26185-2.ll3 ; Verify that we correctly emit code for extending ldg/ldu. We do not expose
4 ; extending variants in the backend, but the ldg/ldu selection code may pick
/external/llvm-project/llvm/test/CodeGen/NVPTX/
Dldu-ldg.ll6 declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align)
7 declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align)
27 %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 4)
34 %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 4)
Dbug26185-2.ll3 ; Verify that we correctly emit code for extending ldg/ldu. We do not expose
4 ; extending variants in the backend, but the ldg/ldu selection code may pick
/external/llvm-project/llvm/test/MC/AArch64/
Darmv8.5a-mte-error.s878 ldg sp, [x0, #0] label
879 ldg x0, [x0, x0] label
880 ldg x0, [x0, #4096] label
881 ldg x0, [x0, #-4112] label
882 ldg #1, [x0, #255] label
883 ldg x0, [#1, #255] label
898 ldg label
899 ldg x0 label
900 ldg x0, [#0] label
901 ldg w0, [x1] label
[all …]
Darmv8.5a-mte.s573 ldg X0, [X1, #0] label
574 ldg X2, [sp, #-4096] label
575 ldg x3, [x4, #4080] label
/external/tensorflow/tensorflow/core/kernels/image/
Dresize_nearest_neighbor_op_gpu.cu.cc61 top_data[index] = ldg(bottom_data_n + idx); in ResizeNearestNeighborNHWC()
90 top_data[index] = ldg(bottom_data_n + idx); in LegacyResizeNearestNeighborNHWC()
121 GpuAtomicAdd(bottom_diff_n + idx, ldg(top_diff + index)); in ResizeNearestNeighborBackwardNHWC()
150 GpuAtomicAdd(bottom_diff_n + idx, ldg(top_diff + index)); in LegacyResizeNearestNeighborBackwardNHWC()

123