1 // REQUIRES: nvptx-registered-target
2 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | \
3 // RUN:   FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
4 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | \
5 // RUN:   FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
6 
7 #define __device__ __attribute__((device))
8 #define __global__ __attribute__((global))
9 #define __shared__ __attribute__((shared))
10 #define __constant__ __attribute__((constant))
11 
read_tid()12 __device__ int read_tid() {
13 
14 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
15 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
16 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
17 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.w()
18 
19   int x = __nvvm_read_ptx_sreg_tid_x();
20   int y = __nvvm_read_ptx_sreg_tid_y();
21   int z = __nvvm_read_ptx_sreg_tid_z();
22   int w = __nvvm_read_ptx_sreg_tid_w();
23 
24   return x + y + z + w;
25 
26 }
27 
read_ntid()28 __device__ int read_ntid() {
29 
30 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
31 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
32 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
33 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
34 
35   int x = __nvvm_read_ptx_sreg_ntid_x();
36   int y = __nvvm_read_ptx_sreg_ntid_y();
37   int z = __nvvm_read_ptx_sreg_ntid_z();
38   int w = __nvvm_read_ptx_sreg_ntid_w();
39 
40   return x + y + z + w;
41 
42 }
43 
read_ctaid()44 __device__ int read_ctaid() {
45 
46 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
47 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
48 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
49 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
50 
51   int x = __nvvm_read_ptx_sreg_ctaid_x();
52   int y = __nvvm_read_ptx_sreg_ctaid_y();
53   int z = __nvvm_read_ptx_sreg_ctaid_z();
54   int w = __nvvm_read_ptx_sreg_ctaid_w();
55 
56   return x + y + z + w;
57 
58 }
59 
read_nctaid()60 __device__ int read_nctaid() {
61 
62 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
63 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
64 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
65 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
66 
67   int x = __nvvm_read_ptx_sreg_nctaid_x();
68   int y = __nvvm_read_ptx_sreg_nctaid_y();
69   int z = __nvvm_read_ptx_sreg_nctaid_z();
70   int w = __nvvm_read_ptx_sreg_nctaid_w();
71 
72   return x + y + z + w;
73 
74 }
75 
read_ids()76 __device__ int read_ids() {
77 
78 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.laneid()
79 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpid()
80 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nwarpid()
81 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid()
82 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid()
83 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid()
84 
85   int a = __nvvm_read_ptx_sreg_laneid();
86   int b = __nvvm_read_ptx_sreg_warpid();
87   int c = __nvvm_read_ptx_sreg_nwarpid();
88   int d = __nvvm_read_ptx_sreg_smid();
89   int e = __nvvm_read_ptx_sreg_nsmid();
90   int f = __nvvm_read_ptx_sreg_gridid();
91 
92   return a + b + c + d + e + f;
93 
94 }
95 
read_lanemasks()96 __device__ int read_lanemasks() {
97 
98 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.eq()
99 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.le()
100 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.lt()
101 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.ge()
102 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt()
103 
104   int a = __nvvm_read_ptx_sreg_lanemask_eq();
105   int b = __nvvm_read_ptx_sreg_lanemask_le();
106   int c = __nvvm_read_ptx_sreg_lanemask_lt();
107   int d = __nvvm_read_ptx_sreg_lanemask_ge();
108   int e = __nvvm_read_ptx_sreg_lanemask_gt();
109 
110   return a + b + c + d + e;
111 
112 }
113 
read_clocks()114 __device__ long long read_clocks() {
115 
116 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.clock()
117 // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64()
118 
119   int a = __nvvm_read_ptx_sreg_clock();
120   long long b = __nvvm_read_ptx_sreg_clock64();
121 
122   return a + b;
123 }
124 
read_pms()125 __device__ int read_pms() {
126 
127 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm0()
128 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm1()
129 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm2()
130 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm3()
131 
132   int a = __nvvm_read_ptx_sreg_pm0();
133   int b = __nvvm_read_ptx_sreg_pm1();
134   int c = __nvvm_read_ptx_sreg_pm2();
135   int d = __nvvm_read_ptx_sreg_pm3();
136 
137   return a + b + c + d;
138 
139 }
140 
sync()141 __device__ void sync() {
142 
143 // CHECK: call void @llvm.nvvm.bar.sync(i32 0)
144 
145   __nvvm_bar_sync(0);
146 
147 }
148 
149 
150 // NVVM intrinsics
151 
152 // The idea is not to test all intrinsics, just that Clang is recognizing the
153 // builtins defined in BuiltinsNVPTX.def
nvvm_math(float f1,float f2,double d1,double d2)154 __device__ void nvvm_math(float f1, float f2, double d1, double d2) {
155 // CHECK: call float @llvm.nvvm.fmax.f
156   float t1 = __nvvm_fmax_f(f1, f2);
157 // CHECK: call float @llvm.nvvm.fmin.f
158   float t2 = __nvvm_fmin_f(f1, f2);
159 // CHECK: call float @llvm.nvvm.sqrt.rn.f
160   float t3 = __nvvm_sqrt_rn_f(f1);
161 // CHECK: call float @llvm.nvvm.rcp.rn.f
162   float t4 = __nvvm_rcp_rn_f(f2);
163 // CHECK: call float @llvm.nvvm.add.rn.f
164   float t5 = __nvvm_add_rn_f(f1, f2);
165 
166 // CHECK: call double @llvm.nvvm.fmax.d
167   double td1 = __nvvm_fmax_d(d1, d2);
168 // CHECK: call double @llvm.nvvm.fmin.d
169   double td2 = __nvvm_fmin_d(d1, d2);
170 // CHECK: call double @llvm.nvvm.sqrt.rn.d
171   double td3 = __nvvm_sqrt_rn_d(d1);
172 // CHECK: call double @llvm.nvvm.rcp.rn.d
173   double td4 = __nvvm_rcp_rn_d(d2);
174 
175 // CHECK: call void @llvm.nvvm.membar.cta()
176   __nvvm_membar_cta();
177 // CHECK: call void @llvm.nvvm.membar.gl()
178   __nvvm_membar_gl();
179 // CHECK: call void @llvm.nvvm.membar.sys()
180   __nvvm_membar_sys();
181 // CHECK: call void @llvm.nvvm.barrier0()
182   __syncthreads();
183 }
184 
185 __device__ int di;
186 __shared__ int si;
187 __device__ long dl;
188 __shared__ long sl;
189 __device__ long long dll;
190 __shared__ long long sll;
191 
192 // Check for atomic intrinsics
193 // CHECK-LABEL: nvvm_atom
nvvm_atom(float * fp,float f,int * ip,int i,unsigned int * uip,unsigned ui,long * lp,long l,long long * llp,long long ll)194 __device__ void nvvm_atom(float *fp, float f, int *ip, int i, unsigned int *uip, unsigned ui, long *lp, long l,
195                           long long *llp, long long ll) {
196   // CHECK: atomicrmw add
197   __nvvm_atom_add_gen_i(ip, i);
198   // CHECK: atomicrmw add
199   __nvvm_atom_add_gen_l(&dl, l);
200   // CHECK: atomicrmw add
201   __nvvm_atom_add_gen_ll(&sll, ll);
202 
203   // CHECK: atomicrmw sub
204   __nvvm_atom_sub_gen_i(ip, i);
205   // CHECK: atomicrmw sub
206   __nvvm_atom_sub_gen_l(&dl, l);
207   // CHECK: atomicrmw sub
208   __nvvm_atom_sub_gen_ll(&sll, ll);
209 
210   // CHECK: atomicrmw and
211   __nvvm_atom_and_gen_i(ip, i);
212   // CHECK: atomicrmw and
213   __nvvm_atom_and_gen_l(&dl, l);
214   // CHECK: atomicrmw and
215   __nvvm_atom_and_gen_ll(&sll, ll);
216 
217   // CHECK: atomicrmw or
218   __nvvm_atom_or_gen_i(ip, i);
219   // CHECK: atomicrmw or
220   __nvvm_atom_or_gen_l(&dl, l);
221   // CHECK: atomicrmw or
222   __nvvm_atom_or_gen_ll(&sll, ll);
223 
224   // CHECK: atomicrmw xor
225   __nvvm_atom_xor_gen_i(ip, i);
226   // CHECK: atomicrmw xor
227   __nvvm_atom_xor_gen_l(&dl, l);
228   // CHECK: atomicrmw xor
229   __nvvm_atom_xor_gen_ll(&sll, ll);
230 
231   // CHECK: atomicrmw xchg
232   __nvvm_atom_xchg_gen_i(ip, i);
233   // CHECK: atomicrmw xchg
234   __nvvm_atom_xchg_gen_l(&dl, l);
235   // CHECK: atomicrmw xchg
236   __nvvm_atom_xchg_gen_ll(&sll, ll);
237 
238   // CHECK: atomicrmw max i32*
239   __nvvm_atom_max_gen_i(ip, i);
240   // CHECK: atomicrmw umax i32*
241   __nvvm_atom_max_gen_ui((unsigned int *)ip, i);
242   // CHECK: atomicrmw max
243   __nvvm_atom_max_gen_l(&dl, l);
244   // CHECK: atomicrmw umax
245   __nvvm_atom_max_gen_ul((unsigned long *)&dl, l);
246   // CHECK: atomicrmw max i64*
247   __nvvm_atom_max_gen_ll(&sll, ll);
248   // CHECK: atomicrmw umax i64*
249   __nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll);
250 
251   // CHECK: atomicrmw min i32*
252   __nvvm_atom_min_gen_i(ip, i);
253   // CHECK: atomicrmw umin i32*
254   __nvvm_atom_min_gen_ui((unsigned int *)ip, i);
255   // CHECK: atomicrmw min
256   __nvvm_atom_min_gen_l(&dl, l);
257   // CHECK: atomicrmw umin
258   __nvvm_atom_min_gen_ul((unsigned long *)&dl, l);
259   // CHECK: atomicrmw min i64*
260   __nvvm_atom_min_gen_ll(&sll, ll);
261   // CHECK: atomicrmw umin i64*
262   __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll);
263 
264   // CHECK: cmpxchg
265   // CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0
266   __nvvm_atom_cas_gen_i(ip, 0, i);
267   // CHECK: cmpxchg
268   // CHECK-NEXT: extractvalue { {{i32|i64}}, i1 } {{%[0-9]+}}, 0
269   __nvvm_atom_cas_gen_l(&dl, 0, l);
270   // CHECK: cmpxchg
271   // CHECK-NEXT: extractvalue { i64, i1 } {{%[0-9]+}}, 0
272   __nvvm_atom_cas_gen_ll(&sll, 0, ll);
273 
274   // CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
275   __nvvm_atom_add_gen_f(fp, f);
276 
277   // CHECK: call i32 @llvm.nvvm.atomic.load.inc.32.p0i32
278   __nvvm_atom_inc_gen_ui(uip, ui);
279 
280   // CHECK: call i32 @llvm.nvvm.atomic.load.dec.32.p0i32
281   __nvvm_atom_dec_gen_ui(uip, ui);
282 
283   // CHECK: ret
284 }
285 
286 // CHECK-LABEL: nvvm_ldg
nvvm_ldg(const void * p)287 __device__ void nvvm_ldg(const void *p) {
288   // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* {{%[0-9]+}}, i32 1)
289   // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* {{%[0-9]+}}, i32 1)
290   __nvvm_ldg_c((const char *)p);
291   __nvvm_ldg_uc((const unsigned char *)p);
292 
293   // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* {{%[0-9]+}}, i32 2)
294   // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* {{%[0-9]+}}, i32 2)
295   __nvvm_ldg_s((const short *)p);
296   __nvvm_ldg_us((const unsigned short *)p);
297 
298   // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
299   // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
300   __nvvm_ldg_i((const int *)p);
301   __nvvm_ldg_ui((const unsigned int *)p);
302 
303   // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
304   // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
305   // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* {{%[0-9]+}}, i32 8)
306   // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* {{%[0-9]+}}, i32 8)
307   __nvvm_ldg_l((const long *)p);
308   __nvvm_ldg_ul((const unsigned long *)p);
309 
310   // CHECK: call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* {{%[0-9]+}}, i32 4)
311   __nvvm_ldg_f((const float *)p);
312   // CHECK: call double @llvm.nvvm.ldg.global.f.f64.p0f64(double* {{%[0-9]+}}, i32 8)
313   __nvvm_ldg_d((const double *)p);
314 
315   // In practice, the pointers we pass to __ldg will be aligned as appropriate
316   // for the CUDA <type>N vector types (e.g. short4), which are not the same as
317   // the LLVM vector types.  However, each LLVM vector type has an alignment
318   // less than or equal to its corresponding CUDA type, so we're OK.
319   //
320   // PTX Interoperability section 2.2: "For a vector with an even number of
321   // elements, its alignment is set to number of elements times the alignment of
322   // its member: n*alignof(t)."
323 
324   // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* {{%[0-9]+}}, i32 2)
325   // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* {{%[0-9]+}}, i32 2)
326   typedef char char2 __attribute__((ext_vector_type(2)));
327   typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
328   __nvvm_ldg_c2((const char2 *)p);
329   __nvvm_ldg_uc2((const uchar2 *)p);
330 
331   // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* {{%[0-9]+}}, i32 4)
332   // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* {{%[0-9]+}}, i32 4)
333   typedef char char4 __attribute__((ext_vector_type(4)));
334   typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
335   __nvvm_ldg_c4((const char4 *)p);
336   __nvvm_ldg_uc4((const uchar4 *)p);
337 
338   // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* {{%[0-9]+}}, i32 4)
339   // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* {{%[0-9]+}}, i32 4)
340   typedef short short2 __attribute__((ext_vector_type(2)));
341   typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
342   __nvvm_ldg_s2((const short2 *)p);
343   __nvvm_ldg_us2((const ushort2 *)p);
344 
345   // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0v4i16(<4 x i16>* {{%[0-9]+}}, i32 8)
346   // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0v4i16(<4 x i16>* {{%[0-9]+}}, i32 8)
347   typedef short short4 __attribute__((ext_vector_type(4)));
348   typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
349   __nvvm_ldg_s4((const short4 *)p);
350   __nvvm_ldg_us4((const ushort4 *)p);
351 
352   // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* {{%[0-9]+}}, i32 8)
353   // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* {{%[0-9]+}}, i32 8)
354   typedef int int2 __attribute__((ext_vector_type(2)));
355   typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
356   __nvvm_ldg_i2((const int2 *)p);
357   __nvvm_ldg_ui2((const uint2 *)p);
358 
359   // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0v4i32(<4 x i32>* {{%[0-9]+}}, i32 16)
360   // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0v4i32(<4 x i32>* {{%[0-9]+}}, i32 16)
361   typedef int int4 __attribute__((ext_vector_type(4)));
362   typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
363   __nvvm_ldg_i4((const int4 *)p);
364   __nvvm_ldg_ui4((const uint4 *)p);
365 
366   // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* {{%[0-9]+}}, i32 16)
367   // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* {{%[0-9]+}}, i32 16)
368   typedef long long longlong2 __attribute__((ext_vector_type(2)));
369   typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
370   __nvvm_ldg_ll2((const longlong2 *)p);
371   __nvvm_ldg_ull2((const ulonglong2 *)p);
372 
373   // CHECK: call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0v2f32(<2 x float>* {{%[0-9]+}}, i32 8)
374   typedef float float2 __attribute__((ext_vector_type(2)));
375   __nvvm_ldg_f2((const float2 *)p);
376 
377   // CHECK: call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0v4f32(<4 x float>* {{%[0-9]+}}, i32 16)
378   typedef float float4 __attribute__((ext_vector_type(4)));
379   __nvvm_ldg_f4((const float4 *)p);
380 
381   // CHECK: call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0v2f64(<2 x double>* {{%[0-9]+}}, i32 16)
382   typedef double double2 __attribute__((ext_vector_type(2)));
383   __nvvm_ldg_d2((const double2 *)p);
384 }
385