1; RUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=SIVI %s
2; RUN: llc < %s -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=CI --check-prefix=GCN  %s
3; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=SIVI %s
4
5; SMRD load with an immediate offset.
6; GCN-LABEL: {{^}}smrd0:
7; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
8; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
9define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
10entry:
11  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
12  %1 = load i32, i32 addrspace(2)* %0
13  store i32 %1, i32 addrspace(1)* %out
14  ret void
15}
16
17; SMRD load with the largest possible immediate offset.
18; GCN-LABEL: {{^}}smrd1:
19; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
20; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
21define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
22entry:
23  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
24  %1 = load i32, i32 addrspace(2)* %0
25  store i32 %1, i32 addrspace(1)* %out
26  ret void
27}
28
29; SMRD load with an offset greater than the largest possible immediate.
30; GCN-LABEL: {{^}}smrd2:
31; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
32; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
33; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
34; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
35; GCN: s_endpgm
36define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
37entry:
38  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
39  %1 = load i32, i32 addrspace(2)* %0
40  store i32 %1, i32 addrspace(1)* %out
41  ret void
42}
43
44; SMRD load with a 64-bit offset
45; GCN-LABEL: {{^}}smrd3:
46; FIXME: There are too many copies here because we don't fold immediates
47;        through REG_SEQUENCE
48; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
49; TODO: Add VI checks
50; GCN: s_endpgm
51define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
52entry:
53  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
54  %1 = load i32, i32 addrspace(2)* %0
55  store i32 %1, i32 addrspace(1)* %out
56  ret void
57}
58
59; SMRD load with the largest possible immediate offset on VI
60; GCN-LABEL: {{^}}smrd4:
61; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc
62; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
63; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
64; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
65define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
66entry:
67  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
68  %1 = load i32, i32 addrspace(2)* %0
69  store i32 %1, i32 addrspace(1)* %out
70  ret void
71}
72
73; SMRD load with an offset greater than the largest possible immediate on VI
74; GCN-LABEL: {{^}}smrd5:
75; SIVI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
76; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
77; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
78; GCN: s_endpgm
79define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
80entry:
81  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
82  %1 = load i32, i32 addrspace(2)* %0
83  store i32 %1, i32 addrspace(1)* %out
84  ret void
85}
86
87; SMRD load using the load.const intrinsic with an immediate offset
88; GCN-LABEL: {{^}}smrd_load_const0:
89; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
90; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
91define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
92main_body:
93  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
94  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
95  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
96  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
97  ret void
98}
99
100; SMRD load using the load.const intrinsic with the largest possible immediate
101; offset.
102; GCN-LABEL: {{^}}smrd_load_const1:
103; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
104; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
105define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
106main_body:
107  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
108  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
109  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1020)
110  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
111  ret void
112}
113; SMRD load using the load.const intrinsic with an offset greater than the
114; largets possible immediate.
115; immediate offset.
116; GCN-LABEL: {{^}}smrd_load_const2:
117; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
118; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
119; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
120; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
121define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
122main_body:
123  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
124  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
125  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1024)
126  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
127  ret void
128}
129
130; SMRD load with the largest possible immediate offset on VI
131; GCN-LABEL: {{^}}smrd_load_const3:
132; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc
133; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
134; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
135; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
136define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
137main_body:
138  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
139  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
140  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048572)
141  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
142  ret void
143}
144
145; SMRD load with an offset greater than the largest possible immediate on VI
146; GCN-LABEL: {{^}}smrd_load_const4:
147; SIVI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
148; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
149; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
150; GCN: s_endpgm
151define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
152main_body:
153  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
154  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
155  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048576)
156  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
157  ret void
158}
159
160; Function Attrs: nounwind readnone
161declare float @llvm.SI.load.const(<16 x i8>, i32) #0
162
163declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
164
165attributes #0 = { nounwind readnone }
166