1# RUN: llc -march=amdgcn -mcpu=fiji -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
2# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
3
4# GCN-LABEL: {{^}}sdwa_imm_operand:
5# GCN: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2
6# GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2
7# GCN: BB0_1:
8# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
9# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
10
11# GCN-LABEL: {{^}}sdwa_sgpr_operand:
12# VI: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2
13# VI-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2
14# VI: BB1_1:
15# VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
16# VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
17
18# GFX9: s_mov_b32 s[[SHIFT:[0-9]+]], 2
19# GFX9-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2
20# GFX9: BB1_1:
21# GFX9: v_lshlrev_b32_sdwa v{{[0-9]+}}, s[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
22# GFX9: v_lshlrev_b32_sdwa v{{[0-9]+}}, s[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
23
24--- |
25  ; ModuleID = 'sdwa-scalar-ops.opt.ll'
26  source_filename = "sdwa-scalar-ops.opt.ll"
27  target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
28
29  define amdgpu_kernel void @sdwa_imm_operand(i32 addrspace(1)* nocapture %arg) {
30  bb:
31    br label %bb2
32
33  bb1:                                              ; preds = %bb2
34    ret void
35
36  bb2:                                              ; preds = %bb2, %bb
37    %lsr.iv = phi i64 [ %lsr.iv.next, %bb2 ], [ 0, %bb ]
38    %bc = bitcast i32 addrspace(1)* %arg to i8 addrspace(1)*
39    %uglygep4 = getelementptr i8, i8 addrspace(1)* %bc, i64 %lsr.iv
40    %uglygep45 = bitcast i8 addrspace(1)* %uglygep4 to i32 addrspace(1)*
41    %tmp5 = load i32, i32 addrspace(1)* %uglygep45, align 4
42    %tmp6 = lshr i32 %tmp5, 8
43    %tmp7 = and i32 %tmp6, 255
44    %tmp8 = zext i32 %tmp7 to i64
45    %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp8
46    store i32 1, i32 addrspace(1)* %tmp9, align 4
47    %scevgep = getelementptr i32, i32 addrspace(1)* %uglygep45, i64 1
48    %tmp13 = load i32, i32 addrspace(1)* %scevgep, align 4
49    %tmp14 = lshr i32 %tmp13, 8
50    %tmp15 = and i32 %tmp14, 255
51    %tmp16 = zext i32 %tmp15 to i64
52    %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16
53    store i32 1, i32 addrspace(1)* %tmp17, align 4
54    %lsr.iv.next = add nuw nsw i64 %lsr.iv, 8
55    %tmp1 = trunc i64 %lsr.iv.next to i32
56    %tmp19 = icmp eq i32 %tmp1, 4096
57    br i1 %tmp19, label %bb1, label %bb2
58  }
59
60  define amdgpu_kernel void @sdwa_sgpr_operand(i32 addrspace(1)* nocapture %arg) {
61  bb:
62    br label %bb2
63
64  bb1:                                              ; preds = %bb2
65    ret void
66
67  bb2:                                              ; preds = %bb2, %bb
68    %lsr.iv = phi i64 [ %lsr.iv.next, %bb2 ], [ 0, %bb ]
69    %bc = bitcast i32 addrspace(1)* %arg to i8 addrspace(1)*
70    %uglygep4 = getelementptr i8, i8 addrspace(1)* %bc, i64 %lsr.iv
71    %uglygep45 = bitcast i8 addrspace(1)* %uglygep4 to i32 addrspace(1)*
72    %tmp5 = load i32, i32 addrspace(1)* %uglygep45, align 4
73    %tmp6 = lshr i32 %tmp5, 8
74    %tmp7 = and i32 %tmp6, 255
75    %tmp8 = zext i32 %tmp7 to i64
76    %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp8
77    store i32 1, i32 addrspace(1)* %tmp9, align 4
78    %scevgep = getelementptr i32, i32 addrspace(1)* %uglygep45, i64 1
79    %tmp13 = load i32, i32 addrspace(1)* %scevgep, align 4
80    %tmp14 = lshr i32 %tmp13, 8
81    %tmp15 = and i32 %tmp14, 255
82    %tmp16 = zext i32 %tmp15 to i64
83    %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16
84    store i32 1, i32 addrspace(1)* %tmp17, align 4
85    %lsr.iv.next = add nuw nsw i64 %lsr.iv, 8
86    %tmp1 = trunc i64 %lsr.iv.next to i32
87    %tmp19 = icmp eq i32 %tmp1, 4096
88    br i1 %tmp19, label %bb1, label %bb2
89  }
90
91...
92---
93name:            sdwa_imm_operand
94alignment:       1
95exposesReturnsTwice: false
96legalized:       false
97regBankSelected: false
98selected:        false
99tracksRegLiveness: true
100registers:
101  - { id: 0, class: sreg_64 }
102  - { id: 1, class: sreg_64 }
103  - { id: 2, class: vgpr_32 }
104  - { id: 3, class: sgpr_128 }
105  - { id: 4, class: sgpr_64 }
106  - { id: 5, class: sreg_32_xm0 }
107  - { id: 6, class: sgpr_32 }
108  - { id: 7, class: sreg_64 }
109  - { id: 8, class: sreg_64 }
110  - { id: 9, class: sreg_64_xexec }
111  - { id: 10, class: sreg_32_xm0 }
112  - { id: 11, class: sreg_32_xm0 }
113  - { id: 12, class: sreg_32_xm0 }
114  - { id: 13, class: sreg_32_xm0 }
115  - { id: 14, class: sreg_32_xm0 }
116  - { id: 15, class: sreg_32_xm0 }
117  - { id: 16, class: sreg_64 }
118  - { id: 17, class: vgpr_32 }
119  - { id: 18, class: vreg_64 }
120  - { id: 19, class: sreg_32_xm0 }
121  - { id: 20, class: sreg_32 }
122  - { id: 21, class: sreg_32_xm0 }
123  - { id: 22, class: sreg_32_xm0 }
124  - { id: 23, class: sreg_32_xm0 }
125  - { id: 24, class: sreg_64 }
126  - { id: 25, class: sreg_32_xm0 }
127  - { id: 26, class: sreg_32_xm0 }
128  - { id: 27, class: sreg_32_xm0 }
129  - { id: 28, class: sreg_32_xm0 }
130  - { id: 29, class: sreg_64 }
131  - { id: 30, class: vgpr_32 }
132  - { id: 31, class: vreg_64 }
133  - { id: 32, class: sreg_32_xm0 }
134  - { id: 33, class: sreg_32_xm0 }
135  - { id: 34, class: sreg_64 }
136  - { id: 35, class: sreg_32_xm0 }
137  - { id: 36, class: sreg_32_xm0 }
138  - { id: 37, class: sreg_32_xm0 }
139  - { id: 38, class: sreg_32_xm0 }
140  - { id: 39, class: vreg_64 }
141  - { id: 40, class: vgpr_32 }
142  - { id: 41, class: vreg_64 }
143  - { id: 42, class: sreg_32_xm0 }
144  - { id: 43, class: sreg_32 }
145  - { id: 44, class: sreg_32_xm0 }
146  - { id: 45, class: sreg_64 }
147  - { id: 46, class: sreg_32_xm0 }
148  - { id: 47, class: sreg_32_xm0 }
149  - { id: 48, class: sreg_32_xm0 }
150  - { id: 49, class: sreg_32_xm0 }
151  - { id: 50, class: sreg_64 }
152  - { id: 51, class: vreg_64 }
153  - { id: 52, class: sreg_64 }
154  - { id: 53, class: sreg_32_xm0 }
155  - { id: 54, class: sreg_32_xm0 }
156  - { id: 55, class: sreg_32_xm0 }
157  - { id: 56, class: sreg_32_xm0 }
158  - { id: 57, class: sreg_64 }
159  - { id: 58, class: sreg_32_xm0 }
160  - { id: 59, class: sreg_32_xm0 }
161  - { id: 60, class: vgpr_32 }
162  - { id: 61, class: vgpr_32 }
163  - { id: 62, class: vreg_64 }
164  - { id: 63, class: vgpr_32 }
165  - { id: 64, class: vgpr_32 }
166  - { id: 65, class: vgpr_32 }
167  - { id: 66, class: vgpr_32 }
168  - { id: 67, class: vreg_64 }
169  - { id: 68, class: vgpr_32 }
170  - { id: 69, class: vgpr_32 }
171  - { id: 70, class: vgpr_32 }
172  - { id: 71, class: vgpr_32 }
173  - { id: 72, class: vgpr_32 }
174  - { id: 73, class: vgpr_32 }
175  - { id: 74, class: vgpr_32 }
176  - { id: 75, class: vreg_64 }
177  - { id: 76, class: vgpr_32 }
178  - { id: 77, class: vgpr_32 }
179  - { id: 78, class: vgpr_32 }
180  - { id: 79, class: vgpr_32 }
181  - { id: 80, class: vreg_64 }
182  - { id: 81, class: vgpr_32 }
183  - { id: 82, class: vgpr_32 }
184  - { id: 83, class: vgpr_32 }
185liveins:
186  - { reg: '$sgpr4_sgpr5', virtual-reg: '%4' }
187frameInfo:
188  isFrameAddressTaken: false
189  isReturnAddressTaken: false
190  hasStackMap:     false
191  hasPatchPoint:   false
192  stackSize:       0
193  offsetAdjustment: 0
194  maxAlignment:    0
195  adjustsStack:    false
196  hasCalls:        false
197  hasOpaqueSPAdjustment: false
198  hasVAStart:      false
199  hasMustTailInVarArgFunc: false
200body:             |
201  bb.0.bb:
202    successors: %bb.2.bb2(0x80000000)
203    liveins: $sgpr4_sgpr5
204
205    %4 = COPY $sgpr4_sgpr5
206    %9 = S_LOAD_DWORDX2_IMM %4, 0, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`)
207    %8 = S_MOV_B64 0
208    %7 = COPY %9
209    %30 = V_MOV_B32_e32 1, implicit $exec
210    S_BRANCH %bb.2.bb2
211
212  bb.1.bb1:
213    S_ENDPGM 0
214
215  bb.2.bb2:
216    successors: %bb.1.bb1(0x04000000), %bb.2.bb2(0x7c000000)
217
218    %0 = PHI %8, %bb.0.bb, %1, %bb.2.bb2
219    %13 = COPY %7.sub1
220    %14 = S_ADD_U32 %7.sub0, %0.sub0, implicit-def $scc
221    %15 = S_ADDC_U32 %7.sub1, %0.sub1, implicit-def dead $scc, implicit $scc
222    %16 = REG_SEQUENCE %14, %subreg.sub0, %15, %subreg.sub1
223    %18 = COPY %16
224    %17 = FLAT_LOAD_DWORD %18, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.uglygep45)
225    %60 = V_BFE_U32 %17, 8, 8, implicit $exec
226    %61 = V_LSHLREV_B32_e32 2, killed %60, implicit $exec
227    %70 = V_ADD_CO_U32_e32 %7.sub0, %61, implicit-def $vcc, implicit $exec
228    %66 = COPY %13
229    %65 = V_ADDC_U32_e32 0, %66, implicit-def $vcc, implicit $vcc, implicit $exec
230    %67 = REG_SEQUENCE %70, %subreg.sub0, killed %65, %subreg.sub1
231    FLAT_STORE_DWORD %67, %30, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp9)
232    %37 = S_ADD_U32 %14, 4, implicit-def $scc
233    %38 = S_ADDC_U32 %15, 0, implicit-def dead $scc, implicit $scc
234    %71 = COPY killed %37
235    %72 = COPY killed %38
236    %41 = REG_SEQUENCE killed %71, %subreg.sub0, killed %72, %subreg.sub1
237    %40 = FLAT_LOAD_DWORD killed %41, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.scevgep)
238    %73 = V_BFE_U32 %40, 8, 8, implicit $exec
239    %74 = V_LSHLREV_B32_e32 2, killed %73, implicit $exec
240    %83 = V_ADD_CO_U32_e32 %7.sub0, %74, implicit-def $vcc, implicit $exec
241    %78 = V_ADDC_U32_e32 0, %66, implicit-def $vcc, implicit $vcc, implicit $exec
242    %80 = REG_SEQUENCE %83, %subreg.sub0, killed %78, %subreg.sub1
243    FLAT_STORE_DWORD %80, %30, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp17)
244    %55 = S_ADD_U32 %0.sub0, 8, implicit-def $scc
245    %56 = S_ADDC_U32 %0.sub1, 0, implicit-def dead $scc, implicit $scc
246    %57 = REG_SEQUENCE %55, %subreg.sub0, killed %56, %subreg.sub1
247    %1 = COPY %57
248    S_CMPK_EQ_I32 %55, 4096, implicit-def $scc
249    S_CBRANCH_SCC1 %bb.1.bb1, implicit $scc
250    S_BRANCH %bb.2.bb2
251
252...
253---
254name:            sdwa_sgpr_operand
255alignment:       1
256exposesReturnsTwice: false
257legalized:       false
258regBankSelected: false
259selected:        false
260tracksRegLiveness: true
261registers:
262  - { id: 0, class: sreg_64 }
263  - { id: 1, class: sreg_64 }
264  - { id: 2, class: vgpr_32 }
265  - { id: 3, class: sgpr_128 }
266  - { id: 4, class: sgpr_64 }
267  - { id: 5, class: sreg_32_xm0 }
268  - { id: 6, class: sgpr_32 }
269  - { id: 7, class: sreg_64 }
270  - { id: 8, class: sreg_64 }
271  - { id: 9, class: sreg_64_xexec }
272  - { id: 10, class: sreg_32_xm0 }
273  - { id: 11, class: sreg_32_xm0 }
274  - { id: 12, class: sreg_32_xm0 }
275  - { id: 13, class: sreg_32_xm0 }
276  - { id: 14, class: sreg_32_xm0 }
277  - { id: 15, class: sreg_32_xm0 }
278  - { id: 16, class: sreg_64 }
279  - { id: 17, class: vgpr_32 }
280  - { id: 18, class: vreg_64 }
281  - { id: 19, class: sreg_32_xm0 }
282  - { id: 20, class: sreg_32 }
283  - { id: 21, class: sreg_32_xm0 }
284  - { id: 22, class: sreg_32_xm0 }
285  - { id: 23, class: sreg_32_xm0 }
286  - { id: 24, class: sreg_64 }
287  - { id: 25, class: sreg_32_xm0 }
288  - { id: 26, class: sreg_32_xm0 }
289  - { id: 27, class: sreg_32_xm0 }
290  - { id: 28, class: sreg_32_xm0 }
291  - { id: 29, class: sreg_64 }
292  - { id: 30, class: vgpr_32 }
293  - { id: 31, class: vreg_64 }
294  - { id: 32, class: sreg_32_xm0 }
295  - { id: 33, class: sreg_32_xm0 }
296  - { id: 34, class: sreg_64 }
297  - { id: 35, class: sreg_32_xm0 }
298  - { id: 36, class: sreg_32_xm0 }
299  - { id: 37, class: sreg_32_xm0 }
300  - { id: 38, class: sreg_32_xm0 }
301  - { id: 39, class: vreg_64 }
302  - { id: 40, class: vgpr_32 }
303  - { id: 41, class: vreg_64 }
304  - { id: 42, class: sreg_32_xm0 }
305  - { id: 43, class: sreg_32 }
306  - { id: 44, class: sreg_32_xm0 }
307  - { id: 45, class: sreg_64 }
308  - { id: 46, class: sreg_32_xm0 }
309  - { id: 47, class: sreg_32_xm0 }
310  - { id: 48, class: sreg_32_xm0 }
311  - { id: 49, class: sreg_32_xm0 }
312  - { id: 50, class: sreg_64 }
313  - { id: 51, class: vreg_64 }
314  - { id: 52, class: sreg_64 }
315  - { id: 53, class: sreg_32_xm0 }
316  - { id: 54, class: sreg_32_xm0 }
317  - { id: 55, class: sreg_32_xm0 }
318  - { id: 56, class: sreg_32_xm0 }
319  - { id: 57, class: sreg_64 }
320  - { id: 58, class: sreg_32_xm0 }
321  - { id: 59, class: sreg_32_xm0 }
322  - { id: 60, class: vgpr_32 }
323  - { id: 61, class: vgpr_32 }
324  - { id: 62, class: vreg_64 }
325  - { id: 63, class: vgpr_32 }
326  - { id: 64, class: vgpr_32 }
327  - { id: 65, class: vgpr_32 }
328  - { id: 66, class: vgpr_32 }
329  - { id: 67, class: vreg_64 }
330  - { id: 68, class: vgpr_32 }
331  - { id: 69, class: vgpr_32 }
332  - { id: 70, class: vgpr_32 }
333  - { id: 71, class: vgpr_32 }
334  - { id: 72, class: vgpr_32 }
335  - { id: 73, class: vgpr_32 }
336  - { id: 74, class: vgpr_32 }
337  - { id: 75, class: vreg_64 }
338  - { id: 76, class: vgpr_32 }
339  - { id: 77, class: vgpr_32 }
340  - { id: 78, class: vgpr_32 }
341  - { id: 79, class: vgpr_32 }
342  - { id: 80, class: vreg_64 }
343  - { id: 81, class: vgpr_32 }
344  - { id: 82, class: vgpr_32 }
345  - { id: 83, class: vgpr_32 }
346  - { id: 84, class: sreg_32_xm0 }
347liveins:
348  - { reg: '$sgpr4_sgpr5', virtual-reg: '%4' }
349frameInfo:
350  isFrameAddressTaken: false
351  isReturnAddressTaken: false
352  hasStackMap:     false
353  hasPatchPoint:   false
354  stackSize:       0
355  offsetAdjustment: 0
356  maxAlignment:    0
357  adjustsStack:    false
358  hasCalls:        false
359  hasOpaqueSPAdjustment: false
360  hasVAStart:      false
361  hasMustTailInVarArgFunc: false
362body:             |
363  bb.0.bb:
364    successors: %bb.2.bb2(0x80000000)
365    liveins: $sgpr4_sgpr5
366
367    %4 = COPY $sgpr4_sgpr5
368    %9 = S_LOAD_DWORDX2_IMM %4, 0, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`)
369    %8 = S_MOV_B64 0
370    %7 = COPY %9
371    %30 = V_MOV_B32_e32 1, implicit $exec
372    %84 = S_MOV_B32 2
373    S_BRANCH %bb.2.bb2
374
375  bb.1.bb1:
376    S_ENDPGM 0
377
378  bb.2.bb2:
379    successors: %bb.1.bb1(0x04000000), %bb.2.bb2(0x7c000000)
380
381    %0 = PHI %8, %bb.0.bb, %1, %bb.2.bb2
382    %13 = COPY %7.sub1
383    %14 = S_ADD_U32 %7.sub0, %0.sub0, implicit-def $scc
384    %15 = S_ADDC_U32 %7.sub1, %0.sub1, implicit-def dead $scc, implicit $scc
385    %16 = REG_SEQUENCE %14, %subreg.sub0, %15, %subreg.sub1
386    %18 = COPY %16
387    %17 = FLAT_LOAD_DWORD %18, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.uglygep45)
388    %60 = V_BFE_U32 %17, 8, 8, implicit $exec
389    %61 = V_LSHLREV_B32_e32 %84, killed %60, implicit $exec
390    %70 = V_ADD_CO_U32_e32 %7.sub0, %61, implicit-def $vcc, implicit $exec
391    %66 = COPY %13
392    %65 = V_ADDC_U32_e32 0, %66, implicit-def $vcc, implicit $vcc, implicit $exec
393    %67 = REG_SEQUENCE %70, %subreg.sub0, killed %65, %subreg.sub1
394    FLAT_STORE_DWORD %67, %30, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp9)
395    %37 = S_ADD_U32 %14, 4, implicit-def $scc
396    %38 = S_ADDC_U32 %15, 0, implicit-def dead $scc, implicit $scc
397    %71 = COPY killed %37
398    %72 = COPY killed %38
399    %41 = REG_SEQUENCE killed %71, %subreg.sub0, killed %72, %subreg.sub1
400    %40 = FLAT_LOAD_DWORD killed %41, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.scevgep)
401    %73 = V_BFE_U32 %40, 8, 8, implicit $exec
402    %74 = V_LSHLREV_B32_e32 %84, killed %73, implicit $exec
403    %83 = V_ADD_CO_U32_e32 %7.sub0, %74, implicit-def $vcc, implicit $exec
404    %78 = V_ADDC_U32_e32 0, %66, implicit-def $vcc, implicit $vcc, implicit $exec
405    %80 = REG_SEQUENCE %83, %subreg.sub0, killed %78, %subreg.sub1
406    FLAT_STORE_DWORD %80, %30, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp17)
407    %55 = S_ADD_U32 %0.sub0, 8, implicit-def $scc
408    %56 = S_ADDC_U32 %0.sub1, 0, implicit-def dead $scc, implicit $scc
409    %57 = REG_SEQUENCE %55, %subreg.sub0, killed %56, %subreg.sub1
410    %1 = COPY %57
411    S_CMPK_EQ_I32 %55, 4096, implicit-def $scc
412    S_CBRANCH_SCC1 %bb.1.bb1, implicit $scc
413    S_BRANCH %bb.2.bb2
414
415...
416