1; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s 2 3 4; There is no dependence between the store and the two loads. So we can combine 5; the loads and schedule it freely. 6 7; GCN-LABEL: {{^}}ds_combine_nodep 8 9; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 10; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8 11; GCN: s_waitcnt lgkmcnt({{[0-9]+}}) 12define amdgpu_kernel void @ds_combine_nodep(float addrspace(1)* %out, float addrspace(3)* %inptr) { 13 14 %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)* 15 %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24 16 %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)* 17 %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)* 18 %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4 19 %v0 = extractelement <3 x float> %load0, i32 2 20 21 %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0 22 %data = insertelement <2 x float> %tmp1, float 2.0, i32 1 23 24 %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26 25 %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)* 26 store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4 27 28 %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7 29 %v1 = load float, float addrspace(3)* %vaddr1, align 4 30 31 %sum = fadd float %v0, %v1 32 store float %sum, float addrspace(1)* %out, align 4 33 ret void 34} 35 36 37; The store depends on the first load, so we could not move the first load down to combine with 38; the second load directly. However, we can move the store after the combined load. 39 40; GCN-LABEL: {{^}}ds_combine_WAR 41 42; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:27 43; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 44define amdgpu_kernel void @ds_combine_WAR(float addrspace(1)* %out, float addrspace(3)* %inptr) { 45 46 %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)* 47 %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100 48 %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)* 49 %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)* 50 %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4 51 %v0 = extractelement <3 x float> %load0, i32 2 52 53 %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0 54 %data = insertelement <2 x float> %tmp1, float 2.0, i32 1 55 56 %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26 57 %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)* 58 store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4 59 60 %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7 61 %v1 = load float, float addrspace(3)* %vaddr1, align 4 62 63 %sum = fadd float %v0, %v1 64 store float %sum, float addrspace(1)* %out, align 4 65 ret void 66} 67 68 69; The second load depends on the store. We can combine the two loads, and the combined load is 70; at the original place of the second load. 71 72; GCN-LABEL: {{^}}ds_combine_RAW 73 74; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 75; GCN-NEXT: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:26 76define amdgpu_kernel void @ds_combine_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) { 77 78 %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)* 79 %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24 80 %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)* 81 %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)* 82 %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4 83 %v0 = extractelement <3 x float> %load0, i32 2 84 85 %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0 86 %data = insertelement <2 x float> %tmp1, float 2.0, i32 1 87 88 %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26 89 %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)* 90 store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4 91 92 %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26 93 %v1 = load float, float addrspace(3)* %vaddr1, align 4 94 95 %sum = fadd float %v0, %v1 96 store float %sum, float addrspace(1)* %out, align 4 97 ret void 98} 99 100 101; The store depends on the first load, also the second load depends on the store. 102; So we can not combine the two loads. 103 104; GCN-LABEL: {{^}}ds_combine_WAR_RAW 105 106; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:108 107; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 108; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104 109define amdgpu_kernel void @ds_combine_WAR_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) { 110 111 %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)* 112 %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100 113 %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)* 114 %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)* 115 %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4 116 %v0 = extractelement <3 x float> %load0, i32 2 117 118 %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0 119 %data = insertelement <2 x float> %tmp1, float 2.0, i32 1 120 121 %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26 122 %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)* 123 store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4 124 125 %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26 126 %v1 = load float, float addrspace(3)* %vaddr1, align 4 127 128 %sum = fadd float %v0, %v1 129 store float %sum, float addrspace(1)* %out, align 4 130 ret void 131} 132