1; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK 2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK 3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 --check-prefix=CHECK 4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL --check-prefix=CHECK 5 6; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load 7; because that is slower than two 16-byte loads. 8; Other AVX-capable chips don't have that problem. 9 10define <8 x float> @load32bytes(<8 x float>* %Ap) { 11 ; CHECK-LABEL: load32bytes 12 13 ; SANDYB: vmovaps 14 ; SANDYB: vinsertf128 15 ; SANDYB: retq 16 17 ; BTVER2: vmovups 18 ; BTVER2: retq 19 20 ; HASWELL: vmovups 21 ; HASWELL: retq 22 23 %A = load <8 x float>, <8 x float>* %Ap, align 16 24 ret <8 x float> %A 25} 26 27; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store 28; because that is slowerthan two 16-byte stores. 29; Other AVX-capable chips don't have that problem. 30 31define void @store32bytes(<8 x float> %A, <8 x float>* %P) { 32 ; CHECK-LABEL: store32bytes 33 34 ; SANDYB: vextractf128 35 ; SANDYB: vmovaps 36 ; SANDYB: retq 37 38 ; BTVER2: vmovups 39 ; BTVER2: retq 40 41 ; HASWELL: vmovups 42 ; HASWELL: retq 43 44 store <8 x float> %A, <8 x float>* %P, align 16 45 ret void 46} 47 48; Merge two consecutive 16-byte subvector loads into a single 32-byte load 49; if it's faster. 50 51define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) { 52 ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic 53 54 ; SANDYB: vmovups 55 ; SANDYB-NEXT: vinsertf128 56 ; SANDYB-NEXT: retq 57 58 ; BTVER2: vmovups 59 ; BTVER2-NEXT: retq 60 61 ; HASWELL: vmovups 62 ; HASWELL-NEXT: retq 63 64 %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3 65 %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 66 %v1 = load <4 x float>, <4 x float>* %ptr1, align 1 67 %v2 = load <4 x float>, <4 x float>* %ptr2, align 1 68 %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 69 ret <8 x float> %v3 70} 71 72; Swap the order of the shufflevector operands to ensure that the 73; pattern still matches. 74define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) { 75 ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic_swap 76 77 ; SANDYB: vmovups 78 ; SANDYB-NEXT: vinsertf128 79 ; SANDYB-NEXT: retq 80 81 ; BTVER2: vmovups 82 ; BTVER2-NEXT: retq 83 84 ; HASWELL: vmovups 85 ; HASWELL-NEXT: retq 86 87 %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 88 %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5 89 %v1 = load <4 x float>, <4 x float>* %ptr1, align 1 90 %v2 = load <4 x float>, <4 x float>* %ptr2, align 1 91 %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 92 ret <8 x float> %v3 93} 94 95; Check each element type other than float to make sure it is handled correctly. 96; Use the loaded values with an 'add' to make sure we're using the correct load type. 97; Even though BtVer2 has fast 32-byte loads, we should not generate those for 98; 256-bit integer vectors because BtVer2 doesn't have AVX2. 99 100define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) { 101 ; CHECK-LABEL: combine_16_byte_loads_i64 102 103 ; SANDYB: vextractf128 104 ; SANDYB-NEXT: vpaddq 105 ; SANDYB-NEXT: vpaddq 106 ; SANDYB-NEXT: vinsertf128 107 ; SANDYB-NEXT: retq 108 109 ; BTVER2: vextractf128 110 ; BTVER2-NEXT: vpaddq 111 ; BTVER2-NEXT: vpaddq 112 ; BTVER2-NEXT: vinsertf128 113 ; BTVER2-NEXT: retq 114 115 ; HASWELL-NOT: vextract 116 ; HASWELL: vpaddq 117 ; HASWELL-NEXT: retq 118 119 %ptr1 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 5 120 %ptr2 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 6 121 %v1 = load <2 x i64>, <2 x i64>* %ptr1, align 1 122 %v2 = load <2 x i64>, <2 x i64>* %ptr2, align 1 123 %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 124 %v4 = add <4 x i64> %v3, %x 125 ret <4 x i64> %v4 126} 127 128define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) { 129 ; CHECK-LABEL: combine_16_byte_loads_i32 130 131 ; SANDYB: vextractf128 132 ; SANDYB-NEXT: vpaddd 133 ; SANDYB-NEXT: vpaddd 134 ; SANDYB-NEXT: vinsertf128 135 ; SANDYB-NEXT: retq 136 137 ; BTVER2: vextractf128 138 ; BTVER2-NEXT: vpaddd 139 ; BTVER2-NEXT: vpaddd 140 ; BTVER2-NEXT: vinsertf128 141 ; BTVER2-NEXT: retq 142 143 ; HASWELL-NOT: vextract 144 ; HASWELL: vpaddd 145 ; HASWELL-NEXT: retq 146 147 %ptr1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 6 148 %ptr2 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 7 149 %v1 = load <4 x i32>, <4 x i32>* %ptr1, align 1 150 %v2 = load <4 x i32>, <4 x i32>* %ptr2, align 1 151 %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 152 %v4 = add <8 x i32> %v3, %x 153 ret <8 x i32> %v4 154} 155 156define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) { 157 ; CHECK-LABEL: combine_16_byte_loads_i16 158 159 ; SANDYB: vextractf128 160 ; SANDYB-NEXT: vpaddw 161 ; SANDYB-NEXT: vpaddw 162 ; SANDYB-NEXT: vinsertf128 163 ; SANDYB-NEXT: retq 164 165 ; BTVER2: vextractf128 166 ; BTVER2-NEXT: vpaddw 167 ; BTVER2-NEXT: vpaddw 168 ; BTVER2-NEXT: vinsertf128 169 ; BTVER2-NEXT: retq 170 171 ; HASWELL-NOT: vextract 172 ; HASWELL: vpaddw 173 ; HASWELL-NEXT: retq 174 175 %ptr1 = getelementptr inbounds <8 x i16>, <8 x i16>* %ptr, i64 7 176 %ptr2 = getelementptr inbounds <8 x i16>, <8 x i16>* %ptr, i64 8 177 %v1 = load <8 x i16>, <8 x i16>* %ptr1, align 1 178 %v2 = load <8 x i16>, <8 x i16>* %ptr2, align 1 179 %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 180 %v4 = add <16 x i16> %v3, %x 181 ret <16 x i16> %v4 182} 183 184define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) { 185 ; CHECK-LABEL: combine_16_byte_loads_i8 186 187 ; SANDYB: vextractf128 188 ; SANDYB-NEXT: vpaddb 189 ; SANDYB-NEXT: vpaddb 190 ; SANDYB-NEXT: vinsertf128 191 ; SANDYB-NEXT: retq 192 193 ; BTVER2: vextractf128 194 ; BTVER2-NEXT: vpaddb 195 ; BTVER2-NEXT: vpaddb 196 ; BTVER2-NEXT: vinsertf128 197 ; BTVER2-NEXT: retq 198 199 ; HASWELL-NOT: vextract 200 ; HASWELL: vpaddb 201 ; HASWELL-NEXT: retq 202 203 %ptr1 = getelementptr inbounds <16 x i8>, <16 x i8>* %ptr, i64 8 204 %ptr2 = getelementptr inbounds <16 x i8>, <16 x i8>* %ptr, i64 9 205 %v1 = load <16 x i8>, <16 x i8>* %ptr1, align 1 206 %v2 = load <16 x i8>, <16 x i8>* %ptr2, align 1 207 %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 208 %v4 = add <32 x i8> %v3, %x 209 ret <32 x i8> %v4 210} 211 212define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) { 213 ; CHECK-LABEL: combine_16_byte_loads_double 214 215 ; SANDYB: vmovupd 216 ; SANDYB-NEXT: vinsertf128 217 ; SANDYB-NEXT: vaddpd 218 ; SANDYB-NEXT: retq 219 220 ; BTVER2-NOT: vinsertf128 221 ; BTVER2: vaddpd 222 ; BTVER2-NEXT: retq 223 224 ; HASWELL-NOT: vinsertf128 225 ; HASWELL: vaddpd 226 ; HASWELL-NEXT: retq 227 228 %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 9 229 %ptr2 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 10 230 %v1 = load <2 x double>, <2 x double>* %ptr1, align 1 231 %v2 = load <2 x double>, <2 x double>* %ptr2, align 1 232 %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 233 %v4 = fadd <4 x double> %v3, %x 234 ret <4 x double> %v4 235} 236 237