1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,+slow-unaligned-mem-32 | FileCheck %s --check-prefix=AVXSLOW 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,-slow-unaligned-mem-32 | FileCheck %s --check-prefix=AVXFAST 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=AVX2 5 6; Don't generate an unaligned 32-byte load on this test if that is slower than two 16-byte loads. 7 8define <8 x float> @load32bytes(<8 x float>* %Ap) { 9; AVXSLOW-LABEL: load32bytes: 10; AVXSLOW: # %bb.0: 11; AVXSLOW-NEXT: vmovaps (%rdi), %xmm0 12; AVXSLOW-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 13; AVXSLOW-NEXT: retq 14; 15; AVXFAST-LABEL: load32bytes: 16; AVXFAST: # %bb.0: 17; AVXFAST-NEXT: vmovups (%rdi), %ymm0 18; AVXFAST-NEXT: retq 19; 20; AVX2-LABEL: load32bytes: 21; AVX2: # %bb.0: 22; AVX2-NEXT: vmovups (%rdi), %ymm0 23; AVX2-NEXT: retq 24 %A = load <8 x float>, <8 x float>* %Ap, align 16 25 ret <8 x float> %A 26} 27 28; Don't generate an unaligned 32-byte store on this test if that is slower than two 16-byte loads. 29 30define void @store32bytes(<8 x float> %A, <8 x float>* %P) { 31; AVXSLOW-LABEL: store32bytes: 32; AVXSLOW: # %bb.0: 33; AVXSLOW-NEXT: vextractf128 $1, %ymm0, 16(%rdi) 34; AVXSLOW-NEXT: vmovaps %xmm0, (%rdi) 35; AVXSLOW-NEXT: vzeroupper 36; AVXSLOW-NEXT: retq 37; 38; AVXFAST-LABEL: store32bytes: 39; AVXFAST: # %bb.0: 40; AVXFAST-NEXT: vmovups %ymm0, (%rdi) 41; AVXFAST-NEXT: vzeroupper 42; AVXFAST-NEXT: retq 43; 44; AVX2-LABEL: store32bytes: 45; AVX2: # %bb.0: 46; AVX2-NEXT: vmovups %ymm0, (%rdi) 47; AVX2-NEXT: vzeroupper 48; AVX2-NEXT: retq 49 store <8 x float> %A, <8 x float>* %P, align 16 50 ret void 51} 52 53; Merge two consecutive 16-byte subvector loads into a single 32-byte load if it's faster. 54 55define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) { 56; AVXSLOW-LABEL: combine_16_byte_loads_no_intrinsic: 57; AVXSLOW: # %bb.0: 58; AVXSLOW-NEXT: vmovups 48(%rdi), %xmm0 59; AVXSLOW-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 60; AVXSLOW-NEXT: retq 61; 62; AVXFAST-LABEL: combine_16_byte_loads_no_intrinsic: 63; AVXFAST: # %bb.0: 64; AVXFAST-NEXT: vmovups 48(%rdi), %ymm0 65; AVXFAST-NEXT: retq 66; 67; AVX2-LABEL: combine_16_byte_loads_no_intrinsic: 68; AVX2: # %bb.0: 69; AVX2-NEXT: vmovups 48(%rdi), %ymm0 70; AVX2-NEXT: retq 71 %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3 72 %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 73 %v1 = load <4 x float>, <4 x float>* %ptr1, align 1 74 %v2 = load <4 x float>, <4 x float>* %ptr2, align 1 75 %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 76 ret <8 x float> %v3 77} 78 79; If the first load is 32-byte aligned, then the loads should be merged in all cases. 80 81define <8 x float> @combine_16_byte_loads_aligned(<4 x float>* %ptr) { 82; AVXSLOW-LABEL: combine_16_byte_loads_aligned: 83; AVXSLOW: # %bb.0: 84; AVXSLOW-NEXT: vmovaps 48(%rdi), %ymm0 85; AVXSLOW-NEXT: retq 86; 87; AVXFAST-LABEL: combine_16_byte_loads_aligned: 88; AVXFAST: # %bb.0: 89; AVXFAST-NEXT: vmovaps 48(%rdi), %ymm0 90; AVXFAST-NEXT: retq 91; 92; AVX2-LABEL: combine_16_byte_loads_aligned: 93; AVX2: # %bb.0: 94; AVX2-NEXT: vmovaps 48(%rdi), %ymm0 95; AVX2-NEXT: retq 96 %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3 97 %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 98 %v1 = load <4 x float>, <4 x float>* %ptr1, align 32 99 %v2 = load <4 x float>, <4 x float>* %ptr2, align 1 100 %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 101 ret <8 x float> %v3 102} 103 104; Swap the order of the shufflevector operands to ensure that the pattern still matches. 105 106define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) { 107; AVXSLOW-LABEL: combine_16_byte_loads_no_intrinsic_swap: 108; AVXSLOW: # %bb.0: 109; AVXSLOW-NEXT: vmovups 64(%rdi), %xmm0 110; AVXSLOW-NEXT: vinsertf128 $1, 80(%rdi), %ymm0, %ymm0 111; AVXSLOW-NEXT: retq 112; 113; AVXFAST-LABEL: combine_16_byte_loads_no_intrinsic_swap: 114; AVXFAST: # %bb.0: 115; AVXFAST-NEXT: vmovups 64(%rdi), %ymm0 116; AVXFAST-NEXT: retq 117; 118; AVX2-LABEL: combine_16_byte_loads_no_intrinsic_swap: 119; AVX2: # %bb.0: 120; AVX2-NEXT: vmovups 64(%rdi), %ymm0 121; AVX2-NEXT: retq 122 %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 123 %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5 124 %v1 = load <4 x float>, <4 x float>* %ptr1, align 1 125 %v2 = load <4 x float>, <4 x float>* %ptr2, align 1 126 %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 127 ret <8 x float> %v3 128} 129 130; Check each element type other than float to make sure it is handled correctly. 131; Use the loaded values with an 'add' to make sure we're using the correct load type. 132; Don't generate 32-byte loads for integer ops unless we have AVX2. 133 134define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) { 135; AVXSLOW-LABEL: combine_16_byte_loads_i64: 136; AVXSLOW: # %bb.0: 137; AVXSLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 138; AVXSLOW-NEXT: vpaddq 96(%rdi), %xmm1, %xmm1 139; AVXSLOW-NEXT: vpaddq 80(%rdi), %xmm0, %xmm0 140; AVXSLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 141; AVXSLOW-NEXT: retq 142; 143; AVXFAST-LABEL: combine_16_byte_loads_i64: 144; AVXFAST: # %bb.0: 145; AVXFAST-NEXT: vextractf128 $1, %ymm0, %xmm1 146; AVXFAST-NEXT: vpaddq 96(%rdi), %xmm1, %xmm1 147; AVXFAST-NEXT: vpaddq 80(%rdi), %xmm0, %xmm0 148; AVXFAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 149; AVXFAST-NEXT: retq 150; 151; AVX2-LABEL: combine_16_byte_loads_i64: 152; AVX2: # %bb.0: 153; AVX2-NEXT: vpaddq 80(%rdi), %ymm0, %ymm0 154; AVX2-NEXT: retq 155 %ptr1 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 5 156 %ptr2 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 6 157 %v1 = load <2 x i64>, <2 x i64>* %ptr1, align 1 158 %v2 = load <2 x i64>, <2 x i64>* %ptr2, align 1 159 %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 160 %v4 = add <4 x i64> %v3, %x 161 ret <4 x i64> %v4 162} 163 164define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) { 165; AVXSLOW-LABEL: combine_16_byte_loads_i32: 166; AVXSLOW: # %bb.0: 167; AVXSLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 168; AVXSLOW-NEXT: vpaddd 112(%rdi), %xmm1, %xmm1 169; AVXSLOW-NEXT: vpaddd 96(%rdi), %xmm0, %xmm0 170; AVXSLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 171; AVXSLOW-NEXT: retq 172; 173; AVXFAST-LABEL: combine_16_byte_loads_i32: 174; AVXFAST: # %bb.0: 175; AVXFAST-NEXT: vextractf128 $1, %ymm0, %xmm1 176; AVXFAST-NEXT: vpaddd 112(%rdi), %xmm1, %xmm1 177; AVXFAST-NEXT: vpaddd 96(%rdi), %xmm0, %xmm0 178; AVXFAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 179; AVXFAST-NEXT: retq 180; 181; AVX2-LABEL: combine_16_byte_loads_i32: 182; AVX2: # %bb.0: 183; AVX2-NEXT: vpaddd 96(%rdi), %ymm0, %ymm0 184; AVX2-NEXT: retq 185 %ptr1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 6 186 %ptr2 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 7 187 %v1 = load <4 x i32>, <4 x i32>* %ptr1, align 1 188 %v2 = load <4 x i32>, <4 x i32>* %ptr2, align 1 189 %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 190 %v4 = add <8 x i32> %v3, %x 191 ret <8 x i32> %v4 192} 193 194define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) { 195; AVXSLOW-LABEL: combine_16_byte_loads_i16: 196; AVXSLOW: # %bb.0: 197; AVXSLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 198; AVXSLOW-NEXT: vpaddw 128(%rdi), %xmm1, %xmm1 199; AVXSLOW-NEXT: vpaddw 112(%rdi), %xmm0, %xmm0 200; AVXSLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 201; AVXSLOW-NEXT: retq 202; 203; AVXFAST-LABEL: combine_16_byte_loads_i16: 204; AVXFAST: # %bb.0: 205; AVXFAST-NEXT: vextractf128 $1, %ymm0, %xmm1 206; AVXFAST-NEXT: vpaddw 128(%rdi), %xmm1, %xmm1 207; AVXFAST-NEXT: vpaddw 112(%rdi), %xmm0, %xmm0 208; AVXFAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 209; AVXFAST-NEXT: retq 210; 211; AVX2-LABEL: combine_16_byte_loads_i16: 212; AVX2: # %bb.0: 213; AVX2-NEXT: vpaddw 112(%rdi), %ymm0, %ymm0 214; AVX2-NEXT: retq 215 %ptr1 = getelementptr inbounds <8 x i16>, <8 x i16>* %ptr, i64 7 216 %ptr2 = getelementptr inbounds <8 x i16>, <8 x i16>* %ptr, i64 8 217 %v1 = load <8 x i16>, <8 x i16>* %ptr1, align 1 218 %v2 = load <8 x i16>, <8 x i16>* %ptr2, align 1 219 %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 220 %v4 = add <16 x i16> %v3, %x 221 ret <16 x i16> %v4 222} 223 224define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) { 225; AVXSLOW-LABEL: combine_16_byte_loads_i8: 226; AVXSLOW: # %bb.0: 227; AVXSLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 228; AVXSLOW-NEXT: vpaddb 144(%rdi), %xmm1, %xmm1 229; AVXSLOW-NEXT: vpaddb 128(%rdi), %xmm0, %xmm0 230; AVXSLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 231; AVXSLOW-NEXT: retq 232; 233; AVXFAST-LABEL: combine_16_byte_loads_i8: 234; AVXFAST: # %bb.0: 235; AVXFAST-NEXT: vextractf128 $1, %ymm0, %xmm1 236; AVXFAST-NEXT: vpaddb 144(%rdi), %xmm1, %xmm1 237; AVXFAST-NEXT: vpaddb 128(%rdi), %xmm0, %xmm0 238; AVXFAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 239; AVXFAST-NEXT: retq 240; 241; AVX2-LABEL: combine_16_byte_loads_i8: 242; AVX2: # %bb.0: 243; AVX2-NEXT: vpaddb 128(%rdi), %ymm0, %ymm0 244; AVX2-NEXT: retq 245 %ptr1 = getelementptr inbounds <16 x i8>, <16 x i8>* %ptr, i64 8 246 %ptr2 = getelementptr inbounds <16 x i8>, <16 x i8>* %ptr, i64 9 247 %v1 = load <16 x i8>, <16 x i8>* %ptr1, align 1 248 %v2 = load <16 x i8>, <16 x i8>* %ptr2, align 1 249 %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 250 %v4 = add <32 x i8> %v3, %x 251 ret <32 x i8> %v4 252} 253 254define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) { 255; AVXSLOW-LABEL: combine_16_byte_loads_double: 256; AVXSLOW: # %bb.0: 257; AVXSLOW-NEXT: vmovups 144(%rdi), %xmm1 258; AVXSLOW-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm1 259; AVXSLOW-NEXT: vaddpd %ymm0, %ymm1, %ymm0 260; AVXSLOW-NEXT: retq 261; 262; AVXFAST-LABEL: combine_16_byte_loads_double: 263; AVXFAST: # %bb.0: 264; AVXFAST-NEXT: vaddpd 144(%rdi), %ymm0, %ymm0 265; AVXFAST-NEXT: retq 266; 267; AVX2-LABEL: combine_16_byte_loads_double: 268; AVX2: # %bb.0: 269; AVX2-NEXT: vaddpd 144(%rdi), %ymm0, %ymm0 270; AVX2-NEXT: retq 271 %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 9 272 %ptr2 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 10 273 %v1 = load <2 x double>, <2 x double>* %ptr1, align 1 274 %v2 = load <2 x double>, <2 x double>* %ptr2, align 1 275 %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 276 %v4 = fadd <4 x double> %v3, %x 277 ret <4 x double> %v4 278} 279 280