1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s 3 4; Test based on pr5626 to load/store 5; 6 7%i32vec3 = type <3 x i32> 8define void @add3i32(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) { 9; CHECK-LABEL: add3i32: 10; CHECK: # BB#0: 11; CHECK-NEXT: movdqa (%rsi), %xmm0 12; CHECK-NEXT: paddd (%rdx), %xmm0 13; CHECK-NEXT: pextrd $2, %xmm0, 8(%rdi) 14; CHECK-NEXT: movq %xmm0, (%rdi) 15; CHECK-NEXT: movq %rdi, %rax 16; CHECK-NEXT: retq 17 %a = load %i32vec3, %i32vec3* %ap, align 16 18 %b = load %i32vec3, %i32vec3* %bp, align 16 19 %x = add %i32vec3 %a, %b 20 store %i32vec3 %x, %i32vec3* %ret, align 16 21 ret void 22} 23 24define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) { 25; CHECK-LABEL: add3i32_2: 26; CHECK: # BB#0: 27; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 28; CHECK-NEXT: pinsrd $2, 8(%rsi), %xmm0 29; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 30; CHECK-NEXT: pinsrd $2, 8(%rdx), %xmm1 31; CHECK-NEXT: paddd %xmm0, %xmm1 32; CHECK-NEXT: pextrd $2, %xmm1, 8(%rdi) 33; CHECK-NEXT: movq %xmm1, (%rdi) 34; CHECK-NEXT: movq %rdi, %rax 35; CHECK-NEXT: retq 36 %a = load %i32vec3, %i32vec3* %ap, align 8 37 %b = load %i32vec3, %i32vec3* %bp, align 8 38 %x = add %i32vec3 %a, %b 39 store %i32vec3 %x, %i32vec3* %ret, align 8 40 ret void 41} 42 43%i32vec7 = type <7 x i32> 44define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) { 45; CHECK-LABEL: add7i32: 46; CHECK: # BB#0: 47; CHECK-NEXT: movdqa (%rsi), %xmm0 48; CHECK-NEXT: movdqa 16(%rsi), %xmm1 49; CHECK-NEXT: paddd (%rdx), %xmm0 50; CHECK-NEXT: paddd 16(%rdx), %xmm1 51; CHECK-NEXT: pextrd $2, %xmm1, 24(%rdi) 52; CHECK-NEXT: movq %xmm1, 16(%rdi) 53; CHECK-NEXT: movdqa %xmm0, (%rdi) 54; CHECK-NEXT: movq %rdi, %rax 55; CHECK-NEXT: retq 56 %a = load %i32vec7, %i32vec7* %ap, align 16 57 %b = load %i32vec7, %i32vec7* %bp, align 16 58 %x = add %i32vec7 %a, %b 59 store %i32vec7 %x, %i32vec7* %ret, align 16 60 ret void 61} 62 63%i32vec12 = type <12 x i32> 64define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) { 65; CHECK-LABEL: add12i32: 66; CHECK: # BB#0: 67; CHECK-NEXT: movdqa (%rsi), %xmm0 68; CHECK-NEXT: movdqa 16(%rsi), %xmm1 69; CHECK-NEXT: movdqa 32(%rsi), %xmm2 70; CHECK-NEXT: paddd (%rdx), %xmm0 71; CHECK-NEXT: paddd 16(%rdx), %xmm1 72; CHECK-NEXT: paddd 32(%rdx), %xmm2 73; CHECK-NEXT: movdqa %xmm2, 32(%rdi) 74; CHECK-NEXT: movdqa %xmm1, 16(%rdi) 75; CHECK-NEXT: movdqa %xmm0, (%rdi) 76; CHECK-NEXT: movq %rdi, %rax 77; CHECK-NEXT: retq 78 %a = load %i32vec12, %i32vec12* %ap, align 16 79 %b = load %i32vec12, %i32vec12* %bp, align 16 80 %x = add %i32vec12 %a, %b 81 store %i32vec12 %x, %i32vec12* %ret, align 16 82 ret void 83} 84 85 86%i16vec3 = type <3 x i16> 87define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind { 88; CHECK-LABEL: add3i16: 89; CHECK: # BB#0: 90; CHECK-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 91; CHECK-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 92; CHECK-NEXT: paddd %xmm0, %xmm1 93; CHECK-NEXT: pextrw $4, %xmm1, 4(%rdi) 94; CHECK-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 95; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero 96; CHECK-NEXT: movd %xmm0, (%rdi) 97; CHECK-NEXT: movq %rdi, %rax 98; CHECK-NEXT: retq 99 %a = load %i16vec3, %i16vec3* %ap, align 16 100 %b = load %i16vec3, %i16vec3* %bp, align 16 101 %x = add %i16vec3 %a, %b 102 store %i16vec3 %x, %i16vec3* %ret, align 16 103 ret void 104} 105 106%i16vec4 = type <4 x i16> 107define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind { 108; CHECK-LABEL: add4i16: 109; CHECK: # BB#0: 110; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 111; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 112; CHECK-NEXT: paddw %xmm0, %xmm1 113; CHECK-NEXT: movq %xmm1, (%rdi) 114; CHECK-NEXT: movq %rdi, %rax 115; CHECK-NEXT: retq 116 %a = load %i16vec4, %i16vec4* %ap, align 16 117 %b = load %i16vec4, %i16vec4* %bp, align 16 118 %x = add %i16vec4 %a, %b 119 store %i16vec4 %x, %i16vec4* %ret, align 16 120 ret void 121} 122 123%i16vec12 = type <12 x i16> 124define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* %bp) nounwind { 125; CHECK-LABEL: add12i16: 126; CHECK: # BB#0: 127; CHECK-NEXT: movdqa (%rsi), %xmm0 128; CHECK-NEXT: movdqa 16(%rsi), %xmm1 129; CHECK-NEXT: paddw (%rdx), %xmm0 130; CHECK-NEXT: paddw 16(%rdx), %xmm1 131; CHECK-NEXT: movq %xmm1, 16(%rdi) 132; CHECK-NEXT: movdqa %xmm0, (%rdi) 133; CHECK-NEXT: movq %rdi, %rax 134; CHECK-NEXT: retq 135 %a = load %i16vec12, %i16vec12* %ap, align 16 136 %b = load %i16vec12, %i16vec12* %bp, align 16 137 %x = add %i16vec12 %a, %b 138 store %i16vec12 %x, %i16vec12* %ret, align 16 139 ret void 140} 141 142%i16vec18 = type <18 x i16> 143define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* %bp) nounwind { 144; CHECK-LABEL: add18i16: 145; CHECK: # BB#0: 146; CHECK-NEXT: movdqa (%rsi), %xmm0 147; CHECK-NEXT: movdqa 16(%rsi), %xmm1 148; CHECK-NEXT: movdqa 32(%rsi), %xmm2 149; CHECK-NEXT: paddw (%rdx), %xmm0 150; CHECK-NEXT: paddw 16(%rdx), %xmm1 151; CHECK-NEXT: paddw 32(%rdx), %xmm2 152; CHECK-NEXT: movd %xmm2, 32(%rdi) 153; CHECK-NEXT: movdqa %xmm1, 16(%rdi) 154; CHECK-NEXT: movdqa %xmm0, (%rdi) 155; CHECK-NEXT: movq %rdi, %rax 156; CHECK-NEXT: retq 157 %a = load %i16vec18, %i16vec18* %ap, align 16 158 %b = load %i16vec18, %i16vec18* %bp, align 16 159 %x = add %i16vec18 %a, %b 160 store %i16vec18 %x, %i16vec18* %ret, align 16 161 ret void 162} 163 164 165%i8vec3 = type <3 x i8> 166define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind { 167; CHECK-LABEL: add3i8: 168; CHECK: # BB#0: 169; CHECK-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 170; CHECK-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 171; CHECK-NEXT: paddd %xmm0, %xmm1 172; CHECK-NEXT: pextrb $8, %xmm1, 2(%rdi) 173; CHECK-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 174; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 175; CHECK-NEXT: movd %xmm0, %eax 176; CHECK-NEXT: movw %ax, (%rdi) 177; CHECK-NEXT: movq %rdi, %rax 178; CHECK-NEXT: retq 179 %a = load %i8vec3, %i8vec3* %ap, align 16 180 %b = load %i8vec3, %i8vec3* %bp, align 16 181 %x = add %i8vec3 %a, %b 182 store %i8vec3 %x, %i8vec3* %ret, align 16 183 ret void 184} 185 186%i8vec31 = type <31 x i8> 187define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp) nounwind { 188; CHECK-LABEL: add31i8: 189; CHECK: # BB#0: 190; CHECK-NEXT: movdqa (%rsi), %xmm0 191; CHECK-NEXT: movdqa 16(%rsi), %xmm1 192; CHECK-NEXT: paddb (%rdx), %xmm0 193; CHECK-NEXT: paddb 16(%rdx), %xmm1 194; CHECK-NEXT: pextrb $14, %xmm1, 30(%rdi) 195; CHECK-NEXT: pextrw $6, %xmm1, 28(%rdi) 196; CHECK-NEXT: pextrd $2, %xmm1, 24(%rdi) 197; CHECK-NEXT: movq %xmm1, 16(%rdi) 198; CHECK-NEXT: movdqa %xmm0, (%rdi) 199; CHECK-NEXT: movq %rdi, %rax 200; CHECK-NEXT: retq 201 %a = load %i8vec31, %i8vec31* %ap, align 16 202 %b = load %i8vec31, %i8vec31* %bp, align 16 203 %x = add %i8vec31 %a, %b 204 store %i8vec31 %x, %i8vec31* %ret, align 16 205 ret void 206} 207 208 209%i8vec3pack = type { <3 x i8>, i8 } 210define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind { 211; CHECK-LABEL: rot: 212; CHECK: # BB#0: # %entry 213; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,128,u,u,u,u,u,u,u,u,u,u,u,u> 214; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <158,158,158,u> 215; CHECK-NEXT: pshufb %xmm0, %xmm1 216; CHECK-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 217; CHECK-NEXT: movd %xmm1, %eax 218; CHECK-NEXT: movw %ax, (%rsi) 219; CHECK-NEXT: movb $-98, 2(%rsi) 220; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <1,1,1,u> 221; CHECK-NEXT: pshufb %xmm0, %xmm1 222; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 223; CHECK-NEXT: movd %xmm0, %eax 224; CHECK-NEXT: movw %ax, (%rdx) 225; CHECK-NEXT: movb $1, 2(%rdx) 226; CHECK-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 227; CHECK-NEXT: movdqa %xmm0, %xmm1 228; CHECK-NEXT: psrld $1, %xmm1 229; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] 230; CHECK-NEXT: pextrb $8, %xmm1, 2(%rdi) 231; CHECK-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 232; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 233; CHECK-NEXT: movd %xmm0, %eax 234; CHECK-NEXT: movw %ax, (%rdi) 235; CHECK-NEXT: movq %rdi, %rax 236; CHECK-NEXT: retq 237entry: 238 %storetmp = bitcast %i8vec3pack* %X to <3 x i8>* 239 store <3 x i8> <i8 -98, i8 -98, i8 -98>, <3 x i8>* %storetmp 240 %storetmp1 = bitcast %i8vec3pack* %rot to <3 x i8>* 241 store <3 x i8> <i8 1, i8 1, i8 1>, <3 x i8>* %storetmp1 242 %tmp = load %i8vec3pack, %i8vec3pack* %X 243 %extractVec = extractvalue %i8vec3pack %tmp, 0 244 %tmp2 = load %i8vec3pack, %i8vec3pack* %rot 245 %extractVec3 = extractvalue %i8vec3pack %tmp2, 0 246 %shr = lshr <3 x i8> %extractVec, %extractVec3 247 %storetmp4 = bitcast %i8vec3pack* %result to <3 x i8>* 248 store <3 x i8> %shr, <3 x i8>* %storetmp4 249 ret void 250} 251 252