1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s
3
4define <64 x i8> @test1(i8 * %addr) {
5; CHECK-LABEL: test1:
6; CHECK:       ## BB#0:
7; CHECK-NEXT:    vmovdqu8 (%rdi), %zmm0
8; CHECK-NEXT:    retq
9  %vaddr = bitcast i8* %addr to <64 x i8>*
10  %res = load <64 x i8>, <64 x i8>* %vaddr, align 1
11  ret <64 x i8>%res
12}
13
14define void @test2(i8 * %addr, <64 x i8> %data) {
15; CHECK-LABEL: test2:
16; CHECK:       ## BB#0:
17; CHECK-NEXT:    vmovdqu8 %zmm0, (%rdi)
18; CHECK-NEXT:    retq
19  %vaddr = bitcast i8* %addr to <64 x i8>*
20  store <64 x i8>%data, <64 x i8>* %vaddr, align 1
21  ret void
22}
23
24define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) {
25; CHECK-LABEL: test3:
26; CHECK:       ## BB#0:
27; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
28; CHECK-NEXT:    vpcmpneqb %zmm2, %zmm1, %k1
29; CHECK-NEXT:    vpblendmb (%rdi), %zmm0, %zmm0 {%k1}
30; CHECK-NEXT:    retq
31  %mask = icmp ne <64 x i8> %mask1, zeroinitializer
32  %vaddr = bitcast i8* %addr to <64 x i8>*
33  %r = load <64 x i8>, <64 x i8>* %vaddr, align 1
34  %res = select <64 x i1> %mask, <64 x i8> %r, <64 x i8> %old
35  ret <64 x i8>%res
36}
37
38define <64 x i8> @test4(i8 * %addr, <64 x i8> %mask1) {
39; CHECK-LABEL: test4:
40; CHECK:       ## BB#0:
41; CHECK-NEXT:    vpxord %zmm1, %zmm1, %zmm1
42; CHECK-NEXT:    vpcmpneqb %zmm1, %zmm0, %k1
43; CHECK-NEXT:    vmovdqu8 (%rdi), %zmm0 {%k1} {z}
44; CHECK-NEXT:    retq
45  %mask = icmp ne <64 x i8> %mask1, zeroinitializer
46  %vaddr = bitcast i8* %addr to <64 x i8>*
47  %r = load <64 x i8>, <64 x i8>* %vaddr, align 1
48  %res = select <64 x i1> %mask, <64 x i8> %r, <64 x i8> zeroinitializer
49  ret <64 x i8>%res
50}
51
52define <32 x i16> @test5(i8 * %addr) {
53; CHECK-LABEL: test5:
54; CHECK:       ## BB#0:
55; CHECK-NEXT:    vmovdqu16 (%rdi), %zmm0
56; CHECK-NEXT:    retq
57  %vaddr = bitcast i8* %addr to <32 x i16>*
58  %res = load <32 x i16>, <32 x i16>* %vaddr, align 1
59  ret <32 x i16>%res
60}
61
62define void @test6(i8 * %addr, <32 x i16> %data) {
63; CHECK-LABEL: test6:
64; CHECK:       ## BB#0:
65; CHECK-NEXT:    vmovdqu16 %zmm0, (%rdi)
66; CHECK-NEXT:    retq
67  %vaddr = bitcast i8* %addr to <32 x i16>*
68  store <32 x i16>%data, <32 x i16>* %vaddr, align 1
69  ret void
70}
71
72define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) {
73; CHECK-LABEL: test7:
74; CHECK:       ## BB#0:
75; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
76; CHECK-NEXT:    vpcmpneqw %zmm2, %zmm1, %k1
77; CHECK-NEXT:    vpblendmw (%rdi), %zmm0, %zmm0 {%k1}
78; CHECK-NEXT:    retq
79  %mask = icmp ne <32 x i16> %mask1, zeroinitializer
80  %vaddr = bitcast i8* %addr to <32 x i16>*
81  %r = load <32 x i16>, <32 x i16>* %vaddr, align 1
82  %res = select <32 x i1> %mask, <32 x i16> %r, <32 x i16> %old
83  ret <32 x i16>%res
84}
85
86define <32 x i16> @test8(i8 * %addr, <32 x i16> %mask1) {
87; CHECK-LABEL: test8:
88; CHECK:       ## BB#0:
89; CHECK-NEXT:    vpxord %zmm1, %zmm1, %zmm1
90; CHECK-NEXT:    vpcmpneqw %zmm1, %zmm0, %k1
91; CHECK-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
92; CHECK-NEXT:    retq
93  %mask = icmp ne <32 x i16> %mask1, zeroinitializer
94  %vaddr = bitcast i8* %addr to <32 x i16>*
95  %r = load <32 x i16>, <32 x i16>* %vaddr, align 1
96  %res = select <32 x i1> %mask, <32 x i16> %r, <32 x i16> zeroinitializer
97  ret <32 x i16>%res
98}
99
100define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
101; CHECK-LABEL: test_mask_load_16xi8:
102; CHECK:       ## BB#0:
103; CHECK-NEXT:    vpsllw $7, %xmm0, %xmm0
104; CHECK-NEXT:    vpmovb2m %zmm0, %k0
105; CHECK-NEXT:    kshiftlq $48, %k0, %k0
106; CHECK-NEXT:    kshiftrq $48, %k0, %k1
107; CHECK-NEXT:    vmovdqu8 (%rdi), %zmm0 {%k1} {z}
108; CHECK-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
109; CHECK-NEXT:    retq
110  %res = call <16 x i8> @llvm.masked.load.v16i8(<16 x i8>* %addr, i32 4, <16 x i1>%mask, <16 x i8> undef)
111  ret <16 x i8> %res
112}
113declare <16 x i8> @llvm.masked.load.v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
114
115define <32 x i8> @test_mask_load_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
116; CHECK-LABEL: test_mask_load_32xi8:
117; CHECK:       ## BB#0:
118; CHECK-NEXT:    vpsllw $7, %ymm0, %ymm0
119; CHECK-NEXT:    vpmovb2m %zmm0, %k0
120; CHECK-NEXT:    kshiftlq $32, %k0, %k0
121; CHECK-NEXT:    kshiftrq $32, %k0, %k1
122; CHECK-NEXT:    vmovdqu8 (%rdi), %zmm0 {%k1} {z}
123; CHECK-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
124; CHECK-NEXT:    retq
125  %res = call <32 x i8> @llvm.masked.load.v32i8(<32 x i8>* %addr, i32 4, <32 x i1>%mask, <32 x i8> zeroinitializer)
126  ret <32 x i8> %res
127}
128declare <32 x i8> @llvm.masked.load.v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>)
129
130define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
131; CHECK-LABEL: test_mask_load_8xi16:
132; CHECK:       ## BB#0:
133; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0
134; CHECK-NEXT:    vpmovw2m %zmm0, %k0
135; CHECK-NEXT:    kshiftld $24, %k0, %k0
136; CHECK-NEXT:    kshiftrd $24, %k0, %k1
137; CHECK-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
138; CHECK-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
139; CHECK-NEXT:    retq
140  %res = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %addr, i32 4, <8 x i1>%mask, <8 x i16> undef)
141  ret <8 x i16> %res
142}
143declare <8 x i16> @llvm.masked.load.v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
144
145define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
146; CHECK-LABEL: test_mask_load_16xi16:
147; CHECK:       ## BB#0:
148; CHECK-NEXT:    vpsllw $7, %xmm0, %xmm0
149; CHECK-NEXT:    vpmovb2m %zmm0, %k0
150; CHECK-NEXT:    kshiftld $16, %k0, %k0
151; CHECK-NEXT:    kshiftrd $16, %k0, %k1
152; CHECK-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
153; CHECK-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
154; CHECK-NEXT:    retq
155  %res = call <16 x i16> @llvm.masked.load.v16i16(<16 x i16>* %addr, i32 4, <16 x i1>%mask, <16 x i16> zeroinitializer)
156  ret <16 x i16> %res
157}
158declare <16 x i16> @llvm.masked.load.v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>)
159
160define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
161; CHECK-LABEL: test_mask_store_16xi8:
162; CHECK:       ## BB#0:
163; CHECK-NEXT:    ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
164; CHECK-NEXT:    vpsllw $7, %xmm0, %xmm0
165; CHECK-NEXT:    vpmovb2m %zmm0, %k0
166; CHECK-NEXT:    kshiftlq $48, %k0, %k0
167; CHECK-NEXT:    kshiftrq $48, %k0, %k1
168; CHECK-NEXT:    vmovdqu8 %zmm1, (%rdi) {%k1}
169; CHECK-NEXT:    retq
170  call void @llvm.masked.store.v16i8(<16 x i8> %val, <16 x i8>* %addr, i32 4, <16 x i1>%mask)
171  ret void
172}
173declare void @llvm.masked.store.v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
174
175define void @test_mask_store_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
176; CHECK-LABEL: test_mask_store_32xi8:
177; CHECK:       ## BB#0:
178; CHECK-NEXT:    ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
179; CHECK-NEXT:    vpsllw $7, %ymm0, %ymm0
180; CHECK-NEXT:    vpmovb2m %zmm0, %k0
181; CHECK-NEXT:    kshiftlq $32, %k0, %k0
182; CHECK-NEXT:    kshiftrq $32, %k0, %k1
183; CHECK-NEXT:    vmovdqu8 %zmm1, (%rdi) {%k1}
184; CHECK-NEXT:    retq
185  call void @llvm.masked.store.v32i8(<32 x i8> %val, <32 x i8>* %addr, i32 4, <32 x i1>%mask)
186  ret void
187}
188declare void @llvm.masked.store.v32i8(<32 x i8>, <32 x i8>*, i32, <32 x i1>)
189
190define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
191; CHECK-LABEL: test_mask_store_8xi16:
192; CHECK:       ## BB#0:
193; CHECK-NEXT:    ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
194; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0
195; CHECK-NEXT:    vpmovw2m %zmm0, %k0
196; CHECK-NEXT:    kshiftld $24, %k0, %k0
197; CHECK-NEXT:    kshiftrd $24, %k0, %k1
198; CHECK-NEXT:    vmovdqu16 %zmm1, (%rdi) {%k1}
199; CHECK-NEXT:    retq
200  call void @llvm.masked.store.v8i16(<8 x i16> %val, <8 x i16>* %addr, i32 4, <8 x i1>%mask)
201  ret void
202}
203declare void @llvm.masked.store.v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
204
205define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
206; CHECK-LABEL: test_mask_store_16xi16:
207; CHECK:       ## BB#0:
208; CHECK-NEXT:    ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
209; CHECK-NEXT:    vpsllw $7, %xmm0, %xmm0
210; CHECK-NEXT:    vpmovb2m %zmm0, %k0
211; CHECK-NEXT:    kshiftld $16, %k0, %k0
212; CHECK-NEXT:    kshiftrd $16, %k0, %k1
213; CHECK-NEXT:    vmovdqu16 %zmm1, (%rdi) {%k1}
214; CHECK-NEXT:    retq
215  call void @llvm.masked.store.v16i16(<16 x i16> %val, <16 x i16>* %addr, i32 4, <16 x i1>%mask)
216  ret void
217}
218declare void @llvm.masked.store.v16i16(<16 x i16>, <16 x i16>*, i32, <16 x i1>)
219