1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=knl | FileCheck %s -check-prefix=X32 3; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=skx | FileCheck %s -check-prefix=X32 4; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=knl | FileCheck %s -check-prefix=WIN32 5; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=skx | FileCheck %s -check-prefix=WIN32 6; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=knl | FileCheck %s -check-prefixes=WIN64,WIN64-KNL 7; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=skx | FileCheck %s -check-prefixes=WIN64,WIN64-SKX 8; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -check-prefixes=X64,X64-KNL 9; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s -check-prefixes=X64,X64-SKX 10 11declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *) 12declare <16 x float> @func_float16(<16 x float>, <16 x float>) 13declare i32 @func_int(i32, i32) 14 15;test calling conventions - input parameters 16define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind { 17; X32-LABEL: testf16_inp: 18; X32: ## %bb.0: 19; X32-NEXT: pushl %ebp 20; X32-NEXT: movl %esp, %ebp 21; X32-NEXT: andl $-64, %esp 22; X32-NEXT: subl $192, %esp 23; X32-NEXT: vaddps %zmm1, %zmm0, %zmm0 24; X32-NEXT: leal {{[0-9]+}}(%esp), %eax 25; X32-NEXT: movl %eax, (%esp) 26; X32-NEXT: calll _func_float16_ptr 27; X32-NEXT: vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0 28; X32-NEXT: movl %ebp, %esp 29; X32-NEXT: popl %ebp 30; X32-NEXT: retl 31; 32; WIN32-LABEL: testf16_inp: 33; WIN32: # %bb.0: 34; WIN32-NEXT: pushl %ebp 35; WIN32-NEXT: movl %esp, %ebp 36; WIN32-NEXT: andl $-64, %esp 37; WIN32-NEXT: subl $128, %esp 38; WIN32-NEXT: vaddps %zmm1, %zmm0, %zmm0 39; WIN32-NEXT: movl %esp, %eax 40; WIN32-NEXT: pushl %eax 41; WIN32-NEXT: calll _func_float16_ptr 42; WIN32-NEXT: addl $4, %esp 43; WIN32-NEXT: vaddps (%esp), %zmm0, %zmm0 44; WIN32-NEXT: movl %ebp, %esp 45; WIN32-NEXT: popl %ebp 46; WIN32-NEXT: retl 47; 48; WIN64-LABEL: testf16_inp: 49; WIN64: # %bb.0: 50; WIN64-NEXT: pushq %rbp 51; WIN64-NEXT: subq $176, %rsp 52; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rbp 53; WIN64-NEXT: andq $-64, %rsp 54; WIN64-NEXT: vmovaps (%rcx), %zmm0 55; WIN64-NEXT: vaddps (%rdx), %zmm0, %zmm0 56; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx 57; WIN64-NEXT: callq func_float16_ptr 58; WIN64-NEXT: vaddps {{[0-9]+}}(%rsp), %zmm0, %zmm0 59; WIN64-NEXT: leaq 48(%rbp), %rsp 60; WIN64-NEXT: popq %rbp 61; WIN64-NEXT: retq 62; 63; X64-LABEL: testf16_inp: 64; X64: ## %bb.0: 65; X64-NEXT: pushq %rbp 66; X64-NEXT: movq %rsp, %rbp 67; X64-NEXT: pushq %r13 68; X64-NEXT: pushq %r12 69; X64-NEXT: andq $-64, %rsp 70; X64-NEXT: subq $128, %rsp 71; X64-NEXT: vaddps %zmm1, %zmm0, %zmm0 72; X64-NEXT: movq %rsp, %rdi 73; X64-NEXT: callq _func_float16_ptr 74; X64-NEXT: vaddps (%rsp), %zmm0, %zmm0 75; X64-NEXT: leaq -16(%rbp), %rsp 76; X64-NEXT: popq %r12 77; X64-NEXT: popq %r13 78; X64-NEXT: popq %rbp 79; X64-NEXT: retq 80 %y = alloca <16 x float>, align 64 81 %x = fadd <16 x float> %a, %b 82 %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 83 %2 = load <16 x float>, <16 x float>* %y, align 16 84 %3 = fadd <16 x float> %2, %1 85 ret <16 x float> %3 86} 87 88;test calling conventions - preserved registers 89 90define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind { 91; X32-LABEL: testf16_regs: 92; X32: ## %bb.0: 93; X32-NEXT: pushl %ebp 94; X32-NEXT: movl %esp, %ebp 95; X32-NEXT: andl $-64, %esp 96; X32-NEXT: subl $256, %esp ## imm = 0x100 97; X32-NEXT: vmovaps %zmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill 98; X32-NEXT: vaddps %zmm1, %zmm0, %zmm0 99; X32-NEXT: leal {{[0-9]+}}(%esp), %eax 100; X32-NEXT: movl %eax, (%esp) 101; X32-NEXT: calll _func_float16_ptr 102; X32-NEXT: vaddps {{[-0-9]+}}(%e{{[sb]}}p), %zmm0, %zmm0 ## 64-byte Folded Reload 103; X32-NEXT: vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0 104; X32-NEXT: movl %ebp, %esp 105; X32-NEXT: popl %ebp 106; X32-NEXT: retl 107; 108; WIN32-LABEL: testf16_regs: 109; WIN32: # %bb.0: 110; WIN32-NEXT: pushl %ebp 111; WIN32-NEXT: movl %esp, %ebp 112; WIN32-NEXT: andl $-64, %esp 113; WIN32-NEXT: subl $192, %esp 114; WIN32-NEXT: vmovaps %zmm1, (%esp) # 64-byte Spill 115; WIN32-NEXT: vaddps %zmm1, %zmm0, %zmm0 116; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax 117; WIN32-NEXT: pushl %eax 118; WIN32-NEXT: calll _func_float16_ptr 119; WIN32-NEXT: addl $4, %esp 120; WIN32-NEXT: vaddps (%esp), %zmm0, %zmm0 # 64-byte Folded Reload 121; WIN32-NEXT: vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0 122; WIN32-NEXT: movl %ebp, %esp 123; WIN32-NEXT: popl %ebp 124; WIN32-NEXT: retl 125; 126; WIN64-LABEL: testf16_regs: 127; WIN64: # %bb.0: 128; WIN64-NEXT: pushq %rbp 129; WIN64-NEXT: subq $176, %rsp 130; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rbp 131; WIN64-NEXT: andq $-64, %rsp 132; WIN64-NEXT: vmovaps (%rdx), %zmm16 133; WIN64-NEXT: vaddps (%rcx), %zmm16, %zmm0 134; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx 135; WIN64-NEXT: callq func_float16_ptr 136; WIN64-NEXT: vaddps %zmm16, %zmm0, %zmm0 137; WIN64-NEXT: vaddps {{[0-9]+}}(%rsp), %zmm0, %zmm0 138; WIN64-NEXT: leaq 48(%rbp), %rsp 139; WIN64-NEXT: popq %rbp 140; WIN64-NEXT: retq 141; 142; X64-LABEL: testf16_regs: 143; X64: ## %bb.0: 144; X64-NEXT: pushq %rbp 145; X64-NEXT: movq %rsp, %rbp 146; X64-NEXT: pushq %r13 147; X64-NEXT: pushq %r12 148; X64-NEXT: andq $-64, %rsp 149; X64-NEXT: subq $128, %rsp 150; X64-NEXT: vmovaps %zmm1, %zmm16 151; X64-NEXT: vaddps %zmm1, %zmm0, %zmm0 152; X64-NEXT: movq %rsp, %rdi 153; X64-NEXT: callq _func_float16_ptr 154; X64-NEXT: vaddps %zmm16, %zmm0, %zmm0 155; X64-NEXT: vaddps (%rsp), %zmm0, %zmm0 156; X64-NEXT: leaq -16(%rbp), %rsp 157; X64-NEXT: popq %r12 158; X64-NEXT: popq %r13 159; X64-NEXT: popq %rbp 160; X64-NEXT: retq 161 %y = alloca <16 x float>, align 64 162 %x = fadd <16 x float> %a, %b 163 %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 164 %2 = load <16 x float>, <16 x float>* %y, align 16 165 %3 = fadd <16 x float> %1, %b 166 %4 = fadd <16 x float> %2, %3 167 ret <16 x float> %4 168} 169 170; test calling conventions - prolog and epilog 171define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind { 172; X32-LABEL: test_prolog_epilog: 173; X32: ## %bb.0: 174; X32-NEXT: subl $12, %esp 175; X32-NEXT: calll _func_float16 176; X32-NEXT: addl $12, %esp 177; X32-NEXT: retl 178; 179; WIN32-LABEL: test_prolog_epilog: 180; WIN32: # %bb.0: 181; WIN32-NEXT: calll _func_float16 182; WIN32-NEXT: retl 183; 184; WIN64-KNL-LABEL: test_prolog_epilog: 185; WIN64-KNL: # %bb.0: 186; WIN64-KNL-NEXT: pushq %rbp 187; WIN64-KNL-NEXT: subq $1264, %rsp # imm = 0x4F0 188; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rbp 189; WIN64-KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 190; WIN64-KNL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 191; WIN64-KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 192; WIN64-KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 193; WIN64-KNL-NEXT: vmovaps %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 194; WIN64-KNL-NEXT: vmovaps %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 195; WIN64-KNL-NEXT: vmovaps %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 196; WIN64-KNL-NEXT: vmovaps %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 197; WIN64-KNL-NEXT: vmovaps %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 198; WIN64-KNL-NEXT: vmovaps %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 199; WIN64-KNL-NEXT: vmovaps %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 200; WIN64-KNL-NEXT: vmovaps %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 201; WIN64-KNL-NEXT: vmovaps %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 202; WIN64-KNL-NEXT: vmovaps %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 203; WIN64-KNL-NEXT: vmovaps %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 204; WIN64-KNL-NEXT: vmovaps %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 205; WIN64-KNL-NEXT: vmovaps %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 206; WIN64-KNL-NEXT: vmovaps %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 207; WIN64-KNL-NEXT: vmovaps %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 208; WIN64-KNL-NEXT: vmovaps %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 209; WIN64-KNL-NEXT: andq $-64, %rsp 210; WIN64-KNL-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp) 211; WIN64-KNL-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 212; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rcx 213; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rdx 214; WIN64-KNL-NEXT: callq func_float16 215; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 216; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 217; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 218; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 219; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 220; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 221; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 222; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload 223; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload 224; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload 225; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 226; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 227; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload 228; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload 229; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 230; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload 231; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 232; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 233; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 234; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 235; WIN64-KNL-NEXT: leaq 1136(%rbp), %rsp 236; WIN64-KNL-NEXT: popq %rbp 237; WIN64-KNL-NEXT: retq 238; 239; WIN64-SKX-LABEL: test_prolog_epilog: 240; WIN64-SKX: # %bb.0: 241; WIN64-SKX-NEXT: pushq %rbp 242; WIN64-SKX-NEXT: subq $1264, %rsp # imm = 0x4F0 243; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rbp 244; WIN64-SKX-NEXT: kmovq %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 245; WIN64-SKX-NEXT: kmovq %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 246; WIN64-SKX-NEXT: kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 247; WIN64-SKX-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 248; WIN64-SKX-NEXT: vmovaps %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 249; WIN64-SKX-NEXT: vmovaps %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 250; WIN64-SKX-NEXT: vmovaps %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 251; WIN64-SKX-NEXT: vmovaps %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 252; WIN64-SKX-NEXT: vmovaps %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 253; WIN64-SKX-NEXT: vmovaps %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 254; WIN64-SKX-NEXT: vmovaps %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 255; WIN64-SKX-NEXT: vmovaps %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 256; WIN64-SKX-NEXT: vmovaps %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 257; WIN64-SKX-NEXT: vmovaps %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 258; WIN64-SKX-NEXT: vmovaps %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 259; WIN64-SKX-NEXT: vmovaps %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 260; WIN64-SKX-NEXT: vmovaps %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 261; WIN64-SKX-NEXT: vmovaps %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 262; WIN64-SKX-NEXT: vmovaps %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 263; WIN64-SKX-NEXT: vmovaps %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 264; WIN64-SKX-NEXT: andq $-64, %rsp 265; WIN64-SKX-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp) 266; WIN64-SKX-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 267; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rcx 268; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rdx 269; WIN64-SKX-NEXT: callq func_float16 270; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload 271; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload 272; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 273; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload 274; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload 275; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 276; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload 277; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload 278; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload 279; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload 280; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload 281; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload 282; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload 283; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload 284; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload 285; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload 286; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload 287; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload 288; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 8-byte Reload 289; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload 290; WIN64-SKX-NEXT: leaq 1136(%rbp), %rsp 291; WIN64-SKX-NEXT: popq %rbp 292; WIN64-SKX-NEXT: retq 293; 294; X64-KNL-LABEL: test_prolog_epilog: 295; X64-KNL: ## %bb.0: 296; X64-KNL-NEXT: pushq %rsi 297; X64-KNL-NEXT: subq $1072, %rsp ## imm = 0x430 298; X64-KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill 299; X64-KNL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill 300; X64-KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill 301; X64-KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill 302; X64-KNL-NEXT: vmovups %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 303; X64-KNL-NEXT: vmovups %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 304; X64-KNL-NEXT: vmovups %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 305; X64-KNL-NEXT: vmovups %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 306; X64-KNL-NEXT: vmovups %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 307; X64-KNL-NEXT: vmovups %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 308; X64-KNL-NEXT: vmovups %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 309; X64-KNL-NEXT: vmovups %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 310; X64-KNL-NEXT: vmovups %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 311; X64-KNL-NEXT: vmovups %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 312; X64-KNL-NEXT: vmovups %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 313; X64-KNL-NEXT: vmovups %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 314; X64-KNL-NEXT: vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 315; X64-KNL-NEXT: vmovups %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 316; X64-KNL-NEXT: vmovups %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 317; X64-KNL-NEXT: vmovups %zmm16, (%rsp) ## 64-byte Spill 318; X64-KNL-NEXT: callq _func_float16 319; X64-KNL-NEXT: vmovups (%rsp), %zmm16 ## 64-byte Reload 320; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload 321; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload 322; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload 323; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload 324; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload 325; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload 326; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload 327; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload 328; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload 329; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload 330; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload 331; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload 332; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload 333; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload 334; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload 335; X64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload 336; X64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload 337; X64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload 338; X64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload 339; X64-KNL-NEXT: addq $1072, %rsp ## imm = 0x430 340; X64-KNL-NEXT: popq %rsi 341; X64-KNL-NEXT: retq 342; 343; X64-SKX-LABEL: test_prolog_epilog: 344; X64-SKX: ## %bb.0: 345; X64-SKX-NEXT: pushq %rsi 346; X64-SKX-NEXT: subq $1072, %rsp ## imm = 0x430 347; X64-SKX-NEXT: kmovq %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill 348; X64-SKX-NEXT: kmovq %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill 349; X64-SKX-NEXT: kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill 350; X64-SKX-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill 351; X64-SKX-NEXT: vmovups %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 352; X64-SKX-NEXT: vmovups %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 353; X64-SKX-NEXT: vmovups %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 354; X64-SKX-NEXT: vmovups %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 355; X64-SKX-NEXT: vmovups %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 356; X64-SKX-NEXT: vmovups %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 357; X64-SKX-NEXT: vmovups %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 358; X64-SKX-NEXT: vmovups %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 359; X64-SKX-NEXT: vmovups %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 360; X64-SKX-NEXT: vmovups %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 361; X64-SKX-NEXT: vmovups %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 362; X64-SKX-NEXT: vmovups %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 363; X64-SKX-NEXT: vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 364; X64-SKX-NEXT: vmovups %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 365; X64-SKX-NEXT: vmovups %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill 366; X64-SKX-NEXT: vmovups %zmm16, (%rsp) ## 64-byte Spill 367; X64-SKX-NEXT: callq _func_float16 368; X64-SKX-NEXT: vmovups (%rsp), %zmm16 ## 64-byte Reload 369; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload 370; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload 371; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload 372; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload 373; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload 374; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload 375; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload 376; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload 377; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload 378; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload 379; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload 380; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload 381; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload 382; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload 383; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload 384; X64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 8-byte Reload 385; X64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 8-byte Reload 386; X64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 8-byte Reload 387; X64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 8-byte Reload 388; X64-SKX-NEXT: addq $1072, %rsp ## imm = 0x430 389; X64-SKX-NEXT: popq %rsi 390; X64-SKX-NEXT: retq 391 %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b) 392 ret <16 x float> %c 393} 394 395 396declare <16 x float> @func_float16_mask(<16 x float>, <16 x i1>) 397 398define <16 x float> @testf16_inp_mask(<16 x float> %a, i16 %mask) { 399; X32-LABEL: testf16_inp_mask: 400; X32: ## %bb.0: 401; X32-NEXT: subl $12, %esp 402; X32-NEXT: .cfi_def_cfa_offset 16 403; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 404; X32-NEXT: calll _func_float16_mask 405; X32-NEXT: addl $12, %esp 406; X32-NEXT: retl 407; 408; WIN32-LABEL: testf16_inp_mask: 409; WIN32: # %bb.0: 410; WIN32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 411; WIN32-NEXT: calll _func_float16_mask 412; WIN32-NEXT: retl 413; 414; WIN64-KNL-LABEL: testf16_inp_mask: 415; WIN64-KNL: # %bb.0: 416; WIN64-KNL-NEXT: subq $40, %rsp 417; WIN64-KNL-NEXT: .seh_stackalloc 40 418; WIN64-KNL-NEXT: .seh_endprologue 419; WIN64-KNL-NEXT: # kill: def $dx killed $dx def $edx 420; WIN64-KNL-NEXT: vmovaps (%rcx), %zmm0 421; WIN64-KNL-NEXT: kmovw %edx, %k1 422; WIN64-KNL-NEXT: callq func_float16_mask 423; WIN64-KNL-NEXT: nop 424; WIN64-KNL-NEXT: addq $40, %rsp 425; WIN64-KNL-NEXT: retq 426; WIN64-KNL-NEXT: .seh_endproc 427; 428; WIN64-SKX-LABEL: testf16_inp_mask: 429; WIN64-SKX: # %bb.0: 430; WIN64-SKX-NEXT: subq $40, %rsp 431; WIN64-SKX-NEXT: .seh_stackalloc 40 432; WIN64-SKX-NEXT: .seh_endprologue 433; WIN64-SKX-NEXT: # kill: def $dx killed $dx def $edx 434; WIN64-SKX-NEXT: vmovaps (%rcx), %zmm0 435; WIN64-SKX-NEXT: kmovd %edx, %k1 436; WIN64-SKX-NEXT: callq func_float16_mask 437; WIN64-SKX-NEXT: nop 438; WIN64-SKX-NEXT: addq $40, %rsp 439; WIN64-SKX-NEXT: retq 440; WIN64-SKX-NEXT: .seh_endproc 441; 442; X64-KNL-LABEL: testf16_inp_mask: 443; X64-KNL: ## %bb.0: 444; X64-KNL-NEXT: pushq %rbp 445; X64-KNL-NEXT: .cfi_def_cfa_offset 16 446; X64-KNL-NEXT: pushq %r13 447; X64-KNL-NEXT: .cfi_def_cfa_offset 24 448; X64-KNL-NEXT: pushq %r12 449; X64-KNL-NEXT: .cfi_def_cfa_offset 32 450; X64-KNL-NEXT: .cfi_offset %r12, -32 451; X64-KNL-NEXT: .cfi_offset %r13, -24 452; X64-KNL-NEXT: .cfi_offset %rbp, -16 453; X64-KNL-NEXT: kmovw %edi, %k1 454; X64-KNL-NEXT: callq _func_float16_mask 455; X64-KNL-NEXT: popq %r12 456; X64-KNL-NEXT: popq %r13 457; X64-KNL-NEXT: popq %rbp 458; X64-KNL-NEXT: retq 459; 460; X64-SKX-LABEL: testf16_inp_mask: 461; X64-SKX: ## %bb.0: 462; X64-SKX-NEXT: pushq %rbp 463; X64-SKX-NEXT: .cfi_def_cfa_offset 16 464; X64-SKX-NEXT: pushq %r13 465; X64-SKX-NEXT: .cfi_def_cfa_offset 24 466; X64-SKX-NEXT: pushq %r12 467; X64-SKX-NEXT: .cfi_def_cfa_offset 32 468; X64-SKX-NEXT: .cfi_offset %r12, -32 469; X64-SKX-NEXT: .cfi_offset %r13, -24 470; X64-SKX-NEXT: .cfi_offset %rbp, -16 471; X64-SKX-NEXT: kmovd %edi, %k1 472; X64-SKX-NEXT: callq _func_float16_mask 473; X64-SKX-NEXT: popq %r12 474; X64-SKX-NEXT: popq %r13 475; X64-SKX-NEXT: popq %rbp 476; X64-SKX-NEXT: retq 477 %imask = bitcast i16 %mask to <16 x i1> 478 %1 = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1> %imask) 479 ret <16 x float> %1 480} 481 482define intel_ocl_bicc <16 x float> @test_prolog_epilog_with_mask(<16 x float> %a, <16 x i32> %x1, <16 x i32>%x2, <16 x i1> %mask) nounwind { 483; X32-LABEL: test_prolog_epilog_with_mask: 484; X32: ## %bb.0: 485; X32-NEXT: subl $12, %esp 486; X32-NEXT: vpcmpeqd %zmm2, %zmm1, %k0 487; X32-NEXT: kxorw %k1, %k0, %k1 488; X32-NEXT: calll _func_float16_mask 489; X32-NEXT: addl $12, %esp 490; X32-NEXT: retl 491; 492; WIN32-LABEL: test_prolog_epilog_with_mask: 493; WIN32: # %bb.0: 494; WIN32-NEXT: vpcmpeqd %zmm2, %zmm1, %k0 495; WIN32-NEXT: kxorw %k1, %k0, %k1 496; WIN32-NEXT: calll _func_float16_mask 497; WIN32-NEXT: retl 498; 499; WIN64-LABEL: test_prolog_epilog_with_mask: 500; WIN64: # %bb.0: 501; WIN64-NEXT: subq $40, %rsp 502; WIN64-NEXT: vpcmpeqd %zmm2, %zmm1, %k0 503; WIN64-NEXT: kxorw %k1, %k0, %k1 504; WIN64-NEXT: callq func_float16_mask 505; WIN64-NEXT: addq $40, %rsp 506; WIN64-NEXT: retq 507; 508; X64-LABEL: test_prolog_epilog_with_mask: 509; X64: ## %bb.0: 510; X64-NEXT: pushq %rax 511; X64-NEXT: vpcmpeqd %zmm2, %zmm1, %k0 512; X64-NEXT: kxorw %k1, %k0, %k1 513; X64-NEXT: callq _func_float16_mask 514; X64-NEXT: popq %rax 515; X64-NEXT: retq 516 %cmp_res = icmp eq <16 x i32>%x1, %x2 517 %mask1 = xor <16 x i1> %cmp_res, %mask 518 %c = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1>%mask1) 519 ret <16 x float> %c 520} 521