1; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck -check-prefix=X32 %s 2; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+avx | FileCheck -check-prefix=X32 %s 3; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx | FileCheck -check-prefix=WIN64 %s 4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck -check-prefix=X64 %s 5 6declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *) 7declare <16 x float> @func_float16(<16 x float>, <16 x float>) 8declare i32 @func_int(i32, i32) 9 10; WIN64-LABEL: testf16_inp 11; WIN64: vaddps {{.*}}, {{%ymm[0-1]}} 12; WIN64: vaddps {{.*}}, {{%ymm[0-1]}} 13; WIN64: leaq {{.*}}(%rsp), %rcx 14; WIN64: call 15; WIN64: ret 16 17; X32-LABEL: testf16_inp 18; X32: movl %eax, (%esp) 19; X32: vaddps {{.*}}, {{%ymm[0-1]}} 20; X32: vaddps {{.*}}, {{%ymm[0-1]}} 21; X32: call 22; X32: ret 23 24; X64-LABEL: testf16_inp 25; X64: vaddps {{.*}}, {{%ymm[0-1]}} 26; X64: vaddps {{.*}}, {{%ymm[0-1]}} 27; X64: leaq {{.*}}(%rsp), %rdi 28; X64: call 29; X64: ret 30 31;test calling conventions - input parameters 32define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind { 33 %y = alloca <16 x float>, align 16 34 %x = fadd <16 x float> %a, %b 35 %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 36 %2 = load <16 x float>, <16 x float>* %y, align 16 37 %3 = fadd <16 x float> %2, %1 38 ret <16 x float> %3 39} 40 41;test calling conventions - preserved registers 42 43; preserved ymm6-ymm15 44; WIN64-LABEL: testf16_regs 45; WIN64: call 46; WIN64: vaddps {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}} 47; WIN64: vaddps {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}} 48; WIN64: ret 49 50; preserved ymm8-ymm15 51; X64-LABEL: testf16_regs 52; X64: call 53; X64: vaddps {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}} 54; X64: vaddps {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}} 55; X64: ret 56 57define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind { 58 %y = alloca <16 x float>, align 16 59 %x = fadd <16 x float> %a, %b 60 %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 61 %2 = load <16 x float>, <16 x float>* %y, align 16 62 %3 = fadd <16 x float> %1, %b 63 %4 = fadd <16 x float> %2, %3 64 ret <16 x float> %4 65} 66 67; test calling conventions - prolog and epilog 68; WIN64-LABEL: test_prolog_epilog 69; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 70; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 71; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 72; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 73; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 74; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 75; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 76; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 77; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 78; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 79; WIN64: call 80; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 81; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 82; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 83; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 84; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 85; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 86; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 87; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 88; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 89; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 90 91; X64-LABEL: test_prolog_epilog 92; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill 93; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill 94; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill 95; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill 96; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill 97; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill 98; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill 99; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill 100; X64: call 101; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload 102; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload 103; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload 104; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload 105; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload 106; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload 107; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload 108; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload 109define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind { 110 %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b) 111 ret <16 x float> %c 112} 113 114; test functions with integer parameters 115; pass parameters on stack for 32-bit platform 116; X32-LABEL: test_int 117; X32: movl {{.*}}, 4(%esp) 118; X32: movl {{.*}}, (%esp) 119; X32: call 120; X32: addl {{.*}}, %eax 121 122; pass parameters in registers for 64-bit platform 123; X64-LABEL: test_int 124; X64: leal {{.*}}, %edi 125; X64: movl {{.*}}, %esi 126; X64: call 127; X64: addl {{.*}}, %eax 128define i32 @test_int(i32 %a, i32 %b) nounwind { 129 %c1 = add i32 %a, %b 130 %c2 = call intel_ocl_bicc i32 @func_int(i32 %c1, i32 %a) 131 %c = add i32 %c2, %b 132 ret i32 %c 133} 134 135; WIN64-LABEL: test_float4 136; WIN64-NOT: vzeroupper 137; WIN64: call 138; WIN64-NOT: vzeroupper 139; WIN64: call 140; WIN64: ret 141 142; X64-LABEL: test_float4 143; X64-NOT: vzeroupper 144; X64: call 145; X64-NOT: vzeroupper 146; X64: call 147; X64: ret 148 149; X32-LABEL: test_float4 150; X32: vzeroupper 151; X32: call 152; X32: vzeroupper 153; X32: call 154; X32: ret 155 156declare <4 x float> @func_float4(<4 x float>, <4 x float>, <4 x float>) 157 158define <8 x float> @test_float4(<8 x float> %a, <8 x float> %b, <8 x float> %c) nounwind readnone { 159entry: 160 %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 161 %1 = shufflevector <8 x float> %b, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 162 %2 = shufflevector <8 x float> %c, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 163 %call.i = tail call intel_ocl_bicc <4 x float> @func_float4(<4 x float> %0, <4 x float> %1, <4 x float> %2) nounwind 164 %3 = shufflevector <4 x float> %call.i, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 165 %4 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 166 %5 = shufflevector <8 x float> %b, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 167 %6 = shufflevector <8 x float> %c, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 168 %call.i2 = tail call intel_ocl_bicc <4 x float> @func_float4(<4 x float> %4, <4 x float> %5, <4 x float> %6) nounwind 169 %7 = shufflevector <4 x float> %call.i2, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 170 %8 = shufflevector <8 x float> %3, <8 x float> %7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> 171 ret <8 x float> %8 172} 173