1; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math | FileCheck %s --check-prefix=CST --check-prefix=SSE2 2; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+sse4.1 | FileCheck %s --check-prefix=CST --check-prefix=SSE41 3; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx | FileCheck %s --check-prefix=CST --check-prefix=AVX 4; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 5; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F 6; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL 7 8; Check that the constant used in the vectors are the right ones. 9; SSE2: [[MASKCSTADDR:.LCPI[0-9_]+]]: 10; SSE2-NEXT: .long 65535 # 0xffff 11; SSE2-NEXT: .long 65535 # 0xffff 12; SSE2-NEXT: .long 65535 # 0xffff 13; SSE2-NEXT: .long 65535 # 0xffff 14 15; CST: [[LOWCSTADDR:.LCPI[0-9_]+]]: 16; CST-NEXT: .long 1258291200 # 0x4b000000 17; CST-NEXT: .long 1258291200 # 0x4b000000 18; CST-NEXT: .long 1258291200 # 0x4b000000 19; CST-NEXT: .long 1258291200 # 0x4b000000 20 21; CST: [[HIGHCSTADDR:.LCPI[0-9_]+]]: 22; CST-NEXT: .long 1392508928 # 0x53000000 23; CST-NEXT: .long 1392508928 # 0x53000000 24; CST-NEXT: .long 1392508928 # 0x53000000 25; CST-NEXT: .long 1392508928 # 0x53000000 26 27; CST: [[MAGICCSTADDR:.LCPI[0-9_]+]]: 28; CST-NEXT: .long 0x53000080 # float 5.49764202E+11 29; CST-NEXT: .long 0x53000080 # float 5.49764202E+11 30; CST-NEXT: .long 0x53000080 # float 5.49764202E+11 31; CST-NEXT: .long 0x53000080 # float 5.49764202E+11 32 33; AVX2: [[LOWCSTADDR:.LCPI[0-9_]+]]: 34; AVX2-NEXT: .long 1258291200 # 0x4b000000 35 36; AVX2: [[HIGHCSTADDR:.LCPI[0-9_]+]]: 37; AVX2-NEXT: .long 1392508928 # 0x53000000 38 39; AVX2: [[MAGICCSTADDR:.LCPI[0-9_]+]]: 40; AVX2-NEXT: .long 0x53000080 # float 5.49764202E+11 41 42define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) { 43; SSE2-LABEL: test_uitofp_v4i32_to_v4f32: 44; SSE2: movdqa [[MASKCSTADDR]](%rip), [[MASK:%xmm[0-9]+]] 45; SSE2-NEXT: pand %xmm0, [[MASK]] 46; After this instruction, MASK will have the value of the low parts 47; of the vector. 48; SSE2-NEXT: por [[LOWCSTADDR]](%rip), [[MASK]] 49; SSE2-NEXT: psrld $16, %xmm0 50; SSE2-NEXT: por [[HIGHCSTADDR]](%rip), %xmm0 51; SSE2-NEXT: subps [[MAGICCSTADDR]](%rip), %xmm0 52; SSE2-NEXT: addps [[MASK]], %xmm0 53; SSE2-NEXT: retq 54; 55; Currently we commute the arguments of the first blend, but this could be 56; improved to match the lowering of the second blend. 57; SSE41-LABEL: test_uitofp_v4i32_to_v4f32: 58; SSE41: movdqa [[LOWCSTADDR]](%rip), [[LOWVEC:%xmm[0-9]+]] 59; SSE41-NEXT: pblendw $85, %xmm0, [[LOWVEC]] 60; SSE41-NEXT: psrld $16, %xmm0 61; SSE41-NEXT: pblendw $170, [[HIGHCSTADDR]](%rip), %xmm0 62; SSE41-NEXT: subps [[MAGICCSTADDR]](%rip), %xmm0 63; SSE41-NEXT: addps [[LOWVEC]], %xmm0 64; SSE41-NEXT: retq 65; 66; AVX-LABEL: test_uitofp_v4i32_to_v4f32: 67; AVX: vpblendw $170, [[LOWCSTADDR]](%rip), %xmm0, [[LOWVEC:%xmm[0-9]+]] 68; AVX-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]] 69; AVX-NEXT: vpblendw $170, [[HIGHCSTADDR]](%rip), [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]] 70; AVX-NEXT: vsubps [[MAGICCSTADDR]](%rip), [[HIGHVEC]], [[TMP:%xmm[0-9]+]] 71; AVX-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0 72; AVX-NEXT: retq 73; 74; The lowering for AVX2 is a bit messy, because we select broadcast 75; instructions, instead of folding the constant loads. 76; AVX2-LABEL: test_uitofp_v4i32_to_v4f32: 77; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%xmm[0-9]+]] 78; AVX2-NEXT: vpblendw $170, [[LOWCST]], %xmm0, [[LOWVEC:%xmm[0-9]+]] 79; AVX2-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]] 80; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%xmm[0-9]+]] 81; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]] 82; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%xmm[0-9]+]] 83; AVX2-NEXT: vsubps [[MAGICCST]], [[HIGHVEC]], [[TMP:%xmm[0-9]+]] 84; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0 85; AVX2-NEXT: retq 86; 87; AVX512F-LABEL: test_uitofp_v4i32_to_v4f32: 88; AVX512F: # %bb.0: 89; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 90; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 91; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 92; AVX512F-NEXT: vzeroupper 93; AVX512F-NEXT: retq 94; 95; AVX512VL-LABEL: test_uitofp_v4i32_to_v4f32: 96; AVX512VL: # %bb.0: 97; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 98; AVX512VL-NEXT: retq 99 %tmp = uitofp <4 x i32> %arg to <4 x float> 100 ret <4 x float> %tmp 101} 102 103; Match the AVX2 constants used in the next function 104; AVX2: [[LOWCSTADDR:.LCPI[0-9_]+]]: 105; AVX2-NEXT: .long 1258291200 # 0x4b000000 106 107; AVX2: [[HIGHCSTADDR:.LCPI[0-9_]+]]: 108; AVX2-NEXT: .long 1392508928 # 0x53000000 109 110; AVX2: [[MAGICCSTADDR:.LCPI[0-9_]+]]: 111; AVX2-NEXT: .long 0x53000080 # float 5.49764202E+11 112 113define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) { 114; Legalization will break the thing is 2 x <4 x i32> on anthing prior AVX. 115; The constant used for in the vector instruction are shared between the 116; two sequences of instructions. 117; 118; SSE2-LABEL: test_uitofp_v8i32_to_v8f32: 119; SSE2: movdqa {{.*#+}} [[MASK:xmm[0-9]+]] = [65535,65535,65535,65535] 120; SSE2-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]] 121; SSE2-NEXT: pand %[[MASK]], [[VECLOW]] 122; SSE2-NEXT: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200] 123; SSE2-NEXT: por %[[LOWCST]], [[VECLOW]] 124; SSE2-NEXT: psrld $16, %xmm0 125; SSE2-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928] 126; SSE2-NEXT: por %[[HIGHCST]], %xmm0 127; SSE2-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] 128; SSE2-NEXT: subps %[[MAGICCST]], %xmm0 129; SSE2-NEXT: addps [[VECLOW]], %xmm0 130; MASK is the low vector of the second part after this point. 131; SSE2-NEXT: pand %xmm1, %[[MASK]] 132; SSE2-NEXT: por %[[LOWCST]], %[[MASK]] 133; SSE2-NEXT: psrld $16, %xmm1 134; SSE2-NEXT: por %[[HIGHCST]], %xmm1 135; SSE2-NEXT: subps %[[MAGICCST]], %xmm1 136; SSE2-NEXT: addps %[[MASK]], %xmm1 137; SSE2-NEXT: retq 138; 139; SSE41-LABEL: test_uitofp_v8i32_to_v8f32: 140; SSE41: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200] 141; SSE41-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]] 142; SSE41-NEXT: pblendw $170, %[[LOWCST]], [[VECLOW]] 143; SSE41-NEXT: psrld $16, %xmm0 144; SSE41-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928] 145; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm0 146; SSE41-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] 147; SSE41-NEXT: subps %[[MAGICCST]], %xmm0 148; SSE41-NEXT: addps [[VECLOW]], %xmm0 149; LOWCST is the low vector of the second part after this point. 150; The operands of the blend are inverted because we reuse xmm1 151; in the next shift. 152; SSE41-NEXT: pblendw $85, %xmm1, %[[LOWCST]] 153; SSE41-NEXT: psrld $16, %xmm1 154; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm1 155; SSE41-NEXT: subps %[[MAGICCST]], %xmm1 156; SSE41-NEXT: addps %[[LOWCST]], %xmm1 157; SSE41-NEXT: retq 158; 159; Test that we are not lowering uinttofp to scalars 160; AVX-NOT: cvtsd2ss 161; AVX: retq 162; 163; AVX2-LABEL: test_uitofp_v8i32_to_v8f32: 164; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%ymm[0-9]+]] 165; AVX2-NEXT: vpblendw $170, [[LOWCST]], %ymm0, [[LOWVEC:%ymm[0-9]+]] 166; AVX2-NEXT: vpsrld $16, %ymm0, [[SHIFTVEC:%ymm[0-9]+]] 167; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%ymm[0-9]+]] 168; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%ymm[0-9]+]] 169; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%ymm[0-9]+]] 170; AVX2-NEXT: vsubps [[MAGICCST]], [[HIGHVEC]], [[TMP:%ymm[0-9]+]] 171; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %ymm0 172; AVX2-NEXT: retq 173; 174; AVX512F-LABEL: test_uitofp_v8i32_to_v8f32: 175; AVX512F: # %bb.0: 176; AVX512F-NEXT: # kill 177; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 178; AVX512F-NEXT: # kill 179; AVX512F-NEXT: retq 180; 181; AVX512VL-LABEL: test_uitofp_v8i32_to_v8f32: 182; AVX512VL: # %bb.0: 183; AVX512VL-NEXT: vcvtudq2ps %ymm0, %ymm0 184; AVX512VL-NEXT: retq 185 %tmp = uitofp <8 x i32> %arg to <8 x float> 186 ret <8 x float> %tmp 187} 188