Lines Matching refs:FAST
3 …s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE3,SSE3-FAST
5 … %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST
7 … %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST
9 … %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST
52 ; SSE3-FAST-LABEL: haddpd3:
53 ; SSE3-FAST: # %bb.0:
54 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
55 ; SSE3-FAST-NEXT: retq
63 ; AVX-FAST-LABEL: haddpd3:
64 ; AVX-FAST: # %bb.0:
65 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
66 ; AVX-FAST-NEXT: retq
160 ; SSE3-FAST-LABEL: haddps6:
161 ; SSE3-FAST: # %bb.0:
162 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
163 ; SSE3-FAST-NEXT: retq
171 ; AVX-FAST-LABEL: haddps6:
172 ; AVX-FAST: # %bb.0:
173 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
174 ; AVX-FAST-NEXT: retq
221 ; SSE3-FAST-LABEL: hsubpd2:
222 ; SSE3-FAST: # %bb.0:
223 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
224 ; SSE3-FAST-NEXT: retq
232 ; AVX-FAST-LABEL: hsubpd2:
233 ; AVX-FAST: # %bb.0:
234 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
235 ; AVX-FAST-NEXT: retq
297 ; SSE3-FAST-LABEL: hsubps4:
298 ; SSE3-FAST: # %bb.0:
299 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
300 ; SSE3-FAST-NEXT: retq
308 ; AVX-FAST-LABEL: hsubps4:
309 ; AVX-FAST: # %bb.0:
310 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
311 ; AVX-FAST-NEXT: retq
467 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32:
468 ; SSE3-FAST: # %bb.0:
469 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
470 ; SSE3-FAST-NEXT: retq
478 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32:
479 ; AVX-FAST: # %bb.0:
480 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
481 ; AVX-FAST-NEXT: retq
497 ; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32:
498 ; SSE3-FAST: # %bb.0:
499 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
500 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
501 ; SSE3-FAST-NEXT: retq
510 ; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32:
511 ; AVX-FAST: # %bb.0:
512 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
513 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
514 ; AVX-FAST-NEXT: retq
528 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute:
529 ; SSE3-FAST: # %bb.0:
530 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
531 ; SSE3-FAST-NEXT: retq
539 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute:
540 ; AVX-FAST: # %bb.0:
541 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
542 ; AVX-FAST-NEXT: retq
558 ; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute:
559 ; SSE3-FAST: # %bb.0:
560 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
561 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
562 ; SSE3-FAST-NEXT: retq
571 ; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute:
572 ; AVX-FAST: # %bb.0:
573 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
574 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
575 ; AVX-FAST-NEXT: retq
591 ; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64:
592 ; SSE3-FAST: # %bb.0:
593 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
594 ; SSE3-FAST-NEXT: retq
602 ; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64:
603 ; AVX-FAST: # %bb.0:
604 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
605 ; AVX-FAST-NEXT: retq
621 ; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute:
622 ; SSE3-FAST: # %bb.0:
623 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
624 ; SSE3-FAST-NEXT: retq
632 ; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute:
633 ; AVX-FAST: # %bb.0:
634 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
635 ; AVX-FAST-NEXT: retq
649 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fsub_f32:
650 ; SSE3-FAST: # %bb.0:
651 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
652 ; SSE3-FAST-NEXT: retq
660 ; AVX-FAST-LABEL: extract_extract01_v4f32_fsub_f32:
661 ; AVX-FAST: # %bb.0:
662 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
663 ; AVX-FAST-NEXT: retq
680 ; SSE3-FAST-LABEL: extract_extract23_v4f32_fsub_f32:
681 ; SSE3-FAST: # %bb.0:
682 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
683 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
684 ; SSE3-FAST-NEXT: retq
693 ; AVX-FAST-LABEL: extract_extract23_v4f32_fsub_f32:
694 ; AVX-FAST: # %bb.0:
695 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
696 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
697 ; AVX-FAST-NEXT: retq
752 ; SSE3-FAST-LABEL: extract_extract01_v2f64_fsub_f64:
753 ; SSE3-FAST: # %bb.0:
754 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
755 ; SSE3-FAST-NEXT: retq
763 ; AVX-FAST-LABEL: extract_extract01_v2f64_fsub_f64:
764 ; AVX-FAST: # %bb.0:
765 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
766 ; AVX-FAST-NEXT: retq
802 ; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32:
803 ; SSE3-FAST: # %bb.0:
804 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
805 ; SSE3-FAST-NEXT: retq
814 ; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32:
815 ; AVX-FAST: # %bb.0:
816 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
817 ; AVX-FAST-NEXT: vzeroupper
818 ; AVX-FAST-NEXT: retq
834 ; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32:
835 ; SSE3-FAST: # %bb.0:
836 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
837 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
838 ; SSE3-FAST-NEXT: retq
848 ; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32:
849 ; AVX-FAST: # %bb.0:
850 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
851 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
852 ; AVX-FAST-NEXT: vzeroupper
853 ; AVX-FAST-NEXT: retq
869 ; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32:
870 ; SSE3-FAST: # %bb.0:
871 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
872 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
873 ; SSE3-FAST-NEXT: retq
884 ; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32:
885 ; AVX-FAST: # %bb.0:
886 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
887 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
888 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
889 ; AVX-FAST-NEXT: vzeroupper
890 ; AVX-FAST-NEXT: retq
904 ; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute:
905 ; SSE3-FAST: # %bb.0:
906 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
907 ; SSE3-FAST-NEXT: retq
916 ; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute:
917 ; AVX-FAST: # %bb.0:
918 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
919 ; AVX-FAST-NEXT: vzeroupper
920 ; AVX-FAST-NEXT: retq
936 ; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute:
937 ; SSE3-FAST: # %bb.0:
938 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
939 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
940 ; SSE3-FAST-NEXT: retq
950 ; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute:
951 ; AVX-FAST: # %bb.0:
952 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
953 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
954 ; AVX-FAST-NEXT: vzeroupper
955 ; AVX-FAST-NEXT: retq
971 ; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute:
972 ; SSE3-FAST: # %bb.0:
973 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
974 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
975 ; SSE3-FAST-NEXT: retq
986 ; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute:
987 ; AVX-FAST: # %bb.0:
988 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
989 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
990 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
991 ; AVX-FAST-NEXT: vzeroupper
992 ; AVX-FAST-NEXT: retq
1008 ; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64:
1009 ; SSE3-FAST: # %bb.0:
1010 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
1011 ; SSE3-FAST-NEXT: retq
1020 ; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64:
1021 ; AVX-FAST: # %bb.0:
1022 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1023 ; AVX-FAST-NEXT: vzeroupper
1024 ; AVX-FAST-NEXT: retq
1039 ; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
1040 ; SSE3-FAST: # %bb.0:
1041 ; SSE3-FAST-NEXT: movapd %xmm1, %xmm0
1042 ; SSE3-FAST-NEXT: haddpd %xmm1, %xmm0
1043 ; SSE3-FAST-NEXT: retq
1053 ; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
1054 ; AVX-FAST: # %bb.0:
1055 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1056 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1057 ; AVX-FAST-NEXT: vzeroupper
1058 ; AVX-FAST-NEXT: retq
1074 ; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1075 ; SSE3-FAST: # %bb.0:
1076 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
1077 ; SSE3-FAST-NEXT: retq
1086 ; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1087 ; AVX-FAST: # %bb.0:
1088 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1089 ; AVX-FAST-NEXT: vzeroupper
1090 ; AVX-FAST-NEXT: retq
1105 ; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1106 ; SSE3-FAST: # %bb.0:
1107 ; SSE3-FAST-NEXT: movapd %xmm1, %xmm0
1108 ; SSE3-FAST-NEXT: haddpd %xmm1, %xmm0
1109 ; SSE3-FAST-NEXT: retq
1119 ; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1120 ; AVX-FAST: # %bb.0:
1121 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1122 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1123 ; AVX-FAST-NEXT: vzeroupper
1124 ; AVX-FAST-NEXT: retq
1138 ; SSE3-FAST-LABEL: extract_extract01_v8f32_fsub_f32:
1139 ; SSE3-FAST: # %bb.0:
1140 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
1141 ; SSE3-FAST-NEXT: retq
1150 ; AVX-FAST-LABEL: extract_extract01_v8f32_fsub_f32:
1151 ; AVX-FAST: # %bb.0:
1152 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1153 ; AVX-FAST-NEXT: vzeroupper
1154 ; AVX-FAST-NEXT: retq
1171 ; SSE3-FAST-LABEL: extract_extract23_v8f32_fsub_f32:
1172 ; SSE3-FAST: # %bb.0:
1173 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
1174 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1175 ; SSE3-FAST-NEXT: retq
1185 ; AVX-FAST-LABEL: extract_extract23_v8f32_fsub_f32:
1186 ; AVX-FAST: # %bb.0:
1187 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1188 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1189 ; AVX-FAST-NEXT: vzeroupper
1190 ; AVX-FAST-NEXT: retq
1205 ; SSE3-FAST-LABEL: extract_extract45_v8f32_fsub_f32:
1206 ; SSE3-FAST: # %bb.0:
1207 ; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
1208 ; SSE3-FAST-NEXT: hsubps %xmm1, %xmm0
1209 ; SSE3-FAST-NEXT: retq
1219 ; AVX-FAST-LABEL: extract_extract45_v8f32_fsub_f32:
1220 ; AVX-FAST: # %bb.0:
1221 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1222 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1223 ; AVX-FAST-NEXT: vzeroupper
1224 ; AVX-FAST-NEXT: retq
1261 ; SSE3-FAST-LABEL: extract_extract01_v4f64_fsub_f64:
1262 ; SSE3-FAST: # %bb.0:
1263 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
1264 ; SSE3-FAST-NEXT: retq
1273 ; AVX-FAST-LABEL: extract_extract01_v4f64_fsub_f64:
1274 ; AVX-FAST: # %bb.0:
1275 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
1276 ; AVX-FAST-NEXT: vzeroupper
1277 ; AVX-FAST-NEXT: retq
1316 ; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32:
1317 ; SSE3-FAST: # %bb.0:
1318 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1319 ; SSE3-FAST-NEXT: retq
1328 ; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32:
1329 ; AVX-FAST: # %bb.0:
1330 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1331 ; AVX-FAST-NEXT: vzeroupper
1332 ; AVX-FAST-NEXT: retq
1346 ; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1347 ; SSE3-FAST: # %bb.0:
1348 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1349 ; SSE3-FAST-NEXT: retq
1358 ; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1359 ; AVX-FAST: # %bb.0:
1360 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1361 ; AVX-FAST-NEXT: vzeroupper
1362 ; AVX-FAST-NEXT: retq
1378 ; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64:
1379 ; SSE3-FAST: # %bb.0:
1380 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
1381 ; SSE3-FAST-NEXT: retq
1390 ; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64:
1391 ; AVX-FAST: # %bb.0:
1392 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1393 ; AVX-FAST-NEXT: vzeroupper
1394 ; AVX-FAST-NEXT: retq
1410 ; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1411 ; SSE3-FAST: # %bb.0:
1412 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
1413 ; SSE3-FAST-NEXT: retq
1422 ; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1423 ; AVX-FAST: # %bb.0:
1424 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1425 ; AVX-FAST-NEXT: vzeroupper
1426 ; AVX-FAST-NEXT: retq
1440 ; SSE3-FAST-LABEL: extract_extract01_v16f32_fsub_f32:
1441 ; SSE3-FAST: # %bb.0:
1442 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
1443 ; SSE3-FAST-NEXT: retq
1452 ; AVX-FAST-LABEL: extract_extract01_v16f32_fsub_f32:
1453 ; AVX-FAST: # %bb.0:
1454 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1455 ; AVX-FAST-NEXT: vzeroupper
1456 ; AVX-FAST-NEXT: retq
1491 ; SSE3-FAST-LABEL: extract_extract01_v8f64_fsub_f64:
1492 ; SSE3-FAST: # %bb.0:
1493 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
1494 ; SSE3-FAST-NEXT: retq
1503 ; AVX-FAST-LABEL: extract_extract01_v8f64_fsub_f64:
1504 ; AVX-FAST: # %bb.0:
1505 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
1506 ; AVX-FAST-NEXT: vzeroupper
1507 ; AVX-FAST-NEXT: retq
1545 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1546 ; SSE3-FAST: # %bb.0:
1547 ; SSE3-FAST-NEXT: movss %xmm0, (%rdi)
1548 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1549 ; SSE3-FAST-NEXT: retq
1558 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1559 ; AVX-FAST: # %bb.0:
1560 ; AVX-FAST-NEXT: vmovss %xmm0, (%rdi)
1561 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1562 ; AVX-FAST-NEXT: retq
1578 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1579 ; SSE3-FAST: # %bb.0:
1580 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1581 ; SSE3-FAST-NEXT: movss %xmm1, (%rdi)
1582 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1583 ; SSE3-FAST-NEXT: retq
1592 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1593 ; AVX-FAST: # %bb.0:
1594 ; AVX-FAST-NEXT: vextractps $1, %xmm0, (%rdi)
1595 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1596 ; AVX-FAST-NEXT: retq
1646 ; SSE3-FAST-LABEL: fadd_reduce_v8f32:
1647 ; SSE3-FAST: # %bb.0:
1648 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm2
1649 ; SSE3-FAST-NEXT: haddps %xmm2, %xmm2
1650 ; SSE3-FAST-NEXT: haddps %xmm2, %xmm2
1651 ; SSE3-FAST-NEXT: addss %xmm2, %xmm0
1652 ; SSE3-FAST-NEXT: retq
1666 ; AVX-FAST-LABEL: fadd_reduce_v8f32:
1667 ; AVX-FAST: # %bb.0:
1668 ; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
1669 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1
1670 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
1671 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
1672 ; AVX-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
1673 ; AVX-FAST-NEXT: vzeroupper
1674 ; AVX-FAST-NEXT: retq
1689 ; SSE3-FAST-LABEL: fadd_reduce_v4f64:
1690 ; SSE3-FAST: # %bb.0:
1691 ; SSE3-FAST-NEXT: haddpd %xmm1, %xmm2
1692 ; SSE3-FAST-NEXT: haddpd %xmm2, %xmm2
1693 ; SSE3-FAST-NEXT: addsd %xmm2, %xmm0
1694 ; SSE3-FAST-NEXT: retq
1706 ; AVX-FAST-LABEL: fadd_reduce_v4f64:
1707 ; AVX-FAST: # %bb.0:
1708 ; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
1709 ; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm2, %xmm1
1710 ; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
1711 ; AVX-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1712 ; AVX-FAST-NEXT: vzeroupper
1713 ; AVX-FAST-NEXT: retq
1730 ; SSSE3-FAST-LABEL: PR39936_v8f32:
1731 ; SSSE3-FAST: # %bb.0:
1732 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0
1733 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
1734 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
1735 ; SSSE3-FAST-NEXT: retq
1745 ; SSE3-FAST-LABEL: PR39936_v8f32:
1746 ; SSE3-FAST: # %bb.0:
1747 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm0
1748 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1749 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1750 ; SSE3-FAST-NEXT: retq
1762 ; AVX-FAST-LABEL: PR39936_v8f32:
1763 ; AVX-FAST: # %bb.0:
1764 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
1765 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1766 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1767 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1768 ; AVX-FAST-NEXT: vzeroupper
1769 ; AVX-FAST-NEXT: retq
1793 ; SSE3-FAST-LABEL: hadd32_4:
1794 ; SSE3-FAST: # %bb.0:
1795 ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
1796 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1797 ; SSE3-FAST-NEXT: addps %xmm0, %xmm1
1798 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
1799 ; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
1800 ; SSE3-FAST-NEXT: retq
1810 ; AVX-FAST-LABEL: hadd32_4:
1811 ; AVX-FAST: # %bb.0:
1812 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1813 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
1814 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1815 ; AVX-FAST-NEXT: retq
1835 ; SSE3-FAST-LABEL: hadd32_8:
1836 ; SSE3-FAST: # %bb.0:
1837 ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
1838 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1839 ; SSE3-FAST-NEXT: addps %xmm0, %xmm1
1840 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
1841 ; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
1842 ; SSE3-FAST-NEXT: retq
1853 ; AVX-FAST-LABEL: hadd32_8:
1854 ; AVX-FAST: # %bb.0:
1855 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1856 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
1857 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1858 ; AVX-FAST-NEXT: vzeroupper
1859 ; AVX-FAST-NEXT: retq
1879 ; SSE3-FAST-LABEL: hadd32_16:
1880 ; SSE3-FAST: # %bb.0:
1881 ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
1882 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1883 ; SSE3-FAST-NEXT: addps %xmm0, %xmm1
1884 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
1885 ; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
1886 ; SSE3-FAST-NEXT: retq
1897 ; AVX-FAST-LABEL: hadd32_16:
1898 ; AVX-FAST: # %bb.0:
1899 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1900 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
1901 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1902 ; AVX-FAST-NEXT: vzeroupper
1903 ; AVX-FAST-NEXT: retq
2071 ; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32:
2072 ; SSE3-FAST: # %bb.0:
2073 ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
2074 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2075 ; SSE3-FAST-NEXT: addps %xmm0, %xmm1
2076 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
2077 ; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
2078 ; SSE3-FAST-NEXT: retq
2089 ; AVX-FAST-LABEL: partial_reduction_fadd_v8f32:
2090 ; AVX-FAST: # %bb.0:
2091 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2092 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2093 ; AVX-FAST-NEXT: vzeroupper
2094 ; AVX-FAST-NEXT: retq
2117 ; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2118 ; SSE3-FAST: # %bb.0:
2119 ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
2120 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2121 ; SSE3-FAST-NEXT: addps %xmm0, %xmm1
2122 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
2123 ; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
2124 ; SSE3-FAST-NEXT: retq
2135 ; AVX-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2136 ; AVX-FAST: # %bb.0:
2137 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2138 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
2139 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2140 ; AVX-FAST-NEXT: vzeroupper
2141 ; AVX-FAST-NEXT: retq
2161 ; SSE3-FAST-LABEL: partial_reduction_fadd_v16f32:
2162 ; SSE3-FAST: # %bb.0:
2163 ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
2164 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2165 ; SSE3-FAST-NEXT: addps %xmm0, %xmm1
2166 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
2167 ; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
2168 ; SSE3-FAST-NEXT: retq
2179 ; AVX-FAST-LABEL: partial_reduction_fadd_v16f32:
2180 ; AVX-FAST: # %bb.0:
2181 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2182 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2183 ; AVX-FAST-NEXT: vzeroupper
2184 ; AVX-FAST-NEXT: retq