Lines Matching refs:uint16x8_t
1273 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1)); in DC4()
1274 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1)); in DC4()
1275 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1)); in DC4()
1276 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1)); in DC4()
1277 const uint16x8_t s0 = vaddq_u16(L0, L1); in DC4()
1278 const uint16x8_t s1 = vaddq_u16(L2, L3); in DC4()
1279 const uint16x8_t s01 = vaddq_u16(s0, s1); in DC4()
1280 const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1)); in DC4()
1411 uint16x8_t sum_top; in DC8()
1412 uint16x8_t sum_left; in DC8()
1424 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1)); in DC8()
1425 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1)); in DC8()
1426 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1)); in DC8()
1427 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1)); in DC8()
1428 const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1)); in DC8()
1429 const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1)); in DC8()
1430 const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1)); in DC8()
1431 const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1)); in DC8()
1432 const uint16x8_t s0 = vaddq_u16(L0, L1); in DC8()
1433 const uint16x8_t s1 = vaddq_u16(L2, L3); in DC8()
1434 const uint16x8_t s2 = vaddq_u16(L4, L5); in DC8()
1435 const uint16x8_t s3 = vaddq_u16(L6, L7); in DC8()
1436 const uint16x8_t s01 = vaddq_u16(s0, s1); in DC8()
1437 const uint16x8_t s23 = vaddq_u16(s2, s3); in DC8()
1442 const uint16x8_t sum = vaddq_u16(sum_left, sum_top); in DC8()
1489 uint16x8_t sum_top; in DC16()
1490 uint16x8_t sum_left; in DC16()
1495 const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top in DC16()
1506 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1)); in DC16()
1507 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1)); in DC16()
1508 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1)); in DC16()
1509 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1)); in DC16()
1510 const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1)); in DC16()
1511 const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1)); in DC16()
1512 const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1)); in DC16()
1513 const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1)); in DC16()
1514 const uint16x8_t s0 = vaddq_u16(L0, L1); in DC16()
1515 const uint16x8_t s1 = vaddq_u16(L2, L3); in DC16()
1516 const uint16x8_t s2 = vaddq_u16(L4, L5); in DC16()
1517 const uint16x8_t s3 = vaddq_u16(L6, L7); in DC16()
1518 const uint16x8_t s01 = vaddq_u16(s0, s1); in DC16()
1519 const uint16x8_t s23 = vaddq_u16(s2, s3); in DC16()
1520 const uint16x8_t sum = vaddq_u16(s01, s23); in DC16()
1526 const uint16x8_t sum = vaddq_u16(sum_left, sum_top); in DC16()