1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_config.h"
14 #include "vpx/vpx_integer.h"
15
vpx_subtract_block_neon(int rows,int cols,int16_t * diff,ptrdiff_t diff_stride,const uint8_t * src,ptrdiff_t src_stride,const uint8_t * pred,ptrdiff_t pred_stride)16 void vpx_subtract_block_neon(int rows, int cols,
17 int16_t *diff, ptrdiff_t diff_stride,
18 const uint8_t *src, ptrdiff_t src_stride,
19 const uint8_t *pred, ptrdiff_t pred_stride) {
20 int r, c;
21
22 if (cols > 16) {
23 for (r = 0; r < rows; ++r) {
24 for (c = 0; c < cols; c += 32) {
25 const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
26 const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
27 const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
28 const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
29 const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00),
30 vget_low_u8(v_pred_00));
31 const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00),
32 vget_high_u8(v_pred_00));
33 const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16),
34 vget_low_u8(v_pred_16));
35 const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16),
36 vget_high_u8(v_pred_16));
37 vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
38 vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
39 vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
40 vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
41 }
42 diff += diff_stride;
43 pred += pred_stride;
44 src += src_stride;
45 }
46 } else if (cols > 8) {
47 for (r = 0; r < rows; ++r) {
48 const uint8x16_t v_src = vld1q_u8(&src[0]);
49 const uint8x16_t v_pred = vld1q_u8(&pred[0]);
50 const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src),
51 vget_low_u8(v_pred));
52 const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src),
53 vget_high_u8(v_pred));
54 vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
55 vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
56 diff += diff_stride;
57 pred += pred_stride;
58 src += src_stride;
59 }
60 } else if (cols > 4) {
61 for (r = 0; r < rows; ++r) {
62 const uint8x8_t v_src = vld1_u8(&src[0]);
63 const uint8x8_t v_pred = vld1_u8(&pred[0]);
64 const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
65 vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
66 diff += diff_stride;
67 pred += pred_stride;
68 src += src_stride;
69 }
70 } else {
71 for (r = 0; r < rows; ++r) {
72 for (c = 0; c < cols; ++c)
73 diff[c] = src[c] - pred[c];
74
75 diff += diff_stride;
76 pred += pred_stride;
77 src += src_stride;
78 }
79 }
80 }
81