1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vp9_rtcd.h"
12 #include "vp9/common/vp9_onyxc_int.h"
13 #include "vpx_dsp/mips/macros_msa.h"
14
filter_by_weight8x8_msa(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,int32_t src_weight)15 static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
16 uint8_t *dst_ptr, int32_t dst_stride,
17 int32_t src_weight) {
18 int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
19 int32_t row;
20 uint64_t src0_d, src1_d, dst0_d, dst1_d;
21 v16i8 src0 = { 0 };
22 v16i8 src1 = { 0 };
23 v16i8 dst0 = { 0 };
24 v16i8 dst1 = { 0 };
25 v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
26
27 src_wt = __msa_fill_h(src_weight);
28 dst_wt = __msa_fill_h(dst_weight);
29
30 for (row = 2; row--;) {
31 LD2(src_ptr, src_stride, src0_d, src1_d);
32 src_ptr += (2 * src_stride);
33 LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
34 INSERT_D2_SB(src0_d, src1_d, src0);
35 INSERT_D2_SB(dst0_d, dst1_d, dst0);
36
37 LD2(src_ptr, src_stride, src0_d, src1_d);
38 src_ptr += (2 * src_stride);
39 LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
40 INSERT_D2_SB(src0_d, src1_d, src1);
41 INSERT_D2_SB(dst0_d, dst1_d, dst1);
42
43 UNPCK_UB_SH(src0, src_r, src_l);
44 UNPCK_UB_SH(dst0, dst_r, dst_l);
45 res_h_r = (src_r * src_wt);
46 res_h_r += (dst_r * dst_wt);
47 res_h_l = (src_l * src_wt);
48 res_h_l += (dst_l * dst_wt);
49 SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
50 dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
51 ST8x2_UB(dst0, dst_ptr, dst_stride);
52 dst_ptr += (2 * dst_stride);
53
54 UNPCK_UB_SH(src1, src_r, src_l);
55 UNPCK_UB_SH(dst1, dst_r, dst_l);
56 res_h_r = (src_r * src_wt);
57 res_h_r += (dst_r * dst_wt);
58 res_h_l = (src_l * src_wt);
59 res_h_l += (dst_l * dst_wt);
60 SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
61 dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
62 ST8x2_UB(dst1, dst_ptr, dst_stride);
63 dst_ptr += (2 * dst_stride);
64 }
65 }
66
filter_by_weight16x16_msa(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,int32_t src_weight)67 static void filter_by_weight16x16_msa(const uint8_t *src_ptr,
68 int32_t src_stride,
69 uint8_t *dst_ptr,
70 int32_t dst_stride,
71 int32_t src_weight) {
72 int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
73 int32_t row;
74 v16i8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
75 v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
76
77 src_wt = __msa_fill_h(src_weight);
78 dst_wt = __msa_fill_h(dst_weight);
79
80 for (row = 4; row--;) {
81 LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
82 src_ptr += (4 * src_stride);
83 LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
84
85 UNPCK_UB_SH(src0, src_r, src_l);
86 UNPCK_UB_SH(dst0, dst_r, dst_l);
87 res_h_r = (src_r * src_wt);
88 res_h_r += (dst_r * dst_wt);
89 res_h_l = (src_l * src_wt);
90 res_h_l += (dst_l * dst_wt);
91 SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
92 PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
93 dst_ptr += dst_stride;
94
95 UNPCK_UB_SH(src1, src_r, src_l);
96 UNPCK_UB_SH(dst1, dst_r, dst_l);
97 res_h_r = (src_r * src_wt);
98 res_h_r += (dst_r * dst_wt);
99 res_h_l = (src_l * src_wt);
100 res_h_l += (dst_l * dst_wt);
101 SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
102 PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
103 dst_ptr += dst_stride;
104
105 UNPCK_UB_SH(src2, src_r, src_l);
106 UNPCK_UB_SH(dst2, dst_r, dst_l);
107 res_h_r = (src_r * src_wt);
108 res_h_r += (dst_r * dst_wt);
109 res_h_l = (src_l * src_wt);
110 res_h_l += (dst_l * dst_wt);
111 SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
112 PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
113 dst_ptr += dst_stride;
114
115 UNPCK_UB_SH(src3, src_r, src_l);
116 UNPCK_UB_SH(dst3, dst_r, dst_l);
117 res_h_r = (src_r * src_wt);
118 res_h_r += (dst_r * dst_wt);
119 res_h_l = (src_l * src_wt);
120 res_h_l += (dst_l * dst_wt);
121 SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
122 PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
123 dst_ptr += dst_stride;
124 }
125 }
126
vp9_filter_by_weight8x8_msa(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int src_weight)127 void vp9_filter_by_weight8x8_msa(const uint8_t *src, int src_stride,
128 uint8_t *dst, int dst_stride,
129 int src_weight) {
130 filter_by_weight8x8_msa(src, src_stride, dst, dst_stride, src_weight);
131 }
132
vp9_filter_by_weight16x16_msa(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int src_weight)133 void vp9_filter_by_weight16x16_msa(const uint8_t *src, int src_stride,
134 uint8_t *dst, int dst_stride,
135 int src_weight) {
136 filter_by_weight16x16_msa(src, src_stride, dst, dst_stride, src_weight);
137 }
138