1 /* 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef INCLUDE_LIBYUV_MACROS_MSA_H_ 12 #define INCLUDE_LIBYUV_MACROS_MSA_H_ 13 14 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 15 #include <msa.h> 16 #include <stdint.h> 17 18 #if (__mips_isa_rev >= 6) 19 #define LW(psrc) \ 20 ({ \ 21 uint8* psrc_lw_m = (uint8*)(psrc); /* NOLINT */ \ 22 uint32 val_m; \ 23 asm volatile("lw %[val_m], %[psrc_lw_m] \n" \ 24 : [val_m] "=r"(val_m) \ 25 : [psrc_lw_m] "m"(*psrc_lw_m)); \ 26 val_m; \ 27 }) 28 29 #if (__mips == 64) 30 #define LD(psrc) \ 31 ({ \ 32 uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \ 33 uint64 val_m = 0; \ 34 asm volatile("ld %[val_m], %[psrc_ld_m] \n" \ 35 : [val_m] "=r"(val_m) \ 36 : [psrc_ld_m] "m"(*psrc_ld_m)); \ 37 val_m; \ 38 }) 39 #else // !(__mips == 64) 40 #define LD(psrc) \ 41 ({ \ 42 uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \ 43 uint32 val0_m, val1_m; \ 44 uint64 val_m = 0; \ 45 val0_m = LW(psrc_ld_m); \ 46 val1_m = LW(psrc_ld_m + 4); \ 47 val_m = (uint64)(val1_m); /* NOLINT */ \ 48 val_m = (uint64)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ 49 val_m = (uint64)(val_m | (uint64)val0_m); /* NOLINT */ \ 50 val_m; \ 51 }) 52 #endif // (__mips == 64) 53 54 #define SW(val, pdst) \ 55 ({ \ 56 uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ 57 uint32_t val_m = (val); \ 58 asm volatile("sw %[val_m], %[pdst_sw_m] \n" \ 59 : [pdst_sw_m] "=m"(*pdst_sw_m) \ 60 : [val_m] "r"(val_m)); \ 61 }) 62 63 #if (__mips == 64) 64 #define SD(val, pdst) \ 65 ({ \ 66 uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ 67 uint64_t val_m = (val); \ 68 asm volatile("sd %[val_m], %[pdst_sd_m] \n" \ 69 : [pdst_sd_m] "=m"(*pdst_sd_m) \ 70 : [val_m] "r"(val_m)); \ 71 }) 72 #else // !(__mips == 64) 73 #define SD(val, pdst) \ 74 ({ \ 75 uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ 76 uint32_t val0_m, val1_m; \ 77 val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ 78 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ 79 SW(val0_m, pdst_sd_m); \ 80 SW(val1_m, pdst_sd_m + 4); \ 81 }) 82 #endif // !(__mips == 64) 83 #else // !(__mips_isa_rev >= 6) 84 #define LW(psrc) \ 85 ({ \ 86 uint8* psrc_lw_m = (uint8*)(psrc); /* NOLINT */ \ 87 uint32 val_m; \ 88 asm volatile("ulw %[val_m], %[psrc_lw_m] \n" \ 89 : [val_m] "=r"(val_m) \ 90 : [psrc_lw_m] "m"(*psrc_lw_m)); \ 91 val_m; \ 92 }) 93 94 #if (__mips == 64) 95 #define LD(psrc) \ 96 ({ \ 97 uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \ 98 uint64 val_m = 0; \ 99 asm volatile("uld %[val_m], %[psrc_ld_m] \n" \ 100 : [val_m] "=r"(val_m) \ 101 : [psrc_ld_m] "m"(*psrc_ld_m)); \ 102 val_m; \ 103 }) 104 #else // !(__mips == 64) 105 #define LD(psrc) \ 106 ({ \ 107 uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \ 108 uint32 val0_m, val1_m; \ 109 uint64 val_m = 0; \ 110 val0_m = LW(psrc_ld_m); \ 111 val1_m = LW(psrc_ld_m + 4); \ 112 val_m = (uint64)(val1_m); /* NOLINT */ \ 113 val_m = (uint64)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ 114 val_m = (uint64)(val_m | (uint64)val0_m); /* NOLINT */ \ 115 val_m; \ 116 }) 117 #endif // (__mips == 64) 118 119 #define SW(val, pdst) \ 120 ({ \ 121 uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ 122 uint32_t val_m = (val); \ 123 asm volatile("usw %[val_m], %[pdst_sw_m] \n" \ 124 : [pdst_sw_m] "=m"(*pdst_sw_m) \ 125 : [val_m] "r"(val_m)); \ 126 }) 127 128 #define SD(val, pdst) \ 129 ({ \ 130 uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ 131 uint32_t val0_m, val1_m; \ 132 val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ 133 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ 134 SW(val0_m, pdst_sd_m); \ 135 SW(val1_m, pdst_sd_m + 4); \ 136 }) 137 #endif // (__mips_isa_rev >= 6) 138 139 // TODO(fbarchard): Consider removing __VAR_ARGS versions. 140 #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */ 141 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__) 142 143 #define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ 144 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__) 145 146 #define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ 147 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__) 148 149 /* Description : Load two vectors with 16 'byte' sized elements 150 Arguments : Inputs - psrc, stride 151 Outputs - out0, out1 152 Return Type - as per RTYPE 153 Details : Load 16 byte elements in 'out0' from (psrc) 154 Load 16 byte elements in 'out1' from (psrc + stride) 155 */ 156 #define LD_B2(RTYPE, psrc, stride, out0, out1) \ 157 { \ 158 out0 = LD_B(RTYPE, (psrc)); \ 159 out1 = LD_B(RTYPE, (psrc) + stride); \ 160 } 161 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) 162 163 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ 164 { \ 165 LD_B2(RTYPE, (psrc), stride, out0, out1); \ 166 LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ 167 } 168 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) 169 170 /* Description : Store two vectors with stride each having 16 'byte' sized 171 elements 172 Arguments : Inputs - in0, in1, pdst, stride 173 Details : Store 16 byte elements from 'in0' to (pdst) 174 Store 16 byte elements from 'in1' to (pdst + stride) 175 */ 176 #define ST_B2(RTYPE, in0, in1, pdst, stride) \ 177 { \ 178 ST_B(RTYPE, in0, (pdst)); \ 179 ST_B(RTYPE, in1, (pdst) + stride); \ 180 } 181 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) 182 183 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ 184 { \ 185 ST_B2(RTYPE, in0, in1, (pdst), stride); \ 186 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ 187 } 188 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) 189 190 /* Description : Store vectors of 8 halfword elements with stride 191 Arguments : Inputs - in0, in1, pdst, stride 192 Details : Store 8 halfword elements from 'in0' to (pdst) 193 Store 8 halfword elements from 'in1' to (pdst + stride) 194 */ 195 #define ST_H2(RTYPE, in0, in1, pdst, stride) \ 196 { \ 197 ST_H(RTYPE, in0, (pdst)); \ 198 ST_H(RTYPE, in1, (pdst) + stride); \ 199 } 200 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__) 201 202 // TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly. 203 /* Description : Shuffle byte vector elements as per mask vector 204 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 205 Outputs - out0, out1 206 Return Type - as per RTYPE 207 Details : Byte elements from 'in0' & 'in1' are copied selectively to 208 'out0' as per control vector 'mask0' 209 */ 210 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 211 { \ 212 out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ 213 out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ 214 } 215 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) 216 217 /* Description : Interleave both left and right half of input vectors 218 Arguments : Inputs - in0, in1 219 Outputs - out0, out1 220 Return Type - as per RTYPE 221 Details : Right half of byte elements from 'in0' and 'in1' are 222 interleaved and written to 'out0' 223 */ 224 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ 225 { \ 226 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 227 out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 228 } 229 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) 230 231 #endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */ 232 233 #endif // INCLUDE_LIBYUV_MACROS_MSA_H_ 234