1 /* ------------------------------------------------------------------
2  * Copyright (C) 1998-2009 PacketVideo
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13  * express or implied.
14  * See the License for the specific language governing permissions
15  * and limitations under the License.
16  * -------------------------------------------------------------------
17  */
18 /*********************************************************************************/
19 /*  Filename: sad_mb_offset.h                                                       */
20 /*  Description: Implementation for in-line functions used in dct.cpp           */
21 /*  Modified:                                                                   */
22 /*********************************************************************************/
23 
24 #if !defined(PV_ARM_GCC_V4) && !defined(PV_ARM_GCC_V5) /* ARM GNU COMPILER  */
25 
26 #if (NUMBER==3)
sad_mb_offset3(UChar * ref,UChar * blk,Int lx,Int dmin)27 __inline int32 sad_mb_offset3(UChar *ref, UChar *blk, Int lx, Int dmin)
28 #elif (NUMBER==2)
29 __inline int32 sad_mb_offset2(UChar *ref, UChar *blk, Int lx, Int dmin)
30 #elif (NUMBER==1)
31 __inline int32 sad_mb_offset1(UChar *ref, UChar *blk, Int lx, Int dmin)
32 #endif
33 {
34     int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
35 
36     //  x5 = (x4<<8) - x4;
37     x4 = x5 = 0;
38     x6 = 0xFFFF00FF;
39     x9 = 0x80808080; /* const. */
40     ref -= NUMBER; /* bic ref, ref, #3 */
41     ref -= lx;
42     blk -= 16;
43     x8 = 16;
44 
45 #if (NUMBER==3)
46 LOOP_SAD3:
47 #elif (NUMBER==2)
48 LOOP_SAD2:
49 #elif (NUMBER==1)
50 LOOP_SAD1:
51 #endif
52     /****** process 8 pixels ******/
53     x10 = *((uint32*)(ref += lx)); /* D C B A */
54     x11 = *((uint32*)(ref + 4));    /* H G F E */
55     x12 = *((uint32*)(ref + 8));    /* L K J I */
56 
57     x10 = ((uint32)x10 >> SHIFT); /* 0 0 0 D */
58     x10 = x10 | (x11 << (32 - SHIFT));        /* G F E D */
59     x11 = ((uint32)x11 >> SHIFT); /* 0 0 0 H */
60     x11 = x11 | (x12 << (32 - SHIFT));        /* K J I H */
61 
62     x12 = *((uint32*)(blk += 16));
63     x14 = *((uint32*)(blk + 4));
64 
65     /* process x11 & x14 */
66     x11 = sad_4pixel(x11, x14, x9);
67 
68     /* process x12 & x10 */
69     x10 = sad_4pixel(x10, x12, x9);
70 
71     x5 = x5 + x10; /* accumulate low bytes */
72     x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
73     x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
74     x5 = x5 + x11;  /* accumulate low bytes */
75     x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
76     x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
77 
78     /****** process 8 pixels ******/
79     x10 = *((uint32*)(ref + 8)); /* D C B A */
80     x11 = *((uint32*)(ref + 12));   /* H G F E */
81     x12 = *((uint32*)(ref + 16));   /* L K J I */
82 
83     x10 = ((uint32)x10 >> SHIFT); /* mvn x10, x10, lsr #24  = 0xFF 0xFF 0xFF ~D */
84     x10 = x10 | (x11 << (32 - SHIFT));        /* bic x10, x10, x11, lsl #8 = ~G ~F ~E ~D */
85     x11 = ((uint32)x11 >> SHIFT); /* 0xFF 0xFF 0xFF ~H */
86     x11 = x11 | (x12 << (32 - SHIFT));        /* ~K ~J ~I ~H */
87 
88     x12 = *((uint32*)(blk + 8));
89     x14 = *((uint32*)(blk + 12));
90 
91     /* process x11 & x14 */
92     x11 = sad_4pixel(x11, x14, x9);
93 
94     /* process x12 & x10 */
95     x10 = sad_4pixel(x10, x12, x9);
96 
97     x5 = x5 + x10; /* accumulate low bytes */
98     x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
99     x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
100     x5 = x5 + x11;  /* accumulate low bytes */
101     x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
102     x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
103 
104     /****************/
105     x10 = x5 - (x4 << 8); /* extract low bytes */
106     x10 = x10 + x4;     /* add with high bytes */
107     x10 = x10 + (x10 << 16); /* add with lower half word */
108 
109     if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
110     {
111         if (--x8)
112         {
113 #if (NUMBER==3)
114             goto         LOOP_SAD3;
115 #elif (NUMBER==2)
116             goto         LOOP_SAD2;
117 #elif (NUMBER==1)
118             goto         LOOP_SAD1;
119 #endif
120         }
121 
122     }
123 
124     return ((uint32)x10 >> 16);
125 }
126 
127 #elif defined(__CC_ARM)  /* only work with arm v5 */
128 
129 #if (NUMBER==3)
sad_mb_offset3(UChar * ref,UChar * blk,Int lx,Int dmin,int32 x8)130 __inline int32 sad_mb_offset3(UChar *ref, UChar *blk, Int lx, Int dmin, int32 x8)
131 #elif (NUMBER==2)
132 __inline int32 sad_mb_offset2(UChar *ref, UChar *blk, Int lx, Int dmin, int32 x8)
133 #elif (NUMBER==1)
134 __inline int32 sad_mb_offset1(UChar *ref, UChar *blk, Int lx, Int dmin, int32 x8)
135 #endif
136 {
137     int32 x4, x5, x6, x9, x10, x11, x12, x14;
138 
139     x9 = 0x80808080; /* const. */
140     x4 = x5 = 0;
141 
142     __asm{
143         MVN      x6, #0xff0000;
144         BIC      ref, ref, #3;
145 
146 #if (NUMBER==3)
147 LOOP_SAD3:
148 #elif (NUMBER==2)
149 LOOP_SAD2:
150 #elif (NUMBER==1)
151 LOOP_SAD1:
152 #endif
153     }
154     /****** process 8 pixels ******/
155     x11 = *((int32*)(ref + 12));
156     x12 = *((int32*)(ref + 16));
157     x10 = *((int32*)(ref + 8));
158     x14 = *((int32*)(blk + 12));
159 
160     __asm{
161         MVN      x10, x10, lsr #SHIFT;
162         BIC      x10, x10, x11, lsl #(32-SHIFT);
163         MVN      x11, x11, lsr #SHIFT;
164         BIC      x11, x11, x12, lsl #(32-SHIFT);
165 
166         LDR      x12, [blk, #8];
167     }
168 
169     /* process x11 & x14 */
170     x11 = sad_4pixelN(x11, x14, x9);
171 
172     /* process x12 & x10 */
173     x10 = sad_4pixelN(x10, x12, x9);
174 
175     sum_accumulate;
176 
177     __asm{
178         /****** process 8 pixels ******/
179         LDR      x11, [ref, #4];
180         LDR      x12, [ref, #8];
181         LDR  x10, [ref], lx ;
182         LDR  x14, [blk, #4];
183 
184         MVN      x10, x10, lsr #SHIFT;
185         BIC      x10, x10, x11, lsl #(32-SHIFT);
186         MVN      x11, x11, lsr #SHIFT;
187         BIC      x11, x11, x12, lsl #(32-SHIFT);
188 
189         LDR      x12, [blk], #16;
190     }
191 
192     /* process x11 & x14 */
193     x11 = sad_4pixelN(x11, x14, x9);
194 
195     /* process x12 & x10 */
196     x10 = sad_4pixelN(x10, x12, x9);
197 
198     sum_accumulate;
199 
200     /****************/
201     x10 = x5 - (x4 << 8); /* extract low bytes */
202     x10 = x10 + x4;     /* add with high bytes */
203     x10 = x10 + (x10 << 16); /* add with lower half word */
204 
205     __asm{
206         RSBS     x11, dmin, x10, lsr #16
207         ADDLSS   x8, x8, #INC_X8
208 #if (NUMBER==3)
209         BLS      LOOP_SAD3;
210 #elif (NUMBER==2)
211 BLS      LOOP_SAD2;
212 #elif (NUMBER==1)
213 BLS      LOOP_SAD1;
214 #endif
215     }
216 
217     return ((uint32)x10 >> 16);
218 }
219 
220 #elif ( defined(PV_ARM_GCC_V5) || defined(PV_ARM_GCC_V4) ) /* ARM GNU COMPILER  */
221 
222 #if (NUMBER==3)
sad_mb_offset3(UChar * ref,UChar * blk,Int lx,Int dmin)223 __inline int32 sad_mb_offset3(UChar *ref, UChar *blk, Int lx, Int dmin)
224 #elif (NUMBER==2)
225 __inline int32 sad_mb_offset2(UChar *ref, UChar *blk, Int lx, Int dmin)
226 #elif (NUMBER==1)
227 __inline int32 sad_mb_offset1(UChar *ref, UChar *blk, Int lx, Int dmin)
228 #endif
229 {
230     int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
231 
232     //  x5 = (x4<<8) - x4;
233     x4 = x5 = 0;
234     x6 = 0xFFFF00FF;
235     x9 = 0x80808080; /* const. */
236     ref -= NUMBER; /* bic ref, ref, #3 */
237     ref -= lx;
238     x8 = 16;
239 
240 #if (NUMBER==3)
241 LOOP_SAD3:
242 #elif (NUMBER==2)
243 LOOP_SAD2:
244 #elif (NUMBER==1)
245 LOOP_SAD1:
246 #endif
247     /****** process 8 pixels ******/
248     x10 = *((uint32*)(ref += lx)); /* D C B A */
249     x11 = *((uint32*)(ref + 4));    /* H G F E */
250     x12 = *((uint32*)(ref + 8));    /* L K J I */
251 
252     int32 shift = SHIFT;
253     int32 shift2 = 32 - SHIFT;
254     asm volatile("ldr  %3, [%4, #4]\n\t"
255                  "mvn  %0, %0, lsr %5\n\t"
256                  "bic  %0, %0, %1, lsl %6\n\t"
257                  "mvn  %1, %1, lsr %5\n\t"
258                  "bic  %1, %1, %2, lsl %6\n\t"
259                  "ldr  %2, [%4, #8]"
260              : "+r"(x10), "+r"(x11), "+r"(x12), "=r"(x14)
261                          : "r"(blk), "r"(shift), "r"(shift2));
262 
263     /* process x11 & x14 */
264     x11 = sad_4pixel(x11, x14, x9);
265 
266     /* process x12 & x10 */
267     x10 = sad_4pixel(x10, x12, x9);
268 
269     sum_accumulate;
270 
271     /****** process 8 pixels ******/
272     x10 = *((uint32*)(ref + 8)); /* D C B A */
273     x11 = *((uint32*)(ref + 12));   /* H G F E */
274     x12 = *((uint32*)(ref + 16));   /* L K J I */
275 
276     asm volatile("ldr  %3, [%4, #4]\n\t"
277                  "mvn  %0, %0, lsr %5\n\t"
278                  "bic  %0, %0, %1, lsl %6\n\t"
279                  "mvn  %1, %1, lsr %5\n\t"
280                  "bic  %1, %1, %2, lsl %6\n\t"
281                  "ldr  %2, [%4, #8]"
282              : "+r"(x10), "+r"(x11), "+r"(x12), "=r"(x14)
283                          : "r"(blk), "r"(shift), "r"(shift2));
284 
285     /* process x11 & x14 */
286     x11 = sad_4pixel(x11, x14, x9);
287 
288     /* process x12 & x10 */
289     x10 = sad_4pixel(x10, x12, x9);
290 
291     sum_accumulate;
292 
293     /****************/
294     x10 = x5 - (x4 << 8); /* extract low bytes */
295     x10 = x10 + x4;     /* add with high bytes */
296     x10 = x10 + (x10 << 16); /* add with lower half word */
297 
298     if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
299     {
300         if (--x8)
301         {
302 #if (NUMBER==3)
303             goto         LOOP_SAD3;
304 #elif (NUMBER==2)
305 goto         LOOP_SAD2;
306 #elif (NUMBER==1)
307 goto         LOOP_SAD1;
308 #endif
309         }
310 
311     }
312 
313     return ((uint32)x10 >> 16);
314 }
315 
316 #endif
317 
318