1 /* ------------------------------------------------------------------
2  * Copyright (C) 1998-2009 PacketVideo
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13  * express or implied.
14  * See the License for the specific language governing permissions
15  * and limitations under the License.
16  * -------------------------------------------------------------------
17  */
18 /*********************************************************************************/
19 /*  Filename: sad_inline.h                                                      */
20 /*  Description: Implementation for in-line functions used in dct.cpp           */
21 /*  Modified:                                                                   */
22 /*********************************************************************************/
23 #ifndef _SAD_INLINE_H_
24 #define _SAD_INLINE_H_
25 
26 #ifdef __cplusplus
27 extern "C"
28 {
29 #endif
30 
31 #if !defined(PV_ARM_GCC_V5) && !defined(PV_ARM_GCC_V4) /* ARM GNU COMPILER  */
32 
SUB_SAD(int32 sad,int32 tmp,int32 tmp2)33     __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
34     {
35         tmp = tmp - tmp2;
36         if (tmp > 0) sad += tmp;
37         else sad -= tmp;
38 
39         return sad;
40     }
41 
sad_4pixel(int32 src1,int32 src2,int32 mask)42     __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
43     {
44         int32 x7;
45 
46         x7 = src2 ^ src1;       /* check odd/even combination */
47         if ((uint32)src2 >= (uint32)src1)
48         {
49             src1 = src2 - src1;     /* subs */
50         }
51         else
52         {
53             src1 = src1 - src2;
54         }
55         x7 = x7 ^ src1;     /* only odd bytes need to add carry */
56         x7 = mask & ((uint32)x7 >> 1);
57         x7 = (x7 << 8) - x7;
58         src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */
59         src1 = src1 ^(x7 >> 7);   /* take absolute value of negative byte */
60 
61         return src1;
62     }
63 
64 #define NUMBER 3
65 #define SHIFT 24
66 
67 #include "sad_mb_offset.h"
68 
69 #undef NUMBER
70 #define NUMBER 2
71 #undef SHIFT
72 #define SHIFT 16
73 #include "sad_mb_offset.h"
74 
75 #undef NUMBER
76 #define NUMBER 1
77 #undef SHIFT
78 #define SHIFT 8
79 #include "sad_mb_offset.h"
80 
81 
simd_sad_mb(UChar * ref,UChar * blk,Int dmin,Int lx)82     __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
83     {
84         int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
85 
86         x9 = 0x80808080; /* const. */
87 
88         x8 = (uintptr_t)ref & 0x3;
89         if (x8 == 3)
90             goto SadMBOffset3;
91         if (x8 == 2)
92             goto SadMBOffset2;
93         if (x8 == 1)
94             goto SadMBOffset1;
95 
96 //  x5 = (x4<<8)-x4; /* x5 = x4*255; */
97         x4 = x5 = 0;
98 
99         x6 = 0xFFFF00FF;
100 
101         ref -= lx;
102         blk -= 16;
103 
104         x8 = 16;
105 
106 LOOP_SAD0:
107         /****** process 8 pixels ******/
108         x10 = *((uint32*)(ref += lx));
109         x11 = *((uint32*)(ref + 4));
110         x12 = *((uint32*)(blk += 16));
111         x14 = *((uint32*)(blk + 4));
112 
113         /* process x11 & x14 */
114         x11 = sad_4pixel(x11, x14, x9);
115 
116         /* process x12 & x10 */
117         x10 = sad_4pixel(x10, x12, x9);
118 
119         x5 = x5 + x10; /* accumulate low bytes */
120         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
121         x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
122         x5 = x5 + x11;  /* accumulate low bytes */
123         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
124         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
125 
126         /****** process 8 pixels ******/
127         x10 = *((uint32*)(ref + 8));
128         x11 = *((uint32*)(ref + 12));
129         x12 = *((uint32*)(blk + 8));
130         x14 = *((uint32*)(blk + 12));
131 
132         /* process x11 & x14 */
133         x11 = sad_4pixel(x11, x14, x9);
134 
135         /* process x12 & x10 */
136         x10 = sad_4pixel(x10, x12, x9);
137 
138         x5 = x5 + x10;  /* accumulate low bytes */
139         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
140         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
141         x5 = x5 + x11;  /* accumulate low bytes */
142         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
143         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
144 
145         /****************/
146         x10 = x5 - (x4 << 8); /* extract low bytes */
147         x10 = x10 + x4;     /* add with high bytes */
148         x10 = x10 + (x10 << 16); /* add with lower half word */
149 
150         if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
151         {
152             if (--x8)
153             {
154                 goto LOOP_SAD0;
155             }
156 
157         }
158 
159         return ((uint32)x10 >> 16);
160 
161 SadMBOffset3:
162 
163         return sad_mb_offset3(ref, blk, lx, dmin);
164 
165 SadMBOffset2:
166 
167         return sad_mb_offset2(ref, blk, lx, dmin);
168 
169 SadMBOffset1:
170 
171         return sad_mb_offset1(ref, blk, lx, dmin);
172 
173     }
174 
175 #elif defined(__CC_ARM)  /* only work with arm v5 */
176 
177     __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
178     {
179         __asm
180         {
181             rsbs    tmp, tmp, tmp2 ;
182             rsbmi   tmp, tmp, #0 ;
183             add     sad, sad, tmp ;
184         }
185 
186         return sad;
187     }
188 
189     __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
190     {
191         int32 x7;
192 
193         __asm
194         {
195             EOR     x7, src2, src1;     /* check odd/even combination */
196             SUBS    src1, src2, src1;
197             EOR     x7, x7, src1;
198             AND     x7, mask, x7, lsr #1;
199             ORRCC   x7, x7, #0x80000000;
200             RSB     x7, x7, x7, lsl #8;
201             ADD     src1, src1, x7, asr #7;   /* add 0xFF to the negative byte, add back carry */
202             EOR     src1, src1, x7, asr #7;   /* take absolute value of negative byte */
203         }
204 
205         return src1;
206     }
207 
208     __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
209     {
210         int32 x7;
211 
212         __asm
213         {
214             EOR      x7, src2, src1;        /* check odd/even combination */
215             ADDS     src1, src2, src1;
216             EOR      x7, x7, src1;      /* only odd bytes need to add carry */
217             ANDS     x7, mask, x7, rrx;
218             RSB      x7, x7, x7, lsl #8;
219             SUB      src1, src1, x7, asr #7;  /* add 0xFF to the negative byte, add back carry */
220             EOR      src1, src1, x7, asr #7; /* take absolute value of negative byte */
221         }
222 
223         return src1;
224     }
225 
226 #define sum_accumulate  __asm{      SBC      x5, x5, x10;  /* accumulate low bytes */ \
227         BIC      x10, x6, x10;   /* x10 & 0xFF00FF00 */ \
228         ADD      x4, x4, x10,lsr #8;   /* accumulate high bytes */ \
229         SBC      x5, x5, x11;    /* accumulate low bytes */ \
230         BIC      x11, x6, x11;   /* x11 & 0xFF00FF00 */ \
231         ADD      x4, x4, x11,lsr #8; } /* accumulate high bytes */
232 
233 
234 #define NUMBER 3
235 #define SHIFT 24
236 #define INC_X8 0x08000001
237 
238 #include "sad_mb_offset.h"
239 
240 #undef NUMBER
241 #define NUMBER 2
242 #undef SHIFT
243 #define SHIFT 16
244 #undef INC_X8
245 #define INC_X8 0x10000001
246 #include "sad_mb_offset.h"
247 
248 #undef NUMBER
249 #define NUMBER 1
250 #undef SHIFT
251 #define SHIFT 8
252 #undef INC_X8
253 #define INC_X8 0x08000001
254 #include "sad_mb_offset.h"
255 
256 
257     __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
258     {
259         int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
260 
261         x9 = 0x80808080; /* const. */
262         x4 = x5 = 0;
263 
264         __asm
265         {
266             MOVS    x8, ref, lsl #31 ;
267             BHI     SadMBOffset3;
268             BCS     SadMBOffset2;
269             BMI     SadMBOffset1;
270 
271             MVN     x6, #0xFF00;
272         }
273 LOOP_SAD0:
274         /****** process 8 pixels ******/
275         x11 = *((int32*)(ref + 12));
276         x10 = *((int32*)(ref + 8));
277         x14 = *((int32*)(blk + 12));
278         x12 = *((int32*)(blk + 8));
279 
280         /* process x11 & x14 */
281         x11 = sad_4pixel(x11, x14, x9);
282 
283         /* process x12 & x10 */
284         x10 = sad_4pixel(x10, x12, x9);
285 
286         x5 = x5 + x10;  /* accumulate low bytes */
287         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
288         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
289         x5 = x5 + x11;  /* accumulate low bytes */
290         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
291         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
292 
293         __asm
294         {
295             /****** process 8 pixels ******/
296             LDR     x11, [ref, #4];
297             LDR     x10, [ref], lx ;
298             LDR     x14, [blk, #4];
299             LDR     x12, [blk], #16 ;
300         }
301 
302         /* process x11 & x14 */
303         x11 = sad_4pixel(x11, x14, x9);
304 
305         /* process x12 & x10 */
306         x10 = sad_4pixel(x10, x12, x9);
307 
308         x5 = x5 + x10;  /* accumulate low bytes */
309         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
310         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
311         x5 = x5 + x11;  /* accumulate low bytes */
312         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
313         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
314 
315         /****************/
316         x10 = x5 - (x4 << 8); /* extract low bytes */
317         x10 = x10 + x4;     /* add with high bytes */
318         x10 = x10 + (x10 << 16); /* add with lower half word */
319 
320         __asm
321         {
322             /****************/
323             RSBS    x11, dmin, x10, lsr #16;
324             ADDLSS  x8, x8, #0x10000001;
325             BLS     LOOP_SAD0;
326         }
327 
328         return ((uint32)x10 >> 16);
329 
330 SadMBOffset3:
331 
332         return sad_mb_offset3(ref, blk, lx, dmin, x8);
333 
334 SadMBOffset2:
335 
336         return sad_mb_offset2(ref, blk, lx, dmin, x8);
337 
338 SadMBOffset1:
339 
340         return sad_mb_offset1(ref, blk, lx, dmin, x8);
341     }
342 
343 
344 #elif ( defined(PV_ARM_GCC_V5) || defined(PV_ARM_GCC_V4) ) /* ARM GNU COMPILER  */
345 
346     __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
347     {
348         register int32 out;
349         register int32 temp1;
350         register int32 ss = sad;
351         register int32 tt = tmp;
352         register int32 uu = tmp2;
353 
354         asm volatile("rsbs  %1, %4, %3\n\t"
355                      "rsbmi %1, %1, #0\n\t"
356                      "add   %0, %2, %1"
357              : "=&r"(out),
358                      "=&r"(temp1)
359                              : "r"(ss),
360                              "r"(tt),
361                              "r"(uu));
362         return out;
363     }
364 
365     __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
366 {
367         register int32 out;
368         register int32 temp1;
369         register int32 s1 = src1;
370         register int32 s2 = src2;
371         register int32 mm = mask;
372 
373         asm volatile("eor   %0, %3, %2\n\t"
374                      "subs  %1, %3, %2\n\t"
375                      "eor   %0, %0, %1\n\t"
376                      "and   %0, %4, %0, lsr #1\n\t"
377                      "orrcc %0, %0, #0x80000000\n\t"
378                      "rsb   %0, %0, %0, lsl #8\n\t"
379                      "add   %1, %1, %0, asr #7\n\t"
380                      "eor   %1, %1, %0, asr #7"
381              : "=&r"(out),
382                      "=&r"(temp1)
383                              : "r"(s1),
384                              "r"(s2),
385                              "r"(mm));
386 
387         return temp1;
388     }
389 
390     __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
391 {
392         register int32 out;
393         register int32 temp1;
394         register int32 s1 = src1;
395         register int32 s2 = src2;
396         register int32 mm = mask;
397 
398         asm volatile("eor    %1, %3, %2\n\t"
399                      "adds   %0, %3, %2\n\t"
400                      "eor    %1, %1, %0\n\t"
401                      "ands   %1, %4, %1,rrx\n\t"
402                      "rsb    %1, %1, %1, lsl #8\n\t"
403                      "sub    %0, %0, %1, asr #7\n\t"
404                      "eor    %0, %0, %1, asr #7"
405              : "=&r"(out),
406                      "=&r"(temp1)
407                              : "r"(s1),
408                              "r"(s2),
409                              "r"(mm));
410 
411         return (out);
412     }
413 
414 #define sum_accumulate asm volatile("sbc  %0, %0, %1\n\t" \
415                                 "bic  %1, %4, %1\n\t" \
416                                 "add  %2, %2, %1, lsr #8\n\t" \
417                                 "sbc  %0, %0, %3\n\t" \
418                                 "bic  %3, %4, %3\n\t" \
419                                 "add  %2, %2, %3, lsr #8" \
420                                 :"+r"(x5), "+r"(x10), "+r"(x4), "+r"(x11) \
421                                 :"r"(x6));
422 
423 #define NUMBER 3
424 #define SHIFT 24
425 #define INC_X8 0x08000001
426 
427 #include "sad_mb_offset.h"
428 
429 #undef NUMBER
430 #define NUMBER 2
431 #undef SHIFT
432 #define SHIFT 16
433 #undef INC_X8
434 #define INC_X8 0x10000001
435 #include "sad_mb_offset.h"
436 
437 #undef NUMBER
438 #define NUMBER 1
439 #undef SHIFT
440 #define SHIFT 8
441 #undef INC_X8
442 #define INC_X8 0x08000001
443 #include "sad_mb_offset.h"
444 
445 
446     __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
447 {
448         int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
449 
450         x9 = 0x80808080; /* const. */
451         x4 = x5 = 0;
452 
453         x8 = (uint32)ref & 0x3;
454         if (x8 == 3)
455             goto SadMBOffset3;
456         if (x8 == 2)
457             goto SadMBOffset2;
458         if (x8 == 1)
459             goto SadMBOffset1;
460 
461 asm volatile("mvn %0, #0xFF00": "=r"(x6));
462 
463 LOOP_SAD0:
464         /****** process 8 pixels ******/
465         x11 = *((int32*)(ref + 12));
466         x10 = *((int32*)(ref + 8));
467         x14 = *((int32*)(blk + 12));
468         x12 = *((int32*)(blk + 8));
469 
470         /* process x11 & x14 */
471         x11 = sad_4pixel(x11, x14, x9);
472 
473         /* process x12 & x10 */
474         x10 = sad_4pixel(x10, x12, x9);
475 
476         x5 = x5 + x10;  /* accumulate low bytes */
477         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
478         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
479         x5 = x5 + x11;  /* accumulate low bytes */
480         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
481         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
482 
483         asm volatile("ldr  %0, [%4, #4]\n\t"
484                      "ldr  %1, [%4], %6\n\t"
485                      "ldr  %2, [%5, #4]\n\t"
486                      "ldr  %3, [%5], #16"
487              : "=r"(x11), "=r"(x10), "=r"(x14), "=r"(x12), "+r"(ref), "+r"(blk)
488                              : "r"(lx));
489 
490         /* process x11 & x14 */
491         x11 = sad_4pixel(x11, x14, x9);
492 
493         /* process x12 & x10 */
494         x10 = sad_4pixel(x10, x12, x9);
495 
496         x5 = x5 + x10;  /* accumulate low bytes */
497         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
498         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
499         x5 = x5 + x11;  /* accumulate low bytes */
500         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
501         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
502 
503         /****************/
504         x10 = x5 - (x4 << 8); /* extract low bytes */
505         x10 = x10 + x4;     /* add with high bytes */
506         x10 = x10 + (x10 << 16); /* add with lower half word */
507 
508         if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
509         {
510             if (--x8)
511             {
512                 goto LOOP_SAD0;
513             }
514 
515         }
516 
517         return ((uint32)x10 >> 16);
518 
519 SadMBOffset3:
520 
521         return sad_mb_offset3(ref, blk, lx, dmin);
522 
523 SadMBOffset2:
524 
525         return sad_mb_offset2(ref, blk, lx, dmin);
526 
527 SadMBOffset1:
528 
529         return sad_mb_offset1(ref, blk, lx, dmin);
530     }
531 
532 #endif // OS
533 
534 #ifdef __cplusplus
535 }
536 #endif
537 
538 #endif // _SAD_INLINE_H_
539 
540