1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20
21@/*
22@//----------------------------------------------------------------------------
23@// File Name            : impeg2_inter_pred.s
24@//
25@// Description          : This file has motion compensation related
26@//                        interpolation functions on Neon + CortexA-8 platform
27@//
28@// Reference Document   :
29@//
30@// Revision History     :
31@//      Date            Author                  Detail Description
32@//   ------------    ----------------    ----------------------------------
33@//   18 jun 2010     S Hamsalekha              Created
34@//
35@//-------------------------------------------------------------------------
36@*/
37
38@/*
39@// ----------------------------------------------------------------------------
40@// Include Files
41@// ----------------------------------------------------------------------------
42@*/
43.text
44.p2align 2
45
46
47@/*
48@// ----------------------------------------------------------------------------
49@// Struct/Union Types and Define
50@// ----------------------------------------------------------------------------
51@*/
52
53
54@/*
55@// ----------------------------------------------------------------------------
56@// Static Global Data section variables
57@// ----------------------------------------------------------------------------
58@*/
59@// -------------------------- NONE --------------------------------------------
60
61
62@/*
63@// ----------------------------------------------------------------------------
64@// Static Prototype Functions
65@// ----------------------------------------------------------------------------
66@*/
67@// -------------------------- NONE --------------------------------------------
68
69@/*
70@// ----------------------------------------------------------------------------
71@// Exported functions
72@// ----------------------------------------------------------------------------
73@*/
74
75@//---------------------------------------------------------------------------
76@// Function Name      :   impeg2_copy_mb_a9q()
77@//
78@// Detail Description : Copies one MB worth of data from src to the dst
79@//
80@// Inputs             : r0 - pointer to src
81@//                      r1 - pointer to dst
82@//                      r2 - source width
83@//                      r3 - destination width
84@// Registers Used     : r4, r5, d0, d1
85@//
86@// Stack Usage        : 12 bytes
87@//
88@// Outputs            :
89@//
90@// Return Data        : None
91@//
92@// Programming Note   : <program limitation>
93@//-----------------------------------------------------------------------------
94@*/
95
96
97
98        .global impeg2_copy_mb_a9q
99
100
101impeg2_copy_mb_a9q:
102
103    stmfd           r13!, {r4, r5, r14}
104
105
106    ldr             r4, [r0]            @src->y
107    ldr             r5, [r1]            @dst->y
108    @Read one row of data from the src
109    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
110    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
111
112    @//Repeat 15 times for y
113    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
114    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
115    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
116    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
117    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
118    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
119    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
120    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
121    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
122    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
123    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
124    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
125    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
126    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
127    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
128    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
129    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
130    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
131    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
132    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
133    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
134    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
135    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
136    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
137    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
138    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
139    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
140    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
141    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
142    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
143
144    mov             r2, r2, lsr #1      @src_offset /= 2
145    mov             r3, r3, lsr #1      @dst_offset /= 2
146
147    ldr             r4, [r0, #4]        @src->u
148    ldr             r5, [r1, #4]        @dst->u
149    @Read one row of data from the src
150    vld1.8          {d0}, [r4], r2      @Load and increment src
151    vst1.8          {d0}, [r5], r3      @Store and increment dst
152
153    @//Repeat 7 times for u
154    vld1.8          {d0}, [r4], r2      @Load and increment src
155    vst1.8          {d0}, [r5], r3      @Store and increment dst
156    vld1.8          {d0}, [r4], r2      @Load and increment src
157    vst1.8          {d0}, [r5], r3      @Store and increment dst
158    vld1.8          {d0}, [r4], r2      @Load and increment src
159    vst1.8          {d0}, [r5], r3      @Store and increment dst
160    vld1.8          {d0}, [r4], r2      @Load and increment src
161    vst1.8          {d0}, [r5], r3      @Store and increment dst
162    vld1.8          {d0}, [r4], r2      @Load and increment src
163    vst1.8          {d0}, [r5], r3      @Store and increment dst
164    vld1.8          {d0}, [r4], r2      @Load and increment src
165    vst1.8          {d0}, [r5], r3      @Store and increment dst
166    vld1.8          {d0}, [r4], r2      @Load and increment src
167    vst1.8          {d0}, [r5], r3      @Store and increment dst
168
169    ldr             r4, [r0, #8]        @src->v
170    ldr             r5, [r1, #8]        @dst->v
171    @Read one row of data from the src
172    vld1.8          {d0}, [r4], r2      @Load and increment src
173    vst1.8          {d0}, [r5], r3      @Store and increment dst
174
175    @//Repeat 7 times for v
176    vld1.8          {d0}, [r4], r2      @Load and increment src
177    vst1.8          {d0}, [r5], r3      @Store and increment dst
178    vld1.8          {d0}, [r4], r2      @Load and increment src
179    vst1.8          {d0}, [r5], r3      @Store and increment dst
180    vld1.8          {d0}, [r4], r2      @Load and increment src
181    vst1.8          {d0}, [r5], r3      @Store and increment dst
182    vld1.8          {d0}, [r4], r2      @Load and increment src
183    vst1.8          {d0}, [r5], r3      @Store and increment dst
184    vld1.8          {d0}, [r4], r2      @Load and increment src
185    vst1.8          {d0}, [r5], r3      @Store and increment dst
186    vld1.8          {d0}, [r4], r2      @Load and increment src
187    vst1.8          {d0}, [r5], r3      @Store and increment dst
188    vld1.8          {d0}, [r4], r2      @Load and increment src
189    vst1.8          {d0}, [r5], r3      @Store and increment dst
190
191    ldmfd           r13!, {r4, r5, pc}
192
193
194
195
196@/*
197@//---------------------------------------------------------------------------
198@// Function Name      :   impeg2_mc_fullx_halfy_8x8_a9q()
199@//
200@// Detail Description : This function pastes the reference block in the
201@//                      current frame buffer.This function is called for
202@//                      blocks that are not coded and have motion vectors
203@//                      with a half pel resolution.
204@//
205@// Inputs             : r0 - out    : Current Block Pointer
206@//                      r1 - ref     : Refernce Block Pointer
207@//                      r2 - ref_wid   : Refernce Block Width
208@//                      r3 - out_wid   ; Current Block Width
209@//
210@// Registers Used     : D0-D9
211@//
212@// Stack Usage        : 4 bytes
213@//
214@// Outputs            : The Motion Compensated Block
215@//
216@// Return Data        : None
217@//
218@// Programming Note   : <program limitation>
219@//-----------------------------------------------------------------------------
220@*/
221
222        .global impeg2_mc_fullx_halfy_8x8_a9q
223
224impeg2_mc_fullx_halfy_8x8_a9q:
225
226    stmfd           r13!, {r14}
227    add             r14, r1, r2
228    mov             r2, r2, lsl #1
229
230@/* Load 8 + 1 rows from reference block */
231@/* Do the addition with out rounding off as rounding value is 1 */
232    vld1.8          {d0}, [r1], r2      @// first row hence r1 = D0
233    vld1.8          {d2}, [r14], r2     @// second row hence r2 = D2
234    vld1.8          {d4}, [r1], r2      @// third row hence r3 = D4
235    vld1.8          {d6}, [r14], r2     @// fourth row hence r4 = D6
236    vld1.8          {d1}, [r1], r2      @// fifth row hence r5 = D1
237    vld1.8          {d3}, [r14], r2     @// sixth row hence r6 = D3
238    vrhadd.u8       d9, d1, d6          @// estimated row 4 = D9
239    vld1.8          {d5}, [r1], r2      @// seventh row hence r7 = D5
240    vrhadd.u8       q0, q0, q1          @// estimated row 1 = D0, row 5 = D1
241    vld1.8          {d7}, [r14], r2     @// eighth row hence r8 = D7
242    vrhadd.u8       q1, q1, q2          @// estimated row 2 = D2, row 6 = D3
243    vld1.8          {d8}, [r1], r2      @// ninth row hence r9 = D8
244    vrhadd.u8       q2, q2, q3          @// estimated row 3 = D4, row 7 = D5
245
246    add             r14, r0, r3
247    mov             r3, r3, lsl #1
248
249@/* Store the eight rows calculated above */
250    vst1.8          {d2}, [r14], r3     @// second row hence D2
251    vrhadd.u8       d7, d7, d8          @// estimated row 8 = D7
252    vst1.8          {d0}, [r0], r3      @// first row hence D0
253    vst1.8          {d9}, [r14], r3     @// fourth row hence D9
254    vst1.8          {d4}, [r0], r3      @// third row hence D4
255    vst1.8          {d3}, [r14], r3     @// sixth row hence r6 = D3
256    vst1.8          {d1}, [r0], r3      @// fifth row hence r5 = D1
257    vst1.8          {d7}, [r14], r3     @// eighth row hence r8 = D7
258    vst1.8          {d5}, [r0], r3      @// seventh row hence r7 = D5
259
260    ldmfd           sp!, {pc}
261
262
263
264
265
266
267@/*
268@//---------------------------------------------------------------------------
269@// Function Name      :   impeg2_mc_halfx_fully_8x8_a9q()
270@//
271@// Detail Description : This function pastes the reference block in the
272@//                      current frame buffer.This function is called for
273@//                      blocks that are not coded and have motion vectors
274@//                      with a half pel resolutionand VopRoundingType is 0 ..
275@//
276@// Inputs             : r0 - out    : Current Block Pointer
277@//                      r1 - ref     : Refernce Block Pointer
278@//                      r2 - ref_wid   : Refernce Block Width
279@//                      r3 - out_wid   ; Current Block Width
280@//
281@// Registers Used     : r12, r14, d0-d10, d12-d14, d16-d18, d20-d22
282
283@//
284@// Stack Usage        : 8 bytes
285@//
286@// Outputs            : The Motion Compensated Block
287@//
288@// Return Data        : None
289@//
290@// Programming Note   : <program limitation>
291@//-----------------------------------------------------------------------------
292@*/
293
294
295
296        .global impeg2_mc_halfx_fully_8x8_a9q
297
298
299
300impeg2_mc_halfx_fully_8x8_a9q:
301
302    stmfd           sp!, {r12, lr}
303
304    add             r14, r1, r2, lsl #2
305
306    add             r12, r0, r3, lsl#2
307
308    vld1.8          {d0, d1}, [r1], r2  @load 16 pixels of  row1
309
310    vld1.8          {d2, d3}, [r14], r2 @ row5
311
312
313    vld1.8          {d4, d5}, [r1], r2  @load 16 pixels row2
314
315    vld1.8          {d6, d7}, [r14], r2 @row6
316
317
318    vext.8          d8, d0, d1, #1      @Extract pixels (1-8) of row1
319
320    vext.8          d12, d2, d3, #1     @Extract pixels (1-8) of row5
321
322    vext.8          d16, d4, d5, #1     @Extract pixels (1-8) of row2
323
324    vext.8          d20, d6, d7, #1     @Extract pixels (1-8) of row6
325
326
327    vld1.8          {d9, d10}, [r1], r2 @load row3
328
329    vld1.8          {d13, d14}, [r14], r2 @load row7
330
331    vld1.8          {d17, d18}, [r1], r2 @load  row4
332
333    vld1.8          {d21, d22}, [r14], r2 @load  row8
334
335
336    vext.8          d1, d9, d10, #1     @Extract pixels (1-8) of row3
337
338    vext.8          d3, d13, d14, #1    @Extract pixels (1-8) of row7
339
340
341
342    vext.8          d5, d17, d18, #1    @Extract pixels (1-8) of row4
343
344    vext.8          d7, d21, d22, #1    @Extract pixels (1-8) of row8
345
346
347    vrhadd.u8       q0, q0, q4          @operate on row1 and row3
348
349    vrhadd.u8       q1, q1, q6          @operate on row5 and row7
350
351
352    vrhadd.u8       q2, q2, q8          @operate on row2 and row4
353
354
355
356    vrhadd.u8       q3, q3, q10         @operate on row6 and row8
357
358    vst1.8          d0, [r0], r3        @store row1
359
360    vst1.8          d2, [r12], r3       @store row5
361
362    vst1.8          d4, [r0], r3        @store row2
363
364    vst1.8          d6, [r12], r3       @store row6
365
366    vst1.8          d1, [r0], r3        @store row3
367
368    vst1.8          d3, [r12], r3       @store row7
369
370    vst1.8          d5, [r0], r3        @store row4
371
372    vst1.8          d7, [r12], r3       @store row8
373
374
375
376    ldmfd           sp!, {r12, pc}
377
378
379
380
381
382
383
384
385@/*
386@//---------------------------------------------------------------------------
387@// Function Name      :   impeg2_mc_halfx_halfy_8x8_a9q()
388@//
389@// Detail Description : This function pastes the reference block in the
390@//                      current frame buffer.This function is called for
391@//                      blocks that are not coded and have motion vectors
392@//                      with a half pel resolutionand VopRoundingType is 0 ..
393@//
394@// Inputs             : r0 - out    : Current Block Pointer
395@//                      r1 - ref     : Refernce Block Pointer
396@//                      r2 - ref_wid   : Refernce Block Width
397@//                      r3 - out_wid   ; Current Block Width
398@//
399@// Registers Used     : r14, q0-q15
400
401@//
402@// Stack Usage        : 4 bytes
403@//
404@// Outputs            : The Motion Compensated Block
405@//
406@// Return Data        : None
407@//
408@// Programming Note   : <program limitation>
409@//-----------------------------------------------------------------------------
410@*/
411
412
413        .global impeg2_mc_halfx_halfy_8x8_a9q
414
415impeg2_mc_halfx_halfy_8x8_a9q:
416
417    stmfd           sp!, {r14}
418
419    add             r14, r1, r2, lsl #2
420
421    vld1.8          {d0, d1}, [r1], r2  @load 16 pixels of  row1
422
423    vld1.8          {d2, d3}, [r14], r2 @ row5
424
425    vld1.8          {d4, d5}, [r1], r2  @load 16 pixels row2
426
427    vld1.8          {d6, d7}, [r14], r2 @row6
428
429    vext.8          d1, d0, d1, #1      @Extract pixels (1-8) of row1
430
431
432
433    vext.8          d3, d2, d3, #1      @Extract pixels (1-8) of row5
434
435
436
437    vext.8          d5, d4, d5, #1      @Extract pixels (1-8) of row2
438
439    vext.8          d7, d6, d7, #1      @Extract pixels (1-8) of row6
440
441
442
443
444    vld1.8          {d8, d9}, [r1], r2  @load row3
445
446
447
448    vld1.8          {d10, d11}, [r14], r2 @load row7
449
450    vld1.8          {d12, d13}, [r1], r2 @load  row4
451
452    vld1.8          {d14, d15}, [r14], r2 @load  row8
453
454    vext.8          d9, d8, d9, #1      @Extract pixels (1-8) of row3
455
456    vld1.8          {d16, d17}, [r14], r2 @load  row9
457
458
459
460
461
462    vext.8          d11, d10, d11, #1   @Extract pixels (1-8) of row7
463
464
465
466    vext.8          d13, d12, d13, #1   @Extract pixels (1-8) of row4
467
468
469
470    vext.8          d15, d14, d15, #1   @Extract pixels (1-8) of row8
471
472    vext.8          d17, d16, d17, #1   @Extract pixels (1-8) of row9
473
474
475    @interpolation in x direction
476
477    vaddl.u8        q0, d0, d1          @operate row1
478
479    vaddl.u8        q1, d2, d3          @operate row5
480
481    vaddl.u8        q2, d4, d5          @operate row2
482
483    vaddl.u8        q3, d6, d7          @operate row6
484
485    vaddl.u8        q4, d8, d9          @operate row3
486
487    vaddl.u8        q5, d10, d11        @operate row7
488
489    vaddl.u8        q6, d12, d13        @operate row4
490
491    vaddl.u8        q7, d14, d15        @operate row8
492
493    vaddl.u8        q8, d16, d17        @operate row9
494
495    @interpolation in y direction
496
497    add             r14, r0, r3, lsl #2
498
499
500
501    vadd.u16        q9, q0, q2          @operate row1 and row2
502
503    vadd.u16        q13, q1, q3         @operate row5 and row6
504
505    vadd.u16        q10, q2, q4         @operate row2 and row3
506
507    vadd.u16        q14, q3, q5         @operate row6 and row7
508
509    vrshrn.u16      d18, q9, #2         @row1
510
511    vrshrn.u16      d26, q13, #2        @row5
512
513    vrshrn.u16      d20, q10, #2        @row2
514
515    vrshrn.u16      d28, q14, #2        @row6
516
517    vadd.u16        q11, q4, q6         @operate row3 and row4
518
519    vst1.8          d18, [r0], r3       @store row1
520
521    vadd.u16        q15, q5, q7         @operate row7 and row8
522
523    vst1.8          d26, [r14], r3      @store row5
524
525    vadd.u16        q12, q6, q1         @operate row4 and row5
526
527    vst1.8          d20, [r0], r3       @store row2
528
529    vadd.u16        q7, q7, q8          @operate row8 and row9
530
531    vst1.8          d28, [r14], r3      @store row6
532
533
534
535    vrshrn.u16      d22, q11, #2        @row3
536
537    vrshrn.u16      d30, q15, #2        @row7
538
539    vrshrn.u16      d24, q12, #2        @row4
540
541    vrshrn.u16      d14, q7, #2         @row8
542
543
544    vst1.8          d22, [r0], r3       @store row3
545    vst1.8          d30, [r14], r3      @store row7
546    vst1.8          d24, [r0], r3       @store row4
547    vst1.8          d14, [r14], r3      @store row8
548
549
550
551    ldmfd           sp!, {pc}
552
553
554
555
556
557@/*
558@//---------------------------------------------------------------------------
559@// Function Name      :   impeg2_mc_fullx_fully_8x8_a9q()
560@//
561@// Detail Description : This function pastes the reference block in the
562@//                      current frame buffer.This function is called for
563@//                      blocks that are not coded and have motion vectors
564@//                      with a half pel resolutionand ..
565@//
566@// Inputs             : r0 - out    : Current Block Pointer
567@//                      r1 - ref     : Refernce Block Pointer
568@//                      r2 - ref_wid   : Refernce Block Width
569@//                      r3 - out_wid   ; Current Block Width
570@//
571@// Registers Used     : r12, r14, d0-d3
572
573@//
574@// Stack Usage        : 8 bytes
575@//
576@// Outputs            : The Motion Compensated Block
577@//
578@// Return Data        : None
579@//
580@// Programming Note   : <program limitation>
581@//-----------------------------------------------------------------------------
582@*/
583
584
585        .global impeg2_mc_fullx_fully_8x8_a9q
586impeg2_mc_fullx_fully_8x8_a9q:
587
588
589    stmfd           sp!, {r12, lr}
590
591    add             r14, r1, r2, lsl #2
592
593    add             r12, r0, r3, lsl #2
594
595
596    vld1.8          d0, [r1], r2        @load row1
597
598    vld1.8          d1, [r14], r2       @load row4
599
600    vld1.8          d2, [r1], r2        @load row2
601
602    vld1.8          d3, [r14], r2       @load row5
603
604
605    vst1.8          d0, [r0], r3        @store row1
606
607    vst1.8          d1, [r12], r3       @store row4
608
609    vst1.8          d2, [r0], r3        @store row2
610
611    vst1.8          d3, [r12], r3       @store row5
612
613
614    vld1.8          d0, [r1], r2        @load row3
615
616    vld1.8          d1, [r14], r2       @load row6
617
618    vld1.8          d2, [r1], r2        @load row4
619
620    vld1.8          d3, [r14], r2       @load row8
621
622
623    vst1.8          d0, [r0], r3        @store row3
624
625    vst1.8          d1, [r12], r3       @store row6
626
627    vst1.8          d2, [r0], r3        @store row4
628
629    vst1.8          d3, [r12], r3       @store row8
630
631
632    ldmfd           sp!, {r12, pc}
633
634
635
636
637
638@/*
639@//---------------------------------------------------------------------------
640@// Function Name      :   impeg2_interpolate_a9q()
641@//
642@// Detail Description : interpolates two buffers and adds pred
643@//
644@// Inputs             : r0 - pointer to src1
645@//                      r1 - pointer to src2
646@//                      r2 - dest buf
647@//                      r3 - dst stride
648@// Registers Used     : r4, r5, r7, r14, d0-d15
649@//
650@// Stack Usage        : 20 bytes
651@//
652@// Outputs            : The Motion Compensated Block
653@//
654@// Return Data        : None
655@//
656@// Programming Note   : <program limitation>
657@//-----------------------------------------------------------------------------
658@*/
659
660
661        .global impeg2_interpolate_a9q
662
663
664impeg2_interpolate_a9q:
665
666    stmfd           r13!, {r4, r5, r7, r12, r14}
667
668    ldr             r4, [r0, #0]        @ptr_y src1
669
670    ldr             r5, [r1, #0]        @ptr_y src2
671
672    ldr             r7, [r2, #0]        @ptr_y dst buf
673
674    mov             r12, #4             @counter for number of blocks
675
676
677interp_lumablocks_stride:
678
679    vld1.8          {d0, d1}, [r4]!     @row1 src1
680
681    vld1.8          {d2, d3}, [r4]!     @row2 src1
682
683    vld1.8          {d4, d5}, [r4]!     @row3 src1
684
685    vld1.8          {d6, d7}, [r4]!     @row4 src1
686
687
688    vld1.8          {d8, d9}, [r5]!     @row1 src2
689
690    vld1.8          {d10, d11}, [r5]!   @row2 src2
691
692    vld1.8          {d12, d13}, [r5]!   @row3 src2
693
694    vld1.8          {d14, d15}, [r5]!   @row4 src2
695
696
697
698
699    vrhadd.u8       q0, q0, q4          @operate on row1
700
701    vrhadd.u8       q1, q1, q5          @operate on row2
702
703    vrhadd.u8       q2, q2, q6          @operate on row3
704
705    vrhadd.u8       q3, q3, q7          @operate on row4
706
707
708
709    vst1.8          {d0, d1}, [r7], r3  @row1
710
711    vst1.8          {d2, d3}, [r7], r3  @row2
712
713    vst1.8          {d4, d5}, [r7], r3  @row3
714
715    vst1.8          {d6, d7}, [r7], r3  @row4
716
717    subs            r12, r12, #1
718
719    bne             interp_lumablocks_stride
720
721
722    mov             r3, r3, lsr #1      @stride >> 1
723
724    ldr             r4, [r0, #4]        @ptr_u src1
725
726    ldr             r5, [r1, #4]        @ptr_u src2
727
728    ldr             r7 , [r2, #4]       @ptr_u dst buf
729
730    mov             r12, #2             @counter for number of blocks
731
732
733
734@chroma blocks
735
736interp_chromablocks_stride:
737
738    vld1.8          {d0, d1}, [r4]!     @row1 & 2 src1
739
740    vld1.8          {d2, d3}, [r4]!     @row3 & 4 src1
741
742    vld1.8          {d4, d5}, [r4]!     @row5 & 6 src1
743
744    vld1.8          {d6, d7}, [r4]!     @row7 & 8 src1
745
746
747    vld1.8          {d8, d9}, [r5]!     @row1 & 2 src2
748
749    vld1.8          {d10, d11}, [r5]!   @row3 & 4 src2
750
751    vld1.8          {d12, d13}, [r5]!   @row5 & 6 src2
752
753    vld1.8          {d14, d15}, [r5]!   @row7 & 8 src2
754
755
756
757
758    vrhadd.u8       q0, q0, q4          @operate on row1 & 2
759
760    vrhadd.u8       q1, q1, q5          @operate on row3 & 4
761
762    vrhadd.u8       q2, q2, q6          @operate on row5 & 6
763
764    vrhadd.u8       q3, q3, q7          @operate on row7 & 8
765
766
767    vst1.8          {d0}, [r7], r3      @row1
768
769    vst1.8          {d1}, [r7], r3      @row2
770
771    vst1.8          {d2}, [r7], r3      @row3
772
773    vst1.8          {d3}, [r7], r3      @row4
774
775    vst1.8          {d4}, [r7], r3      @row5
776
777    vst1.8          {d5}, [r7], r3      @row6
778
779    vst1.8          {d6}, [r7], r3      @row7
780
781    vst1.8          {d7}, [r7], r3      @row8
782
783
784
785    ldr             r4, [r0, #8]        @ptr_v src1
786
787    ldr             r5, [r1, #8]        @ptr_v src2
788
789    ldr             r7, [r2, #8]        @ptr_v dst buf
790
791    subs            r12, r12, #1
792
793    bne             interp_chromablocks_stride
794
795
796    ldmfd           r13!, {r4, r5, r7, r12, pc}
797
798
799
800
801
802