1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20
21///*
22////----------------------------------------------------------------------------
23//// File Name            : impeg2_inter_pred.s
24////
25//// Description          : This file has motion compensation related
26////                        interpolation functions on Neon + CortexA-8 platform
27////
28//// Reference Document   :
29////
30//// Revision History     :
31////      Date            Author                  Detail Description
32////   ------------    ----------------    ----------------------------------
33////   18 jun 2010      S Hamsalekha              Created
34////
35////-------------------------------------------------------------------------
36//*/
37
38///*
39//// ----------------------------------------------------------------------------
40//// Include Files
41//// ----------------------------------------------------------------------------
42//*/
43//              PRESERVE8
44.text
45.include "impeg2_neon_macros.s"
46
47///*
48//// ----------------------------------------------------------------------------
49//// Struct/Union Types and Define
50//// ----------------------------------------------------------------------------
51//*/
52
53
54///*
55//// ----------------------------------------------------------------------------
56//// Static Global Data section variables
57//// ----------------------------------------------------------------------------
58//*/
59//// -------------------------- NONE --------------------------------------------
60
61
62///*
63//// ----------------------------------------------------------------------------
64//// Static Prototype Functions
65//// ----------------------------------------------------------------------------
66//*/
67//// -------------------------- NONE --------------------------------------------
68
69///*
70//// ----------------------------------------------------------------------------
71//// Exported functions
72//// ----------------------------------------------------------------------------
73//*/
74
75
76///*
77////---------------------------------------------------------------------------
78//// Function Name      :   impeg2_copy_mb_av8()
79////
80//// Detail Description : Copies one MB worth of data from src to the dst
81////
82//// Inputs             : x0 - pointer to src
83////                      x1 - pointer to dst
84////                      x2 - source width
85////                      x3 - destination width
86//// Registers Used     : v0, v1
87////
88//// Stack Usage        : 64 bytes
89////
90//// Outputs            :
91////
92//// Return Data        : None
93////
94//// Programming Note   : <program limitation>
95////-----------------------------------------------------------------------------
96//*/
97
98
99
100.global impeg2_copy_mb_av8
101
102
103impeg2_copy_mb_av8:
104
105//STMFD   x13!,{x4,x5,x12,x14}
106    push_v_regs
107
108
109    ldr             x4, [x0]            //src->y
110    ldr             x5, [x1]            //dst->y
111
112    //Read one row of data from the src
113    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
114    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
115
116    ////Repeat 15 times for y
117    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
118    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
119    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
120    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
121    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
122    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
123    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
124    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
125    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
126    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
127    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
128    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
129    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
130    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
131    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
132    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
133    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
134    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
135    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
136    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
137    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
138    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
139    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
140    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
141    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
142    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
143    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
144    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
145    ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
146    st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
147
148    lsr             x2, x2, #1          //src_offset /= 2
149    lsr             x3, x3, #1          //dst_offset /= 2
150
151    ldr             x4, [x0, #8]        //src->u
152    ldr             x5, [x1, #8]        //dst->u
153
154    //Read one row of data from the src
155    ld1             {v0.8b}, [x4], x2   //Load and increment src
156    st1             {v0.8b}, [x5], x3   //Store and increment dst
157
158    ////Repeat 7 times for u
159    ld1             {v0.8b}, [x4], x2   //Load and increment src
160    st1             {v0.8b}, [x5], x3   //Store and increment dst
161    ld1             {v0.8b}, [x4], x2   //Load and increment src
162    st1             {v0.8b}, [x5], x3   //Store and increment dst
163    ld1             {v0.8b}, [x4], x2   //Load and increment src
164    st1             {v0.8b}, [x5], x3   //Store and increment dst
165    ld1             {v0.8b}, [x4], x2   //Load and increment src
166    st1             {v0.8b}, [x5], x3   //Store and increment dst
167    ld1             {v0.8b}, [x4], x2   //Load and increment src
168    st1             {v0.8b}, [x5], x3   //Store and increment dst
169    ld1             {v0.8b}, [x4], x2   //Load and increment src
170    st1             {v0.8b}, [x5], x3   //Store and increment dst
171    ld1             {v0.8b}, [x4], x2   //Load and increment src
172    st1             {v0.8b}, [x5], x3   //Store and increment dst
173
174    ldr             x4, [x0, #16]       //src->v
175    ldr             x5, [x1, #16]       //dst->v
176
177    //Read one row of data from the src
178    ld1             {v0.8b}, [x4], x2   //Load and increment src
179    st1             {v0.8b}, [x5], x3   //Store and increment dst
180
181    ////Repeat 7 times for v
182    ld1             {v0.8b}, [x4], x2   //Load and increment src
183    st1             {v0.8b}, [x5], x3   //Store and increment dst
184    ld1             {v0.8b}, [x4], x2   //Load and increment src
185    st1             {v0.8b}, [x5], x3   //Store and increment dst
186    ld1             {v0.8b}, [x4], x2   //Load and increment src
187    st1             {v0.8b}, [x5], x3   //Store and increment dst
188    ld1             {v0.8b}, [x4], x2   //Load and increment src
189    st1             {v0.8b}, [x5], x3   //Store and increment dst
190    ld1             {v0.8b}, [x4], x2   //Load and increment src
191    st1             {v0.8b}, [x5], x3   //Store and increment dst
192    ld1             {v0.8b}, [x4], x2   //Load and increment src
193    st1             {v0.8b}, [x5], x3   //Store and increment dst
194    ld1             {v0.8b}, [x4], x2   //Load and increment src
195    st1             {v0.8b}, [x5], x3   //Store and increment dst
196
197//LDMFD   x13!,{x4,x5,x12,PC}
198    pop_v_regs
199    ret
200
201
202///*
203////---------------------------------------------------------------------------
204//// Function Name      :   impeg2_mc_fullx_halfy_8x8_av8()
205////
206//// Detail Description : This function pastes the reference block in the
207////                      current frame buffer.This function is called for
208////                      blocks that are not coded and have motion vectors
209////                      with a half pel resolution.
210////
211//// Inputs             : x0 - out    : Current Block Pointer
212////                      x1 - ref     : Refernce Block Pointer
213////                      x2 - ref_wid   : Refernce Block Width
214////                      x3 - out_wid    @ Current Block Width
215////
216//// Registers Used     : x14, D0-D9
217////
218//// Stack Usage        : 64 bytes
219////
220//// Outputs            : The Motion Compensated Block
221////
222//// Return Data        : None
223////
224//// Programming Note   : <program limitation>
225////-----------------------------------------------------------------------------
226//*/
227
228.global impeg2_mc_fullx_halfy_8x8_av8
229
230impeg2_mc_fullx_halfy_8x8_av8:
231
232//STMFD       x13!,{x12,x14}
233    push_v_regs
234    add             x14, x1, x2
235    lsl             x2, x2, #1
236
237///* Load 8 + 1 rows from reference block */
238///* Do the addition with out rounding off as rounding value is 1 */
239    ld1             {v0.8b}, [x1], x2   //// first row hence x1 = D0
240    ld1             {v2.8b}, [x14], x2  //// second row hence x2 = D2
241    ld1             {v4.8b}, [x1], x2   //// third row hence x3 = D4
242    ld1             {v6.8b}, [x14], x2  //// fourth row hence x4 = D6
243    ld1             {v1.8b}, [x1], x2   //// fifth row hence x5 = D1
244    ld1             {v3.8b}, [x14], x2  //// sixth row hence x6 = D3
245    urhadd          v9.8b, v1.8b , v6.8b //// estimated row 4 = D9
246    ld1             {v5.8b}, [x1], x2   //// seventh row hence x7 = D5
247    urhadd          v0.16b, v0.16b , v2.16b //// estimated row 1 = D0, row 5 = D1
248    urhadd          v1.16b, v1.16b , v3.16b //// estimated row 1 = D0, row 5 = D1
249    ld1             {v7.8b}, [x14], x2  //// eighth row hence x8 = D7
250    urhadd          v2.16b, v2.16b , v4.16b //// estimated row 2 = D2, row 6 = D3
251    urhadd          v3.16b, v3.16b , v5.16b //// estimated row 2 = D2, row 6 = D3
252    ld1             {v8.8b}, [x1], x2   //// ninth row hence x9 = D8
253    urhadd          v4.16b, v4.16b , v6.16b //// estimated row 3 = D4, row 7 = D5
254    urhadd          v5.16b, v5.16b , v7.16b //// estimated row 3 = D4, row 7 = D5
255
256    add             x14, x0, x3
257    lsl             x3, x3, #1
258
259///* Store the eight rows calculated above */
260    st1             {v2.8b}, [x14], x3  //// second row hence D2
261    urhadd          v7.8b, v7.8b , v8.8b //// estimated row 8 = D7
262    st1             {v0.8b}, [x0], x3   //// first row hence D0
263    st1             {v9.8b}, [x14], x3  //// fourth row hence D9
264    st1             {v4.8b}, [x0], x3   //// third row hence D4
265    st1             {v3.8b}, [x14], x3  //// sixth row hence x6 = D3
266    st1             {v1.8b}, [x0], x3   //// fifth row hence x5 = D1
267    st1             {v7.8b}, [x14], x3  //// eighth row hence x8 = D7
268    st1             {v5.8b}, [x0], x3   //// seventh row hence x7 = D5
269
270// LDMFD sp!,{x12,pc}
271    pop_v_regs
272    ret
273
274
275
276
277
278///*
279////---------------------------------------------------------------------------
280//// Function Name      :   impeg2_mc_halfx_fully_8x8_av8()
281////
282//// Detail Description : This function pastes the reference block in the
283////                      current frame buffer.This function is called for
284////                      blocks that are not coded and have motion vectors
285////                      with a half pel resolutionand VopRoundingType is 0 ..
286////
287//// Inputs             : x0 - out    : Current Block Pointer
288////                      x1 - ref     : Refernce Block Pointer
289////                      x2 - ref_wid   : Refernce Block Width
290////                      x3 - out_wid    @ Current Block Width
291////
292//// Registers Used     : x12, x14, v0-v10, v12-v14, v16-v18, v20-v22
293
294////
295//// Stack Usage        : 64 bytes
296////
297//// Outputs            : The Motion Compensated Block
298////
299//// Return Data        : None
300////
301//// Programming Note   : <program limitation>
302////-----------------------------------------------------------------------------
303//*/
304
305
306
307.global impeg2_mc_halfx_fully_8x8_av8
308
309
310
311impeg2_mc_halfx_fully_8x8_av8:
312
313    // STMFD sp!,{x12,x14}
314    push_v_regs
315
316    add             x14, x1, x2, lsl #2
317
318    add             x12, x0, x3, lsl#2
319
320    ld1             {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of  row1
321
322    ld1             {v2.8b, v3.8b}, [x14], x2 // row5
323
324
325    ld1             {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2
326
327    ld1             {v6.8b, v7.8b}, [x14], x2 //row6
328
329
330    ext             v8.8b, v0.8b , v1.8b , #1
331
332    ext             v12.8b, v2.8b , v3.8b , #1
333
334    ext             v16.8b, v4.8b , v5.8b , #1
335
336    ext             v20.8b, v6.8b , v7.8b , #1
337
338
339    ld1             {v9.8b, v10.8b}, [x1], x2 //load row3
340
341    ld1             {v13.8b, v14.8b}, [x14], x2 //load row7
342
343    ld1             {v17.8b, v18.8b}, [x1], x2 //load  row4
344
345    ld1             {v21.8b, v22.8b}, [x14], x2 //load  row8
346
347
348    ext             v1.8b, v9.8b , v10.8b , #1
349
350    ext             v3.8b, v13.8b , v14.8b , #1
351
352
353
354    ext             v5.8b, v17.8b , v18.8b , #1
355
356    ext             v7.8b, v21.8b , v22.8b , #1
357
358
359    urhadd          v0.16b, v0.16b , v8.16b //operate on row1 and row3
360    urhadd          v1.16b, v1.16b , v9.16b //operate on row1 and row3
361
362    urhadd          v2.16b, v2.16b , v12.16b //operate on row5 and row7
363    urhadd          v3.16b, v3.16b , v13.16b //operate on row5 and row7
364
365
366    urhadd          v4.16b, v4.16b , v16.16b //operate on row2 and row4
367    urhadd          v5.16b, v5.16b , v17.16b //operate on row2 and row4
368
369
370    urhadd          v6.16b, v6.16b , v20.16b //operate on row6 and row8
371    urhadd          v7.16b, v7.16b , v21.16b //operate on row6 and row8
372
373    st1             {v0.8b}, [x0], x3   //store row1
374
375    st1             {v2.8b}, [x12], x3  //store row5
376
377    st1             {v4.8b}, [x0], x3   //store row2
378
379    st1             {v6.8b}, [x12], x3  //store row6
380
381    st1             {v1.8b}, [x0], x3   //store row3
382
383    st1             {v3.8b}, [x12], x3  //store row7
384
385    st1             {v5.8b}, [x0], x3   //store row4
386
387    st1             {v7.8b}, [x12], x3  //store row8
388
389
390
391    // LDMFD sp!,{x12,pc}
392    pop_v_regs
393    ret
394
395
396
397
398
399
400
401///*
402////---------------------------------------------------------------------------
403//// Function Name      :   impeg2_mc_halfx_halfy_8x8_av8()
404////
405//// Detail Description : This function pastes the reference block in the
406////                      current frame buffer.This function is called for
407////                      blocks that are not coded and have motion vectors
408////                      with a half pel resolutionand VopRoundingType is 0 ..
409////
410//// Inputs             : x0 - out    : Current Block Pointer
411////                      x1 - ref     : Refernce Block Pointer
412////                      x2 - ref_wid   : Refernce Block Width
413////                      x3 - out_wid    @ Current Block Width
414////
415//// Registers Used     : x14, v0-v18, v22, v24, v26, v28, v30
416
417////
418//// Stack Usage        : 64 bytes
419////
420//// Outputs            : The Motion Compensated Block
421////
422//// Return Data        : None
423////
424//// Programming Note   : <program limitation>
425////-----------------------------------------------------------------------------
426//*/
427
428
429.global impeg2_mc_halfx_halfy_8x8_av8
430
431impeg2_mc_halfx_halfy_8x8_av8:
432
433    // STMFD sp!,{x12,x14}
434    push_v_regs
435
436    add             x14, x1, x2, lsl #2
437
438    ld1             {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of  row1
439
440    ld1             {v2.8b, v3.8b}, [x14], x2 // row5
441
442    ld1             {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2
443
444    ld1             {v6.8b, v7.8b}, [x14], x2 //row6
445
446    ext             v1.8b, v0.8b , v1.8b , #1
447
448
449
450    ext             v3.8b, v2.8b , v3.8b , #1
451
452
453
454    ext             v5.8b, v4.8b , v5.8b , #1
455
456    ext             v7.8b, v6.8b , v7.8b , #1
457
458
459
460
461    ld1             {v8.8b, v9.8b}, [x1], x2 //load row3
462
463
464
465    ld1             {v10.8b, v11.8b}, [x14], x2 //load row7
466
467    ld1             {v12.8b, v13.8b}, [x1], x2 //load  row4
468
469    ld1             {v14.8b, v15.8b}, [x14], x2 //load  row8
470
471    ext             v9.8b, v8.8b , v9.8b , #1
472
473    ld1             {v16.8b, v17.8b}, [x14], x2 //load  row9
474
475
476
477
478
479    ext             v11.8b, v10.8b , v11.8b , #1
480
481
482
483    ext             v13.8b, v12.8b , v13.8b , #1
484
485
486
487    ext             v15.8b, v14.8b , v15.8b , #1
488
489    ext             v17.8b, v16.8b , v17.8b , #1
490
491
492    //interpolation in x direction
493
494    uaddl           v0.8h, v0.8b, v1.8b //operate row1
495
496    uaddl           v2.8h, v2.8b, v3.8b //operate row5
497
498    uaddl           v4.8h, v4.8b, v5.8b //operate row2
499
500    uaddl           v6.8h, v6.8b, v7.8b //operate row6
501
502    uaddl           v8.8h, v8.8b, v9.8b //operate row3
503
504    uaddl           v10.8h, v10.8b, v11.8b //operate row7
505
506    uaddl           v12.8h, v12.8b, v13.8b //operate row4
507
508    uaddl           v14.8h, v14.8b, v15.8b //operate row8
509
510    uaddl           v16.8h, v16.8b, v17.8b //operate row9
511
512    //interpolation in y direction
513
514    add             x14, x0, x3, lsl #2
515
516
517
518    add             v18.8h, v0.8h , v4.8h //operate row1 and row2
519
520    add             v26.8h, v2.8h , v6.8h //operate row5 and row6
521
522    add             v20.8h, v4.8h , v8.8h //operate row2 and row3
523
524    add             v28.8h, v6.8h , v10.8h //operate row6 and row7
525
526    rshrn           v18.8b, v18.8h, #2  //row1
527
528    rshrn           v26.8b, v26.8h, #2  //row5
529
530    rshrn           v20.8b, v20.8h, #2  //row2
531
532    rshrn           v28.8b, v28.8h, #2  //row6
533
534    add             v22.8h, v8.8h , v12.8h //operate row3 and row4
535
536    st1             {v18.8b}, [x0], x3  //store row1
537
538    add             v30.8h, v10.8h , v14.8h //operate row7 and row8
539
540    st1             {v26.8b}, [x14], x3 //store row5
541
542    add             v24.8h, v12.8h , v2.8h //operate row4 and row5
543
544    st1             {v20.8b}, [x0], x3  //store row2
545
546    add             v14.8h, v14.8h , v16.8h //operate row8 and row9
547
548    st1             {v28.8b}, [x14], x3 //store row6
549
550
551
552    rshrn           v22.8b, v22.8h, #2  //row3
553
554    rshrn           v30.8b, v30.8h, #2  //row7
555
556    rshrn           v24.8b, v24.8h, #2  //row4
557
558    rshrn           v14.8b, v14.8h, #2  //row8
559
560
561    st1             {v22.8b}, [x0], x3  //store row3
562    st1             {v30.8b}, [x14], x3 //store row7
563    st1             {v24.8b}, [x0], x3  //store row4
564    st1             {v14.8b}, [x14], x3 //store row8
565
566
567
568    // LDMFD sp!,{x12,pc}
569    pop_v_regs
570    ret
571
572
573
574
575///*
576////---------------------------------------------------------------------------
577//// Function Name      :   impeg2_mc_fullx_fully_8x8_av8()
578////
579//// Detail Description : This function pastes the reference block in the
580////                      current frame buffer.This function is called for
581////                      blocks that are not coded and have motion vectors
582////                      with a half pel resolutionand ..
583////
584//// Inputs             : x0 - out    : Current Block Pointer
585////                      x1 - ref     : Refernce Block Pointer
586////                      x2 - ref_wid   : Refernce Block Width
587////                      x3 - out_wid    @ Current Block Width
588////
589//// Registers Used     : x12, x14, v0-v3
590
591////
592//// Stack Usage        : 64 bytes
593////
594//// Outputs            : The Motion Compensated Block
595////
596//// Return Data        : None
597////
598//// Programming Note   : <program limitation>
599////-----------------------------------------------------------------------------
600//*/
601
602
603.global impeg2_mc_fullx_fully_8x8_av8
604impeg2_mc_fullx_fully_8x8_av8:
605
606
607    // STMFD sp!,{x12,x14}
608    push_v_regs
609
610    add             x14, x1, x2, lsl #2
611
612    add             x12, x0, x3, lsl #2
613
614
615    ld1             {v0.8b}, [x1], x2   //load row1
616
617    ld1             {v1.8b}, [x14], x2  //load row4
618
619    ld1             {v2.8b}, [x1], x2   //load row2
620
621    ld1             {v3.8b}, [x14], x2  //load row5
622
623
624    st1             {v0.8b}, [x0], x3   //store row1
625
626    st1             {v1.8b}, [x12], x3  //store row4
627
628    st1             {v2.8b}, [x0], x3   //store row2
629
630    st1             {v3.8b}, [x12], x3  //store row5
631
632
633    ld1             {v0.8b}, [x1], x2   //load row3
634
635    ld1             {v1.8b}, [x14], x2  //load row6
636
637    ld1             {v2.8b}, [x1], x2   //load row4
638
639    ld1             {v3.8b}, [x14], x2  //load row8
640
641
642    st1             {v0.8b}, [x0], x3   //store row3
643
644    st1             {v1.8b}, [x12], x3  //store row6
645
646    st1             {v2.8b}, [x0], x3   //store row4
647
648    st1             {v3.8b}, [x12], x3  //store row8
649
650
651    // LDMFD sp!,{x12,pc}
652    pop_v_regs
653    ret
654
655
656
657
658///*
659////---------------------------------------------------------------------------
660//// Function Name      :   impeg2_interpolate_av8()
661////
662//// Detail Description : interpolates two buffers and adds pred
663////
664//// Inputs             : x0 - pointer to src1
665////                      x1 - pointer to src2
666////                      x2 - dest buf
667////                         x3 - dst stride
668//// Registers Used     : x12, v0-v15
669////
670//// Stack Usage        : 64 bytes
671////
672//// Outputs            : The Motion Compensated Block
673////
674//// Return Data        : None
675////
676//// Programming Note   : <program limitation>
677////-----------------------------------------------------------------------------
678//*/
679
680
681.global impeg2_interpolate_av8
682
683
684impeg2_interpolate_av8:
685
686//STMFD    x13!,{x4-x7,x12,x14}
687    push_v_regs
688
689    ldr             x4, [x0, #0]        //ptr_y src1
690
691    ldr             x5, [x1, #0]        //ptr_y src2
692
693    ldr             x7, [x2, #0]        //ptr_y dst buf
694
695    mov             x12, #4             //counter for number of blocks
696
697
698interp_lumablocks_stride:
699    ld1             {v0.16b}, [x4], #16 //row1 src1
700
701    ld1             {v2.16b}, [x4], #16 //row2 src1
702
703    ld1             {v4.16b}, [x4], #16 //row3 src1
704
705    ld1             {v6.16b}, [x4], #16 //row4 src1
706
707
708    ld1             {v8.16b}, [x5], #16 //row1 src2
709
710    ld1             {v10.16b}, [x5], #16 //row2 src2
711
712    ld1             {v12.16b}, [x5], #16 //row3 src2
713
714    ld1             {v14.16b}, [x5], #16 //row4 src2
715
716    urhadd          v0.16b, v0.16b , v8.16b //operate on row1
717
718    urhadd          v2.16b, v2.16b , v10.16b //operate on row2
719
720    urhadd          v4.16b, v4.16b , v12.16b //operate on row3
721
722    urhadd          v6.16b, v6.16b , v14.16b //operate on row4
723    st1             {v0.16b}, [x7], x3  //row1
724
725    st1             {v2.16b}, [x7], x3  //row2
726
727    st1             {v4.16b}, [x7], x3  //row3
728
729    st1             {v6.16b}, [x7], x3  //row4
730
731    subs            x12, x12, #1
732
733    bne             interp_lumablocks_stride
734
735
736    lsr             x3, x3, #1          //stride >> 1
737
738    ldr             x4, [x0, #8]        //ptr_u src1
739
740    ldr             x5, [x1, #8]        //ptr_u src2
741
742    ldr             x7 , [x2, #8]       //ptr_u dst buf
743
744    mov             x12, #2             //counter for number of blocks
745
746
747
748//chroma blocks
749
750interp_chromablocks_stride:
751    ld1             {v0.8b, v1.8b}, [x4], #16 //row1 & 2 src1
752
753    ld1             {v2.8b, v3.8b}, [x4], #16 //row3 & 4 src1
754
755    ld1             {v4.8b, v5.8b}, [x4], #16 //row5 & 6 src1
756
757    ld1             {v6.8b, v7.8b}, [x4], #16 //row7 & 8 src1
758
759
760    ld1             {v8.8b, v9.8b}, [x5], #16 //row1 & 2 src2
761
762    ld1             {v10.8b, v11.8b}, [x5], #16 //row3 & 4 src2
763
764    ld1             {v12.8b, v13.8b}, [x5], #16 //row5 & 6 src2
765
766    ld1             {v14.8b, v15.8b}, [x5], #16 //row7 & 8 src2
767
768    urhadd          v0.16b, v0.16b , v8.16b //operate on row1 & 2
769    urhadd          v1.16b, v1.16b , v9.16b //operate on row1 & 2
770
771    urhadd          v2.16b, v2.16b , v10.16b //operate on row3 & 4
772    urhadd          v3.16b, v3.16b , v11.16b //operate on row3 & 4
773
774    urhadd          v4.16b, v4.16b , v12.16b //operate on row5 & 6
775    urhadd          v5.16b, v5.16b , v13.16b //operate on row5 & 6
776
777    urhadd          v6.16b, v6.16b , v14.16b //operate on row7 & 8
778    urhadd          v7.16b, v7.16b , v15.16b //operate on row7 & 8
779
780    st1             {v0.8b}, [x7], x3   //row1
781
782    st1             {v1.8b}, [x7], x3   //row2
783
784    st1             {v2.8b}, [x7], x3   //row3
785
786    st1             {v3.8b}, [x7], x3   //row4
787
788    st1             {v4.8b}, [x7], x3   //row5
789
790    st1             {v5.8b}, [x7], x3   //row6
791
792    st1             {v6.8b}, [x7], x3   //row7
793
794    st1             {v7.8b}, [x7], x3   //row8
795
796
797    ldr             x4, [x0, #16]       //ptr_v src1
798
799    ldr             x5, [x1, #16]       //ptr_v src2
800
801    ldr             x7, [x2, #16]       //ptr_v dst buf
802
803    subs            x12, x12, #1
804
805    bne             interp_chromablocks_stride
806
807
808    //LDMFD  x13!,{x4-x7,x12,PC}
809    pop_v_regs
810    ret
811
812
813
814
815