1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19// *******************************************************************************
20// * @file
21// *  ihevc_itrans_recon_8x8_neon.s
22// *
23// * @brief
24// *  contains function definitions for single stage  inverse transform
25// *
26// * @author
27// *  anand s
28// *
29// * @par list of functions:
30// *  - ihevc_itrans_recon_8x8()
31// *
32// * @remarks
33// *  none
34// *
35// *******************************************************************************
36//*/
37
38///**
39// *******************************************************************************
40// *
41// * @brief
42// *  this function performs inverse transform  and reconstruction for 8x8
43// * input block
44// *
45// * @par description:
46// *  performs inverse transform and adds the prediction  data and clips output
47// * to 8 bit
48// *
49// * @param[in] pi2_src
50// *  input 8x8 coefficients
51// *
52// * @param[in] pi2_tmp
53// *  temporary 8x8 buffer for storing inverse
54// *
55// *  transform
56// *  1st stage output
57// *
58// * @param[in] pu1_pred
59// *  prediction 8x8 block
60// *
61// * @param[out] pu1_dst
62// *  output 8x8 block
63// *
64// * @param[in] src_strd
65// *  input stride
66// *
67// * @param[in] pred_strd
68// *  prediction stride
69// *
70// * @param[in] dst_strd
71// *  output stride
72// *
73// * @param[in] shift
74// *  output shift
75// *
76// * @param[in] zero_cols
77// *  zero columns in pi2_src
78// *
79// * @returns  void
80// *
81// * @remarks
82// *  none
83// *
84// *******************************************************************************
85// */
86
87//void ihevc_itrans_recon_8x8(word16 *pi2_src,
88//                            word16 *pi2_tmp,
89//                            uword8 *pu1_pred,
90//                            uword8 *pu1_dst,
91//                            word32 src_strd,
92//                            word32 pred_strd,
93//                            word32 dst_strd,
94//                            word32 zero_cols
95//                             word32    zero_rows                )
96
97//**************variables vs registers*************************
98//    x0 => *pi2_src
99//    x1 => *pi2_tmp
100//    x2 => *pu1_pred
101//    x3 => *pu1_dst
102//    src_strd
103//    pred_strd
104//    dst_strd
105//    zero_cols
106
107
108
109.text
110.align 4
111.include "ihevc_neon_macros.s"
112
113
114
115.set width_x_size_x5 ,   40
116.set width_x_size_x2 ,   32
117.set shift_stage1_idct ,   7
118.set shift_stage2_idct ,   12
119
120.globl ihevc_itrans_recon_8x8_av8
121
122.extern g_ai2_ihevc_trans_8_transpose
123
124.type ihevc_itrans_recon_8x8_av8, %function
125
126ihevc_itrans_recon_8x8_av8:
127////register usage.extern        - loading and until idct of columns
128////    cosine constants     -     d0
129////    sine constants         -     d1
130////    row 0 first half     -     d2        -    y0
131////    row 1 first half     -     d6        -    y1
132////    row 2 first half     -     d3        -    y2
133////    row 3 first half     -     d7        -    y3
134////    row 4 first half     -     d10        -    y4
135////    row 5 first half     -     d14        -    y5
136////    row 6 first half     -     d11        -    y6
137////    row 7 first half     -     d15        -    y7
138
139////    row 0 second half    -     d4        -    y0
140////    row 1 second half    -     d8      -    y1
141////    row 2 second half    -     d5      -    y2
142////    row 3 second half    -     d9      -    y3
143////    row 4 second half    -     d12     -    y4
144////    row 5 second half    -     d16     -    y5
145////    row 6 second half    -     d13     -    y6
146////    row 7 second half    -     d17     -    y7
147
148    //// copy the input pointer to another register
149    //// step 1 : load all constants
150    // stmfd sp!,{x4-x12,x14}
151
152    ldr         w11, [sp]                   // zero rows
153
154    push_v_regs
155    stp         x19, x20,[sp,#-16]!
156
157    mov         x12, x7 // zero columns
158    mov         x8, x5 // prediction stride
159    mov         x7, x6 // destination stride
160    mov         x6, x4 // src stride
161    lsl         x6, x6, #1                  // x sizeof(word16)
162    add         x9,x0,x6, lsl #1            // 2 rows
163
164    add         x10,x6,x6, lsl #1           // 3 rows
165
166    sub         x10,x10, #8                 // - 4 cols * sizeof(word16)
167    sub         x5,x6, #8                   // src_strd - 4 cols * sizeof(word16)
168
169    adrp        x14, :got:g_ai2_ihevc_trans_8_transpose
170    ldr         x14, [x14, #:got_lo12:g_ai2_ihevc_trans_8_transpose]
171
172    ld1         {v0.4h, v1.4h},[x14]        ////d0,d1 are used for storing the constant data
173
174    ////step 2 load all the input data
175    ////step 3 operate first 4 colums at a time
176
177    and         x11,x11,#0xff
178    and         x12,x12,#0xff
179
180    cmp         x11,#0xf0
181    bge         skip_last4_rows
182
183
184    ld1         {v2.4h},[x0],#8
185    ld1         {v3.4h},[x9],#8
186    ld1         {v4.4h},[x0],x5
187    smull       v20.4s, v2.4h, v0.h[0]      //// y0 * cos4(part of c0 and c1)
188    ld1         {v5.4h},[x9],x5
189    smull       v18.4s, v3.4h, v1.h[2]      //// y2 * sin2 (q3 is freed by this time)(part of d1)
190    ld1         {v6.4h},[x0],#8
191    ld1         {v7.4h},[x9],#8
192    smull       v24.4s, v6.4h, v0.h[1]      //// y1 * cos1(part of b0)
193    ld1         {v8.4h},[x0],x10
194    smull       v26.4s, v6.4h, v0.h[3]      //// y1 * cos3(part of b1)
195    ld1         {v9.4h},[x9],x10
196    smull       v28.4s, v6.4h, v1.h[1]      //// y1 * sin3(part of b2)
197    ld1         {v10.4h},[x0],#8
198    smull       v30.4s, v6.4h, v1.h[3]      //// y1 * sin1(part of b3)
199    ld1         {v11.4h},[x9],#8
200    smlal       v24.4s, v7.4h, v0.h[3]      //// y1 * cos1 + y3 * cos3(part of b0)
201    ld1         {v12.4h},[x0],x5
202    smlsl       v26.4s, v7.4h, v1.h[3]      //// y1 * cos3 - y3 * sin1(part of b1)
203    ld1         {v13.4h},[x9],x5
204    smlsl       v28.4s, v7.4h, v0.h[1]      //// y1 * sin3 - y3 * cos1(part of b2)
205    ld1         {v14.4h},[x0],#8
206    smlsl       v30.4s, v7.4h, v1.h[1]      //// y1 * sin1 - y3 * sin3(part of b3)
207    ld1         {v15.4h},[x9],#8
208    smull       v22.4s, v10.4h, v0.h[0]     //// y4 * cos4(part of c0 and c1)
209    ld1         {v16.4h},[x0],x10
210    smull       v6.4s, v3.4h, v0.h[2]       //// y2 * cos2(part of d0)
211    ld1         {v17.4h},[x9],x10
212
213    ///* this following was activated when alignment is not there */
214////    vld1.16        d2,[x0]!
215////    vld1.16        d3,[x2]!
216////    vld1.16        d4,[x0]!
217////    vld1.16        d5,[x2]!
218////    vld1.16        d6,[x0]!
219////    vld1.16        d7,[x2]!
220////    vld1.16        d8,[x0],x3
221////    vld1.16        d9,[x2],x3
222////    vld1.16        d10,[x0]!
223////    vld1.16        d11,[x2]!
224////    vld1.16        d12,[x0]!
225////    vld1.16        d13,[x2]!
226////    vld1.16        d14,[x0]!
227////    vld1.16        d15,[x2]!
228////    vld1.16        d16,[x0],x3
229////    vld1.16        d17,[x2],x3
230
231
232
233
234    smlal       v24.4s, v14.4h, v1.h[1]     //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
235    smlsl       v26.4s, v14.4h, v0.h[1]     //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
236    smlal       v28.4s, v14.4h, v1.h[3]     //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
237    smlal       v30.4s, v14.4h, v0.h[3]     //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
238
239    smlsl       v18.4s, v11.4h, v0.h[2]     //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
240    smlal       v6.4s, v11.4h, v1.h[2]      //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
241
242    add         v10.4s,  v20.4s ,  v22.4s   //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
243    sub         v20.4s,  v20.4s ,  v22.4s   //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
244
245    smlal       v24.4s, v15.4h, v1.h[3]     //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
246    smlsl       v26.4s, v15.4h, v1.h[1]     //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
247    smlal       v28.4s, v15.4h, v0.h[3]     //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
248    smlsl       v30.4s, v15.4h, v0.h[1]     //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
249
250    add         v14.4s,  v10.4s ,  v6.4s    ////    a0 = c0 + d0(part of x0,x7)
251    sub         v10.4s,  v10.4s ,  v6.4s    //// a3 = c0 - d0(part of x3,x4)
252    sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of x2,x5)
253    add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of x1,x6)
254
255    add         v20.4s,  v14.4s ,  v24.4s   //// a0 + b0(part of x0)
256    sub         v6.4s,  v14.4s ,  v24.4s    //// a0 - b0(part of x7)
257
258    add         v24.4s,  v22.4s ,  v28.4s   //// a2 + b2(part of x2)
259    sub         v22.4s,  v22.4s ,  v28.4s   //// a2 - b2(part of x5)
260
261    add         v28.4s,  v18.4s ,  v26.4s   //// a1 + b1(part of x1)
262    sub         v18.4s,  v18.4s ,  v26.4s   //// a1 - b1(part of x6)
263
264    add         v26.4s,  v10.4s ,  v30.4s   //// a3 + b3(part of x3)
265    sub         v30.4s,  v10.4s ,  v30.4s   //// a3 - b3(part of x4)
266
267    sqrshrn     v2.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
268    sqrshrn     v15.4h, v6.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
269    sqrshrn     v3.4h, v24.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
270    sqrshrn     v14.4h, v22.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
271    sqrshrn     v6.4h, v28.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
272    sqrshrn     v11.4h, v18.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
273    sqrshrn     v7.4h, v26.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
274    sqrshrn     v10.4h, v30.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
275
276
277    b           last4_cols
278
279
280
281skip_last4_rows:
282
283
284
285    ld1         {v2.4h},[x0],#8
286    ld1         {v3.4h},[x9],#8
287    ld1         {v4.4h},[x0],x5
288    ld1         {v5.4h},[x9],x5
289    ld1         {v6.4h},[x0],#8
290    ld1         {v7.4h},[x9],#8
291    ld1         {v8.4h},[x0],x10
292    ld1         {v9.4h},[x9],x10
293
294
295
296    movi        v12.4h, #0
297    movi        v13.4h, #0
298    movi        v16.4h, #0
299    movi        v17.4h, #0
300
301
302
303
304    smull       v24.4s, v6.4h, v0.h[1]      //// y1 * cos1(part of b0)
305    smull       v26.4s, v6.4h, v0.h[3]      //// y1 * cos3(part of b1)
306    smull       v28.4s, v6.4h, v1.h[1]      //// y1 * sin3(part of b2)
307    smull       v30.4s, v6.4h, v1.h[3]      //// y1 * sin1(part of b3)
308
309    smlal       v24.4s, v7.4h, v0.h[3]      //// y1 * cos1 + y3 * cos3(part of b0)
310    smlsl       v26.4s, v7.4h, v1.h[3]      //// y1 * cos3 - y3 * sin1(part of b1)
311    smlsl       v28.4s, v7.4h, v0.h[1]      //// y1 * sin3 - y3 * cos1(part of b2)
312    smlsl       v30.4s, v7.4h, v1.h[1]      //// y1 * sin1 - y3 * sin3(part of b3)
313
314    smull       v18.4s, v3.4h, v1.h[2]      //// y2 * sin2 (q3 is freed by this time)(part of d1)
315    smull       v6.4s, v3.4h, v0.h[2]       //// y2 * cos2(part of d0)
316
317    smull       v20.4s, v2.4h, v0.h[0]      //// y0 * cos4(part of c0 and c1)
318
319
320    add         v14.4s,  v20.4s ,  v6.4s    ////    a0 = c0 + d0(part of x0,x7)
321    sub         v10.4s,  v20.4s ,  v6.4s    //// a3 = c0 - d0(part of x3,x4)
322    sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of x2,x5)
323    add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of x1,x6)
324
325    add         v20.4s,  v14.4s ,  v24.4s   //// a0 + b0(part of x0)
326    sub         v6.4s,  v14.4s ,  v24.4s    //// a0 - b0(part of x7)
327
328    add         v24.4s,  v22.4s ,  v28.4s   //// a2 + b2(part of x2)
329    sub         v22.4s,  v22.4s ,  v28.4s   //// a2 - b2(part of x5)
330
331    add         v28.4s,  v18.4s ,  v26.4s   //// a1 + b1(part of x1)
332    sub         v18.4s,  v18.4s ,  v26.4s   //// a1 - b1(part of x6)
333
334    add         v26.4s,  v10.4s ,  v30.4s   //// a3 + b3(part of x3)
335    sub         v30.4s,  v10.4s ,  v30.4s   //// a3 - b3(part of x4)
336
337    sqrshrn     v2.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
338    sqrshrn     v15.4h, v6.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
339    sqrshrn     v3.4h, v24.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
340    sqrshrn     v14.4h, v22.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
341    sqrshrn     v6.4h, v28.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
342    sqrshrn     v11.4h, v18.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
343    sqrshrn     v7.4h, v26.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
344    sqrshrn     v10.4h, v30.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
345
346
347last4_cols:
348
349
350    cmp         x12,#0xf0
351    bge         skip_last4cols
352
353    smull       v24.4s, v8.4h, v0.h[1]      //// y1 * cos1(part of b0)
354    smull       v26.4s, v8.4h, v0.h[3]      //// y1 * cos3(part of b1)
355    smull       v28.4s, v8.4h, v1.h[1]      //// y1 * sin3(part of b2)
356    smull       v30.4s, v8.4h, v1.h[3]      //// y1 * sin1(part of b3)
357
358    smlal       v24.4s, v9.4h, v0.h[3]      //// y1 * cos1 + y3 * cos3(part of b0)
359    smlsl       v26.4s, v9.4h, v1.h[3]      //// y1 * cos3 - y3 * sin1(part of b1)
360    smlsl       v28.4s, v9.4h, v0.h[1]      //// y1 * sin3 - y3 * cos1(part of b2)
361    smlsl       v30.4s, v9.4h, v1.h[1]      //// y1 * sin1 - y3 * sin3(part of b3)
362
363    smull       v18.4s, v5.4h, v1.h[2]      //// y2 * sin2 (q4 is freed by this time)(part of d1)
364    smull       v8.4s, v5.4h, v0.h[2]       //// y2 * cos2(part of d0)
365
366    smull       v20.4s, v4.4h, v0.h[0]      //// y0 * cos4(part of c0 and c1)
367    smull       v22.4s, v12.4h, v0.h[0]     //// y4 * cos4(part of c0 and c1)
368
369    smlal       v24.4s, v16.4h, v1.h[1]     //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
370    smlsl       v26.4s, v16.4h, v0.h[1]     //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
371    smlal       v28.4s, v16.4h, v1.h[3]     //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
372    smlal       v30.4s, v16.4h, v0.h[3]     //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
373
374    smlsl       v18.4s, v13.4h, v0.h[2]     //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
375    smlal       v8.4s, v13.4h, v1.h[2]      //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
376
377    add         v12.4s,  v20.4s ,  v22.4s   //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
378    sub         v20.4s,  v20.4s ,  v22.4s   //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
379
380    smlal       v24.4s, v17.4h, v1.h[3]     //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
381    smlsl       v26.4s, v17.4h, v1.h[1]     //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
382    smlal       v28.4s, v17.4h, v0.h[3]     //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
383    smlsl       v30.4s, v17.4h, v0.h[1]     //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
384
385    add         v16.4s,  v12.4s ,  v8.4s    ////    a0 = c0 + d0(part of e0,e7)
386    sub         v12.4s,  v12.4s ,  v8.4s    //// a3 = c0 - d0(part of e3,e4)
387    sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of e2,e5)
388    add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of e1,e6)
389
390    add         v20.4s,  v16.4s ,  v24.4s   //// a0 + b0(part of e0)
391    sub         v8.4s,  v16.4s ,  v24.4s    //// a0 - b0(part of e7)
392
393    add         v24.4s,  v22.4s ,  v28.4s   //// a2 + b2(part of e2)
394    sub         v22.4s,  v22.4s ,  v28.4s   //// a2 - b2(part of e5)
395
396    add         v28.4s,  v18.4s ,  v26.4s   //// a1 + b1(part of e1)
397    sub         v18.4s,  v18.4s ,  v26.4s   //// a1 - b1(part of e6)
398
399    add         v26.4s,  v12.4s ,  v30.4s   //// a3 + b3(part of e3)
400    sub         v30.4s,  v12.4s ,  v30.4s   //// a3 - b3(part of x4)
401
402    sqrshrn     v4.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
403    sqrshrn     v17.4h, v8.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
404    sqrshrn     v5.4h, v24.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
405    sqrshrn     v16.4h, v22.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
406    sqrshrn     v8.4h, v28.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
407    sqrshrn     v13.4h, v18.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
408    sqrshrn     v9.4h, v26.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
409    sqrshrn     v12.4h, v30.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
410    b           end_skip_last4cols
411
412
413
414skip_last4cols:
415
416    umov        x15,v25.d[0]
417
418    trn1        v25.4h, v2.4h, v6.4h
419    trn2        v29.4h, v2.4h, v6.4h        ////[x3,x1],[x2,x0] first qudrant transposing
420
421    trn1        v27.4h, v3.4h, v7.4h
422    trn2        v31.4h, v3.4h, v7.4h        ////[x3,x1],[x2,x0] first qudrant transposing
423
424    trn1        v6.2s, v29.2s, v31.2s
425    trn2        v7.2s, v29.2s, v31.2s       ////x0,x1,x2,x3 first qudrant transposing continued.....
426    trn1        v2.2s, v25.2s, v27.2s
427    trn2        v3.2s, v25.2s, v27.2s       ////x0,x1,x2,x3 first qudrant transposing continued.....
428
429
430    trn1        v25.4h, v10.4h, v14.4h
431    trn2        v29.4h, v10.4h, v14.4h      ////[x7,x5],[x6,x4] third qudrant transposing
432
433    trn1        v27.4h, v11.4h, v15.4h
434    trn2        v31.4h, v11.4h, v15.4h      ////[x7,x5],[x6,x4] third qudrant transposing
435
436    trn1        v10.2s, v25.2s, v27.2s
437    trn2        v11.2s, v25.2s, v27.2s      ////x4,x5,x6,x7 third qudrant transposing continued.....
438    trn1        v14.2s, v29.2s, v31.2s
439    trn2        v15.2s, v29.2s, v31.2s      ////x4,x5,x6,x7 third qudrant transposing continued.....
440
441    mov         v25.d[0],x15
442
443    smull       v24.4s, v6.4h, v0.h[1]      //// y1 * cos1(part of b0)
444    smull       v26.4s, v6.4h, v0.h[3]      //// y1 * cos3(part of b1)
445    smull       v28.4s, v6.4h, v1.h[1]      //// y1 * sin3(part of b2)
446    smull       v30.4s, v6.4h, v1.h[3]      //// y1 * sin1(part of b3)
447
448    smlal       v24.4s, v7.4h, v0.h[3]      //// y1 * cos1 + y3 * cos3(part of b0)
449    smlsl       v26.4s, v7.4h, v1.h[3]      //// y1 * cos3 - y3 * sin1(part of b1)
450    smlsl       v28.4s, v7.4h, v0.h[1]      //// y1 * sin3 - y3 * cos1(part of b2)
451    smlsl       v30.4s, v7.4h, v1.h[1]      //// y1 * sin1 - y3 * sin3(part of b3)
452
453    smull       v20.4s, v2.4h, v0.h[0]      //// y0 * cos4(part of c0 and c1)
454//    vmull.s16    q11,d4,d0[0]                    @// y4 * cos4(part of c0 and c1)
455
456    smull       v18.4s, v3.4h, v1.h[2]      //// y2 * sin2 (q3 is freed by this time)(part of d1)
457    smull       v6.4s, v3.4h, v0.h[2]       //// y2 * cos2(part of d0)
458
459
460
461
462    sub         v22.4s,  v20.4s ,  v6.4s    //// a3 = c0 - d0(part of x3,x4)
463    add         v4.4s,  v20.4s ,  v6.4s     ////    a0 = c0 + d0(part of x0,x7)
464
465
466    add         v2.4s,  v4.4s ,  v24.4s
467
468    sub         v6.4s,  v4.4s ,  v24.4s
469
470    add         v8.4s,  v22.4s ,  v30.4s
471
472    sub         v24.4s,  v22.4s ,  v30.4s
473
474    sqrshrn     v5.4h, v8.4s,#shift_stage2_idct
475    sqrshrn     v2.4h, v2.4s,#shift_stage2_idct
476    sqrshrn     v9.4h, v6.4s,#shift_stage2_idct
477    sqrshrn     v6.4h, v24.4s,#shift_stage2_idct
478
479    sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of x2,x5)
480    add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of x1,x6)
481
482
483    add         v30.4s,  v22.4s ,  v28.4s
484
485    sub         v24.4s,  v22.4s ,  v28.4s
486
487    add         v28.4s,  v18.4s ,  v26.4s
488
489    sub         v22.4s,  v18.4s ,  v26.4s
490    sqrshrn     v4.4h, v30.4s,#shift_stage2_idct
491    sqrshrn     v7.4h, v24.4s,#shift_stage2_idct
492    sqrshrn     v3.4h, v28.4s,#shift_stage2_idct
493    sqrshrn     v8.4h, v22.4s,#shift_stage2_idct
494
495
496
497    umov        x19,v25.d[0]
498    umov        x20,v25.d[1]
499
500    trn1        v27.4h, v2.4h, v3.4h
501    trn2        v29.4h, v2.4h, v3.4h
502    trn1        v25.4h, v4.4h, v5.4h
503    trn2        v31.4h, v4.4h, v5.4h
504
505    trn1        v2.2s, v27.2s, v25.2s
506    trn2        v4.2s, v27.2s, v25.2s
507    trn1        v3.2s, v29.2s, v31.2s
508    trn2        v5.2s, v29.2s, v31.2s
509
510    trn1        v27.4h, v6.4h, v7.4h
511    trn2        v29.4h, v6.4h, v7.4h
512    trn1        v25.4h, v8.4h, v9.4h
513    trn2        v31.4h, v8.4h, v9.4h
514
515    trn1        v6.2s, v27.2s, v25.2s
516    trn2        v8.2s, v27.2s, v25.2s
517    trn1        v7.2s, v29.2s, v31.2s
518    trn2        v9.2s, v29.2s, v31.2s
519
520    mov         v25.d[0],x19
521    mov         v25.d[1],x20
522
523    smull       v24.4s, v14.4h, v0.h[1]     //// y1 * cos1(part of b0)
524
525    smull       v26.4s, v14.4h, v0.h[3]     //// y1 * cos3(part of b1)
526    smull       v28.4s, v14.4h, v1.h[1]     //// y1 * sin3(part of b2)
527    smull       v30.4s, v14.4h, v1.h[3]     //// y1 * sin1(part of b3)
528
529    smlal       v24.4s, v15.4h, v0.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
530    smlsl       v26.4s, v15.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
531    smlsl       v28.4s, v15.4h, v0.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
532    smlsl       v30.4s, v15.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
533    smull       v20.4s, v10.4h, v0.h[0]     //// y0 * cos4(part of c0 and c1)
534    smull       v18.4s, v11.4h, v1.h[2]     //// y2 * sin2 (q7 is freed by this time)(part of d1)
535    smull       v14.4s, v11.4h, v0.h[2]     //// y2 * cos2(part of d0)
536
537
538    add         x4,x2,x8, lsl #1            // x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
539
540
541    add         x5,x8,x8, lsl #1            //
542
543
544    add         x0,x3,x7, lsl #1            // x0 points to 3rd row of dest data
545
546
547    add         x10,x7,x7, lsl #1           //
548
549    // swapping v3 and v6
550    mov         v31.d[0], v3.d[0]
551    mov         v3.d[0], v6.d[0]
552    mov         v6.d[0], v31.d[0]
553
554    // swapping v5 and v8
555    mov         v31.d[0], v5.d[0]
556    mov         v5.d[0], v8.d[0]
557    mov         v8.d[0], v31.d[0]
558
559
560    sub         v22.4s,  v20.4s ,  v14.4s   //// a3 = c0 - d0(part of x3,x4)
561    add         v12.4s,  v20.4s ,  v14.4s   ////    a0 = c0 + d0(part of x0,x7)
562
563
564    add         v0.4s,  v12.4s ,  v24.4s
565
566
567    sub         v24.4s,  v12.4s ,  v24.4s
568
569
570    add         v12.4s,  v22.4s ,  v30.4s
571
572
573    sub         v14.4s,  v22.4s ,  v30.4s
574
575    sqrshrn     v10.4h, v0.4s,#shift_stage2_idct
576    sqrshrn     v17.4h, v24.4s,#shift_stage2_idct
577    sqrshrn     v13.4h, v12.4s,#shift_stage2_idct
578    sqrshrn     v14.4h, v14.4s,#shift_stage2_idct
579
580    sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of x2,x5)
581    add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of x1,x6)
582
583
584    add         v0.4s,  v22.4s ,  v28.4s
585
586
587    sub         v24.4s,  v22.4s ,  v28.4s
588
589
590    add         v28.4s,  v18.4s ,  v26.4s
591
592
593    sub         v26.4s,  v18.4s ,  v26.4s
594    ld1         {v18.8b},[x2],x8
595
596    sqrshrn     v12.4h, v0.4s,#shift_stage2_idct
597    ld1         {v20.8b},[x2],x5
598
599
600    sqrshrn     v15.4h, v24.4s,#shift_stage2_idct
601    ld1         {v19.8b},[x2],x8
602
603
604
605
606    sqrshrn     v11.4h, v28.4s,#shift_stage2_idct
607    ld1         {v22.8b},[x4],x8
608
609
610
611
612    sqrshrn     v16.4h, v26.4s,#shift_stage2_idct
613    ld1         {v21.8b},[x2],x5
614
615
616    b           pred_buff_addition
617end_skip_last4cols:
618
619
620    umov        x19,v25.d[0]
621    umov        x20,v25.d[1]
622
623///* now the idct of columns is done, transpose so that row idct done efficiently(step5) */
624    trn1        v27.4h, v2.4h, v6.4h
625    trn2        v29.4h, v2.4h, v6.4h        ////[x3,x1],[x2,x0] first qudrant transposing
626    trn1        v25.4h, v3.4h, v7.4h
627    trn2        v31.4h, v3.4h, v7.4h        ////[x3,x1],[x2,x0] first qudrant transposing
628
629    trn1        v2.2s, v27.2s, v25.2s
630    trn2        v3.2s, v27.2s, v25.2s       ////x0,x1,x2,x3 first qudrant transposing continued.....
631    trn1        v6.2s, v29.2s, v31.2s
632    trn2        v7.2s, v29.2s, v31.2s       ////x0,x1,x2,x3 first qudrant transposing continued.....
633
634    trn1        v27.4h, v4.4h, v8.4h
635    trn2        v29.4h, v4.4h, v8.4h        ////[x3,x1],[x2,x0] second qudrant transposing
636    trn1        v25.4h, v5.4h, v9.4h
637    trn2        v31.4h, v5.4h, v9.4h        ////[x3,x1],[x2,x0] second qudrant transposing
638
639    trn1        v4.2s, v27.2s, v25.2s
640    trn2        v5.2s, v27.2s, v25.2s       ////x0,x1,x2,x3 second qudrant transposing continued.....
641    trn1        v8.2s, v29.2s, v31.2s
642    trn2        v9.2s, v29.2s, v31.2s       ////x0,x1,x2,x3 second qudrant transposing continued.....
643
644    trn1        v27.4h, v10.4h, v14.4h
645    trn2        v29.4h, v10.4h, v14.4h      ////[x7,x5],[x6,x4] third qudrant transposing
646    trn1        v25.4h, v11.4h, v15.4h
647    trn2        v31.4h, v11.4h, v15.4h      ////[x7,x5],[x6,x4] third qudrant transposing
648
649    trn1        v10.2s, v27.2s, v25.2s
650    trn2        v11.2s, v27.2s, v25.2s      ////x4,x5,x6,x7 third qudrant transposing continued.....
651    trn1        v14.2s, v29.2s, v31.2s
652    trn2        v15.2s, v29.2s, v31.2s      ////x4,x5,x6,x7 third qudrant transposing continued.....
653
654    trn1        v27.4h, v12.4h, v16.4h
655    trn2        v29.4h, v12.4h, v16.4h      ////[x7,x5],[x6,x4] fourth qudrant transposing
656    trn1        v25.4h, v13.4h, v17.4h
657    trn2        v31.4h, v13.4h, v17.4h      ////[x7,x5],[x6,x4] fourth qudrant transposing
658
659    trn1        v12.2s, v27.2s, v25.2s
660    trn2        v13.2s, v27.2s, v25.2s      ////x4,x5,x6,x7 fourth qudrant transposing continued.....
661    trn1        v16.2s, v29.2s, v31.2s
662    trn2        v17.2s, v29.2s, v31.2s      ////x4,x5,x6,x7 fourth qudrant transposing continued.....
663
664    mov         v25.d[0],x19
665    mov         v25.d[1],x20
666
667    ////step6 operate on first four rows and find their idct
668    ////register usage.extern        - storing and idct of rows
669////    cosine constants     -     d0
670////    sine constants         -     d1
671////    element 0 first four     -     d2        -    y0
672////    element 1 first four     -     d6        -    y1
673////    element 2 first four     -     d3        -    y2
674////    element 3 first four     -     d7        -    y3
675////    element 4 first four     -     d4        -    y4
676////    element 5 first four     -     d8        -    y5
677////    element 6 first four     -     d5        -    y6
678////    element 7 first four     -     d9        -    y7
679////    element 0 second four    -     d10        -    y0
680////    element 1 second four    -     d14     -    y1
681////    element 2 second four    -     d11     -    y2
682////    element 3 second four    -     d15     -    y3
683////    element 4 second four    -     d12     -    y4
684////    element 5 second four    -     d16     -    y5
685////    element 6 second four    -     d13     -    y6
686////    element 7 second four    -     d17     -    y7
687
688    //// map between first kernel code seq and current
689////        d2    ->    d2
690////        d6    ->    d6
691////        d3    ->    d3
692////        d7    ->    d7
693////        d10    ->    d4
694////        d14    ->    d8
695////        d11    ->    d5
696////        d15    ->    d9
697////        q3    ->    q3
698////        q5    ->    q2
699////        q7    ->    q4
700
701    smull       v24.4s, v6.4h, v0.h[1]      //// y1 * cos1(part of b0)
702    smull       v26.4s, v6.4h, v0.h[3]      //// y1 * cos3(part of b1)
703    smull       v28.4s, v6.4h, v1.h[1]      //// y1 * sin3(part of b2)
704    smull       v30.4s, v6.4h, v1.h[3]      //// y1 * sin1(part of b3)
705
706    smlal       v24.4s, v7.4h, v0.h[3]      //// y1 * cos1 + y3 * cos3(part of b0)
707    smlsl       v26.4s, v7.4h, v1.h[3]      //// y1 * cos3 - y3 * sin1(part of b1)
708    smlsl       v28.4s, v7.4h, v0.h[1]      //// y1 * sin3 - y3 * cos1(part of b2)
709    smlsl       v30.4s, v7.4h, v1.h[1]      //// y1 * sin1 - y3 * sin3(part of b3)
710
711    smull       v20.4s, v2.4h, v0.h[0]      //// y0 * cos4(part of c0 and c1)
712    smull       v22.4s, v4.4h, v0.h[0]      //// y4 * cos4(part of c0 and c1)
713
714    smull       v18.4s, v3.4h, v1.h[2]      //// y2 * sin2 (q3 is freed by this time)(part of d1)
715    smull       v6.4s, v3.4h, v0.h[2]       //// y2 * cos2(part of d0)
716
717
718    smlal       v24.4s, v8.4h, v1.h[1]      //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
719    smlsl       v26.4s, v8.4h, v0.h[1]      //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
720    smlal       v28.4s, v8.4h, v1.h[3]      //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
721    smlal       v30.4s, v8.4h, v0.h[3]      //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
722
723    smlsl       v18.4s, v5.4h, v0.h[2]      //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
724    smlal       v6.4s, v5.4h, v1.h[2]       //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
725
726    add         v2.4s,  v20.4s ,  v22.4s    //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
727    sub         v20.4s,  v20.4s ,  v22.4s   //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
728
729    smlal       v24.4s, v9.4h, v1.h[3]      //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
730    smlsl       v26.4s, v9.4h, v1.h[1]      //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
731    smlal       v28.4s, v9.4h, v0.h[3]      //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
732    smlsl       v30.4s, v9.4h, v0.h[1]      //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
733
734    sub         v22.4s,  v2.4s ,  v6.4s     //// a3 = c0 - d0(part of x3,x4)
735    add         v4.4s,  v2.4s ,  v6.4s      ////    a0 = c0 + d0(part of x0,x7)
736
737
738    add         v2.4s,  v4.4s ,  v24.4s
739
740    sub         v6.4s,  v4.4s ,  v24.4s
741
742    add         v8.4s,  v22.4s ,  v30.4s
743
744    sub         v24.4s,  v22.4s ,  v30.4s
745
746    sqrshrn     v5.4h, v8.4s,#shift_stage2_idct
747    sqrshrn     v2.4h, v2.4s,#shift_stage2_idct
748    sqrshrn     v9.4h, v6.4s,#shift_stage2_idct
749    sqrshrn     v6.4h, v24.4s,#shift_stage2_idct
750
751    sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of x2,x5)
752    add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of x1,x6)
753
754
755    add         v30.4s,  v22.4s ,  v28.4s
756
757    sub         v24.4s,  v22.4s ,  v28.4s
758
759    add         v28.4s,  v18.4s ,  v26.4s
760
761    sub         v22.4s,  v18.4s ,  v26.4s
762    sqrshrn     v4.4h, v30.4s,#shift_stage2_idct
763    sqrshrn     v7.4h, v24.4s,#shift_stage2_idct
764    sqrshrn     v3.4h, v28.4s,#shift_stage2_idct
765    sqrshrn     v8.4h, v22.4s,#shift_stage2_idct
766
767
768
769    umov        x19,v25.d[0]
770    umov        x20,v25.d[1]
771
772    trn1        v27.4h, v2.4h, v3.4h
773    trn2        v29.4h, v2.4h, v3.4h
774    trn1        v25.4h, v4.4h, v5.4h
775    trn2        v31.4h, v4.4h, v5.4h
776
777    trn1        v2.2s, v27.2s, v25.2s
778    trn2        v4.2s, v27.2s, v25.2s
779    trn1        v3.2s, v29.2s, v31.2s
780    trn2        v5.2s, v29.2s, v31.2s
781
782    trn1        v27.4h, v6.4h, v7.4h
783    trn2        v29.4h, v6.4h, v7.4h
784    trn1        v25.4h, v8.4h, v9.4h
785    trn2        v31.4h, v8.4h, v9.4h
786
787    trn1        v6.2s, v27.2s, v25.2s
788    trn2        v8.2s, v27.2s, v25.2s
789    trn1        v7.2s, v29.2s, v31.2s
790    trn2        v9.2s, v29.2s, v31.2s
791
792    mov         v25.d[0],x19
793    mov         v25.d[1],x20
794
795
796
797    smull       v24.4s, v14.4h, v0.h[1]     //// y1 * cos1(part of b0)
798    smull       v26.4s, v14.4h, v0.h[3]     //// y1 * cos3(part of b1)
799    smull       v28.4s, v14.4h, v1.h[1]     //// y1 * sin3(part of b2)
800    smull       v30.4s, v14.4h, v1.h[3]     //// y1 * sin1(part of b3)
801    smlal       v24.4s, v15.4h, v0.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
802    smlsl       v26.4s, v15.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
803    smlsl       v28.4s, v15.4h, v0.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
804    smlsl       v30.4s, v15.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
805    smull       v20.4s, v10.4h, v0.h[0]     //// y0 * cos4(part of c0 and c1)
806    smull       v22.4s, v12.4h, v0.h[0]     //// y4 * cos4(part of c0 and c1)
807    smull       v18.4s, v11.4h, v1.h[2]     //// y2 * sin2 (q7 is freed by this time)(part of d1)
808    smull       v14.4s, v11.4h, v0.h[2]     //// y2 * cos2(part of d0)
809    smlal       v24.4s, v16.4h, v1.h[1]     //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
810
811    add         x4,x2,x8, lsl #1            // x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
812    smlsl       v26.4s, v16.4h, v0.h[1]     //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
813
814    add         x5,x8,x8, lsl #1            //
815    smlal       v28.4s, v16.4h, v1.h[3]     //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
816
817    add         x0,x3,x7, lsl #1            // x0 points to 3rd row of dest data
818    smlal       v30.4s, v16.4h, v0.h[3]     //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
819
820    add         x10,x7,x7, lsl #1           //
821    smlsl       v18.4s, v13.4h, v0.h[2]     //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
822
823
824    smlal       v14.4s, v13.4h, v1.h[2]     //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
825
826    add         v12.4s,  v20.4s ,  v22.4s   //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
827    sub         v20.4s,  v20.4s ,  v22.4s   //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
828
829    smlal       v24.4s, v17.4h, v1.h[3]     //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
830
831    // swapping v3 and v6
832    mov         v31.d[0], v3.d[0]
833    mov         v3.d[0], v6.d[0]
834    mov         v6.d[0], v31.d[0]
835
836    smlsl       v26.4s, v17.4h, v1.h[1]     //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
837    // swapping v5 and v8
838    mov         v31.d[0], v5.d[0]
839    mov         v5.d[0], v8.d[0]
840    mov         v8.d[0], v31.d[0]
841
842    smlal       v28.4s, v17.4h, v0.h[3]     //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
843    smlsl       v30.4s, v17.4h, v0.h[1]     //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
844
845    sub         v22.4s,  v12.4s ,  v14.4s   //// a3 = c0 - d0(part of x3,x4)
846    add         v12.4s,  v12.4s ,  v14.4s   ////    a0 = c0 + d0(part of x0,x7)
847
848
849    add         v0.4s,  v12.4s ,  v24.4s
850
851
852    sub         v24.4s,  v12.4s ,  v24.4s
853
854
855    add         v12.4s,  v22.4s ,  v30.4s
856
857
858    sub         v14.4s,  v22.4s ,  v30.4s
859
860    sqrshrn     v10.4h, v0.4s,#shift_stage2_idct
861    sqrshrn     v17.4h, v24.4s,#shift_stage2_idct
862    sqrshrn     v13.4h, v12.4s,#shift_stage2_idct
863    sqrshrn     v14.4h, v14.4s,#shift_stage2_idct
864
865    sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of x2,x5)
866    add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of x1,x6)
867
868
869    add         v0.4s,  v22.4s ,  v28.4s
870
871
872    sub         v24.4s,  v22.4s ,  v28.4s
873
874
875    add         v28.4s,  v18.4s ,  v26.4s
876
877
878    sub         v26.4s,  v18.4s ,  v26.4s
879    ld1         {v18.8b},[x2],x8
880
881    sqrshrn     v12.4h, v0.4s,#shift_stage2_idct
882    ld1         {v20.8b},[x2],x5
883
884
885    sqrshrn     v15.4h, v24.4s,#shift_stage2_idct
886    ld1         {v19.8b},[x2],x8
887
888
889
890
891    sqrshrn     v11.4h, v28.4s,#shift_stage2_idct
892    ld1         {v22.8b},[x4],x8
893
894
895
896
897    sqrshrn     v16.4h, v26.4s,#shift_stage2_idct
898    ld1         {v21.8b},[x2],x5
899
900
901
902
903pred_buff_addition:
904
905    umov        x19,v25.d[0]
906    umov        x20,v25.d[1]
907
908    trn1        v27.4h, v10.4h, v11.4h
909    trn2        v29.4h, v10.4h, v11.4h
910    trn1        v25.4h, v12.4h, v13.4h
911    trn2        v31.4h, v12.4h, v13.4h
912
913    trn1        v10.2s, v27.2s, v25.2s
914    trn2        v12.2s, v27.2s, v25.2s
915    trn1        v11.2s, v29.2s, v31.2s
916    trn2        v13.2s, v29.2s, v31.2s
917
918    trn1        v27.4h, v14.4h, v15.4h
919    trn2        v29.4h, v14.4h, v15.4h
920    trn1        v25.4h, v16.4h, v17.4h
921    trn2        v31.4h, v16.4h, v17.4h
922
923    trn1        v14.2s, v27.2s, v25.2s
924    trn2        v16.2s, v27.2s, v25.2s
925    trn1        v15.2s, v29.2s, v31.2s
926    trn2        v17.2s, v29.2s, v31.2s
927
928
929    mov         v25.d[0],x19
930    mov         v25.d[1],x20
931
932
933    ld1         {v24.8b},[x4],x5
934    ld1         {v23.8b},[x4],x8
935    ld1         {v25.8b},[x4],x5
936    mov         v2.d[1], v3.d[0]
937    mov         v4.d[1], v5.d[0]
938    mov         v6.d[1], v7.d[0]
939    mov         v8.d[1], v9.d[0]
940    uaddw       v2.8h,  v2.8h ,  v18.8b
941    uaddw       v4.8h,  v4.8h ,  v22.8b
942    uaddw       v6.8h,  v6.8h ,  v20.8b
943    uaddw       v8.8h,  v8.8h ,  v24.8b
944
945    // swapping v11 and v14
946    mov         v31.d[0], v11.d[0]
947    mov         v11.d[0], v14.d[0]
948    mov         v14.d[0], v31.d[0]
949
950    // swapping v13 and v16
951    mov         v31.d[0], v13.d[0]
952    mov         v13.d[0], v16.d[0]
953    mov         v16.d[0], v31.d[0]
954// row values stored in the q register.
955
956//q1 :x0
957//q3: x1
958//q2: x2
959//q4: x3
960//q5: x4
961//q7: x5
962//q6: x6
963//q8: x7
964
965
966
967///// adding the prediction buffer
968
969
970
971
972
973
974
975
976
977    // load prediction data
978
979
980
981
982
983    //adding recon with prediction
984
985
986
987
988    mov         v10.d[1], v11.d[0]
989    mov         v12.d[1], v13.d[0]
990    mov         v14.d[1], v15.d[0]
991    mov         v16.d[1], v17.d[0]
992    uaddw       v10.8h,  v10.8h ,  v19.8b
993    sqxtun      v2.8b, v2.8h
994    uaddw       v14.8h,  v14.8h ,  v21.8b
995    sqxtun      v4.8b, v4.8h
996    uaddw       v12.8h,  v12.8h ,  v23.8b
997    sqxtun      v6.8b, v6.8h
998    uaddw       v16.8h,  v16.8h ,  v25.8b
999    sqxtun      v8.8b, v8.8h
1000
1001
1002
1003
1004
1005
1006
1007    st1         {v2.8b},[x3],x7
1008    sqxtun      v10.8b, v10.8h
1009    st1         {v6.8b},[x3],x10
1010    sqxtun      v14.8b, v14.8h
1011    st1         {v4.8b},[x0],x7
1012    sqxtun      v12.8b, v12.8h
1013    st1         {v8.8b},[x0],x10
1014    sqxtun      v16.8b, v16.8h
1015
1016
1017
1018
1019
1020
1021
1022    st1         {v10.8b},[x3],x7
1023    st1         {v14.8b},[x3],x10
1024    st1         {v12.8b},[x0],x7
1025    st1         {v16.8b},[x0],x10
1026
1027
1028
1029
1030    // ldmfd sp!,{x4-x12,pc}
1031    ldp         x19, x20,[sp],#16
1032    pop_v_regs
1033    ret
1034
1035
1036
1037
1038
1039