1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_luma_mode_3_to_9.s
22@*
23@* @brief
24@*  contains function definitions for intra prediction dc filtering.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  parthiban v
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    luma intraprediction filter for dc input
45@*
46@* @par description:
47@*
48@* @param[in] pu1_ref
49@*  uword8 pointer to the source
50@*
51@* @param[out] pu1_dst
52@*  uword8 pointer to the destination
53@*
54@* @param[in] src_strd
55@*  integer source stride
56@*
57@* @param[in] dst_strd
58@*  integer destination stride
59@*
60@* @param[in] nt
61@*  size of tranform block
62@*
63@* @param[in] mode
64@*  type of filtering
65@*
66@* @returns
67@*
68@* @remarks
69@*  none
70@*
71@*******************************************************************************
72@*/
73
74@void ihevc_intra_pred_luma_mode_3_to_9(uword8* pu1_ref,
75@                               word32 src_strd,
76@                               uword8* pu1_dst,
77@                               word32 dst_strd,
78@                               word32 nt,
79@                               word32 mode)
80@
81@**************variables vs registers*****************************************
82@r0 => *pu1_ref
83@r1 => src_strd
84@r2 => *pu1_dst
85@r3 => dst_strd
86
87@stack contents from #40
88@   nt
89@   mode
90
91.text
92.align 4
93
94
95
96
97.globl ihevc_intra_pred_luma_mode_3_to_9_a9q
98.extern gai4_ihevc_ang_table
99.extern gai4_ihevc_inv_ang_table
100.extern col_for_intra_luma
101.extern idx_neg_idx_3_9
102
103gai4_ihevc_ang_table_addr:
104.long gai4_ihevc_ang_table - ulbl1 - 8
105
106gai4_ihevc_inv_ang_table_addr:
107.long gai4_ihevc_inv_ang_table - ulbl2 - 8
108
109idx_neg_idx_3_9_addr_1:
110.long idx_neg_idx_3_9 - ulbl3_1 - 8
111
112idx_neg_idx_3_9_addr_2:
113.long idx_neg_idx_3_9 - ulbl3_2 - 8
114
115col_for_intra_luma_addr_1:
116.long col_for_intra_luma - ulbl4_1 - 8
117
118col_for_intra_luma_addr_2:
119.long col_for_intra_luma - ulbl4_2 - 8
120
121col_for_intra_luma_addr_3:
122.long col_for_intra_luma - ulbl4_3 - 8
123
124.type ihevc_intra_pred_luma_mode_3_to_9_a9q, %function
125
126ihevc_intra_pred_luma_mode_3_to_9_a9q:
127
128    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
129
130    ldr         r4,[sp,#40]                 @loads nt
131    ldr         r7, gai4_ihevc_ang_table_addr
132ulbl1:
133    add         r7,r7,pc
134
135    ldr         r5,[sp,#44]                 @mode (3 to 9)
136    ldr         r8, gai4_ihevc_inv_ang_table_addr
137ulbl2:
138    add         r8,r8,pc
139
140    add         r7, r7, r5, lsl #2          @gai4_ihevc_ang_table[mode]
141    ldr         r7, [r7]                    @intra_pred_ang
142    vdup.8      d30, r7                     @intra_pred_ang
143
144    ldr         r14, col_for_intra_luma_addr_1
145ulbl4_1:
146    add         r14,r14,pc
147    cmp         r4, #4
148
149    beq         sz_4_proc
150    b           prologue_8_16_32
151
152prologue_8_16_32:
153    lsr         r10, r4, #3
154    vld1.8      d31, [r14]!
155    mul         r10, r4, r10                @block counter (dec by #8)
156
157    mov         r11, r4                     @col counter to be inc/dec by #8
158    vmull.s8    q11, d30, d31               @(col+1)*intra_pred_angle [0:7](col)
159
160    sub         r7, r5, #3
161    vmov.i8     d2,#1                       @contains #1 for adding to get ref_main_idx + 1
162    ldr         r12, idx_neg_idx_3_9_addr_1 @load least idx table
163ulbl3_1:
164    add         r12,r12,pc
165
166    vmov.i8     d3, #2
167
168    add         r12, r12, r7, lsl #4
169    mov         r8, r12
170
171    mov         r7, #8
172    sub         r7, r7, r3, lsl #3          @r7 = 8-8r3
173
174    ldr         r9, [r8]
175    add         r1, r0, r4, lsl #1          @pu1_ref + nt
176
177    vmovn.s16   d6, q11
178    vdup.8      d26, r9                     @least idx added to final idx values
179    sub         r1, r1, #9                  @ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
180
181    sub         r6, r1, r9
182
183    vld1.8      {d0,d1}, [r6]               @stores the 32 values reqd based on indices values (from least idx)
184    vshr.s16    q11, q11, #5
185
186    vmov.i8     d29, #31                    @contains #31 for vand operation
187
188    vmov.i8     d28, #32
189
190    vqmovn.s16  d8, q11
191
192    vand        d6, d6, d29                 @fract values in d1/ idx values in d0
193
194    mov         r0, #1
195
196    vmov.i8     d27, #7                     @row 0 to 7
197
198    vsub.s8     d8, d8, d2                  @ref_main_idx (sub row)
199    vsub.s8     d8, d26, d8                 @ref_main_idx (row 0)
200    vadd.s8     d8, d8, d27                 @t0 compensate the pu1_src idx incremented by 8
201    vsub.s8     d9, d8, d2                  @ref_main_idx + 1 (row 0)
202    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 0)
203    vsub.s8     d7, d28, d6                 @32-fract
204
205    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 0)
206    vsub.s8     d4, d8, d2                  @ref_main_idx (row 1)
207    vsub.s8     d5, d9, d2                  @ref_main_idx + 1 (row 1)
208
209    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 1)
210    vmull.u8    q12, d12, d7                @mul (row 0)
211    vmlal.u8    q12, d13, d6                @mul (row 0)
212
213    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 1)
214    vsub.s8     d8, d8, d3                  @ref_main_idx (row 2)
215    vsub.s8     d9, d9, d3                  @ref_main_idx + 1 (row 2)
216
217    vrshrn.i16  d24, q12, #5                @round shft (row 0)
218
219    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 2)
220    vmull.u8    q11, d16, d7                @mul (row 1)
221    vmlal.u8    q11, d17, d6                @mul (row 1)
222
223    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 2)
224    vsub.s8     d4, d4, d3                  @ref_main_idx (row 3)
225    vsub.s8     d5, d5, d3                  @ref_main_idx + 1 (row 3)
226
227    vst1.8      d24, [r2], r3               @st (row 0)
228    vrshrn.i16  d22, q11, #5                @round shft (row 1)
229
230    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 3)
231    vmull.u8    q10, d14, d7                @mul (row 2)
232    vmlal.u8    q10, d15, d6                @mul (row 2)
233
234    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 3)
235    vsub.s8     d8, d8, d3                  @ref_main_idx (row 4)
236    vsub.s8     d9, d9, d3                  @ref_main_idx + 1 (row 4)
237
238    vst1.8      d22, [r2], r3               @st (row 1)
239    vrshrn.i16  d20, q10, #5                @round shft (row 2)
240
241    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 4)
242    vmull.u8    q9, d10, d7                 @mul (row 3)
243    vmlal.u8    q9, d11, d6                 @mul (row 3)
244
245    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 4)
246    vsub.s8     d4, d4, d3                  @ref_main_idx (row 5)
247    vsub.s8     d5, d5, d3                  @ref_main_idx + 1 (row 5)
248
249    vst1.8      d20, [r2], r3               @st (row 2)
250    vrshrn.i16  d18, q9, #5                 @round shft (row 3)
251
252    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 5)
253    vmull.u8    q12, d12, d7                @mul (row 4)
254    vmlal.u8    q12, d13, d6                @mul (row 4)
255
256    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 5)
257    vsub.s8     d8, d8, d3                  @ref_main_idx (row 6)
258    vsub.s8     d9, d9, d3                  @ref_main_idx + 1 (row 6)
259
260    vst1.8      d18, [r2], r3               @st (row 3)
261    vrshrn.i16  d24, q12, #5                @round shft (row 4)
262
263    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 6)
264    vmull.u8    q11, d16, d7                @mul (row 5)
265    vmlal.u8    q11, d17, d6                @mul (row 5)
266
267    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 6)
268    vsub.s8     d4, d4, d3                  @ref_main_idx (row 7)
269    vsub.s8     d5, d5, d3                  @ref_main_idx + 1 (row 7)
270
271    vst1.8      d24, [r2], r3               @st (row 4)
272    vrshrn.i16  d22, q11, #5                @round shft (row 5)
273
274    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 7)
275    vmull.u8    q10, d14, d7                @mul (row 6)
276    vmlal.u8    q10, d15, d6                @mul (row 6)
277
278    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 7)
279    vmull.u8    q9, d10, d7                 @mul (row 7)
280    vmlal.u8    q9, d11, d6                 @mul (row 7)
281
282    vst1.8      d22, [r2], r3               @st (row 5)
283    vrshrn.i16  d20, q10, #5                @round shft (row 6)
284    vrshrn.i16  d18, q9, #5                 @round shft (row 7)
285
286    vst1.8      d20, [r2], r3               @st (row 6)
287
288    subs        r10, r10, #8                @subtract 8 and go to end if 8x8
289
290    vst1.8      d18, [r2], r3               @st (row 7)
291
292    beq         end_func
293
294    subs        r11, r11, #8
295    addgt       r8, r8, #4
296    addgt       r2, r2, r7
297    movle       r8, r12
298    suble       r2, r2, r4
299    addle       r2, r2, #8
300    movle       r11, r4
301    ldrle       r14, col_for_intra_luma_addr_2
302ulbl4_2:
303    addle       r14,r14,pc
304    addle       r0, r0, #8
305
306    mov         r5,r2
307    vld1.8      d31, [r14]!
308    vmull.s8    q6, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
309    vmovn.s16   d10, q6
310    vshr.s16    q6, q6, #5
311    vqmovn.s16  d11, q6
312    ldr         r9, [r8]
313    add         r9, r0, r9
314    sub         r9, r9, #1
315    vdup.8      d26, r9
316    vmov.i8     d16,#8
317
318    sub         r4,r4,#8
319
320kernel_8_16_32:
321
322    vsub.s8     d8, d26, d11                @ref_main_idx
323    vmov        d26,d10
324
325    subs        r11, r11, #8
326    sub         r6, r1, r9
327    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 7)
328    vadd.s8     d8, d8, d16                 @to compensate the pu1_src idx incremented by 8
329
330    vmull.u8    q10, d14, d7                @mul (row 6)
331    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx - 1 (row 7)
332    vmlal.u8    q10, d15, d6                @mul (row 6)
333
334    vsub.s8     d9, d8, d2                  @ref_main_idx - 1
335    addle       r0, r0, #8
336    addgt       r8, r8, #4
337    vld1.8      {d0,d1}, [r6]               @stores the 32 values reqd based on indices values (from least idx)
338
339    vst1.8      d24, [r5], r3               @st (row 4)
340    vrshrn.i16  d22, q11, #5                @round shft (row 5)
341
342    ldrle       r14, col_for_intra_luma_addr_3
343ulbl4_3:
344    addle       r14,r14,pc
345
346    movle       r8, r12
347    vdup.8      d27, r0                     @row value inc or reset accordingly
348
349    vsub.s8     d4, d8, d2                  @ref_main_idx (row 1)
350    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 0)
351    vsub.s8     d5, d9, d2                  @ref_main_idx - 1 (row 1)
352
353
354    vmull.u8    q9, d10, d7                 @mul (row 7)
355    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 0)
356    vmlal.u8    q9, d11, d6                 @mul (row 7)
357
358    vld1.8      d31, [r14]!
359    vand        d6, d29, d26                @fract values in d1/ idx values in d0
360
361    vst1.8      d22, [r5], r3               @(from previous loop)st (row 5)
362    vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
363
364    vsub.s8     d8, d8, d3                  @ref_main_idx (row 2)
365    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 1)
366    vsub.s8     d9, d9, d3                  @ref_main_idx - 1 (row 2)
367
368    addle       r11, r4, #8
369    ldr         r9, [r8]
370    vsub.s8     d7, d28, d6                 @32-fract
371
372    vmull.u8    q12, d12, d7                @mul (row 0)
373    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 1)
374    vmlal.u8    q12, d13, d6                @mul (row 0)
375
376    vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
377    vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
378
379    vsub.s8     d4, d4, d3                  @ref_main_idx (row 3)
380    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 2)
381    vsub.s8     d5, d5, d3                  @ref_main_idx - 1 (row 3)
382
383    vmull.u8    q11, d10, d7                @mul (row 1)
384    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 2)
385    vmlal.u8    q11, d17, d6                @mul (row 1)
386
387    vrshrn.i16  d24, q12, #5                @round shft (row 0)
388    vst1.8      d18, [r5], r3               @(from previous loop)st (row 7)
389
390    vsub.s8     d8, d8, d3                  @ref_main_idx (row 4)
391    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 3)
392    vsub.s8     d9, d9, d3                  @ref_main_idx - 1 (row 4)
393
394    vmull.u8    q10, d14, d7                @mul (row 2)
395    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 3)
396    vmlal.u8    q10, d15, d6                @mul (row 2)
397
398    vmull.s8    q7, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
399    add         r5,r2,r3,lsl#2
400    add         r9, r0, r9
401
402    vst1.8      d24, [r2], r3               @st (row 0)
403    vrshrn.i16  d22, q11, #5                @round shft (row 1)
404
405    vsub.s8     d4, d4, d3                  @ref_main_idx (row 5)
406    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 4)
407    vsub.s8     d5, d5, d3                  @ref_main_idx - 1 (row 5)
408
409    vmull.u8    q9, d10, d7                 @mul (row 3)
410    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 4)
411    vmlal.u8    q9, d11, d6                 @mul (row 3)
412
413    vst1.8      d22, [r2], r3               @st (row 1)
414    vrshrn.i16  d20, q10, #5                @round shft (row 2)
415
416    vmovn.s16   d10, q7
417    vshr.s16    q7, q7, #5
418
419    vsub.s8     d8, d8, d3                  @ref_main_idx (row 6)
420    vtbl.8      d21, {d0,d1}, d4            @load from ref_main_idx (row 5)
421    vsub.s8     d9, d9, d3                  @ref_main_idx - 1 (row 6)
422
423    vmull.u8    q12, d12, d7                @mul (row 4)
424    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 5)
425    vmlal.u8    q12, d13, d6                @mul (row 4)
426
427    vst1.8      d20, [r2], r3               @st (row 2)
428    vrshrn.i16  d18, q9, #5                 @round shft (row 3)
429
430    sub         r9, r9, #1
431    vqmovn.s16  d11, q7
432
433    vsub.s8     d4, d4, d3                  @ref_main_idx (row 7)
434    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 6)
435    vsub.s8     d5, d5, d3                  @ref_main_idx - 1 (row 7)
436
437    vmull.u8    q11, d21, d7                @mul (row 5)
438    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 6)
439    vmlal.u8    q11, d17, d6                @mul (row 5)
440
441    vadd.s8     d11, d27, d11               @ref_main_idx (add row)
442    vdup.8      d26, r9
443
444    vst1.8      d18, [r2], r3               @st (row 3)
445    vrshrn.i16  d24, q12, #5                @round shft (row 4)
446
447    add         r2,r3, lsl #2
448    vsub.s8     d11, d11, d2                @ref_main_idx -1 (sub 1)
449    addgt       r2, r7, r2
450
451    suble       r2, r2, r4
452
453    subs        r10, r10, #8                @subtract 8 and go to end if 8x8
454
455    bne         kernel_8_16_32
456
457epil_8_16_32:
458    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 7)
459
460    vmull.u8    q10, d14, d7                @mul (row 6)
461    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 7)
462    vmlal.u8    q10, d15, d6                @mul (row 6)
463
464    vst1.8      d24, [r5], r3               @st (row 4)
465    vrshrn.i16  d24, q11, #5                @round shft (row 5)
466
467    vmull.u8    q9, d10, d7                 @mul (row 7)
468    vmlal.u8    q9, d11, d6                 @mul (row 7)
469
470    vst1.8      d24, [r5], r3               @(from previous loop)st (row 5)
471    vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
472
473    vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
474    vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
475
476    vst1.8      d18, [r5], r3               @st (row 7)
477
478    b           end_func
479
480sz_4_proc:
481    vld1.8      d31, [r14]
482    vmov.i8     d2, #1                      @contains #1 for adding to get ref_main_idx - 1
483
484    vmov.i8     d3, #2
485    ldr         r12, idx_neg_idx_3_9_addr_2 @load least idx table
486ulbl3_2:
487    add         r12,r12,pc
488
489    vmull.s8    q11, d30, d31               @(col+1)*intra_pred_angle [0:7](col)
490    sub         r7, r5, #3
491
492    add         r12, r12, r7, lsl #4
493    mov         r8, r12
494
495    ldr         r9, [r8]
496
497    vdup.8      d26, r9                     @least idx added to final idx values
498    add         r6, r0, r4, lsl #1          @pu1_ref + 2nt
499
500    vmovn.s16   d6, q11
501    sub         r6, r6, #9                  @ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
502    sub         r6, r6, r9
503
504    vld1.8      {d0,d1}, [r6]               @stores the 32 values reqd based on indices values (from least idx)
505
506    vmov.i8     d29, #31                    @contains #31 for vand operation
507
508    vmov.i8     d28, #32
509
510    vshr.s16    q11, q11, #5
511    vqmovn.s16  d8, q11
512
513    vand        d6, d6, d29                 @fract values in d1/ idx values in d0
514    vsub.s8     d7, d28, d6                 @32-fract
515
516    vmov.i8     d27, #7                     @row 0 to 7(row-1)
517    vsub.s8     d8, d8, d2                  @ref_main_idx (add 1)
518    vsub.s8     d8, d26, d8                 @ref_main_idx
519    vadd.s8     d8, d8, d27                 @t0 compensate the pu1_src idx incremented by 8
520    vsub.s8     d9, d8, d2                  @ref_main_idx - 1
521
522    vsub.s8     d4, d8, d2                  @row 1 ref_main_idx
523    vsub.s8     d5, d9, d2
524
525    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 0)
526    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 0)
527
528
529    vmull.u8    q12, d12, d7                @mul (row 0)
530    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 1)
531    vmlal.u8    q12, d13, d6                @mul (row 0)
532
533    vsub.s8     d8, d8, d3                  @idx (row 2)
534    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 1)
535    vsub.s8     d9, d9, d3                  @idx+1 (row 2)
536
537    vmull.u8    q11, d16, d7                @mul (row 1)
538    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 2)
539    vmlal.u8    q11, d17, d6                @mul (row 1)
540
541    vrshrn.i16  d24, q12, #5                @round shift (row 0)
542
543    vsub.s8     d4, d4, d3                  @idx (row 3)
544    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 2)
545    vsub.s8     d5, d5, d3                  @idx+1 (row 3)
546
547    vmull.u8    q10, d12, d7                @mul (row 2)
548    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 3)
549    vmlal.u8    q10, d13, d6                @mul (row 2)
550
551    vst1.32     d24[0], [r2], r3            @st row 0
552    vrshrn.i16  d22, q11, #5                @round shift (row 1)
553
554    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 3)
555
556    vmull.u8    q9, d16, d7                 @mul (row 3)
557    vmlal.u8    q9, d17, d6                 @mul (row 3)
558
559    vst1.32     d22[0], [r2], r3            @st row 1
560    vrshrn.i16  d20, q10, #5                @round shift (row 2)
561
562    vst1.32     d20[0], [r2], r3            @st row 2
563
564    vrshrn.i16  d18, q9, #5                 @round shift (row 3)
565
566    vst1.32     d18[0], [r2], r3            @st (row 3)
567
568end_func:
569    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
570
571
572
573
574