1@
2@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3@
4@ Use of this source code is governed by a BSD-style license
5@ that can be found in the LICENSE file in the root of the source
6@ tree. An additional intellectual property rights grant can be found
7@ in the file PATENTS.  All contributing project authors may
8@ be found in the AUTHORS file in the root of the source tree.
9@
10
11@ Contains a function for WebRtcIsacfix_CalculateResidualEnergyNeon() in
12@ iSAC codec, optimized for ARM Neon platform. Reference code in
13@ lpc_masking_model.c.
14
15.arch armv7-a
16.fpu neon
17.global WebRtcIsacfix_CalculateResidualEnergyNeon
18.align  2
19
20@ int32_t WebRtcIsacfix_CalculateResidualEnergyNeon(int lpc_order,
21@                                                   int32_t q_val_corr,
22@                                                   int q_val_polynomial,
23@                                                   int16_t* a_polynomial,
24@                                                   int32_t* corr_coeffs,
25@                                                   int* q_val_residual_energy);
26
27WebRtcIsacfix_CalculateResidualEnergyNeon:
28.fnstart
29.save {r4-r11}
30  push {r4-r11}
31
32  sub r13, r13, #16
33  str r1, [r13, #8]
34  str r2, [r13, #12]
35
36  mov r4, #1
37  vmov.s64 q11, #0            @ Initialize shift_internal.
38  vmov.s64 q13, #0            @ Initialize sum64.
39  vmov.s64 q10, #0
40  vmov.u8 d20[0], r4          @ Set q10 to 1.
41
42  cmp r0, #0
43  blt POST_LOOP_I
44
45  add r9, r3, r0, asl #1      @ &a_polynomial[lpc_order]
46  mov r6, #0                  @ Loop counter i.
47  ldr r11, [r13, #48]
48  sub r10, r0, #1
49  mov r7, r3                  @ &a_polynomial[0]
50  str r9, [r13, #4]
51
52LOOP_I:
53  ldr r2, [r11], #4            @ corr_coeffs[i]
54  vmov.s64 q15, #0            @ Initialize the sum64_tmp.
55  vdup.s32 d25, r2
56
57  cmp r0, r6                  @ Compare lpc_order to i.
58  movle r2, r6
59  ble POST_LOOP_J
60
61  mov r1, r6                  @ j = i;
62  mov r12, r7                  @ &a_polynomial[i]
63  mov r4, r3                  @ &a_polynomial[j - i]
64
65LOOP_J:
66  ldr r8, [r12], #4
67  ldr r5, [r4], #4
68  vmov.u32 d0[0], r8
69  vmov.u32 d1[0], r5
70  vmull.s16 q0, d0, d1
71  vmull.s32 q0, d0, d25
72  cmp r6, #0                  @ i == 0?
73  vshl.s64 q0, q11
74  beq SUM1
75  vshl.s64 q0, #1
76
77SUM1:
78  vqadd.s64 q14, q0, q15      @ Sum and test overflow.
79  add r1, r1, #2
80  bvc MOV1                    @ Skip the shift if there's no overflow.
81  vshr.s64 q0, #1
82  vshr.s64 q15, #1
83  vadd.s64 q14, q0, q15
84  vsub.s64 q11, q10
85
86MOV1:
87  cmp r0, r1                  @ Compare lpc_order to j.
88  vmov.s64 q15, q14
89  bgt LOOP_J
90
91  bic r1, r10, #1
92  add r2, r6, #2
93  add r2, r1, r2
94
95POST_LOOP_J:
96  vqadd.s64 q0, q13, q15      @ Sum and test overflow.
97  bvc MOV2                    @ Skip the shift if there's no overflow.
98  vshr.s64 q13, #1
99  vshr.s64 q15, #1
100  vadd.s64 q0, q13, q15
101  vsub.s64 q11, q10
102
103MOV2:
104  vmov.s64 q13, q0            @ update sum64.
105  cmp r2, r0
106  bne CHECK_LOOP_CONDITION
107
108  @ Last sample in the inner loop.
109  ldr r4, [r13, #4]
110  ldrsh r8, [r4]
111  ldrsh r12, [r9]
112  mul r8, r8, r12
113  vmov.s32 d0[0], r8
114  vmull.s32 q0, d0, d25
115  cmp r6, #0                  @ i == 0?
116  vshl.s64 q0, q11
117  beq SUM2
118  vshl.s64 q0, #1
119
120SUM2:
121  vqadd.s64 d1, d0, d26       @ Sum and test overflow.
122  bvc MOV3                    @ Skip the shift if there's no overflow.
123  vshr.s64 q13, #1
124  vshr.s64 d0, #1
125  vadd.s64 d1, d0, d26
126  vsub.s64 q11, q10
127
128MOV3:
129  vmov.s64 d26, d1            @ update sum64.
130
131CHECK_LOOP_CONDITION:
132  add r6, r6, #1
133  sub r9, r9, #2
134  cmp r0, r6                  @ Compare i to lpc_order.
135  sub r10, r10, #1
136  add r7, r7, #2
137  bge LOOP_I
138
139POST_LOOP_I:
140  mov r3, #0
141  vqadd.s64 d0, d26, d27      @ Sum and test overflow.
142  bvc GET_SHIFT_NORM          @ Skip the shift if there's no overflow.
143  vshr.s64 q13, #1
144  vadd.s64 d0, d26, d27
145  vsub.s64 q11, q10
146
147GET_SHIFT_NORM:
148  vcls.s32 d1, d0             @ Count leading extra sign bits.
149  vmov.32 r2, d1[1]           @ Store # of sign bits of only the 32 MSBs.
150  vmovl.s32 q1, d1
151  vshl.s64 d0, d3             @ d3 contains # of sign bits of the 32 MSBs.
152
153  vcls.s32 d1, d0             @ Count again the leading extra sign bits.
154  vmov.s32 r1, d1[1]          @ Store # of sign bits of only the 32 MSBs.
155  vmovl.s32 q1, d1
156  vshl.s64 d0, d3             @ d3 contains # of sign bits of the 32 MSBs.
157
158  vmov.s32 r0, d0[1]          @ residual_energy
159  vmov.s32 r3, d22[0]         @ shift_internal
160
161  @ Calculate the value for q_val_residual_energy.
162  ldr r4, [r13, #8]            @ q_val_corr
163  ldr r5, [r13, #12]           @ q_val_polynomial
164  sub r12, r4, #32
165  add r12, r12, r5, asl #1
166  add r1, r12, r1              @ add 1st part of shift_internal.
167  add r12, r1, r2              @ add 2nd part of shift_internal.
168  ldr r2, [r13, #52]
169  add r3, r12, r3              @ value for q_val_residual_energy.
170  str r3, [r2, #0]
171
172  add r13, r13, #16
173  pop {r4-r11}
174  bx  r14
175
176.fnend
177
178