Dot_p_neon.s revision b676a05348e4c516fa8b57e33b10548e6142c3f8
1@/*
2@ ** Copyright 2003-2010, VisualOn, Inc.
3@ **
4@ ** Licensed under the Apache License, Version 2.0 (the "License");
5@ ** you may not use this file except in compliance with the License.
6@ ** You may obtain a copy of the License at
7@ **
8@ **     http://www.apache.org/licenses/LICENSE-2.0
9@ **
10@ ** Unless required by applicable law or agreed to in writing, software
11@ ** distributed under the License is distributed on an "AS IS" BASIS,
12@ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13@ ** See the License for the specific language governing permissions and
14@ ** limitations under the License.
15@ */
16@
17@**********************************************************************/
18@Word32 Dot_product12(                      /* (o) Q31: normalized result (1 < val <= -1) */
19@       Word16 x[],                           /* (i) 12bits: x vector                       */
20@       Word16 y[],                           /* (i) 12bits: y vector                       */
21@       Word16 lg,                            /* (i)    : vector length                     */
22@       Word16 * exp                          /* (o)    : exponent of result (0..+30)       */
23@)
24@************************************************************************
25@  x[]   ---  r0
26@  y[]   ---  r1
27@  lg    ---  r2
28@  *exp  ---  r3
29
30          .section   .text
31          .global    Dot_product12_asm
32
33Dot_product12_asm:
34
35          STMFD   	    r13!, {r4 - r12, r14}
36	  CMP               r0, r1
37	  BEQ               LOOP_EQ
38
39          VLD1.S16          {Q0, Q1}, [r0]!               @load 16 Word16 x[]
40          VLD1.S16          {Q2, Q3}, [r0]!               @load 16 Word16 x[]
41          VLD1.S16          {Q4, Q5}, [r0]!               @load 16 Word16 x[]
42          VLD1.S16          {Q6, Q7}, [r0]!               @load 16 Word16 x[]
43	  VLD1.S16          {Q8, Q9}, [r1]!               @load 16 Word16 y[]
44	  VLD1.S16          {Q10, Q11}, [r1]!             @load 16 Word16 y[]
45	  VLD1.S16          {Q12, Q13}, [r1]!             @load 16 Word16 y[]
46
47          VMULL.S16         Q15, D16, D0
48          VMLAL.S16         Q15, D17, D1
49          VMLAL.S16         Q15, D18, D2
50          VMLAL.S16         Q15, D19, D3
51	  VLD1.S16          {Q0, Q1}, [r1]!               @load 16 Word16 y[]
52          VMLAL.S16         Q15, D20, D4
53          VMLAL.S16         Q15, D21, D5
54          VMLAL.S16         Q15, D22, D6
55          VMLAL.S16         Q15, D23, D7
56          VMLAL.S16         Q15, D24, D8
57          VMLAL.S16         Q15, D25, D9
58          VMLAL.S16         Q15, D26, D10
59          VMLAL.S16         Q15, D27, D11
60          VMLAL.S16         Q15, D0, D12
61          VMLAL.S16         Q15, D1, D13
62          VMLAL.S16         Q15, D2, D14
63          VMLAL.S16         Q15, D3, D15
64
65          CMP               r2, #64
66          BEQ               Lable1
67          VLD1.S16          {Q0, Q1}, [r0]!               @load 16 Word16 x[]
68	  VLD1.S16          {Q2, Q3}, [r1]!
69          VMLAL.S16         Q15, D4, D0
70          VMLAL.S16         Q15, D5, D1
71          VMLAL.S16         Q15, D6, D2
72          VMLAL.S16         Q15, D7, D3
73	  BL                Lable1
74
75LOOP_EQ:
76          VLD1.S16          {Q0, Q1}, [r0]!
77	  VLD1.S16          {Q2, Q3}, [r0]!
78	  VLD1.S16          {Q4, Q5}, [r0]!
79	  VLD1.S16          {Q6, Q7}, [r0]!
80	  VMULL.S16         Q15, D0, D0
81	  VMLAL.S16         Q15, D1, D1
82	  VMLAL.S16         Q15, D2, D2
83	  VMLAL.S16         Q15, D3, D3
84	  VMLAL.S16         Q15, D4, D4
85	  VMLAL.S16         Q15, D5, D5
86	  VMLAL.S16         Q15, D6, D6
87	  VMLAL.S16         Q15, D7, D7
88	  VMLAL.S16         Q15, D8, D8
89	  VMLAL.S16         Q15, D9, D9
90	  VMLAL.S16         Q15, D10, D10
91	  VMLAL.S16         Q15, D11, D11
92	  VMLAL.S16         Q15, D12, D12
93	  VMLAL.S16         Q15, D13, D13
94	  VMLAL.S16         Q15, D14, D14
95	  VMLAL.S16         Q15, D15, D15
96
97	  CMP               r2, #64
98	  BEQ               Lable1
99	  VLD1.S16          {Q0, Q1}, [r0]!
100	  VMLAL.S16         Q15, D0, D0
101	  VMLAL.S16         Q15, D1, D1
102	  VMLAL.S16         Q15, D2, D2
103	  VMLAL.S16         Q15, D3, D3
104
105Lable1:
106
107          VQADD.S32         D30, D30, D31
108          VPADD.S32         D30, D30, D30
109          VMOV.S32          r12, D30[0]
110
111	  ADD               r12, r12, r12
112          ADD               r12, r12, #1                         @ L_sum = (L_sum << 1)  + 1
113	  MOV               r4, r12
114	  CMP               r12, #0
115	  RSBLT             r4, r12, #0
116          CLZ               r10, r4
117          SUB               r10, r10, #1                         @ sft = norm_l(L_sum)
118          MOV               r0, r12, LSL r10                     @ L_sum = L_sum << sft
119          RSB               r11, r10, #30                        @ *exp = 30 - sft
120          STRH              r11, [r3]
121
122Dot_product12_end:
123
124          LDMFD   	    r13!, {r4 - r12, r15}
125
126          .END
127
128