1; Copyright (C) 2009 The Android Open Source Project
2;
3; Licensed under the Apache License, Version 2.0 (the "License");
4; you may not use this file except in compliance with the License.
5; You may obtain a copy of the License at
6;
7;      http://www.apache.org/licenses/LICENSE-2.0
8;
9; Unless required by applicable law or agreed to in writing, software
10; distributed under the License is distributed on an "AS IS" BASIS,
11; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12; See the License for the specific language governing permissions and
13; limitations under the License.
14
15;-------------------------------------------------------------------------------
16;--
17;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorVerQuarter
18;--            function
19;--
20;-------------------------------------------------------------------------------
21
22
23    IF :DEF: H264DEC_WINASM
24        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
25    ELSE
26        REQUIRE8
27        PRESERVE8
28    ENDIF
29
30    AREA    |.text|, CODE
31
32;// h264bsdInterpolateHorVerQuarter register allocation
33
34ref     RN 0
35
36mb      RN 1
37buff    RN 1
38
39count   RN 2
40x0      RN 2
41
42y0      RN 3
43x_2_0   RN 3
44res     RN 3
45
46x_3_1   RN 4
47tmp1    RN 4
48
49height  RN 5
50x_6_4   RN 5
51tmp2    RN 5
52
53partW   RN 6
54x_7_5   RN 6
55tmp3    RN 6
56
57partH   RN 7
58tmp4    RN 7
59
60tmp5    RN 8
61
62tmp6    RN 9
63
64tmpa    RN 10
65
66mult_20_01  RN 11
67tmpb        RN 11
68
69mult_20_m5  RN 12
70width       RN 12
71
72plus16  RN 14
73
74
75;// function exports and imports
76
77    IMPORT  h264bsdFillBlock
78
79    EXPORT  h264bsdInterpolateHorVerQuarter
80
81;// Horizontal filter approach
82;//
83;// Basic idea in horizontal filtering is to adjust coefficients
84;// like below. Calculation is done with 16-bit maths.
85;//
86;// Reg     x_2_0     x_3_1     x_6_4     x_7_5     x_2_0
87;//       [  2  0 ] [  3  1 ] [  6  4 ] [  7  5 ] [ 10  8 ] ...
88;// y_0 =   20  1     20 -5        -5         1
89;// y_1 =   -5        20  1      1 20        -5
90;// y_2 =    1        -5        -5 20      1 20
91;// y_3 =              1        20 -5     -5 20         1
92
93
94h264bsdInterpolateHorVerQuarter
95    STMFD   sp!, {r0-r11, lr}
96    SUB     sp, sp, #0x1e4
97
98    CMP     x0, #0
99    BLT     do_fill                 ;// (x0 < 0)
100    LDR     partW, [sp,#0x220]      ;// partWidth
101    LDR     width, [sp,#0x218]      ;// width
102    ADD     tmpa, x0, partW         ;// (x0+partWidth)
103    ADD     tmpa, tmpa, #5          ;// (x0+partW+5)
104    CMP     tmpa, width
105    BHI     do_fill                 ;// (x0+partW)>width
106
107    CMP     y0, #0
108    BLT     do_fill                 ;// (y0 < 0)
109    LDR     partH, [sp,#0x224]      ;// partHeight
110    LDR     height, [sp,#0x21c]     ;// height
111    ADD     tmp5, y0, partH         ;// (y0+partHeight)
112    ADD     tmp5, tmp5, #5          ;// (y0+partH+5)
113    CMP     tmp5, height
114    BLS     skip_fill               ;// no overfill needed
115
116
117do_fill
118    LDR     partH, [sp,#0x224]      ;// partHeight
119    LDR     partW, [sp,#0x220]      ;// partWidth
120    LDR     height, [sp,#0x21c]     ;// height
121    ADD     tmp5, partH, #5         ;// tmp5 = partH + 5
122    ADD     tmpa, partW, #5         ;// tmpa = partW + 5
123    STMIB   sp, {height, tmpa}      ;// sp+4 = height, sp+8 = partWidth+5
124    LDR     width, [sp,#0x218]      ;// width
125    STR     tmp5, [sp,#0xc]         ;// sp+c = partHeight+5
126    STR     tmpa, [sp,#0x10]        ;// sp+10 = partWidth+5
127    STR     width, [sp,#0]          ;// sp+0 = width
128    ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
129    BL      h264bsdFillBlock
130
131    MOV     x0, #0
132    STR     x0,[sp,#0x1ec]          ;// x0 = 0
133    STR     x0,[sp,#0x1f0]          ;// y0 = 0
134    ADD     ref,sp,#0x28            ;// ref = p1
135    STR     tmpa, [sp,#0x218]       ;// width = partWidth+5
136
137
138skip_fill
139    LDR     x0 ,[sp,#0x1ec]         ;// x0
140    LDR     y0 ,[sp,#0x1f0]         ;// y0
141    LDR     width, [sp,#0x218]      ;// width
142    LDR     tmp6, [sp,#0x228]       ;// horVerOffset
143    LDR     mb, [sp, #0x1e8]        ;// mb
144    MLA     tmp5, width, y0, x0     ;// y0*width+x0
145    ADD     ref, ref, tmp5          ;// ref += y0*width+x0
146    STR     ref, [sp, #0x1e4]       ;// store "ref" for vertical filtering
147    AND     tmp6, tmp6, #2          ;// calculate ref for horizontal filter
148    MOV     tmpa, #2
149    ADD     tmp6, tmpa, tmp6, LSR #1
150    MLA     ref, tmp6, width, ref
151    ADD     ref, ref, #8            ;// ref = ref+8
152
153    ;// pack values to count register
154    ;// [31:28] loop_x (partWidth-1)
155    ;// [27:24] loop_y (partHeight-1)
156    ;// [23:20] partWidth-1
157    ;// [19:16] partHeight-1
158    ;// [15:00] width
159    MOV     count, width
160    SUB     partW, partW, #1;
161    SUB     partH, partH, #1;
162    ADD     tmp5, partH, partW, LSL #4
163    ADD     count, count, tmp5, LSL #16
164
165
166    LDR     mult_20_01, = 0x00140001    ;// constant multipliers
167    LDR     mult_20_m5, = 0x0014FFFB    ;// constant multipliers
168    MOV     plus16, #16                 ;// constant for add
169    AND     tmp4, count, #0x000F0000    ;// partHeight-1
170    AND     tmp6, count, #0x00F00000    ;// partWidth-1
171    ADD     count, count, tmp4, LSL #8  ;// partH-1 to lower part of top byte
172
173;// HORIZONTAL PART
174
175loop_y_hor
176    LDR     x_3_1, [ref, #-8]
177    ADD     count, count, tmp6, LSL #8   ;// partW-1 to upper part of top byte
178    LDR     x_7_5, [ref, #-4]
179    UXTB16  x_2_0, x_3_1
180    UXTB16  x_3_1, x_3_1, ROR #8
181    UXTB16  x_6_4, x_7_5
182
183loop_x_hor
184    UXTB16  x_7_5, x_7_5, ROR #8
185
186    SMLAD   tmp4, x_2_0, mult_20_01, plus16
187    SMLATB  tmp6, x_2_0, mult_20_01, plus16
188    SMLATB  tmp5, x_2_0, mult_20_m5, plus16
189    SMLATB  tmpa, x_3_1, mult_20_01, plus16
190
191    SMLAD   tmp4, x_3_1, mult_20_m5, tmp4
192    SMLATB  tmp6, x_3_1, mult_20_m5, tmp6
193    SMLAD   tmp5, x_3_1, mult_20_01, tmp5
194    LDR     x_3_1, [ref], #4
195    SMLAD   tmpa, x_6_4, mult_20_m5, tmpa
196
197    SMLABB  tmp4, x_6_4, mult_20_m5, tmp4
198    SMLADX  tmp6, x_6_4, mult_20_m5, tmp6
199    SMLADX  tmp5, x_6_4, mult_20_01, tmp5
200    SMLADX  tmpa, x_7_5, mult_20_m5, tmpa
201
202    SMLABB  tmp4, x_7_5, mult_20_01, tmp4
203    UXTB16  x_2_0, x_3_1
204    SMLABB  tmp5, x_7_5, mult_20_m5, tmp5
205    SMLADX  tmp6, x_7_5, mult_20_01, tmp6
206    SMLABB  tmpa, x_2_0, mult_20_01, tmpa
207
208    MOV     tmp5, tmp5, ASR #5
209    MOV     tmp4, tmp4, ASR #5
210    PKHBT   tmp5, tmp5, tmpa, LSL #(16-5)
211    PKHBT   tmp4, tmp4, tmp6, LSL #(16-5)
212    USAT16  tmp5, #8, tmp5
213    USAT16  tmp4, #8, tmp4
214
215    SUBS    count, count, #4<<28
216    ORR     tmp4, tmp4, tmp5, LSL #8
217    STR     tmp4, [mb], #4
218    BCC     next_y_hor
219
220    UXTB16  x_3_1, x_3_1, ROR #8
221
222    SMLAD   tmp4, x_6_4, mult_20_01, plus16
223    SMLATB  tmp6, x_6_4, mult_20_01, plus16
224    SMLATB  tmp5, x_6_4, mult_20_m5, plus16
225    SMLATB  tmpa, x_7_5, mult_20_01, plus16
226
227    SMLAD   tmp4, x_7_5, mult_20_m5, tmp4
228    SMLATB  tmp6, x_7_5, mult_20_m5, tmp6
229    SMLAD   tmp5, x_7_5, mult_20_01, tmp5
230    LDR     x_7_5, [ref], #4
231    SMLAD   tmpa, x_2_0, mult_20_m5, tmpa
232
233    SMLABB  tmp4, x_2_0, mult_20_m5, tmp4
234    SMLADX  tmp6, x_2_0, mult_20_m5, tmp6
235    SMLADX  tmp5, x_2_0, mult_20_01, tmp5
236    SMLADX  tmpa, x_3_1, mult_20_m5, tmpa
237
238    SMLABB  tmp4, x_3_1, mult_20_01, tmp4
239    UXTB16  x_6_4, x_7_5
240    SMLABB  tmp5, x_3_1, mult_20_m5, tmp5
241    SMLADX  tmp6, x_3_1, mult_20_01, tmp6
242    SMLABB  tmpa, x_6_4, mult_20_01, tmpa
243
244    MOV     tmp5, tmp5, ASR #5
245    MOV     tmp4, tmp4, ASR #5
246    PKHBT   tmp5, tmp5, tmpa, LSL #(16-5)
247    PKHBT   tmp4, tmp4, tmp6, LSL #(16-5)
248    USAT16  tmp5, #8, tmp5
249    USAT16  tmp4, #8, tmp4
250
251    SUBS    count, count, #4<<28
252    ORR     tmp4, tmp4, tmp5, LSL #8
253    STR     tmp4, [mb], #4
254    BCS     loop_x_hor
255
256next_y_hor
257    AND     tmp6, count, #0x00F00000        ;// partWidth-1
258    SMLABB  ref, count, mult_20_01, ref     ;// +width
259    ADDS    mb, mb, #16                     ;// +16, Carry=0
260    SBC     mb, mb, tmp6, LSR #20           ;// -(partWidth-1)-1
261    SBC     ref, ref, tmp6, LSR #20         ;// -(partWidth-1)-1
262    ADDS    count, count, #(1<<28)-(1<<24)  ;// decrement counter (partW)
263    BGE     loop_y_hor
264
265
266
267;// VERTICAL PART
268;//
269;// Approach to vertical interpolation
270;//
271;// Interpolation is done by using 32-bit loads and stores
272;// and by using 16 bit arithmetic. 4x4 block is processed
273;// in each round.
274;//
275;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n|
276;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n|
277;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n|
278;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n|
279;//           ..
280;//           ..
281;// |a_m1|a_m1|a_m1|a_m1|...
282;// |b_m1|b_m1|b_m1|b_m1|...
283;// |c_m1|c_m1|c_m1|c_m1|...
284;// |d_m1|d_m1|d_m1|d_m1|...
285
286;// Approach to bilinear interpolation to quarter pel position.
287;// 4 bytes are processed parallel
288;//
289;// algorithm (a+b+1)/2. Rouding upwards +1 can be achieved by
290;// negating second operand to get one's complement (instead of 2's)
291;// and using subtraction, EOR is used to correct sign.
292;//
293;// MVN     b, b
294;// UHSUB8  a, a, b
295;// EOR     a, a, 0x80808080
296
297
298    LDR     ref, [sp, #0x1e4]           ;// ref
299    LDR     tmpa, [sp, #0x228]          ;// horVerOffset
300    LDR     mb, [sp, #0x1e8]            ;// mb
301    LDR     width, [sp, #0x218]         ;// width
302    ADD     ref, ref, #2                ;// calculate correct position
303    AND     tmpa, tmpa, #1
304    ADD     ref, ref, tmpa
305    LDR     plus16, = 0x00100010        ;// +16 to lower and upperf halfwords
306    AND     count, count, #0x00FFFFFF   ;// partWidth-1
307
308    AND     tmpa, count, #0x000F0000    ;// partHeight-1
309    ADD     count, count, tmpa, LSL #8
310
311loop_y
312    ADD     count, count, tmp6, LSL #8  ;// partWidth-1
313
314loop_x
315    LDR     tmp1, [ref], width     ;// |a4|a3|a2|a1|
316    LDR     tmp2, [ref], width     ;// |c4|c3|c2|c1|
317    LDR     tmp3, [ref], width     ;// |g4|g3|g2|g1|
318    LDR     tmp4, [ref], width     ;// |m4|m3|m2|m1|
319    LDR     tmp5, [ref], width     ;// |r4|r3|r2|r1|
320    LDR     tmp6, [ref], width     ;// |t4|t3|t2|t1|
321
322    ;// first four pixels
323    UXTB16  tmpa, tmp3                  ;// |g3|g1|
324    UXTAB16 tmpa, tmpa, tmp4            ;// |g3+m3|g1+m1|
325    UXTB16  tmpb, tmp2                  ;// |c3|c1|
326    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
327
328    UXTAB16 tmpb, tmpb, tmp5            ;// |c3+r3|c1+r1|
329    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
330    UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A
331    UXTAB16 tmpa, tmpa, tmp6            ;// 16+20(G+M)+A+T
332
333    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
334    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
335
336    USAT16  tmpb, #13, tmpa             ;// saturate
337    LDR     res, = 0x00FF00FF
338    UXTB16  tmpa, tmp3, ROR #8          ;// |g4|g2|
339    UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// |g4+m4|g2+m2|
340    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
341
342    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
343    UXTB16  tmpb, tmp2, ROR #8          ;// |c4|c2|
344    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
345    UXTAB16 tmpb, tmpb, tmp5, ROR #8    ;// |c4+r4|c2+r2|
346    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A
347    UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// 16+20(G+M)+A+T
348
349    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
350    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
351
352    USAT16  tmpb, #13, tmpa             ;// saturate
353    LDR     tmp1, [mb]
354    LDR     tmpa, = 0xFF00FF00
355    MVN     tmp1, tmp1
356    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divede by 32
357    ORR     res, res, tmpa
358
359    LDR     tmpa, = 0x80808080
360    UHSUB8  res, res, tmp1              ;// bilinear interpolation
361    LDR     tmp1, [ref], width          ;// load next row
362    EOR     res, res, tmpa              ;// correct sign
363
364    STR     res, [mb], #16              ;// next row (mb)
365
366
367    ;// tmp2 = |a4|a3|a2|a1|
368    ;// tmp3 = |c4|c3|c2|c1|
369    ;// tmp4 = |g4|g3|g2|g1|
370    ;// tmp5 = |m4|m3|m2|m1|
371    ;// tmp6 = |r4|r3|r2|r1|
372    ;// tmp1 = |t4|t3|t2|t1|
373
374    ;// second four pixels
375    UXTB16  tmpa, tmp4                  ;// |g3|g1|
376    UXTAB16 tmpa, tmpa, tmp5            ;// |g3+m3|g1+m1|
377    UXTB16  tmpb, tmp3                  ;// |c3|c1|
378    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
379    UXTAB16 tmpb, tmpb, tmp6            ;// |c3+r3|c1+r1|
380    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
381    UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A
382    UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A+T
383
384    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
385    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
386
387    USAT16  tmpb, #13, tmpa             ;// saturate
388    LDR     res, = 0x00FF00FF
389    UXTB16  tmpa, tmp4, ROR #8          ;// |g4|g2|
390    UXTAB16 tmpa, tmpa, tmp5, ROR #8    ;// |g4+m4|g2+m2|
391    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
392
393    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
394    UXTB16  tmpb, tmp3, ROR #8          ;// |c4|c2|
395    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
396    UXTAB16 tmpb, tmpb, tmp6, ROR #8    ;// |c4+r4|c2+r2|
397    UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A
398    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A+T
399
400    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
401    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
402
403    USAT16  tmpb, #13, tmpa             ;// saturate
404    LDR     tmp2, [mb]
405    LDR     tmpa, = 0xFF00FF00
406    MVN     tmp2, tmp2
407
408    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
409    ORR     res, res, tmpa
410    LDR     tmpa, = 0x80808080
411    UHSUB8  res, res, tmp2              ;// bilinear interpolation
412    LDR     tmp2, [ref], width          ;// load next row
413    EOR     res, res, tmpa              ;// correct sign
414    STR     res, [mb], #16              ;// next row
415
416    ;// tmp3 = |a4|a3|a2|a1|
417    ;// tmp4 = |c4|c3|c2|c1|
418    ;// tmp5 = |g4|g3|g2|g1|
419    ;// tmp6 = |m4|m3|m2|m1|
420    ;// tmp1 = |r4|r3|r2|r1|
421    ;// tmp2 = |t4|t3|t2|t1|
422
423    ;// third four pixels
424    UXTB16  tmpa, tmp5                  ;// |g3|g1|
425    UXTAB16 tmpa, tmpa, tmp6            ;// |g3+m3|g1+m1|
426    UXTB16  tmpb, tmp4                  ;// |c3|c1|
427    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
428    UXTAB16 tmpb, tmpb, tmp1            ;// |c3+r3|c1+r1|
429    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
430    UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A
431    UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A+T
432
433    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
434    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
435
436    USAT16  tmpb, #13, tmpa             ;// saturate
437    LDR     res, = 0x00FF00FF
438    UXTB16  tmpa, tmp5, ROR #8          ;// |g4|g2|
439    UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// |g4+m4|g2+m2|
440    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
441
442    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
443    UXTB16  tmpb, tmp4, ROR #8          ;// |c4|c2|
444    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
445    UXTAB16 tmpb, tmpb, tmp1, ROR #8    ;// |c4+r4|c2+r2|
446    UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A
447    UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A+T
448
449
450    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
451    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
452
453    USAT16  tmpb, #13, tmpa             ;// saturate
454    LDR     tmp3, [mb]
455    LDR     tmpa, = 0xFF00FF00
456    MVN     tmp3, tmp3
457
458    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
459    ORR     res, res, tmpa
460    LDR     tmpa, = 0x80808080
461    UHSUB8  res, res, tmp3              ;// bilinear interpolation
462    LDR     tmp3, [ref]                 ;// load next row
463    EOR     res, res, tmpa              ;// correct sign
464    STR     res, [mb], #16              ;// next row
465
466    ;// tmp4 = |a4|a3|a2|a1|
467    ;// tmp5 = |c4|c3|c2|c1|
468    ;// tmp6 = |g4|g3|g2|g1|
469    ;// tmp1 = |m4|m3|m2|m1|
470    ;// tmp2 = |r4|r3|r2|r1|
471    ;// tmp3 = |t4|t3|t2|t1|
472
473    ;// fourth four pixels
474    UXTB16  tmpa, tmp6                  ;// |g3|g1|
475    UXTAB16 tmpa, tmpa, tmp1            ;// |g3+m3|g1+m1|
476    UXTB16  tmpb, tmp5                  ;// |c3|c1|
477    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
478    UXTAB16 tmpb, tmpb, tmp2            ;// |c3+r3|c1+r1|
479    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
480    UXTAB16 tmpa, tmpa, tmp4            ;// 16+20(G+M)+A
481    UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A+T
482
483    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
484    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
485
486    USAT16  tmpb, #13, tmpa             ;// saturate
487    LDR     res, = 0x00FF00FF
488    UXTB16  tmpa, tmp6, ROR #8          ;// |g4|g2|
489    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// |g4+m4|g2+m2|
490    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
491
492    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
493    UXTB16  tmpb, tmp5, ROR #8          ;// |c4|c2|
494    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
495    UXTAB16 tmpb, tmpb, tmp2, ROR #8    ;// |c4+r4|c2+r2|
496    UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// 16+20(G+M)+A
497    UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A+T
498
499    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
500    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
501
502    USAT16  tmpb, #13, tmpa             ;// saturate
503    LDR     tmp5, [mb]
504    LDR     tmp4, = 0xFF00FF00
505    MVN     tmp5, tmp5
506
507    AND     tmpa, tmp4, tmpb, LSL #3    ;// mask and divide by 32
508    ORR     res, res, tmpa
509    LDR     tmpa, = 0x80808080
510    UHSUB8  res, res, tmp5              ;// bilinear interpolation
511
512    ;// decrement loop_x counter
513    SUBS    count, count, #4<<28        ;// decrement x loop counter
514
515    ;// calculate "ref" address for next round
516    SUB     ref, ref, width, LSL #3     ;// ref -= 8*width;
517    ADD     ref, ref, #4                ;// next column (4 pixels)
518
519    EOR     res, res, tmpa              ;// correct sign
520    STR     res, [mb], #-44
521
522    BCS     loop_x
523
524    ADDS    mb, mb, #64                 ;// set Carry=0
525    ADD     ref, ref, width, LSL #2     ;// ref += 4*width
526    AND     tmp6, count, #0x00F00000    ;// partWidth-1
527    SBC     ref, ref, tmp6, LSR #20     ;// -(partWidth-1)-1
528    SBC     mb, mb, tmp6, LSR #20       ;// -(partWidth-1)-1
529
530    ADDS    count, count, #0xC << 24    ;// decrement y loop counter
531    BGE     loop_y
532
533    ADD     sp, sp, #0x1f4
534    LDMFD   sp!, {r4-r11, pc}
535
536    END
537