1#
2# Copyright (C) 2011 The Android Open Source Project
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#      http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16
17# IDCT implementation using the MIPS DSP ASE (little endian version)
18#
19# See MIPS Technologies Inc documents:
20# "JPEG Decoder Optimization for MIPS32(R) Cores"  MD00483
21#
22# "MIPS32(R) Architecture for Programmers Volume IV-e: The MIPS(R) DSP
23#       Application Specifice Extension to the MIPS32(R) Architecture" MD00374
24#
25
26        .set            noreorder
27        .set            nomacro
28        .set            noat
29
30# This table has been moved to mips_jidctfst.c to avoid having to mess
31# with the global pointer to make this code PIC.
32#       .rdata
33#
34# mips_idct_coefs:
35#       # Constant table of scaled IDCT coefficients.
36#
37#       .word           0x45464546              # FIX( 1.082392200 / 2) =  17734 = 0x4546
38#       .word           0x5A825A82              # FIX( 1.414213562 / 2) =  23170 = 0x5A82
39#       .word           0x76427642              # FIX( 1.847759065 / 2) =  30274 = 0x7642
40#       .word           0xAC61AC61              # FIX(-2.613125930 / 4) = -21407 = 0xAC61
41
42        .text
43
44        .global         mips_idct_columns
45        .ent            mips_idct_columns
46
47# void mips_idct_columns(JCOEF * inptr, IFAST_MULT_TYPE * quantptr,
48#                        DCTELEM * wsptr, const int * mips_idct_coefs);
49
50mips_idct_columns:
51
52# $a0   - inptr
53# $a1   - quantptr
54# $a2   - wsptr
55# $a3, $at   - mips_idct_coefs
56# $t0:7 - simd data
57# $t8   - coefficients, temp
58# $t9   - loop end address
59# $s0:3 - simd quantization factors
60# $s4:7 - temp results
61# $v0:1 - temp results
62
63        addiu           $sp, $sp, -32           # reserve stack space for s0-s7
64
65        sw              $s0, 28($sp)
66        sw              $s1, 24($sp)
67        sw              $s2, 20($sp)
68        sw              $s3, 16($sp)
69        sw              $s4, 12($sp)
70        sw              $s5,  8($sp)
71        sw              $s6,  4($sp)
72        sw              $s7,  0($sp)
73
74        addiu           $t9, $a0, 16            # end address
75
76        #lui            $at, %hi(mips_idct_coefs)
77        #ori            $at, %lo(mips_idct_coefs)
78        # move mips_idct_coefs address from $a3 into $at where the rest of this code expects it
79        or              $at, $a3, $zero
80
81loop_columns:
82
83        lw              $s0, 0($a1)             # quantptr[DCTSIZE*0]
84
85        lw              $t0, 0($a0)             # inptr[DCTSIZE*0]
86        lw              $t1, 16($a0)            # inptr[DCTSIZE*1]
87
88        muleq_s.w.phl   $v0, $t0, $s0           # tmp0 ...
89
90        lw              $t2, 32($a0)            # inptr[DCTSIZE*2]
91        lw              $t3, 48($a0)            # inptr[DCTSIZE*3]
92        lw              $t4, 64($a0)            # inptr[DCTSIZE*4]
93        lw              $t5, 80($a0)            # inptr[DCTSIZE*5]
94
95        muleq_s.w.phr   $t0, $t0, $s0           # ... tmp0 ...
96
97        lw              $t6, 96($a0)            # inptr[DCTSIZE*6]
98        lw              $t7, 112($a0)           # inptr[DCTSIZE*7]
99
100        or              $s4, $t1, $t2
101        or              $s5, $t3, $t4
102
103        bnez            $s4, full_column
104        ins             $t0, $v0, 16, 16        # ... tmp0
105
106        bnez            $s5, full_column
107        or              $s6, $t5, $t6
108        or              $s6, $s6, $t7
109        bnez            $s6, full_column
110
111        sw              $t0, 0($a2)             # wsptr[DCTSIZE*0]
112        sw              $t0, 16($a2)            # wsptr[DCTSIZE*1]
113        sw              $t0, 32($a2)            # wsptr[DCTSIZE*2]
114        sw              $t0, 48($a2)            # wsptr[DCTSIZE*3]
115        sw              $t0, 64($a2)            # wsptr[DCTSIZE*4]
116        sw              $t0, 80($a2)            # wsptr[DCTSIZE*5]
117        sw              $t0, 96($a2)            # wsptr[DCTSIZE*6]
118        sw              $t0, 112($a2)           # wsptr[DCTSIZE*7]
119
120        addiu           $a0, $a0, 4
121
122        b               continue_columns
123        addiu           $a1, $a1, 4
124
125
126full_column:
127
128        lw              $s1, 32($a1)            # quantptr[DCTSIZE*2]
129        lw              $s2, 64($a1)            # quantptr[DCTSIZE*4]
130
131        muleq_s.w.phl   $v0, $t2, $s1           # tmp1 ...
132        muleq_s.w.phr   $t2, $t2, $s1           # ... tmp1 ...
133
134        lw              $s0, 16($a1)            # quantptr[DCTSIZE*1]
135        lw              $s1, 48($a1)            # quantptr[DCTSIZE*3]
136        lw              $s3, 96($a1)            # quantptr[DCTSIZE*6]
137
138        muleq_s.w.phl   $v1, $t4, $s2           # tmp2 ...
139        muleq_s.w.phr   $t4, $t4, $s2           # ... tmp2 ...
140
141        lw              $s2, 80($a1)            # quantptr[DCTSIZE*5]
142        lw              $t8, 4($at)             # FIX(1.414213562)
143        ins             $t2, $v0, 16, 16        # ... tmp1
144
145        muleq_s.w.phl   $v0, $t6, $s3           # tmp3 ...
146        muleq_s.w.phr   $t6, $t6, $s3           # ... tmp3 ...
147
148        ins             $t4, $v1, 16, 16        # ... tmp2
149
150        addq.ph         $s4, $t0, $t4           # tmp10
151        subq.ph         $s5, $t0, $t4           # tmp11
152
153        ins             $t6, $v0, 16, 16        # ... tmp3
154
155        subq.ph         $s6, $t2, $t6           # tmp12 ...
156        addq.ph         $s7, $t2, $t6           # tmp13
157
158        mulq_rs.ph      $s6, $s6, $t8           # ... tmp12 ...
159
160        addq.ph         $t0, $s4, $s7           # tmp0
161        subq.ph         $t6, $s4, $s7           # tmp3
162
163################
164
165        muleq_s.w.phl   $v0, $t1, $s0           # tmp4 ...
166        muleq_s.w.phr   $t1, $t1, $s0           # ... tmp4 ...
167
168        shll_s.ph       $s6, $s6, 1             # x2
169
170        lw              $s3, 112($a1)           # quantptr[DCTSIZE*7]
171
172        subq.ph         $s6, $s6, $s7           # ... tmp12
173
174        muleq_s.w.phl   $v1, $t7, $s3           # tmp7 ...
175        muleq_s.w.phr   $t7, $t7, $s3           # ... tmp7 ...
176
177        ins             $t1, $v0, 16, 16        # ... tmp4
178
179        addq.ph         $t2, $s5, $s6           # tmp1
180        subq.ph         $t4, $s5, $s6           # tmp2
181
182        muleq_s.w.phl   $v0, $t5, $s2           # tmp6 ...
183        muleq_s.w.phr   $t5, $t5, $s2           # ... tmp6 ...
184
185        ins             $t7, $v1, 16, 16        # ... tmp7
186
187        addq.ph         $s5, $t1, $t7           # z11
188        subq.ph         $s6, $t1, $t7           # z12
189
190        muleq_s.w.phl   $v1, $t3, $s1           # tmp5 ...
191        muleq_s.w.phr   $t3, $t3, $s1           # ... tmp5 ...
192
193        ins             $t5, $v0, 16, 16        # ... tmp6
194
195# stalls
196
197        ins             $t3, $v1, 16, 16        # ... tmp5
198
199
200        addq.ph         $s7, $t5, $t3           # z13
201        subq.ph         $v0, $t5, $t3           # z10
202
203        addq.ph         $t7, $s5, $s7           # tmp7
204        subq.ph         $s5, $s5, $s7           # tmp11 ...
205
206        addq.ph         $v1, $v0, $s6           # z5 ...
207
208        mulq_rs.ph      $s5, $s5, $t8           # ... tmp11
209
210        lw              $t8, 8($at)             # FIX(1.847759065)
211        lw              $s4, 0($at)             # FIX(1.082392200)
212
213        addq.ph         $s0, $t0, $t7
214        subq.ph         $s1, $t0, $t7
215
216        mulq_rs.ph      $v1, $v1, $t8           # ... z5
217
218        shll_s.ph       $s5, $s5, 1             # x2
219
220        lw              $t8, 12($at)            # FIX(-2.613125930)
221        sw              $s0, 0($a2)             # wsptr[DCTSIZE*0]
222
223        mulq_rs.ph      $v0, $v0, $t8           # tmp12 ...
224        mulq_rs.ph      $s4, $s6, $s4           # tmp10 ...
225
226        shll_s.ph       $v1, $v1, 1             # x2
227
228        addiu           $a0, $a0, 4
229        addiu           $a1, $a1, 4
230
231        sw              $s1, 112($a2)           # wsptr[DCTSIZE*7]
232
233        shll_s.ph       $s6, $v0, 2             # x4
234        shll_s.ph       $s4, $s4, 1             # x2
235        addq.ph         $s6, $s6, $v1           # ... tmp12
236
237        subq.ph         $t5, $s6, $t7           # tmp6
238        subq.ph         $s4, $s4, $v1           # ... tmp10
239        subq.ph         $t3, $s5, $t5           # tmp5
240        addq.ph         $s2, $t2, $t5
241        addq.ph         $t1, $s4, $t3           # tmp4
242        subq.ph         $s3, $t2, $t5
243
244        sw              $s2, 16($a2)            # wsptr[DCTSIZE*1]
245        sw              $s3, 96($a2)            # wsptr[DCTSIZE*6]
246
247        addq.ph         $v0, $t4, $t3
248        subq.ph         $v1, $t4, $t3
249
250        sw              $v0, 32($a2)            # wsptr[DCTSIZE*2]
251        sw              $v1, 80($a2)            # wsptr[DCTSIZE*5]
252
253        addq.ph         $v0, $t6, $t1
254        subq.ph         $v1, $t6, $t1
255
256        sw              $v0, 64($a2)            # wsptr[DCTSIZE*4]
257        sw              $v1, 48($a2)            # wsptr[DCTSIZE*3]
258
259continue_columns:
260
261        bne             $a0, $t9, loop_columns
262        addiu           $a2, $a2, 4
263
264
265        lw              $s0, 28($sp)
266        lw              $s1, 24($sp)
267        lw              $s2, 20($sp)
268        lw              $s3, 16($sp)
269        lw              $s4, 12($sp)
270        lw              $s5,  8($sp)
271        lw              $s6,  4($sp)
272        lw              $s7,  0($sp)
273
274        jr              $ra
275        addiu           $sp, $sp, 32
276
277
278        .end            mips_idct_columns
279
280
281##################################################################
282
283
284        .global         mips_idct_rows
285        .ent            mips_idct_rows
286
287# void mips_idct_rows(DCTELEM * wsptr, JSAMPARRAY output_buf,
288#                     JDIMENSION output_col, const int * mips_idct_coefs);
289
290mips_idct_rows:
291
292# $a0   - wsptr
293# $a1   - output_buf
294# $a2   - output_col
295# $a3   - outptr
296# $a3, $at   - mips_idct_coefs
297# $t0:7 - simd data
298# $t8   - coefficients, temp
299# $t9   - loop end address
300# $s0:3 - simd quantization factors
301# $s4:7 - temp results
302# s8    - const 0x80808080
303# $v0:1 - temp results
304
305SHIFT   =               2
306
307        addiu           $sp, $sp, -48           # reserve stack space for s0-s8
308
309        # save $a3 (mips_idct_coefs) because it might get clobbered below
310        sw              $a3, 36($sp)
311
312        sw              $s0, 32($sp)
313        sw              $s1, 28($sp)
314        sw              $s2, 24($sp)
315        sw              $s3, 20($sp)
316        sw              $s4, 16($sp)
317        sw              $s5, 12($sp)
318        sw              $s6,  8($sp)
319        sw              $s7,  4($sp)
320        sw              $s8,  0($sp)
321
322        addiu           $t9, $a0, 128           # end address
323
324        lui             $s8, 0x8080
325        ori             $s8, $s8, 0x8080
326
327loop_rows:
328
329        lw              $at, 36($sp)            # restore saved $a3 (mips_idct_coefs)
330
331        lw              $t0, 0+0($a0)           # wsptr[DCTSIZE*0+0/1]  b a
332        lw              $s0, 16+0($a0)          # wsptr[DCTSIZE*1+0/1]  B A
333        lw              $t2, 0+4($a0)           # wsptr[DCTSIZE*0+2/3]  d c
334        lw              $s2, 16+4($a0)          # wsptr[DCTSIZE*1+2/3]  D C
335        lw              $t4, 0+8($a0)           # wsptr[DCTSIZE*0+4/5]  f e
336        lw              $s4, 16+8($a0)          # wsptr[DCTSIZE*1+4/5]  F E
337        lw              $t6, 0+12($a0)          # wsptr[DCTSIZE*0+6/7]  h g
338        lw              $s6, 16+12($a0)         # wsptr[DCTSIZE*1+6/7]  H G
339
340        precrq.ph.w     $t1, $s0, $t0           # B b
341        ins             $t0, $s0, 16, 16        # A a
342
343        bnez            $t1, full_row
344        or              $s0, $t2, $s2
345        bnez            $s0, full_row
346        or              $s0, $t4, $s4
347        bnez            $s0, full_row
348        or              $s0, $t6, $s6
349        bnez            $s0, full_row
350
351        shll_s.ph       $s0, $t0, SHIFT         # A a
352
353        lw              $a3, 0($a1)
354        lw              $at, 4($a1)
355
356        precrq.ph.w     $t0, $s0, $s0           # A A
357        ins             $s0, $s0, 16, 16        # a a
358
359        addu            $a3, $a3, $a2
360        addu            $at, $at, $a2
361
362        precrq.qb.ph    $t0, $t0, $t0           # A A A A
363        precrq.qb.ph    $s0, $s0, $s0           # a a a a
364
365
366        addu.qb         $s0, $s0, $s8
367        addu.qb         $t0, $t0, $s8
368
369
370        sw              $s0, 0($a3)
371        sw              $s0, 4($a3)
372
373        sw              $t0, 0($at)
374        sw              $t0, 4($at)
375
376
377        addiu           $a0, $a0, 32
378
379        bne             $a0, $t9, loop_rows
380        addiu           $a1, $a1, 8
381
382        b               exit_rows
383        nop
384
385
386full_row:
387
388        precrq.ph.w     $t3, $s2, $t2
389        ins             $t2, $s2, 16, 16
390
391        precrq.ph.w     $t5, $s4, $t4
392        ins             $t4, $s4, 16, 16
393
394        precrq.ph.w     $t7, $s6, $t6
395        ins             $t6, $s6, 16, 16
396
397
398        lw              $t8, 4($at)             # FIX(1.414213562)
399
400        addq.ph         $s4, $t0, $t4           # tmp10
401        subq.ph         $s5, $t0, $t4           # tmp11
402
403        subq.ph         $s6, $t2, $t6           # tmp12 ...
404        addq.ph         $s7, $t2, $t6           # tmp13
405
406        mulq_rs.ph      $s6, $s6, $t8           # ... tmp12 ...
407
408        addq.ph         $t0, $s4, $s7           # tmp0
409        subq.ph         $t6, $s4, $s7           # tmp3
410
411        shll_s.ph       $s6, $s6, 1             # x2
412
413        subq.ph         $s6, $s6, $s7           # ... tmp12
414
415        addq.ph         $t2, $s5, $s6           # tmp1
416        subq.ph         $t4, $s5, $s6           # tmp2
417
418################
419
420        addq.ph         $s5, $t1, $t7           # z11
421        subq.ph         $s6, $t1, $t7           # z12
422
423        addq.ph         $s7, $t5, $t3           # z13
424        subq.ph         $v0, $t5, $t3           # z10
425
426        addq.ph         $t7, $s5, $s7           # tmp7
427        subq.ph         $s5, $s5, $s7           # tmp11 ...
428
429        addq.ph         $v1, $v0, $s6           # z5 ...
430
431        mulq_rs.ph      $s5, $s5, $t8           # ... tmp11
432
433        lw              $t8, 8($at)             # FIX(1.847759065)
434        lw              $s4, 0($at)             # FIX(1.082392200)
435
436        addq.ph         $s0, $t0, $t7           # tmp0 + tmp7
437        subq.ph         $s7, $t0, $t7           # tmp0 - tmp7
438
439        mulq_rs.ph      $v1, $v1, $t8           # ... z5
440
441        lw              $a3, 0($a1)
442        lw              $t8, 12($at)            # FIX(-2.613125930)
443
444        shll_s.ph       $s5, $s5, 1             # x2
445
446        addu            $a3, $a3, $a2
447
448        mulq_rs.ph      $v0, $v0, $t8           # tmp12 ...
449        mulq_rs.ph      $s4, $s6, $s4           # tmp10 ...
450
451        shll_s.ph       $v1, $v1, 1             # x2
452
453        addiu           $a0, $a0, 32
454        addiu           $a1, $a1, 8
455
456
457        shll_s.ph       $s6, $v0, 2             # x4
458        shll_s.ph       $s4, $s4, 1             # x2
459        addq.ph         $s6, $s6, $v1           # ... tmp12
460
461        shll_s.ph       $s0, $s0, SHIFT
462
463        subq.ph         $t5, $s6, $t7           # tmp6
464        subq.ph         $s4, $s4, $v1           # ... tmp10
465        subq.ph         $t3, $s5, $t5           # tmp5
466
467        shll_s.ph       $s7, $s7, SHIFT
468
469        addq.ph         $t1, $s4, $t3           # tmp4
470
471
472        addq.ph         $s1, $t2, $t5           # tmp1 + tmp6
473        subq.ph         $s6, $t2, $t5           # tmp1 - tmp6
474
475        addq.ph         $s2, $t4, $t3           # tmp2 + tmp5
476        subq.ph         $s5, $t4, $t3           # tmp2 - tmp5
477
478        addq.ph         $s4, $t6, $t1           # tmp3 + tmp4
479        subq.ph         $s3, $t6, $t1           # tmp3 - tmp4
480
481
482        shll_s.ph       $s1, $s1, SHIFT
483        shll_s.ph       $s2, $s2, SHIFT
484        shll_s.ph       $s3, $s3, SHIFT
485        shll_s.ph       $s4, $s4, SHIFT
486        shll_s.ph       $s5, $s5, SHIFT
487        shll_s.ph       $s6, $s6, SHIFT
488
489
490        precrq.ph.w     $t0, $s1, $s0           # B A
491        ins             $s0, $s1, 16, 16        # b a
492
493        precrq.ph.w     $t2, $s3, $s2           # D C
494        ins             $s2, $s3, 16, 16        # d c
495
496        precrq.ph.w     $t4, $s5, $s4           # F E
497        ins             $s4, $s5, 16, 16        # f e
498
499        precrq.ph.w     $t6, $s7, $s6           # H G
500        ins             $s6, $s7, 16, 16        # h g
501
502        precrq.qb.ph    $t0, $t2, $t0           # D C B A
503        precrq.qb.ph    $s0, $s2, $s0           # d c b a
504
505        precrq.qb.ph    $t4, $t6, $t4           # H G F E
506        precrq.qb.ph    $s4, $s6, $s4           # h g f e
507
508
509        addu.qb         $s0, $s0, $s8
510        addu.qb         $s4, $s4, $s8
511
512
513        sw              $s0, 0($a3)             # outptr[0/1/2/3]       d c b a
514        sw              $s4, 4($a3)             # outptr[4/5/6/7]       h g f e
515
516        lw              $a3, -4($a1)
517
518        addu.qb         $t0, $t0, $s8
519
520        addu            $a3, $a3, $a2
521
522        addu.qb         $t4, $t4, $s8
523
524
525        sw              $t0, 0($a3)             # outptr[0/1/2/3]       D C B A
526
527        bne             $a0, $t9, loop_rows
528        sw              $t4, 4($a3)             # outptr[4/5/6/7]       H G F E
529
530
531exit_rows:
532
533        lw              $s0, 32($sp)
534        lw              $s1, 28($sp)
535        lw              $s2, 24($sp)
536        lw              $s3, 20($sp)
537        lw              $s4, 16($sp)
538        lw              $s5, 12($sp)
539        lw              $s6,  8($sp)
540        lw              $s7,  4($sp)
541        lw              $s8,  0($sp)
542
543        jr              $ra
544        addiu           $sp, $sp, 48
545
546
547        .end            mips_idct_rows
548