1/* 2 * Copyright 2012 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "libyuv/basic_types.h" 12#include "libyuv/row.h" 13 14#ifdef __cplusplus 15namespace libyuv { 16extern "C" { 17#endif 18 19// This module is for GCC MIPS DSPR2 20#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \ 21 (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32) 22 23void ScaleRowDown2_DSPR2(const uint8* src_ptr, 24 ptrdiff_t src_stride, 25 uint8* dst, 26 int dst_width) { 27 __asm__ __volatile__( 28 ".set push \n" 29 ".set noreorder \n" 30 31 "srl $t9, %[dst_width], 4 \n" // iterations -> by 16 32 "beqz $t9, 2f \n" 33 " nop \n" 34 35 "1: \n" 36 "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| 37 "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| 38 "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| 39 "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| 40 "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16| 41 "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| 42 "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| 43 "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| 44 // TODO(fbarchard): Use odd pixels instead of even. 45 "precrq.qb.ph $t8, $t1, $t0 \n" // |7|5|3|1| 46 "precrq.qb.ph $t0, $t3, $t2 \n" // |15|13|11|9| 47 "precrq.qb.ph $t1, $t5, $t4 \n" // |23|21|19|17| 48 "precrq.qb.ph $t2, $t7, $t6 \n" // |31|29|27|25| 49 "addiu %[src_ptr], %[src_ptr], 32 \n" 50 "addiu $t9, $t9, -1 \n" 51 "sw $t8, 0(%[dst]) \n" 52 "sw $t0, 4(%[dst]) \n" 53 "sw $t1, 8(%[dst]) \n" 54 "sw $t2, 12(%[dst]) \n" 55 "bgtz $t9, 1b \n" 56 " addiu %[dst], %[dst], 16 \n" 57 58 "2: \n" 59 "andi $t9, %[dst_width], 0xf \n" // residue 60 "beqz $t9, 3f \n" 61 " nop \n" 62 63 "21: \n" 64 "lbu $t0, 1(%[src_ptr]) \n" 65 "addiu %[src_ptr], %[src_ptr], 2 \n" 66 "addiu $t9, $t9, -1 \n" 67 "sb $t0, 0(%[dst]) \n" 68 "bgtz $t9, 21b \n" 69 " addiu %[dst], %[dst], 1 \n" 70 71 "3: \n" 72 ".set pop \n" 73 : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst) 74 : [dst_width] "r"(dst_width) 75 : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"); 76} 77 78void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, 79 ptrdiff_t src_stride, 80 uint8* dst, 81 int dst_width) { 82 const uint8* t = src_ptr + src_stride; 83 84 __asm__ __volatile__( 85 ".set push \n" 86 ".set noreorder \n" 87 88 "srl $t9, %[dst_width], 3 \n" // iterations -> step 8 89 "bltz $t9, 2f \n" 90 " nop \n" 91 92 "1: \n" 93 "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| 94 "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| 95 "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| 96 "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| 97 "lw $t4, 0(%[t]) \n" // |19|18|17|16| 98 "lw $t5, 4(%[t]) \n" // |23|22|21|20| 99 "lw $t6, 8(%[t]) \n" // |27|26|25|24| 100 "lw $t7, 12(%[t]) \n" // |31|30|29|28| 101 "addiu $t9, $t9, -1 \n" 102 "srl $t8, $t0, 16 \n" // |X|X|3|2| 103 "ins $t0, $t4, 16, 16 \n" // |17|16|1|0| 104 "ins $t4, $t8, 0, 16 \n" // |19|18|3|2| 105 "raddu.w.qb $t0, $t0 \n" // |17+16+1+0| 106 "raddu.w.qb $t4, $t4 \n" // |19+18+3+2| 107 "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2 108 "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2 109 "srl $t8, $t1, 16 \n" // |X|X|7|6| 110 "ins $t1, $t5, 16, 16 \n" // |21|20|5|4| 111 "ins $t5, $t8, 0, 16 \n" // |22|23|7|6| 112 "raddu.w.qb $t1, $t1 \n" // |21+20+5+4| 113 "raddu.w.qb $t5, $t5 \n" // |23+22+7+6| 114 "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2 115 "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2 116 "srl $t8, $t2, 16 \n" // |X|X|11|10| 117 "ins $t2, $t6, 16, 16 \n" // |25|24|9|8| 118 "ins $t6, $t8, 0, 16 \n" // |27|26|11|10| 119 "raddu.w.qb $t2, $t2 \n" // |25+24+9+8| 120 "raddu.w.qb $t6, $t6 \n" // |27+26+11+10| 121 "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2 122 "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2 123 "srl $t8, $t3, 16 \n" // |X|X|15|14| 124 "ins $t3, $t7, 16, 16 \n" // |29|28|13|12| 125 "ins $t7, $t8, 0, 16 \n" // |31|30|15|14| 126 "raddu.w.qb $t3, $t3 \n" // |29+28+13+12| 127 "raddu.w.qb $t7, $t7 \n" // |31+30+15+14| 128 "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2 129 "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2 130 "addiu %[src_ptr], %[src_ptr], 16 \n" 131 "addiu %[t], %[t], 16 \n" 132 "sb $t0, 0(%[dst]) \n" 133 "sb $t4, 1(%[dst]) \n" 134 "sb $t1, 2(%[dst]) \n" 135 "sb $t5, 3(%[dst]) \n" 136 "sb $t2, 4(%[dst]) \n" 137 "sb $t6, 5(%[dst]) \n" 138 "sb $t3, 6(%[dst]) \n" 139 "sb $t7, 7(%[dst]) \n" 140 "bgtz $t9, 1b \n" 141 " addiu %[dst], %[dst], 8 \n" 142 143 "2: \n" 144 "andi $t9, %[dst_width], 0x7 \n" // x = residue 145 "beqz $t9, 3f \n" 146 " nop \n" 147 148 "21: \n" 149 "lwr $t1, 0(%[src_ptr]) \n" 150 "lwl $t1, 3(%[src_ptr]) \n" 151 "lwr $t2, 0(%[t]) \n" 152 "lwl $t2, 3(%[t]) \n" 153 "srl $t8, $t1, 16 \n" 154 "ins $t1, $t2, 16, 16 \n" 155 "ins $t2, $t8, 0, 16 \n" 156 "raddu.w.qb $t1, $t1 \n" 157 "raddu.w.qb $t2, $t2 \n" 158 "shra_r.w $t1, $t1, 2 \n" 159 "shra_r.w $t2, $t2, 2 \n" 160 "sb $t1, 0(%[dst]) \n" 161 "sb $t2, 1(%[dst]) \n" 162 "addiu %[src_ptr], %[src_ptr], 4 \n" 163 "addiu $t9, $t9, -2 \n" 164 "addiu %[t], %[t], 4 \n" 165 "bgtz $t9, 21b \n" 166 " addiu %[dst], %[dst], 2 \n" 167 168 "3: \n" 169 ".set pop \n" 170 171 : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [t] "+r"(t) 172 : [dst_width] "r"(dst_width) 173 : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"); 174} 175 176void ScaleRowDown4_DSPR2(const uint8* src_ptr, 177 ptrdiff_t src_stride, 178 uint8* dst, 179 int dst_width) { 180 __asm__ __volatile__( 181 ".set push \n" 182 ".set noreorder \n" 183 184 "srl $t9, %[dst_width], 3 \n" 185 "beqz $t9, 2f \n" 186 " nop \n" 187 188 "1: \n" 189 "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| 190 "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| 191 "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| 192 "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12| 193 "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16| 194 "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20| 195 "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24| 196 "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28| 197 "precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0| 198 "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8| 199 "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16| 200 "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24| 201 "precrq.qb.ph $t1, $t2, $t1 \n" // |14|10|6|2| 202 "precrq.qb.ph $t5, $t6, $t5 \n" // |30|26|22|18| 203 "addiu %[src_ptr], %[src_ptr], 32 \n" 204 "addiu $t9, $t9, -1 \n" 205 "sw $t1, 0(%[dst]) \n" 206 "sw $t5, 4(%[dst]) \n" 207 "bgtz $t9, 1b \n" 208 " addiu %[dst], %[dst], 8 \n" 209 210 "2: \n" 211 "andi $t9, %[dst_width], 7 \n" // residue 212 "beqz $t9, 3f \n" 213 " nop \n" 214 215 "21: \n" 216 "lbu $t1, 2(%[src_ptr]) \n" 217 "addiu %[src_ptr], %[src_ptr], 4 \n" 218 "addiu $t9, $t9, -1 \n" 219 "sb $t1, 0(%[dst]) \n" 220 "bgtz $t9, 21b \n" 221 " addiu %[dst], %[dst], 1 \n" 222 223 "3: \n" 224 ".set pop \n" 225 : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst) 226 : [dst_width] "r"(dst_width) 227 : "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"); 228} 229 230void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, 231 ptrdiff_t src_stride, 232 uint8* dst, 233 int dst_width) { 234 intptr_t stride = src_stride; 235 const uint8* s1 = src_ptr + stride; 236 const uint8* s2 = s1 + stride; 237 const uint8* s3 = s2 + stride; 238 239 __asm__ __volatile__( 240 ".set push \n" 241 ".set noreorder \n" 242 243 "srl $t9, %[dst_width], 1 \n" 244 "andi $t8, %[dst_width], 1 \n" 245 246 "1: \n" 247 "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| 248 "lw $t1, 0(%[s1]) \n" // |7|6|5|4| 249 "lw $t2, 0(%[s2]) \n" // |11|10|9|8| 250 "lw $t3, 0(%[s3]) \n" // |15|14|13|12| 251 "lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16| 252 "lw $t5, 4(%[s1]) \n" // |23|22|21|20| 253 "lw $t6, 4(%[s2]) \n" // |27|26|25|24| 254 "lw $t7, 4(%[s3]) \n" // |31|30|29|28| 255 "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0| 256 "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4| 257 "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8| 258 "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12| 259 "raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16| 260 "raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20| 261 "raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24| 262 "raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28| 263 "add $t0, $t0, $t1 \n" 264 "add $t1, $t2, $t3 \n" 265 "add $t0, $t0, $t1 \n" 266 "add $t4, $t4, $t5 \n" 267 "add $t6, $t6, $t7 \n" 268 "add $t4, $t4, $t6 \n" 269 "shra_r.w $t0, $t0, 4 \n" 270 "shra_r.w $t4, $t4, 4 \n" 271 "sb $t0, 0(%[dst]) \n" 272 "sb $t4, 1(%[dst]) \n" 273 "addiu %[src_ptr], %[src_ptr], 8 \n" 274 "addiu %[s1], %[s1], 8 \n" 275 "addiu %[s2], %[s2], 8 \n" 276 "addiu %[s3], %[s3], 8 \n" 277 "addiu $t9, $t9, -1 \n" 278 "bgtz $t9, 1b \n" 279 " addiu %[dst], %[dst], 2 \n" 280 "beqz $t8, 2f \n" 281 " nop \n" 282 283 "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| 284 "lw $t1, 0(%[s1]) \n" // |7|6|5|4| 285 "lw $t2, 0(%[s2]) \n" // |11|10|9|8| 286 "lw $t3, 0(%[s3]) \n" // |15|14|13|12| 287 "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0| 288 "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4| 289 "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8| 290 "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12| 291 "add $t0, $t0, $t1 \n" 292 "add $t1, $t2, $t3 \n" 293 "add $t0, $t0, $t1 \n" 294 "shra_r.w $t0, $t0, 4 \n" 295 "sb $t0, 0(%[dst]) \n" 296 297 "2: \n" 298 ".set pop \n" 299 300 : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [s1] "+r"(s1), [s2] "+r"(s2), 301 [s3] "+r"(s3) 302 : [dst_width] "r"(dst_width) 303 : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"); 304} 305 306void ScaleRowDown34_DSPR2(const uint8* src_ptr, 307 ptrdiff_t src_stride, 308 uint8* dst, 309 int dst_width) { 310 __asm__ __volatile__( 311 ".set push \n" 312 ".set noreorder \n" 313 "1: \n" 314 "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| 315 "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| 316 "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| 317 "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12| 318 "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16| 319 "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20| 320 "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24| 321 "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28| 322 "precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13| 323 "precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30| 324 "addiu %[dst_width], %[dst_width], -24 \n" 325 "ins $t1, $t1, 8, 16 \n" // |3|1|0|X| 326 "ins $t4, $t0, 8, 16 \n" // |X|15|13|12| 327 "ins $t5, $t5, 8, 16 \n" // |19|17|16|X| 328 "ins $t8, $t9, 8, 16 \n" // |X|31|29|28| 329 "addiu %[src_ptr], %[src_ptr], 32 \n" 330 "packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5| 331 "packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21| 332 "prepend $t1, $t2, 8 \n" // |4|3|1|0| 333 "prepend $t3, $t4, 24 \n" // |15|13|12|11| 334 "prepend $t5, $t6, 8 \n" // |20|19|17|16| 335 "prepend $t7, $t8, 24 \n" // |31|29|28|27| 336 "sw $t1, 0(%[dst]) \n" 337 "sw $t0, 4(%[dst]) \n" 338 "sw $t3, 8(%[dst]) \n" 339 "sw $t5, 12(%[dst]) \n" 340 "sw $t9, 16(%[dst]) \n" 341 "sw $t7, 20(%[dst]) \n" 342 "bnez %[dst_width], 1b \n" 343 " addiu %[dst], %[dst], 24 \n" 344 ".set pop \n" 345 : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width) 346 : 347 : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"); 348} 349 350void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, 351 ptrdiff_t src_stride, 352 uint8* d, 353 int dst_width) { 354 __asm__ __volatile__( 355 ".set push \n" 356 ".set noreorder \n" 357 "repl.ph $t3, 3 \n" // 0x00030003 358 359 "1: \n" 360 "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| 361 "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| 362 "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1| 363 "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1| 364 "muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3| 365 "muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3| 366 "andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1| 367 "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1| 368 "raddu.w.qb $t0, $t0 \n" 369 "raddu.w.qb $t1, $t1 \n" 370 "shra_r.w $t0, $t0, 1 \n" 371 "shra_r.w $t1, $t1, 1 \n" 372 "preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1| 373 "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1| 374 "rotr $t2, $t2, 16 \n" // |0|S1|0|S2| 375 "rotr $t6, $t6, 16 \n" // |0|T1|0|T2| 376 "addu.ph $t2, $t2, $t4 \n" 377 "addu.ph $t6, $t6, $t5 \n" 378 "sll $t5, $t0, 1 \n" 379 "add $t0, $t5, $t0 \n" 380 "shra_r.ph $t2, $t2, 2 \n" 381 "shra_r.ph $t6, $t6, 2 \n" 382 "shll.ph $t4, $t2, 1 \n" 383 "addq.ph $t4, $t4, $t2 \n" 384 "addu $t0, $t0, $t1 \n" 385 "addiu %[src_ptr], %[src_ptr], 4 \n" 386 "shra_r.w $t0, $t0, 2 \n" 387 "addu.ph $t6, $t6, $t4 \n" 388 "shra_r.ph $t6, $t6, 2 \n" 389 "srl $t1, $t6, 16 \n" 390 "addiu %[dst_width], %[dst_width], -3 \n" 391 "sb $t1, 0(%[d]) \n" 392 "sb $t0, 1(%[d]) \n" 393 "sb $t6, 2(%[d]) \n" 394 "bgtz %[dst_width], 1b \n" 395 " addiu %[d], %[d], 3 \n" 396 "3: \n" 397 ".set pop \n" 398 : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d), 399 [dst_width] "+r"(dst_width) 400 : 401 : "t0", "t1", "t2", "t3", "t4", "t5", "t6"); 402} 403 404void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, 405 ptrdiff_t src_stride, 406 uint8* d, 407 int dst_width) { 408 __asm__ __volatile__( 409 ".set push \n" 410 ".set noreorder \n" 411 "repl.ph $t2, 3 \n" // 0x00030003 412 413 "1: \n" 414 "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| 415 "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| 416 "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1| 417 "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1| 418 "muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3| 419 "muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3| 420 "andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1| 421 "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1| 422 "raddu.w.qb $t0, $t0 \n" 423 "raddu.w.qb $t1, $t1 \n" 424 "shra_r.w $t0, $t0, 1 \n" 425 "shra_r.w $t1, $t1, 1 \n" 426 "preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1| 427 "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1| 428 "rotr $t4, $t4, 16 \n" // |0|S1|0|S2| 429 "rotr $t6, $t6, 16 \n" // |0|T1|0|T2| 430 "addu.ph $t4, $t4, $t3 \n" 431 "addu.ph $t6, $t6, $t5 \n" 432 "shra_r.ph $t6, $t6, 2 \n" 433 "shra_r.ph $t4, $t4, 2 \n" 434 "addu.ph $t6, $t6, $t4 \n" 435 "addiu %[src_ptr], %[src_ptr], 4 \n" 436 "shra_r.ph $t6, $t6, 1 \n" 437 "addu $t0, $t0, $t1 \n" 438 "addiu %[dst_width], %[dst_width], -3 \n" 439 "shra_r.w $t0, $t0, 1 \n" 440 "srl $t1, $t6, 16 \n" 441 "sb $t1, 0(%[d]) \n" 442 "sb $t0, 1(%[d]) \n" 443 "sb $t6, 2(%[d]) \n" 444 "bgtz %[dst_width], 1b \n" 445 " addiu %[d], %[d], 3 \n" 446 "3: \n" 447 ".set pop \n" 448 : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d), 449 [dst_width] "+r"(dst_width) 450 : 451 : "t0", "t1", "t2", "t3", "t4", "t5", "t6"); 452} 453 454void ScaleRowDown38_DSPR2(const uint8* src_ptr, 455 ptrdiff_t src_stride, 456 uint8* dst, 457 int dst_width) { 458 __asm__ __volatile__( 459 ".set push \n" 460 ".set noreorder \n" 461 462 "1: \n" 463 "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| 464 "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| 465 "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| 466 "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| 467 "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16| 468 "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| 469 "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| 470 "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| 471 "wsbh $t0, $t0 \n" // |2|3|0|1| 472 "wsbh $t6, $t6 \n" // |26|27|24|25| 473 "srl $t0, $t0, 8 \n" // |X|2|3|0| 474 "srl $t3, $t3, 16 \n" // |X|X|15|14| 475 "srl $t5, $t5, 16 \n" // |X|X|23|22| 476 "srl $t7, $t7, 16 \n" // |X|X|31|30| 477 "ins $t1, $t2, 24, 8 \n" // |8|6|5|4| 478 "ins $t6, $t5, 0, 8 \n" // |26|27|24|22| 479 "ins $t1, $t0, 0, 16 \n" // |8|6|3|0| 480 "ins $t6, $t7, 24, 8 \n" // |30|27|24|22| 481 "prepend $t2, $t3, 24 \n" // |X|15|14|11| 482 "ins $t4, $t4, 16, 8 \n" // |19|16|17|X| 483 "ins $t4, $t2, 0, 16 \n" // |19|16|14|11| 484 "addiu %[src_ptr], %[src_ptr], 32 \n" 485 "addiu %[dst_width], %[dst_width], -12 \n" 486 "addiu $t8,%[dst_width], -12 \n" 487 "sw $t1, 0(%[dst]) \n" 488 "sw $t4, 4(%[dst]) \n" 489 "sw $t6, 8(%[dst]) \n" 490 "bgez $t8, 1b \n" 491 " addiu %[dst], %[dst], 12 \n" 492 ".set pop \n" 493 : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width) 494 : 495 : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8"); 496} 497 498void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, 499 ptrdiff_t src_stride, 500 uint8* dst_ptr, 501 int dst_width) { 502 intptr_t stride = src_stride; 503 const uint8* t = src_ptr + stride; 504 const int c = 0x2AAA; 505 506 __asm__ __volatile__( 507 ".set push \n" 508 ".set noreorder \n" 509 510 "1: \n" 511 "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| 512 "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| 513 "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0| 514 "lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4| 515 "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6| 516 "packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6| 517 "packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4| 518 "raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6 519 "raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4 520 "precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1| 521 "precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3| 522 "srl $t4, $t4, 2 \n" // t4 / 4 523 "srl $t6, $t6, 16 \n" // |0|0|S3|T3| 524 "raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3 525 "addu $t6, $t5, $t6 \n" 526 "mul $t6, $t6, %[c] \n" // t6 * 0x2AAA 527 "sll $t0, $t0, 8 \n" // |S2|S1|S0|0| 528 "sll $t2, $t2, 8 \n" // |T2|T1|T0|0| 529 "raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0 530 "raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0 531 "addu $t0, $t0, $t2 \n" 532 "mul $t0, $t0, %[c] \n" // t0 * 0x2AAA 533 "addiu %[src_ptr], %[src_ptr], 8 \n" 534 "addiu %[t], %[t], 8 \n" 535 "addiu %[dst_width], %[dst_width], -3 \n" 536 "addiu %[dst_ptr], %[dst_ptr], 3 \n" 537 "srl $t6, $t6, 16 \n" 538 "srl $t0, $t0, 16 \n" 539 "sb $t4, -1(%[dst_ptr]) \n" 540 "sb $t6, -2(%[dst_ptr]) \n" 541 "bgtz %[dst_width], 1b \n" 542 " sb $t0, -3(%[dst_ptr]) \n" 543 ".set pop \n" 544 : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [t] "+r"(t), 545 [dst_width] "+r"(dst_width) 546 : [c] "r"(c) 547 : "t0", "t1", "t2", "t3", "t4", "t5", "t6"); 548} 549 550void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, 551 ptrdiff_t src_stride, 552 uint8* dst_ptr, 553 int dst_width) { 554 intptr_t stride = src_stride; 555 const uint8* s1 = src_ptr + stride; 556 stride += stride; 557 const uint8* s2 = src_ptr + stride; 558 const int c1 = 0x1C71; 559 const int c2 = 0x2AAA; 560 561 __asm__ __volatile__( 562 ".set push \n" 563 ".set noreorder \n" 564 565 "1: \n" 566 "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| 567 "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| 568 "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0| 569 "lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4| 570 "lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0| 571 "lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4| 572 "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6| 573 "packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6| 574 "raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6 575 "packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4| 576 "raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4 577 "sll $t8, $t5, 16 \n" // |R5|R4|0|0| 578 "raddu.w.qb $t8, $t8 \n" // R5+R4 579 "addu $t7, $t7, $t8 \n" 580 "srl $t8, $t5, 16 \n" // |0|0|R7|R6| 581 "raddu.w.qb $t8, $t8 \n" // R7 + R6 582 "addu $t6, $t6, $t8 \n" 583 "mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA 584 "precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1| 585 "precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1| 586 "srl $t8, $t8, 8 \n" // |0|S3|T3|R3| 587 "raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3 588 "addu $t7, $t7, $t8 \n" 589 "mul $t7, $t7, %[c1] \n" // t7 * 0x1C71 590 "sll $t0, $t0, 8 \n" // |S2|S1|S0|0| 591 "sll $t2, $t2, 8 \n" // |T2|T1|T0|0| 592 "sll $t4, $t4, 8 \n" // |R2|R1|R0|0| 593 "raddu.w.qb $t0, $t0 \n" 594 "raddu.w.qb $t2, $t2 \n" 595 "raddu.w.qb $t4, $t4 \n" 596 "addu $t0, $t0, $t2 \n" 597 "addu $t0, $t0, $t4 \n" 598 "mul $t0, $t0, %[c1] \n" // t0 * 0x1C71 599 "addiu %[src_ptr], %[src_ptr], 8 \n" 600 "addiu %[s1], %[s1], 8 \n" 601 "addiu %[s2], %[s2], 8 \n" 602 "addiu %[dst_width], %[dst_width], -3 \n" 603 "addiu %[dst_ptr], %[dst_ptr], 3 \n" 604 "srl $t6, $t6, 16 \n" 605 "srl $t7, $t7, 16 \n" 606 "srl $t0, $t0, 16 \n" 607 "sb $t6, -1(%[dst_ptr]) \n" 608 "sb $t7, -2(%[dst_ptr]) \n" 609 "bgtz %[dst_width], 1b \n" 610 " sb $t0, -3(%[dst_ptr]) \n" 611 ".set pop \n" 612 : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [s1] "+r"(s1), 613 [s2] "+r"(s2), [dst_width] "+r"(dst_width) 614 : [c1] "r"(c1), [c2] "r"(c2) 615 : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8"); 616} 617 618void ScaleAddRow_DSPR2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { 619 int x; 620 for (x = 0; x < ((src_width - 1)); x += 8) { 621 uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4; 622 uint32 tmp_t5, tmp_t6, tmp_t7, tmp_t8; 623 __asm__ __volatile__( 624 ".set push \n" 625 ".set noreorder \n" 626 "lw %[tmp_t5], 0(%[src_ptr]) \n" 627 "lw %[tmp_t6], 4(%[src_ptr]) \n" 628 "lw %[tmp_t1], 0(%[dst_ptr]) \n" 629 "lw %[tmp_t2], 4(%[dst_ptr]) \n" 630 "lw %[tmp_t3], 8(%[dst_ptr]) \n" 631 "lw %[tmp_t4], 12(%[dst_ptr]) \n" 632 "preceu.ph.qbr %[tmp_t7], %[tmp_t5] \n" 633 "preceu.ph.qbl %[tmp_t8], %[tmp_t5] \n" 634 "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t7] \n" 635 "addu.ph %[tmp_t2], %[tmp_t2], %[tmp_t8] \n" 636 "preceu.ph.qbr %[tmp_t7], %[tmp_t6] \n" 637 "preceu.ph.qbl %[tmp_t8], %[tmp_t6] \n" 638 "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t7] \n" 639 "addu.ph %[tmp_t4], %[tmp_t4], %[tmp_t8] \n" 640 "sw %[tmp_t1], 0(%[dst_ptr]) \n" 641 "sw %[tmp_t2], 4(%[dst_ptr]) \n" 642 "sw %[tmp_t3], 8(%[dst_ptr]) \n" 643 "sw %[tmp_t4], 12(%[dst_ptr]) \n" 644 ".set pop \n" 645 : 646 [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), [tmp_t3] "=&r"(tmp_t3), 647 [tmp_t4] "=&r"(tmp_t4), [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), 648 [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [src_ptr] "+r"(src_ptr) 649 : [dst_ptr] "r"(dst_ptr)); 650 src_ptr += 8; 651 dst_ptr += 8; 652 } 653 654 if ((src_width)&7) { 655 for (x = 0; x < ((src_width - 1) & 7); x += 1) { 656 dst_ptr[0] += src_ptr[0]; 657 src_ptr += 1; 658 dst_ptr += 1; 659 } 660 } 661} 662 663#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) 664 665#ifdef __cplusplus 666} // extern "C" 667} // namespace libyuv 668#endif 669