1# libFLAC - Free Lossless Audio Codec library 2# Copyright (C) 2004,2005,2006,2007 Josh Coalson 3# 4# Redistribution and use in source and binary forms, with or without 5# modification, are permitted provided that the following conditions 6# are met: 7# 8# - Redistributions of source code must retain the above copyright 9# notice, this list of conditions and the following disclaimer. 10# 11# - Redistributions in binary form must reproduce the above copyright 12# notice, this list of conditions and the following disclaimer in the 13# documentation and/or other materials provided with the distribution. 14# 15# - Neither the name of the Xiph.org Foundation nor the names of its 16# contributors may be used to endorse or promote products derived from 17# this software without specific prior written permission. 18# 19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 23# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 26# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 27# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31.text 32 .align 2 33.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16 34.type _FLAC__lpc_restore_signal_asm_ppc_altivec_16, @function 35 36.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8 37.type _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8, @function 38 39_FLAC__lpc_restore_signal_asm_ppc_altivec_16: 40# r3: residual[] 41# r4: data_len 42# r5: qlp_coeff[] 43# r6: order 44# r7: lp_quantization 45# r8: data[] 46 47# see src/libFLAC/lpc.c:FLAC__lpc_restore_signal() 48# these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual 49# bps<=15 for mid-side coding, since that uses an extra bit) 50 51# these should be fast; the inner loop is unrolled (it takes no more than 52# 3*(order%4) instructions, all of which are arithmetic), and all of the 53# coefficients and all relevant history stay in registers, so the outer loop 54# has only one load from memory (the residual) 55 56# I have not yet run this through simg4, so there may be some avoidable stalls, 57# and there may be a somewhat more clever way to do the outer loop 58 59# the branch mechanism may prevent dynamic loading; I still need to examine 60# this issue, and there may be a more elegant method 61 62 stmw r31,-4(r1) 63 64 addi r9,r1,-28 65 li r31,0xf 66 andc r9,r9,r31 # for quadword-aligned stack data 67 68 slwi r6,r6,2 # adjust for word size 69 slwi r4,r4,2 70 add r4,r4,r8 # r4 = data+data_len 71 72 mfspr r0,256 # cache old vrsave 73 addis r31,0,0xffff 74 ori r31,r31,0xfc00 75 mtspr 256,r31 # declare VRs in vrsave 76 77 cmplw cr0,r8,r4 # i<data_len 78 bc 4,0,L1400 79 80 # load coefficients into v0-v7 and initial history into v8-v15 81 li r31,0xf 82 and r31,r8,r31 # r31: data%4 83 li r11,16 84 subf r31,r31,r11 # r31: 4-(data%4) 85 slwi r31,r31,3 # convert to bits for vsro 86 li r10,-4 87 stw r31,-4(r9) 88 lvewx v0,r10,r9 89 vspltisb v18,-1 90 vsro v18,v18,v0 # v18: mask vector 91 92 li r31,0x8 93 lvsl v0,0,r31 94 vsldoi v0,v0,v0,12 95 li r31,0xc 96 lvsl v1,0,r31 97 vspltisb v2,0 98 vspltisb v3,-1 99 vmrglw v2,v2,v3 100 vsel v0,v1,v0,v2 # v0: reversal permutation vector 101 102 add r10,r5,r6 103 lvsl v17,0,r5 # v17: coefficient alignment permutation vector 104 vperm v17,v17,v17,v0 # v17: reversal coefficient alignment permutation vector 105 106 mr r11,r8 107 lvsl v16,0,r11 # v16: history alignment permutation vector 108 109 lvx v0,0,r5 110 addi r5,r5,16 111 lvx v1,0,r5 112 vperm v0,v0,v1,v17 113 lvx v8,0,r11 114 addi r11,r11,-16 115 lvx v9,0,r11 116 vperm v8,v9,v8,v16 117 cmplw cr0,r5,r10 118 bc 12,0,L1101 119 vand v0,v0,v18 120 addis r31,0,L1307@ha 121 ori r31,r31,L1307@l 122 b L1199 123 124L1101: 125 addi r5,r5,16 126 lvx v2,0,r5 127 vperm v1,v1,v2,v17 128 addi r11,r11,-16 129 lvx v10,0,r11 130 vperm v9,v10,v9,v16 131 cmplw cr0,r5,r10 132 bc 12,0,L1102 133 vand v1,v1,v18 134 addis r31,0,L1306@ha 135 ori r31,r31,L1306@l 136 b L1199 137 138L1102: 139 addi r5,r5,16 140 lvx v3,0,r5 141 vperm v2,v2,v3,v17 142 addi r11,r11,-16 143 lvx v11,0,r11 144 vperm v10,v11,v10,v16 145 cmplw cr0,r5,r10 146 bc 12,0,L1103 147 vand v2,v2,v18 148 lis r31,L1305@ha 149 la r31,L1305@l(r31) 150 b L1199 151 152L1103: 153 addi r5,r5,16 154 lvx v4,0,r5 155 vperm v3,v3,v4,v17 156 addi r11,r11,-16 157 lvx v12,0,r11 158 vperm v11,v12,v11,v16 159 cmplw cr0,r5,r10 160 bc 12,0,L1104 161 vand v3,v3,v18 162 lis r31,L1304@ha 163 la r31,L1304@l(r31) 164 b L1199 165 166L1104: 167 addi r5,r5,16 168 lvx v5,0,r5 169 vperm v4,v4,v5,v17 170 addi r11,r11,-16 171 lvx v13,0,r11 172 vperm v12,v13,v12,v16 173 cmplw cr0,r5,r10 174 bc 12,0,L1105 175 vand v4,v4,v18 176 lis r31,L1303@ha 177 la r31,L1303@l(r31) 178 b L1199 179 180L1105: 181 addi r5,r5,16 182 lvx v6,0,r5 183 vperm v5,v5,v6,v17 184 addi r11,r11,-16 185 lvx v14,0,r11 186 vperm v13,v14,v13,v16 187 cmplw cr0,r5,r10 188 bc 12,0,L1106 189 vand v5,v5,v18 190 lis r31,L1302@ha 191 la r31,L1302@l(r31) 192 b L1199 193 194L1106: 195 addi r5,r5,16 196 lvx v7,0,r5 197 vperm v6,v6,v7,v17 198 addi r11,r11,-16 199 lvx v15,0,r11 200 vperm v14,v15,v14,v16 201 cmplw cr0,r5,r10 202 bc 12,0,L1107 203 vand v6,v6,v18 204 lis r31,L1301@ha 205 la r31,L1301@l(r31) 206 b L1199 207 208L1107: 209 addi r5,r5,16 210 lvx v19,0,r5 211 vperm v7,v7,v19,v17 212 addi r11,r11,-16 213 lvx v19,0,r11 214 vperm v15,v19,v15,v16 215 vand v7,v7,v18 216 lis r31,L1300@ha 217 la r31,L1300@l(r31) 218 219L1199: 220 mtctr r31 221 222 # set up invariant vectors 223 vspltish v16,0 # v16: zero vector 224 225 li r10,-12 226 lvsr v17,r10,r8 # v17: result shift vector 227 lvsl v18,r10,r3 # v18: residual shift back vector 228 229 li r10,-4 230 stw r7,-4(r9) 231 lvewx v19,r10,r9 # v19: lp_quantization vector 232 233L1200: 234 vmulosh v20,v0,v8 # v20: sum vector 235 bcctr 20,0 236 237L1300: 238 vmulosh v21,v7,v15 239 vsldoi v15,v15,v14,4 # increment history 240 vaddsws v20,v20,v21 241 242L1301: 243 vmulosh v21,v6,v14 244 vsldoi v14,v14,v13,4 245 vaddsws v20,v20,v21 246 247L1302: 248 vmulosh v21,v5,v13 249 vsldoi v13,v13,v12,4 250 vaddsws v20,v20,v21 251 252L1303: 253 vmulosh v21,v4,v12 254 vsldoi v12,v12,v11,4 255 vaddsws v20,v20,v21 256 257L1304: 258 vmulosh v21,v3,v11 259 vsldoi v11,v11,v10,4 260 vaddsws v20,v20,v21 261 262L1305: 263 vmulosh v21,v2,v10 264 vsldoi v10,v10,v9,4 265 vaddsws v20,v20,v21 266 267L1306: 268 vmulosh v21,v1,v9 269 vsldoi v9,v9,v8,4 270 vaddsws v20,v20,v21 271 272L1307: 273 vsumsws v20,v20,v16 # v20[3]: sum 274 vsraw v20,v20,v19 # v20[3]: sum >> lp_quantization 275 276 lvewx v21,0,r3 # v21[n]: *residual 277 vperm v21,v21,v21,v18 # v21[3]: *residual 278 vaddsws v20,v21,v20 # v20[3]: *residual + (sum >> lp_quantization) 279 vsldoi v18,v18,v18,4 # increment shift vector 280 281 vperm v21,v20,v20,v17 # v21[n]: shift for storage 282 vsldoi v17,v17,v17,12 # increment shift vector 283 stvewx v21,0,r8 284 285 vsldoi v20,v20,v20,12 286 vsldoi v8,v8,v20,4 # insert value onto history 287 288 addi r3,r3,4 289 addi r8,r8,4 290 cmplw cr0,r8,r4 # i<data_len 291 bc 12,0,L1200 292 293L1400: 294 mtspr 256,r0 # restore old vrsave 295 lmw r31,-4(r1) 296 blr 297 298_FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8: 299# r3: residual[] 300# r4: data_len 301# r5: qlp_coeff[] 302# r6: order 303# r7: lp_quantization 304# r8: data[] 305 306# see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above 307# this version assumes order<=8; it uses fewer vector registers, which should 308# save time in context switches, and has less code, which may improve 309# instruction caching 310 311 stmw r31,-4(r1) 312 313 addi r9,r1,-28 314 li r31,0xf 315 andc r9,r9,r31 # for quadword-aligned stack data 316 317 slwi r6,r6,2 # adjust for word size 318 slwi r4,r4,2 319 add r4,r4,r8 # r4 = data+data_len 320 321 mfspr r0,256 # cache old vrsave 322 addis r31,0,0xffc0 323 ori r31,r31,0x0000 324 mtspr 256,r31 # declare VRs in vrsave 325 326 cmplw cr0,r8,r4 # i<data_len 327 bc 4,0,L2400 328 329 # load coefficients into v0-v1 and initial history into v2-v3 330 li r31,0xf 331 and r31,r8,r31 # r31: data%4 332 li r11,16 333 subf r31,r31,r11 # r31: 4-(data%4) 334 slwi r31,r31,3 # convert to bits for vsro 335 li r10,-4 336 stw r31,-4(r9) 337 lvewx v0,r10,r9 338 vspltisb v6,-1 339 vsro v6,v6,v0 # v6: mask vector 340 341 li r31,0x8 342 lvsl v0,0,r31 343 vsldoi v0,v0,v0,12 344 li r31,0xc 345 lvsl v1,0,r31 346 vspltisb v2,0 347 vspltisb v3,-1 348 vmrglw v2,v2,v3 349 vsel v0,v1,v0,v2 # v0: reversal permutation vector 350 351 add r10,r5,r6 352 lvsl v5,0,r5 # v5: coefficient alignment permutation vector 353 vperm v5,v5,v5,v0 # v5: reversal coefficient alignment permutation vector 354 355 mr r11,r8 356 lvsl v4,0,r11 # v4: history alignment permutation vector 357 358 lvx v0,0,r5 359 addi r5,r5,16 360 lvx v1,0,r5 361 vperm v0,v0,v1,v5 362 lvx v2,0,r11 363 addi r11,r11,-16 364 lvx v3,0,r11 365 vperm v2,v3,v2,v4 366 cmplw cr0,r5,r10 367 bc 12,0,L2101 368 vand v0,v0,v6 369 lis r31,L2301@ha 370 la r31,L2301@l(r31) 371 b L2199 372 373L2101: 374 addi r5,r5,16 375 lvx v7,0,r5 376 vperm v1,v1,v7,v5 377 addi r11,r11,-16 378 lvx v7,0,r11 379 vperm v3,v7,v3,v4 380 vand v1,v1,v6 381 lis r31,L2300@ha 382 la r31,L2300@l(r31) 383 384L2199: 385 mtctr r31 386 387 # set up invariant vectors 388 vspltish v4,0 # v4: zero vector 389 390 li r10,-12 391 lvsr v5,r10,r8 # v5: result shift vector 392 lvsl v6,r10,r3 # v6: residual shift back vector 393 394 li r10,-4 395 stw r7,-4(r9) 396 lvewx v7,r10,r9 # v7: lp_quantization vector 397 398L2200: 399 vmulosh v8,v0,v2 # v8: sum vector 400 bcctr 20,0 401 402L2300: 403 vmulosh v9,v1,v3 404 vsldoi v3,v3,v2,4 405 vaddsws v8,v8,v9 406 407L2301: 408 vsumsws v8,v8,v4 # v8[3]: sum 409 vsraw v8,v8,v7 # v8[3]: sum >> lp_quantization 410 411 lvewx v9,0,r3 # v9[n]: *residual 412 vperm v9,v9,v9,v6 # v9[3]: *residual 413 vaddsws v8,v9,v8 # v8[3]: *residual + (sum >> lp_quantization) 414 vsldoi v6,v6,v6,4 # increment shift vector 415 416 vperm v9,v8,v8,v5 # v9[n]: shift for storage 417 vsldoi v5,v5,v5,12 # increment shift vector 418 stvewx v9,0,r8 419 420 vsldoi v8,v8,v8,12 421 vsldoi v2,v2,v8,4 # insert value onto history 422 423 addi r3,r3,4 424 addi r8,r8,4 425 cmplw cr0,r8,r4 # i<data_len 426 bc 12,0,L2200 427 428L2400: 429 mtspr 256,r0 # restore old vrsave 430 lmw r31,-4(r1) 431 blr 432