16eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis/* 26eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * AltiVec optimizations for libjpeg-turbo 36eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 46eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * Copyright (C) 2014, D. R. Commander. All Rights Reserved. 56eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 66eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * This software is provided 'as-is', without any express or implied 76eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * warranty. In no event will the authors be held liable for any damages 86eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * arising from the use of this software. 96eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 106eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * Permission is granted to anyone to use this software for any purpose, 116eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * including commercial applications, and to alter it and redistribute it 126eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * freely, subject to the following restrictions: 136eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 146eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 1. The origin of this software must not be misrepresented; you must not 156eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * claim that you wrote the original software. If you use this software 166eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * in a product, an acknowledgment in the product documentation would be 176eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * appreciated but is not required. 186eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 2. Altered source versions must be plainly marked as such, and must not be 196eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * misrepresented as being the original software. 206eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 3. This notice may not be removed or altered from any source distribution. 216eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis */ 226eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 236eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis/* FAST INTEGER FORWARD DCT 246eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 256eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * This is similar to the SSE2 implementation, except that we left-shift the 266eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because 276eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of: 286eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * the elements in arg3 + the most significant 17 bits of 296eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * (the elements in arg1 * the elements in arg2). 306eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis */ 316eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 326eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#include "jsimd_altivec.h" 336eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 346eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 356eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define F_0_382 98 /* FIX(0.382683433) */ 366eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define F_0_541 139 /* FIX(0.541196100) */ 376eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define F_0_707 181 /* FIX(0.707106781) */ 386eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define F_1_306 334 /* FIX(1.306562965) */ 396eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 406eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define CONST_BITS 8 416eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define PRE_MULTIPLY_SCALE_BITS 2 426eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1) 436eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 446eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 456eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define DO_FDCT() \ 466eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis{ \ 476eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* Even part */ \ 486eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis \ 496eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp10 = vec_add(tmp0, tmp3); \ 506eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp13 = vec_sub(tmp0, tmp3); \ 516eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp11 = vec_add(tmp1, tmp2); \ 526eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp12 = vec_sub(tmp1, tmp2); \ 536eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis \ 546eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis out0 = vec_add(tmp10, tmp11); \ 556eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis out4 = vec_sub(tmp10, tmp11); \ 566eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis \ 576eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis z1 = vec_add(tmp12, tmp13); \ 586eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis z1 = vec_sl(z1, pre_multiply_scale_bits); \ 596eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis z1 = vec_madds(z1, pw_0707, pw_zero); \ 606eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis \ 616eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis out2 = vec_add(tmp13, z1); \ 626eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis out6 = vec_sub(tmp13, z1); \ 636eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis \ 646eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* Odd part */ \ 656eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis \ 666eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp10 = vec_add(tmp4, tmp5); \ 676eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp11 = vec_add(tmp5, tmp6); \ 686eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp12 = vec_add(tmp6, tmp7); \ 696eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis \ 706eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \ 716eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \ 726eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis z5 = vec_sub(tmp10, tmp12); \ 736eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis z5 = vec_madds(z5, pw_0382, pw_zero); \ 746eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis \ 756eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis z2 = vec_madds(tmp10, pw_0541, z5); \ 766eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis z4 = vec_madds(tmp12, pw_1306, z5); \ 776eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis \ 786eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \ 796eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis z3 = vec_madds(tmp11, pw_0707, pw_zero); \ 806eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis \ 816eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis z11 = vec_add(tmp7, z3); \ 826eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis z13 = vec_sub(tmp7, z3); \ 836eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis \ 846eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis out5 = vec_add(z13, z2); \ 856eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis out3 = vec_sub(z13, z2); \ 866eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis out1 = vec_add(z11, z4); \ 876eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis out7 = vec_sub(z11, z4); \ 886eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis} 896eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 906eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 916eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidisvoid 926eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidisjsimd_fdct_ifast_altivec (DCTELEM *data) 936eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis{ 946eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector short row0, row1, row2, row3, row4, row5, row6, row7, 956eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis col0, col1, col2, col3, col4, col5, col6, col7, 966eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, 976eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis z1, z2, z3, z4, z5, z11, z13, 986eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis out0, out1, out2, out3, out4, out5, out6, out7; 996eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1006eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* Constants */ 1016eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector short pw_zero = { __8X(0) }, 1026eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis pw_0382 = { __8X(F_0_382 << CONST_SHIFT) }, 1036eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis pw_0541 = { __8X(F_0_541 << CONST_SHIFT) }, 1046eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis pw_0707 = { __8X(F_0_707 << CONST_SHIFT) }, 1056eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis pw_1306 = { __8X(F_1_306 << CONST_SHIFT) }; 1066eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector unsigned short 1076eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) }; 1086eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1096eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* Pass 1: process rows */ 1106eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1116eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis row0 = vec_ld(0, data); 1126eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis row1 = vec_ld(16, data); 1136eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis row2 = vec_ld(32, data); 1146eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis row3 = vec_ld(48, data); 1156eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis row4 = vec_ld(64, data); 1166eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis row5 = vec_ld(80, data); 1176eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis row6 = vec_ld(96, data); 1186eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis row7 = vec_ld(112, data); 1196eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1206eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis TRANSPOSE(row, col); 1216eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1226eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp0 = vec_add(col0, col7); 1236eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp7 = vec_sub(col0, col7); 1246eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp1 = vec_add(col1, col6); 1256eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp6 = vec_sub(col1, col6); 1266eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp2 = vec_add(col2, col5); 1276eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp5 = vec_sub(col2, col5); 1286eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp3 = vec_add(col3, col4); 1296eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp4 = vec_sub(col3, col4); 1306eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1316eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis DO_FDCT(); 1326eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1336eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* Pass 2: process columns */ 1346eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1356eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis TRANSPOSE(out, row); 1366eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1376eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp0 = vec_add(row0, row7); 1386eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp7 = vec_sub(row0, row7); 1396eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp1 = vec_add(row1, row6); 1406eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp6 = vec_sub(row1, row6); 1416eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp2 = vec_add(row2, row5); 1426eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp5 = vec_sub(row2, row5); 1436eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp3 = vec_add(row3, row4); 1446eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis tmp4 = vec_sub(row3, row4); 1456eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1466eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis DO_FDCT(); 1476eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1486eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis vec_st(out0, 0, data); 1496eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis vec_st(out1, 16, data); 1506eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis vec_st(out2, 32, data); 1516eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis vec_st(out3, 48, data); 1526eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis vec_st(out4, 64, data); 1536eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis vec_st(out5, 80, data); 1546eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis vec_st(out6, 96, data); 1556eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis vec_st(out7, 112, data); 1566eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis} 157