16eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis/*
26eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * AltiVec optimizations for libjpeg-turbo
36eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis *
46eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
56eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis *
66eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * This software is provided 'as-is', without any express or implied
76eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * warranty.  In no event will the authors be held liable for any damages
86eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * arising from the use of this software.
96eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis *
106eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * Permission is granted to anyone to use this software for any purpose,
116eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * including commercial applications, and to alter it and redistribute it
126eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * freely, subject to the following restrictions:
136eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis *
146eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 1. The origin of this software must not be misrepresented; you must not
156eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis *    claim that you wrote the original software. If you use this software
166eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis *    in a product, an acknowledgment in the product documentation would be
176eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis *    appreciated but is not required.
186eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 2. Altered source versions must be plainly marked as such, and must not be
196eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis *    misrepresented as being the original software.
206eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 3. This notice may not be removed or altered from any source distribution.
216eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis */
226eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
236eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis/* FAST INTEGER FORWARD DCT
246eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis *
256eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * This is similar to the SSE2 implementation, except that we left-shift the
266eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * constants by 1 less bit (the -1 in CONST_SHIFT.)  This is because
276eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
286eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis *   the elements in arg3 + the most significant 17 bits of
296eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis *     (the elements in arg1 * the elements in arg2).
306eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis */
316eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
326eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#include "jsimd_altivec.h"
336eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
346eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
356eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define F_0_382 98   /* FIX(0.382683433) */
366eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define F_0_541 139  /* FIX(0.541196100) */
376eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define F_0_707 181  /* FIX(0.707106781) */
386eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define F_1_306 334  /* FIX(1.306562965) */
396eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
406eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define CONST_BITS 8
416eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define PRE_MULTIPLY_SCALE_BITS 2
426eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
436eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
446eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
456eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define DO_FDCT()  \
466eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis{  \
476eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  /* Even part */  \
486eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  \
496eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp10 = vec_add(tmp0, tmp3);  \
506eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp13 = vec_sub(tmp0, tmp3);  \
516eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp11 = vec_add(tmp1, tmp2);  \
526eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp12 = vec_sub(tmp1, tmp2);  \
536eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  \
546eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  out0  = vec_add(tmp10, tmp11);  \
556eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  out4  = vec_sub(tmp10, tmp11);  \
566eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  \
576eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  z1 = vec_add(tmp12, tmp13);  \
586eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  z1 = vec_sl(z1, pre_multiply_scale_bits);  \
596eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  z1 = vec_madds(z1, pw_0707, pw_zero);  \
606eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  \
616eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  out2 = vec_add(tmp13, z1);  \
626eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  out6 = vec_sub(tmp13, z1);  \
636eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  \
646eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  /* Odd part */  \
656eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  \
666eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp10 = vec_add(tmp4, tmp5);  \
676eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp11 = vec_add(tmp5, tmp6);  \
686eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp12 = vec_add(tmp6, tmp7);  \
696eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  \
706eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp10 = vec_sl(tmp10, pre_multiply_scale_bits);  \
716eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits);  \
726eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  z5 = vec_sub(tmp10, tmp12);  \
736eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  z5 = vec_madds(z5, pw_0382, pw_zero);  \
746eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  \
756eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  z2 = vec_madds(tmp10, pw_0541, z5);  \
766eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  z4 = vec_madds(tmp12, pw_1306, z5);  \
776eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  \
786eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits);  \
796eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  z3 = vec_madds(tmp11, pw_0707, pw_zero);  \
806eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  \
816eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  z11 = vec_add(tmp7, z3);  \
826eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  z13 = vec_sub(tmp7, z3);  \
836eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  \
846eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  out5 = vec_add(z13, z2);  \
856eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  out3 = vec_sub(z13, z2);  \
866eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  out1 = vec_add(z11, z4);  \
876eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  out7 = vec_sub(z11, z4);  \
886eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis}
896eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
906eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
916eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidisvoid
926eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidisjsimd_fdct_ifast_altivec (DCTELEM *data)
936eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis{
946eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
956eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis    col0, col1, col2, col3, col4, col5, col6, col7,
966eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
976eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis    z1, z2, z3, z4, z5, z11, z13,
986eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis    out0, out1, out2, out3, out4, out5, out6, out7;
996eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
1006eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  /* Constants */
1016eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  __vector short pw_zero = { __8X(0) },
1026eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis    pw_0382 = { __8X(F_0_382 << CONST_SHIFT) },
1036eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis    pw_0541 = { __8X(F_0_541 << CONST_SHIFT) },
1046eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis    pw_0707 = { __8X(F_0_707 << CONST_SHIFT) },
1056eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis    pw_1306 = { __8X(F_1_306 << CONST_SHIFT) };
1066eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  __vector unsigned short
1076eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis    pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };
1086eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
1096eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  /* Pass 1: process rows */
1106eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
1116eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  row0 = vec_ld(0, data);
1126eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  row1 = vec_ld(16, data);
1136eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  row2 = vec_ld(32, data);
1146eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  row3 = vec_ld(48, data);
1156eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  row4 = vec_ld(64, data);
1166eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  row5 = vec_ld(80, data);
1176eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  row6 = vec_ld(96, data);
1186eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  row7 = vec_ld(112, data);
1196eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
1206eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  TRANSPOSE(row, col);
1216eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
1226eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp0 = vec_add(col0, col7);
1236eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp7 = vec_sub(col0, col7);
1246eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp1 = vec_add(col1, col6);
1256eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp6 = vec_sub(col1, col6);
1266eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp2 = vec_add(col2, col5);
1276eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp5 = vec_sub(col2, col5);
1286eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp3 = vec_add(col3, col4);
1296eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp4 = vec_sub(col3, col4);
1306eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
1316eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  DO_FDCT();
1326eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
1336eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  /* Pass 2: process columns */
1346eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
1356eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  TRANSPOSE(out, row);
1366eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
1376eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp0 = vec_add(row0, row7);
1386eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp7 = vec_sub(row0, row7);
1396eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp1 = vec_add(row1, row6);
1406eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp6 = vec_sub(row1, row6);
1416eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp2 = vec_add(row2, row5);
1426eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp5 = vec_sub(row2, row5);
1436eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp3 = vec_add(row3, row4);
1446eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  tmp4 = vec_sub(row3, row4);
1456eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
1466eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  DO_FDCT();
1476eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis
1486eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  vec_st(out0, 0, data);
1496eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  vec_st(out1, 16, data);
1506eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  vec_st(out2, 32, data);
1516eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  vec_st(out3, 48, data);
1526eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  vec_st(out4, 64, data);
1536eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  vec_st(out5, 80, data);
1546eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  vec_st(out6, 96, data);
1556eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis  vec_st(out7, 112, data);
1566eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis}
157