16eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis/* 26eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * AltiVec optimizations for libjpeg-turbo 36eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 46eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. 56eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 66eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * This software is provided 'as-is', without any express or implied 76eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * warranty. In no event will the authors be held liable for any damages 86eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * arising from the use of this software. 96eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 106eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * Permission is granted to anyone to use this software for any purpose, 116eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * including commercial applications, and to alter it and redistribute it 126eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * freely, subject to the following restrictions: 136eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 146eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 1. The origin of this software must not be misrepresented; you must not 156eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * claim that you wrote the original software. If you use this software 166eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * in a product, an acknowledgment in the product documentation would be 176eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * appreciated but is not required. 186eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 2. Altered source versions must be plainly marked as such, and must not be 196eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * misrepresented as being the original software. 206eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 3. This notice may not be removed or altered from any source distribution. 216eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis */ 226eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 236eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define JPEG_INTERNALS 246eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#include "../jinclude.h" 256eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#include "../jpeglib.h" 266eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#include "../jsimd.h" 276eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#include "../jdct.h" 286eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#include "../jsimddct.h" 296eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#include "jsimd.h" 306eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#include <altivec.h> 316eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 326eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 336eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis/* Common code */ 346eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 356eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define __4X(a) a, a, a, a 366eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define __4X2(a, b) a, b, a, b, a, b, a, b 376eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define __8X(a) __4X(a), __4X(a) 386eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define __16X(a) __8X(a), __8X(a) 396eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 406eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define TRANSPOSE(row, col) \ 416eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis{ \ 426eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector short row04l, row04h, row15l, row15h, \ 436eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis row26l, row26h, row37l, row37h; \ 446eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector short col01e, col01o, col23e, col23o, \ 456eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis col45e, col45o, col67e, col67o; \ 466eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis \ 476eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* transpose coefficients (phase 1) */ \ 486eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \ 496eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \ 506eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \ 516eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \ 526eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \ 536eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \ 546eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \ 556eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \ 566eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis \ 576eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* transpose coefficients (phase 2) */ \ 586eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \ 596eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \ 606eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \ 616eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \ 626eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \ 636eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \ 646eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \ 656eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \ 666eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis \ 676eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* transpose coefficients (phase 3) */ \ 686eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \ 696eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \ 706eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \ 716eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \ 726eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \ 736eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \ 746eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \ 756eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \ 766eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis} 776eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 786eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#ifndef min 796eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define min(a,b) ((a) < (b) ? (a) : (b)) 806eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#endif 816eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 826eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 836eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis/* Macros to abstract big/little endian bit twiddling */ 846eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 856eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#if __BIG_ENDIAN__ 866eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 876eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define VEC_LD(a, b) vec_ld(a, b) 886eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define VEC_ST(a, b, c) vec_st(a, b, c) 896eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a) 906eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a) 916eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 926eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#else 936eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 946eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define VEC_LD(a, b) vec_vsx_ld(a, b) 956eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define VEC_ST(a, b, c) vec_vsx_st(a, b, c) 966eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero) 976eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero) 986eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 996eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#endif 100