16eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis/* 26eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * AltiVec optimizations for libjpeg-turbo 36eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 46eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * Copyright (C) 2015, D. R. Commander. All Rights Reserved. 56eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 66eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * This software is provided 'as-is', without any express or implied 76eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * warranty. In no event will the authors be held liable for any damages 86eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * arising from the use of this software. 96eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 106eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * Permission is granted to anyone to use this software for any purpose, 116eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * including commercial applications, and to alter it and redistribute it 126eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * freely, subject to the following restrictions: 136eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 146eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 1. The origin of this software must not be misrepresented; you must not 156eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * claim that you wrote the original software. If you use this software 166eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * in a product, an acknowledgment in the product documentation would be 176eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * appreciated but is not required. 186eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 2. Altered source versions must be plainly marked as such, and must not be 196eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * misrepresented as being the original software. 206eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 3. This notice may not be removed or altered from any source distribution. 216eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis */ 226eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 236eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis/* CHROMA DOWNSAMPLING */ 246eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 256eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#include "jsimd_altivec.h" 266eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#include "jcsample.h" 276eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 286eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 296eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidisvoid 306eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidisjsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor, 316eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis JDIMENSION v_samp_factor, 326eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis JDIMENSION width_blocks, 336eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis JSAMPARRAY input_data, JSAMPARRAY output_data) 346eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis{ 356eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis int outrow, outcol; 366eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis JDIMENSION output_cols = width_blocks * DCTSIZE; 376eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis JSAMPROW inptr, outptr; 386eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 396eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector unsigned char this0, next0, out; 406eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector unsigned short this0e, this0o, next0e, next0o, outl, outh; 416eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 426eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* Constants */ 436eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector unsigned short pw_bias = { __4X2(0, 1) }, 446eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis pw_one = { __8X(1) }; 456eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector unsigned char even_odd_index = 466eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis {0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15}, 476eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis pb_zero = { __16X(0) }; 486eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 496eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis expand_right_edge(input_data, max_v_samp_factor, image_width, 506eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis output_cols * 2); 516eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 526eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis for (outrow = 0; outrow < v_samp_factor; outrow++) { 536eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outptr = output_data[outrow]; 546eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis inptr = input_data[outrow]; 556eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 566eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis for (outcol = output_cols; outcol > 0; 576eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outcol -= 16, inptr += 32, outptr += 16) { 586eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 596eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis this0 = vec_ld(0, inptr); 606eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis this0 = vec_perm(this0, this0, even_odd_index); 616eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis this0e = (__vector unsigned short)VEC_UNPACKHU(this0); 626eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis this0o = (__vector unsigned short)VEC_UNPACKLU(this0); 636eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outl = vec_add(this0e, this0o); 646eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outl = vec_add(outl, pw_bias); 656eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outl = vec_sr(outl, pw_one); 666eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 676eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis if (outcol > 8) { 686eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis next0 = vec_ld(16, inptr); 696eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis next0 = vec_perm(next0, next0, even_odd_index); 706eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis next0e = (__vector unsigned short)VEC_UNPACKHU(next0); 716eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis next0o = (__vector unsigned short)VEC_UNPACKLU(next0); 726eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outh = vec_add(next0e, next0o); 736eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outh = vec_add(outh, pw_bias); 746eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outh = vec_sr(outh, pw_one); 756eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis } else 766eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outh = vec_splat_u16(0); 776eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 786eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis out = vec_pack(outl, outh); 796eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis vec_st(out, 0, outptr); 806eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis } 816eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis } 826eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis} 836eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 846eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 856eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidisvoid 866eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidisjsimd_h2v2_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor, 876eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis JDIMENSION v_samp_factor, 886eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis JDIMENSION width_blocks, 896eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis JSAMPARRAY input_data, JSAMPARRAY output_data) 906eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis{ 916eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis int inrow, outrow, outcol; 926eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis JDIMENSION output_cols = width_blocks * DCTSIZE; 936eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis JSAMPROW inptr0, inptr1, outptr; 946eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 956eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector unsigned char this0, next0, this1, next1, out; 966eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector unsigned short this0e, this0o, next0e, next0o, this1e, this1o, 976eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis next1e, next1o, out0l, out0h, out1l, out1h, outl, outh; 986eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 996eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* Constants */ 1006eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector unsigned short pw_bias = { __4X2(1, 2) }, 1016eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis pw_two = { __8X(2) }; 1026eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector unsigned char even_odd_index = 1036eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, 1046eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis pb_zero = { __16X(0) }; 1056eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1066eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis expand_right_edge(input_data, max_v_samp_factor, image_width, 1076eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis output_cols * 2); 1086eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1096eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis for (inrow = 0, outrow = 0; outrow < v_samp_factor; 1106eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis inrow += 2, outrow++) { 1116eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1126eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis inptr0 = input_data[inrow]; 1136eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis inptr1 = input_data[inrow + 1]; 1146eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outptr = output_data[outrow]; 1156eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1166eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis for (outcol = output_cols; outcol > 0; 1176eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outcol -= 16, inptr0 += 32, inptr1 += 32, outptr += 16) { 1186eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1196eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis this0 = vec_ld(0, inptr0); 1206eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis this0 = vec_perm(this0, this0, even_odd_index); 1216eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis this0e = (__vector unsigned short)VEC_UNPACKHU(this0); 1226eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis this0o = (__vector unsigned short)VEC_UNPACKLU(this0); 1236eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis out0l = vec_add(this0e, this0o); 1246eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1256eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis this1 = vec_ld(0, inptr1); 1266eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis this1 = vec_perm(this1, this1, even_odd_index); 1276eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis this1e = (__vector unsigned short)VEC_UNPACKHU(this1); 1286eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis this1o = (__vector unsigned short)VEC_UNPACKLU(this1); 1296eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis out1l = vec_add(this1e, this1o); 1306eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1316eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outl = vec_add(out0l, out1l); 1326eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outl = vec_add(outl, pw_bias); 1336eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outl = vec_sr(outl, pw_two); 1346eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1356eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis if (outcol > 8) { 1366eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis next0 = vec_ld(16, inptr0); 1376eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis next0 = vec_perm(next0, next0, even_odd_index); 1386eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis next0e = (__vector unsigned short)VEC_UNPACKHU(next0); 1396eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis next0o = (__vector unsigned short)VEC_UNPACKLU(next0); 1406eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis out0h = vec_add(next0e, next0o); 1416eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1426eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis next1 = vec_ld(16, inptr1); 1436eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis next1 = vec_perm(next1, next1, even_odd_index); 1446eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis next1e = (__vector unsigned short)VEC_UNPACKHU(next1); 1456eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis next1o = (__vector unsigned short)VEC_UNPACKLU(next1); 1466eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis out1h = vec_add(next1e, next1o); 1476eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1486eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outh = vec_add(out0h, out1h); 1496eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outh = vec_add(outh, pw_bias); 1506eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outh = vec_sr(outh, pw_two); 1516eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis } else 1526eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outh = vec_splat_u16(0); 1536eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1546eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis out = vec_pack(outl, outh); 1556eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis vec_st(out, 0, outptr); 1566eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis } 1576eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis } 1586eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis} 159