16eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis/* 26eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * AltiVec optimizations for libjpeg-turbo 36eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 46eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. 56eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * Copyright (C) 2014, Jay Foad. All Rights Reserved. 66eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 76eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * This software is provided 'as-is', without any express or implied 86eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * warranty. In no event will the authors be held liable for any damages 96eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * arising from the use of this software. 106eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 116eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * Permission is granted to anyone to use this software for any purpose, 126eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * including commercial applications, and to alter it and redistribute it 136eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * freely, subject to the following restrictions: 146eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 156eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 1. The origin of this software must not be misrepresented; you must not 166eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * claim that you wrote the original software. If you use this software 176eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * in a product, an acknowledgment in the product documentation would be 186eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * appreciated but is not required. 196eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 2. Altered source versions must be plainly marked as such, and must not be 206eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * misrepresented as being the original software. 216eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 3. This notice may not be removed or altered from any source distribution. 226eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis */ 236eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 246eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis/* This file is included by jccolor-altivec.c */ 256eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 266eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 276eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidisvoid jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf, 286eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis JSAMPIMAGE output_buf, 296eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis JDIMENSION output_row, int num_rows) 306eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis{ 316eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis JSAMPROW inptr, outptr0, outptr1, outptr2; 326eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis int pitch = img_width * RGB_PIXELSIZE, num_cols; 336eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#if __BIG_ENDIAN__ 346eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis int offset; 356eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#endif 366eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; 376eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 386eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, 396eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr; 406eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4 416eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector unsigned char rgb3 = {0}; 426eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#endif 436eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4 446eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector unsigned char rgb4 = {0}; 456eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#endif 466eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3; 476eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector unsigned short yl, yh, crl, crh, cbl, cbh; 486eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector int y0, y1, y2, y3, cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3; 496eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 506eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* Constants */ 516eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) }, 526eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) }, 536eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis pw_mf016_mf033 = { __4X2(-F_0_168, -F_0_331) }, 546eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis pw_mf008_mf041 = { __4X2(-F_0_081, -F_0_418) }; 556eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector unsigned short pw_f050_f000 = { __4X2(F_0_500, 0) }; 566eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector int pd_onehalf = { __4X(ONE_HALF) }, 576eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) }; 586eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector unsigned char pb_zero = { __16X(0) }, 596eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#if __BIG_ENDIAN__ 606eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29}; 616eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#else 626eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31}; 636eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#endif 646eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 656eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis while (--num_rows >= 0) { 666eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis inptr = *input_buf++; 676eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outptr0 = output_buf[0][output_row]; 686eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outptr1 = output_buf[1][output_row]; 696eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outptr2 = output_buf[2][output_row]; 706eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis output_row++; 716eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 726eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis for (num_cols = pitch; num_cols > 0; 736eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16, 746eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis outptr0 += 16, outptr1 += 16, outptr2 += 16) { 756eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 766eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#if __BIG_ENDIAN__ 776eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* Load 16 pixels == 48 or 64 bytes */ 786eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis offset = (size_t)inptr & 15; 796eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis if (offset) { 806eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis __vector unsigned char unaligned_shift_index; 816eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis int bytes = num_cols + offset; 826eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 836eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { 846eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* Slow path to prevent buffer overread. Since there is no way to 856eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * read a partial AltiVec register, overread would occur on the last 866eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * chunk of the last image row if the right edge is not on a 16-byte 876eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * boundary. It could also occur on other rows if the bytes per row 886eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * is low enough. Since we can't determine whether we're on the last 896eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * image row, we have to assume every row is the last. 906eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis */ 916eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); 926eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb0 = vec_ld(0, tmpbuf); 936eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb1 = vec_ld(16, tmpbuf); 946eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb2 = vec_ld(32, tmpbuf); 956eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#if RGB_PIXELSIZE == 4 966eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb3 = vec_ld(48, tmpbuf); 976eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#endif 986eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis } else { 996eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* Fast path */ 1006eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb0 = vec_ld(0, inptr); 1016eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis if (bytes > 16) 1026eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb1 = vec_ld(16, inptr); 1036eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis if (bytes > 32) 1046eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb2 = vec_ld(32, inptr); 1056eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis if (bytes > 48) 1066eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb3 = vec_ld(48, inptr); 1076eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#if RGB_PIXELSIZE == 4 1086eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis if (bytes > 64) 1096eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb4 = vec_ld(64, inptr); 1106eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#endif 1116eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis unaligned_shift_index = vec_lvsl(0, inptr); 1126eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index); 1136eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index); 1146eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index); 1156eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#if RGB_PIXELSIZE == 4 1166eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index); 1176eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#endif 1186eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis } 1196eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis } else { 1206eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#endif /* __BIG_ENDIAN__ */ 1216eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { 1226eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* Slow path */ 1236eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); 1246eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb0 = VEC_LD(0, tmpbuf); 1256eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb1 = VEC_LD(16, tmpbuf); 1266eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb2 = VEC_LD(32, tmpbuf); 1276eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#if RGB_PIXELSIZE == 4 1286eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb3 = VEC_LD(48, tmpbuf); 1296eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#endif 1306eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis } else { 1316eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* Fast path */ 1326eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb0 = VEC_LD(0, inptr); 1336eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis if (num_cols > 16) 1346eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb1 = VEC_LD(16, inptr); 1356eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis if (num_cols > 32) 1366eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb2 = VEC_LD(32, inptr); 1376eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#if RGB_PIXELSIZE == 4 1386eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis if (num_cols > 48) 1396eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgb3 = VEC_LD(48, inptr); 1406eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#endif 1416eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis } 1426eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#if __BIG_ENDIAN__ 1436eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis } 1446eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#endif 1456eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1466eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#if RGB_PIXELSIZE == 3 1476eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 1486eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga 1496eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf 1506eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 1516eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 1526eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 1536eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb 1546eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf 1556eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis */ 1566eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0); 1576eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1); 1586eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2); 1596eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3); 1606eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#else 1616eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 1626eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 1636eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb 1646eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf 1656eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 1666eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 1676eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 1686eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb 1696eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf 1706eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis */ 1716eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX); 1726eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX); 1736eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX); 1746eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX); 1756eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis#endif 1766eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1776eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3 1786eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * bg0 = B0 G0 B1 G1 B2 G2 B3 G3 1796eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * ... 1806eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 1816eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't 1826eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * support unsigned vectors. 1836eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis */ 1846eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0); 1856eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0); 1866eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1); 1876eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1); 1886eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2); 1896eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2); 1906eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3); 1916eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3); 1926eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 1936eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* (Original) 1946eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 1956eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 1966eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 1976eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * 1986eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * (This implementation) 1996eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 2006eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 2016eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 2026eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis */ 2036eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 2046eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* Calculate Y values */ 2056eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 2066eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf); 2076eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf); 2086eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf); 2096eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf); 2106eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis y0 = vec_msums(bg0, pw_f0114_f0250, y0); 2116eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis y1 = vec_msums(bg1, pw_f0114_f0250, y1); 2126eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis y2 = vec_msums(bg2, pw_f0114_f0250, y2); 2136eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis y3 = vec_msums(bg3, pw_f0114_f0250, y3); 2146eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from 2156eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * each dword into a new 16-bit vector, which is the equivalent of 2166eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * descaling the 32-bit results (right-shifting by 16 bits) and then 2176eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis * packing them. 2186eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis */ 2196eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1, 2206eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis shift_pack_index); 2216eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3, 2226eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis shift_pack_index); 2236eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis y = vec_pack(yl, yh); 2246eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis vec_st(y, 0, outptr0); 2256eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 2266eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* Calculate Cb values */ 2276eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cb0 = vec_msums(rg0, pw_mf016_mf033, pd_onehalfm1_cj); 2286eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cb1 = vec_msums(rg1, pw_mf016_mf033, pd_onehalfm1_cj); 2296eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cb2 = vec_msums(rg2, pw_mf016_mf033, pd_onehalfm1_cj); 2306eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cb3 = vec_msums(rg3, pw_mf016_mf033, pd_onehalfm1_cj); 2316eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cb0 = (__vector int)vec_msum((__vector unsigned short)bg0, pw_f050_f000, 2326eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis (__vector unsigned int)cb0); 2336eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cb1 = (__vector int)vec_msum((__vector unsigned short)bg1, pw_f050_f000, 2346eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis (__vector unsigned int)cb1); 2356eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cb2 = (__vector int)vec_msum((__vector unsigned short)bg2, pw_f050_f000, 2366eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis (__vector unsigned int)cb2); 2376eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cb3 = (__vector int)vec_msum((__vector unsigned short)bg3, pw_f050_f000, 2386eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis (__vector unsigned int)cb3); 2396eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cbl = vec_perm((__vector unsigned short)cb0, 2406eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis (__vector unsigned short)cb1, shift_pack_index); 2416eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cbh = vec_perm((__vector unsigned short)cb2, 2426eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis (__vector unsigned short)cb3, shift_pack_index); 2436eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cb = vec_pack(cbl, cbh); 2446eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis vec_st(cb, 0, outptr1); 2456eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis 2466eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis /* Calculate Cr values */ 2476eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cr0 = vec_msums(bg0, pw_mf008_mf041, pd_onehalfm1_cj); 2486eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cr1 = vec_msums(bg1, pw_mf008_mf041, pd_onehalfm1_cj); 2496eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cr2 = vec_msums(bg2, pw_mf008_mf041, pd_onehalfm1_cj); 2506eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cr3 = vec_msums(bg3, pw_mf008_mf041, pd_onehalfm1_cj); 2516eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cr0 = (__vector int)vec_msum((__vector unsigned short)rg0, pw_f050_f000, 2526eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis (__vector unsigned int)cr0); 2536eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cr1 = (__vector int)vec_msum((__vector unsigned short)rg1, pw_f050_f000, 2546eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis (__vector unsigned int)cr1); 2556eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cr2 = (__vector int)vec_msum((__vector unsigned short)rg2, pw_f050_f000, 2566eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis (__vector unsigned int)cr2); 2576eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cr3 = (__vector int)vec_msum((__vector unsigned short)rg3, pw_f050_f000, 2586eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis (__vector unsigned int)cr3); 2596eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis crl = vec_perm((__vector unsigned short)cr0, 2606eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis (__vector unsigned short)cr1, shift_pack_index); 2616eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis crh = vec_perm((__vector unsigned short)cr2, 2626eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis (__vector unsigned short)cr3, shift_pack_index); 2636eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis cr = vec_pack(crl, crh); 2646eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis vec_st(cr, 0, outptr2); 2656eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis } 2666eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis } 2676eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis} 268