1// Copyright 2015 Google Inc. All Rights Reserved. 2// 3// Use of this source code is governed by a BSD-style license 4// that can be found in the COPYING file in the root of the source 5// tree. An additional intellectual property rights grant can be found 6// in the file PATENTS. All contributing project authors may 7// be found in the AUTHORS file in the root of the source tree. 8// ----------------------------------------------------------------------------- 9// 10// MIPS version of lossless functions 11// 12// Author(s): Djordje Pesut (djordje.pesut@imgtec.com) 13// Jovan Zelincevic (jovan.zelincevic@imgtec.com) 14 15#include "./dsp.h" 16#include "./lossless.h" 17#include "./lossless_common.h" 18 19#if defined(WEBP_USE_MIPS32) 20 21#include <assert.h> 22#include <math.h> 23#include <stdlib.h> 24#include <string.h> 25 26static float FastSLog2Slow(uint32_t v) { 27 assert(v >= LOG_LOOKUP_IDX_MAX); 28 if (v < APPROX_LOG_WITH_CORRECTION_MAX) { 29 uint32_t log_cnt, y, correction; 30 const int c24 = 24; 31 const float v_f = (float)v; 32 uint32_t temp; 33 34 // Xf = 256 = 2^8 35 // log_cnt is index of leading one in upper 24 bits 36 __asm__ volatile( 37 "clz %[log_cnt], %[v] \n\t" 38 "addiu %[y], $zero, 1 \n\t" 39 "subu %[log_cnt], %[c24], %[log_cnt] \n\t" 40 "sllv %[y], %[y], %[log_cnt] \n\t" 41 "srlv %[temp], %[v], %[log_cnt] \n\t" 42 : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y), 43 [temp]"=r"(temp) 44 : [c24]"r"(c24), [v]"r"(v) 45 ); 46 47 // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256 48 // Xf = floor(Xf) * (1 + (v % y) / v) 49 // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v) 50 // The correction factor: log(1 + d) ~ d; for very small d values, so 51 // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v 52 // LOG_2_RECIPROCAL ~ 23/16 53 54 // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1) 55 correction = (23 * (v & (y - 1))) >> 4; 56 return v_f * (kLog2Table[temp] + log_cnt) + correction; 57 } else { 58 return (float)(LOG_2_RECIPROCAL * v * log((double)v)); 59 } 60} 61 62static float FastLog2Slow(uint32_t v) { 63 assert(v >= LOG_LOOKUP_IDX_MAX); 64 if (v < APPROX_LOG_WITH_CORRECTION_MAX) { 65 uint32_t log_cnt, y; 66 const int c24 = 24; 67 double log_2; 68 uint32_t temp; 69 70 __asm__ volatile( 71 "clz %[log_cnt], %[v] \n\t" 72 "addiu %[y], $zero, 1 \n\t" 73 "subu %[log_cnt], %[c24], %[log_cnt] \n\t" 74 "sllv %[y], %[y], %[log_cnt] \n\t" 75 "srlv %[temp], %[v], %[log_cnt] \n\t" 76 : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y), 77 [temp]"=r"(temp) 78 : [c24]"r"(c24), [v]"r"(v) 79 ); 80 81 log_2 = kLog2Table[temp] + log_cnt; 82 if (v >= APPROX_LOG_MAX) { 83 // Since the division is still expensive, add this correction factor only 84 // for large values of 'v'. 85 86 const uint32_t correction = (23 * (v & (y - 1))) >> 4; 87 log_2 += (double)correction / v; 88 } 89 return (float)log_2; 90 } else { 91 return (float)(LOG_2_RECIPROCAL * log((double)v)); 92 } 93} 94 95// C version of this function: 96// int i = 0; 97// int64_t cost = 0; 98// const uint32_t* pop = &population[4]; 99// const uint32_t* LoopEnd = &population[length]; 100// while (pop != LoopEnd) { 101// ++i; 102// cost += i * *pop; 103// cost += i * *(pop + 1); 104// pop += 2; 105// } 106// return (double)cost; 107static double ExtraCost(const uint32_t* const population, int length) { 108 int i, temp0, temp1; 109 const uint32_t* pop = &population[4]; 110 const uint32_t* const LoopEnd = &population[length]; 111 112 __asm__ volatile( 113 "mult $zero, $zero \n\t" 114 "xor %[i], %[i], %[i] \n\t" 115 "beq %[pop], %[LoopEnd], 2f \n\t" 116 "1: \n\t" 117 "lw %[temp0], 0(%[pop]) \n\t" 118 "lw %[temp1], 4(%[pop]) \n\t" 119 "addiu %[i], %[i], 1 \n\t" 120 "addiu %[pop], %[pop], 8 \n\t" 121 "madd %[i], %[temp0] \n\t" 122 "madd %[i], %[temp1] \n\t" 123 "bne %[pop], %[LoopEnd], 1b \n\t" 124 "2: \n\t" 125 "mfhi %[temp0] \n\t" 126 "mflo %[temp1] \n\t" 127 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), 128 [i]"=&r"(i), [pop]"+r"(pop) 129 : [LoopEnd]"r"(LoopEnd) 130 : "memory", "hi", "lo" 131 ); 132 133 return (double)((int64_t)temp0 << 32 | temp1); 134} 135 136// C version of this function: 137// int i = 0; 138// int64_t cost = 0; 139// const uint32_t* pX = &X[4]; 140// const uint32_t* pY = &Y[4]; 141// const uint32_t* LoopEnd = &X[length]; 142// while (pX != LoopEnd) { 143// const uint32_t xy0 = *pX + *pY; 144// const uint32_t xy1 = *(pX + 1) + *(pY + 1); 145// ++i; 146// cost += i * xy0; 147// cost += i * xy1; 148// pX += 2; 149// pY += 2; 150// } 151// return (double)cost; 152static double ExtraCostCombined(const uint32_t* const X, 153 const uint32_t* const Y, int length) { 154 int i, temp0, temp1, temp2, temp3; 155 const uint32_t* pX = &X[4]; 156 const uint32_t* pY = &Y[4]; 157 const uint32_t* const LoopEnd = &X[length]; 158 159 __asm__ volatile( 160 "mult $zero, $zero \n\t" 161 "xor %[i], %[i], %[i] \n\t" 162 "beq %[pX], %[LoopEnd], 2f \n\t" 163 "1: \n\t" 164 "lw %[temp0], 0(%[pX]) \n\t" 165 "lw %[temp1], 0(%[pY]) \n\t" 166 "lw %[temp2], 4(%[pX]) \n\t" 167 "lw %[temp3], 4(%[pY]) \n\t" 168 "addiu %[i], %[i], 1 \n\t" 169 "addu %[temp0], %[temp0], %[temp1] \n\t" 170 "addu %[temp2], %[temp2], %[temp3] \n\t" 171 "addiu %[pX], %[pX], 8 \n\t" 172 "addiu %[pY], %[pY], 8 \n\t" 173 "madd %[i], %[temp0] \n\t" 174 "madd %[i], %[temp2] \n\t" 175 "bne %[pX], %[LoopEnd], 1b \n\t" 176 "2: \n\t" 177 "mfhi %[temp0] \n\t" 178 "mflo %[temp1] \n\t" 179 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), 180 [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), 181 [i]"=&r"(i), [pX]"+r"(pX), [pY]"+r"(pY) 182 : [LoopEnd]"r"(LoopEnd) 183 : "memory", "hi", "lo" 184 ); 185 186 return (double)((int64_t)temp0 << 32 | temp1); 187} 188 189#define HUFFMAN_COST_PASS \ 190 __asm__ volatile( \ 191 "sll %[temp1], %[temp0], 3 \n\t" \ 192 "addiu %[temp3], %[streak], -3 \n\t" \ 193 "addu %[temp2], %[pstreaks], %[temp1] \n\t" \ 194 "blez %[temp3], 1f \n\t" \ 195 "srl %[temp1], %[temp1], 1 \n\t" \ 196 "addu %[temp3], %[pcnts], %[temp1] \n\t" \ 197 "lw %[temp0], 4(%[temp2]) \n\t" \ 198 "lw %[temp1], 0(%[temp3]) \n\t" \ 199 "addu %[temp0], %[temp0], %[streak] \n\t" \ 200 "addiu %[temp1], %[temp1], 1 \n\t" \ 201 "sw %[temp0], 4(%[temp2]) \n\t" \ 202 "sw %[temp1], 0(%[temp3]) \n\t" \ 203 "b 2f \n\t" \ 204 "1: \n\t" \ 205 "lw %[temp0], 0(%[temp2]) \n\t" \ 206 "addu %[temp0], %[temp0], %[streak] \n\t" \ 207 "sw %[temp0], 0(%[temp2]) \n\t" \ 208 "2: \n\t" \ 209 : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \ 210 [temp3]"=&r"(temp3), [temp0]"+r"(temp0) \ 211 : [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts), \ 212 [streak]"r"(streak) \ 213 : "memory" \ 214 ); 215 216// Returns the various RLE counts 217static WEBP_INLINE void GetEntropyUnrefinedHelper( 218 uint32_t val, int i, uint32_t* const val_prev, int* const i_prev, 219 VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) { 220 int* const pstreaks = &stats->streaks[0][0]; 221 int* const pcnts = &stats->counts[0]; 222 int temp0, temp1, temp2, temp3; 223 const int streak = i - *i_prev; 224 225 // Gather info for the bit entropy. 226 if (*val_prev != 0) { 227 bit_entropy->sum += (*val_prev) * streak; 228 bit_entropy->nonzeros += streak; 229 bit_entropy->nonzero_code = *i_prev; 230 bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak; 231 if (bit_entropy->max_val < *val_prev) { 232 bit_entropy->max_val = *val_prev; 233 } 234 } 235 236 // Gather info for the Huffman cost. 237 temp0 = (*val_prev != 0); 238 HUFFMAN_COST_PASS 239 240 *val_prev = val; 241 *i_prev = i; 242} 243 244static void GetEntropyUnrefined(const uint32_t X[], int length, 245 VP8LBitEntropy* const bit_entropy, 246 VP8LStreaks* const stats) { 247 int i; 248 int i_prev = 0; 249 uint32_t x_prev = X[0]; 250 251 memset(stats, 0, sizeof(*stats)); 252 VP8LBitEntropyInit(bit_entropy); 253 254 for (i = 1; i < length; ++i) { 255 const uint32_t x = X[i]; 256 if (x != x_prev) { 257 GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats); 258 } 259 } 260 GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats); 261 262 bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum); 263} 264 265static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[], 266 int length, 267 VP8LBitEntropy* const bit_entropy, 268 VP8LStreaks* const stats) { 269 int i = 1; 270 int i_prev = 0; 271 uint32_t xy_prev = X[0] + Y[0]; 272 273 memset(stats, 0, sizeof(*stats)); 274 VP8LBitEntropyInit(bit_entropy); 275 276 for (i = 1; i < length; ++i) { 277 const uint32_t xy = X[i] + Y[i]; 278 if (xy != xy_prev) { 279 GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, stats); 280 } 281 } 282 GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats); 283 284 bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum); 285} 286 287#define ASM_START \ 288 __asm__ volatile( \ 289 ".set push \n\t" \ 290 ".set at \n\t" \ 291 ".set macro \n\t" \ 292 "1: \n\t" 293 294// P2 = P0 + P1 295// A..D - offsets 296// E - temp variable to tell macro 297// if pointer should be incremented 298// literal_ and successive histograms could be unaligned 299// so we must use ulw and usw 300#define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2) \ 301 "ulw %[temp0], " #A "(%[" #P0 "]) \n\t" \ 302 "ulw %[temp1], " #B "(%[" #P0 "]) \n\t" \ 303 "ulw %[temp2], " #C "(%[" #P0 "]) \n\t" \ 304 "ulw %[temp3], " #D "(%[" #P0 "]) \n\t" \ 305 "ulw %[temp4], " #A "(%[" #P1 "]) \n\t" \ 306 "ulw %[temp5], " #B "(%[" #P1 "]) \n\t" \ 307 "ulw %[temp6], " #C "(%[" #P1 "]) \n\t" \ 308 "ulw %[temp7], " #D "(%[" #P1 "]) \n\t" \ 309 "addu %[temp4], %[temp4], %[temp0] \n\t" \ 310 "addu %[temp5], %[temp5], %[temp1] \n\t" \ 311 "addu %[temp6], %[temp6], %[temp2] \n\t" \ 312 "addu %[temp7], %[temp7], %[temp3] \n\t" \ 313 "addiu %[" #P0 "], %[" #P0 "], 16 \n\t" \ 314 ".if " #E " == 1 \n\t" \ 315 "addiu %[" #P1 "], %[" #P1 "], 16 \n\t" \ 316 ".endif \n\t" \ 317 "usw %[temp4], " #A "(%[" #P2 "]) \n\t" \ 318 "usw %[temp5], " #B "(%[" #P2 "]) \n\t" \ 319 "usw %[temp6], " #C "(%[" #P2 "]) \n\t" \ 320 "usw %[temp7], " #D "(%[" #P2 "]) \n\t" \ 321 "addiu %[" #P2 "], %[" #P2 "], 16 \n\t" \ 322 "bne %[" #P0 "], %[LoopEnd], 1b \n\t" \ 323 ".set pop \n\t" \ 324 325#define ASM_END_COMMON_0 \ 326 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \ 327 [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \ 328 [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \ 329 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), \ 330 [pa]"+r"(pa), [pout]"+r"(pout) 331 332#define ASM_END_COMMON_1 \ 333 : [LoopEnd]"r"(LoopEnd) \ 334 : "memory", "at" \ 335 ); 336 337#define ASM_END_0 \ 338 ASM_END_COMMON_0 \ 339 , [pb]"+r"(pb) \ 340 ASM_END_COMMON_1 341 342#define ASM_END_1 \ 343 ASM_END_COMMON_0 \ 344 ASM_END_COMMON_1 345 346#define ADD_VECTOR(A, B, OUT, SIZE, EXTRA_SIZE) do { \ 347 const uint32_t* pa = (const uint32_t*)(A); \ 348 const uint32_t* pb = (const uint32_t*)(B); \ 349 uint32_t* pout = (uint32_t*)(OUT); \ 350 const uint32_t* const LoopEnd = pa + (SIZE); \ 351 assert((SIZE) % 4 == 0); \ 352 ASM_START \ 353 ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout) \ 354 ASM_END_0 \ 355 if ((EXTRA_SIZE) > 0) { \ 356 const int last = (EXTRA_SIZE); \ 357 int i; \ 358 for (i = 0; i < last; ++i) pout[i] = pa[i] + pb[i]; \ 359 } \ 360} while (0) 361 362#define ADD_VECTOR_EQ(A, OUT, SIZE, EXTRA_SIZE) do { \ 363 const uint32_t* pa = (const uint32_t*)(A); \ 364 uint32_t* pout = (uint32_t*)(OUT); \ 365 const uint32_t* const LoopEnd = pa + (SIZE); \ 366 assert((SIZE) % 4 == 0); \ 367 ASM_START \ 368 ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout) \ 369 ASM_END_1 \ 370 if ((EXTRA_SIZE) > 0) { \ 371 const int last = (EXTRA_SIZE); \ 372 int i; \ 373 for (i = 0; i < last; ++i) pout[i] += pa[i]; \ 374 } \ 375} while (0) 376 377static void HistogramAdd(const VP8LHistogram* const a, 378 const VP8LHistogram* const b, 379 VP8LHistogram* const out) { 380 uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; 381 const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_) 382 - (NUM_LITERAL_CODES + NUM_LENGTH_CODES); 383 assert(a->palette_code_bits_ == b->palette_code_bits_); 384 385 if (b != out) { 386 ADD_VECTOR(a->literal_, b->literal_, out->literal_, 387 NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size); 388 ADD_VECTOR(a->distance_, b->distance_, out->distance_, 389 NUM_DISTANCE_CODES, 0); 390 ADD_VECTOR(a->red_, b->red_, out->red_, NUM_LITERAL_CODES, 0); 391 ADD_VECTOR(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES, 0); 392 ADD_VECTOR(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES, 0); 393 } else { 394 ADD_VECTOR_EQ(a->literal_, out->literal_, 395 NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size); 396 ADD_VECTOR_EQ(a->distance_, out->distance_, NUM_DISTANCE_CODES, 0); 397 ADD_VECTOR_EQ(a->red_, out->red_, NUM_LITERAL_CODES, 0); 398 ADD_VECTOR_EQ(a->blue_, out->blue_, NUM_LITERAL_CODES, 0); 399 ADD_VECTOR_EQ(a->alpha_, out->alpha_, NUM_LITERAL_CODES, 0); 400 } 401} 402 403#undef ADD_VECTOR_EQ 404#undef ADD_VECTOR 405#undef ASM_END_1 406#undef ASM_END_0 407#undef ASM_END_COMMON_1 408#undef ASM_END_COMMON_0 409#undef ADD_TO_OUT 410#undef ASM_START 411 412//------------------------------------------------------------------------------ 413// Entry point 414 415extern void VP8LEncDspInitMIPS32(void); 416 417WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) { 418 VP8LFastSLog2Slow = FastSLog2Slow; 419 VP8LFastLog2Slow = FastLog2Slow; 420 VP8LExtraCost = ExtraCost; 421 VP8LExtraCostCombined = ExtraCostCombined; 422 VP8LGetEntropyUnrefined = GetEntropyUnrefined; 423 VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined; 424 VP8LHistogramAdd = HistogramAdd; 425} 426 427#else // !WEBP_USE_MIPS32 428 429WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPS32) 430 431#endif // WEBP_USE_MIPS32 432