1d012387afef0ba02185ebe27bc6bb15551912e92Havoc Pennington/* 270bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker * Copyright 2014 Google Inc. 370bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker * 470bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker * Use of this source code is governed by a BSD-style license that can be 570bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker * found in the LICENSE file. 670bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker */ 770bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker 870bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker#ifndef SkHalf_DEFINED 970bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker#define SkHalf_DEFINED 1070bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker 1170bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker#include "SkNx.h" 1270bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker#include "SkTypes.h" 1370bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker 1470bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker// 16-bit floating point value 1570bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker// format is 1 bit sign, 5 bits exponent, 10 bits mantissa 1670bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker// only used for storage 1770bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habackertypedef uint16_t SkHalf; 1870bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker 1970bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker#define SK_HalfMin 0x0400 // 2^-24 (minimum positive normal value) 2070bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker#define SK_HalfMax 0x7bff // 65504 215baf2f856a9c6625993234855b07680da1c8916fTobias Mueller#define SK_HalfEpsilon 0x1400 // 2^-10 2270bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker 2370bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker// convert between half and single precision floating point 2470bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habackerfloat SkHalfToFloat(SkHalf h); 25dbecdeabb20e0ce11121819c63373f0afba57c58Marcus BrinkmannSkHalf SkFloatToHalf(float f); 26dbecdeabb20e0ce11121819c63373f0afba57c58Marcus Brinkmann 2770bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker// Convert between half and single precision floating point, but pull any dirty 2870bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker// trick we can to make it faster as long as it's correct enough for values in [0,1]. 2970bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habackerstatic inline Sk4f SkHalfToFloat_01(uint64_t); 3070bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habackerstatic inline uint64_t SkFloatToHalf_01(const Sk4f&); 3170bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker 3270bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker// ~~~~~~~~~~~ impl ~~~~~~~~~~~~~~ // 3370bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker 3470bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker// Like the serial versions in SkHalf.cpp, these are based on 3584401ec697281090dc2d02c45504c6fdd174f5ddChristian Ehrlicher// https://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ 3670bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker 373222b64d4a5e333ad3f95374a17fc4ecd6bc1431Romain Pokrzywka// GCC 4.9 lacks the intrinsics to use ARMv8 f16<->f32 instructions, so we use inline assembly. 3870bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker 3970bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habackerstatic inline Sk4f SkHalfToFloat_01(uint64_t hs) { 4070bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker#if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64) 416e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann float32x4_t fs; 42d9b9b45554b43d8c41eb1b8bd7c0421620cddcd4Ralf Habacker asm ("fmov %d[fs], %[hs] \n" // vcreate_f16(hs) 436e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann "fcvtl %[fs].4s, %[fs].4h \n" // vcvt_f32_f16(...) 4484401ec697281090dc2d02c45504c6fdd174f5ddChristian Ehrlicher : [fs] "=w" (fs) // =w: write-only NEON register 4570bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker : [hs] "r" (hs)); // r: read-only 64-bit general register 463404bb7238f2bb6dd5d678bc8f782810f3079241Marcus Brinkmann return fs; 473404bb7238f2bb6dd5d678bc8f782810f3079241Marcus Brinkmann 483404bb7238f2bb6dd5d678bc8f782810f3079241Marcus Brinkmann#elif !defined(SKNX_NO_SIMD) && defined(SK_ARM_HAS_NEON) 493404bb7238f2bb6dd5d678bc8f782810f3079241Marcus Brinkmann // NEON makes this pretty easy: 503404bb7238f2bb6dd5d678bc8f782810f3079241Marcus Brinkmann // - denormals are 10-bit * 2^-14 == 24-bit fixed point; 513404bb7238f2bb6dd5d678bc8f782810f3079241Marcus Brinkmann // - handle normals the same way as in SSE: align mantissa, then rebias exponent. 523404bb7238f2bb6dd5d678bc8f782810f3079241Marcus Brinkmann uint32x4_t h = vmovl_u16(vcreate_u16(hs)), 53378053ba594cca44e1bc9e069eab91b0a0954308Ralf Habacker is_denorm = vcltq_u32(h, vdupq_n_u32(1<<10)); 5470bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker float32x4_t denorm = vcvtq_n_f32_u32(h, 24), 5570bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker norm = vreinterpretq_f32_u32(vaddq_u32(vshlq_n_u32(h, 13), 5670bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker vdupq_n_u32((127-15) << 23))); 5770bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker return vbslq_f32(is_denorm, denorm, norm); 5870bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker 590314e701c812565bd7bdac548cadfea5d310d66cMatt McCutchen#elif !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 6070bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker // If our input is a normal 16-bit float, things are pretty easy: 6170bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker // - shift left by 13 to put the mantissa in the right place; 6270bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker // - the exponent is wrong, but it just needs to be rebiased; 6370bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker // - re-bias the exponent from 15-bias to 127-bias by adding (127-15). 64d372907895ffed3a2df06146d5dcc8601eab04dcRalf Habacker 650314e701c812565bd7bdac548cadfea5d310d66cMatt McCutchen // If our input is denormalized, we're going to do the same steps, plus a few more fix ups: 660314e701c812565bd7bdac548cadfea5d310d66cMatt McCutchen // - the input is h = K*2^-14, for some 10-bit fixed point K in [0,1); 6770bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker // - by shifting left 13 and adding (127-15) to the exponent, we constructed the float value 6870bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker // 2^-15*(1+K); 6970bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker // - we'd need to subtract 2^-15 and multiply by 2 to get back to K*2^-14, or equivallently 7070bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker // multiply by 2 then subtract 2^-14. 7170bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker // 7270bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker // - We'll work that multiply by 2 into the rebias, by adding 1 more to the exponent. 7370bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker // - Conveniently, this leaves that rebias constant 2^-14, exactly what we want to subtract. 7470bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker 7570bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker __m128i h = _mm_unpacklo_epi16(_mm_loadl_epi64((const __m128i*)&hs), _mm_setzero_si128()); 7670bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker const __m128i is_denorm = _mm_cmplt_epi32(h, _mm_set1_epi32(1<<10)); 7770bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker 7870bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker __m128i rebias = _mm_set1_epi32((127-15) << 23); 79459c19b9dd157cd360c3082d015a4c5ae4689cf8Tor Lillqvist rebias = _mm_add_epi32(rebias, _mm_and_si128(is_denorm, _mm_set1_epi32(1<<23))); 8070bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker 8170bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker __m128i f = _mm_add_epi32(_mm_slli_epi32(h, 13), rebias); 8270bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker return _mm_sub_ps(_mm_castsi128_ps(f), 8370bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker _mm_castsi128_ps(_mm_and_si128(is_denorm, rebias))); 8470bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker#else 856e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann float fs[4]; 866e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann for (int i = 0; i < 4; i++) { 876e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann fs[i] = SkHalfToFloat(hs >> (i*16)); 886e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann } 896e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann return Sk4f::Load(fs); 906e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann#endif 9170bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker} 9270bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker 936e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmannstatic inline uint64_t SkFloatToHalf_01(const Sk4f& fs) { 946e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann uint64_t r; 956e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann#if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64) 966e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann float32x4_t vec = fs.fVec; 976e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann asm ("fcvtn %[vec].4h, %[vec].4s \n" // vcvt_f16_f32(vec) 986e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann "fmov %[r], %d[vec] \n" // vst1_f16(&r, ...) 9970bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker : [r] "=r" (r) // =r: write-only 64-bit general register 1006e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann , [vec] "+w" (vec)); // +w: read-write NEON register 1016e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann 1026e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann// TODO: ARMv7 NEON float->half? 1036e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann 1046e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann#elif !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 10570bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker // Scale down from 127-bias to 15-bias, then cut off bottom 13 mantissa bits. 10670bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker // This doesn't round, so it can be 1 bit too small. 10770bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker const __m128 rebias = _mm_castsi128_ps(_mm_set1_epi32((127 - (127-15)) << 23)); 1086e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann __m128i h = _mm_srli_epi32(_mm_castps_si128(_mm_mul_ps(fs.fVec, rebias)), 13); 10970bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker _mm_storel_epi64((__m128i*)&r, _mm_packs_epi32(h,h)); 1106e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann 1116e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann#else 1126e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann SkHalf hs[4]; 1136e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann for (int i = 0; i < 4; i++) { 11470bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker hs[i] = SkFloatToHalf(fs[i]); 11570bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker } 11670bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker r = (uint64_t)hs[3] << 48 1176e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann | (uint64_t)hs[2] << 32 1186e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann | (uint64_t)hs[1] << 16 1196e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann | (uint64_t)hs[0] << 0; 1206e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann#endif 12170bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker return r; 1226e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann} 1236e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann 12470bfc74e54ac8a9a93885710cd8350d1a58b3406Ralf Habacker#endif 1256e214b5b3c283798b5743b4ebf7c9ec466fe3667Marcus Brinkmann