1c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell/*
2c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell * Copyright 2014 The Android Open Source Project
3c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell *
4c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell * Use of this source code is governed by a BSD-style license that can be
5c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell * found in the LICENSE file.
6c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell */
7c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell
8c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell#ifndef SkMath_opts_SSE2_DEFINED
9c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell#define SkMath_opts_SSE2_DEFINED
10c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell
11c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell#include <emmintrin.h>
12c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell
13c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell// Because no _mm_div_epi32() in SSE2, we use float division to emulate.
14c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell// When using this function, make sure a and b don't exceed float's precision.
15c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powellstatic inline __m128i shim_mm_div_epi32(const __m128i& a, const __m128i& b) {
16c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell    __m128 x = _mm_cvtepi32_ps(a);
17c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell    __m128 y = _mm_cvtepi32_ps(b);
18c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell    return _mm_cvttps_epi32(_mm_div_ps(x, y));
19c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell}
20c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell
21c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell// Portable version of SkSqrtBits is in SkMath.cpp.
22c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powellstatic inline __m128i SkSqrtBits_SSE2(const __m128i& x, int count) {
23c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell    __m128i root =  _mm_setzero_si128();
24c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell    __m128i remHi = _mm_setzero_si128();
25c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell    __m128i remLo = x;
2684bbe6fe39c63a7e67a3ca937981a463c11ac9edScott Main    __m128i one128 = _mm_set1_epi32(1);
27c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell
28c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell    do {
29c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell        root = _mm_slli_epi32(root, 1);
30c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell
31c9cf2eb0a9b6694d0fda3dbc313844955db60820Adam Powell        remHi = _mm_or_si128(_mm_slli_epi32(remHi, 2),
32                             _mm_srli_epi32(remLo, 30));
33        remLo = _mm_slli_epi32(remLo, 2);
34
35        __m128i testDiv = _mm_slli_epi32(root, 1);
36        testDiv = _mm_add_epi32(testDiv, _mm_set1_epi32(1));
37
38        __m128i cmp = _mm_cmplt_epi32(remHi, testDiv);
39        __m128i remHi1 = _mm_and_si128(cmp, remHi);
40        __m128i root1 = _mm_and_si128(cmp, root);
41        __m128i remHi2 = _mm_andnot_si128(cmp, _mm_sub_epi32(remHi, testDiv));
42        __m128i root2 = _mm_andnot_si128(cmp, _mm_add_epi32(root, one128));
43
44        remHi = _mm_or_si128(remHi1, remHi2);
45        root = _mm_or_si128(root1, root2);
46    } while (--count >= 0);
47
48    return root;
49}
50
51#endif // SkMath_opts_SSE2_DEFINED
52