16bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---=== 26bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker * 36bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker * Permission is hereby granted, free of charge, to any person obtaining a copy 46bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker * of this software and associated documentation files (the "Software"), to deal 56bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker * in the Software without restriction, including without limitation the rights 66bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 76bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker * copies of the Software, and to permit persons to whom the Software is 86bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker * furnished to do so, subject to the following conditions: 96bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker * 106bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker * The above copyright notice and this permission notice shall be included in 116bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker * all copies or substantial portions of the Software. 126bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker * 136bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 146bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 156bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 166bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 176bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 186bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 196bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker * THE SOFTWARE. 206bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker * 216bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker *===-----------------------------------------------------------------------=== 226bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker */ 236bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker#ifndef __CLANG_CUDA_INTRINSICS_H__ 246bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker#define __CLANG_CUDA_INTRINSICS_H__ 256bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker#ifndef __CUDA__ 266bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker#error "This file is for CUDA compilation only." 276bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker#endif 286bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker 296bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker// sm_30 intrinsics: __shfl_{up,down,xor}. 306bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker 316bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker#define __SM_30_INTRINSICS_H__ 326bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker#define __SM_30_INTRINSICS_HPP__ 336bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker 346bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 356bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker 366bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker#pragma push_macro("__MAKE_SHUFFLES") 376bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask) \ 386bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker inline __device__ int __FnName(int __in, int __offset, \ 396bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker int __width = warpSize) { \ 406bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return __IntIntrinsic(__in, __offset, \ 416bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ((warpSize - __width) << 8) | (__Mask)); \ 426bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker } \ 436bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker inline __device__ float __FnName(float __in, int __offset, \ 446bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker int __width = warpSize) { \ 456bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return __FloatIntrinsic(__in, __offset, \ 466bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ((warpSize - __width) << 8) | (__Mask)); \ 476bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker } \ 486bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker inline __device__ unsigned int __FnName(unsigned int __in, int __offset, \ 496bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker int __width = warpSize) { \ 506bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return static_cast<unsigned int>( \ 516bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ::__FnName(static_cast<int>(__in), __offset, __width)); \ 526bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker } \ 536bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker inline __device__ long long __FnName(long long __in, int __offset, \ 546bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker int __width = warpSize) { \ 556bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker struct __Bits { \ 566bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker int __a, __b; \ 576bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker }; \ 586bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker _Static_assert(sizeof(__in) == sizeof(__Bits)); \ 596bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ 606bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker __Bits __tmp; \ 616bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker memcpy(&__in, &__tmp, sizeof(__in)); \ 626bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \ 636bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \ 646bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker long long __out; \ 656bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker memcpy(&__out, &__tmp, sizeof(__tmp)); \ 666bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return __out; \ 676bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker } \ 686bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker inline __device__ unsigned long long __FnName( \ 696bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker unsigned long long __in, int __offset, int __width = warpSize) { \ 706bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return static_cast<unsigned long long>( \ 716bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ::__FnName(static_cast<unsigned long long>(__in), __offset, __width)); \ 726bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker } \ 736bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker inline __device__ double __FnName(double __in, int __offset, \ 746bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker int __width = warpSize) { \ 756bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker long long __tmp; \ 766bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker _Static_assert(sizeof(__tmp) == sizeof(__in)); \ 776bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker memcpy(&__tmp, &__in, sizeof(__in)); \ 786bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker __tmp = ::__FnName(__tmp, __offset, __width); \ 796bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker double __out; \ 806bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker memcpy(&__out, &__tmp, sizeof(__out)); \ 816bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return __out; \ 826bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker } 836bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker 846bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f); 856bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= 866bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker// maxLane. 876bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0); 886bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f); 896bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f); 906bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker 916bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker#pragma pop_macro("__MAKE_SHUFFLES") 926bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker 936bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 946bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker 956bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}. 966bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker 976bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker// Prevent the vanilla sm_32 intrinsics header from being included. 986bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker#define __SM_32_INTRINSICS_H__ 996bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker#define __SM_32_INTRINSICS_HPP__ 1006bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker 1016bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 1026bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker 1036bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); } 1046bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); } 1056bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); } 1066bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); } 1076bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ long long __ldg(const long long *ptr) { 1086bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return __nvvm_ldg_ll(ptr); 1096bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 1106bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ unsigned char __ldg(const unsigned char *ptr) { 1116bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return __nvvm_ldg_uc(ptr); 1126bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 1136bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ unsigned short __ldg(const unsigned short *ptr) { 1146bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return __nvvm_ldg_us(ptr); 1156bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 1166bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ unsigned int __ldg(const unsigned int *ptr) { 1176bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return __nvvm_ldg_ui(ptr); 1186bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 1196bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ unsigned long __ldg(const unsigned long *ptr) { 1206bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return __nvvm_ldg_ul(ptr); 1216bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 1226bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ unsigned long long __ldg(const unsigned long long *ptr) { 1236bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return __nvvm_ldg_ull(ptr); 1246bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 1256bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); } 1266bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); } 1276bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker 1286bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ char2 __ldg(const char2 *ptr) { 1296bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker typedef char c2 __attribute__((ext_vector_type(2))); 1306bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker // We can assume that ptr is aligned at least to char2's alignment, but the 1316bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker // load will assume that ptr is aligned to char2's alignment. This is only 1326bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker // safe if alignof(c2) <= alignof(char2). 1336bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr)); 1346bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker char2 ret; 1356bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.x = rv[0]; 1366bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.y = rv[1]; 1376bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return ret; 1386bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 1396bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ char4 __ldg(const char4 *ptr) { 1406bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker typedef char c4 __attribute__((ext_vector_type(4))); 1416bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr)); 1426bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker char4 ret; 1436bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.x = rv[0]; 1446bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.y = rv[1]; 1456bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.z = rv[2]; 1466bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.w = rv[3]; 1476bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return ret; 1486bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 1496bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ short2 __ldg(const short2 *ptr) { 1506bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker typedef short s2 __attribute__((ext_vector_type(2))); 1516bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr)); 1526bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker short2 ret; 1536bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.x = rv[0]; 1546bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.y = rv[1]; 1556bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return ret; 1566bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 1576bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ short4 __ldg(const short4 *ptr) { 1586bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker typedef short s4 __attribute__((ext_vector_type(4))); 1596bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr)); 1606bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker short4 ret; 1616bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.x = rv[0]; 1626bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.y = rv[1]; 1636bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.z = rv[2]; 1646bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.w = rv[3]; 1656bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return ret; 1666bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 1676bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ int2 __ldg(const int2 *ptr) { 1686bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker typedef int i2 __attribute__((ext_vector_type(2))); 1696bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr)); 1706bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker int2 ret; 1716bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.x = rv[0]; 1726bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.y = rv[1]; 1736bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return ret; 1746bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 1756bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ int4 __ldg(const int4 *ptr) { 1766bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker typedef int i4 __attribute__((ext_vector_type(4))); 1776bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr)); 1786bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker int4 ret; 1796bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.x = rv[0]; 1806bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.y = rv[1]; 1816bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.z = rv[2]; 1826bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.w = rv[3]; 1836bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return ret; 1846bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 1856bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ longlong2 __ldg(const longlong2 *ptr) { 1866bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker typedef long long ll2 __attribute__((ext_vector_type(2))); 1876bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr)); 1886bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker longlong2 ret; 1896bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.x = rv[0]; 1906bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.y = rv[1]; 1916bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return ret; 1926bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 1936bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker 1946bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ uchar2 __ldg(const uchar2 *ptr) { 1956bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker typedef unsigned char uc2 __attribute__((ext_vector_type(2))); 1966bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr)); 1976bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker uchar2 ret; 1986bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.x = rv[0]; 1996bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.y = rv[1]; 2006bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return ret; 2016bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 2026bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ uchar4 __ldg(const uchar4 *ptr) { 2036bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker typedef unsigned char uc4 __attribute__((ext_vector_type(4))); 2046bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr)); 2056bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker uchar4 ret; 2066bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.x = rv[0]; 2076bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.y = rv[1]; 2086bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.z = rv[2]; 2096bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.w = rv[3]; 2106bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return ret; 2116bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 2126bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ ushort2 __ldg(const ushort2 *ptr) { 2136bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker typedef unsigned short us2 __attribute__((ext_vector_type(2))); 2146bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr)); 2156bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ushort2 ret; 2166bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.x = rv[0]; 2176bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.y = rv[1]; 2186bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return ret; 2196bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 2206bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ ushort4 __ldg(const ushort4 *ptr) { 2216bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker typedef unsigned short us4 __attribute__((ext_vector_type(4))); 2226bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr)); 2236bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ushort4 ret; 2246bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.x = rv[0]; 2256bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.y = rv[1]; 2266bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.z = rv[2]; 2276bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.w = rv[3]; 2286bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return ret; 2296bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 2306bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ uint2 __ldg(const uint2 *ptr) { 2316bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker typedef unsigned int ui2 __attribute__((ext_vector_type(2))); 2326bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr)); 2336bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker uint2 ret; 2346bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.x = rv[0]; 2356bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.y = rv[1]; 2366bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return ret; 2376bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 2386bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ uint4 __ldg(const uint4 *ptr) { 2396bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker typedef unsigned int ui4 __attribute__((ext_vector_type(4))); 2406bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr)); 2416bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker uint4 ret; 2426bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.x = rv[0]; 2436bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.y = rv[1]; 2446bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.z = rv[2]; 2456bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.w = rv[3]; 2466bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return ret; 2476bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 2486bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) { 2496bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker typedef unsigned long long ull2 __attribute__((ext_vector_type(2))); 2506bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr)); 2516bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ulonglong2 ret; 2526bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.x = rv[0]; 2536bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.y = rv[1]; 2546bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return ret; 2556bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 2566bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker 2576bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ float2 __ldg(const float2 *ptr) { 2586bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker typedef float f2 __attribute__((ext_vector_type(2))); 2596bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr)); 2606bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker float2 ret; 2616bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.x = rv[0]; 2626bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.y = rv[1]; 2636bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return ret; 2646bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 2656bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ float4 __ldg(const float4 *ptr) { 2666bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker typedef float f4 __attribute__((ext_vector_type(4))); 2676bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr)); 2686bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker float4 ret; 2696bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.x = rv[0]; 2706bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.y = rv[1]; 2716bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.z = rv[2]; 2726bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.w = rv[3]; 2736bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return ret; 2746bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 2756bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ double2 __ldg(const double2 *ptr) { 2766bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker typedef double d2 __attribute__((ext_vector_type(2))); 2776bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr)); 2786bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker double2 ret; 2796bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.x = rv[0]; 2806bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker ret.y = rv[1]; 2816bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return ret; 2826bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 2836bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker 2846bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker// TODO: Implement these as intrinsics, so the backend can work its magic on 2856bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker// these. Alternatively, we could implement these as plain C and try to get 2866bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker// llvm to recognize the relevant patterns. 2876bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32, 2886bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker unsigned shiftWidth) { 2896bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker unsigned result; 2906bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker asm("shf.l.wrap.b32 %0, %1, %2, %3;" 2916bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker : "=r"(result) 2926bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker : "r"(low32), "r"(high32), "r"(shiftWidth)); 2936bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return result; 2946bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 2956bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32, 2966bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker unsigned shiftWidth) { 2976bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker unsigned result; 2986bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker asm("shf.l.clamp.b32 %0, %1, %2, %3;" 2996bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker : "=r"(result) 3006bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker : "r"(low32), "r"(high32), "r"(shiftWidth)); 3016bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return result; 3026bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 3036bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32, 3046bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker unsigned shiftWidth) { 3056bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker unsigned result; 3066bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker asm("shf.r.wrap.b32 %0, %1, %2, %3;" 3076bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker : "=r"(result) 3086bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker : "r"(low32), "r"(high32), "r"(shiftWidth)); 3096bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return result; 3106bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 3116bdbd720989797e8a53237ef3ef213c4114f869gitbuildkickerinline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, 3126bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker unsigned shiftWidth) { 3136bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker unsigned ret; 3146bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker asm("shf.r.clamp.b32 %0, %1, %2, %3;" 3156bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker : "=r"(ret) 3166bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker : "r"(low32), "r"(high32), "r"(shiftWidth)); 3176bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker return ret; 3186bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker} 3196bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker 3206bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 3216bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker 3226bdbd720989797e8a53237ef3ef213c4114f869gitbuildkicker#endif // defined(__CLANG_CUDA_INTRINSICS_H__) 323