1// This file is part of the ustl library, an STL implementation. 2// 3// Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net> 4// This file is free software, distributed under the MIT License. 5// 6// ualgobase.cc 7// 8// Copy and fill optimizations are here. 9// 10 11#ifndef NDEBUG // Optimized code here. asserts slow it down, and are checked elsewhere. 12#define NDEBUG 13#endif 14 15#include "ualgo.h" 16 17#undef CPU_HAS_MMX 18 19namespace ustl { 20 21// Generic version for implementing fill_nX_fast on non-i386 architectures. 22template <typename T> inline void stosv (T*& p, size_t n, T v) 23 { while (n--) *p++ = v; } 24 25#if defined(__i386__) || defined(__x86_64__) 26 27//---------------------------------------------------------------------- 28// Copy functions 29//---------------------------------------------------------------------- 30 31#if __GNUC__ >= 3 32static inline void movsb_dir_up (void) __attribute__((always_inline)); 33static inline void movsb_dir_down (void) __attribute__((always_inline)); 34static inline void movsb (const void*& src, size_t nBytes, void*& dest) __attribute__((always_inline)); 35static inline void movsd (const void*& src, size_t nWords, void*& dest) __attribute__((always_inline)); 36#endif 37 38static inline void movsb_dir_up (void) { asm volatile ("cld"); } 39static inline void movsb_dir_down (void) { asm volatile ("std"); } 40 41static inline void movsb (const void*& src, size_t nBytes, void*& dest) 42{ 43 asm volatile ("rep;\n\tmovsb" 44 : "=&S"(src), "=&D"(dest), "=&c"(nBytes) 45 : "0"(src), "1"(dest), "2"(nBytes) 46 : "memory"); 47} 48 49static inline void movsd (const void*& src, size_t nWords, void*& dest) 50{ 51 asm volatile ("rep;\n\tmovsl" 52 : "=&S"(src), "=&D"(dest), "=&c"(nWords) 53 : "0"(src), "1"(dest), "2"(nWords) 54 : "memory"); 55} 56 57template <> inline void stosv (uint8_t*& p, size_t n, uint8_t v) 58{ asm volatile ("rep;\n\tstosb" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); } 59template <> inline void stosv (uint16_t*& p, size_t n, uint16_t v) 60{ asm volatile ("rep;\n\tstosw" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); } 61template <> inline void stosv (uint32_t*& p, size_t n, uint32_t v) 62{ asm volatile ("rep;\n\tstosl" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); } 63 64#if CPU_HAS_MMX 65#define MMX_ALIGN 16U // Data must be aligned on this grain 66#define MMX_BS 32U // Assembly routines copy data this many bytes at a time. 67 68static inline void simd_block_copy (const void* src, void* dest) __attribute__((always_inline)); 69static inline void simd_block_store (uint8_t* dest) __attribute__((always_inline)); 70static inline void simd_block_cleanup (void) __attribute__((always_inline)); 71 72static inline void simd_block_copy (const void* src, void* dest) 73{ 74 const char* csrc ((const char*) src); 75 char* cdest ((char*) dest); 76 #if CPU_HAS_SSE 77 asm ( 78 "movaps\t%2, %%xmm0 \n\t" 79 "movaps\t%3, %%xmm1 \n\t" 80 "movntps\t%%xmm0, %0 \n\t" 81 "movntps\t%%xmm1, %1" 82 : "=m"(cdest[0]), "=m"(cdest[16]) 83 : "m"(csrc[0]), "m"(csrc[16]) 84 : "xmm0", "xmm1"); 85 #else 86 asm ( 87 "movq %4, %%mm0 \n\t" 88 "movq %5, %%mm1 \n\t" 89 "movq %6, %%mm2 \n\t" 90 "movq %7, %%mm3 \n\t" 91 "movq %%mm0, %0 \n\t" 92 "movq %%mm1, %1 \n\t" 93 "movq %%mm2, %2 \n\t" 94 "movq %%mm3, %3" 95 : "=m"(cdest[0]), "=m"(cdest[8]), "=m"(cdest[16]), "=m"(cdest[24]) 96 : "m"(csrc[0]), "m"(csrc[8]), "m"(csrc[16]), "m"(csrc[24]) 97 : "mm0", "mm1", "mm2", "mm3", "st", "st(1)", "st(2)", "st(3)"); 98 #endif 99} 100 101static inline void simd_block_store (uint8_t* dest) 102{ 103 #if CPU_HAS_SSE 104 asm volatile ( 105 "movntq %%mm0, %0\n\t" 106 "movntq %%mm0, %1\n\t" 107 "movntq %%mm0, %2\n\t" 108 "movntq %%mm0, %3" 109 : "=m"(dest[0]), "=m"(dest[8]), "=m"(dest[16]), "=m"(dest[24])); 110 #else 111 asm volatile ( 112 "movq %%mm0, %0 \n\t" 113 "movq %%mm0, %1 \n\t" 114 "movq %%mm0, %2 \n\t" 115 "movq %%mm0, %3" 116 : "=m"(dest[0]), "=m"(dest[8]), "=m"(dest[16]), "=m"(dest[24])); 117 #endif 118} 119 120static inline void simd_block_cleanup (void) 121{ 122 #if !CPU_HAS_SSE 123 simd::reset_mmx(); 124 #endif 125 asm volatile ("sfence"); 126} 127 128/// The fastest optimized raw memory copy. 129void copy_n_fast (const void* src, size_t nBytes, void* dest) 130{ 131 movsb_dir_up(); 132 size_t nHeadBytes = Align(uintptr_t(src), MMX_ALIGN) - uintptr_t(src); 133 nHeadBytes = min (nHeadBytes, nBytes); 134 movsb (src, nHeadBytes, dest); 135 nBytes -= nHeadBytes; 136 if (!(uintptr_t(dest) % MMX_ALIGN)) { 137 const size_t nMiddleBlocks = nBytes / MMX_BS; 138 for (uoff_t i = 0; i < nMiddleBlocks; ++ i) { 139 prefetch (advance (src, 512), 0, 0); 140 simd_block_copy (src, dest); 141 src = advance (src, MMX_BS); 142 dest = advance (dest, MMX_BS); 143 } 144 simd_block_cleanup(); 145 nBytes %= MMX_BS; 146 } 147 movsb (src, nBytes, dest); 148} 149#endif // CPU_HAS_MMX 150 151/// The fastest optimized backwards raw memory copy. 152void copy_backward_fast (const void* first, const void* last, void* result) 153{ 154 prefetch (first, 0, 0); 155 prefetch (result, 1, 0); 156 size_t nBytes (distance (first, last)); 157 movsb_dir_down(); 158 size_t nHeadBytes = uintptr_t(last) % 4; 159 last = advance (last, -1); 160 result = advance (result, -1); 161 movsb (last, nHeadBytes, result); 162 nBytes -= nHeadBytes; 163 if (uintptr_t(result) % 4 == 3) { 164 const size_t nMiddleBlocks = nBytes / 4; 165 last = advance (last, -3); 166 result = advance (result, -3); 167 movsd (last, nMiddleBlocks, result); 168 nBytes %= 4; 169 } 170 movsb (last, nBytes, result); 171 movsb_dir_up(); 172} 173#endif // __i386__ 174 175//---------------------------------------------------------------------- 176// Fill functions 177//---------------------------------------------------------------------- 178 179#if CPU_HAS_MMX 180template <typename T> inline void build_block (T) {} 181template <> inline void build_block (uint8_t v) 182{ 183 asm volatile ( 184 "movd %0, %%mm0\n\tpunpcklbw %%mm0, %%mm0\n\tpshufw $0, %%mm0, %%mm0" 185 : : "g"(uint32_t(v)) : "mm0"); 186} 187template <> inline void build_block (uint16_t v) 188{ 189 asm volatile ( 190 "movd %0, %%mm0\n\tpshufw $0, %%mm0, %%mm0" 191 : : "g"(uint32_t(v)) : "mm0"); 192} 193template <> inline void build_block (uint32_t v) 194{ 195 asm volatile ( 196 "movd %0, %%mm0\n\tpunpckldq %%mm0, %%mm0" 197 : : "g"(uint32_t(v)) : "mm0"); 198} 199 200static inline void simd_block_fill_loop (uint8_t*& dest, size_t count) 201{ 202 prefetch (advance (dest, 512), 1, 0); 203 for (uoff_t i = 0; i < count; ++ i, dest += MMX_BS) 204 simd_block_store (dest); 205 simd_block_cleanup(); 206 simd::reset_mmx(); 207} 208 209template <typename T> 210inline void fill_n_fast (T* dest, size_t count, T v) 211{ 212 size_t nHead = Align(uintptr_t(dest), MMX_ALIGN) - uintptr_t(dest) / sizeof(T); 213 nHead = min (nHead, count); 214 stosv (dest, nHead, v); 215 count -= nHead; 216 build_block (v); 217 simd_block_fill_loop ((uint8_t*&) dest, count * sizeof(T) / MMX_BS); 218 count %= MMX_BS; 219 stosv (dest, count, v); 220} 221 222void fill_n8_fast (uint8_t* dest, size_t count, uint8_t v) 223 { fill_n_fast (dest, count, v); } 224void fill_n16_fast (uint16_t* dest, size_t count, uint16_t v) 225 { fill_n_fast (dest, count, v); } 226void fill_n32_fast (uint32_t* dest, size_t count, uint32_t v) 227 { fill_n_fast (dest, count, v); } 228#else 229void fill_n8_fast (uint8_t* dest, size_t count, uint8_t v) { memset (dest, v, count); } 230void fill_n16_fast (uint16_t* dest, size_t count, uint16_t v) { stosv (dest, count, v); } 231void fill_n32_fast (uint32_t* dest, size_t count, uint32_t v) { stosv (dest, count, v); } 232#endif // CPU_HAS_MMX 233 234/// Exchanges ranges [first, middle) and [middle, last) 235void rotate_fast (void* first, void* middle, void* last) 236{ 237#ifdef HAVE_ALLOCA_H 238 const size_t half1 (distance (first, middle)), half2 (distance (middle, last)); 239 const size_t hmin (min (half1, half2)); 240 if (!hmin) { 241 return; 242 } 243 void* buf = alloca (hmin); 244 if (buf) { 245 if (half2 < half1) { 246 copy_n_fast (middle, half2, buf); 247 copy_backward_fast (first, middle, last); 248 copy_n_fast (buf, half2, first); 249 } else { 250 copy_n_fast (first, half1, buf); 251 copy_n_fast (middle, half2, first); 252 copy_n_fast (buf, half1, advance (first, half2)); 253 } 254 } else 255#else 256 if (first == middle || middle == last) { 257 return; 258 } 259#endif 260 { 261 char* f = (char*) first; 262 char* m = (char*) middle; 263 char* l = (char*) last; 264 reverse (f, m); 265 reverse (m, l); 266 while (f != m && m != l) 267 iter_swap (f++, --l); 268 reverse (f, (f == m ? l : m)); 269 } 270} 271 272#if __GNUC__ < 4 273size_t popcount (uint32_t v) 274{ 275 const uint32_t w = v - ((v >> 1) & 0x55555555); // Algorithm from AMD optimization guide 276 const uint32_t x = (w & 0x33333333) + ((w >> 2) & 0x33333333); 277 return (((x + (x >> 4) & 0x0F0F0F0F) * 0x01010101) >> 24); 278} 279 280#if HAVE_INT64_T 281/// \brief Returns the number of 1s in \p v in binary. 282size_t popcount (uint64_t v) 283{ 284 v -= (v >> 1) & UINT64_C(0x5555555555555555); // Algorithm from Wikipedia 285 v = (v & UINT64_C(0x3333333333333333)) + ((v >> 2) & UINT64_C(0x3333333333333333)); 286 v = (v + (v >> 4)) & UINT64_C(0x0F0F0F0F0F0F0F0F); 287 return ((v * UINT64_C(0x0101010101010101)) >> 56); 288} 289#endif // HAVE_INT64_T 290#endif // !__GNUC__ 291 292} // namespace ustl 293 294