1// This file is part of the ustl library, an STL implementation.
2//
3// Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net>
4// This file is free software, distributed under the MIT License.
5//
6// ualgobase.cc
7//
8// Copy and fill optimizations are here.
9//
10
11#ifndef NDEBUG	// Optimized code here. asserts slow it down, and are checked elsewhere.
12#define NDEBUG
13#endif
14
15#include "ualgo.h"
16
17#undef CPU_HAS_MMX
18
19namespace ustl {
20
21// Generic version for implementing fill_nX_fast on non-i386 architectures.
22template <typename T> inline void stosv (T*& p, size_t n, T v)
23    { while (n--) *p++ = v; }
24
25#if defined(__i386__) || defined(__x86_64__)
26
27//----------------------------------------------------------------------
28// Copy functions
29//----------------------------------------------------------------------
30
31#if __GNUC__ >= 3
32static inline void movsb_dir_up (void) __attribute__((always_inline));
33static inline void movsb_dir_down (void) __attribute__((always_inline));
34static inline void movsb (const void*& src, size_t nBytes, void*& dest) __attribute__((always_inline));
35static inline void movsd (const void*& src, size_t nWords, void*& dest) __attribute__((always_inline));
36#endif
37
38static inline void movsb_dir_up (void) { asm volatile ("cld"); }
39static inline void movsb_dir_down (void) { asm volatile ("std"); }
40
41static inline void movsb (const void*& src, size_t nBytes, void*& dest)
42{
43    asm volatile ("rep;\n\tmovsb"
44	: "=&S"(src), "=&D"(dest), "=&c"(nBytes)
45	: "0"(src), "1"(dest), "2"(nBytes)
46	: "memory");
47}
48
49static inline void movsd (const void*& src, size_t nWords, void*& dest)
50{
51    asm volatile ("rep;\n\tmovsl"
52	: "=&S"(src), "=&D"(dest), "=&c"(nWords)
53	: "0"(src), "1"(dest), "2"(nWords)
54	: "memory");
55}
56
57template <> inline void stosv (uint8_t*& p, size_t n, uint8_t v)
58{ asm volatile ("rep;\n\tstosb" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); }
59template <> inline void stosv (uint16_t*& p, size_t n, uint16_t v)
60{ asm volatile ("rep;\n\tstosw" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); }
61template <> inline void stosv (uint32_t*& p, size_t n, uint32_t v)
62{ asm volatile ("rep;\n\tstosl" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); }
63
64#if CPU_HAS_MMX
65#define MMX_ALIGN	16U	// Data must be aligned on this grain
66#define MMX_BS		32U	// Assembly routines copy data this many bytes at a time.
67
68static inline void simd_block_copy (const void* src, void* dest) __attribute__((always_inline));
69static inline void simd_block_store (uint8_t* dest) __attribute__((always_inline));
70static inline void simd_block_cleanup (void) __attribute__((always_inline));
71
72static inline void simd_block_copy (const void* src, void* dest)
73{
74    const char* csrc ((const char*) src);
75    char* cdest ((char*) dest);
76    #if CPU_HAS_SSE
77    asm (
78	"movaps\t%2, %%xmm0	\n\t"
79	"movaps\t%3, %%xmm1	\n\t"
80	"movntps\t%%xmm0, %0	\n\t"
81	"movntps\t%%xmm1, %1"
82	: "=m"(cdest[0]), "=m"(cdest[16])
83	: "m"(csrc[0]), "m"(csrc[16])
84	: "xmm0", "xmm1");
85    #else
86    asm (
87	"movq	%4, %%mm0	\n\t"
88	"movq	%5, %%mm1	\n\t"
89	"movq	%6, %%mm2	\n\t"
90	"movq	%7, %%mm3	\n\t"
91	"movq	%%mm0, %0	\n\t"
92	"movq	%%mm1, %1	\n\t"
93	"movq	%%mm2, %2	\n\t"
94	"movq	%%mm3, %3"
95	: "=m"(cdest[0]), "=m"(cdest[8]), "=m"(cdest[16]), "=m"(cdest[24])
96	: "m"(csrc[0]), "m"(csrc[8]), "m"(csrc[16]), "m"(csrc[24])
97	: "mm0", "mm1", "mm2", "mm3", "st", "st(1)", "st(2)", "st(3)");
98    #endif
99}
100
101static inline void simd_block_store (uint8_t* dest)
102{
103    #if CPU_HAS_SSE
104    asm volatile (
105	"movntq %%mm0, %0\n\t"
106	"movntq %%mm0, %1\n\t"
107	"movntq %%mm0, %2\n\t"
108	"movntq %%mm0, %3"
109	: "=m"(dest[0]), "=m"(dest[8]), "=m"(dest[16]), "=m"(dest[24]));
110    #else
111    asm volatile (
112	"movq %%mm0, %0	\n\t"
113	"movq %%mm0, %1	\n\t"
114	"movq %%mm0, %2	\n\t"
115	"movq %%mm0, %3"
116	: "=m"(dest[0]), "=m"(dest[8]), "=m"(dest[16]), "=m"(dest[24]));
117    #endif
118}
119
120static inline void simd_block_cleanup (void)
121{
122    #if !CPU_HAS_SSE
123	simd::reset_mmx();
124    #endif
125    asm volatile ("sfence");
126}
127
128/// The fastest optimized raw memory copy.
129void copy_n_fast (const void* src, size_t nBytes, void* dest)
130{
131    movsb_dir_up();
132    size_t nHeadBytes = Align(uintptr_t(src), MMX_ALIGN) - uintptr_t(src);
133    nHeadBytes = min (nHeadBytes, nBytes);
134    movsb (src, nHeadBytes, dest);
135    nBytes -= nHeadBytes;
136    if (!(uintptr_t(dest) % MMX_ALIGN)) {
137	const size_t nMiddleBlocks = nBytes / MMX_BS;
138	for (uoff_t i = 0; i < nMiddleBlocks; ++ i) {
139	    prefetch (advance (src, 512), 0, 0);
140	    simd_block_copy (src, dest);
141	    src = advance (src, MMX_BS);
142	    dest = advance (dest, MMX_BS);
143	}
144	simd_block_cleanup();
145	nBytes %= MMX_BS;
146    }
147    movsb (src, nBytes, dest);
148}
149#endif // CPU_HAS_MMX
150
151/// The fastest optimized backwards raw memory copy.
152void copy_backward_fast (const void* first, const void* last, void* result)
153{
154    prefetch (first, 0, 0);
155    prefetch (result, 1, 0);
156    size_t nBytes (distance (first, last));
157    movsb_dir_down();
158    size_t nHeadBytes = uintptr_t(last) % 4;
159    last = advance (last, -1);
160    result = advance (result, -1);
161    movsb (last, nHeadBytes, result);
162    nBytes -= nHeadBytes;
163    if (uintptr_t(result) % 4 == 3) {
164	const size_t nMiddleBlocks = nBytes / 4;
165	last = advance (last, -3);
166	result = advance (result, -3);
167	movsd (last, nMiddleBlocks, result);
168	nBytes %= 4;
169    }
170    movsb (last, nBytes, result);
171    movsb_dir_up();
172}
173#endif // __i386__
174
175//----------------------------------------------------------------------
176// Fill functions
177//----------------------------------------------------------------------
178
179#if CPU_HAS_MMX
180template <typename T> inline void build_block (T) {}
181template <> inline void build_block (uint8_t v)
182{
183    asm volatile (
184	"movd %0, %%mm0\n\tpunpcklbw %%mm0, %%mm0\n\tpshufw $0, %%mm0, %%mm0"
185	: : "g"(uint32_t(v)) : "mm0");
186}
187template <> inline void build_block (uint16_t v)
188{
189    asm volatile (
190	"movd %0, %%mm0\n\tpshufw $0, %%mm0, %%mm0"
191	: : "g"(uint32_t(v)) : "mm0");
192}
193template <> inline void build_block (uint32_t v)
194{
195    asm volatile (
196	"movd %0, %%mm0\n\tpunpckldq %%mm0, %%mm0"
197	: : "g"(uint32_t(v)) : "mm0");
198}
199
200static inline void simd_block_fill_loop (uint8_t*& dest, size_t count)
201{
202    prefetch (advance (dest, 512), 1, 0);
203    for (uoff_t i = 0; i < count; ++ i, dest += MMX_BS)
204	simd_block_store (dest);
205    simd_block_cleanup();
206    simd::reset_mmx();
207}
208
209template <typename T>
210inline void fill_n_fast (T* dest, size_t count, T v)
211{
212    size_t nHead = Align(uintptr_t(dest), MMX_ALIGN) - uintptr_t(dest) / sizeof(T);
213    nHead = min (nHead, count);
214    stosv (dest, nHead, v);
215    count -= nHead;
216    build_block (v);
217    simd_block_fill_loop ((uint8_t*&) dest, count * sizeof(T) / MMX_BS);
218    count %= MMX_BS;
219    stosv (dest, count, v);
220}
221
222void fill_n8_fast (uint8_t* dest, size_t count, uint8_t v)
223    { fill_n_fast (dest, count, v); }
224void fill_n16_fast (uint16_t* dest, size_t count, uint16_t v)
225    { fill_n_fast (dest, count, v); }
226void fill_n32_fast (uint32_t* dest, size_t count, uint32_t v)
227    { fill_n_fast (dest, count, v); }
228#else
229void fill_n8_fast (uint8_t* dest, size_t count, uint8_t v) { memset (dest, v, count); }
230void fill_n16_fast (uint16_t* dest, size_t count, uint16_t v) { stosv (dest, count, v); }
231void fill_n32_fast (uint32_t* dest, size_t count, uint32_t v) { stosv (dest, count, v); }
232#endif // CPU_HAS_MMX
233
234/// Exchanges ranges [first, middle) and [middle, last)
235void rotate_fast (void* first, void* middle, void* last)
236{
237#ifdef HAVE_ALLOCA_H
238    const size_t half1 (distance (first, middle)), half2 (distance (middle, last));
239    const size_t hmin (min (half1, half2));
240  if (!hmin) {
241	return;
242  }
243    void* buf = alloca (hmin);
244    if (buf) {
245	if (half2 < half1) {
246	    copy_n_fast (middle, half2, buf);
247	    copy_backward_fast (first, middle, last);
248	    copy_n_fast (buf, half2, first);
249	} else {
250	    copy_n_fast (first, half1, buf);
251	    copy_n_fast (middle, half2, first);
252	    copy_n_fast (buf, half1, advance (first, half2));
253	}
254    } else
255#else
256    if (first == middle || middle == last) {
257	return;
258    }
259#endif
260    {
261	char* f = (char*) first;
262	char* m = (char*) middle;
263	char* l = (char*) last;
264	reverse (f, m);
265	reverse (m, l);
266	while (f != m && m != l)
267	    iter_swap (f++, --l);
268	reverse (f, (f == m ? l : m));
269    }
270}
271
272#if __GNUC__ < 4
273size_t popcount (uint32_t v)
274{
275    const uint32_t w = v - ((v >> 1) & 0x55555555); // Algorithm from AMD optimization guide
276    const uint32_t x = (w & 0x33333333) + ((w >> 2) & 0x33333333);
277    return (((x + (x >> 4) & 0x0F0F0F0F) * 0x01010101) >> 24);
278}
279
280#if HAVE_INT64_T
281/// \brief Returns the number of 1s in \p v in binary.
282size_t popcount (uint64_t v)
283{
284    v -= (v >> 1) & UINT64_C(0x5555555555555555);		// Algorithm from Wikipedia
285    v = (v & UINT64_C(0x3333333333333333)) + ((v >> 2) & UINT64_C(0x3333333333333333));
286    v = (v + (v >> 4)) & UINT64_C(0x0F0F0F0F0F0F0F0F);
287    return ((v * UINT64_C(0x0101010101010101)) >> 56);
288}
289#endif	// HAVE_INT64_T
290#endif	// !__GNUC__
291
292} // namespace ustl
293
294