1// Copyright 2008 Google Inc. All Rights Reserved.
2
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6
7//      http://www.apache.org/licenses/LICENSE-2.0
8
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include "adler32memcpy.h"
16
17// We are using (a modified form of) adler-32 checksum algorithm instead
18// of CRC since adler-32 is faster than CRC.
19// (Comparison: http://guru.multimedia.cx/crc32-vs-adler32/)
20// This form of adler is bit modified, instead of treating the data in
21// units of bytes, 32-bit data is taken as a unit and two 64-bit
22// checksums are done (we could have one checksum but two checksums
23// make the code run faster).
24
25// Adler-32 implementation:
26//   Data is treated as 1-byte numbers and,
27//   there are two 16-bit numbers a and b
28//   Initialize a with 1 and b with 0.
29//   for each data unit 'd'
30//      a += d
31//      b += a
32//   checksum = a<<16 + b
33//   This sum should never overflow.
34//
35// Adler-64+64 implementation:
36//   (applied in this code)
37//   Data is treated as 32-bit numbers and whole data is separated into two
38//   streams, and hence the two checksums a1, a2, b1 and b2.
39//   Initialize a1 and a2 with 1, b1 and b2 with 0
40//   add first dataunit to a1
41//   add a1 to b1
42//   add second dataunit to a1
43//   add a1 to b1
44//   add third dataunit to a2
45//   add a2 to b2
46//   add fourth dataunit to a2
47//   add a2 to b2
48//   ...
49//   repeat the sequence back for next 4 dataunits
50//
51//   variable A = XMM6 and variable B = XMM7.
52//   (a1 = lower 8 bytes of XMM6 and b1 = lower 8 bytes of XMM7)
53
54// Assumptions
55// 1. size_in_bytes is a multiple of 16.
56// 2. srcmem and dstmem are 16 byte aligned.
57// 3. size_in_bytes is less than 2^19 bytes.
58
59// Assumption 3 ensures that there is no overflow when numbers are being
60// added (we can remove this assumption by doing modulus with a prime
61// number when it is just about to overflow but that would be a very costly
62// exercise)
63
64// Returns true if the checksums are equal.
65bool AdlerChecksum::Equals(const AdlerChecksum &other) const {
66  return ( (a1_ == other.a1_) && (a2_ == other.a2_) &&
67           (b1_ == other.b1_) && (b2_ == other.b2_) );
68}
69
70// Returns string representation of the Adler checksum.
71string AdlerChecksum::ToHexString() const {
72  char buffer[128];
73  snprintf(buffer, sizeof(buffer), "%llx%llx%llx%llx", a1_, a2_, b1_, b2_);
74  return string(buffer);
75}
76
77// Sets components of the Adler checksum.
78void AdlerChecksum::Set(uint64 a1, uint64 a2, uint64 b1, uint64 b2) {
79  a1_ = a1;
80  a2_ = a2;
81  b1_ = b1;
82  b2_ = b2;
83}
84
85// Calculates Adler checksum for supplied data.
86bool CalculateAdlerChecksum(uint64 *data64, unsigned int size_in_bytes,
87                            AdlerChecksum *checksum) {
88  // Use this data wrapper to access memory with 64bit read/write.
89  datacast_t data;
90  unsigned int count = size_in_bytes / sizeof(data);
91
92  if (count > (1U) << 19) {
93    // Size is too large, must be strictly less than 512 KB.
94    return false;
95  }
96
97  uint64 a1 = 1;
98  uint64 a2 = 1;
99  uint64 b1 = 0;
100  uint64 b2 = 0;
101
102  unsigned int i = 0;
103  while (i < count) {
104    // Process 64 bits at a time.
105    data.l64 = data64[i];
106    a1 = a1 + data.l32.l;
107    b1 = b1 + a1;
108    a1 = a1 + data.l32.h;
109    b1 = b1 + a1;
110    i++;
111
112    data.l64 = data64[i];
113    a2 = a2 + data.l32.l;
114    b2 = b2 + a2;
115    a2 = a2 + data.l32.h;
116    b2 = b2 + a2;
117    i++;
118  }
119  checksum->Set(a1, a2, b1, b2);
120  return true;
121}
122
123// C implementation of Adler memory copy.
124bool AdlerMemcpyC(uint64 *dstmem64, uint64 *srcmem64,
125                  unsigned int size_in_bytes, AdlerChecksum *checksum) {
126  // Use this data wrapper to access memory with 64bit read/write.
127  datacast_t data;
128  unsigned int count = size_in_bytes / sizeof(data);
129
130  if (count > ((1U) << 19)) {
131    // Size is too large, must be strictly less than 512 KB.
132    return false;
133  }
134
135  uint64 a1 = 1;
136  uint64 a2 = 1;
137  uint64 b1 = 0;
138  uint64 b2 = 0;
139
140  unsigned int i = 0;
141  while (i < count) {
142    // Process 64 bits at a time.
143    data.l64 = srcmem64[i];
144    a1 = a1 + data.l32.l;
145    b1 = b1 + a1;
146    a1 = a1 + data.l32.h;
147    b1 = b1 + a1;
148    dstmem64[i] = data.l64;
149    i++;
150
151    data.l64 = srcmem64[i];
152    a2 = a2 + data.l32.l;
153    b2 = b2 + a2;
154    a2 = a2 + data.l32.h;
155    b2 = b2 + a2;
156    dstmem64[i] = data.l64;
157    i++;
158  }
159  checksum->Set(a1, a2, b1, b2);
160  return true;
161}
162
163// C implementation of Adler memory copy with some float point ops,
164// attempting to warm up the CPU.
165bool AdlerMemcpyWarmC(uint64 *dstmem64, uint64 *srcmem64,
166                      unsigned int size_in_bytes, AdlerChecksum *checksum) {
167  // Use this data wrapper to access memory with 64bit read/write.
168  datacast_t data;
169  unsigned int count = size_in_bytes / sizeof(data);
170
171  if (count > ((1U) << 19)) {
172    // Size is too large, must be strictly less than 512 KB.
173    return false;
174  }
175
176  uint64 a1 = 1;
177  uint64 a2 = 1;
178  uint64 b1 = 0;
179  uint64 b2 = 0;
180
181  double a = 2.0 * static_cast<double>(srcmem64[0]);
182  double b = 5.0 * static_cast<double>(srcmem64[0]);
183  double c = 7.0 * static_cast<double>(srcmem64[0]);
184  double d = 9.0 * static_cast<double>(srcmem64[0]);
185
186  unsigned int i = 0;
187  while (i < count) {
188    // Process 64 bits at a time.
189    data.l64 = srcmem64[i];
190    a1 = a1 + data.l32.l;
191    b1 = b1 + a1;
192    a1 = a1 + data.l32.h;
193    b1 = b1 + a1;
194    dstmem64[i] = data.l64;
195    i++;
196
197    // Warm cpu up.
198    a = a * b;
199    b = b + c;
200
201    data.l64 = srcmem64[i];
202    a2 = a2 + data.l32.l;
203    b2 = b2 + a2;
204    a2 = a2 + data.l32.h;
205    b2 = b2 + a2;
206    dstmem64[i] = data.l64;
207    i++;
208
209    // Warm cpu up.
210    c = c * d;
211    d = d + d;
212  }
213
214  // Warm cpu up.
215  d = a + b + c + d;
216  if (d == 1.0) {
217    // Reference the result so that it can't be discarded by the compiler.
218    printf("Log: This will probably never happen.\n");
219  }
220
221  checksum->Set(a1, a2, b1, b2);
222  return true;
223}
224
225// x86_64 SSE2 assembly implementation of fast and stressful Adler memory copy.
226bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
227                    unsigned int size_in_bytes, AdlerChecksum *checksum) {
228// Use assembly implementation where supported.
229#if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
230
231// Pull a bit of tricky preprocessing to make the inline asm both
232// 32 bit and 64 bit.
233#ifdef STRESSAPPTEST_CPU_I686  // Instead of coding both, x86...
234#define rAX "%%eax"
235#define rCX "%%ecx"
236#define rDX "%%edx"
237#define rBX "%%ebx"
238#define rSP "%%esp"
239#define rBP "%%ebp"
240#define rSI "%%esi"
241#define rDI "%%edi"
242#endif
243
244#ifdef STRESSAPPTEST_CPU_X86_64  // ...and x64, we use rXX macros.
245#define rAX "%%rax"
246#define rCX "%%rcx"
247#define rDX "%%rdx"
248#define rBX "%%rbx"
249#define rSP "%%rsp"
250#define rBP "%%rbp"
251#define rSI "%%rsi"
252#define rDI "%%rdi"
253#endif
254
255  // Elements 0 to 3 are used for holding checksum terms a1, a2,
256  // b1, b2 respectively. These elements are filled by asm code.
257  // Elements 4 and 5 are used by asm code to for ANDing MMX data and removing
258  // 2 words from each MMX register (A MMX reg has 4 words, by ANDing we are
259  // setting word index 0 and word index 2 to zero).
260  // Element 6 and 7 are used for setting a1 and a2 to 1.
261  volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) =
262      {0, 0, 0, 0, 0x00000000ffffffffUL, 0x00000000ffffffffUL, 1, 1};
263
264  if ((size_in_bytes >> 19) > 0) {
265    // Size is too large. Must be less than 2^19 bytes = 512 KB.
266    return false;
267  }
268
269  // Number of 32-bit words which are not added to a1/a2 in the main loop.
270  uint32 remaining_words = (size_in_bytes % 48) / 4;
271
272  // Since we are moving 48 bytes at a time number of iterations = total size/48
273  // is value of counter.
274  uint32 num_of_48_byte_units = size_in_bytes / 48;
275
276  asm volatile (
277      // Source address is in ESI (extended source index)
278      // destination is in EDI (extended destination index)
279      // and counter is already in ECX (extended counter
280      // index).
281      "cmp  $0, " rCX ";"   // Compare counter to zero.
282      "jz END;"
283
284      // XMM6 is initialized with 1 and XMM7 with 0.
285      "prefetchnta  0(" rSI ");"
286      "prefetchnta 64(" rSI ");"
287      "movdqu   48(" rAX "), %%xmm6;"
288      "xorps      %%xmm7, %%xmm7;"
289
290      // Start of the loop which copies 48 bytes from source to dst each time.
291      "TOP:\n"
292
293      // Make 6 moves each of 16 bytes from srcmem to XMM registers.
294      // We are using 2 words out of 4 words in each XMM register,
295      // word index 0 and word index 2
296      "movdqa   0(" rSI "), %%xmm0;"
297      "movdqu   4(" rSI "), %%xmm1;"  // Be careful to use unaligned move here.
298      "movdqa  16(" rSI "), %%xmm2;"
299      "movdqu  20(" rSI "), %%xmm3;"
300      "movdqa  32(" rSI "), %%xmm4;"
301      "movdqu  36(" rSI "), %%xmm5;"
302
303      // Move 3 * 16 bytes from XMM registers to dstmem.
304      // Note: this copy must be performed before pinsrw instructions since
305      // they will modify the XMM registers.
306      "movntdq %%xmm0,  0(" rDI ");"
307      "movntdq %%xmm2, 16(" rDI ");"
308      "movntdq %%xmm4, 32(" rDI ");"
309
310      // Sets the word[1] and word[3] of XMM0 to XMM5 to zero.
311      "andps 32(" rAX "), %%xmm0;"
312      "andps 32(" rAX "), %%xmm1;"
313      "andps 32(" rAX "), %%xmm2;"
314      "andps 32(" rAX "), %%xmm3;"
315      "andps 32(" rAX "), %%xmm4;"
316      "andps 32(" rAX "), %%xmm5;"
317
318      // Add XMM0 to XMM6 and then add XMM6 to XMM7.
319      // Repeat this for XMM1, ..., XMM5.
320      // Overflow(for XMM7) can occur only if there are more
321      // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so
322      // if size_in_bytes > 2^19 than overflow occurs.
323      "paddq %%xmm0, %%xmm6;"
324      "paddq %%xmm6, %%xmm7;"
325      "paddq %%xmm1, %%xmm6;"
326      "paddq %%xmm6, %%xmm7;"
327      "paddq %%xmm2, %%xmm6;"
328      "paddq %%xmm6, %%xmm7;"
329      "paddq %%xmm3, %%xmm6;"
330      "paddq %%xmm6, %%xmm7;"
331      "paddq %%xmm4, %%xmm6;"
332      "paddq %%xmm6, %%xmm7;"
333      "paddq %%xmm5, %%xmm6;"
334      "paddq %%xmm6, %%xmm7;"
335
336      // Increment ESI and EDI by 48 bytes and decrement counter by 1.
337      "add $48, " rSI ";"
338      "add $48, " rDI ";"
339      "prefetchnta  0(" rSI ");"
340      "prefetchnta 64(" rSI ");"
341      "dec " rCX ";"
342      "jnz TOP;"
343
344      // Now only remaining_words 32-bit words are left.
345      // make a loop, add first two words to a1 and next two to a2 (just like
346      // above loop, the only extra thing we are doing is rechecking
347      // rDX (=remaining_words) everytime we add a number to a1/a2.
348      "REM_IS_STILL_NOT_ZERO:\n"
349      // Unless remaining_words becomes less than 4 words(16 bytes)
350      // there is not much issue and remaining_words will always
351      // be a multiple of four by assumption.
352      "cmp $4, " rDX ";"
353      // In case for some weird reasons if remaining_words becomes
354      // less than 4 but not zero then also break the code and go off to END.
355      "jl END;"
356      // Otherwise just go on and copy data in chunks of 4-words at a time till
357      // whole data (<48 bytes) is copied.
358      "movdqa  0(" rSI "), %%xmm0;"    // Copy next 4-words to XMM0 and to XMM1.
359
360      "movdqa  0(" rSI "), %%xmm5;"    // Accomplish movdqu 4(%rSI) without
361      "pshufd $0x39, %%xmm5, %%xmm1;"  // indexing off memory boundary.
362
363      "movntdq %%xmm0,  0(" rDI ");"   // Copy 4-words to destination.
364      "andps  32(" rAX "), %%xmm0;"
365      "andps  32(" rAX "), %%xmm1;"
366      "paddq     %%xmm0, %%xmm6;"
367      "paddq     %%xmm6, %%xmm7;"
368      "paddq     %%xmm1, %%xmm6;"
369      "paddq     %%xmm6, %%xmm7;"
370      "add $16, " rSI ";"
371      "add $16, " rDI ";"
372      "sub $4, " rDX ";"
373      // Decrement %rDX by 4 since %rDX is number of 32-bit
374      // words left after considering all 48-byte units.
375      "jmp REM_IS_STILL_NOT_ZERO;"
376
377      "END:\n"
378      // Report checksum values A and B (both right now are two concatenated
379      // 64 bit numbers and have to be converted to 64 bit numbers)
380      // seems like Adler128 (since size of each part is 4 byte rather than
381      // 1 byte).
382      "movdqa %%xmm6,   0(" rAX ");"
383      "movdqa %%xmm7,  16(" rAX ");"
384      "sfence;"
385
386      // No output registers.
387      :
388      // Input registers.
389      : "S" (srcmem64), "D" (dstmem64), "a" (checksum_arr),
390        "c" (num_of_48_byte_units), "d" (remaining_words)
391  );  // asm.
392
393  if (checksum != NULL) {
394    checksum->Set(checksum_arr[0], checksum_arr[1],
395                  checksum_arr[2], checksum_arr[3]);
396  }
397
398  // Everything went fine, so return true (this does not mean
399  // that there is no problem with memory this just mean that data was copied
400  // from src to dst and checksum was calculated successfully).
401  return true;
402#else
403  // Fall back to C implementation for anything else.
404  return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum);
405#endif
406}
407