1#include <iostream> 2#include <chrono> 3#include <vector> 4#include <algorithm> 5#include <numeric> 6#include <stdlib.h> 7#include <memory> 8#include <cmath> 9#include <string> 10 11using namespace std; 12 13const size_t size_start = 64; 14const size_t size_end = 16 * (1ull << 20); 15const size_t samples = 2048; 16size_t size_per_test = 64 * (1ull << 20); 17size_t tot_sum = 0; 18 19void __attribute__((noinline)) memcpy_noinline(void *dst, void *src, size_t size); 20void __attribute__((noinline)) memset_noinline(void *dst, int value, size_t size); 21uint64_t __attribute__((noinline)) sum(volatile void *src, size_t size); 22 23enum BenchType { 24 MemcpyBench, 25 MemsetBench, 26 SumBench, 27}; 28 29int main(int argc, char *argv[]) 30{ 31 BenchType type; 32 if (argc <= 1) { 33 cerr << "memcpy_perf [--memcpy|--memset|--sum]" << endl; 34 return 0; 35 } 36 if (string(argv[1]) == string("--memcpy")) { 37 type = MemcpyBench; 38 } else if (string(argv[1]) == string("--memset")) { 39 type = MemsetBench; 40 } else if (string(argv[1]) == string("--sum")) { 41 type = SumBench; 42 } else { 43 type = MemcpyBench; 44 } 45 46 unique_ptr<uint8_t[]> src(new uint8_t[size_end]); 47 unique_ptr<uint8_t[]> dst(new uint8_t[size_end]); 48 memset(src.get(), 1, size_end); 49 50 double start_pow = log10(size_start); 51 double end_pow = log10(size_end); 52 double pow_inc = (end_pow - start_pow) / samples; 53 54 //cout << "src: " << (uintptr_t)src.get() << endl; 55 //cout << "dst: " << (uintptr_t)dst.get() << endl; 56 57 for (double cur_pow = start_pow; cur_pow <= end_pow; cur_pow += pow_inc) { 58 chrono::time_point<chrono::high_resolution_clock> copy_start, copy_end; 59 60 size_t cur_size = (size_t)pow(10.0, cur_pow); 61 size_t iter_per_size = size_per_test / cur_size; 62 63 // run benchmark 64 switch (type) { 65 case MemsetBench: { 66 memcpy_noinline(src.get(), dst.get(), cur_size); 67 memset_noinline(dst.get(), 0xdeadbeef, cur_size); 68 copy_start = chrono::high_resolution_clock::now(); 69 for (int i = 0; i < iter_per_size; i++) { 70 memset_noinline(dst.get(), 0xdeadbeef, cur_size); 71 } 72 copy_end = chrono::high_resolution_clock::now(); 73 break; 74 } 75 case MemcpyBench: { 76 memcpy_noinline(dst.get(), src.get(), cur_size); 77 memcpy_noinline(src.get(), dst.get(), cur_size); 78 copy_start = chrono::high_resolution_clock::now(); 79 for (int i = 0; i < iter_per_size; i++) { 80 memcpy_noinline(dst.get(), src.get(), cur_size); 81 } 82 copy_end = chrono::high_resolution_clock::now(); 83 break; 84 } 85 case SumBench: { 86 uint64_t s = 0; 87 s += sum(src.get(), cur_size); 88 copy_start = chrono::high_resolution_clock::now(); 89 for (int i = 0; i < iter_per_size; i++) { 90 s += sum(src.get(), cur_size); 91 } 92 copy_end = chrono::high_resolution_clock::now(); 93 tot_sum += s; 94 break; 95 } 96 } 97 98 double ns_per_copy = chrono::duration_cast<chrono::nanoseconds>(copy_end - copy_start).count() / double(iter_per_size); 99 double gb_per_sec = ((double)cur_size / (1ull<<30)) / (ns_per_copy / 1.0E9); 100 if (type == MemcpyBench) 101 gb_per_sec *= 2.0; 102 cout << "size: " << cur_size << ", perf: " << gb_per_sec << "GB/s, iter: " << iter_per_size << endl; 103 } 104 return 0; 105} 106