1#include <iostream>
2#include <chrono>
3#include <vector>
4#include <algorithm>
5#include <numeric>
6#include <stdlib.h>
7#include <memory>
8#include <cmath>
9#include <string>
10
11using namespace std;
12
13const size_t size_start = 64;
14const size_t size_end = 16 * (1ull << 20);
15const size_t samples = 2048;
16size_t size_per_test = 64 * (1ull << 20);
17size_t tot_sum = 0;
18
19void __attribute__((noinline)) memcpy_noinline(void *dst, void *src, size_t size);
20void __attribute__((noinline)) memset_noinline(void *dst, int value, size_t size);
21uint64_t __attribute__((noinline)) sum(volatile void *src, size_t size);
22
23enum BenchType {
24    MemcpyBench,
25    MemsetBench,
26    SumBench,
27};
28
29int main(int argc, char *argv[])
30{
31    BenchType type;
32    if (argc <= 1) {
33        cerr << "memcpy_perf [--memcpy|--memset|--sum]" << endl;
34        return 0;
35    }
36    if (string(argv[1]) == string("--memcpy")) {
37        type = MemcpyBench;
38    } else if (string(argv[1]) == string("--memset")) {
39        type = MemsetBench;
40    } else if (string(argv[1]) == string("--sum")) {
41        type = SumBench;
42    } else {
43        type = MemcpyBench;
44    }
45
46    unique_ptr<uint8_t[]> src(new uint8_t[size_end]);
47    unique_ptr<uint8_t[]> dst(new uint8_t[size_end]);
48    memset(src.get(), 1, size_end);
49
50    double start_pow = log10(size_start);
51    double end_pow = log10(size_end);
52    double pow_inc = (end_pow - start_pow) / samples;
53
54    //cout << "src: " << (uintptr_t)src.get() << endl;
55    //cout << "dst: " <<  (uintptr_t)dst.get() << endl;
56
57    for (double cur_pow = start_pow; cur_pow <= end_pow; cur_pow += pow_inc) {
58        chrono::time_point<chrono::high_resolution_clock> copy_start, copy_end;
59
60        size_t cur_size = (size_t)pow(10.0, cur_pow);
61        size_t iter_per_size = size_per_test / cur_size;
62
63        // run benchmark
64        switch (type) {
65            case MemsetBench: {
66                memcpy_noinline(src.get(), dst.get(), cur_size);
67                memset_noinline(dst.get(), 0xdeadbeef, cur_size);
68                copy_start = chrono::high_resolution_clock::now();
69                for (int i = 0; i < iter_per_size; i++) {
70                    memset_noinline(dst.get(), 0xdeadbeef, cur_size);
71                }
72                copy_end = chrono::high_resolution_clock::now();
73                break;
74            }
75            case MemcpyBench: {
76                memcpy_noinline(dst.get(), src.get(), cur_size);
77                memcpy_noinline(src.get(), dst.get(), cur_size);
78                copy_start = chrono::high_resolution_clock::now();
79                for (int i = 0; i < iter_per_size; i++) {
80                    memcpy_noinline(dst.get(), src.get(), cur_size);
81                }
82                copy_end = chrono::high_resolution_clock::now();
83                break;
84            }
85            case SumBench: {
86                uint64_t s = 0;
87                s += sum(src.get(), cur_size);
88                copy_start = chrono::high_resolution_clock::now();
89                for (int i = 0; i < iter_per_size; i++) {
90                    s += sum(src.get(), cur_size);
91                }
92                copy_end = chrono::high_resolution_clock::now();
93                tot_sum += s;
94                break;
95            }
96        }
97
98        double ns_per_copy = chrono::duration_cast<chrono::nanoseconds>(copy_end - copy_start).count() / double(iter_per_size);
99        double gb_per_sec = ((double)cur_size / (1ull<<30)) / (ns_per_copy / 1.0E9);
100        if (type == MemcpyBench)
101            gb_per_sec *= 2.0;
102        cout << "size: " << cur_size << ", perf: " << gb_per_sec << "GB/s, iter: " << iter_per_size << endl;
103    }
104    return 0;
105}
106