1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef __BANDWIDTH_H__
18#define __BANDWIDTH_H__
19
20#include <stdlib.h>
21#include <string.h>
22
23#include "utils/Compat.h"
24#include "memtest.h"
25
26// Bandwidth Class definitions.
27class BandwidthBenchmark {
28public:
29    BandwidthBenchmark()
30        : _size(0),
31          _num_warm_loops(DEFAULT_NUM_WARM_LOOPS),
32          _num_loops(DEFAULT_NUM_LOOPS) {}
33    virtual ~BandwidthBenchmark() {}
34
35    bool run() {
36        if (_size == 0) {
37            return false;
38        }
39        if (!canRun()) {
40            return false;
41        }
42
43        bench(_num_warm_loops);
44
45        nsecs_t t = system_time();
46        bench(_num_loops);
47        t = system_time() - t;
48
49        _mb_per_sec = (_size*(_num_loops/_BYTES_PER_MB))/(t/_NUM_NS_PER_SEC);
50
51        return true;
52    }
53
54    bool canRun() { return !usesNeon() || isNeonSupported(); }
55
56    virtual bool setSize(size_t size) = 0;
57
58    virtual const char *getName() = 0;
59
60    virtual bool verify() = 0;
61
62    virtual bool usesNeon() { return false; }
63
64    bool isNeonSupported() {
65#if defined(__ARM_NEON__)
66        return true;
67#else
68        return false;
69#endif
70    }
71
72    // Accessors/mutators.
73    double mb_per_sec() { return _mb_per_sec; }
74    size_t num_warm_loops() { return _num_warm_loops; }
75    size_t num_loops() { return _num_loops; }
76    size_t size() { return _size; }
77
78    void set_num_warm_loops(size_t num_warm_loops) {
79        _num_warm_loops = num_warm_loops;
80    }
81    void set_num_loops(size_t num_loops) { _num_loops = num_loops; }
82
83    // Static constants
84    static const unsigned int DEFAULT_NUM_WARM_LOOPS = 1000000;
85    static const unsigned int DEFAULT_NUM_LOOPS = 20000000;
86
87protected:
88    virtual void bench(size_t num_loops) = 0;
89
90    double _mb_per_sec;
91    size_t _size;
92    size_t _num_warm_loops;
93    size_t _num_loops;
94
95private:
96    // Static constants
97    static const CONSTEXPR double _NUM_NS_PER_SEC = 1000000000.0;
98    static const CONSTEXPR double _BYTES_PER_MB = 1024.0* 1024.0;
99};
100
101class CopyBandwidthBenchmark : public BandwidthBenchmark {
102public:
103    CopyBandwidthBenchmark() : BandwidthBenchmark(), _src(NULL), _dst(NULL) { }
104
105    bool setSize(size_t size) {
106        if (_src) {
107           free(_src);
108           _src = NULL;
109        }
110        if (_dst) {
111            free(_dst);
112            _dst = NULL;
113        }
114
115        if (size == 0) {
116            _size = DEFAULT_COPY_SIZE;
117        } else {
118            _size = size;
119        }
120
121        _src = reinterpret_cast<char*>(memalign(64, _size));
122        if (!_src) {
123            perror("Failed to allocate memory for test.");
124            return false;
125        }
126        _dst = reinterpret_cast<char*>(memalign(64, _size));
127        if (!_dst) {
128            perror("Failed to allocate memory for test.");
129            return false;
130        }
131
132        return true;
133    }
134    virtual ~CopyBandwidthBenchmark() {
135        if (_src) {
136            free(_src);
137            _src = NULL;
138        }
139        if (_dst) {
140            free(_dst);
141            _dst = NULL;
142        }
143    }
144
145    bool verify() {
146        memset(_src, 0x23, _size);
147        memset(_dst, 0, _size);
148        bench(1);
149        if (memcmp(_src, _dst, _size) != 0) {
150            printf("Buffers failed to compare after one loop.\n");
151            return false;
152        }
153
154        memset(_src, 0x23, _size);
155        memset(_dst, 0, _size);
156        _num_loops = 2;
157        bench(2);
158        if (memcmp(_src, _dst, _size) != 0) {
159            printf("Buffers failed to compare after two loops.\n");
160            return false;
161        }
162
163        return true;
164    }
165
166protected:
167    char *_src;
168    char *_dst;
169
170    static const unsigned int DEFAULT_COPY_SIZE = 8000;
171};
172
173class CopyLdrdStrdBenchmark : public CopyBandwidthBenchmark {
174public:
175    CopyLdrdStrdBenchmark() : CopyBandwidthBenchmark() { }
176    virtual ~CopyLdrdStrdBenchmark() {}
177
178    const char *getName() { return "ldrd/strd"; }
179
180protected:
181    // Copy using ldrd/strd instructions.
182    void bench(size_t num_loops) {
183        asm volatile(
184            "stmfd sp!, {r0,r1,r2,r3,r4,r6,r7}\n"
185
186            "mov r0, %0\n"
187            "mov r1, %1\n"
188            "mov r2, %2\n"
189            "mov r3, %3\n"
190
191            "0:\n"
192            "mov r4, r2, lsr #6\n"
193
194            "1:\n"
195            "ldrd r6, r7, [r0]\n"
196            "strd r6, r7, [r1]\n"
197            "ldrd r6, r7, [r0, #8]\n"
198            "strd r6, r7, [r1, #8]\n"
199            "ldrd r6, r7, [r0, #16]\n"
200            "strd r6, r7, [r1, #16]\n"
201            "ldrd r6, r7, [r0, #24]\n"
202            "strd r6, r7, [r1, #24]\n"
203            "ldrd r6, r7, [r0, #32]\n"
204            "strd r6, r7, [r1, #32]\n"
205            "ldrd r6, r7, [r0, #40]\n"
206            "strd r6, r7, [r1, #40]\n"
207            "ldrd r6, r7, [r0, #48]\n"
208            "strd r6, r7, [r1, #48]\n"
209            "ldrd r6, r7, [r0, #56]\n"
210            "strd r6, r7, [r1, #56]\n"
211
212            "add  r0, r0, #64\n"
213            "add  r1, r1, #64\n"
214            "subs r4, r4, #1\n"
215            "bgt 1b\n"
216
217            "sub r0, r0, r2\n"
218            "sub r1, r1, r2\n"
219            "subs r3, r3, #1\n"
220            "bgt 0b\n"
221
222            "ldmfd sp!, {r0,r1,r2,r3,r4,r6,r7}\n"
223        :: "r" (_src), "r" (_dst), "r" (_size), "r" (num_loops) : "r0", "r1", "r2", "r3");
224    }
225};
226
227class CopyLdmiaStmiaBenchmark : public CopyBandwidthBenchmark {
228public:
229    CopyLdmiaStmiaBenchmark() : CopyBandwidthBenchmark() { }
230    virtual ~CopyLdmiaStmiaBenchmark() {}
231
232    const char *getName() { return "ldmia/stmia"; }
233
234protected:
235    // Copy using ldmia/stmia instructions.
236    void bench(size_t num_loops) {
237        asm volatile(
238            "stmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12}\n"
239
240            "mov r0, %0\n"
241            "mov r1, %1\n"
242            "mov r2, %2\n"
243            "mov r3, %3\n"
244
245            "0:\n"
246            "mov r4, r2, lsr #6\n"
247
248            "1:\n"
249            "ldmia r0!, {r5, r6, r7, r8, r9, r10, r11, r12}\n"
250            "stmia r1!, {r5, r6, r7, r8, r9, r10, r11, r12}\n"
251            "subs r4, r4, #1\n"
252            "ldmia r0!, {r5, r6, r7, r8, r9, r10, r11, r12}\n"
253            "stmia r1!, {r5, r6, r7, r8, r9, r10, r11, r12}\n"
254            "bgt 1b\n"
255
256            "sub r0, r0, r2\n"
257            "sub r1, r1, r2\n"
258            "subs r3, r3, #1\n"
259            "bgt 0b\n"
260
261            "ldmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12}\n"
262        :: "r" (_src), "r" (_dst), "r" (_size), "r" (num_loops) : "r0", "r1", "r2", "r3");
263    }
264};
265
266class CopyVld1Vst1Benchmark : public CopyBandwidthBenchmark {
267public:
268    CopyVld1Vst1Benchmark() : CopyBandwidthBenchmark() { }
269    virtual ~CopyVld1Vst1Benchmark() {}
270
271    const char *getName() { return "vld1/vst1"; }
272
273    bool usesNeon() { return true; }
274
275protected:
276    // Copy using vld1/vst1 instructions.
277#if defined(__ARM_NEON__)
278    void bench(size_t num_loops) {
279        asm volatile(
280            "stmfd sp!, {r0,r1,r2,r3,r4}\n"
281
282            "mov r0, %0\n"
283            "mov r1, %1\n"
284            "mov r2, %2\n"
285            "mov r3, %3\n"
286
287            "0:\n"
288            "mov r4, r2, lsr #6\n"
289
290            "1:\n"
291            "vld1.8 {d0-d3}, [r0]!\n"
292            "vld1.8 {d4-d7}, [r0]!\n"
293            "subs r4, r4, #1\n"
294            "vst1.8 {d0-d3}, [r1:128]!\n"
295            "vst1.8 {d4-d7}, [r1:128]!\n"
296            "bgt 1b\n"
297
298            "sub r0, r0, r2\n"
299            "sub r1, r1, r2\n"
300            "subs r3, r3, #1\n"
301            "bgt 0b\n"
302
303            "ldmfd sp!, {r0,r1,r2,r3,r4}\n"
304        :: "r" (_src), "r" (_dst), "r" (_size), "r" (num_loops) : "r0", "r1", "r2", "r3");
305#else
306    void bench(size_t) {
307#endif
308    }
309};
310
311class CopyVldrVstrBenchmark : public CopyBandwidthBenchmark {
312public:
313    CopyVldrVstrBenchmark() : CopyBandwidthBenchmark() { }
314    virtual ~CopyVldrVstrBenchmark() {}
315
316    const char *getName() { return "vldr/vstr"; }
317
318    bool usesNeon() { return true; }
319
320protected:
321    // Copy using vldr/vstr instructions.
322#if defined(__ARM_NEON__)
323    void bench(size_t num_loops) {
324        asm volatile(
325            "stmfd sp!, {r0,r1,r2,r3,r4}\n"
326
327            "mov r0, %0\n"
328            "mov r1, %1\n"
329            "mov r2, %2\n"
330            "mov r3, %3\n"
331
332            "0:\n"
333            "mov r4, r2, lsr #6\n"
334
335            "1:\n"
336            "vldr d0, [r0, #0]\n"
337            "subs r4, r4, #1\n"
338            "vldr d1, [r0, #8]\n"
339            "vstr d0, [r1, #0]\n"
340            "vldr d0, [r0, #16]\n"
341            "vstr d1, [r1, #8]\n"
342            "vldr d1, [r0, #24]\n"
343            "vstr d0, [r1, #16]\n"
344            "vldr d0, [r0, #32]\n"
345            "vstr d1, [r1, #24]\n"
346            "vldr d1, [r0, #40]\n"
347            "vstr d0, [r1, #32]\n"
348            "vldr d0, [r0, #48]\n"
349            "vstr d1, [r1, #40]\n"
350            "vldr d1, [r0, #56]\n"
351            "vstr d0, [r1, #48]\n"
352            "add r0, r0, #64\n"
353            "vstr d1, [r1, #56]\n"
354            "add r1, r1, #64\n"
355            "bgt 1b\n"
356
357            "sub r0, r0, r2\n"
358            "sub r1, r1, r2\n"
359            "subs r3, r3, #1\n"
360            "bgt 0b\n"
361
362            "ldmfd sp!, {r0,r1,r2,r3,r4}\n"
363        :: "r" (_src), "r" (_dst), "r" (_size), "r" (num_loops) : "r0", "r1", "r2", "r3");
364#else
365    void bench(size_t) {
366#endif
367    }
368};
369
370class CopyVldmiaVstmiaBenchmark : public CopyBandwidthBenchmark {
371public:
372    CopyVldmiaVstmiaBenchmark() : CopyBandwidthBenchmark() { }
373    virtual ~CopyVldmiaVstmiaBenchmark() {}
374
375    const char *getName() { return "vldmia/vstmia"; }
376
377    bool usesNeon() { return true; }
378
379protected:
380    // Copy using vldmia/vstmia instructions.
381#if defined(__ARM_NEON__)
382    void bench(size_t num_loops) {
383        asm volatile(
384            "stmfd sp!, {r0,r1,r2,r3,r4}\n"
385
386            "mov r0, %0\n"
387            "mov r1, %1\n"
388            "mov r2, %2\n"
389            "mov r3, %3\n"
390
391            "0:\n"
392            "mov r4, r2, lsr #6\n"
393
394            "1:\n"
395            "vldmia r0!, {d0-d7}\n"
396            "subs r4, r4, #1\n"
397            "vstmia r1!, {d0-d7}\n"
398            "bgt 1b\n"
399
400            "sub r0, r0, r2\n"
401            "sub r1, r1, r2\n"
402            "subs r3, r3, #1\n"
403            "bgt 0b\n"
404
405            "ldmfd sp!, {r0,r1,r2,r3,r4}\n"
406        :: "r" (_src), "r" (_dst), "r" (_size), "r" (num_loops) : "r0", "r1", "r2", "r3");
407#else
408    void bench(size_t) {
409#endif
410    }
411};
412
413class MemcpyBenchmark : public CopyBandwidthBenchmark {
414public:
415    MemcpyBenchmark() : CopyBandwidthBenchmark() { }
416    virtual ~MemcpyBenchmark() {}
417
418    const char *getName() { return "memcpy"; }
419
420protected:
421    void bench(size_t num_loops) {
422        for (size_t i = 0; i < num_loops; i++) {
423            memcpy(_dst, _src, _size);
424        }
425    }
426};
427
428class SingleBufferBandwidthBenchmark : public BandwidthBenchmark {
429public:
430    SingleBufferBandwidthBenchmark() : BandwidthBenchmark(), _buffer(NULL) { }
431    virtual ~SingleBufferBandwidthBenchmark() {
432        if (_buffer) {
433            free(_buffer);
434            _buffer = NULL;
435        }
436    }
437
438    bool setSize(size_t size) {
439        if (_buffer) {
440            free(_buffer);
441            _buffer = NULL;
442        }
443
444        if (size == 0) {
445            _size = DEFAULT_SINGLE_BUFFER_SIZE;
446        } else {
447            _size = size;
448        }
449
450        _buffer = reinterpret_cast<char*>(memalign(64, _size));
451        if (!_buffer) {
452            perror("Failed to allocate memory for test.");
453            return false;
454        }
455        memset(_buffer, 0, _size);
456
457        return true;
458    }
459
460    bool verify() { return true; }
461
462protected:
463    char *_buffer;
464
465    static const unsigned int DEFAULT_SINGLE_BUFFER_SIZE = 16000;
466};
467
468class WriteBandwidthBenchmark : public SingleBufferBandwidthBenchmark {
469public:
470    WriteBandwidthBenchmark() : SingleBufferBandwidthBenchmark() { }
471    virtual ~WriteBandwidthBenchmark() { }
472
473    bool verify() {
474        memset(_buffer, 0, _size);
475        bench(1);
476        for (size_t i = 0; i < _size; i++) {
477            if (_buffer[i] != 1) {
478                printf("Buffer failed to compare after one loop.\n");
479                return false;
480            }
481        }
482
483        memset(_buffer, 0, _size);
484        bench(2);
485        for (size_t i = 0; i < _size; i++) {
486            if (_buffer[i] != 2) {
487                printf("Buffer failed to compare after two loops.\n");
488                return false;
489            }
490        }
491
492        return true;
493    }
494};
495
496class WriteStrdBenchmark : public WriteBandwidthBenchmark {
497public:
498    WriteStrdBenchmark() : WriteBandwidthBenchmark() { }
499    virtual ~WriteStrdBenchmark() {}
500
501    const char *getName() { return "strd"; }
502
503protected:
504    // Write a given value using strd.
505    void bench(size_t num_loops) {
506        asm volatile(
507            "stmfd sp!, {r0,r1,r2,r3,r4,r5}\n"
508
509            "mov r0, %0\n"
510            "mov r1, %1\n"
511            "mov r2, %2\n"
512
513            "mov r4, #0\n"
514            "mov r5, #0\n"
515
516            "0:\n"
517            "mov r3, r1, lsr #5\n"
518
519            "add r4, r4, #0x01010101\n"
520            "mov r5, r4\n"
521
522            "1:\n"
523            "subs r3, r3, #1\n"
524            "strd r4, r5, [r0]\n"
525            "strd r4, r5, [r0, #8]\n"
526            "strd r4, r5, [r0, #16]\n"
527            "strd r4, r5, [r0, #24]\n"
528            "add  r0, r0, #32\n"
529            "bgt 1b\n"
530
531            "sub r0, r0, r1\n"
532            "subs r2, r2, #1\n"
533            "bgt 0b\n"
534
535            "ldmfd sp!, {r0,r1,r2,r3,r4,r5}\n"
536          :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2");
537    }
538};
539
540class WriteStmiaBenchmark : public WriteBandwidthBenchmark {
541public:
542    WriteStmiaBenchmark() : WriteBandwidthBenchmark() { }
543    virtual ~WriteStmiaBenchmark() {}
544
545    const char *getName() { return "stmia"; }
546
547protected:
548      // Write a given value using stmia.
549      void bench(size_t num_loops) {
550          asm volatile(
551              "stmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11}\n"
552
553              "mov r0, %0\n"
554              "mov r1, %1\n"
555              "mov r2, %2\n"
556
557              "mov r4, #0\n"
558
559              "0:\n"
560              "mov r3, r1, lsr #5\n"
561
562              "add r4, r4, #0x01010101\n"
563              "mov r5, r4\n"
564              "mov r6, r4\n"
565              "mov r7, r4\n"
566              "mov r8, r4\n"
567              "mov r9, r4\n"
568              "mov r10, r4\n"
569              "mov r11, r4\n"
570
571              "1:\n"
572              "subs r3, r3, #1\n"
573              "stmia r0!, {r4, r5, r6, r7, r8, r9, r10, r11}\n"
574              "bgt 1b\n"
575
576              "sub r0, r0, r1\n"
577              "subs r2, r2, #1\n"
578              "bgt 0b\n"
579
580              "ldmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11}\n"
581        :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2");
582    }
583};
584
585class WriteVst1Benchmark : public WriteBandwidthBenchmark {
586public:
587    WriteVst1Benchmark() : WriteBandwidthBenchmark() { }
588    virtual ~WriteVst1Benchmark() {}
589
590    const char *getName() { return "vst1"; }
591
592    bool usesNeon() { return true; }
593
594protected:
595    // Write a given value using vst.
596#if defined(__ARM_NEON__)
597    void bench(size_t num_loops) {
598        asm volatile(
599            "stmfd sp!, {r0,r1,r2,r3,r4}\n"
600
601            "mov r0, %0\n"
602            "mov r1, %1\n"
603            "mov r2, %2\n"
604            "mov r4, #0\n"
605
606            "0:\n"
607            "mov r3, r1, lsr #5\n"
608
609            "add r4, r4, #1\n"
610            "vdup.8 d0, r4\n"
611            "vmov d1, d0\n"
612            "vmov d2, d0\n"
613            "vmov d3, d0\n"
614
615            "1:\n"
616            "subs r3, r3, #1\n"
617            "vst1.8 {d0-d3}, [r0:128]!\n"
618            "bgt 1b\n"
619
620            "sub r0, r0, r1\n"
621            "subs r2, r2, #1\n"
622            "bgt 0b\n"
623
624            "ldmfd sp!, {r0,r1,r2,r3,r4}\n"
625        :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2");
626#else
627    void bench(size_t) {
628#endif
629    }
630};
631
632class WriteVstrBenchmark : public WriteBandwidthBenchmark {
633public:
634    WriteVstrBenchmark() : WriteBandwidthBenchmark() { }
635    virtual ~WriteVstrBenchmark() {}
636
637    const char *getName() { return "vstr"; }
638
639    bool usesNeon() { return true; }
640
641protected:
642    // Write a given value using vst.
643#if defined(__ARM_NEON__)
644    void bench(size_t num_loops) {
645        asm volatile(
646            "stmfd sp!, {r0,r1,r2,r3,r4}\n"
647
648            "mov r0, %0\n"
649            "mov r1, %1\n"
650            "mov r2, %2\n"
651            "mov r4, #0\n"
652
653            "0:\n"
654            "mov r3, r1, lsr #5\n"
655
656            "add r4, r4, #1\n"
657            "vdup.8 d0, r4\n"
658            "vmov d1, d0\n"
659            "vmov d2, d0\n"
660            "vmov d3, d0\n"
661
662            "1:\n"
663            "vstr d0, [r0, #0]\n"
664            "subs r3, r3, #1\n"
665            "vstr d1, [r0, #8]\n"
666            "vstr d0, [r0, #16]\n"
667            "vstr d1, [r0, #24]\n"
668            "add r0, r0, #32\n"
669            "bgt 1b\n"
670
671            "sub r0, r0, r1\n"
672            "subs r2, r2, #1\n"
673            "bgt 0b\n"
674
675            "ldmfd sp!, {r0,r1,r2,r3,r4}\n"
676        :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2");
677#else
678    void bench(size_t) {
679#endif
680    }
681};
682
683class WriteVstmiaBenchmark : public WriteBandwidthBenchmark {
684public:
685    WriteVstmiaBenchmark() : WriteBandwidthBenchmark() { }
686    virtual ~WriteVstmiaBenchmark() {}
687
688    const char *getName() { return "vstmia"; }
689
690    bool usesNeon() { return true; }
691
692protected:
693    // Write a given value using vstmia.
694#if defined(__ARM_NEON__)
695    void bench(size_t num_loops) {
696        asm volatile(
697            "stmfd sp!, {r0,r1,r2,r3,r4}\n"
698
699            "mov r0, %0\n"
700            "mov r1, %1\n"
701            "mov r2, %2\n"
702            "mov r4, #0\n"
703
704            "0:\n"
705            "mov r3, r1, lsr #5\n"
706
707            "add r4, r4, #1\n"
708            "vdup.8 d0, r4\n"
709            "vmov d1, d0\n"
710            "vmov d2, d0\n"
711            "vmov d3, d0\n"
712
713            "1:\n"
714            "subs r3, r3, #1\n"
715            "vstmia r0!, {d0-d3}\n"
716            "bgt 1b\n"
717
718            "sub r0, r0, r1\n"
719            "subs r2, r2, #1\n"
720            "bgt 0b\n"
721
722            "ldmfd sp!, {r0,r1,r2,r3,r4}\n"
723        :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2");
724#else
725    void bench(size_t) {
726#endif
727    }
728};
729
730class MemsetBenchmark : public WriteBandwidthBenchmark {
731public:
732    MemsetBenchmark() : WriteBandwidthBenchmark() { }
733    virtual ~MemsetBenchmark() {}
734
735    const char *getName() { return "memset"; }
736
737protected:
738    void bench(size_t num_loops) {
739        for (size_t i = 0; i < num_loops; i++) {
740            memset(_buffer, (i % 255) + 1, _size);
741        }
742    }
743};
744
745class ReadLdrdBenchmark : public SingleBufferBandwidthBenchmark {
746public:
747    ReadLdrdBenchmark() : SingleBufferBandwidthBenchmark() { }
748    virtual ~ReadLdrdBenchmark() {}
749
750    const char *getName() { return "ldrd"; }
751
752protected:
753    // Write a given value using strd.
754    void bench(size_t num_loops) {
755        asm volatile(
756            "stmfd sp!, {r0,r1,r2,r3,r4,r5}\n"
757
758            "mov r0, %0\n"
759            "mov r1, %1\n"
760            "mov r2, %2\n"
761
762            "0:\n"
763            "mov r3, r1, lsr #5\n"
764
765            "1:\n"
766            "subs r3, r3, #1\n"
767            "ldrd r4, r5, [r0]\n"
768            "ldrd r4, r5, [r0, #8]\n"
769            "ldrd r4, r5, [r0, #16]\n"
770            "ldrd r4, r5, [r0, #24]\n"
771            "add  r0, r0, #32\n"
772            "bgt 1b\n"
773
774            "sub r0, r0, r1\n"
775            "subs r2, r2, #1\n"
776            "bgt 0b\n"
777
778            "ldmfd sp!, {r0,r1,r2,r3,r4,r5}\n"
779          :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2");
780    }
781};
782
783class ReadLdmiaBenchmark : public SingleBufferBandwidthBenchmark {
784public:
785    ReadLdmiaBenchmark() : SingleBufferBandwidthBenchmark() { }
786    virtual ~ReadLdmiaBenchmark() {}
787
788    const char *getName() { return "ldmia"; }
789
790protected:
791      // Write a given value using stmia.
792      void bench(size_t num_loops) {
793          asm volatile(
794              "stmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11}\n"
795
796              "mov r0, %0\n"
797              "mov r1, %1\n"
798              "mov r2, %2\n"
799
800              "0:\n"
801              "mov r3, r1, lsr #5\n"
802
803              "1:\n"
804              "subs r3, r3, #1\n"
805              "ldmia r0!, {r4, r5, r6, r7, r8, r9, r10, r11}\n"
806              "bgt 1b\n"
807
808              "sub r0, r0, r1\n"
809              "subs r2, r2, #1\n"
810              "bgt 0b\n"
811
812              "ldmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11}\n"
813        :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2");
814    }
815};
816
817class ReadVld1Benchmark : public SingleBufferBandwidthBenchmark {
818public:
819    ReadVld1Benchmark() : SingleBufferBandwidthBenchmark() { }
820    virtual ~ReadVld1Benchmark() {}
821
822    const char *getName() { return "vld1"; }
823
824    bool usesNeon() { return true; }
825
826protected:
827    // Write a given value using vst.
828#if defined(__ARM_NEON__)
829    void bench(size_t num_loops) {
830        asm volatile(
831            "stmfd sp!, {r0,r1,r2,r3}\n"
832
833            "mov r0, %0\n"
834            "mov r1, %1\n"
835            "mov r2, %2\n"
836
837            "0:\n"
838            "mov r3, r1, lsr #5\n"
839
840            "1:\n"
841            "subs r3, r3, #1\n"
842            "vld1.8 {d0-d3}, [r0:128]!\n"
843            "bgt 1b\n"
844
845            "sub r0, r0, r1\n"
846            "subs r2, r2, #1\n"
847            "bgt 0b\n"
848
849            "ldmfd sp!, {r0,r1,r2,r3}\n"
850        :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2");
851#else
852    void bench(size_t) {
853#endif
854    }
855};
856
857class ReadVldrBenchmark : public SingleBufferBandwidthBenchmark {
858public:
859    ReadVldrBenchmark() : SingleBufferBandwidthBenchmark() { }
860    virtual ~ReadVldrBenchmark() {}
861
862    const char *getName() { return "vldr"; }
863
864    bool usesNeon() { return true; }
865
866protected:
867    // Write a given value using vst.
868#if defined(__ARM_NEON__)
869    void bench(size_t num_loops) {
870        asm volatile(
871            "stmfd sp!, {r0,r1,r2,r3}\n"
872
873            "mov r0, %0\n"
874            "mov r1, %1\n"
875            "mov r2, %2\n"
876
877            "0:\n"
878            "mov r3, r1, lsr #5\n"
879
880            "1:\n"
881            "vldr d0, [r0, #0]\n"
882            "subs r3, r3, #1\n"
883            "vldr d1, [r0, #8]\n"
884            "vldr d0, [r0, #16]\n"
885            "vldr d1, [r0, #24]\n"
886            "add r0, r0, #32\n"
887            "bgt 1b\n"
888
889            "sub r0, r0, r1\n"
890            "subs r2, r2, #1\n"
891            "bgt 0b\n"
892
893            "ldmfd sp!, {r0,r1,r2,r3}\n"
894        :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2");
895#else
896    void bench(size_t) {
897#endif
898    }
899};
900
901
902class ReadVldmiaBenchmark : public SingleBufferBandwidthBenchmark {
903public:
904    ReadVldmiaBenchmark() : SingleBufferBandwidthBenchmark() { }
905    virtual ~ReadVldmiaBenchmark() {}
906
907    const char *getName() { return "vldmia"; }
908
909    bool usesNeon() { return true; }
910
911protected:
912    // Write a given value using vstmia.
913#if defined(__ARM_NEON__)
914    void bench(size_t num_loops) {
915        asm volatile(
916            "stmfd sp!, {r0,r1,r2,r3}\n"
917
918            "mov r0, %0\n"
919            "mov r1, %1\n"
920            "mov r2, %2\n"
921
922            "0:\n"
923            "mov r3, r1, lsr #5\n"
924
925            "1:\n"
926            "subs r3, r3, #1\n"
927            "vldmia r0!, {d0-d3}\n"
928            "bgt 1b\n"
929
930            "sub r0, r0, r1\n"
931            "subs r2, r2, #1\n"
932            "bgt 0b\n"
933
934            "ldmfd sp!, {r0,r1,r2,r3}\n"
935        :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2");
936#else
937    void bench(size_t) {
938#endif
939    }
940};
941
942#endif  // __BANDWIDTH_H__
943