15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2004, Google Inc.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// All rights reserved.
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Redistribution and use in source and binary forms, with or without
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// modification, are permitted provided that the following conditions are
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// met:
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     * Redistributions of source code must retain the above copyright
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// notice, this list of conditions and the following disclaimer.
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     * Redistributions in binary form must reproduce the above
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// copyright notice, this list of conditions and the following disclaimer
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// in the documentation and/or other materials provided with the
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// distribution.
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     * Neither the name of Google Inc. nor the names of its
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// contributors may be used to endorse or promote products derived from
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// this software without specific prior written permission.
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ----------------------------------------------------------------------
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// CycleClock
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//    A CycleClock tells you the current time in Cycles.  The "time"
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//    is actually time since power-on.  This is like time() but doesn't
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//    involve a system call and is much more precise.
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// NOTE: Not all cpu/platform/kernel combinations guarantee that this
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// clock increments at a constant rate or is synchronized across all logical
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// cpus in a system.
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Also, in some out of order CPU implementations, the CycleClock is not
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// serializing. So if you're trying to count at cycles granularity, your
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// data might be inaccurate due to out of order instruction execution.
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ----------------------------------------------------------------------
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef GOOGLE_BASE_CYCLECLOCK_H_
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define GOOGLE_BASE_CYCLECLOCK_H_
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h"   // make sure we get the def for int64
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/arm_instruction_set_select.h"
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// base/sysinfo.h is really big and we don't want to include it unless
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// it is necessary.
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(__arm__) || defined(__mips__)
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# include "base/sysinfo.h"
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(__MACH__) && defined(__APPLE__)
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)# include <mach/mach_time.h>
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For MSVC, we want to use '_asm rdtsc' when possible (since it works
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// with even ancient MSVC compilers), and when not possible the
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// __rdtsc intrinsic, declared in <intrin.h>.  Unfortunately, in some
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// environments, <windows.h> and <intrin.h> have conflicting
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// declarations of some other intrinsics, breaking compilation.
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Therefore, we simply declare __rdtsc ourselves. See also
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// http://connect.microsoft.com/VisualStudio/feedback/details/262047
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(_MSC_VER) && !defined(_M_IX86)
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern "C" uint64 __rdtsc();
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#pragma intrinsic(__rdtsc)
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(ARMV3) || defined(__mips__)
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <sys/time.h>
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// NOTE: only i386 and x86_64 have been well tested.
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// PPC, sparc, alpha, and ia64 are based on
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//    http://peter.kuscsik.com/wordpress/?p=14
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// with modifications by m3b.  See also
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//    https://setisvn.ssl.berkeley.edu/svn/lib/fftw-3.0.1/kernel/cycle.h
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct CycleClock {
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // This should return the number of cycles since power-on.  Thread-safe.
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static inline int64 Now() {
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(__MACH__) && defined(__APPLE__)
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // this goes at the top because we need ALL Macs, regardless of
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // architecture, to return the number of "mach time units" that
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // have passed since startup.  See sysinfo.cc where
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // InitializeSystemInfo() sets the supposed cpu clock frequency of
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // macs to the number of mach time units per second, not actual
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // CPU clock frequency (which can change in the face of CPU
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // frequency scaling).  Also note that when the Mac sleeps, this
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // counter pauses; it does not continue counting, nor does it
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // reset to zero.
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return mach_absolute_time();
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(__i386__)
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int64 ret;
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    __asm__ volatile ("rdtsc" : "=A" (ret) );
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return ret;
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(__x86_64__) || defined(__amd64__)
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    uint64 low, high;
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    __asm__ volatile ("rdtsc" : "=a" (low), "=d" (high));
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return (high << 32) | low;
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(__powerpc__) || defined(__ppc__)
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // This returns a time-base, which is not always precisely a cycle-count.
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int64 tbl, tbu0, tbu1;
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    asm("mftbu %0" : "=r" (tbu0));
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    asm("mftb  %0" : "=r" (tbl));
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    asm("mftbu %0" : "=r" (tbu1));
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    tbl &= -static_cast<int64>(tbu0 == tbu1);
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is garbage)
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return (tbu1 << 32) | tbl;
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(__sparc__)
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int64 tick;
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    asm(".byte 0x83, 0x41, 0x00, 0x00");
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    asm("mov   %%g1, %0" : "=r" (tick));
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return tick;
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(__ia64__)
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int64 itc;
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    asm("mov %0 = ar.itc" : "=r" (itc));
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return itc;
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(_MSC_VER) && defined(_M_IX86)
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Older MSVC compilers (like 7.x) don't seem to support the
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // __rdtsc intrinsic properly, so I prefer to use _asm instead
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // when I know it will work.  Otherwise, I'll use __rdtsc and hope
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // the code is being compiled with a non-ancient compiler.
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    _asm rdtsc
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(_MSC_VER)
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return __rdtsc();
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(ARMV3)
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(ARMV6)  // V6 is the earliest arch that has a standard cyclecount
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    uint32 pmccntr;
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    uint32 pmuseren;
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    uint32 pmcntenset;
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Read the user mode perf monitor counter access permissions.
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    asm volatile ("mrc p15, 0, %0, c9, c14, 0" : "=r" (pmuseren));
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (pmuseren & 1) {  // Allows reading perfmon counters for user mode code.
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      asm volatile ("mrc p15, 0, %0, c9, c12, 1" : "=r" (pmcntenset));
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (pmcntenset & 0x80000000ul) {  // Is it counting?
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        asm volatile ("mrc p15, 0, %0, c9, c13, 0" : "=r" (pmccntr));
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // The counter is set up to count every 64th cycle
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return static_cast<int64>(pmccntr) * 64;  // Should optimize to << 6
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    struct timeval tv;
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    gettimeofday(&tv, NULL);
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return static_cast<int64>((tv.tv_sec + tv.tv_usec * 0.000001)
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                              * CyclesPerSecond());
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(__mips__)
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // mips apparently only allows rdtsc for superusers, so we fall
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // back to gettimeofday.  It's possible clock_gettime would be better.
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    struct timeval tv;
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    gettimeofday(&tv, NULL);
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return static_cast<int64>((tv.tv_sec + tv.tv_usec * 0.000001)
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                              * CyclesPerSecond());
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#else
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The soft failover to a generic implementation is automatic only for ARM.
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For other platforms the developer is expected to make an attempt to create
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// a fast implementation and use generic version if nothing better is available.
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#error You need to define CycleTimer for your O/S and CPU
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // GOOGLE_BASE_CYCLECLOCK_H_
164