TscClock.h#

Fully qualified name: carb/clock/TscClock.h

File members: carb/clock/TscClock.h

// SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: LicenseRef-NvidiaProprietary
//
// NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
// property and proprietary rights in and to this material, related
// documentation and any modifications thereto. Any use, reproduction,
// disclosure or distribution of this material and related documentation
// without an express license agreement from NVIDIA CORPORATION or
// its affiliates is strictly prohibited.

#pragma once

#include "../Defines.h"

#include "../cpp/Thread.h"
#include "../extras/CpuInfo.h"
#include "../math/MulDiv.h"
#include "../Strong.h"

#include <algorithm>
#include <atomic>
#include <chrono>
#include <thread>

#if CARB_EXCEPTIONS_ENABLED
#    include <stdexcept>
#endif

#if CARB_COMPILER_GNUC || CARB_TOOLCHAIN_CLANG || defined(DOXYGEN_BUILD)
#    define CARBLOCAL_COMPILER_BARRIER __asm__ volatile("" ::: "memory")
#elif CARB_COMPILER_MSC
#    include <intrin.h>
extern "C"
{
    void _ReadWriteBarrier(void);
#    pragma intrinsic(_ReadWriteBarrier)
#    define CARBLOCAL_COMPILER_BARRIER _ReadWriteBarrier()
#    if CARB_X86_64
#        pragma intrinsic(__rdtsc)
#        pragma intrinsic(__rdtscp)
#    elif CARB_AARCH64
// Adapted from https://github.com/llvm-mirror/clang/blob/master/test/CodeGen/arm64-microsoft-status-reg.cpp
#        define CARBLOCAL_AARCH64_SYSREG(op0, op1, crn, crm, op2)                                                      \
            (((op0 & 1) << 14) | ((op1 & 7) << 11) | ((crn & 15) << 7) | ((crm & 15) << 3) | ((op2 & 7) << 0))
#        define CARBLOCAL_AARCH64_CNTVCT CARBLOCAL_AARCH64_SYSREG(3, 3, 14, 0, 2) // Generic Timer counter register
    __int64 _ReadStatusReg(int);
#        pragma intrinsic(_ReadStatusReg)
#    else
    CARB_UNSUPPORTED_ARCHITECTURE();
#    endif
}
#else
CARB_UNSUPPORTED_COMPILER(); // For brevity, every compiler ifdef block below does not have this check
#endif

#if CARB_PLATFORM_WINDOWS
#    include "../CarbWindows.h"
#elif CARB_POSIX
#    include <time.h>
#else
CARB_UNSUPPORTED_PLATFORM();
#endif

namespace carb::clock
{

namespace detail
{

// non pipeline-flushing
inline uint64_t readTsc(void) noexcept
{
#if CARB_X86_64
#    if CARB_COMPILER_GNUC || CARB_TOOLCHAIN_CLANG
    return __builtin_ia32_rdtsc();
#    elif CARB_COMPILER_MSC
    return __rdtsc();
#    endif
#elif CARB_AARCH64
#    if CARB_COMPILER_GNUC || CARB_TOOLCHAIN_CLANG
    // From: https://github.com/google/benchmark/blob/master/src/cycleclock.h
    // System timer of ARMv8 runs at a different frequency than the CPU's.
    // The frequency is fixed, typically in the range 1-50MHz. It can be
    // read at CNTFRQ special register. We assume the OS has set up
    // the virtual timer properly.
    uint64_t virtualTimer;
    asm volatile("mrs %0, cntvct_el0" : "=r"(virtualTimer));
    return virtualTimer;
#    elif CARB_COMPILER_MSC
    return _ReadStatusReg(CARBLOCAL_AARCH64_CNTVCT);
#    endif
#else
    CARB_UNSUPPORTED_ARCHITECTURE();
#endif
}

// flushes pipeline
inline uint64_t readTscp(void) noexcept
{
#if CARB_X86_64
#    if CARB_COMPILER_GNUC || CARB_TOOLCHAIN_CLANG
    unsigned int cpu;
    CARBLOCAL_COMPILER_BARRIER;
    auto val = __builtin_ia32_rdtscp(&cpu);
    CARBLOCAL_COMPILER_BARRIER;
    return val;
#    elif CARB_COMPILER_MSC
    unsigned int cpu;
    // Use compiler barriers to ensure that the timer read location is sequentially consisted wrt the surrounding code.
    CARBLOCAL_COMPILER_BARRIER;
    auto val = __rdtscp(&cpu);
    CARBLOCAL_COMPILER_BARRIER;
    return val;
#    endif
#elif CARB_AARCH64
    CARBLOCAL_COMPILER_BARRIER;
    auto val = readTsc();
    CARBLOCAL_COMPILER_BARRIER;
    return val;
#else
    CARB_UNSUPPORTED_ARCHITECTURE();
#endif
}

inline bool isInvariant() noexcept
{
#if CARB_AARCH64
    return true; // aarch64 is always invariant
#elif CARB_X86_64
    const auto highestExtLeaf = extras::detail::cpuid(0x80000000).eax;
    if (highestExtLeaf < 0x80000007)
        return false; // doesn't support misc feature flags

    const auto leafEx7h = extras::detail::cpuid(0x80000007);
    return !!(leafEx7h.edx & (1 << 8)); // invariant TSC bit
#else
    CARB_UNSUPPORTED_ARCHITECTURE();
#endif
}

inline uint64_t readMonotonicFreq(void) noexcept
{
#if CARB_PLATFORM_WINDOWS
    static std::atomic_uint64_t freq{ 0 };
    if (auto f = freq.load(std::memory_order_relaxed); CARB_LIKELY(f != 0))
        CARB_CPP20_LIKELY
        {
            return f;
        }
    // NOTE: multiple threads may reach here but this is okay as they should all arrive at the same value.
    // This use of an atomic instead of thread-safe statics greatly simplifies this function's execution
    const uint64_t f = []() CARB_NOINLINE {
        CARBWIN_LARGE_INTEGER li;
        [[maybe_unused]] BOOL b = QueryPerformanceFrequency((LARGE_INTEGER*)&li);
        CARB_ASSERT(b);
        return li.QuadPart;
    }();
    freq.store(f, std::memory_order_relaxed);
    return f;
#elif CARB_POSIX
    return 1'000'000'000; // nanosecond resolution
#else
    CARB_UNSUPPORTED_PLATFORM();
#endif
}

inline uint64_t readMonotonic(void) noexcept
{
#if CARB_PLATFORM_WINDOWS
    CARBWIN_LARGE_INTEGER li;
    [[maybe_unused]] BOOL b = QueryPerformanceCounter((LARGE_INTEGER*)&li);
    CARB_ASSERT(b);
    return uint64_t(li.QuadPart);
#elif CARB_POSIX
    struct timespec tp;
    clock_gettime(CLOCK_MONOTONIC, &tp);
    return ((uint64_t)tp.tv_sec * 1'000'000'000) + (uint64_t)tp.tv_nsec; // nanosecond resolution
#else
    CARB_UNSUPPORTED_PLATFORM();
#endif
}

inline bool sampleClocks(uint64_t& tsc, uint64_t& mono) noexcept
{
    // Attempt to take a TSC stamp and monotonic stamp as closely together as possible. In order to do this, we will
    // interleave several timestamps in the pattern: TSC, mono, TSC, mono, ..., TSC
    // Essentially this measures how long each monotonic timestamp takes in terms of the much faster TSC. We can then
    // take the fastest monotonic timestamp and calculate an equivalent TSC timestamp from the midpoint.

    static constexpr int kIterations = 100;
    struct Sample
    {
        uint64_t mono, tsc;
    };
    std::array<Sample, kIterations> samples;

    // Interleave sampling the TSC and monotonic clocks ending on a TSC
    const auto end = samples.end();
    const auto startTsc = readTscp();
    for (auto it = samples.begin(); it != end; /*in loop*/)
    {
        // Unroll the loop slightly
        it->mono = readMonotonic();
        (it++)->tsc = readTscp();
        it->mono = readMonotonic();
        (it++)->tsc = readTscp();
        it->mono = readMonotonic();
        (it++)->tsc = readTscp();
        it->mono = readMonotonic();
        (it++)->tsc = readTscp();
        CARB_ASSERT(it <= end);
    }

    // Start with the first as a baseline
    int64_t tscDiff = INT64_MAX;
    uint64_t prevTsc = startTsc;
    for (auto& sample : samples)
    {
        auto diff = int64_t(sample.tsc - prevTsc);
        if (diff <= 0)
            return false; // TSC ran backwards

        if (diff < tscDiff)
        {
            tscDiff = diff;
            tsc = prevTsc + uint64_t(diff / 2);
            mono = sample.mono;
        }
        prevTsc = sample.tsc;
    }
    return true;
}

inline uint64_t freqCalcFailed() noexcept(!CARB_EXCEPTIONS_ENABLED)
{
#if CARB_EXCEPTIONS_ENABLED
    throw std::runtime_error("Cannot calculate frequency: TSC ran backwards");
#else
    return 0;
#endif
}

inline uint64_t computeTscFrequency() noexcept(!CARB_EXCEPTIONS_ENABLED)
{
    // We have two clocks in two different domains. The CPU-specific TSC and the monotonic clock. We need to compute the
    // frequency of the TSC since it is not presented in any way.
    uint64_t tsc[2] = {};
    uint64_t monotonic[2] = {};

    const auto monoFreq = readMonotonicFreq();
    if (monoFreq < 1'000'000) // need at least microsecond resolution
        return freqCalcFailed();

    // Sleep so that we hopefully start with a full quanta and are less likely to context switch during this function.
    cpp::this_thread::sleep_for(std::chrono::microseconds(10));

    // Sample our clocks to get a start time
    if (!sampleClocks(tsc[0], monotonic[0]))
        return freqCalcFailed();

    // Wait a bit...
    cpp::this_thread::sleep_for(std::chrono::microseconds(50));

    // Sample clocks again to get elapsed time
    if (!sampleClocks(tsc[1], monotonic[1]) || int64_t(tsc[1] - tsc[0]) < 0)
        return freqCalcFailed();

    // This shouldn't happen, given the delay
    CARB_ASSERT(monotonic[1] != monotonic[0]);
    CARB_IGNOREWARNING_MSC_WITH_PUSH(4702) // unreachable code
    return math::mulDiv(math::round_nearest_neighbor, tsc[1] - tsc[0], monoFreq, monotonic[1] - monotonic[0])
        .or_else([] { return cpp::optional<uint64_t>(freqCalcFailed()); })
        .value();
    CARB_IGNOREWARNING_MSC_POP
}

inline uint64_t determineFrequency() noexcept(!CARB_EXCEPTIONS_ENABLED)
{
    // See if we can read the frequency from cpuid
#if CARB_X86_64 && 0 // disabled because it's more accurate for our purposes to measure it
    do
    {
        // Figure out the highest leaf command
        const unsigned highestLeaf = extras::detail::cpuid(0).eax;
        if (highestLeaf >= 0x15) // see if we can read leaf 15h (tsc and core crystal frequencies)
        {
            auto leaf15h = extras::detail::cpuid(0x15);
            if (leaf15h.eax != 0 && leaf15h.ebx != 0 && leaf15h.ecx != 0)
            {
                // Compute the frequency: TSCFreq = ECX*(EBX/EAX)
                uint64_t freq = uint64_t(leaf15h.ecx) * leaf15h.ebx / leaf15h.eax;
                // Sanity check, should be at least 10 MHz
                if (freq >= 10'000'000)
                    return freq;
            }

            // Some processors, like Skylake, 15h_ECX is 0 so we need to read from 16h (processor and bus specification
            // frequencies)
            if (highestLeaf >= 0x16)
            {
                auto leaf16h = extras::detail::cpuid(0x16);

                // TSC frequency should be equal to the processor base frequency
                uint64_t freq = uint64_t(leaf16h.eax & 0xffff) * 1'000'000;
                if (freq >= 10'000'000)
                    return freq;
            }

            // TODO: Attempt to look up frequency based on family/model/stepping information?

            // All else failed, fall through to measurement
        }
    } while (false);
#elif CARB_AARCH64
    // TODO? We can read the frequency from the CNTFRQ special register if desired
#endif

    return computeTscFrequency();
}

} // namespace detail

class tsc_clock
{
public:
    CARB_STRONGTYPE(Sample, uint64_t);

    CARB_STRONGTYPE(Freq, uint64_t);

    static bool isInvariant() noexcept
    {
        static bool val = detail::isInvariant();
        return val;
    }

    static Sample sample() noexcept
    {
        return Sample(detail::readTscp());
    }

    static Freq frequency() noexcept(!CARB_EXCEPTIONS_ENABLED)
    {
        static std::atomic_uint64_t cached{ 0 };
        if (auto freq = cached.load(std::memory_order_relaxed); CARB_LIKELY(freq != 0))
            CARB_CPP20_LIKELY
            {
                return Freq(freq);
            }
        // NOTE: multiple threads may reach here but this is okay as they should all arrive at a valid value within a
        // few fractions of a percent from each other. The use of an atomic instead of thread-safe statics greatly
        // simplifies this function's execution, plus allows us to handle exceptions enabled or not.
        auto freq = detail::determineFrequency();
        if (freq)
            cached.store(freq, std::memory_order_relaxed);
        return Freq(freq);
    }

    template <class Duration>
    static Duration duration(Sample older, Sample newer) noexcept
    {
        using DurationRep = typename Duration::rep;
        using Rep = std::conditional_t<std::is_floating_point<DurationRep>::value, double,
                                       std::conditional_t<std::is_signed<DurationRep>::value, int64_t, uint64_t>>;
        using Period = typename Duration::period;
        int64_t const diff = int64_t(newer.get()) - int64_t(older.get());
        int64_t const freq = int64_t(frequency().get());
        CARB_ASSERT(freq > 0);

        // diff * period::den / (freq * period::num)
        auto duration = math::mulDiv(Rep(diff), Rep(Period::den), Rep(freq * Period::num));
        return Duration(DurationRep(duration.value_or(Rep{})));
    }
};
} // namespace carb::clock

#undef CARBLOCAL_AARCH64_SYSREG
#undef CARBLOCAL_AARCH64_CNTVCT
#undef CARBLOCAL_COMPILER_BARRIER