carb/time/TscClock.h

File members: carb/time/TscClock.h

// Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//

#pragma once

#include "../Defines.h"

#include "../cpp/Numeric.h"
#include "../Strong.h"

#include <thread>

#if CARB_PLATFORM_WINDOWS
// From immintrin.h
extern "C" unsigned __int64 rdtsc(void);
extern "C" unsigned __int64 rdtscp(unsigned int*);
#    pragma intrinsic(__rdtsc)
#    pragma intrinsic(__rdtscp)
#    include "../CarbWindows.h"
#elif CARB_POSIX
#    include <time.h>
#else
CARB_UNSUPPORTED_PLATFORM();
#endif

namespace carb
{
namespace time
{

namespace detail
{

#if CARB_PLATFORM_WINDOWS
// non pipeline-flushing
inline uint64_t readTsc(void) noexcept
{
    return __rdtsc();
}

// flushes pipeline
inline uint64_t readTscp(void) noexcept
{
    unsigned int cpu;
    return __rdtscp(&cpu);
}

inline uint64_t readMonotonic(void) noexcept
{
    CARBWIN_LARGE_INTEGER li;
    BOOL b = QueryPerformanceCounter((LARGE_INTEGER*)&li);
    CARB_ASSERT(b);
    CARB_UNUSED(b);
    return li.QuadPart;
}

inline uint64_t readMonotonicFreq(void) noexcept
{
    CARBWIN_LARGE_INTEGER li;
    BOOL b = QueryPerformanceFrequency((LARGE_INTEGER*)&li);
    CARB_ASSERT(b);
    CARB_UNUSED(b);
    return li.QuadPart;
}
#elif CARB_POSIX
#    if CARB_X86_64
// non pipeline-flushing
__inline__ uint64_t readTsc(void) noexcept
{
    uint64_t msr;
    // clang-format off
    __asm__ __volatile__(
        "rdtsc;\n"          // read the rdtsc counter
        "shl $32, %%rdx;\n" // rdx <<= 32
        "or %%rdx, %0"      // rax |= rdx, output is in rax
        : "=a"(msr)         // output to msr variable
        :                   // no inputs
        : "%rcx", "%rdx");  // clobbers
    // clang-format on
    return msr;
}

// flushes pipeline
__inline__ uint64_t readTscp(void) noexcept
{
    // Use RDTSCP since it is serializing and flushes the pipeline intrinsically.
    uint64_t msr;
    // clang-format off
    __asm__ __volatile__(
        "rdtscp;\n"         // read the rdtsc counter
        "shl $32, %%rdx;\n" // rdx <<= 32
        "or %%rdx, %0"      // rax |= rdx, output is in rax
        : "=a"(msr)         // output to msr variable
        :                   // no inputs
        : "%rcx", "%rdx");  // clobbers
    // clang-format on
    return msr;
}
#    elif CARB_AARCH64
__inline__ uint64_t readTsc(void) noexcept
{
    // From: https://github.com/google/benchmark/blob/master/src/cycleclock.h
    // System timer of ARMv8 runs at a different frequency than the CPU's.
    // The frequency is fixed, typically in the range 1-50MHz. It can be
    // read at CNTFRQ special register. We assume the OS has set up
    // the virtual timer properly.
    uint64_t virtualTimer;
    asm volatile("mrs %0, cntvct_el0" : "=r"(virtualTimer));
    return virtualTimer;
}

// aarch64 doesn't specify if it flushes the pipeline, so both functions do the same
__inline__ uint64_t readTscp(void) noexcept
{
    return readTsc();
}
#    else
CARB_UNSUPPORTED_ARCHITECTURE();
#    endif

inline uint64_t readMonotonic(void) noexcept
{
    struct timespec tp;
    clock_gettime(CLOCK_MONOTONIC, &tp);
    return ((tp.tv_sec * 1'000'000'000) + tp.tv_nsec) / 10;
}

inline uint64_t readMonotonicFreq(void) noexcept
{
    // 10ns resolution is sufficient for system clock and gives us less chance to overflow in computeTscFrequency()
    return 100'000'000;
}
#endif

inline void sampleClocks(uint64_t& tsc, uint64_t& monotonic) noexcept
{
    // Attempt to take a TSC stamp and monotonic stamp as closely together as possible. In order to do this, we will
    // interleave several timestamps in the pattern: TSC, mono, TSC, mono, ..., TSC
    // Essentially this measures how long each monotonic timestamp takes in terms of the much faster TSC. We can then
    // take the fastest monotonic timestamp and calculate an equivalent TSC timestamp from the midpoint.

    static constexpr int kIterations = 100;
    uint64_t stamps[kIterations * 2 + 1];
    uint64_t* stamp = stamps;
    uint64_t* const end = stamp + (kIterations * 2);

    // Sleep so that we hopefully start with a full quanta and are less likely to context switch during this function.
    std::this_thread::sleep_for(std::chrono::milliseconds(1));

    // Interleave sampling the TSC and monotonic clocks ending on a TSC
    while (stamp != end)
    {
        // Unroll the loop slightly
        *(stamp++) = readTscp();
        *(stamp++) = readMonotonic();
        *(stamp++) = readTscp();
        *(stamp++) = readMonotonic();
        *(stamp++) = readTscp();
        *(stamp++) = readMonotonic();
        *(stamp++) = readTscp();
        *(stamp++) = readMonotonic();
        CARB_ASSERT(stamp <= end);
    }
    *(stamp++) = readTscp();

    // Start with the first as a baseline
    uint64_t best = stamps[2] - stamps[0];
    tsc = stamps[0] + ((stamps[2] - stamps[0]) / 2);
    monotonic = stamps[1];

    // Find the best sample
    for (int i = 0; i != kIterations; ++i)
    {
        uint64_t tscDiff = stamps[2 * (i + 1)] - stamps[2 * i];
        if (tscDiff < best)
        {
            best = tscDiff;
            // Use a tsc sample midway between two samples
            tsc = stamps[2 * i] + (tscDiff / 2);
            monotonic = stamps[2 * i + 1];
        }
    }
}

inline uint64_t computeTscFrequency() noexcept
{
    // We have two clocks in two different domains. The CPU-specific TSC and the monotonic clock. We need to compute the
    // frequency of the TSC since it is not presented in any way.
    uint64_t tsc[2];
    uint64_t monotonic[2];

    // Sample our clocks and wait briefly then sample again
    sampleClocks(tsc[0], monotonic[0]);
    std::this_thread::sleep_for(std::chrono::milliseconds(50));
    sampleClocks(tsc[1], monotonic[1]);

    // This shouldn't happen, given the delay
    CARB_ASSERT(monotonic[1] != monotonic[0]);
    return ((tsc[1] - tsc[0]) * readMonotonicFreq()) / (monotonic[1] - monotonic[0]);
}

} // namespace detail

class tsc_clock
{
public:
    CARB_STRONGTYPE(Sample, uint64_t);

    CARB_STRONGTYPE(Freq, uint64_t);

    static Sample sample() noexcept
    {
        return Sample(detail::readTscp());
    }

    static Freq frequency() noexcept
    {
        static Freq freq{ detail::computeTscFrequency() };
        return freq;
    }

    template <class Duration>
    static Duration duration(Sample older, Sample newer) noexcept
    {
        using To = Duration;
        using Rep = typename Duration::rep;
        using Period = typename Duration::period;
        int64_t diff = newer.get() - older.get();

        // std::ratio is compile-time, so we have to do our own computations
        using CT = std::common_type_t<Rep, int64_t, intmax_t>;

        intmax_t _N1 = 1;
        intmax_t _D1 = intmax_t(frequency().get());
        intmax_t _N2 = Period::den; // Inverted for divide
        intmax_t _D2 = Period::num; // Inverted for divide

        intmax_t _Gx = carb::cpp::gcd(_N1, _D2);
        intmax_t _Gy = carb::cpp::gcd(_N2, _D1);

        intmax_t ratio_num = (_N1 / _Gx) * (_N2 / _Gy); // TODO: Check for overflow
        intmax_t ratio_den = (_D1 / _Gy) * (_D2 / _Gx); // TODO: Check for overflow

        if (ratio_num == 1 && ratio_den == 1)
            return To(Rep(diff));
        if (ratio_num != 1 && ratio_den == 1)
            return To(Rep(CT(diff) * CT(ratio_num)));
        if (ratio_num == 1 && ratio_den != 1)
            return To(Rep(CT(diff) / CT(ratio_den)));
        // Unfortunately, our frequency() is often not even numbers so the gcd() will be low. Which means that we often
        // need to multiply and divide large numbers that end up overflowing. So use double here to keep better
        // precision. As an alternative we could try to round the frequency up or down slightly, though this will impact
        // precision.
        return To(Rep(double(diff) * double(ratio_num) / double(ratio_den)));
    }
};
} // namespace time
} // namespace carb