carb/cpp/Bit.h

File members: carb/cpp/Bit.h

// Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//

#pragma once

#include "TypeTraits.h"
#include "detail/ImplDummy.h"
#include "../extras/CpuInfo.h"

#if CARB_HAS_CPP20
#    include <bit>
#endif

#if !defined(DOXYGEN_BUILD) && !defined(__cpp_lib_bitops)

// CARB_POPCNT is 1 if the compiler is targeting a CPU with AVX instructions or GCC reports popcnt is available. It is
// undefined at the bottom of the file.
#    if defined(__AVX__) /* MSC/GCC */ || defined(__POPCNT__) /* GCC */
#        define CARB_POPCNT 1
#    else
#        define CARB_POPCNT 0
#    endif

// CARB_LZCNT is 1 if the compiler is targeting a CPU with AVX2 instructions or GCC reports lzcnt is available. It is
// undefined at the bottom of the file.
#    if defined(__AVX2__) /* MSC/GCC */ || defined(__LZCNT__) /* GCC */
#        define CARB_LZCNT 1
#    else
#        define CARB_LZCNT 0
#    endif

#    if CARB_COMPILER_MSC
extern "C"
{
    unsigned int __popcnt(unsigned int value);
    unsigned __int64 __popcnt64(unsigned __int64 value);
    unsigned char _BitScanReverse(unsigned long* _Index, unsigned long _Mask);
    unsigned char _BitScanReverse64(unsigned long* _Index, unsigned __int64 _Mask);
    unsigned char _BitScanForward(unsigned long* _Index, unsigned long _Mask);
    unsigned char _BitScanForward64(unsigned long* _Index, unsigned __int64 _Mask);
#        if CARB_LZCNT
    unsigned int __lzcnt(unsigned int);
    unsigned short __lzcnt16(unsigned short);
    unsigned __int64 __lzcnt64(unsigned __int64);
#        endif
}

#        pragma intrinsic(__popcnt)
#        pragma intrinsic(__popcnt64)
#        pragma intrinsic(_BitScanReverse)
#        pragma intrinsic(_BitScanReverse64)
#        pragma intrinsic(_BitScanForward)
#        pragma intrinsic(_BitScanForward64)
#        if CARB_LZCNT
#            pragma intrinsic(__lzcnt)
#            pragma intrinsic(__lzcnt16)
#            pragma intrinsic(__lzcnt64)
#        endif
#    elif CARB_COMPILER_GNUC
#    else
CARB_UNSUPPORTED_PLATFORM();
#    endif
#endif

namespace carb
{
namespace cpp
{

#if !defined(DOXYGEN_BUILD) && !defined(__cpp_lib_bitops)
namespace detail
{

// Naive implementation of popcnt for CPUs without built in instructions.
template <class T, std::enable_if_t<std::is_unsigned<T>::value, bool> = true>
constexpr int popCountImpl(T val) noexcept
{
    int count = 0;
    while (val != 0)
    {
        ++count;
        val = val & (val - 1);
    }
    return count;
}

// The Helper class is specialized by type and size since many intrinsics have different names for different sizes.
template <class T, size_t Size = sizeof(T)>
class Helper;

// Specialization for functions where sizeof(T) >= 1
template <class T>
class Helper<T, 1>
{
public:
    static_assert(std::numeric_limits<T>::is_specialized, "Requires numeric type");
    using Signed = typename std::make_signed_t<T>;
    using Unsigned = typename std::make_unsigned_t<T>;
    // popCount implementation for 1-4 bytes integers
    static int popCount(const T& val)
    {
#    if CARB_COMPILER_MSC
#        if !CARB_POPCNT // Skip the check if we know we have the instruction
        // Only use the intrinsic if it's supported on the CPU
        static extras::CpuInfo cpuInfo;
        if (!cpuInfo.popcntSupported())
        {
            return popCountImpl((Unsigned)val);
        }
        else
#        endif
        {
            return (int)__popcnt((unsigned long)(Unsigned)val);
        }
#    else
        return __builtin_popcount((unsigned long)(Unsigned)val);
#    endif
    }
    static constexpr void propagateHighBit(T& n)
    {
        n |= (n >> 1);
        n |= (n >> 2);
        n |= (n >> 4);
    }

    static int countl_zero(T val)
    {
#    if CARB_LZCNT
#        if CARB_COMPILER_MSC
        return int(__lzcnt16((unsigned short)(Unsigned)val)) - (16 - std::numeric_limits<T>::digits);
#        else
        return int(__builtin_ia32_lzcnt_u16((unsigned short)(Unsigned)val)) - (16 - std::numeric_limits<T>::digits);
#        endif
#    else
#        if CARB_COMPILER_MSC
        unsigned long index;
        constexpr static int diff = std::numeric_limits<unsigned long>::digits - std::numeric_limits<T>::digits;
        return _BitScanReverse(&index, (unsigned long)(Unsigned)val) ?
                   (std::numeric_limits<unsigned long>::digits - 1 - index - diff) :
                   std::numeric_limits<T>::digits;
#        else
        // According to docs, undefined if val == 0
        return val ? __builtin_clz((unsigned int)(Unsigned)val) - (32 - std::numeric_limits<T>::digits) :
                     std::numeric_limits<T>::digits;
#        endif
#    endif
    }

    static int countr_zero(T val)
    {
        if (val == 0)
        {
            return std::numeric_limits<T>::digits;
        }
        else
        {
#    if CARB_COMPILER_MSC
            unsigned long result;
            _BitScanForward(&result, (unsigned long)(Unsigned)val);
            return (int)result;
#    else
            return __builtin_ctz((unsigned int)(Unsigned)val);
#    endif
        }
    }
};

// Specialization for functions where sizeof(T) >= 2
template <class T>
class Helper<T, 2> : public Helper<T, 1>
{
public:
    using Base = Helper<T, 1>;
    using typename Base::Signed;
    using typename Base::Unsigned;
    static constexpr void propagateHighBit(T& n)
    {
        Base::propagateHighBit(n);
        n |= (n >> 8);
    }
};

// Specialization for functions where sizeof(T) >= 4
template <class T>
class Helper<T, 4> : public Helper<T, 2>
{
public:
    using Base = Helper<T, 2>;
    using typename Base::Signed;
    using typename Base::Unsigned;
    static constexpr void propagateHighBit(T& n)
    {
        Base::propagateHighBit(n);
        n |= (n >> 16);
    }

#    if CARB_LZCNT
#        if CARB_COMPILER_MSC
    static int countl_zero(T val)
    {
        static_assert(std::numeric_limits<T>::digits == 32, "Invalid assumption");
        return int(__lzcnt((unsigned int)(Unsigned)val));
    }
#        else
    static int countl_zero(T val)
    {
        static_assert(std::numeric_limits<T>::digits == 32, "Invalid assumption");
        return int(__builtin_ia32_lzcnt_u32((unsigned int)(Unsigned)val));
    }
#        endif
#    endif
};

// Specialization for functions where sizeof(T) >= 8
template <class T>
class Helper<T, 8> : public Helper<T, 4>
{
public:
    using Base = Helper<T, 4>;
    using typename Base::Signed;
    using typename Base::Unsigned;

    // popCount implementation for 8 byte integers
    static int popCount(const T& val)
    {
        static_assert(sizeof(T) == sizeof(uint64_t), "Unexpected size");
#    if CARB_COMPILER_MSC
#        if !CARB_POPCNT // Skip the check if we know we have the instruction
        // Only use the intrinsic if it's supported on the CPU
        static extras::CpuInfo cpuInfo;
        if (!cpuInfo.popcntSupported())
        {
            return popCountImpl((Unsigned)val);
        }
        else
#        endif
        {
            return (int)__popcnt64((Unsigned)val);
        }
#    else
        return __builtin_popcountll((Unsigned)val);
#    endif
    }
    static constexpr void propagateHighBit(T& n)
    {
        Base::propagateHighBit(n);
        n |= (n >> 32);
    }

    static int countl_zero(T val)
    {
#    if CARB_LZCNT
#        if CARB_COMPILER_MSC
        return int(__lzcnt64((Unsigned)val));
#        else
        return int(__builtin_ia32_lzcnt_u64((Unsigned)val));
#        endif
#    else
#        if CARB_COMPILER_MSC
        unsigned long index;
        static_assert(sizeof(val) == sizeof(unsigned __int64), "Invalid assumption");
        return _BitScanReverse64(&index, val) ? std::numeric_limits<T>::digits - 1 - index :
                                                std::numeric_limits<T>::digits;
#        else
        // According to docs, undefined if val == 0
        return val ? __builtin_clzll((Unsigned)val) : std::numeric_limits<T>::digits;
#        endif
#    endif
    }

    static int countr_zero(T val)
    {
        if (val == 0)
        {
            return std::numeric_limits<T>::digits;
        }
        else
        {
#    if CARB_COMPILER_MSC
            unsigned long result;
            _BitScanForward64(&result, (Unsigned)val);
            return (int)result;
#    else
            return __builtin_ctzll((Unsigned)val);
#    endif
        }
    }
};

template <class T>
using IsUnsignedIntegral = cpp::conjunction<std::is_unsigned<T>, std::is_integral<T>>;

} // namespace detail
#endif

#if defined(DOXYGEN_BUILD)
enum class endian
{
    little = 0,
    big = 1,
    native = little
};
#elif defined(__cpp_lib_endian)
using endian = std::endian;
#else
enum class endian
{
#    if CARB_COMPILER_GNUC
    little = __ORDER_LITTLE_ENDIAN__,
    big = __ORDER_BIG_ENDIAN__,
    native = __BYTE_ORDER__
#    else
    little = 0,
    big = 1,
#        if CARB_X86_64 || CARB_AARCH64
    native = little
#        else
    CARB_UNSUPPORTED_PLATFORM();
#        endif
#    endif
};
#endif

#if CARB_COMPILER_MSC && _MSC_VER >= 1926
// VS 2019 16.6 has it https://github.com/microsoft/STL/issues/22#issuecomment-571378845
#    define CARBLOCAL_HAS_BUILTIN_BIT_CAST 1
#elif defined(__has_builtin)
#    if __has_builtin(__builtin_bit_cast) // most compilers don't like this combined with previous
#        define CARBLOCAL_HAS_BUILTIN_BIT_CAST 1
#    endif
#endif

#if !defined(DOXYGEN_BUILD) && defined(__cpp_lib_bit_cast)
using std::bit_cast;
#elif !defined(DOXYGEN_BUILD) && defined(CARBLOCAL_HAS_BUILTIN_BIT_CAST)
template <class To,
          class From,
          std::enable_if_t<cpp::conjunction<cpp::bool_constant<sizeof(To) == sizeof(From)>,
                                            std::is_trivially_copyable<To>,
                                            std::is_trivially_copyable<From>>::value,
                           bool> = false>
constexpr To bit_cast(const From& src) noexcept
{
    return __builtin_bit_cast(To, src);
}
#else
template <class To,
          class From CARB_NO_DOC(,
                                 std::enable_if_t<cpp::conjunction<cpp::bool_constant<sizeof(To) == sizeof(From)>,
                                                                   std::is_trivially_copyable<To>,
                                                                   std::is_trivially_copyable<From>>::value,
                                                  bool> = false)>
To bit_cast(const From& src) noexcept // requires compiler support to be constexpr
{
    // This union allows us to bypass `dest`'s constructor and just trivially copy into it.
    union
    {
        detail::NontrivialDummyType dummy;
        To dest;
    } u = {};
    std::memcpy(reinterpret_cast<void*>(&u.dest), &src, sizeof(From));
    return u.dest;
}
#endif
#undef CARBLOCAL_HAS_BUILTIN_BIT_CAST

#if defined(__cpp_lib_bitops) && !defined(DOXYGEN_BUILD)
// Use C++20 definitions
using std::bit_ceil;
using std::bit_floor;
using std::countl_one;
using std::countl_zero;
using std::countr_one;
using std::countr_zero;
using std::has_single_bit;
using std::popcount;

// Wrapper implementations guaranteed to be constexpr
template <class T>
constexpr int popcount_constexpr(T val) noexcept
{
    return std::popcount(val);
}

template <class T>
constexpr int countl_zero_constexpr(T val) noexcept
{
    return std::countl_zero(val);
}

template <class T>
constexpr int countr_zero_constexpr(T val) noexcept
{
    return std::countr_zero(val);
}

template <class T>
constexpr int countl_one_constexpr(T val) noexcept
{
    return std::countl_one(val);
}

template <class T>
constexpr int countr_one_constexpr(T val) noexcept
{
    return std::countr_one(val);
}
#else
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr bool has_single_bit(T val) noexcept
{
    return val != T(0) && (val & (val - 1)) == T(0);
}

template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr T bit_ceil(T val) noexcept
{
    if (val <= 1)
        return T(1);

    // Yes, this could be implemented with a `nlz` instruction but cannot be constexpr without compiler support.
    --val;
    detail::Helper<T>::propagateHighBit(val);
    ++val;
    return val;
}

template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr T bit_floor(T val) noexcept
{
    // Yes, this could be implemented with a `nlz` instruction but cannot be constexpr without compiler support.
    detail::Helper<T>::propagateHighBit(val);
    return val - (val >> 1);
}

template <class T, std::enable_if_t<std::is_unsigned<T>::value, bool> = true>
int popcount(T val) noexcept
{
    return detail::Helper<T>::popCount(val);
}

template <class T, std::enable_if_t<std::is_unsigned<T>::value, bool> = true>
constexpr int popcount_constexpr(T val) noexcept
{
    return detail::popCountImpl(val);
}

template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
int countl_zero(T val) noexcept
{
    return detail::Helper<T>::countl_zero(val);
}

template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr int countl_zero_constexpr(T val) noexcept
{
    if (val == 0)
        return std::numeric_limits<T>::digits;

    unsigned zeros = 0;
    for (T shift = std::numeric_limits<T>::digits >> 1; shift; shift >>= 1)
    {
        T temp = val >> shift;
        if (temp)
            val = temp;
        else
            zeros |= shift;
    }
    return int(zeros);
}

template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
int countr_zero(T val) noexcept
{
    return detail::Helper<T>::countr_zero(val);
}

#    pragma push_macro("max")
#    undef max

template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr int countr_zero_constexpr(T val) noexcept
{
    if (val == 0)
        return std::numeric_limits<T>::digits;
    if (val & 1)
        return 0;

    int zeros = 0;
    T shift = std::numeric_limits<T>::digits >> 1;
    T mask = std::numeric_limits<T>::max() >> shift;
    while (shift)
    {
        if (!(val & mask))
        {
            val >>= shift;
            zeros |= shift;
        }
        shift >>= 1;
        mask >>= shift;
    }
    return zeros;
}

#    pragma pop_macro("max")

template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
int countl_one(T val) noexcept
{
    return detail::Helper<T>::countl_zero(T(~val));
}

template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr int countl_one_constexpr(T val) noexcept
{
    return countl_zero_constexpr(T(~val));
}

template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
int countr_one(T val) noexcept
{
    return detail::Helper<T>::countr_zero(T(~val));
}

template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr int countr_one_constexpr(T val) noexcept
{
    return countr_zero_constexpr(T(~val));
}
#endif

} // namespace cpp
} // namespace carb

#undef CARB_LZCNT
#undef CARB_POPCNT