Bit.h#
Fully qualified name: carb/cpp/Bit.h
File members: carb/cpp/Bit.h
// Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//
#pragma once
#include "TypeTraits.h"
#include "detail/ImplDummy.h"
#include "../extras/CpuInfo.h"
#include <limits>
#include <type_traits>
#if CARB_HAS_CPP20
# include <bit>
#endif
#if !defined(DOXYGEN_BUILD) && !defined(__cpp_lib_bitops)
// CARB_POPCNT is 1 if the compiler is targeting a CPU with AVX instructions or GCC reports popcnt is available. It is
// undefined at the bottom of the file.
# if defined(__AVX__) /* MSC/GCC */ || defined(__POPCNT__) /* GCC */
# define CARB_POPCNT 1
# else
# define CARB_POPCNT 0
# endif
// CARB_LZCNT is 1 if the compiler is targeting a CPU with AVX2 instructions or GCC reports lzcnt is available. It is
// undefined at the bottom of the file.
# if defined(__AVX2__) /* MSC/GCC */ || defined(__LZCNT__) /* GCC */
# define CARB_LZCNT 1
# else
# define CARB_LZCNT 0
# endif
# if CARB_COMPILER_MSC
extern "C"
{
# if CARB_AARCH64
# if _MSC_VER >= 1932 || CARB_TOOLCHAIN_CLANG
# pragma intrinsic(_CountOneBits)
# pragma intrinsic(_CountOneBits64)
unsigned int _CountOneBits(unsigned long);
unsigned int _CountOneBits64(unsigned __int64);
# else
// VS2019 is missing these intrinsics. Fixed in VS2022 Update 1
// See: https://developercommunity.visualstudio.com/t/-countonebits-intrinsics-are-not-defined-for-arm64/957551
inline unsigned int _CountOneBits64(unsigned __int64 val)
{
// Naive impl
int count = 0;
while (val != 0)
{
++count;
val = val & (val - 1);
}
return count;
}
inline unsigned int _CountOneBits(unsigned long val)
{
return _CountOneBits64(val);
}
# endif
# else
# pragma intrinsic(__popcnt)
# pragma intrinsic(__popcnt64)
unsigned int __popcnt(unsigned int value);
unsigned __int64 __popcnt64(unsigned __int64 value);
# endif
# pragma intrinsic(_BitScanReverse)
# pragma intrinsic(_BitScanReverse64)
# pragma intrinsic(_BitScanForward)
# pragma intrinsic(_BitScanForward64)
unsigned char _BitScanReverse(unsigned long* _Index, unsigned long _Mask);
unsigned char _BitScanReverse64(unsigned long* _Index, unsigned __int64 _Mask);
unsigned char _BitScanForward(unsigned long* _Index, unsigned long _Mask);
unsigned char _BitScanForward64(unsigned long* _Index, unsigned __int64 _Mask);
# if CARB_LZCNT
# pragma intrinsic(__lzcnt)
# pragma intrinsic(__lzcnt16)
# pragma intrinsic(__lzcnt64)
unsigned int __lzcnt(unsigned int);
unsigned short __lzcnt16(unsigned short);
unsigned __int64 __lzcnt64(unsigned __int64);
# endif
}
# elif CARB_COMPILER_GNUC
# else
CARB_UNSUPPORTED_PLATFORM();
# endif
#endif
namespace carb
{
namespace cpp
{
namespace detail
{
#if !defined(DOXYGEN_BUILD) && !defined(__cpp_lib_bitops)
// Naive implementation of popcnt for CPUs without built in instructions.
template <class T, std::enable_if_t<std::is_unsigned<T>::value, bool> = true>
constexpr int popCountImpl(T val) noexcept
{
int count = 0;
while (val != 0)
{
++count;
val = val & (val - 1);
}
return count;
}
// The Helper class is specialized by type and size since many intrinsics have different names for different sizes.
template <class T, size_t Size = sizeof(T)>
class Helper;
// Specialization for functions where sizeof(T) >= 1
template <class T>
class Helper<T, 1>
{
public:
static_assert(std::numeric_limits<T>::is_specialized, "Requires numeric type");
using Signed = typename std::make_signed_t<T>;
using Unsigned = typename std::make_unsigned_t<T>;
// popCount implementation for 1-4 bytes integers
static int popCount(const T& val)
{
# if CARB_COMPILER_MSC
# if !CARB_POPCNT // Skip the check if we know we have the instruction
// Only use the intrinsic if it's supported on the CPU
static extras::CpuInfo cpuInfo;
if (!cpuInfo.popcntSupported())
{
return popCountImpl((Unsigned)val);
}
else
# endif
{
# if CARB_X86_64
return (int)__popcnt((unsigned long)(Unsigned)val);
# else
return (int)_CountOneBits((unsigned long)(Unsigned)val);
# endif
}
# else
return __builtin_popcount((unsigned long)(Unsigned)val);
# endif
}
static constexpr void propagateHighBit(T& n)
{
n |= (n >> 1);
n |= (n >> 2);
n |= (n >> 4);
}
static int countl_zero(T val)
{
# if CARB_LZCNT
# if CARB_COMPILER_MSC
return int(__lzcnt16((unsigned short)(Unsigned)val)) - (16 - std::numeric_limits<T>::digits);
# else
return int(__builtin_ia32_lzcnt_u16((unsigned short)(Unsigned)val)) - (16 - std::numeric_limits<T>::digits);
# endif
# else
# if CARB_COMPILER_MSC
unsigned long index;
constexpr static int diff = std::numeric_limits<unsigned long>::digits - std::numeric_limits<T>::digits;
return _BitScanReverse(&index, (unsigned long)(Unsigned)val) ?
(std::numeric_limits<unsigned long>::digits - 1 - index - diff) :
std::numeric_limits<T>::digits;
# else
// According to docs, undefined if val == 0
return val ? __builtin_clz((unsigned int)(Unsigned)val) - (32 - std::numeric_limits<T>::digits) :
std::numeric_limits<T>::digits;
# endif
# endif
}
static int countr_zero(T val)
{
if (val == 0)
{
return std::numeric_limits<T>::digits;
}
else
{
# if CARB_COMPILER_MSC
unsigned long result;
_BitScanForward(&result, (unsigned long)(Unsigned)val);
return (int)result;
# else
return __builtin_ctz((unsigned int)(Unsigned)val);
# endif
}
}
};
// Specialization for functions where sizeof(T) >= 2
template <class T>
class Helper<T, 2> : public Helper<T, 1>
{
public:
using Base = Helper<T, 1>;
using typename Base::Signed;
using typename Base::Unsigned;
static constexpr void propagateHighBit(T& n)
{
Base::propagateHighBit(n);
n |= (n >> 8);
}
};
// Specialization for functions where sizeof(T) >= 4
template <class T>
class Helper<T, 4> : public Helper<T, 2>
{
public:
using Base = Helper<T, 2>;
using typename Base::Signed;
using typename Base::Unsigned;
static constexpr void propagateHighBit(T& n)
{
Base::propagateHighBit(n);
n |= (n >> 16);
}
# if CARB_LZCNT
# if CARB_COMPILER_MSC
static int countl_zero(T val)
{
static_assert(std::numeric_limits<T>::digits == 32, "Invalid assumption");
return int(__lzcnt((unsigned int)(Unsigned)val));
}
# else
static int countl_zero(T val)
{
static_assert(std::numeric_limits<T>::digits == 32, "Invalid assumption");
return int(__builtin_ia32_lzcnt_u32((unsigned int)(Unsigned)val));
}
# endif
# endif
};
// Specialization for functions where sizeof(T) >= 8
template <class T>
class Helper<T, 8> : public Helper<T, 4>
{
public:
using Base = Helper<T, 4>;
using typename Base::Signed;
using typename Base::Unsigned;
// popCount implementation for 8 byte integers
static int popCount(const T& val)
{
static_assert(sizeof(T) == sizeof(uint64_t), "Unexpected size");
# if CARB_COMPILER_MSC
# if !CARB_POPCNT // Skip the check if we know we have the instruction
// Only use the intrinsic if it's supported on the CPU
static extras::CpuInfo cpuInfo;
if (!cpuInfo.popcntSupported())
{
return popCountImpl((Unsigned)val);
}
else
# endif
{
# if CARB_X86_64
return (int)__popcnt64((Unsigned)val);
# else
return (int)_CountOneBits64((Unsigned)val);
# endif
}
# else
return __builtin_popcountll((Unsigned)val);
# endif
}
static constexpr void propagateHighBit(T& n)
{
Base::propagateHighBit(n);
n |= (n >> 32);
}
static int countl_zero(T val)
{
# if CARB_LZCNT
# if CARB_COMPILER_MSC
return int(__lzcnt64((Unsigned)val));
# else
return int(__builtin_ia32_lzcnt_u64((Unsigned)val));
# endif
# else
# if CARB_COMPILER_MSC
unsigned long index;
static_assert(sizeof(val) == sizeof(unsigned __int64), "Invalid assumption");
return _BitScanReverse64(&index, val) ? std::numeric_limits<T>::digits - 1 - index :
std::numeric_limits<T>::digits;
# else
// According to docs, undefined if val == 0
return val ? __builtin_clzll((Unsigned)val) : std::numeric_limits<T>::digits;
# endif
# endif
}
static int countr_zero(T val)
{
if (val == 0)
{
return std::numeric_limits<T>::digits;
}
else
{
# if CARB_COMPILER_MSC
unsigned long result;
_BitScanForward64(&result, (Unsigned)val);
return (int)result;
# else
return __builtin_ctzll((Unsigned)val);
# endif
}
}
};
#endif
template <class T>
using IsUnsignedIntegral = std::conjunction<std::is_unsigned<T>, std::is_integral<T>>;
template <class T>
std::enable_if_t<IsUnsignedIntegral<T>::value, unsigned> log2(const T in) noexcept
{
CARB_ASSERT(in > 0, "Undefined");
#if CARB_COMPILER_MSC
unsigned long out;
_BitScanReverse64(&out, (uint64_t)in);
return out;
#elif CARB_COMPILER_GNUC
return 63 ^ __builtin_clzll(in);
#else
CARB_UNSUPPORTED_COMPILER()
#endif
}
inline uint8_t reverse_byte(uint8_t in) noexcept
{
// clang-format off
constexpr static uint8_t reversed[] = {
0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,
0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,
0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,
0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,
0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,
0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
};
// clang-format on
return reversed[in];
}
template <class T>
std::enable_if_t<IsUnsignedIntegral<T>::value, T> reverse_bits(const T in) noexcept
{
T out;
auto pin = reinterpret_cast<const uint8_t*>(&in);
auto pout = reinterpret_cast<uint8_t*>(&out);
for (ptrdiff_t i = sizeof(T) - 1; i >= 0; --i)
pout[i] = reverse_byte(pin[(sizeof(T) - 1) - i]);
return out;
}
template <class T>
std::enable_if_t<IsUnsignedIntegral<T>::value, T> reverse_n_bits(const T in, std::size_t n) noexcept
{
CARB_ASSERT(n != 0, "Undefined behavior");
return reverse_bits(in) >> (std::numeric_limits<T>::digits - n);
}
} // namespace detail
#if defined(DOXYGEN_BUILD)
enum class endian
{
little = 0,
big = 1,
native = little
};
#elif defined(__cpp_lib_endian)
using endian = std::endian;
#else
enum class endian
{
# if CARB_COMPILER_GNUC
little = __ORDER_LITTLE_ENDIAN__,
big = __ORDER_BIG_ENDIAN__,
native = __BYTE_ORDER__
# else
little = 0,
big = 1,
# if CARB_X86_64 || CARB_AARCH64
native = little
# else
CARB_UNSUPPORTED_PLATFORM();
# endif
# endif
};
#endif
#if CARB_COMPILER_MSC && _MSC_VER >= 1926
// VS 2019 16.6 has it https://github.com/microsoft/STL/issues/22#issuecomment-571378845
# define CARBLOCAL_HAS_BUILTIN_BIT_CAST 1
#elif defined(__has_builtin) && !defined(NV_SONAR_BUILD)
# if __has_builtin(__builtin_bit_cast) // most compilers don't like this combined with previous
# define CARBLOCAL_HAS_BUILTIN_BIT_CAST 1
# endif
#endif
#if !defined(DOXYGEN_BUILD) && defined(__cpp_lib_bit_cast)
using std::bit_cast;
#elif !defined(DOXYGEN_BUILD) && defined(CARBLOCAL_HAS_BUILTIN_BIT_CAST)
template <class To,
class From,
std::enable_if_t<std::conjunction<std::bool_constant<sizeof(To) == sizeof(From)>,
std::is_trivially_copyable<To>,
std::is_trivially_copyable<From>>::value,
bool> = false>
constexpr To bit_cast(const From& src) noexcept
{
return __builtin_bit_cast(To, src);
}
#else
template <class To,
class From CARB_NO_DOC(,
std::enable_if_t<std::conjunction<std::bool_constant<sizeof(To) == sizeof(From)>,
std::is_trivially_copyable<To>,
std::is_trivially_copyable<From>>::value,
bool> = false)>
To bit_cast(const From& src) noexcept // requires compiler support to be constexpr
{
// This union allows us to bypass `dest`'s constructor and just trivially copy into it.
union
{
detail::NontrivialDummyType dummy;
To dest;
} u = {};
std::memcpy(reinterpret_cast<void*>(&u.dest), &src, sizeof(From));
return u.dest;
}
#endif
#undef CARBLOCAL_HAS_BUILTIN_BIT_CAST
#if defined(__cpp_lib_bitops) && !defined(DOXYGEN_BUILD)
// Use C++20 definitions
using std::bit_ceil;
using std::bit_floor;
using std::bit_width;
using std::countl_one;
using std::countl_zero;
using std::countr_one;
using std::countr_zero;
using std::has_single_bit;
using std::popcount;
// Wrapper implementations guaranteed to be constexpr
template <class T>
constexpr int popcount_constexpr(T val) noexcept
{
return std::popcount(val);
}
template <class T>
constexpr int countl_zero_constexpr(T val) noexcept
{
return std::countl_zero(val);
}
template <class T>
constexpr int countr_zero_constexpr(T val) noexcept
{
return std::countr_zero(val);
}
template <class T>
constexpr int countl_one_constexpr(T val) noexcept
{
return std::countl_one(val);
}
template <class T>
constexpr int countr_one_constexpr(T val) noexcept
{
return std::countr_one(val);
}
template <class T>
constexpr auto bit_width_constexpr(T val) noexcept
{
return std::bit_width(val);
}
#else
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr bool has_single_bit(T val) noexcept
{
return val != T(0) && (val & (val - 1)) == T(0);
}
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr T bit_ceil(T val) noexcept
{
if (val <= 1)
return T(1);
// Yes, this could be implemented with a `nlz` instruction but cannot be constexpr without compiler support.
--val;
detail::Helper<T>::propagateHighBit(val);
++val;
return val;
}
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr T bit_floor(T val) noexcept
{
// Yes, this could be implemented with a `nlz` instruction but cannot be constexpr without compiler support.
detail::Helper<T>::propagateHighBit(val);
return val - (val >> 1);
}
template <class T, std::enable_if_t<std::is_unsigned<T>::value, bool> = true>
int popcount(T val) noexcept
{
return detail::Helper<T>::popCount(val);
}
template <class T, std::enable_if_t<std::is_unsigned<T>::value, bool> = true>
constexpr int popcount_constexpr(T val) noexcept
{
return detail::popCountImpl(val);
}
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
int countl_zero(T val) noexcept
{
return detail::Helper<T>::countl_zero(val);
}
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr int countl_zero_constexpr(T val) noexcept
{
if (val == 0)
return std::numeric_limits<T>::digits;
unsigned zeros = 0;
for (T shift = std::numeric_limits<T>::digits >> 1; shift; shift >>= 1)
{
T temp = val >> shift;
if (temp)
val = temp;
else
zeros |= shift;
}
return int(zeros);
}
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
int countr_zero(T val) noexcept
{
return detail::Helper<T>::countr_zero(val);
}
# pragma push_macro("max")
# undef max
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr int countr_zero_constexpr(T val) noexcept
{
if (val == 0)
return std::numeric_limits<T>::digits;
if (val & 1)
return 0;
int zeros = 0;
T shift = std::numeric_limits<T>::digits >> 1;
T mask = std::numeric_limits<T>::max() >> shift;
while (shift)
{
if (!(val & mask))
{
val >>= shift;
zeros |= shift;
}
shift >>= 1;
mask >>= shift;
}
return zeros;
}
# pragma pop_macro("max")
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
int countl_one(T val) noexcept
{
return detail::Helper<T>::countl_zero(T(~val));
}
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr int countl_one_constexpr(T val) noexcept
{
return countl_zero_constexpr(T(~val));
}
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
int countr_one(T val) noexcept
{
return detail::Helper<T>::countr_zero(T(~val));
}
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr int countr_one_constexpr(T val) noexcept
{
return countr_zero_constexpr(T(~val));
}
template <class T>
int bit_width(T val) noexcept
{
return std::numeric_limits<T>::digits - countl_zero(val);
}
template <class T>
constexpr int bit_width_constexpr(T val) noexcept
{
return std::numeric_limits<T>::digits - countl_zero_constexpr(val);
}
#endif
} // namespace cpp
} // namespace carb
#undef CARB_LZCNT
#undef CARB_POPCNT