carb/cpp/Bit.h
File members: carb/cpp/Bit.h
// Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//
#pragma once
#include "TypeTraits.h"
#include "detail/ImplDummy.h"
#include "../extras/CpuInfo.h"
#if CARB_HAS_CPP20
# include <bit>
#endif
#if !defined(DOXYGEN_BUILD) && !defined(__cpp_lib_bitops)
// CARB_POPCNT is 1 if the compiler is targeting a CPU with AVX instructions or GCC reports popcnt is available. It is
// undefined at the bottom of the file.
# if defined(__AVX__) /* MSC/GCC */ || defined(__POPCNT__) /* GCC */
# define CARB_POPCNT 1
# else
# define CARB_POPCNT 0
# endif
// CARB_LZCNT is 1 if the compiler is targeting a CPU with AVX2 instructions or GCC reports lzcnt is available. It is
// undefined at the bottom of the file.
# if defined(__AVX2__) /* MSC/GCC */ || defined(__LZCNT__) /* GCC */
# define CARB_LZCNT 1
# else
# define CARB_LZCNT 0
# endif
# if CARB_COMPILER_MSC
extern "C"
{
unsigned int __popcnt(unsigned int value);
unsigned __int64 __popcnt64(unsigned __int64 value);
unsigned char _BitScanReverse(unsigned long* _Index, unsigned long _Mask);
unsigned char _BitScanReverse64(unsigned long* _Index, unsigned __int64 _Mask);
unsigned char _BitScanForward(unsigned long* _Index, unsigned long _Mask);
unsigned char _BitScanForward64(unsigned long* _Index, unsigned __int64 _Mask);
# if CARB_LZCNT
unsigned int __lzcnt(unsigned int);
unsigned short __lzcnt16(unsigned short);
unsigned __int64 __lzcnt64(unsigned __int64);
# endif
}
# pragma intrinsic(__popcnt)
# pragma intrinsic(__popcnt64)
# pragma intrinsic(_BitScanReverse)
# pragma intrinsic(_BitScanReverse64)
# pragma intrinsic(_BitScanForward)
# pragma intrinsic(_BitScanForward64)
# if CARB_LZCNT
# pragma intrinsic(__lzcnt)
# pragma intrinsic(__lzcnt16)
# pragma intrinsic(__lzcnt64)
# endif
# elif CARB_COMPILER_GNUC
# else
CARB_UNSUPPORTED_PLATFORM();
# endif
#endif
namespace carb
{
namespace cpp
{
#if !defined(DOXYGEN_BUILD) && !defined(__cpp_lib_bitops)
namespace detail
{
// Naive implementation of popcnt for CPUs without built in instructions.
template <class T, std::enable_if_t<std::is_unsigned<T>::value, bool> = true>
constexpr int popCountImpl(T val) noexcept
{
int count = 0;
while (val != 0)
{
++count;
val = val & (val - 1);
}
return count;
}
// The Helper class is specialized by type and size since many intrinsics have different names for different sizes.
template <class T, size_t Size = sizeof(T)>
class Helper;
// Specialization for functions where sizeof(T) >= 1
template <class T>
class Helper<T, 1>
{
public:
static_assert(std::numeric_limits<T>::is_specialized, "Requires numeric type");
using Signed = typename std::make_signed_t<T>;
using Unsigned = typename std::make_unsigned_t<T>;
// popCount implementation for 1-4 bytes integers
static int popCount(const T& val)
{
# if CARB_COMPILER_MSC
# if !CARB_POPCNT // Skip the check if we know we have the instruction
// Only use the intrinsic if it's supported on the CPU
static extras::CpuInfo cpuInfo;
if (!cpuInfo.popcntSupported())
{
return popCountImpl((Unsigned)val);
}
else
# endif
{
return (int)__popcnt((unsigned long)(Unsigned)val);
}
# else
return __builtin_popcount((unsigned long)(Unsigned)val);
# endif
}
static constexpr void propagateHighBit(T& n)
{
n |= (n >> 1);
n |= (n >> 2);
n |= (n >> 4);
}
static int countl_zero(T val)
{
# if CARB_LZCNT
# if CARB_COMPILER_MSC
return int(__lzcnt16((unsigned short)(Unsigned)val)) - (16 - std::numeric_limits<T>::digits);
# else
return int(__builtin_ia32_lzcnt_u16((unsigned short)(Unsigned)val)) - (16 - std::numeric_limits<T>::digits);
# endif
# else
# if CARB_COMPILER_MSC
unsigned long index;
constexpr static int diff = std::numeric_limits<unsigned long>::digits - std::numeric_limits<T>::digits;
return _BitScanReverse(&index, (unsigned long)(Unsigned)val) ?
(std::numeric_limits<unsigned long>::digits - 1 - index - diff) :
std::numeric_limits<T>::digits;
# else
// According to docs, undefined if val == 0
return val ? __builtin_clz((unsigned int)(Unsigned)val) - (32 - std::numeric_limits<T>::digits) :
std::numeric_limits<T>::digits;
# endif
# endif
}
static int countr_zero(T val)
{
if (val == 0)
{
return std::numeric_limits<T>::digits;
}
else
{
# if CARB_COMPILER_MSC
unsigned long result;
_BitScanForward(&result, (unsigned long)(Unsigned)val);
return (int)result;
# else
return __builtin_ctz((unsigned int)(Unsigned)val);
# endif
}
}
};
// Specialization for functions where sizeof(T) >= 2
template <class T>
class Helper<T, 2> : public Helper<T, 1>
{
public:
using Base = Helper<T, 1>;
using typename Base::Signed;
using typename Base::Unsigned;
static constexpr void propagateHighBit(T& n)
{
Base::propagateHighBit(n);
n |= (n >> 8);
}
};
// Specialization for functions where sizeof(T) >= 4
template <class T>
class Helper<T, 4> : public Helper<T, 2>
{
public:
using Base = Helper<T, 2>;
using typename Base::Signed;
using typename Base::Unsigned;
static constexpr void propagateHighBit(T& n)
{
Base::propagateHighBit(n);
n |= (n >> 16);
}
# if CARB_LZCNT
# if CARB_COMPILER_MSC
static int countl_zero(T val)
{
static_assert(std::numeric_limits<T>::digits == 32, "Invalid assumption");
return int(__lzcnt((unsigned int)(Unsigned)val));
}
# else
static int countl_zero(T val)
{
static_assert(std::numeric_limits<T>::digits == 32, "Invalid assumption");
return int(__builtin_ia32_lzcnt_u32((unsigned int)(Unsigned)val));
}
# endif
# endif
};
// Specialization for functions where sizeof(T) >= 8
template <class T>
class Helper<T, 8> : public Helper<T, 4>
{
public:
using Base = Helper<T, 4>;
using typename Base::Signed;
using typename Base::Unsigned;
// popCount implementation for 8 byte integers
static int popCount(const T& val)
{
static_assert(sizeof(T) == sizeof(uint64_t), "Unexpected size");
# if CARB_COMPILER_MSC
# if !CARB_POPCNT // Skip the check if we know we have the instruction
// Only use the intrinsic if it's supported on the CPU
static extras::CpuInfo cpuInfo;
if (!cpuInfo.popcntSupported())
{
return popCountImpl((Unsigned)val);
}
else
# endif
{
return (int)__popcnt64((Unsigned)val);
}
# else
return __builtin_popcountll((Unsigned)val);
# endif
}
static constexpr void propagateHighBit(T& n)
{
Base::propagateHighBit(n);
n |= (n >> 32);
}
static int countl_zero(T val)
{
# if CARB_LZCNT
# if CARB_COMPILER_MSC
return int(__lzcnt64((Unsigned)val));
# else
return int(__builtin_ia32_lzcnt_u64((Unsigned)val));
# endif
# else
# if CARB_COMPILER_MSC
unsigned long index;
static_assert(sizeof(val) == sizeof(unsigned __int64), "Invalid assumption");
return _BitScanReverse64(&index, val) ? std::numeric_limits<T>::digits - 1 - index :
std::numeric_limits<T>::digits;
# else
// According to docs, undefined if val == 0
return val ? __builtin_clzll((Unsigned)val) : std::numeric_limits<T>::digits;
# endif
# endif
}
static int countr_zero(T val)
{
if (val == 0)
{
return std::numeric_limits<T>::digits;
}
else
{
# if CARB_COMPILER_MSC
unsigned long result;
_BitScanForward64(&result, (Unsigned)val);
return (int)result;
# else
return __builtin_ctzll((Unsigned)val);
# endif
}
}
};
template <class T>
using IsUnsignedIntegral = cpp::conjunction<std::is_unsigned<T>, std::is_integral<T>>;
} // namespace detail
#endif
#if defined(DOXYGEN_BUILD)
enum class endian
{
little = 0,
big = 1,
native = little
};
#elif defined(__cpp_lib_endian)
using endian = std::endian;
#else
enum class endian
{
# if CARB_COMPILER_GNUC
little = __ORDER_LITTLE_ENDIAN__,
big = __ORDER_BIG_ENDIAN__,
native = __BYTE_ORDER__
# else
little = 0,
big = 1,
# if CARB_X86_64 || CARB_AARCH64
native = little
# else
CARB_UNSUPPORTED_PLATFORM();
# endif
# endif
};
#endif
#if CARB_COMPILER_MSC && _MSC_VER >= 1926
// VS 2019 16.6 has it https://github.com/microsoft/STL/issues/22#issuecomment-571378845
# define CARBLOCAL_HAS_BUILTIN_BIT_CAST 1
#elif defined(__has_builtin)
# if __has_builtin(__builtin_bit_cast) // most compilers don't like this combined with previous
# define CARBLOCAL_HAS_BUILTIN_BIT_CAST 1
# endif
#endif
#if !defined(DOXYGEN_BUILD) && defined(__cpp_lib_bit_cast)
using std::bit_cast;
#elif !defined(DOXYGEN_BUILD) && defined(CARBLOCAL_HAS_BUILTIN_BIT_CAST)
template <class To,
class From,
std::enable_if_t<cpp::conjunction<cpp::bool_constant<sizeof(To) == sizeof(From)>,
std::is_trivially_copyable<To>,
std::is_trivially_copyable<From>>::value,
bool> = false>
constexpr To bit_cast(const From& src) noexcept
{
return __builtin_bit_cast(To, src);
}
#else
template <class To,
class From CARB_NO_DOC(,
std::enable_if_t<cpp::conjunction<cpp::bool_constant<sizeof(To) == sizeof(From)>,
std::is_trivially_copyable<To>,
std::is_trivially_copyable<From>>::value,
bool> = false)>
To bit_cast(const From& src) noexcept // requires compiler support to be constexpr
{
// This union allows us to bypass `dest`'s constructor and just trivially copy into it.
union
{
detail::NontrivialDummyType dummy;
To dest;
} u = {};
std::memcpy(reinterpret_cast<void*>(&u.dest), &src, sizeof(From));
return u.dest;
}
#endif
#undef CARBLOCAL_HAS_BUILTIN_BIT_CAST
#if defined(__cpp_lib_bitops) && !defined(DOXYGEN_BUILD)
// Use C++20 definitions
using std::bit_ceil;
using std::bit_floor;
using std::countl_one;
using std::countl_zero;
using std::countr_one;
using std::countr_zero;
using std::has_single_bit;
using std::popcount;
// Wrapper implementations guaranteed to be constexpr
template <class T>
constexpr int popcount_constexpr(T val) noexcept
{
return std::popcount(val);
}
template <class T>
constexpr int countl_zero_constexpr(T val) noexcept
{
return std::countl_zero(val);
}
template <class T>
constexpr int countr_zero_constexpr(T val) noexcept
{
return std::countr_zero(val);
}
template <class T>
constexpr int countl_one_constexpr(T val) noexcept
{
return std::countl_one(val);
}
template <class T>
constexpr int countr_one_constexpr(T val) noexcept
{
return std::countr_one(val);
}
#else
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr bool has_single_bit(T val) noexcept
{
return val != T(0) && (val & (val - 1)) == T(0);
}
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr T bit_ceil(T val) noexcept
{
if (val <= 1)
return T(1);
// Yes, this could be implemented with a `nlz` instruction but cannot be constexpr without compiler support.
--val;
detail::Helper<T>::propagateHighBit(val);
++val;
return val;
}
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr T bit_floor(T val) noexcept
{
// Yes, this could be implemented with a `nlz` instruction but cannot be constexpr without compiler support.
detail::Helper<T>::propagateHighBit(val);
return val - (val >> 1);
}
template <class T, std::enable_if_t<std::is_unsigned<T>::value, bool> = true>
int popcount(T val) noexcept
{
return detail::Helper<T>::popCount(val);
}
template <class T, std::enable_if_t<std::is_unsigned<T>::value, bool> = true>
constexpr int popcount_constexpr(T val) noexcept
{
return detail::popCountImpl(val);
}
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
int countl_zero(T val) noexcept
{
return detail::Helper<T>::countl_zero(val);
}
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr int countl_zero_constexpr(T val) noexcept
{
if (val == 0)
return std::numeric_limits<T>::digits;
unsigned zeros = 0;
for (T shift = std::numeric_limits<T>::digits >> 1; shift; shift >>= 1)
{
T temp = val >> shift;
if (temp)
val = temp;
else
zeros |= shift;
}
return int(zeros);
}
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
int countr_zero(T val) noexcept
{
return detail::Helper<T>::countr_zero(val);
}
# pragma push_macro("max")
# undef max
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr int countr_zero_constexpr(T val) noexcept
{
if (val == 0)
return std::numeric_limits<T>::digits;
if (val & 1)
return 0;
int zeros = 0;
T shift = std::numeric_limits<T>::digits >> 1;
T mask = std::numeric_limits<T>::max() >> shift;
while (shift)
{
if (!(val & mask))
{
val >>= shift;
zeros |= shift;
}
shift >>= 1;
mask >>= shift;
}
return zeros;
}
# pragma pop_macro("max")
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
int countl_one(T val) noexcept
{
return detail::Helper<T>::countl_zero(T(~val));
}
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr int countl_one_constexpr(T val) noexcept
{
return countl_zero_constexpr(T(~val));
}
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
int countr_one(T val) noexcept
{
return detail::Helper<T>::countr_zero(T(~val));
}
template <class T, std::enable_if_t<detail::IsUnsignedIntegral<T>::value, bool> = false>
constexpr int countr_one_constexpr(T val) noexcept
{
return countr_zero_constexpr(T(~val));
}
#endif
} // namespace cpp
} // namespace carb
#undef CARB_LZCNT
#undef CARB_POPCNT