Utf8Parser.h#
Fully qualified name: carb/extras/Utf8Parser.h
File members: carb/extras/Utf8Parser.h
// SPDX-FileCopyrightText: Copyright (c) 2019-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: LicenseRef-NvidiaProprietary
//
// NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
// property and proprietary rights in and to this material, related
// documentation and any modifications thereto. Any use, reproduction,
// disclosure or distribution of this material and related documentation
// without an express license agreement from NVIDIA CORPORATION or
// its affiliates is strictly prohibited.
#pragma once
#include "../Defines.h"
#include "../../omni/extras/ScratchBuffer.h"
#include "../cpp/Optional.h"
#include "../cpp/StringView.h"
#include "../cpp/ZStringView.h"
#include <algorithm>
#include <cstdint>
#include <string>
namespace carb::extras
{
namespace detail
{
using CodePoint = char32_t;
constexpr CodePoint kReplacementChar{ 0xFFFD };
constexpr CodePoint kLastCodePoint{ 0x10FFFF };
// UTF-16 surrogate pairs
constexpr char16_t kSurrogateFirstMask{ 0xf800 };
constexpr char16_t kSurrogateFirst{ 0xd800 };
constexpr char16_t kSurrogateSecondMask{ 0xfc00 };
constexpr char16_t kSurrogateSecond{ 0xdc00 };
constexpr char16_t kSurrogateLast{ 0xdfff };
constexpr char32_t kSurrogateBias{ 0x10000 };
constexpr size_t kSurrogateShift = 10;
constexpr char16_t kSurrogateMask = (char16_t(1) << kSurrogateShift) - 1;
} // namespace detail
class Utf8Parser
{
public:
using CodePoint = detail::CodePoint;
using Utf16CodeUnit = char16_t;
using CodeByte = char;
using Flags = uint32_t;
enum class SurrogateMember
{
eNone,
eHigh,
eLow,
};
static constexpr Flags fDecodeUseDefault = 0x00000001;
static constexpr Flags fDecodeSkipInvalid = 0x00000002;
static constexpr Flags fEncodeUseUtf16 = 0x00000004;
static constexpr Flags fEncodeIgnoreSurrogatePairs = 0x00000008;
static constexpr size_t kNullTerminated = ~0ull;
static constexpr CodePoint kInvalidCodePoint = ~0u;
static constexpr size_t kMaxSequenceLength = 7;
static constexpr CodePoint kDefaultCodePoint = detail::kReplacementChar;
static const CodeByte* nextCodePoint(const CodeByte* str,
size_t lengthInBytes = kNullTerminated,
CodePoint* codepoint = nullptr,
Flags flags = 0)
{
// retrieve the next code point
CodePoint high = 0;
const CodeByte* next = nullptr;
bool r = parseUtf8(str, &next, &high, lengthInBytes, flags);
if (codepoint != nullptr)
{
*codepoint = high;
}
// parsing failed => just fail out
if (!r)
{
return next;
}
// it's a surrogate pair (and we're allowed to parse those) => parse out the full pair
if ((flags & fEncodeIgnoreSurrogatePairs) == 0 && classifyUtf16SurrogateMember(high) == SurrogateMember::eHigh)
{
// figure out the new length if it's not null terminated
const size_t newLen =
(lengthInBytes == kNullTerminated) ? kNullTerminated : (lengthInBytes - (size_t)(next - str));
// parse out the next code point
CodePoint low = 0;
r = parseUtf8(next, &next, &low, newLen, flags);
// invalid surrogate pair => fail
if (!r || classifyUtf16SurrogateMember(low) != SurrogateMember::eLow)
{
if (codepoint != nullptr)
{
*codepoint = getFailureCodepoint(flags);
}
return next;
}
// valid surrogate pair => calculate the code point
if (codepoint != nullptr)
{
*codepoint = (((high & kSurrogateMask) << kSurrogateBits) | (low & kSurrogateMask)) + kSurrogateBias;
}
return next;
}
return next;
}
static const CodeByte* lastCodePoint(carb::cpp::basic_string_view<CodeByte> str,
CodePoint* codepoint = nullptr,
Flags flags = fDecodeUseDefault)
{
// Prepare error value result
if (codepoint != nullptr)
{
*codepoint = getFailureCodepoint(flags);
}
// Check if it's a null or empty string
if (str.empty())
{
return nullptr;
}
// Make sure no unexpected flags pass into used `nextCodePoint` function
constexpr Flags kErrorHandlingMask = fDecodeSkipInvalid | fDecodeUseDefault;
const Flags helperParserFlags = (flags & kErrorHandlingMask);
size_t curCodePointSize = 0; // Keeps max number of bytes for a decoding attempt with `nextCodePoint`
const bool skipInvalid = (flags & fDecodeSkipInvalid) != 0;
// Walk the string backwards to find the start of the last CodePoint and decode it
// Note: it can be a single byte or a sequence, also if not searching for the last valid CodePoint
// the maximum number of bytes to check is just last `kMaxSequenceLength` bytes instead of the full string
//
// Note that the 'array-bounds' warning needs to be disabled here since it can pick up the
// calculation of `rIterEnd` as being out of bounds. However, for cases of short strings
// this is intentional since the iterator needs to point to one byte before the start of the
// string.
CARB_IGNOREWARNING_GNUC_WITH_PUSH("-Warray-bounds")
const CodeByte* const rIterBegin = str.data() - 1 + str.size();
const CodeByte* const rIterEnd =
(flags & fDecodeSkipInvalid) ? str.data() - 1 : CARB_MAX(str.data() - 1, rIterBegin - kMaxSequenceLength);
for (const CodeByte* rIter = rIterBegin; rIter != rIterEnd; --rIter)
{
const uint8_t curByte = static_cast<uint8_t>(*rIter);
++curCodePointSize;
// Check if the current code byte is a direct ASCII character
if (curByte < k7BitLimit)
{
// If parsed more than one byte then it's an error
if (curCodePointSize > 1 && !skipInvalid)
{
return nullptr;
}
if (codepoint != nullptr)
{
*codepoint = curByte;
}
return rIter;
}
// The current code byte is a continuation byte so step further
if (curByte < kMinLeadByte)
{
continue;
}
// The current code byte is a lead byte, decode the sequence and check that all bytes were used
CodePoint cp{};
const CodeByte* next = nextCodePoint(rIter, curCodePointSize, &cp, helperParserFlags);
if (!next)
{
if (skipInvalid)
{
curCodePointSize = 0;
continue;
}
return nullptr;
}
// Validate that all bytes till the end were used if expecting no invalid bytes
// Ex: "\xce\xa6\xa6" is a 2 byte sequence "\xce\xa6" for a 0x03A6 code point followed by excessive
// follow up byte "\xa6". The first 2 bytes will be decoded by the `nextCodePoint` properly
// and `next` will be pointing at the last "\xa6" byte
if (!skipInvalid && curCodePointSize != static_cast<size_t>(next - rIter))
{
return nullptr;
}
const SurrogateMember surrogateType = classifyUtf16SurrogateMember(cp);
// Encountered the high surrogate part first which is an error
if (CARB_UNLIKELY(surrogateType == SurrogateMember::eHigh))
{
if (skipInvalid)
{
// Just skip it and search further
curCodePointSize = 0;
continue;
}
return nullptr;
}
// Found the low part of a surrogate pair, need to continue parsing to get the high part
else if (CARB_UNLIKELY(surrogateType == SurrogateMember::eLow))
{
constexpr int kSurrogatePartSize = 3;
constexpr int kFullSurrogatePairSize = 2 * kSurrogatePartSize;
// To prepare for possible continuation of parsing if skipping invalid bytes and no high surrogate is
// found reset the possible CodePoint size
curCodePointSize = 0;
// For a valid UTF-8 string there are must be high surrogate (3 bytes) preceding low surrogate (3 bytes)
if (rIter <= rIterEnd + kSurrogatePartSize)
{
if (skipInvalid)
{
// Skip the low surrogate data and continue to check the preceding byte
continue;
}
return nullptr;
}
// Step 3 bytes preceding the low surrogate
const CodeByte* const possibleHighSurStart = rIter - kSurrogatePartSize;
// Check if it starts with a lead byte
if (static_cast<uint8_t>(*possibleHighSurStart) < kMinLeadByte)
{
if (skipInvalid)
{
continue;
}
return nullptr;
}
// Try to parse 6 bytes (full surrogate pair size) to get the whole CodePoint without skipping invalid
// bytes
const CodeByte* const decodedPairEnd =
nextCodePoint(possibleHighSurStart, kFullSurrogatePairSize, &cp, 0);
if (!decodedPairEnd)
{
if (skipInvalid)
{
continue;
}
return nullptr;
}
// Check if used all 6 bytes (as expected from a surrogate pair)
if (decodedPairEnd - possibleHighSurStart != kFullSurrogatePairSize)
{
if (skipInvalid)
{
continue;
}
return nullptr;
}
// A proper surrogate pair was parsed into the `cp`
// and only the `rIter` has invalid value at this point
rIter = possibleHighSurStart;
// Just exit the block so the code below reports the result
}
if (codepoint)
{
*codepoint = cp;
}
// Everything is fine thus return start of the sequence
return rIter;
}
CARB_IGNOREWARNING_GNUC_POP
// Didn't find start of a valid CodePoint
return nullptr;
}
static size_t getLengthInCodePoints(const CodeByte* str, size_t maxLengthInBytes = kNullTerminated, Flags flags = 0)
{
const CodeByte* current;
const CodeByte* next;
size_t count = 0;
// get the second codepoint in the string.
current = str;
next = nextCodePoint(str, maxLengthInBytes, nullptr, flags);
// empty or invalid string => fail.
if (next == nullptr)
return 0;
if (maxLengthInBytes != kNullTerminated)
{
maxLengthInBytes -= (size_t)(next - current);
}
count++;
do
{
current = next;
next = nextCodePoint(current, maxLengthInBytes, nullptr, flags);
if (next == nullptr)
return count;
if (maxLengthInBytes != kNullTerminated)
{
maxLengthInBytes -= (size_t)(next - current);
}
count++;
} while (maxLengthInBytes > 0);
return count;
}
static size_t getLengthInCodeBytes(const CodePoint* str,
size_t maxLengthInCodePoints = kNullTerminated,
Flags flags = 0)
{
size_t count = 0;
size_t largeCodePointSize = 4;
if ((flags & fEncodeUseUtf16) != 0)
largeCodePointSize = 6;
for (size_t i = 0; str[i] != 0 && i < maxLengthInCodePoints; i++)
{
if (str[i] < getMaxCodePoint(0))
count++;
else if (str[i] < getMaxCodePoint(1))
count += 2;
else if (str[i] < getMaxCodePoint(2))
count += 3;
else if (str[i] < getMaxCodePoint(3))
count += largeCodePointSize;
else if (str[i] < getMaxCodePoint(4))
count += 5;
else if (str[i] < getMaxCodePoint(5))
count += 6;
else
count += 7;
}
return count;
}
static size_t getLengthInCodeBytes(const Utf16CodeUnit* str,
size_t maxLengthInCodePoints = kNullTerminated,
Flags flags = 0)
{
size_t count = 0;
size_t largeCodePointSize = 4;
if ((flags & fEncodeUseUtf16) != 0)
largeCodePointSize = 6;
for (size_t i = 0; str[i] != 0 && i < maxLengthInCodePoints; i++)
{
if (str[i] < getMaxCodePoint(0))
count++;
else if (str[i] < getMaxCodePoint(1))
count += 2;
else
{
// found a surrogate pair in the string -> both of these codepoints will decode to
// a single UTF-32 codepoint => skip the low surrogate and add the size of a
// single encoded codepoint.
if (str[i] >= kSurrogateBaseHigh && str[i] < kSurrogateBaseLow && i + 1 < maxLengthInCodePoints &&
str[i + 1] >= kSurrogateBaseLow && str[i + 1] <= kSurrogateMax)
{
i++;
count += largeCodePointSize;
}
// not part of a UTF-16 surrogate pair => this will encode to 3 bytes in UTF-8.
else
count += 3;
}
}
return count;
}
static CodePoint getCodePoint(const CodeByte* str, size_t lengthInBytes = kNullTerminated, Flags flags = 0)
{
char32_t c = 0;
nextCodePoint(str, lengthInBytes, &c, flags);
return c;
}
static CodeByte* getCodeBytes(CodePoint cp, CodeByte* str, size_t lengthInBytes, size_t* bytesWritten, Flags flags = 0)
{
size_t sequenceLength = 0;
size_t continuationLength = 0;
size_t codePointCount = 1;
CodePoint codePoint[2] = { cp, 0 };
CodeByte* result;
// not enough room in the buffer => fail.
if (lengthInBytes == 0)
{
*bytesWritten = 0;
return nullptr;
}
// a 7-bit ASCII character -> this can be directly stored => store and return.
if (codePoint[0] < k7BitLimit)
{
str[0] = CodeByte((codePoint[0] & 0xff));
*bytesWritten = 1;
return str;
}
// at this point we know that the encoding for the codepoint is going to require at least
// two bytes. We need to calculate the sequence length and encode the bytes.
// allowing a UTF-16 surrogate pair encoding in the string and the codepoint is above the
// range where a surrogate pair is necessary => calculate the low and high codepoints
// for the pair and set the sequence length.
if ((flags & fEncodeUseUtf16) != 0 && codePoint[0] >= kSurrogateBias)
{
sequenceLength = 3;
continuationLength = 2;
codePointCount = 2;
codePoint[0] -= kSurrogateBias;
codePoint[1] = kSurrogateBaseLow | (codePoint[0] & kSurrogateMask);
codePoint[0] = kSurrogateBaseHigh | ((codePoint[0] >> kSurrogateBits) & kSurrogateMask);
}
// not using a UTF-16 surrogate pair => search for the required length of the sequence.
else
{
// figure out the required sequence length for the given for this codepoint.
for (size_t i = 1; i < kMaxSequenceBytes; i++)
{
if (codePoint[0] < getMaxCodePoint(i))
{
sequenceLength = i + 1;
continuationLength = i;
break;
}
}
// failed to find a sequence length for the given codepoint (?!?) => fail (this should
// never happen).
if (sequenceLength == 0)
{
*bytesWritten = 0;
return nullptr;
}
}
// not enough space in the buffer to store the entire sequence => fail.
if (lengthInBytes < sequenceLength * codePointCount)
{
*bytesWritten = 0;
return nullptr;
}
result = str;
// write out each of the codepoints. If UTF-16 encoding is not being used, there will only
// be one codepoint and this loop will exit after the first iteration.
for (size_t j = 0; j < codePointCount; j++)
{
cp = codePoint[j];
// write out the lead byte.
*str = CodeByte(getLeadByte(continuationLength) |
((cp >> (continuationLength * kContinuationShift)) & getLeadMask(continuationLength)));
str++;
// write out the continuation bytes.
for (size_t i = 0; i < continuationLength; i++)
{
*str = CodeByte(kContinuationBits |
((cp >> ((continuationLength - i - 1) * kContinuationShift)) & kContinuationMask));
str++;
}
}
*bytesWritten = sequenceLength * codePointCount;
return result;
}
static SurrogateMember classifyUtf16SurrogateMember(CodePoint cp)
{
if (cp >= kSurrogateBaseHigh && cp < kSurrogateBaseLow)
return SurrogateMember::eHigh;
if (cp >= kSurrogateBaseLow && cp <= kSurrogateMax)
return SurrogateMember::eLow;
return SurrogateMember::eNone;
}
static CodePoint decodeUtf16CodePoint(CodePoint high, CodePoint low)
{
CodePoint cp;
// the high and low codepoints are out of the surrogate pair range -> cannot decode => fail.
if (high < kSurrogateBaseHigh || high >= kSurrogateBaseLow || low < kSurrogateBaseLow || low > kSurrogateMax)
return 0;
// decode the surrogate pair into a single Unicode codepoint.
cp = (((high & kSurrogateMask) << kSurrogateBits) | (low & kSurrogateMask)) + kSurrogateBias;
return cp;
}
static size_t encodeUtf16CodePoint(CodePoint cp, CodePoint* out)
{
CodePoint high;
CodePoint low;
// small enough for a direct encoding => just store it.
if (cp < kSurrogateBias)
{
if (out != nullptr)
*out = cp;
return 1;
}
// too big for direct encoding => convert it to a surrogate pair and store both in the
// output buffer.
cp -= kSurrogateBias;
low = kSurrogateBaseLow | (cp & kSurrogateMask);
high = kSurrogateBaseHigh | ((cp >> kSurrogateBits) & kSurrogateMask);
if (out != nullptr)
*out = high | (low << 16);
return 2;
}
inline static bool isSpaceCodePoint(CodePoint cp)
{
// Taken from https://en.wikipedia.org/wiki/Whitespace_character
// Note: sorted to allow binary search
static constexpr CodePoint kSpaceCodePoints[] = {
0x0009, // character tabulation
0x000A, // line feed
0x000B, // line tabulation
0x000C, // form feed
0x000D, // carriage return
0x0020, // space
0x0085, // next line
0x00A0, // no-break space
0x1680, // ogham space mark
0x180E, // Mongolian vowel separator
0x2000, // en quad
0x2001, // em quad
0x2002, // en space
0x2003, // em space
0x2004, // three-per-em space
0x2005, // four-per-em space
0x2006, // six-per-em space
0x2007, // figure space
0x2008, // punctuation space
0x2009, // thin space
0x200A, // hair space
0x200B, // zero width space
0x200C, // zero width non-joiner
0x200D, // zero width joiner
0x2028, // line separator
0x2029, // paragraph separator
0x202F, // narrow no-break space
0x205F, // medium mathematical space
0x2060, // word joiner
0x3000, // ideograph space
0xFEFF, // zero width non-breaking space
};
constexpr size_t kSpaceCodePointsCount = CARB_COUNTOF(kSpaceCodePoints);
constexpr const CodePoint* const kSpaceCodePointsEnd = kSpaceCodePoints + kSpaceCodePointsCount;
return std::binary_search(kSpaceCodePoints, kSpaceCodePointsEnd, cp);
}
private:
static constexpr uint8_t s_leadBits[] = { 7, 5, 4, 3, 2, 1, 0 };
static constexpr CodePoint kMaxCodePoint = detail::kLastCodePoint;
static constexpr uint32_t kContinuationShift = 6;
static constexpr uint8_t kContinuationBits = 0x80;
static constexpr uint8_t kContinuationMask = (1u << kContinuationShift) - 1;
static constexpr CodePoint kSurrogateBias = detail::kSurrogateBias;
static constexpr CodePoint kSurrogateBaseHigh = detail::kSurrogateFirst;
static constexpr CodePoint kSurrogateBaseLow = detail::kSurrogateSecond;
static constexpr CodePoint kSurrogateMin = detail::kSurrogateFirst;
static constexpr CodePoint kSurrogateMax = detail::kSurrogateLast;
static constexpr uint32_t kSurrogateBits = detail::kSurrogateShift;
static constexpr CodePoint kSurrogateMask = detail::kSurrogateMask;
static constexpr size_t kMaxSequenceBytes = 7;
static constexpr uint8_t k7BitLimit = 0x80;
static constexpr uint8_t kMinLeadByte = 0xc0;
static constexpr uint8_t getContinuationLength(size_t leadByte)
{
constexpr uint8_t s_continuationSize[] = {
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xc0 - 0xcf */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xd0 - 0xdf */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xe0 - 0xef */
3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 6, 0, /* 0xf0 - 0xff */
};
return s_continuationSize[leadByte - kMinLeadByte];
}
static constexpr uint8_t getLeadMask(size_t continuationLength)
{
constexpr uint8_t s_leadMasks[] = { (1u << s_leadBits[0]) - 1, (1u << s_leadBits[1]) - 1,
(1u << s_leadBits[2]) - 1, (1u << s_leadBits[3]) - 1,
(1u << s_leadBits[4]) - 1, (1u << s_leadBits[5]) - 1,
(1u << s_leadBits[6]) - 1 };
return s_leadMasks[continuationLength];
}
static constexpr uint8_t getLeadByte(size_t continuationLength)
{
constexpr uint8_t s_leadBytes[] = {
(0xffu << (s_leadBits[0] + 1)) & 0xff, (0xffu << (s_leadBits[1] + 1)) & 0xff,
(0xffu << (s_leadBits[2] + 1)) & 0xff, (0xffu << (s_leadBits[3] + 1)) & 0xff,
(0xffu << (s_leadBits[4] + 1)) & 0xff, (0xffu << (s_leadBits[5] + 1)) & 0xff,
(0xffu << (s_leadBits[6] + 1)) & 0xff
};
return s_leadBytes[continuationLength];
}
static constexpr CodePoint getMaxCodePoint(size_t continuationLength)
{
constexpr CodePoint s_maxCodePoint[] = { 0x00000080, 0x00000800, 0x00010000, 0x00200000,
0x04000000, 0x80000000, 0xffffffff };
return s_maxCodePoint[continuationLength];
}
inline static CodePoint decodeContinuationValue(int32_t byte, size_t continuationLength)
{
return (CodePoint)((byte & kContinuationMask) << ((continuationLength - 1) * kContinuationShift));
}
static constexpr CodePoint getFailureCodepoint(Flags flags)
{
return (flags & fDecodeUseDefault) != 0 ? kDefaultCodePoint : 0;
}
static bool parseUtf8(const CodeByte* str,
const CodeByte** outNext,
CodePoint* outCodePoint,
size_t lengthInBytes = kNullTerminated,
Flags flags = 0)
{
auto fail = [&]() -> bool {
// we weren't asked to attempt to skip over invalid code sequences => just fail out
if ((flags & fDecodeSkipInvalid) == 0)
{
return false;
}
// walk the rest of the string skipping over continuation bytes and invalid lead bytes.
// Note that we've already tested and rejected the first byte so we just need to continue
// the search starting at the next byte.
for (size_t i = 1; i < lengthInBytes; i++)
{
const auto b = static_cast<uint8_t>(str[i]);
// continuation byte => skip it.
if ((b & ~kContinuationMask) == kContinuationBits)
continue;
// invalid lead byte => skip it.
if (b >= kMinLeadByte && getContinuationLength(b) == 0)
continue;
// invalid range of bytes
if (b >= k7BitLimit && b < kMinLeadByte)
continue;
*outNext = str + i;
return false;
}
// We've hit the end of the string. This mean that the sequence is
// either invalid, misaligned, or an illegal overlong sequence was
// used. We aren't able to write out the next character pointer if
// we hit this point.
return false;
};
// initialize to failure values;
*outCodePoint = getFailureCodepoint(flags);
*outNext = nullptr;
// the string doesn't have any more bytes in it -> no more codepoints => fail.
if (lengthInBytes == 0)
{
return false;
}
const auto byte = static_cast<uint8_t>(*str);
// the current code byte is at the null terminator -> no more codepoints => finish.
if (byte == '\0')
{
*outCodePoint = byte;
return true;
}
// the current code byte is a direct ASCII character => finish.
if (byte < k7BitLimit)
{
*outCodePoint = byte;
*outNext = str + 1;
return true;
}
if (byte < kMinLeadByte)
{
return fail();
}
// the current code byte is a lead byte => calculate the sequence length and return the
// start of the next codepoint.
const size_t continuationLength = getContinuationLength(byte);
const size_t sequenceLength = continuationLength + 1;
// not enough bytes left in the string to complete this codepoint => fail.
// continuationLength of 0 is invalid => fail
if (lengthInBytes < sequenceLength || continuationLength == 0)
{
return fail();
}
// decode the codepoint.
{
CodePoint cp =
(CodePoint)((byte & getLeadMask(continuationLength)) << (continuationLength * kContinuationShift));
for (size_t i = 0; i < continuationLength; i++)
{
// validate the continuation byte so we don't walk past the
// end of a null terminated string
if ((uint8_t(str[i + 1]) & ~kContinuationMask) != kContinuationBits)
{
return fail();
}
cp |= decodeContinuationValue(str[i + 1], continuationLength - i);
}
*outCodePoint = cp;
*outNext = str + sequenceLength;
return true;
}
}
};
class Utf8Iterator
{
public:
using CodeByte = Utf8Parser::CodeByte;
using CodePoint = Utf8Parser::CodePoint;
using Flags = Utf8Parser::Flags;
// Reference the special length value used for null terminated strings.
static constexpr size_t kNullTerminated = Utf8Parser::kNullTerminated;
Utf8Iterator()
: m_prev(nullptr), m_string(nullptr), m_length(kNullTerminated), m_flags(0), m_lastCodePoint(0), m_index(0)
{
}
Utf8Iterator(const CodeByte* string, size_t lengthInBytes = kNullTerminated, Flags flags = 0)
: m_prev(nullptr), m_string(string), m_length(lengthInBytes), m_flags(flags), m_lastCodePoint(0), m_index(0)
{
next();
}
Utf8Iterator(const Utf8Iterator& it)
{
copy(it);
}
explicit operator bool() const
{
return isValid();
}
CodePoint operator*() const
{
return m_lastCodePoint;
}
const CodeByte* operator&() const
{
return m_prev;
}
Utf8Iterator& operator++()
{
next();
return *this;
}
Utf8Iterator operator++(int32_t)
{
Utf8Iterator tmp = (*this);
next();
return tmp;
}
template <typename T>
Utf8Iterator& operator+=(T count)
{
for (T i = 0; i < count && m_prev != nullptr; i++)
next();
return *this;
}
template <typename T>
Utf8Iterator operator+(T count) const
{
Utf8Iterator tmp = *this;
return (tmp += count);
}
bool operator==(const Utf8Iterator& it) const
{
return m_string == it.m_string;
}
bool operator!=(const Utf8Iterator& it) const
{
return m_string != it.m_string;
}
bool operator<(const Utf8Iterator& it) const
{
return m_string < it.m_string;
}
bool operator<=(const Utf8Iterator& it) const
{
return m_string <= it.m_string;
}
bool operator>(const Utf8Iterator& it) const
{
return m_string > it.m_string;
}
bool operator>=(const Utf8Iterator& it) const
{
return m_string >= it.m_string;
}
Utf8Iterator& operator=(const Utf8Iterator& it)
{
// Note: normally we'd check for an identity assignment in this operator overload and
// ignore. Unfortunately we can't do that here since we also override the '&'
// operator above. Since this copy operation should still be safe for an identity
// assignment, we'll just let it proceed.
copy(it);
return *this;
}
Utf8Iterator& operator=(const CodeByte* str)
{
m_prev = nullptr;
m_string = str;
m_length = kNullTerminated;
m_lastCodePoint = 0;
m_flags = 0;
m_index = 0;
next();
return *this;
}
size_t getIndex() const
{
return m_index - 1;
}
size_t getCodepointSize() const
{
if (m_string == nullptr)
return m_prev == nullptr ? 0 : 1;
return (size_t)(m_string - m_prev);
}
private:
void copy(const Utf8Iterator& it)
{
m_prev = it.m_prev;
m_string = it.m_string;
m_length = it.m_length;
m_flags = it.m_flags;
m_lastCodePoint = it.m_lastCodePoint;
m_index = it.m_index;
}
bool isValid() const
{
return m_string != nullptr && m_lastCodePoint != 0;
}
void next()
{
const CodeByte* ptr;
if (m_string == nullptr)
{
m_prev = nullptr;
return;
}
if (m_length == 0)
{
m_string = nullptr;
m_prev = nullptr;
m_lastCodePoint = 0;
return;
}
ptr = Utf8Parser::nextCodePoint(m_string, m_length, &m_lastCodePoint, m_flags);
if (m_length != kNullTerminated)
m_length -= (size_t)(ptr - m_string);
m_prev = m_string;
m_string = ptr;
m_index++;
}
const CodeByte* m_prev;
const CodeByte* m_string;
size_t m_length;
Flags m_flags;
CodePoint m_lastCodePoint;
size_t m_index;
};
// implementation details used for string conversions
namespace detail
{
template <typename CharT>
class BufferAdapter
{
public:
using value_type = CharT;
constexpr BufferAdapter(CharT* buffer, size_t bufferLength) noexcept
: m_buffer(buffer), m_capacityWithTerm(buffer ? bufferLength : size_t(-1))
{
if (buffer && bufferLength)
buffer[0] = CharT{}; // null terminate
}
bool resize(size_t size) noexcept
{
CARB_ASSERT(size > m_size); // Only allow increasing
if (size > capacity())
return false;
if (m_buffer)
{
#ifdef __CUDACC__ // Silence warning #128-D: loop is not reachable
const auto end = &m_buffer[size + 1]; // add a terminator too
for (auto it = &m_buffer[m_size]; it != end; ++it)
*it = CharT{};
#else
std::fill(&m_buffer[m_size], &m_buffer[size + 1], CharT{}); // add a terminator too
#endif
}
m_size = size;
return true;
}
CharT& operator[](size_t index)
{
static CharT dummy;
CARB_ASSERT(index < m_size);
return m_buffer ? m_buffer[index] : dummy;
}
constexpr size_t capacity() const noexcept
{
return m_capacityWithTerm ? m_capacityWithTerm - 1 : 0; // our capacity includes the terminator, so remove that
}
constexpr size_t size() const noexcept
{
return m_size;
}
bool push_back(CharT c) noexcept
{
if (m_buffer)
{
if (m_size >= capacity())
return false;
m_buffer[m_size++] = c;
m_buffer[m_size] = CharT{}; // null terminate
}
else
{
++m_size; // just adding the size that we need
}
return true;
}
size_t sizeWithTerminator() const noexcept
{
return !m_buffer || m_capacityWithTerm != 0 ? m_size + 1 : 0;
}
private:
CharT* m_buffer;
size_t m_size{ 0 };
const size_t m_capacityWithTerm;
};
template <class T>
#if CARB_HAS_CPP20 && defined(__cpp_char8_t)
constexpr bool IsUtf8Char_v = std::is_same_v<T, char> || std::is_same_v<T, char8_t>;
#else
constexpr bool IsUtf8Char_v = std::is_same_v<T, char>;
#endif
constexpr bool isValidCodePoint(CodePoint cp) noexcept
{
if (cp > kLastCodePoint)
return false;
if ((cp & kSurrogateFirstMask) == kSurrogateFirst)
return false;
return true;
}
constexpr CodePoint replaceInvalid(CodePoint cp) noexcept
{
return isValidCodePoint(cp) ? cp : kReplacementChar;
}
// returns continuation size and lead mask
constexpr std::pair<size_t, const uint8_t> parseContinuation(uint8_t lead) noexcept
{
// 0xxx xxxx - ASCII, directly convertible with no continuation bytes. Shouldn't get here.
CARB_ASSERT(lead & 0x80);
#if CARB_HAS_CPP20 && defined(__cpp_lib_bitops)
auto bits = size_t(std::countl_one(lead));
if (bits >= 2 && bits <= 4)
return { bits - 1, uint8_t(uint8_t(-1) >> (bits + 1)) };
#else
// 110x xxxx - 1 continuation byte
if ((lead & 0b1110'0000) == 0b1100'0000)
return { 1, uint8_t(0b0001'1111) };
// 1110 xxxx - 2 continuation bytes
if ((lead & 0b1111'0000) == 0b1110'0000)
return { 2, uint8_t(0b0000'1111) };
// 1111 0xxx - 3 continuation bytes
if ((lead & 0b1111'1000) == 0b1111'0000)
return { 3, uint8_t(0b0000'0111) };
#endif
return { 0, uint8_t(0) };
}
template <class T> // UTF-8
char32_t toCodePoint(cpp::basic_string_view<std::enable_if_t<sizeof(T) == 1, T>>& in)
{
CARB_ASSERT(!in.empty());
const uint8_t lead = static_cast<uint8_t>(in.front());
in.remove_prefix(1);
// If no continuation byte is set => ASCII, which is directly convertible
if (!(lead & 0x80))
return char32_t{ lead };
// Get the continuation size and lead mask from the lead byte
auto [contSize, kLeadMask] = parseContinuation(lead);
if (contSize == 0 || in.size() < contSize) // invalid or not enough characters remain
return kReplacementChar;
constexpr size_t kContShift = 6;
constexpr uint8_t kContMask = (uint8_t(1) << kContShift) - 1;
char32_t cp = char32_t(lead & kLeadMask) << (contSize * kContShift);
do
{
auto cont = static_cast<uint8_t>(in.front());
// 10xx xxxx - continuation bytes
if ((cont & 0b1100'0000) != 0b1000'0000)
return kReplacementChar; // not a valid continuation byte
in.remove_prefix(1);
cp |= (char32_t(cont & kContMask) << (--contSize * kContShift));
} while (contSize);
// Make sure our decode was valid
return replaceInvalid(cp);
}
template <typename Pred>
bool handleBuffer(Pred&& pred)
{
if constexpr (std::is_same_v<bool, decltype(pred())>)
return pred();
else
{
pred();
return true;
}
}
template <class StringType, typename Out = typename StringType::value_type> // UTF-8
std::enable_if_t<sizeof(Out) == 1, bool> fromCodePoint(char32_t cp, StringType& out)
{
cp = replaceInvalid(cp);
// Check for ASCII character that requires no encoding
CARB_LIKELY_IF(cp < char32_t{ 0x80 })
{
return handleBuffer([&] { return out.push_back(Out(cp)); });
}
// We will need a continuation. Figure out how many continuation bytes
const size_t continuationLength = cp < 0x7ff ? 1 : cp < 0xffff ? 2 : 3;
CARB_ASSERT(continuationLength >= 1 && continuationLength <= 3);
auto index = out.size();
// Resize with an extra for the lead byte. If our buffer doesn't have space,
// return instead of pushing a partial sequence.
if (!handleBuffer([&] { return out.resize(index + continuationLength + 1); }))
return false;
constexpr static size_t kContinuationBits = 6;
constexpr static uint8_t kContinuationBase = 0b10000000; // actually 0b10xxxxxx
constexpr static char32_t kContinuationMask = char32_t((1 << kContinuationBits) - 1);
uint8_t lead = uint8_t(0xf0 << (3 - continuationLength));
lead |= uint8_t(cp >> (continuationLength * kContinuationBits));
out[index++] = Out(lead);
switch (continuationLength)
{
case 3:
out[index++] = Out(kContinuationBase | uint8_t((cp >> (2 * kContinuationBits)) & kContinuationMask));
[[fallthrough]];
case 2:
out[index++] = Out(kContinuationBase | uint8_t((cp >> kContinuationBits) & kContinuationMask));
[[fallthrough]];
case 1:
out[index] = Out(kContinuationBase | uint8_t(cp & kContinuationMask));
break;
}
return true;
}
template <class T> // UTF-16
char32_t toCodePoint(cpp::basic_string_view<std::enable_if_t<sizeof(T) == 2, T>>& in)
{
CARB_ASSERT(!in.empty());
auto c = static_cast<char16_t>(in.front());
in.remove_prefix(1);
// Test for surrogate pair
if ((c & kSurrogateFirstMask) != kSurrogateFirst)
return char32_t{ c }; // not surrogate pair, can be converted directly
// If it's actually the second pair component, it's invalid
if ((c & kSurrogateSecondMask) == kSurrogateSecond)
return kReplacementChar;
// If the second part isn't the high member, then it's invalid
if (in.empty() || ((in.front() & kSurrogateSecondMask) != kSurrogateSecond))
return kReplacementChar;
// Combine the surrogates
char32_t cp = char32_t(c & kSurrogateMask) << kSurrogateShift;
cp |= (in.front() & kSurrogateMask);
cp += kSurrogateBias;
in.remove_prefix(1);
return replaceInvalid(cp);
}
template <class StringType, typename Out = typename StringType::value_type> // UTF-16
std::enable_if_t<sizeof(Out) == 2, bool> fromCodePoint(char32_t cp, StringType& out)
{
cp = replaceInvalid(cp);
// Check if it fits in a single UTF-16 code unit
if (cp < kSurrogateBias)
{
return handleBuffer([&] { return out.push_back(Out(cp)); });
}
else
{
// Resize. If our buffer doesn't have space for the whole pair, exit out
auto index = out.size();
if (!handleBuffer([&] { return out.resize(index + 2); }))
return false;
// Doesn't fit in a single code unit => convert to surrogate pair
cp -= kSurrogateBias;
out[index + 0] = Out(kSurrogateFirst | (char16_t(cp >> kSurrogateShift) & kSurrogateMask));
out[index + 1] = Out(kSurrogateSecond | char16_t(cp & kSurrogateMask));
}
return true;
}
template <class T>
char32_t toCodePoint(cpp::basic_string_view<std::enable_if_t<sizeof(T) == 4, T>>& in)
{
CARB_ASSERT(!in.empty());
auto cp = static_cast<char32_t>(in.front());
in.remove_prefix(1);
return replaceInvalid(cp);
}
template <class StringType, typename Out = typename StringType::value_type> // UTF-32
std::enable_if_t<sizeof(Out) == 4, bool> fromCodePoint(char32_t cp, StringType& out)
{
cp = replaceInvalid(cp);
return handleBuffer([&] { return out.push_back(Out(cp)); });
}
template <typename StringType, typename In>
inline bool convertBetweenUnicodeFormats(cpp::basic_string_view<In> str, StringType& out)
{
using Out = typename StringType::value_type;
static_assert(!std::is_same_v<In, Out>, "Pointless to convert between same types");
while (!str.empty())
{
if (!fromCodePoint(toCodePoint<In>(str), out))
return false;
}
return true;
}
} // namespace detail
inline size_t convertUtf8StringToUtf32(cpp::string_view str, char32_t* out, size_t outLen) noexcept
{
detail::BufferAdapter<char32_t> adapter(out, outLen);
detail::convertBetweenUnicodeFormats(str, adapter);
return adapter.sizeWithTerminator();
}
inline size_t convertUtf8StringToUtf32(cpp::unbounded_string str, char32_t* out, size_t outLen) noexcept
{
return convertUtf8StringToUtf32(cpp::string_view(cpp::unsafe_length, str), out, outLen);
}
#ifndef DOXYGEN_BUILD // sphinx can't cope with this overload
template <class StringType>
inline std::enable_if_t<std::is_same_v<typename StringType::value_type, char32_t>, StringType> convertUtf8StringToUtf32(
cpp::string_view str)
{
StringType out;
detail::convertBetweenUnicodeFormats(str, out);
return out;
}
#endif
inline std::u32string convertUtf8StringToUtf32(cpp::string_view str)
{
return convertUtf8StringToUtf32<std::u32string>(str);
}
inline std::u32string convertUtf8StringToUtf32(cpp::unbounded_string str)
{
return convertUtf8StringToUtf32<std::u32string>(cpp::string_view(cpp::unsafe_length, str));
}
// TODO: OVCC-1591: would be nice to have basic_string_view versions of these functions and InputIterator versions
// which would also allow passing two `const char*` as a [begin, end).
inline size_t convertUtf32StringToUtf8(cpp::u32string_view str, char* out, size_t outLen)
{
detail::BufferAdapter<char> adapter(out, outLen);
detail::convertBetweenUnicodeFormats(str, adapter);
return adapter.sizeWithTerminator();
}
inline size_t convertUtf32StringToUtf8(cpp::unbounded_u32string str, char* out, size_t outLen)
{
return convertUtf32StringToUtf8(cpp::u32string_view(cpp::unsafe_length, str), out, outLen);
}
#ifndef DOXYGEN_BUILD // sphinx can't cope with this overload
template <class StringType>
inline std::enable_if_t<detail::IsUtf8Char_v<typename StringType::value_type>, StringType> convertUtf32StringToUtf8(
cpp::u32string_view str)
{
StringType out;
detail::convertBetweenUnicodeFormats(str, out);
return out;
}
#endif
inline std::string convertUtf32StringToUtf8(cpp::u32string_view str)
{
return convertUtf32StringToUtf8<std::string>(str);
}
inline std::string convertUtf32StringToUtf8(cpp::unbounded_u32string str)
{
return convertUtf32StringToUtf8<std::string>(cpp::u32string_view(cpp::unsafe_length, str));
}
inline size_t convertUtf16StringToUtf8(cpp::u16string_view str, char* out, size_t outLen)
{
detail::BufferAdapter<char> adapter(out, outLen);
detail::convertBetweenUnicodeFormats(str, adapter);
return adapter.sizeWithTerminator();
}
inline size_t convertUtf16StringToUtf8(cpp::unbounded_u16string str, char* out, size_t outLen)
{
return convertUtf16StringToUtf8(cpp::u16string_view(cpp::unsafe_length, str), out, outLen);
}
#ifndef DOXYGEN_BUILD // sphinx can't cope with this overload
template <class StringType>
inline std::enable_if_t<detail::IsUtf8Char_v<typename StringType::value_type>, StringType> convertUtf16StringToUtf8(
cpp::u16string_view str)
{
StringType out;
detail::convertBetweenUnicodeFormats(str, out);
return out;
}
#endif
inline std::string convertUtf16StringToUtf8(cpp::u16string_view str)
{
return convertUtf16StringToUtf8<std::string>(str);
}
inline std::string convertUtf16StringToUtf8(cpp::unbounded_u16string str)
{
return convertUtf16StringToUtf8<std::string>(cpp::u16string_view(cpp::unsafe_length, str));
}
inline size_t convertUtf8StringToUtf16(cpp::string_view str, char16_t* out, size_t outLen) noexcept
{
detail::BufferAdapter<char16_t> adapter(out, outLen);
detail::convertBetweenUnicodeFormats(str, adapter);
return adapter.sizeWithTerminator();
}
inline size_t convertUtf8StringToUtf16(cpp::unbounded_string str, char16_t* out, size_t outLen)
{
return convertUtf8StringToUtf16(cpp::string_view(cpp::unsafe_length, str), out, outLen);
}
#ifndef DOXYGEN_BUILD // sphinx can't cope with this overload
template <class StringType>
inline std::enable_if_t<std::is_same_v<typename StringType::value_type, char16_t>, StringType> convertUtf8StringToUtf16(
cpp::string_view str)
{
StringType out;
detail::convertBetweenUnicodeFormats(str, out);
return out;
}
#endif
inline std::u16string convertUtf8StringToUtf16(cpp::string_view str)
{
return convertUtf8StringToUtf16<std::u16string>(str);
}
inline std::u16string convertUtf8StringToUtf16(cpp::unbounded_string str)
{
return convertUtf8StringToUtf16<std::u16string>(cpp::string_view(cpp::unsafe_length, str));
}
inline size_t convertUtf8StringToWide(cpp::string_view str, wchar_t* out, size_t outLen) noexcept
{
detail::BufferAdapter<wchar_t> adapter(out, outLen);
detail::convertBetweenUnicodeFormats(str, adapter);
return adapter.sizeWithTerminator();
}
inline size_t convertUtf8StringToWide(cpp::unbounded_string str, wchar_t* out, size_t outLen) noexcept
{
return convertUtf8StringToWide(cpp::string_view(cpp::unsafe_length, str), out, outLen);
}
#ifndef DOXYGEN_BUILD // sphinx can't cope with this overload
template <class StringType>
inline std::enable_if_t<std::is_same_v<typename StringType::value_type, wchar_t>, StringType> convertUtf8StringToWide(
cpp::string_view str)
{
StringType out;
detail::convertBetweenUnicodeFormats(str, out);
return out;
}
#endif
inline std::wstring convertUtf8StringToWide(cpp::string_view str)
{
return convertUtf8StringToWide<std::wstring>(str);
}
inline std::wstring convertUtf8StringToWide(const char* str)
{
return convertUtf8StringToWide<std::wstring>(cpp::string_view(cpp::unsafe_length, str));
}
inline size_t convertWideStringToUtf8(cpp::wstring_view str, char* out, size_t outLen) noexcept
{
detail::BufferAdapter<char> adapter(out, outLen);
detail::convertBetweenUnicodeFormats(str, adapter);
return adapter.sizeWithTerminator();
}
inline size_t convertWideStringToUtf8(cpp::unbounded_wstring str, char* out, size_t outLen) noexcept
{
return convertWideStringToUtf8(cpp::wstring_view(cpp::unsafe_length, str), out, outLen);
}
#ifndef DOXYGEN_BUILD // sphinx can't cope with this overload
template <class StringType>
std::enable_if_t<detail::IsUtf8Char_v<typename StringType::value_type>, StringType> convertWideStringToUtf8(
cpp::wstring_view str)
{
StringType out;
detail::convertBetweenUnicodeFormats(str, out);
return out;
}
#endif
inline std::string convertWideStringToUtf8(cpp::wstring_view str)
{
return convertWideStringToUtf8<std::string>(str);
}
inline std::string convertWideStringToUtf8(cpp::unbounded_wstring str)
{
return convertWideStringToUtf8<std::string>(cpp::wstring_view(cpp::unsafe_length, str));
}
} // namespace carb::extras