carb/extras/Utf8Parser.h
File members: carb/extras/Utf8Parser.h
// Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//
#pragma once
#include "../Defines.h"
#include "../../omni/extras/ScratchBuffer.h"
#include <cstdint>
#include <algorithm>
#include <cmath>
namespace carb
{
namespace extras
{
class Utf8Parser
{
public:
using CodePoint = char32_t;
using Utf16CodeUnit = char16_t;
using CodeByte = char;
using Flags = uint32_t;
enum class SurrogateMember
{
eNone,
eHigh,
eLow,
};
static constexpr Flags fDecodeUseDefault = 0x00000001;
static constexpr Flags fDecodeSkipInvalid = 0x00000002;
static constexpr Flags fEncodeUseUtf16 = 0x00000004;
static constexpr Flags fEncodeIgnoreSurrogatePairs = 0x00000008;
static constexpr size_t kNullTerminated = ~0ull;
static constexpr CodePoint kInvalidCodePoint = ~0u;
static constexpr size_t kMaxSequenceLength = 7;
static constexpr CodePoint kDefaultCodePoint = 0x0000fffd;
static const CodeByte* nextCodePoint(const CodeByte* str,
size_t lengthInBytes = kNullTerminated,
CodePoint* codepoint = nullptr,
Flags flags = 0)
{
// retrieve the next code point
CodePoint high = 0;
const CodeByte* next = nullptr;
bool r = parseUtf8(str, &next, &high, lengthInBytes, flags);
if (codepoint != nullptr)
{
*codepoint = high;
}
// parsing failed => just fail out
if (!r)
{
return next;
}
// it's a surrogate pair (and we're allowed to parse those) => parse out the full pair
if ((flags & fEncodeIgnoreSurrogatePairs) == 0 && classifyUtf16SurrogateMember(high) == SurrogateMember::eHigh)
{
// figure out the new length if it's not null terminated
const size_t newLen = (lengthInBytes == kNullTerminated) ? kNullTerminated : (lengthInBytes - (next - str));
// parse out the next code point
CodePoint low = 0;
r = parseUtf8(next, &next, &low, newLen, flags);
// invalid surrogate pair => fail
if (!r || classifyUtf16SurrogateMember(low) != SurrogateMember::eLow)
{
if (codepoint != nullptr)
{
*codepoint = getFailureCodepoint(flags);
}
return next;
}
// valid surrogate pair => calculate the code point
if (codepoint != nullptr)
{
*codepoint = (((high & kSurrogateMask) << kSurrogateBits) | (low & kSurrogateMask)) + kSurrogateBias;
}
return next;
}
return next;
}
static const CodeByte* lastCodePoint(const CodeByte* str,
size_t lengthInBytes = kNullTerminated,
CodePoint* codepoint = nullptr,
Flags flags = fDecodeUseDefault)
{
// Prepare error value result
if (codepoint != nullptr)
{
*codepoint = getFailureCodepoint(flags);
}
// Check if it's a null or empty string
if (!str || *str == 0)
{
return nullptr;
}
if (lengthInBytes == kNullTerminated)
{
lengthInBytes = std::strlen(str);
}
// Make sure no unexpected flags pass into used `nextCodePoint` function
constexpr Flags kErrorHandlingMask = fDecodeSkipInvalid | fDecodeUseDefault;
const Flags helperParserFlags = (flags & kErrorHandlingMask);
size_t curCodePointSize = 0; // Keeps max number of bytes for a decoding attempt with `nextCodePoint`
const bool skipInvalid = (flags & fDecodeSkipInvalid) != 0;
// Walk the string backwards to find the start of the last CodePoint and decode it
// Note: it can be a single byte or a sequence, also if not searching for the last valid CodePoint
// the maximum number of bytes to check is just last `kMaxSequenceLength` bytes instead of the full string
//
// Note that the 'array-bounds' warning needs to be disabled here since it can pick up the
// calculation of `rIterEnd` as being out of bounds. However, for cases of short strings
// this is intentional since the iterator needs to point to one byte before the start of the
// string.
CARB_IGNOREWARNING_GNUC_WITH_PUSH("-Warray-bounds")
const CodeByte* const rIterBegin = str - 1 + lengthInBytes;
const CodeByte* const rIterEnd =
(flags & fDecodeSkipInvalid) ? str - 1 : CARB_MAX(str - 1, rIterBegin - kMaxSequenceLength);
for (const CodeByte* rIter = rIterBegin; rIter != rIterEnd; --rIter)
{
const uint8_t curByte = static_cast<uint8_t>(*rIter);
++curCodePointSize;
// Check if the current code byte is a direct ASCII character
if (curByte < k7BitLimit)
{
// If parsed more than one byte then it's an error
if (curCodePointSize > 1 && !skipInvalid)
{
return nullptr;
}
if (codepoint != nullptr)
{
*codepoint = curByte;
}
return rIter;
}
// The current code byte is a continuation byte so step further
if (curByte < kMinLeadByte)
{
continue;
}
// The current code byte is a lead byte, decode the sequence and check that all bytes were used
CodePoint cp{};
const CodeByte* next = nextCodePoint(rIter, curCodePointSize, &cp, helperParserFlags);
if (!next)
{
if (skipInvalid)
{
curCodePointSize = 0;
continue;
}
return nullptr;
}
// Validate that all bytes till the end were used if expecting no invalid bytes
// Ex: "\xce\xa6\xa6" is a 2 byte sequence "\xce\xa6" for a 0x03A6 code point followed by excessive
// follow up byte "\xa6". The first 2 bytes will be decoded by the `nextCodePoint` properly
// and `next` will be pointing at the last "\xa6" byte
if (!skipInvalid && curCodePointSize != static_cast<size_t>(next - rIter))
{
return nullptr;
}
const SurrogateMember surrogateType = classifyUtf16SurrogateMember(cp);
// Encountered the high surrogate part first which is an error
if (CARB_UNLIKELY(surrogateType == SurrogateMember::eHigh))
{
if (skipInvalid)
{
// Just skip it and search further
curCodePointSize = 0;
continue;
}
return nullptr;
}
// Found the low part of a surrogate pair, need to continue parsing to get the high part
else if (CARB_UNLIKELY(surrogateType == SurrogateMember::eLow))
{
constexpr int kSurrogatePartSize = 3;
constexpr int kFullSurrogatePairSize = 2 * kSurrogatePartSize;
// To prepare for possible continuation of parsing if skipping invalid bytes and no high surrogate is
// found reset the possible CodePoint size
curCodePointSize = 0;
// For a valid UTF-8 string there are must be high surrogate (3 bytes) preceding low surrogate (3 bytes)
if (rIter <= rIterEnd + kSurrogatePartSize)
{
if (skipInvalid)
{
// Skip the low surrogate data and continue to check the preceding byte
continue;
}
return nullptr;
}
// Step 3 bytes preceding the low surrogate
const CodeByte* const possibleHighSurStart = rIter - kSurrogatePartSize;
// Check if it starts with a lead byte
if (static_cast<uint8_t>(*possibleHighSurStart) < kMinLeadByte)
{
if (skipInvalid)
{
continue;
}
return nullptr;
}
// Try to parse 6 bytes (full surrogate pair size) to get the whole CodePoint without skipping invalid
// bytes
const CodeByte* const decodedPairEnd =
nextCodePoint(possibleHighSurStart, kFullSurrogatePairSize, &cp, 0);
if (!decodedPairEnd)
{
if (skipInvalid)
{
continue;
}
return nullptr;
}
// Check if used all 6 bytes (as expected from a surrogate pair)
if (decodedPairEnd - possibleHighSurStart != kFullSurrogatePairSize)
{
if (skipInvalid)
{
continue;
}
return nullptr;
}
// A proper surrogate pair was parsed into the `cp`
// and only the `rIter` has invalid value at this point
rIter = possibleHighSurStart;
// Just exit the block so the code below reports the result
}
if (codepoint)
{
*codepoint = cp;
}
// Everything is fine thus return start of the sequence
return rIter;
}
CARB_IGNOREWARNING_GNUC_POP
// Didn't find start of a valid CodePoint
return nullptr;
}
static size_t getLengthInCodePoints(const CodeByte* str, size_t maxLengthInBytes = kNullTerminated, Flags flags = 0)
{
const CodeByte* current;
const CodeByte* next;
size_t count = 0;
// get the second codepoint in the string.
current = str;
next = nextCodePoint(str, maxLengthInBytes, nullptr, flags);
// empty or invalid string => fail.
if (next == nullptr)
return 0;
if (maxLengthInBytes != kNullTerminated)
{
maxLengthInBytes -= next - current;
}
count++;
do
{
current = next;
next = nextCodePoint(current, maxLengthInBytes, nullptr, flags);
if (next == nullptr)
return count;
if (maxLengthInBytes != kNullTerminated)
{
maxLengthInBytes -= next - current;
}
count++;
} while (maxLengthInBytes > 0);
return count;
}
static size_t getLengthInCodeBytes(const CodePoint* str,
size_t maxLengthInCodePoints = kNullTerminated,
Flags flags = 0)
{
size_t count = 0;
size_t largeCodePointSize = 4;
if ((flags & fEncodeUseUtf16) != 0)
largeCodePointSize = 6;
for (size_t i = 0; str[i] != 0 && i < maxLengthInCodePoints; i++)
{
if (str[i] < getMaxCodePoint(0))
count++;
else if (str[i] < getMaxCodePoint(1))
count += 2;
else if (str[i] < getMaxCodePoint(2))
count += 3;
else if (str[i] < getMaxCodePoint(3))
count += largeCodePointSize;
else if (str[i] < getMaxCodePoint(4))
count += 5;
else if (str[i] < getMaxCodePoint(5))
count += 6;
else
count += 7;
}
return count;
}
static size_t getLengthInCodeBytes(const Utf16CodeUnit* str,
size_t maxLengthInCodePoints = kNullTerminated,
Flags flags = 0)
{
size_t count = 0;
size_t largeCodePointSize = 4;
if ((flags & fEncodeUseUtf16) != 0)
largeCodePointSize = 6;
for (size_t i = 0; str[i] != 0 && i < maxLengthInCodePoints; i++)
{
if (str[i] < getMaxCodePoint(0))
count++;
else if (str[i] < getMaxCodePoint(1))
count += 2;
else
{
// found a surrogate pair in the string -> both of these codepoints will decode to
// a single UTF-32 codepoint => skip the low surrogate and add the size of a
// single encoded codepoint.
if (str[i] >= kSurrogateBaseHigh && str[i] < kSurrogateBaseLow && i + 1 < maxLengthInCodePoints &&
str[i + 1] >= kSurrogateBaseLow && str[i + 1] <= kSurrogateMax)
{
i++;
count += largeCodePointSize;
}
// not part of a UTF-16 surrogate pair => this will encode to 3 bytes in UTF-8.
else
count += 3;
}
}
return count;
}
static CodePoint getCodePoint(const CodeByte* str, size_t lengthInBytes = kNullTerminated, Flags flags = 0)
{
char32_t c = 0;
nextCodePoint(str, lengthInBytes, &c, flags);
return c;
}
static CodeByte* getCodeBytes(CodePoint cp, CodeByte* str, size_t lengthInBytes, size_t* bytesWritten, Flags flags = 0)
{
size_t sequenceLength = 0;
size_t continuationLength = 0;
size_t codePointCount = 1;
CodePoint codePoint[2] = { cp, 0 };
CodeByte* result;
// not enough room in the buffer => fail.
if (lengthInBytes == 0)
{
*bytesWritten = 0;
return nullptr;
}
// a 7-bit ASCII character -> this can be directly stored => store and return.
if (codePoint[0] < k7BitLimit)
{
str[0] = CodeByte((codePoint[0] & 0xff));
*bytesWritten = 1;
return str;
}
// at this point we know that the encoding for the codepoint is going to require at least
// two bytes. We need to calculate the sequence length and encode the bytes.
// allowing a UTF-16 surrogate pair encoding in the string and the codepoint is above the
// range where a surrogate pair is necessary => calculate the low and high codepoints
// for the pair and set the sequence length.
if ((flags & fEncodeUseUtf16) != 0 && codePoint[0] >= kSurrogateBias)
{
sequenceLength = 3;
continuationLength = 2;
codePointCount = 2;
codePoint[0] -= kSurrogateBias;
codePoint[1] = kSurrogateBaseLow | (codePoint[0] & kSurrogateMask);
codePoint[0] = kSurrogateBaseHigh | ((codePoint[0] >> kSurrogateBits) & kSurrogateMask);
}
// not using a UTF-16 surrogate pair => search for the required length of the sequence.
else
{
// figure out the required sequence length for the given for this codepoint.
for (size_t i = 1; i < kMaxSequenceBytes; i++)
{
if (codePoint[0] < getMaxCodePoint(i))
{
sequenceLength = i + 1;
continuationLength = i;
break;
}
}
// failed to find a sequence length for the given codepoint (?!?) => fail (this should
// never happen).
if (sequenceLength == 0)
{
*bytesWritten = 0;
return nullptr;
}
}
// not enough space in the buffer to store the entire sequence => fail.
if (lengthInBytes < sequenceLength * codePointCount)
{
*bytesWritten = 0;
return nullptr;
}
result = str;
// write out each of the codepoints. If UTF-16 encoding is not being used, there will only
// be one codepoint and this loop will exit after the first iteration.
for (size_t j = 0; j < codePointCount; j++)
{
cp = codePoint[j];
// write out the lead byte.
*str = CodeByte(getLeadByte(continuationLength) |
((cp >> (continuationLength * kContinuationShift)) & getLeadMask(continuationLength)));
str++;
// write out the continuation bytes.
for (size_t i = 0; i < continuationLength; i++)
{
*str = CodeByte(kContinuationBits |
((cp >> ((continuationLength - i - 1) * kContinuationShift)) & kContinuationMask));
str++;
}
}
*bytesWritten = sequenceLength * codePointCount;
return result;
}
static SurrogateMember classifyUtf16SurrogateMember(CodePoint cp)
{
if (cp >= kSurrogateBaseHigh && cp < kSurrogateBaseLow)
return SurrogateMember::eHigh;
if (cp >= kSurrogateBaseLow && cp <= kSurrogateMax)
return SurrogateMember::eLow;
return SurrogateMember::eNone;
}
static CodePoint decodeUtf16CodePoint(CodePoint high, CodePoint low)
{
CodePoint cp;
// the high and low codepoints are out of the surrogate pair range -> cannot decode => fail.
if (high < kSurrogateBaseHigh || high >= kSurrogateBaseLow || low < kSurrogateBaseLow || low > kSurrogateMax)
return 0;
// decode the surrogate pair into a single Unicode codepoint.
cp = (((high & kSurrogateMask) << kSurrogateBits) | (low & kSurrogateMask)) + kSurrogateBias;
return cp;
}
static size_t encodeUtf16CodePoint(CodePoint cp, CodePoint* out)
{
CodePoint high;
CodePoint low;
// small enough for a direct encoding => just store it.
if (cp < kSurrogateBias)
{
if (out != nullptr)
*out = cp;
return 1;
}
// too big for direct encoding => convert it to a surrogate pair and store both in the
// output buffer.
cp -= kSurrogateBias;
low = kSurrogateBaseLow | (cp & kSurrogateMask);
high = kSurrogateBaseHigh | ((cp >> kSurrogateBits) & kSurrogateMask);
if (out != nullptr)
*out = high | (low << 16);
return 2;
}
inline static bool isSpaceCodePoint(CodePoint cp)
{
// Taken from https://en.wikipedia.org/wiki/Whitespace_character
// Note: sorted to allow binary search
static constexpr CodePoint kSpaceCodePoints[] = {
0x0009, // character tabulation
0x000A, // line feed
0x000B, // line tabulation
0x000C, // form feed
0x000D, // carriage return
0x0020, // space
0x0085, // next line
0x00A0, // no-break space
0x1680, // ogham space mark
0x180E, // Mongolian vowel separator
0x2000, // en quad
0x2001, // em quad
0x2002, // en space
0x2003, // em space
0x2004, // three-per-em space
0x2005, // four-per-em space
0x2006, // six-per-em space
0x2007, // figure space
0x2008, // punctuation space
0x2009, // thin space
0x200A, // hair space
0x200B, // zero width space
0x200C, // zero width non-joiner
0x200D, // zero width joiner
0x2028, // line separator
0x2029, // paragraph separator
0x202F, // narrow no-break space
0x205F, // medium mathematical space
0x2060, // word joiner
0x3000, // ideograph space
0xFEFF, // zero width non-breaking space
};
constexpr size_t kSpaceCodePointsCount = CARB_COUNTOF(kSpaceCodePoints);
constexpr const CodePoint* const kSpaceCodePointsEnd = kSpaceCodePoints + kSpaceCodePointsCount;
return std::binary_search(kSpaceCodePoints, kSpaceCodePointsEnd, cp);
}
private:
static constexpr uint8_t s_leadBits[] = { 7, 5, 4, 3, 2, 1, 0 };
static constexpr CodePoint kMaxCodePoint = 0x0010ffff;
static constexpr uint32_t kContinuationShift = 6;
static constexpr uint8_t kContinuationBits = 0x80;
static constexpr uint8_t kContinuationMask = (1u << kContinuationShift) - 1;
static constexpr CodePoint kSurrogateBias = 0x00010000;
static constexpr CodePoint kSurrogateBaseHigh = 0x0000d800;
static constexpr CodePoint kSurrogateBaseLow = 0x0000dc00;
static constexpr CodePoint kSurrogateMin = 0x0000d800;
static constexpr CodePoint kSurrogateMax = 0x0000dfff;
static constexpr uint32_t kSurrogateBits = 10;
static constexpr CodePoint kSurrogateMask = ((1 << kSurrogateBits) - 1);
static constexpr size_t kMaxSequenceBytes = 7;
static constexpr uint8_t k7BitLimit = 0x80;
static constexpr uint8_t kMinLeadByte = 0xc0;
static constexpr uint8_t getContinuationLength(size_t leadByte)
{
constexpr uint8_t s_continuationSize[] = {
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xc0 - 0xcf */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xd0 - 0xdf */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xe0 - 0xef */
3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 6, 0, /* 0xf0 - 0xff */
};
return s_continuationSize[leadByte - kMinLeadByte];
}
static constexpr uint8_t getLeadMask(size_t continuationLength)
{
constexpr uint8_t s_leadMasks[] = { (1u << s_leadBits[0]) - 1, (1u << s_leadBits[1]) - 1,
(1u << s_leadBits[2]) - 1, (1u << s_leadBits[3]) - 1,
(1u << s_leadBits[4]) - 1, (1u << s_leadBits[5]) - 1,
(1u << s_leadBits[6]) - 1 };
return s_leadMasks[continuationLength];
}
static constexpr uint8_t getLeadByte(size_t continuationLength)
{
constexpr uint8_t s_leadBytes[] = {
(0xffu << (s_leadBits[0] + 1)) & 0xff, (0xffu << (s_leadBits[1] + 1)) & 0xff,
(0xffu << (s_leadBits[2] + 1)) & 0xff, (0xffu << (s_leadBits[3] + 1)) & 0xff,
(0xffu << (s_leadBits[4] + 1)) & 0xff, (0xffu << (s_leadBits[5] + 1)) & 0xff,
(0xffu << (s_leadBits[6] + 1)) & 0xff
};
return s_leadBytes[continuationLength];
}
static constexpr CodePoint getMaxCodePoint(size_t continuationLength)
{
constexpr CodePoint s_maxCodePoint[] = { 0x00000080, 0x00000800, 0x00010000, 0x00200000,
0x04000000, 0x80000000, 0xffffffff };
return s_maxCodePoint[continuationLength];
}
inline static CodePoint decodeContinuationByte(uint8_t byte, size_t continuationLength)
{
return (byte & kContinuationMask) << ((continuationLength - 1) * kContinuationShift);
}
static constexpr CodePoint getFailureCodepoint(Flags flags)
{
return (flags & fDecodeUseDefault) != 0 ? kDefaultCodePoint : 0;
}
static bool parseUtf8(const CodeByte* str,
const CodeByte** outNext,
CodePoint* outCodePoint,
size_t lengthInBytes = kNullTerminated,
Flags flags = 0)
{
auto fail = [&]() -> bool {
// we weren't asked to attempt to skip over invalid code sequences => just fail out
if ((flags & fDecodeSkipInvalid) == 0)
{
return false;
}
// walk the rest of the string skipping over continuation bytes and invalid lead bytes.
// Note that we've already tested and rejected the first byte so we just need to continue
// the search starting at the next byte.
for (size_t i = 1; i < lengthInBytes; i++)
{
const auto b = static_cast<uint8_t>(str[i]);
// continuation byte => skip it.
if ((b & ~kContinuationMask) == kContinuationBits)
continue;
// invalid lead byte => skip it.
if (b >= kMinLeadByte && getContinuationLength(b) == 0)
continue;
// invalid range of bytes
if (b >= k7BitLimit && b < kMinLeadByte)
continue;
*outNext = str + i;
return false;
}
// We've hit the end of the string. This mean that the sequence is
// either invalid, misaligned, or an illegal overlong sequence was
// used. We aren't able to write out the next character pointer if
// we hit this point.
return false;
};
// initialize to failure values;
*outCodePoint = getFailureCodepoint(flags);
*outNext = nullptr;
// the string doesn't have any more bytes in it -> no more codepoints => fail.
if (lengthInBytes == 0)
{
return false;
}
const auto byte = static_cast<uint8_t>(*str);
// the current code byte is at the null terminator -> no more codepoints => finish.
if (byte == '\0')
{
*outCodePoint = byte;
return true;
}
// the current code byte is a direct ASCII character => finish.
if (byte < k7BitLimit)
{
*outCodePoint = byte;
*outNext = str + 1;
return true;
}
if (byte < kMinLeadByte)
{
return fail();
}
// the current code byte is a lead byte => calculate the sequence length and return the
// start of the next codepoint.
const size_t continuationLength = getContinuationLength(byte);
const size_t sequenceLength = continuationLength + 1;
// not enough bytes left in the string to complete this codepoint => fail.
// continuationLength of 0 is invalid => fail
if (lengthInBytes < sequenceLength || continuationLength == 0)
{
return fail();
}
// decode the codepoint.
{
CodePoint cp = (byte & getLeadMask(continuationLength)) << (continuationLength * kContinuationShift);
for (size_t i = 0; i < continuationLength; i++)
{
// validate the continuation byte so we don't walk past the
// end of a null terminated string
if ((uint8_t(str[i + 1]) & ~kContinuationMask) != kContinuationBits)
{
return fail();
}
cp |= decodeContinuationByte(str[i + 1], continuationLength - i);
}
*outCodePoint = cp;
*outNext = str + sequenceLength;
return true;
}
}
};
class Utf8Iterator
{
public:
using CodeByte = Utf8Parser::CodeByte;
using CodePoint = Utf8Parser::CodePoint;
using Flags = Utf8Parser::Flags;
// Reference the special length value used for null terminated strings.
static constexpr size_t kNullTerminated = Utf8Parser::kNullTerminated;
Utf8Iterator()
: m_prev(nullptr), m_string(nullptr), m_length(kNullTerminated), m_flags(0), m_lastCodePoint(0), m_index(0)
{
}
Utf8Iterator(const CodeByte* string, size_t lengthInBytes = kNullTerminated, Flags flags = 0)
: m_prev(nullptr), m_string(string), m_length(lengthInBytes), m_flags(flags), m_lastCodePoint(0), m_index(0)
{
next();
}
Utf8Iterator(const Utf8Iterator& it)
{
copy(it);
}
operator bool() const
{
return isValid();
}
bool operator!() const
{
return !isValid();
}
CodePoint operator*() const
{
return m_lastCodePoint;
}
const CodeByte* operator&() const
{
return m_prev;
}
Utf8Iterator& operator++()
{
next();
return *this;
}
Utf8Iterator operator++(int32_t)
{
Utf8Iterator tmp = (*this);
next();
return tmp;
}
template <typename T>
Utf8Iterator& operator+=(T count)
{
for (T i = 0; i < count && m_prev != nullptr; i++)
next();
return *this;
}
template <typename T>
Utf8Iterator operator+(T count) const
{
Utf8Iterator tmp = *this;
return (tmp += count);
}
bool operator==(const Utf8Iterator& it) const
{
return m_string == it.m_string;
}
bool operator!=(const Utf8Iterator& it) const
{
return m_string != it.m_string;
}
bool operator<(const Utf8Iterator& it) const
{
return m_string < it.m_string;
}
bool operator<=(const Utf8Iterator& it) const
{
return m_string <= it.m_string;
}
bool operator>(const Utf8Iterator& it) const
{
return m_string > it.m_string;
}
bool operator>=(const Utf8Iterator& it) const
{
return m_string >= it.m_string;
}
Utf8Iterator& operator=(const Utf8Iterator& it)
{
// Note: normally we'd check for an identity assignment in this operator overload and
// ignore. Unfortunately we can't do that here since we also override the '&'
// operator above. Since this copy operation should still be safe for an identity
// assignment, we'll just let it proceed.
copy(it);
return *this;
}
Utf8Iterator& operator=(const CodeByte* str)
{
m_prev = nullptr;
m_string = str;
m_length = kNullTerminated;
m_lastCodePoint = 0;
m_flags = 0;
m_index = 0;
next();
return *this;
}
size_t getIndex() const
{
return m_index - 1;
}
size_t getCodepointSize() const
{
if (m_string == nullptr)
return m_prev == nullptr ? 0 : 1;
return m_string - m_prev;
}
private:
void copy(const Utf8Iterator& it)
{
m_prev = it.m_prev;
m_string = it.m_string;
m_length = it.m_length;
m_flags = it.m_flags;
m_lastCodePoint = it.m_lastCodePoint;
m_index = it.m_index;
}
bool isValid() const
{
return m_string != nullptr && m_lastCodePoint != 0;
}
void next()
{
const CodeByte* ptr;
if (m_string == nullptr)
{
m_prev = nullptr;
return;
}
if (m_length == 0)
{
m_string = nullptr;
m_prev = nullptr;
m_lastCodePoint = 0;
return;
}
ptr = Utf8Parser::nextCodePoint(m_string, m_length, &m_lastCodePoint, m_flags);
if (m_length != kNullTerminated)
m_length -= ptr - m_string;
m_prev = m_string;
m_string = ptr;
m_index++;
}
const CodeByte* m_prev;
const CodeByte* m_string;
size_t m_length;
Flags m_flags;
CodePoint m_lastCodePoint;
size_t m_index;
};
// implementation details used for string conversions - ignore this!
#ifndef DOXYGEN_SHOULD_SKIP_THIS
namespace detail
{
template <typename T,
typename O,
std::pair<size_t, char32_t>(toCodePoint)(const T*),
size_t(fromCodePoint)(char32_t c, O* out, size_t outLen)>
inline size_t convertBetweenUnicodeFormatsRaw(const T* str, O* out, size_t outLen)
{
// the last element written to the output
O* prev = nullptr;
size_t prevLen = 0;
size_t written = 0;
size_t read = 0;
bool fullyRead = false;
if (str == nullptr)
{
return 0;
}
// no output => ignore outLen in the loop
if (out == nullptr)
{
outLen = SIZE_MAX;
}
if (outLen == 0)
{
return 0;
}
for (;;)
{
size_t len;
// decode the input to UTF-32
std::pair<size_t, char32_t> decoded = toCodePoint(str + read);
// decode failed
if (decoded.first == 0)
{
break;
}
// encode from UTF-32 to the output format
len = fromCodePoint(decoded.second, (out == nullptr) ? nullptr : out + written, outLen - written);
// out of buffer space
if (len == 0)
{
break;
}
// store the last written character
if (out != nullptr)
{
prev = out + written;
}
prevLen = len;
// advance the indices
written += len;
read += decoded.first;
// hit the null terminator => finished
if (decoded.second == '\0')
{
fullyRead = true;
break;
}
}
// if the string was truncated, we need to cut out the last character
// from the written count
if (!fullyRead)
{
if (written == outLen)
{
written -= prevLen;
written += 1;
if (out != nullptr)
{
*prev = '\0';
}
}
else
{
if (out != nullptr)
{
out[written] = '\0';
}
written++;
}
}
return written;
}
inline std::pair<size_t, char32_t> utf8ToCodePoint(const char* str)
{
char32_t c = 0;
const char* p = Utf8Parser::nextCodePoint(str, Utf8Parser::kNullTerminated, &c, Utf8Parser::fDecodeUseDefault);
if (c == '\0')
{
return std::pair<size_t, char32_t>(1, '\0');
}
else if (p == nullptr)
{
// invalid character, skip it
return std::pair<size_t, char32_t>(1, c);
}
else
{
return std::pair<size_t, char32_t>(p - str, c);
}
}
inline size_t utf32FromCodePoint(char32_t c, char32_t* out, size_t outLen)
{
if (outLen == 0)
{
return 0;
}
else
{
if (out != nullptr)
{
out[0] = c;
}
return 1;
}
};
inline std::pair<size_t, char32_t> utf32ToCodePoint(const char32_t* str)
{
return std::pair<size_t, char32_t>(1, *str);
}
inline size_t utf8FromCodePoint(char32_t c, char* out, size_t outLen)
{
char dummy[8];
size_t len = 0;
char* p;
if (out == nullptr)
{
out = dummy;
outLen = CARB_MIN(outLen, CARB_COUNTOF(dummy));
}
p = Utf8Parser::getCodeBytes(c, out, outLen, &len);
if (p == nullptr)
{
return 0;
}
else
{
return len;
}
}
inline std::pair<size_t, char32_t> utf16ToCodePoint(const char16_t* str)
{
char32_t c;
switch (Utf8Parser::classifyUtf16SurrogateMember(str[0]))
{
case Utf8Parser::SurrogateMember::eHigh:
c = Utf8Parser::decodeUtf16CodePoint(str[0], str[1]);
if (c == 0) // invalid surrogate pair
{
break;
}
return std::pair<size_t, char32_t>(2, c);
// a stray low surrogate is invalid
case Utf8Parser::SurrogateMember::eLow:
break;
default:
return std::pair<size_t, char32_t>(1, str[0]);
}
// failed to parse => just return the invalid character code point
constexpr static auto kDefaultCodePoint_ = Utf8Parser::kDefaultCodePoint; // CC-1110
return std::pair<size_t, char32_t>(1, kDefaultCodePoint_);
}
inline size_t utf16FromCodePoint(char32_t c, char16_t* out, size_t outLen)
{
char32_t mangled = 0;
size_t len;
len = Utf8Parser::encodeUtf16CodePoint(c, &mangled);
if (outLen < len)
{
return 0;
}
if (out != nullptr)
{
switch (len)
{
default:
break;
case 1:
out[0] = char16_t(mangled);
break;
case 2:
out[0] = char16_t(mangled & 0xFFFF);
out[1] = char16_t(mangled >> 16);
break;
}
}
return len;
}
template <typename T, typename O, size_t conv(const T* str, O* out, size_t outLen)>
inline std::basic_string<O> convertBetweenUnicodeFormats(const T* str)
{
omni::extras::ScratchBuffer<O, 4096> buffer;
size_t len = conv(str, nullptr, 0);
if (len == 0 || !buffer.resize(len))
{
return std::basic_string<O>();
}
conv(str, buffer.data(), buffer.size());
return std::basic_string<O>(buffer.data(), buffer.data() + len - 1);
}
} // namespace detail
#endif
inline size_t convertUtf8StringToUtf32(const char* str, char32_t* out, size_t outLen) noexcept
{
return detail::convertBetweenUnicodeFormatsRaw<char, char32_t, detail::utf8ToCodePoint, detail::utf32FromCodePoint>(
str, out, outLen);
}
inline std::u32string convertUtf8StringToUtf32(const char* str)
{
return detail::convertBetweenUnicodeFormats<char, char32_t, convertUtf8StringToUtf32>(str);
}
inline std::u32string convertUtf8StringToUtf32(std::string str)
{
return convertUtf8StringToUtf32(str.c_str());
}
inline size_t convertUtf32StringToUtf8(const char32_t* str, char* out, size_t outLen)
{
return detail::convertBetweenUnicodeFormatsRaw<char32_t, char, detail::utf32ToCodePoint, detail::utf8FromCodePoint>(
str, out, outLen);
}
inline std::string convertUtf32StringToUtf8(const char32_t* str)
{
return detail::convertBetweenUnicodeFormats<char32_t, char, convertUtf32StringToUtf8>(str);
}
inline std::string convertUtf32StringToUtf8(std::u32string str)
{
return convertUtf32StringToUtf8(str.c_str());
}
inline size_t convertUtf16StringToUtf8(const char16_t* str, char* out, size_t outLen)
{
return detail::convertBetweenUnicodeFormatsRaw<char16_t, char, detail::utf16ToCodePoint, detail::utf8FromCodePoint>(
str, out, outLen);
}
inline std::string convertUtf16StringToUtf8(const char16_t* str)
{
return detail::convertBetweenUnicodeFormats<char16_t, char, convertUtf16StringToUtf8>(str);
}
inline std::string convertUtf16StringToUtf8(std::u16string str)
{
return convertUtf16StringToUtf8(str.c_str());
}
inline size_t convertUtf8StringToUtf16(const char* str, char16_t* out, size_t outLen) noexcept
{
return detail::convertBetweenUnicodeFormatsRaw<char, char16_t, detail::utf8ToCodePoint, detail::utf16FromCodePoint>(
str, out, outLen);
}
inline std::u16string convertUtf8StringToUtf16(const char* str)
{
return detail::convertBetweenUnicodeFormats<char, char16_t, convertUtf8StringToUtf16>(str);
}
inline std::u16string convertUtf8StringToUtf16(std::string str)
{
return convertUtf8StringToUtf16(str.c_str());
}
inline size_t convertUtf8StringToWide(const char* str, wchar_t* out, size_t outLen) noexcept
{
#if CARB_PLATFORM_WINDOWS
static_assert(sizeof(*out) == sizeof(char16_t), "unexpected wchar_t type");
return detail::convertBetweenUnicodeFormatsRaw<char, char16_t, detail::utf8ToCodePoint, detail::utf16FromCodePoint>(
str, reinterpret_cast<char16_t*>(out), outLen);
#else
static_assert(sizeof(*out) == sizeof(char32_t), "unexpected wchar_t type");
return detail::convertBetweenUnicodeFormatsRaw<char, char32_t, detail::utf8ToCodePoint, detail::utf32FromCodePoint>(
str, reinterpret_cast<char32_t*>(out), outLen);
#endif
}
inline std::wstring convertUtf8StringToWide(const char* str)
{
return detail::convertBetweenUnicodeFormats<char, wchar_t, convertUtf8StringToWide>(str);
}
inline std::wstring convertUtf8StringToWide(std::string str)
{
return convertUtf8StringToWide(str.c_str());
}
inline size_t convertWideStringToUtf8(const wchar_t* str, char* out, size_t outLen) noexcept
{
#if CARB_PLATFORM_WINDOWS
static_assert(sizeof(*str) == sizeof(char16_t), "unexpected wchar_t type");
return detail::convertBetweenUnicodeFormatsRaw<char16_t, char, detail::utf16ToCodePoint, detail::utf8FromCodePoint>(
reinterpret_cast<const char16_t*>(str), out, outLen);
#else
static_assert(sizeof(*str) == sizeof(char32_t), "unexpected wchar_t type");
return detail::convertBetweenUnicodeFormatsRaw<char32_t, char, detail::utf32ToCodePoint, detail::utf8FromCodePoint>(
reinterpret_cast<const char32_t*>(str), out, outLen);
#endif
}
inline std::string convertWideStringToUtf8(const wchar_t* str)
{
return detail::convertBetweenUnicodeFormats<wchar_t, char, convertWideStringToUtf8>(str);
}
inline std::string convertWideStringToUtf8(std::wstring str)
{
return convertWideStringToUtf8(str.c_str());
}
} // namespace extras
} // namespace carb