Utf8Parser.h#

Fully qualified name: carb/extras/Utf8Parser.h

File members: carb/extras/Utf8Parser.h

// SPDX-FileCopyrightText: Copyright (c) 2019-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: LicenseRef-NvidiaProprietary
//
// NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
// property and proprietary rights in and to this material, related
// documentation and any modifications thereto. Any use, reproduction,
// disclosure or distribution of this material and related documentation
// without an express license agreement from NVIDIA CORPORATION or
// its affiliates is strictly prohibited.
#pragma once

#include "../Defines.h"
#include "../../omni/extras/ScratchBuffer.h"
#include "../cpp/Optional.h"
#include "../cpp/StringView.h"
#include "../cpp/ZStringView.h"

#include <algorithm>
#include <cstdint>
#include <string>

namespace carb::extras
{
namespace detail
{

using CodePoint = char32_t;

constexpr CodePoint kReplacementChar{ 0xFFFD };

constexpr CodePoint kLastCodePoint{ 0x10FFFF };

// UTF-16 surrogate pairs
constexpr char16_t kSurrogateFirstMask{ 0xf800 };
constexpr char16_t kSurrogateFirst{ 0xd800 };
constexpr char16_t kSurrogateSecondMask{ 0xfc00 };
constexpr char16_t kSurrogateSecond{ 0xdc00 };
constexpr char16_t kSurrogateLast{ 0xdfff };
constexpr char32_t kSurrogateBias{ 0x10000 };
constexpr size_t kSurrogateShift = 10;
constexpr char16_t kSurrogateMask = (char16_t(1) << kSurrogateShift) - 1;

} // namespace detail

class Utf8Parser
{
public:
    using CodePoint = detail::CodePoint;

    using Utf16CodeUnit = char16_t;

    using CodeByte = char;

    using Flags = uint32_t;

    enum class SurrogateMember
    {
        eNone,

        eHigh,

        eLow,
    };

    static constexpr Flags fDecodeUseDefault = 0x00000001;

    static constexpr Flags fDecodeSkipInvalid = 0x00000002;

    static constexpr Flags fEncodeUseUtf16 = 0x00000004;

    static constexpr Flags fEncodeIgnoreSurrogatePairs = 0x00000008;

    static constexpr size_t kNullTerminated = ~0ull;

    static constexpr CodePoint kInvalidCodePoint = ~0u;

    static constexpr size_t kMaxSequenceLength = 7;

    static constexpr CodePoint kDefaultCodePoint = detail::kReplacementChar;

    static const CodeByte* nextCodePoint(const CodeByte* str,
                                         size_t lengthInBytes = kNullTerminated,
                                         CodePoint* codepoint = nullptr,
                                         Flags flags = 0)
    {
        // retrieve the next code point
        CodePoint high = 0;
        const CodeByte* next = nullptr;
        bool r = parseUtf8(str, &next, &high, lengthInBytes, flags);

        if (codepoint != nullptr)
        {
            *codepoint = high;
        }

        // parsing failed => just fail out
        if (!r)
        {
            return next;
        }

        // it's a surrogate pair (and we're allowed to parse those) => parse out the full pair
        if ((flags & fEncodeIgnoreSurrogatePairs) == 0 && classifyUtf16SurrogateMember(high) == SurrogateMember::eHigh)
        {
            // figure out the new length if it's not null terminated
            const size_t newLen =
                (lengthInBytes == kNullTerminated) ? kNullTerminated : (lengthInBytes - (size_t)(next - str));

            // parse out the next code point
            CodePoint low = 0;
            r = parseUtf8(next, &next, &low, newLen, flags);

            // invalid surrogate pair => fail
            if (!r || classifyUtf16SurrogateMember(low) != SurrogateMember::eLow)
            {
                if (codepoint != nullptr)
                {
                    *codepoint = getFailureCodepoint(flags);
                }

                return next;
            }

            // valid surrogate pair => calculate the code point
            if (codepoint != nullptr)
            {
                *codepoint = (((high & kSurrogateMask) << kSurrogateBits) | (low & kSurrogateMask)) + kSurrogateBias;
            }

            return next;
        }

        return next;
    }

    static const CodeByte* lastCodePoint(carb::cpp::basic_string_view<CodeByte> str,
                                         CodePoint* codepoint = nullptr,
                                         Flags flags = fDecodeUseDefault)
    {
        // Prepare error value result
        if (codepoint != nullptr)
        {
            *codepoint = getFailureCodepoint(flags);
        }

        // Check if it's a null or empty string
        if (str.empty())
        {
            return nullptr;
        }

        // Make sure no unexpected flags pass into used `nextCodePoint` function
        constexpr Flags kErrorHandlingMask = fDecodeSkipInvalid | fDecodeUseDefault;
        const Flags helperParserFlags = (flags & kErrorHandlingMask);

        size_t curCodePointSize = 0; // Keeps max number of bytes for a decoding attempt with `nextCodePoint`
        const bool skipInvalid = (flags & fDecodeSkipInvalid) != 0;

        // Walk the string backwards to find the start of the last CodePoint and decode it
        // Note: it can be a single byte or a sequence, also if not searching for the last valid CodePoint
        // the maximum number of bytes to check is just last `kMaxSequenceLength` bytes instead of the full string
        //
        // Note that the 'array-bounds' warning needs to be disabled here since it can pick up the
        // calculation of `rIterEnd` as being out of bounds.  However, for cases of short strings
        // this is intentional since the iterator needs to point to one byte before the start of the
        // string.
        CARB_IGNOREWARNING_GNUC_WITH_PUSH("-Warray-bounds")
        const CodeByte* const rIterBegin = str.data() - 1 + str.size();
        const CodeByte* const rIterEnd =
            (flags & fDecodeSkipInvalid) ? str.data() - 1 : CARB_MAX(str.data() - 1, rIterBegin - kMaxSequenceLength);
        for (const CodeByte* rIter = rIterBegin; rIter != rIterEnd; --rIter)
        {
            const uint8_t curByte = static_cast<uint8_t>(*rIter);

            ++curCodePointSize;

            // Check if the current code byte is a direct ASCII character
            if (curByte < k7BitLimit)
            {
                // If parsed more than one byte then it's an error
                if (curCodePointSize > 1 && !skipInvalid)
                {
                    return nullptr;
                }

                if (codepoint != nullptr)
                {
                    *codepoint = curByte;
                }
                return rIter;
            }

            // The current code byte is a continuation byte so step further
            if (curByte < kMinLeadByte)
            {
                continue;
            }

            // The current code byte is a lead byte, decode the sequence and check that all bytes were used
            CodePoint cp{};
            const CodeByte* next = nextCodePoint(rIter, curCodePointSize, &cp, helperParserFlags);

            if (!next)
            {
                if (skipInvalid)
                {
                    curCodePointSize = 0;
                    continue;
                }

                return nullptr;
            }

            // Validate that all bytes till the end were used if expecting no invalid bytes
            // Ex: "\xce\xa6\xa6" is a 2 byte sequence "\xce\xa6" for a 0x03A6 code point followed by excessive
            // follow up byte "\xa6". The first 2 bytes will be decoded by the `nextCodePoint` properly
            // and `next` will be pointing at the last "\xa6" byte
            if (!skipInvalid && curCodePointSize != static_cast<size_t>(next - rIter))
            {
                return nullptr;
            }

            const SurrogateMember surrogateType = classifyUtf16SurrogateMember(cp);

            // Encountered the high surrogate part first which is an error
            if (CARB_UNLIKELY(surrogateType == SurrogateMember::eHigh))
            {
                if (skipInvalid)
                {
                    // Just skip it and search further
                    curCodePointSize = 0;
                    continue;
                }

                return nullptr;
            }
            // Found the low part of a surrogate pair, need to continue parsing to get the high part
            else if (CARB_UNLIKELY(surrogateType == SurrogateMember::eLow))
            {
                constexpr int kSurrogatePartSize = 3;
                constexpr int kFullSurrogatePairSize = 2 * kSurrogatePartSize;

                // To prepare for possible continuation of parsing if skipping invalid bytes and no high surrogate is
                // found reset the possible CodePoint size
                curCodePointSize = 0;

                // For a valid UTF-8 string there are must be high surrogate (3 bytes) preceding low surrogate (3 bytes)
                if (rIter <= rIterEnd + kSurrogatePartSize)
                {
                    if (skipInvalid)
                    {
                        // Skip the low surrogate data and continue to check the preceding byte
                        continue;
                    }
                    return nullptr;
                }

                // Step 3 bytes preceding the low surrogate
                const CodeByte* const possibleHighSurStart = rIter - kSurrogatePartSize;
                // Check if it starts with a lead byte
                if (static_cast<uint8_t>(*possibleHighSurStart) < kMinLeadByte)
                {
                    if (skipInvalid)
                    {
                        continue;
                    }
                    return nullptr;
                }

                // Try to parse 6 bytes (full surrogate pair size) to get the whole CodePoint without skipping invalid
                // bytes
                const CodeByte* const decodedPairEnd =
                    nextCodePoint(possibleHighSurStart, kFullSurrogatePairSize, &cp, 0);

                if (!decodedPairEnd)
                {
                    if (skipInvalid)
                    {
                        continue;
                    }
                    return nullptr;
                }

                // Check if used all 6 bytes (as expected from a surrogate pair)
                if (decodedPairEnd - possibleHighSurStart != kFullSurrogatePairSize)
                {
                    if (skipInvalid)
                    {
                        continue;
                    }
                    return nullptr;
                }

                // A proper surrogate pair was parsed into the `cp`
                // and only the `rIter` has invalid value at this point
                rIter = possibleHighSurStart;
                // Just exit the block so the code below reports the result
            }

            if (codepoint)
            {
                *codepoint = cp;
            }

            // Everything is fine thus return start of the sequence
            return rIter;
        }
        CARB_IGNOREWARNING_GNUC_POP

        // Didn't find start of a valid CodePoint
        return nullptr;
    }

    static size_t getLengthInCodePoints(const CodeByte* str, size_t maxLengthInBytes = kNullTerminated, Flags flags = 0)
    {
        const CodeByte* current;
        const CodeByte* next;
        size_t count = 0;

        // get the second codepoint in the string.
        current = str;
        next = nextCodePoint(str, maxLengthInBytes, nullptr, flags);

        // empty or invalid string => fail.
        if (next == nullptr)
            return 0;

        if (maxLengthInBytes != kNullTerminated)
        {
            maxLengthInBytes -= (size_t)(next - current);
        }
        count++;

        do
        {
            current = next;
            next = nextCodePoint(current, maxLengthInBytes, nullptr, flags);

            if (next == nullptr)
                return count;

            if (maxLengthInBytes != kNullTerminated)
            {
                maxLengthInBytes -= (size_t)(next - current);
            }
            count++;
        } while (maxLengthInBytes > 0);

        return count;
    }

    static size_t getLengthInCodeBytes(const CodePoint* str,
                                       size_t maxLengthInCodePoints = kNullTerminated,
                                       Flags flags = 0)
    {
        size_t count = 0;
        size_t largeCodePointSize = 4;

        if ((flags & fEncodeUseUtf16) != 0)
            largeCodePointSize = 6;

        for (size_t i = 0; str[i] != 0 && i < maxLengthInCodePoints; i++)
        {
            if (str[i] < getMaxCodePoint(0))
                count++;

            else if (str[i] < getMaxCodePoint(1))
                count += 2;

            else if (str[i] < getMaxCodePoint(2))
                count += 3;

            else if (str[i] < getMaxCodePoint(3))
                count += largeCodePointSize;

            else if (str[i] < getMaxCodePoint(4))
                count += 5;

            else if (str[i] < getMaxCodePoint(5))
                count += 6;

            else
                count += 7;
        }

        return count;
    }

    static size_t getLengthInCodeBytes(const Utf16CodeUnit* str,
                                       size_t maxLengthInCodePoints = kNullTerminated,
                                       Flags flags = 0)
    {
        size_t count = 0;
        size_t largeCodePointSize = 4;

        if ((flags & fEncodeUseUtf16) != 0)
            largeCodePointSize = 6;

        for (size_t i = 0; str[i] != 0 && i < maxLengthInCodePoints; i++)
        {
            if (str[i] < getMaxCodePoint(0))
                count++;

            else if (str[i] < getMaxCodePoint(1))
                count += 2;

            else
            {
                // found a surrogate pair in the string -> both of these codepoints will decode to
                // a single UTF-32 codepoint => skip the low surrogate and add the size of a
                //   single encoded codepoint.
                if (str[i] >= kSurrogateBaseHigh && str[i] < kSurrogateBaseLow && i + 1 < maxLengthInCodePoints &&
                    str[i + 1] >= kSurrogateBaseLow && str[i + 1] <= kSurrogateMax)
                {
                    i++;
                    count += largeCodePointSize;
                }

                // not part of a UTF-16 surrogate pair => this will encode to 3 bytes in UTF-8.
                else
                    count += 3;
            }
        }

        return count;
    }

    static CodePoint getCodePoint(const CodeByte* str, size_t lengthInBytes = kNullTerminated, Flags flags = 0)
    {
        char32_t c = 0;
        nextCodePoint(str, lengthInBytes, &c, flags);
        return c;
    }

    static CodeByte* getCodeBytes(CodePoint cp, CodeByte* str, size_t lengthInBytes, size_t* bytesWritten, Flags flags = 0)
    {
        size_t sequenceLength = 0;
        size_t continuationLength = 0;
        size_t codePointCount = 1;
        CodePoint codePoint[2] = { cp, 0 };
        CodeByte* result;

        // not enough room in the buffer => fail.
        if (lengthInBytes == 0)
        {
            *bytesWritten = 0;
            return nullptr;
        }

        // a 7-bit ASCII character -> this can be directly stored => store and return.
        if (codePoint[0] < k7BitLimit)
        {
            str[0] = CodeByte((codePoint[0] & 0xff));
            *bytesWritten = 1;
            return str;
        }

        // at this point we know that the encoding for the codepoint is going to require at least
        // two bytes.  We need to calculate the sequence length and encode the bytes.

        // allowing a UTF-16 surrogate pair encoding in the string and the codepoint is above the
        //   range where a surrogate pair is necessary => calculate the low and high codepoints
        //   for the pair and set the sequence length.
        if ((flags & fEncodeUseUtf16) != 0 && codePoint[0] >= kSurrogateBias)
        {
            sequenceLength = 3;
            continuationLength = 2;
            codePointCount = 2;

            codePoint[0] -= kSurrogateBias;

            codePoint[1] = kSurrogateBaseLow | (codePoint[0] & kSurrogateMask);
            codePoint[0] = kSurrogateBaseHigh | ((codePoint[0] >> kSurrogateBits) & kSurrogateMask);
        }

        // not using a UTF-16 surrogate pair => search for the required length of the sequence.
        else
        {
            // figure out the required sequence length for the given for this codepoint.
            for (size_t i = 1; i < kMaxSequenceBytes; i++)
            {
                if (codePoint[0] < getMaxCodePoint(i))
                {
                    sequenceLength = i + 1;
                    continuationLength = i;
                    break;
                }
            }

            // failed to find a sequence length for the given codepoint (?!?) => fail (this should
            //   never happen).
            if (sequenceLength == 0)
            {
                *bytesWritten = 0;
                return nullptr;
            }
        }

        // not enough space in the buffer to store the entire sequence => fail.
        if (lengthInBytes < sequenceLength * codePointCount)
        {
            *bytesWritten = 0;
            return nullptr;
        }

        result = str;

        // write out each of the codepoints.  If UTF-16 encoding is not being used, there will only
        // be one codepoint and this loop will exit after the first iteration.
        for (size_t j = 0; j < codePointCount; j++)
        {
            cp = codePoint[j];

            // write out the lead byte.
            *str = CodeByte(getLeadByte(continuationLength) |
                            ((cp >> (continuationLength * kContinuationShift)) & getLeadMask(continuationLength)));
            str++;

            // write out the continuation bytes.
            for (size_t i = 0; i < continuationLength; i++)
            {
                *str = CodeByte(kContinuationBits |
                                ((cp >> ((continuationLength - i - 1) * kContinuationShift)) & kContinuationMask));
                str++;
            }
        }

        *bytesWritten = sequenceLength * codePointCount;
        return result;
    }

    static SurrogateMember classifyUtf16SurrogateMember(CodePoint cp)
    {
        if (cp >= kSurrogateBaseHigh && cp < kSurrogateBaseLow)
            return SurrogateMember::eHigh;

        if (cp >= kSurrogateBaseLow && cp <= kSurrogateMax)
            return SurrogateMember::eLow;

        return SurrogateMember::eNone;
    }

    static CodePoint decodeUtf16CodePoint(CodePoint high, CodePoint low)
    {
        CodePoint cp;

        // the high and low codepoints are out of the surrogate pair range -> cannot decode => fail.
        if (high < kSurrogateBaseHigh || high >= kSurrogateBaseLow || low < kSurrogateBaseLow || low > kSurrogateMax)
            return 0;

        // decode the surrogate pair into a single Unicode codepoint.
        cp = (((high & kSurrogateMask) << kSurrogateBits) | (low & kSurrogateMask)) + kSurrogateBias;
        return cp;
    }

    static size_t encodeUtf16CodePoint(CodePoint cp, CodePoint* out)
    {
        CodePoint high;
        CodePoint low;

        // small enough for a direct encoding => just store it.
        if (cp < kSurrogateBias)
        {
            if (out != nullptr)
                *out = cp;

            return 1;
        }

        // too big for direct encoding => convert it to a surrogate pair and store both in the
        //   output buffer.
        cp -= kSurrogateBias;
        low = kSurrogateBaseLow | (cp & kSurrogateMask);
        high = kSurrogateBaseHigh | ((cp >> kSurrogateBits) & kSurrogateMask);

        if (out != nullptr)
            *out = high | (low << 16);

        return 2;
    }

    inline static bool isSpaceCodePoint(CodePoint cp)
    {
        // Taken from https://en.wikipedia.org/wiki/Whitespace_character
        // Note: sorted to allow binary search
        static constexpr CodePoint kSpaceCodePoints[] = {
            0x0009, //  character tabulation
            0x000A, //  line feed
            0x000B, //  line tabulation
            0x000C, //  form feed
            0x000D, //  carriage return
            0x0020, //  space
            0x0085, //  next line
            0x00A0, //  no-break space
            0x1680, //  ogham space mark
            0x180E, //  Mongolian vowel separator
            0x2000, //  en quad
            0x2001, //  em quad
            0x2002, //  en space
            0x2003, //  em space
            0x2004, //  three-per-em space
            0x2005, //  four-per-em space
            0x2006, //  six-per-em space
            0x2007, //  figure space
            0x2008, //  punctuation space
            0x2009, //  thin space
            0x200A, //  hair space
            0x200B, //  zero width space
            0x200C, //  zero width non-joiner
            0x200D, //  zero width joiner
            0x2028, //  line separator
            0x2029, //  paragraph separator
            0x202F, //  narrow no-break space
            0x205F, //  medium mathematical space
            0x2060, //  word joiner
            0x3000, //  ideograph space
            0xFEFF, //  zero width non-breaking space
        };
        constexpr size_t kSpaceCodePointsCount = CARB_COUNTOF(kSpaceCodePoints);
        constexpr const CodePoint* const kSpaceCodePointsEnd = kSpaceCodePoints + kSpaceCodePointsCount;
        return std::binary_search(kSpaceCodePoints, kSpaceCodePointsEnd, cp);
    }

private:
    static constexpr uint8_t s_leadBits[] = { 7, 5, 4, 3, 2, 1, 0 };

    static constexpr CodePoint kMaxCodePoint = detail::kLastCodePoint;

    static constexpr uint32_t kContinuationShift = 6;

    static constexpr uint8_t kContinuationBits = 0x80;

    static constexpr uint8_t kContinuationMask = (1u << kContinuationShift) - 1;

    static constexpr CodePoint kSurrogateBias = detail::kSurrogateBias;

    static constexpr CodePoint kSurrogateBaseHigh = detail::kSurrogateFirst;

    static constexpr CodePoint kSurrogateBaseLow = detail::kSurrogateSecond;

    static constexpr CodePoint kSurrogateMin = detail::kSurrogateFirst;

    static constexpr CodePoint kSurrogateMax = detail::kSurrogateLast;

    static constexpr uint32_t kSurrogateBits = detail::kSurrogateShift;

    static constexpr CodePoint kSurrogateMask = detail::kSurrogateMask;

    static constexpr size_t kMaxSequenceBytes = 7;

    static constexpr uint8_t k7BitLimit = 0x80;

    static constexpr uint8_t kMinLeadByte = 0xc0;

    static constexpr uint8_t getContinuationLength(size_t leadByte)
    {
        constexpr uint8_t s_continuationSize[] = {
            0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xc0 - 0xcf */
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xd0 - 0xdf */
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xe0 - 0xef */
            3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 6, 0, /* 0xf0 - 0xff */
        };
        return s_continuationSize[leadByte - kMinLeadByte];
    }

    static constexpr uint8_t getLeadMask(size_t continuationLength)
    {
        constexpr uint8_t s_leadMasks[] = { (1u << s_leadBits[0]) - 1, (1u << s_leadBits[1]) - 1,
                                            (1u << s_leadBits[2]) - 1, (1u << s_leadBits[3]) - 1,
                                            (1u << s_leadBits[4]) - 1, (1u << s_leadBits[5]) - 1,
                                            (1u << s_leadBits[6]) - 1 };
        return s_leadMasks[continuationLength];
    }

    static constexpr uint8_t getLeadByte(size_t continuationLength)
    {
        constexpr uint8_t s_leadBytes[] = {
            (0xffu << (s_leadBits[0] + 1)) & 0xff, (0xffu << (s_leadBits[1] + 1)) & 0xff,
            (0xffu << (s_leadBits[2] + 1)) & 0xff, (0xffu << (s_leadBits[3] + 1)) & 0xff,
            (0xffu << (s_leadBits[4] + 1)) & 0xff, (0xffu << (s_leadBits[5] + 1)) & 0xff,
            (0xffu << (s_leadBits[6] + 1)) & 0xff
        };
        return s_leadBytes[continuationLength];
    }

    static constexpr CodePoint getMaxCodePoint(size_t continuationLength)
    {
        constexpr CodePoint s_maxCodePoint[] = { 0x00000080, 0x00000800, 0x00010000, 0x00200000,
                                                 0x04000000, 0x80000000, 0xffffffff };
        return s_maxCodePoint[continuationLength];
    }

    inline static CodePoint decodeContinuationValue(int32_t byte, size_t continuationLength)
    {
        return (CodePoint)((byte & kContinuationMask) << ((continuationLength - 1) * kContinuationShift));
    }

    static constexpr CodePoint getFailureCodepoint(Flags flags)
    {
        return (flags & fDecodeUseDefault) != 0 ? kDefaultCodePoint : 0;
    }

    static bool parseUtf8(const CodeByte* str,
                          const CodeByte** outNext,
                          CodePoint* outCodePoint,
                          size_t lengthInBytes = kNullTerminated,
                          Flags flags = 0)
    {
        auto fail = [&]() -> bool {
            // we weren't asked to attempt to skip over invalid code sequences => just fail out
            if ((flags & fDecodeSkipInvalid) == 0)
            {
                return false;
            }

            // walk the rest of the string skipping over continuation bytes and invalid lead bytes.
            // Note that we've already tested and rejected the first byte so we just need to continue
            // the search starting at the next byte.
            for (size_t i = 1; i < lengthInBytes; i++)
            {
                const auto b = static_cast<uint8_t>(str[i]);
                // continuation byte => skip it.
                if ((b & ~kContinuationMask) == kContinuationBits)
                    continue;

                // invalid lead byte => skip it.
                if (b >= kMinLeadByte && getContinuationLength(b) == 0)
                    continue;

                // invalid range of bytes
                if (b >= k7BitLimit && b < kMinLeadByte)
                    continue;

                *outNext = str + i;
                return false;
            }

            // We've hit the end of the string.  This mean that the sequence is
            // either invalid, misaligned, or an illegal overlong sequence was
            // used.  We aren't able to write out the next character pointer if
            // we hit this point.
            return false;
        };

        // initialize to failure values;
        *outCodePoint = getFailureCodepoint(flags);
        *outNext = nullptr;

        // the string doesn't have any more bytes in it -> no more codepoints => fail.
        if (lengthInBytes == 0)
        {
            return false;
        }

        const auto byte = static_cast<uint8_t>(*str);

        // the current code byte is at the null terminator -> no more codepoints => finish.
        if (byte == '\0')
        {
            *outCodePoint = byte;
            return true;
        }

        // the current code byte is a direct ASCII character => finish.
        if (byte < k7BitLimit)
        {
            *outCodePoint = byte;
            *outNext = str + 1;
            return true;
        }

        if (byte < kMinLeadByte)
        {
            return fail();
        }

        // the current code byte is a lead byte => calculate the sequence length and return the
        //   start of the next codepoint.
        const size_t continuationLength = getContinuationLength(byte);
        const size_t sequenceLength = continuationLength + 1;

        // not enough bytes left in the string to complete this codepoint => fail.
        // continuationLength of 0 is invalid => fail
        if (lengthInBytes < sequenceLength || continuationLength == 0)
        {
            return fail();
        }

        // decode the codepoint.
        {
            CodePoint cp =
                (CodePoint)((byte & getLeadMask(continuationLength)) << (continuationLength * kContinuationShift));

            for (size_t i = 0; i < continuationLength; i++)
            {
                // validate the continuation byte so we don't walk past the
                // end of a null terminated string
                if ((uint8_t(str[i + 1]) & ~kContinuationMask) != kContinuationBits)
                {
                    return fail();
                }

                cp |= decodeContinuationValue(str[i + 1], continuationLength - i);
            }

            *outCodePoint = cp;
            *outNext = str + sequenceLength;
            return true;
        }
    }
};

class Utf8Iterator
{
public:
    using CodeByte = Utf8Parser::CodeByte;
    using CodePoint = Utf8Parser::CodePoint;
    using Flags = Utf8Parser::Flags;
    // Reference the special length value used for null terminated strings.
    static constexpr size_t kNullTerminated = Utf8Parser::kNullTerminated;

    Utf8Iterator()
        : m_prev(nullptr), m_string(nullptr), m_length(kNullTerminated), m_flags(0), m_lastCodePoint(0), m_index(0)
    {
    }

    Utf8Iterator(const CodeByte* string, size_t lengthInBytes = kNullTerminated, Flags flags = 0)
        : m_prev(nullptr), m_string(string), m_length(lengthInBytes), m_flags(flags), m_lastCodePoint(0), m_index(0)
    {
        next();
    }

    Utf8Iterator(const Utf8Iterator& it)
    {
        copy(it);
    }

    explicit operator bool() const
    {
        return isValid();
    }

    CodePoint operator*() const
    {
        return m_lastCodePoint;
    }

    const CodeByte* operator&() const
    {
        return m_prev;
    }

    Utf8Iterator& operator++()
    {
        next();
        return *this;
    }

    Utf8Iterator operator++(int32_t)
    {
        Utf8Iterator tmp = (*this);
        next();
        return tmp;
    }

    template <typename T>
    Utf8Iterator& operator+=(T count)
    {
        for (T i = 0; i < count && m_prev != nullptr; i++)
            next();

        return *this;
    }

    template <typename T>
    Utf8Iterator operator+(T count) const
    {
        Utf8Iterator tmp = *this;
        return (tmp += count);
    }

    bool operator==(const Utf8Iterator& it) const
    {
        return m_string == it.m_string;
    }

    bool operator!=(const Utf8Iterator& it) const
    {
        return m_string != it.m_string;
    }

    bool operator<(const Utf8Iterator& it) const
    {
        return m_string < it.m_string;
    }

    bool operator<=(const Utf8Iterator& it) const
    {
        return m_string <= it.m_string;
    }

    bool operator>(const Utf8Iterator& it) const
    {
        return m_string > it.m_string;
    }

    bool operator>=(const Utf8Iterator& it) const
    {
        return m_string >= it.m_string;
    }

    Utf8Iterator& operator=(const Utf8Iterator& it)
    {
        // Note: normally we'd check for an identity assignment in this operator overload and
        //       ignore.  Unfortunately we can't do that here since we also override the '&'
        //       operator above.  Since this copy operation should still be safe for an identity
        //       assignment, we'll just let it proceed.
        copy(it);
        return *this;
    }

    Utf8Iterator& operator=(const CodeByte* str)
    {
        m_prev = nullptr;
        m_string = str;
        m_length = kNullTerminated;
        m_lastCodePoint = 0;
        m_flags = 0;
        m_index = 0;
        next();
        return *this;
    }

    size_t getIndex() const
    {
        return m_index - 1;
    }

    size_t getCodepointSize() const
    {
        if (m_string == nullptr)
            return m_prev == nullptr ? 0 : 1;

        return (size_t)(m_string - m_prev);
    }

private:
    void copy(const Utf8Iterator& it)
    {
        m_prev = it.m_prev;
        m_string = it.m_string;
        m_length = it.m_length;
        m_flags = it.m_flags;
        m_lastCodePoint = it.m_lastCodePoint;
        m_index = it.m_index;
    }

    bool isValid() const
    {
        return m_string != nullptr && m_lastCodePoint != 0;
    }

    void next()
    {
        const CodeByte* ptr;

        if (m_string == nullptr)
        {
            m_prev = nullptr;
            return;
        }

        if (m_length == 0)
        {
            m_string = nullptr;
            m_prev = nullptr;
            m_lastCodePoint = 0;
            return;
        }

        ptr = Utf8Parser::nextCodePoint(m_string, m_length, &m_lastCodePoint, m_flags);

        if (m_length != kNullTerminated)
            m_length -= (size_t)(ptr - m_string);

        m_prev = m_string;
        m_string = ptr;
        m_index++;
    }

    const CodeByte* m_prev;

    const CodeByte* m_string;

    size_t m_length;

    Flags m_flags;

    CodePoint m_lastCodePoint;

    size_t m_index;
};

// implementation details used for string conversions
namespace detail
{
template <typename CharT>
class BufferAdapter
{
public:
    using value_type = CharT;

    constexpr BufferAdapter(CharT* buffer, size_t bufferLength) noexcept
        : m_buffer(buffer), m_capacityWithTerm(buffer ? bufferLength : size_t(-1))
    {
        if (buffer && bufferLength)
            buffer[0] = CharT{}; // null terminate
    }

    bool resize(size_t size) noexcept
    {
        CARB_ASSERT(size > m_size); // Only allow increasing
        if (size > capacity())
            return false;
        if (m_buffer)
        {
#ifdef __CUDACC__ // Silence warning #128-D: loop is not reachable
            const auto end = &m_buffer[size + 1]; // add a terminator too
            for (auto it = &m_buffer[m_size]; it != end; ++it)
                *it = CharT{};
#else
            std::fill(&m_buffer[m_size], &m_buffer[size + 1], CharT{}); // add a terminator too
#endif
        }
        m_size = size;
        return true;
    }

    CharT& operator[](size_t index)
    {
        static CharT dummy;
        CARB_ASSERT(index < m_size);
        return m_buffer ? m_buffer[index] : dummy;
    }

    constexpr size_t capacity() const noexcept
    {
        return m_capacityWithTerm ? m_capacityWithTerm - 1 : 0; // our capacity includes the terminator, so remove that
    }

    constexpr size_t size() const noexcept
    {
        return m_size;
    }

    bool push_back(CharT c) noexcept
    {
        if (m_buffer)
        {
            if (m_size >= capacity())
                return false;
            m_buffer[m_size++] = c;
            m_buffer[m_size] = CharT{}; // null terminate
        }
        else
        {
            ++m_size; // just adding the size that we need
        }
        return true;
    }

    size_t sizeWithTerminator() const noexcept
    {
        return !m_buffer || m_capacityWithTerm != 0 ? m_size + 1 : 0;
    }

private:
    CharT* m_buffer;
    size_t m_size{ 0 };
    const size_t m_capacityWithTerm;
};

template <class T>
#if CARB_HAS_CPP20 && defined(__cpp_char8_t)
constexpr bool IsUtf8Char_v = std::is_same_v<T, char> || std::is_same_v<T, char8_t>;
#else
constexpr bool IsUtf8Char_v = std::is_same_v<T, char>;
#endif

constexpr bool isValidCodePoint(CodePoint cp) noexcept
{
    if (cp > kLastCodePoint)
        return false;
    if ((cp & kSurrogateFirstMask) == kSurrogateFirst)
        return false;
    return true;
}

constexpr CodePoint replaceInvalid(CodePoint cp) noexcept
{
    return isValidCodePoint(cp) ? cp : kReplacementChar;
}

// returns continuation size and lead mask
constexpr std::pair<size_t, const uint8_t> parseContinuation(uint8_t lead) noexcept
{
    // 0xxx xxxx - ASCII, directly convertible with no continuation bytes. Shouldn't get here.
    CARB_ASSERT(lead & 0x80);
#if CARB_HAS_CPP20 && defined(__cpp_lib_bitops)
    auto bits = size_t(std::countl_one(lead));
    if (bits >= 2 && bits <= 4)
        return { bits - 1, uint8_t(uint8_t(-1) >> (bits + 1)) };
#else
    // 110x xxxx - 1 continuation byte
    if ((lead & 0b1110'0000) == 0b1100'0000)
        return { 1, uint8_t(0b0001'1111) };
    // 1110 xxxx - 2 continuation bytes
    if ((lead & 0b1111'0000) == 0b1110'0000)
        return { 2, uint8_t(0b0000'1111) };
    // 1111 0xxx - 3 continuation bytes
    if ((lead & 0b1111'1000) == 0b1111'0000)
        return { 3, uint8_t(0b0000'0111) };
#endif
    return { 0, uint8_t(0) };
}

template <class T> // UTF-8
char32_t toCodePoint(cpp::basic_string_view<std::enable_if_t<sizeof(T) == 1, T>>& in)
{
    CARB_ASSERT(!in.empty());
    const uint8_t lead = static_cast<uint8_t>(in.front());
    in.remove_prefix(1);

    // If no continuation byte is set => ASCII, which is directly convertible
    if (!(lead & 0x80))
        return char32_t{ lead };

    // Get the continuation size and lead mask from the lead byte
    auto [contSize, kLeadMask] = parseContinuation(lead);
    if (contSize == 0 || in.size() < contSize) // invalid or not enough characters remain
        return kReplacementChar;

    constexpr size_t kContShift = 6;
    constexpr uint8_t kContMask = (uint8_t(1) << kContShift) - 1;
    char32_t cp = char32_t(lead & kLeadMask) << (contSize * kContShift);
    do
    {
        auto cont = static_cast<uint8_t>(in.front());
        // 10xx xxxx - continuation bytes
        if ((cont & 0b1100'0000) != 0b1000'0000)
            return kReplacementChar; // not a valid continuation byte
        in.remove_prefix(1);
        cp |= (char32_t(cont & kContMask) << (--contSize * kContShift));
    } while (contSize);

    // Make sure our decode was valid
    return replaceInvalid(cp);
}

template <typename Pred>
bool handleBuffer(Pred&& pred)
{
    if constexpr (std::is_same_v<bool, decltype(pred())>)
        return pred();
    else
    {
        pred();
        return true;
    }
}

template <class StringType, typename Out = typename StringType::value_type> // UTF-8
std::enable_if_t<sizeof(Out) == 1, bool> fromCodePoint(char32_t cp, StringType& out)
{
    cp = replaceInvalid(cp);

    // Check for ASCII character that requires no encoding
    CARB_LIKELY_IF(cp < char32_t{ 0x80 })
    {
        return handleBuffer([&] { return out.push_back(Out(cp)); });
    }

    // We will need a continuation. Figure out how many continuation bytes
    const size_t continuationLength = cp < 0x7ff ? 1 : cp < 0xffff ? 2 : 3;
    CARB_ASSERT(continuationLength >= 1 && continuationLength <= 3);

    auto index = out.size();

    // Resize with an extra for the lead byte. If our buffer doesn't have space,
    // return instead of pushing a partial sequence.
    if (!handleBuffer([&] { return out.resize(index + continuationLength + 1); }))
        return false;

    constexpr static size_t kContinuationBits = 6;
    constexpr static uint8_t kContinuationBase = 0b10000000; // actually 0b10xxxxxx
    constexpr static char32_t kContinuationMask = char32_t((1 << kContinuationBits) - 1);

    uint8_t lead = uint8_t(0xf0 << (3 - continuationLength));
    lead |= uint8_t(cp >> (continuationLength * kContinuationBits));
    out[index++] = Out(lead);
    switch (continuationLength)
    {
        case 3:
            out[index++] = Out(kContinuationBase | uint8_t((cp >> (2 * kContinuationBits)) & kContinuationMask));
            [[fallthrough]];
        case 2:
            out[index++] = Out(kContinuationBase | uint8_t((cp >> kContinuationBits) & kContinuationMask));
            [[fallthrough]];
        case 1:
            out[index] = Out(kContinuationBase | uint8_t(cp & kContinuationMask));
            break;
    }
    return true;
}

template <class T> // UTF-16
char32_t toCodePoint(cpp::basic_string_view<std::enable_if_t<sizeof(T) == 2, T>>& in)
{
    CARB_ASSERT(!in.empty());
    auto c = static_cast<char16_t>(in.front());
    in.remove_prefix(1);

    // Test for surrogate pair
    if ((c & kSurrogateFirstMask) != kSurrogateFirst)
        return char32_t{ c }; // not surrogate pair, can be converted directly

    // If it's actually the second pair component, it's invalid
    if ((c & kSurrogateSecondMask) == kSurrogateSecond)
        return kReplacementChar;

    // If the second part isn't the high member, then it's invalid
    if (in.empty() || ((in.front() & kSurrogateSecondMask) != kSurrogateSecond))
        return kReplacementChar;

    // Combine the surrogates
    char32_t cp = char32_t(c & kSurrogateMask) << kSurrogateShift;
    cp |= (in.front() & kSurrogateMask);
    cp += kSurrogateBias;
    in.remove_prefix(1);

    return replaceInvalid(cp);
}

template <class StringType, typename Out = typename StringType::value_type> // UTF-16
std::enable_if_t<sizeof(Out) == 2, bool> fromCodePoint(char32_t cp, StringType& out)
{
    cp = replaceInvalid(cp);

    // Check if it fits in a single UTF-16 code unit
    if (cp < kSurrogateBias)
    {
        return handleBuffer([&] { return out.push_back(Out(cp)); });
    }
    else
    {
        // Resize. If our buffer doesn't have space for the whole pair, exit out
        auto index = out.size();
        if (!handleBuffer([&] { return out.resize(index + 2); }))
            return false;
        // Doesn't fit in a single code unit => convert to surrogate pair
        cp -= kSurrogateBias;
        out[index + 0] = Out(kSurrogateFirst | (char16_t(cp >> kSurrogateShift) & kSurrogateMask));
        out[index + 1] = Out(kSurrogateSecond | char16_t(cp & kSurrogateMask));
    }
    return true;
}

template <class T>
char32_t toCodePoint(cpp::basic_string_view<std::enable_if_t<sizeof(T) == 4, T>>& in)
{
    CARB_ASSERT(!in.empty());
    auto cp = static_cast<char32_t>(in.front());
    in.remove_prefix(1);
    return replaceInvalid(cp);
}

template <class StringType, typename Out = typename StringType::value_type> // UTF-32
std::enable_if_t<sizeof(Out) == 4, bool> fromCodePoint(char32_t cp, StringType& out)
{
    cp = replaceInvalid(cp);
    return handleBuffer([&] { return out.push_back(Out(cp)); });
}

template <typename StringType, typename In>
inline bool convertBetweenUnicodeFormats(cpp::basic_string_view<In> str, StringType& out)
{
    using Out = typename StringType::value_type;
    static_assert(!std::is_same_v<In, Out>, "Pointless to convert between same types");
    while (!str.empty())
    {
        if (!fromCodePoint(toCodePoint<In>(str), out))
            return false;
    }
    return true;
}
} // namespace detail

inline size_t convertUtf8StringToUtf32(cpp::string_view str, char32_t* out, size_t outLen) noexcept
{
    detail::BufferAdapter<char32_t> adapter(out, outLen);
    detail::convertBetweenUnicodeFormats(str, adapter);
    return adapter.sizeWithTerminator();
}

inline size_t convertUtf8StringToUtf32(cpp::unbounded_string str, char32_t* out, size_t outLen) noexcept
{
    return convertUtf8StringToUtf32(cpp::string_view(cpp::unsafe_length, str), out, outLen);
}

#ifndef DOXYGEN_BUILD // sphinx can't cope with this overload
template <class StringType>
inline std::enable_if_t<std::is_same_v<typename StringType::value_type, char32_t>, StringType> convertUtf8StringToUtf32(
    cpp::string_view str)
{
    StringType out;
    detail::convertBetweenUnicodeFormats(str, out);
    return out;
}
#endif

inline std::u32string convertUtf8StringToUtf32(cpp::string_view str)
{
    return convertUtf8StringToUtf32<std::u32string>(str);
}
inline std::u32string convertUtf8StringToUtf32(cpp::unbounded_string str)
{
    return convertUtf8StringToUtf32<std::u32string>(cpp::string_view(cpp::unsafe_length, str));
}

// TODO: OVCC-1591: would be nice to have basic_string_view versions of these functions and InputIterator versions
// which would also allow passing two `const char*` as a [begin, end).

inline size_t convertUtf32StringToUtf8(cpp::u32string_view str, char* out, size_t outLen)
{
    detail::BufferAdapter<char> adapter(out, outLen);
    detail::convertBetweenUnicodeFormats(str, adapter);
    return adapter.sizeWithTerminator();
}

inline size_t convertUtf32StringToUtf8(cpp::unbounded_u32string str, char* out, size_t outLen)
{
    return convertUtf32StringToUtf8(cpp::u32string_view(cpp::unsafe_length, str), out, outLen);
}

#ifndef DOXYGEN_BUILD // sphinx can't cope with this overload
template <class StringType>
inline std::enable_if_t<detail::IsUtf8Char_v<typename StringType::value_type>, StringType> convertUtf32StringToUtf8(
    cpp::u32string_view str)
{
    StringType out;
    detail::convertBetweenUnicodeFormats(str, out);
    return out;
}
#endif

inline std::string convertUtf32StringToUtf8(cpp::u32string_view str)
{
    return convertUtf32StringToUtf8<std::string>(str);
}
inline std::string convertUtf32StringToUtf8(cpp::unbounded_u32string str)
{
    return convertUtf32StringToUtf8<std::string>(cpp::u32string_view(cpp::unsafe_length, str));
}

inline size_t convertUtf16StringToUtf8(cpp::u16string_view str, char* out, size_t outLen)
{
    detail::BufferAdapter<char> adapter(out, outLen);
    detail::convertBetweenUnicodeFormats(str, adapter);
    return adapter.sizeWithTerminator();
}

inline size_t convertUtf16StringToUtf8(cpp::unbounded_u16string str, char* out, size_t outLen)
{
    return convertUtf16StringToUtf8(cpp::u16string_view(cpp::unsafe_length, str), out, outLen);
}

#ifndef DOXYGEN_BUILD // sphinx can't cope with this overload
template <class StringType>
inline std::enable_if_t<detail::IsUtf8Char_v<typename StringType::value_type>, StringType> convertUtf16StringToUtf8(
    cpp::u16string_view str)
{
    StringType out;
    detail::convertBetweenUnicodeFormats(str, out);
    return out;
}
#endif

inline std::string convertUtf16StringToUtf8(cpp::u16string_view str)
{
    return convertUtf16StringToUtf8<std::string>(str);
}
inline std::string convertUtf16StringToUtf8(cpp::unbounded_u16string str)
{
    return convertUtf16StringToUtf8<std::string>(cpp::u16string_view(cpp::unsafe_length, str));
}

inline size_t convertUtf8StringToUtf16(cpp::string_view str, char16_t* out, size_t outLen) noexcept
{
    detail::BufferAdapter<char16_t> adapter(out, outLen);
    detail::convertBetweenUnicodeFormats(str, adapter);
    return adapter.sizeWithTerminator();
}

inline size_t convertUtf8StringToUtf16(cpp::unbounded_string str, char16_t* out, size_t outLen)
{
    return convertUtf8StringToUtf16(cpp::string_view(cpp::unsafe_length, str), out, outLen);
}

#ifndef DOXYGEN_BUILD // sphinx can't cope with this overload
template <class StringType>
inline std::enable_if_t<std::is_same_v<typename StringType::value_type, char16_t>, StringType> convertUtf8StringToUtf16(
    cpp::string_view str)
{
    StringType out;
    detail::convertBetweenUnicodeFormats(str, out);
    return out;
}
#endif

inline std::u16string convertUtf8StringToUtf16(cpp::string_view str)
{
    return convertUtf8StringToUtf16<std::u16string>(str);
}
inline std::u16string convertUtf8StringToUtf16(cpp::unbounded_string str)
{
    return convertUtf8StringToUtf16<std::u16string>(cpp::string_view(cpp::unsafe_length, str));
}

inline size_t convertUtf8StringToWide(cpp::string_view str, wchar_t* out, size_t outLen) noexcept
{
    detail::BufferAdapter<wchar_t> adapter(out, outLen);
    detail::convertBetweenUnicodeFormats(str, adapter);
    return adapter.sizeWithTerminator();
}

inline size_t convertUtf8StringToWide(cpp::unbounded_string str, wchar_t* out, size_t outLen) noexcept
{
    return convertUtf8StringToWide(cpp::string_view(cpp::unsafe_length, str), out, outLen);
}

#ifndef DOXYGEN_BUILD // sphinx can't cope with this overload
template <class StringType>
inline std::enable_if_t<std::is_same_v<typename StringType::value_type, wchar_t>, StringType> convertUtf8StringToWide(
    cpp::string_view str)
{
    StringType out;
    detail::convertBetweenUnicodeFormats(str, out);
    return out;
}
#endif

inline std::wstring convertUtf8StringToWide(cpp::string_view str)
{
    return convertUtf8StringToWide<std::wstring>(str);
}
inline std::wstring convertUtf8StringToWide(const char* str)
{
    return convertUtf8StringToWide<std::wstring>(cpp::string_view(cpp::unsafe_length, str));
}

inline size_t convertWideStringToUtf8(cpp::wstring_view str, char* out, size_t outLen) noexcept
{
    detail::BufferAdapter<char> adapter(out, outLen);
    detail::convertBetweenUnicodeFormats(str, adapter);
    return adapter.sizeWithTerminator();
}

inline size_t convertWideStringToUtf8(cpp::unbounded_wstring str, char* out, size_t outLen) noexcept
{
    return convertWideStringToUtf8(cpp::wstring_view(cpp::unsafe_length, str), out, outLen);
}

#ifndef DOXYGEN_BUILD // sphinx can't cope with this overload
template <class StringType>
std::enable_if_t<detail::IsUtf8Char_v<typename StringType::value_type>, StringType> convertWideStringToUtf8(
    cpp::wstring_view str)
{
    StringType out;
    detail::convertBetweenUnicodeFormats(str, out);
    return out;
}
#endif

inline std::string convertWideStringToUtf8(cpp::wstring_view str)
{
    return convertWideStringToUtf8<std::string>(str);
}
inline std::string convertWideStringToUtf8(cpp::unbounded_wstring str)
{
    return convertWideStringToUtf8<std::string>(cpp::wstring_view(cpp::unsafe_length, str));
}

} // namespace carb::extras