
File members: carb/extras/Utf8Parser.h

// Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#pragma once

#include "../Defines.h"
#include "../../omni/extras/ScratchBuffer.h"

#include <cstdint>
#include <algorithm>
#include <cmath>

namespace carb
namespace extras

class Utf8Parser
    using CodePoint = char32_t;

    using Utf16CodeUnit = char16_t;

    using CodeByte = char;

    using Flags = uint32_t;

    enum class SurrogateMember



    static constexpr Flags fDecodeUseDefault = 0x00000001;

    static constexpr Flags fDecodeSkipInvalid = 0x00000002;

    static constexpr Flags fEncodeUseUtf16 = 0x00000004;

    static constexpr Flags fEncodeIgnoreSurrogatePairs = 0x00000008;

    static constexpr size_t kNullTerminated = ~0ull;

    static constexpr CodePoint kInvalidCodePoint = ~0u;

    static constexpr size_t kMaxSequenceLength = 7;

    static constexpr CodePoint kDefaultCodePoint = 0x0000fffd;

    static const CodeByte* nextCodePoint(const CodeByte* str,
                                         size_t lengthInBytes = kNullTerminated,
                                         CodePoint* codepoint = nullptr,
                                         Flags flags = 0)
        // retrieve the next code point
        CodePoint high = 0;
        const CodeByte* next = nullptr;
        bool r = parseUtf8(str, &next, &high, lengthInBytes, flags);

        if (codepoint != nullptr)
            *codepoint = high;

        // parsing failed => just fail out
        if (!r)
            return next;

        // it's a surrogate pair (and we're allowed to parse those) => parse out the full pair
        if ((flags & fEncodeIgnoreSurrogatePairs) == 0 && classifyUtf16SurrogateMember(high) == SurrogateMember::eHigh)
            // figure out the new length if it's not null terminated
            const size_t newLen = (lengthInBytes == kNullTerminated) ? kNullTerminated : (lengthInBytes - (next - str));

            // parse out the next code point
            CodePoint low = 0;
            r = parseUtf8(next, &next, &low, newLen, flags);

            // invalid surrogate pair => fail
            if (!r || classifyUtf16SurrogateMember(low) != SurrogateMember::eLow)
                if (codepoint != nullptr)
                    *codepoint = getFailureCodepoint(flags);

                return next;

            // valid surrogate pair => calculate the code point
            if (codepoint != nullptr)
                *codepoint = (((high & kSurrogateMask) << kSurrogateBits) | (low & kSurrogateMask)) + kSurrogateBias;

            return next;

        return next;

    static const CodeByte* lastCodePoint(const CodeByte* str,
                                         size_t lengthInBytes = kNullTerminated,
                                         CodePoint* codepoint = nullptr,
                                         Flags flags = fDecodeUseDefault)
        // Prepare error value result
        if (codepoint != nullptr)
            *codepoint = getFailureCodepoint(flags);

        // Check if it's a null or empty string
        if (!str || *str == 0)
            return nullptr;

        if (lengthInBytes == kNullTerminated)
            lengthInBytes = std::strlen(str);

        // Make sure no unexpected flags pass into used `nextCodePoint` function
        constexpr Flags kErrorHandlingMask = fDecodeSkipInvalid | fDecodeUseDefault;
        const Flags helperParserFlags = (flags & kErrorHandlingMask);

        size_t curCodePointSize = 0; // Keeps max number of bytes for a decoding attempt with `nextCodePoint`
        const bool skipInvalid = (flags & fDecodeSkipInvalid) != 0;

        // Walk the string backwards to find the start of the last CodePoint and decode it
        // Note: it can be a single byte or a sequence, also if not searching for the last valid CodePoint
        // the maximum number of bytes to check is just last `kMaxSequenceLength` bytes instead of the full string
        // Note that the 'array-bounds' warning needs to be disabled here since it can pick up the
        // calculation of `rIterEnd` as being out of bounds.  However, for cases of short strings
        // this is intentional since the iterator needs to point to one byte before the start of the
        // string.
        const CodeByte* const rIterBegin = str - 1 + lengthInBytes;
        const CodeByte* const rIterEnd =
            (flags & fDecodeSkipInvalid) ? str - 1 : CARB_MAX(str - 1, rIterBegin - kMaxSequenceLength);
        for (const CodeByte* rIter = rIterBegin; rIter != rIterEnd; --rIter)
            const uint8_t curByte = static_cast<uint8_t>(*rIter);


            // Check if the current code byte is a direct ASCII character
            if (curByte < k7BitLimit)
                // If parsed more than one byte then it's an error
                if (curCodePointSize > 1 && !skipInvalid)
                    return nullptr;

                if (codepoint != nullptr)
                    *codepoint = curByte;
                return rIter;

            // The current code byte is a continuation byte so step further
            if (curByte < kMinLeadByte)

            // The current code byte is a lead byte, decode the sequence and check that all bytes were used
            CodePoint cp{};
            const CodeByte* next = nextCodePoint(rIter, curCodePointSize, &cp, helperParserFlags);

            if (!next)
                if (skipInvalid)
                    curCodePointSize = 0;

                return nullptr;

            // Validate that all bytes till the end were used if expecting no invalid bytes
            // Ex: "\xce\xa6\xa6" is a 2 byte sequence "\xce\xa6" for a 0x03A6 code point followed by excessive
            // follow up byte "\xa6". The first 2 bytes will be decoded by the `nextCodePoint` properly
            // and `next` will be pointing at the last "\xa6" byte
            if (!skipInvalid && curCodePointSize != static_cast<size_t>(next - rIter))
                return nullptr;

            const SurrogateMember surrogateType = classifyUtf16SurrogateMember(cp);

            // Encountered the high surrogate part first which is an error
            if (CARB_UNLIKELY(surrogateType == SurrogateMember::eHigh))
                if (skipInvalid)
                    // Just skip it and search further
                    curCodePointSize = 0;

                return nullptr;
            // Found the low part of a surrogate pair, need to continue parsing to get the high part
            else if (CARB_UNLIKELY(surrogateType == SurrogateMember::eLow))
                constexpr int kSurrogatePartSize = 3;
                constexpr int kFullSurrogatePairSize = 2 * kSurrogatePartSize;

                // To prepare for possible continuation of parsing if skipping invalid bytes and no high surrogate is
                // found reset the possible CodePoint size
                curCodePointSize = 0;

                // For a valid UTF-8 string there are must be high surrogate (3 bytes) preceding low surrogate (3 bytes)
                if (rIter <= rIterEnd + kSurrogatePartSize)
                    if (skipInvalid)
                        // Skip the low surrogate data and continue to check the preceding byte
                    return nullptr;

                // Step 3 bytes preceding the low surrogate
                const CodeByte* const possibleHighSurStart = rIter - kSurrogatePartSize;
                // Check if it starts with a lead byte
                if (static_cast<uint8_t>(*possibleHighSurStart) < kMinLeadByte)
                    if (skipInvalid)
                    return nullptr;

                // Try to parse 6 bytes (full surrogate pair size) to get the whole CodePoint without skipping invalid
                // bytes
                const CodeByte* const decodedPairEnd =
                    nextCodePoint(possibleHighSurStart, kFullSurrogatePairSize, &cp, 0);

                if (!decodedPairEnd)
                    if (skipInvalid)
                    return nullptr;

                // Check if used all 6 bytes (as expected from a surrogate pair)
                if (decodedPairEnd - possibleHighSurStart != kFullSurrogatePairSize)
                    if (skipInvalid)
                    return nullptr;

                // A proper surrogate pair was parsed into the `cp`
                // and only the `rIter` has invalid value at this point
                rIter = possibleHighSurStart;
                // Just exit the block so the code below reports the result

            if (codepoint)
                *codepoint = cp;

            // Everything is fine thus return start of the sequence
            return rIter;

        // Didn't find start of a valid CodePoint
        return nullptr;

    static size_t getLengthInCodePoints(const CodeByte* str, size_t maxLengthInBytes = kNullTerminated, Flags flags = 0)
        const CodeByte* current;
        const CodeByte* next;
        size_t count = 0;

        // get the second codepoint in the string.
        current = str;
        next = nextCodePoint(str, maxLengthInBytes, nullptr, flags);

        // empty or invalid string => fail.
        if (next == nullptr)
            return 0;

        if (maxLengthInBytes != kNullTerminated)
            maxLengthInBytes -= next - current;

            current = next;
            next = nextCodePoint(current, maxLengthInBytes, nullptr, flags);

            if (next == nullptr)
                return count;

            if (maxLengthInBytes != kNullTerminated)
                maxLengthInBytes -= next - current;
        } while (maxLengthInBytes > 0);

        return count;

    static size_t getLengthInCodeBytes(const CodePoint* str,
                                       size_t maxLengthInCodePoints = kNullTerminated,
                                       Flags flags = 0)
        size_t count = 0;
        size_t largeCodePointSize = 4;

        if ((flags & fEncodeUseUtf16) != 0)
            largeCodePointSize = 6;

        for (size_t i = 0; str[i] != 0 && i < maxLengthInCodePoints; i++)
            if (str[i] < getMaxCodePoint(0))

            else if (str[i] < getMaxCodePoint(1))
                count += 2;

            else if (str[i] < getMaxCodePoint(2))
                count += 3;

            else if (str[i] < getMaxCodePoint(3))
                count += largeCodePointSize;

            else if (str[i] < getMaxCodePoint(4))
                count += 5;

            else if (str[i] < getMaxCodePoint(5))
                count += 6;

                count += 7;

        return count;

    static size_t getLengthInCodeBytes(const Utf16CodeUnit* str,
                                       size_t maxLengthInCodePoints = kNullTerminated,
                                       Flags flags = 0)
        size_t count = 0;
        size_t largeCodePointSize = 4;

        if ((flags & fEncodeUseUtf16) != 0)
            largeCodePointSize = 6;

        for (size_t i = 0; str[i] != 0 && i < maxLengthInCodePoints; i++)
            if (str[i] < getMaxCodePoint(0))

            else if (str[i] < getMaxCodePoint(1))
                count += 2;

                // found a surrogate pair in the string -> both of these codepoints will decode to
                // a single UTF-32 codepoint => skip the low surrogate and add the size of a
                //   single encoded codepoint.
                if (str[i] >= kSurrogateBaseHigh && str[i] < kSurrogateBaseLow && i + 1 < maxLengthInCodePoints &&
                    str[i + 1] >= kSurrogateBaseLow && str[i + 1] <= kSurrogateMax)
                    count += largeCodePointSize;

                // not part of a UTF-16 surrogate pair => this will encode to 3 bytes in UTF-8.
                    count += 3;

        return count;

    static CodePoint getCodePoint(const CodeByte* str, size_t lengthInBytes = kNullTerminated, Flags flags = 0)
        char32_t c = 0;
        nextCodePoint(str, lengthInBytes, &c, flags);
        return c;

    static CodeByte* getCodeBytes(CodePoint cp, CodeByte* str, size_t lengthInBytes, size_t* bytesWritten, Flags flags = 0)
        size_t sequenceLength = 0;
        size_t continuationLength = 0;
        size_t codePointCount = 1;
        CodePoint codePoint[2] = { cp, 0 };
        CodeByte* result;

        // not enough room in the buffer => fail.
        if (lengthInBytes == 0)
            *bytesWritten = 0;
            return nullptr;

        // a 7-bit ASCII character -> this can be directly stored => store and return.
        if (codePoint[0] < k7BitLimit)
            str[0] = CodeByte((codePoint[0] & 0xff));
            *bytesWritten = 1;
            return str;

        // at this point we know that the encoding for the codepoint is going to require at least
        // two bytes.  We need to calculate the sequence length and encode the bytes.

        // allowing a UTF-16 surrogate pair encoding in the string and the codepoint is above the
        //   range where a surrogate pair is necessary => calculate the low and high codepoints
        //   for the pair and set the sequence length.
        if ((flags & fEncodeUseUtf16) != 0 && codePoint[0] >= kSurrogateBias)
            sequenceLength = 3;
            continuationLength = 2;
            codePointCount = 2;

            codePoint[0] -= kSurrogateBias;

            codePoint[1] = kSurrogateBaseLow | (codePoint[0] & kSurrogateMask);
            codePoint[0] = kSurrogateBaseHigh | ((codePoint[0] >> kSurrogateBits) & kSurrogateMask);

        // not using a UTF-16 surrogate pair => search for the required length of the sequence.
            // figure out the required sequence length for the given for this codepoint.
            for (size_t i = 1; i < kMaxSequenceBytes; i++)
                if (codePoint[0] < getMaxCodePoint(i))
                    sequenceLength = i + 1;
                    continuationLength = i;

            // failed to find a sequence length for the given codepoint (?!?) => fail (this should
            //   never happen).
            if (sequenceLength == 0)
                *bytesWritten = 0;
                return nullptr;

        // not enough space in the buffer to store the entire sequence => fail.
        if (lengthInBytes < sequenceLength * codePointCount)
            *bytesWritten = 0;
            return nullptr;

        result = str;

        // write out each of the codepoints.  If UTF-16 encoding is not being used, there will only
        // be one codepoint and this loop will exit after the first iteration.
        for (size_t j = 0; j < codePointCount; j++)
            cp = codePoint[j];

            // write out the lead byte.
            *str = CodeByte(getLeadByte(continuationLength) |
                            ((cp >> (continuationLength * kContinuationShift)) & getLeadMask(continuationLength)));

            // write out the continuation bytes.
            for (size_t i = 0; i < continuationLength; i++)
                *str = CodeByte(kContinuationBits |
                                ((cp >> ((continuationLength - i - 1) * kContinuationShift)) & kContinuationMask));

        *bytesWritten = sequenceLength * codePointCount;
        return result;

    static SurrogateMember classifyUtf16SurrogateMember(CodePoint cp)
        if (cp >= kSurrogateBaseHigh && cp < kSurrogateBaseLow)
            return SurrogateMember::eHigh;

        if (cp >= kSurrogateBaseLow && cp <= kSurrogateMax)
            return SurrogateMember::eLow;

        return SurrogateMember::eNone;

    static CodePoint decodeUtf16CodePoint(CodePoint high, CodePoint low)
        CodePoint cp;

        // the high and low codepoints are out of the surrogate pair range -> cannot decode => fail.
        if (high < kSurrogateBaseHigh || high >= kSurrogateBaseLow || low < kSurrogateBaseLow || low > kSurrogateMax)
            return 0;

        // decode the surrogate pair into a single Unicode codepoint.
        cp = (((high & kSurrogateMask) << kSurrogateBits) | (low & kSurrogateMask)) + kSurrogateBias;
        return cp;

    static size_t encodeUtf16CodePoint(CodePoint cp, CodePoint* out)
        CodePoint high;
        CodePoint low;

        // small enough for a direct encoding => just store it.
        if (cp < kSurrogateBias)
            if (out != nullptr)
                *out = cp;

            return 1;

        // too big for direct encoding => convert it to a surrogate pair and store both in the
        //   output buffer.
        cp -= kSurrogateBias;
        low = kSurrogateBaseLow | (cp & kSurrogateMask);
        high = kSurrogateBaseHigh | ((cp >> kSurrogateBits) & kSurrogateMask);

        if (out != nullptr)
            *out = high | (low << 16);

        return 2;

    inline static bool isSpaceCodePoint(CodePoint cp)
        // Taken from
        // Note: sorted to allow binary search
        static constexpr CodePoint kSpaceCodePoints[] = {
            0x0009, //  character tabulation
            0x000A, //  line feed
            0x000B, //  line tabulation
            0x000C, //  form feed
            0x000D, //  carriage return
            0x0020, //  space
            0x0085, //  next line
            0x00A0, //  no-break space
            0x1680, //  ogham space mark
            0x180E, //  Mongolian vowel separator
            0x2000, //  en quad
            0x2001, //  em quad
            0x2002, //  en space
            0x2003, //  em space
            0x2004, //  three-per-em space
            0x2005, //  four-per-em space
            0x2006, //  six-per-em space
            0x2007, //  figure space
            0x2008, //  punctuation space
            0x2009, //  thin space
            0x200A, //  hair space
            0x200B, //  zero width space
            0x200C, //  zero width non-joiner
            0x200D, //  zero width joiner
            0x2028, //  line separator
            0x2029, //  paragraph separator
            0x202F, //  narrow no-break space
            0x205F, //  medium mathematical space
            0x2060, //  word joiner
            0x3000, //  ideograph space
            0xFEFF, //  zero width non-breaking space
        constexpr size_t kSpaceCodePointsCount = CARB_COUNTOF(kSpaceCodePoints);
        constexpr const CodePoint* const kSpaceCodePointsEnd = kSpaceCodePoints + kSpaceCodePointsCount;
        return std::binary_search(kSpaceCodePoints, kSpaceCodePointsEnd, cp);

    static constexpr uint8_t s_leadBits[] = { 7, 5, 4, 3, 2, 1, 0 };

    static constexpr CodePoint kMaxCodePoint = 0x0010ffff;

    static constexpr uint32_t kContinuationShift = 6;

    static constexpr uint8_t kContinuationBits = 0x80;

    static constexpr uint8_t kContinuationMask = (1u << kContinuationShift) - 1;

    static constexpr CodePoint kSurrogateBias = 0x00010000;

    static constexpr CodePoint kSurrogateBaseHigh = 0x0000d800;

    static constexpr CodePoint kSurrogateBaseLow = 0x0000dc00;

    static constexpr CodePoint kSurrogateMin = 0x0000d800;

    static constexpr CodePoint kSurrogateMax = 0x0000dfff;

    static constexpr uint32_t kSurrogateBits = 10;

    static constexpr CodePoint kSurrogateMask = ((1 << kSurrogateBits) - 1);

    static constexpr size_t kMaxSequenceBytes = 7;

    static constexpr uint8_t k7BitLimit = 0x80;

    static constexpr uint8_t kMinLeadByte = 0xc0;

    static constexpr uint8_t getContinuationLength(size_t leadByte)
        constexpr uint8_t s_continuationSize[] = {
            0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xc0 - 0xcf */
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xd0 - 0xdf */
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xe0 - 0xef */
            3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 6, 0, /* 0xf0 - 0xff */
        return s_continuationSize[leadByte - kMinLeadByte];

    static constexpr uint8_t getLeadMask(size_t continuationLength)
        constexpr uint8_t s_leadMasks[] = { (1u << s_leadBits[0]) - 1, (1u << s_leadBits[1]) - 1,
                                            (1u << s_leadBits[2]) - 1, (1u << s_leadBits[3]) - 1,
                                            (1u << s_leadBits[4]) - 1, (1u << s_leadBits[5]) - 1,
                                            (1u << s_leadBits[6]) - 1 };
        return s_leadMasks[continuationLength];

    static constexpr uint8_t getLeadByte(size_t continuationLength)
        constexpr uint8_t s_leadBytes[] = {
            (0xffu << (s_leadBits[0] + 1)) & 0xff, (0xffu << (s_leadBits[1] + 1)) & 0xff,
            (0xffu << (s_leadBits[2] + 1)) & 0xff, (0xffu << (s_leadBits[3] + 1)) & 0xff,
            (0xffu << (s_leadBits[4] + 1)) & 0xff, (0xffu << (s_leadBits[5] + 1)) & 0xff,
            (0xffu << (s_leadBits[6] + 1)) & 0xff
        return s_leadBytes[continuationLength];

    static constexpr CodePoint getMaxCodePoint(size_t continuationLength)
        constexpr CodePoint s_maxCodePoint[] = { 0x00000080, 0x00000800, 0x00010000, 0x00200000,
                                                 0x04000000, 0x80000000, 0xffffffff };
        return s_maxCodePoint[continuationLength];

    inline static CodePoint decodeContinuationByte(uint8_t byte, size_t continuationLength)
        return (byte & kContinuationMask) << ((continuationLength - 1) * kContinuationShift);

    static constexpr CodePoint getFailureCodepoint(Flags flags)
        return (flags & fDecodeUseDefault) != 0 ? kDefaultCodePoint : 0;

    static bool parseUtf8(const CodeByte* str,
                          const CodeByte** outNext,
                          CodePoint* outCodePoint,
                          size_t lengthInBytes = kNullTerminated,
                          Flags flags = 0)
        auto fail = [&]() -> bool {
            // we weren't asked to attempt to skip over invalid code sequences => just fail out
            if ((flags & fDecodeSkipInvalid) == 0)
                return false;

            // walk the rest of the string skipping over continuation bytes and invalid lead bytes.
            // Note that we've already tested and rejected the first byte so we just need to continue
            // the search starting at the next byte.
            for (size_t i = 1; i < lengthInBytes; i++)
                const auto b = static_cast<uint8_t>(str[i]);
                // continuation byte => skip it.
                if ((b & ~kContinuationMask) == kContinuationBits)

                // invalid lead byte => skip it.
                if (b >= kMinLeadByte && getContinuationLength(b) == 0)

                // invalid range of bytes
                if (b >= k7BitLimit && b < kMinLeadByte)

                *outNext = str + i;
                return false;

            // We've hit the end of the string.  This mean that the sequence is
            // either invalid, misaligned, or an illegal overlong sequence was
            // used.  We aren't able to write out the next character pointer if
            // we hit this point.
            return false;

        // initialize to failure values;
        *outCodePoint = getFailureCodepoint(flags);
        *outNext = nullptr;

        // the string doesn't have any more bytes in it -> no more codepoints => fail.
        if (lengthInBytes == 0)
            return false;

        const auto byte = static_cast<uint8_t>(*str);

        // the current code byte is at the null terminator -> no more codepoints => finish.
        if (byte == '\0')
            *outCodePoint = byte;
            return true;

        // the current code byte is a direct ASCII character => finish.
        if (byte < k7BitLimit)
            *outCodePoint = byte;
            *outNext = str + 1;
            return true;

        if (byte < kMinLeadByte)
            return fail();

        // the current code byte is a lead byte => calculate the sequence length and return the
        //   start of the next codepoint.
        const size_t continuationLength = getContinuationLength(byte);
        const size_t sequenceLength = continuationLength + 1;

        // not enough bytes left in the string to complete this codepoint => fail.
        // continuationLength of 0 is invalid => fail
        if (lengthInBytes < sequenceLength || continuationLength == 0)
            return fail();

        // decode the codepoint.
            CodePoint cp = (byte & getLeadMask(continuationLength)) << (continuationLength * kContinuationShift);

            for (size_t i = 0; i < continuationLength; i++)
                // validate the continuation byte so we don't walk past the
                // end of a null terminated string
                if ((uint8_t(str[i + 1]) & ~kContinuationMask) != kContinuationBits)
                    return fail();

                cp |= decodeContinuationByte(str[i + 1], continuationLength - i);

            *outCodePoint = cp;
            *outNext = str + sequenceLength;
            return true;

class Utf8Iterator
    using CodeByte = Utf8Parser::CodeByte;
    using CodePoint = Utf8Parser::CodePoint;
    using Flags = Utf8Parser::Flags;
    // Reference the special length value used for null terminated strings.
    static constexpr size_t kNullTerminated = Utf8Parser::kNullTerminated;

        : m_prev(nullptr), m_string(nullptr), m_length(kNullTerminated), m_flags(0), m_lastCodePoint(0), m_index(0)

    Utf8Iterator(const CodeByte* string, size_t lengthInBytes = kNullTerminated, Flags flags = 0)
        : m_prev(nullptr), m_string(string), m_length(lengthInBytes), m_flags(flags), m_lastCodePoint(0), m_index(0)

    Utf8Iterator(const Utf8Iterator& it)

    operator bool() const
        return isValid();

    bool operator!() const
        return !isValid();

    CodePoint operator*() const
        return m_lastCodePoint;

    const CodeByte* operator&() const
        return m_prev;

    Utf8Iterator& operator++()
        return *this;

    Utf8Iterator operator++(int32_t)
        Utf8Iterator tmp = (*this);
        return tmp;

    template <typename T>
    Utf8Iterator& operator+=(T count)
        for (T i = 0; i < count && m_prev != nullptr; i++)

        return *this;

    template <typename T>
    Utf8Iterator operator+(T count) const
        Utf8Iterator tmp = *this;
        return (tmp += count);

    bool operator==(const Utf8Iterator& it) const
        return m_string == it.m_string;

    bool operator!=(const Utf8Iterator& it) const
        return m_string != it.m_string;

    bool operator<(const Utf8Iterator& it) const
        return m_string < it.m_string;

    bool operator<=(const Utf8Iterator& it) const
        return m_string <= it.m_string;

    bool operator>(const Utf8Iterator& it) const
        return m_string > it.m_string;

    bool operator>=(const Utf8Iterator& it) const
        return m_string >= it.m_string;

    Utf8Iterator& operator=(const Utf8Iterator& it)
        // Note: normally we'd check for an identity assignment in this operator overload and
        //       ignore.  Unfortunately we can't do that here since we also override the '&'
        //       operator above.  Since this copy operation should still be safe for an identity
        //       assignment, we'll just let it proceed.
        return *this;

    Utf8Iterator& operator=(const CodeByte* str)
        m_prev = nullptr;
        m_string = str;
        m_length = kNullTerminated;
        m_lastCodePoint = 0;
        m_flags = 0;
        m_index = 0;
        return *this;

    size_t getIndex() const
        return m_index - 1;

    size_t getCodepointSize() const
        if (m_string == nullptr)
            return m_prev == nullptr ? 0 : 1;

        return m_string - m_prev;

    void copy(const Utf8Iterator& it)
        m_prev = it.m_prev;
        m_string = it.m_string;
        m_length = it.m_length;
        m_flags = it.m_flags;
        m_lastCodePoint = it.m_lastCodePoint;
        m_index = it.m_index;

    bool isValid() const
        return m_string != nullptr && m_lastCodePoint != 0;

    void next()
        const CodeByte* ptr;

        if (m_string == nullptr)
            m_prev = nullptr;

        if (m_length == 0)
            m_string = nullptr;
            m_prev = nullptr;
            m_lastCodePoint = 0;

        ptr = Utf8Parser::nextCodePoint(m_string, m_length, &m_lastCodePoint, m_flags);

        if (m_length != kNullTerminated)
            m_length -= ptr - m_string;

        m_prev = m_string;
        m_string = ptr;

    const CodeByte* m_prev;

    const CodeByte* m_string;

    size_t m_length;

    Flags m_flags;

    CodePoint m_lastCodePoint;

    size_t m_index;

// implementation details used for string conversions - ignore this!
namespace detail
template <typename T,
          typename O,
          std::pair<size_t, char32_t>(toCodePoint)(const T*),
          size_t(fromCodePoint)(char32_t c, O* out, size_t outLen)>
inline size_t convertBetweenUnicodeFormatsRaw(const T* str, O* out, size_t outLen)
    // the last element written to the output
    O* prev = nullptr;
    size_t prevLen = 0;
    size_t written = 0;
    size_t read = 0;
    bool fullyRead = false;

    if (str == nullptr)
        return 0;

    // no output => ignore outLen in the loop
    if (out == nullptr)
        outLen = SIZE_MAX;

    if (outLen == 0)
        return 0;

    for (;;)
        size_t len;

        // decode the input to UTF-32
        std::pair<size_t, char32_t> decoded = toCodePoint(str + read);

        // decode failed
        if (decoded.first == 0)

        // encode from UTF-32 to the output format
        len = fromCodePoint(decoded.second, (out == nullptr) ? nullptr : out + written, outLen - written);

        // out of buffer space
        if (len == 0)

        // store the last written character
        if (out != nullptr)
            prev = out + written;
        prevLen = len;

        // advance the indices
        written += len;
        read += decoded.first;

        // hit the null terminator => finished
        if (decoded.second == '\0')
            fullyRead = true;

    // if the string was truncated, we need to cut out the last character
    // from the written count
    if (!fullyRead)
        if (written == outLen)
            written -= prevLen;
            written += 1;

            if (out != nullptr)
                *prev = '\0';
            if (out != nullptr)
                out[written] = '\0';

    return written;

inline std::pair<size_t, char32_t> utf8ToCodePoint(const char* str)
    char32_t c = 0;
    const char* p = Utf8Parser::nextCodePoint(str, Utf8Parser::kNullTerminated, &c, Utf8Parser::fDecodeUseDefault);
    if (c == '\0')
        return std::pair<size_t, char32_t>(1, '\0');
    else if (p == nullptr)
        // invalid character, skip it
        return std::pair<size_t, char32_t>(1, c);
        return std::pair<size_t, char32_t>(p - str, c);

inline size_t utf32FromCodePoint(char32_t c, char32_t* out, size_t outLen)
    if (outLen == 0)
        return 0;
        if (out != nullptr)
            out[0] = c;
        return 1;

inline std::pair<size_t, char32_t> utf32ToCodePoint(const char32_t* str)
    return std::pair<size_t, char32_t>(1, *str);

inline size_t utf8FromCodePoint(char32_t c, char* out, size_t outLen)
    char dummy[8];
    size_t len = 0;
    char* p;

    if (out == nullptr)
        out = dummy;
        outLen = CARB_MIN(outLen, CARB_COUNTOF(dummy));

    p = Utf8Parser::getCodeBytes(c, out, outLen, &len);
    if (p == nullptr)
        return 0;
        return len;

inline std::pair<size_t, char32_t> utf16ToCodePoint(const char16_t* str)
    char32_t c;
    switch (Utf8Parser::classifyUtf16SurrogateMember(str[0]))
        case Utf8Parser::SurrogateMember::eHigh:
            c = Utf8Parser::decodeUtf16CodePoint(str[0], str[1]);
            if (c == 0) // invalid surrogate pair

            return std::pair<size_t, char32_t>(2, c);

        // a stray low surrogate is invalid
        case Utf8Parser::SurrogateMember::eLow:

            return std::pair<size_t, char32_t>(1, str[0]);

    // failed to parse => just return the invalid character code point
    constexpr static auto kDefaultCodePoint_ = Utf8Parser::kDefaultCodePoint; // CC-1110
    return std::pair<size_t, char32_t>(1, kDefaultCodePoint_);

inline size_t utf16FromCodePoint(char32_t c, char16_t* out, size_t outLen)
    char32_t mangled = 0;
    size_t len;

    len = Utf8Parser::encodeUtf16CodePoint(c, &mangled);
    if (outLen < len)
        return 0;

    if (out != nullptr)
        switch (len)

            case 1:
                out[0] = char16_t(mangled);

            case 2:
                out[0] = char16_t(mangled & 0xFFFF);
                out[1] = char16_t(mangled >> 16);

    return len;

template <typename T, typename O, size_t conv(const T* str, O* out, size_t outLen)>
inline std::basic_string<O> convertBetweenUnicodeFormats(const T* str)
    omni::extras::ScratchBuffer<O, 4096> buffer;
    size_t len = conv(str, nullptr, 0);
    if (len == 0 || !buffer.resize(len))
        return std::basic_string<O>();
    conv(str,, buffer.size());
    return std::basic_string<O>(, + len - 1);
} // namespace detail

inline size_t convertUtf8StringToUtf32(const char* str, char32_t* out, size_t outLen) noexcept
    return detail::convertBetweenUnicodeFormatsRaw<char, char32_t, detail::utf8ToCodePoint, detail::utf32FromCodePoint>(
        str, out, outLen);

inline std::u32string convertUtf8StringToUtf32(const char* str)
    return detail::convertBetweenUnicodeFormats<char, char32_t, convertUtf8StringToUtf32>(str);

inline std::u32string convertUtf8StringToUtf32(std::string str)
    return convertUtf8StringToUtf32(str.c_str());

inline size_t convertUtf32StringToUtf8(const char32_t* str, char* out, size_t outLen)
    return detail::convertBetweenUnicodeFormatsRaw<char32_t, char, detail::utf32ToCodePoint, detail::utf8FromCodePoint>(
        str, out, outLen);

inline std::string convertUtf32StringToUtf8(const char32_t* str)
    return detail::convertBetweenUnicodeFormats<char32_t, char, convertUtf32StringToUtf8>(str);

inline std::string convertUtf32StringToUtf8(std::u32string str)
    return convertUtf32StringToUtf8(str.c_str());

inline size_t convertUtf16StringToUtf8(const char16_t* str, char* out, size_t outLen)
    return detail::convertBetweenUnicodeFormatsRaw<char16_t, char, detail::utf16ToCodePoint, detail::utf8FromCodePoint>(
        str, out, outLen);

inline std::string convertUtf16StringToUtf8(const char16_t* str)
    return detail::convertBetweenUnicodeFormats<char16_t, char, convertUtf16StringToUtf8>(str);

inline std::string convertUtf16StringToUtf8(std::u16string str)
    return convertUtf16StringToUtf8(str.c_str());

inline size_t convertUtf8StringToUtf16(const char* str, char16_t* out, size_t outLen) noexcept
    return detail::convertBetweenUnicodeFormatsRaw<char, char16_t, detail::utf8ToCodePoint, detail::utf16FromCodePoint>(
        str, out, outLen);

inline std::u16string convertUtf8StringToUtf16(const char* str)
    return detail::convertBetweenUnicodeFormats<char, char16_t, convertUtf8StringToUtf16>(str);

inline std::u16string convertUtf8StringToUtf16(std::string str)
    return convertUtf8StringToUtf16(str.c_str());

inline size_t convertUtf8StringToWide(const char* str, wchar_t* out, size_t outLen) noexcept
    static_assert(sizeof(*out) == sizeof(char16_t), "unexpected wchar_t type");
    return detail::convertBetweenUnicodeFormatsRaw<char, char16_t, detail::utf8ToCodePoint, detail::utf16FromCodePoint>(
        str, reinterpret_cast<char16_t*>(out), outLen);
    static_assert(sizeof(*out) == sizeof(char32_t), "unexpected wchar_t type");
    return detail::convertBetweenUnicodeFormatsRaw<char, char32_t, detail::utf8ToCodePoint, detail::utf32FromCodePoint>(
        str, reinterpret_cast<char32_t*>(out), outLen);

inline std::wstring convertUtf8StringToWide(const char* str)
    return detail::convertBetweenUnicodeFormats<char, wchar_t, convertUtf8StringToWide>(str);

inline std::wstring convertUtf8StringToWide(std::string str)
    return convertUtf8StringToWide(str.c_str());

inline size_t convertWideStringToUtf8(const wchar_t* str, char* out, size_t outLen) noexcept
    static_assert(sizeof(*str) == sizeof(char16_t), "unexpected wchar_t type");
    return detail::convertBetweenUnicodeFormatsRaw<char16_t, char, detail::utf16ToCodePoint, detail::utf8FromCodePoint>(
        reinterpret_cast<const char16_t*>(str), out, outLen);
    static_assert(sizeof(*str) == sizeof(char32_t), "unexpected wchar_t type");
    return detail::convertBetweenUnicodeFormatsRaw<char32_t, char, detail::utf32ToCodePoint, detail::utf8FromCodePoint>(
        reinterpret_cast<const char32_t*>(str), out, outLen);

inline std::string convertWideStringToUtf8(const wchar_t* str)
    return detail::convertBetweenUnicodeFormats<wchar_t, char, convertWideStringToUtf8>(str);

inline std::string convertWideStringToUtf8(std::wstring str)
    return convertWideStringToUtf8(str.c_str());

} // namespace extras
} // namespace carb