Utf8Parser.h

File members: carb/extras/Utf8Parser.h
// Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//
#pragma once

#include "../Defines.h"
#include "../../omni/extras/ScratchBuffer.h"

#include <cstdint>
#include <algorithm>
#include <cmath>

namespace carb
{
namespace extras
{

class Utf8Parser
{
public:
    using CodePoint = char32_t;

    using Utf16CodeUnit = char16_t;

    using CodeByte = char;

    using Flags = uint32_t;

    enum class SurrogateMember
    {
        eNone,

        eHigh,

        eLow,
    };

    static constexpr Flags fDecodeUseDefault = 0x00000001;

    static constexpr Flags fDecodeSkipInvalid = 0x00000002;

    static constexpr Flags fEncodeUseUtf16 = 0x00000004;

    static constexpr Flags fEncodeIgnoreSurrogatePairs = 0x00000008;

    static constexpr size_t kNullTerminated = ~0ull;

    static constexpr CodePoint kInvalidCodePoint = ~0u;

    static constexpr size_t kMaxSequenceLength = 7;

    static constexpr CodePoint kDefaultCodePoint = 0x0000fffd;

    static const CodeByte* nextCodePoint(const CodeByte* str,
                                         size_t lengthInBytes = kNullTerminated,
                                         CodePoint* codepoint = nullptr,
                                         Flags flags = 0)
    {
        // retrieve the next code point
        CodePoint high = 0;
        const CodeByte* next = nullptr;
        bool r = parseUtf8(str, &next, &high, lengthInBytes, flags);

        if (codepoint != nullptr)
        {
            *codepoint = high;
        }

        // parsing failed => just fail out
        if (!r)
        {
            return next;
        }

        // it's a surrogate pair (and we're allowed to parse those) => parse out the full pair
        if ((flags & fEncodeIgnoreSurrogatePairs) == 0 && classifyUtf16SurrogateMember(high) == SurrogateMember::eHigh)
        {
            // figure out the new length if it's not null terminated
            const size_t newLen = (lengthInBytes == kNullTerminated) ? kNullTerminated : (lengthInBytes - (next - str));

            // parse out the next code point
            CodePoint low = 0;
            r = parseUtf8(next, &next, &low, newLen, flags);

            // invalid surrogate pair => fail
            if (!r || classifyUtf16SurrogateMember(low) != SurrogateMember::eLow)
            {
                if (codepoint != nullptr)
                {
                    *codepoint = getFailureCodepoint(flags);
                }

                return next;
            }

            // valid surrogate pair => calculate the code point
            if (codepoint != nullptr)
            {
                *codepoint = (((high & kSurrogateMask) << kSurrogateBits) | (low & kSurrogateMask)) + kSurrogateBias;
            }

            return next;
        }

        return next;
    }

    static const CodeByte* lastCodePoint(const CodeByte* str,
                                         size_t lengthInBytes = kNullTerminated,
                                         CodePoint* codepoint = nullptr,
                                         Flags flags = fDecodeUseDefault)
    {
        // Prepare error value result
        if (codepoint != nullptr)
        {
            *codepoint = getFailureCodepoint(flags);
        }

        // Check if it's a null or empty string
        if (!str || *str == 0)
        {
            return nullptr;
        }

        if (lengthInBytes == kNullTerminated)
        {
            lengthInBytes = std::strlen(str);
        }

        // Make sure no unexpected flags pass into used `nextCodePoint` function
        constexpr Flags kErrorHandlingMask = fDecodeSkipInvalid | fDecodeUseDefault;
        const Flags helperParserFlags = (flags & kErrorHandlingMask);

        size_t curCodePointSize = 0; // Keeps max number of bytes for a decoding attempt with `nextCodePoint`
        const bool skipInvalid = (flags & fDecodeSkipInvalid) != 0;

        // Walk the string backwards to find the start of the last CodePoint and decode it
        // Note: it can be a single byte or a sequence, also if not searching for the last valid CodePoint
        // the maximum number of bytes to check is just last `kMaxSequenceLength` bytes instead of the full string
        //
        // Note that the 'array-bounds' warning needs to be disabled here since it can pick up the
        // calculation of `rIterEnd` as being out of bounds.  However, for cases of short strings
        // this is intentional since the iterator needs to point to one byte before the start of the
        // string.
        CARB_IGNOREWARNING_GNUC_WITH_PUSH("-Warray-bounds")
        const CodeByte* const rIterBegin = str - 1 + lengthInBytes;
        const CodeByte* const rIterEnd =
            (flags & fDecodeSkipInvalid) ? str - 1 : CARB_MAX(str - 1, rIterBegin - kMaxSequenceLength);
        for (const CodeByte* rIter = rIterBegin; rIter != rIterEnd; --rIter)
        {
            const uint8_t curByte = static_cast<uint8_t>(*rIter);

            ++curCodePointSize;

            // Check if the current code byte is a direct ASCII character
            if (curByte < k7BitLimit)
            {
                // If parsed more than one byte then it's an error
                if (curCodePointSize > 1 && !skipInvalid)
                {
                    return nullptr;
                }

                if (codepoint != nullptr)
                {
                    *codepoint = curByte;
                }
                return rIter;
            }

            // The current code byte is a continuation byte so step further
            if (curByte < kMinLeadByte)
            {
                continue;
            }

            // The current code byte is a lead byte, decode the sequence and check that all bytes were used
            CodePoint cp{};
            const CodeByte* next = nextCodePoint(rIter, curCodePointSize, &cp, helperParserFlags);

            if (!next)
            {
                if (skipInvalid)
                {
                    curCodePointSize = 0;
                    continue;
                }

                return nullptr;
            }

            // Validate that all bytes till the end were used if expecting no invalid bytes
            // Ex: "\xce\xa6\xa6" is a 2 byte sequence "\xce\xa6" for a 0x03A6 code point followed by excessive
            // follow up byte "\xa6". The first 2 bytes will be decoded by the `nextCodePoint` properly
            // and `next` will be pointing at the last "\xa6" byte
            if (!skipInvalid && curCodePointSize != static_cast<size_t>(next - rIter))
            {
                return nullptr;
            }

            const SurrogateMember surrogateType = classifyUtf16SurrogateMember(cp);

            // Encountered the high surrogate part first which is an error
            if (CARB_UNLIKELY(surrogateType == SurrogateMember::eHigh))
            {
                if (skipInvalid)
                {
                    // Just skip it and search further
                    curCodePointSize = 0;
                    continue;
                }

                return nullptr;
            }
            // Found the low part of a surrogate pair, need to continue parsing to get the high part
            else if (CARB_UNLIKELY(surrogateType == SurrogateMember::eLow))
            {
                constexpr int kSurrogatePartSize = 3;
                constexpr int kFullSurrogatePairSize = 2 * kSurrogatePartSize;

                // To prepare for possible continuation of parsing if skipping invalid bytes and no high surrogate is
                // found reset the possible CodePoint size
                curCodePointSize = 0;

                // For a valid UTF-8 string there are must be high surrogate (3 bytes) preceding low surrogate (3 bytes)
                if (rIter <= rIterEnd + kSurrogatePartSize)
                {
                    if (skipInvalid)
                    {
                        // Skip the low surrogate data and continue to check the preceding byte
                        continue;
                    }
                    return nullptr;
                }

                // Step 3 bytes preceding the low surrogate
                const CodeByte* const possibleHighSurStart = rIter - kSurrogatePartSize;
                // Check if it starts with a lead byte
                if (static_cast<uint8_t>(*possibleHighSurStart) < kMinLeadByte)
                {
                    if (skipInvalid)
                    {
                        continue;
                    }
                    return nullptr;
                }

                // Try to parse 6 bytes (full surrogate pair size) to get the whole CodePoint without skipping invalid
                // bytes
                const CodeByte* const decodedPairEnd =
                    nextCodePoint(possibleHighSurStart, kFullSurrogatePairSize, &cp, 0);

                if (!decodedPairEnd)
                {
                    if (skipInvalid)
                    {
                        continue;
                    }
                    return nullptr;
                }

                // Check if used all 6 bytes (as expected from a surrogate pair)
                if (decodedPairEnd - possibleHighSurStart != kFullSurrogatePairSize)
                {
                    if (skipInvalid)
                    {
                        continue;
                    }
                    return nullptr;
                }

                // A proper surrogate pair was parsed into the `cp`
                // and only the `rIter` has invalid value at this point
                rIter = possibleHighSurStart;
                // Just exit the block so the code below reports the result
            }

            if (codepoint)
            {
                *codepoint = cp;
            }

            // Everything is fine thus return start of the sequence
            return rIter;
        }
        CARB_IGNOREWARNING_GNUC_POP

        // Didn't find start of a valid CodePoint
        return nullptr;
    }

    static size_t getLengthInCodePoints(const CodeByte* str, size_t maxLengthInBytes = kNullTerminated, Flags flags = 0)
    {
        const CodeByte* current;
        const CodeByte* next;
        size_t count = 0;

        // get the second codepoint in the string.
        current = str;
        next = nextCodePoint(str, maxLengthInBytes, nullptr, flags);

        // empty or invalid string => fail.
        if (next == nullptr)
            return 0;

        if (maxLengthInBytes != kNullTerminated)
        {
            maxLengthInBytes -= next - current;
        }
        count++;

        do
        {
            current = next;
            next = nextCodePoint(current, maxLengthInBytes, nullptr, flags);

            if (next == nullptr)
                return count;

            if (maxLengthInBytes != kNullTerminated)
            {
                maxLengthInBytes -= next - current;
            }
            count++;
        } while (maxLengthInBytes > 0);

        return count;
    }

    static size_t getLengthInCodeBytes(const CodePoint* str,
                                       size_t maxLengthInCodePoints = kNullTerminated,
                                       Flags flags = 0)
    {
        size_t count = 0;
        size_t largeCodePointSize = 4;

        if ((flags & fEncodeUseUtf16) != 0)
            largeCodePointSize = 6;

        for (size_t i = 0; str[i] != 0 && i < maxLengthInCodePoints; i++)
        {
            if (str[i] < getMaxCodePoint(0))
                count++;

            else if (str[i] < getMaxCodePoint(1))
                count += 2;

            else if (str[i] < getMaxCodePoint(2))
                count += 3;

            else if (str[i] < getMaxCodePoint(3))
                count += largeCodePointSize;

            else if (str[i] < getMaxCodePoint(4))
                count += 5;

            else if (str[i] < getMaxCodePoint(5))
                count += 6;

            else
                count += 7;
        }

        return count;
    }

    static size_t getLengthInCodeBytes(const Utf16CodeUnit* str,
                                       size_t maxLengthInCodePoints = kNullTerminated,
                                       Flags flags = 0)
    {
        size_t count = 0;
        size_t largeCodePointSize = 4;

        if ((flags & fEncodeUseUtf16) != 0)
            largeCodePointSize = 6;

        for (size_t i = 0; str[i] != 0 && i < maxLengthInCodePoints; i++)
        {
            if (str[i] < getMaxCodePoint(0))
                count++;

            else if (str[i] < getMaxCodePoint(1))
                count += 2;

            else
            {
                // found a surrogate pair in the string -> both of these codepoints will decode to
                // a single UTF-32 codepoint => skip the low surrogate and add the size of a
                //   single encoded codepoint.
                if (str[i] >= kSurrogateBaseHigh && str[i] < kSurrogateBaseLow && i + 1 < maxLengthInCodePoints &&
                    str[i + 1] >= kSurrogateBaseLow && str[i + 1] <= kSurrogateMax)
                {
                    i++;
                    count += largeCodePointSize;
                }

                // not part of a UTF-16 surrogate pair => this will encode to 3 bytes in UTF-8.
                else
                    count += 3;
            }
        }

        return count;
    }

    static CodePoint getCodePoint(const CodeByte* str, size_t lengthInBytes = kNullTerminated, Flags flags = 0)
    {
        char32_t c = 0;
        nextCodePoint(str, lengthInBytes, &c, flags);
        return c;
    }

    static CodeByte* getCodeBytes(CodePoint cp, CodeByte* str, size_t lengthInBytes, size_t* bytesWritten, Flags flags = 0)
    {
        size_t sequenceLength = 0;
        size_t continuationLength = 0;
        size_t codePointCount = 1;
        CodePoint codePoint[2] = { cp, 0 };
        CodeByte* result;

        // not enough room in the buffer => fail.
        if (lengthInBytes == 0)
        {
            *bytesWritten = 0;
            return nullptr;
        }

        // a 7-bit ASCII character -> this can be directly stored => store and return.
        if (codePoint[0] < k7BitLimit)
        {
            str[0] = CodeByte((codePoint[0] & 0xff));
            *bytesWritten = 1;
            return str;
        }

        // at this point we know that the encoding for the codepoint is going to require at least
        // two bytes.  We need to calculate the sequence length and encode the bytes.

        // allowing a UTF-16 surrogate pair encoding in the string and the codepoint is above the
        //   range where a surrogate pair is necessary => calculate the low and high codepoints
        //   for the pair and set the sequence length.
        if ((flags & fEncodeUseUtf16) != 0 && codePoint[0] >= kSurrogateBias)
        {
            sequenceLength = 3;
            continuationLength = 2;
            codePointCount = 2;

            codePoint[0] -= kSurrogateBias;

            codePoint[1] = kSurrogateBaseLow | (codePoint[0] & kSurrogateMask);
            codePoint[0] = kSurrogateBaseHigh | ((codePoint[0] >> kSurrogateBits) & kSurrogateMask);
        }

        // not using a UTF-16 surrogate pair => search for the required length of the sequence.
        else
        {
            // figure out the required sequence length for the given for this codepoint.
            for (size_t i = 1; i < kMaxSequenceBytes; i++)
            {
                if (codePoint[0] < getMaxCodePoint(i))
                {
                    sequenceLength = i + 1;
                    continuationLength = i;
                    break;
                }
            }

            // failed to find a sequence length for the given codepoint (?!?) => fail (this should
            //   never happen).
            if (sequenceLength == 0)
            {
                *bytesWritten = 0;
                return nullptr;
            }
        }

        // not enough space in the buffer to store the entire sequence => fail.
        if (lengthInBytes < sequenceLength * codePointCount)
        {
            *bytesWritten = 0;
            return nullptr;
        }

        result = str;

        // write out each of the codepoints.  If UTF-16 encoding is not being used, there will only
        // be one codepoint and this loop will exit after the first iteration.
        for (size_t j = 0; j < codePointCount; j++)
        {
            cp = codePoint[j];

            // write out the lead byte.
            *str = CodeByte(getLeadByte(continuationLength) |
                            ((cp >> (continuationLength * kContinuationShift)) & getLeadMask(continuationLength)));
            str++;

            // write out the continuation bytes.
            for (size_t i = 0; i < continuationLength; i++)
            {
                *str = CodeByte(kContinuationBits |
                                ((cp >> ((continuationLength - i - 1) * kContinuationShift)) & kContinuationMask));
                str++;
            }
        }

        *bytesWritten = sequenceLength * codePointCount;
        return result;
    }

    static SurrogateMember classifyUtf16SurrogateMember(CodePoint cp)
    {
        if (cp >= kSurrogateBaseHigh && cp < kSurrogateBaseLow)
            return SurrogateMember::eHigh;

        if (cp >= kSurrogateBaseLow && cp <= kSurrogateMax)
            return SurrogateMember::eLow;

        return SurrogateMember::eNone;
    }

    static CodePoint decodeUtf16CodePoint(CodePoint high, CodePoint low)
    {
        CodePoint cp;

        // the high and low codepoints are out of the surrogate pair range -> cannot decode => fail.
        if (high < kSurrogateBaseHigh || high >= kSurrogateBaseLow || low < kSurrogateBaseLow || low > kSurrogateMax)
            return 0;

        // decode the surrogate pair into a single Unicode codepoint.
        cp = (((high & kSurrogateMask) << kSurrogateBits) | (low & kSurrogateMask)) + kSurrogateBias;
        return cp;
    }

    static size_t encodeUtf16CodePoint(CodePoint cp, CodePoint* out)
    {
        CodePoint high;
        CodePoint low;

        // small enough for a direct encoding => just store it.
        if (cp < kSurrogateBias)
        {
            if (out != nullptr)
                *out = cp;

            return 1;
        }

        // too big for direct encoding => convert it to a surrogate pair and store both in the
        //   output buffer.
        cp -= kSurrogateBias;
        low = kSurrogateBaseLow | (cp & kSurrogateMask);
        high = kSurrogateBaseHigh | ((cp >> kSurrogateBits) & kSurrogateMask);

        if (out != nullptr)
            *out = high | (low << 16);

        return 2;
    }

    inline static bool isSpaceCodePoint(CodePoint cp)
    {
        // Taken from https://en.wikipedia.org/wiki/Whitespace_character
        // Note: sorted to allow binary search
        static constexpr CodePoint kSpaceCodePoints[] = {
            0x0009, //  character tabulation
            0x000A, //  line feed
            0x000B, //  line tabulation
            0x000C, //  form feed
            0x000D, //  carriage return
            0x0020, //  space
            0x0085, //  next line
            0x00A0, //  no-break space
            0x1680, //  ogham space mark
            0x180E, //  Mongolian vowel separator
            0x2000, //  en quad
            0x2001, //  em quad
            0x2002, //  en space
            0x2003, //  em space
            0x2004, //  three-per-em space
            0x2005, //  four-per-em space
            0x2006, //  six-per-em space
            0x2007, //  figure space
            0x2008, //  punctuation space
            0x2009, //  thin space
            0x200A, //  hair space
            0x200B, //  zero width space
            0x200C, //  zero width non-joiner
            0x200D, //  zero width joiner
            0x2028, //  line separator
            0x2029, //  paragraph separator
            0x202F, //  narrow no-break space
            0x205F, //  medium mathematical space
            0x2060, //  word joiner
            0x3000, //  ideograph space
            0xFEFF, //  zero width non-breaking space
        };
        constexpr size_t kSpaceCodePointsCount = CARB_COUNTOF(kSpaceCodePoints);
        constexpr const CodePoint* const kSpaceCodePointsEnd = kSpaceCodePoints + kSpaceCodePointsCount;
        return std::binary_search(kSpaceCodePoints, kSpaceCodePointsEnd, cp);
    }

private:
    static constexpr uint8_t s_leadBits[] = { 7, 5, 4, 3, 2, 1, 0 };

    static constexpr CodePoint kMaxCodePoint = 0x0010ffff;

    static constexpr uint32_t kContinuationShift = 6;

    static constexpr uint8_t kContinuationBits = 0x80;

    static constexpr uint8_t kContinuationMask = (1u << kContinuationShift) - 1;

    static constexpr CodePoint kSurrogateBias = 0x00010000;

    static constexpr CodePoint kSurrogateBaseHigh = 0x0000d800;

    static constexpr CodePoint kSurrogateBaseLow = 0x0000dc00;

    static constexpr CodePoint kSurrogateMin = 0x0000d800;

    static constexpr CodePoint kSurrogateMax = 0x0000dfff;

    static constexpr uint32_t kSurrogateBits = 10;

    static constexpr CodePoint kSurrogateMask = ((1 << kSurrogateBits) - 1);

    static constexpr size_t kMaxSequenceBytes = 7;

    static constexpr uint8_t k7BitLimit = 0x80;

    static constexpr uint8_t kMinLeadByte = 0xc0;

    static constexpr uint8_t getContinuationLength(size_t leadByte)
    {
        constexpr uint8_t s_continuationSize[] = {
            0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xc0 - 0xcf */
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xd0 - 0xdf */
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xe0 - 0xef */
            3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 6, 0, /* 0xf0 - 0xff */
        };
        return s_continuationSize[leadByte - kMinLeadByte];
    }

    static constexpr uint8_t getLeadMask(size_t continuationLength)
    {
        constexpr uint8_t s_leadMasks[] = { (1u << s_leadBits[0]) - 1, (1u << s_leadBits[1]) - 1,
                                            (1u << s_leadBits[2]) - 1, (1u << s_leadBits[3]) - 1,
                                            (1u << s_leadBits[4]) - 1, (1u << s_leadBits[5]) - 1,
                                            (1u << s_leadBits[6]) - 1 };
        return s_leadMasks[continuationLength];
    }

    static constexpr uint8_t getLeadByte(size_t continuationLength)
    {
        constexpr uint8_t s_leadBytes[] = {
            (0xffu << (s_leadBits[0] + 1)) & 0xff, (0xffu << (s_leadBits[1] + 1)) & 0xff,
            (0xffu << (s_leadBits[2] + 1)) & 0xff, (0xffu << (s_leadBits[3] + 1)) & 0xff,
            (0xffu << (s_leadBits[4] + 1)) & 0xff, (0xffu << (s_leadBits[5] + 1)) & 0xff,
            (0xffu << (s_leadBits[6] + 1)) & 0xff
        };
        return s_leadBytes[continuationLength];
    }

    static constexpr CodePoint getMaxCodePoint(size_t continuationLength)
    {
        constexpr CodePoint s_maxCodePoint[] = { 0x00000080, 0x00000800, 0x00010000, 0x00200000,
                                                 0x04000000, 0x80000000, 0xffffffff };
        return s_maxCodePoint[continuationLength];
    }

    inline static CodePoint decodeContinuationByte(uint8_t byte, size_t continuationLength)
    {
        return (byte & kContinuationMask) << ((continuationLength - 1) * kContinuationShift);
    }

    static constexpr CodePoint getFailureCodepoint(Flags flags)
    {
        return (flags & fDecodeUseDefault) != 0 ? kDefaultCodePoint : 0;
    }

    static bool parseUtf8(const CodeByte* str,
                          const CodeByte** outNext,
                          CodePoint* outCodePoint,
                          size_t lengthInBytes = kNullTerminated,
                          Flags flags = 0)
    {
        auto fail = [&]() -> bool {
            // we weren't asked to attempt to skip over invalid code sequences => just fail out
            if ((flags & fDecodeSkipInvalid) == 0)
            {
                return false;
            }

            // walk the rest of the string skipping over continuation bytes and invalid lead bytes.
            // Note that we've already tested and rejected the first byte so we just need to continue
            // the search starting at the next byte.
            for (size_t i = 1; i < lengthInBytes; i++)
            {
                const auto b = static_cast<uint8_t>(str[i]);
                // continuation byte => skip it.
                if ((b & ~kContinuationMask) == kContinuationBits)
                    continue;

                // invalid lead byte => skip it.
                if (b >= kMinLeadByte && getContinuationLength(b) == 0)
                    continue;

                // invalid range of bytes
                if (b >= k7BitLimit && b < kMinLeadByte)
                    continue;

                *outNext = str + i;
                return false;
            }

            // We've hit the end of the string.  This mean that the sequence is
            // either invalid, misaligned, or an illegal overlong sequence was
            // used.  We aren't able to write out the next character pointer if
            // we hit this point.
            return false;
        };

        // initialize to failure values;
        *outCodePoint = getFailureCodepoint(flags);
        *outNext = nullptr;

        // the string doesn't have any more bytes in it -> no more codepoints => fail.
        if (lengthInBytes == 0)
        {
            return false;
        }

        const auto byte = static_cast<uint8_t>(*str);

        // the current code byte is at the null terminator -> no more codepoints => finish.
        if (byte == '\0')
        {
            *outCodePoint = byte;
            return true;
        }

        // the current code byte is a direct ASCII character => finish.
        if (byte < k7BitLimit)
        {
            *outCodePoint = byte;
            *outNext = str + 1;
            return true;
        }

        if (byte < kMinLeadByte)
        {
            return fail();
        }

        // the current code byte is a lead byte => calculate the sequence length and return the
        //   start of the next codepoint.
        const size_t continuationLength = getContinuationLength(byte);
        const size_t sequenceLength = continuationLength + 1;

        // not enough bytes left in the string to complete this codepoint => fail.
        // continuationLength of 0 is invalid => fail
        if (lengthInBytes < sequenceLength || continuationLength == 0)
        {
            return fail();
        }

        // decode the codepoint.
        {
            CodePoint cp = (byte & getLeadMask(continuationLength)) << (continuationLength * kContinuationShift);

            for (size_t i = 0; i < continuationLength; i++)
            {
                // validate the continuation byte so we don't walk past the
                // end of a null terminated string
                if ((uint8_t(str[i + 1]) & ~kContinuationMask) != kContinuationBits)
                {
                    return fail();
                }

                cp |= decodeContinuationByte(str[i + 1], continuationLength - i);
            }

            *outCodePoint = cp;
            *outNext = str + sequenceLength;
            return true;
        }
    }
};

class Utf8Iterator
{
public:
    using CodeByte = Utf8Parser::CodeByte;
    using CodePoint = Utf8Parser::CodePoint;
    using Flags = Utf8Parser::Flags;
    // Reference the special length value used for null terminated strings.
    static constexpr size_t kNullTerminated = Utf8Parser::kNullTerminated;

    Utf8Iterator()
        : m_prev(nullptr), m_string(nullptr), m_length(kNullTerminated), m_flags(0), m_lastCodePoint(0), m_index(0)
    {
    }

    Utf8Iterator(const CodeByte* string, size_t lengthInBytes = kNullTerminated, Flags flags = 0)
        : m_prev(nullptr), m_string(string), m_length(lengthInBytes), m_flags(flags), m_lastCodePoint(0), m_index(0)
    {
        next();
    }

    Utf8Iterator(const Utf8Iterator& it)
    {
        copy(it);
    }

    operator bool() const
    {
        return isValid();
    }

    bool operator!() const
    {
        return !isValid();
    }

    CodePoint operator*() const
    {
        return m_lastCodePoint;
    }

    const CodeByte* operator&() const
    {
        return m_prev;
    }

    Utf8Iterator& operator++()
    {
        next();
        return *this;
    }

    Utf8Iterator operator++(int32_t)
    {
        Utf8Iterator tmp = (*this);
        next();
        return tmp;
    }

    template <typename T>
    Utf8Iterator& operator+=(T count)
    {
        for (T i = 0; i < count && m_prev != nullptr; i++)
            next();

        return *this;
    }

    template <typename T>
    Utf8Iterator operator+(T count) const
    {
        Utf8Iterator tmp = *this;
        return (tmp += count);
    }

    bool operator==(const Utf8Iterator& it) const
    {
        return m_string == it.m_string;
    }

    bool operator!=(const Utf8Iterator& it) const
    {
        return m_string != it.m_string;
    }

    bool operator<(const Utf8Iterator& it) const
    {
        return m_string < it.m_string;
    }

    bool operator<=(const Utf8Iterator& it) const
    {
        return m_string <= it.m_string;
    }

    bool operator>(const Utf8Iterator& it) const
    {
        return m_string > it.m_string;
    }

    bool operator>=(const Utf8Iterator& it) const
    {
        return m_string >= it.m_string;
    }

    Utf8Iterator& operator=(const Utf8Iterator& it)
    {
        // Note: normally we'd check for an identity assignment in this operator overload and
        //       ignore.  Unfortunately we can't do that here since we also override the '&'
        //       operator above.  Since this copy operation should still be safe for an identity
        //       assignment, we'll just let it proceed.
        copy(it);
        return *this;
    }

    Utf8Iterator& operator=(const CodeByte* str)
    {
        m_prev = nullptr;
        m_string = str;
        m_length = kNullTerminated;
        m_lastCodePoint = 0;
        m_flags = 0;
        m_index = 0;
        next();
        return *this;
    }

    size_t getIndex() const
    {
        return m_index - 1;
    }

    size_t getCodepointSize() const
    {
        if (m_string == nullptr)
            return m_prev == nullptr ? 0 : 1;

        return m_string - m_prev;
    }

private:
    void copy(const Utf8Iterator& it)
    {
        m_prev = it.m_prev;
        m_string = it.m_string;
        m_length = it.m_length;
        m_flags = it.m_flags;
        m_lastCodePoint = it.m_lastCodePoint;
        m_index = it.m_index;
    }

    bool isValid() const
    {
        return m_string != nullptr && m_lastCodePoint != 0;
    }

    void next()
    {
        const CodeByte* ptr;

        if (m_string == nullptr)
        {
            m_prev = nullptr;
            return;
        }

        if (m_length == 0)
        {
            m_string = nullptr;
            m_prev = nullptr;
            m_lastCodePoint = 0;
            return;
        }

        ptr = Utf8Parser::nextCodePoint(m_string, m_length, &m_lastCodePoint, m_flags);

        if (m_length != kNullTerminated)
            m_length -= ptr - m_string;

        m_prev = m_string;
        m_string = ptr;
        m_index++;
    }

    const CodeByte* m_prev;

    const CodeByte* m_string;

    size_t m_length;

    Flags m_flags;

    CodePoint m_lastCodePoint;

    size_t m_index;
};

// implementation details used for string conversions - ignore this!
#ifndef DOXYGEN_SHOULD_SKIP_THIS
namespace detail
{
template <typename T,
          typename O,
          std::pair<size_t, char32_t>(toCodePoint)(const T*),
          size_t(fromCodePoint)(char32_t c, O* out, size_t outLen)>
inline size_t convertBetweenUnicodeFormatsRaw(const T* str, O* out, size_t outLen)
{
    // the last element written to the output
    O* prev = nullptr;
    size_t prevLen = 0;
    size_t written = 0;
    size_t read = 0;
    bool fullyRead = false;

    if (str == nullptr)
    {
        return 0;
    }

    // no output => ignore outLen in the loop
    if (out == nullptr)
    {
        outLen = SIZE_MAX;
    }

    if (outLen == 0)
    {
        return 0;
    }

    for (;;)
    {
        size_t len;

        // decode the input to UTF-32
        std::pair<size_t, char32_t> decoded = toCodePoint(str + read);

        // decode failed
        if (decoded.first == 0)
        {
            break;
        }

        // encode from UTF-32 to the output format
        len = fromCodePoint(decoded.second, (out == nullptr) ? nullptr : out + written, outLen - written);

        // out of buffer space
        if (len == 0)
        {
            break;
        }

        // store the last written character
        if (out != nullptr)
        {
            prev = out + written;
        }
        prevLen = len;

        // advance the indices
        written += len;
        read += decoded.first;

        // hit the null terminator => finished
        if (decoded.second == '\0')
        {
            fullyRead = true;
            break;
        }
    }

    // if the string was truncated, we need to cut out the last character
    // from the written count
    if (!fullyRead)
    {
        if (written == outLen)
        {
            written -= prevLen;
            written += 1;

            if (out != nullptr)
            {
                *prev = '\0';
            }
        }
        else
        {
            if (out != nullptr)
            {
                out[written] = '\0';
            }
            written++;
        }
    }

    return written;
}

inline std::pair<size_t, char32_t> utf8ToCodePoint(const char* str)
{
    char32_t c = 0;
    const char* p = Utf8Parser::nextCodePoint(str, Utf8Parser::kNullTerminated, &c, Utf8Parser::fDecodeUseDefault);
    if (c == '\0')
    {
        return std::pair<size_t, char32_t>(1, '\0');
    }
    else if (p == nullptr)
    {
        // invalid character, skip it
        return std::pair<size_t, char32_t>(1, c);
    }
    else
    {
        return std::pair<size_t, char32_t>(p - str, c);
    }
}

inline size_t utf32FromCodePoint(char32_t c, char32_t* out, size_t outLen)
{
    if (outLen == 0)
    {
        return 0;
    }
    else
    {
        if (out != nullptr)
        {
            out[0] = c;
        }
        return 1;
    }
};

inline std::pair<size_t, char32_t> utf32ToCodePoint(const char32_t* str)
{
    return std::pair<size_t, char32_t>(1, *str);
}

inline size_t utf8FromCodePoint(char32_t c, char* out, size_t outLen)
{
    char dummy[8];
    size_t len = 0;
    char* p;

    if (out == nullptr)
    {
        out = dummy;
        outLen = CARB_MIN(outLen, CARB_COUNTOF(dummy));
    }

    p = Utf8Parser::getCodeBytes(c, out, outLen, &len);
    if (p == nullptr)
    {
        return 0;
    }
    else
    {
        return len;
    }
}

inline std::pair<size_t, char32_t> utf16ToCodePoint(const char16_t* str)
{
    char32_t c;
    switch (Utf8Parser::classifyUtf16SurrogateMember(str[0]))
    {
        case Utf8Parser::SurrogateMember::eHigh:
            c = Utf8Parser::decodeUtf16CodePoint(str[0], str[1]);
            if (c == 0) // invalid surrogate pair
            {
                break;
            }

            return std::pair<size_t, char32_t>(2, c);

        // a stray low surrogate is invalid
        case Utf8Parser::SurrogateMember::eLow:
            break;

        default:
            return std::pair<size_t, char32_t>(1, str[0]);
    }

    // failed to parse => just return the invalid character code point
    constexpr static auto kDefaultCodePoint_ = Utf8Parser::kDefaultCodePoint; // CC-1110
    return std::pair<size_t, char32_t>(1, kDefaultCodePoint_);
}

inline size_t utf16FromCodePoint(char32_t c, char16_t* out, size_t outLen)
{
    char32_t mangled = 0;
    size_t len;

    len = Utf8Parser::encodeUtf16CodePoint(c, &mangled);
    if (outLen < len)
    {
        return 0;
    }

    if (out != nullptr)
    {
        switch (len)
        {
            default:
                break;

            case 1:
                out[0] = char16_t(mangled);
                break;

            case 2:
                out[0] = char16_t(mangled & 0xFFFF);
                out[1] = char16_t(mangled >> 16);
                break;
        }
    }

    return len;
}

template <typename T, typename O, size_t conv(const T* str, O* out, size_t outLen)>
inline std::basic_string<O> convertBetweenUnicodeFormats(const T* str)
{
    omni::extras::ScratchBuffer<O, 4096> buffer;
    size_t len = conv(str, nullptr, 0);
    if (len == 0 || !buffer.resize(len))
    {
        return std::basic_string<O>();
    }
    conv(str, buffer.data(), buffer.size());
    return std::basic_string<O>(buffer.data(), buffer.data() + len - 1);
}
} // namespace detail
#endif

inline size_t convertUtf8StringToUtf32(const char* str, char32_t* out, size_t outLen) noexcept
{
    return detail::convertBetweenUnicodeFormatsRaw<char, char32_t, detail::utf8ToCodePoint, detail::utf32FromCodePoint>(
        str, out, outLen);
}

inline std::u32string convertUtf8StringToUtf32(const char* str)
{
    return detail::convertBetweenUnicodeFormats<char, char32_t, convertUtf8StringToUtf32>(str);
}

inline std::u32string convertUtf8StringToUtf32(std::string str)
{
    return convertUtf8StringToUtf32(str.c_str());
}

inline size_t convertUtf32StringToUtf8(const char32_t* str, char* out, size_t outLen)
{
    return detail::convertBetweenUnicodeFormatsRaw<char32_t, char, detail::utf32ToCodePoint, detail::utf8FromCodePoint>(
        str, out, outLen);
}

inline std::string convertUtf32StringToUtf8(const char32_t* str)
{
    return detail::convertBetweenUnicodeFormats<char32_t, char, convertUtf32StringToUtf8>(str);
}

inline std::string convertUtf32StringToUtf8(std::u32string str)
{
    return convertUtf32StringToUtf8(str.c_str());
}

inline size_t convertUtf16StringToUtf8(const char16_t* str, char* out, size_t outLen)
{
    return detail::convertBetweenUnicodeFormatsRaw<char16_t, char, detail::utf16ToCodePoint, detail::utf8FromCodePoint>(
        str, out, outLen);
}

inline std::string convertUtf16StringToUtf8(const char16_t* str)
{
    return detail::convertBetweenUnicodeFormats<char16_t, char, convertUtf16StringToUtf8>(str);
}

inline std::string convertUtf16StringToUtf8(std::u16string str)
{
    return convertUtf16StringToUtf8(str.c_str());
}

inline size_t convertUtf8StringToUtf16(const char* str, char16_t* out, size_t outLen) noexcept
{
    return detail::convertBetweenUnicodeFormatsRaw<char, char16_t, detail::utf8ToCodePoint, detail::utf16FromCodePoint>(
        str, out, outLen);
}

inline std::u16string convertUtf8StringToUtf16(const char* str)
{
    return detail::convertBetweenUnicodeFormats<char, char16_t, convertUtf8StringToUtf16>(str);
}

inline std::u16string convertUtf8StringToUtf16(std::string str)
{
    return convertUtf8StringToUtf16(str.c_str());
}

inline size_t convertUtf8StringToWide(const char* str, wchar_t* out, size_t outLen) noexcept
{
#if CARB_PLATFORM_WINDOWS
    static_assert(sizeof(*out) == sizeof(char16_t), "unexpected wchar_t type");
    return detail::convertBetweenUnicodeFormatsRaw<char, char16_t, detail::utf8ToCodePoint, detail::utf16FromCodePoint>(
        str, reinterpret_cast<char16_t*>(out), outLen);
#else
    static_assert(sizeof(*out) == sizeof(char32_t), "unexpected wchar_t type");
    return detail::convertBetweenUnicodeFormatsRaw<char, char32_t, detail::utf8ToCodePoint, detail::utf32FromCodePoint>(
        str, reinterpret_cast<char32_t*>(out), outLen);
#endif
}

inline std::wstring convertUtf8StringToWide(const char* str)
{
    return detail::convertBetweenUnicodeFormats<char, wchar_t, convertUtf8StringToWide>(str);
}

inline std::wstring convertUtf8StringToWide(std::string str)
{
    return convertUtf8StringToWide(str.c_str());
}

inline size_t convertWideStringToUtf8(const wchar_t* str, char* out, size_t outLen) noexcept
{
#if CARB_PLATFORM_WINDOWS
    static_assert(sizeof(*str) == sizeof(char16_t), "unexpected wchar_t type");
    return detail::convertBetweenUnicodeFormatsRaw<char16_t, char, detail::utf16ToCodePoint, detail::utf8FromCodePoint>(
        reinterpret_cast<const char16_t*>(str), out, outLen);
#else
    static_assert(sizeof(*str) == sizeof(char32_t), "unexpected wchar_t type");
    return detail::convertBetweenUnicodeFormatsRaw<char32_t, char, detail::utf32ToCodePoint, detail::utf8FromCodePoint>(
        reinterpret_cast<const char32_t*>(str), out, outLen);
#endif
}

inline std::string convertWideStringToUtf8(const wchar_t* str)
{
    return detail::convertBetweenUnicodeFormats<wchar_t, char, convertWideStringToUtf8>(str);
}

inline std::string convertWideStringToUtf8(std::wstring str)
{
    return convertWideStringToUtf8(str.c_str());
}

} // namespace extras
} // namespace carb