Retro68/gcc/libphobos/libdruntime/rt/util/utf.d

/********************************************
 * Encode and decode UTF-8, UTF-16 and UTF-32 strings.
 *
 * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
 * wchar type.
 * For Posix systems, the C wchar_t type is UTF-32 and corresponds to
 * the D utf.dchar type.
 *
 * UTF character support is restricted to (\u0000 &lt;= character &lt;= \U0010FFFF).
 *
 * See_Also:
 *      $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
 *      $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
 *      $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
 *
 * Copyright: Copyright Digital Mars 2003 - 2016.
 * License:   $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
 * Authors:   Walter Bright, Sean Kelly
 * Source:    $(DRUNTIMESRC src/rt/util/_utf.d)
 */

module rt.util.utf;

extern (C) void onUnicodeError( string msg, size_t idx, string file = __FILE__, size_t line = __LINE__ ) @safe pure;

/*******************************
 * Test if c is a valid UTF-32 character.
 *
 * \uFFFE and \uFFFF are considered valid by this function,
 * as they are permitted for internal use by an application,
 * but they are not allowed for interchange by the Unicode standard.
 *
 * Returns: true if it is, false if not.
 */

@safe @nogc pure nothrow
bool isValidDchar(dchar c)
{
    /* Note: FFFE and FFFF are specifically permitted by the
     * Unicode standard for application internal use, but are not
     * allowed for interchange.
     * (thanks to Arcane Jill)
     */

    return c < 0xD800 ||
        (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
}

unittest
{
    debug(utf) printf("utf.isValidDchar.unittest\n");
    assert(isValidDchar(cast(dchar)'a') == true);
    assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
}


static immutable UTF8stride =
[
    cast(ubyte)
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
    4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
];

/**
 * stride() returns the length of a UTF-8 sequence starting at index i
 * in string s.
 * Returns:
 *      The number of bytes in the UTF-8 sequence or
 *      0xFF meaning s[i] is not the start of of UTF-8 sequence.
 */
@safe @nogc pure nothrow
uint stride(in char[] s, size_t i)
{
    return UTF8stride[s[i]];
}

/**
 * stride() returns the length of a UTF-16 sequence starting at index i
 * in string s.
 */
@safe @nogc pure nothrow
uint stride(in wchar[] s, size_t i)
{   uint u = s[i];
    return 1 + (u >= 0xD800 && u <= 0xDBFF);
}

/**
 * stride() returns the length of a UTF-32 sequence starting at index i
 * in string s.
 * Returns: The return value will always be 1.
 */
@safe @nogc pure nothrow
uint stride(in dchar[] s, size_t i)
{
    return 1;
}

/*******************************************
 * Given an index i into an array of characters s[],
 * and assuming that index i is at the start of a UTF character,
 * determine the number of UCS characters up to that index i.
 */
@safe pure
size_t toUCSindex(in char[] s, size_t i)
{
    size_t n;
    size_t j;

    for (j = 0; j < i; )
    {
        j += stride(s, j);
        n++;
    }
    if (j > i)
    {
        onUnicodeError("invalid UTF-8 sequence", j);
    }
    return n;
}

/** ditto */
@safe pure
size_t toUCSindex(in wchar[] s, size_t i)
{
    size_t n;
    size_t j;

    for (j = 0; j < i; )
    {
        j += stride(s, j);
        n++;
    }
    if (j > i)
    {
        onUnicodeError("invalid UTF-16 sequence", j);
    }
    return n;
}

/** ditto */
@safe @nogc pure nothrow
size_t toUCSindex(in dchar[] s, size_t i)
{
    return i;
}

/******************************************
 * Given a UCS index n into an array of characters s[], return the UTF index.
 */
@safe pure
size_t toUTFindex(in char[] s, size_t n)
{
    size_t i;

    while (n--)
    {
        uint j = UTF8stride[s[i]];
        if (j == 0xFF)
            onUnicodeError("invalid UTF-8 sequence", i);
        i += j;
    }
    return i;
}

/** ditto */
@safe @nogc pure nothrow
size_t toUTFindex(in wchar[] s, size_t n)
{
    size_t i;

    while (n--)
    {   wchar u = s[i];

        i += 1 + (u >= 0xD800 && u <= 0xDBFF);
    }
    return i;
}

/** ditto */
@safe @nogc pure nothrow
size_t toUTFindex(in dchar[] s, size_t n)
{
    return n;
}

/* =================== Decode ======================= */

/***************
 * Decodes and returns character starting at s[idx]. idx is advanced past the
 * decoded character. If the character is not well formed, a UtfException is
 * thrown and idx remains unchanged.
 */
@safe pure
dchar decode(in char[] s, ref size_t idx)
    in
    {
        assert(idx >= 0 && idx < s.length);
    }
    out (result)
    {
        assert(isValidDchar(result));
    }
    body
    {
        size_t len = s.length;
        dchar V;
        size_t i = idx;
        char u = s[i];

        if (u & 0x80)
        {   uint n;
            char u2;

            /* The following encodings are valid, except for the 5 and 6 byte
             * combinations:
             *  0xxxxxxx
             *  110xxxxx 10xxxxxx
             *  1110xxxx 10xxxxxx 10xxxxxx
             *  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
             *  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
             *  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
             */
            for (n = 1; ; n++)
            {
                if (n > 4)
                    goto Lerr;          // only do the first 4 of 6 encodings
                if (((u << n) & 0x80) == 0)
                {
                    if (n == 1)
                        goto Lerr;
                    break;
                }
            }

            // Pick off (7 - n) significant bits of B from first byte of octet
            V = cast(dchar)(u & ((1 << (7 - n)) - 1));

            if (i + (n - 1) >= len)
                goto Lerr;                      // off end of string

            /* The following combinations are overlong, and illegal:
             *  1100000x (10xxxxxx)
             *  11100000 100xxxxx (10xxxxxx)
             *  11110000 1000xxxx (10xxxxxx 10xxxxxx)
             *  11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
             *  11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
             */
            u2 = s[i + 1];
            if ((u & 0xFE) == 0xC0 ||
                (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
                (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
                (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
                (u == 0xFC && (u2 & 0xFC) == 0x80))
                goto Lerr;                      // overlong combination

            for (uint j = 1; j != n; j++)
            {
                u = s[i + j];
                if ((u & 0xC0) != 0x80)
                    goto Lerr;                  // trailing bytes are 10xxxxxx
                V = (V << 6) | (u & 0x3F);
            }
            if (!isValidDchar(V))
                goto Lerr;
            i += n;
        }
        else
        {
            V = cast(dchar) u;
            i++;
        }

        idx = i;
        return V;

      Lerr:
      onUnicodeError("invalid UTF-8 sequence", i);
    return V; // dummy return
    }

unittest
{   size_t i;
    dchar c;

    debug(utf) printf("utf.decode.unittest\n");

    static s1 = "abcd"c;
    i = 0;
    c = decode(s1, i);
    assert(c == cast(dchar)'a');
    assert(i == 1);
    c = decode(s1, i);
    assert(c == cast(dchar)'b');
    assert(i == 2);

    static s2 = "\xC2\xA9"c;
    i = 0;
    c = decode(s2, i);
    assert(c == cast(dchar)'\u00A9');
    assert(i == 2);

    static s3 = "\xE2\x89\xA0"c;
    i = 0;
    c = decode(s3, i);
    assert(c == cast(dchar)'\u2260');
    assert(i == 3);

    static s4 =
    [   "\xE2\x89"c[],          // too short
        "\xC0\x8A",
        "\xE0\x80\x8A",
        "\xF0\x80\x80\x8A",
        "\xF8\x80\x80\x80\x8A",
        "\xFC\x80\x80\x80\x80\x8A",
    ];

    for (int j = 0; j < s4.length; j++)
    {
        try
        {
            i = 0;
            c = decode(s4[j], i);
            assert(0);
        }
        catch (Throwable o)
        {
            i = 23;
        }
        assert(i == 23);
    }
}

/** ditto */
@safe pure
dchar decode(in wchar[] s, ref size_t idx)
    in
    {
        assert(idx >= 0 && idx < s.length);
    }
    out (result)
    {
        assert(isValidDchar(result));
    }
    body
    {
        string msg;
        dchar V;
        size_t i = idx;
        uint u = s[i];

        if (u & ~0x7F)
        {   if (u >= 0xD800 && u <= 0xDBFF)
            {   uint u2;

                if (i + 1 == s.length)
                {   msg = "surrogate UTF-16 high value past end of string";
                    goto Lerr;
                }
                u2 = s[i + 1];
                if (u2 < 0xDC00 || u2 > 0xDFFF)
                {   msg = "surrogate UTF-16 low value out of range";
                    goto Lerr;
                }
                u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
                i += 2;
            }
            else if (u >= 0xDC00 && u <= 0xDFFF)
            {   msg = "unpaired surrogate UTF-16 value";
                goto Lerr;
            }
            else if (u == 0xFFFE || u == 0xFFFF)
            {   msg = "illegal UTF-16 value";
                goto Lerr;
            }
            else
                i++;
        }
        else
        {
            i++;
        }

        idx = i;
        return cast(dchar)u;

      Lerr:
          onUnicodeError(msg, i);
        return cast(dchar)u; // dummy return
    }

/** ditto */
@safe pure
dchar decode(in dchar[] s, ref size_t idx)
    in
    {
        assert(idx >= 0 && idx < s.length);
    }
    body
    {
        size_t i = idx;
        dchar c = s[i];

        if (!isValidDchar(c))
            goto Lerr;
        idx = i + 1;
        return c;

      Lerr:
          onUnicodeError("invalid UTF-32 value", i);
        return c; // dummy return
    }


/* =================== Encode ======================= */

/*******************************
 * Encodes character c and appends it to array s[].
 */
@safe pure nothrow
void encode(ref char[] s, dchar c)
    in
    {
        assert(isValidDchar(c));
    }
    body
    {
        char[] r = s;

        if (c <= 0x7F)
        {
            r ~= cast(char) c;
        }
        else
        {
            char[4] buf;
            uint L;

            if (c <= 0x7FF)
            {
                buf[0] = cast(char)(0xC0 | (c >> 6));
                buf[1] = cast(char)(0x80 | (c & 0x3F));
                L = 2;
            }
            else if (c <= 0xFFFF)
            {
                buf[0] = cast(char)(0xE0 | (c >> 12));
                buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
                buf[2] = cast(char)(0x80 | (c & 0x3F));
                L = 3;
            }
            else if (c <= 0x10FFFF)
            {
                buf[0] = cast(char)(0xF0 | (c >> 18));
                buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
                buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
                buf[3] = cast(char)(0x80 | (c & 0x3F));
                L = 4;
            }
            else
            {
                assert(0);
            }
            r ~= buf[0 .. L];
        }
        s = r;
    }

unittest
{
    debug(utf) printf("utf.encode.unittest\n");

    char[] s = "abcd".dup;
    encode(s, cast(dchar)'a');
    assert(s.length == 5);
    assert(s == "abcda");

    encode(s, cast(dchar)'\u00A9');
    assert(s.length == 7);
    assert(s == "abcda\xC2\xA9");
    //assert(s == "abcda\u00A9");       // BUG: fix compiler

    encode(s, cast(dchar)'\u2260');
    assert(s.length == 10);
    assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
}

/** ditto */
@safe pure nothrow
void encode(ref wchar[] s, dchar c)
    in
    {
        assert(isValidDchar(c));
    }
    body
    {
        wchar[] r = s;

        if (c <= 0xFFFF)
        {
            r ~= cast(wchar) c;
        }
        else
        {
            wchar[2] buf;

            buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
            buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
            r ~= buf;
        }
        s = r;
    }

/** ditto */
@safe pure nothrow
void encode(ref dchar[] s, dchar c)
    in
    {
        assert(isValidDchar(c));
    }
    body
    {
        s ~= c;
    }

/**
Returns the code length of $(D c) in the encoding using $(D C) as a
code point. The code is returned in character count, not in bytes.
 */
@safe pure nothrow @nogc
ubyte codeLength(C)(dchar c)
{
    static if (C.sizeof == 1)
    {
        if (c <= 0x7F) return 1;
        if (c <= 0x7FF) return 2;
        if (c <= 0xFFFF) return 3;
        if (c <= 0x10FFFF) return 4;
        assert(false);
    }
    else static if (C.sizeof == 2)
    {
        return c <= 0xFFFF ? 1 : 2;
    }
    else
    {
        static assert(C.sizeof == 4);
        return 1;
    }
}

/* =================== Validation ======================= */

/***********************************
Checks to see if string is well formed or not. $(D S) can be an array
 of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException)
 if it is not. Use to check all untrusted input for correctness.
 */
@safe pure
void validate(S)(in S s)
{
    auto len = s.length;
    for (size_t i = 0; i < len; )
    {
        decode(s, i);
    }
}

/* =================== Conversion to UTF8 ======================= */

@safe pure nothrow @nogc
char[] toUTF8(char[] buf, dchar c)
    in
    {
        assert(isValidDchar(c));
    }
    body
    {
        if (c <= 0x7F)
        {
            buf[0] = cast(char) c;
            return buf[0 .. 1];
        }
        else if (c <= 0x7FF)
        {
            buf[0] = cast(char)(0xC0 | (c >> 6));
            buf[1] = cast(char)(0x80 | (c & 0x3F));
            return buf[0 .. 2];
        }
        else if (c <= 0xFFFF)
        {
            buf[0] = cast(char)(0xE0 | (c >> 12));
            buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
            buf[2] = cast(char)(0x80 | (c & 0x3F));
            return buf[0 .. 3];
        }
        else if (c <= 0x10FFFF)
        {
            buf[0] = cast(char)(0xF0 | (c >> 18));
            buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
            buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
            buf[3] = cast(char)(0x80 | (c & 0x3F));
            return buf[0 .. 4];
        }
        assert(0);
    }

/*******************
 * Encodes string s into UTF-8 and returns the encoded string.
 */
@safe pure nothrow
string toUTF8(string s)
    in
    {
        validate(s);
    }
    body
    {
        return s;
    }

/** ditto */
@trusted pure
string toUTF8(in wchar[] s)
{
    char[] r;
    size_t i;
    size_t slen = s.length;

    r.length = slen;

    for (i = 0; i < slen; i++)
    {   wchar c = s[i];

        if (c <= 0x7F)
            r[i] = cast(char)c;         // fast path for ascii
        else
        {
            r.length = i;
            foreach (dchar c; s[i .. slen])
            {
                encode(r, c);
            }
            break;
        }
    }
    return cast(string)r;
}

/** ditto */
@trusted pure
string toUTF8(in dchar[] s)
{
    char[] r;
    size_t i;
    size_t slen = s.length;

    r.length = slen;

    for (i = 0; i < slen; i++)
    {   dchar c = s[i];

        if (c <= 0x7F)
            r[i] = cast(char)c;         // fast path for ascii
        else
        {
            r.length = i;
            foreach (dchar d; s[i .. slen])
            {
                encode(r, d);
            }
            break;
        }
    }
    return cast(string)r;
}

/* =================== Conversion to UTF16 ======================= */

@safe pure nothrow @nogc
wchar[] toUTF16(wchar[] buf, dchar c)
    in
    {
        assert(isValidDchar(c));
    }
    body
    {
        if (c <= 0xFFFF)
        {
            buf[0] = cast(wchar) c;
            return buf[0 .. 1];
        }
        else
        {
            buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
            buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
            return buf[0 .. 2];
        }
    }

/****************
 * Encodes string s into UTF-16 and returns the encoded string.
 * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
 * an LPWSTR or LPCWSTR argument.
 */
@trusted pure
wstring toUTF16(in char[] s)
{
    wchar[] r;
    size_t slen = s.length;

    r.length = slen;
    r.length = 0;
    for (size_t i = 0; i < slen; )
    {
        dchar c = s[i];
        if (c <= 0x7F)
        {
            i++;
            r ~= cast(wchar)c;
        }
        else
        {
            c = decode(s, i);
            encode(r, c);
        }
    }
    return cast(wstring)r;
}

alias const(wchar)* wptr;
/** ditto */
@safe pure
wptr toUTF16z(in char[] s)
{
    wchar[] r;
    size_t slen = s.length;

    r.length = slen + 1;
    r.length = 0;
    for (size_t i = 0; i < slen; )
    {
        dchar c = s[i];
        if (c <= 0x7F)
        {
            i++;
            r ~= cast(wchar)c;
        }
        else
        {
            c = decode(s, i);
            encode(r, c);
        }
    }
    r ~= '\000';
    return &r[0];
}

/** ditto */
@safe pure nothrow
wstring toUTF16(wstring s)
    in
    {
        validate(s);
    }
    body
    {
        return s;
    }

/** ditto */
@trusted pure nothrow
wstring toUTF16(in dchar[] s)
{
    wchar[] r;
    size_t slen = s.length;

    r.length = slen;
    r.length = 0;
    for (size_t i = 0; i < slen; i++)
    {
        encode(r, s[i]);
    }
    return cast(wstring)r;
}

/* =================== Conversion to UTF32 ======================= */

/*****
 * Encodes string s into UTF-32 and returns the encoded string.
 */
@trusted pure
dstring toUTF32(in char[] s)
{
    dchar[] r;
    size_t slen = s.length;
    size_t j = 0;

    r.length = slen;            // r[] will never be longer than s[]
    for (size_t i = 0; i < slen; )
    {
        dchar c = s[i];
        if (c >= 0x80)
            c = decode(s, i);
        else
            i++;                // c is ascii, no need for decode
        r[j++] = c;
    }
    return cast(dstring)r[0 .. j];
}

/** ditto */
@trusted pure
dstring toUTF32(in wchar[] s)
{
    dchar[] r;
    size_t slen = s.length;
    size_t j = 0;

    r.length = slen;            // r[] will never be longer than s[]
    for (size_t i = 0; i < slen; )
    {
        dchar c = s[i];
        if (c >= 0x80)
            c = decode(s, i);
        else
            i++;                // c is ascii, no need for decode
        r[j++] = c;
    }
    return cast(dstring)r[0 .. j];
}

/** ditto */
@safe pure nothrow
dstring toUTF32(dstring s)
    in
    {
        validate(s);
    }
    body
    {
        return s;
    }

/* ================================ tests ================================== */

unittest
{
    debug(utf) printf("utf.toUTF.unittest\n");

    auto c = "hello"c[];
    auto w = toUTF16(c);
    assert(w == "hello");
    auto d = toUTF32(c);
    assert(d == "hello");

    c = toUTF8(w);
    assert(c == "hello");
    d = toUTF32(w);
    assert(d == "hello");

    c = toUTF8(d);
    assert(c == "hello");
    w = toUTF16(d);
    assert(w == "hello");


    c = "hel\u1234o";
    w = toUTF16(c);
    assert(w == "hel\u1234o");
    d = toUTF32(c);
    assert(d == "hel\u1234o");

    c = toUTF8(w);
    assert(c == "hel\u1234o");
    d = toUTF32(w);
    assert(d == "hel\u1234o");

    c = toUTF8(d);
    assert(c == "hel\u1234o");
    w = toUTF16(d);
    assert(w == "hel\u1234o");


    c = "he\U000BAAAAllo";
    w = toUTF16(c);
    //foreach (wchar c; w) printf("c = x%x\n", c);
    //foreach (wchar c; cast(wstring)"he\U000BAAAAllo") printf("c = x%x\n", c);
    assert(w == "he\U000BAAAAllo");
    d = toUTF32(c);
    assert(d == "he\U000BAAAAllo");

    c = toUTF8(w);
    assert(c == "he\U000BAAAAllo");
    d = toUTF32(w);
    assert(d == "he\U000BAAAAllo");

    c = toUTF8(d);
    assert(c == "he\U000BAAAAllo");
    w = toUTF16(d);
    assert(w == "he\U000BAAAAllo");

    wchar[2] buf;
    auto ret = toUTF16(buf, '\U000BAAAA');
    assert(ret == "\U000BAAAA");
}
gcc-9.1 2019-06-02 15:48:37 +00:00			`/********************************************`
			`* Encode and decode UTF-8, UTF-16 and UTF-32 strings.`
			`*`
			`* For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D`
			`* wchar type.`
			`* For Posix systems, the C wchar_t type is UTF-32 and corresponds to`
			`* the D utf.dchar type.`
			`*`
			`* UTF character support is restricted to (\u0000 <= character <= \U0010FFFF).`
			`*`
			`* See_Also:`
			`* $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>`
			`* $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>`
			`* $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)`
			`*`
			`* Copyright: Copyright Digital Mars 2003 - 2016.`
			`* License: $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0).`
			`* Authors: Walter Bright, Sean Kelly`
			`* Source: $(DRUNTIMESRC src/rt/util/_utf.d)`
			`*/`

			`module rt.util.utf;`

			`extern (C) void onUnicodeError( string msg, size_t idx, string file = __FILE__, size_t line = __LINE__ ) @safe pure;`

			`/*******************************`
			`* Test if c is a valid UTF-32 character.`
			`*`
			`* \uFFFE and \uFFFF are considered valid by this function,`
			`* as they are permitted for internal use by an application,`
			`* but they are not allowed for interchange by the Unicode standard.`
			`*`
			`* Returns: true if it is, false if not.`
			`*/`

			`@safe @nogc pure nothrow`
			`bool isValidDchar(dchar c)`
			`{`
			`/* Note: FFFE and FFFF are specifically permitted by the`
			`* Unicode standard for application internal use, but are not`
			`* allowed for interchange.`
			`* (thanks to Arcane Jill)`
			`*/`

			`return c < 0xD800 \|\|`
			`(c > 0xDFFF && c <= 0x10FFFF /&& c != 0xFFFE && c != 0xFFFF/);`
			`}`

			`unittest`
			`{`
			`debug(utf) printf("utf.isValidDchar.unittest\n");`
			`assert(isValidDchar(cast(dchar)'a') == true);`
			`assert(isValidDchar(cast(dchar)0x1FFFFF) == false);`
			`}`



			`static immutable UTF8stride =`
			`[`
			`cast(ubyte)`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,`
			`0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,`
			`0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,`
			`0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,`
			`0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,`
			`2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,`
			`2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,`
			`3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,`
			`4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,`
			`];`

			`/**`
			`* stride() returns the length of a UTF-8 sequence starting at index i`
			`* in string s.`
			`* Returns:`
			`* The number of bytes in the UTF-8 sequence or`
			`* 0xFF meaning s[i] is not the start of of UTF-8 sequence.`
			`*/`
			`@safe @nogc pure nothrow`
			`uint stride(in char[] s, size_t i)`
			`{`
			`return UTF8stride[s[i]];`
			`}`

			`/**`
			`* stride() returns the length of a UTF-16 sequence starting at index i`
			`* in string s.`
			`*/`
			`@safe @nogc pure nothrow`
			`uint stride(in wchar[] s, size_t i)`
			`{ uint u = s[i];`
			`return 1 + (u >= 0xD800 && u <= 0xDBFF);`
			`}`

			`/**`
			`* stride() returns the length of a UTF-32 sequence starting at index i`
			`* in string s.`
			`* Returns: The return value will always be 1.`
			`*/`
			`@safe @nogc pure nothrow`
			`uint stride(in dchar[] s, size_t i)`
			`{`
			`return 1;`
			`}`

			`/*******************************************`
			`* Given an index i into an array of characters s[],`
			`* and assuming that index i is at the start of a UTF character,`
			`* determine the number of UCS characters up to that index i.`
			`*/`
			`@safe pure`
			`size_t toUCSindex(in char[] s, size_t i)`
			`{`
			`size_t n;`
			`size_t j;`

			`for (j = 0; j < i; )`
			`{`
			`j += stride(s, j);`
			`n++;`
			`}`
			`if (j > i)`
			`{`
			`onUnicodeError("invalid UTF-8 sequence", j);`
			`}`
			`return n;`
			`}`

			`/** ditto */`
			`@safe pure`
			`size_t toUCSindex(in wchar[] s, size_t i)`
			`{`
			`size_t n;`
			`size_t j;`

			`for (j = 0; j < i; )`
			`{`
			`j += stride(s, j);`
			`n++;`
			`}`
			`if (j > i)`
			`{`
			`onUnicodeError("invalid UTF-16 sequence", j);`
			`}`
			`return n;`
			`}`

			`/** ditto */`
			`@safe @nogc pure nothrow`
			`size_t toUCSindex(in dchar[] s, size_t i)`
			`{`
			`return i;`
			`}`

			`/******************************************`
			`* Given a UCS index n into an array of characters s[], return the UTF index.`
			`*/`
			`@safe pure`
			`size_t toUTFindex(in char[] s, size_t n)`
			`{`
			`size_t i;`

			`while (n--)`
			`{`
			`uint j = UTF8stride[s[i]];`
			`if (j == 0xFF)`
			`onUnicodeError("invalid UTF-8 sequence", i);`
			`i += j;`
			`}`
			`return i;`
			`}`

			`/** ditto */`
			`@safe @nogc pure nothrow`
			`size_t toUTFindex(in wchar[] s, size_t n)`
			`{`
			`size_t i;`

			`while (n--)`
			`{ wchar u = s[i];`

			`i += 1 + (u >= 0xD800 && u <= 0xDBFF);`
			`}`
			`return i;`
			`}`

			`/** ditto */`
			`@safe @nogc pure nothrow`
			`size_t toUTFindex(in dchar[] s, size_t n)`
			`{`
			`return n;`
			`}`

			`/* =================== Decode ======================= */`

			`/***************`
			`* Decodes and returns character starting at s[idx]. idx is advanced past the`
			`* decoded character. If the character is not well formed, a UtfException is`
			`* thrown and idx remains unchanged.`
			`*/`
			`@safe pure`
			`dchar decode(in char[] s, ref size_t idx)`
			`in`
			`{`
			`assert(idx >= 0 && idx < s.length);`
			`}`
			`out (result)`
			`{`
			`assert(isValidDchar(result));`
			`}`
			`body`
			`{`
			`size_t len = s.length;`
			`dchar V;`
			`size_t i = idx;`
			`char u = s[i];`

			`if (u & 0x80)`
			`{ uint n;`
			`char u2;`

			`/* The following encodings are valid, except for the 5 and 6 byte`
			`* combinations:`
			`* 0xxxxxxx`
			`* 110xxxxx 10xxxxxx`
			`* 1110xxxx 10xxxxxx 10xxxxxx`
			`* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx`
			`* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx`
			`* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx`
			`*/`
			`for (n = 1; ; n++)`
			`{`
			`if (n > 4)`
			`goto Lerr; // only do the first 4 of 6 encodings`
			`if (((u << n) & 0x80) == 0)`
			`{`
			`if (n == 1)`
			`goto Lerr;`
			`break;`
			`}`
			`}`

			`// Pick off (7 - n) significant bits of B from first byte of octet`
			`V = cast(dchar)(u & ((1 << (7 - n)) - 1));`

			`if (i + (n - 1) >= len)`
			`goto Lerr; // off end of string`

			`/* The following combinations are overlong, and illegal:`
			`* 1100000x (10xxxxxx)`
			`* 11100000 100xxxxx (10xxxxxx)`
			`* 11110000 1000xxxx (10xxxxxx 10xxxxxx)`
			`* 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)`
			`* 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)`
			`*/`
			`u2 = s[i + 1];`
			`if ((u & 0xFE) == 0xC0 \|\|`
			`(u == 0xE0 && (u2 & 0xE0) == 0x80) \|\|`
			`(u == 0xF0 && (u2 & 0xF0) == 0x80) \|\|`
			`(u == 0xF8 && (u2 & 0xF8) == 0x80) \|\|`
			`(u == 0xFC && (u2 & 0xFC) == 0x80))`
			`goto Lerr; // overlong combination`

			`for (uint j = 1; j != n; j++)`
			`{`
			`u = s[i + j];`
			`if ((u & 0xC0) != 0x80)`
			`goto Lerr; // trailing bytes are 10xxxxxx`
			`V = (V << 6) \| (u & 0x3F);`
			`}`
			`if (!isValidDchar(V))`
			`goto Lerr;`
			`i += n;`
			`}`
			`else`
			`{`
			`V = cast(dchar) u;`
			`i++;`
			`}`

			`idx = i;`
			`return V;`

			`Lerr:`
			`onUnicodeError("invalid UTF-8 sequence", i);`
			`return V; // dummy return`
			`}`

			`unittest`
			`{ size_t i;`
			`dchar c;`

			`debug(utf) printf("utf.decode.unittest\n");`

			`static s1 = "abcd"c;`
			`i = 0;`
			`c = decode(s1, i);`
			`assert(c == cast(dchar)'a');`
			`assert(i == 1);`
			`c = decode(s1, i);`
			`assert(c == cast(dchar)'b');`
			`assert(i == 2);`

			`static s2 = "\xC2\xA9"c;`
			`i = 0;`
			`c = decode(s2, i);`
			`assert(c == cast(dchar)'\u00A9');`
			`assert(i == 2);`

			`static s3 = "\xE2\x89\xA0"c;`
			`i = 0;`
			`c = decode(s3, i);`
			`assert(c == cast(dchar)'\u2260');`
			`assert(i == 3);`

			`static s4 =`
			`[ "\xE2\x89"c[], // too short`
			`"\xC0\x8A",`
			`"\xE0\x80\x8A",`
			`"\xF0\x80\x80\x8A",`
			`"\xF8\x80\x80\x80\x8A",`
			`"\xFC\x80\x80\x80\x80\x8A",`
			`];`

			`for (int j = 0; j < s4.length; j++)`
			`{`
			`try`
			`{`
			`i = 0;`
			`c = decode(s4[j], i);`
			`assert(0);`
			`}`
			`catch (Throwable o)`
			`{`
			`i = 23;`
			`}`
			`assert(i == 23);`
			`}`
			`}`

			`/** ditto */`
			`@safe pure`
			`dchar decode(in wchar[] s, ref size_t idx)`
			`in`
			`{`
			`assert(idx >= 0 && idx < s.length);`
			`}`
			`out (result)`
			`{`
			`assert(isValidDchar(result));`
			`}`
			`body`
			`{`
			`string msg;`
			`dchar V;`
			`size_t i = idx;`
			`uint u = s[i];`

			`if (u & ~0x7F)`
			`{ if (u >= 0xD800 && u <= 0xDBFF)`
			`{ uint u2;`

			`if (i + 1 == s.length)`
			`{ msg = "surrogate UTF-16 high value past end of string";`
			`goto Lerr;`
			`}`
			`u2 = s[i + 1];`
			`if (u2 < 0xDC00 \|\| u2 > 0xDFFF)`
			`{ msg = "surrogate UTF-16 low value out of range";`
			`goto Lerr;`
			`}`
			`u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);`
			`i += 2;`
			`}`
			`else if (u >= 0xDC00 && u <= 0xDFFF)`
			`{ msg = "unpaired surrogate UTF-16 value";`
			`goto Lerr;`
			`}`
			`else if (u == 0xFFFE \|\| u == 0xFFFF)`
			`{ msg = "illegal UTF-16 value";`
			`goto Lerr;`
			`}`
			`else`
			`i++;`
			`}`
			`else`
			`{`
			`i++;`
			`}`

			`idx = i;`
			`return cast(dchar)u;`

			`Lerr:`
			`onUnicodeError(msg, i);`
			`return cast(dchar)u; // dummy return`
			`}`

			`/** ditto */`
			`@safe pure`
			`dchar decode(in dchar[] s, ref size_t idx)`
			`in`
			`{`
			`assert(idx >= 0 && idx < s.length);`
			`}`
			`body`
			`{`
			`size_t i = idx;`
			`dchar c = s[i];`

			`if (!isValidDchar(c))`
			`goto Lerr;`
			`idx = i + 1;`
			`return c;`

			`Lerr:`
			`onUnicodeError("invalid UTF-32 value", i);`
			`return c; // dummy return`
			`}`


			`/* =================== Encode ======================= */`

			`/*******************************`
			`* Encodes character c and appends it to array s[].`
			`*/`
			`@safe pure nothrow`
			`void encode(ref char[] s, dchar c)`
			`in`
			`{`
			`assert(isValidDchar(c));`
			`}`
			`body`
			`{`
			`char[] r = s;`

			`if (c <= 0x7F)`
			`{`
			`r ~= cast(char) c;`
			`}`
			`else`
			`{`
			`char[4] buf;`
			`uint L;`

			`if (c <= 0x7FF)`
			`{`
			`buf[0] = cast(char)(0xC0 \| (c >> 6));`
			`buf[1] = cast(char)(0x80 \| (c & 0x3F));`
			`L = 2;`
			`}`
			`else if (c <= 0xFFFF)`
			`{`
			`buf[0] = cast(char)(0xE0 \| (c >> 12));`
			`buf[1] = cast(char)(0x80 \| ((c >> 6) & 0x3F));`
			`buf[2] = cast(char)(0x80 \| (c & 0x3F));`
			`L = 3;`
			`}`
			`else if (c <= 0x10FFFF)`
			`{`
			`buf[0] = cast(char)(0xF0 \| (c >> 18));`
			`buf[1] = cast(char)(0x80 \| ((c >> 12) & 0x3F));`
			`buf[2] = cast(char)(0x80 \| ((c >> 6) & 0x3F));`
			`buf[3] = cast(char)(0x80 \| (c & 0x3F));`
			`L = 4;`
			`}`
			`else`
			`{`
			`assert(0);`
			`}`
			`r ~= buf[0 .. L];`
			`}`
			`s = r;`
			`}`

			`unittest`
			`{`
			`debug(utf) printf("utf.encode.unittest\n");`

			`char[] s = "abcd".dup;`
			`encode(s, cast(dchar)'a');`
			`assert(s.length == 5);`
			`assert(s == "abcda");`

			`encode(s, cast(dchar)'\u00A9');`
			`assert(s.length == 7);`
			`assert(s == "abcda\xC2\xA9");`
			`//assert(s == "abcda\u00A9"); // BUG: fix compiler`

			`encode(s, cast(dchar)'\u2260');`
			`assert(s.length == 10);`
			`assert(s == "abcda\xC2\xA9\xE2\x89\xA0");`
			`}`

			`/** ditto */`
			`@safe pure nothrow`
			`void encode(ref wchar[] s, dchar c)`
			`in`
			`{`
			`assert(isValidDchar(c));`
			`}`
			`body`
			`{`
			`wchar[] r = s;`

			`if (c <= 0xFFFF)`
			`{`
			`r ~= cast(wchar) c;`
			`}`
			`else`
			`{`
			`wchar[2] buf;`

			`buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);`
			`buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);`
			`r ~= buf;`
			`}`
			`s = r;`
			`}`

			`/** ditto */`
			`@safe pure nothrow`
			`void encode(ref dchar[] s, dchar c)`
			`in`
			`{`
			`assert(isValidDchar(c));`
			`}`
			`body`
			`{`
			`s ~= c;`
			`}`

			`/**`
			`Returns the code length of $(D c) in the encoding using $(D C) as a`
			`code point. The code is returned in character count, not in bytes.`
			`*/`
			`@safe pure nothrow @nogc`
			`ubyte codeLength(C)(dchar c)`
			`{`
			`static if (C.sizeof == 1)`
			`{`
			`if (c <= 0x7F) return 1;`
			`if (c <= 0x7FF) return 2;`
			`if (c <= 0xFFFF) return 3;`
			`if (c <= 0x10FFFF) return 4;`
			`assert(false);`
			`}`
			`else static if (C.sizeof == 2)`
			`{`
			`return c <= 0xFFFF ? 1 : 2;`
			`}`
			`else`
			`{`
			`static assert(C.sizeof == 4);`
			`return 1;`
			`}`
			`}`

			`/* =================== Validation ======================= */`

			`/***********************************`
			`Checks to see if string is well formed or not. $(D S) can be an array`
			`of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException)`
			`if it is not. Use to check all untrusted input for correctness.`
			`*/`
			`@safe pure`
			`void validate(S)(in S s)`
			`{`
			`auto len = s.length;`
			`for (size_t i = 0; i < len; )`
			`{`
			`decode(s, i);`
			`}`
			`}`

			`/* =================== Conversion to UTF8 ======================= */`

			`@safe pure nothrow @nogc`
			`char[] toUTF8(char[] buf, dchar c)`
			`in`
			`{`
			`assert(isValidDchar(c));`
			`}`
			`body`
			`{`
			`if (c <= 0x7F)`
			`{`
			`buf[0] = cast(char) c;`
			`return buf[0 .. 1];`
			`}`
			`else if (c <= 0x7FF)`
			`{`
			`buf[0] = cast(char)(0xC0 \| (c >> 6));`
			`buf[1] = cast(char)(0x80 \| (c & 0x3F));`
			`return buf[0 .. 2];`
			`}`
			`else if (c <= 0xFFFF)`
			`{`
			`buf[0] = cast(char)(0xE0 \| (c >> 12));`
			`buf[1] = cast(char)(0x80 \| ((c >> 6) & 0x3F));`
			`buf[2] = cast(char)(0x80 \| (c & 0x3F));`
			`return buf[0 .. 3];`
			`}`
			`else if (c <= 0x10FFFF)`
			`{`
			`buf[0] = cast(char)(0xF0 \| (c >> 18));`
			`buf[1] = cast(char)(0x80 \| ((c >> 12) & 0x3F));`
			`buf[2] = cast(char)(0x80 \| ((c >> 6) & 0x3F));`
			`buf[3] = cast(char)(0x80 \| (c & 0x3F));`
			`return buf[0 .. 4];`
			`}`
			`assert(0);`
			`}`

			`/*******************`
			`* Encodes string s into UTF-8 and returns the encoded string.`
			`*/`
			`@safe pure nothrow`
			`string toUTF8(string s)`
			`in`
			`{`
			`validate(s);`
			`}`
			`body`
			`{`
			`return s;`
			`}`

			`/** ditto */`
			`@trusted pure`
			`string toUTF8(in wchar[] s)`
			`{`
			`char[] r;`
			`size_t i;`
			`size_t slen = s.length;`

			`r.length = slen;`

			`for (i = 0; i < slen; i++)`
			`{ wchar c = s[i];`

			`if (c <= 0x7F)`
			`r[i] = cast(char)c; // fast path for ascii`
			`else`
			`{`
			`r.length = i;`
			`foreach (dchar c; s[i .. slen])`
			`{`
			`encode(r, c);`
			`}`
			`break;`
			`}`
			`}`
			`return cast(string)r;`
			`}`

			`/** ditto */`
			`@trusted pure`
			`string toUTF8(in dchar[] s)`
			`{`
			`char[] r;`
			`size_t i;`
			`size_t slen = s.length;`

			`r.length = slen;`

			`for (i = 0; i < slen; i++)`
			`{ dchar c = s[i];`

			`if (c <= 0x7F)`
			`r[i] = cast(char)c; // fast path for ascii`
			`else`
			`{`
			`r.length = i;`
			`foreach (dchar d; s[i .. slen])`
			`{`
			`encode(r, d);`
			`}`
			`break;`
			`}`
			`}`
			`return cast(string)r;`
			`}`

			`/* =================== Conversion to UTF16 ======================= */`

			`@safe pure nothrow @nogc`
			`wchar[] toUTF16(wchar[] buf, dchar c)`
			`in`
			`{`
			`assert(isValidDchar(c));`
			`}`
			`body`
			`{`
			`if (c <= 0xFFFF)`
			`{`
			`buf[0] = cast(wchar) c;`
			`return buf[0 .. 1];`
			`}`
			`else`
			`{`
			`buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);`
			`buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);`
			`return buf[0 .. 2];`
			`}`
			`}`

			`/****************`
			`* Encodes string s into UTF-16 and returns the encoded string.`
			`* toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take`
			`* an LPWSTR or LPCWSTR argument.`
			`*/`
			`@trusted pure`
			`wstring toUTF16(in char[] s)`
			`{`
			`wchar[] r;`
			`size_t slen = s.length;`

			`r.length = slen;`
			`r.length = 0;`
			`for (size_t i = 0; i < slen; )`
			`{`
			`dchar c = s[i];`
			`if (c <= 0x7F)`
			`{`
			`i++;`
			`r ~= cast(wchar)c;`
			`}`
			`else`
			`{`
			`c = decode(s, i);`
			`encode(r, c);`
			`}`
			`}`
			`return cast(wstring)r;`
			`}`

			`alias const(wchar)* wptr;`
			`/** ditto */`
			`@safe pure`
			`wptr toUTF16z(in char[] s)`
			`{`
			`wchar[] r;`
			`size_t slen = s.length;`

			`r.length = slen + 1;`
			`r.length = 0;`
			`for (size_t i = 0; i < slen; )`
			`{`
			`dchar c = s[i];`
			`if (c <= 0x7F)`
			`{`
			`i++;`
			`r ~= cast(wchar)c;`
			`}`
			`else`
			`{`
			`c = decode(s, i);`
			`encode(r, c);`
			`}`
			`}`
			`r ~= '\000';`
			`return &r[0];`
			`}`

			`/** ditto */`
			`@safe pure nothrow`
			`wstring toUTF16(wstring s)`
			`in`
			`{`
			`validate(s);`
			`}`
			`body`
			`{`
			`return s;`
			`}`

			`/** ditto */`
			`@trusted pure nothrow`
			`wstring toUTF16(in dchar[] s)`
			`{`
			`wchar[] r;`
			`size_t slen = s.length;`

			`r.length = slen;`
			`r.length = 0;`
			`for (size_t i = 0; i < slen; i++)`
			`{`
			`encode(r, s[i]);`
			`}`
			`return cast(wstring)r;`
			`}`

			`/* =================== Conversion to UTF32 ======================= */`

			`/*****`
			`* Encodes string s into UTF-32 and returns the encoded string.`
			`*/`
			`@trusted pure`
			`dstring toUTF32(in char[] s)`
			`{`
			`dchar[] r;`
			`size_t slen = s.length;`
			`size_t j = 0;`

			`r.length = slen; // r[] will never be longer than s[]`
			`for (size_t i = 0; i < slen; )`
			`{`
			`dchar c = s[i];`
			`if (c >= 0x80)`
			`c = decode(s, i);`
			`else`
			`i++; // c is ascii, no need for decode`
			`r[j++] = c;`
			`}`
			`return cast(dstring)r[0 .. j];`
			`}`

			`/** ditto */`
			`@trusted pure`
			`dstring toUTF32(in wchar[] s)`
			`{`
			`dchar[] r;`
			`size_t slen = s.length;`
			`size_t j = 0;`

			`r.length = slen; // r[] will never be longer than s[]`
			`for (size_t i = 0; i < slen; )`
			`{`
			`dchar c = s[i];`
			`if (c >= 0x80)`
			`c = decode(s, i);`
			`else`
			`i++; // c is ascii, no need for decode`
			`r[j++] = c;`
			`}`
			`return cast(dstring)r[0 .. j];`
			`}`

			`/** ditto */`
			`@safe pure nothrow`
			`dstring toUTF32(dstring s)`
			`in`
			`{`
			`validate(s);`
			`}`
			`body`
			`{`
			`return s;`
			`}`

			`/* ================================ tests ================================== */`

			`unittest`
			`{`
			`debug(utf) printf("utf.toUTF.unittest\n");`

			`auto c = "hello"c[];`
			`auto w = toUTF16(c);`
			`assert(w == "hello");`
			`auto d = toUTF32(c);`
			`assert(d == "hello");`

			`c = toUTF8(w);`
			`assert(c == "hello");`
			`d = toUTF32(w);`
			`assert(d == "hello");`

			`c = toUTF8(d);`
			`assert(c == "hello");`
			`w = toUTF16(d);`
			`assert(w == "hello");`


			`c = "hel\u1234o";`
			`w = toUTF16(c);`
			`assert(w == "hel\u1234o");`
			`d = toUTF32(c);`
			`assert(d == "hel\u1234o");`

			`c = toUTF8(w);`
			`assert(c == "hel\u1234o");`
			`d = toUTF32(w);`
			`assert(d == "hel\u1234o");`

			`c = toUTF8(d);`
			`assert(c == "hel\u1234o");`
			`w = toUTF16(d);`
			`assert(w == "hel\u1234o");`


			`c = "he\U000BAAAAllo";`
			`w = toUTF16(c);`
			`//foreach (wchar c; w) printf("c = x%x\n", c);`
			`//foreach (wchar c; cast(wstring)"he\U000BAAAAllo") printf("c = x%x\n", c);`
			`assert(w == "he\U000BAAAAllo");`
			`d = toUTF32(c);`
			`assert(d == "he\U000BAAAAllo");`

			`c = toUTF8(w);`
			`assert(c == "he\U000BAAAAllo");`
			`d = toUTF32(w);`
			`assert(d == "he\U000BAAAAllo");`

			`c = toUTF8(d);`
			`assert(c == "he\U000BAAAAllo");`
			`w = toUTF16(d);`
			`assert(w == "he\U000BAAAAllo");`

			`wchar[2] buf;`
			`auto ret = toUTF16(buf, '\U000BAAAA');`
			`assert(ret == "\U000BAAAA");`
			`}`