/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- * vim: set ts=8 sts=4 et sw=4 tw=99: * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #ifndef js_CharacterEncoding_h #define js_CharacterEncoding_h #include "mozilla/Range.h" #include "js/TypeDecls.h" #include "js/Utility.h" namespace js { class ExclusiveContext; } // namespace js class JSFlatString; namespace JS { /* * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each * byte is treated as a 2-byte character, and there is no way to pass in a * string containing characters beyond U+00FF. */ class Latin1Chars : public mozilla::Range { typedef mozilla::Range Base; public: Latin1Chars() : Base() {} Latin1Chars(char* aBytes, size_t aLength) : Base(reinterpret_cast(aBytes), aLength) {} Latin1Chars(const Latin1Char* aBytes, size_t aLength) : Base(const_cast(aBytes), aLength) {} Latin1Chars(const char* aBytes, size_t aLength) : Base(reinterpret_cast(const_cast(aBytes)), aLength) {} }; /* * A Latin1Chars, but with \0 termination for C compatibility. */ class Latin1CharsZ : public mozilla::RangedPtr { typedef mozilla::RangedPtr Base; public: Latin1CharsZ() : Base(nullptr, 0) {} Latin1CharsZ(char* aBytes, size_t aLength) : Base(reinterpret_cast(aBytes), aLength) { MOZ_ASSERT(aBytes[aLength] == '\0'); } Latin1CharsZ(Latin1Char* aBytes, size_t aLength) : Base(aBytes, aLength) { MOZ_ASSERT(aBytes[aLength] == '\0'); } using Base::operator=; char* c_str() { return reinterpret_cast(get()); } }; class UTF8Chars : public mozilla::Range { typedef mozilla::Range Base; public: UTF8Chars() : Base() {} UTF8Chars(char* aBytes, size_t aLength) : Base(reinterpret_cast(aBytes), aLength) {} UTF8Chars(const char* aBytes, size_t aLength) : Base(reinterpret_cast(const_cast(aBytes)), aLength) {} }; /* * SpiderMonkey also deals directly with UTF-8 encoded text in some places. */ class UTF8CharsZ : public mozilla::RangedPtr { typedef mozilla::RangedPtr Base; public: UTF8CharsZ() : Base(nullptr, 0) {} UTF8CharsZ(char* aBytes, size_t aLength) : Base(reinterpret_cast(aBytes), aLength) { MOZ_ASSERT(aBytes[aLength] == '\0'); } UTF8CharsZ(unsigned char* aBytes, size_t aLength) : Base(aBytes, aLength) { MOZ_ASSERT(aBytes[aLength] == '\0'); } using Base::operator=; char* c_str() { return reinterpret_cast(get()); } }; /* * SpiderMonkey uses a 2-byte character representation: it is a * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2, * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a * sufficiently dedicated JavaScript program to be fully unicode-aware by * manually interpreting UTF-16 extension characters embedded in the JS * string. */ class TwoByteChars : public mozilla::Range { typedef mozilla::Range Base; public: TwoByteChars() : Base() {} TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {} TwoByteChars(const char16_t* aChars, size_t aLength) : Base(const_cast(aChars), aLength) {} }; /* * A TwoByteChars, but \0 terminated for compatibility with JSFlatString. */ class TwoByteCharsZ : public mozilla::RangedPtr { typedef mozilla::RangedPtr Base; public: TwoByteCharsZ() : Base(nullptr, 0) {} TwoByteCharsZ(char16_t* chars, size_t length) : Base(chars, length) { MOZ_ASSERT(chars[length] == '\0'); } using Base::operator=; }; typedef mozilla::RangedPtr ConstCharPtr; /* * Like TwoByteChars, but the chars are const. */ class ConstTwoByteChars : public mozilla::Range { typedef mozilla::Range Base; public: ConstTwoByteChars() : Base() {} ConstTwoByteChars(const char16_t* aChars, size_t aLength) : Base(aChars, aLength) {} }; /* * Convert a 2-byte character sequence to "ISO-Latin-1". This works by * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source * contains any UTF-16 extension characters, then this may give invalid Latin1 * output. The returned string is zero terminated. The returned string or the * returned string's |start()| must be freed with JS_free or js_free, * respectively. If allocation fails, an OOM error will be set and the method * will return a nullptr chars (which can be tested for with the ! operator). * This method cannot trigger GC. */ extern Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ(js::ExclusiveContext* cx, const mozilla::Range tbchars); template extern UTF8CharsZ CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx, const mozilla::Range chars); uint32_t Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length); /* * Inflate bytes in UTF-8 encoding to char16_t. * - On error, returns an empty TwoByteCharsZ. * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold * its length; the length value excludes the trailing null. */ extern TwoByteCharsZ UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen); /* * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 characters * will be replaced by \uFFFD. No exception will be thrown for malformed UTF-8 * input. */ extern TwoByteCharsZ LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen); /* * Returns the length of the char buffer required to encode |s| as UTF8. * Does not include the null-terminator. */ JS_PUBLIC_API(size_t) GetDeflatedUTF8StringLength(JSFlatString* s); /* * Encode |src| as UTF8. The caller must ensure |dst| has enough space. * Does not write the null terminator. */ JS_PUBLIC_API(void) DeflateStringToUTF8Buffer(JSFlatString* src, mozilla::RangedPtr dst); } // namespace JS inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); } inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); } #endif /* js_CharacterEncoding_h */