/* Copyright (c) 2017, Computer History Museum All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted (subject to the limitations in the disclaimer below) provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Computer History Museum nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define MAX_UTF8_CHAR_LEN (((sizeof(UniChar) * 8) / 6) + ((6 - ((sizeof(UniChar) * 8) / 6)) < ((sizeof(UniChar) * 8) % 6) ? 2 : 1)) #if __profile__ #include "unicodeprofile.h" #endif typedef enum { umFlowedDir, umCharsetDir, umHtmlDir, umRichDir, umDirCount } umDirectives; typedef enum { umHeaderState, umFlowedState, umCharsetState, umHtmlState, umRichState, umTextState, } umStates; typedef struct { Byte xState; Byte tState; Byte cState; Byte oState; } UniGlobalsState; static struct UniGlobals { TECObjectRef internetToUTF8; TextEncoding internetToUTF8Encoding; Boolean maclatin1; UnicodeToTextRunInfo UTF8ToMac; IntlConverter quickConverter; ByteCount origTextLen; TextPtr *origText; ByteCount convTextLen; TextPtr *convText; ItemCount convRunsCount; TextEncodingRun **convRuns; FormatOrderPtr *order; UInt16 tecVersion; Boolean hasTextRunFlag; Boolean has88591VariantsFlag; } uGlobals = {0}; OSStatus ConvertUTF8Text(BytePtr theText, ByteCount bufLen); OSStatus DrawConvertedUTF8(short width, ScriptFontInfo fonts); OSStatus MeasureConvertedUTF8(short *width, FontInfo *maxFont, Boolean *rightToLeft, ScriptFontInfo fonts); pascal long MyVisibleLength(Ptr textPtr, long len, short direction); pascal Boolean MyRlDirProc(short theFormat, void *dirParam); long EnsureHandleSize(Handle h, Size s); void NoPurgeUniGlobals(UniGlobalsState *uniState); void ResetUniGlobals(UniGlobalsState *uniState); OSStatus GetUnicodeHint(TextEncoding encoding, StringPtr lang, UniChar *hint); OSStatus ClearIntlConverterContext(UnicodeToTextRunInfo converter, Boolean utf8); OSStatus UpdateIntlConverterLo(IntlConverter *converter, StringPtr charset, TextEncoding encoding); OSStatus UpdateTECConverter(TECObjectRef *converter, StringPtr charset, TextEncoding newEncoding, TextEncoding *fromEncoding, TextEncoding toEncoding, Boolean *maclatin1, UniChar *hint); UniCharCount UTF8CharCount(BytePtr utf8, ByteCount bufLen); OSErr AccuEnsureSize(AccuPtr a, long len, long incr); short GetIntlFont(ScriptCode script, ScriptFontInfo fonts); short GetIntlSize(ScriptCode script, ScriptFontInfo fonts); void CleanISO2022(TextPtr text, long len, IntlConverter *converter); OSStatus InsertIntlHeaders(Handle text, long len, long tOff, AccuPtr a, TextEncoding encoding, PETEHandle pte, long *pOff); long ParseCharset(Ptr textPtr, long len, PStr charset, umDirectives c); OSErr PeteGetStyleRun(PETEHandle pte, long offset, long *len, PETEStyleInfoPtr style, long validBits); OSStatus MyTECConvertText(TECObjectRef encodingConverter, ConstTextPtr inputBuffer, ByteCount inputBufferLength, ByteCount *actualInputLength, TextPtr outputBuffer, ByteCount outputBufferLength, ByteCount *actualOutputLength, Boolean maclatin1); long UnicodeMappingCount(TextEncoding encoding); OSStatus InitUnicode() { UnicodeMapping tempMapping; TECInfoHandle info; OSStatus statusCode; Zero(uGlobals); if((long)TECGetInfo == kUnresolvedCFragSymbolAddress) { Log(-1, "\pNo Unicode"); return noErr; } statusCode = TECGetInfo(&info); if(statusCode != noErr) { Log(-1, "\pTECGetInfo failed"); return noErr; } uGlobals.tecVersion = (**info).tecVersion; uGlobals.hasTextRunFlag = !(!((**info).tecUnicodeConverterFeatures & kTECAddTextRunHeuristicsMask)); uGlobals.has88591VariantsFlag = UnicodeMappingCount(DefaultEncoding(kTextEncodingMacRomanLatin1)) > 1; tempMapping.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicodeUTF8Format); statusCode = CreateUnicodeToTextRunInfo(0, &tempMapping, &uGlobals.UTF8ToMac); if(statusCode != noErr) { Log(-1, "\pUTF8 To Mac failed"); return statusCode; } statusCode = CreateIntlConverter(&uGlobals.quickConverter, kTextEncodingUnknown); if(statusCode == noErr) { if(((uGlobals.origText = NuHTempBetter(0)) == nil) || ((uGlobals.convText = NuHTempBetter(0)) == nil) || ((uGlobals.convRuns = NuHTempBetter(2 * sizeof(TextEncodingRun))) == nil) || ((uGlobals.order = NuHTempBetter(0)) == nil)) { statusCode = memFullErr; } else { HPurge((Handle)uGlobals.origText); HPurge((Handle)uGlobals.convText); HPurge((Handle)uGlobals.convRuns); HPurge((Handle)uGlobals.order); } } if(statusCode != noErr) CleanupUnicode(); return statusCode; } void CleanupUnicode() { if(uGlobals.tecVersion > 0) return; DisposeUnicodeToTextRunInfo(&uGlobals.UTF8ToMac); if(uGlobals.internetToUTF8) { TECDisposeConverter(uGlobals.internetToUTF8); uGlobals.internetToUTF8 = 0; } DisposeIntlConverter(uGlobals.quickConverter); ZapHandle(uGlobals.origText); ZapHandle(uGlobals.convText); ZapHandle(uGlobals.convRuns); ZapHandle(uGlobals.order); Zero(uGlobals); } Boolean UTF8ToRoman(BytePtr theText, ByteCount *textLen, ByteCount bufLen) { if(ConvertUTF8Text(theText, *textLen) != noErr) return false; if(uGlobals.convRunsCount != 1) return false; if(ResolveDefaultTextEncoding((*uGlobals.convRuns)[0].textEncoding) != ResolveDefaultTextEncoding(DefaultEncoding(kTextEncodingMacRoman))) return false; *textLen = MIN(bufLen,uGlobals.convTextLen); BMD(*(uGlobals.convText), theText, *textLen); return true; } Boolean RomanToUTF8(PStr s, long strSize) { UniChar temp[128]; TextToUnicodeInfo info; UnicodeMapping tempMapping; OSStatus statusCode; ByteCount len; Zero(tempMapping); tempMapping.mappingVersion = kUnicodeUseLatestMapping; tempMapping.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicodeUTF8Format); tempMapping.otherEncoding = CreateTextEncoding(kTextEncodingMacRoman, kTextEncodingDefaultVariant, kTextEncodingDefaultFormat); statusCode = CreateTextToUnicodeInfo(&tempMapping, &info); if(statusCode) return statusCode; statusCode = ConvertFromPStringToUnicode(info, s, strSize-1, &len, temp); if(statusCode && statusCode != kTECOutputBufferFullStatus && statusCode != kTECUsedFallbacksStatus) return statusCode; BlockMoveData(&temp, &s[1], len); s[0] = len; return noErr; } OSStatus DrawUTF8Text(BytePtr theText, ByteCount bufLen, short width, ScriptFontInfo fonts) { OSStatus statusCode; statusCode = ConvertUTF8Text(theText, bufLen); if(statusCode != noErr) { return statusCode; } return DrawConvertedUTF8(width, fonts); } OSStatus DrawConvertedUTF8(short width, ScriptFontInfo fonts) { ItemCount run, runDisp; OSStatus statusCode; ScriptCode script; short fontID, oldFont, oldSize; GrafPtr port; Boolean lineRight; ByteOffset offset; ByteCount runLen; UniGlobalsState uniState; NoPurgeUniGlobals(&uniState); GetPort(&port); oldFont = GetPortTextFont(port); oldSize = GetPortTextSize(port); for(statusCode = noErr, run = 0; statusCode == noErr && run < uGlobals.convRunsCount; ++run) { runDisp = (**uGlobals.order)[run]; statusCode = RevertTextEncodingToScriptInfo((*uGlobals.convRuns)[runDisp].textEncoding, &script, nil, GlobalTemp); if(statusCode == noErr) { if(GlobalTemp[0] != 0) { GetFNum(GlobalTemp, &fontID); } else { fontID = GetIntlFont(script, fonts); } TextFont(fontID); TextSize(GetIntlSize(script, fonts)); if(run == 0) { lineRight = !(!(Boolean)GetScriptVariable(script, smScriptRight)); if(lineRight) { short fullWidth; statusCode = MeasureConvertedUTF8(&fullWidth, nil, nil, fonts); if(statusCode != noErr) break; if(width < fullWidth) { Move(width - fullWidth, 0); } } } offset = (*uGlobals.convRuns)[runDisp].offset; runLen = ((runDisp == uGlobals.convRunsCount - 1) ? uGlobals.convTextLen : (*uGlobals.convRuns)[runDisp + 1].offset) - offset; HLock((Handle)uGlobals.convText); if((**uGlobals.order)[run] == uGlobals.convRunsCount - 1) { runLen = MyVisibleLength(*uGlobals.convText + offset, runLen, lineRight ? rightCaret : leftCaret); } DrawText(*uGlobals.convText, offset, runLen); HUnlock((Handle)uGlobals.convText); } } TextSize(oldSize); TextFont(oldFont); ResetUniGlobals(&uniState); return statusCode; } OSStatus MeasureUTF8Text(BytePtr theText, ByteCount bufLen, short *width, FontInfo *maxFont, Boolean *rightToLeft, ScriptFontInfo fonts) { OSStatus statusCode; statusCode = ConvertUTF8Text(theText, bufLen); if(statusCode != noErr) { return statusCode; } return MeasureConvertedUTF8(width, maxFont, rightToLeft, fonts); } OSStatus MeasureConvertedUTF8(short *width, FontInfo *maxFont, Boolean *rightToLeft, ScriptFontInfo fonts) { GrafPtr port; short fontID, oldFont, oldSize; OSStatus statusCode; ItemCount run; ByteOffset offset; ByteCount runLen; ScriptCode script; FontInfo fInfo; Boolean lineRight; UniGlobalsState uniState; NoPurgeUniGlobals(&uniState); if(width != nil) *width = 0; if(maxFont != nil) Zero(*maxFont); if(rightToLeft != nil) *rightToLeft = false; GetPort(&port); oldFont = GetPortTextFont(port); oldSize = GetPortTextSize(port); for(statusCode = noErr, run = 0; statusCode == noErr && run < uGlobals.convRunsCount; ++run) { statusCode = RevertTextEncodingToScriptInfo((*uGlobals.convRuns)[run].textEncoding, &script, nil, GlobalTemp); if(statusCode == noErr) { if(GlobalTemp[0] != 0) { GetFNum(GlobalTemp, &fontID); } else { fontID = GetIntlFont(script, fonts); } TextFont(fontID); TextSize(GetIntlSize(script, fonts)); if(maxFont != nil) { GetFontInfo(&fInfo); if(fInfo.ascent > maxFont->ascent) { maxFont->ascent = fInfo.ascent; } if(fInfo.descent > maxFont->descent) { maxFont->descent = fInfo.descent; } if(fInfo.widMax > maxFont->widMax) { maxFont->widMax = fInfo.widMax; } if(fInfo.leading > maxFont->leading) { maxFont->leading = fInfo.leading; } } if(run == 0) { lineRight = !(!(Boolean)GetScriptVariable(script, smScriptRight)); if(rightToLeft != nil) *rightToLeft = lineRight; } if(width != nil) { offset = (*uGlobals.convRuns)[run].offset; runLen = ((run == uGlobals.convRunsCount - 1) ? uGlobals.convTextLen : (*uGlobals.convRuns)[run + 1].offset) - offset; HLock((Handle)uGlobals.convText); if((**uGlobals.order)[run] == uGlobals.convRunsCount - 1) { runLen = MyVisibleLength(*uGlobals.convText + offset, runLen, lineRight ? rightCaret : leftCaret); } *width += TextWidth(*uGlobals.convText, offset, runLen); HUnlock((Handle)uGlobals.convText); } else if(maxFont == nil) { break; } } } TextSize(oldSize); TextFont(oldFont); ResetUniGlobals(&uniState); return statusCode; } OSStatus ConvertUTF8Text(BytePtr theText, ByteCount bufLen) { OSStatus statusCode = noErr; long textSize, runCount; ByteCount inputRead, readTotal, outputLen, tempLen; ItemCount numRuns; OptionBits flags; ScriptCode script; UniGlobalsState uniState; BytePtr tempText; DECLARE_UPP(MyRlDirProc,StyleRunDirection); INIT_UPP(MyRlDirProc,StyleRunDirection); flags = kUnicodeKeepInfoMask | kUnicodeUseFallbacksMask | kUnicodeDefaultDirectionMask | kUnicodeLooseMappingsMask | kUnicodeTextRunMask; if(uGlobals.hasTextRunFlag) flags |= /*kUnicodeKeepSameEncodingMask | */kUnicodeTextRunHeuristicsMask; if((*uGlobals.convText != nil) && (*uGlobals.convRuns != nil) && (*uGlobals.order != nil) && (*uGlobals.origText != nil) && (bufLen == uGlobals.origTextLen) && (!memcmp(*uGlobals.origText, theText, bufLen))) { return noErr; } NoPurgeUniGlobals(&uniState); uGlobals.convTextLen = 0; uGlobals.convRunsCount = 0; readTotal = 0; if((uGlobals.tecVersion == 0x0180) && (GetOSVersion() >= 0x1020)) { // The 10.2 version of the TEC is converting 16-bit Unicode, not UTF-8 // So, convert the UTF-8 to 16-bit Unicode and put it into uGlobals.origText temporarily tempText = theText; tempLen = UTF8CharCount(tempText, bufLen) * sizeof(UniChar); statusCode = EnsureHandleSize((Handle)uGlobals.origText, tempLen); if(statusCode > noErr) { UniChar *curUniChar = (UniChar *)*(uGlobals.origText); UniCharCount c = tempLen / sizeof(UniChar); while(c--) { *curUniChar++ = UTF8ToUniChar(tempText, &tempText); } statusCode = noErr; } } else { tempLen = GoodUTF8Len(theText, bufLen); } runCount = EnsureHandleSize((Handle)uGlobals.convRuns, 2 * sizeof(TextEncodingRun)) / sizeof(TextEncodingRun); textSize = EnsureHandleSize((Handle)uGlobals.convText, bufLen); if(!statusCode && ((statusCode = runCount) > noErr) && ((statusCode = textSize) > noErr)) do { HLock((Handle)uGlobals.convText); HLock((Handle)uGlobals.convRuns); if((uGlobals.tecVersion == 0x0180) && (GetOSVersion() >= 0x1020)) { HLock((Handle)uGlobals.origText); tempText = *(uGlobals.origText); } else { tempText = theText; } statusCode = ConvertFromUnicodeToTextRun(uGlobals.UTF8ToMac, tempLen - readTotal, (ConstUniCharArrayPtr)(tempText + readTotal), flags, 0, nil, nil, nil, textSize - uGlobals.convTextLen, &inputRead, &outputLen, *uGlobals.convText + uGlobals.convTextLen, runCount - uGlobals.convRunsCount, &numRuns, *uGlobals.convRuns + uGlobals.convRunsCount); HUnlock((Handle)uGlobals.convText); HUnlock((Handle)uGlobals.convRuns); if((uGlobals.tecVersion == 0x0180) && (GetOSVersion() >= 0x1020)) HUnlock((Handle)uGlobals.origText); uGlobals.convTextLen += outputLen; uGlobals.convRunsCount += numRuns; readTotal += inputRead; if(statusCode == kTECArrayFullErr) { SetHandleSize((Handle)uGlobals.convRuns, ++runCount * sizeof(TextEncodingRun)); goto CheckError; } else if(statusCode == kTECOutputBufferFullStatus) { ResizeBuffer : SetHandleSize((Handle)uGlobals.convText, textSize += 1 K); CheckError : if((statusCode = MemError()) != noErr) { break; } } else { if(statusCode == kTECUsedFallbacksStatus) statusCode = noErr; break; } } while(true); if((statusCode == noErr) && ((statusCode = EnsureHandleSize((Handle)uGlobals.order, sizeof(FormatOrder) * uGlobals.convRunsCount)) > noErr)) { if(uGlobals.convRunsCount == 1) { (**uGlobals.order)[0] = 0; statusCode = noErr; } else { statusCode = RevertTextEncodingToScriptInfo((*uGlobals.convRuns)[0].textEncoding, &script, nil, nil); if(statusCode == noErr) { HLock((Handle)uGlobals.order); GetFormatOrder(**uGlobals.order, 0, uGlobals.convRunsCount - 1, !(!(Boolean)GetScriptVariable(script, smScriptRight)), MyRlDirProcUPP, (Ptr)uGlobals.convRuns); HUnlock((Handle)uGlobals.order); } } } if((statusCode == noErr) && ((textSize = EnsureHandleSize((Handle)uGlobals.origText, bufLen)) > noErr)) { uGlobals.origTextLen = bufLen; BMD(theText, *uGlobals.origText, bufLen); } else { uGlobals.origTextLen = 0L; } ClearIntlConverterContext(uGlobals.UTF8ToMac, true); ResetUniGlobals(&uniState); return statusCode; } pascal Boolean MyRlDirProc(short theFormat, void *dirParam) { ScriptCode script; TextEncodingRun **convRuns = (TextEncodingRun **)dirParam; return((RevertTextEncodingToScriptInfo((*convRuns)[theFormat].textEncoding, &script, nil, nil) == noErr) && (Boolean)GetScriptVariable(script, smScriptRight)); } long EnsureHandleSize(Handle h, Size s) { OSErr errCode; long s2; if(h == nil) return nilHandleErr; if(*h == nil) ReallocateHandle(h,s); else { s2 = InlineGetHandleSize(h); if(s2 < s) { SetHandleSize(h,s); } else s = s2; } errCode = MemError(); return (errCode == noErr) ? s : errCode; } void NoPurgeUniGlobals(UniGlobalsState *uniState) { uniState->xState = HGetState((Handle)uGlobals.origText); HNoPurge((Handle)uGlobals.convText); uniState->tState = HGetState((Handle)uGlobals.convText); HNoPurge((Handle)uGlobals.convText); uniState->cState = HGetState((Handle)uGlobals.convRuns); HNoPurge((Handle)uGlobals.convRuns); uniState->oState = HGetState((Handle)uGlobals.order); HNoPurge((Handle)uGlobals.order); } void ResetUniGlobals(UniGlobalsState *uniState) { HSetState((Handle)uGlobals.origText, uniState->xState); HSetState((Handle)uGlobals.convText, uniState->tState); HSetState((Handle)uGlobals.convRuns, uniState->cState); HSetState((Handle)uGlobals.order, uniState->oState); } /* Pass -1 for inOff if inText is a pointer instead of a handle. Pass either charset or encoding. */ OSStatus InternetToUTF8Text(StringPtr charset, TextEncoding encoding, ConstTextPtr *inText, long inOff, ByteCount inLen, AccuPtr a, Boolean hint) { OSStatus err; Byte iState, aState; ByteCount usedILen, usedOLen; UniChar hintChar; ConstTextPtr textPtr; err = UpdateTECConverter(&uGlobals.internetToUTF8, charset, encoding, &uGlobals.internetToUTF8Encoding, UTF8_ENCODING, &uGlobals.maclatin1, &hintChar); if(err && err != kTECNoConversionPathErr) return err; if(hint && hintChar) { unsigned char hint8[MAX_UTF8_CHAR_LEN]; ByteCount len; len = UniCharToUTF8(hintChar, hint8, sizeof(hint8)); AccuAddPtr(a, hint8, len); } if(err == kTECNoConversionPathErr) { if(inOff >= 0) { err = AccuAddFromHandle(a, (Handle)inText, inOff, inLen); } else { err = AccuAddPtr(a, (Ptr)inText, inLen); } return err; } aState = HGetState(a->data); if(inOff >= 0) { iState = HGetState((Handle)inText); } else { textPtr = (ConstTextPtr)inText; } do { if((err = AccuEnsureSize(a, MAX(inLen, 1 K), 1 K)) == noErr) { HLock(a->data); if(inOff >= 0) { textPtr = LDRef(inText) + inOff; } err = MyTECConvertText(uGlobals.internetToUTF8, textPtr, inLen, &usedILen, *(a->data) + a->offset, a->size - a->offset, &usedOLen, uGlobals.maclatin1); if(inOff >= 0) { HSetState((Handle)inText, iState); } HSetState(a->data, aState); if(!err || (err == kTECOutputBufferFullStatus)) { a->offset += usedOLen; inLen -= usedILen; if(inOff >= 0) { inOff += usedILen; } else { textPtr += usedILen; } } if(!err) { do { if((err = AccuEnsureSize(a, kUnicodeMinBufSize, 1 K)) == noErr) { HLock(a->data); err = TECFlushText(uGlobals.internetToUTF8, *(a->data) + a->offset, a->size - a->offset, &usedOLen); HSetState(a->data, aState); if(!err || (err == kTECOutputBufferFullStatus)) { a->offset += usedOLen; } } } while(err == kTECOutputBufferFullStatus); } } } while(err == kTECOutputBufferFullStatus); TECClearConverterContextInfo(uGlobals.internetToUTF8); return err; } OSStatus GetUnicodeHint(TextEncoding encoding, StringPtr lang, UniChar *hint) { ScriptCode script; OSStatus err; // LocaleStringToLangAndRegionCodes(string, &langCode, nil); if(uGlobals.tecVersion < 0x0150) return kTextUnsupportedEncodingErr; err = NearestMacTextEncodings(encoding, &encoding, nil); if(err) return err; err = RevertTextEncodingToScriptInfo(encoding, &script, nil, nil); if(!err) switch(script) { case smJapanese : *hint = kUnicodeSourceHintJapanese; break; case smTradChinese : *hint = kUnicodeSourceHintChineseTraditional; break; case smKorean : *hint = kUnicodeSourceHintKorean; break; case smSimpChinese : *hint = kUnicodeSourceHintChineseSimplified; break; default : *hint = kUnicodeSourceHintDefault; } return err; } OSStatus CreateIntlConverter(IntlConverter *converter, TextEncoding encoding) { OSStatus err; converter->uniCount = 0L; converter->lastCharScript = smSystemScript; converter->lastCharType = 0; converter->flags = kUnicodeDefaultDirectionMask; converter->uniHandle = NuHTempBetter(129); if(!converter->uniHandle) { Log(-1, "\pCreate uniHandle failed"); return MemError(); } converter->inToUnicode = converter->unicodeToMac = 0; if(uGlobals.tecVersion == 0) { err = noErr; } else { converter->inToUnicodeEncoding = (encoding == kTextEncodingUnknown) ? CreateSystemRomanEncoding() : encoding; err = UpdateIntlConverter(converter, nil); if(err) goto HDispose; err = CreateUnicodeToTextRunInfo(0, nil, &converter->unicodeToMac); } if(err) { Log(-1L, "\pUnicode To Mac failed"); TECDisposeConverter(converter->inToUnicode); HDispose : ZapHandle(converter->uniHandle); } return err; } OSStatus ConvertIntlText(IntlConverter *converter, StringPtr inText, ByteCount *inLen, StringPtr outText, ByteCount *outLen, TextEncoding *encoding, WordSpaceEnum addSpace, ByteOffset *spaceOffset) { OptionBits flags; OSStatus err; ByteCount origILen, origOLen, usedOLen, actILen, usedUniLen, uniBufSize; Boolean full; ItemCount runCount, spaceCount; ByteOffset iSpaceOffset, oSpaceOffset; TextEncodingRun encodingRun; // ScriptCode script; Byte hState; if(converter == nil) { return paramErr; } if(converter->unicodeToMac == nil) { *encoding = kTextEncodingUnknown; if(spaceOffset != nil) { *spaceOffset = ULONG_MAX; } if(addSpace != dontAddWordSpace) { *outText++ = ' '; --*outLen; } err = (*inLen <= *outLen) ? noErr : kTECOutputBufferFullStatus; *inLen = *outLen = MIN(*outLen, *inLen); BMD(inText, outText, *outLen); if(addSpace != dontAddWordSpace) { ++*outLen; } if(converter->table != NO_TABLE) { TransLitRes(outText, *outLen, converter->table); } return err; } hState = HGetState(converter->uniHandle); HLock(converter->uniHandle); uniBufSize = GetHandleSize(converter->uniHandle); origILen = *inLen; origOLen = *outLen; *outLen = *inLen = usedOLen = 0L; flags = converter->flags | kUnicodeUseFallbacksMask | kUnicodeLooseMappingsMask | kUnicodeTextRunMask | kUnicodeKeepInfoMask; if(uGlobals.hasTextRunFlag) flags |= /*kUnicodeKeepSameEncodingMask | */kUnicodeTextRunHeuristicsMask; do { if(addSpace != dontAddWordSpace) { spaceCount = 1; iSpaceOffset = (converter->uniCount * sizeof(UniChar)); (*converter->uniHandle)[converter->uniCount++] = kUnicodeSpaceChar; addSpace = dontAddWordSpace; } else { spaceCount = 0; } if((*inLen <= origILen || origILen == 0L) && (converter->uniCount < ((uniBufSize - sizeof(UniChar)) / sizeof(UniChar)))) { UniChar *curBuf = (*converter->uniHandle) + converter->uniCount; ByteCount curSize = uniBufSize - ((converter->uniCount + 1) * sizeof(UniChar)); if(converter->inToUnicode == nil) { actILen = usedUniLen = MIN(origILen - *inLen, curSize); if(usedUniLen > curSize) err = kTECOutputBufferFullStatus; else err = noErr; BMD(inText + *inLen, curBuf, usedUniLen); } else if((origILen != 0L) && (origILen != *inLen)) { if(converter->iso2022) CleanISO2022(inText + *inLen, origILen - *inLen, converter); err = MyTECConvertText(converter->inToUnicode, inText + *inLen, origILen - *inLen, &actILen, curBuf, curSize, &usedUniLen, converter->maclatin1); } else { actILen = 0L; err = TECFlushText(converter->inToUnicode, curBuf, curSize, &usedUniLen); } full = ((err == kTECOutputBufferFullStatus) || (err == kTECBufferBelowMinimumSizeErr)); if(!full && err) break; converter->uniCount += (usedUniLen / sizeof(UniChar)); *inLen += actILen; } else { full = (origILen != 0L); } ConvertUnicode : err = ConvertFromUnicodeToTextRun(converter->unicodeToMac, converter->uniCount * sizeof(UniChar), *converter->uniHandle, flags, spaceCount, &iSpaceOffset, &spaceCount, &oSpaceOffset, origOLen - *outLen, &usedUniLen, &usedOLen, outText + *outLen, 1, &runCount, &encodingRun); *encoding = encodingRun.textEncoding; switch(err) { case kTECUsedFallbacksStatus : err = noErr; goto MoveIt; case kTECBufferBelowMinimumSizeErr : err = kTECOutputBufferFullStatus; case kTECOutputBufferFullStatus : case kTECArrayFullErr : case noErr : MoveIt : if((runCount > 0) && (usedOLen > 0)) { /* RevertTextEncodingToScriptInfo(encodingRun.textEncoding, &script, nil, nil); if(spaceCount != 0) { short charType; charType = CharacterType(outText, 0, script); } converter->lastCharScript = script; converter->lastCharType = CharacterType(outText, *outLen - 1, script); */ if(spaceOffset != nil) { if(spaceCount != 0) { *spaceOffset = oSpaceOffset + *outLen; } else { *spaceOffset = ULONG_MAX; } } } converter->uniCount -= usedUniLen / sizeof(UniChar); if(converter->uniCount > 0) { BMD((*converter->uniHandle) + (usedUniLen / sizeof(UniChar)), *converter->uniHandle, converter->uniCount * sizeof(UniChar)); if((usedOLen == 0) && (err == kTECArrayFullErr)) goto ConvertUnicode; } *outLen += usedOLen; break; default : if(converter->inToUnicode == nil) { converter->uniCount -= (actILen / sizeof(UniChar)) + spaceCount; } } } while(full && !err); if(!err && (inText == (StringPtr)-1L) && (converter->unicodeToMac != nil)) { ClearIntlConverterContext(converter->unicodeToMac, false); } HSetState(converter->uniHandle, hState); return err; } void DisposeIntlConverter(IntlConverter converter) { if(converter.unicodeToMac != nil) { TECDisposeConverter(converter.inToUnicode); DisposeUnicodeToTextRunInfo(&converter.unicodeToMac); converter.inToUnicode = converter.unicodeToMac = nil; } ZapHandle(converter.uniHandle); } OSStatus UpdateIntlConverter(IntlConverter *converter, StringPtr charset) { OSStatus err; UniChar hint; if(uGlobals.tecVersion == 0) { if((charset == nil) || (charset[0] == 0) || EqualStrRes(charset,MIME_MAC)) { converter->table = NO_TABLE; return noErr; } else { converter->table = FindMIMECharset(charset); return (converter->table == NO_TABLE) ? kTextUnsupportedEncodingErr : noErr; } } err = UpdateTECConverter(&converter->inToUnicode, charset, charset ? kTextEncodingUnknown : converter->inToUnicodeEncoding, &converter->inToUnicodeEncoding, CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicode16BitFormat), &converter->maclatin1, &hint); if(!err) { TextEncodingBase base; if(hint) { (*converter->uniHandle)[converter->uniCount++] = hint; } base = GetTextEncodingBase(converter->inToUnicodeEncoding); converter->iso2022 = (base <= kTextEncodingISO_2022_KR && base >= kTextEncodingISO_2022_JP); converter->inDouble = false; } else { converter->iso2022 = false; } // If the transfer code has already taken a stab at // transliteration, it will pass in a charset of // orginal-charset/new-charset. Right now, we take // this as a signal we'd better not do further transliteration // At some future date, we may be able to undo some or all // of the damage that was done if this original transliteration // proved to be in error if (charset && PIndex(charset,'/')) converter->alreadyTransliterated = true; return err; } OSStatus EncodingPlusPeteStyle(TextEncoding encoding, PETEStyleEntry *pse, ScriptCode *outScript) { ScriptCode script, oldScript; LangCode lang; OSStatus err; if(uGlobals.tecVersion == 0) return noErr; err = RevertTextEncodingToScriptInfo(encoding, &script, &lang, GlobalTemp); if(err) return err; if(GlobalTemp[0] != 0) { GetFNum(GlobalTemp, &pse->psStyle.textStyle.tsFont); pse->psStyle.textStyle.tsLang = langUnspecified; script = smUninterp; } else { Boolean defaultFont = (pse->psStyle.textStyle.tsFont == kPETEDefaultFont) || (pse->psStyle.textStyle.tsFont == kPETEDefaultFixed); if(pse->psStyle.textStyle.tsLang == langUnspecified) { oldScript = smUninterp; } else if(!defaultFont) { oldScript = FontToScript(pse->psStyle.textStyle.tsFont); pse->psStyle.textStyle.tsLang = (short)GetScriptVariable(oldScript, smScriptLang); } else { TextEncoding tempEncoding; err = UpgradeScriptInfoToTextEncoding(kTextScriptDontCare, pse->psStyle.textStyle.tsLang, kTextRegionDontCare, nil, &tempEncoding); if(err) return err; err = RevertTextEncodingToScriptInfo(tempEncoding, &oldScript, nil, nil); if(err) return err; } if(lang != kTextLanguageDontCare) { if(pse->psStyle.textStyle.tsLang != lang) { pse->psStyle.textStyle.tsLang = lang; if(!defaultFont) { pse->psStyle.textStyle.tsFont = kPETEDefaultFont; } } } else { if(script != oldScript) { pse->psStyle.textStyle.tsLang = (short)GetScriptVariable(script, smScriptLang); if(!defaultFont) { pse->psStyle.textStyle.tsFont = kPETEDefaultFont; } } } } if(outScript != nil) *outScript = script; return noErr; } OSStatus ClearIntlConverterContext(UnicodeToTextRunInfo converter, Boolean utf8) { UniChar twoChars[2]; unsigned char twoUTF8Chars[2 * MAX_UTF8_CHAR_LEN]; void *iUnicodeStr; unsigned char buffer[kUnicodeMinBufSize]; TextEncodingRun encodingRun; ByteCount oLen, iLen; ItemCount runs; OptionBits flags; flags = kUnicodeUseFallbacksMask | kUnicodeDefaultDirectionMask | kUnicodeLooseMappingsMask | kUnicodeTextRunMask; if(uGlobals.hasTextRunFlag) flags |= /*kUnicodeKeepSameEncodingMask | */kUnicodeTextRunHeuristicsMask; if(utf8 && (uGlobals.tecVersion != 0x0180 || (GetOSVersion() < 0x1020))) { iLen = UniCharToUTF8(kUnicodeSourceHintDefault, &twoUTF8Chars[0], sizeof(twoUTF8Chars)); iLen += UniCharToUTF8(kUnicodeSpaceChar, &twoUTF8Chars[iLen], sizeof(twoUTF8Chars) - iLen); iUnicodeStr = &twoUTF8Chars[0]; } else { twoChars[0] = kUnicodeSourceHintDefault; twoChars[1] = kUnicodeSpaceChar; iUnicodeStr = &twoChars[0]; iLen = 2 * sizeof(UniChar); } return ConvertFromUnicodeToTextRun(converter, iLen, iUnicodeStr, flags, 0, nil, 0, nil, sizeof(buffer), &iLen, &oLen, buffer, 1, &runs, &encodingRun); } OSStatus UpdateTECConverter(TECObjectRef *converter, StringPtr charset, TextEncoding newEncoding, TextEncoding *fromEncoding, TextEncoding toEncoding, Boolean *maclatin1, UniChar *hint) { TECObjectRef tempConverter; OSStatus err = noErr; UPtr spot = nil; *maclatin1 = false; if(charset && charset[0] != 0) { Str255 string; spot = &charset[1]; PTokenPtr(&charset[1], charset[0], string, &spot, "*"); if(EqualStrRes(string, MIME_ISO_LATIN1)) { newEncoding = DefaultEncoding(kTextEncodingWindowsLatin1); } else { TextEncoding tempEncoding; err = TECGetTextEncodingFromInternetName(&tempEncoding, string); if(err) { if(newEncoding == kTextEncodingUnknown) return err; } else newEncoding = tempEncoding; // Work around another Apple bug if((uGlobals.tecVersion >= 0x0170 && uGlobals.tecVersion < 0x0190) && ((newEncoding & 0x0000FF00) == 0x00000100)) // It's Unicode newEncoding &= 0xFFFFFF00; // Make it default Unicode instead of explicit version } } if(!*converter || newEncoding != *fromEncoding) { if(toEncoding != kTextEncodingUnknown) err = TECCreateConverter(&tempConverter, newEncoding, toEncoding); else err = TECCreateOneToManyConverter(&tempConverter, newEncoding, 0, nil); if(!err) { if(*converter) TECDisposeConverter(*converter); *converter = tempConverter; *fromEncoding = newEncoding; } } if(!err && hint) { Str255 string; if(GetUnicodeHint(newEncoding, (spot && PTokenPtr(&charset[1],charset[0],string,&spot,"*")) ? string : nil, hint)) *hint = 0; } return err; } ByteCount UniCharToUTF8(UniChar c, unsigned char *utf8, ByteCount maxLen) { /* Bonehead! */ if(maxLen == 0) return 0; /* Plain ASCII just return it */ if(c <= 0x7F) { *utf8++ = c; /* Null terminate if there's room */ if(maxLen > 1) *utf8 = 0; return 1; } else { unsigned long lc; long mask = 0xFFFFFFE0; ByteCount len; /* * Count the bytes needed; trim off low six bits and see if the * remainder will fit in the first byte */ for(lc = c, len = 2; (lc >>=6) & mask; mask >>= 1, ++len) ; /* More than can fit? */ if(len > maxLen) return 0; /* Fill it in from last byte to first */ utf8 += len; /* Null terminate if there's room */ if(len < maxLen) *utf8 = 0; lc = c; do { /* Drop in the low six bits and set the high bit */ *--utf8 = ((lc & 0x3F) | 0x80); lc >>= 6; } while(lc & mask); /* Put the rest of the bits in the first byte */ *--utf8 = (((mask << 1) & 0x000000FF) | lc); return len; } } UniChar UTF8ToUniChar(unsigned char *utf8, unsigned char **next) { UniChar r; /* Plain ASCII just return it */ if(*utf8 <= 0x7F) { r = *utf8++; } else { int c; unsigned char b, m; /* * Count how many bytes there are; check how many high bits are set * and that'll be the byte count. Also, form the mask to grab the * low bits out of the first byte. */ for(c = 0, b = *utf8, m = 0x3F; ((b <<= 1) & 0x80); ++c) m >>= 1; /* Grab the low bits of the first byte */ r = *utf8++ & m; while(c--) { /* Grab the low six bits of each byte and shift them into r */ r <<= 6; r |= (*utf8++ & 0x3F); } } if(next) *next = utf8; return r; } UniCharCount UTF8CharCount(BytePtr utf8, ByteCount bufLen) { UniCharCount count = 0; UTF8Char b; ByteCount tempLen; while(bufLen) { tempLen = 1; if((b = *utf8++) > 0x7F) { while((b <<= 1) & 0x80) { ++tempLen; } if(tempLen > bufLen) break; utf8 += tempLen; } bufLen -= tempLen; ++count; } return count; } /* Return the length of the buffer that contains legal UTF-8 (remove trailing partials) */ ByteCount GoodUTF8Len(BytePtr utf8, ByteCount bufLen) { UTF8Char b; ByteCount newLen = 0, tempLen; while(bufLen) { // It's at least 1 if it's US-ASCII tempLen = 1; // If it's not US-ASCII.... if((b = *utf8++) > 0x7F) { /* * The first byte of any UTF-8 sequence has the high bit set followed by * a bit set for each subsequent byte in the sequence. So, count how many * bits are set from high to low (not including the first one) and that * tells how many bytes follow this one */ while((b <<= 1) & 0x80) { ++tempLen; } /* * If the first byte said there are more than the remaining buffer in the * sequence, don't add that to the length. */ if(tempLen > bufLen) break; // Move past this sequence of UTF-8 bytes (we already moved past the first) utf8 += tempLen - 1; } // Add the length used to the total, and subtract it from what's left newLen += tempLen; bufLen -= tempLen; } return newLen; } OSErr AccuEnsureSize(AccuPtr a, long len, long incr) { if(a->data && (a->size - a->offset >= len)) return noErr; if (!a->data) { a->offset = 0; a->size = MAX(len, 1 K); a->data = NuHTempBetter(a->size); } else { a->size += len + incr; SetHandleBig_(a->data,a->size); } return(a->err=MemError()); } short GetIntlFont(ScriptCode script, ScriptFontInfo fonts) { if(fonts[script].fontID > applFont) return fonts[script].fontID; else return LoWord(GetScriptVariable(script, smScriptAppFondSize)); } short GetIntlSize(ScriptCode script, ScriptFontInfo fonts) { if(fonts[script].fontSize > 0) return fonts[script].fontSize; else return HiWord(GetScriptVariable(script, smScriptAppFondSize)); } void CleanISO2022(TextPtr text, long len, IntlConverter *converter) { // state = 2022AnyByte; for(; --len >= 0; ++text) { /* switch state : case 2022AnyByte : if(text[offset] == 0x1B) { state = 2022Esc; break; } if(text[offset] <= 0x7F) break; */ if(*text == 0x1B) { if(++text, --len >= 0) { --len; switch(*text++) { case '$' : converter->inDouble = true; break; case '(' : converter->inDouble = false; } } } else { if(converter->inDouble && (len > 0)) { --len; if((*text++ < 0x21) || (*(text - 1) > 0x7E) || (*text < 0x21) || (*text > 0x7E)) { *(text - 1) = 0x21; *text = 0x21; } } else if(*text > 0x7F) { *text = '?'; } } } } Boolean Find2047(UPtr chars, long len, long *ewOff, long *ewTextOff, long *ewEndOff, long *ewWSLen, Boolean *qp); OSStatus PeteInsertHeader(PETEHandle pte, long *pOff, Handle text, long len, long tOff) { return InsertIntlHeaders(text, len, tOff, nil, kTextEncodingUnknown, pte, pOff); } /* * Pass encoding and a or if you want UTF8 in an accumulator; pass pte and pOff if you want a PETE handle * Pass -1 for tOff if Text is pointer instead of handle */ OSStatus InsertIntlHeaders(Handle text, long len, long tOff, AccuPtr a, TextEncoding encoding, PETEHandle pte, long *pOff) { long curOff, // Current offset into the text ewOff, // Offset of the encoded word ewTextOff, // Offset of the text portion of the encoded word ewEndOff, // Offset of the last question mark in the encoded word ewWSLen, // Whitespace after the encoded word ewLastWSLen, // Whitespace after previous encoded word skipOff; // Stuff that looked like an encoded word but wasn't OSStatus err = noErr; Boolean qp; // TRUE=quoted-printable; FALSE=base64 Ptr textPtr; // temp pointer long inTOff; // Offset into the text passed in inTOff = tOff; // < 0 means it was a pointer if(tOff < 0) tOff = 0; if(len == 0) return noErr; // Current offset into the text curOff = tOff; ewWSLen = 0; skipOff = 0; /* * Go through a single header. Find an encoded word. Output what came before the * encoded word. Convert the encoded word. Output the converted stuff. */ do { // Deref if it's a handle textPtr = inTOff < 0 ? (Ptr)text : *text; // Save whether there was any trailing WS on the last encoded word ewLastWSLen = ewWSLen; if(Find2047(textPtr + curOff + skipOff, len - ((curOff + skipOff) - tOff), &ewOff, &ewTextOff, &ewEndOff, &ewWSLen, &qp)) { // If we immediately find an encoded word after the last one, ignore the WS if(ewOff == ewLastWSLen) { curOff += ewOff; ewTextOff -= ewOff; ewEndOff -= ewOff; ewOff = 0; } } // If there was some text before the encoded word (or there was some skipped text)... if(ewOff + skipOff > 0) { // Add that text to the accumulator or document if(a) { if(encoding == kTextEncodingUnknown) encoding = CreateSystemRomanEncoding(); err = InternetToUTF8Text(nil, encoding, ((Ptr)text) + (inTOff < 0 ? curOff : 0), inTOff < 0 ? inTOff : curOff, ewOff + skipOff, a, false); } else { if(inTOff < 0) { err = PETEInsertTextPtr(PETE,pte,*pOff,textPtr+curOff,ewOff+skipOff,nil); } else { err = PETEInsertTextHandle(PETE,pte,*pOff,text,ewOff+skipOff,curOff,nil); } } } if(!err) { // If we need to keep track of the PETE offset, add what we just inserted if(pOff && *pOff != -1) *pOff += ewOff + skipOff; // Move curOff past what we just inserted curOff += ewOff + skipOff; skipOff = 0; // Make the offsets relative to curOff ewTextOff -= ewOff; ewEndOff -= ewOff; ewOff = 0; // At the end? if(ewEndOff != 0) { Str255 word; // Get the text of the encoded word and un-quoted-printable or un-base64 textPtr = inTOff < 0 ? (Ptr)text : *text; MakePStr(word, textPtr + curOff + ewTextOff, (ewEndOff - 2) - ewTextOff); if((qp && (PseudoQP(word), true) || !DecodeB64String(word))) { Str31 charset; long iOff, oOff; // Get the charset name MakePStr(charset, textPtr + curOff + 2, ewTextOff - 5); if (EqualStrRes(charset,UNKNOWN_CHARSET_NAME)) GetRString(charset,UNSPECIFIED_CHARSET); if(a) { // Adding the text to the accumulator using the charset iOff = a->offset; if(InternetToUTF8Text(charset, kTextEncodingUnknown, &word[1], -1L, word[0], a, false) != noErr) { a->offset = iOff; goto BadConversion; } else { curOff += ewEndOff; } } // If we add to the document, we update the converter using the charset else if(UpdateIntlConverter(&uGlobals.quickConverter, charset) == noErr) { // Adding the text to the document with the converter if((iOff = *pOff) < 0L) PeteGetTextAndSelection(pte,nil,&iOff,nil); err = PeteInsertIntlText(pte, pOff, &word[1], -1L, word[0], &uGlobals.quickConverter, kTextEncodingUnknown, false, true); if(!err) { curOff += ewEndOff; } else if(EncodingError(err)) { if((oOff = *pOff) < 0L) PeteGetTextAndSelection(pte,nil,&oOff,nil); PeteDelete(pte, iOff, oOff); goto BadConversion; } } else goto BadConversion; } else { BadConversion : /* * Couldn't convert, so add the entire encoded word to the skip so that * it can be added before the next encoded word. */ curOff -= ewLastWSLen; skipOff = ewEndOff + ewWSLen + ewLastWSLen; ewWSLen = 0; } } } } while((!err || EncodingError(err)) && curOff < tOff + len); return err; } Boolean Find2047(UPtr chars, long len, long *ewOff, long *ewTextOff, long *ewEndOff, long *ewWSLen, Boolean *qp) { UPtr q[4]; // Pointers to the 4 questions marks in the encoded word UPtr end=chars+len; // Past the end of the text UPtr spot=chars; // Start out at the start of the text long ewEncOff; // Offset of the encoding type (Q or B) while(true) { // If we don't have enough text left for an encoded word, break out if(end - spot < 8) { *ewOff = *ewTextOff = *ewEndOff = len; *ewWSLen = 0; break; } // Encoded words start with "=?" if((*spot++ == '=') && (*spot == '?')) { int i; // Find 3 more questions marks, break out if we hit the end for(q[0] = spot++, i = 1; i < 4; ++i) { q[i] = q[i-1]; while(q[i] < end && *++(q[i]) != '?') ; if(q[i] >= end) break; } // If we found 4 question marks and the next thing is an "="... if((i == 4) && (q[3] < end) && (q[3][1] == '=')) { // Encoded word starts one before the 1st '?' *ewOff = q[0] - chars - 1; // The encoding char is one after the 2nd '?' ewEncOff = q[1] - chars + 1; // The text is one after the 3rd '?' *ewTextOff = q[2] - chars + 1; // The end is two after the last '?' *ewEndOff = q[3] - chars + 2; // Get the length of WS after the encoded word to ignore for(spot = q[3] + 2; spot < end; ++spot) { switch(*spot) { case ' ' : case '\t' : case '\012' : case '\014' : case '\015' : continue; } break; } *ewWSLen = spot - (q[3] + 2); break; } } } // See if we've got quoted-printable or base64 if(*ewTextOff - ewEncOff == 2) { switch(chars[ewEncOff]) { default : goto badEncoding; case 'Q' : case 'q' : *qp = true; break; case 'B' : case 'b' : *qp = false; } } else { badEncoding : ewOff = ewEndOff; } return (*ewOff < *ewEndOff); } OSErr PeteSetIntlText(PETEHandle pte, Handle text, long start, long end, IntlConverter *converter, TextEncoding encoding) { OSErr err; long offset = -1; PeteDelete(pte,0,0x7fffffff); PeteScroll(pte,0,pseCenterSelection); err = PeteInsertIntlText(pte, &offset, text, start, end, converter, encoding, false, true); PeteSetURLRescan(pte,0); PeteNickScan (pte); return err; } OSErr PeteInsertIntlText(PETEHandle pte, long *offset, Handle text, long start, long end, IntlConverter *converter, TextEncoding encoding, Boolean needSpace, Boolean flush) { PETEStyleEntry pse; OSErr err = noErr; Str255 outText; long outLen, inLen, usedInLen = 0; Byte hState; Ptr textPtr; Boolean bufRep, hand; if(converter == nil) { converter = &uGlobals.quickConverter; err = UpdateTECConverter(&converter->inToUnicode, nil, encoding, &converter->inToUnicodeEncoding, CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicode16BitFormat), &converter->maclatin1, nil); if(err) return err; err = UpdateIntlConverter(converter, nil); if(err) return err; } PETEGetStyle(PETE,pte,kPETECurrentStyle,nil,&pse); hand = ((text!=nil) && (start >= 0)); if(hand) { hState = HGetState(text); } else { textPtr = (Ptr)text; } flush = flush && (text != nil); do { do { bufRep = false; inLen = end - usedInLen; if(start >= 0) inLen -= start; outLen = sizeof(outText); if(hand) textPtr = LDRef(text) + start; err = ConvertIntlText(converter,textPtr?textPtr+usedInLen:(Ptr)-1L,&inLen,outText,&outLen,&encoding,needSpace?addWordSpaceConditional:dontAddWordSpace, nil); if(hand) HSetState(text, hState); if(outLen == 0) break; needSpace = false; switch(err) { case kTECOutputBufferFullStatus : case kTECArrayFullErr : bufRep = true; case noErr : err = EncodingPlusPeteStyle(encoding, &pse, nil); if(err == noErr) { **Pslh = pse; err = PETEInsertTextPtr(PETE,pte,offset==nil?kPETECurrentSelection:*offset,outText,outLen,Pslh); if((err == noErr) && (offset != nil) && (*offset >= 0L)) *offset+=outLen; } } usedInLen += inLen; } while((err == noErr) && bufRep); textPtr = nil; hand = false; start = end = usedInLen = 0L; } while((err == noErr) && !(flush = !flush)); if(!err && (converter->unicodeToMac != nil)) ClearIntlConverterContext(converter->unicodeToMac, false); return err; } Boolean EncodingError(OSStatus err) { switch(err) { case kTextUnsupportedEncodingErr : case kTextMalformedInputErr : case kTextUndefinedElementErr : case kTECPartialCharErr : case kTECUnmappableElementErr : case kTECIncompleteElementErr : return true; default : return false; } } Boolean HasUnicode() { return uGlobals.tecVersion != 0; } // End buffer *before* the CR OSStatus MessageToUTF8(Handle inText, long inOff, ByteCount inLen, AccuPtr a, int *context) { static Str31 charset; static Str15 dirs[umDirCount] = {0}; Str15 temp; ByteOffset next = inOff, last; long offset; Byte hState; TextEncoding encoding; OSStatus err = noErr; if(inLen == 0) return noErr; if(!HasUnicode()) return paramErr; err = UpgradeScriptInfoToTextEncoding(FontToScript(FontID), kTextLanguageDontCare, kTextRegionDontCare, nil, &encoding); if(err) return err; if(dirs[umFlowedDir][0] == 0) GetRString(dirs[umFlowedDir],EnrichedStrn+enXFlowed); if(dirs[umCharsetDir][0] == 0) GetRString(dirs[umCharsetDir],EnrichedStrn+enXCharset); if(dirs[umHtmlDir][0] == 0) GetRString(dirs[umHtmlDir],EnrichedStrn+enXHTML); if(dirs[umRichDir][0] == 0) GetRString(dirs[umRichDir],EnrichedStrn+enXRich); hState = HGetState(inText); while((err == noErr) && (next < inOff + inLen)) { switch(*context) { case umHeaderState : charset[0] = 0; temp[0] = temp[1] = 13; offset = SearchPtrPtr(temp, 2, LDRef(inText), next, inLen - (next - inOff), false, false, nil); HSetState(inText, hState); last = next; if(offset < 0) { next = inLen + inOff; } else { next += offset; *context = umTextState; } err = InsertIntlHeaders(inText, last, next - last, a, encoding, nil, nil); break; case umTextState : charset[0] = 0; temp[0] = 13; temp[1] = '<'; offset = SearchPtrPtr(temp, 2, LDRef(inText), next, inLen - (next - inOff), false, false, nil); HSetState(inText, hState); last = next; if(offset < 0) { next = inLen + inOff; offset = 0; } else { int i; // Move by the return next += offset + 1; for(i = umFlowedDir; i < umDirCount; ++i) { if((inLen - (next - inOff) > dirs[i][0]) && (PPtrFindSub(dirs[i], *inText + next + 1, dirs[i][0]) != nil)) { break; } } if(i < umDirCount) { *context = i; offset = next + dirs[i][0] + 1; offset = ParseCharset(*inText + offset, inLen - (offset - inOff), charset, i); } else offset = 0; } err = InternetToUTF8Text(nil, encoding, inText, last, next - last, a, false); next += offset; break; case umFlowedState : case umCharsetState : case umHtmlState : case umRichState : temp[0] = 3; temp[1] = 13; temp[2] = '<'; temp[3] = '/'; PCat(temp, dirs[*context]); temp[++temp[0]] = '>'; offset = SearchPtrPtr(&temp[1], temp[0], LDRef(inText), next, inLen - (next - inOff), false, false, nil); HSetState(inText, hState); last = next; if(offset < 0) { next = inLen + inOff; offset = 0; } else { // Move by the return next += offset + 1; *context = umTextState; offset = temp[0]; } err = InternetToUTF8Text(charset, encoding, inText, last, next - last, a, false); next += offset; } } return err; } long ParseCharset(Ptr textPtr, long len, PStr charset, umDirectives dir) { static Str15 paramDir = {0}, charsetAttr = {0}; Ptr tempPtr; long offset; Boolean done = false; if(paramDir[0] == 0) ComposeRString(paramDir, MIME_RICH_ON, EnrichedStrn+enParam); if(charsetAttr[0] == 0) { charsetAttr[++charsetAttr[0]] = ' '; PCatR(charsetAttr,HTMLAttributeStrn+htmlCharsetAttr); charsetAttr[++charsetAttr[0]] = '='; } charset[0] = 0; offset = 0; switch(dir) { case umHtmlDir : tempPtr = memchr(textPtr, '>', len); if(tempPtr) { offset = (tempPtr - textPtr) + 1; tempPtr = PPtrFindSub(charsetAttr, textPtr, offset); if(tempPtr) { len -= textPtr - tempPtr; while((len > 0) && (charset[0] < 32) && ((charset[charset[0] + 1] = *++tempPtr) != '"')) { --len; ++charset[0]; } done = (*tempPtr == '"'); } } break; case umRichDir : while((len > 0) && (*textPtr != '>')) { --len; ++textPtr; ++offset; } --len; ++textPtr; ++offset; if((len > paramDir[0]) && PPtrFindSub(paramDir, textPtr, paramDir[0])) { textPtr += paramDir[0]; offset += paramDir[0]; len -= paramDir[0]; while((len > 0) && (charset[0] < 32) && ((charset[charset[0] + 1] = *++textPtr) != '<')) { --len; ++offset; ++charset[0]; } if((*textPtr++ == '<') && (*textPtr++ == '/') && !strincmp(textPtr, ¶mDir[2], paramDir[0] - 1)) { done = true; offset += paramDir[0] + 1; } } break; case umFlowedDir : case umCharsetDir : while((len > 0) && (*textPtr != ' ') && (*textPtr != '>')) { --len; ++textPtr; ++offset; } ++offset; if(*textPtr == ' ') { while((len > 0) && (charset[0] < 32) && ((charset[charset[0] + 1] = *++textPtr) != '>')) { --len; ++offset; ++charset[0]; } } done = (*textPtr == '>'); } if(!done) { offset = 0; charset[0] = 0; } return offset; } OSStatus PeteGetUTF8Text(PETEHandle pte, long offset, long iLen, long *iUsed, UPtr out, long oLen, long *oUsed) { long runLen, usedIn, usedOut; OSStatus err = noErr; PETEStyleInfo style; LangCode lang; StringPtr fontName; UHandle text; TextEncoding encoding; Byte hState; *iUsed = 0; *oUsed = 0; runLen = 0; err = PETEGetRawText(PETE,pte,&text); if(err) return err; hState = HGetState(text); if(iLen > PeteLen(pte) - offset) iLen = PeteLen(pte) - offset; while((iLen > 0) && (oLen > 0) && !err) { if(runLen == 0) { err = PeteGetStyleRun(pte, offset + *iUsed, &runLen, &style, peFontValid|peLangValid); if(err) return err; } if(runLen > iLen) runLen = iLen; if((style.textStyle.tsFont != kPETEDefaultFont) && (style.textStyle.tsFont != kPETEDefaultFixed)) { lang = kTextLanguageDontCare; fontName = GlobalTemp; GetFontName(style.textStyle.tsFont, fontName); } else { lang = style.textStyle.tsLang; fontName = nil; } err = UpgradeScriptInfoToTextEncoding(kTextScriptDontCare, lang, kTextRegionDontCare, fontName, &encoding); if(err) return err; err = UpdateTECConverter(&uGlobals.internetToUTF8, nil, encoding, &uGlobals.internetToUTF8Encoding, UTF8_ENCODING, &uGlobals.maclatin1, nil); if(err) return err; HLock(text); err = MyTECConvertText(uGlobals.internetToUTF8, *text + offset + *iUsed, runLen, &usedIn, out + *oUsed, oLen, &usedOut, uGlobals.maclatin1); HSetState(text, hState); if(err && err != kTECOutputBufferFullStatus) return err; oLen -= usedOut; *oUsed += usedOut; iLen -= usedIn; *iUsed += usedIn; runLen -= usedIn; } return err; } OSErr PeteGetStyleRun(PETEHandle pte, long offset, long *len, PETEStyleInfoPtr style, long validBits) { OSErr err; long runLen, fullLen; PETEStyleEntry pse; *len = 0; fullLen = PETEGetTextLen(PETE, pte); err = PETEGetStyle(PETE, pte, offset, &runLen, &pse); if(err) return err; *style = pse.psStyle; runLen -= (offset - pse.psStartChar); while((runLen + offset + *len) < fullLen) { long tempLen, diffBits; PETEStyleEntry pse2; err = PETEGetStyle(PETE, pte, offset + *len + runLen, &tempLen, &pse2); if(err) break; err = PETECompareStyles(PETE, pte, &pse, &pse2, validBits, false, &diffBits); if(err) break; if(diffBits) break; *len += runLen; runLen = tempLen - pse2.psStartChar; } *len += runLen; return err; } TextEncoding CreateSystemRomanEncoding() { TextEncoding encoding; if(noErr != UpgradeScriptInfoToTextEncoding(smRoman, (LangCode)LoWord(GetScriptVariable(smRoman, smScriptLang)), kTextRegionDontCare, nil, &encoding)) return DefaultEncoding(kTextEncodingMacRoman); else return encoding; } OSStatus MyTECConvertText (TECObjectRef encodingConverter, ConstTextPtr inputBuffer, ByteCount inputBufferLength, ByteCount * actualInputLength, TextPtr outputBuffer, ByteCount outputBufferLength, ByteCount * actualOutputLength, Boolean maclatin1) { OSStatus err; if(!maclatin1) { err = TECConvertText(encodingConverter, inputBuffer, inputBufferLength, actualInputLength, outputBuffer, outputBufferLength, actualOutputLength); if (err == kTECUsedFallbacksStatus) err = noErr; } else { if(actualInputLength) *actualInputLength = 0; if(actualOutputLength) *actualOutputLength = 0; do { ByteCount curLen, usedILen, usedOLen; curLen = MIN(inputBufferLength, sizeof(GlobalTemp)); BMD(inputBuffer, GlobalTemp, curLen); TransLitRes(GlobalTemp, curLen, TRANS_IN_TABL); err = TECConvertText(encodingConverter, GlobalTemp, curLen, &usedILen, outputBuffer, outputBufferLength, &usedOLen); if (err == kTECUsedFallbacksStatus) err = noErr; if(actualInputLength) *actualInputLength += usedILen; if(actualOutputLength) *actualOutputLength = usedOLen; inputBufferLength -= usedILen; outputBufferLength -= usedOLen; inputBuffer += usedILen; outputBuffer += usedOLen; } while(inputBufferLength && !err); } return err; } long UnicodeMappingCount(TextEncoding encoding) { OptionBits matchFilter; UnicodeMapping matchMapping; ItemCount foundCount; matchMapping.unicodeEncoding = DefaultEncoding(kTextEncodingUnicodeDefault); matchMapping.otherEncoding = encoding; matchMapping.mappingVersion = kUnicodeUseLatestMapping; matchFilter = ( kUnicodeMatchUnicodeBaseMask | kUnicodeMatchUnicodeVariantMask | kUnicodeMatchUnicodeFormatMask | kUnicodeMatchOtherBaseMask | kUnicodeMatchOtherFormatMask ); return CountUnicodeMappings(matchFilter, &matchMapping, &foundCount) == noErr ? foundCount : 0; } /************************************************************************ * SniffAndConvertHandleToRoman - figure out what sort of text is in a handle and Romanize it ************************************************************************/ OSErr SniffAndConvertHandleToRoman(Handle *hp) { OSErr err; uLong snifferCount = 0; uLong textSize = GetHandleSize(*hp); unsigned char utf8Magic[] = { 0xef,0xbb,0xbf }; // if it's unicode, it may have a byte order mark at the start, which is fffe (intel), feff (network), or efbbbf (utf-8) // Handle these specially if (textSize>2 && (*(uShort *)**hp==0xfffe || *(uShort *)**hp==0xfeff)) return ConvertHandleToRoman(hp,CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicode16BitFormat),0); else if (textSize>3 && !memcmp(**hp,utf8Magic,3)) return ConvertHandleToRoman(hp,CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicodeUTF8Format),3); // if it's ascii now, leave it if (!AnyFunny(*hp,0)) return noErr; // sniffers are available in tec 1.2 & up // don't bother sniffing an empty handle if (uGlobals.tecVersion >= 0x0120 && textSize) // count all the sniffers we can; we'll sniff for everything if (!TECCountAvailableSniffers(&snifferCount)) if (snifferCount) { TextEncoding *encodings = NuPtr(snifferCount*sizeof(TextEncoding)); uLong *errors = NuPtr(snifferCount*sizeof(uLong)); uLong *features = NuPtr(snifferCount*sizeof(uLong)); if (encodings && errors && features) { // ok, we have room for everything! TECSnifferObjectRef theSniffer; if (!TECGetAvailableSniffers(encodings,snifferCount,&snifferCount)) if (snifferCount) if (!TECCreateSniffer(&theSniffer,encodings,snifferCount)) { // Finally, we've made the sniffer, so let's sniff! TECSniffTextEncoding(theSniffer,LDRef(*hp),textSize,encodings,snifferCount,errors,textSize/10,features,textSize/10); TECDisposeSniffer(theSniffer); UL(*hp); // Ok, the "best" encoding will be on top. See if it's good enough if (errors[0]