From 7af97d2dfdb0efb6da48e9c19ce05d09e0d7d191 Mon Sep 17 00:00:00 2001
From: Cameron Kaiser <classilla@floodgap.com>
Date: Fri, 6 Jul 2018 17:49:00 -0700
Subject: [PATCH] closes #502: CVE-2016-6293/7415
 CVE-2017-7867/7868/14952/15422

---
 intl/icu/source/common/locid.cpp  | 78 ++++++++++++------------
 intl/icu/source/common/uloc.cpp   | 98 ++++++++++---------------------
 intl/icu/source/common/utext.cpp  | 28 +++++++--
 intl/icu/source/i18n/gregoimp.cpp |  5 ++
 intl/icu/source/i18n/gregoimp.h   | 11 ++++
 intl/icu/source/i18n/persncal.cpp |  2 +-
 intl/icu/source/i18n/zonemeta.cpp |  1 -
 7 files changed, 112 insertions(+), 111 deletions(-)

diff --git a/intl/icu/source/common/locid.cpp b/intl/icu/source/common/locid.cpp
index 27e403065..2ee1eaaec 100644
--- a/intl/icu/source/common/locid.cpp
+++ b/intl/icu/source/common/locid.cpp
@@ -42,6 +42,7 @@
 #include "uhash.h"
 #include "ucln_cmn.h"
 #include "ustr_imp.h"
+#include "charstr.h"
 
 U_CDECL_BEGIN
 static UBool U_CALLCONV locale_cleanup(void);
@@ -57,6 +58,12 @@ static UMutex gDefaultLocaleMutex = U_MUTEX_INITIALIZER;
 static UHashtable *gDefaultLocalesHashT = NULL;
 static Locale *gDefaultLocale = NULL;
 
+/**
+ * \def ULOC_STRING_LIMIT
+ * strings beyond this value crash in CharString
+ */
+#define ULOC_STRING_LIMIT 357913941
+
 U_NAMESPACE_END
 
 typedef enum ELocalePos {
@@ -283,13 +290,12 @@ Locale::Locale( const   char * newLanguage,
     }
     else
     {
-        MaybeStackArray<char, ULOC_FULLNAME_CAPACITY> togo;
+        UErrorCode status = U_ZERO_ERROR;
         int32_t size = 0;
         int32_t lsize = 0;
         int32_t csize = 0;
         int32_t vsize = 0;
         int32_t ksize = 0;
-        char    *p;
 
         // Calculate the size of the resulting string.
 
@@ -297,13 +303,23 @@ Locale::Locale( const   char * newLanguage,
         if ( newLanguage != NULL )
         {
             lsize = (int32_t)uprv_strlen(newLanguage);
+            if ( lsize < 0 || lsize > ULOC_STRING_LIMIT ) { // int32 wrap
+                setToBogus();
+                return;
+            }
             size = lsize;
         }
 
+        CharString togo(newLanguage, lsize, status); // start with newLanguage
+
         // _Country
         if ( newCountry != NULL )
         {
             csize = (int32_t)uprv_strlen(newCountry);
+            if ( csize < 0 || csize > ULOC_STRING_LIMIT ) { // int32 wrap
+                setToBogus();
+                return;
+            }
             size += csize;
         }
 
@@ -318,6 +334,10 @@ Locale::Locale( const   char * newLanguage,
 
             // remove trailing _'s
             vsize = (int32_t)uprv_strlen(newVariant);
+            if ( vsize < 0 || vsize > ULOC_STRING_LIMIT ) { // int32 wrap
+                setToBogus();
+                return;
+            }
             while( (vsize>1) && (newVariant[vsize-1] == SEP_CHAR) )
             {
                 vsize--;
@@ -342,70 +362,56 @@ Locale::Locale( const   char * newLanguage,
         if ( newKeywords != NULL)
         {
             ksize = (int32_t)uprv_strlen(newKeywords);
+            if ( ksize < 0 || ksize > ULOC_STRING_LIMIT ) {
+              setToBogus();
+              return;
+            }
             size += ksize + 1;
         }
 
-
         //  NOW we have the full locale string..
-
-        /*if the whole string is longer than our internal limit, we need
-        to go to the heap for temporary buffers*/
-        if (size >= togo.getCapacity())
-        {
-            // If togo_heap could not be created, initialize with default settings.
-            if (togo.resize(size+1) == NULL) {
-                init(NULL, FALSE);
-            }
-        }
-
-        togo[0] = 0;
-
         // Now, copy it back.
-        p = togo.getAlias();
-        if ( lsize != 0 )
-        {
-            uprv_strcpy(p, newLanguage);
-            p += lsize;
-        }
+
+        // newLanguage is already copied
 
         if ( ( vsize != 0 ) || (csize != 0) )  // at least:  __v
         {                                      //            ^
-            *p++ = SEP_CHAR;
+            togo.append(SEP_CHAR, status);
         }
 
         if ( csize != 0 )
         {
-            uprv_strcpy(p, newCountry);
-            p += csize;
+            togo.append(newCountry, status);
         }
 
         if ( vsize != 0)
         {
-            *p++ = SEP_CHAR; // at least: __v
-
-            uprv_strncpy(p, newVariant, vsize);  // Must use strncpy because
-            p += vsize;                          // of trimming (above).
-            *p = 0; // terminate
+            togo.append(SEP_CHAR, status)
+                .append(newVariant, vsize, status);
         }
 
         if ( ksize != 0)
         {
             if (uprv_strchr(newKeywords, '=')) {
-                *p++ = '@'; /* keyword parsing */
+                togo.append('@', status); /* keyword parsing */
             }
             else {
-                *p++ = '_'; /* Variant parsing with a script */
+                togo.append('_', status); /* Variant parsing with a script */
                 if ( vsize == 0) {
-                    *p++ = '_'; /* No country found */
+                    togo.append('_', status); /* No country found */
                 }
             }
-            uprv_strcpy(p, newKeywords);
-            p += ksize;
+            togo.append(newKeywords, status);
         }
 
+        if (U_FAILURE(status)) {
+            // Something went wrong with appending, etc.
+            setToBogus();
+            return;
+        }
         // Parse it, because for example 'language' might really be a complete
         // string.
-        init(togo.getAlias(), FALSE);
+        init(togo.data(), FALSE);
     }
 }
 
diff --git a/intl/icu/source/common/uloc.cpp b/intl/icu/source/common/uloc.cpp
index bdfd0c9a6..00198439e 100644
--- a/intl/icu/source/common/uloc.cpp
+++ b/intl/icu/source/common/uloc.cpp
@@ -2246,7 +2246,7 @@ _uloc_strtod(const char *start, char **end) {
 typedef struct { 
     float q;
     int32_t dummy;  /* to avoid uninitialized memory copy from qsort */
-    char *locale;
+    char locale[ULOC_FULLNAME_CAPACITY+1];
 } _acceptLangItem;
 
 static int32_t U_CALLCONV
@@ -2288,9 +2288,7 @@ uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult
                             UEnumeration* availableLocales,
                             UErrorCode *status)
 {
-    _acceptLangItem *j;
-    _acceptLangItem smallBuffer[30];
-    char **strs;
+  icu::MaybeStackArray<_acceptLangItem, 4> items; // Struct for collecting items. 
     char tmp[ULOC_FULLNAME_CAPACITY +1];
     int32_t n = 0;
     const char *itemEnd;
@@ -2300,11 +2298,7 @@ uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult
     int32_t res;
     int32_t i;
     int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
-    int32_t jSize;
-    char *tempstr; /* Use for null pointer check */
 
-    j = smallBuffer;
-    jSize = sizeof(smallBuffer)/sizeof(smallBuffer[0]);
     if(U_FAILURE(*status)) {
         return -1;
     }
@@ -2332,93 +2326,61 @@ uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult
             while(isspace(*t)) {
                 t++;
             }
-            j[n].q = (float)_uloc_strtod(t,NULL);
+            items[n].q = (float)_uloc_strtod(t,NULL);
         } else {
             /* no semicolon - it's 1.0 */
-            j[n].q = 1.0f;
+            items[n].q = 1.0f;
             paramEnd = itemEnd;
         }
-        j[n].dummy=0;
+        items[n].dummy=0;
         /* eat spaces prior to semi */
         for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
             ;
-        /* Check for null pointer from uprv_strndup */
-        tempstr = uprv_strndup(s,(int32_t)((t+1)-s));
-        if (tempstr == NULL) {
-            *status = U_MEMORY_ALLOCATION_ERROR;
-            return -1;
+        int32_t slen = ((t+1)-s);
+        if(slen > ULOC_FULLNAME_CAPACITY) {
+          *status = U_BUFFER_OVERFLOW_ERROR;
+          return -1; // too big
         }
-        j[n].locale = tempstr;
-        uloc_canonicalize(j[n].locale,tmp,sizeof(tmp)/sizeof(tmp[0]),status);
-        if(strcmp(j[n].locale,tmp)) {
-            uprv_free(j[n].locale);
-            j[n].locale=uprv_strdup(tmp);
+        uprv_strncpy(items[n].locale, s, slen);
+        items[n].locale[slen]=0; // terminate
+        int32_t clen = uloc_canonicalize(items[n].locale, tmp, UPRV_LENGTHOF(tmp)-1, status);
+        if(U_FAILURE(*status)) return -1;
+        if((clen!=slen) || (uprv_strncmp(items[n].locale, tmp, slen))) {
+            // canonicalization had an effect- copy back
+            uprv_strncpy(items[n].locale, tmp, clen);
+            items[n].locale[clen] = 0; // terminate
         }
 #if defined(ULOC_DEBUG)
-        /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
+        /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, items[n].locale, items[n].q);*/
 #endif
         n++;
         s = itemEnd;
         while(*s==',') { /* eat duplicate commas */
             s++;
         }
-        if(n>=jSize) {
-            if(j==smallBuffer) {  /* overflowed the small buffer. */
-                j = static_cast<_acceptLangItem *>(uprv_malloc(sizeof(j[0])*(jSize*2)));
-                if(j!=NULL) {
-                    uprv_memcpy(j,smallBuffer,sizeof(j[0])*jSize);
-                }
+        if(n>=items.getCapacity()) { // If we need more items
+          if(NULL == items.resize(items.getCapacity()*2, items.getCapacity())) {
+              *status = U_MEMORY_ALLOCATION_ERROR;
+              return -1;
+          }
 #if defined(ULOC_DEBUG)
-                fprintf(stderr,"malloced at size %d\n", jSize);
+          fprintf(stderr,"malloced at size %d\n", items.getCapacity());
 #endif
-            } else {
-                j = static_cast<_acceptLangItem *>(uprv_realloc(j, sizeof(j[0])*jSize*2));
-#if defined(ULOC_DEBUG)
-                fprintf(stderr,"re-alloced at size %d\n", jSize);
-#endif
-            }
-            jSize *= 2;
-            if(j==NULL) {
-                *status = U_MEMORY_ALLOCATION_ERROR;
-                return -1;
-            }
         }
     }
-    uprv_sortArray(j, n, sizeof(j[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
+    uprv_sortArray(items.getAlias(), n, sizeof(items[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
+    icu::LocalArray<const char*> strs(new const char*[n], *status);
     if(U_FAILURE(*status)) {
-        if(j != smallBuffer) {
-#if defined(ULOC_DEBUG)
-            fprintf(stderr,"freeing j %p\n", j);
-#endif
-            uprv_free(j);
-        }
-        return -1;
-    }
-    strs = static_cast<char **>(uprv_malloc((size_t)(sizeof(strs[0])*n)));
-    /* Check for null pointer */
-    if (strs == NULL) {
-        uprv_free(j); /* Free to avoid memory leak */
-        *status = U_MEMORY_ALLOCATION_ERROR;
-        return -1;
+      return -1;
     }
     for(i=0;i<n;i++) {
 #if defined(ULOC_DEBUG)
-        /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
+        /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, items[i].locale, items[i].q);*/
 #endif
-        strs[i]=j[i].locale;
+        strs[i]=items[i].locale;
     }
     res =  uloc_acceptLanguage(result, resultAvailable, outResult, 
-        (const char**)strs, n, availableLocales, status);
-    for(i=0;i<n;i++) {
-        uprv_free(strs[i]);
-    }
-    uprv_free(strs);
-    if(j != smallBuffer) {
-#if defined(ULOC_DEBUG)
-        fprintf(stderr,"freeing j %p\n", j);
-#endif
-        uprv_free(j);
-    }
+                               strs.getAlias(), n, availableLocales, status);
     return res;
 }
 
diff --git a/intl/icu/source/common/utext.cpp b/intl/icu/source/common/utext.cpp
index d2f37baf9..010f752ea 100644
--- a/intl/icu/source/common/utext.cpp
+++ b/intl/icu/source/common/utext.cpp
@@ -845,11 +845,18 @@ U_CDECL_END
 //------------------------------------------------------------------------------
 
 // Chunk size.
-//     Must be less than 85, because of byte mapping from UChar indexes to native indexes.
-//     Worst case is three native bytes to one UChar.  (Supplemenaries are 4 native bytes
-//     to two UChars.)
+//     Must be less than 42  (256/6), because of byte mapping from UChar indexes to native indexes.
+//     Worst case there are six UTF-8 bytes per UChar.
+//         obsolete 6 byte form fd + 5 trails maps to fffd
+//         obsolete 5 byte form fc + 4 trails maps to fffd
+//         non-shortest 4 byte forms maps to fffd
+//         normal supplementaries map to a pair of utf-16, two utf8 bytes per utf-16 unit
+//     mapToUChars array size must allow for the worst case, 6.
+//     This could be brought down to 4, by treating fd and fc as pure illegal,
+//     rather than obsolete lead bytes. But that is not compatible with the utf-8 access macros.
 //
 enum { UTF8_TEXT_CHUNK_SIZE=32 };
+enum { SIZEOF_MAPTOUCHARS=UTF8_TEXT_CHUNK_SIZE*6+6 };
 
 //
 // UTF8Buf  Two of these structs will be set up in the UText's extra allocated space.
@@ -887,7 +894,7 @@ struct UTF8Buf {
                                                      //  Requires two extra slots,
                                                      //    one for a supplementary starting in the last normal position,
                                                      //    and one for an entry for the buffer limit position.
-    uint8_t   mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
+    uint8_t   mapToUChars[SIZEOF_MAPTOUCHARS]; // Map native offset from bufNativeStart to
                                                      //   correspoding offset in filled part of buf.
     int32_t   align;
 };
@@ -1030,6 +1037,7 @@ utf8TextAccess(UText *ut, int64_t index, UBool forward) {
             // Requested index is in this buffer.
             u8b = (UTF8Buf *)ut->p;   // the current buffer
             mapIndex = ix - u8b->toUCharsMapStart;
+            U_ASSERT(mapIndex < (int32_t)SIZEOF_MAPTOUCHARS);
             ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
             return TRUE;
 
@@ -1296,6 +1304,10 @@ fillReverse:
         // Can only do this if the incoming index is somewhere in the interior of the string.
         //   If index is at the end, there is no character there to look at.
         if (ix != ut->b) {
+            // Note: this function will only move the index back if it is on a trail byte
+            //       and there is a preceding lead byte and the sequence from the lead 
+            //       through this trail could be part of a valid UTF-8 sequence
+            //       Otherwise the index remains unchanged.
             U8_SET_CP_START(s8, 0, ix);
         }
 
@@ -1309,7 +1321,10 @@ fillReverse:
         UChar   *buf = u8b->buf;
         uint8_t *mapToNative = u8b->mapToNative;
         uint8_t *mapToUChars = u8b->mapToUChars;
-        int32_t  toUCharsMapStart = ix - (UTF8_TEXT_CHUNK_SIZE*3 + 1);
+        int32_t  toUCharsMapStart = ix - SIZEOF_MAPTOUCHARS + 1;
+        // Note that toUCharsMapStart can be negative. Happens when the remaining
+        // text from current position to the beginning is less than the buffer size.
+        // + 1 because mapToUChars must have a slot at the end for the bufNativeLimit entry.
         int32_t  destIx = UTF8_TEXT_CHUNK_SIZE+2;   // Start in the overflow region
                                                     //   at end of buffer to leave room
                                                     //   for a surrogate pair at the
@@ -1336,6 +1351,7 @@ fillReverse:
             if (c<0x80) {
                 // Special case ASCII range for speed.
                 buf[destIx] = (UChar)c;
+                U_ASSERT(toUCharsMapStart <= srcIx);
                 mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
                 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
             } else {
@@ -1365,6 +1381,7 @@ fillReverse:
                 do {
                     mapToUChars[sIx-- - toUCharsMapStart] = (uint8_t)destIx;
                 } while (sIx >= srcIx);
+                U_ASSERT(toUCharsMapStart <= (srcIx+1));
 
                 // Set native indexing limit to be the current position.
                 //   We are processing a non-ascii, non-native-indexing char now;
@@ -1539,6 +1556,7 @@ utf8TextMapIndexToUTF16(const UText *ut, int64_t index64) {
     U_ASSERT(index>=ut->chunkNativeStart+ut->nativeIndexingLimit);
     U_ASSERT(index<=ut->chunkNativeLimit);
     int32_t mapIndex = index - u8b->toUCharsMapStart;
+    U_ASSERT(mapIndex < (int32_t)SIZEOF_MAPTOUCHARS);
     int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
     U_ASSERT(offset>=0 && offset<=ut->chunkLength);
     return offset;
diff --git a/intl/icu/source/i18n/gregoimp.cpp b/intl/icu/source/i18n/gregoimp.cpp
index 08a3fbcef..30def0ef2 100644
--- a/intl/icu/source/i18n/gregoimp.cpp
+++ b/intl/icu/source/i18n/gregoimp.cpp
@@ -29,6 +29,11 @@ int32_t ClockMath::floorDivide(int32_t numerator, int32_t denominator) {
         numerator / denominator : ((numerator + 1) / denominator) - 1;
 }
 
+int64_t ClockMath::floorDivide(int64_t numerator, int64_t denominator) {
+    return (numerator >= 0) ?
+        numerator / denominator : ((numerator + 1) / denominator) - 1;
+}
+
 int32_t ClockMath::floorDivide(double numerator, int32_t denominator,
                           int32_t& remainder) {
     double quotient;
diff --git a/intl/icu/source/i18n/gregoimp.h b/intl/icu/source/i18n/gregoimp.h
index f65d14101..3f4d821a1 100644
--- a/intl/icu/source/i18n/gregoimp.h
+++ b/intl/icu/source/i18n/gregoimp.h
@@ -38,6 +38,17 @@ class ClockMath {
      */
     static int32_t floorDivide(int32_t numerator, int32_t denominator);
 
+    /**
+     * Divide two integers, returning the floor of the quotient.
+     * Unlike the built-in division, this is mathematically
+     * well-behaved.  E.g., <code>-1/4</code> => 0 but
+     * <code>floorDivide(-1,4)</code> => -1.
+     * @param numerator the numerator
+     * @param denominator a divisor which must be != 0
+     * @return the floor of the quotient
+     */
+    static int64_t floorDivide(int64_t numerator, int64_t denominator);
+
     /**
      * Divide two numbers, returning the floor of the quotient.
      * Unlike the built-in division, this is mathematically
diff --git a/intl/icu/source/i18n/persncal.cpp b/intl/icu/source/i18n/persncal.cpp
index bcc1411ea..775a493c2 100644
--- a/intl/icu/source/i18n/persncal.cpp
+++ b/intl/icu/source/i18n/persncal.cpp
@@ -211,7 +211,7 @@ void PersianCalendar::handleComputeFields(int32_t julianDay, UErrorCode &/*statu
     int32_t year, month, dayOfMonth, dayOfYear;
 
     int32_t daysSinceEpoch = julianDay - PERSIAN_EPOCH;
-    year = 1 + ClockMath::floorDivide(33 * daysSinceEpoch + 3, 12053);
+    year = 1 + (int32_t)ClockMath::floorDivide(33 * (int64_t)daysSinceEpoch + 3, (int64_t)12053);
 
     int32_t farvardin1 = 365 * (year - 1) + ClockMath::floorDivide(8 * year + 21, 33);
     dayOfYear = (daysSinceEpoch - farvardin1); // 0-based
diff --git a/intl/icu/source/i18n/zonemeta.cpp b/intl/icu/source/i18n/zonemeta.cpp
index e2c75e557..d78e2acc0 100644
--- a/intl/icu/source/i18n/zonemeta.cpp
+++ b/intl/icu/source/i18n/zonemeta.cpp
@@ -681,7 +681,6 @@ ZoneMeta::createMetazoneMappings(const UnicodeString &tzid) {
                     mzMappings = new UVector(deleteOlsonToMetaMappingEntry, NULL, status);
                     if (U_FAILURE(status)) {
                         delete mzMappings;
-                        deleteOlsonToMetaMappingEntry(entry);
                         uprv_free(entry);
                         break;
                     }