ls: fix handling of broken unicode sequences

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
Denys Vlasenko 2010-01-31 05:55:55 +01:00
parent d8528b8e56
commit 3d5b606931
2 changed files with 49 additions and 44 deletions

View File

@ -139,6 +139,8 @@ size_t FAST_FUNC wcstombs(char *dest, const wchar_t *src, size_t n)
return org_n - n; return org_n - n;
} }
#define ERROR_WCHAR (~(wchar_t)0)
static const char *mbstowc_internal(wchar_t *res, const char *src) static const char *mbstowc_internal(wchar_t *res, const char *src)
{ {
int bytes; int bytes;
@ -159,16 +161,22 @@ static const char *mbstowc_internal(wchar_t *res, const char *src)
c <<= 1; c <<= 1;
bytes++; bytes++;
} while ((c & 0x80) && bytes < 6); } while ((c & 0x80) && bytes < 6);
if (bytes == 1) if (bytes == 1) {
return NULL; /* A bare "continuation" byte. Say, 80 */
*res = ERROR_WCHAR;
return src;
}
c = (uint8_t)(c) >> bytes; c = (uint8_t)(c) >> bytes;
while (--bytes) { while (--bytes) {
unsigned ch = (unsigned char) *src++; unsigned ch = (unsigned char) *src;
if ((ch & 0xc0) != 0x80) { if ((ch & 0xc0) != 0x80) {
return NULL; /* Missing "continuation" byte. Example: e0 80 */
*res = ERROR_WCHAR;
return src;
} }
c = (c << 6) + (ch & 0x3f); c = (c << 6) + (ch & 0x3f);
src++;
} }
/* TODO */ /* TODO */
@ -177,8 +185,8 @@ static const char *mbstowc_internal(wchar_t *res, const char *src)
/* 11110000 10000000 10000100 10000000 converts to 0x100 */ /* 11110000 10000000 10000100 10000000 converts to 0x100 */
/* correct encoding: 11000100 10000000 */ /* correct encoding: 11000100 10000000 */
if (c <= 0x7f) { /* crude check */ if (c <= 0x7f) { /* crude check */
return NULL; *res = ERROR_WCHAR;
//or maybe 0xfffd; /* replacement character */ return src;
} }
*res = c; *res = c;
@ -204,7 +212,7 @@ size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n)
while (n) { while (n) {
wchar_t wc; wchar_t wc;
src = mbstowc_internal(&wc, src); src = mbstowc_internal(&wc, src);
if (src == NULL) /* error */ if (wc == ERROR_WCHAR) /* error */
return (size_t) -1L; return (size_t) -1L;
if (dest) if (dest)
*dest++ = wc; *dest++ = wc;
@ -312,20 +320,15 @@ static char* FAST_FUNC unicode_conv_to_printable2(uni_stat_t *stats, const char
goto subst; goto subst;
} }
#else #else
{ src = mbstowc_internal(&wc, src);
const char *src1 = mbstowc_internal(&wc, src); /* src is advanced to next mb char
/* src = NULL: invalid sequence is seen, * wc == ERROR_WCHAR: invalid sequence is seen
* else: wc is set, src is advanced to next mb char * else: wc is set
*/ */
if (src1) { /* no error */ if (wc == ERROR_WCHAR) /* error */
goto subst;
if (wc == 0) /* end-of-string */ if (wc == 0) /* end-of-string */
break; break;
src = src1;
} else { /* error */
src++;
goto subst;
}
}
#endif #endif
if (CONFIG_LAST_SUPPORTED_WCHAR && wc > CONFIG_LAST_SUPPORTED_WCHAR) if (CONFIG_LAST_SUPPORTED_WCHAR && wc > CONFIG_LAST_SUPPORTED_WCHAR)
goto subst; goto subst;
@ -411,7 +414,7 @@ unsigned FAST_FUNC unicode_padding_to_width(unsigned width, const char *src)
} }
#else #else
src = mbstowc_internal(&wc, src); src = mbstowc_internal(&wc, src);
if (!src || wc == 0) /* error, or end-of-string */ if (wc == ERROR_WCHAR || wc == 0) /* error, or end-of-string */
return width; return width;
#endif #endif
w = wcwidth(wc); w = wcwidth(wc);

View File

@ -11,9 +11,11 @@ mkdir ls.testdir || exit 1
# testing "test name" "command" "expected result" "file input" "stdin" # testing "test name" "command" "expected result" "file input" "stdin"
# The test isn't passing correctly now - all | chars should line up # With Unicode provided by libc locale, I'm not sure this test can pass.
# perfectly in the correctly passed test. # I suspect we might fail to skip exactly correct number of bytes
# over broked unicode sequences.
test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
&& test x"$CONFIG_LOCALE_SUPPORT" != x"y" \
&& test x"$CONFIG_SUBST_WCHAR" = x"63" \ && test x"$CONFIG_SUBST_WCHAR" = x"63" \
&& test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \ && test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \
&& testing "ls unicode test" \ && testing "ls unicode test" \
@ -73,40 +75,40 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
0053____"?_?_"____________________________________________________________| 0053____"?_?_"____________________________________________________________|
0054_3.3__Sequences_with_last_continuation_byte_missing___________________| 0054_3.3__Sequences_with_last_continuation_byte_missing___________________|
0055_3.3.1__2-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| 0055_3.3.1__2-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
0056_3.3.2__3-byte_sequence_with_last_byte_missing__U+0000_:_____"??"______| 0056_3.3.2__3-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
0057_3.3.3__4-byte_sequence_with_last_byte_missing__U+0000_:_____"???"______| 0057_3.3.3__4-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
0058_3.3.4__5-byte_sequence_with_last_byte_missing__U+0000_:_____"????"______| 0058_3.3.4__5-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
0059_3.3.5__6-byte_sequence_with_last_byte_missing__U+0000_:_____"?????"______| 0059_3.3.5__6-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
0060_3.3.6__2-byte_sequence_with_last_byte_missing__U-000007FF_:_"?"______| 0060_3.3.6__2-byte_sequence_with_last_byte_missing__U-000007FF_:_"?"______|
0061_3.3.7__3-byte_sequence_with_last_byte_missing__U-0000FFFF_:_"??"______| 0061_3.3.7__3-byte_sequence_with_last_byte_missing__U-0000FFFF_:_"?"______|
0062_3.3.8__4-byte_sequence_with_last_byte_missing__U-001FFFFF_:_"???"______| 0062_3.3.8__4-byte_sequence_with_last_byte_missing__U-001FFFFF_:_"?"______|
0063_3.3.9__5-byte_sequence_with_last_byte_missing__U-03FFFFFF_:_"????"______| 0063_3.3.9__5-byte_sequence_with_last_byte_missing__U-03FFFFFF_:_"?"______|
0064_3.3.10_6-byte_sequence_with_last_byte_missing__U-7FFFFFFF_:_"?????"______| 0064_3.3.10_6-byte_sequence_with_last_byte_missing__U-7FFFFFFF_:_"?"______|
0065_3.4__Concatenation_of_incomplete_sequences___________________________| 0065_3.4__Concatenation_of_incomplete_sequences___________________________|
0066____"??????????????????????????????"______________________________________________________| 0066____"??????????"______________________________________________________|
0067_3.5__Impossible_bytes________________________________________________| 0067_3.5__Impossible_bytes________________________________________________|
0068_3.5.1__fe_=_"?"______________________________________________________| 0068_3.5.1__fe_=_"?"______________________________________________________|
0069_3.5.2__ff_=_"?"______________________________________________________| 0069_3.5.2__ff_=_"?"______________________________________________________|
0070_3.5.3__fe_fe_ff_ff_=_"????"__________________________________________| 0070_3.5.3__fe_fe_ff_ff_=_"????"__________________________________________|
0071_4__Overlong_sequences________________________________________________| 0071_4__Overlong_sequences________________________________________________|
0072_4.1__Examples_of_an_overlong_ASCII_character_________________________| 0072_4.1__Examples_of_an_overlong_ASCII_character_________________________|
0073_4.1.1_U+002F_=_c0_af_____________=_"??"_______________________________| 0073_4.1.1_U+002F_=_c0_af_____________=_"?"_______________________________|
0074_4.1.2_U+002F_=_e0_80_af__________=_"???"_______________________________| 0074_4.1.2_U+002F_=_e0_80_af__________=_"?"_______________________________|
0075_4.1.3_U+002F_=_f0_80_80_af_______=_"????"_______________________________| 0075_4.1.3_U+002F_=_f0_80_80_af_______=_"?"_______________________________|
0076_4.1.4_U+002F_=_f8_80_80_80_af____=_"?????"_______________________________| 0076_4.1.4_U+002F_=_f8_80_80_80_af____=_"?"_______________________________|
0077_4.1.5_U+002F_=_fc_80_80_80_80_af_=_"??????"_______________________________| 0077_4.1.5_U+002F_=_fc_80_80_80_80_af_=_"?"_______________________________|
0078_4.2__Maximum_overlong_sequences______________________________________| 0078_4.2__Maximum_overlong_sequences______________________________________|
0079_4.2.1__U-0000007F_=_c1_bf_____________=_"??"__________________________| 0079_4.2.1__U-0000007F_=_c1_bf_____________=_"?"__________________________|
0080_4.2.2__U-000007FF_=_e0_9f_bf__________=_"?"__________________________| 0080_4.2.2__U-000007FF_=_e0_9f_bf__________=_"?"__________________________|
0081_4.2.3__U-0000FFFF_=_f0_8f_bf_bf_______=_"?"__________________________| 0081_4.2.3__U-0000FFFF_=_f0_8f_bf_bf_______=_"?"__________________________|
0082_4.2.4__U-001FFFFF_=_f8_87_bf_bf_bf____=_"?"__________________________| 0082_4.2.4__U-001FFFFF_=_f8_87_bf_bf_bf____=_"?"__________________________|
0083_4.2.5__U-03FFFFFF_=_fc_83_bf_bf_bf_bf_=_"?"__________________________| 0083_4.2.5__U-03FFFFFF_=_fc_83_bf_bf_bf_bf_=_"?"__________________________|
0084_4.3__Overlong_representation_of_the_NUL_character____________________| 0084_4.3__Overlong_representation_of_the_NUL_character____________________|
0085_4.3.1__U+0000_=_c0_80_____________=_"??"______________________________| 0085_4.3.1__U+0000_=_c0_80_____________=_"?"______________________________|
0086_4.3.2__U+0000_=_e0_80_80__________=_"???"______________________________| 0086_4.3.2__U+0000_=_e0_80_80__________=_"?"______________________________|
0087_4.3.3__U+0000_=_f0_80_80_80_______=_"????"______________________________| 0087_4.3.3__U+0000_=_f0_80_80_80_______=_"?"______________________________|
0088_4.3.4__U+0000_=_f8_80_80_80_80____=_"?????"______________________________| 0088_4.3.4__U+0000_=_f8_80_80_80_80____=_"?"______________________________|
0089_4.3.5__U+0000_=_fc_80_80_80_80_80_=_"??????"______________________________| 0089_4.3.5__U+0000_=_fc_80_80_80_80_80_=_"?"______________________________|
0090_5__Illegal_code_positions____________________________________________| 0090_5__Illegal_code_positions____________________________________________|
0091_5.1_Single_UTF-16_surrogates_________________________________________| 0091_5.1_Single_UTF-16_surrogates_________________________________________|
0092_5.1.1__U+D800_=_ed_a0_80_=_"?"_______________________________________| 0092_5.1.1__U+D800_=_ed_a0_80_=_"?"_______________________________________|