From aa167556cd2954bb9a9fb0a005178462087a4600 Mon Sep 17 00:00:00 2001 From: Tomas Heinrich Date: Fri, 26 Mar 2010 13:13:24 +0100 Subject: [PATCH] unicode: optional table for better handling of neutral bidi chars Off: function old new delta unicode_bidi_isrtl - 55 +55 isrtl_str 51 65 +14 unicode_isrtl 55 - -55 read_line_input 5003 4937 -66 ------------------------------------------------------------------------------ (add/remove: 1/4 grow/shrink: 1/1 up/down: 69/-121) Total: -52 bytes On: function old new delta static.neutral_b - 320 +320 static.neutral_p - 142 +142 unicode_bidi_isrtl - 55 +55 unicode_bidi_is_neutral_wchar - 55 +55 isrtl_str 51 59 +8 unicode_isrtl 55 - -55 read_line_input 5003 4937 -66 ------------------------------------------------------------------------------ (add/remove: 4/4 grow/shrink: 1/1 up/down: 580/-121) Total: 459 bytes Signed-off-by: Tomas Heinrich Signed-off-by: Denys Vlasenko --- Config.in | 13 +- include/unicode.h | 11 +- libbb/lineedit.c | 14 +- libbb/unicode.c | 348 +++++++++++++++++++++++++++++----------- libbb/unicode_wcwidth.c | 4 +- 5 files changed, 284 insertions(+), 106 deletions(-) diff --git a/Config.in b/Config.in index e0c01f3ef..4439ce4f9 100644 --- a/Config.in +++ b/Config.in @@ -198,12 +198,23 @@ config UNICODE_WIDE_WCHARS config UNICODE_BIDI_SUPPORT bool "Bidirectional character-aware line input" - default y + default n depends on FEATURE_ASSUME_UNICODE && !LOCALE_SUPPORT help With this option on, right-to-left Unicode characters are treated differently on input (e.g. cursor movement). +config UNICODE_NEUTRAL_TABLE + bool "In bidi input, support non-ASCII neutral chars too" + default n + depends on UNICODE_BIDI_SUPPORT + help + In most cases it's enough to treat only ASCII non-letters + (i.e. punctuation, numbers and space) as characters + with neutral directionality. + With this option on, more extensive (and bigger) table + of neutral chars will be used. + config LONG_OPTS bool "Support for --long-options" default y diff --git a/include/unicode.h b/include/unicode.h index 05bdbca02..deb4022c3 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -18,7 +18,8 @@ enum { UNICODE_ON = 2, }; -#define unicode_isrtl(wc) 0 +#define unicode_bidi_isrtl(wc) 0 +#define unicode_bidi_is_neutral_wchar(wc) (wc <= 126 && !isalpha(wc)) #if !ENABLE_FEATURE_ASSUME_UNICODE @@ -92,8 +93,12 @@ int iswspace(wint_t wc) FAST_FUNC; int iswalnum(wint_t wc) FAST_FUNC; int iswpunct(wint_t wc) FAST_FUNC; # if ENABLE_UNICODE_BIDI_SUPPORT -# undef unicode_isrtl -int unicode_isrtl(wint_t wc) FAST_FUNC; +# undef unicode_bidi_isrtl +int unicode_bidi_isrtl(wint_t wc) FAST_FUNC; +# if ENABLE_UNICODE_NEUTRAL_TABLE +# undef unicode_bidi_is_neutral_wchar +int unicode_bidi_is_neutral_wchar(wint_t wc) FAST_FUNC; +# endif # endif diff --git a/libbb/lineedit.c b/libbb/lineedit.c index be022e8ae..38a09cb26 100644 --- a/libbb/lineedit.c +++ b/libbb/lineedit.c @@ -1742,9 +1742,10 @@ static int lineedit_read_key(char *read_key_buffer) static int isrtl_str(void) { int idx = cursor; - while (command_ps[idx] >= ' ' && command_ps[idx] < 127 && !isalpha(command_ps[idx])) + + while (idx < command_len && unicode_bidi_is_neutral_wchar(command_ps[idx])) idx++; - return unicode_isrtl(command_ps[idx]); + return unicode_bidi_isrtl(command_ps[idx]); } #else # define isrtl_str() 0 @@ -2220,19 +2221,18 @@ int FAST_FUNC read_line_input(const char *prompt, char *command, int maxsize, li command_ps[cursor] = ic; command_ps[cursor + 1] = BB_NUL; cmdedit_set_out_char(' '); - if (unicode_isrtl(ic)) + if (unicode_bidi_isrtl(ic)) input_backward(1); } else { /* In the middle, insert */ - /* is char right-to-left, or "neutral" one (e.g. comma) added to rtl text? */ - int rtl = ENABLE_UNICODE_BIDI_SUPPORT ? (unicode_isrtl(ic) || (ic < 127 && !isalpha(ic) && isrtl_str())) : 0; int sc = cursor; memmove(command_ps + sc + 1, command_ps + sc, (command_len - sc) * sizeof(command_ps[0])); command_ps[sc] = ic; - if (!rtl) - sc++; + /* is right-to-left char, or neutral one (e.g. comma) was just added to rtl text? */ + if (!isrtl_str()) + sc++; /* no */ /* rewrite from cursor */ input_end(); /* to prev x pos + 1 */ diff --git a/libbb/unicode.c b/libbb/unicode.c index 91667ea72..bc9714562 100644 --- a/libbb/unicode.c +++ b/libbb/unicode.c @@ -242,7 +242,7 @@ int FAST_FUNC iswpunct(wint_t wc) #include "unicode_wcwidth.c" # if ENABLE_UNICODE_BIDI_SUPPORT -int FAST_FUNC unicode_isrtl(wint_t wc) +int FAST_FUNC unicode_bidi_isrtl(wint_t wc) { /* ranges taken from * http://www.unicode.org/Public/5.2.0/ucd/extracted/DerivedBidiClass.txt @@ -251,43 +251,44 @@ int FAST_FUNC unicode_isrtl(wint_t wc) static const struct interval rtl_b[] = { # define BIG_(a,b) { a, b }, # define PAIR(a,b) - PAIR(0x0590, 0x0590) - PAIR(0x05BE, 0x05BE) - PAIR(0x05C0, 0x05C0) - PAIR(0x05C3, 0x05C3) - PAIR(0x05C6, 0x05C6) - BIG_(0x05C8, 0x05FF) - PAIR(0x0604, 0x0605) - PAIR(0x0608, 0x0608) - PAIR(0x060B, 0x060B) - PAIR(0x060D, 0x060D) - BIG_(0x061B, 0x064A) - PAIR(0x065F, 0x065F) - PAIR(0x066D, 0x066F) - BIG_(0x0671, 0x06D5) - PAIR(0x06E5, 0x06E6) - PAIR(0x06EE, 0x06EF) - BIG_(0x06FA, 0x070E) - PAIR(0x0710, 0x0710) - BIG_(0x0712, 0x072F) - BIG_(0x074B, 0x07A5) - BIG_(0x07B1, 0x07EA) - PAIR(0x07F4, 0x07F5) - BIG_(0x07FA, 0x0815) - PAIR(0x081A, 0x081A) - PAIR(0x0824, 0x0824) - PAIR(0x0828, 0x0828) - BIG_(0x082E, 0x08FF) - PAIR(0x200F, 0x200F) - PAIR(0x202B, 0x202B) - PAIR(0x202E, 0x202E) - BIG_(0xFB1D, 0xFB1D) - BIG_(0xFB1F, 0xFB28) - BIG_(0xFB2A, 0xFD3D) - BIG_(0xFD40, 0xFDCF) - BIG_(0xFDC8, 0xFDCF) - BIG_(0xFDF0, 0xFDFC) - BIG_(0xFDFE, 0xFDFF) +# define ARRAY \ + PAIR(0x0590, 0x0590) \ + PAIR(0x05BE, 0x05BE) \ + PAIR(0x05C0, 0x05C0) \ + PAIR(0x05C3, 0x05C3) \ + PAIR(0x05C6, 0x05C6) \ + BIG_(0x05C8, 0x05FF) \ + PAIR(0x0604, 0x0605) \ + PAIR(0x0608, 0x0608) \ + PAIR(0x060B, 0x060B) \ + PAIR(0x060D, 0x060D) \ + BIG_(0x061B, 0x064A) \ + PAIR(0x065F, 0x065F) \ + PAIR(0x066D, 0x066F) \ + BIG_(0x0671, 0x06D5) \ + PAIR(0x06E5, 0x06E6) \ + PAIR(0x06EE, 0x06EF) \ + BIG_(0x06FA, 0x070E) \ + PAIR(0x0710, 0x0710) \ + BIG_(0x0712, 0x072F) \ + BIG_(0x074B, 0x07A5) \ + BIG_(0x07B1, 0x07EA) \ + PAIR(0x07F4, 0x07F5) \ + BIG_(0x07FA, 0x0815) \ + PAIR(0x081A, 0x081A) \ + PAIR(0x0824, 0x0824) \ + PAIR(0x0828, 0x0828) \ + BIG_(0x082E, 0x08FF) \ + PAIR(0x200F, 0x200F) \ + PAIR(0x202B, 0x202B) \ + PAIR(0x202E, 0x202E) \ + BIG_(0xFB1D, 0xFB1D) \ + BIG_(0xFB1F, 0xFB28) \ + BIG_(0xFB2A, 0xFD3D) \ + BIG_(0xFD40, 0xFDCF) \ + BIG_(0xFDC8, 0xFDCF) \ + BIG_(0xFDF0, 0xFDFC) \ + BIG_(0xFDFE, 0xFDFF) \ BIG_(0xFE70, 0xFEFE) /* Probably not necessary {0x10800, 0x1091E}, @@ -302,68 +303,21 @@ int FAST_FUNC unicode_isrtl(wint_t wc) {0x10E7F, 0x10FFF}, {0x1E800, 0x1EFFF} */ + ARRAY # undef BIG_ # undef PAIR }; - - static const uint16_t rtl_p[] = { # define BIG_(a,b) # define PAIR(a,b) (a << 2) | (b-a), - /* Exact copy-n-paste of the above: */ - PAIR(0x0590, 0x0590) - PAIR(0x05BE, 0x05BE) - PAIR(0x05C0, 0x05C0) - PAIR(0x05C3, 0x05C3) - PAIR(0x05C6, 0x05C6) - BIG_(0x05C8, 0x05FF) - PAIR(0x0604, 0x0605) - PAIR(0x0608, 0x0608) - PAIR(0x060B, 0x060B) - PAIR(0x060D, 0x060D) - BIG_(0x061B, 0x064A) - PAIR(0x065F, 0x065F) - PAIR(0x066D, 0x066F) - BIG_(0x0671, 0x06D5) - PAIR(0x06E5, 0x06E6) - PAIR(0x06EE, 0x06EF) - BIG_(0x06FA, 0x070E) - PAIR(0x0710, 0x0710) - BIG_(0x0712, 0x072F) - BIG_(0x074B, 0x07A5) - BIG_(0x07B1, 0x07EA) - PAIR(0x07F4, 0x07F5) - BIG_(0x07FA, 0x0815) - PAIR(0x081A, 0x081A) - PAIR(0x0824, 0x0824) - PAIR(0x0828, 0x0828) - BIG_(0x082E, 0x08FF) - PAIR(0x200F, 0x200F) - PAIR(0x202B, 0x202B) - PAIR(0x202E, 0x202E) - BIG_(0xFB1D, 0xFB1D) - BIG_(0xFB1F, 0xFB28) - BIG_(0xFB2A, 0xFD3D) - BIG_(0xFD40, 0xFDCF) - BIG_(0xFDC8, 0xFDCF) - BIG_(0xFDF0, 0xFDFC) - BIG_(0xFDFE, 0xFDFF) - BIG_(0xFE70, 0xFEFE) - /* Probably not necessary - {0x10800, 0x1091E}, - {0x10920, 0x10A00}, - {0x10A04, 0x10A04}, - {0x10A07, 0x10A0B}, - {0x10A10, 0x10A37}, - {0x10A3B, 0x10A3E}, - {0x10A40, 0x10A7F}, - {0x10B36, 0x10B38}, - {0x10B40, 0x10E5F}, - {0x10E7F, 0x10FFF}, - {0x1E800, 0x1EFFF} - */ + static const uint16_t rtl_p[] = { ARRAY }; # undef BIG_ # undef PAIR - }; +# define BIG_(a,b) char big_##a[b < 0x4000 && b-a <= 3 ? -1 : 1]; +# define PAIR(a,b) char pair##a[b >= 0x4000 || b-a > 3 ? -1 : 1]; + struct CHECK { ARRAY }; +# undef BIG_ +# undef PAIR +# undef ARRAY if (in_interval_table(wc, rtl_b, ARRAY_SIZE(rtl_b) - 1)) return 1; @@ -371,6 +325,214 @@ int FAST_FUNC unicode_isrtl(wint_t wc) return 1; return 0; } + +# if ENABLE_UNICODE_NEUTRAL_TABLE +int FAST_FUNC unicode_bidi_is_neutral_wchar(wint_t wc) +{ + /* ranges taken from + * http://www.unicode.org/Public/5.2.0/ucd/extracted/DerivedBidiClass.txt + * Bidi_Classes: Paragraph_Separator, Segment_Separator, + * White_Space, Other_Neutral, European_Number, European_Separator, + * European_Terminator, Arabic_Number, Common_Separator + */ + static const struct interval neutral_b[] = { +# define BIG_(a,b) { a, b }, +# define PAIR(a,b) +# define ARRAY \ + BIG_(0x0009, 0x000D) \ + BIG_(0x001C, 0x0040) \ + BIG_(0x005B, 0x0060) \ + PAIR(0x007B, 0x007E) \ + PAIR(0x0085, 0x0085) \ + BIG_(0x00A0, 0x00A9) \ + PAIR(0x00AB, 0x00AC) \ + BIG_(0x00AE, 0x00B4) \ + PAIR(0x00B6, 0x00B9) \ + BIG_(0x00BB, 0x00BF) \ + PAIR(0x00D7, 0x00D7) \ + PAIR(0x00F7, 0x00F7) \ + PAIR(0x02B9, 0x02BA) \ + BIG_(0x02C2, 0x02CF) \ + BIG_(0x02D2, 0x02DF) \ + BIG_(0x02E5, 0x02FF) \ + PAIR(0x0374, 0x0375) \ + PAIR(0x037E, 0x037E) \ + PAIR(0x0384, 0x0385) \ + PAIR(0x0387, 0x0387) \ + PAIR(0x03F6, 0x03F6) \ + PAIR(0x058A, 0x058A) \ + PAIR(0x0600, 0x0603) \ + PAIR(0x0606, 0x0607) \ + PAIR(0x0609, 0x060A) \ + PAIR(0x060C, 0x060C) \ + PAIR(0x060E, 0x060F) \ + BIG_(0x0660, 0x066C) \ + PAIR(0x06DD, 0x06DD) \ + PAIR(0x06E9, 0x06E9) \ + BIG_(0x06F0, 0x06F9) \ + PAIR(0x07F6, 0x07F9) \ + PAIR(0x09F2, 0x09F3) \ + PAIR(0x09FB, 0x09FB) \ + PAIR(0x0AF1, 0x0AF1) \ + BIG_(0x0BF3, 0x0BFA) \ + BIG_(0x0C78, 0x0C7E) \ + PAIR(0x0CF1, 0x0CF2) \ + PAIR(0x0E3F, 0x0E3F) \ + PAIR(0x0F3A, 0x0F3D) \ + BIG_(0x1390, 0x1400) \ + PAIR(0x1680, 0x1680) \ + PAIR(0x169B, 0x169C) \ + PAIR(0x17DB, 0x17DB) \ + BIG_(0x17F0, 0x17F9) \ + BIG_(0x1800, 0x180A) \ + PAIR(0x180E, 0x180E) \ + PAIR(0x1940, 0x1940) \ + PAIR(0x1944, 0x1945) \ + BIG_(0x19DE, 0x19FF) \ + PAIR(0x1FBD, 0x1FBD) \ + PAIR(0x1FBF, 0x1FC1) \ + PAIR(0x1FCD, 0x1FCF) \ + PAIR(0x1FDD, 0x1FDF) \ + PAIR(0x1FED, 0x1FEF) \ + PAIR(0x1FFD, 0x1FFE) \ + BIG_(0x2000, 0x200A) \ + BIG_(0x2010, 0x2029) \ + BIG_(0x202F, 0x205F) \ + PAIR(0x2070, 0x2070) \ + BIG_(0x2074, 0x207E) \ + BIG_(0x2080, 0x208E) \ + BIG_(0x20A0, 0x20B8) \ + PAIR(0x2100, 0x2101) \ + PAIR(0x2103, 0x2106) \ + PAIR(0x2108, 0x2109) \ + PAIR(0x2114, 0x2114) \ + PAIR(0x2116, 0x2118) \ + BIG_(0x211E, 0x2123) \ + PAIR(0x2125, 0x2125) \ + PAIR(0x2127, 0x2127) \ + PAIR(0x2129, 0x2129) \ + PAIR(0x212E, 0x212E) \ + PAIR(0x213A, 0x213B) \ + BIG_(0x2140, 0x2144) \ + PAIR(0x214A, 0x214D) \ + BIG_(0x2150, 0x215F) \ + PAIR(0x2189, 0x2189) \ + BIG_(0x2190, 0x2335) \ + BIG_(0x237B, 0x2394) \ + BIG_(0x2396, 0x23E8) \ + BIG_(0x2400, 0x2426) \ + BIG_(0x2440, 0x244A) \ + BIG_(0x2460, 0x249B) \ + BIG_(0x24EA, 0x26AB) \ + BIG_(0x26AD, 0x26CD) \ + BIG_(0x26CF, 0x26E1) \ + PAIR(0x26E3, 0x26E3) \ + BIG_(0x26E8, 0x26FF) \ + PAIR(0x2701, 0x2704) \ + PAIR(0x2706, 0x2709) \ + BIG_(0x270C, 0x2727) \ + BIG_(0x2729, 0x274B) \ + PAIR(0x274D, 0x274D) \ + PAIR(0x274F, 0x2752) \ + BIG_(0x2756, 0x275E) \ + BIG_(0x2761, 0x2794) \ + BIG_(0x2798, 0x27AF) \ + BIG_(0x27B1, 0x27BE) \ + BIG_(0x27C0, 0x27CA) \ + PAIR(0x27CC, 0x27CC) \ + BIG_(0x27D0, 0x27FF) \ + BIG_(0x2900, 0x2B4C) \ + BIG_(0x2B50, 0x2B59) \ + BIG_(0x2CE5, 0x2CEA) \ + BIG_(0x2CF9, 0x2CFF) \ + BIG_(0x2E00, 0x2E99) \ + BIG_(0x2E9B, 0x2EF3) \ + BIG_(0x2F00, 0x2FD5) \ + BIG_(0x2FF0, 0x2FFB) \ + BIG_(0x3000, 0x3004) \ + BIG_(0x3008, 0x3020) \ + PAIR(0x3030, 0x3030) \ + PAIR(0x3036, 0x3037) \ + PAIR(0x303D, 0x303D) \ + PAIR(0x303E, 0x303F) \ + PAIR(0x309B, 0x309C) \ + PAIR(0x30A0, 0x30A0) \ + PAIR(0x30FB, 0x30FB) \ + BIG_(0x31C0, 0x31E3) \ + PAIR(0x321D, 0x321E) \ + BIG_(0x3250, 0x325F) \ + PAIR(0x327C, 0x327E) \ + BIG_(0x32B1, 0x32BF) \ + PAIR(0x32CC, 0x32CF) \ + PAIR(0x3377, 0x337A) \ + PAIR(0x33DE, 0x33DF) \ + PAIR(0x33FF, 0x33FF) \ + BIG_(0x4DC0, 0x4DFF) \ + BIG_(0xA490, 0xA4C6) \ + BIG_(0xA60D, 0xA60F) \ + BIG_(0xA673, 0xA673) \ + BIG_(0xA67E, 0xA67F) \ + BIG_(0xA700, 0xA721) \ + BIG_(0xA788, 0xA788) \ + BIG_(0xA828, 0xA82B) \ + BIG_(0xA838, 0xA839) \ + BIG_(0xA874, 0xA877) \ + BIG_(0xFB29, 0xFB29) \ + BIG_(0xFD3E, 0xFD3F) \ + BIG_(0xFDFD, 0xFDFD) \ + BIG_(0xFE10, 0xFE19) \ + BIG_(0xFE30, 0xFE52) \ + BIG_(0xFE54, 0xFE66) \ + BIG_(0xFE68, 0xFE6B) \ + BIG_(0xFF01, 0xFF20) \ + BIG_(0xFF3B, 0xFF40) \ + BIG_(0xFF5B, 0xFF65) \ + BIG_(0xFFE0, 0xFFE6) \ + BIG_(0xFFE8, 0xFFEE) \ + BIG_(0xFFF9, 0xFFFD) + /* + {0x10101, 0x10101}, + {0x10140, 0x1019B}, + {0x1091F, 0x1091F}, + {0x10B39, 0x10B3F}, + {0x10E60, 0x10E7E}, + {0x1D200, 0x1D241}, + {0x1D245, 0x1D245}, + {0x1D300, 0x1D356}, + {0x1D6DB, 0x1D6DB}, + {0x1D715, 0x1D715}, + {0x1D74F, 0x1D74F}, + {0x1D789, 0x1D789}, + {0x1D7C3, 0x1D7C3}, + {0x1D7CE, 0x1D7FF}, + {0x1F000, 0x1F02B}, + {0x1F030, 0x1F093}, + {0x1F100, 0x1F10A} + */ + ARRAY +# undef BIG_ +# undef PAIR + }; +# define BIG_(a,b) +# define PAIR(a,b) (a << 2) | (b-a), + static const uint16_t neutral_p[] = { ARRAY }; +# undef BIG_ +# undef PAIR +# define BIG_(a,b) char big_##a[b < 0x4000 && b-a <= 3 ? -1 : 1]; +# define PAIR(a,b) char pair##a[b >= 0x4000 || b-a > 3 ? -1 : 1]; + struct CHECK { ARRAY }; +# undef BIG_ +# undef PAIR +# undef ARRAY + + if (in_interval_table(wc, neutral_b, ARRAY_SIZE(neutral_b) - 1)) + return 1; + if (in_uint16_table(wc, neutral_p, ARRAY_SIZE(neutral_p) - 1)) + return 1; + return 0; +} +# endif + # endif /* UNICODE_BIDI_SUPPORT */ #endif /* Homegrown Unicode support */ diff --git a/libbb/unicode_wcwidth.c b/libbb/unicode_wcwidth.c index 7eccc394c..0bb622705 100644 --- a/libbb/unicode_wcwidth.c +++ b/libbb/unicode_wcwidth.c @@ -538,6 +538,6 @@ static int wcwidth(unsigned ucs) || ((ucs >> 17) == (2 >> 1)) /* 20000..3ffff: Supplementary and Tertiary Ideographic Planes */ # endif ); -# endif -#endif +# endif /* >= 0x1100 */ +#endif /* >= 0x300 */ }