#!/usr/bin/env python3 # # Script to generate tables for cpp_wcwidth, leveraging glibc's utf8_gen.py. # # This file is part of GCC. # # GCC is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 3, or (at your option) any later # version. # # GCC is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. # # You should have received a copy of the GNU General Public License # along with GCC; see the file COPYING3. If not see # . */ import sys import os if len(sys.argv) != 2: print("usage: %s ", file=sys.stderr) sys.exit(1) unicode_version = sys.argv[1] # Parse a codepoint in the format output by glibc tools. def parse_ucn(s): if not (s.startswith("")): raise ValueError return int(s[2:-1], base=16) # Process a line of width output from utf_gen.py and update global array. widths = [1] * (1 + 0x10FFFF) def process_width(line): # Example lines: # 0 # ... 0 s = line.split() width = int(s[1]) r = s[0].split("...") if len(r) == 1: begin = parse_ucn(r[0]) end = begin + 1 elif len(r) == 2: begin = parse_ucn(r[0]) end = parse_ucn(r[1]) + 1 else: raise ValueError widths[begin:end] = [width] * (end - begin) # To keep things simple, we use glibc utf8_gen.py as-is. It only outputs to a # file named UTF-8, which is not configurable. Then we parse this into the form # we want it. os.system("from_glibc/utf8_gen.py --unicode_version %s" % unicode_version) processing = False for line in open("UTF-8", "r"): if processing: if line == "END WIDTH\n": processing = False else: try: process_width(line) except (ValueError, IndexError): print(e, "warning: ignored unexpected line: %s" % line, file=sys.stderr, end="") elif line == "WIDTH\n": processing = True # All bytes < 256 we treat as width 1. widths[0:255] = [1] * 255 # Condense the list to contiguous ranges. cur_range = [-1, 1] all_ranges = [] for i, width in enumerate(widths): if width == cur_range[1]: cur_range[0] = i else: all_ranges.append(cur_range) cur_range = [i, width] # Output the arrays for generated_cpp_wcwidth.h print("/* Generated by contrib/unicode/gen_wcwidth.py,", "with the help of glibc's") print(" utf8_gen.py, using version %s" % unicode_version, "of the Unicode standard. */") print("\nstatic const cppchar_t wcwidth_range_ends[] = {", end="") for i, r in enumerate(all_ranges): if i % 8: print(" ", end="") else: print("\n ", end="") print("0x%x," % (r[0]), end="") print("\n};\n") print("static const unsigned char wcwidth_widths[] = {", end="") for i, r in enumerate(all_ranges): if i % 24: print(" ", end="") else: print("\n ", end="") print("%d," % r[1], end="") print("\n};")