Take Mac OS text encoding constants from headers

Among other uses, these constants are used on HFS+ volumes to choose
which encoding to use when converting Unicode filenames back to an 8-bit
string for old Mac OS APIs.

This is documented in TN 1150 under "Text Encodings".

https://developer.apple.com/library/archive/technotes/tn/tn1150.html
This commit is contained in:
Dietrich Epp 2022-04-01 12:05:15 -04:00
parent 3dcda5579e
commit 4b20b30330
2 changed files with 86 additions and 13 deletions

41
scripts/encoding.csv Normal file
View File

@ -0,0 +1,41 @@
Name,Value
kTextEncodingMacRoman,0
kTextEncodingMacJapanese,1
kTextEncodingMacChineseTrad,2
kTextEncodingMacKorean,3
kTextEncodingMacArabic,4
kTextEncodingMacHebrew,5
kTextEncodingMacGreek,6
kTextEncodingMacCyrillic,7
kTextEncodingMacDevanagari,9
kTextEncodingMacGurmukhi,10
kTextEncodingMacGujarati,11
kTextEncodingMacOriya,12
kTextEncodingMacBengali,13
kTextEncodingMacTamil,14
kTextEncodingMacTelugu,15
kTextEncodingMacKannada,16
kTextEncodingMacMalayalam,17
kTextEncodingMacSinhalese,18
kTextEncodingMacBurmese,19
kTextEncodingMacKhmer,20
kTextEncodingMacThai,21
kTextEncodingMacLaotian,22
kTextEncodingMacGeorgian,23
kTextEncodingMacArmenian,24
kTextEncodingMacChineseSimp,25
kTextEncodingMacTibetan,26
kTextEncodingMacMongolian,27
kTextEncodingMacEthiopic,28
kTextEncodingMacCentralEurRoman,29
kTextEncodingMacVietnamese,30
kTextEncodingMacExtArabic,31
kTextEncodingMacSymbol,33
kTextEncodingMacDingbats,34
kTextEncodingMacTurkish,35
kTextEncodingMacCroatian,36
kTextEncodingMacIcelandic,37
kTextEncodingMacRomanian,38
kTextEncodingMacCeltic,39
kTextEncodingMacGaelic,40
kTextEncodingMacKeyboardGlyphs,41
1 Name Value
2 kTextEncodingMacRoman 0
3 kTextEncodingMacJapanese 1
4 kTextEncodingMacChineseTrad 2
5 kTextEncodingMacKorean 3
6 kTextEncodingMacArabic 4
7 kTextEncodingMacHebrew 5
8 kTextEncodingMacGreek 6
9 kTextEncodingMacCyrillic 7
10 kTextEncodingMacDevanagari 9
11 kTextEncodingMacGurmukhi 10
12 kTextEncodingMacGujarati 11
13 kTextEncodingMacOriya 12
14 kTextEncodingMacBengali 13
15 kTextEncodingMacTamil 14
16 kTextEncodingMacTelugu 15
17 kTextEncodingMacKannada 16
18 kTextEncodingMacMalayalam 17
19 kTextEncodingMacSinhalese 18
20 kTextEncodingMacBurmese 19
21 kTextEncodingMacKhmer 20
22 kTextEncodingMacThai 21
23 kTextEncodingMacLaotian 22
24 kTextEncodingMacGeorgian 23
25 kTextEncodingMacArmenian 24
26 kTextEncodingMacChineseSimp 25
27 kTextEncodingMacTibetan 26
28 kTextEncodingMacMongolian 27
29 kTextEncodingMacEthiopic 28
30 kTextEncodingMacCentralEurRoman 29
31 kTextEncodingMacVietnamese 30
32 kTextEncodingMacExtArabic 31
33 kTextEncodingMacSymbol 33
34 kTextEncodingMacDingbats 34
35 kTextEncodingMacTurkish 35
36 kTextEncodingMacCroatian 36
37 kTextEncodingMacIcelandic 37
38 kTextEncodingMacRomanian 38
39 kTextEncodingMacCeltic 39
40 kTextEncodingMacGaelic 40
41 kTextEncodingMacKeyboardGlyphs 41

View File

@ -1,12 +1,21 @@
"""Extract script and region constants from Script.h."""
import csv
import os
import re
import sys
from typing import List, Tuple
from typing import Iterator, List, Tuple
Item = Tuple[str, int]
def list_enums(filename: str) -> Iterator[Item]:
"""List enum definitions in a file."""
with open(filename, 'rb') as fp:
data = fp.read()
for item in re.finditer(rb'^\s*(\w+)\s*=\s*(\d+)', data, re.MULTILINE):
name, value = item.groups()
yield name.decode('ASCII'), int(value)
def index_of(data: List[Item], key: str) -> int:
for i, (name, _) in enumerate(data):
if name == key:
@ -17,28 +26,51 @@ def slice(data: List[Item], first: str, last: str) -> List[Item]:
return data[index_of(data, first):index_of(data, last)+1]
def write_csv(fname: str, data: List[Item]) -> None:
print('Writing', fname, file=sys.stderr)
with open(fname, 'w') as fp:
w = csv.writer(fp)
w.writerow(['Name', 'Value'])
for item in data:
w.writerow(item)
def main(argv: List[str]) -> None:
if len(argv) != 1:
print('usage: script_gen.py <Script.h>', file=sys.stderr)
raise SystemExit(2)
with open(argv[0], 'rb') as fp:
data = fp.read()
def process_script(filename: str) -> None:
scripts: List[Item] = []
regions: List[Item] = []
for item in re.finditer(rb'^\s*(\w+)\s*=\s*(\d+)', data, re.MULTILINE):
name, value = item.groups()
if name.startswith(b'sm'):
scripts.append((name.decode('ASCII'), int(value)))
elif name.startswith(b'ver'):
regions.append((name.decode('ASCII'), int(value)))
for name, value in list_enums(filename):
if name.startswith('sm'):
scripts.append((name, value))
elif name.startswith('ver'):
regions.append((name, value))
write_csv('script.csv', slice(scripts, 'smRoman', 'smUninterp'))
write_csv('region.csv', slice(regions, 'verUS', 'verGreenland'))
def process_textcommon(filename: str) -> None:
encodings: List[Item] = []
for name, value in list_enums(filename):
if name.startswith('kTextEncoding'):
encodings.append((name, value))
write_csv('encoding.csv',
slice(encodings, 'kTextEncodingMacRoman',
'kTextEncodingMacKeyboardGlyphs'))
def process(filename: str) -> None:
name = os.path.basename(filename).lower()
if name == 'script.h':
process_script(filename)
elif name == 'textcommon.h':
process_textcommon(filename)
else:
print('Error: unknown header file:', repr(filename), file=sys.stderr)
raise SystemExit(1)
def main(argv: List[str]) -> None:
if not argv:
sys.stderr.write(
'Usage: script_gen.py [<file.h>...]\n'
'This will read Script.h and TextCommon.h\n')
raise SystemExit(2)
for arg in argv:
process(arg)
if __name__ == '__main__':
main(sys.argv[1:])