diff options
Diffstat (limited to 'tdemarkdown/md4c/scripts/build_folding_map.py')
-rw-r--r-- | tdemarkdown/md4c/scripts/build_folding_map.py | 120 |
1 files changed, 120 insertions, 0 deletions
diff --git a/tdemarkdown/md4c/scripts/build_folding_map.py b/tdemarkdown/md4c/scripts/build_folding_map.py new file mode 100644 index 000000000..b401775f5 --- /dev/null +++ b/tdemarkdown/md4c/scripts/build_folding_map.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 + +import os +import sys +import textwrap + + +self_path = os.path.dirname(os.path.realpath(__file__)); +f = open(self_path + "/unicode/CaseFolding.txt", "r") + +status_list = [ "C", "F" ] + +folding_list = [ dict(), dict(), dict() ] + +# Filter the foldings for "full" folding. +for line in f: + comment_off = line.find("#") + if comment_off >= 0: + line = line[:comment_off] + line = line.strip() + if not line: + continue + + raw_codepoint, status, raw_mapping, ignored_tail = line.split(";", 3) + if not status.strip() in status_list: + continue + codepoint = int(raw_codepoint.strip(), 16) + mapping = [int(it, 16) for it in raw_mapping.strip().split(" ")] + mapping_len = len(mapping) + + if mapping_len in range(1, 4): + folding_list[mapping_len-1][codepoint] = mapping + else: + assert(False) +f.close() + + +# If we assume that (index0 ... index-1) makes a range (as defined below), +# check that the newly provided index is compatible with the range too; i.e. +# verify that the range can be extended without breaking its properties. +# +# Currently, we can handle ranges which: +# +# (1) either form consecutive sequence of codepoints and which map that range +# to other consecutive range of codepoints (of the same length); +# +# (2) or a consecutive sequence of codepoints with step 2 where each codepoint +# CP is mapped to the codepoint CP+1 +# (e.g. 0x1234 -> 0x1235; 0x1236 -> 0x1237; 0x1238 -> 0x1239; ...). +# +# Note: When the codepoints in the range are mapped to multiple codepoints, +# only the 1st mapped codepoint is considered. All the other ones have to be +# shared by all the mappings covered by the range. +def is_range_compatible(folding, codepoint_list, index0, index): + N = index - index0 + codepoint0 = codepoint_list[index0] + codepoint1 = codepoint_list[index0+1] + codepointN = codepoint_list[index] + mapping0 = folding[codepoint0] + mapping1 = folding[codepoint1] + mappingN = folding[codepointN] + + # Check the range type (1): + if codepoint1 - codepoint0 == 1 and codepointN - codepoint0 == N \ + and mapping1[0] - mapping0[0] == 1 and mapping1[1:] == mapping0[1:] \ + and mappingN[0] - mapping0[0] == N and mappingN[1:] == mapping0[1:]: + return True + + # Check the range type (2): + if codepoint1 - codepoint0 == 2 and codepointN - codepoint0 == 2 * N \ + and mapping0[0] - codepoint0 == 1 \ + and mapping1[0] - codepoint1 == 1 and mapping1[1:] == mapping0[1:] \ + and mappingN[0] - codepointN == 1 and mappingN[1:] == mapping0[1:]: + return True + + return False + + +def mapping_str(list, mapping): + return ",".join("0x{:04x}".format(x) for x in mapping) + +for mapping_len in range(1, 4): + folding = folding_list[mapping_len-1] + codepoint_list = list(folding) + + index0 = 0 + count = len(folding) + + records = list() + data_records = list() + + while index0 < count: + index1 = index0 + 1 + while index1 < count and is_range_compatible(folding, codepoint_list, index0, index1): + index1 += 1 + + if index1 - index0 > 2: + # Range of codepoints + records.append("R(0x{:04x},0x{:04x})".format(codepoint_list[index0], codepoint_list[index1-1])) + data_records.append(mapping_str(data_records, folding[codepoint_list[index0]])) + data_records.append(mapping_str(data_records, folding[codepoint_list[index1-1]])) + index0 = index1 + else: + # Single codepoint + records.append("S(0x{:04x})".format(codepoint_list[index0])) + data_records.append(mapping_str(data_records, folding[codepoint_list[index0]])) + index0 += 1 + + sys.stdout.write("static const unsigned FOLD_MAP_{}[] = {{\n".format(mapping_len)) + sys.stdout.write("\n".join(textwrap.wrap(", ".join(records), 110, + initial_indent = " ", subsequent_indent=" "))) + sys.stdout.write("\n};\n") + + sys.stdout.write("static const unsigned FOLD_MAP_{}_DATA[] = {{\n".format(mapping_len)) + sys.stdout.write("\n".join(textwrap.wrap(", ".join(data_records), 110, + initial_indent = " ", subsequent_indent=" "))) + sys.stdout.write("\n};\n") + + + |