diff options
-rwxr-xr-x | contrib/unicode/utf8-dump.py | 69 |
1 files changed, 69 insertions, 0 deletions
diff --git a/contrib/unicode/utf8-dump.py b/contrib/unicode/utf8-dump.py new file mode 100755 index 00000000000..f12ee79f9f2 --- /dev/null +++ b/contrib/unicode/utf8-dump.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +# +# Script to dump a UTF-8 file as a list of numbered lines (mimicking GCC's +# diagnostic output format), interleaved with lines per character showing +# the Unicode codepoints, the UTF-8 encoding bytes, the name of the +# character, and, where printable, the characters themselves. +# The lines are printed in logical order, which may help the reader to grok +# the relationship between visual and logical ordering in bi-di files. +# +# SPDX-License-Identifier: MIT +# +# Copyright (C) 2021 David Malcolm <dmalcolm@redhat.com>. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT +# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE +# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import sys +import unicodedata + + +def get_name(ch): + try: + return unicodedata.name(ch) + except ValueError: + if ch == '\n': + return 'LINE FEED (LF)' + return '(unknown)' + + +def get_printable(ch): + cat = unicodedata.category(ch) + if cat == 'Cc': + return '(control character)' + elif cat == 'Cf': + return '(format control)' + elif cat[0] == 'Z': + return '(separator)' + return ch + + +def dump_file(f_in): + line_num = 1 + for line in f_in: + print('%4i | %s' % (line_num, line.rstrip())) + for ch in line: + utf8_desc = '%15s' % (' '.join(['0x%02x' % b + for b in ch.encode('utf-8')])) + print('%4s | U+%04X %s %40s %s' + % ('', ord(ch), utf8_desc, get_name(ch), get_printable(ch))) + line_num += 1 + + +with open(sys.argv[1], mode='r') as f_in: + dump_file(f_in) |