# HG changeset patch # User HackBot # Date 1397775239 0 # Node ID d09b208e869fd1f052f28de386546913954450f1 # Parent 6d2e9d293bda193a053fa89c32db9e533bdcefa4 fetch http://sources.debian.net/data/main/u/unicode/0.9.7/unicode diff -r 6d2e9d293bda -r d09b208e869f unicode --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/unicode Thu Apr 17 22:53:59 2014 +0000 @@ -0,0 +1,815 @@ +#!/usr/bin/python + + +import os, glob, sys, unicodedata, locale, gzip, re, traceback, encodings +import urllib, webbrowser, textwrap + +# bz2 was introduced in 2.3, we want this to work also with earlier versions +try: + import bz2 +except ImportError: + bz2 = None + +# for python3 +try: + unicode +except NameError: + unicode = str + +# 'any' and 'all' were introduced in python2.5 +# dummy replacement for older versions +try: + all +except NameError: + all = lambda x: False + +PY3 = sys.version_info[0] >= 3 +if PY3: + import subprocess as cmd + + def is_ascii(s): + "test is string s consists completely of ascii characters (python 3)" + try: + s.encode('ascii') + except UnicodeEncodeError: + return False + return True + + def out(*args): + "pring args, converting them to output charset" + for i in args: + sys.stdout.flush() + sys.stdout.buffer.write(i.encode(options.iocharset, 'replace')) + + # ord23 is used to convert elements of byte array in python3, which are integers + ord23 = lambda x: x + + # unichr is not in python3 + unichr = chr + +else: # python2 + + # getoutput() and getstatusoutput() methods have + # been moved from commands to the subprocess module + # with Python >= 3.x + import commands as cmd + + def is_ascii(s): + "test is string s consists completely of ascii characters (python 2)" + try: + unicode(s, 'ascii') + except UnicodeDecodeError: + return False + return True + + def out(*args): + "pring args, converting them to output charset" + for i in args: + sys.stdout.write(i.encode(options.iocharset, 'replace')) + + ord23 = ord + + + +from optparse import OptionParser + +VERSION='0.9.7' + + +# list of terminals that support bidi +biditerms = ['mlterm'] + +try: + locale.setlocale(locale.LC_ALL, '') +except locale.Error: + pass + +# guess terminal charset +try: + iocharsetguess = locale.nl_langinfo(locale.CODESET) or "ascii" +except locale.Error: + iocharsetguess = "ascii" + +if os.environ.get('TERM') in biditerms and iocharsetguess.lower().startswith('utf'): + LTR = u'\u202d' # left to right override +else: + LTR = '' + + +colours = { + 'none' : "", + 'default' : "\033[0m", + 'bold' : "\033[1m", + 'underline' : "\033[4m", + 'blink' : "\033[5m", + 'reverse' : "\033[7m", + 'concealed' : "\033[8m", + + 'black' : "\033[30m", + 'red' : "\033[31m", + 'green' : "\033[32m", + 'yellow' : "\033[33m", + 'blue' : "\033[34m", + 'magenta' : "\033[35m", + 'cyan' : "\033[36m", + 'white' : "\033[37m", + + 'on_black' : "\033[40m", + 'on_red' : "\033[41m", + 'on_green' : "\033[42m", + 'on_yellow' : "\033[43m", + 'on_blue' : "\033[44m", + 'on_magenta' : "\033[45m", + 'on_cyan' : "\033[46m", + 'on_white' : "\033[47m", + + 'beep' : "\007", + } + + +general_category = { + 'Lu': 'Letter, Uppercase', + 'Ll': 'Letter, Lowercase', + 'Lt': 'Letter, Titlecase', + 'Lm': 'Letter, Modifier', + 'Lo': 'Letter, Other', + 'Mn': 'Mark, Non-Spacing', + 'Mc': 'Mark, Spacing Combining', + 'Me': 'Mark, Enclosing', + 'Nd': 'Number, Decimal Digit', + 'Nl': 'Number, Letter', + 'No': 'Number, Other', + 'Pc': 'Punctuation, Connector', + 'Pd': 'Punctuation, Dash', + 'Ps': 'Punctuation, Open', + 'Pe': 'Punctuation, Close', + 'Pi': 'Punctuation, Initial quote', + 'Pf': 'Punctuation, Final quote', + 'Po': 'Punctuation, Other', + 'Sm': 'Symbol, Math', + 'Sc': 'Symbol, Currency', + 'Sk': 'Symbol, Modifier', + 'So': 'Symbol, Other', + 'Zs': 'Separator, Space', + 'Zl': 'Separator, Line', + 'Zp': 'Separator, Paragraph', + 'Cc': 'Other, Control', + 'Cf': 'Other, Format', + 'Cs': 'Other, Surrogate', + 'Co': 'Other, Private Use', + 'Cn': 'Other, Not Assigned', +} + +bidi_category = { + 'L' : 'Left-to-Right', + 'LRE' : 'Left-to-Right Embedding', + 'LRO' : 'Left-to-Right Override', + 'R' : 'Right-to-Left', + 'AL' : 'Right-to-Left Arabic', + 'RLE' : 'Right-to-Left Embedding', + 'RLO' : 'Right-to-Left Override', + 'PDF' : 'Pop Directional Format', + 'EN' : 'European Number', + 'ES' : 'European Number Separator', + 'ET' : 'European Number Terminator', + 'AN' : 'Arabic Number', + 'CS' : 'Common Number Separator', + 'NSM' : 'Non-Spacing Mark', + 'BN' : 'Boundary Neutral', + 'B' : 'Paragraph Separator', + 'S' : 'Segment Separator', + 'WS' : 'Whitespace', + 'ON' : 'Other Neutrals', +} + +comb_classes = { + 0: 'Spacing, split, enclosing, reordrant, and Tibetan subjoined', + 1: 'Overlays and interior', + 7: 'Nuktas', + 8: 'Hiragana/Katakana voicing marks', + 9: 'Viramas', + 10: 'Start of fixed position classes', + 199: 'End of fixed position classes', + 200: 'Below left attached', + 202: 'Below attached', + 204: 'Below right attached', + 208: 'Left attached (reordrant around single base character)', + 210: 'Right attached', + 212: 'Above left attached', + 214: 'Above attached', + 216: 'Above right attached', + 218: 'Below left', + 220: 'Below', + 222: 'Below right', + 224: 'Left (reordrant around single base character)', + 226: 'Right', + 228: 'Above left', + 230: 'Above', + 232: 'Above right', + 233: 'Double below', + 234: 'Double above', + 240: 'Below (iota subscript)', +} + + + +def get_unicode_properties(ch): + properties = {} + if ch in linecache: + fields = linecache[ch].strip().split(';') + proplist = ['codepoint', 'name', 'category', 'combining', 'bidi', 'decomposition', 'dummy', 'digit_value', 'numeric_value', 'mirrored', 'unicode1name', 'iso_comment', 'uppercase', 'lowercase', 'titlecase'] + for i, prop in enumerate(proplist): + if prop!='dummy': + properties[prop] = fields[i] + + if properties['lowercase']: + properties['lowercase'] = unichr(int(properties['lowercase'], 16)) + if properties['uppercase']: + properties['uppercase'] = unichr(int(properties['uppercase'], 16)) + if properties['titlecase']: + properties['titlecase'] = unichr(int(properties['titlecase'], 16)) + + properties['combining'] = int(properties['combining']) + properties['mirrored'] = properties['mirrored']=='Y' + else: + properties['codepoint'] = '%04X' % ord(ch) + properties['name'] = unicodedata.name(ch, '') + properties['category'] = unicodedata.category(ch) + properties['combining'] = unicodedata.combining(ch) + properties['bidi'] = unicodedata.bidirectional(ch) + properties['decomposition'] = unicodedata.decomposition(ch) + properties['digit_value'] = unicodedata.digit(ch, '') + properties['numeric_value'] = unicodedata.numeric(ch, '') + properties['mirrored'] = unicodedata.mirrored(ch) + properties['unicode1name'] = '' + properties['iso_comment'] = '' + properties['uppercase'] = ch.upper() + properties['lowercase'] = ch.lower() + properties['titlecase'] = '' + return properties + + +def do_init(): + HomeDir = os.path.expanduser('~/.unicode') + HomeUnicodeData = os.path.join(HomeDir, "UnicodeData.txt") + global UnicodeDataFileNames + UnicodeDataFileNames = [HomeUnicodeData, '/usr/share/unicode/UnicodeData.txt', '/usr/share/unidata/UnicodeData.txt', './UnicodeData.txt'] + \ + glob.glob('/usr/share/unidata/UnicodeData*.txt') + \ + glob.glob('/usr/share/perl/*/unicore/UnicodeData.txt') + \ + glob.glob('/System/Library/Perl/*/unicore/UnicodeData.txt') # for MacOSX + + HomeUnihanData = os.path.join(HomeDir, "Unihan*") + global UnihanDataGlobs + UnihanDataGlobs = [HomeUnihanData, '/usr/share/unidata/Unihan*', '/usr/share/unicode/Unihan*', './Unihan*'] + + +def get_unihan_files(): + fos = [] # list of file names for Unihan data file(s) + for gl in UnihanDataGlobs: + fnames = glob.glob(gl) + fos += fnames + return fos + +def get_unihan_properties_internal(ch): + properties = {} + ch = ord(ch) + global unihan_fs + for f in unihan_fs: + fo = OpenGzip(f) + for l in fo: + if l.startswith('#'): + continue + line = l.strip() + if not line: + continue + char, key, value = line.strip().split('\t') + if int(char[2:], 16) == ch: + properties[key] = unicode(value, 'utf-8') + elif int(char[2:], 16)>ch: + break + return properties + +def get_unihan_properties_zgrep(ch): + properties = {} + global unihan_fs + ch = ord(ch) + chs = 'U+%X' % ch + for f in unihan_fs: + if f.endswith('.gz'): + grepcmd = 'zgrep' + elif f.endswith('.bz2'): + grepcmd = 'bzgrep' + else: + grepcmd = 'grep' + cmdline = grepcmd+' ^'+chs+r'\\b '+f + status, output = cmd.getstatusoutput(cmdline) + output = output.split('\n') + for l in output: + if not l: + continue + char, key, value = l.strip().split('\t') + if int(char[2:], 16) == ch: + if PY3: + properties[key] = value + else: + properties[key] = unicode(value, 'utf-8') + elif int(char[2:], 16)>ch: + break + return properties + +# basic sanity check, if e.g. you run this on MS Windows... +if os.path.exists('/bin/grep'): + get_unihan_properties = get_unihan_properties_zgrep +else: + get_unihan_properties = get_unihan_properties_internal + + +def error(txt): + out(txt) + out('\n') + sys.exit(1) + +def get_gzip_filename(fname): + "return fname, if it does not exist, return fname+.gz, if neither that, fname+bz2, if neither that, return None" + if os.path.exists(fname): + return fname + if os.path.exists(fname+'.gz'): + return fname+'.gz' + if os.path.exists(fname+'.bz2') and bz2 is not None: + return fname+'.bz2' + return None + + +def OpenGzip(fname): + "open fname, try fname.gz or fname.bz2 if fname does not exist, return file object or GzipFile or BZ2File object" + if os.path.exists(fname) and not (fname.endswith('.gz') or fname.endswith('.bz2')): + return open(fname) + if os.path.exists(fname+'.gz'): + fname = fname+'.gz' + elif os.path.exists(fname+'.bz2') and bz2 is not None: + fname = fname+'.bz2' + if fname.endswith('.gz'): + return gzip.GzipFile(fname) + elif fname.endswith('.bz2'): + return bz2.BZ2File(fname) + return None + +def GrepInNames(pattern, fillcache=False): + p = re.compile(pattern, re.I) + f = None + for name in UnicodeDataFileNames: + f = OpenGzip(name) + if f != None: + break + if not fillcache: + if not f: + out( """ +Cannot find UnicodeData.txt, please place it into +/usr/share/unidata/UnicodeData.txt, +/usr/share/unicode/UnicodeData.txt, ~/.unicode/ or current +working directory (optionally you can gzip it). +Without the file, searching will be much slower. + +""" ) + for i in xrange(sys.maxunicode): + try: + name = unicodedata.name(unichr(i)) + if re.search(p, name): + yield myunichr(i) + except ValueError: + pass + else: + for l in f: + if re.search(p, l): + r = myunichr(int(l.split(';')[0], 16)) + linecache[r] = l + yield r + f.close() + else: + if f: + for l in f: + if re.search(p, l): + r = myunichr(int(l.split(';')[0], 16)) + linecache[r] = l + f.close() + + +def valfromcp(n, cp=None): + "if fromcp is defined, then the 'n' is considered to be from that codepage and is converted accordingly" + if cp: + xh = '%x' %n + if len(xh) % 2: # pad hexadecimal representation with a zero + xh = '0'+xh + cps = ( [xh[i:i+2] for i in range(0,len(xh),2)] ) + cps = ( chr(int(i, 16)) for i in cps) + cps = ''.join(cps) + """ + if 0 <= n <= 255: + s = chr(n) + elif 256 <= n <= 65535: + s = struct.pack('>H', n) + elif 65536 <= n <= sys.maxint: + s = struct.pack('>H', n) + else: # bad character code, either negative or too big + raise ValueError("Bad character code %s" %n) + print 'ee',`s` + n = unicode(s, cp) + """ + s = unicode(cps, cp) + ns = [ord(x) for x in s] + return ns + else: + return [n] + +def myunichr(n): + try: + r = unichr(n) + return r + except OverflowError: + traceback.print_exc() + error("The codepoint is too big - it does not fit into an int.") + except ValueError: + traceback.print_exc() + err = "The codepoint is too big." + if sys.maxunicode <= 0xffff: + err += "\nPerhaps your python interpreter is not compiled with wide unicode characters." + error(err) + + +def guesstype(arg): + if not arg: # empty string + return 'empty string', arg + elif not is_ascii(arg): + return 'string', arg + elif arg[:2]=='U+' or arg[:2]=='u+': # it is hexadecimal number + try: + val = int(arg[2:], 16) + if val>sys.maxunicode: + return 'regexp', arg + else: + return 'hexadecimal', arg[2:] + except ValueError: + return 'regexp', arg + elif arg[0] in "Uu" and len(arg)>4: + try: + val = int(arg[1:], 16) + if val>sys.maxunicode: + return 'regexp', arg + else: + return 'hexadecimal', arg + except ValueError: + return 'regexp', arg + elif len(arg)>=4: + if len(arg) in (8, 16, 24, 32): + if all(x in '01' for x in arg): + val = int(arg, 2) + if val<=sys.maxunicode: + return 'binary', arg + try: + val = int(arg, 16) + if val>sys.maxunicode: + return 'regexp', arg + else: + return 'hexadecimal', arg + except ValueError: + return 'regexp', arg + else: + return 'string', arg + +def process(arglist, t, fromcp=None): + # build a list of values, so that we can combine queries like + # LATIN ALPHA and search for LATIN.*ALPHA and not names that + # contain either LATIN or ALPHA + result = [] + names_query = [] # reserved for queries in names - i.e. -r + for arg_i in arglist: + if t==None: + tp, arg = guesstype(arg_i) + if tp == 'regexp': + # if the first argument is guessed to be a regexp, add + # all the following arguments to the regular expression - + # this is probably what you wanted, e.g. + # 'unicode cyrillic be' will now search for the 'cyrillic.*be' regular expression + t = 'regexp' + else: + tp, arg = t, arg_i + if tp=='hexadecimal': + val = int(arg, 16) + vals = valfromcp(val, fromcp) + for val in vals: + r = myunichr(val) + list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties + result.append(r) + elif tp=='decimal': + val = int(arg, 10) + vals = valfromcp(val, fromcp) + for val in vals: + r = myunichr(val) + list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties + result.append(r) + elif tp=='octal': + val = int(arg, 8) + vals = valfromcp(val, fromcp) + for val in vals: + r = myunichr(val) + list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties + result.append(r) + elif tp=='binary': + val = int(arg, 2) + vals = valfromcp(val, fromcp) + for val in vals: + r = myunichr(val) + list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties + result.append(r) + elif tp=='regexp': + names_query.append(arg) + elif tp=='string': + try: + if PY3: # argv is automatically decoded into unicode, even padded with bogus character if it is not encodable + unirepr = arg + else: + unirepr = unicode(arg, options.iocharset) + except UnicodeDecodeError: + error ("Sequence %s is not valid in charset '%s'." % (repr(arg), options.iocharset)) + unilist = ['%04X'%ord(x) for x in unirepr] + unireg = '|'.join(unilist) + list(GrepInNames(unireg, fillcache=True)) + for r in unirepr: + result.append(r) + elif tp=='empty string': + pass # do not do anything for an empty string + if names_query: + query = '.*'.join(names_query) + for r in GrepInNames(query): + result.append(r) + return result + +def maybe_colours(colour): + if use_colour: + return colours[colour] + else: + return "" + +# format key and value +def printkv(*l): + for i in range(0, len(l), 2): + if i options.maxcount: + out("\nToo many characters to display, more than %s, use --max option to change it\n" % options.maxcount) + return + properties = get_unicode_properties(c) + out(maybe_colours('bold')) + out('U+%04X '% ord(c)) + if properties['name']: + out(properties['name']) + else: + out(maybe_colours('default')) + out(" - No such unicode character name in database") + out(maybe_colours('default')) + out('\n') + + ar = ["UTF-8", ' '.join([("%02x" % ord23(x)) for x in c.encode('utf-8')]) , + "UTF-16BE", ''.join([("%02x" % ord23(x)) for x in c.encode('utf-16be')]), + "Decimal", "&#%s;" % ord(c) ] + if options.addcharset: + try: + rep = ' '.join([("%02x" % ord(x)) for x in c.encode(options.addcharset)] ) + except UnicodeError: + rep = "NONE" + ar.extend( [options.addcharset, rep] ) + printkv(*ar) + + + if properties['combining']: + pc = " "+c + else: + pc = c + out(pc) + uppercase = properties['uppercase'] + lowercase = properties['lowercase'] + if uppercase: + out(" (%s)" % uppercase) + out('\n') + printkv( "Uppercase", 'U+%04X'% ord(properties['uppercase']) ) + elif lowercase: + out(" (%s)" % properties['lowercase']) + out('\n') + printkv( "Lowercase", 'U+%04X'% ord(properties['lowercase']) ) + else: + out('\n') + printkv( 'Category', properties['category']+ " (%s)" % general_category[properties['category']] ) + + if properties['numeric_value']: + printkv( 'Numeric value', properties['numeric_value']) + if properties['digit_value']: + printkv( 'Digit value', properties['digit_value']) + + bidi = properties['bidi'] + if bidi: + printkv( 'Bidi', bidi+ " (%s)" % bidi_category[bidi] ) + mirrored = properties['mirrored'] + if mirrored: + out('Character is mirrored\n') + comb = properties['combining'] + if comb: + printkv( 'Combining', str(comb)+ " (%s)" % (comb_classes.get(comb, '?')) ) + decomp = properties['decomposition'] + if decomp: + printkv( 'Decomposition', decomp ) + if options.verbosity>0: + uhp = get_unihan_properties(c) + for key in uhp: + printkv(key, uhp[key]) + out('\n') + + +def print_block(block): + #header + out(" "*10) + for i in range(16): + out(".%X " % i) + out('\n') + #body + for i in range(block*16, block*16+16): + hexi = "%X" % i + if len(hexi)>3: + hexi = "%07X" % i + hexi = hexi[:4]+" "+hexi[4:] + else: + hexi = " %03X" % i + out(LTR+hexi+". ") + for j in range(16): + c = unichr(i*16+j) + if unicodedata.combining(c): + c = " "+c + out(c) + out(' ') + out('\n') + out('\n') + +def print_blocks(blocks): + for block in blocks: + print_block(block) + +def is_range(s, typ): + sp = s.split('..') + if len(sp)!=2: + return False + if not sp[1]: + sp[1] = sp[0] + elif not sp[0]: + sp[0] = sp[1] + if not sp[0]: + return False + low = list(process([sp[0]], typ)) # intentionally no fromcp here, ranges are only of unicode characters + high = list(process([sp[1]], typ)) + if len(low)!=1 or len(high)!=1: + return False + low = ord(low[0]) + high = ord(high[0]) + low = low // 256 + high = high // 256 + 1 + return range(low, high) + + + +parser = OptionParser(usage="usage: %prog [options] arg") +parser.add_option("-x", "--hexadecimal", + action="store_const", const='hexadecimal', dest="type", + help="Assume arg to be hexadecimal number") +parser.add_option("-o", "--octal", + action="store_const", const='octal', dest="type", + help="Assume arg to be octal number") +parser.add_option("-b", "--binary", + action="store_const", const='binary', dest="type", + help="Assume arg to be binary number") +parser.add_option("-d", "--decimal", + action="store_const", const='decimal', dest="type", + help="Assume arg to be decimal number") +parser.add_option("-r", "--regexp", + action="store_const", const='regexp', dest="type", + help="Assume arg to be regular expression") +parser.add_option("-s", "--string", + action="store_const", const='string', dest="type", + help="Assume arg to be a sequence of characters") +parser.add_option("-a", "--auto", + action="store_const", const=None, dest="type", + help="Try to guess arg type (default)") +parser.add_option("-m", "--max", + action="store", default=10, dest="maxcount", type="int", + help="Maximal number of codepoints to display, default: 10; 0=unlimited") +parser.add_option("-i", "--io", + action="store", default=iocharsetguess, dest="iocharset", type="string", + help="I/O character set, I am guessing %s" % iocharsetguess) +parser.add_option("--fcp", "--fromcp", + action="store", default='', dest="fromcp", type="string", + help="Convert numerical arguments from this encoding, default: no conversion") +parser.add_option("-c", "--charset-add", + action="store", dest="addcharset", type="string", + help="Show hexadecimal reprezentation in this additional charset") +parser.add_option("-C", "--colour", + action="store", dest="use_colour", type="string", + default="auto", + help="Use colours, on, off or auto") +parser.add_option('', "--color", + action="store", dest="use_colour", type="string", + default="auto", + help="synonym for --colour") +parser.add_option("-v", "--verbose", + action="count", dest="verbosity", + default=0, + help="Increase verbosity (reads Unihan properties - slow!)") +parser.add_option("-w", "--wikipedia", + action="count", dest="query_wiki", + default=0, + help="Query wikipedia for the character") +parser.add_option("--list", + action="store_const", dest="list_all_encodings", + const=True, + help="List (approximately) all known encodings") + + +(options, arguments) = parser.parse_args() + +linecache = {} +do_init() + + +if options.list_all_encodings: + all_encodings = os.listdir(os.path.dirname(encodings.__file__)) + all_encodings = set([os.path.splitext(x)[0] for x in all_encodings]) + all_encodings = list(all_encodings) + all_encodings.sort() + print (textwrap.fill(' '.join(all_encodings))) + sys.exit() + +if len(arguments)==0: + parser.print_help() + sys.exit() + + +if options.use_colour.lower() in ("on", "1", "true", "yes"): + use_colour = True +elif options.use_colour.lower() in ("off", "0", "false", "no"): + use_colour = False +else: + use_colour = sys.stdout.isatty() + if sys.platform == 'win32': + use_colour = False + + +l_args = [] # list of non range arguments to process +for argum in arguments: + is_r = is_range(argum, options.type) + if is_r: + print_blocks(is_r) + else: + l_args.append(argum) + +if l_args: + unihan_fs = [] + if options.verbosity>0: + unihan_fs = get_unihan_files() # list of file names for Unihan data file(s), empty if not available + if not unihan_fs: + out( """ +Unihan_*.txt files not found. In order to view Unihan properties, +please place the file into /usr/share/unidata/, +/usr/share/unicode/, ~/.unicode/ +or current working directory (optionally you can gzip or bzip2 them). +You can get the files by unpacking ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip +Warning, listing UniHan Properties is rather slow. + +""") + options.verbosity = 0 + try: + print_characters(process(l_args, options.type, options.fromcp), options.maxcount, options.query_wiki) + except IOError: # e.g. broken pipe + pass +