# HG changeset patch # User HackBot # Date 1398448569 0 # Node ID b53ae6f7abbb3fbc2a7aeda333171f35e4b94688 # Parent 038fcc1f45eafef4d1ba5b47778a78775304da3b mv bin/{un,mult}icode && (head -n -1 bin/unicode.old | sed \'s/import sys/import sys, os/\'; echo \' os.execvp("multicode", sys.argv[1:])\') > bin/unicode && rm bin/unicode.old diff -r 038fcc1f45ea -r b53ae6f7abbb bin/multicode --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/multicode Fri Apr 25 17:56:09 2014 +0000 @@ -0,0 +1,815 @@ +#!/usr/bin/python + + +import os, glob, sys, unicodedata, locale, gzip, re, traceback, encodings +import urllib, webbrowser, textwrap + +# bz2 was introduced in 2.3, we want this to work also with earlier versions +try: + import bz2 +except ImportError: + bz2 = None + +# for python3 +try: + unicode +except NameError: + unicode = str + +# 'any' and 'all' were introduced in python2.5 +# dummy replacement for older versions +try: + all +except NameError: + all = lambda x: False + +PY3 = sys.version_info[0] >= 3 +if PY3: + import subprocess as cmd + + def is_ascii(s): + "test is string s consists completely of ascii characters (python 3)" + try: + s.encode('ascii') + except UnicodeEncodeError: + return False + return True + + def out(*args): + "pring args, converting them to output charset" + for i in args: + sys.stdout.flush() + sys.stdout.buffer.write(i.encode(options.iocharset, 'replace')) + + # ord23 is used to convert elements of byte array in python3, which are integers + ord23 = lambda x: x + + # unichr is not in python3 + unichr = chr + +else: # python2 + + # getoutput() and getstatusoutput() methods have + # been moved from commands to the subprocess module + # with Python >= 3.x + import commands as cmd + + def is_ascii(s): + "test is string s consists completely of ascii characters (python 2)" + try: + unicode(s, 'ascii') + except UnicodeDecodeError: + return False + return True + + def out(*args): + "pring args, converting them to output charset" + for i in args: + sys.stdout.write(i.encode(options.iocharset, 'replace')) + + ord23 = ord + + + +from optparse import OptionParser + +VERSION='0.9.7' + + +# list of terminals that support bidi +biditerms = ['mlterm'] + +try: + locale.setlocale(locale.LC_ALL, '') +except locale.Error: + pass + +# guess terminal charset +try: + iocharsetguess = locale.nl_langinfo(locale.CODESET) or "ascii" +except locale.Error: + iocharsetguess = "ascii" + +if os.environ.get('TERM') in biditerms and iocharsetguess.lower().startswith('utf'): + LTR = u'\u202d' # left to right override +else: + LTR = '' + + +colours = { + 'none' : "", + 'default' : "\033[0m", + 'bold' : "\033[1m", + 'underline' : "\033[4m", + 'blink' : "\033[5m", + 'reverse' : "\033[7m", + 'concealed' : "\033[8m", + + 'black' : "\033[30m", + 'red' : "\033[31m", + 'green' : "\033[32m", + 'yellow' : "\033[33m", + 'blue' : "\033[34m", + 'magenta' : "\033[35m", + 'cyan' : "\033[36m", + 'white' : "\033[37m", + + 'on_black' : "\033[40m", + 'on_red' : "\033[41m", + 'on_green' : "\033[42m", + 'on_yellow' : "\033[43m", + 'on_blue' : "\033[44m", + 'on_magenta' : "\033[45m", + 'on_cyan' : "\033[46m", + 'on_white' : "\033[47m", + + 'beep' : "\007", + } + + +general_category = { + 'Lu': 'Letter, Uppercase', + 'Ll': 'Letter, Lowercase', + 'Lt': 'Letter, Titlecase', + 'Lm': 'Letter, Modifier', + 'Lo': 'Letter, Other', + 'Mn': 'Mark, Non-Spacing', + 'Mc': 'Mark, Spacing Combining', + 'Me': 'Mark, Enclosing', + 'Nd': 'Number, Decimal Digit', + 'Nl': 'Number, Letter', + 'No': 'Number, Other', + 'Pc': 'Punctuation, Connector', + 'Pd': 'Punctuation, Dash', + 'Ps': 'Punctuation, Open', + 'Pe': 'Punctuation, Close', + 'Pi': 'Punctuation, Initial quote', + 'Pf': 'Punctuation, Final quote', + 'Po': 'Punctuation, Other', + 'Sm': 'Symbol, Math', + 'Sc': 'Symbol, Currency', + 'Sk': 'Symbol, Modifier', + 'So': 'Symbol, Other', + 'Zs': 'Separator, Space', + 'Zl': 'Separator, Line', + 'Zp': 'Separator, Paragraph', + 'Cc': 'Other, Control', + 'Cf': 'Other, Format', + 'Cs': 'Other, Surrogate', + 'Co': 'Other, Private Use', + 'Cn': 'Other, Not Assigned', +} + +bidi_category = { + 'L' : 'Left-to-Right', + 'LRE' : 'Left-to-Right Embedding', + 'LRO' : 'Left-to-Right Override', + 'R' : 'Right-to-Left', + 'AL' : 'Right-to-Left Arabic', + 'RLE' : 'Right-to-Left Embedding', + 'RLO' : 'Right-to-Left Override', + 'PDF' : 'Pop Directional Format', + 'EN' : 'European Number', + 'ES' : 'European Number Separator', + 'ET' : 'European Number Terminator', + 'AN' : 'Arabic Number', + 'CS' : 'Common Number Separator', + 'NSM' : 'Non-Spacing Mark', + 'BN' : 'Boundary Neutral', + 'B' : 'Paragraph Separator', + 'S' : 'Segment Separator', + 'WS' : 'Whitespace', + 'ON' : 'Other Neutrals', +} + +comb_classes = { + 0: 'Spacing, split, enclosing, reordrant, and Tibetan subjoined', + 1: 'Overlays and interior', + 7: 'Nuktas', + 8: 'Hiragana/Katakana voicing marks', + 9: 'Viramas', + 10: 'Start of fixed position classes', + 199: 'End of fixed position classes', + 200: 'Below left attached', + 202: 'Below attached', + 204: 'Below right attached', + 208: 'Left attached (reordrant around single base character)', + 210: 'Right attached', + 212: 'Above left attached', + 214: 'Above attached', + 216: 'Above right attached', + 218: 'Below left', + 220: 'Below', + 222: 'Below right', + 224: 'Left (reordrant around single base character)', + 226: 'Right', + 228: 'Above left', + 230: 'Above', + 232: 'Above right', + 233: 'Double below', + 234: 'Double above', + 240: 'Below (iota subscript)', +} + + + +def get_unicode_properties(ch): + properties = {} + if ch in linecache: + fields = linecache[ch].strip().split(';') + proplist = ['codepoint', 'name', 'category', 'combining', 'bidi', 'decomposition', 'dummy', 'digit_value', 'numeric_value', 'mirrored', 'unicode1name', 'iso_comment', 'uppercase', 'lowercase', 'titlecase'] + for i, prop in enumerate(proplist): + if prop!='dummy': + properties[prop] = fields[i] + + if properties['lowercase']: + properties['lowercase'] = unichr(int(properties['lowercase'], 16)) + if properties['uppercase']: + properties['uppercase'] = unichr(int(properties['uppercase'], 16)) + if properties['titlecase']: + properties['titlecase'] = unichr(int(properties['titlecase'], 16)) + + properties['combining'] = int(properties['combining']) + properties['mirrored'] = properties['mirrored']=='Y' + else: + properties['codepoint'] = '%04X' % ord(ch) + properties['name'] = unicodedata.name(ch, '') + properties['category'] = unicodedata.category(ch) + properties['combining'] = unicodedata.combining(ch) + properties['bidi'] = unicodedata.bidirectional(ch) + properties['decomposition'] = unicodedata.decomposition(ch) + properties['digit_value'] = unicodedata.digit(ch, '') + properties['numeric_value'] = unicodedata.numeric(ch, '') + properties['mirrored'] = unicodedata.mirrored(ch) + properties['unicode1name'] = '' + properties['iso_comment'] = '' + properties['uppercase'] = ch.upper() + properties['lowercase'] = ch.lower() + properties['titlecase'] = '' + return properties + + +def do_init(): + HomeDir = os.path.expanduser('~/.unicode') + HomeUnicodeData = os.path.join(HomeDir, "UnicodeData.txt") + global UnicodeDataFileNames + UnicodeDataFileNames = [HomeUnicodeData, '/usr/share/unicode/UnicodeData.txt', '/usr/share/unidata/UnicodeData.txt', '/hackenv/bin/UnicodeData.txt'] + \ + glob.glob('/usr/share/unidata/UnicodeData*.txt') + \ + glob.glob('/usr/share/perl/*/unicore/UnicodeData.txt') + \ + glob.glob('/System/Library/Perl/*/unicore/UnicodeData.txt') # for MacOSX + + HomeUnihanData = os.path.join(HomeDir, "Unihan*") + global UnihanDataGlobs + UnihanDataGlobs = [HomeUnihanData, '/usr/share/unidata/Unihan*', '/usr/share/unicode/Unihan*', './Unihan*'] + + +def get_unihan_files(): + fos = [] # list of file names for Unihan data file(s) + for gl in UnihanDataGlobs: + fnames = glob.glob(gl) + fos += fnames + return fos + +def get_unihan_properties_internal(ch): + properties = {} + ch = ord(ch) + global unihan_fs + for f in unihan_fs: + fo = OpenGzip(f) + for l in fo: + if l.startswith('#'): + continue + line = l.strip() + if not line: + continue + char, key, value = line.strip().split('\t') + if int(char[2:], 16) == ch: + properties[key] = unicode(value, 'utf-8') + elif int(char[2:], 16)>ch: + break + return properties + +def get_unihan_properties_zgrep(ch): + properties = {} + global unihan_fs + ch = ord(ch) + chs = 'U+%X' % ch + for f in unihan_fs: + if f.endswith('.gz'): + grepcmd = 'zgrep' + elif f.endswith('.bz2'): + grepcmd = 'bzgrep' + else: + grepcmd = 'grep' + cmdline = grepcmd+' ^'+chs+r'\\b '+f + status, output = cmd.getstatusoutput(cmdline) + output = output.split('\n') + for l in output: + if not l: + continue + char, key, value = l.strip().split('\t') + if int(char[2:], 16) == ch: + if PY3: + properties[key] = value + else: + properties[key] = unicode(value, 'utf-8') + elif int(char[2:], 16)>ch: + break + return properties + +# basic sanity check, if e.g. you run this on MS Windows... +if os.path.exists('/bin/grep'): + get_unihan_properties = get_unihan_properties_zgrep +else: + get_unihan_properties = get_unihan_properties_internal + + +def error(txt): + out(txt) + out('\n') + sys.exit(1) + +def get_gzip_filename(fname): + "return fname, if it does not exist, return fname+.gz, if neither that, fname+bz2, if neither that, return None" + if os.path.exists(fname): + return fname + if os.path.exists(fname+'.gz'): + return fname+'.gz' + if os.path.exists(fname+'.bz2') and bz2 is not None: + return fname+'.bz2' + return None + + +def OpenGzip(fname): + "open fname, try fname.gz or fname.bz2 if fname does not exist, return file object or GzipFile or BZ2File object" + if os.path.exists(fname) and not (fname.endswith('.gz') or fname.endswith('.bz2')): + return open(fname) + if os.path.exists(fname+'.gz'): + fname = fname+'.gz' + elif os.path.exists(fname+'.bz2') and bz2 is not None: + fname = fname+'.bz2' + if fname.endswith('.gz'): + return gzip.GzipFile(fname) + elif fname.endswith('.bz2'): + return bz2.BZ2File(fname) + return None + +def GrepInNames(pattern, fillcache=False): + p = re.compile(pattern, re.I) + f = None + for name in UnicodeDataFileNames: + f = OpenGzip(name) + if f != None: + break + if not fillcache: + if not f: + out( """ +Cannot find UnicodeData.txt, please place it into +/usr/share/unidata/UnicodeData.txt, +/usr/share/unicode/UnicodeData.txt, ~/.unicode/ or current +working directory (optionally you can gzip it). +Without the file, searching will be much slower. + +""" ) + for i in xrange(sys.maxunicode): + try: + name = unicodedata.name(unichr(i)) + if re.search(p, name): + yield myunichr(i) + except ValueError: + pass + else: + for l in f: + if re.search(p, l): + r = myunichr(int(l.split(';')[0], 16)) + linecache[r] = l + yield r + f.close() + else: + if f: + for l in f: + if re.search(p, l): + r = myunichr(int(l.split(';')[0], 16)) + linecache[r] = l + f.close() + + +def valfromcp(n, cp=None): + "if fromcp is defined, then the 'n' is considered to be from that codepage and is converted accordingly" + if cp: + xh = '%x' %n + if len(xh) % 2: # pad hexadecimal representation with a zero + xh = '0'+xh + cps = ( [xh[i:i+2] for i in range(0,len(xh),2)] ) + cps = ( chr(int(i, 16)) for i in cps) + cps = ''.join(cps) + """ + if 0 <= n <= 255: + s = chr(n) + elif 256 <= n <= 65535: + s = struct.pack('>H', n) + elif 65536 <= n <= sys.maxint: + s = struct.pack('>H', n) + else: # bad character code, either negative or too big + raise ValueError("Bad character code %s" %n) + print 'ee',`s` + n = unicode(s, cp) + """ + s = unicode(cps, cp) + ns = [ord(x) for x in s] + return ns + else: + return [n] + +def myunichr(n): + try: + r = unichr(n) + return r + except OverflowError: + traceback.print_exc() + error("The codepoint is too big - it does not fit into an int.") + except ValueError: + traceback.print_exc() + err = "The codepoint is too big." + if sys.maxunicode <= 0xffff: + err += "\nPerhaps your python interpreter is not compiled with wide unicode characters." + error(err) + + +def guesstype(arg): + if not arg: # empty string + return 'empty string', arg + elif not is_ascii(arg): + return 'string', arg + elif arg[:2]=='U+' or arg[:2]=='u+': # it is hexadecimal number + try: + val = int(arg[2:], 16) + if val>sys.maxunicode: + return 'regexp', arg + else: + return 'hexadecimal', arg[2:] + except ValueError: + return 'regexp', arg + elif arg[0] in "Uu" and len(arg)>4: + try: + val = int(arg[1:], 16) + if val>sys.maxunicode: + return 'regexp', arg + else: + return 'hexadecimal', arg + except ValueError: + return 'regexp', arg + elif len(arg)>=4: + if len(arg) in (8, 16, 24, 32): + if all(x in '01' for x in arg): + val = int(arg, 2) + if val<=sys.maxunicode: + return 'binary', arg + try: + val = int(arg, 16) + if val>sys.maxunicode: + return 'regexp', arg + else: + return 'hexadecimal', arg + except ValueError: + return 'regexp', arg + else: + return 'string', arg + +def process(arglist, t, fromcp=None): + # build a list of values, so that we can combine queries like + # LATIN ALPHA and search for LATIN.*ALPHA and not names that + # contain either LATIN or ALPHA + result = [] + names_query = [] # reserved for queries in names - i.e. -r + for arg_i in arglist: + if t==None: + tp, arg = guesstype(arg_i) + if tp == 'regexp': + # if the first argument is guessed to be a regexp, add + # all the following arguments to the regular expression - + # this is probably what you wanted, e.g. + # 'unicode cyrillic be' will now search for the 'cyrillic.*be' regular expression + t = 'regexp' + else: + tp, arg = t, arg_i + if tp=='hexadecimal': + val = int(arg, 16) + vals = valfromcp(val, fromcp) + for val in vals: + r = myunichr(val) + list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties + result.append(r) + elif tp=='decimal': + val = int(arg, 10) + vals = valfromcp(val, fromcp) + for val in vals: + r = myunichr(val) + list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties + result.append(r) + elif tp=='octal': + val = int(arg, 8) + vals = valfromcp(val, fromcp) + for val in vals: + r = myunichr(val) + list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties + result.append(r) + elif tp=='binary': + val = int(arg, 2) + vals = valfromcp(val, fromcp) + for val in vals: + r = myunichr(val) + list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties + result.append(r) + elif tp=='regexp': + names_query.append(arg) + elif tp=='string': + try: + if PY3: # argv is automatically decoded into unicode, even padded with bogus character if it is not encodable + unirepr = arg + else: + unirepr = unicode(arg, options.iocharset) + except UnicodeDecodeError: + error ("Sequence %s is not valid in charset '%s'." % (repr(arg), options.iocharset)) + unilist = ['%04X'%ord(x) for x in unirepr] + unireg = '|'.join(unilist) + list(GrepInNames(unireg, fillcache=True)) + for r in unirepr: + result.append(r) + elif tp=='empty string': + pass # do not do anything for an empty string + if names_query: + query = '.*'.join(names_query) + for r in GrepInNames(query): + result.append(r) + return result + +def maybe_colours(colour): + if use_colour: + return colours[colour] + else: + return "" + +# format key and value +def printkv(*l): + for i in range(0, len(l), 2): + if i options.maxcount: + out("\nToo many characters to display, more than %s, use --max option to change it\n" % options.maxcount) + return + properties = get_unicode_properties(c) + out(maybe_colours('bold')) + out('U+%04X '% ord(c)) + if properties['name']: + out(properties['name']) + else: + out(maybe_colours('default')) + out(" - No such unicode character name in database") + out(maybe_colours('default')) + out('\n') + + ar = ["UTF-8", ' '.join([("%02x" % ord23(x)) for x in c.encode('utf-8')]) , + "UTF-16BE", ''.join([("%02x" % ord23(x)) for x in c.encode('utf-16be')]), + "Decimal", "&#%s;" % ord(c) ] + if options.addcharset: + try: + rep = ' '.join([("%02x" % ord(x)) for x in c.encode(options.addcharset)] ) + except UnicodeError: + rep = "NONE" + ar.extend( [options.addcharset, rep] ) + printkv(*ar) + + + if properties['combining']: + pc = " "+c + else: + pc = c + out(pc) + uppercase = properties['uppercase'] + lowercase = properties['lowercase'] + if uppercase: + out(" (%s)" % uppercase) + out('\n') + printkv( "Uppercase", 'U+%04X'% ord(properties['uppercase']) ) + elif lowercase: + out(" (%s)" % properties['lowercase']) + out('\n') + printkv( "Lowercase", 'U+%04X'% ord(properties['lowercase']) ) + else: + out('\n') + printkv( 'Category', properties['category']+ " (%s)" % general_category[properties['category']] ) + + if properties['numeric_value']: + printkv( 'Numeric value', properties['numeric_value']) + if properties['digit_value']: + printkv( 'Digit value', properties['digit_value']) + + bidi = properties['bidi'] + if bidi: + printkv( 'Bidi', bidi+ " (%s)" % bidi_category[bidi] ) + mirrored = properties['mirrored'] + if mirrored: + out('Character is mirrored\n') + comb = properties['combining'] + if comb: + printkv( 'Combining', str(comb)+ " (%s)" % (comb_classes.get(comb, '?')) ) + decomp = properties['decomposition'] + if decomp: + printkv( 'Decomposition', decomp ) + if options.verbosity>0: + uhp = get_unihan_properties(c) + for key in uhp: + printkv(key, uhp[key]) + out('\n') + + +def print_block(block): + #header + out(" "*10) + for i in range(16): + out(".%X " % i) + out('\n') + #body + for i in range(block*16, block*16+16): + hexi = "%X" % i + if len(hexi)>3: + hexi = "%07X" % i + hexi = hexi[:4]+" "+hexi[4:] + else: + hexi = " %03X" % i + out(LTR+hexi+". ") + for j in range(16): + c = unichr(i*16+j) + if unicodedata.combining(c): + c = " "+c + out(c) + out(' ') + out('\n') + out('\n') + +def print_blocks(blocks): + for block in blocks: + print_block(block) + +def is_range(s, typ): + sp = s.split('..') + if len(sp)!=2: + return False + if not sp[1]: + sp[1] = sp[0] + elif not sp[0]: + sp[0] = sp[1] + if not sp[0]: + return False + low = list(process([sp[0]], typ)) # intentionally no fromcp here, ranges are only of unicode characters + high = list(process([sp[1]], typ)) + if len(low)!=1 or len(high)!=1: + return False + low = ord(low[0]) + high = ord(high[0]) + low = low // 256 + high = high // 256 + 1 + return range(low, high) + + + +parser = OptionParser(usage="usage: %prog [options] arg") +parser.add_option("-x", "--hexadecimal", + action="store_const", const='hexadecimal', dest="type", + help="Assume arg to be hexadecimal number") +parser.add_option("-o", "--octal", + action="store_const", const='octal', dest="type", + help="Assume arg to be octal number") +parser.add_option("-b", "--binary", + action="store_const", const='binary', dest="type", + help="Assume arg to be binary number") +parser.add_option("-d", "--decimal", + action="store_const", const='decimal', dest="type", + help="Assume arg to be decimal number") +parser.add_option("-r", "--regexp", + action="store_const", const='regexp', dest="type", + help="Assume arg to be regular expression") +parser.add_option("-s", "--string", + action="store_const", const='string', dest="type", + help="Assume arg to be a sequence of characters") +parser.add_option("-a", "--auto", + action="store_const", const=None, dest="type", + help="Try to guess arg type (default)") +parser.add_option("-m", "--max", + action="store", default=10, dest="maxcount", type="int", + help="Maximal number of codepoints to display, default: 10; 0=unlimited") +parser.add_option("-i", "--io", + action="store", default=iocharsetguess, dest="iocharset", type="string", + help="I/O character set, I am guessing %s" % iocharsetguess) +parser.add_option("--fcp", "--fromcp", + action="store", default='', dest="fromcp", type="string", + help="Convert numerical arguments from this encoding, default: no conversion") +parser.add_option("-c", "--charset-add", + action="store", dest="addcharset", type="string", + help="Show hexadecimal reprezentation in this additional charset") +parser.add_option("-C", "--colour", + action="store", dest="use_colour", type="string", + default="auto", + help="Use colours, on, off or auto") +parser.add_option('', "--color", + action="store", dest="use_colour", type="string", + default="auto", + help="synonym for --colour") +parser.add_option("-v", "--verbose", + action="count", dest="verbosity", + default=0, + help="Increase verbosity (reads Unihan properties - slow!)") +parser.add_option("-w", "--wikipedia", + action="count", dest="query_wiki", + default=0, + help="Query wikipedia for the character") +parser.add_option("--list", + action="store_const", dest="list_all_encodings", + const=True, + help="List (approximately) all known encodings") + + +(options, arguments) = parser.parse_args() + +linecache = {} +do_init() + + +if options.list_all_encodings: + all_encodings = os.listdir(os.path.dirname(encodings.__file__)) + all_encodings = set([os.path.splitext(x)[0] for x in all_encodings]) + all_encodings = list(all_encodings) + all_encodings.sort() + print (textwrap.fill(' '.join(all_encodings))) + sys.exit() + +if len(arguments)==0: + parser.print_help() + sys.exit() + + +if options.use_colour.lower() in ("on", "1", "true", "yes"): + use_colour = True +elif options.use_colour.lower() in ("off", "0", "false", "no"): + use_colour = False +else: + use_colour = sys.stdout.isatty() + if sys.platform == 'win32': + use_colour = False + + +l_args = [] # list of non range arguments to process +for argum in arguments: + is_r = is_range(argum, options.type) + if is_r: + print_blocks(is_r) + else: + l_args.append(argum) + +if l_args: + unihan_fs = [] + if options.verbosity>0: + unihan_fs = get_unihan_files() # list of file names for Unihan data file(s), empty if not available + if not unihan_fs: + out( """ +Unihan_*.txt files not found. In order to view Unihan properties, +please place the file into /usr/share/unidata/, +/usr/share/unicode/, ~/.unicode/ +or current working directory (optionally you can gzip or bzip2 them). +You can get the files by unpacking ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip +Warning, listing UniHan Properties is rather slow. + +""") + options.verbosity = 0 + try: + print_characters(process(l_args, options.type, options.fromcp), options.maxcount, options.query_wiki) + except IOError: # e.g. broken pipe + pass + diff -r 038fcc1f45ea -r b53ae6f7abbb bin/unicode --- a/bin/unicode Fri Apr 25 03:31:28 2014 +0000 +++ b/bin/unicode Fri Apr 25 17:56:09 2014 +0000 @@ -1,815 +1,10 @@ -#!/usr/bin/python - - -import os, glob, sys, unicodedata, locale, gzip, re, traceback, encodings -import urllib, webbrowser, textwrap - -# bz2 was introduced in 2.3, we want this to work also with earlier versions -try: - import bz2 -except ImportError: - bz2 = None - -# for python3 -try: - unicode -except NameError: - unicode = str - -# 'any' and 'all' were introduced in python2.5 -# dummy replacement for older versions -try: - all -except NameError: - all = lambda x: False - -PY3 = sys.version_info[0] >= 3 -if PY3: - import subprocess as cmd - - def is_ascii(s): - "test is string s consists completely of ascii characters (python 3)" - try: - s.encode('ascii') - except UnicodeEncodeError: - return False - return True - - def out(*args): - "pring args, converting them to output charset" - for i in args: - sys.stdout.flush() - sys.stdout.buffer.write(i.encode(options.iocharset, 'replace')) - - # ord23 is used to convert elements of byte array in python3, which are integers - ord23 = lambda x: x - - # unichr is not in python3 - unichr = chr - -else: # python2 - - # getoutput() and getstatusoutput() methods have - # been moved from commands to the subprocess module - # with Python >= 3.x - import commands as cmd - - def is_ascii(s): - "test is string s consists completely of ascii characters (python 2)" - try: - unicode(s, 'ascii') - except UnicodeDecodeError: - return False - return True - - def out(*args): - "pring args, converting them to output charset" - for i in args: - sys.stdout.write(i.encode(options.iocharset, 'replace')) - - ord23 = ord - - - -from optparse import OptionParser - -VERSION='0.9.7' - - -# list of terminals that support bidi -biditerms = ['mlterm'] - -try: - locale.setlocale(locale.LC_ALL, '') -except locale.Error: - pass - -# guess terminal charset +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +import re +import sys, os +import unicodedata +def l(c): m = re.match('(?:U[+])?([0-9a-f]{1,5})$', c, re.I); return unicodedata.lookup(c) if m is None else unichr(int(m.group(1),16)) try: - iocharsetguess = locale.nl_langinfo(locale.CODESET) or "ascii" -except locale.Error: - iocharsetguess = "ascii" - -if os.environ.get('TERM') in biditerms and iocharsetguess.lower().startswith('utf'): - LTR = u'\u202d' # left to right override -else: - LTR = '' - - -colours = { - 'none' : "", - 'default' : "\033[0m", - 'bold' : "\033[1m", - 'underline' : "\033[4m", - 'blink' : "\033[5m", - 'reverse' : "\033[7m", - 'concealed' : "\033[8m", - - 'black' : "\033[30m", - 'red' : "\033[31m", - 'green' : "\033[32m", - 'yellow' : "\033[33m", - 'blue' : "\033[34m", - 'magenta' : "\033[35m", - 'cyan' : "\033[36m", - 'white' : "\033[37m", - - 'on_black' : "\033[40m", - 'on_red' : "\033[41m", - 'on_green' : "\033[42m", - 'on_yellow' : "\033[43m", - 'on_blue' : "\033[44m", - 'on_magenta' : "\033[45m", - 'on_cyan' : "\033[46m", - 'on_white' : "\033[47m", - - 'beep' : "\007", - } - - -general_category = { - 'Lu': 'Letter, Uppercase', - 'Ll': 'Letter, Lowercase', - 'Lt': 'Letter, Titlecase', - 'Lm': 'Letter, Modifier', - 'Lo': 'Letter, Other', - 'Mn': 'Mark, Non-Spacing', - 'Mc': 'Mark, Spacing Combining', - 'Me': 'Mark, Enclosing', - 'Nd': 'Number, Decimal Digit', - 'Nl': 'Number, Letter', - 'No': 'Number, Other', - 'Pc': 'Punctuation, Connector', - 'Pd': 'Punctuation, Dash', - 'Ps': 'Punctuation, Open', - 'Pe': 'Punctuation, Close', - 'Pi': 'Punctuation, Initial quote', - 'Pf': 'Punctuation, Final quote', - 'Po': 'Punctuation, Other', - 'Sm': 'Symbol, Math', - 'Sc': 'Symbol, Currency', - 'Sk': 'Symbol, Modifier', - 'So': 'Symbol, Other', - 'Zs': 'Separator, Space', - 'Zl': 'Separator, Line', - 'Zp': 'Separator, Paragraph', - 'Cc': 'Other, Control', - 'Cf': 'Other, Format', - 'Cs': 'Other, Surrogate', - 'Co': 'Other, Private Use', - 'Cn': 'Other, Not Assigned', -} - -bidi_category = { - 'L' : 'Left-to-Right', - 'LRE' : 'Left-to-Right Embedding', - 'LRO' : 'Left-to-Right Override', - 'R' : 'Right-to-Left', - 'AL' : 'Right-to-Left Arabic', - 'RLE' : 'Right-to-Left Embedding', - 'RLO' : 'Right-to-Left Override', - 'PDF' : 'Pop Directional Format', - 'EN' : 'European Number', - 'ES' : 'European Number Separator', - 'ET' : 'European Number Terminator', - 'AN' : 'Arabic Number', - 'CS' : 'Common Number Separator', - 'NSM' : 'Non-Spacing Mark', - 'BN' : 'Boundary Neutral', - 'B' : 'Paragraph Separator', - 'S' : 'Segment Separator', - 'WS' : 'Whitespace', - 'ON' : 'Other Neutrals', -} - -comb_classes = { - 0: 'Spacing, split, enclosing, reordrant, and Tibetan subjoined', - 1: 'Overlays and interior', - 7: 'Nuktas', - 8: 'Hiragana/Katakana voicing marks', - 9: 'Viramas', - 10: 'Start of fixed position classes', - 199: 'End of fixed position classes', - 200: 'Below left attached', - 202: 'Below attached', - 204: 'Below right attached', - 208: 'Left attached (reordrant around single base character)', - 210: 'Right attached', - 212: 'Above left attached', - 214: 'Above attached', - 216: 'Above right attached', - 218: 'Below left', - 220: 'Below', - 222: 'Below right', - 224: 'Left (reordrant around single base character)', - 226: 'Right', - 228: 'Above left', - 230: 'Above', - 232: 'Above right', - 233: 'Double below', - 234: 'Double above', - 240: 'Below (iota subscript)', -} - - - -def get_unicode_properties(ch): - properties = {} - if ch in linecache: - fields = linecache[ch].strip().split(';') - proplist = ['codepoint', 'name', 'category', 'combining', 'bidi', 'decomposition', 'dummy', 'digit_value', 'numeric_value', 'mirrored', 'unicode1name', 'iso_comment', 'uppercase', 'lowercase', 'titlecase'] - for i, prop in enumerate(proplist): - if prop!='dummy': - properties[prop] = fields[i] - - if properties['lowercase']: - properties['lowercase'] = unichr(int(properties['lowercase'], 16)) - if properties['uppercase']: - properties['uppercase'] = unichr(int(properties['uppercase'], 16)) - if properties['titlecase']: - properties['titlecase'] = unichr(int(properties['titlecase'], 16)) - - properties['combining'] = int(properties['combining']) - properties['mirrored'] = properties['mirrored']=='Y' - else: - properties['codepoint'] = '%04X' % ord(ch) - properties['name'] = unicodedata.name(ch, '') - properties['category'] = unicodedata.category(ch) - properties['combining'] = unicodedata.combining(ch) - properties['bidi'] = unicodedata.bidirectional(ch) - properties['decomposition'] = unicodedata.decomposition(ch) - properties['digit_value'] = unicodedata.digit(ch, '') - properties['numeric_value'] = unicodedata.numeric(ch, '') - properties['mirrored'] = unicodedata.mirrored(ch) - properties['unicode1name'] = '' - properties['iso_comment'] = '' - properties['uppercase'] = ch.upper() - properties['lowercase'] = ch.lower() - properties['titlecase'] = '' - return properties - - -def do_init(): - HomeDir = os.path.expanduser('~/.unicode') - HomeUnicodeData = os.path.join(HomeDir, "UnicodeData.txt") - global UnicodeDataFileNames - UnicodeDataFileNames = [HomeUnicodeData, '/usr/share/unicode/UnicodeData.txt', '/usr/share/unidata/UnicodeData.txt', '/hackenv/bin/UnicodeData.txt'] + \ - glob.glob('/usr/share/unidata/UnicodeData*.txt') + \ - glob.glob('/usr/share/perl/*/unicore/UnicodeData.txt') + \ - glob.glob('/System/Library/Perl/*/unicore/UnicodeData.txt') # for MacOSX - - HomeUnihanData = os.path.join(HomeDir, "Unihan*") - global UnihanDataGlobs - UnihanDataGlobs = [HomeUnihanData, '/usr/share/unidata/Unihan*', '/usr/share/unicode/Unihan*', './Unihan*'] - - -def get_unihan_files(): - fos = [] # list of file names for Unihan data file(s) - for gl in UnihanDataGlobs: - fnames = glob.glob(gl) - fos += fnames - return fos - -def get_unihan_properties_internal(ch): - properties = {} - ch = ord(ch) - global unihan_fs - for f in unihan_fs: - fo = OpenGzip(f) - for l in fo: - if l.startswith('#'): - continue - line = l.strip() - if not line: - continue - char, key, value = line.strip().split('\t') - if int(char[2:], 16) == ch: - properties[key] = unicode(value, 'utf-8') - elif int(char[2:], 16)>ch: - break - return properties - -def get_unihan_properties_zgrep(ch): - properties = {} - global unihan_fs - ch = ord(ch) - chs = 'U+%X' % ch - for f in unihan_fs: - if f.endswith('.gz'): - grepcmd = 'zgrep' - elif f.endswith('.bz2'): - grepcmd = 'bzgrep' - else: - grepcmd = 'grep' - cmdline = grepcmd+' ^'+chs+r'\\b '+f - status, output = cmd.getstatusoutput(cmdline) - output = output.split('\n') - for l in output: - if not l: - continue - char, key, value = l.strip().split('\t') - if int(char[2:], 16) == ch: - if PY3: - properties[key] = value - else: - properties[key] = unicode(value, 'utf-8') - elif int(char[2:], 16)>ch: - break - return properties - -# basic sanity check, if e.g. you run this on MS Windows... -if os.path.exists('/bin/grep'): - get_unihan_properties = get_unihan_properties_zgrep -else: - get_unihan_properties = get_unihan_properties_internal - - -def error(txt): - out(txt) - out('\n') - sys.exit(1) - -def get_gzip_filename(fname): - "return fname, if it does not exist, return fname+.gz, if neither that, fname+bz2, if neither that, return None" - if os.path.exists(fname): - return fname - if os.path.exists(fname+'.gz'): - return fname+'.gz' - if os.path.exists(fname+'.bz2') and bz2 is not None: - return fname+'.bz2' - return None - - -def OpenGzip(fname): - "open fname, try fname.gz or fname.bz2 if fname does not exist, return file object or GzipFile or BZ2File object" - if os.path.exists(fname) and not (fname.endswith('.gz') or fname.endswith('.bz2')): - return open(fname) - if os.path.exists(fname+'.gz'): - fname = fname+'.gz' - elif os.path.exists(fname+'.bz2') and bz2 is not None: - fname = fname+'.bz2' - if fname.endswith('.gz'): - return gzip.GzipFile(fname) - elif fname.endswith('.bz2'): - return bz2.BZ2File(fname) - return None - -def GrepInNames(pattern, fillcache=False): - p = re.compile(pattern, re.I) - f = None - for name in UnicodeDataFileNames: - f = OpenGzip(name) - if f != None: - break - if not fillcache: - if not f: - out( """ -Cannot find UnicodeData.txt, please place it into -/usr/share/unidata/UnicodeData.txt, -/usr/share/unicode/UnicodeData.txt, ~/.unicode/ or current -working directory (optionally you can gzip it). -Without the file, searching will be much slower. - -""" ) - for i in xrange(sys.maxunicode): - try: - name = unicodedata.name(unichr(i)) - if re.search(p, name): - yield myunichr(i) - except ValueError: - pass - else: - for l in f: - if re.search(p, l): - r = myunichr(int(l.split(';')[0], 16)) - linecache[r] = l - yield r - f.close() - else: - if f: - for l in f: - if re.search(p, l): - r = myunichr(int(l.split(';')[0], 16)) - linecache[r] = l - f.close() - - -def valfromcp(n, cp=None): - "if fromcp is defined, then the 'n' is considered to be from that codepage and is converted accordingly" - if cp: - xh = '%x' %n - if len(xh) % 2: # pad hexadecimal representation with a zero - xh = '0'+xh - cps = ( [xh[i:i+2] for i in range(0,len(xh),2)] ) - cps = ( chr(int(i, 16)) for i in cps) - cps = ''.join(cps) - """ - if 0 <= n <= 255: - s = chr(n) - elif 256 <= n <= 65535: - s = struct.pack('>H', n) - elif 65536 <= n <= sys.maxint: - s = struct.pack('>H', n) - else: # bad character code, either negative or too big - raise ValueError("Bad character code %s" %n) - print 'ee',`s` - n = unicode(s, cp) - """ - s = unicode(cps, cp) - ns = [ord(x) for x in s] - return ns - else: - return [n] - -def myunichr(n): - try: - r = unichr(n) - return r - except OverflowError: - traceback.print_exc() - error("The codepoint is too big - it does not fit into an int.") - except ValueError: - traceback.print_exc() - err = "The codepoint is too big." - if sys.maxunicode <= 0xffff: - err += "\nPerhaps your python interpreter is not compiled with wide unicode characters." - error(err) - - -def guesstype(arg): - if not arg: # empty string - return 'empty string', arg - elif not is_ascii(arg): - return 'string', arg - elif arg[:2]=='U+' or arg[:2]=='u+': # it is hexadecimal number - try: - val = int(arg[2:], 16) - if val>sys.maxunicode: - return 'regexp', arg - else: - return 'hexadecimal', arg[2:] - except ValueError: - return 'regexp', arg - elif arg[0] in "Uu" and len(arg)>4: - try: - val = int(arg[1:], 16) - if val>sys.maxunicode: - return 'regexp', arg - else: - return 'hexadecimal', arg - except ValueError: - return 'regexp', arg - elif len(arg)>=4: - if len(arg) in (8, 16, 24, 32): - if all(x in '01' for x in arg): - val = int(arg, 2) - if val<=sys.maxunicode: - return 'binary', arg - try: - val = int(arg, 16) - if val>sys.maxunicode: - return 'regexp', arg - else: - return 'hexadecimal', arg - except ValueError: - return 'regexp', arg - else: - return 'string', arg - -def process(arglist, t, fromcp=None): - # build a list of values, so that we can combine queries like - # LATIN ALPHA and search for LATIN.*ALPHA and not names that - # contain either LATIN or ALPHA - result = [] - names_query = [] # reserved for queries in names - i.e. -r - for arg_i in arglist: - if t==None: - tp, arg = guesstype(arg_i) - if tp == 'regexp': - # if the first argument is guessed to be a regexp, add - # all the following arguments to the regular expression - - # this is probably what you wanted, e.g. - # 'unicode cyrillic be' will now search for the 'cyrillic.*be' regular expression - t = 'regexp' - else: - tp, arg = t, arg_i - if tp=='hexadecimal': - val = int(arg, 16) - vals = valfromcp(val, fromcp) - for val in vals: - r = myunichr(val) - list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties - result.append(r) - elif tp=='decimal': - val = int(arg, 10) - vals = valfromcp(val, fromcp) - for val in vals: - r = myunichr(val) - list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties - result.append(r) - elif tp=='octal': - val = int(arg, 8) - vals = valfromcp(val, fromcp) - for val in vals: - r = myunichr(val) - list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties - result.append(r) - elif tp=='binary': - val = int(arg, 2) - vals = valfromcp(val, fromcp) - for val in vals: - r = myunichr(val) - list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties - result.append(r) - elif tp=='regexp': - names_query.append(arg) - elif tp=='string': - try: - if PY3: # argv is automatically decoded into unicode, even padded with bogus character if it is not encodable - unirepr = arg - else: - unirepr = unicode(arg, options.iocharset) - except UnicodeDecodeError: - error ("Sequence %s is not valid in charset '%s'." % (repr(arg), options.iocharset)) - unilist = ['%04X'%ord(x) for x in unirepr] - unireg = '|'.join(unilist) - list(GrepInNames(unireg, fillcache=True)) - for r in unirepr: - result.append(r) - elif tp=='empty string': - pass # do not do anything for an empty string - if names_query: - query = '.*'.join(names_query) - for r in GrepInNames(query): - result.append(r) - return result - -def maybe_colours(colour): - if use_colour: - return colours[colour] - else: - return "" - -# format key and value -def printkv(*l): - for i in range(0, len(l), 2): - if i options.maxcount: - out("\nToo many characters to display, more than %s, use --max option to change it\n" % options.maxcount) - return - properties = get_unicode_properties(c) - out(maybe_colours('bold')) - out('U+%04X '% ord(c)) - if properties['name']: - out(properties['name']) - else: - out(maybe_colours('default')) - out(" - No such unicode character name in database") - out(maybe_colours('default')) - out('\n') - - ar = ["UTF-8", ' '.join([("%02x" % ord23(x)) for x in c.encode('utf-8')]) , - "UTF-16BE", ''.join([("%02x" % ord23(x)) for x in c.encode('utf-16be')]), - "Decimal", "&#%s;" % ord(c) ] - if options.addcharset: - try: - rep = ' '.join([("%02x" % ord(x)) for x in c.encode(options.addcharset)] ) - except UnicodeError: - rep = "NONE" - ar.extend( [options.addcharset, rep] ) - printkv(*ar) - - - if properties['combining']: - pc = " "+c - else: - pc = c - out(pc) - uppercase = properties['uppercase'] - lowercase = properties['lowercase'] - if uppercase: - out(" (%s)" % uppercase) - out('\n') - printkv( "Uppercase", 'U+%04X'% ord(properties['uppercase']) ) - elif lowercase: - out(" (%s)" % properties['lowercase']) - out('\n') - printkv( "Lowercase", 'U+%04X'% ord(properties['lowercase']) ) - else: - out('\n') - printkv( 'Category', properties['category']+ " (%s)" % general_category[properties['category']] ) - - if properties['numeric_value']: - printkv( 'Numeric value', properties['numeric_value']) - if properties['digit_value']: - printkv( 'Digit value', properties['digit_value']) - - bidi = properties['bidi'] - if bidi: - printkv( 'Bidi', bidi+ " (%s)" % bidi_category[bidi] ) - mirrored = properties['mirrored'] - if mirrored: - out('Character is mirrored\n') - comb = properties['combining'] - if comb: - printkv( 'Combining', str(comb)+ " (%s)" % (comb_classes.get(comb, '?')) ) - decomp = properties['decomposition'] - if decomp: - printkv( 'Decomposition', decomp ) - if options.verbosity>0: - uhp = get_unihan_properties(c) - for key in uhp: - printkv(key, uhp[key]) - out('\n') - - -def print_block(block): - #header - out(" "*10) - for i in range(16): - out(".%X " % i) - out('\n') - #body - for i in range(block*16, block*16+16): - hexi = "%X" % i - if len(hexi)>3: - hexi = "%07X" % i - hexi = hexi[:4]+" "+hexi[4:] - else: - hexi = " %03X" % i - out(LTR+hexi+". ") - for j in range(16): - c = unichr(i*16+j) - if unicodedata.combining(c): - c = " "+c - out(c) - out(' ') - out('\n') - out('\n') - -def print_blocks(blocks): - for block in blocks: - print_block(block) - -def is_range(s, typ): - sp = s.split('..') - if len(sp)!=2: - return False - if not sp[1]: - sp[1] = sp[0] - elif not sp[0]: - sp[0] = sp[1] - if not sp[0]: - return False - low = list(process([sp[0]], typ)) # intentionally no fromcp here, ranges are only of unicode characters - high = list(process([sp[1]], typ)) - if len(low)!=1 or len(high)!=1: - return False - low = ord(low[0]) - high = ord(high[0]) - low = low // 256 - high = high // 256 + 1 - return range(low, high) - - - -parser = OptionParser(usage="usage: %prog [options] arg") -parser.add_option("-x", "--hexadecimal", - action="store_const", const='hexadecimal', dest="type", - help="Assume arg to be hexadecimal number") -parser.add_option("-o", "--octal", - action="store_const", const='octal', dest="type", - help="Assume arg to be octal number") -parser.add_option("-b", "--binary", - action="store_const", const='binary', dest="type", - help="Assume arg to be binary number") -parser.add_option("-d", "--decimal", - action="store_const", const='decimal', dest="type", - help="Assume arg to be decimal number") -parser.add_option("-r", "--regexp", - action="store_const", const='regexp', dest="type", - help="Assume arg to be regular expression") -parser.add_option("-s", "--string", - action="store_const", const='string', dest="type", - help="Assume arg to be a sequence of characters") -parser.add_option("-a", "--auto", - action="store_const", const=None, dest="type", - help="Try to guess arg type (default)") -parser.add_option("-m", "--max", - action="store", default=10, dest="maxcount", type="int", - help="Maximal number of codepoints to display, default: 10; 0=unlimited") -parser.add_option("-i", "--io", - action="store", default=iocharsetguess, dest="iocharset", type="string", - help="I/O character set, I am guessing %s" % iocharsetguess) -parser.add_option("--fcp", "--fromcp", - action="store", default='', dest="fromcp", type="string", - help="Convert numerical arguments from this encoding, default: no conversion") -parser.add_option("-c", "--charset-add", - action="store", dest="addcharset", type="string", - help="Show hexadecimal reprezentation in this additional charset") -parser.add_option("-C", "--colour", - action="store", dest="use_colour", type="string", - default="auto", - help="Use colours, on, off or auto") -parser.add_option('', "--color", - action="store", dest="use_colour", type="string", - default="auto", - help="synonym for --colour") -parser.add_option("-v", "--verbose", - action="count", dest="verbosity", - default=0, - help="Increase verbosity (reads Unihan properties - slow!)") -parser.add_option("-w", "--wikipedia", - action="count", dest="query_wiki", - default=0, - help="Query wikipedia for the character") -parser.add_option("--list", - action="store_const", dest="list_all_encodings", - const=True, - help="List (approximately) all known encodings") - - -(options, arguments) = parser.parse_args() - -linecache = {} -do_init() - - -if options.list_all_encodings: - all_encodings = os.listdir(os.path.dirname(encodings.__file__)) - all_encodings = set([os.path.splitext(x)[0] for x in all_encodings]) - all_encodings = list(all_encodings) - all_encodings.sort() - print (textwrap.fill(' '.join(all_encodings))) - sys.exit() - -if len(arguments)==0: - parser.print_help() - sys.exit() - - -if options.use_colour.lower() in ("on", "1", "true", "yes"): - use_colour = True -elif options.use_colour.lower() in ("off", "0", "false", "no"): - use_colour = False -else: - use_colour = sys.stdout.isatty() - if sys.platform == 'win32': - use_colour = False - - -l_args = [] # list of non range arguments to process -for argum in arguments: - is_r = is_range(argum, options.type) - if is_r: - print_blocks(is_r) - else: - l_args.append(argum) - -if l_args: - unihan_fs = [] - if options.verbosity>0: - unihan_fs = get_unihan_files() # list of file names for Unihan data file(s), empty if not available - if not unihan_fs: - out( """ -Unihan_*.txt files not found. In order to view Unihan properties, -please place the file into /usr/share/unidata/, -/usr/share/unicode/, ~/.unicode/ -or current working directory (optionally you can gzip or bzip2 them). -You can get the files by unpacking ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip -Warning, listing UniHan Properties is rather slow. - -""") - options.verbosity = 0 - try: - print_characters(process(l_args, options.type, options.fromcp), options.maxcount, options.query_wiki) - except IOError: # e.g. broken pipe - pass - + print u''.join(map(l, sys.argv[1:])).encode('utf-8') +except KeyError: + os.execvp("multicode", sys.argv[1:]) diff -r 038fcc1f45ea -r b53ae6f7abbb bin/unicode.old --- a/bin/unicode.old Fri Apr 25 03:31:28 2014 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- -import re -import sys -import unicodedata -def l(c): m = re.match('(?:U[+])?([0-9a-f]{1,5})$', c, re.I); return unicodedata.lookup(c) if m is None else unichr(int(m.group(1),16)) -try: - print u''.join(map(l, sys.argv[1:])).encode('utf-8') -except KeyError: - print u'Unknown character.'