Mercurial > repo

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/multicode	Fri Apr 25 17:56:09 2014 +0000
@@ -0,0 +1,815 @@
+#!/usr/bin/python
+
+
+import os, glob, sys, unicodedata, locale, gzip, re, traceback, encodings
+import urllib, webbrowser, textwrap
+
+# bz2 was introduced in 2.3, we want this to work also with earlier versions
+try:
+    import bz2
+except ImportError:
+    bz2 = None
+
+# for python3
+try:
+    unicode
+except NameError:
+    unicode = str
+
+# 'any' and 'all' were introduced in python2.5
+# dummy replacement for older versions
+try:
+    all
+except NameError:
+    all = lambda x: False
+
+PY3 = sys.version_info[0] >= 3
+if PY3:
+    import subprocess as cmd
+
+    def is_ascii(s):
+        "test is string s consists completely of ascii characters (python 3)"
+        try:
+            s.encode('ascii')
+        except UnicodeEncodeError:
+            return False
+        return True
+
+    def out(*args):
+        "pring args, converting them to output charset"
+        for i in args:
+            sys.stdout.flush()
+            sys.stdout.buffer.write(i.encode(options.iocharset, 'replace'))
+
+    # ord23 is used to convert elements of byte array in python3, which are integers
+    ord23 = lambda x: x
+
+    # unichr is not in python3
+    unichr = chr
+
+else: # python2
+
+    # getoutput() and getstatusoutput() methods have
+    # been moved from commands to the subprocess module
+    # with Python >= 3.x
+    import commands as cmd
+
+    def is_ascii(s):
+        "test is string s consists completely of ascii characters (python 2)"
+        try:
+            unicode(s, 'ascii')
+        except UnicodeDecodeError:
+            return False
+        return True
+
+    def out(*args):
+        "pring args, converting them to output charset"
+        for i in args:
+            sys.stdout.write(i.encode(options.iocharset, 'replace'))
+
+    ord23 = ord
+
+
+
+from optparse import OptionParser
+
+VERSION='0.9.7'
+
+
+# list of terminals that support bidi
+biditerms = ['mlterm']
+
+try:
+    locale.setlocale(locale.LC_ALL, '')
+except locale.Error:
+    pass
+
+# guess terminal charset
+try:
+    iocharsetguess = locale.nl_langinfo(locale.CODESET) or "ascii"
+except locale.Error:
+    iocharsetguess = "ascii"
+
+if os.environ.get('TERM') in biditerms and iocharsetguess.lower().startswith('utf'):
+    LTR = u'\u202d' # left to right override
+else:
+    LTR = ''
+
+
+colours = {
+            'none'       :    "",
+            'default'    :    "\033[0m",
+            'bold'       :    "\033[1m",
+            'underline'  :    "\033[4m",
+            'blink'      :    "\033[5m",
+            'reverse'    :    "\033[7m",
+            'concealed'  :    "\033[8m",
+
+            'black'      :    "\033[30m",
+            'red'        :    "\033[31m",
+            'green'      :    "\033[32m",
+            'yellow'     :    "\033[33m",
+            'blue'       :    "\033[34m",
+            'magenta'    :    "\033[35m",
+            'cyan'       :    "\033[36m",
+            'white'      :    "\033[37m",
+
+            'on_black'   :    "\033[40m",
+            'on_red'     :    "\033[41m",
+            'on_green'   :    "\033[42m",
+            'on_yellow'  :    "\033[43m",
+            'on_blue'    :    "\033[44m",
+            'on_magenta' :    "\033[45m",
+            'on_cyan'    :    "\033[46m",
+            'on_white'   :    "\033[47m",
+
+            'beep'       :    "\007",
+            }
+
+
+general_category = {
+      'Lu':  'Letter, Uppercase',
+      'Ll':  'Letter, Lowercase',
+      'Lt':  'Letter, Titlecase',
+      'Lm':  'Letter, Modifier',
+      'Lo':  'Letter, Other',
+      'Mn':  'Mark, Non-Spacing',
+      'Mc':  'Mark, Spacing Combining',
+      'Me':  'Mark, Enclosing',
+      'Nd':  'Number, Decimal Digit',
+      'Nl':  'Number, Letter',
+      'No':  'Number, Other',
+      'Pc':  'Punctuation, Connector',
+      'Pd':  'Punctuation, Dash',
+      'Ps':  'Punctuation, Open',
+      'Pe':  'Punctuation, Close',
+      'Pi':  'Punctuation, Initial quote',
+      'Pf':  'Punctuation, Final quote',
+      'Po':  'Punctuation, Other',
+      'Sm':  'Symbol, Math',
+      'Sc':  'Symbol, Currency',
+      'Sk':  'Symbol, Modifier',
+      'So':  'Symbol, Other',
+      'Zs':  'Separator, Space',
+      'Zl':  'Separator, Line',
+      'Zp':  'Separator, Paragraph',
+      'Cc':  'Other, Control',
+      'Cf':  'Other, Format',
+      'Cs':  'Other, Surrogate',
+      'Co':  'Other, Private Use',
+      'Cn':  'Other, Not Assigned',
+}
+
+bidi_category = {
+     'L'   : 'Left-to-Right',
+     'LRE' : 'Left-to-Right Embedding',
+     'LRO' : 'Left-to-Right Override',
+     'R'   : 'Right-to-Left',
+     'AL'  : 'Right-to-Left Arabic',
+     'RLE' : 'Right-to-Left Embedding',
+     'RLO' : 'Right-to-Left Override',
+     'PDF' : 'Pop Directional Format',
+     'EN'  : 'European Number',
+     'ES'  : 'European Number Separator',
+     'ET'  : 'European Number Terminator',
+     'AN'  : 'Arabic Number',
+     'CS'  : 'Common Number Separator',
+     'NSM' : 'Non-Spacing Mark',
+     'BN'  : 'Boundary Neutral',
+     'B'   : 'Paragraph Separator',
+     'S'   : 'Segment Separator',
+     'WS'  : 'Whitespace',
+     'ON'  : 'Other Neutrals',
+}
+
+comb_classes = {
+        0: 'Spacing, split, enclosing, reordrant, and Tibetan subjoined',
+        1: 'Overlays and interior',
+        7: 'Nuktas',
+        8: 'Hiragana/Katakana voicing marks',
+        9: 'Viramas',
+       10: 'Start of fixed position classes',
+      199: 'End of fixed position classes',
+      200: 'Below left attached',
+      202: 'Below attached',
+      204: 'Below right attached',
+      208: 'Left attached (reordrant around single base character)',
+      210: 'Right attached',
+      212: 'Above left attached',
+      214: 'Above attached',
+      216: 'Above right attached',
+      218: 'Below left',
+      220: 'Below',
+      222: 'Below right',
+      224: 'Left (reordrant around single base character)',
+      226: 'Right',
+      228: 'Above left',
+      230: 'Above',
+      232: 'Above right',
+      233: 'Double below',
+      234: 'Double above',
+      240: 'Below (iota subscript)',
+}
+
+
+
+def get_unicode_properties(ch):
+    properties = {}
+    if ch in linecache:
+        fields = linecache[ch].strip().split(';')
+        proplist = ['codepoint', 'name', 'category', 'combining', 'bidi', 'decomposition', 'dummy', 'digit_value', 'numeric_value', 'mirrored', 'unicode1name', 'iso_comment', 'uppercase', 'lowercase', 'titlecase']
+        for i, prop in enumerate(proplist):
+            if prop!='dummy':
+                properties[prop] = fields[i]
+
+        if properties['lowercase']:
+            properties['lowercase'] = unichr(int(properties['lowercase'], 16))
+        if properties['uppercase']:
+            properties['uppercase'] = unichr(int(properties['uppercase'], 16))
+        if properties['titlecase']:
+            properties['titlecase'] = unichr(int(properties['titlecase'], 16))
+
+        properties['combining'] = int(properties['combining'])
+        properties['mirrored'] = properties['mirrored']=='Y'
+    else:
+        properties['codepoint'] = '%04X' % ord(ch)
+        properties['name'] = unicodedata.name(ch, '')
+        properties['category'] = unicodedata.category(ch)
+        properties['combining'] = unicodedata.combining(ch)
+        properties['bidi'] = unicodedata.bidirectional(ch)
+        properties['decomposition'] = unicodedata.decomposition(ch)
+        properties['digit_value'] = unicodedata.digit(ch, '')
+        properties['numeric_value'] = unicodedata.numeric(ch, '')
+        properties['mirrored'] = unicodedata.mirrored(ch)
+        properties['unicode1name'] = ''
+        properties['iso_comment'] = ''
+        properties['uppercase'] = ch.upper()
+        properties['lowercase'] = ch.lower()
+        properties['titlecase'] = ''
+    return properties
+
+
+def do_init():
+    HomeDir = os.path.expanduser('~/.unicode')
+    HomeUnicodeData = os.path.join(HomeDir, "UnicodeData.txt")
+    global UnicodeDataFileNames
+    UnicodeDataFileNames = [HomeUnicodeData, '/usr/share/unicode/UnicodeData.txt', '/usr/share/unidata/UnicodeData.txt', '/hackenv/bin/UnicodeData.txt'] + \
+        glob.glob('/usr/share/unidata/UnicodeData*.txt') + \
+        glob.glob('/usr/share/perl/*/unicore/UnicodeData.txt') + \
+        glob.glob('/System/Library/Perl/*/unicore/UnicodeData.txt') # for MacOSX
+
+    HomeUnihanData = os.path.join(HomeDir, "Unihan*")
+    global UnihanDataGlobs
+    UnihanDataGlobs = [HomeUnihanData, '/usr/share/unidata/Unihan*', '/usr/share/unicode/Unihan*', './Unihan*']
+
+
+def get_unihan_files():
+    fos = [] # list of file names for Unihan data file(s)
+    for gl in UnihanDataGlobs:
+        fnames = glob.glob(gl)
+        fos += fnames
+    return fos
+
+def get_unihan_properties_internal(ch):
+    properties = {}
+    ch = ord(ch)
+    global unihan_fs
+    for f in unihan_fs:
+        fo = OpenGzip(f)
+        for l in fo:
+            if l.startswith('#'):
+                continue
+            line = l.strip()
+            if not line:
+                continue
+            char, key, value = line.strip().split('\t')
+            if int(char[2:], 16) == ch:
+                properties[key] = unicode(value, 'utf-8')
+            elif int(char[2:], 16)>ch:
+                break
+    return properties
+
+def get_unihan_properties_zgrep(ch):
+    properties = {}
+    global unihan_fs
+    ch = ord(ch)
+    chs = 'U+%X' % ch
+    for f in unihan_fs:
+        if f.endswith('.gz'):
+            grepcmd = 'zgrep'
+        elif f.endswith('.bz2'):
+            grepcmd = 'bzgrep'
+        else:
+            grepcmd = 'grep'
+        cmdline = grepcmd+' ^'+chs+r'\\b '+f
+        status, output = cmd.getstatusoutput(cmdline)
+        output = output.split('\n')
+        for l in output:
+            if not l:
+                continue
+            char, key, value = l.strip().split('\t')
+            if int(char[2:], 16) == ch:
+                if PY3:
+                    properties[key] = value
+                else:
+                    properties[key] = unicode(value, 'utf-8')
+            elif int(char[2:], 16)>ch:
+                break
+    return properties
+
+# basic sanity check, if e.g. you run this on MS Windows...
+if os.path.exists('/bin/grep'):
+    get_unihan_properties = get_unihan_properties_zgrep
+else:
+    get_unihan_properties = get_unihan_properties_internal
+
+
+def error(txt):
+    out(txt)
+    out('\n')
+    sys.exit(1)
+
+def get_gzip_filename(fname):
+    "return fname, if it does not exist, return fname+.gz, if neither that, fname+bz2, if neither that, return None"
+    if os.path.exists(fname):
+        return fname
+    if os.path.exists(fname+'.gz'):
+        return fname+'.gz'
+    if os.path.exists(fname+'.bz2') and bz2 is not None:
+        return fname+'.bz2'
+    return None
+
+
+def OpenGzip(fname):
+    "open fname, try fname.gz or fname.bz2 if fname does not exist, return file object or GzipFile or BZ2File object"
+    if os.path.exists(fname) and not (fname.endswith('.gz') or fname.endswith('.bz2')):
+        return open(fname)
+    if os.path.exists(fname+'.gz'):
+        fname = fname+'.gz'
+    elif os.path.exists(fname+'.bz2') and bz2 is not None:
+        fname = fname+'.bz2'
+    if fname.endswith('.gz'):
+        return gzip.GzipFile(fname)
+    elif fname.endswith('.bz2'):
+        return bz2.BZ2File(fname)
+    return None
+
+def GrepInNames(pattern, fillcache=False):
+    p = re.compile(pattern, re.I)
+    f = None
+    for name in UnicodeDataFileNames:
+        f = OpenGzip(name)
+        if f != None:
+            break
+    if not fillcache:
+        if not f:
+            out( """
+Cannot find UnicodeData.txt, please place it into
+/usr/share/unidata/UnicodeData.txt,
+/usr/share/unicode/UnicodeData.txt, ~/.unicode/ or current
+working directory (optionally you can gzip it).
+Without the file, searching will be much slower.
+
+""" )
+            for i in xrange(sys.maxunicode):
+                try:
+                    name = unicodedata.name(unichr(i))
+                    if re.search(p, name):
+                        yield myunichr(i)
+                except ValueError:
+                    pass
+        else:
+            for l in f:
+                if re.search(p, l):
+                    r = myunichr(int(l.split(';')[0], 16))
+                    linecache[r] = l
+                    yield r
+            f.close()
+    else:
+        if f:
+            for l in f:
+                if re.search(p, l):
+                    r = myunichr(int(l.split(';')[0], 16))
+                    linecache[r] = l
+            f.close()
+
+
+def valfromcp(n, cp=None):
+    "if fromcp is defined, then the 'n' is considered to be from that codepage and is converted accordingly"
+    if cp:
+        xh = '%x' %n
+        if len(xh) % 2: # pad hexadecimal representation with a zero
+            xh = '0'+xh
+        cps = ( [xh[i:i+2] for i in range(0,len(xh),2)] )
+        cps = ( chr(int(i, 16)) for i in cps)
+        cps = ''.join(cps)
+        """
+        if 0 <= n <= 255:
+            s = chr(n)
+        elif 256 <= n <= 65535:
+            s = struct.pack('>H', n)
+        elif 65536 <= n <= sys.maxint:
+            s = struct.pack('>H', n)
+        else: # bad character code, either negative or too big
+            raise ValueError("Bad character code %s" %n)
+        print 'ee',`s`
+        n = unicode(s, cp)
+        """
+        s = unicode(cps, cp)
+        ns = [ord(x) for x in s]
+        return ns
+    else:
+        return [n]
+
+def myunichr(n):
+    try:
+        r = unichr(n)
+        return r
+    except OverflowError:
+        traceback.print_exc()
+        error("The codepoint is too big - it does not fit into an int.")
+    except ValueError:
+        traceback.print_exc()
+        err = "The codepoint is too big."
+        if sys.maxunicode <= 0xffff:
+            err += "\nPerhaps your python interpreter is not compiled with wide unicode characters."
+        error(err)
+
+
+def guesstype(arg):
+    if not arg: # empty string
+        return 'empty string', arg
+    elif not is_ascii(arg):
+        return 'string', arg
+    elif arg[:2]=='U+' or arg[:2]=='u+': # it is hexadecimal number
+        try:
+            val = int(arg[2:], 16)
+            if val>sys.maxunicode:
+                return 'regexp', arg
+            else:
+                return 'hexadecimal', arg[2:]
+        except ValueError:
+            return 'regexp', arg
+    elif arg[0] in "Uu" and len(arg)>4:
+        try:
+            val = int(arg[1:], 16)
+            if val>sys.maxunicode:
+                return 'regexp', arg
+            else:
+                return 'hexadecimal', arg
+        except ValueError:
+            return 'regexp', arg
+    elif len(arg)>=4:
+        if len(arg) in (8, 16, 24, 32):
+            if all(x in '01' for x in arg):
+                val = int(arg, 2)
+                if val<=sys.maxunicode:
+                    return 'binary', arg
+        try:
+            val = int(arg, 16)
+            if val>sys.maxunicode:
+                return 'regexp', arg
+            else:
+                return 'hexadecimal', arg
+        except ValueError:
+            return 'regexp', arg
+    else:
+        return 'string', arg
+
+def process(arglist, t, fromcp=None):
+    # build a list of values, so that we can combine queries like
+    # LATIN ALPHA and search for LATIN.*ALPHA and not names that
+    # contain either LATIN or ALPHA
+    result = []
+    names_query = [] # reserved for queries in names - i.e. -r
+    for arg_i in arglist:
+        if t==None:
+            tp, arg = guesstype(arg_i)
+            if tp == 'regexp':
+                # if the first argument is guessed to be a regexp, add
+                # all the following arguments to the regular expression -
+                # this is probably what you wanted, e.g.
+                # 'unicode cyrillic be' will now search for the 'cyrillic.*be' regular expression
+                t = 'regexp'
+        else:
+            tp, arg = t, arg_i
+        if tp=='hexadecimal':
+            val = int(arg, 16)
+            vals = valfromcp(val, fromcp)
+            for val in vals:
+                r = myunichr(val)
+                list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties
+                result.append(r)
+        elif tp=='decimal':
+            val = int(arg, 10)
+            vals = valfromcp(val, fromcp)
+            for val in vals:
+                r = myunichr(val)
+                list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties
+                result.append(r)
+        elif tp=='octal':
+            val = int(arg, 8)
+            vals = valfromcp(val, fromcp)
+            for val in vals:
+                r = myunichr(val)
+                list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties
+                result.append(r)
+        elif tp=='binary':
+            val = int(arg, 2)
+            vals = valfromcp(val, fromcp)
+            for val in vals:
+                r = myunichr(val)
+                list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties
+                result.append(r)
+        elif tp=='regexp':
+            names_query.append(arg)
+        elif tp=='string':
+            try:
+                if PY3: # argv is automatically decoded into unicode, even padded with bogus character if it is not encodable
+                    unirepr = arg
+                else:
+                    unirepr = unicode(arg, options.iocharset)
+            except UnicodeDecodeError:
+                error ("Sequence %s is not valid in charset '%s'." % (repr(arg),  options.iocharset))
+            unilist = ['%04X'%ord(x) for x in unirepr]
+            unireg = '|'.join(unilist)
+            list(GrepInNames(unireg, fillcache=True))
+            for r in unirepr:
+                result.append(r)
+        elif tp=='empty string':
+            pass # do not do anything for an empty string
+    if names_query:
+        query = '.*'.join(names_query)
+        for r in GrepInNames(query):
+            result.append(r)
+    return result
+
+def maybe_colours(colour):
+    if use_colour:
+        return colours[colour]
+    else:
+        return ""
+
+# format key and value
+def printkv(*l):
+    for i in range(0, len(l), 2):
+        if i<len(l)-2:
+            sep = "  "
+        else:
+            sep = "\n"
+        k, v = l[i], l[i+1]
+        out(maybe_colours('green'))
+        out(k)
+        out(": ")
+        out(maybe_colours('default'))
+        out(unicode(v))
+        out(sep)
+
+def print_characters(clist, maxcount, query_wiki=0):
+    """query_wiki - 0 - don't
+                    1 - spawn browser
+    """
+    counter = 0
+    for c in clist:
+
+        if query_wiki:
+            ch = urllib.quote(c.encode('utf-8')) # wikipedia uses UTF-8 in names
+            wiki_url = 'http://en.wikipedia.org/wiki/'+ch
+            webbrowser.open(wiki_url)
+            query_wiki = 0 # query only the very first character
+
+
+        if maxcount:
+            counter += 1
+        if counter > options.maxcount:
+            out("\nToo many characters to display, more than %s, use --max option to change it\n" % options.maxcount)
+            return
+        properties = get_unicode_properties(c)
+        out(maybe_colours('bold'))
+        out('U+%04X '% ord(c))
+        if properties['name']:
+            out(properties['name'])
+        else:
+            out(maybe_colours('default'))
+            out(" - No such unicode character name in database")
+        out(maybe_colours('default'))
+        out('\n')
+
+        ar = ["UTF-8", ' '.join([("%02x" % ord23(x)) for x in c.encode('utf-8')]) ,
+              "UTF-16BE", ''.join([("%02x" % ord23(x)) for x in c.encode('utf-16be')]),
+              "Decimal", "&#%s;" % ord(c) ]
+        if options.addcharset:
+            try:
+                rep = ' '.join([("%02x" % ord(x)) for x in c.encode(options.addcharset)] )
+            except UnicodeError:
+                rep = "NONE"
+            ar.extend( [options.addcharset, rep] )
+        printkv(*ar)
+
+
+        if properties['combining']:
+            pc = " "+c
+        else:
+            pc = c
+        out(pc)
+        uppercase = properties['uppercase']
+        lowercase = properties['lowercase']
+        if uppercase:
+            out(" (%s)" % uppercase)
+            out('\n')
+            printkv( "Uppercase", 'U+%04X'% ord(properties['uppercase']) )
+        elif lowercase:
+            out(" (%s)" % properties['lowercase'])
+            out('\n')
+            printkv( "Lowercase", 'U+%04X'% ord(properties['lowercase']) )
+        else:
+            out('\n')
+        printkv( 'Category', properties['category']+ " (%s)" % general_category[properties['category']] )
+
+        if properties['numeric_value']:
+            printkv( 'Numeric value',  properties['numeric_value'])
+        if properties['digit_value']:
+            printkv( 'Digit value',  properties['digit_value'])
+
+        bidi = properties['bidi']
+        if bidi:
+            printkv( 'Bidi', bidi+ " (%s)" % bidi_category[bidi] )
+        mirrored = properties['mirrored']
+        if mirrored:
+            out('Character is mirrored\n')
+        comb = properties['combining']
+        if comb:
+            printkv( 'Combining', str(comb)+ " (%s)" % (comb_classes.get(comb, '?')) )
+        decomp = properties['decomposition']
+        if decomp:
+            printkv( 'Decomposition', decomp )
+        if options.verbosity>0:
+            uhp = get_unihan_properties(c)
+            for key in uhp:
+                printkv(key, uhp[key])
+        out('\n')
+
+
+def print_block(block):
+    #header
+    out(" "*10)
+    for i in range(16):
+        out(".%X " % i)
+    out('\n')
+    #body
+    for i in range(block*16, block*16+16):
+        hexi = "%X" % i
+        if len(hexi)>3:
+            hexi = "%07X" % i
+            hexi = hexi[:4]+" "+hexi[4:]
+        else:
+            hexi = "     %03X" % i
+        out(LTR+hexi+".  ")
+        for j in range(16):
+            c = unichr(i*16+j)
+            if unicodedata.combining(c):
+                c = " "+c
+            out(c)
+            out('  ')
+        out('\n')
+    out('\n')
+
+def print_blocks(blocks):
+    for block in blocks:
+        print_block(block)
+
+def is_range(s, typ):
+    sp = s.split('..')
+    if len(sp)!=2:
+        return False
+    if not sp[1]:
+        sp[1] = sp[0]
+    elif not sp[0]:
+        sp[0] = sp[1]
+    if not sp[0]:
+        return False
+    low = list(process([sp[0]], typ)) # intentionally no fromcp here, ranges are only of unicode characters
+    high = list(process([sp[1]], typ))
+    if len(low)!=1 or len(high)!=1:
+        return False
+    low = ord(low[0])
+    high = ord(high[0])
+    low = low // 256
+    high = high // 256 + 1
+    return range(low, high)
+
+
+
+parser = OptionParser(usage="usage: %prog [options] arg")
+parser.add_option("-x", "--hexadecimal",
+      action="store_const", const='hexadecimal', dest="type",
+      help="Assume arg to be hexadecimal number")
+parser.add_option("-o", "--octal",
+      action="store_const", const='octal', dest="type",
+      help="Assume arg to be octal number")
+parser.add_option("-b", "--binary",
+      action="store_const", const='binary', dest="type",
+      help="Assume arg to be binary number")
+parser.add_option("-d", "--decimal",
+      action="store_const", const='decimal', dest="type",
+      help="Assume arg to be decimal number")
+parser.add_option("-r", "--regexp",
+      action="store_const", const='regexp', dest="type",
+      help="Assume arg to be regular expression")
+parser.add_option("-s", "--string",
+      action="store_const", const='string', dest="type",
+      help="Assume arg to be a sequence of characters")
+parser.add_option("-a", "--auto",
+      action="store_const", const=None, dest="type",
+      help="Try to guess arg type (default)")
+parser.add_option("-m", "--max",
+      action="store", default=10, dest="maxcount", type="int",
+      help="Maximal number of codepoints to display, default: 10; 0=unlimited")
+parser.add_option("-i", "--io",
+      action="store", default=iocharsetguess, dest="iocharset", type="string",
+      help="I/O character set, I am guessing %s" % iocharsetguess)
+parser.add_option("--fcp", "--fromcp",
+      action="store", default='', dest="fromcp", type="string",
+      help="Convert numerical arguments from this encoding, default: no conversion")
+parser.add_option("-c", "--charset-add",
+      action="store", dest="addcharset", type="string",
+      help="Show hexadecimal reprezentation in this additional charset")
+parser.add_option("-C", "--colour",
+      action="store", dest="use_colour", type="string",
+      default="auto",
+      help="Use colours, on, off or auto")
+parser.add_option('', "--color",
+      action="store", dest="use_colour", type="string",
+      default="auto",
+      help="synonym for --colour")
+parser.add_option("-v", "--verbose",
+      action="count", dest="verbosity",
+      default=0,
+      help="Increase verbosity (reads Unihan properties - slow!)")
+parser.add_option("-w", "--wikipedia",
+      action="count", dest="query_wiki",
+      default=0,
+      help="Query wikipedia for the character")
+parser.add_option("--list",
+      action="store_const", dest="list_all_encodings",
+      const=True,
+      help="List (approximately) all known encodings")
+
+
+(options, arguments) = parser.parse_args()
+
+linecache = {}
+do_init()
+
+
+if options.list_all_encodings:
+    all_encodings = os.listdir(os.path.dirname(encodings.__file__))
+    all_encodings = set([os.path.splitext(x)[0] for x in all_encodings])
+    all_encodings = list(all_encodings)
+    all_encodings.sort()
+    print (textwrap.fill(' '.join(all_encodings)))
+    sys.exit()
+
+if len(arguments)==0:
+    parser.print_help()
+    sys.exit()
+
+
+if options.use_colour.lower() in ("on", "1", "true", "yes"):
+    use_colour = True
+elif options.use_colour.lower() in ("off", "0", "false", "no"):
+    use_colour = False
+else:
+    use_colour = sys.stdout.isatty()
+    if sys.platform == 'win32':
+        use_colour = False
+
+
+l_args = [] # list of non range arguments to process
+for argum in arguments:
+    is_r = is_range(argum, options.type)
+    if is_r:
+        print_blocks(is_r)
+    else:
+        l_args.append(argum)
+
+if l_args:
+    unihan_fs = []
+    if options.verbosity>0:
+        unihan_fs = get_unihan_files() # list of file names for Unihan data file(s), empty if not available
+        if not unihan_fs:
+            out( """
+Unihan_*.txt files not found. In order to view Unihan properties,
+please place the file into /usr/share/unidata/,
+/usr/share/unicode/, ~/.unicode/
+or current working directory (optionally you can gzip or bzip2 them).
+You can get the files by unpacking ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip
+Warning, listing UniHan Properties is rather slow.
+
+""")
+            options.verbosity = 0
+    try:
+        print_characters(process(l_args, options.type, options.fromcp), options.maxcount, options.query_wiki)
+    except IOError: # e.g. broken pipe
+        pass
+
--- a/bin/unicode	Fri Apr 25 03:31:28 2014 +0000
+++ b/bin/unicode	Fri Apr 25 17:56:09 2014 +0000
@@ -1,815 +1,10 @@
-#!/usr/bin/python
-
-
-import os, glob, sys, unicodedata, locale, gzip, re, traceback, encodings
-import urllib, webbrowser, textwrap
-
-# bz2 was introduced in 2.3, we want this to work also with earlier versions
-try:
-    import bz2
-except ImportError:
-    bz2 = None
-
-# for python3
-try:
-    unicode
-except NameError:
-    unicode = str
-
-# 'any' and 'all' were introduced in python2.5
-# dummy replacement for older versions
-try:
-    all
-except NameError:
-    all = lambda x: False
-
-PY3 = sys.version_info[0] >= 3
-if PY3:
-    import subprocess as cmd
-
-    def is_ascii(s):
-        "test is string s consists completely of ascii characters (python 3)"
-        try:
-            s.encode('ascii')
-        except UnicodeEncodeError:
-            return False
-        return True
-
-    def out(*args):
-        "pring args, converting them to output charset"
-        for i in args:
-            sys.stdout.flush()
-            sys.stdout.buffer.write(i.encode(options.iocharset, 'replace'))
-
-    # ord23 is used to convert elements of byte array in python3, which are integers
-    ord23 = lambda x: x
-
-    # unichr is not in python3
-    unichr = chr
-
-else: # python2
-
-    # getoutput() and getstatusoutput() methods have
-    # been moved from commands to the subprocess module
-    # with Python >= 3.x
-    import commands as cmd
-
-    def is_ascii(s):
-        "test is string s consists completely of ascii characters (python 2)"
-        try:
-            unicode(s, 'ascii')
-        except UnicodeDecodeError:
-            return False
-        return True
-
-    def out(*args):
-        "pring args, converting them to output charset"
-        for i in args:
-            sys.stdout.write(i.encode(options.iocharset, 'replace'))
-
-    ord23 = ord
-
-
-
-from optparse import OptionParser
-
-VERSION='0.9.7'
-
-
-# list of terminals that support bidi
-biditerms = ['mlterm']
-
-try:
-    locale.setlocale(locale.LC_ALL, '')
-except locale.Error:
-    pass
-
-# guess terminal charset
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import re
+import sys, os
+import unicodedata
+def l(c): m = re.match('(?:U[+])?([0-9a-f]{1,5})$', c, re.I); return unicodedata.lookup(c) if m is None else unichr(int(m.group(1),16))
 try:
-    iocharsetguess = locale.nl_langinfo(locale.CODESET) or "ascii"
-except locale.Error:
-    iocharsetguess = "ascii"
-
-if os.environ.get('TERM') in biditerms and iocharsetguess.lower().startswith('utf'):
-    LTR = u'\u202d' # left to right override
-else:
-    LTR = ''
-
-
-colours = {
-            'none'       :    "",
-            'default'    :    "\033[0m",
-            'bold'       :    "\033[1m",
-            'underline'  :    "\033[4m",
-            'blink'      :    "\033[5m",
-            'reverse'    :    "\033[7m",
-            'concealed'  :    "\033[8m",
-
-            'black'      :    "\033[30m",
-            'red'        :    "\033[31m",
-            'green'      :    "\033[32m",
-            'yellow'     :    "\033[33m",
-            'blue'       :    "\033[34m",
-            'magenta'    :    "\033[35m",
-            'cyan'       :    "\033[36m",
-            'white'      :    "\033[37m",
-
-            'on_black'   :    "\033[40m",
-            'on_red'     :    "\033[41m",
-            'on_green'   :    "\033[42m",
-            'on_yellow'  :    "\033[43m",
-            'on_blue'    :    "\033[44m",
-            'on_magenta' :    "\033[45m",
-            'on_cyan'    :    "\033[46m",
-            'on_white'   :    "\033[47m",
-
-            'beep'       :    "\007",
-            }
-
-
-general_category = {
-      'Lu':  'Letter, Uppercase',
-      'Ll':  'Letter, Lowercase',
-      'Lt':  'Letter, Titlecase',
-      'Lm':  'Letter, Modifier',
-      'Lo':  'Letter, Other',
-      'Mn':  'Mark, Non-Spacing',
-      'Mc':  'Mark, Spacing Combining',
-      'Me':  'Mark, Enclosing',
-      'Nd':  'Number, Decimal Digit',
-      'Nl':  'Number, Letter',
-      'No':  'Number, Other',
-      'Pc':  'Punctuation, Connector',
-      'Pd':  'Punctuation, Dash',
-      'Ps':  'Punctuation, Open',
-      'Pe':  'Punctuation, Close',
-      'Pi':  'Punctuation, Initial quote',
-      'Pf':  'Punctuation, Final quote',
-      'Po':  'Punctuation, Other',
-      'Sm':  'Symbol, Math',
-      'Sc':  'Symbol, Currency',
-      'Sk':  'Symbol, Modifier',
-      'So':  'Symbol, Other',
-      'Zs':  'Separator, Space',
-      'Zl':  'Separator, Line',
-      'Zp':  'Separator, Paragraph',
-      'Cc':  'Other, Control',
-      'Cf':  'Other, Format',
-      'Cs':  'Other, Surrogate',
-      'Co':  'Other, Private Use',
-      'Cn':  'Other, Not Assigned',
-}
-
-bidi_category = {
-     'L'   : 'Left-to-Right',
-     'LRE' : 'Left-to-Right Embedding',
-     'LRO' : 'Left-to-Right Override',
-     'R'   : 'Right-to-Left',
-     'AL'  : 'Right-to-Left Arabic',
-     'RLE' : 'Right-to-Left Embedding',
-     'RLO' : 'Right-to-Left Override',
-     'PDF' : 'Pop Directional Format',
-     'EN'  : 'European Number',
-     'ES'  : 'European Number Separator',
-     'ET'  : 'European Number Terminator',
-     'AN'  : 'Arabic Number',
-     'CS'  : 'Common Number Separator',
-     'NSM' : 'Non-Spacing Mark',
-     'BN'  : 'Boundary Neutral',
-     'B'   : 'Paragraph Separator',
-     'S'   : 'Segment Separator',
-     'WS'  : 'Whitespace',
-     'ON'  : 'Other Neutrals',
-}
-
-comb_classes = {
-        0: 'Spacing, split, enclosing, reordrant, and Tibetan subjoined',
-        1: 'Overlays and interior',
-        7: 'Nuktas',
-        8: 'Hiragana/Katakana voicing marks',
-        9: 'Viramas',
-       10: 'Start of fixed position classes',
-      199: 'End of fixed position classes',
-      200: 'Below left attached',
-      202: 'Below attached',
-      204: 'Below right attached',
-      208: 'Left attached (reordrant around single base character)',
-      210: 'Right attached',
-      212: 'Above left attached',
-      214: 'Above attached',
-      216: 'Above right attached',
-      218: 'Below left',
-      220: 'Below',
-      222: 'Below right',
-      224: 'Left (reordrant around single base character)',
-      226: 'Right',
-      228: 'Above left',
-      230: 'Above',
-      232: 'Above right',
-      233: 'Double below',
-      234: 'Double above',
-      240: 'Below (iota subscript)',
-}
-
-
-
-def get_unicode_properties(ch):
-    properties = {}
-    if ch in linecache:
-        fields = linecache[ch].strip().split(';')
-        proplist = ['codepoint', 'name', 'category', 'combining', 'bidi', 'decomposition', 'dummy', 'digit_value', 'numeric_value', 'mirrored', 'unicode1name', 'iso_comment', 'uppercase', 'lowercase', 'titlecase']
-        for i, prop in enumerate(proplist):
-            if prop!='dummy':
-                properties[prop] = fields[i]
-
-        if properties['lowercase']:
-            properties['lowercase'] = unichr(int(properties['lowercase'], 16))
-        if properties['uppercase']:
-            properties['uppercase'] = unichr(int(properties['uppercase'], 16))
-        if properties['titlecase']:
-            properties['titlecase'] = unichr(int(properties['titlecase'], 16))
-
-        properties['combining'] = int(properties['combining'])
-        properties['mirrored'] = properties['mirrored']=='Y'
-    else:
-        properties['codepoint'] = '%04X' % ord(ch)
-        properties['name'] = unicodedata.name(ch, '')
-        properties['category'] = unicodedata.category(ch)
-        properties['combining'] = unicodedata.combining(ch)
-        properties['bidi'] = unicodedata.bidirectional(ch)
-        properties['decomposition'] = unicodedata.decomposition(ch)
-        properties['digit_value'] = unicodedata.digit(ch, '')
-        properties['numeric_value'] = unicodedata.numeric(ch, '')
-        properties['mirrored'] = unicodedata.mirrored(ch)
-        properties['unicode1name'] = ''
-        properties['iso_comment'] = ''
-        properties['uppercase'] = ch.upper()
-        properties['lowercase'] = ch.lower()
-        properties['titlecase'] = ''
-    return properties
-
-
-def do_init():
-    HomeDir = os.path.expanduser('~/.unicode')
-    HomeUnicodeData = os.path.join(HomeDir, "UnicodeData.txt")
-    global UnicodeDataFileNames
-    UnicodeDataFileNames = [HomeUnicodeData, '/usr/share/unicode/UnicodeData.txt', '/usr/share/unidata/UnicodeData.txt', '/hackenv/bin/UnicodeData.txt'] + \
-        glob.glob('/usr/share/unidata/UnicodeData*.txt') + \
-        glob.glob('/usr/share/perl/*/unicore/UnicodeData.txt') + \
-        glob.glob('/System/Library/Perl/*/unicore/UnicodeData.txt') # for MacOSX
-
-    HomeUnihanData = os.path.join(HomeDir, "Unihan*")
-    global UnihanDataGlobs
-    UnihanDataGlobs = [HomeUnihanData, '/usr/share/unidata/Unihan*', '/usr/share/unicode/Unihan*', './Unihan*']
-
-
-def get_unihan_files():
-    fos = [] # list of file names for Unihan data file(s)
-    for gl in UnihanDataGlobs:
-        fnames = glob.glob(gl)
-        fos += fnames
-    return fos
-
-def get_unihan_properties_internal(ch):
-    properties = {}
-    ch = ord(ch)
-    global unihan_fs
-    for f in unihan_fs:
-        fo = OpenGzip(f)
-        for l in fo:
-            if l.startswith('#'):
-                continue
-            line = l.strip()
-            if not line:
-                continue
-            char, key, value = line.strip().split('\t')
-            if int(char[2:], 16) == ch:
-                properties[key] = unicode(value, 'utf-8')
-            elif int(char[2:], 16)>ch:
-                break
-    return properties
-
-def get_unihan_properties_zgrep(ch):
-    properties = {}
-    global unihan_fs
-    ch = ord(ch)
-    chs = 'U+%X' % ch
-    for f in unihan_fs:
-        if f.endswith('.gz'):
-            grepcmd = 'zgrep'
-        elif f.endswith('.bz2'):
-            grepcmd = 'bzgrep'
-        else:
-            grepcmd = 'grep'
-        cmdline = grepcmd+' ^'+chs+r'\\b '+f
-        status, output = cmd.getstatusoutput(cmdline)
-        output = output.split('\n')
-        for l in output:
-            if not l:
-                continue
-            char, key, value = l.strip().split('\t')
-            if int(char[2:], 16) == ch:
-                if PY3:
-                    properties[key] = value
-                else:
-                    properties[key] = unicode(value, 'utf-8')
-            elif int(char[2:], 16)>ch:
-                break
-    return properties
-
-# basic sanity check, if e.g. you run this on MS Windows...
-if os.path.exists('/bin/grep'):
-    get_unihan_properties = get_unihan_properties_zgrep
-else:
-    get_unihan_properties = get_unihan_properties_internal
-
-
-def error(txt):
-    out(txt)
-    out('\n')
-    sys.exit(1)
-
-def get_gzip_filename(fname):
-    "return fname, if it does not exist, return fname+.gz, if neither that, fname+bz2, if neither that, return None"
-    if os.path.exists(fname):
-        return fname
-    if os.path.exists(fname+'.gz'):
-        return fname+'.gz'
-    if os.path.exists(fname+'.bz2') and bz2 is not None:
-        return fname+'.bz2'
-    return None
-
-
-def OpenGzip(fname):
-    "open fname, try fname.gz or fname.bz2 if fname does not exist, return file object or GzipFile or BZ2File object"
-    if os.path.exists(fname) and not (fname.endswith('.gz') or fname.endswith('.bz2')):
-        return open(fname)
-    if os.path.exists(fname+'.gz'):
-        fname = fname+'.gz'
-    elif os.path.exists(fname+'.bz2') and bz2 is not None:
-        fname = fname+'.bz2'
-    if fname.endswith('.gz'):
-        return gzip.GzipFile(fname)
-    elif fname.endswith('.bz2'):
-        return bz2.BZ2File(fname)
-    return None
-
-def GrepInNames(pattern, fillcache=False):
-    p = re.compile(pattern, re.I)
-    f = None
-    for name in UnicodeDataFileNames:
-        f = OpenGzip(name)
-        if f != None:
-            break
-    if not fillcache:
-        if not f:
-            out( """
-Cannot find UnicodeData.txt, please place it into
-/usr/share/unidata/UnicodeData.txt,
-/usr/share/unicode/UnicodeData.txt, ~/.unicode/ or current
-working directory (optionally you can gzip it).
-Without the file, searching will be much slower.
-
-""" )
-            for i in xrange(sys.maxunicode):
-                try:
-                    name = unicodedata.name(unichr(i))
-                    if re.search(p, name):
-                        yield myunichr(i)
-                except ValueError:
-                    pass
-        else:
-            for l in f:
-                if re.search(p, l):
-                    r = myunichr(int(l.split(';')[0], 16))
-                    linecache[r] = l
-                    yield r
-            f.close()
-    else:
-        if f:
-            for l in f:
-                if re.search(p, l):
-                    r = myunichr(int(l.split(';')[0], 16))
-                    linecache[r] = l
-            f.close()
-
-
-def valfromcp(n, cp=None):
-    "if fromcp is defined, then the 'n' is considered to be from that codepage and is converted accordingly"
-    if cp:
-        xh = '%x' %n
-        if len(xh) % 2: # pad hexadecimal representation with a zero
-            xh = '0'+xh
-        cps = ( [xh[i:i+2] for i in range(0,len(xh),2)] )
-        cps = ( chr(int(i, 16)) for i in cps)
-        cps = ''.join(cps)
-        """
-        if 0 <= n <= 255:
-            s = chr(n)
-        elif 256 <= n <= 65535:
-            s = struct.pack('>H', n)
-        elif 65536 <= n <= sys.maxint:
-            s = struct.pack('>H', n)
-        else: # bad character code, either negative or too big
-            raise ValueError("Bad character code %s" %n)
-        print 'ee',`s`
-        n = unicode(s, cp)
-        """
-        s = unicode(cps, cp)
-        ns = [ord(x) for x in s]
-        return ns
-    else:
-        return [n]
-
-def myunichr(n):
-    try:
-        r = unichr(n)
-        return r
-    except OverflowError:
-        traceback.print_exc()
-        error("The codepoint is too big - it does not fit into an int.")
-    except ValueError:
-        traceback.print_exc()
-        err = "The codepoint is too big."
-        if sys.maxunicode <= 0xffff:
-            err += "\nPerhaps your python interpreter is not compiled with wide unicode characters."
-        error(err)
-
-
-def guesstype(arg):
-    if not arg: # empty string
-        return 'empty string', arg
-    elif not is_ascii(arg):
-        return 'string', arg
-    elif arg[:2]=='U+' or arg[:2]=='u+': # it is hexadecimal number
-        try:
-            val = int(arg[2:], 16)
-            if val>sys.maxunicode:
-                return 'regexp', arg
-            else:
-                return 'hexadecimal', arg[2:]
-        except ValueError:
-            return 'regexp', arg
-    elif arg[0] in "Uu" and len(arg)>4:
-        try:
-            val = int(arg[1:], 16)
-            if val>sys.maxunicode:
-                return 'regexp', arg
-            else:
-                return 'hexadecimal', arg
-        except ValueError:
-            return 'regexp', arg
-    elif len(arg)>=4:
-        if len(arg) in (8, 16, 24, 32):
-            if all(x in '01' for x in arg):
-                val = int(arg, 2)
-                if val<=sys.maxunicode:
-                    return 'binary', arg
-        try:
-            val = int(arg, 16)
-            if val>sys.maxunicode:
-                return 'regexp', arg
-            else:
-                return 'hexadecimal', arg
-        except ValueError:
-            return 'regexp', arg
-    else:
-        return 'string', arg
-
-def process(arglist, t, fromcp=None):
-    # build a list of values, so that we can combine queries like
-    # LATIN ALPHA and search for LATIN.*ALPHA and not names that
-    # contain either LATIN or ALPHA
-    result = []
-    names_query = [] # reserved for queries in names - i.e. -r
-    for arg_i in arglist:
-        if t==None:
-            tp, arg = guesstype(arg_i)
-            if tp == 'regexp':
-                # if the first argument is guessed to be a regexp, add
-                # all the following arguments to the regular expression -
-                # this is probably what you wanted, e.g.
-                # 'unicode cyrillic be' will now search for the 'cyrillic.*be' regular expression
-                t = 'regexp'
-        else:
-            tp, arg = t, arg_i
-        if tp=='hexadecimal':
-            val = int(arg, 16)
-            vals = valfromcp(val, fromcp)
-            for val in vals:
-                r = myunichr(val)
-                list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties
-                result.append(r)
-        elif tp=='decimal':
-            val = int(arg, 10)
-            vals = valfromcp(val, fromcp)
-            for val in vals:
-                r = myunichr(val)
-                list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties
-                result.append(r)
-        elif tp=='octal':
-            val = int(arg, 8)
-            vals = valfromcp(val, fromcp)
-            for val in vals:
-                r = myunichr(val)
-                list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties
-                result.append(r)
-        elif tp=='binary':
-            val = int(arg, 2)
-            vals = valfromcp(val, fromcp)
-            for val in vals:
-                r = myunichr(val)
-                list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties
-                result.append(r)
-        elif tp=='regexp':
-            names_query.append(arg)
-        elif tp=='string':
-            try:
-                if PY3: # argv is automatically decoded into unicode, even padded with bogus character if it is not encodable
-                    unirepr = arg
-                else:
-                    unirepr = unicode(arg, options.iocharset)
-            except UnicodeDecodeError:
-                error ("Sequence %s is not valid in charset '%s'." % (repr(arg),  options.iocharset))
-            unilist = ['%04X'%ord(x) for x in unirepr]
-            unireg = '|'.join(unilist)
-            list(GrepInNames(unireg, fillcache=True))
-            for r in unirepr:
-                result.append(r)
-        elif tp=='empty string':
-            pass # do not do anything for an empty string
-    if names_query:
-        query = '.*'.join(names_query)
-        for r in GrepInNames(query):
-            result.append(r)
-    return result
-
-def maybe_colours(colour):
-    if use_colour:
-        return colours[colour]
-    else:
-        return ""
-
-# format key and value
-def printkv(*l):
-    for i in range(0, len(l), 2):
-        if i<len(l)-2:
-            sep = "  "
-        else:
-            sep = "\n"
-        k, v = l[i], l[i+1]
-        out(maybe_colours('green'))
-        out(k)
-        out(": ")
-        out(maybe_colours('default'))
-        out(unicode(v))
-        out(sep)
-
-def print_characters(clist, maxcount, query_wiki=0):
-    """query_wiki - 0 - don't
-                    1 - spawn browser
-    """
-    counter = 0
-    for c in clist:
-
-        if query_wiki:
-            ch = urllib.quote(c.encode('utf-8')) # wikipedia uses UTF-8 in names
-            wiki_url = 'http://en.wikipedia.org/wiki/'+ch
-            webbrowser.open(wiki_url)
-            query_wiki = 0 # query only the very first character
-
-
-        if maxcount:
-            counter += 1
-        if counter > options.maxcount:
-            out("\nToo many characters to display, more than %s, use --max option to change it\n" % options.maxcount)
-            return
-        properties = get_unicode_properties(c)
-        out(maybe_colours('bold'))
-        out('U+%04X '% ord(c))
-        if properties['name']:
-            out(properties['name'])
-        else:
-            out(maybe_colours('default'))
-            out(" - No such unicode character name in database")
-        out(maybe_colours('default'))
-        out('\n')
-
-        ar = ["UTF-8", ' '.join([("%02x" % ord23(x)) for x in c.encode('utf-8')]) ,
-              "UTF-16BE", ''.join([("%02x" % ord23(x)) for x in c.encode('utf-16be')]),
-              "Decimal", "&#%s;" % ord(c) ]
-        if options.addcharset:
-            try:
-                rep = ' '.join([("%02x" % ord(x)) for x in c.encode(options.addcharset)] )
-            except UnicodeError:
-                rep = "NONE"
-            ar.extend( [options.addcharset, rep] )
-        printkv(*ar)
-
-
-        if properties['combining']:
-            pc = " "+c
-        else:
-            pc = c
-        out(pc)
-        uppercase = properties['uppercase']
-        lowercase = properties['lowercase']
-        if uppercase:
-            out(" (%s)" % uppercase)
-            out('\n')
-            printkv( "Uppercase", 'U+%04X'% ord(properties['uppercase']) )
-        elif lowercase:
-            out(" (%s)" % properties['lowercase'])
-            out('\n')
-            printkv( "Lowercase", 'U+%04X'% ord(properties['lowercase']) )
-        else:
-            out('\n')
-        printkv( 'Category', properties['category']+ " (%s)" % general_category[properties['category']] )
-
-        if properties['numeric_value']:
-            printkv( 'Numeric value',  properties['numeric_value'])
-        if properties['digit_value']:
-            printkv( 'Digit value',  properties['digit_value'])
-
-        bidi = properties['bidi']
-        if bidi:
-            printkv( 'Bidi', bidi+ " (%s)" % bidi_category[bidi] )
-        mirrored = properties['mirrored']
-        if mirrored:
-            out('Character is mirrored\n')
-        comb = properties['combining']
-        if comb:
-            printkv( 'Combining', str(comb)+ " (%s)" % (comb_classes.get(comb, '?')) )
-        decomp = properties['decomposition']
-        if decomp:
-            printkv( 'Decomposition', decomp )
-        if options.verbosity>0:
-            uhp = get_unihan_properties(c)
-            for key in uhp:
-                printkv(key, uhp[key])
-        out('\n')
-
-
-def print_block(block):
-    #header
-    out(" "*10)
-    for i in range(16):
-        out(".%X " % i)
-    out('\n')
-    #body
-    for i in range(block*16, block*16+16):
-        hexi = "%X" % i
-        if len(hexi)>3:
-            hexi = "%07X" % i
-            hexi = hexi[:4]+" "+hexi[4:]
-        else:
-            hexi = "     %03X" % i
-        out(LTR+hexi+".  ")
-        for j in range(16):
-            c = unichr(i*16+j)
-            if unicodedata.combining(c):
-                c = " "+c
-            out(c)
-            out('  ')
-        out('\n')
-    out('\n')
-
-def print_blocks(blocks):
-    for block in blocks:
-        print_block(block)
-
-def is_range(s, typ):
-    sp = s.split('..')
-    if len(sp)!=2:
-        return False
-    if not sp[1]:
-        sp[1] = sp[0]
-    elif not sp[0]:
-        sp[0] = sp[1]
-    if not sp[0]:
-        return False
-    low = list(process([sp[0]], typ)) # intentionally no fromcp here, ranges are only of unicode characters
-    high = list(process([sp[1]], typ))
-    if len(low)!=1 or len(high)!=1:
-        return False
-    low = ord(low[0])
-    high = ord(high[0])
-    low = low // 256
-    high = high // 256 + 1
-    return range(low, high)
-
-
-
-parser = OptionParser(usage="usage: %prog [options] arg")
-parser.add_option("-x", "--hexadecimal",
-      action="store_const", const='hexadecimal', dest="type",
-      help="Assume arg to be hexadecimal number")
-parser.add_option("-o", "--octal",
-      action="store_const", const='octal', dest="type",
-      help="Assume arg to be octal number")
-parser.add_option("-b", "--binary",
-      action="store_const", const='binary', dest="type",
-      help="Assume arg to be binary number")
-parser.add_option("-d", "--decimal",
-      action="store_const", const='decimal', dest="type",
-      help="Assume arg to be decimal number")
-parser.add_option("-r", "--regexp",
-      action="store_const", const='regexp', dest="type",
-      help="Assume arg to be regular expression")
-parser.add_option("-s", "--string",
-      action="store_const", const='string', dest="type",
-      help="Assume arg to be a sequence of characters")
-parser.add_option("-a", "--auto",
-      action="store_const", const=None, dest="type",
-      help="Try to guess arg type (default)")
-parser.add_option("-m", "--max",
-      action="store", default=10, dest="maxcount", type="int",
-      help="Maximal number of codepoints to display, default: 10; 0=unlimited")
-parser.add_option("-i", "--io",
-      action="store", default=iocharsetguess, dest="iocharset", type="string",
-      help="I/O character set, I am guessing %s" % iocharsetguess)
-parser.add_option("--fcp", "--fromcp",
-      action="store", default='', dest="fromcp", type="string",
-      help="Convert numerical arguments from this encoding, default: no conversion")
-parser.add_option("-c", "--charset-add",
-      action="store", dest="addcharset", type="string",
-      help="Show hexadecimal reprezentation in this additional charset")
-parser.add_option("-C", "--colour",
-      action="store", dest="use_colour", type="string",
-      default="auto",
-      help="Use colours, on, off or auto")
-parser.add_option('', "--color",
-      action="store", dest="use_colour", type="string",
-      default="auto",
-      help="synonym for --colour")
-parser.add_option("-v", "--verbose",
-      action="count", dest="verbosity",
-      default=0,
-      help="Increase verbosity (reads Unihan properties - slow!)")
-parser.add_option("-w", "--wikipedia",
-      action="count", dest="query_wiki",
-      default=0,
-      help="Query wikipedia for the character")
-parser.add_option("--list",
-      action="store_const", dest="list_all_encodings",
-      const=True,
-      help="List (approximately) all known encodings")
-
-
-(options, arguments) = parser.parse_args()
-
-linecache = {}
-do_init()
-
-
-if options.list_all_encodings:
-    all_encodings = os.listdir(os.path.dirname(encodings.__file__))
-    all_encodings = set([os.path.splitext(x)[0] for x in all_encodings])
-    all_encodings = list(all_encodings)
-    all_encodings.sort()
-    print (textwrap.fill(' '.join(all_encodings)))
-    sys.exit()
-
-if len(arguments)==0:
-    parser.print_help()
-    sys.exit()
-
-
-if options.use_colour.lower() in ("on", "1", "true", "yes"):
-    use_colour = True
-elif options.use_colour.lower() in ("off", "0", "false", "no"):
-    use_colour = False
-else:
-    use_colour = sys.stdout.isatty()
-    if sys.platform == 'win32':
-        use_colour = False
-
-
-l_args = [] # list of non range arguments to process
-for argum in arguments:
-    is_r = is_range(argum, options.type)
-    if is_r:
-        print_blocks(is_r)
-    else:
-        l_args.append(argum)
-
-if l_args:
-    unihan_fs = []
-    if options.verbosity>0:
-        unihan_fs = get_unihan_files() # list of file names for Unihan data file(s), empty if not available
-        if not unihan_fs:
-            out( """
-Unihan_*.txt files not found. In order to view Unihan properties,
-please place the file into /usr/share/unidata/,
-/usr/share/unicode/, ~/.unicode/
-or current working directory (optionally you can gzip or bzip2 them).
-You can get the files by unpacking ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip
-Warning, listing UniHan Properties is rather slow.
-
-""")
-            options.verbosity = 0
-    try:
-        print_characters(process(l_args, options.type, options.fromcp), options.maxcount, options.query_wiki)
-    except IOError: # e.g. broken pipe
-        pass
-
+    print u''.join(map(l, sys.argv[1:])).encode('utf-8')
+except KeyError:
+    os.execvp("multicode", sys.argv[1:])
--- a/bin/unicode.old	Fri Apr 25 03:31:28 2014 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-import re
-import sys
-import unicodedata
-def l(c): m = re.match('(?:U[+])?([0-9a-f]{1,5})$', c, re.I); return unicodedata.lookup(c) if m is None else unichr(int(m.group(1),16))
-try:
-    print u''.join(map(l, sys.argv[1:])).encode('utf-8')
-except KeyError:
-    print u'Unknown character.'