Mercurial > repo
view bin/multicode @ 9994:d8734db325b9
<hppavilion[1]> le/rn Rogue One//Any regular who gives the slightest Rogue One spoiler shall be hunted down in real life and have their intestines removed through their eye sockets. Members would not be exempt if they existed, which they don\'t.
author | HackBot |
---|---|
date | Sat, 17 Dec 2016 23:40:13 +0000 |
parents | c989a1669243 |
children |
line wrap: on
line source
#!/usr/bin/python import os, glob, sys, unicodedata, locale, gzip, re, traceback, encodings import urllib, webbrowser, textwrap # bz2 was introduced in 2.3, we want this to work also with earlier versions try: import bz2 except ImportError: bz2 = None # for python3 try: unicode except NameError: unicode = str # 'any' and 'all' were introduced in python2.5 # dummy replacement for older versions try: all except NameError: all = lambda x: False PY3 = sys.version_info[0] >= 3 if PY3: import subprocess as cmd def is_ascii(s): "test is string s consists completely of ascii characters (python 3)" try: s.encode('ascii') except UnicodeEncodeError: return False return True def out(*args): "pring args, converting them to output charset" for i in args: sys.stdout.flush() sys.stdout.buffer.write(i.encode(options.iocharset, 'replace')) # ord23 is used to convert elements of byte array in python3, which are integers ord23 = lambda x: x # unichr is not in python3 unichr = chr else: # python2 # getoutput() and getstatusoutput() methods have # been moved from commands to the subprocess module # with Python >= 3.x import commands as cmd def is_ascii(s): "test is string s consists completely of ascii characters (python 2)" try: unicode(s, 'ascii') except UnicodeDecodeError: return False return True def out(*args): "pring args, converting them to output charset" for i in args: sys.stdout.write(i.encode(options.iocharset, 'replace')) ord23 = ord from optparse import OptionParser VERSION='0.9.7' # list of terminals that support bidi biditerms = ['mlterm'] try: locale.setlocale(locale.LC_ALL, '') except locale.Error: pass # guess terminal charset try: iocharsetguess = locale.nl_langinfo(locale.CODESET) or "ascii" except locale.Error: iocharsetguess = "ascii" if os.environ.get('TERM') in biditerms and iocharsetguess.lower().startswith('utf'): LTR = u'\u202d' # left to right override else: LTR = '' colours = { 'none' : "", 'default' : "\033[0m", 'bold' : "\033[1m", 'underline' : "\033[4m", 'blink' : "\033[5m", 'reverse' : "\033[7m", 'concealed' : "\033[8m", 'black' : "\033[30m", 'red' : "\033[31m", 'green' : "\033[32m", 'yellow' : "\033[33m", 'blue' : "\033[34m", 'magenta' : "\033[35m", 'cyan' : "\033[36m", 'white' : "\033[37m", 'on_black' : "\033[40m", 'on_red' : "\033[41m", 'on_green' : "\033[42m", 'on_yellow' : "\033[43m", 'on_blue' : "\033[44m", 'on_magenta' : "\033[45m", 'on_cyan' : "\033[46m", 'on_white' : "\033[47m", 'beep' : "\007", } general_category = { 'Lu': 'Letter, Uppercase', 'Ll': 'Letter, Lowercase', 'Lt': 'Letter, Titlecase', 'Lm': 'Letter, Modifier', 'Lo': 'Letter, Other', 'Mn': 'Mark, Non-Spacing', 'Mc': 'Mark, Spacing Combining', 'Me': 'Mark, Enclosing', 'Nd': 'Number, Decimal Digit', 'Nl': 'Number, Letter', 'No': 'Number, Other', 'Pc': 'Punctuation, Connector', 'Pd': 'Punctuation, Dash', 'Ps': 'Punctuation, Open', 'Pe': 'Punctuation, Close', 'Pi': 'Punctuation, Initial quote', 'Pf': 'Punctuation, Final quote', 'Po': 'Punctuation, Other', 'Sm': 'Symbol, Math', 'Sc': 'Symbol, Currency', 'Sk': 'Symbol, Modifier', 'So': 'Symbol, Other', 'Zs': 'Separator, Space', 'Zl': 'Separator, Line', 'Zp': 'Separator, Paragraph', 'Cc': 'Other, Control', 'Cf': 'Other, Format', 'Cs': 'Other, Surrogate', 'Co': 'Other, Private Use', 'Cn': 'Other, Not Assigned', } bidi_category = { 'L' : 'Left-to-Right', 'LRE' : 'Left-to-Right Embedding', 'LRO' : 'Left-to-Right Override', 'R' : 'Right-to-Left', 'AL' : 'Right-to-Left Arabic', 'RLE' : 'Right-to-Left Embedding', 'RLO' : 'Right-to-Left Override', 'PDF' : 'Pop Directional Format', 'EN' : 'European Number', 'ES' : 'European Number Separator', 'ET' : 'European Number Terminator', 'AN' : 'Arabic Number', 'CS' : 'Common Number Separator', 'NSM' : 'Non-Spacing Mark', 'BN' : 'Boundary Neutral', 'B' : 'Paragraph Separator', 'S' : 'Segment Separator', 'WS' : 'Whitespace', 'ON' : 'Other Neutrals', } comb_classes = { 0: 'Spacing, split, enclosing, reordrant, and Tibetan subjoined', 1: 'Overlays and interior', 7: 'Nuktas', 8: 'Hiragana/Katakana voicing marks', 9: 'Viramas', 10: 'Start of fixed position classes', 199: 'End of fixed position classes', 200: 'Below left attached', 202: 'Below attached', 204: 'Below right attached', 208: 'Left attached (reordrant around single base character)', 210: 'Right attached', 212: 'Above left attached', 214: 'Above attached', 216: 'Above right attached', 218: 'Below left', 220: 'Below', 222: 'Below right', 224: 'Left (reordrant around single base character)', 226: 'Right', 228: 'Above left', 230: 'Above', 232: 'Above right', 233: 'Double below', 234: 'Double above', 240: 'Below (iota subscript)', } def get_unicode_properties(ch): properties = {} if ch in linecache: fields = linecache[ch].strip().split(';') proplist = ['codepoint', 'name', 'category', 'combining', 'bidi', 'decomposition', 'dummy', 'digit_value', 'numeric_value', 'mirrored', 'unicode1name', 'iso_comment', 'uppercase', 'lowercase', 'titlecase'] for i, prop in enumerate(proplist): if prop!='dummy': properties[prop] = fields[i] if properties['lowercase']: properties['lowercase'] = unichr(int(properties['lowercase'], 16)) if properties['uppercase']: properties['uppercase'] = unichr(int(properties['uppercase'], 16)) if properties['titlecase']: properties['titlecase'] = unichr(int(properties['titlecase'], 16)) properties['combining'] = int(properties['combining']) properties['mirrored'] = properties['mirrored']=='Y' else: properties['codepoint'] = '%04X' % ord(ch) properties['name'] = unicodedata.name(ch, '') properties['category'] = unicodedata.category(ch) properties['combining'] = unicodedata.combining(ch) properties['bidi'] = unicodedata.bidirectional(ch) properties['decomposition'] = unicodedata.decomposition(ch) properties['digit_value'] = unicodedata.digit(ch, '') properties['numeric_value'] = unicodedata.numeric(ch, '') properties['mirrored'] = unicodedata.mirrored(ch) properties['unicode1name'] = '' properties['iso_comment'] = '' properties['uppercase'] = ch.upper() properties['lowercase'] = ch.lower() properties['titlecase'] = '' return properties def do_init(): HomeDir = os.path.expanduser('~/.unicode') HomeUnicodeData = os.path.join(HomeDir, "UnicodeData.txt") global UnicodeDataFileNames UnicodeDataFileNames = [HomeUnicodeData, '/usr/share/unicode/UnicodeData.txt', '/usr/share/unidata/UnicodeData.txt', '/hackenv/share/UnicodeData.txt'] + \ glob.glob('/usr/share/unidata/UnicodeData*.txt') + \ glob.glob('/usr/share/perl/*/unicore/UnicodeData.txt') + \ glob.glob('/System/Library/Perl/*/unicore/UnicodeData.txt') # for MacOSX HomeUnihanData = os.path.join(HomeDir, "Unihan*") global UnihanDataGlobs UnihanDataGlobs = [HomeUnihanData, '/usr/share/unidata/Unihan*', '/usr/share/unicode/Unihan*', './Unihan*'] def get_unihan_files(): fos = [] # list of file names for Unihan data file(s) for gl in UnihanDataGlobs: fnames = glob.glob(gl) fos += fnames return fos def get_unihan_properties_internal(ch): properties = {} ch = ord(ch) global unihan_fs for f in unihan_fs: fo = OpenGzip(f) for l in fo: if l.startswith('#'): continue line = l.strip() if not line: continue char, key, value = line.strip().split('\t') if int(char[2:], 16) == ch: properties[key] = unicode(value, 'utf-8') elif int(char[2:], 16)>ch: break return properties def get_unihan_properties_zgrep(ch): properties = {} global unihan_fs ch = ord(ch) chs = 'U+%X' % ch for f in unihan_fs: if f.endswith('.gz'): grepcmd = 'zgrep' elif f.endswith('.bz2'): grepcmd = 'bzgrep' else: grepcmd = 'grep' cmdline = grepcmd+' ^'+chs+r'\\b '+f status, output = cmd.getstatusoutput(cmdline) output = output.split('\n') for l in output: if not l: continue char, key, value = l.strip().split('\t') if int(char[2:], 16) == ch: if PY3: properties[key] = value else: properties[key] = unicode(value, 'utf-8') elif int(char[2:], 16)>ch: break return properties # basic sanity check, if e.g. you run this on MS Windows... if os.path.exists('/bin/grep'): get_unihan_properties = get_unihan_properties_zgrep else: get_unihan_properties = get_unihan_properties_internal def error(txt): out(txt) out('\n') sys.exit(1) def get_gzip_filename(fname): "return fname, if it does not exist, return fname+.gz, if neither that, fname+bz2, if neither that, return None" if os.path.exists(fname): return fname if os.path.exists(fname+'.gz'): return fname+'.gz' if os.path.exists(fname+'.bz2') and bz2 is not None: return fname+'.bz2' return None def OpenGzip(fname): "open fname, try fname.gz or fname.bz2 if fname does not exist, return file object or GzipFile or BZ2File object" if os.path.exists(fname) and not (fname.endswith('.gz') or fname.endswith('.bz2')): return open(fname) if os.path.exists(fname+'.gz'): fname = fname+'.gz' elif os.path.exists(fname+'.bz2') and bz2 is not None: fname = fname+'.bz2' if fname.endswith('.gz'): return gzip.GzipFile(fname) elif fname.endswith('.bz2'): return bz2.BZ2File(fname) return None def GrepInNames(pattern, fillcache=False): p = re.compile(pattern, re.I) f = None for name in UnicodeDataFileNames: f = OpenGzip(name) if f != None: break if not fillcache: if not f: out( """ Cannot find UnicodeData.txt, please place it into /usr/share/unidata/UnicodeData.txt, /usr/share/unicode/UnicodeData.txt, ~/.unicode/ or current working directory (optionally you can gzip it). Without the file, searching will be much slower. """ ) for i in xrange(sys.maxunicode): try: name = unicodedata.name(unichr(i)) if re.search(p, name): yield myunichr(i) except ValueError: pass else: for l in f: if re.search(p, l): r = myunichr(int(l.split(';')[0], 16)) linecache[r] = l yield r f.close() else: if f: for l in f: if re.search(p, l): r = myunichr(int(l.split(';')[0], 16)) linecache[r] = l f.close() def valfromcp(n, cp=None): "if fromcp is defined, then the 'n' is considered to be from that codepage and is converted accordingly" if cp: xh = '%x' %n if len(xh) % 2: # pad hexadecimal representation with a zero xh = '0'+xh cps = ( [xh[i:i+2] for i in range(0,len(xh),2)] ) cps = ( chr(int(i, 16)) for i in cps) cps = ''.join(cps) """ if 0 <= n <= 255: s = chr(n) elif 256 <= n <= 65535: s = struct.pack('>H', n) elif 65536 <= n <= sys.maxint: s = struct.pack('>H', n) else: # bad character code, either negative or too big raise ValueError("Bad character code %s" %n) print 'ee',`s` n = unicode(s, cp) """ s = unicode(cps, cp) ns = [ord(x) for x in s] return ns else: return [n] def myunichr(n): try: r = unichr(n) return r except OverflowError: traceback.print_exc() error("The codepoint is too big - it does not fit into an int.") except ValueError: traceback.print_exc() err = "The codepoint is too big." if sys.maxunicode <= 0xffff: err += "\nPerhaps your python interpreter is not compiled with wide unicode characters." error(err) def guesstype(arg): if not arg: # empty string return 'empty string', arg elif not is_ascii(arg): return 'string', arg elif arg[:2]=='U+' or arg[:2]=='u+': # it is hexadecimal number try: val = int(arg[2:], 16) if val>sys.maxunicode: return 'regexp', arg else: return 'hexadecimal', arg[2:] except ValueError: return 'regexp', arg elif arg[0] in "Uu" and len(arg)>4: try: val = int(arg[1:], 16) if val>sys.maxunicode: return 'regexp', arg else: return 'hexadecimal', arg except ValueError: return 'regexp', arg elif len(arg)>=4: if len(arg) in (8, 16, 24, 32): if all(x in '01' for x in arg): val = int(arg, 2) if val<=sys.maxunicode: return 'binary', arg try: val = int(arg, 16) if val>sys.maxunicode: return 'regexp', arg else: return 'hexadecimal', arg except ValueError: return 'regexp', arg else: return 'string', arg def process(arglist, t, fromcp=None): # build a list of values, so that we can combine queries like # LATIN ALPHA and search for LATIN.*ALPHA and not names that # contain either LATIN or ALPHA result = [] names_query = [] # reserved for queries in names - i.e. -r for arg_i in arglist: if t==None: tp, arg = guesstype(arg_i) if tp == 'regexp': # if the first argument is guessed to be a regexp, add # all the following arguments to the regular expression - # this is probably what you wanted, e.g. # 'unicode cyrillic be' will now search for the 'cyrillic.*be' regular expression t = 'regexp' else: tp, arg = t, arg_i if tp=='hexadecimal': val = int(arg, 16) vals = valfromcp(val, fromcp) for val in vals: r = myunichr(val) list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties result.append(r) elif tp=='decimal': val = int(arg, 10) vals = valfromcp(val, fromcp) for val in vals: r = myunichr(val) list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties result.append(r) elif tp=='octal': val = int(arg, 8) vals = valfromcp(val, fromcp) for val in vals: r = myunichr(val) list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties result.append(r) elif tp=='binary': val = int(arg, 2) vals = valfromcp(val, fromcp) for val in vals: r = myunichr(val) list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties result.append(r) elif tp=='regexp': names_query.append(arg) elif tp=='string': try: if PY3: # argv is automatically decoded into unicode, even padded with bogus character if it is not encodable unirepr = arg else: unirepr = unicode(arg, options.iocharset) except UnicodeDecodeError: error ("Sequence %s is not valid in charset '%s'." % (repr(arg), options.iocharset)) unilist = ['%04X'%ord(x) for x in unirepr] unireg = '|'.join(unilist) list(GrepInNames(unireg, fillcache=True)) for r in unirepr: result.append(r) elif tp=='empty string': pass # do not do anything for an empty string if names_query: query = '.*'.join(names_query) for r in GrepInNames(query): result.append(r) return result def maybe_colours(colour): if use_colour: return colours[colour] else: return "" # format key and value def printkv(*l): for i in range(0, len(l), 2): if i<len(l)-2: sep = " " else: sep = "\n" k, v = l[i], l[i+1] out(maybe_colours('green')) out(k) out(": ") out(maybe_colours('default')) out(unicode(v)) out(sep) def print_characters(clist, maxcount, query_wiki=0): """query_wiki - 0 - don't 1 - spawn browser """ counter = 0 for c in clist: if query_wiki: ch = urllib.quote(c.encode('utf-8')) # wikipedia uses UTF-8 in names wiki_url = 'http://en.wikipedia.org/wiki/'+ch webbrowser.open(wiki_url) query_wiki = 0 # query only the very first character if maxcount: counter += 1 if counter > options.maxcount: out("\nToo many characters to display, more than %s, use --max option to change it\n" % options.maxcount) return properties = get_unicode_properties(c) out(maybe_colours('bold')) out('U+%04X '% ord(c)) if properties['name']: out(properties['name']) else: out(maybe_colours('default')) out(" - No such unicode character name in database") out(maybe_colours('default')) out('\n') ar = ["UTF-8", ' '.join([("%02x" % ord23(x)) for x in c.encode('utf-8')]) , "UTF-16BE", ''.join([("%02x" % ord23(x)) for x in c.encode('utf-16be')]), "Decimal", "&#%s;" % ord(c) ] if options.addcharset: try: rep = ' '.join([("%02x" % ord(x)) for x in c.encode(options.addcharset)] ) except UnicodeError: rep = "NONE" ar.extend( [options.addcharset, rep] ) printkv(*ar) if properties['combining']: pc = " "+c else: pc = c out(pc) uppercase = properties['uppercase'] lowercase = properties['lowercase'] if uppercase: out(" (%s)" % uppercase) out('\n') printkv( "Uppercase", 'U+%04X'% ord(properties['uppercase']) ) elif lowercase: out(" (%s)" % properties['lowercase']) out('\n') printkv( "Lowercase", 'U+%04X'% ord(properties['lowercase']) ) else: out('\n') printkv( 'Category', properties['category']+ " (%s)" % general_category[properties['category']] ) if properties['numeric_value']: printkv( 'Numeric value', properties['numeric_value']) if properties['digit_value']: printkv( 'Digit value', properties['digit_value']) bidi = properties['bidi'] if bidi: printkv( 'Bidi', bidi+ " (%s)" % bidi_category[bidi] ) mirrored = properties['mirrored'] if mirrored: out('Character is mirrored\n') comb = properties['combining'] if comb: printkv( 'Combining', str(comb)+ " (%s)" % (comb_classes.get(comb, '?')) ) decomp = properties['decomposition'] if decomp: printkv( 'Decomposition', decomp ) if options.verbosity>0: uhp = get_unihan_properties(c) for key in uhp: printkv(key, uhp[key]) out('\n') def print_block(block): #header out(" "*10) for i in range(16): out(".%X " % i) out('\n') #body for i in range(block*16, block*16+16): hexi = "%X" % i if len(hexi)>3: hexi = "%07X" % i hexi = hexi[:4]+" "+hexi[4:] else: hexi = " %03X" % i out(LTR+hexi+". ") for j in range(16): c = unichr(i*16+j) if unicodedata.combining(c): c = " "+c out(c) out(' ') out('\n') out('\n') def print_blocks(blocks): for block in blocks: print_block(block) def is_range(s, typ): sp = s.split('..') if len(sp)!=2: return False if not sp[1]: sp[1] = sp[0] elif not sp[0]: sp[0] = sp[1] if not sp[0]: return False low = list(process([sp[0]], typ)) # intentionally no fromcp here, ranges are only of unicode characters high = list(process([sp[1]], typ)) if len(low)!=1 or len(high)!=1: return False low = ord(low[0]) high = ord(high[0]) low = low // 256 high = high // 256 + 1 return range(low, high) parser = OptionParser(usage="usage: %prog [options] arg") parser.add_option("-x", "--hexadecimal", action="store_const", const='hexadecimal', dest="type", help="Assume arg to be hexadecimal number") parser.add_option("-o", "--octal", action="store_const", const='octal', dest="type", help="Assume arg to be octal number") parser.add_option("-b", "--binary", action="store_const", const='binary', dest="type", help="Assume arg to be binary number") parser.add_option("-d", "--decimal", action="store_const", const='decimal', dest="type", help="Assume arg to be decimal number") parser.add_option("-r", "--regexp", action="store_const", const='regexp', dest="type", help="Assume arg to be regular expression") parser.add_option("-s", "--string", action="store_const", const='string', dest="type", help="Assume arg to be a sequence of characters") parser.add_option("-a", "--auto", action="store_const", const=None, dest="type", help="Try to guess arg type (default)") parser.add_option("-m", "--max", action="store", default=10, dest="maxcount", type="int", help="Maximal number of codepoints to display, default: 10; 0=unlimited") parser.add_option("-i", "--io", action="store", default=iocharsetguess, dest="iocharset", type="string", help="I/O character set, I am guessing %s" % iocharsetguess) parser.add_option("--fcp", "--fromcp", action="store", default='', dest="fromcp", type="string", help="Convert numerical arguments from this encoding, default: no conversion") parser.add_option("-c", "--charset-add", action="store", dest="addcharset", type="string", help="Show hexadecimal reprezentation in this additional charset") parser.add_option("-C", "--colour", action="store", dest="use_colour", type="string", default="auto", help="Use colours, on, off or auto") parser.add_option('', "--color", action="store", dest="use_colour", type="string", default="auto", help="synonym for --colour") parser.add_option("-v", "--verbose", action="count", dest="verbosity", default=0, help="Increase verbosity (reads Unihan properties - slow!)") parser.add_option("-w", "--wikipedia", action="count", dest="query_wiki", default=0, help="Query wikipedia for the character") parser.add_option("--list", action="store_const", dest="list_all_encodings", const=True, help="List (approximately) all known encodings") (options, arguments) = parser.parse_args() linecache = {} do_init() if options.list_all_encodings: all_encodings = os.listdir(os.path.dirname(encodings.__file__)) all_encodings = set([os.path.splitext(x)[0] for x in all_encodings]) all_encodings = list(all_encodings) all_encodings.sort() print (textwrap.fill(' '.join(all_encodings))) sys.exit() if len(arguments)==0: parser.print_help() sys.exit() if options.use_colour.lower() in ("on", "1", "true", "yes"): use_colour = True elif options.use_colour.lower() in ("off", "0", "false", "no"): use_colour = False else: use_colour = sys.stdout.isatty() if sys.platform == 'win32': use_colour = False l_args = [] # list of non range arguments to process for argum in arguments: is_r = is_range(argum, options.type) if is_r: print_blocks(is_r) else: l_args.append(argum) if l_args: unihan_fs = [] if options.verbosity>0: unihan_fs = get_unihan_files() # list of file names for Unihan data file(s), empty if not available if not unihan_fs: out( """ Unihan_*.txt files not found. In order to view Unihan properties, please place the file into /usr/share/unidata/, /usr/share/unicode/, ~/.unicode/ or current working directory (optionally you can gzip or bzip2 them). You can get the files by unpacking ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip Warning, listing UniHan Properties is rather slow. """) options.verbosity = 0 try: print_characters(process(l_args, options.type, options.fromcp), options.maxcount, options.query_wiki) except IOError: # e.g. broken pipe pass