Mercurial > repo
view bin/multicode @ 12521:1298a4f734a6 draft default tip
<int-e> learn The password of the month is 99.964%
author | HackEso <hackeso@esolangs.org> |
---|---|
date | Sun, 02 Feb 2025 02:05:24 +0000 |
parents | c989a1669243 |
children |
line wrap: on
line source
#!/usr/bin/python import os, glob, sys, unicodedata, locale, gzip, re, traceback, encodings import urllib, webbrowser, textwrap # bz2 was introduced in 2.3, we want this to work also with earlier versions try: import bz2 except ImportError: bz2 = None # for python3 try: unicode except NameError: unicode = str # 'any' and 'all' were introduced in python2.5 # dummy replacement for older versions try: all except NameError: all = lambda x: False PY3 = sys.version_info[0] >= 3 if PY3: import subprocess as cmd def is_ascii(s): "test is string s consists completely of ascii characters (python 3)" try: s.encode('ascii') except UnicodeEncodeError: return False return True def out(*args): "pring args, converting them to output charset" for i in args: sys.stdout.flush() sys.stdout.buffer.write(i.encode(options.iocharset, 'replace')) # ord23 is used to convert elements of byte array in python3, which are integers ord23 = lambda x: x # unichr is not in python3 unichr = chr else: # python2 # getoutput() and getstatusoutput() methods have # been moved from commands to the subprocess module # with Python >= 3.x import commands as cmd def is_ascii(s): "test is string s consists completely of ascii characters (python 2)" try: unicode(s, 'ascii') except UnicodeDecodeError: return False return True def out(*args): "pring args, converting them to output charset" for i in args: sys.stdout.write(i.encode(options.iocharset, 'replace')) ord23 = ord from optparse import OptionParser VERSION='0.9.7' # list of terminals that support bidi biditerms = ['mlterm'] try: locale.setlocale(locale.LC_ALL, '') except locale.Error: pass # guess terminal charset try: iocharsetguess = locale.nl_langinfo(locale.CODESET) or "ascii" except locale.Error: iocharsetguess = "ascii" if os.environ.get('TERM') in biditerms and iocharsetguess.lower().startswith('utf'): LTR = u'\u202d' # left to right override else: LTR = '' colours = { 'none' : "", 'default' : "\033[0m", 'bold' : "\033[1m", 'underline' : "\033[4m", 'blink' : "\033[5m", 'reverse' : "\033[7m", 'concealed' : "\033[8m", 'black' : "\033[30m", 'red' : "\033[31m", 'green' : "\033[32m", 'yellow' : "\033[33m", 'blue' : "\033[34m", 'magenta' : "\033[35m", 'cyan' : "\033[36m", 'white' : "\033[37m", 'on_black' : "\033[40m", 'on_red' : "\033[41m", 'on_green' : "\033[42m", 'on_yellow' : "\033[43m", 'on_blue' : "\033[44m", 'on_magenta' : "\033[45m", 'on_cyan' : "\033[46m", 'on_white' : "\033[47m", 'beep' : "\007", } general_category = { 'Lu': 'Letter, Uppercase', 'Ll': 'Letter, Lowercase', 'Lt': 'Letter, Titlecase', 'Lm': 'Letter, Modifier', 'Lo': 'Letter, Other', 'Mn': 'Mark, Non-Spacing', 'Mc': 'Mark, Spacing Combining', 'Me': 'Mark, Enclosing', 'Nd': 'Number, Decimal Digit', 'Nl': 'Number, Letter', 'No': 'Number, Other', 'Pc': 'Punctuation, Connector', 'Pd': 'Punctuation, Dash', 'Ps': 'Punctuation, Open', 'Pe': 'Punctuation, Close', 'Pi': 'Punctuation, Initial quote', 'Pf': 'Punctuation, Final quote', 'Po': 'Punctuation, Other', 'Sm': 'Symbol, Math', 'Sc': 'Symbol, Currency', 'Sk': 'Symbol, Modifier', 'So': 'Symbol, Other', 'Zs': 'Separator, Space', 'Zl': 'Separator, Line', 'Zp': 'Separator, Paragraph', 'Cc': 'Other, Control', 'Cf': 'Other, Format', 'Cs': 'Other, Surrogate', 'Co': 'Other, Private Use', 'Cn': 'Other, Not Assigned', } bidi_category = { 'L' : 'Left-to-Right', 'LRE' : 'Left-to-Right Embedding', 'LRO' : 'Left-to-Right Override', 'R' : 'Right-to-Left', 'AL' : 'Right-to-Left Arabic', 'RLE' : 'Right-to-Left Embedding', 'RLO' : 'Right-to-Left Override', 'PDF' : 'Pop Directional Format', 'EN' : 'European Number', 'ES' : 'European Number Separator', 'ET' : 'European Number Terminator', 'AN' : 'Arabic Number', 'CS' : 'Common Number Separator', 'NSM' : 'Non-Spacing Mark', 'BN' : 'Boundary Neutral', 'B' : 'Paragraph Separator', 'S' : 'Segment Separator', 'WS' : 'Whitespace', 'ON' : 'Other Neutrals', } comb_classes = { 0: 'Spacing, split, enclosing, reordrant, and Tibetan subjoined', 1: 'Overlays and interior', 7: 'Nuktas', 8: 'Hiragana/Katakana voicing marks', 9: 'Viramas', 10: 'Start of fixed position classes', 199: 'End of fixed position classes', 200: 'Below left attached', 202: 'Below attached', 204: 'Below right attached', 208: 'Left attached (reordrant around single base character)', 210: 'Right attached', 212: 'Above left attached', 214: 'Above attached', 216: 'Above right attached', 218: 'Below left', 220: 'Below', 222: 'Below right', 224: 'Left (reordrant around single base character)', 226: 'Right', 228: 'Above left', 230: 'Above', 232: 'Above right', 233: 'Double below', 234: 'Double above', 240: 'Below (iota subscript)', } def get_unicode_properties(ch): properties = {} if ch in linecache: fields = linecache[ch].strip().split(';') proplist = ['codepoint', 'name', 'category', 'combining', 'bidi', 'decomposition', 'dummy', 'digit_value', 'numeric_value', 'mirrored', 'unicode1name', 'iso_comment', 'uppercase', 'lowercase', 'titlecase'] for i, prop in enumerate(proplist): if prop!='dummy': properties[prop] = fields[i] if properties['lowercase']: properties['lowercase'] = unichr(int(properties['lowercase'], 16)) if properties['uppercase']: properties['uppercase'] = unichr(int(properties['uppercase'], 16)) if properties['titlecase']: properties['titlecase'] = unichr(int(properties['titlecase'], 16)) properties['combining'] = int(properties['combining']) properties['mirrored'] = properties['mirrored']=='Y' else: properties['codepoint'] = '%04X' % ord(ch) properties['name'] = unicodedata.name(ch, '') properties['category'] = unicodedata.category(ch) properties['combining'] = unicodedata.combining(ch) properties['bidi'] = unicodedata.bidirectional(ch) properties['decomposition'] = unicodedata.decomposition(ch) properties['digit_value'] = unicodedata.digit(ch, '') properties['numeric_value'] = unicodedata.numeric(ch, '') properties['mirrored'] = unicodedata.mirrored(ch) properties['unicode1name'] = '' properties['iso_comment'] = '' properties['uppercase'] = ch.upper() properties['lowercase'] = ch.lower() properties['titlecase'] = '' return properties def do_init(): HomeDir = os.path.expanduser('~/.unicode') HomeUnicodeData = os.path.join(HomeDir, "UnicodeData.txt") global UnicodeDataFileNames UnicodeDataFileNames = [HomeUnicodeData, '/usr/share/unicode/UnicodeData.txt', '/usr/share/unidata/UnicodeData.txt', '/hackenv/share/UnicodeData.txt'] + \ glob.glob('/usr/share/unidata/UnicodeData*.txt') + \ glob.glob('/usr/share/perl/*/unicore/UnicodeData.txt') + \ glob.glob('/System/Library/Perl/*/unicore/UnicodeData.txt') # for MacOSX HomeUnihanData = os.path.join(HomeDir, "Unihan*") global UnihanDataGlobs UnihanDataGlobs = [HomeUnihanData, '/usr/share/unidata/Unihan*', '/usr/share/unicode/Unihan*', './Unihan*'] def get_unihan_files(): fos = [] # list of file names for Unihan data file(s) for gl in UnihanDataGlobs: fnames = glob.glob(gl) fos += fnames return fos def get_unihan_properties_internal(ch): properties = {} ch = ord(ch) global unihan_fs for f in unihan_fs: fo = OpenGzip(f) for l in fo: if l.startswith('#'): continue line = l.strip() if not line: continue char, key, value = line.strip().split('\t') if int(char[2:], 16) == ch: properties[key] = unicode(value, 'utf-8') elif int(char[2:], 16)>ch: break return properties def get_unihan_properties_zgrep(ch): properties = {} global unihan_fs ch = ord(ch) chs = 'U+%X' % ch for f in unihan_fs: if f.endswith('.gz'): grepcmd = 'zgrep' elif f.endswith('.bz2'): grepcmd = 'bzgrep' else: grepcmd = 'grep' cmdline = grepcmd+' ^'+chs+r'\\b '+f status, output = cmd.getstatusoutput(cmdline) output = output.split('\n') for l in output: if not l: continue char, key, value = l.strip().split('\t') if int(char[2:], 16) == ch: if PY3: properties[key] = value else: properties[key] = unicode(value, 'utf-8') elif int(char[2:], 16)>ch: break return properties # basic sanity check, if e.g. you run this on MS Windows... if os.path.exists('/bin/grep'): get_unihan_properties = get_unihan_properties_zgrep else: get_unihan_properties = get_unihan_properties_internal def error(txt): out(txt) out('\n') sys.exit(1) def get_gzip_filename(fname): "return fname, if it does not exist, return fname+.gz, if neither that, fname+bz2, if neither that, return None" if os.path.exists(fname): return fname if os.path.exists(fname+'.gz'): return fname+'.gz' if os.path.exists(fname+'.bz2') and bz2 is not None: return fname+'.bz2' return None def OpenGzip(fname): "open fname, try fname.gz or fname.bz2 if fname does not exist, return file object or GzipFile or BZ2File object" if os.path.exists(fname) and not (fname.endswith('.gz') or fname.endswith('.bz2')): return open(fname) if os.path.exists(fname+'.gz'): fname = fname+'.gz' elif os.path.exists(fname+'.bz2') and bz2 is not None: fname = fname+'.bz2' if fname.endswith('.gz'): return gzip.GzipFile(fname) elif fname.endswith('.bz2'): return bz2.BZ2File(fname) return None def GrepInNames(pattern, fillcache=False): p = re.compile(pattern, re.I) f = None for name in UnicodeDataFileNames: f = OpenGzip(name) if f != None: break if not fillcache: if not f: out( """ Cannot find UnicodeData.txt, please place it into /usr/share/unidata/UnicodeData.txt, /usr/share/unicode/UnicodeData.txt, ~/.unicode/ or current working directory (optionally you can gzip it). Without the file, searching will be much slower. """ ) for i in xrange(sys.maxunicode): try: name = unicodedata.name(unichr(i)) if re.search(p, name): yield myunichr(i) except ValueError: pass else: for l in f: if re.search(p, l): r = myunichr(int(l.split(';')[0], 16)) linecache[r] = l yield r f.close() else: if f: for l in f: if re.search(p, l): r = myunichr(int(l.split(';')[0], 16)) linecache[r] = l f.close() def valfromcp(n, cp=None): "if fromcp is defined, then the 'n' is considered to be from that codepage and is converted accordingly" if cp: xh = '%x' %n if len(xh) % 2: # pad hexadecimal representation with a zero xh = '0'+xh cps = ( [xh[i:i+2] for i in range(0,len(xh),2)] ) cps = ( chr(int(i, 16)) for i in cps) cps = ''.join(cps) """ if 0 <= n <= 255: s = chr(n) elif 256 <= n <= 65535: s = struct.pack('>H', n) elif 65536 <= n <= sys.maxint: s = struct.pack('>H', n) else: # bad character code, either negative or too big raise ValueError("Bad character code %s" %n) print 'ee',`s` n = unicode(s, cp) """ s = unicode(cps, cp) ns = [ord(x) for x in s] return ns else: return [n] def myunichr(n): try: r = unichr(n) return r except OverflowError: traceback.print_exc() error("The codepoint is too big - it does not fit into an int.") except ValueError: traceback.print_exc() err = "The codepoint is too big." if sys.maxunicode <= 0xffff: err += "\nPerhaps your python interpreter is not compiled with wide unicode characters." error(err) def guesstype(arg): if not arg: # empty string return 'empty string', arg elif not is_ascii(arg): return 'string', arg elif arg[:2]=='U+' or arg[:2]=='u+': # it is hexadecimal number try: val = int(arg[2:], 16) if val>sys.maxunicode: return 'regexp', arg else: return 'hexadecimal', arg[2:] except ValueError: return 'regexp', arg elif arg[0] in "Uu" and len(arg)>4: try: val = int(arg[1:], 16) if val>sys.maxunicode: return 'regexp', arg else: return 'hexadecimal', arg except ValueError: return 'regexp', arg elif len(arg)>=4: if len(arg) in (8, 16, 24, 32): if all(x in '01' for x in arg): val = int(arg, 2) if val<=sys.maxunicode: return 'binary', arg try: val = int(arg, 16) if val>sys.maxunicode: return 'regexp', arg else: return 'hexadecimal', arg except ValueError: return 'regexp', arg else: return 'string', arg def process(arglist, t, fromcp=None): # build a list of values, so that we can combine queries like # LATIN ALPHA and search for LATIN.*ALPHA and not names that # contain either LATIN or ALPHA result = [] names_query = [] # reserved for queries in names - i.e. -r for arg_i in arglist: if t==None: tp, arg = guesstype(arg_i) if tp == 'regexp': # if the first argument is guessed to be a regexp, add # all the following arguments to the regular expression - # this is probably what you wanted, e.g. # 'unicode cyrillic be' will now search for the 'cyrillic.*be' regular expression t = 'regexp' else: tp, arg = t, arg_i if tp=='hexadecimal': val = int(arg, 16) vals = valfromcp(val, fromcp) for val in vals: r = myunichr(val) list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties result.append(r) elif tp=='decimal': val = int(arg, 10) vals = valfromcp(val, fromcp) for val in vals: r = myunichr(val) list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties result.append(r) elif tp=='octal': val = int(arg, 8) vals = valfromcp(val, fromcp) for val in vals: r = myunichr(val) list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties result.append(r) elif tp=='binary': val = int(arg, 2) vals = valfromcp(val, fromcp) for val in vals: r = myunichr(val) list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties result.append(r) elif tp=='regexp': names_query.append(arg) elif tp=='string': try: if PY3: # argv is automatically decoded into unicode, even padded with bogus character if it is not encodable unirepr = arg else: unirepr = unicode(arg, options.iocharset) except UnicodeDecodeError: error ("Sequence %s is not valid in charset '%s'." % (repr(arg), options.iocharset)) unilist = ['%04X'%ord(x) for x in unirepr] unireg = '|'.join(unilist) list(GrepInNames(unireg, fillcache=True)) for r in unirepr: result.append(r) elif tp=='empty string': pass # do not do anything for an empty string if names_query: query = '.*'.join(names_query) for r in GrepInNames(query): result.append(r) return result def maybe_colours(colour): if use_colour: return colours[colour] else: return "" # format key and value def printkv(*l): for i in range(0, len(l), 2): if i<len(l)-2: sep = " " else: sep = "\n" k, v = l[i], l[i+1] out(maybe_colours('green')) out(k) out(": ") out(maybe_colours('default')) out(unicode(v)) out(sep) def print_characters(clist, maxcount, query_wiki=0): """query_wiki - 0 - don't 1 - spawn browser """ counter = 0 for c in clist: if query_wiki: ch = urllib.quote(c.encode('utf-8')) # wikipedia uses UTF-8 in names wiki_url = 'http://en.wikipedia.org/wiki/'+ch webbrowser.open(wiki_url) query_wiki = 0 # query only the very first character if maxcount: counter += 1 if counter > options.maxcount: out("\nToo many characters to display, more than %s, use --max option to change it\n" % options.maxcount) return properties = get_unicode_properties(c) out(maybe_colours('bold')) out('U+%04X '% ord(c)) if properties['name']: out(properties['name']) else: out(maybe_colours('default')) out(" - No such unicode character name in database") out(maybe_colours('default')) out('\n') ar = ["UTF-8", ' '.join([("%02x" % ord23(x)) for x in c.encode('utf-8')]) , "UTF-16BE", ''.join([("%02x" % ord23(x)) for x in c.encode('utf-16be')]), "Decimal", "&#%s;" % ord(c) ] if options.addcharset: try: rep = ' '.join([("%02x" % ord(x)) for x in c.encode(options.addcharset)] ) except UnicodeError: rep = "NONE" ar.extend( [options.addcharset, rep] ) printkv(*ar) if properties['combining']: pc = " "+c else: pc = c out(pc) uppercase = properties['uppercase'] lowercase = properties['lowercase'] if uppercase: out(" (%s)" % uppercase) out('\n') printkv( "Uppercase", 'U+%04X'% ord(properties['uppercase']) ) elif lowercase: out(" (%s)" % properties['lowercase']) out('\n') printkv( "Lowercase", 'U+%04X'% ord(properties['lowercase']) ) else: out('\n') printkv( 'Category', properties['category']+ " (%s)" % general_category[properties['category']] ) if properties['numeric_value']: printkv( 'Numeric value', properties['numeric_value']) if properties['digit_value']: printkv( 'Digit value', properties['digit_value']) bidi = properties['bidi'] if bidi: printkv( 'Bidi', bidi+ " (%s)" % bidi_category[bidi] ) mirrored = properties['mirrored'] if mirrored: out('Character is mirrored\n') comb = properties['combining'] if comb: printkv( 'Combining', str(comb)+ " (%s)" % (comb_classes.get(comb, '?')) ) decomp = properties['decomposition'] if decomp: printkv( 'Decomposition', decomp ) if options.verbosity>0: uhp = get_unihan_properties(c) for key in uhp: printkv(key, uhp[key]) out('\n') def print_block(block): #header out(" "*10) for i in range(16): out(".%X " % i) out('\n') #body for i in range(block*16, block*16+16): hexi = "%X" % i if len(hexi)>3: hexi = "%07X" % i hexi = hexi[:4]+" "+hexi[4:] else: hexi = " %03X" % i out(LTR+hexi+". ") for j in range(16): c = unichr(i*16+j) if unicodedata.combining(c): c = " "+c out(c) out(' ') out('\n') out('\n') def print_blocks(blocks): for block in blocks: print_block(block) def is_range(s, typ): sp = s.split('..') if len(sp)!=2: return False if not sp[1]: sp[1] = sp[0] elif not sp[0]: sp[0] = sp[1] if not sp[0]: return False low = list(process([sp[0]], typ)) # intentionally no fromcp here, ranges are only of unicode characters high = list(process([sp[1]], typ)) if len(low)!=1 or len(high)!=1: return False low = ord(low[0]) high = ord(high[0]) low = low // 256 high = high // 256 + 1 return range(low, high) parser = OptionParser(usage="usage: %prog [options] arg") parser.add_option("-x", "--hexadecimal", action="store_const", const='hexadecimal', dest="type", help="Assume arg to be hexadecimal number") parser.add_option("-o", "--octal", action="store_const", const='octal', dest="type", help="Assume arg to be octal number") parser.add_option("-b", "--binary", action="store_const", const='binary', dest="type", help="Assume arg to be binary number") parser.add_option("-d", "--decimal", action="store_const", const='decimal', dest="type", help="Assume arg to be decimal number") parser.add_option("-r", "--regexp", action="store_const", const='regexp', dest="type", help="Assume arg to be regular expression") parser.add_option("-s", "--string", action="store_const", const='string', dest="type", help="Assume arg to be a sequence of characters") parser.add_option("-a", "--auto", action="store_const", const=None, dest="type", help="Try to guess arg type (default)") parser.add_option("-m", "--max", action="store", default=10, dest="maxcount", type="int", help="Maximal number of codepoints to display, default: 10; 0=unlimited") parser.add_option("-i", "--io", action="store", default=iocharsetguess, dest="iocharset", type="string", help="I/O character set, I am guessing %s" % iocharsetguess) parser.add_option("--fcp", "--fromcp", action="store", default='', dest="fromcp", type="string", help="Convert numerical arguments from this encoding, default: no conversion") parser.add_option("-c", "--charset-add", action="store", dest="addcharset", type="string", help="Show hexadecimal reprezentation in this additional charset") parser.add_option("-C", "--colour", action="store", dest="use_colour", type="string", default="auto", help="Use colours, on, off or auto") parser.add_option('', "--color", action="store", dest="use_colour", type="string", default="auto", help="synonym for --colour") parser.add_option("-v", "--verbose", action="count", dest="verbosity", default=0, help="Increase verbosity (reads Unihan properties - slow!)") parser.add_option("-w", "--wikipedia", action="count", dest="query_wiki", default=0, help="Query wikipedia for the character") parser.add_option("--list", action="store_const", dest="list_all_encodings", const=True, help="List (approximately) all known encodings") (options, arguments) = parser.parse_args() linecache = {} do_init() if options.list_all_encodings: all_encodings = os.listdir(os.path.dirname(encodings.__file__)) all_encodings = set([os.path.splitext(x)[0] for x in all_encodings]) all_encodings = list(all_encodings) all_encodings.sort() print (textwrap.fill(' '.join(all_encodings))) sys.exit() if len(arguments)==0: parser.print_help() sys.exit() if options.use_colour.lower() in ("on", "1", "true", "yes"): use_colour = True elif options.use_colour.lower() in ("off", "0", "false", "no"): use_colour = False else: use_colour = sys.stdout.isatty() if sys.platform == 'win32': use_colour = False l_args = [] # list of non range arguments to process for argum in arguments: is_r = is_range(argum, options.type) if is_r: print_blocks(is_r) else: l_args.append(argum) if l_args: unihan_fs = [] if options.verbosity>0: unihan_fs = get_unihan_files() # list of file names for Unihan data file(s), empty if not available if not unihan_fs: out( """ Unihan_*.txt files not found. In order to view Unihan properties, please place the file into /usr/share/unidata/, /usr/share/unicode/, ~/.unicode/ or current working directory (optionally you can gzip or bzip2 them). You can get the files by unpacking ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip Warning, listing UniHan Properties is rather slow. """) options.verbosity = 0 try: print_characters(process(l_args, options.type, options.fromcp), options.maxcount, options.query_wiki) except IOError: # e.g. broken pipe pass