Mercurial > repo
comparison bin/multicode @ 9075:c989a1669243
<fizzie> revert 58b9ee8f97a7
author | HackBot |
---|---|
date | Sun, 25 Sep 2016 20:31:46 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
9074:560a73f4f0a4 | 9075:c989a1669243 |
---|---|
1 #!/usr/bin/python | |
2 | |
3 | |
4 import os, glob, sys, unicodedata, locale, gzip, re, traceback, encodings | |
5 import urllib, webbrowser, textwrap | |
6 | |
7 # bz2 was introduced in 2.3, we want this to work also with earlier versions | |
8 try: | |
9 import bz2 | |
10 except ImportError: | |
11 bz2 = None | |
12 | |
13 # for python3 | |
14 try: | |
15 unicode | |
16 except NameError: | |
17 unicode = str | |
18 | |
19 # 'any' and 'all' were introduced in python2.5 | |
20 # dummy replacement for older versions | |
21 try: | |
22 all | |
23 except NameError: | |
24 all = lambda x: False | |
25 | |
26 PY3 = sys.version_info[0] >= 3 | |
27 if PY3: | |
28 import subprocess as cmd | |
29 | |
30 def is_ascii(s): | |
31 "test is string s consists completely of ascii characters (python 3)" | |
32 try: | |
33 s.encode('ascii') | |
34 except UnicodeEncodeError: | |
35 return False | |
36 return True | |
37 | |
38 def out(*args): | |
39 "pring args, converting them to output charset" | |
40 for i in args: | |
41 sys.stdout.flush() | |
42 sys.stdout.buffer.write(i.encode(options.iocharset, 'replace')) | |
43 | |
44 # ord23 is used to convert elements of byte array in python3, which are integers | |
45 ord23 = lambda x: x | |
46 | |
47 # unichr is not in python3 | |
48 unichr = chr | |
49 | |
50 else: # python2 | |
51 | |
52 # getoutput() and getstatusoutput() methods have | |
53 # been moved from commands to the subprocess module | |
54 # with Python >= 3.x | |
55 import commands as cmd | |
56 | |
57 def is_ascii(s): | |
58 "test is string s consists completely of ascii characters (python 2)" | |
59 try: | |
60 unicode(s, 'ascii') | |
61 except UnicodeDecodeError: | |
62 return False | |
63 return True | |
64 | |
65 def out(*args): | |
66 "pring args, converting them to output charset" | |
67 for i in args: | |
68 sys.stdout.write(i.encode(options.iocharset, 'replace')) | |
69 | |
70 ord23 = ord | |
71 | |
72 | |
73 | |
74 from optparse import OptionParser | |
75 | |
76 VERSION='0.9.7' | |
77 | |
78 | |
79 # list of terminals that support bidi | |
80 biditerms = ['mlterm'] | |
81 | |
82 try: | |
83 locale.setlocale(locale.LC_ALL, '') | |
84 except locale.Error: | |
85 pass | |
86 | |
87 # guess terminal charset | |
88 try: | |
89 iocharsetguess = locale.nl_langinfo(locale.CODESET) or "ascii" | |
90 except locale.Error: | |
91 iocharsetguess = "ascii" | |
92 | |
93 if os.environ.get('TERM') in biditerms and iocharsetguess.lower().startswith('utf'): | |
94 LTR = u'\u202d' # left to right override | |
95 else: | |
96 LTR = '' | |
97 | |
98 | |
99 colours = { | |
100 'none' : "", | |
101 'default' : "\033[0m", | |
102 'bold' : "\033[1m", | |
103 'underline' : "\033[4m", | |
104 'blink' : "\033[5m", | |
105 'reverse' : "\033[7m", | |
106 'concealed' : "\033[8m", | |
107 | |
108 'black' : "\033[30m", | |
109 'red' : "\033[31m", | |
110 'green' : "\033[32m", | |
111 'yellow' : "\033[33m", | |
112 'blue' : "\033[34m", | |
113 'magenta' : "\033[35m", | |
114 'cyan' : "\033[36m", | |
115 'white' : "\033[37m", | |
116 | |
117 'on_black' : "\033[40m", | |
118 'on_red' : "\033[41m", | |
119 'on_green' : "\033[42m", | |
120 'on_yellow' : "\033[43m", | |
121 'on_blue' : "\033[44m", | |
122 'on_magenta' : "\033[45m", | |
123 'on_cyan' : "\033[46m", | |
124 'on_white' : "\033[47m", | |
125 | |
126 'beep' : "\007", | |
127 } | |
128 | |
129 | |
130 general_category = { | |
131 'Lu': 'Letter, Uppercase', | |
132 'Ll': 'Letter, Lowercase', | |
133 'Lt': 'Letter, Titlecase', | |
134 'Lm': 'Letter, Modifier', | |
135 'Lo': 'Letter, Other', | |
136 'Mn': 'Mark, Non-Spacing', | |
137 'Mc': 'Mark, Spacing Combining', | |
138 'Me': 'Mark, Enclosing', | |
139 'Nd': 'Number, Decimal Digit', | |
140 'Nl': 'Number, Letter', | |
141 'No': 'Number, Other', | |
142 'Pc': 'Punctuation, Connector', | |
143 'Pd': 'Punctuation, Dash', | |
144 'Ps': 'Punctuation, Open', | |
145 'Pe': 'Punctuation, Close', | |
146 'Pi': 'Punctuation, Initial quote', | |
147 'Pf': 'Punctuation, Final quote', | |
148 'Po': 'Punctuation, Other', | |
149 'Sm': 'Symbol, Math', | |
150 'Sc': 'Symbol, Currency', | |
151 'Sk': 'Symbol, Modifier', | |
152 'So': 'Symbol, Other', | |
153 'Zs': 'Separator, Space', | |
154 'Zl': 'Separator, Line', | |
155 'Zp': 'Separator, Paragraph', | |
156 'Cc': 'Other, Control', | |
157 'Cf': 'Other, Format', | |
158 'Cs': 'Other, Surrogate', | |
159 'Co': 'Other, Private Use', | |
160 'Cn': 'Other, Not Assigned', | |
161 } | |
162 | |
163 bidi_category = { | |
164 'L' : 'Left-to-Right', | |
165 'LRE' : 'Left-to-Right Embedding', | |
166 'LRO' : 'Left-to-Right Override', | |
167 'R' : 'Right-to-Left', | |
168 'AL' : 'Right-to-Left Arabic', | |
169 'RLE' : 'Right-to-Left Embedding', | |
170 'RLO' : 'Right-to-Left Override', | |
171 'PDF' : 'Pop Directional Format', | |
172 'EN' : 'European Number', | |
173 'ES' : 'European Number Separator', | |
174 'ET' : 'European Number Terminator', | |
175 'AN' : 'Arabic Number', | |
176 'CS' : 'Common Number Separator', | |
177 'NSM' : 'Non-Spacing Mark', | |
178 'BN' : 'Boundary Neutral', | |
179 'B' : 'Paragraph Separator', | |
180 'S' : 'Segment Separator', | |
181 'WS' : 'Whitespace', | |
182 'ON' : 'Other Neutrals', | |
183 } | |
184 | |
185 comb_classes = { | |
186 0: 'Spacing, split, enclosing, reordrant, and Tibetan subjoined', | |
187 1: 'Overlays and interior', | |
188 7: 'Nuktas', | |
189 8: 'Hiragana/Katakana voicing marks', | |
190 9: 'Viramas', | |
191 10: 'Start of fixed position classes', | |
192 199: 'End of fixed position classes', | |
193 200: 'Below left attached', | |
194 202: 'Below attached', | |
195 204: 'Below right attached', | |
196 208: 'Left attached (reordrant around single base character)', | |
197 210: 'Right attached', | |
198 212: 'Above left attached', | |
199 214: 'Above attached', | |
200 216: 'Above right attached', | |
201 218: 'Below left', | |
202 220: 'Below', | |
203 222: 'Below right', | |
204 224: 'Left (reordrant around single base character)', | |
205 226: 'Right', | |
206 228: 'Above left', | |
207 230: 'Above', | |
208 232: 'Above right', | |
209 233: 'Double below', | |
210 234: 'Double above', | |
211 240: 'Below (iota subscript)', | |
212 } | |
213 | |
214 | |
215 | |
216 def get_unicode_properties(ch): | |
217 properties = {} | |
218 if ch in linecache: | |
219 fields = linecache[ch].strip().split(';') | |
220 proplist = ['codepoint', 'name', 'category', 'combining', 'bidi', 'decomposition', 'dummy', 'digit_value', 'numeric_value', 'mirrored', 'unicode1name', 'iso_comment', 'uppercase', 'lowercase', 'titlecase'] | |
221 for i, prop in enumerate(proplist): | |
222 if prop!='dummy': | |
223 properties[prop] = fields[i] | |
224 | |
225 if properties['lowercase']: | |
226 properties['lowercase'] = unichr(int(properties['lowercase'], 16)) | |
227 if properties['uppercase']: | |
228 properties['uppercase'] = unichr(int(properties['uppercase'], 16)) | |
229 if properties['titlecase']: | |
230 properties['titlecase'] = unichr(int(properties['titlecase'], 16)) | |
231 | |
232 properties['combining'] = int(properties['combining']) | |
233 properties['mirrored'] = properties['mirrored']=='Y' | |
234 else: | |
235 properties['codepoint'] = '%04X' % ord(ch) | |
236 properties['name'] = unicodedata.name(ch, '') | |
237 properties['category'] = unicodedata.category(ch) | |
238 properties['combining'] = unicodedata.combining(ch) | |
239 properties['bidi'] = unicodedata.bidirectional(ch) | |
240 properties['decomposition'] = unicodedata.decomposition(ch) | |
241 properties['digit_value'] = unicodedata.digit(ch, '') | |
242 properties['numeric_value'] = unicodedata.numeric(ch, '') | |
243 properties['mirrored'] = unicodedata.mirrored(ch) | |
244 properties['unicode1name'] = '' | |
245 properties['iso_comment'] = '' | |
246 properties['uppercase'] = ch.upper() | |
247 properties['lowercase'] = ch.lower() | |
248 properties['titlecase'] = '' | |
249 return properties | |
250 | |
251 | |
252 def do_init(): | |
253 HomeDir = os.path.expanduser('~/.unicode') | |
254 HomeUnicodeData = os.path.join(HomeDir, "UnicodeData.txt") | |
255 global UnicodeDataFileNames | |
256 UnicodeDataFileNames = [HomeUnicodeData, '/usr/share/unicode/UnicodeData.txt', '/usr/share/unidata/UnicodeData.txt', '/hackenv/share/UnicodeData.txt'] + \ | |
257 glob.glob('/usr/share/unidata/UnicodeData*.txt') + \ | |
258 glob.glob('/usr/share/perl/*/unicore/UnicodeData.txt') + \ | |
259 glob.glob('/System/Library/Perl/*/unicore/UnicodeData.txt') # for MacOSX | |
260 | |
261 HomeUnihanData = os.path.join(HomeDir, "Unihan*") | |
262 global UnihanDataGlobs | |
263 UnihanDataGlobs = [HomeUnihanData, '/usr/share/unidata/Unihan*', '/usr/share/unicode/Unihan*', './Unihan*'] | |
264 | |
265 | |
266 def get_unihan_files(): | |
267 fos = [] # list of file names for Unihan data file(s) | |
268 for gl in UnihanDataGlobs: | |
269 fnames = glob.glob(gl) | |
270 fos += fnames | |
271 return fos | |
272 | |
273 def get_unihan_properties_internal(ch): | |
274 properties = {} | |
275 ch = ord(ch) | |
276 global unihan_fs | |
277 for f in unihan_fs: | |
278 fo = OpenGzip(f) | |
279 for l in fo: | |
280 if l.startswith('#'): | |
281 continue | |
282 line = l.strip() | |
283 if not line: | |
284 continue | |
285 char, key, value = line.strip().split('\t') | |
286 if int(char[2:], 16) == ch: | |
287 properties[key] = unicode(value, 'utf-8') | |
288 elif int(char[2:], 16)>ch: | |
289 break | |
290 return properties | |
291 | |
292 def get_unihan_properties_zgrep(ch): | |
293 properties = {} | |
294 global unihan_fs | |
295 ch = ord(ch) | |
296 chs = 'U+%X' % ch | |
297 for f in unihan_fs: | |
298 if f.endswith('.gz'): | |
299 grepcmd = 'zgrep' | |
300 elif f.endswith('.bz2'): | |
301 grepcmd = 'bzgrep' | |
302 else: | |
303 grepcmd = 'grep' | |
304 cmdline = grepcmd+' ^'+chs+r'\\b '+f | |
305 status, output = cmd.getstatusoutput(cmdline) | |
306 output = output.split('\n') | |
307 for l in output: | |
308 if not l: | |
309 continue | |
310 char, key, value = l.strip().split('\t') | |
311 if int(char[2:], 16) == ch: | |
312 if PY3: | |
313 properties[key] = value | |
314 else: | |
315 properties[key] = unicode(value, 'utf-8') | |
316 elif int(char[2:], 16)>ch: | |
317 break | |
318 return properties | |
319 | |
320 # basic sanity check, if e.g. you run this on MS Windows... | |
321 if os.path.exists('/bin/grep'): | |
322 get_unihan_properties = get_unihan_properties_zgrep | |
323 else: | |
324 get_unihan_properties = get_unihan_properties_internal | |
325 | |
326 | |
327 def error(txt): | |
328 out(txt) | |
329 out('\n') | |
330 sys.exit(1) | |
331 | |
332 def get_gzip_filename(fname): | |
333 "return fname, if it does not exist, return fname+.gz, if neither that, fname+bz2, if neither that, return None" | |
334 if os.path.exists(fname): | |
335 return fname | |
336 if os.path.exists(fname+'.gz'): | |
337 return fname+'.gz' | |
338 if os.path.exists(fname+'.bz2') and bz2 is not None: | |
339 return fname+'.bz2' | |
340 return None | |
341 | |
342 | |
343 def OpenGzip(fname): | |
344 "open fname, try fname.gz or fname.bz2 if fname does not exist, return file object or GzipFile or BZ2File object" | |
345 if os.path.exists(fname) and not (fname.endswith('.gz') or fname.endswith('.bz2')): | |
346 return open(fname) | |
347 if os.path.exists(fname+'.gz'): | |
348 fname = fname+'.gz' | |
349 elif os.path.exists(fname+'.bz2') and bz2 is not None: | |
350 fname = fname+'.bz2' | |
351 if fname.endswith('.gz'): | |
352 return gzip.GzipFile(fname) | |
353 elif fname.endswith('.bz2'): | |
354 return bz2.BZ2File(fname) | |
355 return None | |
356 | |
357 def GrepInNames(pattern, fillcache=False): | |
358 p = re.compile(pattern, re.I) | |
359 f = None | |
360 for name in UnicodeDataFileNames: | |
361 f = OpenGzip(name) | |
362 if f != None: | |
363 break | |
364 if not fillcache: | |
365 if not f: | |
366 out( """ | |
367 Cannot find UnicodeData.txt, please place it into | |
368 /usr/share/unidata/UnicodeData.txt, | |
369 /usr/share/unicode/UnicodeData.txt, ~/.unicode/ or current | |
370 working directory (optionally you can gzip it). | |
371 Without the file, searching will be much slower. | |
372 | |
373 """ ) | |
374 for i in xrange(sys.maxunicode): | |
375 try: | |
376 name = unicodedata.name(unichr(i)) | |
377 if re.search(p, name): | |
378 yield myunichr(i) | |
379 except ValueError: | |
380 pass | |
381 else: | |
382 for l in f: | |
383 if re.search(p, l): | |
384 r = myunichr(int(l.split(';')[0], 16)) | |
385 linecache[r] = l | |
386 yield r | |
387 f.close() | |
388 else: | |
389 if f: | |
390 for l in f: | |
391 if re.search(p, l): | |
392 r = myunichr(int(l.split(';')[0], 16)) | |
393 linecache[r] = l | |
394 f.close() | |
395 | |
396 | |
397 def valfromcp(n, cp=None): | |
398 "if fromcp is defined, then the 'n' is considered to be from that codepage and is converted accordingly" | |
399 if cp: | |
400 xh = '%x' %n | |
401 if len(xh) % 2: # pad hexadecimal representation with a zero | |
402 xh = '0'+xh | |
403 cps = ( [xh[i:i+2] for i in range(0,len(xh),2)] ) | |
404 cps = ( chr(int(i, 16)) for i in cps) | |
405 cps = ''.join(cps) | |
406 """ | |
407 if 0 <= n <= 255: | |
408 s = chr(n) | |
409 elif 256 <= n <= 65535: | |
410 s = struct.pack('>H', n) | |
411 elif 65536 <= n <= sys.maxint: | |
412 s = struct.pack('>H', n) | |
413 else: # bad character code, either negative or too big | |
414 raise ValueError("Bad character code %s" %n) | |
415 print 'ee',`s` | |
416 n = unicode(s, cp) | |
417 """ | |
418 s = unicode(cps, cp) | |
419 ns = [ord(x) for x in s] | |
420 return ns | |
421 else: | |
422 return [n] | |
423 | |
424 def myunichr(n): | |
425 try: | |
426 r = unichr(n) | |
427 return r | |
428 except OverflowError: | |
429 traceback.print_exc() | |
430 error("The codepoint is too big - it does not fit into an int.") | |
431 except ValueError: | |
432 traceback.print_exc() | |
433 err = "The codepoint is too big." | |
434 if sys.maxunicode <= 0xffff: | |
435 err += "\nPerhaps your python interpreter is not compiled with wide unicode characters." | |
436 error(err) | |
437 | |
438 | |
439 def guesstype(arg): | |
440 if not arg: # empty string | |
441 return 'empty string', arg | |
442 elif not is_ascii(arg): | |
443 return 'string', arg | |
444 elif arg[:2]=='U+' or arg[:2]=='u+': # it is hexadecimal number | |
445 try: | |
446 val = int(arg[2:], 16) | |
447 if val>sys.maxunicode: | |
448 return 'regexp', arg | |
449 else: | |
450 return 'hexadecimal', arg[2:] | |
451 except ValueError: | |
452 return 'regexp', arg | |
453 elif arg[0] in "Uu" and len(arg)>4: | |
454 try: | |
455 val = int(arg[1:], 16) | |
456 if val>sys.maxunicode: | |
457 return 'regexp', arg | |
458 else: | |
459 return 'hexadecimal', arg | |
460 except ValueError: | |
461 return 'regexp', arg | |
462 elif len(arg)>=4: | |
463 if len(arg) in (8, 16, 24, 32): | |
464 if all(x in '01' for x in arg): | |
465 val = int(arg, 2) | |
466 if val<=sys.maxunicode: | |
467 return 'binary', arg | |
468 try: | |
469 val = int(arg, 16) | |
470 if val>sys.maxunicode: | |
471 return 'regexp', arg | |
472 else: | |
473 return 'hexadecimal', arg | |
474 except ValueError: | |
475 return 'regexp', arg | |
476 else: | |
477 return 'string', arg | |
478 | |
479 def process(arglist, t, fromcp=None): | |
480 # build a list of values, so that we can combine queries like | |
481 # LATIN ALPHA and search for LATIN.*ALPHA and not names that | |
482 # contain either LATIN or ALPHA | |
483 result = [] | |
484 names_query = [] # reserved for queries in names - i.e. -r | |
485 for arg_i in arglist: | |
486 if t==None: | |
487 tp, arg = guesstype(arg_i) | |
488 if tp == 'regexp': | |
489 # if the first argument is guessed to be a regexp, add | |
490 # all the following arguments to the regular expression - | |
491 # this is probably what you wanted, e.g. | |
492 # 'unicode cyrillic be' will now search for the 'cyrillic.*be' regular expression | |
493 t = 'regexp' | |
494 else: | |
495 tp, arg = t, arg_i | |
496 if tp=='hexadecimal': | |
497 val = int(arg, 16) | |
498 vals = valfromcp(val, fromcp) | |
499 for val in vals: | |
500 r = myunichr(val) | |
501 list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties | |
502 result.append(r) | |
503 elif tp=='decimal': | |
504 val = int(arg, 10) | |
505 vals = valfromcp(val, fromcp) | |
506 for val in vals: | |
507 r = myunichr(val) | |
508 list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties | |
509 result.append(r) | |
510 elif tp=='octal': | |
511 val = int(arg, 8) | |
512 vals = valfromcp(val, fromcp) | |
513 for val in vals: | |
514 r = myunichr(val) | |
515 list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties | |
516 result.append(r) | |
517 elif tp=='binary': | |
518 val = int(arg, 2) | |
519 vals = valfromcp(val, fromcp) | |
520 for val in vals: | |
521 r = myunichr(val) | |
522 list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties | |
523 result.append(r) | |
524 elif tp=='regexp': | |
525 names_query.append(arg) | |
526 elif tp=='string': | |
527 try: | |
528 if PY3: # argv is automatically decoded into unicode, even padded with bogus character if it is not encodable | |
529 unirepr = arg | |
530 else: | |
531 unirepr = unicode(arg, options.iocharset) | |
532 except UnicodeDecodeError: | |
533 error ("Sequence %s is not valid in charset '%s'." % (repr(arg), options.iocharset)) | |
534 unilist = ['%04X'%ord(x) for x in unirepr] | |
535 unireg = '|'.join(unilist) | |
536 list(GrepInNames(unireg, fillcache=True)) | |
537 for r in unirepr: | |
538 result.append(r) | |
539 elif tp=='empty string': | |
540 pass # do not do anything for an empty string | |
541 if names_query: | |
542 query = '.*'.join(names_query) | |
543 for r in GrepInNames(query): | |
544 result.append(r) | |
545 return result | |
546 | |
547 def maybe_colours(colour): | |
548 if use_colour: | |
549 return colours[colour] | |
550 else: | |
551 return "" | |
552 | |
553 # format key and value | |
554 def printkv(*l): | |
555 for i in range(0, len(l), 2): | |
556 if i<len(l)-2: | |
557 sep = " " | |
558 else: | |
559 sep = "\n" | |
560 k, v = l[i], l[i+1] | |
561 out(maybe_colours('green')) | |
562 out(k) | |
563 out(": ") | |
564 out(maybe_colours('default')) | |
565 out(unicode(v)) | |
566 out(sep) | |
567 | |
568 def print_characters(clist, maxcount, query_wiki=0): | |
569 """query_wiki - 0 - don't | |
570 1 - spawn browser | |
571 """ | |
572 counter = 0 | |
573 for c in clist: | |
574 | |
575 if query_wiki: | |
576 ch = urllib.quote(c.encode('utf-8')) # wikipedia uses UTF-8 in names | |
577 wiki_url = 'http://en.wikipedia.org/wiki/'+ch | |
578 webbrowser.open(wiki_url) | |
579 query_wiki = 0 # query only the very first character | |
580 | |
581 | |
582 if maxcount: | |
583 counter += 1 | |
584 if counter > options.maxcount: | |
585 out("\nToo many characters to display, more than %s, use --max option to change it\n" % options.maxcount) | |
586 return | |
587 properties = get_unicode_properties(c) | |
588 out(maybe_colours('bold')) | |
589 out('U+%04X '% ord(c)) | |
590 if properties['name']: | |
591 out(properties['name']) | |
592 else: | |
593 out(maybe_colours('default')) | |
594 out(" - No such unicode character name in database") | |
595 out(maybe_colours('default')) | |
596 out('\n') | |
597 | |
598 ar = ["UTF-8", ' '.join([("%02x" % ord23(x)) for x in c.encode('utf-8')]) , | |
599 "UTF-16BE", ''.join([("%02x" % ord23(x)) for x in c.encode('utf-16be')]), | |
600 "Decimal", "&#%s;" % ord(c) ] | |
601 if options.addcharset: | |
602 try: | |
603 rep = ' '.join([("%02x" % ord(x)) for x in c.encode(options.addcharset)] ) | |
604 except UnicodeError: | |
605 rep = "NONE" | |
606 ar.extend( [options.addcharset, rep] ) | |
607 printkv(*ar) | |
608 | |
609 | |
610 if properties['combining']: | |
611 pc = " "+c | |
612 else: | |
613 pc = c | |
614 out(pc) | |
615 uppercase = properties['uppercase'] | |
616 lowercase = properties['lowercase'] | |
617 if uppercase: | |
618 out(" (%s)" % uppercase) | |
619 out('\n') | |
620 printkv( "Uppercase", 'U+%04X'% ord(properties['uppercase']) ) | |
621 elif lowercase: | |
622 out(" (%s)" % properties['lowercase']) | |
623 out('\n') | |
624 printkv( "Lowercase", 'U+%04X'% ord(properties['lowercase']) ) | |
625 else: | |
626 out('\n') | |
627 printkv( 'Category', properties['category']+ " (%s)" % general_category[properties['category']] ) | |
628 | |
629 if properties['numeric_value']: | |
630 printkv( 'Numeric value', properties['numeric_value']) | |
631 if properties['digit_value']: | |
632 printkv( 'Digit value', properties['digit_value']) | |
633 | |
634 bidi = properties['bidi'] | |
635 if bidi: | |
636 printkv( 'Bidi', bidi+ " (%s)" % bidi_category[bidi] ) | |
637 mirrored = properties['mirrored'] | |
638 if mirrored: | |
639 out('Character is mirrored\n') | |
640 comb = properties['combining'] | |
641 if comb: | |
642 printkv( 'Combining', str(comb)+ " (%s)" % (comb_classes.get(comb, '?')) ) | |
643 decomp = properties['decomposition'] | |
644 if decomp: | |
645 printkv( 'Decomposition', decomp ) | |
646 if options.verbosity>0: | |
647 uhp = get_unihan_properties(c) | |
648 for key in uhp: | |
649 printkv(key, uhp[key]) | |
650 out('\n') | |
651 | |
652 | |
653 def print_block(block): | |
654 #header | |
655 out(" "*10) | |
656 for i in range(16): | |
657 out(".%X " % i) | |
658 out('\n') | |
659 #body | |
660 for i in range(block*16, block*16+16): | |
661 hexi = "%X" % i | |
662 if len(hexi)>3: | |
663 hexi = "%07X" % i | |
664 hexi = hexi[:4]+" "+hexi[4:] | |
665 else: | |
666 hexi = " %03X" % i | |
667 out(LTR+hexi+". ") | |
668 for j in range(16): | |
669 c = unichr(i*16+j) | |
670 if unicodedata.combining(c): | |
671 c = " "+c | |
672 out(c) | |
673 out(' ') | |
674 out('\n') | |
675 out('\n') | |
676 | |
677 def print_blocks(blocks): | |
678 for block in blocks: | |
679 print_block(block) | |
680 | |
681 def is_range(s, typ): | |
682 sp = s.split('..') | |
683 if len(sp)!=2: | |
684 return False | |
685 if not sp[1]: | |
686 sp[1] = sp[0] | |
687 elif not sp[0]: | |
688 sp[0] = sp[1] | |
689 if not sp[0]: | |
690 return False | |
691 low = list(process([sp[0]], typ)) # intentionally no fromcp here, ranges are only of unicode characters | |
692 high = list(process([sp[1]], typ)) | |
693 if len(low)!=1 or len(high)!=1: | |
694 return False | |
695 low = ord(low[0]) | |
696 high = ord(high[0]) | |
697 low = low // 256 | |
698 high = high // 256 + 1 | |
699 return range(low, high) | |
700 | |
701 | |
702 | |
703 parser = OptionParser(usage="usage: %prog [options] arg") | |
704 parser.add_option("-x", "--hexadecimal", | |
705 action="store_const", const='hexadecimal', dest="type", | |
706 help="Assume arg to be hexadecimal number") | |
707 parser.add_option("-o", "--octal", | |
708 action="store_const", const='octal', dest="type", | |
709 help="Assume arg to be octal number") | |
710 parser.add_option("-b", "--binary", | |
711 action="store_const", const='binary', dest="type", | |
712 help="Assume arg to be binary number") | |
713 parser.add_option("-d", "--decimal", | |
714 action="store_const", const='decimal', dest="type", | |
715 help="Assume arg to be decimal number") | |
716 parser.add_option("-r", "--regexp", | |
717 action="store_const", const='regexp', dest="type", | |
718 help="Assume arg to be regular expression") | |
719 parser.add_option("-s", "--string", | |
720 action="store_const", const='string', dest="type", | |
721 help="Assume arg to be a sequence of characters") | |
722 parser.add_option("-a", "--auto", | |
723 action="store_const", const=None, dest="type", | |
724 help="Try to guess arg type (default)") | |
725 parser.add_option("-m", "--max", | |
726 action="store", default=10, dest="maxcount", type="int", | |
727 help="Maximal number of codepoints to display, default: 10; 0=unlimited") | |
728 parser.add_option("-i", "--io", | |
729 action="store", default=iocharsetguess, dest="iocharset", type="string", | |
730 help="I/O character set, I am guessing %s" % iocharsetguess) | |
731 parser.add_option("--fcp", "--fromcp", | |
732 action="store", default='', dest="fromcp", type="string", | |
733 help="Convert numerical arguments from this encoding, default: no conversion") | |
734 parser.add_option("-c", "--charset-add", | |
735 action="store", dest="addcharset", type="string", | |
736 help="Show hexadecimal reprezentation in this additional charset") | |
737 parser.add_option("-C", "--colour", | |
738 action="store", dest="use_colour", type="string", | |
739 default="auto", | |
740 help="Use colours, on, off or auto") | |
741 parser.add_option('', "--color", | |
742 action="store", dest="use_colour", type="string", | |
743 default="auto", | |
744 help="synonym for --colour") | |
745 parser.add_option("-v", "--verbose", | |
746 action="count", dest="verbosity", | |
747 default=0, | |
748 help="Increase verbosity (reads Unihan properties - slow!)") | |
749 parser.add_option("-w", "--wikipedia", | |
750 action="count", dest="query_wiki", | |
751 default=0, | |
752 help="Query wikipedia for the character") | |
753 parser.add_option("--list", | |
754 action="store_const", dest="list_all_encodings", | |
755 const=True, | |
756 help="List (approximately) all known encodings") | |
757 | |
758 | |
759 (options, arguments) = parser.parse_args() | |
760 | |
761 linecache = {} | |
762 do_init() | |
763 | |
764 | |
765 if options.list_all_encodings: | |
766 all_encodings = os.listdir(os.path.dirname(encodings.__file__)) | |
767 all_encodings = set([os.path.splitext(x)[0] for x in all_encodings]) | |
768 all_encodings = list(all_encodings) | |
769 all_encodings.sort() | |
770 print (textwrap.fill(' '.join(all_encodings))) | |
771 sys.exit() | |
772 | |
773 if len(arguments)==0: | |
774 parser.print_help() | |
775 sys.exit() | |
776 | |
777 | |
778 if options.use_colour.lower() in ("on", "1", "true", "yes"): | |
779 use_colour = True | |
780 elif options.use_colour.lower() in ("off", "0", "false", "no"): | |
781 use_colour = False | |
782 else: | |
783 use_colour = sys.stdout.isatty() | |
784 if sys.platform == 'win32': | |
785 use_colour = False | |
786 | |
787 | |
788 l_args = [] # list of non range arguments to process | |
789 for argum in arguments: | |
790 is_r = is_range(argum, options.type) | |
791 if is_r: | |
792 print_blocks(is_r) | |
793 else: | |
794 l_args.append(argum) | |
795 | |
796 if l_args: | |
797 unihan_fs = [] | |
798 if options.verbosity>0: | |
799 unihan_fs = get_unihan_files() # list of file names for Unihan data file(s), empty if not available | |
800 if not unihan_fs: | |
801 out( """ | |
802 Unihan_*.txt files not found. In order to view Unihan properties, | |
803 please place the file into /usr/share/unidata/, | |
804 /usr/share/unicode/, ~/.unicode/ | |
805 or current working directory (optionally you can gzip or bzip2 them). | |
806 You can get the files by unpacking ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip | |
807 Warning, listing UniHan Properties is rather slow. | |
808 | |
809 """) | |
810 options.verbosity = 0 | |
811 try: | |
812 print_characters(process(l_args, options.type, options.fromcp), options.maxcount, options.query_wiki) | |
813 except IOError: # e.g. broken pipe | |
814 pass | |
815 |