9075
|
1 #!/usr/bin/python
|
|
2
|
|
3
|
|
4 import os, glob, sys, unicodedata, locale, gzip, re, traceback, encodings
|
|
5 import urllib, webbrowser, textwrap
|
|
6
|
|
7 # bz2 was introduced in 2.3, we want this to work also with earlier versions
|
|
8 try:
|
|
9 import bz2
|
|
10 except ImportError:
|
|
11 bz2 = None
|
|
12
|
|
13 # for python3
|
|
14 try:
|
|
15 unicode
|
|
16 except NameError:
|
|
17 unicode = str
|
|
18
|
|
19 # 'any' and 'all' were introduced in python2.5
|
|
20 # dummy replacement for older versions
|
|
21 try:
|
|
22 all
|
|
23 except NameError:
|
|
24 all = lambda x: False
|
|
25
|
|
26 PY3 = sys.version_info[0] >= 3
|
|
27 if PY3:
|
|
28 import subprocess as cmd
|
|
29
|
|
30 def is_ascii(s):
|
|
31 "test is string s consists completely of ascii characters (python 3)"
|
|
32 try:
|
|
33 s.encode('ascii')
|
|
34 except UnicodeEncodeError:
|
|
35 return False
|
|
36 return True
|
|
37
|
|
38 def out(*args):
|
|
39 "pring args, converting them to output charset"
|
|
40 for i in args:
|
|
41 sys.stdout.flush()
|
|
42 sys.stdout.buffer.write(i.encode(options.iocharset, 'replace'))
|
|
43
|
|
44 # ord23 is used to convert elements of byte array in python3, which are integers
|
|
45 ord23 = lambda x: x
|
|
46
|
|
47 # unichr is not in python3
|
|
48 unichr = chr
|
|
49
|
|
50 else: # python2
|
|
51
|
|
52 # getoutput() and getstatusoutput() methods have
|
|
53 # been moved from commands to the subprocess module
|
|
54 # with Python >= 3.x
|
|
55 import commands as cmd
|
|
56
|
|
57 def is_ascii(s):
|
|
58 "test is string s consists completely of ascii characters (python 2)"
|
|
59 try:
|
|
60 unicode(s, 'ascii')
|
|
61 except UnicodeDecodeError:
|
|
62 return False
|
|
63 return True
|
|
64
|
|
65 def out(*args):
|
|
66 "pring args, converting them to output charset"
|
|
67 for i in args:
|
|
68 sys.stdout.write(i.encode(options.iocharset, 'replace'))
|
|
69
|
|
70 ord23 = ord
|
|
71
|
|
72
|
|
73
|
|
74 from optparse import OptionParser
|
|
75
|
|
76 VERSION='0.9.7'
|
|
77
|
|
78
|
|
79 # list of terminals that support bidi
|
|
80 biditerms = ['mlterm']
|
|
81
|
|
82 try:
|
|
83 locale.setlocale(locale.LC_ALL, '')
|
|
84 except locale.Error:
|
|
85 pass
|
|
86
|
|
87 # guess terminal charset
|
|
88 try:
|
|
89 iocharsetguess = locale.nl_langinfo(locale.CODESET) or "ascii"
|
|
90 except locale.Error:
|
|
91 iocharsetguess = "ascii"
|
|
92
|
|
93 if os.environ.get('TERM') in biditerms and iocharsetguess.lower().startswith('utf'):
|
|
94 LTR = u'\u202d' # left to right override
|
|
95 else:
|
|
96 LTR = ''
|
|
97
|
|
98
|
|
99 colours = {
|
|
100 'none' : "",
|
|
101 'default' : "\033[0m",
|
|
102 'bold' : "\033[1m",
|
|
103 'underline' : "\033[4m",
|
|
104 'blink' : "\033[5m",
|
|
105 'reverse' : "\033[7m",
|
|
106 'concealed' : "\033[8m",
|
|
107
|
|
108 'black' : "\033[30m",
|
|
109 'red' : "\033[31m",
|
|
110 'green' : "\033[32m",
|
|
111 'yellow' : "\033[33m",
|
|
112 'blue' : "\033[34m",
|
|
113 'magenta' : "\033[35m",
|
|
114 'cyan' : "\033[36m",
|
|
115 'white' : "\033[37m",
|
|
116
|
|
117 'on_black' : "\033[40m",
|
|
118 'on_red' : "\033[41m",
|
|
119 'on_green' : "\033[42m",
|
|
120 'on_yellow' : "\033[43m",
|
|
121 'on_blue' : "\033[44m",
|
|
122 'on_magenta' : "\033[45m",
|
|
123 'on_cyan' : "\033[46m",
|
|
124 'on_white' : "\033[47m",
|
|
125
|
|
126 'beep' : "\007",
|
|
127 }
|
|
128
|
|
129
|
|
130 general_category = {
|
|
131 'Lu': 'Letter, Uppercase',
|
|
132 'Ll': 'Letter, Lowercase',
|
|
133 'Lt': 'Letter, Titlecase',
|
|
134 'Lm': 'Letter, Modifier',
|
|
135 'Lo': 'Letter, Other',
|
|
136 'Mn': 'Mark, Non-Spacing',
|
|
137 'Mc': 'Mark, Spacing Combining',
|
|
138 'Me': 'Mark, Enclosing',
|
|
139 'Nd': 'Number, Decimal Digit',
|
|
140 'Nl': 'Number, Letter',
|
|
141 'No': 'Number, Other',
|
|
142 'Pc': 'Punctuation, Connector',
|
|
143 'Pd': 'Punctuation, Dash',
|
|
144 'Ps': 'Punctuation, Open',
|
|
145 'Pe': 'Punctuation, Close',
|
|
146 'Pi': 'Punctuation, Initial quote',
|
|
147 'Pf': 'Punctuation, Final quote',
|
|
148 'Po': 'Punctuation, Other',
|
|
149 'Sm': 'Symbol, Math',
|
|
150 'Sc': 'Symbol, Currency',
|
|
151 'Sk': 'Symbol, Modifier',
|
|
152 'So': 'Symbol, Other',
|
|
153 'Zs': 'Separator, Space',
|
|
154 'Zl': 'Separator, Line',
|
|
155 'Zp': 'Separator, Paragraph',
|
|
156 'Cc': 'Other, Control',
|
|
157 'Cf': 'Other, Format',
|
|
158 'Cs': 'Other, Surrogate',
|
|
159 'Co': 'Other, Private Use',
|
|
160 'Cn': 'Other, Not Assigned',
|
|
161 }
|
|
162
|
|
163 bidi_category = {
|
|
164 'L' : 'Left-to-Right',
|
|
165 'LRE' : 'Left-to-Right Embedding',
|
|
166 'LRO' : 'Left-to-Right Override',
|
|
167 'R' : 'Right-to-Left',
|
|
168 'AL' : 'Right-to-Left Arabic',
|
|
169 'RLE' : 'Right-to-Left Embedding',
|
|
170 'RLO' : 'Right-to-Left Override',
|
|
171 'PDF' : 'Pop Directional Format',
|
|
172 'EN' : 'European Number',
|
|
173 'ES' : 'European Number Separator',
|
|
174 'ET' : 'European Number Terminator',
|
|
175 'AN' : 'Arabic Number',
|
|
176 'CS' : 'Common Number Separator',
|
|
177 'NSM' : 'Non-Spacing Mark',
|
|
178 'BN' : 'Boundary Neutral',
|
|
179 'B' : 'Paragraph Separator',
|
|
180 'S' : 'Segment Separator',
|
|
181 'WS' : 'Whitespace',
|
|
182 'ON' : 'Other Neutrals',
|
|
183 }
|
|
184
|
|
185 comb_classes = {
|
|
186 0: 'Spacing, split, enclosing, reordrant, and Tibetan subjoined',
|
|
187 1: 'Overlays and interior',
|
|
188 7: 'Nuktas',
|
|
189 8: 'Hiragana/Katakana voicing marks',
|
|
190 9: 'Viramas',
|
|
191 10: 'Start of fixed position classes',
|
|
192 199: 'End of fixed position classes',
|
|
193 200: 'Below left attached',
|
|
194 202: 'Below attached',
|
|
195 204: 'Below right attached',
|
|
196 208: 'Left attached (reordrant around single base character)',
|
|
197 210: 'Right attached',
|
|
198 212: 'Above left attached',
|
|
199 214: 'Above attached',
|
|
200 216: 'Above right attached',
|
|
201 218: 'Below left',
|
|
202 220: 'Below',
|
|
203 222: 'Below right',
|
|
204 224: 'Left (reordrant around single base character)',
|
|
205 226: 'Right',
|
|
206 228: 'Above left',
|
|
207 230: 'Above',
|
|
208 232: 'Above right',
|
|
209 233: 'Double below',
|
|
210 234: 'Double above',
|
|
211 240: 'Below (iota subscript)',
|
|
212 }
|
|
213
|
|
214
|
|
215
|
|
216 def get_unicode_properties(ch):
|
|
217 properties = {}
|
|
218 if ch in linecache:
|
|
219 fields = linecache[ch].strip().split(';')
|
|
220 proplist = ['codepoint', 'name', 'category', 'combining', 'bidi', 'decomposition', 'dummy', 'digit_value', 'numeric_value', 'mirrored', 'unicode1name', 'iso_comment', 'uppercase', 'lowercase', 'titlecase']
|
|
221 for i, prop in enumerate(proplist):
|
|
222 if prop!='dummy':
|
|
223 properties[prop] = fields[i]
|
|
224
|
|
225 if properties['lowercase']:
|
|
226 properties['lowercase'] = unichr(int(properties['lowercase'], 16))
|
|
227 if properties['uppercase']:
|
|
228 properties['uppercase'] = unichr(int(properties['uppercase'], 16))
|
|
229 if properties['titlecase']:
|
|
230 properties['titlecase'] = unichr(int(properties['titlecase'], 16))
|
|
231
|
|
232 properties['combining'] = int(properties['combining'])
|
|
233 properties['mirrored'] = properties['mirrored']=='Y'
|
|
234 else:
|
|
235 properties['codepoint'] = '%04X' % ord(ch)
|
|
236 properties['name'] = unicodedata.name(ch, '')
|
|
237 properties['category'] = unicodedata.category(ch)
|
|
238 properties['combining'] = unicodedata.combining(ch)
|
|
239 properties['bidi'] = unicodedata.bidirectional(ch)
|
|
240 properties['decomposition'] = unicodedata.decomposition(ch)
|
|
241 properties['digit_value'] = unicodedata.digit(ch, '')
|
|
242 properties['numeric_value'] = unicodedata.numeric(ch, '')
|
|
243 properties['mirrored'] = unicodedata.mirrored(ch)
|
|
244 properties['unicode1name'] = ''
|
|
245 properties['iso_comment'] = ''
|
|
246 properties['uppercase'] = ch.upper()
|
|
247 properties['lowercase'] = ch.lower()
|
|
248 properties['titlecase'] = ''
|
|
249 return properties
|
|
250
|
|
251
|
|
252 def do_init():
|
|
253 HomeDir = os.path.expanduser('~/.unicode')
|
|
254 HomeUnicodeData = os.path.join(HomeDir, "UnicodeData.txt")
|
|
255 global UnicodeDataFileNames
|
|
256 UnicodeDataFileNames = [HomeUnicodeData, '/usr/share/unicode/UnicodeData.txt', '/usr/share/unidata/UnicodeData.txt', '/hackenv/share/UnicodeData.txt'] + \
|
|
257 glob.glob('/usr/share/unidata/UnicodeData*.txt') + \
|
|
258 glob.glob('/usr/share/perl/*/unicore/UnicodeData.txt') + \
|
|
259 glob.glob('/System/Library/Perl/*/unicore/UnicodeData.txt') # for MacOSX
|
|
260
|
|
261 HomeUnihanData = os.path.join(HomeDir, "Unihan*")
|
|
262 global UnihanDataGlobs
|
|
263 UnihanDataGlobs = [HomeUnihanData, '/usr/share/unidata/Unihan*', '/usr/share/unicode/Unihan*', './Unihan*']
|
|
264
|
|
265
|
|
266 def get_unihan_files():
|
|
267 fos = [] # list of file names for Unihan data file(s)
|
|
268 for gl in UnihanDataGlobs:
|
|
269 fnames = glob.glob(gl)
|
|
270 fos += fnames
|
|
271 return fos
|
|
272
|
|
273 def get_unihan_properties_internal(ch):
|
|
274 properties = {}
|
|
275 ch = ord(ch)
|
|
276 global unihan_fs
|
|
277 for f in unihan_fs:
|
|
278 fo = OpenGzip(f)
|
|
279 for l in fo:
|
|
280 if l.startswith('#'):
|
|
281 continue
|
|
282 line = l.strip()
|
|
283 if not line:
|
|
284 continue
|
|
285 char, key, value = line.strip().split('\t')
|
|
286 if int(char[2:], 16) == ch:
|
|
287 properties[key] = unicode(value, 'utf-8')
|
|
288 elif int(char[2:], 16)>ch:
|
|
289 break
|
|
290 return properties
|
|
291
|
|
292 def get_unihan_properties_zgrep(ch):
|
|
293 properties = {}
|
|
294 global unihan_fs
|
|
295 ch = ord(ch)
|
|
296 chs = 'U+%X' % ch
|
|
297 for f in unihan_fs:
|
|
298 if f.endswith('.gz'):
|
|
299 grepcmd = 'zgrep'
|
|
300 elif f.endswith('.bz2'):
|
|
301 grepcmd = 'bzgrep'
|
|
302 else:
|
|
303 grepcmd = 'grep'
|
|
304 cmdline = grepcmd+' ^'+chs+r'\\b '+f
|
|
305 status, output = cmd.getstatusoutput(cmdline)
|
|
306 output = output.split('\n')
|
|
307 for l in output:
|
|
308 if not l:
|
|
309 continue
|
|
310 char, key, value = l.strip().split('\t')
|
|
311 if int(char[2:], 16) == ch:
|
|
312 if PY3:
|
|
313 properties[key] = value
|
|
314 else:
|
|
315 properties[key] = unicode(value, 'utf-8')
|
|
316 elif int(char[2:], 16)>ch:
|
|
317 break
|
|
318 return properties
|
|
319
|
|
320 # basic sanity check, if e.g. you run this on MS Windows...
|
|
321 if os.path.exists('/bin/grep'):
|
|
322 get_unihan_properties = get_unihan_properties_zgrep
|
|
323 else:
|
|
324 get_unihan_properties = get_unihan_properties_internal
|
|
325
|
|
326
|
|
327 def error(txt):
|
|
328 out(txt)
|
|
329 out('\n')
|
|
330 sys.exit(1)
|
|
331
|
|
332 def get_gzip_filename(fname):
|
|
333 "return fname, if it does not exist, return fname+.gz, if neither that, fname+bz2, if neither that, return None"
|
|
334 if os.path.exists(fname):
|
|
335 return fname
|
|
336 if os.path.exists(fname+'.gz'):
|
|
337 return fname+'.gz'
|
|
338 if os.path.exists(fname+'.bz2') and bz2 is not None:
|
|
339 return fname+'.bz2'
|
|
340 return None
|
|
341
|
|
342
|
|
343 def OpenGzip(fname):
|
|
344 "open fname, try fname.gz or fname.bz2 if fname does not exist, return file object or GzipFile or BZ2File object"
|
|
345 if os.path.exists(fname) and not (fname.endswith('.gz') or fname.endswith('.bz2')):
|
|
346 return open(fname)
|
|
347 if os.path.exists(fname+'.gz'):
|
|
348 fname = fname+'.gz'
|
|
349 elif os.path.exists(fname+'.bz2') and bz2 is not None:
|
|
350 fname = fname+'.bz2'
|
|
351 if fname.endswith('.gz'):
|
|
352 return gzip.GzipFile(fname)
|
|
353 elif fname.endswith('.bz2'):
|
|
354 return bz2.BZ2File(fname)
|
|
355 return None
|
|
356
|
|
357 def GrepInNames(pattern, fillcache=False):
|
|
358 p = re.compile(pattern, re.I)
|
|
359 f = None
|
|
360 for name in UnicodeDataFileNames:
|
|
361 f = OpenGzip(name)
|
|
362 if f != None:
|
|
363 break
|
|
364 if not fillcache:
|
|
365 if not f:
|
|
366 out( """
|
|
367 Cannot find UnicodeData.txt, please place it into
|
|
368 /usr/share/unidata/UnicodeData.txt,
|
|
369 /usr/share/unicode/UnicodeData.txt, ~/.unicode/ or current
|
|
370 working directory (optionally you can gzip it).
|
|
371 Without the file, searching will be much slower.
|
|
372
|
|
373 """ )
|
|
374 for i in xrange(sys.maxunicode):
|
|
375 try:
|
|
376 name = unicodedata.name(unichr(i))
|
|
377 if re.search(p, name):
|
|
378 yield myunichr(i)
|
|
379 except ValueError:
|
|
380 pass
|
|
381 else:
|
|
382 for l in f:
|
|
383 if re.search(p, l):
|
|
384 r = myunichr(int(l.split(';')[0], 16))
|
|
385 linecache[r] = l
|
|
386 yield r
|
|
387 f.close()
|
|
388 else:
|
|
389 if f:
|
|
390 for l in f:
|
|
391 if re.search(p, l):
|
|
392 r = myunichr(int(l.split(';')[0], 16))
|
|
393 linecache[r] = l
|
|
394 f.close()
|
|
395
|
|
396
|
|
397 def valfromcp(n, cp=None):
|
|
398 "if fromcp is defined, then the 'n' is considered to be from that codepage and is converted accordingly"
|
|
399 if cp:
|
|
400 xh = '%x' %n
|
|
401 if len(xh) % 2: # pad hexadecimal representation with a zero
|
|
402 xh = '0'+xh
|
|
403 cps = ( [xh[i:i+2] for i in range(0,len(xh),2)] )
|
|
404 cps = ( chr(int(i, 16)) for i in cps)
|
|
405 cps = ''.join(cps)
|
|
406 """
|
|
407 if 0 <= n <= 255:
|
|
408 s = chr(n)
|
|
409 elif 256 <= n <= 65535:
|
|
410 s = struct.pack('>H', n)
|
|
411 elif 65536 <= n <= sys.maxint:
|
|
412 s = struct.pack('>H', n)
|
|
413 else: # bad character code, either negative or too big
|
|
414 raise ValueError("Bad character code %s" %n)
|
|
415 print 'ee',`s`
|
|
416 n = unicode(s, cp)
|
|
417 """
|
|
418 s = unicode(cps, cp)
|
|
419 ns = [ord(x) for x in s]
|
|
420 return ns
|
|
421 else:
|
|
422 return [n]
|
|
423
|
|
424 def myunichr(n):
|
|
425 try:
|
|
426 r = unichr(n)
|
|
427 return r
|
|
428 except OverflowError:
|
|
429 traceback.print_exc()
|
|
430 error("The codepoint is too big - it does not fit into an int.")
|
|
431 except ValueError:
|
|
432 traceback.print_exc()
|
|
433 err = "The codepoint is too big."
|
|
434 if sys.maxunicode <= 0xffff:
|
|
435 err += "\nPerhaps your python interpreter is not compiled with wide unicode characters."
|
|
436 error(err)
|
|
437
|
|
438
|
|
439 def guesstype(arg):
|
|
440 if not arg: # empty string
|
|
441 return 'empty string', arg
|
|
442 elif not is_ascii(arg):
|
|
443 return 'string', arg
|
|
444 elif arg[:2]=='U+' or arg[:2]=='u+': # it is hexadecimal number
|
|
445 try:
|
|
446 val = int(arg[2:], 16)
|
|
447 if val>sys.maxunicode:
|
|
448 return 'regexp', arg
|
|
449 else:
|
|
450 return 'hexadecimal', arg[2:]
|
|
451 except ValueError:
|
|
452 return 'regexp', arg
|
|
453 elif arg[0] in "Uu" and len(arg)>4:
|
|
454 try:
|
|
455 val = int(arg[1:], 16)
|
|
456 if val>sys.maxunicode:
|
|
457 return 'regexp', arg
|
|
458 else:
|
|
459 return 'hexadecimal', arg
|
|
460 except ValueError:
|
|
461 return 'regexp', arg
|
|
462 elif len(arg)>=4:
|
|
463 if len(arg) in (8, 16, 24, 32):
|
|
464 if all(x in '01' for x in arg):
|
|
465 val = int(arg, 2)
|
|
466 if val<=sys.maxunicode:
|
|
467 return 'binary', arg
|
|
468 try:
|
|
469 val = int(arg, 16)
|
|
470 if val>sys.maxunicode:
|
|
471 return 'regexp', arg
|
|
472 else:
|
|
473 return 'hexadecimal', arg
|
|
474 except ValueError:
|
|
475 return 'regexp', arg
|
|
476 else:
|
|
477 return 'string', arg
|
|
478
|
|
479 def process(arglist, t, fromcp=None):
|
|
480 # build a list of values, so that we can combine queries like
|
|
481 # LATIN ALPHA and search for LATIN.*ALPHA and not names that
|
|
482 # contain either LATIN or ALPHA
|
|
483 result = []
|
|
484 names_query = [] # reserved for queries in names - i.e. -r
|
|
485 for arg_i in arglist:
|
|
486 if t==None:
|
|
487 tp, arg = guesstype(arg_i)
|
|
488 if tp == 'regexp':
|
|
489 # if the first argument is guessed to be a regexp, add
|
|
490 # all the following arguments to the regular expression -
|
|
491 # this is probably what you wanted, e.g.
|
|
492 # 'unicode cyrillic be' will now search for the 'cyrillic.*be' regular expression
|
|
493 t = 'regexp'
|
|
494 else:
|
|
495 tp, arg = t, arg_i
|
|
496 if tp=='hexadecimal':
|
|
497 val = int(arg, 16)
|
|
498 vals = valfromcp(val, fromcp)
|
|
499 for val in vals:
|
|
500 r = myunichr(val)
|
|
501 list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties
|
|
502 result.append(r)
|
|
503 elif tp=='decimal':
|
|
504 val = int(arg, 10)
|
|
505 vals = valfromcp(val, fromcp)
|
|
506 for val in vals:
|
|
507 r = myunichr(val)
|
|
508 list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties
|
|
509 result.append(r)
|
|
510 elif tp=='octal':
|
|
511 val = int(arg, 8)
|
|
512 vals = valfromcp(val, fromcp)
|
|
513 for val in vals:
|
|
514 r = myunichr(val)
|
|
515 list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties
|
|
516 result.append(r)
|
|
517 elif tp=='binary':
|
|
518 val = int(arg, 2)
|
|
519 vals = valfromcp(val, fromcp)
|
|
520 for val in vals:
|
|
521 r = myunichr(val)
|
|
522 list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties
|
|
523 result.append(r)
|
|
524 elif tp=='regexp':
|
|
525 names_query.append(arg)
|
|
526 elif tp=='string':
|
|
527 try:
|
|
528 if PY3: # argv is automatically decoded into unicode, even padded with bogus character if it is not encodable
|
|
529 unirepr = arg
|
|
530 else:
|
|
531 unirepr = unicode(arg, options.iocharset)
|
|
532 except UnicodeDecodeError:
|
|
533 error ("Sequence %s is not valid in charset '%s'." % (repr(arg), options.iocharset))
|
|
534 unilist = ['%04X'%ord(x) for x in unirepr]
|
|
535 unireg = '|'.join(unilist)
|
|
536 list(GrepInNames(unireg, fillcache=True))
|
|
537 for r in unirepr:
|
|
538 result.append(r)
|
|
539 elif tp=='empty string':
|
|
540 pass # do not do anything for an empty string
|
|
541 if names_query:
|
|
542 query = '.*'.join(names_query)
|
|
543 for r in GrepInNames(query):
|
|
544 result.append(r)
|
|
545 return result
|
|
546
|
|
547 def maybe_colours(colour):
|
|
548 if use_colour:
|
|
549 return colours[colour]
|
|
550 else:
|
|
551 return ""
|
|
552
|
|
553 # format key and value
|
|
554 def printkv(*l):
|
|
555 for i in range(0, len(l), 2):
|
|
556 if i<len(l)-2:
|
|
557 sep = " "
|
|
558 else:
|
|
559 sep = "\n"
|
|
560 k, v = l[i], l[i+1]
|
|
561 out(maybe_colours('green'))
|
|
562 out(k)
|
|
563 out(": ")
|
|
564 out(maybe_colours('default'))
|
|
565 out(unicode(v))
|
|
566 out(sep)
|
|
567
|
|
568 def print_characters(clist, maxcount, query_wiki=0):
|
|
569 """query_wiki - 0 - don't
|
|
570 1 - spawn browser
|
|
571 """
|
|
572 counter = 0
|
|
573 for c in clist:
|
|
574
|
|
575 if query_wiki:
|
|
576 ch = urllib.quote(c.encode('utf-8')) # wikipedia uses UTF-8 in names
|
|
577 wiki_url = 'http://en.wikipedia.org/wiki/'+ch
|
|
578 webbrowser.open(wiki_url)
|
|
579 query_wiki = 0 # query only the very first character
|
|
580
|
|
581
|
|
582 if maxcount:
|
|
583 counter += 1
|
|
584 if counter > options.maxcount:
|
|
585 out("\nToo many characters to display, more than %s, use --max option to change it\n" % options.maxcount)
|
|
586 return
|
|
587 properties = get_unicode_properties(c)
|
|
588 out(maybe_colours('bold'))
|
|
589 out('U+%04X '% ord(c))
|
|
590 if properties['name']:
|
|
591 out(properties['name'])
|
|
592 else:
|
|
593 out(maybe_colours('default'))
|
|
594 out(" - No such unicode character name in database")
|
|
595 out(maybe_colours('default'))
|
|
596 out('\n')
|
|
597
|
|
598 ar = ["UTF-8", ' '.join([("%02x" % ord23(x)) for x in c.encode('utf-8')]) ,
|
|
599 "UTF-16BE", ''.join([("%02x" % ord23(x)) for x in c.encode('utf-16be')]),
|
|
600 "Decimal", "&#%s;" % ord(c) ]
|
|
601 if options.addcharset:
|
|
602 try:
|
|
603 rep = ' '.join([("%02x" % ord(x)) for x in c.encode(options.addcharset)] )
|
|
604 except UnicodeError:
|
|
605 rep = "NONE"
|
|
606 ar.extend( [options.addcharset, rep] )
|
|
607 printkv(*ar)
|
|
608
|
|
609
|
|
610 if properties['combining']:
|
|
611 pc = " "+c
|
|
612 else:
|
|
613 pc = c
|
|
614 out(pc)
|
|
615 uppercase = properties['uppercase']
|
|
616 lowercase = properties['lowercase']
|
|
617 if uppercase:
|
|
618 out(" (%s)" % uppercase)
|
|
619 out('\n')
|
|
620 printkv( "Uppercase", 'U+%04X'% ord(properties['uppercase']) )
|
|
621 elif lowercase:
|
|
622 out(" (%s)" % properties['lowercase'])
|
|
623 out('\n')
|
|
624 printkv( "Lowercase", 'U+%04X'% ord(properties['lowercase']) )
|
|
625 else:
|
|
626 out('\n')
|
|
627 printkv( 'Category', properties['category']+ " (%s)" % general_category[properties['category']] )
|
|
628
|
|
629 if properties['numeric_value']:
|
|
630 printkv( 'Numeric value', properties['numeric_value'])
|
|
631 if properties['digit_value']:
|
|
632 printkv( 'Digit value', properties['digit_value'])
|
|
633
|
|
634 bidi = properties['bidi']
|
|
635 if bidi:
|
|
636 printkv( 'Bidi', bidi+ " (%s)" % bidi_category[bidi] )
|
|
637 mirrored = properties['mirrored']
|
|
638 if mirrored:
|
|
639 out('Character is mirrored\n')
|
|
640 comb = properties['combining']
|
|
641 if comb:
|
|
642 printkv( 'Combining', str(comb)+ " (%s)" % (comb_classes.get(comb, '?')) )
|
|
643 decomp = properties['decomposition']
|
|
644 if decomp:
|
|
645 printkv( 'Decomposition', decomp )
|
|
646 if options.verbosity>0:
|
|
647 uhp = get_unihan_properties(c)
|
|
648 for key in uhp:
|
|
649 printkv(key, uhp[key])
|
|
650 out('\n')
|
|
651
|
|
652
|
|
653 def print_block(block):
|
|
654 #header
|
|
655 out(" "*10)
|
|
656 for i in range(16):
|
|
657 out(".%X " % i)
|
|
658 out('\n')
|
|
659 #body
|
|
660 for i in range(block*16, block*16+16):
|
|
661 hexi = "%X" % i
|
|
662 if len(hexi)>3:
|
|
663 hexi = "%07X" % i
|
|
664 hexi = hexi[:4]+" "+hexi[4:]
|
|
665 else:
|
|
666 hexi = " %03X" % i
|
|
667 out(LTR+hexi+". ")
|
|
668 for j in range(16):
|
|
669 c = unichr(i*16+j)
|
|
670 if unicodedata.combining(c):
|
|
671 c = " "+c
|
|
672 out(c)
|
|
673 out(' ')
|
|
674 out('\n')
|
|
675 out('\n')
|
|
676
|
|
677 def print_blocks(blocks):
|
|
678 for block in blocks:
|
|
679 print_block(block)
|
|
680
|
|
681 def is_range(s, typ):
|
|
682 sp = s.split('..')
|
|
683 if len(sp)!=2:
|
|
684 return False
|
|
685 if not sp[1]:
|
|
686 sp[1] = sp[0]
|
|
687 elif not sp[0]:
|
|
688 sp[0] = sp[1]
|
|
689 if not sp[0]:
|
|
690 return False
|
|
691 low = list(process([sp[0]], typ)) # intentionally no fromcp here, ranges are only of unicode characters
|
|
692 high = list(process([sp[1]], typ))
|
|
693 if len(low)!=1 or len(high)!=1:
|
|
694 return False
|
|
695 low = ord(low[0])
|
|
696 high = ord(high[0])
|
|
697 low = low // 256
|
|
698 high = high // 256 + 1
|
|
699 return range(low, high)
|
|
700
|
|
701
|
|
702
|
|
703 parser = OptionParser(usage="usage: %prog [options] arg")
|
|
704 parser.add_option("-x", "--hexadecimal",
|
|
705 action="store_const", const='hexadecimal', dest="type",
|
|
706 help="Assume arg to be hexadecimal number")
|
|
707 parser.add_option("-o", "--octal",
|
|
708 action="store_const", const='octal', dest="type",
|
|
709 help="Assume arg to be octal number")
|
|
710 parser.add_option("-b", "--binary",
|
|
711 action="store_const", const='binary', dest="type",
|
|
712 help="Assume arg to be binary number")
|
|
713 parser.add_option("-d", "--decimal",
|
|
714 action="store_const", const='decimal', dest="type",
|
|
715 help="Assume arg to be decimal number")
|
|
716 parser.add_option("-r", "--regexp",
|
|
717 action="store_const", const='regexp', dest="type",
|
|
718 help="Assume arg to be regular expression")
|
|
719 parser.add_option("-s", "--string",
|
|
720 action="store_const", const='string', dest="type",
|
|
721 help="Assume arg to be a sequence of characters")
|
|
722 parser.add_option("-a", "--auto",
|
|
723 action="store_const", const=None, dest="type",
|
|
724 help="Try to guess arg type (default)")
|
|
725 parser.add_option("-m", "--max",
|
|
726 action="store", default=10, dest="maxcount", type="int",
|
|
727 help="Maximal number of codepoints to display, default: 10; 0=unlimited")
|
|
728 parser.add_option("-i", "--io",
|
|
729 action="store", default=iocharsetguess, dest="iocharset", type="string",
|
|
730 help="I/O character set, I am guessing %s" % iocharsetguess)
|
|
731 parser.add_option("--fcp", "--fromcp",
|
|
732 action="store", default='', dest="fromcp", type="string",
|
|
733 help="Convert numerical arguments from this encoding, default: no conversion")
|
|
734 parser.add_option("-c", "--charset-add",
|
|
735 action="store", dest="addcharset", type="string",
|
|
736 help="Show hexadecimal reprezentation in this additional charset")
|
|
737 parser.add_option("-C", "--colour",
|
|
738 action="store", dest="use_colour", type="string",
|
|
739 default="auto",
|
|
740 help="Use colours, on, off or auto")
|
|
741 parser.add_option('', "--color",
|
|
742 action="store", dest="use_colour", type="string",
|
|
743 default="auto",
|
|
744 help="synonym for --colour")
|
|
745 parser.add_option("-v", "--verbose",
|
|
746 action="count", dest="verbosity",
|
|
747 default=0,
|
|
748 help="Increase verbosity (reads Unihan properties - slow!)")
|
|
749 parser.add_option("-w", "--wikipedia",
|
|
750 action="count", dest="query_wiki",
|
|
751 default=0,
|
|
752 help="Query wikipedia for the character")
|
|
753 parser.add_option("--list",
|
|
754 action="store_const", dest="list_all_encodings",
|
|
755 const=True,
|
|
756 help="List (approximately) all known encodings")
|
|
757
|
|
758
|
|
759 (options, arguments) = parser.parse_args()
|
|
760
|
|
761 linecache = {}
|
|
762 do_init()
|
|
763
|
|
764
|
|
765 if options.list_all_encodings:
|
|
766 all_encodings = os.listdir(os.path.dirname(encodings.__file__))
|
|
767 all_encodings = set([os.path.splitext(x)[0] for x in all_encodings])
|
|
768 all_encodings = list(all_encodings)
|
|
769 all_encodings.sort()
|
|
770 print (textwrap.fill(' '.join(all_encodings)))
|
|
771 sys.exit()
|
|
772
|
|
773 if len(arguments)==0:
|
|
774 parser.print_help()
|
|
775 sys.exit()
|
|
776
|
|
777
|
|
778 if options.use_colour.lower() in ("on", "1", "true", "yes"):
|
|
779 use_colour = True
|
|
780 elif options.use_colour.lower() in ("off", "0", "false", "no"):
|
|
781 use_colour = False
|
|
782 else:
|
|
783 use_colour = sys.stdout.isatty()
|
|
784 if sys.platform == 'win32':
|
|
785 use_colour = False
|
|
786
|
|
787
|
|
788 l_args = [] # list of non range arguments to process
|
|
789 for argum in arguments:
|
|
790 is_r = is_range(argum, options.type)
|
|
791 if is_r:
|
|
792 print_blocks(is_r)
|
|
793 else:
|
|
794 l_args.append(argum)
|
|
795
|
|
796 if l_args:
|
|
797 unihan_fs = []
|
|
798 if options.verbosity>0:
|
|
799 unihan_fs = get_unihan_files() # list of file names for Unihan data file(s), empty if not available
|
|
800 if not unihan_fs:
|
|
801 out( """
|
|
802 Unihan_*.txt files not found. In order to view Unihan properties,
|
|
803 please place the file into /usr/share/unidata/,
|
|
804 /usr/share/unicode/, ~/.unicode/
|
|
805 or current working directory (optionally you can gzip or bzip2 them).
|
|
806 You can get the files by unpacking ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip
|
|
807 Warning, listing UniHan Properties is rather slow.
|
|
808
|
|
809 """)
|
|
810 options.verbosity = 0
|
|
811 try:
|
|
812 print_characters(process(l_args, options.type, options.fromcp), options.maxcount, options.query_wiki)
|
|
813 except IOError: # e.g. broken pipe
|
|
814 pass
|
|
815
|