996
|
1 symbols = '()\\.,=;'
|
|
2
|
|
3 class TokenizerException(Exception):
|
|
4 pass
|
|
5
|
|
6
|
|
7 def tokenize(s):
|
|
8 tokens = []
|
|
9 current = ''
|
|
10 state = 'begin'
|
|
11 index = 0
|
|
12 s = s + ' ' # a cheesy way to simplify token termination conditions
|
|
13 while index < len(s):
|
|
14 c = s[index]
|
|
15 index += 1
|
|
16 if state == 'begin':
|
|
17 current = c
|
|
18 if c.isspace():
|
|
19 continue
|
|
20 if c.isalpha():
|
|
21 state = 'name'
|
|
22 continue
|
|
23 if c == '{':
|
|
24 state = 'comment'
|
|
25 continue
|
|
26 if c == '"':
|
|
27 state = 'string'
|
|
28 current = ''
|
|
29 continue
|
|
30 if c == '#': # a "Special function"
|
|
31 state = 'special'
|
|
32 continue
|
|
33 if c in symbols: # guaranteed single-character token
|
|
34 tokens.append(c)
|
|
35 continue
|
|
36 raise TokenizerException("Tokenizer can't comprehend '" + c + "'")
|
|
37 if state == 'name':
|
|
38 if c.isalpha() or c.isdigit() or c == '_' or c == '-':
|
|
39 current += c
|
|
40 else:
|
|
41 tokens.append(current)
|
|
42 state = 'begin'
|
|
43 index -= 1
|
|
44 continue
|
|
45 if state == 'string':
|
|
46 current += c
|
|
47 if c == '"' and current != '"':
|
|
48 tokens.append('"' + current)
|
|
49 state = 'begin'
|
|
50 if state == 'comment':
|
|
51 if c == '}':
|
|
52 state = 'begin'
|
|
53 if state == 'special':
|
|
54 if (c == '#' and current == '') or c.isalpha():
|
|
55 current += c
|
|
56 else:
|
|
57 tokens.append(current)
|
|
58 state = 'begin'
|
|
59 index -= 1
|
|
60 return tokens
|