diff interps/lambda/tokenizer.py @ 996:859f9b4339e6

<Gregor> tar xf egobot.tar.xz
author HackBot
date Sun, 09 Dec 2012 19:30:08 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/interps/lambda/tokenizer.py	Sun Dec 09 19:30:08 2012 +0000
@@ -0,0 +1,60 @@
+symbols = '()\\.,=;'
+
+class TokenizerException(Exception):
+    pass
+
+
+def tokenize(s):
+    tokens = []
+    current = ''
+    state = 'begin'
+    index = 0
+    s = s + ' ' # a cheesy way to simplify token termination conditions
+    while index < len(s):
+        c = s[index]
+        index += 1
+        if state == 'begin':
+            current = c
+            if c.isspace():
+                continue
+            if c.isalpha():
+                state = 'name'
+                continue
+            if c == '{':
+                state = 'comment'
+                continue
+            if c == '"':
+                state = 'string'
+                current = ''
+                continue
+            if c == '#':  # a "Special function"
+                state = 'special'
+                continue
+            if c in symbols:  # guaranteed single-character token
+                tokens.append(c)
+                continue
+            raise TokenizerException("Tokenizer can't comprehend '" + c + "'")
+        if state == 'name':
+            if c.isalpha() or c.isdigit() or c == '_' or c == '-':
+                current += c
+            else:
+                tokens.append(current)
+                state = 'begin'
+                index -= 1
+            continue
+        if state == 'string':
+            current += c
+            if c == '"' and current != '"':
+                tokens.append('"' + current)
+                state = 'begin'
+        if state == 'comment':
+            if c == '}':
+                state = 'begin'
+        if state == 'special':
+            if (c == '#' and current == '') or c.isalpha():
+                current += c
+            else:
+                tokens.append(current)
+                state = 'begin'
+                index -= 1
+    return tokens