view interps/lambda/tokenizer.py @ 12253:ad5c5d1b7d04 draft

<oerjan> t sled lib/karma//s/egrep.*>/egrep -x \'<[^>]*>/
author HackEso <hackeso@esolangs.org>
date Fri, 06 Dec 2019 07:53:22 +0000
parents 859f9b4339e6
children
line wrap: on
line source

symbols = '()\\.,=;'

class TokenizerException(Exception):
    pass


def tokenize(s):
    tokens = []
    current = ''
    state = 'begin'
    index = 0
    s = s + ' ' # a cheesy way to simplify token termination conditions
    while index < len(s):
        c = s[index]
        index += 1
        if state == 'begin':
            current = c
            if c.isspace():
                continue
            if c.isalpha():
                state = 'name'
                continue
            if c == '{':
                state = 'comment'
                continue
            if c == '"':
                state = 'string'
                current = ''
                continue
            if c == '#':  # a "Special function"
                state = 'special'
                continue
            if c in symbols:  # guaranteed single-character token
                tokens.append(c)
                continue
            raise TokenizerException("Tokenizer can't comprehend '" + c + "'")
        if state == 'name':
            if c.isalpha() or c.isdigit() or c == '_' or c == '-':
                current += c
            else:
                tokens.append(current)
                state = 'begin'
                index -= 1
            continue
        if state == 'string':
            current += c
            if c == '"' and current != '"':
                tokens.append('"' + current)
                state = 'begin'
        if state == 'comment':
            if c == '}':
                state = 'begin'
        if state == 'special':
            if (c == '#' and current == '') or c.isalpha():
                current += c
            else:
                tokens.append(current)
                state = 'begin'
                index -= 1
    return tokens