Mercurial > repo

diff interps/clc-intercal/CLC-INTERCAL-Base-1.-94.-2/INTERCAL/Parser.pm @ 996:859f9b4339e6
<Gregor> tar xf egobot.tar.xz
author: HackBot
date: Sun, 09 Dec 2012 19:30:08 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/interps/clc-intercal/CLC-INTERCAL-Base-1.-94.-2/INTERCAL/Parser.pm	Sun Dec 09 19:30:08 2012 +0000
@@ -0,0 +1,1017 @@
+package Language::INTERCAL::Parser;
+
+# Parser/code generator/etc
+
+# This file is part of CLC-INTERCAL
+
+# Copyright (c) 2006-2008 Claudio Calvelli, all rights reserved.
+
+# CLC-INTERCAL is copyrighted software. However, permission to use, modify,
+# and distribute it is granted provided that the conditions set out in the
+# licence agreement are met. See files README and COPYING in the distribution.
+
+use strict;
+use vars qw($VERSION $PERVERSION);
+($VERSION) = ($PERVERSION = "CLC-INTERCAL/Base INTERCAL/Parser.pm 1.-94.-2") =~ /\s(\S+)$/;
+
+use Carp;
+use Language::INTERCAL::Exporter '1.-94.-2';
+use Language::INTERCAL::Splats '1.-94.-2', qw(:SP);
+use Language::INTERCAL::ByteCode '1.-94.-2', qw(:BC);
+use Language::INTERCAL::SymbolTable '1.-94.-2';
+use Language::INTERCAL::Reggrim '1.-94.-2';
+
+# for some reason this sort of things works faster than regexes here
+my $digits = '';
+vec($digits, ord($_), 1) = 1 for (0..9);
+my $alphalist ='abcdefghijklmnopqrstuvwxyz_ABCDEFGHIJKLMNOPQRSTUVWXYZ';
+my $alphabet = $digits;
+for (my $i = 0; $i < length $alphalist; $i++) {
+    vec($alphabet, ord(substr($alphalist, $i, 1)), 1) = 1;
+}
+my $anything = '';
+vec($anything, $_, 1) = 1 for (0..255);
+my $spaces = '';
+vec($spaces, ord($_), 1) = 1 for (" ", "\t", "\012", "\015");
+my $nonspaces = $anything;
+vec($spaces, ord($_), 1) = 0 for (" ", "\t", "\012");
+
+my @parser_predefined = (
+    # NAME         PARSE                BAD GENCODE           STARTS      EMPTY COMPLETE
+    ["CONSTANT",   \&_parse_constant,   0,  \&_code_constant, $digits,    0,    \&_complete_constant],
+    ["SYMBOL",     \&_parse_symbol,     0,  \&_code_symbol,   $alphabet,  0,    \&_complete_list],
+    ["JUNK",       \&_parse_junk,       1,  \&_code_junk,     $anything,  1,    \&_complete_none],
+    ["SPACE",      \&_parse_space,      0,  sub { () },       $spaces,    1,    \&_complete_list],
+    ["BLACKSPACE", \&_parse_blackspace, 0,  sub { () },       $nonspaces, 1,    \&_complete_list],
+    ["ANYTHING",   \&_parse_anything,   0,  sub { () },       $anything,  0,    \&_complete_none],
+);
+
+sub _parse_constant {
+    my ($src, $pos, $grammar) = @_;
+    pos($src) = $$pos;
+    return () unless $src =~ /\G0*(\d{1,5})/go;
+    $$pos = pos($src);
+    my $con = $1 + 0;
+    return $con if $con < 65536;
+    $$pos--;
+    return int($con / 10);
+}
+
+sub _code_constant {
+    my ($number) = @_;
+    my @code = BC($number);
+    (pack('C*', @code), scalar(@code));
+}
+
+sub _complete_constant {
+    my ($src, $pos, $grammar, $pf) = @_;
+    my $con = substr($src, $pos);
+    return [0..9] if $con eq '' || $con < 6553;
+    return [0..5] if $con == 6553;
+    return [];
+}
+
+sub _complete_list {
+    my ($src, $pos, $grammar, $pf) = @_;
+    my $vec = $pf->[4];
+    my @cpl = ();
+    for (my $sym = 0; $sym < 8 * length $vec; $sym++) {
+	push @cpl, chr($sym) if vec($vec, $sym, 1);
+    }
+    return \@cpl;
+}
+
+sub _complete_none {
+    my ($src, $pos, $grammar, $pf) = @_;
+    return [];
+}
+
+sub _parse_symbol {
+    my ($src, $pos, $grammar) = @_;
+    pos($src) = $$pos;
+    return () unless $src =~ /\G(\w+)/go;
+    $$pos = pos($src);
+    return $1;
+}
+
+sub _code_symbol {
+    my ($string) = @_;
+    (pack('C*', BC_STR, BC(length $string)) . $string, 1);
+}
+
+sub _parse_junk {
+    my ($src, $pos, $grammar) = @_;
+    my $junk = $grammar->{junk_symbol};
+    return () unless $junk && $junk <= @{$grammar->{productions}};
+    my $cspace = $grammar->{cspace};
+    if (! exists $grammar->{junk_cache}{$$pos}) {
+	# XXX this could be made more efficient, for now we'll leave it at this
+	my $end = undef;
+	$grammar->{junk_symbol} = 0;
+	for (my $p = $$pos + 1; $p < length($src); $p++) {
+	    $cspace->(\$p);
+	    my $t = _compile($grammar, $junk, $src, $p, $cspace, 0, 1);
+	    next unless @$t;
+	    $end = $p;
+	    last;
+	}
+	$grammar->{junk_symbol} = $junk;
+	$end = length($src) if ! defined $end;
+	for (my $p = $$pos; $p < $end; $p++) {
+	    $grammar->{junk_cache}{$p} = $end;
+	}
+    }
+    my $ej = $grammar->{junk_cache}{$$pos};
+    my $res = substr($src, $$pos, $ej - $$pos);
+    $$pos = $ej;
+    $res;
+}
+
+sub _code_junk {
+    my ($string) = @_;
+    $string =~ s/^\s+//o;
+    $string =~ s/\s+$//o;
+    (pack('C*', BC_STR, BC(length $string)) . $string, 1);
+}
+
+sub _parse_space {
+    my ($src, $pos, $grammar) = @_;
+    pos($src) = $$pos;
+    return () unless $src =~ /\G([ \t\012\015]+)/go;
+    $$pos = pos($src);
+    $1;
+}
+
+sub _parse_blackspace {
+    my ($src, $pos, $grammar) = @_;
+    pos($src) = $$pos;
+    return () unless $src =~ /\G([^ \t\012]+)/go;
+    $$pos = pos($src);
+    $1;
+}
+
+sub _parse_anything {
+    my ($src, $pos, $grammar) = @_;
+    my $p = $$pos;
+    return () if $p >= length $src;
+    $$pos = $p + 1;
+    substr($src, $p, 1);
+}
+
+# precompile provides optimised access to _parse_space etc to be used when
+# using the compiler's SPACE symbol - this saves quite a lot of compile time
+sub _precompile {
+    my ($grammar, $source, $space) = @_;
+    my $predefs = $grammar->{predefined};
+    if (exists $predefs->{$space}) {
+	if ($predefs->{$space}[1] == \&_parse_space) {
+	    return (
+		sub {
+		    my ($pos) = @_;
+		    pos($source) = $$pos;
+		    return unless $source =~ /\G[ \t\012\015]+/go;
+		    $$pos = pos($source);
+		},
+		[' ', "\t", "\012", "\015"],
+	    );
+	}
+	if ($predefs->{$space}[1] == \&_parse_blackspace) {
+	    return (
+		sub {
+		    my ($pos) = @_;
+		    pos($source) = $$pos;
+		    return unless $source =~ /\G[^ \t\012]+/go;
+		    $$pos = pos($source);
+		},
+		[grep { ! /^[ \t\012]/ } map { chr } (0..255)],
+	    );
+	}
+	my $sub = $predefs->{$space}[1];
+	my $start = $predefs->{$space}[4];
+	return (
+	    sub {
+		my ($pos) = @_;
+		$sub->($source, $pos, $grammar);
+	    },
+	    [map { chr } grep { vec($start, $_, 1) } (0..255)],
+	);
+    }
+    return (sub {}, []) unless $space && $space <= @{$grammar->{productions}};
+    my $start = '';
+    for my $prod (@{$grammar->{productions}[$space]}) {
+	$start |= $prod->[2];
+    }
+    return (
+	sub {
+	    my ($pos) = @_;
+	    my $p = _compile($grammar, $space, $source, $$pos, sub {}, 0, 0);
+	    # now find the longest matching result
+	    for my $e (@$p) {
+		my ($start, $end) = @$e;
+		$$pos = $end if $$pos < $end;
+	    }
+	},
+	[map { chr } grep { vec($start, $_, 1) } (0..255)],
+    );
+}
+
+sub new {
+    @_ == 2 or croak "Usage: new Language::INTERCAL::Parser(SYMBOLTABLE)";
+    my ($class, $symboltable) = @_;
+    my %predefined = ();
+    for my $pf (@parser_predefined) {
+	my $sn = $symboltable->find($pf->[0]);
+	$predefined{$sn} = $pf;
+    }
+    bless {
+	productions => [],
+	converted => 1,
+	rule_count => 0,
+	symboltable => $symboltable,
+	predefined => \%predefined,
+	optimise => {},
+    }, $class;
+}
+
+sub forall {
+    @_ == 2 or croak "Usage: GRAMMAR->forall(CODE)";
+    my ($grammar, $code) = @_;
+    my $p = $grammar->{productions};
+    my $s = $grammar->{symboltable};
+    my @prod = ();
+    for (my $sym = 0; $sym < @$p; $sym++) {
+	next unless $p->[$sym];
+	for my $prod (@{$p->[$sym]}) {
+	    my ($left, $right, $_1, $_2, $_3, $prodnum) = @$prod;
+	    push @prod, [$prodnum, $sym, $left, $right];
+	}
+    }
+    for my $prod (sort { $a->[0] <=> $b->[0] } @prod) {
+	my ($prodnum, $sym, $left, $right) = @$prod;
+	$right = _unconvert_right($right, $left);
+	$code->($grammar, $s, $prodnum, $sym, $left, $right);
+    }
+}
+
+sub read {
+    @_ == 2 or croak "Usage: GRAMMAR->read(FILEHANDLE)";
+    my ($grammar, $fh) = @_;
+
+    # make it faster to run next time
+    _convert_grammar($grammar);
+
+    my $plist = $grammar->{productions};
+    $fh->read_binary(pack('vv', $grammar->{rule_count}, scalar @$plist));
+    for (my $symbol = 1; $symbol < @$plist; $symbol++) {
+	my $gp = $plist->[$symbol] || [];
+	$fh->read_binary(pack('v', scalar @$gp));
+	for my $prod (@$gp) {
+	    my ($left, $right, $initial, $startmap, $empty, $prodnum) = @$prod;
+	    _read_left($fh, $left);
+	    _read_right($fh, $right);
+	    $fh->read_binary(pack('vvCv', length($initial), length($startmap),
+					  $empty ? 1 : 0, $prodnum));
+	    $fh->read_binary($initial);
+	    $fh->read_binary($startmap);
+	}
+    }
+
+    $grammar;
+}
+
+sub _read_left {
+    my ($fh, $left) = @_;
+    $fh->read_binary(pack('v', scalar(@$left)));
+    for my $element (@$left) {
+	my ($type, $e, $c, @e) = @$element;
+	$fh->read_binary($type);
+	if ($type eq 's') {
+	    $fh->read_binary(pack('v', $e));
+	} elsif ($type eq 'r') {
+	    my $code = @e ? $e->[0]->save() : '';
+	    $fh->read_binary(pack('vva*a*', length($e), length($code),
+					    $e, $code));
+	} else {
+	    $fh->read_binary(pack('v/a*', $e));
+	}
+	$fh->read_binary(pack('v', $c));
+    }
+}
+
+sub _read_right {
+    my ($fh, $right) = @_;
+    $fh->read_binary(pack('v', scalar(@$right)));
+    for my $element (@$right) {
+	my $type = $element->[0];
+	my $e = $element->[1];
+	$fh->read_binary($type);
+	if ($type eq 'b') {
+	    $fh->read_binary(pack('v/a*', $e));
+	} elsif ($type ne '*') {
+	    $fh->read_binary(pack('v', $e));
+	}
+    }
+}
+
+sub write {
+    @_ == 3 or croak "Usage: write " .
+		     "Language::INTERCAL::Parser(FILEHANDLE, SYMBOLS)";
+    my ($class, $fh, $symboltable) = @_;
+
+    my ($rule_count, $nsymbols) = unpack('vv', $fh->write_binary(4));
+    my @productions = ();
+    for (my $symbol = 1; $symbol < $nsymbols; $symbol++) {
+	my $nprod = unpack('v', $fh->write_binary(2));
+	my @prod = ();
+	while (@prod < $nprod) {
+	    my $left = _write_left($fh);
+	    my $right = _write_right($fh);
+	    my ($ninit, $mapsize, $empty, $prodnum) =
+		unpack('vvCv', $fh->write_binary(7));
+	    my $initial = $fh->write_binary($ninit);
+	    my $startmap = $fh->write_binary($mapsize);
+	    push @prod,
+		[$left, $right, $initial, $startmap, $empty, $prodnum];
+	}
+	$productions[$symbol] = \@prod;
+    }
+
+    my %predefined = ();
+    for my $pf (@parser_predefined) {
+	my $sn = $symboltable->find($pf->[0]);
+	$predefined{$sn} = $pf;
+    }
+    my $grammar = bless {
+	symboltable => $symboltable,
+	productions => \@productions,
+	converted => 1,
+	rule_count => $rule_count,
+	predefined => \%predefined,
+    }, $class;
+
+    $grammar;
+}
+
+sub _write_left {
+    my ($fh) = @_;
+    my $elems = unpack('v', $fh->write_binary(2));
+    my @left = ();
+    while ($elems-- > 0) {
+	my $type = $fh->write_binary(1);
+	my $data = '';
+	my @comp = ();
+	if ($type eq 's') {
+	    $data = unpack('v', $fh->write_binary(2));
+	} elsif ($type eq 'r') {
+	    my ($rsize, $csize) = unpack('vv', $fh->write_binary(4));
+	    $data = $fh->write_binary($rsize);
+	    my $comp;
+	    if ($csize > 0) {
+		# restore reggrim from saved state
+		$comp = $fh->write_binary($csize);
+		$comp = Language::INTERCAL::Reggrim->restore($comp);
+	    } else {
+		# try to compile reggrim
+		$comp = Language::INTERCAL::Reggrim->compile($data);
+	    }
+	    @comp = ($comp);
+	} else {
+	    my $size = unpack('v', $fh->write_binary(2));
+	    $data = uc($fh->write_binary($size));
+	    @comp = (length($data));
+	}
+	my $count = unpack('v', $fh->write_binary(2));
+	push @left, [$type, $data, $count, @comp];
+    }
+    \@left;
+}
+
+sub _write_right {
+    my ($fh) = @_;
+    my $elems = unpack('v', $fh->write_binary(2));
+    my @right = ();
+    while ($elems-- > 0) {
+	my $type = $fh->write_binary(1);
+	my $data = '';
+	if ($type eq 'b') {
+	    my $len = unpack('v', $fh->write_binary(2));
+	    $data = $fh->write_binary($len);
+	} elsif ($type ne '*') {
+	    $data = unpack('v', $fh->write_binary(2));
+	}
+	push @right, [$type, $data];
+    }
+    \@right;
+}
+
+sub _convert_left {
+    my ($left) = @_;
+    [map {
+	$_->[0] eq 'r'
+	    ? [$_->[0], $_->[1], $_->[2],
+	       Language::INTERCAL::Reggrim->compile($_->[1])]
+	    : $_->[0] eq 'c' && $_->[1] eq ''
+		? ()
+		: $_->[0] eq 'c'
+		    ? [$_->[0], uc($_->[1]), $_->[2], length($_->[1])]
+		    : [$_->[0], $_->[1], $_->[2]];
+    } @$left];
+}
+
+sub _find_right {
+    my ($grammar, $left, $type, $number, $data) = @_;
+    for (my $lp = 0; $lp < @$left; $lp++) {
+	my $l = $left->[$lp];
+	next if $l->[0] ne $type;
+	next if $l->[0] eq 's' && $l->[1] != $data;
+	next if $l->[0] ne 's' && $l->[1] ne $data;
+	$number--;
+	return $lp if $number < 1;
+    }
+    if ($type eq 's') {
+	faint(SP_CREATION, "Symbol " .
+			   $grammar->{symboltable}->symbol($data) .
+			   " not found");
+    } elsif ($type eq 'c') {
+	my @data = unpack('C*', $data);
+	faint(SP_CREATION, "Block (@data) not found");
+    } elsif ($type eq 'r') {
+	faint(SP_CREATION, "Reggrim ($data) not found");
+    }
+    faint(SP_CREATION, "Internal error");
+}
+
+sub _convert_right {
+    my ($right, $left, $grammar) = @_;
+    [map {
+	$_->[0] eq 'c' && $_->[2] eq ''
+	    ? ()
+	    : $_->[0] =~ /^[scr]$/o
+		? [$_->[0], _find_right($grammar, $left, $_->[0], $_->[1], $_->[2])]
+		: $_->[0] eq 'n'
+		    ? [$_->[0], _find_right($grammar, $left, 's', $_->[1], $_->[2])]
+		    : $_->[0] eq '*'
+			? [$_->[0]]
+			: [$_->[0], $_->[1]];
+    } @$right];
+}
+
+sub _unconvert_right {
+    my ($right, $left) = @_;
+    [map {
+	$_->[0] =~ /^[scr]$/o ?
+	    [$_->[0], _count_left($_->[0], $_->[1], $left)] :
+	$_->[0] eq 'n' ?
+	    [$_->[0], _count_left('s', $_->[1], $left)] :
+	[$_->[0], $_->[1]];
+    } @$right];
+}
+
+sub _count_left {
+    my ($type, $number, $left) = @_;
+    my $count = 0;
+    my $data = $left->[$number][1];
+    for (my $lp = 0; $lp <= $number; $lp++) {
+	my $l = $left->[$lp];
+	next if $l->[0] ne $type;
+	next if $l->[0] eq 's' && $l->[1] != $data;
+	next if $l->[0] ne 's' && $l->[1] ne $data;
+	$count++;
+    }
+    ($count, $data);
+}
+
+# compile_top works similarly to compile but is useful for some types of
+# top-level symbols, as it avoids compile's potentially exponential
+# behaviour. Returns a list of generated code fragments (no completion
+# is attempted)
+
+sub compile_top {
+    @_ == 7 || @_ == 8
+	or croak "Usage: GRAMMAR->compile_top(TOP, INT, SOURCE, " .
+		 "POS, SPACE, JUNK [, VERBOSE])";
+    my ($grammar, $tsymb, $isymb, $source, $ipos, $space, $junk, $verb) = @_;
+    _convert_grammar($grammar);
+    my @result = ();
+    $grammar->{junk_cache} = {};
+    $grammar->{junk_symbol} = $junk;
+    my ($cspace, $sspace) = _precompile($grammar, $source, $space);
+    $grammar->{cspace} = $cspace;
+    $grammar->{sspace} = $sspace;
+    my $started_pos = $ipos;
+    my $started_time = time;
+    my $reported_time = $started_time;
+    while ($ipos < length $source) {
+	if ($verb) {
+	    my $now = time;
+	    if ($now - $reported_time > 60) {
+		my $s = substr($source, $ipos);
+		$s =~ s/\s+/ /go;
+		$s =~ s/^ //o;
+		$s = substr($s, 0, 28);
+		my $fraction = ($ipos - $started_pos)
+			     / (length($source) - $started_pos);
+		my $eta = '';
+		if ($fraction > .1) {
+		    $eta = $started_time + ($now - $started_time) / $fraction;
+		    my @eta = localtime($eta);
+		    $eta = sprintf " ETA: %02d:%02d:%02d", @eta[2, 1, 0];
+		}
+		my @now = localtime($now);
+		my $d = length(length $source);
+		printf STDERR
+		    "\n    %02d:%02d:%02d: done to %${d}d %-30s %5.1f%%%s",
+		    @now[2,1,0], $ipos, "[$s]", 100 * $fraction, $eta;
+		$reported_time = $now;
+	    }
+	}
+	my $pos = $ipos++;
+	defined _parse_junk($source, \$ipos, $grammar)
+	    or $ipos = length $source;
+	my $pp = _compile($grammar, $tsymb, $source, $pos, $cspace, 0, 0);
+	if ($isymb) {
+	    for my $p (@$pp) {
+		my ($ps, $pe, $pj, $pc, $pn, @pu) = @$p;
+		if ($pe < length($source)) {
+		    my $ip =
+			_compile($grammar, $isymb, $source, $pe, $cspace, 0, 0);
+		    if (@$ip) {
+			for my $i (@$ip) {
+			    my ($is, $ie, $ij, $ic, $in, @iu) = @$i;
+			    push @result, $pc . $ic;
+			}
+		    } else {
+			push @result, $pc;
+		    }
+		} else {
+		    push @result, $pc;
+		}
+	    }
+	} else {
+	    push @result, map { $_->[3] } @$pp;
+	}
+    }
+    @result;
+}
+
+# compile attempts to generate code; returns two ARRAYREFs, a list of
+# generated code with elements [start, end, uses_junk?, code, count, @prods],
+# and a list of possible completion if source is a prefix of a parseable string
+# both lists will be empty if nothing can be parsed
+
+sub compile {
+    @_ == 6
+	or croak "Usage: GRAMMAR->compile(SYMBOL, SOURCE, POS, SPACE, JUNK)";
+    my ($grammar, $isymb, $source, $start, $space, $junk) = @_;
+    _convert_grammar($grammar);
+    $grammar->{junk_cache} = {};
+    $grammar->{junk_symbol} = $junk;
+    my ($cspace, $sspace) = _precompile($grammar, $source, $space);
+    $grammar->{cspace} = $cspace;
+    $grammar->{sspace} = $sspace;
+    my %complete = ();
+    my $r = _compile($grammar, $isymb, $source, $start, $cspace, \%complete, 0);
+    ($r, [keys %complete]);
+}
+
+sub _compile {
+    my ($grammar, $isymb, $source, $start, $cspace, $complete, $any) = @_;
+    return [] if $isymb < 1;
+    my $productions = $grammar->{productions};
+    my $pos = $start;
+    $cspace->(\$pos);
+    return [] if $pos >= length $source && ! $complete;
+    my $predefs = $grammar->{predefined};
+    my @result = ();
+    # special case out of the main loop (they should not normally do this)
+    if (exists $predefs->{$isymb}) {
+	my $pf = $predefs->{$isymb};
+	if ($pos >= length $source && ! $pf->[5]) {
+	    if ($complete) {
+		my $cpl = $pf->[6]->($source, $pos, $grammar, $pf);
+		$complete->{$_} = 1 for @$cpl;
+	    }
+	} else {
+	    my $end = $pos;
+	    my @ok = $pf->[1]->($source, \$end, $grammar);
+	    my $bad = $pf->[2] ? $end - $start : 0;
+	    if ($end >= length $source && $complete) {
+		my $cpl = $pf->[6]->($source, $end, $grammar, $pf);
+		$complete->{$_} = 1 for @$cpl, @{$grammar->{sspace}};
+	    }
+	    if (@ok) {
+		my ($code, $count) = $pf->[3]->(@ok);
+		$cspace->(\$end);
+		push @result, [$start, $end, $bad, $code, $count];
+	    }
+	}
+	return \@result;
+    }
+    # normal case, parsing on user-defined symbols
+    return [] if $isymb >= @$productions;
+    my $iprod = $productions->[$isymb];
+    return [] if ! $iprod || ! @$iprod;
+    # prepare a list of states which look promising
+    my @state = ();
+    my $nxc = $pos < length($source) ? ord(substr($source, $pos, 1)) : undef;
+    for (my $prodnum = @$iprod - 1; $prodnum >= 0; $prodnum--) {
+	next unless $iprod->[$prodnum][4]
+		 || ! defined $nxc
+		 || vec($iprod->[$prodnum][2], $nxc, 1);
+	push @state, [$isymb, $prodnum, 0, $pos, 0, []];
+    }
+    my $cpspace = 0;
+    STATE: while (@state) {
+	my ($symb, $prodnum, $prodelem, $place, $bad, $stack, @tree) =
+	    @{pop @state};
+	my $sprod = $productions->[$symb][$prodnum];
+	my $left = $sprod->[0];
+	ELEM: while ($prodelem < @$left) {
+	    my ($type, $data, $count, $aux) = @{$left->[$prodelem]};
+	    $prodelem++;
+	    if ($type eq 's') {
+		if (exists $predefs->{$data}) {
+		    # predefined symbol - we can just run its code here
+		    my $pf = $predefs->{$data};
+		    if ($place < length $source || $pf->[5]) {
+			my $end = $place;
+			my @ok = $pf->[1]->($source, \$end, $grammar);
+			if ($end >= length $source && $complete) {
+			    my $cpl =
+				$pf->[6]->($source, $place, $grammar, $pf);
+			    $complete->{$_} = 1 for @$cpl;
+			    $cpspace = 1;
+			}
+			next STATE unless @ok;
+			$bad += $end - $place if $pf->[2];
+			push @tree, [$pf->[3]->(@ok)];
+			$cspace->(\$end);
+			$place = $end;
+			next ELEM;
+		    } elsif ($complete) {
+			my $cpl = $pf->[6]->($source, $place, $grammar, $pf);
+			$complete->{$_} = 1 for @$cpl;
+		    }
+		    next STATE;
+		} else {
+		    # user defined symbol - we need to push the current
+		    # state onto the stack and add new states to @state
+		    next STATE if $data >= @$productions;
+		    my $prod = $productions->[$data];
+		    next STATE if ! $prod || ! @$prod;
+		    pos($source) = $place;
+		    push @$stack, [$symb, $prodnum, $prodelem, $bad, @tree];
+		    my $nxc = length $source < $place
+			    ? ord(substr($source, $place, 1))
+			    : undef;
+		    for (my $pn = @$prod - 1; $pn >= 0; $pn--) {
+			next unless $prod->[$pn][4]
+				 || ! defined $nxc
+				 || vec($prod->[$pn][2], $nxc, 1);
+			push @state, [$data, $pn, 0, $place, 0, [@$stack]];
+		    }
+		    next STATE;
+		}
+	    } elsif ($type eq 'c') {
+		# constant - just check if the required string is there
+		my $look = uc(substr($source, $place, $aux));
+		if ($data eq $look) {
+		    # yep, it's there - add the place to the current tree
+		    # in case the code generator wants it
+		    push @tree, [$place, $aux];
+		    $place += $aux;
+		    $cspace->(\$place);
+		    $cpspace = 1 if $place >= length $source;
+		    next ELEM;
+		} elsif ($complete &&
+			 length($look) < length($data) &&
+			 substr($data, 0, length($look)) eq $look)
+		{
+		    # a substring of the wanted string is there - so this could
+		    # be an incomplete source
+		    $complete->{substr($data, length($look))} = 1;
+		}
+		next STATE;
+	    } elsif ($type eq 'r') {
+		# regular grimace - run it and see what we get
+		my ($type, $length, $follows) = $aux->match($source, $place);
+		if ($type) {
+		    # a match - add its place and length to the parse tree
+		    push @tree, [$place, $length];
+		    $place += $length;
+		    $cspace->(\$place);
+		    if ($complete) {
+			$complete->{$_} = 1 for @$follows;
+			$cpspace = 1 if $place >= length $source;
+		    }
+		    next ELEM;
+		}
+		if ($complete) {
+		    $complete->{$_} = 1 for @$follows;
+		    $cpspace = 1 if $place >= length $source;
+		}
+		next STATE;
+	    }
+	}
+	# end of production - generate code
+	my %uses = ( $sprod->[5] => 1 );
+	for my $t (@tree) {
+	    my ($x, $c, @u) = @$t;
+	    $uses{$_} = 1 for @u;
+	}
+	my @uses = sort { $a <=> $b } keys %uses;
+	my ($code, $count) = _gencode($source, $left, $sprod->[1], \@tree,
+				      $start, $place - $start, $bad, \@uses);
+	if (@$stack) {
+	    # we were called by another nonterminal
+	    my ($nsym, $nprd, $nelm, $nbad, @ntree) = @{pop @$stack};
+	    $nbad += $bad;
+	    push @state, [$nsym, $nprd, $nelm, $place, $nbad,
+			  $stack, @ntree, [$code, $count, @uses]];
+	} else {
+	    # top level symbol, in other words a (possibly partial) result
+	    push @result, [$start, $place, $bad, $code, $count, @uses];
+	    return \@result if $any;
+	}
+    }
+    if ($complete && $cpspace) {
+	$complete->{$_} = 1 for @{$grammar->{sspace}};
+    }
+    return \@result;
+}
+
+sub _gencode {
+    my ($source, $left, $right, $tree, $start, $length, $junk, $uses) = @_;
+    my $code = '';
+    for my $rp (@$right) {
+	my ($type, $value) = @$rp;
+	if ($type eq 'b') {
+	    $code .= $value;
+	    next;
+	}
+	if ($type eq 's') {
+	    $code .= $tree->[$value][0];
+	    next;
+	}
+	if ($type eq 'n') {
+	    $code .= pack('C*', BC($tree->[$value][1]));
+	    next;
+	}
+	if ($type eq 'c' || $type eq 'r') {
+	    my ($place, $len) = @{$tree->[$value]};
+	    my $const = substr($source, $place, $len);
+	    my @v = unpack('C*', $const);
+	    $code .= pack('C*', BC_MUL, map { BC($_) } scalar(@v), @v);
+	    next;
+	}
+	if ($type eq '*') {
+	    $code .= pack('C*', map { BC($_) } $start, $length, $junk,
+					       scalar @$uses, @$uses);
+	    next;
+	}
+    }
+    my $count = 0;
+    for (my $lp = 0; $lp < @$left; $lp++) {
+	my $lc = $left->[$lp][2];
+	$count += $lc == 0xffff ? $tree->[$lp][1] : $lc;
+    }
+    ($code, $count);
+}
+
+sub _find_starts {
+    my ($grammar) = @_;
+
+    # first find if any symbol can expand (directly) to empty strings
+    # or (directly) to another symbol; since information about each of
+    # these changes our idea of the other, we keep repeating until we
+    # cannot make any more changes
+    my $empty = '';
+
+    my $found = 0;
+    for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) {
+	my $plist = $grammar->{productions}[$symb];
+	next unless $plist;
+	for my $prod (@$plist) {
+	    $prod->[2] = '';
+	    $prod->[3] = '';
+	    $prod->[4] = 0;
+	}
+	$found++;
+    }
+    return $empty unless $found;
+
+    my $continue = 1;
+    while ($continue) {
+	$continue = 0;
+	SYMB: for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) {
+	    my $plist = $grammar->{productions}[$symb];
+	    next unless $plist;
+	    PROD: for my $prod (@$plist) {
+		# look at the first element of the production, if there's one
+		ELEM: for my $p (@{$prod->[0]}) {
+		    if ($p->[0] eq 's') {
+			# that means we can access this particular symbol
+			$continue = 1 if ! vec($prod->[3], $p->[1], 1);
+			vec($prod->[3], $p->[1], 1) = 1;
+			# if we know the symbol can parse the empty string,
+			# we also need to check the next element
+			next if vec($empty, $p->[1], 1);
+		    }
+		    next if ($p->[0] eq 'c' && $p->[1] eq '') ||
+			    ($p->[0] eq 'r' && $p->[3]->can_empty());
+		    next PROD;
+		}
+		# if we get here, all productions are empty, so...
+		$continue = 1 if ! vec($empty, $symb, 1);
+		vec($empty, $symb, 1) = 1;
+		$prod->[4] = 1;
+	    }
+	}
+    }
+
+    for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) {
+	my $plist = $grammar->{productions}[$symb];
+	next unless $plist;
+	for my $prod (@$plist) {
+	    faint(SP_CIRCULAR, $grammar->{symboltable}->symbol($symb))
+		if vec($prod->[3], $symb, 1);
+	}
+    }
+
+    return $empty;
+}
+
+sub _convert_grammar {
+    my ($grammar) = @_;
+
+    return if exists $grammar->{converted};
+    my $empty = _find_starts($grammar);
+    my @i_total = map { '' } @{$grammar->{productions}};
+    my $predefs = $grammar->{predefined};
+
+    # first find the "direct" initials;
+    SYMB: for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) {
+	my $plist = $grammar->{productions}[$symb];
+	next unless $plist;
+	PROD: for my $prod (@$plist) {
+	    next unless $prod;
+	    $prod->[2] = '';
+	    # look at the first element of the production, if there's one
+	    ELEM: for my $p (@{$prod->[0]}) {
+		if ($p->[0] eq 'c') {
+		    next if $p->[1] eq '';
+		    my $l = ord(lc(substr($p->[1], 0, 1)));
+		    my $u = ord(uc(substr($p->[1], 0, 1)));
+		    vec($i_total[$symb], $l, 1) = 1;
+		    vec($i_total[$symb], $u, 1) = 1;
+		    vec($prod->[2], $l, 1) = 1;
+		    vec($prod->[2], $u, 1) = 1;
+		    next PROD;
+		}
+		if ($p->[0] eq 'r') {
+		    my @i = $p->[3]->can_start();
+		    for my $s (@i) {
+			vec($i_total[$symb], $s, 1) = 1;
+			vec($prod->[2], $s, 1) = 1;
+		    }
+		    next if $p->[3]->can_empty();
+		    next PROD;
+		}
+		if ($p->[0] eq 's' && exists $predefs->{$p->[1]}) {
+		    my $pf = $predefs->{$p->[1]};
+		    my $st = $pf->[4];
+		    my $em = $pf->[5];
+		    $i_total[$symb] |= $st;
+		    $prod->[2] |= $st;
+		    next if $em;
+		    next PROD;
+		}
+		next if ($p->[0] eq 's' && vec($empty, $p->[1], 1));
+		next PROD;
+	    }
+	}
+    }
+    
+    # now propagate %i_... using %starts
+    my $continue = 1;
+    while ($continue) {
+	$continue = 0;
+	for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) {
+	    my $plist = $grammar->{productions}[$symb];
+	    next unless $plist;
+	    for my $prod (@$plist) {
+		for (my $other = 0; $other < 8 * length($prod->[3]); $other++) {
+		    next unless vec($prod->[3], $other, 1);
+		    my $init = $i_total[$other];
+		    my $np = $prod->[2] | $init;
+		    $continue = 1
+			if $np ne substr($prod->[2], 0, length($np));
+		    $prod->[2] |= $init;
+		    $i_total[$symb] |= $init;
+		}
+	    }
+	}
+    }
+
+    my $mask = '';
+    for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) {
+	if (exists $predefs->{$symb}) {
+	    vec($mask, $symb, 1) = 1;
+	    next;
+	}
+	my $plist = $grammar->{productions}[$symb];
+	next unless $plist;
+	for my $prod (@$plist) {
+	    $prod->[3] = substr($prod->[3], 0, 1) & $mask;
+	}
+    }
+
+    $grammar->{converted} = 1;
+}
+
+sub _left_equal {
+    my ($l1, $l2) = @_;
+    return 0 if @$l1 != @$l2;
+    for (my $c = 0; $c < @$l1; $c++) {
+	my ($t1, $d1, $c1, $a1) = @{$l1->[$c]};
+	my ($t2, $d2, $c2, $a2) = @{$l2->[$c]};
+	return 0 if $t1 ne $t2;
+	return 0 if $c1 != $c2;
+	if ($t1 eq 's') {
+	    return 0 if $d1 != $d2;
+	} elsif ($t1 eq 'c') {
+	    return 0 if $d1 ne $d2;
+	} elsif ($t1 eq 'r') {
+	    return 0 if ! $a1->is_equal($a2);
+	} else {
+	    return 0;
+	}
+    }
+    return 1;
+}
+
+sub _right_equal {
+    my ($r1, $r2) = @_;
+    return 0 if @$r1 != @$r2;
+    for (my $c = 0; $c < @$r1; $c++) {
+	my ($t1, $v1) = @{$r1->[$c]};
+	my ($t2, $v2) = @{$r2->[$c]};
+	return 0 if $t1 ne $t2;
+	if ($t1 eq 'b') {
+	    return 0 if $v1 ne $v2;
+	} elsif ($t1 eq 's' || $t1 eq 'c' || $t1 eq 'n' || $t1 eq 'r') {
+	    return 0 if $v1 != $v2;
+	} elsif ($t1 ne '*') {
+	    return 0;
+	}
+    }
+    return 1;
+}
+
+sub _find_rule {
+    my ($grammar, $symb, $left, $right) = @_;
+    return () if $symb >= @{$grammar->{productions}};
+    my $prods = $grammar->{productions}[$symb];
+    return () unless $prods;
+    my @found = ();
+    SYMB: for (my $pp = 0; $pp < @$prods; $pp++) {
+	my ($l, $r, $i, $s, $e, $c) = @{$prods->[$pp]};
+	# see if this production is same as $left and (if provided) $right
+	next SYMB if ! _left_equal($l, $left);
+	next SYMB if $right && ! _right_equal($r, $right);
+	push @found, $c;
+    }
+    return @found;
+}
+
+sub add {
+    @_ == 4 or croak "Usage: GRANMAR->add(SYMBOL, LEFT, RIGHT)";
+    my ($grammar, $symb, $left, $right) = @_;
+    $left = _convert_left($left);
+    $right = _convert_right($right, $left, $grammar);
+    # do we already have this production?
+    PROD: for my $prod (@{$grammar->{productions}[$symb]}) {
+	my ($l, $r, $i, $s, $e, $c) = @$prod;
+	next PROD if ! _left_equal($left, $l);
+	next PROD if ! _right_equal($right, $r);
+	# we have it, no need to add anything or make any changes
+	return -$c;
+    }
+    my $prodnum = ++$grammar->{rule_count};
+    push @{$grammar->{productions}[$symb]},
+	[$left, $right, '', '', 0, $prodnum];
+    delete $grammar->{converted};
+    $prodnum;
+}
+
+sub find_rule {
+    @_ == 3 || @_ == 4
+	or croak "Usage: GRANMAR->find_rule(SYMBOL, LEFT [, RIGHT])";
+    my ($grammar, $symb, $left, $right) = @_;
+    $left = _convert_left($left);
+    $right = _convert_right($right, $left, $grammar) if $right;
+    my @rules = _find_rule($grammar, $symb, $left, $right);
+    wantarray ? @rules : $rules[0];
+}
+
+1;
author	HackBot
date	Sun, 09 Dec 2012 19:30:08 +0000
parents
children