Mercurial > repo
diff interps/clc-intercal/CLC-INTERCAL-Base-1.-94.-2/INTERCAL/Parser.pm @ 996:859f9b4339e6
<Gregor> tar xf egobot.tar.xz
author | HackBot |
---|---|
date | Sun, 09 Dec 2012 19:30:08 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interps/clc-intercal/CLC-INTERCAL-Base-1.-94.-2/INTERCAL/Parser.pm Sun Dec 09 19:30:08 2012 +0000 @@ -0,0 +1,1017 @@ +package Language::INTERCAL::Parser; + +# Parser/code generator/etc + +# This file is part of CLC-INTERCAL + +# Copyright (c) 2006-2008 Claudio Calvelli, all rights reserved. + +# CLC-INTERCAL is copyrighted software. However, permission to use, modify, +# and distribute it is granted provided that the conditions set out in the +# licence agreement are met. See files README and COPYING in the distribution. + +use strict; +use vars qw($VERSION $PERVERSION); +($VERSION) = ($PERVERSION = "CLC-INTERCAL/Base INTERCAL/Parser.pm 1.-94.-2") =~ /\s(\S+)$/; + +use Carp; +use Language::INTERCAL::Exporter '1.-94.-2'; +use Language::INTERCAL::Splats '1.-94.-2', qw(:SP); +use Language::INTERCAL::ByteCode '1.-94.-2', qw(:BC); +use Language::INTERCAL::SymbolTable '1.-94.-2'; +use Language::INTERCAL::Reggrim '1.-94.-2'; + +# for some reason this sort of things works faster than regexes here +my $digits = ''; +vec($digits, ord($_), 1) = 1 for (0..9); +my $alphalist ='abcdefghijklmnopqrstuvwxyz_ABCDEFGHIJKLMNOPQRSTUVWXYZ'; +my $alphabet = $digits; +for (my $i = 0; $i < length $alphalist; $i++) { + vec($alphabet, ord(substr($alphalist, $i, 1)), 1) = 1; +} +my $anything = ''; +vec($anything, $_, 1) = 1 for (0..255); +my $spaces = ''; +vec($spaces, ord($_), 1) = 1 for (" ", "\t", "\012", "\015"); +my $nonspaces = $anything; +vec($spaces, ord($_), 1) = 0 for (" ", "\t", "\012"); + +my @parser_predefined = ( + # NAME PARSE BAD GENCODE STARTS EMPTY COMPLETE + ["CONSTANT", \&_parse_constant, 0, \&_code_constant, $digits, 0, \&_complete_constant], + ["SYMBOL", \&_parse_symbol, 0, \&_code_symbol, $alphabet, 0, \&_complete_list], + ["JUNK", \&_parse_junk, 1, \&_code_junk, $anything, 1, \&_complete_none], + ["SPACE", \&_parse_space, 0, sub { () }, $spaces, 1, \&_complete_list], + ["BLACKSPACE", \&_parse_blackspace, 0, sub { () }, $nonspaces, 1, \&_complete_list], + ["ANYTHING", \&_parse_anything, 0, sub { () }, $anything, 0, \&_complete_none], +); + +sub _parse_constant { + my ($src, $pos, $grammar) = @_; + pos($src) = $$pos; + return () unless $src =~ /\G0*(\d{1,5})/go; + $$pos = pos($src); + my $con = $1 + 0; + return $con if $con < 65536; + $$pos--; + return int($con / 10); +} + +sub _code_constant { + my ($number) = @_; + my @code = BC($number); + (pack('C*', @code), scalar(@code)); +} + +sub _complete_constant { + my ($src, $pos, $grammar, $pf) = @_; + my $con = substr($src, $pos); + return [0..9] if $con eq '' || $con < 6553; + return [0..5] if $con == 6553; + return []; +} + +sub _complete_list { + my ($src, $pos, $grammar, $pf) = @_; + my $vec = $pf->[4]; + my @cpl = (); + for (my $sym = 0; $sym < 8 * length $vec; $sym++) { + push @cpl, chr($sym) if vec($vec, $sym, 1); + } + return \@cpl; +} + +sub _complete_none { + my ($src, $pos, $grammar, $pf) = @_; + return []; +} + +sub _parse_symbol { + my ($src, $pos, $grammar) = @_; + pos($src) = $$pos; + return () unless $src =~ /\G(\w+)/go; + $$pos = pos($src); + return $1; +} + +sub _code_symbol { + my ($string) = @_; + (pack('C*', BC_STR, BC(length $string)) . $string, 1); +} + +sub _parse_junk { + my ($src, $pos, $grammar) = @_; + my $junk = $grammar->{junk_symbol}; + return () unless $junk && $junk <= @{$grammar->{productions}}; + my $cspace = $grammar->{cspace}; + if (! exists $grammar->{junk_cache}{$$pos}) { + # XXX this could be made more efficient, for now we'll leave it at this + my $end = undef; + $grammar->{junk_symbol} = 0; + for (my $p = $$pos + 1; $p < length($src); $p++) { + $cspace->(\$p); + my $t = _compile($grammar, $junk, $src, $p, $cspace, 0, 1); + next unless @$t; + $end = $p; + last; + } + $grammar->{junk_symbol} = $junk; + $end = length($src) if ! defined $end; + for (my $p = $$pos; $p < $end; $p++) { + $grammar->{junk_cache}{$p} = $end; + } + } + my $ej = $grammar->{junk_cache}{$$pos}; + my $res = substr($src, $$pos, $ej - $$pos); + $$pos = $ej; + $res; +} + +sub _code_junk { + my ($string) = @_; + $string =~ s/^\s+//o; + $string =~ s/\s+$//o; + (pack('C*', BC_STR, BC(length $string)) . $string, 1); +} + +sub _parse_space { + my ($src, $pos, $grammar) = @_; + pos($src) = $$pos; + return () unless $src =~ /\G([ \t\012\015]+)/go; + $$pos = pos($src); + $1; +} + +sub _parse_blackspace { + my ($src, $pos, $grammar) = @_; + pos($src) = $$pos; + return () unless $src =~ /\G([^ \t\012]+)/go; + $$pos = pos($src); + $1; +} + +sub _parse_anything { + my ($src, $pos, $grammar) = @_; + my $p = $$pos; + return () if $p >= length $src; + $$pos = $p + 1; + substr($src, $p, 1); +} + +# precompile provides optimised access to _parse_space etc to be used when +# using the compiler's SPACE symbol - this saves quite a lot of compile time +sub _precompile { + my ($grammar, $source, $space) = @_; + my $predefs = $grammar->{predefined}; + if (exists $predefs->{$space}) { + if ($predefs->{$space}[1] == \&_parse_space) { + return ( + sub { + my ($pos) = @_; + pos($source) = $$pos; + return unless $source =~ /\G[ \t\012\015]+/go; + $$pos = pos($source); + }, + [' ', "\t", "\012", "\015"], + ); + } + if ($predefs->{$space}[1] == \&_parse_blackspace) { + return ( + sub { + my ($pos) = @_; + pos($source) = $$pos; + return unless $source =~ /\G[^ \t\012]+/go; + $$pos = pos($source); + }, + [grep { ! /^[ \t\012]/ } map { chr } (0..255)], + ); + } + my $sub = $predefs->{$space}[1]; + my $start = $predefs->{$space}[4]; + return ( + sub { + my ($pos) = @_; + $sub->($source, $pos, $grammar); + }, + [map { chr } grep { vec($start, $_, 1) } (0..255)], + ); + } + return (sub {}, []) unless $space && $space <= @{$grammar->{productions}}; + my $start = ''; + for my $prod (@{$grammar->{productions}[$space]}) { + $start |= $prod->[2]; + } + return ( + sub { + my ($pos) = @_; + my $p = _compile($grammar, $space, $source, $$pos, sub {}, 0, 0); + # now find the longest matching result + for my $e (@$p) { + my ($start, $end) = @$e; + $$pos = $end if $$pos < $end; + } + }, + [map { chr } grep { vec($start, $_, 1) } (0..255)], + ); +} + +sub new { + @_ == 2 or croak "Usage: new Language::INTERCAL::Parser(SYMBOLTABLE)"; + my ($class, $symboltable) = @_; + my %predefined = (); + for my $pf (@parser_predefined) { + my $sn = $symboltable->find($pf->[0]); + $predefined{$sn} = $pf; + } + bless { + productions => [], + converted => 1, + rule_count => 0, + symboltable => $symboltable, + predefined => \%predefined, + optimise => {}, + }, $class; +} + +sub forall { + @_ == 2 or croak "Usage: GRAMMAR->forall(CODE)"; + my ($grammar, $code) = @_; + my $p = $grammar->{productions}; + my $s = $grammar->{symboltable}; + my @prod = (); + for (my $sym = 0; $sym < @$p; $sym++) { + next unless $p->[$sym]; + for my $prod (@{$p->[$sym]}) { + my ($left, $right, $_1, $_2, $_3, $prodnum) = @$prod; + push @prod, [$prodnum, $sym, $left, $right]; + } + } + for my $prod (sort { $a->[0] <=> $b->[0] } @prod) { + my ($prodnum, $sym, $left, $right) = @$prod; + $right = _unconvert_right($right, $left); + $code->($grammar, $s, $prodnum, $sym, $left, $right); + } +} + +sub read { + @_ == 2 or croak "Usage: GRAMMAR->read(FILEHANDLE)"; + my ($grammar, $fh) = @_; + + # make it faster to run next time + _convert_grammar($grammar); + + my $plist = $grammar->{productions}; + $fh->read_binary(pack('vv', $grammar->{rule_count}, scalar @$plist)); + for (my $symbol = 1; $symbol < @$plist; $symbol++) { + my $gp = $plist->[$symbol] || []; + $fh->read_binary(pack('v', scalar @$gp)); + for my $prod (@$gp) { + my ($left, $right, $initial, $startmap, $empty, $prodnum) = @$prod; + _read_left($fh, $left); + _read_right($fh, $right); + $fh->read_binary(pack('vvCv', length($initial), length($startmap), + $empty ? 1 : 0, $prodnum)); + $fh->read_binary($initial); + $fh->read_binary($startmap); + } + } + + $grammar; +} + +sub _read_left { + my ($fh, $left) = @_; + $fh->read_binary(pack('v', scalar(@$left))); + for my $element (@$left) { + my ($type, $e, $c, @e) = @$element; + $fh->read_binary($type); + if ($type eq 's') { + $fh->read_binary(pack('v', $e)); + } elsif ($type eq 'r') { + my $code = @e ? $e->[0]->save() : ''; + $fh->read_binary(pack('vva*a*', length($e), length($code), + $e, $code)); + } else { + $fh->read_binary(pack('v/a*', $e)); + } + $fh->read_binary(pack('v', $c)); + } +} + +sub _read_right { + my ($fh, $right) = @_; + $fh->read_binary(pack('v', scalar(@$right))); + for my $element (@$right) { + my $type = $element->[0]; + my $e = $element->[1]; + $fh->read_binary($type); + if ($type eq 'b') { + $fh->read_binary(pack('v/a*', $e)); + } elsif ($type ne '*') { + $fh->read_binary(pack('v', $e)); + } + } +} + +sub write { + @_ == 3 or croak "Usage: write " . + "Language::INTERCAL::Parser(FILEHANDLE, SYMBOLS)"; + my ($class, $fh, $symboltable) = @_; + + my ($rule_count, $nsymbols) = unpack('vv', $fh->write_binary(4)); + my @productions = (); + for (my $symbol = 1; $symbol < $nsymbols; $symbol++) { + my $nprod = unpack('v', $fh->write_binary(2)); + my @prod = (); + while (@prod < $nprod) { + my $left = _write_left($fh); + my $right = _write_right($fh); + my ($ninit, $mapsize, $empty, $prodnum) = + unpack('vvCv', $fh->write_binary(7)); + my $initial = $fh->write_binary($ninit); + my $startmap = $fh->write_binary($mapsize); + push @prod, + [$left, $right, $initial, $startmap, $empty, $prodnum]; + } + $productions[$symbol] = \@prod; + } + + my %predefined = (); + for my $pf (@parser_predefined) { + my $sn = $symboltable->find($pf->[0]); + $predefined{$sn} = $pf; + } + my $grammar = bless { + symboltable => $symboltable, + productions => \@productions, + converted => 1, + rule_count => $rule_count, + predefined => \%predefined, + }, $class; + + $grammar; +} + +sub _write_left { + my ($fh) = @_; + my $elems = unpack('v', $fh->write_binary(2)); + my @left = (); + while ($elems-- > 0) { + my $type = $fh->write_binary(1); + my $data = ''; + my @comp = (); + if ($type eq 's') { + $data = unpack('v', $fh->write_binary(2)); + } elsif ($type eq 'r') { + my ($rsize, $csize) = unpack('vv', $fh->write_binary(4)); + $data = $fh->write_binary($rsize); + my $comp; + if ($csize > 0) { + # restore reggrim from saved state + $comp = $fh->write_binary($csize); + $comp = Language::INTERCAL::Reggrim->restore($comp); + } else { + # try to compile reggrim + $comp = Language::INTERCAL::Reggrim->compile($data); + } + @comp = ($comp); + } else { + my $size = unpack('v', $fh->write_binary(2)); + $data = uc($fh->write_binary($size)); + @comp = (length($data)); + } + my $count = unpack('v', $fh->write_binary(2)); + push @left, [$type, $data, $count, @comp]; + } + \@left; +} + +sub _write_right { + my ($fh) = @_; + my $elems = unpack('v', $fh->write_binary(2)); + my @right = (); + while ($elems-- > 0) { + my $type = $fh->write_binary(1); + my $data = ''; + if ($type eq 'b') { + my $len = unpack('v', $fh->write_binary(2)); + $data = $fh->write_binary($len); + } elsif ($type ne '*') { + $data = unpack('v', $fh->write_binary(2)); + } + push @right, [$type, $data]; + } + \@right; +} + +sub _convert_left { + my ($left) = @_; + [map { + $_->[0] eq 'r' + ? [$_->[0], $_->[1], $_->[2], + Language::INTERCAL::Reggrim->compile($_->[1])] + : $_->[0] eq 'c' && $_->[1] eq '' + ? () + : $_->[0] eq 'c' + ? [$_->[0], uc($_->[1]), $_->[2], length($_->[1])] + : [$_->[0], $_->[1], $_->[2]]; + } @$left]; +} + +sub _find_right { + my ($grammar, $left, $type, $number, $data) = @_; + for (my $lp = 0; $lp < @$left; $lp++) { + my $l = $left->[$lp]; + next if $l->[0] ne $type; + next if $l->[0] eq 's' && $l->[1] != $data; + next if $l->[0] ne 's' && $l->[1] ne $data; + $number--; + return $lp if $number < 1; + } + if ($type eq 's') { + faint(SP_CREATION, "Symbol " . + $grammar->{symboltable}->symbol($data) . + " not found"); + } elsif ($type eq 'c') { + my @data = unpack('C*', $data); + faint(SP_CREATION, "Block (@data) not found"); + } elsif ($type eq 'r') { + faint(SP_CREATION, "Reggrim ($data) not found"); + } + faint(SP_CREATION, "Internal error"); +} + +sub _convert_right { + my ($right, $left, $grammar) = @_; + [map { + $_->[0] eq 'c' && $_->[2] eq '' + ? () + : $_->[0] =~ /^[scr]$/o + ? [$_->[0], _find_right($grammar, $left, $_->[0], $_->[1], $_->[2])] + : $_->[0] eq 'n' + ? [$_->[0], _find_right($grammar, $left, 's', $_->[1], $_->[2])] + : $_->[0] eq '*' + ? [$_->[0]] + : [$_->[0], $_->[1]]; + } @$right]; +} + +sub _unconvert_right { + my ($right, $left) = @_; + [map { + $_->[0] =~ /^[scr]$/o ? + [$_->[0], _count_left($_->[0], $_->[1], $left)] : + $_->[0] eq 'n' ? + [$_->[0], _count_left('s', $_->[1], $left)] : + [$_->[0], $_->[1]]; + } @$right]; +} + +sub _count_left { + my ($type, $number, $left) = @_; + my $count = 0; + my $data = $left->[$number][1]; + for (my $lp = 0; $lp <= $number; $lp++) { + my $l = $left->[$lp]; + next if $l->[0] ne $type; + next if $l->[0] eq 's' && $l->[1] != $data; + next if $l->[0] ne 's' && $l->[1] ne $data; + $count++; + } + ($count, $data); +} + +# compile_top works similarly to compile but is useful for some types of +# top-level symbols, as it avoids compile's potentially exponential +# behaviour. Returns a list of generated code fragments (no completion +# is attempted) + +sub compile_top { + @_ == 7 || @_ == 8 + or croak "Usage: GRAMMAR->compile_top(TOP, INT, SOURCE, " . + "POS, SPACE, JUNK [, VERBOSE])"; + my ($grammar, $tsymb, $isymb, $source, $ipos, $space, $junk, $verb) = @_; + _convert_grammar($grammar); + my @result = (); + $grammar->{junk_cache} = {}; + $grammar->{junk_symbol} = $junk; + my ($cspace, $sspace) = _precompile($grammar, $source, $space); + $grammar->{cspace} = $cspace; + $grammar->{sspace} = $sspace; + my $started_pos = $ipos; + my $started_time = time; + my $reported_time = $started_time; + while ($ipos < length $source) { + if ($verb) { + my $now = time; + if ($now - $reported_time > 60) { + my $s = substr($source, $ipos); + $s =~ s/\s+/ /go; + $s =~ s/^ //o; + $s = substr($s, 0, 28); + my $fraction = ($ipos - $started_pos) + / (length($source) - $started_pos); + my $eta = ''; + if ($fraction > .1) { + $eta = $started_time + ($now - $started_time) / $fraction; + my @eta = localtime($eta); + $eta = sprintf " ETA: %02d:%02d:%02d", @eta[2, 1, 0]; + } + my @now = localtime($now); + my $d = length(length $source); + printf STDERR + "\n %02d:%02d:%02d: done to %${d}d %-30s %5.1f%%%s", + @now[2,1,0], $ipos, "[$s]", 100 * $fraction, $eta; + $reported_time = $now; + } + } + my $pos = $ipos++; + defined _parse_junk($source, \$ipos, $grammar) + or $ipos = length $source; + my $pp = _compile($grammar, $tsymb, $source, $pos, $cspace, 0, 0); + if ($isymb) { + for my $p (@$pp) { + my ($ps, $pe, $pj, $pc, $pn, @pu) = @$p; + if ($pe < length($source)) { + my $ip = + _compile($grammar, $isymb, $source, $pe, $cspace, 0, 0); + if (@$ip) { + for my $i (@$ip) { + my ($is, $ie, $ij, $ic, $in, @iu) = @$i; + push @result, $pc . $ic; + } + } else { + push @result, $pc; + } + } else { + push @result, $pc; + } + } + } else { + push @result, map { $_->[3] } @$pp; + } + } + @result; +} + +# compile attempts to generate code; returns two ARRAYREFs, a list of +# generated code with elements [start, end, uses_junk?, code, count, @prods], +# and a list of possible completion if source is a prefix of a parseable string +# both lists will be empty if nothing can be parsed + +sub compile { + @_ == 6 + or croak "Usage: GRAMMAR->compile(SYMBOL, SOURCE, POS, SPACE, JUNK)"; + my ($grammar, $isymb, $source, $start, $space, $junk) = @_; + _convert_grammar($grammar); + $grammar->{junk_cache} = {}; + $grammar->{junk_symbol} = $junk; + my ($cspace, $sspace) = _precompile($grammar, $source, $space); + $grammar->{cspace} = $cspace; + $grammar->{sspace} = $sspace; + my %complete = (); + my $r = _compile($grammar, $isymb, $source, $start, $cspace, \%complete, 0); + ($r, [keys %complete]); +} + +sub _compile { + my ($grammar, $isymb, $source, $start, $cspace, $complete, $any) = @_; + return [] if $isymb < 1; + my $productions = $grammar->{productions}; + my $pos = $start; + $cspace->(\$pos); + return [] if $pos >= length $source && ! $complete; + my $predefs = $grammar->{predefined}; + my @result = (); + # special case out of the main loop (they should not normally do this) + if (exists $predefs->{$isymb}) { + my $pf = $predefs->{$isymb}; + if ($pos >= length $source && ! $pf->[5]) { + if ($complete) { + my $cpl = $pf->[6]->($source, $pos, $grammar, $pf); + $complete->{$_} = 1 for @$cpl; + } + } else { + my $end = $pos; + my @ok = $pf->[1]->($source, \$end, $grammar); + my $bad = $pf->[2] ? $end - $start : 0; + if ($end >= length $source && $complete) { + my $cpl = $pf->[6]->($source, $end, $grammar, $pf); + $complete->{$_} = 1 for @$cpl, @{$grammar->{sspace}}; + } + if (@ok) { + my ($code, $count) = $pf->[3]->(@ok); + $cspace->(\$end); + push @result, [$start, $end, $bad, $code, $count]; + } + } + return \@result; + } + # normal case, parsing on user-defined symbols + return [] if $isymb >= @$productions; + my $iprod = $productions->[$isymb]; + return [] if ! $iprod || ! @$iprod; + # prepare a list of states which look promising + my @state = (); + my $nxc = $pos < length($source) ? ord(substr($source, $pos, 1)) : undef; + for (my $prodnum = @$iprod - 1; $prodnum >= 0; $prodnum--) { + next unless $iprod->[$prodnum][4] + || ! defined $nxc + || vec($iprod->[$prodnum][2], $nxc, 1); + push @state, [$isymb, $prodnum, 0, $pos, 0, []]; + } + my $cpspace = 0; + STATE: while (@state) { + my ($symb, $prodnum, $prodelem, $place, $bad, $stack, @tree) = + @{pop @state}; + my $sprod = $productions->[$symb][$prodnum]; + my $left = $sprod->[0]; + ELEM: while ($prodelem < @$left) { + my ($type, $data, $count, $aux) = @{$left->[$prodelem]}; + $prodelem++; + if ($type eq 's') { + if (exists $predefs->{$data}) { + # predefined symbol - we can just run its code here + my $pf = $predefs->{$data}; + if ($place < length $source || $pf->[5]) { + my $end = $place; + my @ok = $pf->[1]->($source, \$end, $grammar); + if ($end >= length $source && $complete) { + my $cpl = + $pf->[6]->($source, $place, $grammar, $pf); + $complete->{$_} = 1 for @$cpl; + $cpspace = 1; + } + next STATE unless @ok; + $bad += $end - $place if $pf->[2]; + push @tree, [$pf->[3]->(@ok)]; + $cspace->(\$end); + $place = $end; + next ELEM; + } elsif ($complete) { + my $cpl = $pf->[6]->($source, $place, $grammar, $pf); + $complete->{$_} = 1 for @$cpl; + } + next STATE; + } else { + # user defined symbol - we need to push the current + # state onto the stack and add new states to @state + next STATE if $data >= @$productions; + my $prod = $productions->[$data]; + next STATE if ! $prod || ! @$prod; + pos($source) = $place; + push @$stack, [$symb, $prodnum, $prodelem, $bad, @tree]; + my $nxc = length $source < $place + ? ord(substr($source, $place, 1)) + : undef; + for (my $pn = @$prod - 1; $pn >= 0; $pn--) { + next unless $prod->[$pn][4] + || ! defined $nxc + || vec($prod->[$pn][2], $nxc, 1); + push @state, [$data, $pn, 0, $place, 0, [@$stack]]; + } + next STATE; + } + } elsif ($type eq 'c') { + # constant - just check if the required string is there + my $look = uc(substr($source, $place, $aux)); + if ($data eq $look) { + # yep, it's there - add the place to the current tree + # in case the code generator wants it + push @tree, [$place, $aux]; + $place += $aux; + $cspace->(\$place); + $cpspace = 1 if $place >= length $source; + next ELEM; + } elsif ($complete && + length($look) < length($data) && + substr($data, 0, length($look)) eq $look) + { + # a substring of the wanted string is there - so this could + # be an incomplete source + $complete->{substr($data, length($look))} = 1; + } + next STATE; + } elsif ($type eq 'r') { + # regular grimace - run it and see what we get + my ($type, $length, $follows) = $aux->match($source, $place); + if ($type) { + # a match - add its place and length to the parse tree + push @tree, [$place, $length]; + $place += $length; + $cspace->(\$place); + if ($complete) { + $complete->{$_} = 1 for @$follows; + $cpspace = 1 if $place >= length $source; + } + next ELEM; + } + if ($complete) { + $complete->{$_} = 1 for @$follows; + $cpspace = 1 if $place >= length $source; + } + next STATE; + } + } + # end of production - generate code + my %uses = ( $sprod->[5] => 1 ); + for my $t (@tree) { + my ($x, $c, @u) = @$t; + $uses{$_} = 1 for @u; + } + my @uses = sort { $a <=> $b } keys %uses; + my ($code, $count) = _gencode($source, $left, $sprod->[1], \@tree, + $start, $place - $start, $bad, \@uses); + if (@$stack) { + # we were called by another nonterminal + my ($nsym, $nprd, $nelm, $nbad, @ntree) = @{pop @$stack}; + $nbad += $bad; + push @state, [$nsym, $nprd, $nelm, $place, $nbad, + $stack, @ntree, [$code, $count, @uses]]; + } else { + # top level symbol, in other words a (possibly partial) result + push @result, [$start, $place, $bad, $code, $count, @uses]; + return \@result if $any; + } + } + if ($complete && $cpspace) { + $complete->{$_} = 1 for @{$grammar->{sspace}}; + } + return \@result; +} + +sub _gencode { + my ($source, $left, $right, $tree, $start, $length, $junk, $uses) = @_; + my $code = ''; + for my $rp (@$right) { + my ($type, $value) = @$rp; + if ($type eq 'b') { + $code .= $value; + next; + } + if ($type eq 's') { + $code .= $tree->[$value][0]; + next; + } + if ($type eq 'n') { + $code .= pack('C*', BC($tree->[$value][1])); + next; + } + if ($type eq 'c' || $type eq 'r') { + my ($place, $len) = @{$tree->[$value]}; + my $const = substr($source, $place, $len); + my @v = unpack('C*', $const); + $code .= pack('C*', BC_MUL, map { BC($_) } scalar(@v), @v); + next; + } + if ($type eq '*') { + $code .= pack('C*', map { BC($_) } $start, $length, $junk, + scalar @$uses, @$uses); + next; + } + } + my $count = 0; + for (my $lp = 0; $lp < @$left; $lp++) { + my $lc = $left->[$lp][2]; + $count += $lc == 0xffff ? $tree->[$lp][1] : $lc; + } + ($code, $count); +} + +sub _find_starts { + my ($grammar) = @_; + + # first find if any symbol can expand (directly) to empty strings + # or (directly) to another symbol; since information about each of + # these changes our idea of the other, we keep repeating until we + # cannot make any more changes + my $empty = ''; + + my $found = 0; + for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) { + my $plist = $grammar->{productions}[$symb]; + next unless $plist; + for my $prod (@$plist) { + $prod->[2] = ''; + $prod->[3] = ''; + $prod->[4] = 0; + } + $found++; + } + return $empty unless $found; + + my $continue = 1; + while ($continue) { + $continue = 0; + SYMB: for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) { + my $plist = $grammar->{productions}[$symb]; + next unless $plist; + PROD: for my $prod (@$plist) { + # look at the first element of the production, if there's one + ELEM: for my $p (@{$prod->[0]}) { + if ($p->[0] eq 's') { + # that means we can access this particular symbol + $continue = 1 if ! vec($prod->[3], $p->[1], 1); + vec($prod->[3], $p->[1], 1) = 1; + # if we know the symbol can parse the empty string, + # we also need to check the next element + next if vec($empty, $p->[1], 1); + } + next if ($p->[0] eq 'c' && $p->[1] eq '') || + ($p->[0] eq 'r' && $p->[3]->can_empty()); + next PROD; + } + # if we get here, all productions are empty, so... + $continue = 1 if ! vec($empty, $symb, 1); + vec($empty, $symb, 1) = 1; + $prod->[4] = 1; + } + } + } + + for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) { + my $plist = $grammar->{productions}[$symb]; + next unless $plist; + for my $prod (@$plist) { + faint(SP_CIRCULAR, $grammar->{symboltable}->symbol($symb)) + if vec($prod->[3], $symb, 1); + } + } + + return $empty; +} + +sub _convert_grammar { + my ($grammar) = @_; + + return if exists $grammar->{converted}; + my $empty = _find_starts($grammar); + my @i_total = map { '' } @{$grammar->{productions}}; + my $predefs = $grammar->{predefined}; + + # first find the "direct" initials; + SYMB: for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) { + my $plist = $grammar->{productions}[$symb]; + next unless $plist; + PROD: for my $prod (@$plist) { + next unless $prod; + $prod->[2] = ''; + # look at the first element of the production, if there's one + ELEM: for my $p (@{$prod->[0]}) { + if ($p->[0] eq 'c') { + next if $p->[1] eq ''; + my $l = ord(lc(substr($p->[1], 0, 1))); + my $u = ord(uc(substr($p->[1], 0, 1))); + vec($i_total[$symb], $l, 1) = 1; + vec($i_total[$symb], $u, 1) = 1; + vec($prod->[2], $l, 1) = 1; + vec($prod->[2], $u, 1) = 1; + next PROD; + } + if ($p->[0] eq 'r') { + my @i = $p->[3]->can_start(); + for my $s (@i) { + vec($i_total[$symb], $s, 1) = 1; + vec($prod->[2], $s, 1) = 1; + } + next if $p->[3]->can_empty(); + next PROD; + } + if ($p->[0] eq 's' && exists $predefs->{$p->[1]}) { + my $pf = $predefs->{$p->[1]}; + my $st = $pf->[4]; + my $em = $pf->[5]; + $i_total[$symb] |= $st; + $prod->[2] |= $st; + next if $em; + next PROD; + } + next if ($p->[0] eq 's' && vec($empty, $p->[1], 1)); + next PROD; + } + } + } + + # now propagate %i_... using %starts + my $continue = 1; + while ($continue) { + $continue = 0; + for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) { + my $plist = $grammar->{productions}[$symb]; + next unless $plist; + for my $prod (@$plist) { + for (my $other = 0; $other < 8 * length($prod->[3]); $other++) { + next unless vec($prod->[3], $other, 1); + my $init = $i_total[$other]; + my $np = $prod->[2] | $init; + $continue = 1 + if $np ne substr($prod->[2], 0, length($np)); + $prod->[2] |= $init; + $i_total[$symb] |= $init; + } + } + } + } + + my $mask = ''; + for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) { + if (exists $predefs->{$symb}) { + vec($mask, $symb, 1) = 1; + next; + } + my $plist = $grammar->{productions}[$symb]; + next unless $plist; + for my $prod (@$plist) { + $prod->[3] = substr($prod->[3], 0, 1) & $mask; + } + } + + $grammar->{converted} = 1; +} + +sub _left_equal { + my ($l1, $l2) = @_; + return 0 if @$l1 != @$l2; + for (my $c = 0; $c < @$l1; $c++) { + my ($t1, $d1, $c1, $a1) = @{$l1->[$c]}; + my ($t2, $d2, $c2, $a2) = @{$l2->[$c]}; + return 0 if $t1 ne $t2; + return 0 if $c1 != $c2; + if ($t1 eq 's') { + return 0 if $d1 != $d2; + } elsif ($t1 eq 'c') { + return 0 if $d1 ne $d2; + } elsif ($t1 eq 'r') { + return 0 if ! $a1->is_equal($a2); + } else { + return 0; + } + } + return 1; +} + +sub _right_equal { + my ($r1, $r2) = @_; + return 0 if @$r1 != @$r2; + for (my $c = 0; $c < @$r1; $c++) { + my ($t1, $v1) = @{$r1->[$c]}; + my ($t2, $v2) = @{$r2->[$c]}; + return 0 if $t1 ne $t2; + if ($t1 eq 'b') { + return 0 if $v1 ne $v2; + } elsif ($t1 eq 's' || $t1 eq 'c' || $t1 eq 'n' || $t1 eq 'r') { + return 0 if $v1 != $v2; + } elsif ($t1 ne '*') { + return 0; + } + } + return 1; +} + +sub _find_rule { + my ($grammar, $symb, $left, $right) = @_; + return () if $symb >= @{$grammar->{productions}}; + my $prods = $grammar->{productions}[$symb]; + return () unless $prods; + my @found = (); + SYMB: for (my $pp = 0; $pp < @$prods; $pp++) { + my ($l, $r, $i, $s, $e, $c) = @{$prods->[$pp]}; + # see if this production is same as $left and (if provided) $right + next SYMB if ! _left_equal($l, $left); + next SYMB if $right && ! _right_equal($r, $right); + push @found, $c; + } + return @found; +} + +sub add { + @_ == 4 or croak "Usage: GRANMAR->add(SYMBOL, LEFT, RIGHT)"; + my ($grammar, $symb, $left, $right) = @_; + $left = _convert_left($left); + $right = _convert_right($right, $left, $grammar); + # do we already have this production? + PROD: for my $prod (@{$grammar->{productions}[$symb]}) { + my ($l, $r, $i, $s, $e, $c) = @$prod; + next PROD if ! _left_equal($left, $l); + next PROD if ! _right_equal($right, $r); + # we have it, no need to add anything or make any changes + return -$c; + } + my $prodnum = ++$grammar->{rule_count}; + push @{$grammar->{productions}[$symb]}, + [$left, $right, '', '', 0, $prodnum]; + delete $grammar->{converted}; + $prodnum; +} + +sub find_rule { + @_ == 3 || @_ == 4 + or croak "Usage: GRANMAR->find_rule(SYMBOL, LEFT [, RIGHT])"; + my ($grammar, $symb, $left, $right) = @_; + $left = _convert_left($left); + $right = _convert_right($right, $left, $grammar) if $right; + my @rules = _find_rule($grammar, $symb, $left, $right); + wantarray ? @rules : $rules[0]; +} + +1;