Mercurial > repo
view interps/clc-intercal/CLC-INTERCAL-Base-1.-94.-2/INTERCAL/Parser.pm @ 3553:a2c0fbb7c2b1
<Roujo> revert
author | HackBot |
---|---|
date | Thu, 29 Aug 2013 20:30:48 +0000 |
parents | 859f9b4339e6 |
children |
line wrap: on
line source
package Language::INTERCAL::Parser; # Parser/code generator/etc # This file is part of CLC-INTERCAL # Copyright (c) 2006-2008 Claudio Calvelli, all rights reserved. # CLC-INTERCAL is copyrighted software. However, permission to use, modify, # and distribute it is granted provided that the conditions set out in the # licence agreement are met. See files README and COPYING in the distribution. use strict; use vars qw($VERSION $PERVERSION); ($VERSION) = ($PERVERSION = "CLC-INTERCAL/Base INTERCAL/Parser.pm 1.-94.-2") =~ /\s(\S+)$/; use Carp; use Language::INTERCAL::Exporter '1.-94.-2'; use Language::INTERCAL::Splats '1.-94.-2', qw(:SP); use Language::INTERCAL::ByteCode '1.-94.-2', qw(:BC); use Language::INTERCAL::SymbolTable '1.-94.-2'; use Language::INTERCAL::Reggrim '1.-94.-2'; # for some reason this sort of things works faster than regexes here my $digits = ''; vec($digits, ord($_), 1) = 1 for (0..9); my $alphalist ='abcdefghijklmnopqrstuvwxyz_ABCDEFGHIJKLMNOPQRSTUVWXYZ'; my $alphabet = $digits; for (my $i = 0; $i < length $alphalist; $i++) { vec($alphabet, ord(substr($alphalist, $i, 1)), 1) = 1; } my $anything = ''; vec($anything, $_, 1) = 1 for (0..255); my $spaces = ''; vec($spaces, ord($_), 1) = 1 for (" ", "\t", "\012", "\015"); my $nonspaces = $anything; vec($spaces, ord($_), 1) = 0 for (" ", "\t", "\012"); my @parser_predefined = ( # NAME PARSE BAD GENCODE STARTS EMPTY COMPLETE ["CONSTANT", \&_parse_constant, 0, \&_code_constant, $digits, 0, \&_complete_constant], ["SYMBOL", \&_parse_symbol, 0, \&_code_symbol, $alphabet, 0, \&_complete_list], ["JUNK", \&_parse_junk, 1, \&_code_junk, $anything, 1, \&_complete_none], ["SPACE", \&_parse_space, 0, sub { () }, $spaces, 1, \&_complete_list], ["BLACKSPACE", \&_parse_blackspace, 0, sub { () }, $nonspaces, 1, \&_complete_list], ["ANYTHING", \&_parse_anything, 0, sub { () }, $anything, 0, \&_complete_none], ); sub _parse_constant { my ($src, $pos, $grammar) = @_; pos($src) = $$pos; return () unless $src =~ /\G0*(\d{1,5})/go; $$pos = pos($src); my $con = $1 + 0; return $con if $con < 65536; $$pos--; return int($con / 10); } sub _code_constant { my ($number) = @_; my @code = BC($number); (pack('C*', @code), scalar(@code)); } sub _complete_constant { my ($src, $pos, $grammar, $pf) = @_; my $con = substr($src, $pos); return [0..9] if $con eq '' || $con < 6553; return [0..5] if $con == 6553; return []; } sub _complete_list { my ($src, $pos, $grammar, $pf) = @_; my $vec = $pf->[4]; my @cpl = (); for (my $sym = 0; $sym < 8 * length $vec; $sym++) { push @cpl, chr($sym) if vec($vec, $sym, 1); } return \@cpl; } sub _complete_none { my ($src, $pos, $grammar, $pf) = @_; return []; } sub _parse_symbol { my ($src, $pos, $grammar) = @_; pos($src) = $$pos; return () unless $src =~ /\G(\w+)/go; $$pos = pos($src); return $1; } sub _code_symbol { my ($string) = @_; (pack('C*', BC_STR, BC(length $string)) . $string, 1); } sub _parse_junk { my ($src, $pos, $grammar) = @_; my $junk = $grammar->{junk_symbol}; return () unless $junk && $junk <= @{$grammar->{productions}}; my $cspace = $grammar->{cspace}; if (! exists $grammar->{junk_cache}{$$pos}) { # XXX this could be made more efficient, for now we'll leave it at this my $end = undef; $grammar->{junk_symbol} = 0; for (my $p = $$pos + 1; $p < length($src); $p++) { $cspace->(\$p); my $t = _compile($grammar, $junk, $src, $p, $cspace, 0, 1); next unless @$t; $end = $p; last; } $grammar->{junk_symbol} = $junk; $end = length($src) if ! defined $end; for (my $p = $$pos; $p < $end; $p++) { $grammar->{junk_cache}{$p} = $end; } } my $ej = $grammar->{junk_cache}{$$pos}; my $res = substr($src, $$pos, $ej - $$pos); $$pos = $ej; $res; } sub _code_junk { my ($string) = @_; $string =~ s/^\s+//o; $string =~ s/\s+$//o; (pack('C*', BC_STR, BC(length $string)) . $string, 1); } sub _parse_space { my ($src, $pos, $grammar) = @_; pos($src) = $$pos; return () unless $src =~ /\G([ \t\012\015]+)/go; $$pos = pos($src); $1; } sub _parse_blackspace { my ($src, $pos, $grammar) = @_; pos($src) = $$pos; return () unless $src =~ /\G([^ \t\012]+)/go; $$pos = pos($src); $1; } sub _parse_anything { my ($src, $pos, $grammar) = @_; my $p = $$pos; return () if $p >= length $src; $$pos = $p + 1; substr($src, $p, 1); } # precompile provides optimised access to _parse_space etc to be used when # using the compiler's SPACE symbol - this saves quite a lot of compile time sub _precompile { my ($grammar, $source, $space) = @_; my $predefs = $grammar->{predefined}; if (exists $predefs->{$space}) { if ($predefs->{$space}[1] == \&_parse_space) { return ( sub { my ($pos) = @_; pos($source) = $$pos; return unless $source =~ /\G[ \t\012\015]+/go; $$pos = pos($source); }, [' ', "\t", "\012", "\015"], ); } if ($predefs->{$space}[1] == \&_parse_blackspace) { return ( sub { my ($pos) = @_; pos($source) = $$pos; return unless $source =~ /\G[^ \t\012]+/go; $$pos = pos($source); }, [grep { ! /^[ \t\012]/ } map { chr } (0..255)], ); } my $sub = $predefs->{$space}[1]; my $start = $predefs->{$space}[4]; return ( sub { my ($pos) = @_; $sub->($source, $pos, $grammar); }, [map { chr } grep { vec($start, $_, 1) } (0..255)], ); } return (sub {}, []) unless $space && $space <= @{$grammar->{productions}}; my $start = ''; for my $prod (@{$grammar->{productions}[$space]}) { $start |= $prod->[2]; } return ( sub { my ($pos) = @_; my $p = _compile($grammar, $space, $source, $$pos, sub {}, 0, 0); # now find the longest matching result for my $e (@$p) { my ($start, $end) = @$e; $$pos = $end if $$pos < $end; } }, [map { chr } grep { vec($start, $_, 1) } (0..255)], ); } sub new { @_ == 2 or croak "Usage: new Language::INTERCAL::Parser(SYMBOLTABLE)"; my ($class, $symboltable) = @_; my %predefined = (); for my $pf (@parser_predefined) { my $sn = $symboltable->find($pf->[0]); $predefined{$sn} = $pf; } bless { productions => [], converted => 1, rule_count => 0, symboltable => $symboltable, predefined => \%predefined, optimise => {}, }, $class; } sub forall { @_ == 2 or croak "Usage: GRAMMAR->forall(CODE)"; my ($grammar, $code) = @_; my $p = $grammar->{productions}; my $s = $grammar->{symboltable}; my @prod = (); for (my $sym = 0; $sym < @$p; $sym++) { next unless $p->[$sym]; for my $prod (@{$p->[$sym]}) { my ($left, $right, $_1, $_2, $_3, $prodnum) = @$prod; push @prod, [$prodnum, $sym, $left, $right]; } } for my $prod (sort { $a->[0] <=> $b->[0] } @prod) { my ($prodnum, $sym, $left, $right) = @$prod; $right = _unconvert_right($right, $left); $code->($grammar, $s, $prodnum, $sym, $left, $right); } } sub read { @_ == 2 or croak "Usage: GRAMMAR->read(FILEHANDLE)"; my ($grammar, $fh) = @_; # make it faster to run next time _convert_grammar($grammar); my $plist = $grammar->{productions}; $fh->read_binary(pack('vv', $grammar->{rule_count}, scalar @$plist)); for (my $symbol = 1; $symbol < @$plist; $symbol++) { my $gp = $plist->[$symbol] || []; $fh->read_binary(pack('v', scalar @$gp)); for my $prod (@$gp) { my ($left, $right, $initial, $startmap, $empty, $prodnum) = @$prod; _read_left($fh, $left); _read_right($fh, $right); $fh->read_binary(pack('vvCv', length($initial), length($startmap), $empty ? 1 : 0, $prodnum)); $fh->read_binary($initial); $fh->read_binary($startmap); } } $grammar; } sub _read_left { my ($fh, $left) = @_; $fh->read_binary(pack('v', scalar(@$left))); for my $element (@$left) { my ($type, $e, $c, @e) = @$element; $fh->read_binary($type); if ($type eq 's') { $fh->read_binary(pack('v', $e)); } elsif ($type eq 'r') { my $code = @e ? $e->[0]->save() : ''; $fh->read_binary(pack('vva*a*', length($e), length($code), $e, $code)); } else { $fh->read_binary(pack('v/a*', $e)); } $fh->read_binary(pack('v', $c)); } } sub _read_right { my ($fh, $right) = @_; $fh->read_binary(pack('v', scalar(@$right))); for my $element (@$right) { my $type = $element->[0]; my $e = $element->[1]; $fh->read_binary($type); if ($type eq 'b') { $fh->read_binary(pack('v/a*', $e)); } elsif ($type ne '*') { $fh->read_binary(pack('v', $e)); } } } sub write { @_ == 3 or croak "Usage: write " . "Language::INTERCAL::Parser(FILEHANDLE, SYMBOLS)"; my ($class, $fh, $symboltable) = @_; my ($rule_count, $nsymbols) = unpack('vv', $fh->write_binary(4)); my @productions = (); for (my $symbol = 1; $symbol < $nsymbols; $symbol++) { my $nprod = unpack('v', $fh->write_binary(2)); my @prod = (); while (@prod < $nprod) { my $left = _write_left($fh); my $right = _write_right($fh); my ($ninit, $mapsize, $empty, $prodnum) = unpack('vvCv', $fh->write_binary(7)); my $initial = $fh->write_binary($ninit); my $startmap = $fh->write_binary($mapsize); push @prod, [$left, $right, $initial, $startmap, $empty, $prodnum]; } $productions[$symbol] = \@prod; } my %predefined = (); for my $pf (@parser_predefined) { my $sn = $symboltable->find($pf->[0]); $predefined{$sn} = $pf; } my $grammar = bless { symboltable => $symboltable, productions => \@productions, converted => 1, rule_count => $rule_count, predefined => \%predefined, }, $class; $grammar; } sub _write_left { my ($fh) = @_; my $elems = unpack('v', $fh->write_binary(2)); my @left = (); while ($elems-- > 0) { my $type = $fh->write_binary(1); my $data = ''; my @comp = (); if ($type eq 's') { $data = unpack('v', $fh->write_binary(2)); } elsif ($type eq 'r') { my ($rsize, $csize) = unpack('vv', $fh->write_binary(4)); $data = $fh->write_binary($rsize); my $comp; if ($csize > 0) { # restore reggrim from saved state $comp = $fh->write_binary($csize); $comp = Language::INTERCAL::Reggrim->restore($comp); } else { # try to compile reggrim $comp = Language::INTERCAL::Reggrim->compile($data); } @comp = ($comp); } else { my $size = unpack('v', $fh->write_binary(2)); $data = uc($fh->write_binary($size)); @comp = (length($data)); } my $count = unpack('v', $fh->write_binary(2)); push @left, [$type, $data, $count, @comp]; } \@left; } sub _write_right { my ($fh) = @_; my $elems = unpack('v', $fh->write_binary(2)); my @right = (); while ($elems-- > 0) { my $type = $fh->write_binary(1); my $data = ''; if ($type eq 'b') { my $len = unpack('v', $fh->write_binary(2)); $data = $fh->write_binary($len); } elsif ($type ne '*') { $data = unpack('v', $fh->write_binary(2)); } push @right, [$type, $data]; } \@right; } sub _convert_left { my ($left) = @_; [map { $_->[0] eq 'r' ? [$_->[0], $_->[1], $_->[2], Language::INTERCAL::Reggrim->compile($_->[1])] : $_->[0] eq 'c' && $_->[1] eq '' ? () : $_->[0] eq 'c' ? [$_->[0], uc($_->[1]), $_->[2], length($_->[1])] : [$_->[0], $_->[1], $_->[2]]; } @$left]; } sub _find_right { my ($grammar, $left, $type, $number, $data) = @_; for (my $lp = 0; $lp < @$left; $lp++) { my $l = $left->[$lp]; next if $l->[0] ne $type; next if $l->[0] eq 's' && $l->[1] != $data; next if $l->[0] ne 's' && $l->[1] ne $data; $number--; return $lp if $number < 1; } if ($type eq 's') { faint(SP_CREATION, "Symbol " . $grammar->{symboltable}->symbol($data) . " not found"); } elsif ($type eq 'c') { my @data = unpack('C*', $data); faint(SP_CREATION, "Block (@data) not found"); } elsif ($type eq 'r') { faint(SP_CREATION, "Reggrim ($data) not found"); } faint(SP_CREATION, "Internal error"); } sub _convert_right { my ($right, $left, $grammar) = @_; [map { $_->[0] eq 'c' && $_->[2] eq '' ? () : $_->[0] =~ /^[scr]$/o ? [$_->[0], _find_right($grammar, $left, $_->[0], $_->[1], $_->[2])] : $_->[0] eq 'n' ? [$_->[0], _find_right($grammar, $left, 's', $_->[1], $_->[2])] : $_->[0] eq '*' ? [$_->[0]] : [$_->[0], $_->[1]]; } @$right]; } sub _unconvert_right { my ($right, $left) = @_; [map { $_->[0] =~ /^[scr]$/o ? [$_->[0], _count_left($_->[0], $_->[1], $left)] : $_->[0] eq 'n' ? [$_->[0], _count_left('s', $_->[1], $left)] : [$_->[0], $_->[1]]; } @$right]; } sub _count_left { my ($type, $number, $left) = @_; my $count = 0; my $data = $left->[$number][1]; for (my $lp = 0; $lp <= $number; $lp++) { my $l = $left->[$lp]; next if $l->[0] ne $type; next if $l->[0] eq 's' && $l->[1] != $data; next if $l->[0] ne 's' && $l->[1] ne $data; $count++; } ($count, $data); } # compile_top works similarly to compile but is useful for some types of # top-level symbols, as it avoids compile's potentially exponential # behaviour. Returns a list of generated code fragments (no completion # is attempted) sub compile_top { @_ == 7 || @_ == 8 or croak "Usage: GRAMMAR->compile_top(TOP, INT, SOURCE, " . "POS, SPACE, JUNK [, VERBOSE])"; my ($grammar, $tsymb, $isymb, $source, $ipos, $space, $junk, $verb) = @_; _convert_grammar($grammar); my @result = (); $grammar->{junk_cache} = {}; $grammar->{junk_symbol} = $junk; my ($cspace, $sspace) = _precompile($grammar, $source, $space); $grammar->{cspace} = $cspace; $grammar->{sspace} = $sspace; my $started_pos = $ipos; my $started_time = time; my $reported_time = $started_time; while ($ipos < length $source) { if ($verb) { my $now = time; if ($now - $reported_time > 60) { my $s = substr($source, $ipos); $s =~ s/\s+/ /go; $s =~ s/^ //o; $s = substr($s, 0, 28); my $fraction = ($ipos - $started_pos) / (length($source) - $started_pos); my $eta = ''; if ($fraction > .1) { $eta = $started_time + ($now - $started_time) / $fraction; my @eta = localtime($eta); $eta = sprintf " ETA: %02d:%02d:%02d", @eta[2, 1, 0]; } my @now = localtime($now); my $d = length(length $source); printf STDERR "\n %02d:%02d:%02d: done to %${d}d %-30s %5.1f%%%s", @now[2,1,0], $ipos, "[$s]", 100 * $fraction, $eta; $reported_time = $now; } } my $pos = $ipos++; defined _parse_junk($source, \$ipos, $grammar) or $ipos = length $source; my $pp = _compile($grammar, $tsymb, $source, $pos, $cspace, 0, 0); if ($isymb) { for my $p (@$pp) { my ($ps, $pe, $pj, $pc, $pn, @pu) = @$p; if ($pe < length($source)) { my $ip = _compile($grammar, $isymb, $source, $pe, $cspace, 0, 0); if (@$ip) { for my $i (@$ip) { my ($is, $ie, $ij, $ic, $in, @iu) = @$i; push @result, $pc . $ic; } } else { push @result, $pc; } } else { push @result, $pc; } } } else { push @result, map { $_->[3] } @$pp; } } @result; } # compile attempts to generate code; returns two ARRAYREFs, a list of # generated code with elements [start, end, uses_junk?, code, count, @prods], # and a list of possible completion if source is a prefix of a parseable string # both lists will be empty if nothing can be parsed sub compile { @_ == 6 or croak "Usage: GRAMMAR->compile(SYMBOL, SOURCE, POS, SPACE, JUNK)"; my ($grammar, $isymb, $source, $start, $space, $junk) = @_; _convert_grammar($grammar); $grammar->{junk_cache} = {}; $grammar->{junk_symbol} = $junk; my ($cspace, $sspace) = _precompile($grammar, $source, $space); $grammar->{cspace} = $cspace; $grammar->{sspace} = $sspace; my %complete = (); my $r = _compile($grammar, $isymb, $source, $start, $cspace, \%complete, 0); ($r, [keys %complete]); } sub _compile { my ($grammar, $isymb, $source, $start, $cspace, $complete, $any) = @_; return [] if $isymb < 1; my $productions = $grammar->{productions}; my $pos = $start; $cspace->(\$pos); return [] if $pos >= length $source && ! $complete; my $predefs = $grammar->{predefined}; my @result = (); # special case out of the main loop (they should not normally do this) if (exists $predefs->{$isymb}) { my $pf = $predefs->{$isymb}; if ($pos >= length $source && ! $pf->[5]) { if ($complete) { my $cpl = $pf->[6]->($source, $pos, $grammar, $pf); $complete->{$_} = 1 for @$cpl; } } else { my $end = $pos; my @ok = $pf->[1]->($source, \$end, $grammar); my $bad = $pf->[2] ? $end - $start : 0; if ($end >= length $source && $complete) { my $cpl = $pf->[6]->($source, $end, $grammar, $pf); $complete->{$_} = 1 for @$cpl, @{$grammar->{sspace}}; } if (@ok) { my ($code, $count) = $pf->[3]->(@ok); $cspace->(\$end); push @result, [$start, $end, $bad, $code, $count]; } } return \@result; } # normal case, parsing on user-defined symbols return [] if $isymb >= @$productions; my $iprod = $productions->[$isymb]; return [] if ! $iprod || ! @$iprod; # prepare a list of states which look promising my @state = (); my $nxc = $pos < length($source) ? ord(substr($source, $pos, 1)) : undef; for (my $prodnum = @$iprod - 1; $prodnum >= 0; $prodnum--) { next unless $iprod->[$prodnum][4] || ! defined $nxc || vec($iprod->[$prodnum][2], $nxc, 1); push @state, [$isymb, $prodnum, 0, $pos, 0, []]; } my $cpspace = 0; STATE: while (@state) { my ($symb, $prodnum, $prodelem, $place, $bad, $stack, @tree) = @{pop @state}; my $sprod = $productions->[$symb][$prodnum]; my $left = $sprod->[0]; ELEM: while ($prodelem < @$left) { my ($type, $data, $count, $aux) = @{$left->[$prodelem]}; $prodelem++; if ($type eq 's') { if (exists $predefs->{$data}) { # predefined symbol - we can just run its code here my $pf = $predefs->{$data}; if ($place < length $source || $pf->[5]) { my $end = $place; my @ok = $pf->[1]->($source, \$end, $grammar); if ($end >= length $source && $complete) { my $cpl = $pf->[6]->($source, $place, $grammar, $pf); $complete->{$_} = 1 for @$cpl; $cpspace = 1; } next STATE unless @ok; $bad += $end - $place if $pf->[2]; push @tree, [$pf->[3]->(@ok)]; $cspace->(\$end); $place = $end; next ELEM; } elsif ($complete) { my $cpl = $pf->[6]->($source, $place, $grammar, $pf); $complete->{$_} = 1 for @$cpl; } next STATE; } else { # user defined symbol - we need to push the current # state onto the stack and add new states to @state next STATE if $data >= @$productions; my $prod = $productions->[$data]; next STATE if ! $prod || ! @$prod; pos($source) = $place; push @$stack, [$symb, $prodnum, $prodelem, $bad, @tree]; my $nxc = length $source < $place ? ord(substr($source, $place, 1)) : undef; for (my $pn = @$prod - 1; $pn >= 0; $pn--) { next unless $prod->[$pn][4] || ! defined $nxc || vec($prod->[$pn][2], $nxc, 1); push @state, [$data, $pn, 0, $place, 0, [@$stack]]; } next STATE; } } elsif ($type eq 'c') { # constant - just check if the required string is there my $look = uc(substr($source, $place, $aux)); if ($data eq $look) { # yep, it's there - add the place to the current tree # in case the code generator wants it push @tree, [$place, $aux]; $place += $aux; $cspace->(\$place); $cpspace = 1 if $place >= length $source; next ELEM; } elsif ($complete && length($look) < length($data) && substr($data, 0, length($look)) eq $look) { # a substring of the wanted string is there - so this could # be an incomplete source $complete->{substr($data, length($look))} = 1; } next STATE; } elsif ($type eq 'r') { # regular grimace - run it and see what we get my ($type, $length, $follows) = $aux->match($source, $place); if ($type) { # a match - add its place and length to the parse tree push @tree, [$place, $length]; $place += $length; $cspace->(\$place); if ($complete) { $complete->{$_} = 1 for @$follows; $cpspace = 1 if $place >= length $source; } next ELEM; } if ($complete) { $complete->{$_} = 1 for @$follows; $cpspace = 1 if $place >= length $source; } next STATE; } } # end of production - generate code my %uses = ( $sprod->[5] => 1 ); for my $t (@tree) { my ($x, $c, @u) = @$t; $uses{$_} = 1 for @u; } my @uses = sort { $a <=> $b } keys %uses; my ($code, $count) = _gencode($source, $left, $sprod->[1], \@tree, $start, $place - $start, $bad, \@uses); if (@$stack) { # we were called by another nonterminal my ($nsym, $nprd, $nelm, $nbad, @ntree) = @{pop @$stack}; $nbad += $bad; push @state, [$nsym, $nprd, $nelm, $place, $nbad, $stack, @ntree, [$code, $count, @uses]]; } else { # top level symbol, in other words a (possibly partial) result push @result, [$start, $place, $bad, $code, $count, @uses]; return \@result if $any; } } if ($complete && $cpspace) { $complete->{$_} = 1 for @{$grammar->{sspace}}; } return \@result; } sub _gencode { my ($source, $left, $right, $tree, $start, $length, $junk, $uses) = @_; my $code = ''; for my $rp (@$right) { my ($type, $value) = @$rp; if ($type eq 'b') { $code .= $value; next; } if ($type eq 's') { $code .= $tree->[$value][0]; next; } if ($type eq 'n') { $code .= pack('C*', BC($tree->[$value][1])); next; } if ($type eq 'c' || $type eq 'r') { my ($place, $len) = @{$tree->[$value]}; my $const = substr($source, $place, $len); my @v = unpack('C*', $const); $code .= pack('C*', BC_MUL, map { BC($_) } scalar(@v), @v); next; } if ($type eq '*') { $code .= pack('C*', map { BC($_) } $start, $length, $junk, scalar @$uses, @$uses); next; } } my $count = 0; for (my $lp = 0; $lp < @$left; $lp++) { my $lc = $left->[$lp][2]; $count += $lc == 0xffff ? $tree->[$lp][1] : $lc; } ($code, $count); } sub _find_starts { my ($grammar) = @_; # first find if any symbol can expand (directly) to empty strings # or (directly) to another symbol; since information about each of # these changes our idea of the other, we keep repeating until we # cannot make any more changes my $empty = ''; my $found = 0; for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) { my $plist = $grammar->{productions}[$symb]; next unless $plist; for my $prod (@$plist) { $prod->[2] = ''; $prod->[3] = ''; $prod->[4] = 0; } $found++; } return $empty unless $found; my $continue = 1; while ($continue) { $continue = 0; SYMB: for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) { my $plist = $grammar->{productions}[$symb]; next unless $plist; PROD: for my $prod (@$plist) { # look at the first element of the production, if there's one ELEM: for my $p (@{$prod->[0]}) { if ($p->[0] eq 's') { # that means we can access this particular symbol $continue = 1 if ! vec($prod->[3], $p->[1], 1); vec($prod->[3], $p->[1], 1) = 1; # if we know the symbol can parse the empty string, # we also need to check the next element next if vec($empty, $p->[1], 1); } next if ($p->[0] eq 'c' && $p->[1] eq '') || ($p->[0] eq 'r' && $p->[3]->can_empty()); next PROD; } # if we get here, all productions are empty, so... $continue = 1 if ! vec($empty, $symb, 1); vec($empty, $symb, 1) = 1; $prod->[4] = 1; } } } for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) { my $plist = $grammar->{productions}[$symb]; next unless $plist; for my $prod (@$plist) { faint(SP_CIRCULAR, $grammar->{symboltable}->symbol($symb)) if vec($prod->[3], $symb, 1); } } return $empty; } sub _convert_grammar { my ($grammar) = @_; return if exists $grammar->{converted}; my $empty = _find_starts($grammar); my @i_total = map { '' } @{$grammar->{productions}}; my $predefs = $grammar->{predefined}; # first find the "direct" initials; SYMB: for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) { my $plist = $grammar->{productions}[$symb]; next unless $plist; PROD: for my $prod (@$plist) { next unless $prod; $prod->[2] = ''; # look at the first element of the production, if there's one ELEM: for my $p (@{$prod->[0]}) { if ($p->[0] eq 'c') { next if $p->[1] eq ''; my $l = ord(lc(substr($p->[1], 0, 1))); my $u = ord(uc(substr($p->[1], 0, 1))); vec($i_total[$symb], $l, 1) = 1; vec($i_total[$symb], $u, 1) = 1; vec($prod->[2], $l, 1) = 1; vec($prod->[2], $u, 1) = 1; next PROD; } if ($p->[0] eq 'r') { my @i = $p->[3]->can_start(); for my $s (@i) { vec($i_total[$symb], $s, 1) = 1; vec($prod->[2], $s, 1) = 1; } next if $p->[3]->can_empty(); next PROD; } if ($p->[0] eq 's' && exists $predefs->{$p->[1]}) { my $pf = $predefs->{$p->[1]}; my $st = $pf->[4]; my $em = $pf->[5]; $i_total[$symb] |= $st; $prod->[2] |= $st; next if $em; next PROD; } next if ($p->[0] eq 's' && vec($empty, $p->[1], 1)); next PROD; } } } # now propagate %i_... using %starts my $continue = 1; while ($continue) { $continue = 0; for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) { my $plist = $grammar->{productions}[$symb]; next unless $plist; for my $prod (@$plist) { for (my $other = 0; $other < 8 * length($prod->[3]); $other++) { next unless vec($prod->[3], $other, 1); my $init = $i_total[$other]; my $np = $prod->[2] | $init; $continue = 1 if $np ne substr($prod->[2], 0, length($np)); $prod->[2] |= $init; $i_total[$symb] |= $init; } } } } my $mask = ''; for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) { if (exists $predefs->{$symb}) { vec($mask, $symb, 1) = 1; next; } my $plist = $grammar->{productions}[$symb]; next unless $plist; for my $prod (@$plist) { $prod->[3] = substr($prod->[3], 0, 1) & $mask; } } $grammar->{converted} = 1; } sub _left_equal { my ($l1, $l2) = @_; return 0 if @$l1 != @$l2; for (my $c = 0; $c < @$l1; $c++) { my ($t1, $d1, $c1, $a1) = @{$l1->[$c]}; my ($t2, $d2, $c2, $a2) = @{$l2->[$c]}; return 0 if $t1 ne $t2; return 0 if $c1 != $c2; if ($t1 eq 's') { return 0 if $d1 != $d2; } elsif ($t1 eq 'c') { return 0 if $d1 ne $d2; } elsif ($t1 eq 'r') { return 0 if ! $a1->is_equal($a2); } else { return 0; } } return 1; } sub _right_equal { my ($r1, $r2) = @_; return 0 if @$r1 != @$r2; for (my $c = 0; $c < @$r1; $c++) { my ($t1, $v1) = @{$r1->[$c]}; my ($t2, $v2) = @{$r2->[$c]}; return 0 if $t1 ne $t2; if ($t1 eq 'b') { return 0 if $v1 ne $v2; } elsif ($t1 eq 's' || $t1 eq 'c' || $t1 eq 'n' || $t1 eq 'r') { return 0 if $v1 != $v2; } elsif ($t1 ne '*') { return 0; } } return 1; } sub _find_rule { my ($grammar, $symb, $left, $right) = @_; return () if $symb >= @{$grammar->{productions}}; my $prods = $grammar->{productions}[$symb]; return () unless $prods; my @found = (); SYMB: for (my $pp = 0; $pp < @$prods; $pp++) { my ($l, $r, $i, $s, $e, $c) = @{$prods->[$pp]}; # see if this production is same as $left and (if provided) $right next SYMB if ! _left_equal($l, $left); next SYMB if $right && ! _right_equal($r, $right); push @found, $c; } return @found; } sub add { @_ == 4 or croak "Usage: GRANMAR->add(SYMBOL, LEFT, RIGHT)"; my ($grammar, $symb, $left, $right) = @_; $left = _convert_left($left); $right = _convert_right($right, $left, $grammar); # do we already have this production? PROD: for my $prod (@{$grammar->{productions}[$symb]}) { my ($l, $r, $i, $s, $e, $c) = @$prod; next PROD if ! _left_equal($left, $l); next PROD if ! _right_equal($right, $r); # we have it, no need to add anything or make any changes return -$c; } my $prodnum = ++$grammar->{rule_count}; push @{$grammar->{productions}[$symb]}, [$left, $right, '', '', 0, $prodnum]; delete $grammar->{converted}; $prodnum; } sub find_rule { @_ == 3 || @_ == 4 or croak "Usage: GRANMAR->find_rule(SYMBOL, LEFT [, RIGHT])"; my ($grammar, $symb, $left, $right) = @_; $left = _convert_left($left); $right = _convert_right($right, $left, $grammar) if $right; my @rules = _find_rule($grammar, $symb, $left, $right); wantarray ? @rules : $rules[0]; } 1;