996
|
1 package Language::INTERCAL::Parser;
|
|
2
|
|
3 # Parser/code generator/etc
|
|
4
|
|
5 # This file is part of CLC-INTERCAL
|
|
6
|
|
7 # Copyright (c) 2006-2008 Claudio Calvelli, all rights reserved.
|
|
8
|
|
9 # CLC-INTERCAL is copyrighted software. However, permission to use, modify,
|
|
10 # and distribute it is granted provided that the conditions set out in the
|
|
11 # licence agreement are met. See files README and COPYING in the distribution.
|
|
12
|
|
13 use strict;
|
|
14 use vars qw($VERSION $PERVERSION);
|
|
15 ($VERSION) = ($PERVERSION = "CLC-INTERCAL/Base INTERCAL/Parser.pm 1.-94.-2") =~ /\s(\S+)$/;
|
|
16
|
|
17 use Carp;
|
|
18 use Language::INTERCAL::Exporter '1.-94.-2';
|
|
19 use Language::INTERCAL::Splats '1.-94.-2', qw(:SP);
|
|
20 use Language::INTERCAL::ByteCode '1.-94.-2', qw(:BC);
|
|
21 use Language::INTERCAL::SymbolTable '1.-94.-2';
|
|
22 use Language::INTERCAL::Reggrim '1.-94.-2';
|
|
23
|
|
24 # for some reason this sort of things works faster than regexes here
|
|
25 my $digits = '';
|
|
26 vec($digits, ord($_), 1) = 1 for (0..9);
|
|
27 my $alphalist ='abcdefghijklmnopqrstuvwxyz_ABCDEFGHIJKLMNOPQRSTUVWXYZ';
|
|
28 my $alphabet = $digits;
|
|
29 for (my $i = 0; $i < length $alphalist; $i++) {
|
|
30 vec($alphabet, ord(substr($alphalist, $i, 1)), 1) = 1;
|
|
31 }
|
|
32 my $anything = '';
|
|
33 vec($anything, $_, 1) = 1 for (0..255);
|
|
34 my $spaces = '';
|
|
35 vec($spaces, ord($_), 1) = 1 for (" ", "\t", "\012", "\015");
|
|
36 my $nonspaces = $anything;
|
|
37 vec($spaces, ord($_), 1) = 0 for (" ", "\t", "\012");
|
|
38
|
|
39 my @parser_predefined = (
|
|
40 # NAME PARSE BAD GENCODE STARTS EMPTY COMPLETE
|
|
41 ["CONSTANT", \&_parse_constant, 0, \&_code_constant, $digits, 0, \&_complete_constant],
|
|
42 ["SYMBOL", \&_parse_symbol, 0, \&_code_symbol, $alphabet, 0, \&_complete_list],
|
|
43 ["JUNK", \&_parse_junk, 1, \&_code_junk, $anything, 1, \&_complete_none],
|
|
44 ["SPACE", \&_parse_space, 0, sub { () }, $spaces, 1, \&_complete_list],
|
|
45 ["BLACKSPACE", \&_parse_blackspace, 0, sub { () }, $nonspaces, 1, \&_complete_list],
|
|
46 ["ANYTHING", \&_parse_anything, 0, sub { () }, $anything, 0, \&_complete_none],
|
|
47 );
|
|
48
|
|
49 sub _parse_constant {
|
|
50 my ($src, $pos, $grammar) = @_;
|
|
51 pos($src) = $$pos;
|
|
52 return () unless $src =~ /\G0*(\d{1,5})/go;
|
|
53 $$pos = pos($src);
|
|
54 my $con = $1 + 0;
|
|
55 return $con if $con < 65536;
|
|
56 $$pos--;
|
|
57 return int($con / 10);
|
|
58 }
|
|
59
|
|
60 sub _code_constant {
|
|
61 my ($number) = @_;
|
|
62 my @code = BC($number);
|
|
63 (pack('C*', @code), scalar(@code));
|
|
64 }
|
|
65
|
|
66 sub _complete_constant {
|
|
67 my ($src, $pos, $grammar, $pf) = @_;
|
|
68 my $con = substr($src, $pos);
|
|
69 return [0..9] if $con eq '' || $con < 6553;
|
|
70 return [0..5] if $con == 6553;
|
|
71 return [];
|
|
72 }
|
|
73
|
|
74 sub _complete_list {
|
|
75 my ($src, $pos, $grammar, $pf) = @_;
|
|
76 my $vec = $pf->[4];
|
|
77 my @cpl = ();
|
|
78 for (my $sym = 0; $sym < 8 * length $vec; $sym++) {
|
|
79 push @cpl, chr($sym) if vec($vec, $sym, 1);
|
|
80 }
|
|
81 return \@cpl;
|
|
82 }
|
|
83
|
|
84 sub _complete_none {
|
|
85 my ($src, $pos, $grammar, $pf) = @_;
|
|
86 return [];
|
|
87 }
|
|
88
|
|
89 sub _parse_symbol {
|
|
90 my ($src, $pos, $grammar) = @_;
|
|
91 pos($src) = $$pos;
|
|
92 return () unless $src =~ /\G(\w+)/go;
|
|
93 $$pos = pos($src);
|
|
94 return $1;
|
|
95 }
|
|
96
|
|
97 sub _code_symbol {
|
|
98 my ($string) = @_;
|
|
99 (pack('C*', BC_STR, BC(length $string)) . $string, 1);
|
|
100 }
|
|
101
|
|
102 sub _parse_junk {
|
|
103 my ($src, $pos, $grammar) = @_;
|
|
104 my $junk = $grammar->{junk_symbol};
|
|
105 return () unless $junk && $junk <= @{$grammar->{productions}};
|
|
106 my $cspace = $grammar->{cspace};
|
|
107 if (! exists $grammar->{junk_cache}{$$pos}) {
|
|
108 # XXX this could be made more efficient, for now we'll leave it at this
|
|
109 my $end = undef;
|
|
110 $grammar->{junk_symbol} = 0;
|
|
111 for (my $p = $$pos + 1; $p < length($src); $p++) {
|
|
112 $cspace->(\$p);
|
|
113 my $t = _compile($grammar, $junk, $src, $p, $cspace, 0, 1);
|
|
114 next unless @$t;
|
|
115 $end = $p;
|
|
116 last;
|
|
117 }
|
|
118 $grammar->{junk_symbol} = $junk;
|
|
119 $end = length($src) if ! defined $end;
|
|
120 for (my $p = $$pos; $p < $end; $p++) {
|
|
121 $grammar->{junk_cache}{$p} = $end;
|
|
122 }
|
|
123 }
|
|
124 my $ej = $grammar->{junk_cache}{$$pos};
|
|
125 my $res = substr($src, $$pos, $ej - $$pos);
|
|
126 $$pos = $ej;
|
|
127 $res;
|
|
128 }
|
|
129
|
|
130 sub _code_junk {
|
|
131 my ($string) = @_;
|
|
132 $string =~ s/^\s+//o;
|
|
133 $string =~ s/\s+$//o;
|
|
134 (pack('C*', BC_STR, BC(length $string)) . $string, 1);
|
|
135 }
|
|
136
|
|
137 sub _parse_space {
|
|
138 my ($src, $pos, $grammar) = @_;
|
|
139 pos($src) = $$pos;
|
|
140 return () unless $src =~ /\G([ \t\012\015]+)/go;
|
|
141 $$pos = pos($src);
|
|
142 $1;
|
|
143 }
|
|
144
|
|
145 sub _parse_blackspace {
|
|
146 my ($src, $pos, $grammar) = @_;
|
|
147 pos($src) = $$pos;
|
|
148 return () unless $src =~ /\G([^ \t\012]+)/go;
|
|
149 $$pos = pos($src);
|
|
150 $1;
|
|
151 }
|
|
152
|
|
153 sub _parse_anything {
|
|
154 my ($src, $pos, $grammar) = @_;
|
|
155 my $p = $$pos;
|
|
156 return () if $p >= length $src;
|
|
157 $$pos = $p + 1;
|
|
158 substr($src, $p, 1);
|
|
159 }
|
|
160
|
|
161 # precompile provides optimised access to _parse_space etc to be used when
|
|
162 # using the compiler's SPACE symbol - this saves quite a lot of compile time
|
|
163 sub _precompile {
|
|
164 my ($grammar, $source, $space) = @_;
|
|
165 my $predefs = $grammar->{predefined};
|
|
166 if (exists $predefs->{$space}) {
|
|
167 if ($predefs->{$space}[1] == \&_parse_space) {
|
|
168 return (
|
|
169 sub {
|
|
170 my ($pos) = @_;
|
|
171 pos($source) = $$pos;
|
|
172 return unless $source =~ /\G[ \t\012\015]+/go;
|
|
173 $$pos = pos($source);
|
|
174 },
|
|
175 [' ', "\t", "\012", "\015"],
|
|
176 );
|
|
177 }
|
|
178 if ($predefs->{$space}[1] == \&_parse_blackspace) {
|
|
179 return (
|
|
180 sub {
|
|
181 my ($pos) = @_;
|
|
182 pos($source) = $$pos;
|
|
183 return unless $source =~ /\G[^ \t\012]+/go;
|
|
184 $$pos = pos($source);
|
|
185 },
|
|
186 [grep { ! /^[ \t\012]/ } map { chr } (0..255)],
|
|
187 );
|
|
188 }
|
|
189 my $sub = $predefs->{$space}[1];
|
|
190 my $start = $predefs->{$space}[4];
|
|
191 return (
|
|
192 sub {
|
|
193 my ($pos) = @_;
|
|
194 $sub->($source, $pos, $grammar);
|
|
195 },
|
|
196 [map { chr } grep { vec($start, $_, 1) } (0..255)],
|
|
197 );
|
|
198 }
|
|
199 return (sub {}, []) unless $space && $space <= @{$grammar->{productions}};
|
|
200 my $start = '';
|
|
201 for my $prod (@{$grammar->{productions}[$space]}) {
|
|
202 $start |= $prod->[2];
|
|
203 }
|
|
204 return (
|
|
205 sub {
|
|
206 my ($pos) = @_;
|
|
207 my $p = _compile($grammar, $space, $source, $$pos, sub {}, 0, 0);
|
|
208 # now find the longest matching result
|
|
209 for my $e (@$p) {
|
|
210 my ($start, $end) = @$e;
|
|
211 $$pos = $end if $$pos < $end;
|
|
212 }
|
|
213 },
|
|
214 [map { chr } grep { vec($start, $_, 1) } (0..255)],
|
|
215 );
|
|
216 }
|
|
217
|
|
218 sub new {
|
|
219 @_ == 2 or croak "Usage: new Language::INTERCAL::Parser(SYMBOLTABLE)";
|
|
220 my ($class, $symboltable) = @_;
|
|
221 my %predefined = ();
|
|
222 for my $pf (@parser_predefined) {
|
|
223 my $sn = $symboltable->find($pf->[0]);
|
|
224 $predefined{$sn} = $pf;
|
|
225 }
|
|
226 bless {
|
|
227 productions => [],
|
|
228 converted => 1,
|
|
229 rule_count => 0,
|
|
230 symboltable => $symboltable,
|
|
231 predefined => \%predefined,
|
|
232 optimise => {},
|
|
233 }, $class;
|
|
234 }
|
|
235
|
|
236 sub forall {
|
|
237 @_ == 2 or croak "Usage: GRAMMAR->forall(CODE)";
|
|
238 my ($grammar, $code) = @_;
|
|
239 my $p = $grammar->{productions};
|
|
240 my $s = $grammar->{symboltable};
|
|
241 my @prod = ();
|
|
242 for (my $sym = 0; $sym < @$p; $sym++) {
|
|
243 next unless $p->[$sym];
|
|
244 for my $prod (@{$p->[$sym]}) {
|
|
245 my ($left, $right, $_1, $_2, $_3, $prodnum) = @$prod;
|
|
246 push @prod, [$prodnum, $sym, $left, $right];
|
|
247 }
|
|
248 }
|
|
249 for my $prod (sort { $a->[0] <=> $b->[0] } @prod) {
|
|
250 my ($prodnum, $sym, $left, $right) = @$prod;
|
|
251 $right = _unconvert_right($right, $left);
|
|
252 $code->($grammar, $s, $prodnum, $sym, $left, $right);
|
|
253 }
|
|
254 }
|
|
255
|
|
256 sub read {
|
|
257 @_ == 2 or croak "Usage: GRAMMAR->read(FILEHANDLE)";
|
|
258 my ($grammar, $fh) = @_;
|
|
259
|
|
260 # make it faster to run next time
|
|
261 _convert_grammar($grammar);
|
|
262
|
|
263 my $plist = $grammar->{productions};
|
|
264 $fh->read_binary(pack('vv', $grammar->{rule_count}, scalar @$plist));
|
|
265 for (my $symbol = 1; $symbol < @$plist; $symbol++) {
|
|
266 my $gp = $plist->[$symbol] || [];
|
|
267 $fh->read_binary(pack('v', scalar @$gp));
|
|
268 for my $prod (@$gp) {
|
|
269 my ($left, $right, $initial, $startmap, $empty, $prodnum) = @$prod;
|
|
270 _read_left($fh, $left);
|
|
271 _read_right($fh, $right);
|
|
272 $fh->read_binary(pack('vvCv', length($initial), length($startmap),
|
|
273 $empty ? 1 : 0, $prodnum));
|
|
274 $fh->read_binary($initial);
|
|
275 $fh->read_binary($startmap);
|
|
276 }
|
|
277 }
|
|
278
|
|
279 $grammar;
|
|
280 }
|
|
281
|
|
282 sub _read_left {
|
|
283 my ($fh, $left) = @_;
|
|
284 $fh->read_binary(pack('v', scalar(@$left)));
|
|
285 for my $element (@$left) {
|
|
286 my ($type, $e, $c, @e) = @$element;
|
|
287 $fh->read_binary($type);
|
|
288 if ($type eq 's') {
|
|
289 $fh->read_binary(pack('v', $e));
|
|
290 } elsif ($type eq 'r') {
|
|
291 my $code = @e ? $e->[0]->save() : '';
|
|
292 $fh->read_binary(pack('vva*a*', length($e), length($code),
|
|
293 $e, $code));
|
|
294 } else {
|
|
295 $fh->read_binary(pack('v/a*', $e));
|
|
296 }
|
|
297 $fh->read_binary(pack('v', $c));
|
|
298 }
|
|
299 }
|
|
300
|
|
301 sub _read_right {
|
|
302 my ($fh, $right) = @_;
|
|
303 $fh->read_binary(pack('v', scalar(@$right)));
|
|
304 for my $element (@$right) {
|
|
305 my $type = $element->[0];
|
|
306 my $e = $element->[1];
|
|
307 $fh->read_binary($type);
|
|
308 if ($type eq 'b') {
|
|
309 $fh->read_binary(pack('v/a*', $e));
|
|
310 } elsif ($type ne '*') {
|
|
311 $fh->read_binary(pack('v', $e));
|
|
312 }
|
|
313 }
|
|
314 }
|
|
315
|
|
316 sub write {
|
|
317 @_ == 3 or croak "Usage: write " .
|
|
318 "Language::INTERCAL::Parser(FILEHANDLE, SYMBOLS)";
|
|
319 my ($class, $fh, $symboltable) = @_;
|
|
320
|
|
321 my ($rule_count, $nsymbols) = unpack('vv', $fh->write_binary(4));
|
|
322 my @productions = ();
|
|
323 for (my $symbol = 1; $symbol < $nsymbols; $symbol++) {
|
|
324 my $nprod = unpack('v', $fh->write_binary(2));
|
|
325 my @prod = ();
|
|
326 while (@prod < $nprod) {
|
|
327 my $left = _write_left($fh);
|
|
328 my $right = _write_right($fh);
|
|
329 my ($ninit, $mapsize, $empty, $prodnum) =
|
|
330 unpack('vvCv', $fh->write_binary(7));
|
|
331 my $initial = $fh->write_binary($ninit);
|
|
332 my $startmap = $fh->write_binary($mapsize);
|
|
333 push @prod,
|
|
334 [$left, $right, $initial, $startmap, $empty, $prodnum];
|
|
335 }
|
|
336 $productions[$symbol] = \@prod;
|
|
337 }
|
|
338
|
|
339 my %predefined = ();
|
|
340 for my $pf (@parser_predefined) {
|
|
341 my $sn = $symboltable->find($pf->[0]);
|
|
342 $predefined{$sn} = $pf;
|
|
343 }
|
|
344 my $grammar = bless {
|
|
345 symboltable => $symboltable,
|
|
346 productions => \@productions,
|
|
347 converted => 1,
|
|
348 rule_count => $rule_count,
|
|
349 predefined => \%predefined,
|
|
350 }, $class;
|
|
351
|
|
352 $grammar;
|
|
353 }
|
|
354
|
|
355 sub _write_left {
|
|
356 my ($fh) = @_;
|
|
357 my $elems = unpack('v', $fh->write_binary(2));
|
|
358 my @left = ();
|
|
359 while ($elems-- > 0) {
|
|
360 my $type = $fh->write_binary(1);
|
|
361 my $data = '';
|
|
362 my @comp = ();
|
|
363 if ($type eq 's') {
|
|
364 $data = unpack('v', $fh->write_binary(2));
|
|
365 } elsif ($type eq 'r') {
|
|
366 my ($rsize, $csize) = unpack('vv', $fh->write_binary(4));
|
|
367 $data = $fh->write_binary($rsize);
|
|
368 my $comp;
|
|
369 if ($csize > 0) {
|
|
370 # restore reggrim from saved state
|
|
371 $comp = $fh->write_binary($csize);
|
|
372 $comp = Language::INTERCAL::Reggrim->restore($comp);
|
|
373 } else {
|
|
374 # try to compile reggrim
|
|
375 $comp = Language::INTERCAL::Reggrim->compile($data);
|
|
376 }
|
|
377 @comp = ($comp);
|
|
378 } else {
|
|
379 my $size = unpack('v', $fh->write_binary(2));
|
|
380 $data = uc($fh->write_binary($size));
|
|
381 @comp = (length($data));
|
|
382 }
|
|
383 my $count = unpack('v', $fh->write_binary(2));
|
|
384 push @left, [$type, $data, $count, @comp];
|
|
385 }
|
|
386 \@left;
|
|
387 }
|
|
388
|
|
389 sub _write_right {
|
|
390 my ($fh) = @_;
|
|
391 my $elems = unpack('v', $fh->write_binary(2));
|
|
392 my @right = ();
|
|
393 while ($elems-- > 0) {
|
|
394 my $type = $fh->write_binary(1);
|
|
395 my $data = '';
|
|
396 if ($type eq 'b') {
|
|
397 my $len = unpack('v', $fh->write_binary(2));
|
|
398 $data = $fh->write_binary($len);
|
|
399 } elsif ($type ne '*') {
|
|
400 $data = unpack('v', $fh->write_binary(2));
|
|
401 }
|
|
402 push @right, [$type, $data];
|
|
403 }
|
|
404 \@right;
|
|
405 }
|
|
406
|
|
407 sub _convert_left {
|
|
408 my ($left) = @_;
|
|
409 [map {
|
|
410 $_->[0] eq 'r'
|
|
411 ? [$_->[0], $_->[1], $_->[2],
|
|
412 Language::INTERCAL::Reggrim->compile($_->[1])]
|
|
413 : $_->[0] eq 'c' && $_->[1] eq ''
|
|
414 ? ()
|
|
415 : $_->[0] eq 'c'
|
|
416 ? [$_->[0], uc($_->[1]), $_->[2], length($_->[1])]
|
|
417 : [$_->[0], $_->[1], $_->[2]];
|
|
418 } @$left];
|
|
419 }
|
|
420
|
|
421 sub _find_right {
|
|
422 my ($grammar, $left, $type, $number, $data) = @_;
|
|
423 for (my $lp = 0; $lp < @$left; $lp++) {
|
|
424 my $l = $left->[$lp];
|
|
425 next if $l->[0] ne $type;
|
|
426 next if $l->[0] eq 's' && $l->[1] != $data;
|
|
427 next if $l->[0] ne 's' && $l->[1] ne $data;
|
|
428 $number--;
|
|
429 return $lp if $number < 1;
|
|
430 }
|
|
431 if ($type eq 's') {
|
|
432 faint(SP_CREATION, "Symbol " .
|
|
433 $grammar->{symboltable}->symbol($data) .
|
|
434 " not found");
|
|
435 } elsif ($type eq 'c') {
|
|
436 my @data = unpack('C*', $data);
|
|
437 faint(SP_CREATION, "Block (@data) not found");
|
|
438 } elsif ($type eq 'r') {
|
|
439 faint(SP_CREATION, "Reggrim ($data) not found");
|
|
440 }
|
|
441 faint(SP_CREATION, "Internal error");
|
|
442 }
|
|
443
|
|
444 sub _convert_right {
|
|
445 my ($right, $left, $grammar) = @_;
|
|
446 [map {
|
|
447 $_->[0] eq 'c' && $_->[2] eq ''
|
|
448 ? ()
|
|
449 : $_->[0] =~ /^[scr]$/o
|
|
450 ? [$_->[0], _find_right($grammar, $left, $_->[0], $_->[1], $_->[2])]
|
|
451 : $_->[0] eq 'n'
|
|
452 ? [$_->[0], _find_right($grammar, $left, 's', $_->[1], $_->[2])]
|
|
453 : $_->[0] eq '*'
|
|
454 ? [$_->[0]]
|
|
455 : [$_->[0], $_->[1]];
|
|
456 } @$right];
|
|
457 }
|
|
458
|
|
459 sub _unconvert_right {
|
|
460 my ($right, $left) = @_;
|
|
461 [map {
|
|
462 $_->[0] =~ /^[scr]$/o ?
|
|
463 [$_->[0], _count_left($_->[0], $_->[1], $left)] :
|
|
464 $_->[0] eq 'n' ?
|
|
465 [$_->[0], _count_left('s', $_->[1], $left)] :
|
|
466 [$_->[0], $_->[1]];
|
|
467 } @$right];
|
|
468 }
|
|
469
|
|
470 sub _count_left {
|
|
471 my ($type, $number, $left) = @_;
|
|
472 my $count = 0;
|
|
473 my $data = $left->[$number][1];
|
|
474 for (my $lp = 0; $lp <= $number; $lp++) {
|
|
475 my $l = $left->[$lp];
|
|
476 next if $l->[0] ne $type;
|
|
477 next if $l->[0] eq 's' && $l->[1] != $data;
|
|
478 next if $l->[0] ne 's' && $l->[1] ne $data;
|
|
479 $count++;
|
|
480 }
|
|
481 ($count, $data);
|
|
482 }
|
|
483
|
|
484 # compile_top works similarly to compile but is useful for some types of
|
|
485 # top-level symbols, as it avoids compile's potentially exponential
|
|
486 # behaviour. Returns a list of generated code fragments (no completion
|
|
487 # is attempted)
|
|
488
|
|
489 sub compile_top {
|
|
490 @_ == 7 || @_ == 8
|
|
491 or croak "Usage: GRAMMAR->compile_top(TOP, INT, SOURCE, " .
|
|
492 "POS, SPACE, JUNK [, VERBOSE])";
|
|
493 my ($grammar, $tsymb, $isymb, $source, $ipos, $space, $junk, $verb) = @_;
|
|
494 _convert_grammar($grammar);
|
|
495 my @result = ();
|
|
496 $grammar->{junk_cache} = {};
|
|
497 $grammar->{junk_symbol} = $junk;
|
|
498 my ($cspace, $sspace) = _precompile($grammar, $source, $space);
|
|
499 $grammar->{cspace} = $cspace;
|
|
500 $grammar->{sspace} = $sspace;
|
|
501 my $started_pos = $ipos;
|
|
502 my $started_time = time;
|
|
503 my $reported_time = $started_time;
|
|
504 while ($ipos < length $source) {
|
|
505 if ($verb) {
|
|
506 my $now = time;
|
|
507 if ($now - $reported_time > 60) {
|
|
508 my $s = substr($source, $ipos);
|
|
509 $s =~ s/\s+/ /go;
|
|
510 $s =~ s/^ //o;
|
|
511 $s = substr($s, 0, 28);
|
|
512 my $fraction = ($ipos - $started_pos)
|
|
513 / (length($source) - $started_pos);
|
|
514 my $eta = '';
|
|
515 if ($fraction > .1) {
|
|
516 $eta = $started_time + ($now - $started_time) / $fraction;
|
|
517 my @eta = localtime($eta);
|
|
518 $eta = sprintf " ETA: %02d:%02d:%02d", @eta[2, 1, 0];
|
|
519 }
|
|
520 my @now = localtime($now);
|
|
521 my $d = length(length $source);
|
|
522 printf STDERR
|
|
523 "\n %02d:%02d:%02d: done to %${d}d %-30s %5.1f%%%s",
|
|
524 @now[2,1,0], $ipos, "[$s]", 100 * $fraction, $eta;
|
|
525 $reported_time = $now;
|
|
526 }
|
|
527 }
|
|
528 my $pos = $ipos++;
|
|
529 defined _parse_junk($source, \$ipos, $grammar)
|
|
530 or $ipos = length $source;
|
|
531 my $pp = _compile($grammar, $tsymb, $source, $pos, $cspace, 0, 0);
|
|
532 if ($isymb) {
|
|
533 for my $p (@$pp) {
|
|
534 my ($ps, $pe, $pj, $pc, $pn, @pu) = @$p;
|
|
535 if ($pe < length($source)) {
|
|
536 my $ip =
|
|
537 _compile($grammar, $isymb, $source, $pe, $cspace, 0, 0);
|
|
538 if (@$ip) {
|
|
539 for my $i (@$ip) {
|
|
540 my ($is, $ie, $ij, $ic, $in, @iu) = @$i;
|
|
541 push @result, $pc . $ic;
|
|
542 }
|
|
543 } else {
|
|
544 push @result, $pc;
|
|
545 }
|
|
546 } else {
|
|
547 push @result, $pc;
|
|
548 }
|
|
549 }
|
|
550 } else {
|
|
551 push @result, map { $_->[3] } @$pp;
|
|
552 }
|
|
553 }
|
|
554 @result;
|
|
555 }
|
|
556
|
|
557 # compile attempts to generate code; returns two ARRAYREFs, a list of
|
|
558 # generated code with elements [start, end, uses_junk?, code, count, @prods],
|
|
559 # and a list of possible completion if source is a prefix of a parseable string
|
|
560 # both lists will be empty if nothing can be parsed
|
|
561
|
|
562 sub compile {
|
|
563 @_ == 6
|
|
564 or croak "Usage: GRAMMAR->compile(SYMBOL, SOURCE, POS, SPACE, JUNK)";
|
|
565 my ($grammar, $isymb, $source, $start, $space, $junk) = @_;
|
|
566 _convert_grammar($grammar);
|
|
567 $grammar->{junk_cache} = {};
|
|
568 $grammar->{junk_symbol} = $junk;
|
|
569 my ($cspace, $sspace) = _precompile($grammar, $source, $space);
|
|
570 $grammar->{cspace} = $cspace;
|
|
571 $grammar->{sspace} = $sspace;
|
|
572 my %complete = ();
|
|
573 my $r = _compile($grammar, $isymb, $source, $start, $cspace, \%complete, 0);
|
|
574 ($r, [keys %complete]);
|
|
575 }
|
|
576
|
|
577 sub _compile {
|
|
578 my ($grammar, $isymb, $source, $start, $cspace, $complete, $any) = @_;
|
|
579 return [] if $isymb < 1;
|
|
580 my $productions = $grammar->{productions};
|
|
581 my $pos = $start;
|
|
582 $cspace->(\$pos);
|
|
583 return [] if $pos >= length $source && ! $complete;
|
|
584 my $predefs = $grammar->{predefined};
|
|
585 my @result = ();
|
|
586 # special case out of the main loop (they should not normally do this)
|
|
587 if (exists $predefs->{$isymb}) {
|
|
588 my $pf = $predefs->{$isymb};
|
|
589 if ($pos >= length $source && ! $pf->[5]) {
|
|
590 if ($complete) {
|
|
591 my $cpl = $pf->[6]->($source, $pos, $grammar, $pf);
|
|
592 $complete->{$_} = 1 for @$cpl;
|
|
593 }
|
|
594 } else {
|
|
595 my $end = $pos;
|
|
596 my @ok = $pf->[1]->($source, \$end, $grammar);
|
|
597 my $bad = $pf->[2] ? $end - $start : 0;
|
|
598 if ($end >= length $source && $complete) {
|
|
599 my $cpl = $pf->[6]->($source, $end, $grammar, $pf);
|
|
600 $complete->{$_} = 1 for @$cpl, @{$grammar->{sspace}};
|
|
601 }
|
|
602 if (@ok) {
|
|
603 my ($code, $count) = $pf->[3]->(@ok);
|
|
604 $cspace->(\$end);
|
|
605 push @result, [$start, $end, $bad, $code, $count];
|
|
606 }
|
|
607 }
|
|
608 return \@result;
|
|
609 }
|
|
610 # normal case, parsing on user-defined symbols
|
|
611 return [] if $isymb >= @$productions;
|
|
612 my $iprod = $productions->[$isymb];
|
|
613 return [] if ! $iprod || ! @$iprod;
|
|
614 # prepare a list of states which look promising
|
|
615 my @state = ();
|
|
616 my $nxc = $pos < length($source) ? ord(substr($source, $pos, 1)) : undef;
|
|
617 for (my $prodnum = @$iprod - 1; $prodnum >= 0; $prodnum--) {
|
|
618 next unless $iprod->[$prodnum][4]
|
|
619 || ! defined $nxc
|
|
620 || vec($iprod->[$prodnum][2], $nxc, 1);
|
|
621 push @state, [$isymb, $prodnum, 0, $pos, 0, []];
|
|
622 }
|
|
623 my $cpspace = 0;
|
|
624 STATE: while (@state) {
|
|
625 my ($symb, $prodnum, $prodelem, $place, $bad, $stack, @tree) =
|
|
626 @{pop @state};
|
|
627 my $sprod = $productions->[$symb][$prodnum];
|
|
628 my $left = $sprod->[0];
|
|
629 ELEM: while ($prodelem < @$left) {
|
|
630 my ($type, $data, $count, $aux) = @{$left->[$prodelem]};
|
|
631 $prodelem++;
|
|
632 if ($type eq 's') {
|
|
633 if (exists $predefs->{$data}) {
|
|
634 # predefined symbol - we can just run its code here
|
|
635 my $pf = $predefs->{$data};
|
|
636 if ($place < length $source || $pf->[5]) {
|
|
637 my $end = $place;
|
|
638 my @ok = $pf->[1]->($source, \$end, $grammar);
|
|
639 if ($end >= length $source && $complete) {
|
|
640 my $cpl =
|
|
641 $pf->[6]->($source, $place, $grammar, $pf);
|
|
642 $complete->{$_} = 1 for @$cpl;
|
|
643 $cpspace = 1;
|
|
644 }
|
|
645 next STATE unless @ok;
|
|
646 $bad += $end - $place if $pf->[2];
|
|
647 push @tree, [$pf->[3]->(@ok)];
|
|
648 $cspace->(\$end);
|
|
649 $place = $end;
|
|
650 next ELEM;
|
|
651 } elsif ($complete) {
|
|
652 my $cpl = $pf->[6]->($source, $place, $grammar, $pf);
|
|
653 $complete->{$_} = 1 for @$cpl;
|
|
654 }
|
|
655 next STATE;
|
|
656 } else {
|
|
657 # user defined symbol - we need to push the current
|
|
658 # state onto the stack and add new states to @state
|
|
659 next STATE if $data >= @$productions;
|
|
660 my $prod = $productions->[$data];
|
|
661 next STATE if ! $prod || ! @$prod;
|
|
662 pos($source) = $place;
|
|
663 push @$stack, [$symb, $prodnum, $prodelem, $bad, @tree];
|
|
664 my $nxc = length $source < $place
|
|
665 ? ord(substr($source, $place, 1))
|
|
666 : undef;
|
|
667 for (my $pn = @$prod - 1; $pn >= 0; $pn--) {
|
|
668 next unless $prod->[$pn][4]
|
|
669 || ! defined $nxc
|
|
670 || vec($prod->[$pn][2], $nxc, 1);
|
|
671 push @state, [$data, $pn, 0, $place, 0, [@$stack]];
|
|
672 }
|
|
673 next STATE;
|
|
674 }
|
|
675 } elsif ($type eq 'c') {
|
|
676 # constant - just check if the required string is there
|
|
677 my $look = uc(substr($source, $place, $aux));
|
|
678 if ($data eq $look) {
|
|
679 # yep, it's there - add the place to the current tree
|
|
680 # in case the code generator wants it
|
|
681 push @tree, [$place, $aux];
|
|
682 $place += $aux;
|
|
683 $cspace->(\$place);
|
|
684 $cpspace = 1 if $place >= length $source;
|
|
685 next ELEM;
|
|
686 } elsif ($complete &&
|
|
687 length($look) < length($data) &&
|
|
688 substr($data, 0, length($look)) eq $look)
|
|
689 {
|
|
690 # a substring of the wanted string is there - so this could
|
|
691 # be an incomplete source
|
|
692 $complete->{substr($data, length($look))} = 1;
|
|
693 }
|
|
694 next STATE;
|
|
695 } elsif ($type eq 'r') {
|
|
696 # regular grimace - run it and see what we get
|
|
697 my ($type, $length, $follows) = $aux->match($source, $place);
|
|
698 if ($type) {
|
|
699 # a match - add its place and length to the parse tree
|
|
700 push @tree, [$place, $length];
|
|
701 $place += $length;
|
|
702 $cspace->(\$place);
|
|
703 if ($complete) {
|
|
704 $complete->{$_} = 1 for @$follows;
|
|
705 $cpspace = 1 if $place >= length $source;
|
|
706 }
|
|
707 next ELEM;
|
|
708 }
|
|
709 if ($complete) {
|
|
710 $complete->{$_} = 1 for @$follows;
|
|
711 $cpspace = 1 if $place >= length $source;
|
|
712 }
|
|
713 next STATE;
|
|
714 }
|
|
715 }
|
|
716 # end of production - generate code
|
|
717 my %uses = ( $sprod->[5] => 1 );
|
|
718 for my $t (@tree) {
|
|
719 my ($x, $c, @u) = @$t;
|
|
720 $uses{$_} = 1 for @u;
|
|
721 }
|
|
722 my @uses = sort { $a <=> $b } keys %uses;
|
|
723 my ($code, $count) = _gencode($source, $left, $sprod->[1], \@tree,
|
|
724 $start, $place - $start, $bad, \@uses);
|
|
725 if (@$stack) {
|
|
726 # we were called by another nonterminal
|
|
727 my ($nsym, $nprd, $nelm, $nbad, @ntree) = @{pop @$stack};
|
|
728 $nbad += $bad;
|
|
729 push @state, [$nsym, $nprd, $nelm, $place, $nbad,
|
|
730 $stack, @ntree, [$code, $count, @uses]];
|
|
731 } else {
|
|
732 # top level symbol, in other words a (possibly partial) result
|
|
733 push @result, [$start, $place, $bad, $code, $count, @uses];
|
|
734 return \@result if $any;
|
|
735 }
|
|
736 }
|
|
737 if ($complete && $cpspace) {
|
|
738 $complete->{$_} = 1 for @{$grammar->{sspace}};
|
|
739 }
|
|
740 return \@result;
|
|
741 }
|
|
742
|
|
743 sub _gencode {
|
|
744 my ($source, $left, $right, $tree, $start, $length, $junk, $uses) = @_;
|
|
745 my $code = '';
|
|
746 for my $rp (@$right) {
|
|
747 my ($type, $value) = @$rp;
|
|
748 if ($type eq 'b') {
|
|
749 $code .= $value;
|
|
750 next;
|
|
751 }
|
|
752 if ($type eq 's') {
|
|
753 $code .= $tree->[$value][0];
|
|
754 next;
|
|
755 }
|
|
756 if ($type eq 'n') {
|
|
757 $code .= pack('C*', BC($tree->[$value][1]));
|
|
758 next;
|
|
759 }
|
|
760 if ($type eq 'c' || $type eq 'r') {
|
|
761 my ($place, $len) = @{$tree->[$value]};
|
|
762 my $const = substr($source, $place, $len);
|
|
763 my @v = unpack('C*', $const);
|
|
764 $code .= pack('C*', BC_MUL, map { BC($_) } scalar(@v), @v);
|
|
765 next;
|
|
766 }
|
|
767 if ($type eq '*') {
|
|
768 $code .= pack('C*', map { BC($_) } $start, $length, $junk,
|
|
769 scalar @$uses, @$uses);
|
|
770 next;
|
|
771 }
|
|
772 }
|
|
773 my $count = 0;
|
|
774 for (my $lp = 0; $lp < @$left; $lp++) {
|
|
775 my $lc = $left->[$lp][2];
|
|
776 $count += $lc == 0xffff ? $tree->[$lp][1] : $lc;
|
|
777 }
|
|
778 ($code, $count);
|
|
779 }
|
|
780
|
|
781 sub _find_starts {
|
|
782 my ($grammar) = @_;
|
|
783
|
|
784 # first find if any symbol can expand (directly) to empty strings
|
|
785 # or (directly) to another symbol; since information about each of
|
|
786 # these changes our idea of the other, we keep repeating until we
|
|
787 # cannot make any more changes
|
|
788 my $empty = '';
|
|
789
|
|
790 my $found = 0;
|
|
791 for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) {
|
|
792 my $plist = $grammar->{productions}[$symb];
|
|
793 next unless $plist;
|
|
794 for my $prod (@$plist) {
|
|
795 $prod->[2] = '';
|
|
796 $prod->[3] = '';
|
|
797 $prod->[4] = 0;
|
|
798 }
|
|
799 $found++;
|
|
800 }
|
|
801 return $empty unless $found;
|
|
802
|
|
803 my $continue = 1;
|
|
804 while ($continue) {
|
|
805 $continue = 0;
|
|
806 SYMB: for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) {
|
|
807 my $plist = $grammar->{productions}[$symb];
|
|
808 next unless $plist;
|
|
809 PROD: for my $prod (@$plist) {
|
|
810 # look at the first element of the production, if there's one
|
|
811 ELEM: for my $p (@{$prod->[0]}) {
|
|
812 if ($p->[0] eq 's') {
|
|
813 # that means we can access this particular symbol
|
|
814 $continue = 1 if ! vec($prod->[3], $p->[1], 1);
|
|
815 vec($prod->[3], $p->[1], 1) = 1;
|
|
816 # if we know the symbol can parse the empty string,
|
|
817 # we also need to check the next element
|
|
818 next if vec($empty, $p->[1], 1);
|
|
819 }
|
|
820 next if ($p->[0] eq 'c' && $p->[1] eq '') ||
|
|
821 ($p->[0] eq 'r' && $p->[3]->can_empty());
|
|
822 next PROD;
|
|
823 }
|
|
824 # if we get here, all productions are empty, so...
|
|
825 $continue = 1 if ! vec($empty, $symb, 1);
|
|
826 vec($empty, $symb, 1) = 1;
|
|
827 $prod->[4] = 1;
|
|
828 }
|
|
829 }
|
|
830 }
|
|
831
|
|
832 for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) {
|
|
833 my $plist = $grammar->{productions}[$symb];
|
|
834 next unless $plist;
|
|
835 for my $prod (@$plist) {
|
|
836 faint(SP_CIRCULAR, $grammar->{symboltable}->symbol($symb))
|
|
837 if vec($prod->[3], $symb, 1);
|
|
838 }
|
|
839 }
|
|
840
|
|
841 return $empty;
|
|
842 }
|
|
843
|
|
844 sub _convert_grammar {
|
|
845 my ($grammar) = @_;
|
|
846
|
|
847 return if exists $grammar->{converted};
|
|
848 my $empty = _find_starts($grammar);
|
|
849 my @i_total = map { '' } @{$grammar->{productions}};
|
|
850 my $predefs = $grammar->{predefined};
|
|
851
|
|
852 # first find the "direct" initials;
|
|
853 SYMB: for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) {
|
|
854 my $plist = $grammar->{productions}[$symb];
|
|
855 next unless $plist;
|
|
856 PROD: for my $prod (@$plist) {
|
|
857 next unless $prod;
|
|
858 $prod->[2] = '';
|
|
859 # look at the first element of the production, if there's one
|
|
860 ELEM: for my $p (@{$prod->[0]}) {
|
|
861 if ($p->[0] eq 'c') {
|
|
862 next if $p->[1] eq '';
|
|
863 my $l = ord(lc(substr($p->[1], 0, 1)));
|
|
864 my $u = ord(uc(substr($p->[1], 0, 1)));
|
|
865 vec($i_total[$symb], $l, 1) = 1;
|
|
866 vec($i_total[$symb], $u, 1) = 1;
|
|
867 vec($prod->[2], $l, 1) = 1;
|
|
868 vec($prod->[2], $u, 1) = 1;
|
|
869 next PROD;
|
|
870 }
|
|
871 if ($p->[0] eq 'r') {
|
|
872 my @i = $p->[3]->can_start();
|
|
873 for my $s (@i) {
|
|
874 vec($i_total[$symb], $s, 1) = 1;
|
|
875 vec($prod->[2], $s, 1) = 1;
|
|
876 }
|
|
877 next if $p->[3]->can_empty();
|
|
878 next PROD;
|
|
879 }
|
|
880 if ($p->[0] eq 's' && exists $predefs->{$p->[1]}) {
|
|
881 my $pf = $predefs->{$p->[1]};
|
|
882 my $st = $pf->[4];
|
|
883 my $em = $pf->[5];
|
|
884 $i_total[$symb] |= $st;
|
|
885 $prod->[2] |= $st;
|
|
886 next if $em;
|
|
887 next PROD;
|
|
888 }
|
|
889 next if ($p->[0] eq 's' && vec($empty, $p->[1], 1));
|
|
890 next PROD;
|
|
891 }
|
|
892 }
|
|
893 }
|
|
894
|
|
895 # now propagate %i_... using %starts
|
|
896 my $continue = 1;
|
|
897 while ($continue) {
|
|
898 $continue = 0;
|
|
899 for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) {
|
|
900 my $plist = $grammar->{productions}[$symb];
|
|
901 next unless $plist;
|
|
902 for my $prod (@$plist) {
|
|
903 for (my $other = 0; $other < 8 * length($prod->[3]); $other++) {
|
|
904 next unless vec($prod->[3], $other, 1);
|
|
905 my $init = $i_total[$other];
|
|
906 my $np = $prod->[2] | $init;
|
|
907 $continue = 1
|
|
908 if $np ne substr($prod->[2], 0, length($np));
|
|
909 $prod->[2] |= $init;
|
|
910 $i_total[$symb] |= $init;
|
|
911 }
|
|
912 }
|
|
913 }
|
|
914 }
|
|
915
|
|
916 my $mask = '';
|
|
917 for (my $symb = 1; $symb < @{$grammar->{productions}}; $symb++) {
|
|
918 if (exists $predefs->{$symb}) {
|
|
919 vec($mask, $symb, 1) = 1;
|
|
920 next;
|
|
921 }
|
|
922 my $plist = $grammar->{productions}[$symb];
|
|
923 next unless $plist;
|
|
924 for my $prod (@$plist) {
|
|
925 $prod->[3] = substr($prod->[3], 0, 1) & $mask;
|
|
926 }
|
|
927 }
|
|
928
|
|
929 $grammar->{converted} = 1;
|
|
930 }
|
|
931
|
|
932 sub _left_equal {
|
|
933 my ($l1, $l2) = @_;
|
|
934 return 0 if @$l1 != @$l2;
|
|
935 for (my $c = 0; $c < @$l1; $c++) {
|
|
936 my ($t1, $d1, $c1, $a1) = @{$l1->[$c]};
|
|
937 my ($t2, $d2, $c2, $a2) = @{$l2->[$c]};
|
|
938 return 0 if $t1 ne $t2;
|
|
939 return 0 if $c1 != $c2;
|
|
940 if ($t1 eq 's') {
|
|
941 return 0 if $d1 != $d2;
|
|
942 } elsif ($t1 eq 'c') {
|
|
943 return 0 if $d1 ne $d2;
|
|
944 } elsif ($t1 eq 'r') {
|
|
945 return 0 if ! $a1->is_equal($a2);
|
|
946 } else {
|
|
947 return 0;
|
|
948 }
|
|
949 }
|
|
950 return 1;
|
|
951 }
|
|
952
|
|
953 sub _right_equal {
|
|
954 my ($r1, $r2) = @_;
|
|
955 return 0 if @$r1 != @$r2;
|
|
956 for (my $c = 0; $c < @$r1; $c++) {
|
|
957 my ($t1, $v1) = @{$r1->[$c]};
|
|
958 my ($t2, $v2) = @{$r2->[$c]};
|
|
959 return 0 if $t1 ne $t2;
|
|
960 if ($t1 eq 'b') {
|
|
961 return 0 if $v1 ne $v2;
|
|
962 } elsif ($t1 eq 's' || $t1 eq 'c' || $t1 eq 'n' || $t1 eq 'r') {
|
|
963 return 0 if $v1 != $v2;
|
|
964 } elsif ($t1 ne '*') {
|
|
965 return 0;
|
|
966 }
|
|
967 }
|
|
968 return 1;
|
|
969 }
|
|
970
|
|
971 sub _find_rule {
|
|
972 my ($grammar, $symb, $left, $right) = @_;
|
|
973 return () if $symb >= @{$grammar->{productions}};
|
|
974 my $prods = $grammar->{productions}[$symb];
|
|
975 return () unless $prods;
|
|
976 my @found = ();
|
|
977 SYMB: for (my $pp = 0; $pp < @$prods; $pp++) {
|
|
978 my ($l, $r, $i, $s, $e, $c) = @{$prods->[$pp]};
|
|
979 # see if this production is same as $left and (if provided) $right
|
|
980 next SYMB if ! _left_equal($l, $left);
|
|
981 next SYMB if $right && ! _right_equal($r, $right);
|
|
982 push @found, $c;
|
|
983 }
|
|
984 return @found;
|
|
985 }
|
|
986
|
|
987 sub add {
|
|
988 @_ == 4 or croak "Usage: GRANMAR->add(SYMBOL, LEFT, RIGHT)";
|
|
989 my ($grammar, $symb, $left, $right) = @_;
|
|
990 $left = _convert_left($left);
|
|
991 $right = _convert_right($right, $left, $grammar);
|
|
992 # do we already have this production?
|
|
993 PROD: for my $prod (@{$grammar->{productions}[$symb]}) {
|
|
994 my ($l, $r, $i, $s, $e, $c) = @$prod;
|
|
995 next PROD if ! _left_equal($left, $l);
|
|
996 next PROD if ! _right_equal($right, $r);
|
|
997 # we have it, no need to add anything or make any changes
|
|
998 return -$c;
|
|
999 }
|
|
1000 my $prodnum = ++$grammar->{rule_count};
|
|
1001 push @{$grammar->{productions}[$symb]},
|
|
1002 [$left, $right, '', '', 0, $prodnum];
|
|
1003 delete $grammar->{converted};
|
|
1004 $prodnum;
|
|
1005 }
|
|
1006
|
|
1007 sub find_rule {
|
|
1008 @_ == 3 || @_ == 4
|
|
1009 or croak "Usage: GRANMAR->find_rule(SYMBOL, LEFT [, RIGHT])";
|
|
1010 my ($grammar, $symb, $left, $right) = @_;
|
|
1011 $left = _convert_left($left);
|
|
1012 $right = _convert_right($right, $left, $grammar) if $right;
|
|
1013 my @rules = _find_rule($grammar, $symb, $left, $right);
|
|
1014 wantarray ? @rules : $rules[0];
|
|
1015 }
|
|
1016
|
|
1017 1;
|