996
|
1 package Language::INTERCAL::Reggrim;
|
|
2
|
|
3 # Regular grimages - INTERCAL's answer to regular expressions
|
|
4
|
|
5 # This file is part of CLC-INTERCAL
|
|
6
|
|
7 # Copyright (c) 2007-2008 Claudio Calvelli, all rights reserved.
|
|
8
|
|
9 # CLC-INTERCAL is copyrighted software. However, permission to use, modify,
|
|
10 # and distribute it is granted provided that the conditions set out in the
|
|
11 # licence agreement are met. See files README and COPYING in the distribution.
|
|
12
|
|
13 use strict;
|
|
14 use vars qw($VERSION $PERVERSION);
|
|
15 ($VERSION) = ($PERVERSION = "CLC-INTERCAL/Base INTERCAL/Reggrim.pm 1.-94.-2") =~ /\s(\S+)$/;
|
|
16
|
|
17 use Carp;
|
|
18 use Language::INTERCAL::Exporter '1.-94.-2';
|
|
19 use Language::INTERCAL::Splats '1.-94.-2', qw(:SP);
|
|
20
|
|
21 my %backslash = (
|
|
22 '<' => ['s', 'func', qr/^\w/],
|
|
23 '=' => ['c', 'func', qr/^\w/],
|
|
24 '>' => ['e', 'func', qr/^\w/],
|
|
25 'b' => ['r', 'list', "\b"],
|
|
26 'd' => ['r', 'func', qr/^\d/],
|
|
27 'D' => ['r', 'func', qr/^\D/],
|
|
28 'e' => ['r', 'list', "\e"],
|
|
29 'f' => ['r', 'list', "\f"],
|
|
30 'n' => ['r', 'list', "\012"],
|
|
31 'r' => ['r', 'list', "\015"],
|
|
32 's' => ['r', 'func', qr/^\s/],
|
|
33 'S' => ['r', 'func', qr/^\S/],
|
|
34 't' => ['r', 'list', "\t"],
|
|
35 'w' => ['r', 'func', qr/^\w/],
|
|
36 'W' => ['r', 'func', qr/^\W/],
|
|
37 );
|
|
38
|
|
39 sub compile {
|
|
40 @_ == 2 or croak "Usage: Language::INTERCAL::Reggrim->compile(TEXT)";
|
|
41 my ($class, $text) = @_;
|
|
42 my $code = _compile($text, 0);
|
|
43 my $cnv = _convert($code);
|
|
44 bless $cnv, $class;
|
|
45 }
|
|
46
|
|
47 sub restore {
|
|
48 @_ == 2 or croak "Usage: Language::INTERCAL::Reggrim->restore(DATA)";
|
|
49 my ($class, $data) = @_;
|
|
50 faint(SP_TODO, "reglar grimaces"); # XXX
|
|
51 }
|
|
52
|
|
53 sub save {
|
|
54 @_ == 1 or croak "Usage: REGGRIM->save";
|
|
55 my ($rg) = @_;
|
|
56 faint(SP_TODO, "reglar grimaces"); # XXX
|
|
57 }
|
|
58
|
|
59 sub match {
|
|
60 @_ == 3 or croak "Usage: REGGRIM->match(STRING, PLACE)";
|
|
61 my ($rg, $string, $place) = @_;
|
|
62 faint(SP_TODO, "reglar grimaces"); # XXX
|
|
63 }
|
|
64
|
|
65 sub can_start {
|
|
66 @_ == 1 or croak "Usage: REGGRIM->can_start";
|
|
67 my ($rg) = @_;
|
|
68 faint(SP_TODO, "reglar grimaces"); # XXX
|
|
69 }
|
|
70
|
|
71 sub can_empty {
|
|
72 @_ == 1 or croak "Usage: REGGRIM->can_empty";
|
|
73 my ($rg) = @_;
|
|
74 faint(SP_TODO, "reglar grimaces"); # XXX
|
|
75 }
|
|
76
|
|
77 # private methods follow
|
|
78
|
|
79 use constant _RANGE => 256;
|
|
80
|
|
81 sub _compile {
|
|
82 my ($text, $in_group) = @_;
|
|
83 my $m = undef;
|
|
84 my $alt = undef;
|
|
85 while ($text ne '') {
|
|
86 my $c = substr($text, 0, 1, '');
|
|
87 if ($c eq '.') {
|
|
88 my $r = _full_range();
|
|
89 my $u = _make_range($r);
|
|
90 $u = _test_repeat($u, \$text);
|
|
91 $m = _sequence($m, $u);
|
|
92 next;
|
|
93 }
|
|
94 if ($c eq '\\') {
|
|
95 my ($m, $r, $x) = _backslash(\$text);
|
|
96 my $u;
|
|
97 if ($m eq 'r') {
|
|
98 $u = _make_range($r);
|
|
99 } elsif ($m eq 's') {
|
|
100 $u = _start_of($r);
|
|
101 } elsif ($m eq 'e') {
|
|
102 $u = _end_of($r);
|
|
103 } elsif ($m eq 'c') {
|
|
104 $u = _change($r);
|
|
105 } elsif ($m eq 'i') {
|
|
106 $u = _inside($r);
|
|
107 } elsif ($m eq 'o') {
|
|
108 $u = _inside(~ $r);
|
|
109 } else {
|
|
110 faint(SP_INTERNAL, "Invalid range type $m");
|
|
111 }
|
|
112 $u = _test_repeat($u, \$text);
|
|
113 $m = _sequence($m, $u);
|
|
114 next;
|
|
115 }
|
|
116 if ($c eq '[') {
|
|
117 my $r = empty_range();
|
|
118 my $neg = 0;
|
|
119 if ($$text =~ s/^\^//) {
|
|
120 $neg = 1;
|
|
121 }
|
|
122 if ($$text =~ s/^-//) {
|
|
123 vec($r, ord('-'), 1) = 1;
|
|
124 }
|
|
125 if ($$text =~ s/^\]//) {
|
|
126 vec($r, ord(']'), 1) = 1;
|
|
127 }
|
|
128 while (1) {
|
|
129 $$text eq '' and faint(SP_REGGRIM, "Invalid range");
|
|
130 $c = substr($$text, 0, 1, '');
|
|
131 if ($c eq ']') {
|
|
132 last;
|
|
133 }
|
|
134 if ($c eq '\\') {
|
|
135 my ($m, $b, $x) = _backslash(\$text);
|
|
136 if ($m ne 'r') {
|
|
137 faint(SP_INTERNAL, "Invalid escape in range");
|
|
138 }
|
|
139 if (! defined $x) {
|
|
140 $r |= $b;
|
|
141 next;
|
|
142 }
|
|
143 }
|
|
144 if ($$text =~ s/^-//) {
|
|
145 $$text eq '' and faint(SP_REGGRIM, "Invalid range");
|
|
146 my $d = substr($$text, 0, 1, '');
|
|
147 if ($d eq '\\') {
|
|
148 my ($m, $b, $x) = _backslash(\$text);
|
|
149 if ($m ne 'r' || ! defined $x) {
|
|
150 faint(SP_INTERNAL, "Invalid escape in range");
|
|
151 }
|
|
152 $d = $x;
|
|
153 }
|
|
154 $c = ord($c);
|
|
155 $d = ord($d);
|
|
156 $c <= $d or faint(SP_REGGRIM, "Time goes backwards");
|
|
157 while ($c <= $d) {
|
|
158 vec($r, ord(lc($c)), 1) = 1;
|
|
159 vec($r, ord(uc($c)), 1) = 1;
|
|
160 $c++;
|
|
161 }
|
|
162 } else {
|
|
163 vec($r, ord(lc($c)), 1) = 1;
|
|
164 vec($r, ord(uc($c)), 1) = 1;
|
|
165 }
|
|
166 }
|
|
167 $r = ~ $r if $neg;
|
|
168 my $u = _make_range($r);
|
|
169 $u = _test_repeat($u, \$text);
|
|
170 $m = _sequence($m, $u);
|
|
171 next;
|
|
172 }
|
|
173 if ($c eq '(') {
|
|
174 my $u = _compile($text, 1);
|
|
175 $u = _test_repeat($u, \$text);
|
|
176 $m = _sequence($m, $u);
|
|
177 next;
|
|
178 }
|
|
179 if ($c eq ')') {
|
|
180 return if $in_group;
|
|
181 faint(SP_REGGRIM, "Invalid group");
|
|
182 }
|
|
183 if ($c eq '|') {
|
|
184 $alt = _alternative($alt, $m);
|
|
185 $m = undef;
|
|
186 next;
|
|
187 }
|
|
188 if ($c eq '?' || $c eq '*' || $c eq '+' || $c eq '{') {
|
|
189 faint(SP_REGGRIM, "Misplaced $c");
|
|
190 }
|
|
191 my $r = _empty_range();
|
|
192 vec($r, ord(lc($c)), 1) = 1;
|
|
193 vec($r, ord(uc($c)), 1) = 1;
|
|
194 my $u = _make_range($r);
|
|
195 $u = _test_repeat($u, \$text);
|
|
196 $m = _sequence($m, $u);
|
|
197 next;
|
|
198 }
|
|
199 }
|
|
200
|
|
201 sub _empty_range {
|
|
202 my $r = '';
|
|
203 vec($r, _RANGE - 1, 1) = 0;
|
|
204 $r;
|
|
205 }
|
|
206
|
|
207 sub _full_range {
|
|
208 ~ _empty_range();
|
|
209 }
|
|
210
|
|
211 sub _make_range {
|
|
212 my ($r) = @_;
|
|
213 return {
|
|
214 initial => [[0, _full_range()]],
|
|
215 final => [[1, _full_range()]],
|
|
216 trans => [[1, $r], []],
|
|
217 };
|
|
218 }
|
|
219
|
|
220 sub _start_of {
|
|
221 my ($r) = @_;
|
|
222 return {
|
|
223 initial => [[0, ~ $r]],
|
|
224 final => [[0, $r]],
|
|
225 trans => [[]],
|
|
226 };
|
|
227 }
|
|
228
|
|
229 sub _end_of {
|
|
230 my ($r) = @_;
|
|
231 return {
|
|
232 initial => [[0, $r]],
|
|
233 final => [[0, ~ $r]],
|
|
234 trans => [[]],
|
|
235 };
|
|
236 }
|
|
237
|
|
238 sub _change {
|
|
239 my ($r) = @_;
|
|
240 return {
|
|
241 initial => [[0, ~ $r], [1, $r]],
|
|
242 final => [[0, $r], [1, ~ $r]],
|
|
243 trans => [[], []],
|
|
244 };
|
|
245 }
|
|
246
|
|
247 sub _inside {
|
|
248 my ($r) = @_;
|
|
249 return {
|
|
250 initial => [[0, $r]],
|
|
251 final => [[0, $r]],
|
|
252 trans => [[]],
|
|
253 };
|
|
254 }
|
|
255
|
|
256 sub _test_repeat {
|
|
257 my ($u, $text) = @_;
|
|
258 my $c = substr($$text, 0, 1, '');
|
|
259 $c eq '?' and return _repeat($u, 0, 1);
|
|
260 $c eq '*' and return _repeat($u, 0, undef);
|
|
261 $c eq '+' and return _repeat($u, 1, undef);
|
|
262 if ($c ne '}') { $$text = $c . $$text; return $u }
|
|
263 my ($min, $max) = (0, undef);
|
|
264 $$text =~ s/^(\d+)// and $min = $1;
|
|
265 $$text =~ s/^,(\d+)// and $max = $1;
|
|
266 $$text =~ s/^\}// or faint(SP_REGGRIM, "Missing }");
|
|
267 _repeat($u, $min, $max);
|
|
268 }
|
|
269
|
|
270 sub _sequence {
|
|
271 my ($m1, $m2) = @_;
|
|
272 defined $m1 or return $m2;
|
|
273 defined $m2 or return $m1;
|
|
274 _sequence_or_star($m1, $m2);
|
|
275 }
|
|
276
|
|
277 sub _star {
|
|
278 my ($m) = @_;
|
|
279 defined $m or return $m;
|
|
280 _sequence_or_star($m, undef);
|
|
281 }
|
|
282
|
|
283 sub _repeat {
|
|
284 my ($m, $min, $max) = @_;
|
|
285 defined $max && $min > $max and faint(SP_REGGRIM, "Time goes backwards");
|
|
286 my $m1;
|
|
287 if ($min > 0) {
|
|
288 $m1 = _repeat_exactly($m, $min);
|
|
289 $max -= $min if defined $max;
|
|
290 } else {
|
|
291 $m1 = $m;
|
|
292 }
|
|
293 if (! defined $max) {
|
|
294 return _sequence($m1, _star($m));
|
|
295 }
|
|
296 if ($max == 0) {
|
|
297 return $m1;
|
|
298 }
|
|
299 my $m2 = _alternative($m, undef); # zero or one times
|
|
300 while ($max > 0) {
|
|
301 $m1 = _sequence($m1, $m2);
|
|
302 $max--;
|
|
303 }
|
|
304 $m1;
|
|
305 }
|
|
306
|
|
307 sub _repeat_exactly {
|
|
308 my ($m, $num) = @_;
|
|
309 $num < 1 and return undef;
|
|
310 defined $m or return $m;
|
|
311 $num == 1 and return $m;
|
|
312 my $num2 = int($num / 2);
|
|
313 my $m1 = _repeat_exactly($m, $num2);
|
|
314 my $m2 = _repeat_exactly($m, $num - $num2);
|
|
315 _sequence($m1, $m2);
|
|
316 }
|
|
317
|
|
318 sub _alternative {
|
|
319 my ($m1, $m2) = @_;
|
|
320 defined $m1 or return $m2;
|
|
321 defined $m2 or return _repeat($m1, 0, 1); # m1|(empty)
|
|
322 my @i = @{$m1->{initial}};
|
|
323 my @f = @{$m1->{final}};
|
|
324 my @t = @{$m1->{trans}};
|
|
325 my $diff = @t;
|
|
326 for my $i (@{$m2->{initial}}) {
|
|
327 my ($to, $range) = @$i;
|
|
328 push @i, [$to + $diff, $range];
|
|
329 }
|
|
330 for my $f (@{$m2->{final}}) {
|
|
331 my ($from, $range) = @$f;
|
|
332 push @f, [$from + $diff, $range];
|
|
333 }
|
|
334 for my $t (@{$m2->{trans}}) {
|
|
335 my @state = ();
|
|
336 for my $s (@$t) {
|
|
337 my ($from, $to, $range) = @$s;
|
|
338 push @state, [$from + $diff, $to + $diff, $range];
|
|
339 }
|
|
340 push @t, \@state;
|
|
341 }
|
|
342 return {
|
|
343 initial => \@i,
|
|
344 final => \@f,
|
|
345 trans => \@t,
|
|
346 };
|
|
347 }
|
|
348
|
|
349 sub _backslash {
|
|
350 my ($text) = @_;
|
|
351 $$text eq '' and faint(SP_REGGRIM, "There is no escape!");
|
|
352 my $c = substr($$text, 0, 1, '');
|
|
353 if (exists $backslash{$c}) {
|
|
354 my ($m, $t, $f, $z) = @{$backslash{$c}};
|
|
355 if ($t eq 'func') {
|
|
356 if (! defined $z) {
|
|
357 $z = '';
|
|
358 for (my $i = 0; $i < _RANGE; $i++) {
|
|
359 my $d = chr($i);
|
|
360 $z .= $d if $d =~ $f;
|
|
361 }
|
|
362 $backslash{$c}[3] = $z;
|
|
363 }
|
|
364 $f = $z;
|
|
365 } elsif ($t ne 'list') {
|
|
366 faint(SP_INTERNAL, "Invalid escape in reggrim tables");
|
|
367 }
|
|
368 my $x = length($f) == 1 ? $f : undef;
|
|
369 my $r = _empty_range();
|
|
370 while ($f ne '') {
|
|
371 my $d = substr($f, 0, 1, '');
|
|
372 vec($r, ord(lc($d)), 1) = 1;
|
|
373 vec($r, ord(uc($d)), 1) = 1;
|
|
374 }
|
|
375 return ($m, $r, $x);
|
|
376 } else {
|
|
377 my $r = _empty_range();
|
|
378 vec($r, ord(lc($c)), 1) = 1;
|
|
379 vec($r, ord(uc($c)), 1) = 1;
|
|
380 return ('r', $r, $c);
|
|
381 }
|
|
382 }
|
|
383
|
|
384 # XXX sub _sequence_or_star
|
|
385 # XXX sub _convert
|
|
386
|
|
387 1;
|