annotate share/construct_grams.pl @ 12257:1924fe176291 draft

<fizzie> ` sed -e \'s|wisdom|bin|\' < ../bin/cwlprits > ../bin/cblprits; chmod a+x ../bin/cblprits
author HackEso <hackeso@esolangs.org>
date Sat, 07 Dec 2019 23:36:53 +0000
parents e037173e0012
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e037173e0012 Initial import.
HackBot
parents:
diff changeset
1 #!/usr/bin/perl
e037173e0012 Initial import.
HackBot
parents:
diff changeset
2 use v5.10;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
3 use strict; use warnings;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
4 use utf8;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
5 use Storable 'store';
e037173e0012 Initial import.
HackBot
parents:
diff changeset
6 use Getopt::Long;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
7
e037173e0012 Initial import.
HackBot
parents:
diff changeset
8 my $n = 4;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
9
e037173e0012 Initial import.
HackBot
parents:
diff changeset
10 my %grams;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
11 my %fs;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
12 my %seen;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
13
e037173e0012 Initial import.
HackBot
parents:
diff changeset
14 my $encoding = "UTF-8";
e037173e0012 Initial import.
HackBot
parents:
diff changeset
15 my $filter = qr/^[\p{Alphabetic}\p{Dash_Punctuation}\p{Connector_Punctuation}']+$/;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
16
e037173e0012 Initial import.
HackBot
parents:
diff changeset
17 sub parse(_) {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
18 my ($f) = @_;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
19 while(my $line = <$f>) {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
20 my $word = lc ((split /[^\S\240]/, $line)[0]);
e037173e0012 Initial import.
HackBot
parents:
diff changeset
21 chomp $word;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
22 next if $seen{$word}++ || $word !~ $filter;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
23 $fs{length $word}++;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
24 $word = ' ' x ($n-1) . "$word ";
e037173e0012 Initial import.
HackBot
parents:
diff changeset
25 for(my $i = 0; $_ = substr($word, $i, $n); $i++) {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
26 last unless length == $n;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
27 $grams{substr($_, 0, $n-1)}->{substr($_, $n-1, 1)}++;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
28 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
29 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
30 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
31
e037173e0012 Initial import.
HackBot
parents:
diff changeset
32 sub main {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
33 my $target_mod = "Default";
e037173e0012 Initial import.
HackBot
parents:
diff changeset
34 GetOptions(
e037173e0012 Initial import.
HackBot
parents:
diff changeset
35 'm|module=s' => \$target_mod,
e037173e0012 Initial import.
HackBot
parents:
diff changeset
36 'e|encoding=s' => \$encoding,
e037173e0012 Initial import.
HackBot
parents:
diff changeset
37 'f|filter=s' => \$filter
e037173e0012 Initial import.
HackBot
parents:
diff changeset
38 ) or exit 1;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
39 $target_mod =~ s/(^|[-_ ])(.)/\u$2/g;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
40 $filter = qr/$filter/i;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
41 print "Constructing $target_mod dataset from $encoding\n";
e037173e0012 Initial import.
HackBot
parents:
diff changeset
42 print "Filter: $filter\n";
e037173e0012 Initial import.
HackBot
parents:
diff changeset
43 for (@ARGV) {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
44 print "Reading $_...\n";
e037173e0012 Initial import.
HackBot
parents:
diff changeset
45 open my $f, "<:encoding($encoding)", $_;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
46 parse $f;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
47 close $f;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
48 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
49 store [\%grams, \%fs], "$target_mod";
e037173e0012 Initial import.
HackBot
parents:
diff changeset
50 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
51
e037173e0012 Initial import.
HackBot
parents:
diff changeset
52 main unless caller;