annotate share/construct_grams.pl @ 11340:77399ae45cb1

<wob_jonas> slashlearn peace witch//Peace witches do alchemy: they turn mundane building material to gold. They\'re in the same universe where Bowser turned peaceful citizens of the Mushroom Kingdom to building material.
author HackBot
date Tue, 06 Feb 2018 23:37:00 +0000
parents e037173e0012
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e037173e0012 Initial import.
HackBot
parents:
diff changeset
1 #!/usr/bin/perl
e037173e0012 Initial import.
HackBot
parents:
diff changeset
2 use v5.10;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
3 use strict; use warnings;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
4 use utf8;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
5 use Storable 'store';
e037173e0012 Initial import.
HackBot
parents:
diff changeset
6 use Getopt::Long;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
7
e037173e0012 Initial import.
HackBot
parents:
diff changeset
8 my $n = 4;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
9
e037173e0012 Initial import.
HackBot
parents:
diff changeset
10 my %grams;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
11 my %fs;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
12 my %seen;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
13
e037173e0012 Initial import.
HackBot
parents:
diff changeset
14 my $encoding = "UTF-8";
e037173e0012 Initial import.
HackBot
parents:
diff changeset
15 my $filter = qr/^[\p{Alphabetic}\p{Dash_Punctuation}\p{Connector_Punctuation}']+$/;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
16
e037173e0012 Initial import.
HackBot
parents:
diff changeset
17 sub parse(_) {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
18 my ($f) = @_;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
19 while(my $line = <$f>) {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
20 my $word = lc ((split /[^\S\240]/, $line)[0]);
e037173e0012 Initial import.
HackBot
parents:
diff changeset
21 chomp $word;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
22 next if $seen{$word}++ || $word !~ $filter;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
23 $fs{length $word}++;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
24 $word = ' ' x ($n-1) . "$word ";
e037173e0012 Initial import.
HackBot
parents:
diff changeset
25 for(my $i = 0; $_ = substr($word, $i, $n); $i++) {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
26 last unless length == $n;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
27 $grams{substr($_, 0, $n-1)}->{substr($_, $n-1, 1)}++;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
28 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
29 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
30 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
31
e037173e0012 Initial import.
HackBot
parents:
diff changeset
32 sub main {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
33 my $target_mod = "Default";
e037173e0012 Initial import.
HackBot
parents:
diff changeset
34 GetOptions(
e037173e0012 Initial import.
HackBot
parents:
diff changeset
35 'm|module=s' => \$target_mod,
e037173e0012 Initial import.
HackBot
parents:
diff changeset
36 'e|encoding=s' => \$encoding,
e037173e0012 Initial import.
HackBot
parents:
diff changeset
37 'f|filter=s' => \$filter
e037173e0012 Initial import.
HackBot
parents:
diff changeset
38 ) or exit 1;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
39 $target_mod =~ s/(^|[-_ ])(.)/\u$2/g;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
40 $filter = qr/$filter/i;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
41 print "Constructing $target_mod dataset from $encoding\n";
e037173e0012 Initial import.
HackBot
parents:
diff changeset
42 print "Filter: $filter\n";
e037173e0012 Initial import.
HackBot
parents:
diff changeset
43 for (@ARGV) {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
44 print "Reading $_...\n";
e037173e0012 Initial import.
HackBot
parents:
diff changeset
45 open my $f, "<:encoding($encoding)", $_;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
46 parse $f;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
47 close $f;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
48 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
49 store [\%grams, \%fs], "$target_mod";
e037173e0012 Initial import.
HackBot
parents:
diff changeset
50 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
51
e037173e0012 Initial import.
HackBot
parents:
diff changeset
52 main unless caller;