0
|
1 #!/usr/bin/perl
|
|
2 use v5.10;
|
|
3 use strict; use warnings;
|
|
4 use utf8;
|
|
5 use Storable 'store';
|
|
6 use Getopt::Long;
|
|
7
|
|
8 my $n = 4;
|
|
9
|
|
10 my %grams;
|
|
11 my %fs;
|
|
12 my %seen;
|
|
13
|
|
14 my $encoding = "UTF-8";
|
|
15 my $filter = qr/^[\p{Alphabetic}\p{Dash_Punctuation}\p{Connector_Punctuation}']+$/;
|
|
16
|
|
17 sub parse(_) {
|
|
18 my ($f) = @_;
|
|
19 while(my $line = <$f>) {
|
|
20 my $word = lc ((split /[^\S\240]/, $line)[0]);
|
|
21 chomp $word;
|
|
22 next if $seen{$word}++ || $word !~ $filter;
|
|
23 $fs{length $word}++;
|
|
24 $word = ' ' x ($n-1) . "$word ";
|
|
25 for(my $i = 0; $_ = substr($word, $i, $n); $i++) {
|
|
26 last unless length == $n;
|
|
27 $grams{substr($_, 0, $n-1)}->{substr($_, $n-1, 1)}++;
|
|
28 }
|
|
29 }
|
|
30 }
|
|
31
|
|
32 sub main {
|
|
33 my $target_mod = "Default";
|
|
34 GetOptions(
|
|
35 'm|module=s' => \$target_mod,
|
|
36 'e|encoding=s' => \$encoding,
|
|
37 'f|filter=s' => \$filter
|
|
38 ) or exit 1;
|
|
39 $target_mod =~ s/(^|[-_ ])(.)/\u$2/g;
|
|
40 $filter = qr/$filter/i;
|
|
41 print "Constructing $target_mod dataset from $encoding\n";
|
|
42 print "Filter: $filter\n";
|
|
43 for (@ARGV) {
|
|
44 print "Reading $_...\n";
|
|
45 open my $f, "<:encoding($encoding)", $_;
|
|
46 parse $f;
|
|
47 close $f;
|
|
48 }
|
|
49 store [\%grams, \%fs], "$target_mod";
|
|
50 }
|
|
51
|
|
52 main unless caller;
|