comparison share/construct_grams.pl @ 0:e037173e0012

Initial import.
author HackBot
date Thu, 16 Feb 2012 19:42:32 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e037173e0012
1 #!/usr/bin/perl
2 use v5.10;
3 use strict; use warnings;
4 use utf8;
5 use Storable 'store';
6 use Getopt::Long;
7
8 my $n = 4;
9
10 my %grams;
11 my %fs;
12 my %seen;
13
14 my $encoding = "UTF-8";
15 my $filter = qr/^[\p{Alphabetic}\p{Dash_Punctuation}\p{Connector_Punctuation}']+$/;
16
17 sub parse(_) {
18 my ($f) = @_;
19 while(my $line = <$f>) {
20 my $word = lc ((split /[^\S\240]/, $line)[0]);
21 chomp $word;
22 next if $seen{$word}++ || $word !~ $filter;
23 $fs{length $word}++;
24 $word = ' ' x ($n-1) . "$word ";
25 for(my $i = 0; $_ = substr($word, $i, $n); $i++) {
26 last unless length == $n;
27 $grams{substr($_, 0, $n-1)}->{substr($_, $n-1, 1)}++;
28 }
29 }
30 }
31
32 sub main {
33 my $target_mod = "Default";
34 GetOptions(
35 'm|module=s' => \$target_mod,
36 'e|encoding=s' => \$encoding,
37 'f|filter=s' => \$filter
38 ) or exit 1;
39 $target_mod =~ s/(^|[-_ ])(.)/\u$2/g;
40 $filter = qr/$filter/i;
41 print "Constructing $target_mod dataset from $encoding\n";
42 print "Filter: $filter\n";
43 for (@ARGV) {
44 print "Reading $_...\n";
45 open my $f, "<:encoding($encoding)", $_;
46 parse $f;
47 close $f;
48 }
49 store [\%grams, \%fs], "$target_mod";
50 }
51
52 main unless caller;