Mercurial > repo
comparison share/construct_grams.pl @ 0:e037173e0012
Initial import.
author | HackBot |
---|---|
date | Thu, 16 Feb 2012 19:42:32 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e037173e0012 |
---|---|
1 #!/usr/bin/perl | |
2 use v5.10; | |
3 use strict; use warnings; | |
4 use utf8; | |
5 use Storable 'store'; | |
6 use Getopt::Long; | |
7 | |
8 my $n = 4; | |
9 | |
10 my %grams; | |
11 my %fs; | |
12 my %seen; | |
13 | |
14 my $encoding = "UTF-8"; | |
15 my $filter = qr/^[\p{Alphabetic}\p{Dash_Punctuation}\p{Connector_Punctuation}']+$/; | |
16 | |
17 sub parse(_) { | |
18 my ($f) = @_; | |
19 while(my $line = <$f>) { | |
20 my $word = lc ((split /[^\S\240]/, $line)[0]); | |
21 chomp $word; | |
22 next if $seen{$word}++ || $word !~ $filter; | |
23 $fs{length $word}++; | |
24 $word = ' ' x ($n-1) . "$word "; | |
25 for(my $i = 0; $_ = substr($word, $i, $n); $i++) { | |
26 last unless length == $n; | |
27 $grams{substr($_, 0, $n-1)}->{substr($_, $n-1, 1)}++; | |
28 } | |
29 } | |
30 } | |
31 | |
32 sub main { | |
33 my $target_mod = "Default"; | |
34 GetOptions( | |
35 'm|module=s' => \$target_mod, | |
36 'e|encoding=s' => \$encoding, | |
37 'f|filter=s' => \$filter | |
38 ) or exit 1; | |
39 $target_mod =~ s/(^|[-_ ])(.)/\u$2/g; | |
40 $filter = qr/$filter/i; | |
41 print "Constructing $target_mod dataset from $encoding\n"; | |
42 print "Filter: $filter\n"; | |
43 for (@ARGV) { | |
44 print "Reading $_...\n"; | |
45 open my $f, "<:encoding($encoding)", $_; | |
46 parse $f; | |
47 close $f; | |
48 } | |
49 store [\%grams, \%fs], "$target_mod"; | |
50 } | |
51 | |
52 main unless caller; |