Mercurial > repo
diff share/construct_grams.pl @ 0:e037173e0012
Initial import.
author | HackBot |
---|---|
date | Thu, 16 Feb 2012 19:42:32 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/share/construct_grams.pl Thu Feb 16 19:42:32 2012 +0000 @@ -0,0 +1,52 @@ +#!/usr/bin/perl +use v5.10; +use strict; use warnings; +use utf8; +use Storable 'store'; +use Getopt::Long; + +my $n = 4; + +my %grams; +my %fs; +my %seen; + +my $encoding = "UTF-8"; +my $filter = qr/^[\p{Alphabetic}\p{Dash_Punctuation}\p{Connector_Punctuation}']+$/; + +sub parse(_) { + my ($f) = @_; + while(my $line = <$f>) { + my $word = lc ((split /[^\S\240]/, $line)[0]); + chomp $word; + next if $seen{$word}++ || $word !~ $filter; + $fs{length $word}++; + $word = ' ' x ($n-1) . "$word "; + for(my $i = 0; $_ = substr($word, $i, $n); $i++) { + last unless length == $n; + $grams{substr($_, 0, $n-1)}->{substr($_, $n-1, 1)}++; + } + } +} + +sub main { + my $target_mod = "Default"; + GetOptions( + 'm|module=s' => \$target_mod, + 'e|encoding=s' => \$encoding, + 'f|filter=s' => \$filter + ) or exit 1; + $target_mod =~ s/(^|[-_ ])(.)/\u$2/g; + $filter = qr/$filter/i; + print "Constructing $target_mod dataset from $encoding\n"; + print "Filter: $filter\n"; + for (@ARGV) { + print "Reading $_...\n"; + open my $f, "<:encoding($encoding)", $_; + parse $f; + close $f; + } + store [\%grams, \%fs], "$target_mod"; +} + +main unless caller;