Mercurial > repo
view share/construct_grams.pl @ 2012:c4f83ae4e77e
<olsner> addquote <fungot> but when she saw him fnord and fnord. and then there\'s the fnord, as well as fnord reading sauce with fish, or fnord, that alice quite fnord for it hadn\'t spoken before.
author | HackBot |
---|---|
date | Tue, 05 Feb 2013 23:27:58 +0000 |
parents | e037173e0012 |
children |
line wrap: on
line source
#!/usr/bin/perl use v5.10; use strict; use warnings; use utf8; use Storable 'store'; use Getopt::Long; my $n = 4; my %grams; my %fs; my %seen; my $encoding = "UTF-8"; my $filter = qr/^[\p{Alphabetic}\p{Dash_Punctuation}\p{Connector_Punctuation}']+$/; sub parse(_) { my ($f) = @_; while(my $line = <$f>) { my $word = lc ((split /[^\S\240]/, $line)[0]); chomp $word; next if $seen{$word}++ || $word !~ $filter; $fs{length $word}++; $word = ' ' x ($n-1) . "$word "; for(my $i = 0; $_ = substr($word, $i, $n); $i++) { last unless length == $n; $grams{substr($_, 0, $n-1)}->{substr($_, $n-1, 1)}++; } } } sub main { my $target_mod = "Default"; GetOptions( 'm|module=s' => \$target_mod, 'e|encoding=s' => \$encoding, 'f|filter=s' => \$filter ) or exit 1; $target_mod =~ s/(^|[-_ ])(.)/\u$2/g; $filter = qr/$filter/i; print "Constructing $target_mod dataset from $encoding\n"; print "Filter: $filter\n"; for (@ARGV) { print "Reading $_...\n"; open my $f, "<:encoding($encoding)", $_; parse $f; close $f; } store [\%grams, \%fs], "$target_mod"; } main unless caller;