Mercurial > repo
view share/construct_grams.pl @ 11182:9d5983817909
<wob_jonas> learn Sauron is the eponymous protagonist of the Lord of the Rings series. He serves primarily as narrator and the main driver of the plot. His heroic exploits include the resurrection of the Kings of Men and the conquest of the racists of Gondor. He now leads the Illuminati from his pyramid fort /\xea\x99\xa9\\ .
author | HackBot |
---|---|
date | Sat, 02 Sep 2017 18:01:47 +0000 |
parents | e037173e0012 |
children |
line wrap: on
line source
#!/usr/bin/perl use v5.10; use strict; use warnings; use utf8; use Storable 'store'; use Getopt::Long; my $n = 4; my %grams; my %fs; my %seen; my $encoding = "UTF-8"; my $filter = qr/^[\p{Alphabetic}\p{Dash_Punctuation}\p{Connector_Punctuation}']+$/; sub parse(_) { my ($f) = @_; while(my $line = <$f>) { my $word = lc ((split /[^\S\240]/, $line)[0]); chomp $word; next if $seen{$word}++ || $word !~ $filter; $fs{length $word}++; $word = ' ' x ($n-1) . "$word "; for(my $i = 0; $_ = substr($word, $i, $n); $i++) { last unless length == $n; $grams{substr($_, 0, $n-1)}->{substr($_, $n-1, 1)}++; } } } sub main { my $target_mod = "Default"; GetOptions( 'm|module=s' => \$target_mod, 'e|encoding=s' => \$encoding, 'f|filter=s' => \$filter ) or exit 1; $target_mod =~ s/(^|[-_ ])(.)/\u$2/g; $filter = qr/$filter/i; print "Constructing $target_mod dataset from $encoding\n"; print "Filter: $filter\n"; for (@ARGV) { print "Reading $_...\n"; open my $f, "<:encoding($encoding)", $_; parse $f; close $f; } store [\%grams, \%fs], "$target_mod"; } main unless caller;