view share/construct_grams.pl @ 10176:00ca58f2763c

<oerjan> learn The `words dictionary framework was designed by H\xc3\xa5lgar Oslekk, Bick Noffrey, Guiston Degra\xc3\xaeme, Myyntti Raatalla, G\xc3\xb6lrika Rosenskild, Waslomir Siwovich, Gy\xc5\x91rvan S\xc3\xa1rbik, and Pastronella Gattrovezzi.
author HackBot
date Sat, 28 Jan 2017 18:46:09 +0000
parents e037173e0012
children
line wrap: on
line source

#!/usr/bin/perl
use v5.10;
use strict; use warnings;
use utf8;
use Storable 'store';
use Getopt::Long;

my $n = 4;

my %grams;
my %fs;
my %seen;

my $encoding = "UTF-8";
my $filter = qr/^[\p{Alphabetic}\p{Dash_Punctuation}\p{Connector_Punctuation}']+$/;

sub parse(_) {
    my ($f) = @_;
    while(my $line = <$f>) {
        my $word = lc ((split /[^\S\240]/, $line)[0]);
        chomp $word;
        next if $seen{$word}++ || $word !~ $filter;
        $fs{length $word}++;
        $word = ' ' x ($n-1) . "$word ";
        for(my $i = 0; $_ = substr($word, $i, $n); $i++) {
            last unless length == $n;
            $grams{substr($_, 0, $n-1)}->{substr($_, $n-1, 1)}++;
        }
    }
}

sub main {
    my $target_mod = "Default";
    GetOptions(
               'm|module=s' => \$target_mod,
               'e|encoding=s' => \$encoding,
               'f|filter=s' => \$filter
              ) or exit 1;
    $target_mod =~ s/(^|[-_ ])(.)/\u$2/g;
    $filter = qr/$filter/i;
    print "Constructing $target_mod dataset from $encoding\n";
    print "Filter: $filter\n";
    for (@ARGV) {
        print "Reading $_...\n";
        open my $f, "<:encoding($encoding)", $_;
        parse $f;
        close $f;
    }
    store [\%grams, \%fs], "$target_mod";
}

main unless caller;