view share/construct_grams.pl @ 11340:77399ae45cb1

<wob_jonas> slashlearn peace witch//Peace witches do alchemy: they turn mundane building material to gold. They\'re in the same universe where Bowser turned peaceful citizens of the Mushroom Kingdom to building material.
author HackBot
date Tue, 06 Feb 2018 23:37:00 +0000
parents e037173e0012
children
line wrap: on
line source

#!/usr/bin/perl
use v5.10;
use strict; use warnings;
use utf8;
use Storable 'store';
use Getopt::Long;

my $n = 4;

my %grams;
my %fs;
my %seen;

my $encoding = "UTF-8";
my $filter = qr/^[\p{Alphabetic}\p{Dash_Punctuation}\p{Connector_Punctuation}']+$/;

sub parse(_) {
    my ($f) = @_;
    while(my $line = <$f>) {
        my $word = lc ((split /[^\S\240]/, $line)[0]);
        chomp $word;
        next if $seen{$word}++ || $word !~ $filter;
        $fs{length $word}++;
        $word = ' ' x ($n-1) . "$word ";
        for(my $i = 0; $_ = substr($word, $i, $n); $i++) {
            last unless length == $n;
            $grams{substr($_, 0, $n-1)}->{substr($_, $n-1, 1)}++;
        }
    }
}

sub main {
    my $target_mod = "Default";
    GetOptions(
               'm|module=s' => \$target_mod,
               'e|encoding=s' => \$encoding,
               'f|filter=s' => \$filter
              ) or exit 1;
    $target_mod =~ s/(^|[-_ ])(.)/\u$2/g;
    $filter = qr/$filter/i;
    print "Constructing $target_mod dataset from $encoding\n";
    print "Filter: $filter\n";
    for (@ARGV) {
        print "Reading $_...\n";
        open my $f, "<:encoding($encoding)", $_;
        parse $f;
        close $f;
    }
    store [\%grams, \%fs], "$target_mod";
}

main unless caller;