view share/construct_grams.pl @ 2651:4c617ce2bbe1

<Jafet> echo -e \'\\x7fELF\\x02\\x01\\x01\\0\\0\\0\\0\\0\\0\\0\\0\\0\\x02\\0>\\0\\x01\\0\\0\\0x\\0@\\0\\0\\0\\0\\0@\\0\\0\\0\\0\\0\\0\\0\\xb0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0@\\08\\0\\x01\\0@\\0\\x03\\0\\x02\\0\\x01\\0\\0\\0\\x05\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0@\\0\\0\\0\\0\\0\\0\\0@\\0\\0\\0\\0\\0\\x9a\\0\\0\\0\\0\\0\\0\\0\\x9a\\0\\0\\0\\0\\0\\0\\0\\0\\0 \\0\\0\\0\\0\\0H\\xc7\\xc7\\x01\\0\\0\\0H\\xc7\\xc6\\x98\\0@\\0H\\xc7\\xc2\\x02\\0\\0\\0H\\xc7\\xc0\\x01\\0\\0\\0\\x0f\\x05\\xeb\\xf5y\' > bin/y && chmod +x bin/y
author HackBot
date Sat, 13 Apr 2013 22:29:43 +0000
parents e037173e0012
children
line wrap: on
line source

#!/usr/bin/perl
use v5.10;
use strict; use warnings;
use utf8;
use Storable 'store';
use Getopt::Long;

my $n = 4;

my %grams;
my %fs;
my %seen;

my $encoding = "UTF-8";
my $filter = qr/^[\p{Alphabetic}\p{Dash_Punctuation}\p{Connector_Punctuation}']+$/;

sub parse(_) {
    my ($f) = @_;
    while(my $line = <$f>) {
        my $word = lc ((split /[^\S\240]/, $line)[0]);
        chomp $word;
        next if $seen{$word}++ || $word !~ $filter;
        $fs{length $word}++;
        $word = ' ' x ($n-1) . "$word ";
        for(my $i = 0; $_ = substr($word, $i, $n); $i++) {
            last unless length == $n;
            $grams{substr($_, 0, $n-1)}->{substr($_, $n-1, 1)}++;
        }
    }
}

sub main {
    my $target_mod = "Default";
    GetOptions(
               'm|module=s' => \$target_mod,
               'e|encoding=s' => \$encoding,
               'f|filter=s' => \$filter
              ) or exit 1;
    $target_mod =~ s/(^|[-_ ])(.)/\u$2/g;
    $filter = qr/$filter/i;
    print "Constructing $target_mod dataset from $encoding\n";
    print "Filter: $filter\n";
    for (@ARGV) {
        print "Reading $_...\n";
        open my $f, "<:encoding($encoding)", $_;
        parse $f;
        close $f;
    }
    store [\%grams, \%fs], "$target_mod";
}

main unless caller;