view bin/words @ 11293:a7899ef2d7b6

<wob_jonas> learn Aristotle said that every illness can be cured by balancing the four vitreous humors, and everyone believed him for two thousand years, even though people still died of illnesses. It wasn\'t until the 20th century that Szent-Gy\xc3\xb6rgyi Albert realized that Aristotle didn\'t find fifth kind of vitreous humor, vitamin C, because the Greek alphabet
author HackBot
date Mon, 01 Jan 2018 17:57:43 +0000
parents e037173e0012
children 128e8ef9d583
line wrap: on
line source

#!/usr/bin/perl
use strict; use warnings;
use v5.10;
use open qw( :encoding(UTF-8) :std);
use File::Basename 'dirname';
use Storable 'retrieve';
use List::Util qw(sum min);
use Getopt::Long qw(:config gnu_getopt);
BEGIN {
    eval {
        require Math::Random::MT::Perl; Math::Random::MT::Perl->import('rand');
    };
#warn "Optional module Math::Random::MT::Perl not found.\n" if $@;
}

#constants
my @options = qw(eng-1M eng-all eng-fiction eng-gb eng-us french german hebrew russian spanish irish german-medical bulgarian catalan swedish brazilian canadian-english-insane manx italian ogerman portuguese polish gaelic finnish norwegian esolangs);
my $n = 4;
my $default_opt     = "--eng-1M";
(my $default_dataset = $default_opt) =~ s/(^|-+)([^-])/\u$2/g;

#help info
my $help_text = <<END
Usage: words [-dhNo] [DATASETS...] [NUMBER_OF_WORDS]

options:
  -l, --list             list valid datasets
  -d, --debug            debugging output
  -N, --dont-normalize   don't normalize frequencies when combining
                         multiple Markov models; this has the effect
                         of making larger datasets more influential
  -o, --target-offset    change the target length offset used in the
                         word generation algorithm; use negative integers
                         for best results
END
;

my $list_text = <<END
valid datasets: --@{[join ' --', @options]}
default: $default_opt
END
;

#data from loaded files
my @loaded_data;

#data after normalizing and combining datasets
my $grams;
my $freqs;

#some command line options
my $debug_mode;
my $target_offset = -4; #needs testing;
my $dont_normalize;

sub pick(%) {
    my ($f) = @_;
    my @c = keys %$f;
    my @w = map { $f->{$_} } @c;
    my $r = rand(sum(@w));
    for(0..$#c) {
        return $c[$_] if $r < $w[$_];
        $r -= $w[$_];
    }
    print "end of pick loop reached. returned $c[$#w]\n" if $debug_mode;
    return $c[$#w];
}

sub get_gram {
    my ($key) = @_;
    ##Lazily interpolate the gram table on the fly
    ##then cache the results
    unless (defined $grams->{$key}) {
        for(@loaded_data) {
            my $data = $_->[0];
            my $g = $data->{$key} or next;
            my $sum = $dont_normalize || sum(values %$g);
            while( my ($c, $v) = each %$g ) {
                $grams->{$key}->{$c} += $v/$sum;
            }
        }
    }
    return $grams->{$key};
}

sub generate {
    my $target = pick($freqs) + $target_offset;
    my $word = ' ' x ($n-1);
    my $c;
    do {
        my $len = (length $word) - ($n-1);
        my %ftable = %{get_gram substr($word, -$n+1, $n-1)};
        ($ftable{' '} //= 0) *= 2**($len-$target);
        $c = pick \%ftable;
        $word .= $c;
    } while $c ne ' ';
    $word =~ s/\s//g;
    $word = "$word (L-T: @{[length($word) - $target]})" if $debug_mode;
    return $word;
}

sub load_dataset {
    my ($mod) = @_;
    push @loaded_data, retrieve ("share/WordData/$mod") or die "Unable to load $mod";
}

sub main {
    #if (my $d = dirname $0) { chdir $d }
    ##Option handling
    my ($help_mode, $list_mode);
    @ARGV = split /\s+/, $ARGV[0] if @ARGV == 1;
    GetOptions (
                'd|debug'            => \$debug_mode,
                'h|help'             => \$help_mode,
                'l|list'             => \$list_mode,
                'N|dont-normalize'   => \$dont_normalize,
                'o|target-offset=s'  => \$target_offset,
                map {
                    my $mod=$_;
                    $mod =~ s/(^|-)(.)/\u$2/g;
                    $_, sub { load_dataset $mod };
                } @options
               ) or exit 1;
    return print $help_text if $help_mode;
    return print $list_text if $list_mode;
    ##Use the default dataset if no others were specified
    load_dataset $default_dataset unless @loaded_data;
    ##In the case of 1 dataset, skip normalization by copying everything
    ##into the tables
    if (@loaded_data == 1) {
        ($grams, $freqs) = @{$loaded_data[0]};
    }
    ##Otherwise, normalize and combine the length histograms.
    ##The gram tables will be normalized lazily as needed (see: get_gram)
    else {
        for (@loaded_data) {
            my $fdata = $_->[1];
            my $sum = $dont_normalize || sum(values %$fdata);
            while ( my ($len, $f) = each %$fdata ) {
                $freqs->{$len} += $f/$sum;
            }
        }
    }

    ##Run word generator and print results
    {
        local $\ = ' ';
        print generate for 1..min(25, int($ARGV[0]||1));
    }
    print "\n";
    return 0;
}

exit main unless caller;
1;