Mercurial > repo
view paste/paste.27827 @ 6894:1041408d241c
<oerjan> le/rn soviet union/In ancient history, the Soviet Union used to be the THEM. They believed in absurd principles like "Better Red than Dead". Then Ronald Reagan invented Star Wars to destroy it, after which there seemed to be no the THEM for a while.
author | HackBot |
---|---|
date | Tue, 16 Feb 2016 21:39:22 +0000 |
parents | ccd3c0d8e14c |
children |
line wrap: on
line source
#!/usr/bin/perl use strict; use warnings; use v5.10; use open qw( :encoding(UTF-8) :std); use File::Basename 'dirname'; use Storable 'retrieve'; use List::Util qw(sum min); use Getopt::Long qw(:config gnu_getopt); BEGIN { eval { require Math::Random::MT::Perl; Math::Random::MT::Perl->import('rand'); }; #warn "Optional module Math::Random::MT::Perl not found.\n" if $@; } #constants my @options = qw(eng-1M eng-all eng-fiction eng-gb eng-us french german hebrew russian spanish irish german-medical bulgarian catalan swedish brazilian canadian-english-insane manx italian ogerman portuguese polish gaelic finnish norwegian esolangs); my $n = 4; my $default_opt = "--eng-1M"; (my $default_dataset = $default_opt) =~ s/(^|-+)([^-])/\u$2/g; #help info my $help_text = <<END Usage: words [-dhNo] [DATASETS...] [NUMBER_OF_WORDS] options: -l, --list list valid datasets -d, --debug debugging output -N, --dont-normalize don't normalize frequencies when combining multiple Markov models; this has the effect of making larger datasets more influential -o, --target-offset change the target length offset used in the word generation algorithm; use negative integers for best results END ; my $list_text = <<END valid datasets: --@{[join ' --', @options]} default: $default_opt END ; #data from loaded files my @loaded_data; #data after normalizing and combining datasets my $grams; my $freqs; #some command line options my $debug_mode; my $target_offset = -4; #needs testing; my $dont_normalize; sub pick(%) { my ($f) = @_; my @c = keys %$f; my @w = map { $f->{$_} } @c; my $r = rand(sum(@w)); for(0..$#c) { return $c[$_] if $r < $w[$_]; $r -= $w[$_]; } print "end of pick loop reached. returned $c[$#w]\n" if $debug_mode; return $c[$#w]; } sub get_gram { my ($key) = @_; ##Lazily interpolate the gram table on the fly ##then cache the results unless (defined $grams->{$key}) { for(@loaded_data) { my $data = $_->[0]; my $g = $data->{$key} or next; my $sum = $dont_normalize || sum(values %$g); while( my ($c, $v) = each %$g ) { $grams->{$key}->{$c} += $v/$sum; } } } return $grams->{$key}; } sub generate { my $target = pick($freqs) + $target_offset; my $word = ' ' x ($n-1); my $c; do { my $len = (length $word) - ($n-1); my %ftable = %{get_gram substr($word, -$n+1, $n-1)}; ($ftable{' '} //= 0) *= 2**($len-$target); $c = pick \%ftable; $word .= $c; } while $c ne ' '; $word =~ s/\s//g; $word = "$word (L-T: @{[length($word) - $target]})" if $debug_mode; return $word; } sub load_dataset { my ($mod) = @_; push @loaded_data, retrieve ("share/WordData/$mod") or die "Unable to load $mod"; } sub main { #if (my $d = dirname $0) { chdir $d } ##Option handling my ($help_mode, $list_mode); @ARGV = split /\s+/, $ARGV[0] if @ARGV == 1; GetOptions ( 'd|debug' => \$debug_mode, 'h|help' => \$help_mode, 'l|list' => \$list_mode, 'N|dont-normalize' => \$dont_normalize, 'o|target-offset=s' => \$target_offset, map { my $mod=$_; $mod =~ s/(^|-)(.)/\u$2/g; $_, sub { load_dataset $mod }; } @options ) or exit 1; return print $help_text if $help_mode; return print $list_text if $list_mode; ##Use the default dataset if no others were specified load_dataset $default_dataset unless @loaded_data; ##In the case of 1 dataset, skip normalization by copying everything ##into the tables if (@loaded_data == 1) { ($grams, $freqs) = @{$loaded_data[0]}; } ##Otherwise, normalize and combine the length histograms. ##The gram tables will be normalized lazily as needed (see: get_gram) else { for (@loaded_data) { my $fdata = $_->[1]; my $sum = $dont_normalize || sum(values %$fdata); while ( my ($len, $f) = each %$fdata ) { $freqs->{$len} += $f/$sum; } } } ##Run word generator and print results { local $\ = ' '; print generate for 1..min(25, int($ARGV[0]||1)); } print "\n"; return 0; } exit main unless caller; 1;