Mercurial > repo
view paste/paste.9578 @ 8427:1fc808cd5b1f
<b_jonas> learn can\'t is the most frequent word whose pronunciation varies between /\xc9\x91\xcb\x90/ and /\xc3\xa6/ depending on dialect. The list is: advance after answer ask aunt brass can\'t cast castle chance class command dance demand draft enhance example fast father glass graph grass half last laugh mask master nasty pass past path plant rather sample shan\'t staff task vast
author | HackBot |
---|---|
date | Thu, 09 Jun 2016 21:28:47 +0000 |
parents | dcf344dc99da |
children |
line wrap: on
line source
#!/usr/bin/perl use strict; use warnings; use v5.10; use open qw( :encoding(UTF-8) :std); use File::Basename 'dirname'; use Storable 'retrieve'; use List::Util qw(sum min); use Getopt::Long qw(:config gnu_getopt); BEGIN { eval { require Math::Random::MT::Perl; Math::Random::MT::Perl->import('rand'); }; #warn "Optional module Math::Random::MT::Perl not found.\n" if $@; } #constants my @options = qw(eng-1M eng-all eng-fiction eng-gb eng-us french german hebrew russian spanish irish german-medical bulgarian catalan swedish brazilian canadian-english-insane manx italian ogerman portuguese polish gaelic finnish norwegian esolangs); my $n = 4; my $default_opt = "--eng-1M"; (my $default_dataset = $default_opt) =~ s/(^|-+)([^-])/\u$2/g; #help info my $help_text = <<END Usage: words [-dhNo] [DATASETS...] [NUMBER_OF_WORDS] options: -l, --list list valid datasets -d, --debug debugging output -N, --dont-normalize don't normalize frequencies when combining multiple Markov models; this has the effect of making larger datasets more influential -o, --target-offset change the target length offset used in the word generation algorithm; use negative integers for best results END ; my $list_text = <<END valid datasets: --@{[join ' --', @options]} default: $default_opt END ; #data from loaded files my @loaded_data; #data after normalizing and combining datasets my $grams; my $freqs; #some command line options my $debug_mode; my $target_offset = -4; #needs testing; my $dont_normalize; sub pick(%) { my ($f) = @_; my @c = keys %$f; my @w = map { $f->{$_} } @c; my $r = rand(sum(@w)); for(0..$#c) { return $c[$_] if $r < $w[$_]; $r -= $w[$_]; } print "end of pick loop reached. returned $c[$#w]\n" if $debug_mode; return $c[$#w]; } sub get_gram { my ($key) = @_; ##Lazily interpolate the gram table on the fly ##then cache the results unless (defined $grams->{$key}) { for(@loaded_data) { my $data = $_->[0]; my $g = $data->{$key} or next; my $sum = $dont_normalize || sum(values %$g); while( my ($c, $v) = each %$g ) { $grams->{$key}->{$c} += $v/$sum; } } } return $grams->{$key}; } sub generate { my $target = pick($freqs) + $target_offset; my $word = ' ' x ($n-1); my $c; do { my $len = (length $word) - ($n-1); my %ftable = %{get_gram substr($word, -$n+1, $n-1)}; ($ftable{' '} //= 0) *= 2**($len-$target); $c = pick \%ftable; $word .= $c; } while $c ne ' '; $word =~ s/\s//g; $word = "$word (L-T: @{[length($word) - $target]})" if $debug_mode; return $word; } sub load_dataset { my ($mod) = @_; push @loaded_data, retrieve ("share/WordData/$mod") or die "Unable to load $mod"; } sub main { #if (my $d = dirname $0) { chdir $d } ##Option handling my ($help_mode, $list_mode); @ARGV = split /\s+/, $ARGV[0] if @ARGV == 1; GetOptions ( 'd|debug' => \$debug_mode, 'h|help' => \$help_mode, 'l|list' => \$list_mode, 'N|dont-normalize' => \$dont_normalize, 'o|target-offset=s' => \$target_offset, map { my $mod=$_; $mod =~ s/(^|-)(.)/\u$2/g; $_, sub { load_dataset $mod }; } @options ) or exit 1; return print $help_text if $help_mode; return print $list_text if $list_mode; ##Use the default dataset if no others were specified load_dataset $default_dataset unless @loaded_data; ##In the case of 1 dataset, skip normalization by copying everything ##into the tables if (@loaded_data == 1) { ($grams, $freqs) = @{$loaded_data[0]}; } ##Otherwise, normalize and combine the length histograms. ##The gram tables will be normalized lazily as needed (see: get_gram) else { for (@loaded_data) { my $fdata = $_->[1]; my $sum = $dont_normalize || sum(values %$fdata); while ( my ($len, $f) = each %$fdata ) { $freqs->{$len} += $f/$sum; } } } ##Run word generator and print results { local $\ = ' '; print generate for 1..min(25, int($ARGV[0]||1)); } print "\n"; return 0; } exit main unless caller; 1;