annotate paste/paste.18049 @ 6894:1041408d241c

<oerjan> le/rn soviet union/In ancient history, the Soviet Union used to be the THEM. They believed in absurd principles like "Better Red than Dead". Then Ronald Reagan invented Star Wars to destroy it, after which there seemed to be no the THEM for a while.
author HackBot
date Tue, 16 Feb 2016 21:39:22 +0000
parents e037173e0012
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e037173e0012 Initial import.
HackBot
parents:
diff changeset
1 #!/usr/bin/perl
e037173e0012 Initial import.
HackBot
parents:
diff changeset
2 use strict; use warnings;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
3 use v5.10;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
4 use open qw( :encoding(UTF-8) :std);
e037173e0012 Initial import.
HackBot
parents:
diff changeset
5 use File::Basename 'dirname';
e037173e0012 Initial import.
HackBot
parents:
diff changeset
6 use Storable 'retrieve';
e037173e0012 Initial import.
HackBot
parents:
diff changeset
7 use List::Util qw(sum min);
e037173e0012 Initial import.
HackBot
parents:
diff changeset
8 use Getopt::Long qw(:config gnu_getopt);
e037173e0012 Initial import.
HackBot
parents:
diff changeset
9 BEGIN {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
10 eval {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
11 require Math::Random::MT::Perl; Math::Random::MT::Perl->import('rand');
e037173e0012 Initial import.
HackBot
parents:
diff changeset
12 };
e037173e0012 Initial import.
HackBot
parents:
diff changeset
13 #warn "Optional module Math::Random::MT::Perl not found.\n" if $@;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
14 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
15
e037173e0012 Initial import.
HackBot
parents:
diff changeset
16 #constants
e037173e0012 Initial import.
HackBot
parents:
diff changeset
17 my @options = qw(eng-1M eng-all eng-fiction eng-gb eng-us french german hebrew russian spanish irish german-medical bulgarian catalan swedish brazilian canadian-english-insane manx italian ogerman portuguese polish gaelic finnish norwegian);
e037173e0012 Initial import.
HackBot
parents:
diff changeset
18 my $n = 4;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
19 my $default_opt = "--eng-1M";
e037173e0012 Initial import.
HackBot
parents:
diff changeset
20 (my $default_dataset = $default_opt) =~ s/(^|-+)([^-])/\u$2/g;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
21
e037173e0012 Initial import.
HackBot
parents:
diff changeset
22 #help info
e037173e0012 Initial import.
HackBot
parents:
diff changeset
23 my $help_text = <<END
e037173e0012 Initial import.
HackBot
parents:
diff changeset
24 Usage: words [-dhNo] [DATASETS...] [NUMBER_OF_WORDS]
e037173e0012 Initial import.
HackBot
parents:
diff changeset
25
e037173e0012 Initial import.
HackBot
parents:
diff changeset
26 options:
e037173e0012 Initial import.
HackBot
parents:
diff changeset
27 -l, --list list valid datasets
e037173e0012 Initial import.
HackBot
parents:
diff changeset
28 -d, --debug debugging output
e037173e0012 Initial import.
HackBot
parents:
diff changeset
29 -N, --dont-normalize don't normalize frequencies when combining
e037173e0012 Initial import.
HackBot
parents:
diff changeset
30 multiple Markov models; this has the effect
e037173e0012 Initial import.
HackBot
parents:
diff changeset
31 of making larger datasets more influential
e037173e0012 Initial import.
HackBot
parents:
diff changeset
32 -o, --target-offset change the target length offset used in the
e037173e0012 Initial import.
HackBot
parents:
diff changeset
33 word generation algorithm; use negative integers
e037173e0012 Initial import.
HackBot
parents:
diff changeset
34 for best results
e037173e0012 Initial import.
HackBot
parents:
diff changeset
35 END
e037173e0012 Initial import.
HackBot
parents:
diff changeset
36 ;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
37
e037173e0012 Initial import.
HackBot
parents:
diff changeset
38 my $list_text = <<END
e037173e0012 Initial import.
HackBot
parents:
diff changeset
39 valid datasets: --@{[join ' --', @options]}
e037173e0012 Initial import.
HackBot
parents:
diff changeset
40 default: $default_opt
e037173e0012 Initial import.
HackBot
parents:
diff changeset
41 END
e037173e0012 Initial import.
HackBot
parents:
diff changeset
42 ;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
43
e037173e0012 Initial import.
HackBot
parents:
diff changeset
44 #data from loaded files
e037173e0012 Initial import.
HackBot
parents:
diff changeset
45 my @loaded_data;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
46
e037173e0012 Initial import.
HackBot
parents:
diff changeset
47 #data after normalizing and combining datasets
e037173e0012 Initial import.
HackBot
parents:
diff changeset
48 my $grams;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
49 my $freqs;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
50
e037173e0012 Initial import.
HackBot
parents:
diff changeset
51 #some command line options
e037173e0012 Initial import.
HackBot
parents:
diff changeset
52 my $debug_mode;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
53 my $target_offset = -4; #needs testing;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
54 my $dont_normalize;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
55
e037173e0012 Initial import.
HackBot
parents:
diff changeset
56 sub pick(%) {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
57 my ($f) = @_;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
58 my @c = keys %$f;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
59 my @w = map { $f->{$_} } @c;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
60 my $r = rand(sum(@w));
e037173e0012 Initial import.
HackBot
parents:
diff changeset
61 for(0..$#c) {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
62 return $c[$_] if $r < $w[$_];
e037173e0012 Initial import.
HackBot
parents:
diff changeset
63 $r -= $w[$_];
e037173e0012 Initial import.
HackBot
parents:
diff changeset
64 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
65 print "end of pick loop reached. returned $c[$#w]\n" if $debug_mode;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
66 return $c[$#w];
e037173e0012 Initial import.
HackBot
parents:
diff changeset
67 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
68
e037173e0012 Initial import.
HackBot
parents:
diff changeset
69 sub get_gram {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
70 my ($key) = @_;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
71 ##Lazily interpolate the gram table on the fly
e037173e0012 Initial import.
HackBot
parents:
diff changeset
72 ##then cache the results
e037173e0012 Initial import.
HackBot
parents:
diff changeset
73 unless (defined $grams->{$key}) {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
74 for(@loaded_data) {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
75 my $data = $_->[0];
e037173e0012 Initial import.
HackBot
parents:
diff changeset
76 my $g = $data->{$key} or next;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
77 my $sum = $dont_normalize || sum(values %$g);
e037173e0012 Initial import.
HackBot
parents:
diff changeset
78 while( my ($c, $v) = each %$g ) {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
79 $grams->{$key}->{$c} += $v/$sum;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
80 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
81 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
82 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
83 return $grams->{$key};
e037173e0012 Initial import.
HackBot
parents:
diff changeset
84 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
85
e037173e0012 Initial import.
HackBot
parents:
diff changeset
86 sub generate {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
87 my $target = pick($freqs) + $target_offset;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
88 my $word = ' ' x ($n-1);
e037173e0012 Initial import.
HackBot
parents:
diff changeset
89 my $c;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
90 do {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
91 my $len = (length $word) - ($n-1);
e037173e0012 Initial import.
HackBot
parents:
diff changeset
92 my %ftable = %{get_gram substr($word, -$n+1, $n-1)};
e037173e0012 Initial import.
HackBot
parents:
diff changeset
93 ($ftable{' '} //= 0) *= 2**($len-$target);
e037173e0012 Initial import.
HackBot
parents:
diff changeset
94 $c = pick \%ftable;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
95 $word .= $c;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
96 } while $c ne ' ';
e037173e0012 Initial import.
HackBot
parents:
diff changeset
97 $word =~ s/\s//g;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
98 $word = "$word (L-T: @{[length($word) - $target]})" if $debug_mode;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
99 return $word;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
100 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
101
e037173e0012 Initial import.
HackBot
parents:
diff changeset
102 sub load_dataset {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
103 my ($mod) = @_;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
104 push @loaded_data, retrieve ("share/WordData/$mod") or die "Unable to load $mod";
e037173e0012 Initial import.
HackBot
parents:
diff changeset
105 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
106
e037173e0012 Initial import.
HackBot
parents:
diff changeset
107 sub main {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
108 #if (my $d = dirname $0) { chdir $d }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
109 ##Option handling
e037173e0012 Initial import.
HackBot
parents:
diff changeset
110 my ($help_mode, $list_mode);
e037173e0012 Initial import.
HackBot
parents:
diff changeset
111 @ARGV = split /\s+/, $ARGV[0] if @ARGV == 1;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
112 GetOptions (
e037173e0012 Initial import.
HackBot
parents:
diff changeset
113 'd|debug' => \$debug_mode,
e037173e0012 Initial import.
HackBot
parents:
diff changeset
114 'h|help' => \$help_mode,
e037173e0012 Initial import.
HackBot
parents:
diff changeset
115 'l|list' => \$list_mode,
e037173e0012 Initial import.
HackBot
parents:
diff changeset
116 'N|dont-normalize' => \$dont_normalize,
e037173e0012 Initial import.
HackBot
parents:
diff changeset
117 'o|target-offset=s' => \$target_offset,
e037173e0012 Initial import.
HackBot
parents:
diff changeset
118 map {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
119 my $mod=$_;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
120 $mod =~ s/(^|-)(.)/\u$2/g;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
121 $_, sub { load_dataset $mod };
e037173e0012 Initial import.
HackBot
parents:
diff changeset
122 } @options
e037173e0012 Initial import.
HackBot
parents:
diff changeset
123 ) or exit 1;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
124 return print $help_text if $help_mode;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
125 return print $list_text if $list_mode;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
126 ##Use the default dataset if no others were specified
e037173e0012 Initial import.
HackBot
parents:
diff changeset
127 load_dataset $default_dataset unless @loaded_data;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
128 ##In the case of 1 dataset, skip normalization by copying everything
e037173e0012 Initial import.
HackBot
parents:
diff changeset
129 ##into the tables
e037173e0012 Initial import.
HackBot
parents:
diff changeset
130 if (@loaded_data == 1) {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
131 ($grams, $freqs) = @{$loaded_data[0]};
e037173e0012 Initial import.
HackBot
parents:
diff changeset
132 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
133 ##Otherwise, normalize and combine the length histograms.
e037173e0012 Initial import.
HackBot
parents:
diff changeset
134 ##The gram tables will be normalized lazily as needed (see: get_gram)
e037173e0012 Initial import.
HackBot
parents:
diff changeset
135 else {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
136 for (@loaded_data) {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
137 my $fdata = $_->[1];
e037173e0012 Initial import.
HackBot
parents:
diff changeset
138 my $sum = $dont_normalize || sum(values %$fdata);
e037173e0012 Initial import.
HackBot
parents:
diff changeset
139 while ( my ($len, $f) = each %$fdata ) {
e037173e0012 Initial import.
HackBot
parents:
diff changeset
140 $freqs->{$len} += $f/$sum;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
141 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
142 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
143 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
144
e037173e0012 Initial import.
HackBot
parents:
diff changeset
145 ##Run word generator and print results
e037173e0012 Initial import.
HackBot
parents:
diff changeset
146 local $\ = ' ';
e037173e0012 Initial import.
HackBot
parents:
diff changeset
147 print generate for 1..min(25, int($ARGV[0]||1));
e037173e0012 Initial import.
HackBot
parents:
diff changeset
148 print "\n";
e037173e0012 Initial import.
HackBot
parents:
diff changeset
149 return 0;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
150 }
e037173e0012 Initial import.
HackBot
parents:
diff changeset
151
e037173e0012 Initial import.
HackBot
parents:
diff changeset
152 exit main unless caller;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
153 1;
e037173e0012 Initial import.
HackBot
parents:
diff changeset
154