0
|
1 #!/usr/bin/perl
|
|
2 use strict; use warnings;
|
|
3 use v5.10;
|
|
4 use open qw( :encoding(UTF-8) :std);
|
|
5 use Storable 'retrieve';
|
|
6 use List::Util 'sum';
|
|
7 use Getopt::Long qw(:config gnu_getopt);
|
|
8 BEGIN {
|
|
9 eval {
|
|
10 require Math::Random::MT::Perl; Math::Random::MT::Perl->import('rand');
|
|
11 };
|
|
12 # warn "Optional module Math::Random::MT::Perl not found.\n" if $@;
|
|
13 }
|
|
14
|
|
15 #constants
|
|
16 my @options = qw(eng-1M eng-all eng-fiction eng-gb eng-us french german hebrew russian spanish irish german-medical bulgarian catalan swedish brazilian canadian-english-insane manx italian ogerman portuguese polish gaelic finnish norwegian);
|
|
17 my $n = 4;
|
|
18 my $default_opt = "--eng-1M";
|
|
19 (my $default_dataset = $default_opt) =~ s/(^|-+)([^-])/\u$2/g;
|
|
20
|
|
21 #help info
|
|
22 my $help_text = <<END
|
|
23 Usage: words [-dhNo] [DATASETS...] [NUMBER_OF_WORDS]
|
|
24
|
|
25 valid datasets: --@{[join ' --', @options]}
|
|
26 default: $default_opt
|
|
27
|
|
28 options:
|
|
29 -h, --help this help text
|
|
30 -d, --debug debugging output
|
|
31 -N, --dont-normalize don't normalize frequencies when combining
|
|
32 multiple Markov models; this has the effect
|
|
33 of making larger datasets more influential
|
|
34 -o, --target-offset change the target length offset used in the
|
|
35 word generation algorithm; use negative integers
|
|
36 for best results
|
|
37 END
|
|
38 ;
|
|
39
|
|
40 #data from loaded files
|
|
41 my @loaded_data;
|
|
42
|
|
43 #data after normalizing and combining datasets
|
|
44 my $grams;
|
|
45 my $freqs;
|
|
46
|
|
47 #some command line options
|
|
48 my $debug_mode;
|
|
49 my $target_offset = -4; #needs testing;
|
|
50 my $dont_normalize;
|
|
51
|
|
52 sub pick(%) {
|
|
53 my ($f) = @_;
|
|
54 my @c = keys %$f;
|
|
55 my @w = map { $f->{$_} } @c;
|
|
56 my $r = rand(sum(@w));
|
|
57 for(0..$#c) {
|
|
58 return $c[$_] if $r < $w[$_];
|
|
59 $r -= $w[$_];
|
|
60 }
|
|
61 print "end of pick loop reached. returned $c[$#w]\n" if $debug_mode;
|
|
62 return $c[$#w];
|
|
63 }
|
|
64
|
|
65 sub get_gram {
|
|
66 my ($key) = @_;
|
|
67 ##Lazily interpolate the gram table on the fly
|
|
68 ##then cache the results
|
|
69 unless (defined $grams->{$key}) {
|
|
70 for(@loaded_data) {
|
|
71 my $data = $_->[0];
|
|
72 my $g = $data->{$key} or next;
|
|
73 my $sum = $dont_normalize || sum(values %$g);
|
|
74 while( my ($c, $v) = each %$g ) {
|
|
75 $grams->{$key}->{$c} += $v/$sum;
|
|
76 }
|
|
77 }
|
|
78 }
|
|
79 return $grams->{$key};
|
|
80 }
|
|
81
|
|
82 sub generate {
|
|
83 my $target = pick($freqs) + $target_offset;
|
|
84 my $word = ' ' x ($n-1);
|
|
85 my $c;
|
|
86 do {
|
|
87 my $len = (length $word) - ($n-1);
|
|
88 my %ftable = %{get_gram substr($word, -$n+1, $n-1)};
|
|
89 ($ftable{' '} //= 0) *= 2**($len-$target);
|
|
90 $c = pick \%ftable;
|
|
91 $word .= $c;
|
|
92 } while $c ne ' ';
|
|
93 $word =~ s/\s//g;
|
|
94 $word = "$word (L-T: @{[length($word) - $target]})" if $debug_mode;
|
|
95 return $word;
|
|
96 }
|
|
97
|
|
98 sub load_dataset {
|
|
99 my ($mod) = @_;
|
|
100 push @loaded_data, retrieve ("share/WordData/$mod") or die "Unable to load $mod";
|
|
101 }
|
|
102
|
|
103 sub main {
|
|
104 ##Option handling
|
|
105 @ARGV = split /\s+/, $ARGV[0] if @ARGV == 1;
|
|
106 my $help_mode;
|
|
107 GetOptions (
|
|
108 'd|debug' => \$debug_mode,
|
|
109 'h|help' => \$help_mode,
|
|
110 'N|dont-normalize' => \$dont_normalize,
|
|
111 'o|target-offset=s' => \$target_offset,
|
|
112 map {
|
|
113 my $mod=$_;
|
|
114 $mod =~ s/(^|-)(.)/\u$2/g;
|
|
115 $_, sub { load_dataset $mod };
|
|
116 } @options
|
|
117 ) or exit 1;
|
|
118 return print $help_text if $help_mode;
|
|
119
|
|
120 ##Use the default dataset if no others were specified
|
|
121 load_dataset $default_dataset unless @loaded_data;
|
|
122 ##In the case of 1 dataset, skip normalization by copying everything
|
|
123 ##into the tables
|
|
124 if (@loaded_data == 1) {
|
|
125 ($grams, $freqs) = @{$loaded_data[0]};
|
|
126 }
|
|
127 ##Otherwise, normalize and combine the length histograms.
|
|
128 ##The gram tables will be normalized lazily as needed (see: get_gram)
|
|
129 else {
|
|
130 for (@loaded_data) {
|
|
131 my $fdata = $_->[1];
|
|
132 my $sum = $dont_normalize || sum(values %$fdata);
|
|
133 while ( my ($len, $f) = each %$fdata ) {
|
|
134 $freqs->{$len} += $f/$sum;
|
|
135 }
|
|
136 }
|
|
137 }
|
|
138
|
|
139 ##Run word generator and print results
|
|
140 local $, = ' ';
|
|
141 print map {generate} 1..int($ARGV[0]||1);
|
|
142 print "\n";
|
|
143 return 0;
|
|
144 }
|
|
145
|
|
146 exit main unless caller;
|
|
147 1;
|