Mercurial > repo
comparison paste/paste.27827 @ 579:ccd3c0d8e14c
<nortti_> run cat `which words` | paste
author | HackBot |
---|---|
date | Wed, 04 Jul 2012 23:55:02 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
578:3ba31771c9ab | 579:ccd3c0d8e14c |
---|---|
1 #!/usr/bin/perl | |
2 use strict; use warnings; | |
3 use v5.10; | |
4 use open qw( :encoding(UTF-8) :std); | |
5 use File::Basename 'dirname'; | |
6 use Storable 'retrieve'; | |
7 use List::Util qw(sum min); | |
8 use Getopt::Long qw(:config gnu_getopt); | |
9 BEGIN { | |
10 eval { | |
11 require Math::Random::MT::Perl; Math::Random::MT::Perl->import('rand'); | |
12 }; | |
13 #warn "Optional module Math::Random::MT::Perl not found.\n" if $@; | |
14 } | |
15 | |
16 #constants | |
17 my @options = qw(eng-1M eng-all eng-fiction eng-gb eng-us french german hebrew russian spanish irish german-medical bulgarian catalan swedish brazilian canadian-english-insane manx italian ogerman portuguese polish gaelic finnish norwegian esolangs); | |
18 my $n = 4; | |
19 my $default_opt = "--eng-1M"; | |
20 (my $default_dataset = $default_opt) =~ s/(^|-+)([^-])/\u$2/g; | |
21 | |
22 #help info | |
23 my $help_text = <<END | |
24 Usage: words [-dhNo] [DATASETS...] [NUMBER_OF_WORDS] | |
25 | |
26 options: | |
27 -l, --list list valid datasets | |
28 -d, --debug debugging output | |
29 -N, --dont-normalize don't normalize frequencies when combining | |
30 multiple Markov models; this has the effect | |
31 of making larger datasets more influential | |
32 -o, --target-offset change the target length offset used in the | |
33 word generation algorithm; use negative integers | |
34 for best results | |
35 END | |
36 ; | |
37 | |
38 my $list_text = <<END | |
39 valid datasets: --@{[join ' --', @options]} | |
40 default: $default_opt | |
41 END | |
42 ; | |
43 | |
44 #data from loaded files | |
45 my @loaded_data; | |
46 | |
47 #data after normalizing and combining datasets | |
48 my $grams; | |
49 my $freqs; | |
50 | |
51 #some command line options | |
52 my $debug_mode; | |
53 my $target_offset = -4; #needs testing; | |
54 my $dont_normalize; | |
55 | |
56 sub pick(%) { | |
57 my ($f) = @_; | |
58 my @c = keys %$f; | |
59 my @w = map { $f->{$_} } @c; | |
60 my $r = rand(sum(@w)); | |
61 for(0..$#c) { | |
62 return $c[$_] if $r < $w[$_]; | |
63 $r -= $w[$_]; | |
64 } | |
65 print "end of pick loop reached. returned $c[$#w]\n" if $debug_mode; | |
66 return $c[$#w]; | |
67 } | |
68 | |
69 sub get_gram { | |
70 my ($key) = @_; | |
71 ##Lazily interpolate the gram table on the fly | |
72 ##then cache the results | |
73 unless (defined $grams->{$key}) { | |
74 for(@loaded_data) { | |
75 my $data = $_->[0]; | |
76 my $g = $data->{$key} or next; | |
77 my $sum = $dont_normalize || sum(values %$g); | |
78 while( my ($c, $v) = each %$g ) { | |
79 $grams->{$key}->{$c} += $v/$sum; | |
80 } | |
81 } | |
82 } | |
83 return $grams->{$key}; | |
84 } | |
85 | |
86 sub generate { | |
87 my $target = pick($freqs) + $target_offset; | |
88 my $word = ' ' x ($n-1); | |
89 my $c; | |
90 do { | |
91 my $len = (length $word) - ($n-1); | |
92 my %ftable = %{get_gram substr($word, -$n+1, $n-1)}; | |
93 ($ftable{' '} //= 0) *= 2**($len-$target); | |
94 $c = pick \%ftable; | |
95 $word .= $c; | |
96 } while $c ne ' '; | |
97 $word =~ s/\s//g; | |
98 $word = "$word (L-T: @{[length($word) - $target]})" if $debug_mode; | |
99 return $word; | |
100 } | |
101 | |
102 sub load_dataset { | |
103 my ($mod) = @_; | |
104 push @loaded_data, retrieve ("share/WordData/$mod") or die "Unable to load $mod"; | |
105 } | |
106 | |
107 sub main { | |
108 #if (my $d = dirname $0) { chdir $d } | |
109 ##Option handling | |
110 my ($help_mode, $list_mode); | |
111 @ARGV = split /\s+/, $ARGV[0] if @ARGV == 1; | |
112 GetOptions ( | |
113 'd|debug' => \$debug_mode, | |
114 'h|help' => \$help_mode, | |
115 'l|list' => \$list_mode, | |
116 'N|dont-normalize' => \$dont_normalize, | |
117 'o|target-offset=s' => \$target_offset, | |
118 map { | |
119 my $mod=$_; | |
120 $mod =~ s/(^|-)(.)/\u$2/g; | |
121 $_, sub { load_dataset $mod }; | |
122 } @options | |
123 ) or exit 1; | |
124 return print $help_text if $help_mode; | |
125 return print $list_text if $list_mode; | |
126 ##Use the default dataset if no others were specified | |
127 load_dataset $default_dataset unless @loaded_data; | |
128 ##In the case of 1 dataset, skip normalization by copying everything | |
129 ##into the tables | |
130 if (@loaded_data == 1) { | |
131 ($grams, $freqs) = @{$loaded_data[0]}; | |
132 } | |
133 ##Otherwise, normalize and combine the length histograms. | |
134 ##The gram tables will be normalized lazily as needed (see: get_gram) | |
135 else { | |
136 for (@loaded_data) { | |
137 my $fdata = $_->[1]; | |
138 my $sum = $dont_normalize || sum(values %$fdata); | |
139 while ( my ($len, $f) = each %$fdata ) { | |
140 $freqs->{$len} += $f/$sum; | |
141 } | |
142 } | |
143 } | |
144 | |
145 ##Run word generator and print results | |
146 { | |
147 local $\ = ' '; | |
148 print generate for 1..min(25, int($ARGV[0]||1)); | |
149 } | |
150 print "\n"; | |
151 return 0; | |
152 } | |
153 | |
154 exit main unless caller; | |
155 1; |