annotate paste/paste.9578 @ 9285:8320c9c4620f

<oerjan> learn Umlaut is German for "hum aloud", an important feature of the German language. It is indicated by putting two dots over the vowel of the syllable.
author HackBot
date Sat, 15 Oct 2016 00:04:47 +0000
parents dcf344dc99da
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
580
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
1 #!/usr/bin/perl
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
2 use strict; use warnings;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
3 use v5.10;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
4 use open qw( :encoding(UTF-8) :std);
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
5 use File::Basename 'dirname';
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
6 use Storable 'retrieve';
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
7 use List::Util qw(sum min);
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
8 use Getopt::Long qw(:config gnu_getopt);
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
9 BEGIN {
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
10 eval {
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
11 require Math::Random::MT::Perl; Math::Random::MT::Perl->import('rand');
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
12 };
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
13 #warn "Optional module Math::Random::MT::Perl not found.\n" if $@;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
14 }
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
15
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
16 #constants
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
17 my @options = qw(eng-1M eng-all eng-fiction eng-gb eng-us french german hebrew russian spanish irish german-medical bulgarian catalan swedish brazilian canadian-english-insane manx italian ogerman portuguese polish gaelic finnish norwegian esolangs);
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
18 my $n = 4;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
19 my $default_opt = "--eng-1M";
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
20 (my $default_dataset = $default_opt) =~ s/(^|-+)([^-])/\u$2/g;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
21
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
22 #help info
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
23 my $help_text = <<END
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
24 Usage: words [-dhNo] [DATASETS...] [NUMBER_OF_WORDS]
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
25
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
26 options:
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
27 -l, --list list valid datasets
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
28 -d, --debug debugging output
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
29 -N, --dont-normalize don't normalize frequencies when combining
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
30 multiple Markov models; this has the effect
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
31 of making larger datasets more influential
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
32 -o, --target-offset change the target length offset used in the
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
33 word generation algorithm; use negative integers
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
34 for best results
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
35 END
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
36 ;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
37
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
38 my $list_text = <<END
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
39 valid datasets: --@{[join ' --', @options]}
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
40 default: $default_opt
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
41 END
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
42 ;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
43
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
44 #data from loaded files
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
45 my @loaded_data;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
46
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
47 #data after normalizing and combining datasets
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
48 my $grams;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
49 my $freqs;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
50
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
51 #some command line options
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
52 my $debug_mode;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
53 my $target_offset = -4; #needs testing;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
54 my $dont_normalize;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
55
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
56 sub pick(%) {
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
57 my ($f) = @_;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
58 my @c = keys %$f;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
59 my @w = map { $f->{$_} } @c;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
60 my $r = rand(sum(@w));
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
61 for(0..$#c) {
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
62 return $c[$_] if $r < $w[$_];
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
63 $r -= $w[$_];
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
64 }
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
65 print "end of pick loop reached. returned $c[$#w]\n" if $debug_mode;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
66 return $c[$#w];
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
67 }
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
68
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
69 sub get_gram {
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
70 my ($key) = @_;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
71 ##Lazily interpolate the gram table on the fly
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
72 ##then cache the results
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
73 unless (defined $grams->{$key}) {
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
74 for(@loaded_data) {
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
75 my $data = $_->[0];
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
76 my $g = $data->{$key} or next;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
77 my $sum = $dont_normalize || sum(values %$g);
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
78 while( my ($c, $v) = each %$g ) {
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
79 $grams->{$key}->{$c} += $v/$sum;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
80 }
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
81 }
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
82 }
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
83 return $grams->{$key};
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
84 }
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
85
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
86 sub generate {
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
87 my $target = pick($freqs) + $target_offset;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
88 my $word = ' ' x ($n-1);
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
89 my $c;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
90 do {
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
91 my $len = (length $word) - ($n-1);
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
92 my %ftable = %{get_gram substr($word, -$n+1, $n-1)};
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
93 ($ftable{' '} //= 0) *= 2**($len-$target);
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
94 $c = pick \%ftable;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
95 $word .= $c;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
96 } while $c ne ' ';
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
97 $word =~ s/\s//g;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
98 $word = "$word (L-T: @{[length($word) - $target]})" if $debug_mode;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
99 return $word;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
100 }
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
101
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
102 sub load_dataset {
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
103 my ($mod) = @_;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
104 push @loaded_data, retrieve ("share/WordData/$mod") or die "Unable to load $mod";
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
105 }
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
106
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
107 sub main {
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
108 #if (my $d = dirname $0) { chdir $d }
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
109 ##Option handling
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
110 my ($help_mode, $list_mode);
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
111 @ARGV = split /\s+/, $ARGV[0] if @ARGV == 1;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
112 GetOptions (
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
113 'd|debug' => \$debug_mode,
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
114 'h|help' => \$help_mode,
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
115 'l|list' => \$list_mode,
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
116 'N|dont-normalize' => \$dont_normalize,
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
117 'o|target-offset=s' => \$target_offset,
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
118 map {
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
119 my $mod=$_;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
120 $mod =~ s/(^|-)(.)/\u$2/g;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
121 $_, sub { load_dataset $mod };
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
122 } @options
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
123 ) or exit 1;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
124 return print $help_text if $help_mode;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
125 return print $list_text if $list_mode;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
126 ##Use the default dataset if no others were specified
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
127 load_dataset $default_dataset unless @loaded_data;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
128 ##In the case of 1 dataset, skip normalization by copying everything
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
129 ##into the tables
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
130 if (@loaded_data == 1) {
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
131 ($grams, $freqs) = @{$loaded_data[0]};
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
132 }
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
133 ##Otherwise, normalize and combine the length histograms.
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
134 ##The gram tables will be normalized lazily as needed (see: get_gram)
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
135 else {
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
136 for (@loaded_data) {
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
137 my $fdata = $_->[1];
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
138 my $sum = $dont_normalize || sum(values %$fdata);
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
139 while ( my ($len, $f) = each %$fdata ) {
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
140 $freqs->{$len} += $f/$sum;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
141 }
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
142 }
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
143 }
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
144
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
145 ##Run word generator and print results
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
146 {
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
147 local $\ = ' ';
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
148 print generate for 1..min(25, int($ARGV[0]||1));
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
149 }
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
150 print "\n";
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
151 return 0;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
152 }
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
153
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
154 exit main unless caller;
dcf344dc99da <kallisti> run paste `which words`
HackBot
parents:
diff changeset
155 1;