annotate paste/paste.27827 @ 12493:885661512b17 draft

<int-e> le//rn schwartzian//In 1987, Yogurt introduced a better way to rank Schwartz users: Rather than holding an annual tournament, users would take a series of standardized tests adminstered by official Schwartz centers, and would then be ranked according to the results. This lead to the Schwartzian transform because it allowed many more users to be ranked.
author HackEso <hackeso@esolangs.org>
date Fri, 12 Jan 2024 07:24:55 +0000
parents ccd3c0d8e14c
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
579
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
1 #!/usr/bin/perl
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
2 use strict; use warnings;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
3 use v5.10;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
4 use open qw( :encoding(UTF-8) :std);
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
5 use File::Basename 'dirname';
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
6 use Storable 'retrieve';
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
7 use List::Util qw(sum min);
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
8 use Getopt::Long qw(:config gnu_getopt);
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
9 BEGIN {
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
10 eval {
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
11 require Math::Random::MT::Perl; Math::Random::MT::Perl->import('rand');
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
12 };
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
13 #warn "Optional module Math::Random::MT::Perl not found.\n" if $@;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
14 }
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
15
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
16 #constants
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
17 my @options = qw(eng-1M eng-all eng-fiction eng-gb eng-us french german hebrew russian spanish irish german-medical bulgarian catalan swedish brazilian canadian-english-insane manx italian ogerman portuguese polish gaelic finnish norwegian esolangs);
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
18 my $n = 4;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
19 my $default_opt = "--eng-1M";
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
20 (my $default_dataset = $default_opt) =~ s/(^|-+)([^-])/\u$2/g;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
21
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
22 #help info
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
23 my $help_text = <<END
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
24 Usage: words [-dhNo] [DATASETS...] [NUMBER_OF_WORDS]
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
25
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
26 options:
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
27 -l, --list list valid datasets
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
28 -d, --debug debugging output
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
29 -N, --dont-normalize don't normalize frequencies when combining
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
30 multiple Markov models; this has the effect
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
31 of making larger datasets more influential
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
32 -o, --target-offset change the target length offset used in the
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
33 word generation algorithm; use negative integers
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
34 for best results
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
35 END
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
36 ;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
37
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
38 my $list_text = <<END
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
39 valid datasets: --@{[join ' --', @options]}
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
40 default: $default_opt
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
41 END
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
42 ;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
43
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
44 #data from loaded files
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
45 my @loaded_data;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
46
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
47 #data after normalizing and combining datasets
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
48 my $grams;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
49 my $freqs;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
50
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
51 #some command line options
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
52 my $debug_mode;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
53 my $target_offset = -4; #needs testing;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
54 my $dont_normalize;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
55
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
56 sub pick(%) {
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
57 my ($f) = @_;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
58 my @c = keys %$f;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
59 my @w = map { $f->{$_} } @c;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
60 my $r = rand(sum(@w));
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
61 for(0..$#c) {
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
62 return $c[$_] if $r < $w[$_];
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
63 $r -= $w[$_];
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
64 }
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
65 print "end of pick loop reached. returned $c[$#w]\n" if $debug_mode;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
66 return $c[$#w];
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
67 }
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
68
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
69 sub get_gram {
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
70 my ($key) = @_;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
71 ##Lazily interpolate the gram table on the fly
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
72 ##then cache the results
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
73 unless (defined $grams->{$key}) {
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
74 for(@loaded_data) {
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
75 my $data = $_->[0];
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
76 my $g = $data->{$key} or next;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
77 my $sum = $dont_normalize || sum(values %$g);
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
78 while( my ($c, $v) = each %$g ) {
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
79 $grams->{$key}->{$c} += $v/$sum;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
80 }
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
81 }
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
82 }
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
83 return $grams->{$key};
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
84 }
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
85
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
86 sub generate {
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
87 my $target = pick($freqs) + $target_offset;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
88 my $word = ' ' x ($n-1);
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
89 my $c;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
90 do {
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
91 my $len = (length $word) - ($n-1);
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
92 my %ftable = %{get_gram substr($word, -$n+1, $n-1)};
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
93 ($ftable{' '} //= 0) *= 2**($len-$target);
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
94 $c = pick \%ftable;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
95 $word .= $c;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
96 } while $c ne ' ';
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
97 $word =~ s/\s//g;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
98 $word = "$word (L-T: @{[length($word) - $target]})" if $debug_mode;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
99 return $word;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
100 }
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
101
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
102 sub load_dataset {
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
103 my ($mod) = @_;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
104 push @loaded_data, retrieve ("share/WordData/$mod") or die "Unable to load $mod";
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
105 }
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
106
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
107 sub main {
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
108 #if (my $d = dirname $0) { chdir $d }
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
109 ##Option handling
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
110 my ($help_mode, $list_mode);
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
111 @ARGV = split /\s+/, $ARGV[0] if @ARGV == 1;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
112 GetOptions (
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
113 'd|debug' => \$debug_mode,
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
114 'h|help' => \$help_mode,
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
115 'l|list' => \$list_mode,
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
116 'N|dont-normalize' => \$dont_normalize,
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
117 'o|target-offset=s' => \$target_offset,
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
118 map {
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
119 my $mod=$_;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
120 $mod =~ s/(^|-)(.)/\u$2/g;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
121 $_, sub { load_dataset $mod };
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
122 } @options
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
123 ) or exit 1;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
124 return print $help_text if $help_mode;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
125 return print $list_text if $list_mode;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
126 ##Use the default dataset if no others were specified
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
127 load_dataset $default_dataset unless @loaded_data;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
128 ##In the case of 1 dataset, skip normalization by copying everything
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
129 ##into the tables
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
130 if (@loaded_data == 1) {
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
131 ($grams, $freqs) = @{$loaded_data[0]};
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
132 }
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
133 ##Otherwise, normalize and combine the length histograms.
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
134 ##The gram tables will be normalized lazily as needed (see: get_gram)
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
135 else {
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
136 for (@loaded_data) {
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
137 my $fdata = $_->[1];
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
138 my $sum = $dont_normalize || sum(values %$fdata);
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
139 while ( my ($len, $f) = each %$fdata ) {
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
140 $freqs->{$len} += $f/$sum;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
141 }
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
142 }
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
143 }
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
144
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
145 ##Run word generator and print results
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
146 {
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
147 local $\ = ' ';
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
148 print generate for 1..min(25, int($ARGV[0]||1));
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
149 }
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
150 print "\n";
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
151 return 0;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
152 }
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
153
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
154 exit main unless caller;
ccd3c0d8e14c <nortti_> run cat `which words` | paste
HackBot
parents:
diff changeset
155 1;