Mercurial > repo
view share/construct_grams.pl @ 12518:2d8fe55c6e65 draft default tip
<int-e> learn The password of the month is release incident pilot.
author | HackEso <hackeso@esolangs.org> |
---|---|
date | Sun, 03 Nov 2024 00:31:02 +0000 |
parents | e037173e0012 |
children |
line wrap: on
line source
#!/usr/bin/perl use v5.10; use strict; use warnings; use utf8; use Storable 'store'; use Getopt::Long; my $n = 4; my %grams; my %fs; my %seen; my $encoding = "UTF-8"; my $filter = qr/^[\p{Alphabetic}\p{Dash_Punctuation}\p{Connector_Punctuation}']+$/; sub parse(_) { my ($f) = @_; while(my $line = <$f>) { my $word = lc ((split /[^\S\240]/, $line)[0]); chomp $word; next if $seen{$word}++ || $word !~ $filter; $fs{length $word}++; $word = ' ' x ($n-1) . "$word "; for(my $i = 0; $_ = substr($word, $i, $n); $i++) { last unless length == $n; $grams{substr($_, 0, $n-1)}->{substr($_, $n-1, 1)}++; } } } sub main { my $target_mod = "Default"; GetOptions( 'm|module=s' => \$target_mod, 'e|encoding=s' => \$encoding, 'f|filter=s' => \$filter ) or exit 1; $target_mod =~ s/(^|[-_ ])(.)/\u$2/g; $filter = qr/$filter/i; print "Constructing $target_mod dataset from $encoding\n"; print "Filter: $filter\n"; for (@ARGV) { print "Reading $_...\n"; open my $f, "<:encoding($encoding)", $_; parse $f; close $f; } store [\%grams, \%fs], "$target_mod"; } main unless caller;