# HG changeset patch # User HackEso # Date 1563586200 0 # Node ID 766bdea3dbe936bb174597c3987c07440ac4e1a6 # Parent 844290bdd2ef167484250d2aad3b664f5602bbaf fetch http://nethack4.org/esolangs/7/7.pl diff -r 844290bdd2ef -r 766bdea3dbe9 7.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/7.pl Sat Jul 20 01:30:00 2019 +0000 @@ -0,0 +1,622 @@ +#!/usr/bin/env perl +use utf8; +q{ + +=pod + +=encoding UTF8 + +=head1 NAME + +7 - an esoteric programming language + +=head1 SYNOPSIS + +./7 program.7 # run +./7 -d program.7 # debug + +=head1 DESCRIPTION + +7 is an esoteric programming language, inspired by Underload (although +it has diverged somewhat), but aiming for small source code +representation. + +The language operates using two main pieces of internal state: the +I, which is a vaguely stack-like construct that holds data; and +the I. The frame consists of a sequence of commands +separated by I; the command list is also a sequence of commands. + +The program operates by repeatedly removing the first command from the +command list, then performing the action specified by that command on +the frame. Once the command list is empty, the portion of the frame +to the right of the last bar is copied to the start of the command +list; this operation is called "cycling". (Cycling exits the program +if the frame is completely empty, and is an error if the frame is +nonempty but has no bars; also, as a special case, it deletes the +rightmost bar if nothing is to its right, in order to avoid a trivial +infinite loop.) These are the only ways in which the command list can +be modified. + +The frame is a "working space" for the program, and so can be modified +in rather more interesting ways than the command list. However, +typically only the section to the right of the rightmost bar is +modified, so operations tend to be fairly local and self-contained. + +=head2 Frame operations + +There is a range of basic operations that can be performed on the +frame: + +=over 4 + +=item * + +You can I something to the frame, either a bar or a command. +This adds the bar or command in question to the right of the frame. + +=item * + +You can I the frame, removing the rightmost bar ("closing the +gap" where the bar was in the process). + +=item * + +You can I a section of the frame to the end. A section is the +region of the frame between two bars, or from a bar to an end of the +frame. When copying a section, a bar is placed to separate the copy +from the rest of the frame. + +=item * + +You can I a section of the frame to the end. This is like +copying, except that original section (together with one of the bars +between it and the adjacent sections) is removed. + +=item * + +You can I the end of the frame, again from the rightmost bar +onwards. This will delete the bar in the process. + +=item * + +You can I the end of the frame to the user. This outputs the +area strictly beyond the rightmost bar. (Bars themselves are never +output, because they have no representation as a sequence of bits.) + +=item * + +You can I the end of the frame, beyond the rightmost bar; this +is the most complex operation. To understand this, the first thing to +note is that the commands come in pairs; each I command is +paired with a I command. Additionally, some commands have +names, and some do not have names, We can then describe the process as +follows: + +=over 4 + +=item 1. + +The "passive regions" of the section are identified. These are +regions in which all the commands are named, and for which C<7> and +C<6> commands are correctly matched, as though they were parentheses; +additionally, the substring C<76> cannot appear in a passive region. +Passive regions are made as long as possible while respecting these +rules. + +=item 2. + +Each passive region is enclosed between a C<7> and C<6> command. + +=item 3. + +Each command outside any passive region (which must by definition be +active, as all anonymous commands are active, as are C<6> and C<7>) is +converted to the corresponding passive command. + +=back + +=back + +=head2 Commands + +Now that we've defined the basic frame operations, it's possible to +define the commands. + +Eight of the twelve commands have numerical names, used in the source +code representation: the six passive commands, and two active +commands. The other four active commands have no numerical name (and +cannot appear in the source), and are given names in English to +identify them. + +=over 4 + +=item C<6> (active), C<0> (passive): Pacify then disbar + +The C<6> command pacifies the rightmost section of the frame, and then +the frame is disbarred. The passive equivalent C<0> appends a C<6> +command to the frame. + +=item C<7> (active), C<1> (passive): Append a bar + +The C<7> command appends a bar to the frame. The passive equivalent +C<1> appends a C<7> command to the frame. + +=item "copy" (active), C<2> (passive): Copy the rightmost section + +The "copy" command copies the rightmost section of the frame to the +end of the frame. The passive equivalent C<2> appends a "copy" +command to the frame. + +=item "recycle" (active) C<5> (passive): Cycle early, then delete + +The "recycle" command cycles a copy of the rightmost section of the +frame onto the command list, even if the command list isn't empty. +After doing that, it deletes the rightmost section of the frame +(including the bar that separates it from the rest of the frame). The +passive equivalent C<5> appends a "recycle" command to the frame. + +=item "grab" (active), C<4> (passive): Swap, with extra bars + +The "grab" command appends an extra bar to the frame then moves the +new third-rightmost section (the original second-rightmost section) of +the frame to the end of the frame. The passive equivalent C<4> +appends a "grab" command to the frame. + +=item "eject" (active), C<3> (passive): Output, then delete twice + +The "eject" command outputs the end of the frame (beyond the rightmost +bar) to the user, then deletes the end of the frame from the +second-rightmost bar onwards (i.e. deletes the end of the frame +twice). Exception: if the section of the frame that would be output +contains any unnamed commands, it is pacified first (pacification +leaves a region with only named commands), and a C<7> is prepended to +the output. The passive equivalent C<3> appends an "eject" command to +the frame. + +This command is also capable of producing input in some circumstances +(via outputting special codes which mean "read input"). Although the +input can be in more than one form, it's always provided to the +program as a nonnegative integer, and affects the program like this: +after the deletions from the frame, the new rightmost section of the +frame (not including the bar to its left) has its content repeated a +number of times equal to the input (e.g. 0 would remove everything to +the right of the rightmost bar, 1 would do nothing, 2 would add an +extra copy of the text to the rightmost bar, and so on.) When reading +in integers from the user, any leading whitespace and commas are +skipped, then a decimal integer is read, with any stray characters +"after" the integer being left for future input commands; if there's +no integer available, 0 will be returned (this is also a valid +integer, so you may want to alternate between character and integer +reading to detect invalid input and EOF). When reading in characters, +they're converted to a character code, and 1 is added to them (EOF +counts as character code -1, and thus will produce 0 copies). + +The very first command output via this command does not produce +output, but rather specifies the format of future output: + +=over 4 + +=item Numerical (C<0>) + +In each command list output, a value is calculated like this: the +total number of C<7> and C<1> commands, minus the total number of C<6> +and C<0> commands. Exception: if the output was automatically +pacified (and thus had a C<7> prepended), that C<7> does not count in +the total count of C<7> commands. This integer is converted to +decimal and output. + +Formatting can be controlled via outputting commands other than +strings of C<0>, C<1>, C<6>, C<7>. The output will be preceded by a +space, unless it's the first output on the line. A C<2> at the end of +the string will output a trailing newline. A C<3> at the end of the +string will append a comma to it. C<4> and C<5> can be placed at +either end of the string, and will turn into opening and closing +square brackets respectively. + +Input can be obtained via attempting to output C<23> (numerical input) +or C<24> (character input). Outputting C<25> allows selection of a +new output format (basically by "resetting" the format, meaning that +the next command to be output will select a format again). + +=item Byte-per-command (C<1>) + +Each command is output as an individual byte (thus the only bytes +output will be the digits C<0> to C<7> inclusive, as those are the +only bytes that name commands). This format does not support input. + +=item Character code as number (C<2>) + +This is similar to the numerical output C<0> explained above, except +that 1 is subracted from the number, and it's then used as a character +code (rather than being output in decimal). By default, this is in +binary (i.e. octets mod 256), but this can be changed by attempting to +output a negative number; C<-1> selects binary, C<-2> selects UTF-8, +C<-3> inputs a number, C<-4> inputs a character, and C<-5> allows +selection of a new output format. Commands other than C<0>, C<1>, +C<6>, and C<7> aren't recognised here. + +=item Byte-per-three-commands (C<3>) + +Any C<6> and C<7> commands in the output are ignored. The others are +taken as triples and interpreted as base-6 numbers (as usual, in a +big-endian way), giving a value from 0 to 215 (decimal), 0 to 555 +(base 6). This is is encoded as follows: if the value is 127 +(decimal) or lower (that's 331 in base 6), it's output as an octet +directly; if the value is in the range 128 to 191 (decimal) inclusive +(that's 332 to 515 in base 6), then 64 (decimal), which is 144 (base +6), might or might not be added to it. Values above 192 (decimal) or +520 (base 6) control this addition; outputting 522 (the default) +requests the addition to happen in a way that makes the output look as +much like UTF-8 as possible; 520 specifies never adding (until this +condition is changed); 521 specifies always adding. You can also +request input, with 523 requesting an integer and 524 requesting a +character; 525 allows selection of a new output format. Other values +are currently reserved for future expansion and should not be used. + +=item Truncated octal (C<4>) + +The output is encoded using the octal source encoding. However, if it +does not come to a whole number of bytes, any spare bits are discarded +(as opposed to padding them out, as would for example happen with the +source format C<7> when the source were in binary). This format does +not support input. + +=item US-TTY (C<5>) + +Any C<6> and C<7> commands in the output are ignored. The others are +taken in pairs and interpreted as base-6 numbers (as usual, in a +big-endian way), giving a value from 0 to 35 (decimal), 0 to 55 (base +6). These are then in turn interpreted as being in an expanded +version of the US-TTY character set. This is a 5-bit character set +(thus using values from 0 to 31 (decimal), 0 to 51 (base 6)); the +remaining four codes are reserved for giving instructions to the 7 +interpreter itself. It can encode more than just 31 characters via +the use of shift states; it has five different states, notated here as +Ł, ł, Ŧ, Ø, ø, with Ł being the default. (Ł and Ŧ are identical, +except that immediately after outputting any character in state Ŧ, the +shift state becomes ł; other shift states are stable until changed +explicitly. State ø is an expansion based on CLC-INTERCAL's version of +Baudot, a very similar character set to US-TTY; US-TTY reserves the +state for use by expansions but does not specify what form those might +take. Note that the tables below are not I the same as +CLC-INTERCAL's, although they are very close; however, the only +differences are in the meanings of some shift codes, and in the fact +that C<#> appears in two places in order to improve compatibility with +existing US-TTY and CLC-INTERCAL-encoded text.) The character (or +control sequence) encoded depends on the shift state and the sequence +seen. + +Here's how the character set looks (shift state down the left side, +character encoding (first then second command) at the top, Ŧ encodes +the same characters as Ł): + + 00000011111122222233333344444455 + 01234501234501234501234501234501 + + Ł E A SIU DRJNFCKTZLWHYPQOBGØMXVł + ł e a siu drjnfcktzlwhypqobgØmxvŦ + Ø 3 - 87 $4',!:(5")2#6019?&ø./;Ł + ø ¢ + \#= *{~∀|^<[}>] @ £¬ Ø%_ Ł + +C<00> encodes NUL, C<02> encodes newline, C<04> encodes space, C<12> +encodes carriage return, C<05> in state Ø encodes BEL, C<32> in state +ø encodes backspace, C<41> in state ø encodes DEL, other blank squares +are currently undefined and should not be used. + +As for the remaining codes, C<53> requests input of an integer, C<54> +requests input of a character, and C<55> allows selection of a new +output format. C<52> is currently undefined and should not be used. + +=item Source (C<7>) + +The output is encoded the same way that the source was (and, if +necessary, will be padded with C<1> bits; note that this padding only +occurs if the program ends "naturally", not if it ends abruptly due to +ejecting beyond the bottom of the stack). + +=item Other values + +Other values are currently undefined and reserved for future extension. + +=back + +If an attempt is made to delete more sections than exist on the frame +via use of this command, the program exits immediately (although any +output will still be produced). (A program also exits if the frame +and command list are both empty.) + +=back + +=head2 Source Encoding + +A 7 source file is written via the use of the eight named commands; +the four anonymous commands cannot appear in the original +program. These form the initial command list; the initial frame +consists of two bars (with no commands before, after, or between +them). Additionally, it is not legal for a program to end with a C<7> +command. A file can either be written in ASCII, using one byte per +command, or in octal, using three bits per command. In the latter +case, when storing the program in a file, the commands are written in +such a way that the bit sequence of the commands and of the file are +the same when viewed in a big-endian way. If there's a need to pad +the end of the program, e.g. to make it round up to a multiple of 8 +bits, the program can be padded by appending any number of C<1> bits +(the fact that programs may not end with a C<7> command makes this +unambiguous). Similarly, any number of consecutive C<1> bits at the +end of the file can be deleted (they're implied into the file when a +partial command is seen). + +=cut + +} or exit; + +use warnings; +use strict; +use feature qw/state/; +use Encode qw/encode/; +use 5.012; + +my $debug; + +while (@ARGV && $ARGV[0] =~ /^-/) { + my $option = shift @ARGV; + $option eq '-d' ? $debug = 1 : + $option eq '-' ? last : + die "Unknown option $option"; +} + +my %usttytable = ( + # 0 0 0000 11 11 1122222 233 333344 444455 + # 0 1 2345 01 23 4501234 501 234501 234501 + 'Ł' => "\0"."E\nA S" ."IU\rD". "RJNFCKT".'ZLW'. "HYPQOB". "GØMXVł", + 'ł' => "\0"."e\na s" ."iu\rd". "rjnfckt".'zlw'. "hypqob". "gØmxvŦ", + 'Ø' => "\0"."3\n- \a"."87\r\$"."4',!:(5".'")2'. "#6019?". "&ø./;Ł", + 'ø' => "\0"."¢\n+ \\"."#=\r*". "{~∀|^<[".'}>]'."\b@—£¬\x7F"."—Ø%_—Ł"); +$usttytable{'Ŧ'} = $usttytable{'Ł'}; + +$| = 1; undef $/; +binmode STDIN; +# Note: we leave STDOUT in text mode until the user selects an output format +# that clearly requires binary. +my $program = <>; +my $program_was_binary = 0; +my $bits_per_byte = length unpack "B*", "\0"; + +if ($program =~ y/01234567 \n\r\t\f//c) { + # If the program contains anything other than digits, spaces, and + # \n,\r,\t,\f, then unpack it from binary. + my $binary = unpack "B*", $program; + $binary =~ s/1+$//; + $binary .= 1 while (length $binary) % 3; + $program = ""; + $program .= oct "0b$&" while $binary =~ s/^...//; + $program_was_binary = 1; +} else { + $program =~ s/\s//g; +} + +# Technically we should start with the program on the command list, but +# it's easier to start it on the frame and do an explicit-cycle in order +# to move it into place. +my $frame = "|||$program"; +my $delayed_frame_copy = 1; +my @commandlist = (); + +my $outformat = undef; + +sub pacify { + local $_ = shift; + s/((7(?2)+6|[0-5])+)/"7${1}6"=~y:67:89:r/ge; + y/67cegr89/0-7/; + $_; +} + +my $pending_octal = ""; +my $getc_buffer = undef; + +sub getin { + if ($getc_buffer) { + my $c = $getc_buffer; + undef $getc_buffer; + $debug and printf STDERR "\tRereading character code $c\n"; + return $c; + } else { + my $c = getc; + $c = defined $c ? (ord $c) + 1 : 0; + return $c; + } +} + +sub input_number { + my $c; + my $n = 0; + while (($c = getin) && (chr $c - 1) =~ /\s/) {} + if (!$c) { + $debug and print STDERR "\tRead EOF, expecting number\n"; + $delayed_frame_copy = 0; + return; + } + while ((chr $c - 1) =~ /(\d)/) { + $n *= 10; + $n += $1; + $c = getin; + !$c and last; + } + $getc_buffer = $c if $c; + $debug and print STDERR "\tRead integer value $n\n"; + $delayed_frame_copy = $n; +} + +sub input_char { + my $c = getin; + $debug and print STDERR (!$c ? "\tRead EOF, expecting character\n" : + "Read character code ".($c - 1)); + $delayed_frame_copy = $c; +} + +sub oprint { + local $_ = shift; + my $autopacify = /\D/; + state $after_newline = 1; + $autopacify and $_ = "7" . pacify $_; + length $_ or return; + + unless (defined $outformat) { + s/(.)//; + $outformat = $1; + $outformat == 7 and !$program_was_binary and $outformat = 1; + $debug and print STDERR "\tSelected output format $outformat\n"; + } + [ + sub { # 0: numerical + $_ eq '23' and input_number, return; + $_ eq '24' and input_char, return; + $_ eq '25' and (undef $outformat), return; + my $result = y/71//; + $result -= y/60//; + $result-- if $autopacify; + print " " unless $after_newline; + $after_newline = !!/2$/; + s/^[45]+// and print ($& =~ y/45/[]/r); + print $result; + s/[2345]+$// and print ($& =~ y/2345/\n,[]/r); + }, + sub { # 1: byte per command + print; + $after_newline = 0; + }, + sub { # 2: read numbers, write character codes + binmode STDOUT; + state $utf8mode = 0; + my $result = y/71//; + $result -= y/60//; + $result-- if $autopacify; + if ($result > 0) { + print $utf8mode ? encode("UTF-8", chr($result - 1)) : + encode("ISO-8859-1", chr ($result - 1)); + } else { + $result == -1 and ($utf8mode = 0), return; + $result == -2 and ($utf8mode = 1), return; + $result == -3 and input_number, return; + $result == -4 and input_char, return; + $result == -5 and (undef $outformat), return; + $result == 0 and die "Attempted to output EOF"; + die "Attempted to output negative character code $result"; + } + }, + sub { # 3: byte per three commands + state $partial = ""; + state $add64 = 194; + state $utf8bytes = 0; + binmode STDOUT; + s/[67]//g; + $partial .= $_; + while ($partial =~ s/^(.)(.)(.)//) { + my $v = $1*36 + $2 * 6 + $3; + if ($v < 128) { + print chr $v; + $utf8bytes = 0; + } elsif ($v < 192) { + if ($add64 == 192 || ($add64 == 194 && $utf8bytes)) { + print chr $v; + $utf8bytes and $utf8bytes--; + } elsif ($add64 == 193 || ($add64 == 194 && !$utf8bytes)) { + $v += 64; + print chr $v; + $utf8bytes = 1; # always at least 1 trailing byte + $v >= 0xE0 and ++$utf8bytes; + $v >= 0xF0 and ++$utf8bytes; + $v >= 0xF8 and ++$utf8bytes; + $v >= 0xFC and ++$utf8bytes; + } + } elsif ($v < 195) { + $add64 = $v; + } elsif ($v == 195) { + input_number; + } elsif ($v == 196) { + input_char; + } elsif ($v == 197) { + undef $outformat; + } else { + die "Unknown byte-per-three-commands value $v"; + } + } + }, + sub { # 4: octal + state $partial = ""; + binmode STDOUT; + $partial .= sprintf "%03b", $& while s/%.//; + print chr oct "0b$&" while $partial =~ s/^.{$bits_per_byte}//; + }, + sub { # 5: US-TTY + s/[67]//g; + state $partial = ""; + state $shiftstate = 'Ł'; + $partial .= $_; + while ($partial =~ s/^(.)(.)//) { + my $v = $1 * 6 + $2; + $v == 33 and input_number, return; + $v == 34 and input_char, return; + $v == 35 and (undef $outformat), return; + my $c = substr $usttytable{$shiftstate}, $v, 1; + if ($usttytable{$c}) { + $shiftstate = $c; + } elsif ($c eq '—') { # represents an invalid character + ...; + } else { + # TODO: Adapt to locale encoding? + print encode("UTF-8",$c); + $shiftstate eq 'Ŧ' and $shiftstate = 'ł'; + } + } + }, + sub { # 6: Unimplemented + die "Output format 6 is currently undefined"; + }, + sub { # 7: Like source; if we get here, it's octal encoding + binmode STDOUT; + $pending_octal .= sprintf "%03b", $& while s/^.//; + print chr oct "0b$&" while $pending_octal =~ s/^.{$bits_per_byte}//; + }, + ]->[$outformat]->(); + $debug and print STDERR "\n"; +} + +my $explicit = 1; +CYCLE: while ($frame ne '') { + $frame =~ /[|](\w*)$/ + or die "when cycling, frame is nonempty but has no bar"; + my $commandlist = $1; + $commandlist eq '' and $explicit = 1; + $explicit and $frame =~ s/[|](\w*)$//; + $explicit = 0; + unshift @commandlist, split /([0-57]*)/, $commandlist; + while (@commandlist) { + if ($delayed_frame_copy != 1) { + $frame =~ s/[|]\K(\w*)$/$1 x $delayed_frame_copy/e + || die "Attempt to read input with no bar in the frame"; + $delayed_frame_copy = 1; + } + $debug and print STDERR "\t$frame ", @commandlist, "\n"; + my $command = shift @commandlist; + length $command or next; + for ($command) { + /[0-57]/ and $frame .= $command =~ y/0-57/67cegr|/r; + /6/ and $frame =~ s/[|](\w*)$/pacify $1/e + || die "6 command run with no bar in the frame"; + /c/ and $frame =~ s/[|](\w*)$/|$1|$1/ + || die "'copy' command run with no bar in the frame"; + /e/ and $frame =~ s/[|]\w*[|](\w*)$/(oprint $1), ""/e + || $frame =~ s/(\w*)$/ (oprint $1), exit/e; + /g/ and $frame =~ s/(\w*)[|](\w*)$/$2||$1/ + || die "'grab' command run with no bar in the frame"; + /r/ and $explicit = 1 and next CYCLE; + } + } +} + +$pending_octal =~ s/1+$//; +if (length $pending_octal) { + $pending_octal .= 1 until length $pending_octal == $bits_per_byte; + print chr oct "0b$pending_octal"; +}