Mercurial > repo
view perl-5.22.2/regen/ebcdic.pl @ 8045:a16537d2fe07
<xfix> tar xf perl-5.22.2.tar.gz # Ah, whatever, I\'m doing it anyway
author | HackBot |
---|---|
date | Sat, 14 May 2016 14:54:38 +0000 |
parents | |
children |
line wrap: on
line source
use v5.16.0; use strict; use warnings; require 'regen/regen_lib.pl'; require 'regen/charset_translations.pl'; # Generates the EBCDIC translation tables that were formerly hard-coded into # utfebcdic.h my $out_fh = open_new('ebcdic_tables.h', '>', {style => '*', by => $0, }); sub output_table ($$) { my $table_ref = shift; my $name = shift; # Tables in hex easier to debug, but don't fit into 80 columns my $print_in_hex = 0; die "Requres 256 entries in table $name, got @$table_ref" if @$table_ref != 256; print $out_fh "EXTCONST U8 $name\[\] = {\n"; print $out_fh "/* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F */\n" if $print_in_hex; for my $i (0 .. 255) { if ($print_in_hex) { printf $out_fh "/* %X_ */ ", $i / 16 if $i % 16 == 0; printf $out_fh " 0x%02X", $table_ref->[$i]; } else { printf $out_fh "%4d", $table_ref->[$i]; } printf $out_fh " /* %X_ */", $i / 16 if $print_in_hex && $i % 16 == 15; print $out_fh ",", if $i < 255; print $out_fh "\n" if $i % 16 == 15; } print $out_fh "/* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F */\n" if $print_in_hex; print $out_fh "};\n\n"; } print $out_fh <<END; #ifndef H_EBCDIC_TABLES /* Guard against nested #includes */ #define H_EBCDIC_TABLES 1 /* This file contains definitions for various tables used in EBCDIC handling. * More info is in utfebcdic.h */ END my @charsets = get_supported_code_pages(); shift @charsets; # ASCII is the 0th, and we don't deal with that here. foreach my $charset (@charsets) { # we process the whole array several times, make a copy my @a2e = @{get_a2n($charset)}; print $out_fh "\n" . get_conditional_compile_line_start($charset); print $out_fh "\n"; print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n"; output_table(\@a2e, "PL_a2e"); { # Construct the inverse my @e2a; for my $i (0 .. 255) { $e2a[$a2e[$i]] = $i; } print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n"; output_table(\@e2a, "PL_e2a"); } my @i82utf = @{get_I8_2_utf($charset)}; print $out_fh <<END; /* (Confusingly named) Index is $charset I8 byte; value is * $charset UTF-EBCDIC equivalent */ END output_table(\@i82utf, "PL_utf2e"); { #Construct the inverse my @utf2i8; for my $i (0 .. 255) { $utf2i8[$i82utf[$i]] = $i; } print $out_fh <<END; /* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is * $charset I8 equivalent */ END output_table(\@utf2i8, "PL_e2utf"); } { my @utf8skip; # These are invariants or continuation bytes. for my $i (0 .. 0xBF) { $utf8skip[$i82utf[$i]] = 1; } # These are start bytes; The skip is the number of consecutive highest # order 1-bits (up to 7) for my $i (0xC0 .. 255) { my $count; if (($i & 0b11111110) == 0b11111110) { $count= 7; } elsif (($i & 0b11111100) == 0b11111100) { $count= 6; } elsif (($i & 0b11111000) == 0b11111000) { $count= 5; } elsif (($i & 0b11110000) == 0b11110000) { $count= 4; } elsif (($i & 0b11100000) == 0b11100000) { $count= 3; } elsif (($i & 0b11000000) == 0b11000000) { $count= 2; } else { die "Something wrong for UTF8SKIP calculation for $i"; } $utf8skip[$i82utf[$i]] = $count; } print $out_fh <<END; /* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes; * 1 for continuation. Adapted from the shadow flags table in tr16. The * entries marked 9 in tr16 are continuation bytes and are marked as length 1 * here so that we can recover. */ END output_table(\@utf8skip, "PL_utf8skip"); } use feature 'unicode_strings'; { my @lc; for my $i (0 .. 255) { $lc[$a2e[$i]] = $a2e[ord lc chr $i]; } print $out_fh "/* Index is $charset code point; value is its lowercase equivalent */\n"; output_table(\@lc, "PL_latin1_lc"); } { my @uc; for my $i (0 .. 255) { my $uc = uc chr $i; if (length $uc > 1 || ord $uc > 255) { $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}"; } $uc[$a2e[$i]] = $a2e[ord $uc]; } print $out_fh <<END; /* Index is $charset code point; value is its uppercase equivalent. * The 'mod' in the name means that codepoints whose uppercase is above 255 or * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */ END output_table(\@uc, "PL_mod_latin1_uc"); } { # PL_fold my @ascii_fold; for my $i (0 .. 255) { # Initialise to identity map $ascii_fold[$i] = $i; } # Overwrite the entries that aren't identity for my $chr ('A' .. 'Z') { $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr]; } for my $chr ('a' .. 'z') { $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr]; } print $out_fh <<END; /* Index is $charset code point; For A-Z, value is a-z; for a-z, value * is A-Z; all other code points map to themselves */ END output_table(\@ascii_fold, "PL_fold"); } { my @latin1_fold; for my $i (0 .. 255) { my $char = chr $i; my $lc = lc $char; # lc and uc adequately proxy for fold-case pairs in this 0-255 # range my $uc = uc $char; $uc = $char if length $uc > 1 || ord $uc > 255; if ($lc ne $char) { $latin1_fold[$a2e[$i]] = $a2e[ord $lc]; } elsif ($uc ne $char) { $latin1_fold[$a2e[$i]] = $a2e[ord $uc]; } else { $latin1_fold[$a2e[$i]] = $a2e[$i]; } } print $out_fh <<END; /* Index is $charset code point; value is its other fold-pair equivalent * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is * the code point itself */ END output_table(\@latin1_fold, "PL_fold_latin1"); } print $out_fh get_conditional_compile_line_end(); } print $out_fh "\n#endif /* H_EBCDIC_TABLES */\n"; read_only_bottom_close_and_rename($out_fh);