Mercurial > repo
comparison perl-5.22.2/regen/ebcdic.pl @ 8045:a16537d2fe07
<xfix> tar xf perl-5.22.2.tar.gz # Ah, whatever, I\'m doing it anyway
author | HackBot |
---|---|
date | Sat, 14 May 2016 14:54:38 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
8044:711c038a7dce | 8045:a16537d2fe07 |
---|---|
1 use v5.16.0; | |
2 use strict; | |
3 use warnings; | |
4 require 'regen/regen_lib.pl'; | |
5 require 'regen/charset_translations.pl'; | |
6 | |
7 # Generates the EBCDIC translation tables that were formerly hard-coded into | |
8 # utfebcdic.h | |
9 | |
10 my $out_fh = open_new('ebcdic_tables.h', '>', | |
11 {style => '*', by => $0, }); | |
12 | |
13 sub output_table ($$) { | |
14 my $table_ref = shift; | |
15 my $name = shift; | |
16 | |
17 # Tables in hex easier to debug, but don't fit into 80 columns | |
18 my $print_in_hex = 0; | |
19 | |
20 die "Requres 256 entries in table $name, got @$table_ref" if @$table_ref != 256; | |
21 | |
22 print $out_fh "EXTCONST U8 $name\[\] = {\n"; | |
23 | |
24 print $out_fh "/* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F */\n" if $print_in_hex; | |
25 for my $i (0 .. 255) { | |
26 if ($print_in_hex) { | |
27 printf $out_fh "/* %X_ */ ", $i / 16 if $i % 16 == 0; | |
28 printf $out_fh " 0x%02X", $table_ref->[$i]; | |
29 } | |
30 else { | |
31 printf $out_fh "%4d", $table_ref->[$i]; | |
32 } | |
33 printf $out_fh " /* %X_ */", $i / 16 if $print_in_hex && $i % 16 == 15; | |
34 print $out_fh ",", if $i < 255; | |
35 print $out_fh "\n" if $i % 16 == 15; | |
36 } | |
37 print $out_fh "/* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F */\n" if $print_in_hex; | |
38 print $out_fh "};\n\n"; | |
39 } | |
40 | |
41 print $out_fh <<END; | |
42 | |
43 #ifndef H_EBCDIC_TABLES /* Guard against nested #includes */ | |
44 #define H_EBCDIC_TABLES 1 | |
45 | |
46 /* This file contains definitions for various tables used in EBCDIC handling. | |
47 * More info is in utfebcdic.h */ | |
48 END | |
49 | |
50 my @charsets = get_supported_code_pages(); | |
51 shift @charsets; # ASCII is the 0th, and we don't deal with that here. | |
52 foreach my $charset (@charsets) { | |
53 # we process the whole array several times, make a copy | |
54 my @a2e = @{get_a2n($charset)}; | |
55 | |
56 print $out_fh "\n" . get_conditional_compile_line_start($charset); | |
57 print $out_fh "\n"; | |
58 | |
59 print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n"; | |
60 output_table(\@a2e, "PL_a2e"); | |
61 | |
62 { # Construct the inverse | |
63 my @e2a; | |
64 for my $i (0 .. 255) { | |
65 $e2a[$a2e[$i]] = $i; | |
66 } | |
67 print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n"; | |
68 output_table(\@e2a, "PL_e2a"); | |
69 } | |
70 | |
71 my @i82utf = @{get_I8_2_utf($charset)}; | |
72 print $out_fh <<END; | |
73 /* (Confusingly named) Index is $charset I8 byte; value is | |
74 * $charset UTF-EBCDIC equivalent */ | |
75 END | |
76 output_table(\@i82utf, "PL_utf2e"); | |
77 | |
78 { #Construct the inverse | |
79 my @utf2i8; | |
80 for my $i (0 .. 255) { | |
81 $utf2i8[$i82utf[$i]] = $i; | |
82 } | |
83 print $out_fh <<END; | |
84 /* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is | |
85 * $charset I8 equivalent */ | |
86 END | |
87 output_table(\@utf2i8, "PL_e2utf"); | |
88 } | |
89 | |
90 { | |
91 my @utf8skip; | |
92 | |
93 # These are invariants or continuation bytes. | |
94 for my $i (0 .. 0xBF) { | |
95 $utf8skip[$i82utf[$i]] = 1; | |
96 } | |
97 | |
98 # These are start bytes; The skip is the number of consecutive highest | |
99 # order 1-bits (up to 7) | |
100 for my $i (0xC0 .. 255) { | |
101 my $count; | |
102 if (($i & 0b11111110) == 0b11111110) { | |
103 $count= 7; | |
104 } | |
105 elsif (($i & 0b11111100) == 0b11111100) { | |
106 $count= 6; | |
107 } | |
108 elsif (($i & 0b11111000) == 0b11111000) { | |
109 $count= 5; | |
110 } | |
111 elsif (($i & 0b11110000) == 0b11110000) { | |
112 $count= 4; | |
113 } | |
114 elsif (($i & 0b11100000) == 0b11100000) { | |
115 $count= 3; | |
116 } | |
117 elsif (($i & 0b11000000) == 0b11000000) { | |
118 $count= 2; | |
119 } | |
120 else { | |
121 die "Something wrong for UTF8SKIP calculation for $i"; | |
122 } | |
123 $utf8skip[$i82utf[$i]] = $count; | |
124 } | |
125 | |
126 print $out_fh <<END; | |
127 /* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes; | |
128 * 1 for continuation. Adapted from the shadow flags table in tr16. The | |
129 * entries marked 9 in tr16 are continuation bytes and are marked as length 1 | |
130 * here so that we can recover. */ | |
131 END | |
132 output_table(\@utf8skip, "PL_utf8skip"); | |
133 } | |
134 | |
135 use feature 'unicode_strings'; | |
136 | |
137 { | |
138 my @lc; | |
139 for my $i (0 .. 255) { | |
140 $lc[$a2e[$i]] = $a2e[ord lc chr $i]; | |
141 } | |
142 print $out_fh "/* Index is $charset code point; value is its lowercase equivalent */\n"; | |
143 output_table(\@lc, "PL_latin1_lc"); | |
144 } | |
145 | |
146 { | |
147 my @uc; | |
148 for my $i (0 .. 255) { | |
149 my $uc = uc chr $i; | |
150 if (length $uc > 1 || ord $uc > 255) { | |
151 $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}"; | |
152 } | |
153 $uc[$a2e[$i]] = $a2e[ord $uc]; | |
154 } | |
155 print $out_fh <<END; | |
156 /* Index is $charset code point; value is its uppercase equivalent. | |
157 * The 'mod' in the name means that codepoints whose uppercase is above 255 or | |
158 * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */ | |
159 END | |
160 output_table(\@uc, "PL_mod_latin1_uc"); | |
161 } | |
162 | |
163 { # PL_fold | |
164 my @ascii_fold; | |
165 for my $i (0 .. 255) { # Initialise to identity map | |
166 $ascii_fold[$i] = $i; | |
167 } | |
168 | |
169 # Overwrite the entries that aren't identity | |
170 for my $chr ('A' .. 'Z') { | |
171 $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr]; | |
172 } | |
173 for my $chr ('a' .. 'z') { | |
174 $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr]; | |
175 } | |
176 print $out_fh <<END; | |
177 /* Index is $charset code point; For A-Z, value is a-z; for a-z, value | |
178 * is A-Z; all other code points map to themselves */ | |
179 END | |
180 output_table(\@ascii_fold, "PL_fold"); | |
181 } | |
182 | |
183 { | |
184 my @latin1_fold; | |
185 for my $i (0 .. 255) { | |
186 my $char = chr $i; | |
187 my $lc = lc $char; | |
188 | |
189 # lc and uc adequately proxy for fold-case pairs in this 0-255 | |
190 # range | |
191 my $uc = uc $char; | |
192 $uc = $char if length $uc > 1 || ord $uc > 255; | |
193 if ($lc ne $char) { | |
194 $latin1_fold[$a2e[$i]] = $a2e[ord $lc]; | |
195 } | |
196 elsif ($uc ne $char) { | |
197 $latin1_fold[$a2e[$i]] = $a2e[ord $uc]; | |
198 } | |
199 else { | |
200 $latin1_fold[$a2e[$i]] = $a2e[$i]; | |
201 } | |
202 } | |
203 print $out_fh <<END; | |
204 /* Index is $charset code point; value is its other fold-pair equivalent | |
205 * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is | |
206 * the code point itself */ | |
207 END | |
208 output_table(\@latin1_fold, "PL_fold_latin1"); | |
209 } | |
210 | |
211 print $out_fh get_conditional_compile_line_end(); | |
212 } | |
213 | |
214 print $out_fh "\n#endif /* H_EBCDIC_TABLES */\n"; | |
215 | |
216 read_only_bottom_close_and_rename($out_fh); |