comparison perl-5.22.2/regen/unicode_constants.pl @ 8045:a16537d2fe07

<xfix> tar xf perl-5.22.2.tar.gz # Ah, whatever, I\'m doing it anyway
author HackBot
date Sat, 14 May 2016 14:54:38 +0000
parents
children
comparison
equal deleted inserted replaced
8044:711c038a7dce 8045:a16537d2fe07
1 use v5.16.0;
2 use strict;
3 use warnings;
4 require 'regen/regen_lib.pl';
5 require 'regen/charset_translations.pl';
6 use charnames qw(:loose);
7
8 my $out_fh = open_new('unicode_constants.h', '>',
9 {style => '*', by => $0,
10 from => "Unicode data"});
11
12 print $out_fh <<END;
13
14 #ifndef H_UNICODE_CONSTANTS /* Guard against nested #includes */
15 #define H_UNICODE_CONSTANTS 1
16
17 /* This file contains #defines for various Unicode code points. The values
18 * the macros expand to are the native Unicode code point, or all or portions
19 * of the UTF-8 encoding for the code point. In the former case, the macro
20 * name has the suffix "_NATIVE"; otherwise, the suffix "_UTF8".
21 *
22 * The macros that have the suffix "_UTF8" may have further suffixes, as
23 * follows:
24 * "_FIRST_BYTE" if the value is just the first byte of the UTF-8
25 * representation; the value will be a numeric constant.
26 * "_TAIL" if instead it represents all but the first byte. This, and
27 * with no additional suffix are both string constants */
28
29 END
30
31 # The data are at the end of this file. A blank line is output as-is.
32 # Comments (lines whose first non-blank is a '#') are converted to C-style,
33 # though empty comments are converted to blank lines. Otherwise, each line
34 # represents one #define, and begins with either a Unicode character name with
35 # the blanks and dashes in it squeezed out or replaced by underscores; or it
36 # may be a hexadecimal Unicode code point of the form U+xxxx. In the latter
37 # case, the name will be looked-up to use as the name of the macro. In either
38 # case, the macro name will have suffixes as listed above, and all blanks and
39 # dashes will be replaced by underscores.
40 #
41 # Each line may optionally have one of the following flags on it, separated by
42 # white space from the initial token.
43 # string indicates that the output is to be of the string form
44 # described in the comments above that are placed in the file.
45 # string_skip_ifundef is the same as 'string', but instead of dying if the
46 # code point doesn't exist, the line is just skipped: no output is
47 # generated for it
48 # first indicates that the output is to be of the FIRST_BYTE form.
49 # tail indicates that the output is of the _TAIL form.
50 # native indicates that the output is the code point, converted to the
51 # platform's native character set if applicable
52 #
53 # If the code point has no official name, the desired name may be appended
54 # after the flag, which will be ignored if there is an official name.
55 #
56 # This program is used to make it convenient to create compile time constants
57 # of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
58 # having to figure things out.
59
60 my @data = <DATA>;
61
62 foreach my $charset (get_supported_code_pages()) {
63 print $out_fh "\n" . get_conditional_compile_line_start($charset);
64
65 my @a2n = @{get_a2n($charset)};
66
67 for ( @data ) {
68 chomp;
69
70 # Convert any '#' comments to /* ... */; empty lines and comments are
71 # output as blank lines
72 if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) {
73 my $comment_body = $1 // "";
74 if ($comment_body ne "") {
75 print $out_fh "/* $comment_body */\n";
76 }
77 else {
78 print $out_fh "\n";
79 }
80 next;
81 }
82
83 unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
84 (?: [\ ]+ ( [^ ]* ) )? # optional flag
85 (?: [\ ]+ ( .* ) )? # name if unnamed; flag is required
86 /x)
87 {
88 die "Unexpected syntax at line $.: $_\n";
89 }
90
91 my $name_or_cp = $1;
92 my $flag = $2;
93 my $desired_name = $3;
94
95 my $name;
96 my $cp;
97 my $U_cp; # code point in Unicode (not-native) terms
98 my $undef_ok = $desired_name || $flag =~ /skip_if_undef/;
99
100 if ($name_or_cp =~ /^U\+(.*)/) {
101 $U_cp = hex $1;
102 $name = charnames::viacode($name_or_cp);
103 if (! defined $name) {
104 die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $undef_ok;
105 $name = "";
106 }
107 }
108 else {
109 $name = $name_or_cp;
110 die "Unknown name '$name' at line $.: $_\n" unless defined $name;
111 $U_cp = charnames::vianame($name =~ s/_/ /gr);
112 }
113
114 $cp = ($U_cp < 256)
115 ? $a2n[$U_cp]
116 : $U_cp;
117
118 $name = $desired_name if $name eq "" && $desired_name;
119 $name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes
120
121 my $str;
122 my $suffix;
123 if (defined $flag && $flag eq 'native') {
124 die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff;
125 $suffix = '_NATIVE';
126 $str = sprintf "0x%02X", $cp; # Is a numeric constant
127 }
128 else {
129 $str = join "", map { sprintf "\\x%02X", ord $_ } split //, cp_2_utfbytes($U_cp, $charset);
130
131 $suffix = '_UTF8';
132 if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) {
133 $str = "\"$str\""; # Will be a string constant
134 } elsif ($flag eq 'tail') {
135 $str =~ s/\\x..//; # Remove the first byte
136 $suffix .= '_TAIL';
137 $str = "\"$str\""; # Will be a string constant
138 }
139 elsif ($flag eq 'first') {
140 $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
141 $suffix .= '_FIRST_BYTE';
142 $str = "0x$str"; # Is a numeric constant
143 }
144 else {
145 die "Unknown flag at line $.: $_\n";
146 }
147 }
148 printf $out_fh "# define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp;
149 }
150
151 my $max_PRINT_A = 0;
152 for my $i (0x20 .. 0x7E) {
153 $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A;
154 }
155 printf $out_fh "# define MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C 0x%02X /* The max code point that isPRINT_A */\n", $max_PRINT_A;
156
157 print $out_fh "\n" . get_conditional_compile_line_end();
158
159 }
160
161 use Unicode::UCD 'prop_invlist';
162
163 my $count = 0;
164 my @other_invlist = prop_invlist("Other");
165 for (my $i = 0; $i < @other_invlist; $i += 2) {
166 $count += ((defined $other_invlist[$i+1])
167 ? $other_invlist[$i+1]
168 : 0x110000)
169 - $other_invlist[$i];
170 }
171 printf $out_fh "\n/* The number of code points not matching \\pC */\n"
172 . "#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C %d\n",
173 0x110000 - $count;
174
175 print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n";
176
177 read_only_bottom_close_and_rename($out_fh);
178
179 __DATA__
180 U+017F string
181
182 U+0300 string
183
184 U+0399 string
185 U+03BC string
186
187 U+1E9E string
188
189 U+FB05 string
190 U+FB06 string
191
192 U+2010 string
193 U+D800 first FIRST_SURROGATE
194 BOM first
195 BOM tail
196
197 NBSP native
198 NBSP string
199
200 DEL native
201 CR native
202 LF native
203 VT native
204 ESC native
205 U+00DF native
206 U+00E5 native
207 U+00C5 native
208 U+00FF native
209 U+00B5 native