Mercurial > repo
comparison perl-5.22.2/regen/unicode_constants.pl @ 8045:a16537d2fe07
<xfix> tar xf perl-5.22.2.tar.gz # Ah, whatever, I\'m doing it anyway
author | HackBot |
---|---|
date | Sat, 14 May 2016 14:54:38 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
8044:711c038a7dce | 8045:a16537d2fe07 |
---|---|
1 use v5.16.0; | |
2 use strict; | |
3 use warnings; | |
4 require 'regen/regen_lib.pl'; | |
5 require 'regen/charset_translations.pl'; | |
6 use charnames qw(:loose); | |
7 | |
8 my $out_fh = open_new('unicode_constants.h', '>', | |
9 {style => '*', by => $0, | |
10 from => "Unicode data"}); | |
11 | |
12 print $out_fh <<END; | |
13 | |
14 #ifndef H_UNICODE_CONSTANTS /* Guard against nested #includes */ | |
15 #define H_UNICODE_CONSTANTS 1 | |
16 | |
17 /* This file contains #defines for various Unicode code points. The values | |
18 * the macros expand to are the native Unicode code point, or all or portions | |
19 * of the UTF-8 encoding for the code point. In the former case, the macro | |
20 * name has the suffix "_NATIVE"; otherwise, the suffix "_UTF8". | |
21 * | |
22 * The macros that have the suffix "_UTF8" may have further suffixes, as | |
23 * follows: | |
24 * "_FIRST_BYTE" if the value is just the first byte of the UTF-8 | |
25 * representation; the value will be a numeric constant. | |
26 * "_TAIL" if instead it represents all but the first byte. This, and | |
27 * with no additional suffix are both string constants */ | |
28 | |
29 END | |
30 | |
31 # The data are at the end of this file. A blank line is output as-is. | |
32 # Comments (lines whose first non-blank is a '#') are converted to C-style, | |
33 # though empty comments are converted to blank lines. Otherwise, each line | |
34 # represents one #define, and begins with either a Unicode character name with | |
35 # the blanks and dashes in it squeezed out or replaced by underscores; or it | |
36 # may be a hexadecimal Unicode code point of the form U+xxxx. In the latter | |
37 # case, the name will be looked-up to use as the name of the macro. In either | |
38 # case, the macro name will have suffixes as listed above, and all blanks and | |
39 # dashes will be replaced by underscores. | |
40 # | |
41 # Each line may optionally have one of the following flags on it, separated by | |
42 # white space from the initial token. | |
43 # string indicates that the output is to be of the string form | |
44 # described in the comments above that are placed in the file. | |
45 # string_skip_ifundef is the same as 'string', but instead of dying if the | |
46 # code point doesn't exist, the line is just skipped: no output is | |
47 # generated for it | |
48 # first indicates that the output is to be of the FIRST_BYTE form. | |
49 # tail indicates that the output is of the _TAIL form. | |
50 # native indicates that the output is the code point, converted to the | |
51 # platform's native character set if applicable | |
52 # | |
53 # If the code point has no official name, the desired name may be appended | |
54 # after the flag, which will be ignored if there is an official name. | |
55 # | |
56 # This program is used to make it convenient to create compile time constants | |
57 # of UTF-8, and to generate proper EBCDIC as well as ASCII without manually | |
58 # having to figure things out. | |
59 | |
60 my @data = <DATA>; | |
61 | |
62 foreach my $charset (get_supported_code_pages()) { | |
63 print $out_fh "\n" . get_conditional_compile_line_start($charset); | |
64 | |
65 my @a2n = @{get_a2n($charset)}; | |
66 | |
67 for ( @data ) { | |
68 chomp; | |
69 | |
70 # Convert any '#' comments to /* ... */; empty lines and comments are | |
71 # output as blank lines | |
72 if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) { | |
73 my $comment_body = $1 // ""; | |
74 if ($comment_body ne "") { | |
75 print $out_fh "/* $comment_body */\n"; | |
76 } | |
77 else { | |
78 print $out_fh "\n"; | |
79 } | |
80 next; | |
81 } | |
82 | |
83 unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token | |
84 (?: [\ ]+ ( [^ ]* ) )? # optional flag | |
85 (?: [\ ]+ ( .* ) )? # name if unnamed; flag is required | |
86 /x) | |
87 { | |
88 die "Unexpected syntax at line $.: $_\n"; | |
89 } | |
90 | |
91 my $name_or_cp = $1; | |
92 my $flag = $2; | |
93 my $desired_name = $3; | |
94 | |
95 my $name; | |
96 my $cp; | |
97 my $U_cp; # code point in Unicode (not-native) terms | |
98 my $undef_ok = $desired_name || $flag =~ /skip_if_undef/; | |
99 | |
100 if ($name_or_cp =~ /^U\+(.*)/) { | |
101 $U_cp = hex $1; | |
102 $name = charnames::viacode($name_or_cp); | |
103 if (! defined $name) { | |
104 die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $undef_ok; | |
105 $name = ""; | |
106 } | |
107 } | |
108 else { | |
109 $name = $name_or_cp; | |
110 die "Unknown name '$name' at line $.: $_\n" unless defined $name; | |
111 $U_cp = charnames::vianame($name =~ s/_/ /gr); | |
112 } | |
113 | |
114 $cp = ($U_cp < 256) | |
115 ? $a2n[$U_cp] | |
116 : $U_cp; | |
117 | |
118 $name = $desired_name if $name eq "" && $desired_name; | |
119 $name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes | |
120 | |
121 my $str; | |
122 my $suffix; | |
123 if (defined $flag && $flag eq 'native') { | |
124 die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff; | |
125 $suffix = '_NATIVE'; | |
126 $str = sprintf "0x%02X", $cp; # Is a numeric constant | |
127 } | |
128 else { | |
129 $str = join "", map { sprintf "\\x%02X", ord $_ } split //, cp_2_utfbytes($U_cp, $charset); | |
130 | |
131 $suffix = '_UTF8'; | |
132 if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) { | |
133 $str = "\"$str\""; # Will be a string constant | |
134 } elsif ($flag eq 'tail') { | |
135 $str =~ s/\\x..//; # Remove the first byte | |
136 $suffix .= '_TAIL'; | |
137 $str = "\"$str\""; # Will be a string constant | |
138 } | |
139 elsif ($flag eq 'first') { | |
140 $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte | |
141 $suffix .= '_FIRST_BYTE'; | |
142 $str = "0x$str"; # Is a numeric constant | |
143 } | |
144 else { | |
145 die "Unknown flag at line $.: $_\n"; | |
146 } | |
147 } | |
148 printf $out_fh "# define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp; | |
149 } | |
150 | |
151 my $max_PRINT_A = 0; | |
152 for my $i (0x20 .. 0x7E) { | |
153 $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A; | |
154 } | |
155 printf $out_fh "# define MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C 0x%02X /* The max code point that isPRINT_A */\n", $max_PRINT_A; | |
156 | |
157 print $out_fh "\n" . get_conditional_compile_line_end(); | |
158 | |
159 } | |
160 | |
161 use Unicode::UCD 'prop_invlist'; | |
162 | |
163 my $count = 0; | |
164 my @other_invlist = prop_invlist("Other"); | |
165 for (my $i = 0; $i < @other_invlist; $i += 2) { | |
166 $count += ((defined $other_invlist[$i+1]) | |
167 ? $other_invlist[$i+1] | |
168 : 0x110000) | |
169 - $other_invlist[$i]; | |
170 } | |
171 printf $out_fh "\n/* The number of code points not matching \\pC */\n" | |
172 . "#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C %d\n", | |
173 0x110000 - $count; | |
174 | |
175 print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n"; | |
176 | |
177 read_only_bottom_close_and_rename($out_fh); | |
178 | |
179 __DATA__ | |
180 U+017F string | |
181 | |
182 U+0300 string | |
183 | |
184 U+0399 string | |
185 U+03BC string | |
186 | |
187 U+1E9E string | |
188 | |
189 U+FB05 string | |
190 U+FB06 string | |
191 | |
192 U+2010 string | |
193 U+D800 first FIRST_SURROGATE | |
194 BOM first | |
195 BOM tail | |
196 | |
197 NBSP native | |
198 NBSP string | |
199 | |
200 DEL native | |
201 CR native | |
202 LF native | |
203 VT native | |
204 ESC native | |
205 U+00DF native | |
206 U+00E5 native | |
207 U+00C5 native | |
208 U+00FF native | |
209 U+00B5 native |