Mercurial > repo
comparison perl-5.22.2/regen/mk_invlists.pl @ 8045:a16537d2fe07
<xfix> tar xf perl-5.22.2.tar.gz # Ah, whatever, I\'m doing it anyway
author | HackBot |
---|---|
date | Sat, 14 May 2016 14:54:38 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
8044:711c038a7dce | 8045:a16537d2fe07 |
---|---|
1 #!perl -w | |
2 use 5.015; | |
3 use strict; | |
4 use warnings; | |
5 use Unicode::UCD qw(prop_aliases | |
6 prop_values | |
7 prop_value_aliases | |
8 prop_invlist | |
9 prop_invmap search_invlist | |
10 ); | |
11 require 'regen/regen_lib.pl'; | |
12 require 'regen/charset_translations.pl'; | |
13 | |
14 # This program outputs charclass_invlists.h, which contains various inversion | |
15 # lists in the form of C arrays that are to be used as-is for inversion lists. | |
16 # Thus, the lists it contains are essentially pre-compiled, and need only a | |
17 # light-weight fast wrapper to make them usable at run-time. | |
18 | |
19 # As such, this code knows about the internal structure of these lists, and | |
20 # any change made to that has to be done here as well. A random number stored | |
21 # in the headers is used to minimize the possibility of things getting | |
22 # out-of-sync, or the wrong data structure being passed. Currently that | |
23 # random number is: | |
24 | |
25 # charclass_invlists.h now also has a partial implementation of inversion | |
26 # maps; enough to generate tables for the line break properties, such as GCB | |
27 | |
28 my $VERSION_DATA_STRUCTURE_TYPE = 148565664; | |
29 | |
30 # integer or float | |
31 my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax; | |
32 | |
33 # Matches valid C language enum names: begins with ASCII alphabetic, then any | |
34 # ASCII \w | |
35 my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax; | |
36 | |
37 my $out_fh = open_new('charclass_invlists.h', '>', | |
38 {style => '*', by => $0, | |
39 from => "Unicode::UCD"}); | |
40 | |
41 my $in_file_pound_if = 0; | |
42 | |
43 print $out_fh "/* See the generating file for comments */\n\n"; | |
44 | |
45 # The symbols generated by this program are all currently defined only in a | |
46 # single dot c each. The code knows where most of them go, but this hash | |
47 # gives overrides for the exceptions to the typical place | |
48 my %exceptions_to_where_to_define = | |
49 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C', | |
50 AboveLatin1 => 'PERL_IN_REGCOMP_C', | |
51 Latin1 => 'PERL_IN_REGCOMP_C', | |
52 UpperLatin1 => 'PERL_IN_REGCOMP_C', | |
53 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C', | |
54 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C', | |
55 _Perl_IDCont => 'PERL_IN_UTF8_C', | |
56 _Perl_IDStart => 'PERL_IN_UTF8_C', | |
57 ); | |
58 | |
59 # This hash contains the properties with enums that have hard-coded references | |
60 # to them in C code. Its only use is to make sure that if perl is compiled | |
61 # with an older Unicode data set, that all the enum values the code is | |
62 # expecting will still be in the enum typedef. Thus the code doesn't have to | |
63 # change. The Unicode version won't have any code points that have these enum | |
64 # values, so the code that handles them will not get exercised. This is far | |
65 # better than having to #ifdef things. | |
66 my %hard_coded_enums = | |
67 ( gcb => [ | |
68 'Control', | |
69 'CR', | |
70 'Extend', | |
71 'L', | |
72 'LF', | |
73 'LV', | |
74 'LVT', | |
75 'Other', | |
76 'Prepend', | |
77 'Regional_Indicator', | |
78 'SpacingMark', | |
79 'T', | |
80 'V', | |
81 ], | |
82 sb => [ | |
83 'ATerm', | |
84 'Close', | |
85 'CR', | |
86 'Extend', | |
87 'Format', | |
88 'LF', | |
89 'Lower', | |
90 'Numeric', | |
91 'OLetter', | |
92 'Other', | |
93 'SContinue', | |
94 'Sep', | |
95 'Sp', | |
96 'STerm', | |
97 'Upper', | |
98 ], | |
99 wb => [ | |
100 'ALetter', | |
101 'CR', | |
102 'Double_Quote', | |
103 'Extend', | |
104 'ExtendNumLet', | |
105 'Format', | |
106 'Hebrew_Letter', | |
107 'Katakana', | |
108 'LF', | |
109 'MidLetter', | |
110 'MidNum', | |
111 'MidNumLet', | |
112 'Newline', | |
113 'Numeric', | |
114 'Other', | |
115 'Regional_Indicator', | |
116 'Single_Quote', | |
117 'UNKNOWN', | |
118 ], | |
119 ); | |
120 | |
121 my @a2n; | |
122 | |
123 sub uniques { | |
124 # Returns non-duplicated input values. From "Perl Best Practices: | |
125 # Encapsulated Cleverness". p. 455 in first edition. | |
126 | |
127 my %seen; | |
128 return grep { ! $seen{$_}++ } @_; | |
129 } | |
130 | |
131 sub a2n($) { | |
132 my $cp = shift; | |
133 | |
134 # Returns the input Unicode code point translated to native. | |
135 | |
136 return $cp if $cp !~ $numeric_re || $cp > 255; | |
137 return $a2n[$cp]; | |
138 } | |
139 | |
140 sub end_file_pound_if { | |
141 if ($in_file_pound_if) { | |
142 print $out_fh "\n#endif\t/* $in_file_pound_if */\n"; | |
143 $in_file_pound_if = 0; | |
144 } | |
145 } | |
146 | |
147 sub switch_pound_if ($$) { | |
148 my $name = shift; | |
149 my $new_pound_if = shift; | |
150 | |
151 # Switch to new #if given by the 2nd argument. If there is an override | |
152 # for this, it instead switches to that. The 1st argument is the | |
153 # static's name, used to look up the overrides | |
154 | |
155 if (exists $exceptions_to_where_to_define{$name}) { | |
156 $new_pound_if = $exceptions_to_where_to_define{$name}; | |
157 } | |
158 | |
159 # Exit current #if if the new one is different from the old | |
160 if ($in_file_pound_if | |
161 && $in_file_pound_if !~ /$new_pound_if/) | |
162 { | |
163 end_file_pound_if; | |
164 } | |
165 | |
166 # Enter new #if, if not already in it. | |
167 if (! $in_file_pound_if) { | |
168 $in_file_pound_if = "defined($new_pound_if)"; | |
169 print $out_fh "\n#if $in_file_pound_if\n"; | |
170 } | |
171 } | |
172 | |
173 sub output_invlist ($$;$) { | |
174 my $name = shift; | |
175 my $invlist = shift; # Reference to inversion list array | |
176 my $charset = shift // ""; # name of character set for comment | |
177 | |
178 die "No inversion list for $name" unless defined $invlist | |
179 && ref $invlist eq 'ARRAY' | |
180 && @$invlist; | |
181 | |
182 # Output the inversion list $invlist using the name $name for it. | |
183 # It is output in the exact internal form for inversion lists. | |
184 | |
185 # Is the last element of the header 0, or 1 ? | |
186 my $zero_or_one = 0; | |
187 if ($invlist->[0] != 0) { | |
188 unshift @$invlist, 0; | |
189 $zero_or_one = 1; | |
190 } | |
191 my $count = @$invlist; | |
192 | |
193 switch_pound_if ($name, 'PERL_IN_PERL_C'); | |
194 | |
195 print $out_fh "\nstatic const UV ${name}_invlist[] = {"; | |
196 print $out_fh " /* for $charset */" if $charset; | |
197 print $out_fh "\n"; | |
198 | |
199 print $out_fh "\t$count,\t/* Number of elements */\n"; | |
200 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n"; | |
201 print $out_fh "\t", $zero_or_one, | |
202 ",\t/* 0 if the list starts at 0;", | |
203 "\n\t\t 1 if it starts at the element beyond 0 */\n"; | |
204 | |
205 # The main body are the UVs passed in to this routine. Do the final | |
206 # element separately | |
207 for my $i (0 .. @$invlist - 1) { | |
208 printf $out_fh "\t0x%X", $invlist->[$i]; | |
209 print $out_fh "," if $i < @$invlist - 1; | |
210 print $out_fh "\n"; | |
211 } | |
212 | |
213 print $out_fh "};\n"; | |
214 } | |
215 | |
216 sub output_invmap ($$$$$$$) { | |
217 my $name = shift; | |
218 my $invmap = shift; # Reference to inversion map array | |
219 my $prop_name = shift; | |
220 my $input_format = shift; # The inversion map's format | |
221 my $default = shift; # The property value for code points who | |
222 # otherwise don't have a value specified. | |
223 my $extra_enums = shift; # comma-separated list of our additions to the | |
224 # property's standard possible values | |
225 my $charset = shift // ""; # name of character set for comment | |
226 | |
227 # Output the inversion map $invmap for property $prop_name, but use $name | |
228 # as the actual data structure's name. | |
229 | |
230 my $count = @$invmap; | |
231 | |
232 my $output_format; | |
233 my $declaration_type; | |
234 my %enums; | |
235 my $name_prefix; | |
236 | |
237 if ($input_format eq 's') { | |
238 $prop_name = (prop_aliases($prop_name))[1]; # Get full name | |
239 my $short_name = (prop_aliases($prop_name))[0]; | |
240 my @enums = prop_values($prop_name); | |
241 if (! @enums) { | |
242 die "Only enum properties are currently handled; '$prop_name' isn't one"; | |
243 } | |
244 else { | |
245 | |
246 # Convert short names to long | |
247 @enums = map { (prop_value_aliases($prop_name, $_))[1] } @enums; | |
248 | |
249 my @expected_enums = @{$hard_coded_enums{lc $short_name}}; | |
250 die 'You need to update %hard_coded_enums to reflect new entries in this Unicode version' | |
251 if @expected_enums < @enums; | |
252 | |
253 # Remove the enums found in the input from the ones we expect | |
254 for (my $i = @expected_enums - 1; $i >= 0; $i--) { | |
255 splice(@expected_enums, $i, 1) | |
256 if grep { $expected_enums[$i] eq $_ } @enums; | |
257 } | |
258 | |
259 # The ones remaining must be because we're using an older | |
260 # Unicode version. Add them to the list. | |
261 push @enums, @expected_enums; | |
262 | |
263 # Add in the extra values coded into this program, and sort. | |
264 push @enums, split /,/, $extra_enums if $extra_enums ne ""; | |
265 @enums = sort @enums; | |
266 | |
267 # Assign a value to each element of the enum. The default | |
268 # value always gets 0; the others are arbitrarily assigned. | |
269 my $enum_val = 0; | |
270 $default = prop_value_aliases($prop_name, $default); | |
271 $enums{$default} = $enum_val++; | |
272 for my $enum (@enums) { | |
273 $enums{$enum} = $enum_val++ unless exists $enums{$enum}; | |
274 } | |
275 } | |
276 | |
277 # Inversion map stuff is currently used only by regexec | |
278 switch_pound_if($name, 'PERL_IN_REGEXEC_C'); | |
279 { | |
280 | |
281 # The short names tend to be two lower case letters, but it looks | |
282 # better for those if they are upper. XXX | |
283 $short_name = uc($short_name) if length($short_name) < 3 | |
284 || substr($short_name, 0, 1) =~ /[[:lower:]]/; | |
285 $name_prefix = "${short_name}_"; | |
286 my $enum_count = keys %enums; | |
287 print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n"; | |
288 | |
289 print $out_fh "\ntypedef enum {\n"; | |
290 print $out_fh "\t${name_prefix}$default = $enums{$default},\n"; | |
291 delete $enums{$default}; | |
292 foreach my $enum (sort { $a cmp $b } keys %enums) { | |
293 print $out_fh "\t${name_prefix}$enum = $enums{$enum}"; | |
294 print $out_fh "," if $enums{$enum} < $enum_count - 1; | |
295 print $out_fh "\n"; | |
296 } | |
297 $declaration_type = "${name_prefix}enum"; | |
298 print $out_fh "} $declaration_type;\n"; | |
299 | |
300 $output_format = "${name_prefix}%s"; | |
301 } | |
302 } | |
303 else { | |
304 die "'$input_format' invmap() format for '$prop_name' unimplemented"; | |
305 } | |
306 | |
307 die "No inversion map for $prop_name" unless defined $invmap | |
308 && ref $invmap eq 'ARRAY' | |
309 && $count; | |
310 | |
311 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {"; | |
312 print $out_fh " /* for $charset */" if $charset; | |
313 print $out_fh "\n"; | |
314 | |
315 # The main body are the scalars passed in to this routine. | |
316 for my $i (0 .. $count - 1) { | |
317 my $element = $invmap->[$i]; | |
318 $element = $name_prefix . prop_value_aliases($prop_name, $element); | |
319 print $out_fh "\t$element"; | |
320 print $out_fh "," if $i < $count - 1; | |
321 print $out_fh "\n"; | |
322 } | |
323 print $out_fh "};\n"; | |
324 } | |
325 | |
326 sub mk_invlist_from_sorted_cp_list { | |
327 | |
328 # Returns an inversion list constructed from the sorted input array of | |
329 # code points | |
330 | |
331 my $list_ref = shift; | |
332 | |
333 return unless @$list_ref; | |
334 | |
335 # Initialize to just the first element | |
336 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1); | |
337 | |
338 # For each succeeding element, if it extends the previous range, adjust | |
339 # up, otherwise add it. | |
340 for my $i (1 .. @$list_ref - 1) { | |
341 if ($invlist[-1] == $list_ref->[$i]) { | |
342 $invlist[-1]++; | |
343 } | |
344 else { | |
345 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1; | |
346 } | |
347 } | |
348 return @invlist; | |
349 } | |
350 | |
351 # Read in the Case Folding rules, and construct arrays of code points for the | |
352 # properties we need. | |
353 my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding"); | |
354 die "Could not find inversion map for Case_Folding" unless defined $format; | |
355 die "Incorrect format '$format' for Case_Folding inversion map" | |
356 unless $format eq 'al'; | |
357 my @has_multi_char_fold; | |
358 my @is_non_final_fold; | |
359 | |
360 for my $i (0 .. @$folds_ref - 1) { | |
361 next unless ref $folds_ref->[$i]; # Skip single-char folds | |
362 push @has_multi_char_fold, $cp_ref->[$i]; | |
363 | |
364 # Add to the non-finals list each code point that is in a non-final | |
365 # position | |
366 for my $j (0 .. @{$folds_ref->[$i]} - 2) { | |
367 push @is_non_final_fold, $folds_ref->[$i][$j] | |
368 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold; | |
369 } | |
370 } | |
371 | |
372 sub _Perl_Non_Final_Folds { | |
373 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold; | |
374 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold); | |
375 } | |
376 | |
377 sub prop_name_for_cmp ($) { # Sort helper | |
378 my $name = shift; | |
379 | |
380 # Returns the input lowercased, with non-alphas removed, as well as | |
381 # everything starting with a comma | |
382 | |
383 $name =~ s/,.*//; | |
384 $name =~ s/[[:^alpha:]]//g; | |
385 return lc $name; | |
386 } | |
387 | |
388 sub UpperLatin1 { | |
389 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]); | |
390 } | |
391 | |
392 output_invlist("Latin1", [ 0, 256 ]); | |
393 output_invlist("AboveLatin1", [ 256 ]); | |
394 | |
395 end_file_pound_if; | |
396 | |
397 # We construct lists for all the POSIX and backslash sequence character | |
398 # classes in two forms: | |
399 # 1) ones which match only in the ASCII range | |
400 # 2) ones which match either in the Latin1 range, or the entire Unicode range | |
401 # | |
402 # These get compiled in, and hence affect the memory footprint of every Perl | |
403 # program, even those not using Unicode. To minimize the size, currently | |
404 # the Latin1 version is generated for the beyond ASCII range except for those | |
405 # lists that are quite small for the entire range, such as for \s, which is 22 | |
406 # UVs long plus 4 UVs (currently) for the header. | |
407 # | |
408 # To save even more memory, the ASCII versions could be derived from the | |
409 # larger ones at runtime, saving some memory (minus the expense of the machine | |
410 # instructions to do so), but these are all small anyway, so their total is | |
411 # about 100 UVs. | |
412 # | |
413 # In the list of properties below that get generated, the L1 prefix is a fake | |
414 # property that means just the Latin1 range of the full property (whose name | |
415 # has an X prefix instead of L1). | |
416 # | |
417 # An initial & means to use the subroutine from this file instead of an | |
418 # official inversion list. | |
419 | |
420 for my $charset (get_supported_code_pages()) { | |
421 print $out_fh "\n" . get_conditional_compile_line_start($charset); | |
422 | |
423 @a2n = @{get_a2n($charset)}; | |
424 no warnings 'qw'; | |
425 # Ignore non-alpha in sort | |
426 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw( | |
427 ASCII | |
428 Cased | |
429 VertSpace | |
430 XPerlSpace | |
431 XPosixAlnum | |
432 XPosixAlpha | |
433 XPosixBlank | |
434 XPosixCntrl | |
435 XPosixDigit | |
436 XPosixGraph | |
437 XPosixLower | |
438 XPosixPrint | |
439 XPosixPunct | |
440 XPosixSpace | |
441 XPosixUpper | |
442 XPosixWord | |
443 XPosixXDigit | |
444 _Perl_Any_Folds | |
445 &NonL1_Perl_Non_Final_Folds | |
446 _Perl_Folds_To_Multi_Char | |
447 &UpperLatin1 | |
448 _Perl_IDStart | |
449 _Perl_IDCont | |
450 Grapheme_Cluster_Break,EDGE | |
451 Word_Break,EDGE,UNKNOWN | |
452 Sentence_Break,EDGE | |
453 ) | |
454 ) { | |
455 | |
456 # For the Latin1 properties, we change to use the eXtended version of the | |
457 # base property, then go through the result and get rid of everything not | |
458 # in Latin1 (above 255). Actually, we retain the element for the range | |
459 # that crosses the 255/256 boundary if it is one that matches the | |
460 # property. For example, in the Word property, there is a range of code | |
461 # points that start at U+00F8 and goes through U+02C1. Instead of | |
462 # artificially cutting that off at 256 because 256 is the first code point | |
463 # above Latin1, we let the range go to its natural ending. That gives us | |
464 # extra information with no added space taken. But if the range that | |
465 # crosses the boundary is one that doesn't match the property, we don't | |
466 # start a new range above 255, as that could be construed as going to | |
467 # infinity. For example, the Upper property doesn't include the character | |
468 # at 255, but does include the one at 256. We don't include the 256 one. | |
469 my $prop_name = $prop; | |
470 my $is_local_sub = $prop_name =~ s/^&//; | |
471 my $extra_enums = ""; | |
472 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x; | |
473 my $lookup_prop = $prop_name; | |
474 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ | |
475 or $lookup_prop =~ s/^L1//); | |
476 my $nonl1_only = 0; | |
477 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only; | |
478 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x; | |
479 | |
480 my @invlist; | |
481 my @invmap; | |
482 my $map_format; | |
483 my $map_default; | |
484 my $maps_to_code_point; | |
485 my $to_adjust; | |
486 if ($is_local_sub) { | |
487 @invlist = eval $lookup_prop; | |
488 } | |
489 else { | |
490 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok'); | |
491 if (! @invlist) { | |
492 my ($list_ref, $map_ref, $format, $default); | |
493 | |
494 ($list_ref, $map_ref, $format, $default) | |
495 = prop_invmap($lookup_prop, '_perl_core_internal_ok'); | |
496 die "Could not find inversion list for '$lookup_prop'" unless $list_ref; | |
497 @invlist = @$list_ref; | |
498 @invmap = @$map_ref; | |
499 $map_format = $format; | |
500 $map_default = $default; | |
501 $maps_to_code_point = $map_format =~ /x/; | |
502 $to_adjust = $map_format =~ /a/; | |
503 } | |
504 } | |
505 die "Could not find inversion list for '$lookup_prop'" unless @invlist; | |
506 | |
507 # Re-order the Unicode code points to native ones for this platform. | |
508 # This is only needed for code points below 256, because native code | |
509 # points are only in that range. For inversion maps of properties | |
510 # where the mappings are adjusted (format =~ /a/), this reordering | |
511 # could mess up the adjustment pattern that was in the input, so that | |
512 # has to be dealt with. | |
513 # | |
514 # And inversion maps that map to code points need to eventually have | |
515 # all those code points remapped to native, and it's better to do that | |
516 # here, going through the whole list not just those below 256. This | |
517 # is because some inversion maps have adjustments (format =~ /a/) | |
518 # which may be affected by the reordering. This code needs to be done | |
519 # both for when we are translating the inversion lists for < 256, and | |
520 # for the inversion maps for everything. By doing both in this loop, | |
521 # we can share that code. | |
522 # | |
523 # So, we go through everything for an inversion map to code points; | |
524 # otherwise, we can skip any remapping at all if we are going to | |
525 # output only the above-Latin1 values, or if the range spans the whole | |
526 # of 0..256, as the remap will also include all of 0..256 (256 not | |
527 # 255 because a re-ordering could cause 256 to need to be in the same | |
528 # range as 255.) | |
529 if ((@invmap && $maps_to_code_point) | |
530 || (! $nonl1_only || ($invlist[0] < 256 | |
531 && ! ($invlist[0] == 0 && $invlist[1] > 256)))) | |
532 { | |
533 | |
534 if (! @invmap) { # Straight inversion list | |
535 # Look at all the ranges that start before 257. | |
536 my @latin1_list; | |
537 while (@invlist) { | |
538 last if $invlist[0] > 256; | |
539 my $upper = @invlist > 1 | |
540 ? $invlist[1] - 1 # In range | |
541 | |
542 # To infinity. You may want to stop much much | |
543 # earlier; going this high may expose perl | |
544 # deficiencies with very large numbers. | |
545 : $Unicode::UCD::MAX_CP; | |
546 for my $j ($invlist[0] .. $upper) { | |
547 push @latin1_list, a2n($j); | |
548 } | |
549 | |
550 shift @invlist; # Shift off the range that's in the list | |
551 shift @invlist; # Shift off the range not in the list | |
552 } | |
553 | |
554 # Here @invlist contains all the ranges in the original that start | |
555 # at code points above 256, and @latin1_list contains all the | |
556 # native code points for ranges that start with a Unicode code | |
557 # point below 257. We sort the latter and convert it to inversion | |
558 # list format. Then simply prepend it to the list of the higher | |
559 # code points. | |
560 @latin1_list = sort { $a <=> $b } @latin1_list; | |
561 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list); | |
562 unshift @invlist, @latin1_list; | |
563 } | |
564 else { # Is an inversion map | |
565 | |
566 # This is a similar procedure as plain inversion list, but has | |
567 # multiple buckets. A plain inversion list just has two | |
568 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we | |
569 # pretty much can ignore the 2nd bucket, as it is completely | |
570 # defined by the 1st. But here, what we do is create buckets | |
571 # which contain the code points that map to each, translated | |
572 # to native and turned into an inversion list. Thus each | |
573 # bucket is an inversion list of native code points that map | |
574 # to it or don't map to it. We use these to create an | |
575 # inversion map for the whole property. | |
576 | |
577 # As mentioned earlier, we use this procedure to not just | |
578 # remap the inversion list to native values, but also the maps | |
579 # of code points to native ones. In the latter case we have | |
580 # to look at the whole of the inversion map (or at least to | |
581 # above Unicode; as the maps of code points above that should | |
582 # all be to the default). | |
583 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256; | |
584 | |
585 my %mapped_lists; # A hash whose keys are the buckets. | |
586 while (@invlist) { | |
587 last if $invlist[0] > $upper_limit; | |
588 | |
589 # This shouldn't actually happen, as prop_invmap() returns | |
590 # an extra element at the end that is beyond $upper_limit | |
591 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1; | |
592 | |
593 my $bucket; | |
594 | |
595 # A hash key can't be a ref (we are only expecting arrays | |
596 # of scalars here), so convert any such to a string that | |
597 # will be converted back later (using a vertical tab as | |
598 # the separator). Even if the mapping is to code points, | |
599 # we don't translate to native here because the code | |
600 # output_map() calls to output these arrays assumes the | |
601 # input is Unicode, not native. | |
602 if (ref $invmap[0]) { | |
603 $bucket = join "\cK", @{$invmap[0]}; | |
604 } | |
605 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) { | |
606 | |
607 # Do convert to native for maps to single code points. | |
608 # There are some properties that have a few outlier | |
609 # maps that aren't code points, so the above test | |
610 # skips those. | |
611 $bucket = a2n($invmap[0]); | |
612 } else { | |
613 $bucket = $invmap[0]; | |
614 } | |
615 | |
616 # We now have the bucket that all code points in the range | |
617 # map to, though possibly they need to be adjusted. Go | |
618 # through the range and put each translated code point in | |
619 # it into its bucket. | |
620 my $base_map = $invmap[0]; | |
621 for my $j ($invlist[0] .. $invlist[1] - 1) { | |
622 if ($to_adjust | |
623 # The 1st code point doesn't need adjusting | |
624 && $j > $invlist[0] | |
625 | |
626 # Skip any non-numeric maps: these are outliers | |
627 # that aren't code points. | |
628 && $base_map =~ $numeric_re | |
629 | |
630 # 'ne' because the default can be a string | |
631 && $base_map ne $map_default) | |
632 { | |
633 # We adjust, by incrementing each the bucket and | |
634 # the map. For code point maps, translate to | |
635 # native | |
636 $base_map++; | |
637 $bucket = ($maps_to_code_point) | |
638 ? a2n($base_map) | |
639 : $base_map; | |
640 } | |
641 | |
642 # Add the native code point to the bucket for the | |
643 # current map | |
644 push @{$mapped_lists{$bucket}}, a2n($j); | |
645 } # End of loop through all code points in the range | |
646 | |
647 # Get ready for the next range | |
648 shift @invlist; | |
649 shift @invmap; | |
650 } # End of loop through all ranges in the map. | |
651 | |
652 # Here, @invlist and @invmap retain all the ranges from the | |
653 # originals that start with code points above $upper_limit. | |
654 # Each bucket in %mapped_lists contains all the code points | |
655 # that map to that bucket. If the bucket is for a map to a | |
656 # single code point is a single code point, the bucket has | |
657 # been converted to native. If something else (including | |
658 # multiple code points), no conversion is done. | |
659 # | |
660 # Now we recreate the inversion map into %xlated, but this | |
661 # time for the native character set. | |
662 my %xlated; | |
663 foreach my $bucket (keys %mapped_lists) { | |
664 | |
665 # Sort and convert this bucket to an inversion list. The | |
666 # result will be that ranges that start with even-numbered | |
667 # indexes will be for code points that map to this bucket; | |
668 # odd ones map to some other bucket, and are discarded | |
669 # below. | |
670 @{$mapped_lists{$bucket}} | |
671 = sort{ $a <=> $b} @{$mapped_lists{$bucket}}; | |
672 @{$mapped_lists{$bucket}} | |
673 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}}); | |
674 | |
675 # Add each even-numbered range in the bucket to %xlated; | |
676 # so that the keys of %xlated become the range start code | |
677 # points, and the values are their corresponding maps. | |
678 while (@{$mapped_lists{$bucket}}) { | |
679 my $range_start = $mapped_lists{$bucket}->[0]; | |
680 if ($bucket =~ /\cK/) { | |
681 @{$xlated{$range_start}} = split /\cK/, $bucket; | |
682 } | |
683 else { | |
684 $xlated{$range_start} = $bucket; | |
685 } | |
686 shift @{$mapped_lists{$bucket}}; # Discard odd ranges | |
687 shift @{$mapped_lists{$bucket}}; # Get ready for next | |
688 # iteration | |
689 } | |
690 } # End of loop through all the buckets. | |
691 | |
692 # Here %xlated's keys are the range starts of all the code | |
693 # points in the inversion map. Construct an inversion list | |
694 # from them. | |
695 my @new_invlist = sort { $a <=> $b } keys %xlated; | |
696 | |
697 # If the list is adjusted, we want to munge this list so that | |
698 # we only have one entry for where consecutive code points map | |
699 # to consecutive values. We just skip the subsequent entries | |
700 # where this is the case. | |
701 if ($to_adjust) { | |
702 my @temp; | |
703 for my $i (0 .. @new_invlist - 1) { | |
704 next if $i > 0 | |
705 && $new_invlist[$i-1] + 1 == $new_invlist[$i] | |
706 && $xlated{$new_invlist[$i-1]} =~ $numeric_re | |
707 && $xlated{$new_invlist[$i]} =~ $numeric_re | |
708 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]}; | |
709 push @temp, $new_invlist[$i]; | |
710 } | |
711 @new_invlist = @temp; | |
712 } | |
713 | |
714 # The inversion map comes from %xlated's values. We can | |
715 # unshift each onto the front of the untouched portion, in | |
716 # reverse order of the portion we did process. | |
717 foreach my $start (reverse @new_invlist) { | |
718 unshift @invmap, $xlated{$start}; | |
719 } | |
720 | |
721 # Finally prepend the inversion list we have just constructed to the | |
722 # one that contains anything we didn't process. | |
723 unshift @invlist, @new_invlist; | |
724 } | |
725 } | |
726 | |
727 # prop_invmap() returns an extra final entry, which we can now | |
728 # discard. | |
729 if (@invmap) { | |
730 pop @invlist; | |
731 pop @invmap; | |
732 } | |
733 | |
734 if ($l1_only) { | |
735 die "Unimplemented to do a Latin-1 only inversion map" if @invmap; | |
736 for my $i (0 .. @invlist - 1 - 1) { | |
737 if ($invlist[$i] > 255) { | |
738 | |
739 # In an inversion list, even-numbered elements give the code | |
740 # points that begin ranges that match the property; | |
741 # odd-numbered give ones that begin ranges that don't match. | |
742 # If $i is odd, we are at the first code point above 255 that | |
743 # doesn't match, which means the range it is ending does | |
744 # match, and crosses the 255/256 boundary. We want to include | |
745 # this ending point, so increment $i, so the splice below | |
746 # includes it. Conversely, if $i is even, it is the first | |
747 # code point above 255 that matches, which means there was no | |
748 # matching range that crossed the boundary, and we don't want | |
749 # to include this code point, so splice before it. | |
750 $i++ if $i % 2 != 0; | |
751 | |
752 # Remove everything past this. | |
753 splice @invlist, $i; | |
754 splice @invmap, $i if @invmap; | |
755 last; | |
756 } | |
757 } | |
758 } | |
759 elsif ($nonl1_only) { | |
760 my $found_nonl1 = 0; | |
761 for my $i (0 .. @invlist - 1 - 1) { | |
762 next if $invlist[$i] < 256; | |
763 | |
764 # Here, we have the first element in the array that indicates an | |
765 # element above Latin1. Get rid of all previous ones. | |
766 splice @invlist, 0, $i; | |
767 splice @invmap, 0, $i if @invmap; | |
768 | |
769 # If this one's index is not divisible by 2, it means that this | |
770 # element is inverting away from being in the list, which means | |
771 # all code points from 256 to this one are in this list (or | |
772 # map to the default for inversion maps) | |
773 if ($i % 2 != 0) { | |
774 unshift @invlist, 256; | |
775 unshift @invmap, $map_default if @invmap; | |
776 } | |
777 $found_nonl1 = 1; | |
778 last; | |
779 } | |
780 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1; | |
781 } | |
782 | |
783 output_invlist($prop_name, \@invlist, $charset); | |
784 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap; | |
785 } | |
786 end_file_pound_if; | |
787 print $out_fh "\n" . get_conditional_compile_line_end(); | |
788 } | |
789 | |
790 my $sources_list = "lib/unicore/mktables.lst"; | |
791 my @sources = ($0, qw(lib/unicore/mktables lib/Unicode/UCD.pm)); | |
792 { | |
793 # Depend on mktables’ own sources. It’s a shorter list of files than | |
794 # those that Unicode::UCD uses. | |
795 if (! open my $mktables_list, $sources_list) { | |
796 | |
797 # This should force a rebuild once $sources_list exists | |
798 push @sources, $sources_list; | |
799 } | |
800 else { | |
801 while(<$mktables_list>) { | |
802 last if /===/; | |
803 chomp; | |
804 push @sources, "lib/unicore/$_" if /^[^#]/; | |
805 } | |
806 } | |
807 } | |
808 read_only_bottom_close_and_rename($out_fh, \@sources) |