comparison perl-5.22.2/regen/mk_invlists.pl @ 8045:a16537d2fe07

<xfix> tar xf perl-5.22.2.tar.gz # Ah, whatever, I\'m doing it anyway
author HackBot
date Sat, 14 May 2016 14:54:38 +0000
parents
children
comparison
equal deleted inserted replaced
8044:711c038a7dce 8045:a16537d2fe07
1 #!perl -w
2 use 5.015;
3 use strict;
4 use warnings;
5 use Unicode::UCD qw(prop_aliases
6 prop_values
7 prop_value_aliases
8 prop_invlist
9 prop_invmap search_invlist
10 );
11 require 'regen/regen_lib.pl';
12 require 'regen/charset_translations.pl';
13
14 # This program outputs charclass_invlists.h, which contains various inversion
15 # lists in the form of C arrays that are to be used as-is for inversion lists.
16 # Thus, the lists it contains are essentially pre-compiled, and need only a
17 # light-weight fast wrapper to make them usable at run-time.
18
19 # As such, this code knows about the internal structure of these lists, and
20 # any change made to that has to be done here as well. A random number stored
21 # in the headers is used to minimize the possibility of things getting
22 # out-of-sync, or the wrong data structure being passed. Currently that
23 # random number is:
24
25 # charclass_invlists.h now also has a partial implementation of inversion
26 # maps; enough to generate tables for the line break properties, such as GCB
27
28 my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
29
30 # integer or float
31 my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
32
33 # Matches valid C language enum names: begins with ASCII alphabetic, then any
34 # ASCII \w
35 my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
36
37 my $out_fh = open_new('charclass_invlists.h', '>',
38 {style => '*', by => $0,
39 from => "Unicode::UCD"});
40
41 my $in_file_pound_if = 0;
42
43 print $out_fh "/* See the generating file for comments */\n\n";
44
45 # The symbols generated by this program are all currently defined only in a
46 # single dot c each. The code knows where most of them go, but this hash
47 # gives overrides for the exceptions to the typical place
48 my %exceptions_to_where_to_define =
49 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
50 AboveLatin1 => 'PERL_IN_REGCOMP_C',
51 Latin1 => 'PERL_IN_REGCOMP_C',
52 UpperLatin1 => 'PERL_IN_REGCOMP_C',
53 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
54 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
55 _Perl_IDCont => 'PERL_IN_UTF8_C',
56 _Perl_IDStart => 'PERL_IN_UTF8_C',
57 );
58
59 # This hash contains the properties with enums that have hard-coded references
60 # to them in C code. Its only use is to make sure that if perl is compiled
61 # with an older Unicode data set, that all the enum values the code is
62 # expecting will still be in the enum typedef. Thus the code doesn't have to
63 # change. The Unicode version won't have any code points that have these enum
64 # values, so the code that handles them will not get exercised. This is far
65 # better than having to #ifdef things.
66 my %hard_coded_enums =
67 ( gcb => [
68 'Control',
69 'CR',
70 'Extend',
71 'L',
72 'LF',
73 'LV',
74 'LVT',
75 'Other',
76 'Prepend',
77 'Regional_Indicator',
78 'SpacingMark',
79 'T',
80 'V',
81 ],
82 sb => [
83 'ATerm',
84 'Close',
85 'CR',
86 'Extend',
87 'Format',
88 'LF',
89 'Lower',
90 'Numeric',
91 'OLetter',
92 'Other',
93 'SContinue',
94 'Sep',
95 'Sp',
96 'STerm',
97 'Upper',
98 ],
99 wb => [
100 'ALetter',
101 'CR',
102 'Double_Quote',
103 'Extend',
104 'ExtendNumLet',
105 'Format',
106 'Hebrew_Letter',
107 'Katakana',
108 'LF',
109 'MidLetter',
110 'MidNum',
111 'MidNumLet',
112 'Newline',
113 'Numeric',
114 'Other',
115 'Regional_Indicator',
116 'Single_Quote',
117 'UNKNOWN',
118 ],
119 );
120
121 my @a2n;
122
123 sub uniques {
124 # Returns non-duplicated input values. From "Perl Best Practices:
125 # Encapsulated Cleverness". p. 455 in first edition.
126
127 my %seen;
128 return grep { ! $seen{$_}++ } @_;
129 }
130
131 sub a2n($) {
132 my $cp = shift;
133
134 # Returns the input Unicode code point translated to native.
135
136 return $cp if $cp !~ $numeric_re || $cp > 255;
137 return $a2n[$cp];
138 }
139
140 sub end_file_pound_if {
141 if ($in_file_pound_if) {
142 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
143 $in_file_pound_if = 0;
144 }
145 }
146
147 sub switch_pound_if ($$) {
148 my $name = shift;
149 my $new_pound_if = shift;
150
151 # Switch to new #if given by the 2nd argument. If there is an override
152 # for this, it instead switches to that. The 1st argument is the
153 # static's name, used to look up the overrides
154
155 if (exists $exceptions_to_where_to_define{$name}) {
156 $new_pound_if = $exceptions_to_where_to_define{$name};
157 }
158
159 # Exit current #if if the new one is different from the old
160 if ($in_file_pound_if
161 && $in_file_pound_if !~ /$new_pound_if/)
162 {
163 end_file_pound_if;
164 }
165
166 # Enter new #if, if not already in it.
167 if (! $in_file_pound_if) {
168 $in_file_pound_if = "defined($new_pound_if)";
169 print $out_fh "\n#if $in_file_pound_if\n";
170 }
171 }
172
173 sub output_invlist ($$;$) {
174 my $name = shift;
175 my $invlist = shift; # Reference to inversion list array
176 my $charset = shift // ""; # name of character set for comment
177
178 die "No inversion list for $name" unless defined $invlist
179 && ref $invlist eq 'ARRAY'
180 && @$invlist;
181
182 # Output the inversion list $invlist using the name $name for it.
183 # It is output in the exact internal form for inversion lists.
184
185 # Is the last element of the header 0, or 1 ?
186 my $zero_or_one = 0;
187 if ($invlist->[0] != 0) {
188 unshift @$invlist, 0;
189 $zero_or_one = 1;
190 }
191 my $count = @$invlist;
192
193 switch_pound_if ($name, 'PERL_IN_PERL_C');
194
195 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
196 print $out_fh " /* for $charset */" if $charset;
197 print $out_fh "\n";
198
199 print $out_fh "\t$count,\t/* Number of elements */\n";
200 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
201 print $out_fh "\t", $zero_or_one,
202 ",\t/* 0 if the list starts at 0;",
203 "\n\t\t 1 if it starts at the element beyond 0 */\n";
204
205 # The main body are the UVs passed in to this routine. Do the final
206 # element separately
207 for my $i (0 .. @$invlist - 1) {
208 printf $out_fh "\t0x%X", $invlist->[$i];
209 print $out_fh "," if $i < @$invlist - 1;
210 print $out_fh "\n";
211 }
212
213 print $out_fh "};\n";
214 }
215
216 sub output_invmap ($$$$$$$) {
217 my $name = shift;
218 my $invmap = shift; # Reference to inversion map array
219 my $prop_name = shift;
220 my $input_format = shift; # The inversion map's format
221 my $default = shift; # The property value for code points who
222 # otherwise don't have a value specified.
223 my $extra_enums = shift; # comma-separated list of our additions to the
224 # property's standard possible values
225 my $charset = shift // ""; # name of character set for comment
226
227 # Output the inversion map $invmap for property $prop_name, but use $name
228 # as the actual data structure's name.
229
230 my $count = @$invmap;
231
232 my $output_format;
233 my $declaration_type;
234 my %enums;
235 my $name_prefix;
236
237 if ($input_format eq 's') {
238 $prop_name = (prop_aliases($prop_name))[1]; # Get full name
239 my $short_name = (prop_aliases($prop_name))[0];
240 my @enums = prop_values($prop_name);
241 if (! @enums) {
242 die "Only enum properties are currently handled; '$prop_name' isn't one";
243 }
244 else {
245
246 # Convert short names to long
247 @enums = map { (prop_value_aliases($prop_name, $_))[1] } @enums;
248
249 my @expected_enums = @{$hard_coded_enums{lc $short_name}};
250 die 'You need to update %hard_coded_enums to reflect new entries in this Unicode version'
251 if @expected_enums < @enums;
252
253 # Remove the enums found in the input from the ones we expect
254 for (my $i = @expected_enums - 1; $i >= 0; $i--) {
255 splice(@expected_enums, $i, 1)
256 if grep { $expected_enums[$i] eq $_ } @enums;
257 }
258
259 # The ones remaining must be because we're using an older
260 # Unicode version. Add them to the list.
261 push @enums, @expected_enums;
262
263 # Add in the extra values coded into this program, and sort.
264 push @enums, split /,/, $extra_enums if $extra_enums ne "";
265 @enums = sort @enums;
266
267 # Assign a value to each element of the enum. The default
268 # value always gets 0; the others are arbitrarily assigned.
269 my $enum_val = 0;
270 $default = prop_value_aliases($prop_name, $default);
271 $enums{$default} = $enum_val++;
272 for my $enum (@enums) {
273 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
274 }
275 }
276
277 # Inversion map stuff is currently used only by regexec
278 switch_pound_if($name, 'PERL_IN_REGEXEC_C');
279 {
280
281 # The short names tend to be two lower case letters, but it looks
282 # better for those if they are upper. XXX
283 $short_name = uc($short_name) if length($short_name) < 3
284 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
285 $name_prefix = "${short_name}_";
286 my $enum_count = keys %enums;
287 print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
288
289 print $out_fh "\ntypedef enum {\n";
290 print $out_fh "\t${name_prefix}$default = $enums{$default},\n";
291 delete $enums{$default};
292 foreach my $enum (sort { $a cmp $b } keys %enums) {
293 print $out_fh "\t${name_prefix}$enum = $enums{$enum}";
294 print $out_fh "," if $enums{$enum} < $enum_count - 1;
295 print $out_fh "\n";
296 }
297 $declaration_type = "${name_prefix}enum";
298 print $out_fh "} $declaration_type;\n";
299
300 $output_format = "${name_prefix}%s";
301 }
302 }
303 else {
304 die "'$input_format' invmap() format for '$prop_name' unimplemented";
305 }
306
307 die "No inversion map for $prop_name" unless defined $invmap
308 && ref $invmap eq 'ARRAY'
309 && $count;
310
311 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
312 print $out_fh " /* for $charset */" if $charset;
313 print $out_fh "\n";
314
315 # The main body are the scalars passed in to this routine.
316 for my $i (0 .. $count - 1) {
317 my $element = $invmap->[$i];
318 $element = $name_prefix . prop_value_aliases($prop_name, $element);
319 print $out_fh "\t$element";
320 print $out_fh "," if $i < $count - 1;
321 print $out_fh "\n";
322 }
323 print $out_fh "};\n";
324 }
325
326 sub mk_invlist_from_sorted_cp_list {
327
328 # Returns an inversion list constructed from the sorted input array of
329 # code points
330
331 my $list_ref = shift;
332
333 return unless @$list_ref;
334
335 # Initialize to just the first element
336 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
337
338 # For each succeeding element, if it extends the previous range, adjust
339 # up, otherwise add it.
340 for my $i (1 .. @$list_ref - 1) {
341 if ($invlist[-1] == $list_ref->[$i]) {
342 $invlist[-1]++;
343 }
344 else {
345 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
346 }
347 }
348 return @invlist;
349 }
350
351 # Read in the Case Folding rules, and construct arrays of code points for the
352 # properties we need.
353 my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
354 die "Could not find inversion map for Case_Folding" unless defined $format;
355 die "Incorrect format '$format' for Case_Folding inversion map"
356 unless $format eq 'al';
357 my @has_multi_char_fold;
358 my @is_non_final_fold;
359
360 for my $i (0 .. @$folds_ref - 1) {
361 next unless ref $folds_ref->[$i]; # Skip single-char folds
362 push @has_multi_char_fold, $cp_ref->[$i];
363
364 # Add to the non-finals list each code point that is in a non-final
365 # position
366 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
367 push @is_non_final_fold, $folds_ref->[$i][$j]
368 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
369 }
370 }
371
372 sub _Perl_Non_Final_Folds {
373 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
374 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
375 }
376
377 sub prop_name_for_cmp ($) { # Sort helper
378 my $name = shift;
379
380 # Returns the input lowercased, with non-alphas removed, as well as
381 # everything starting with a comma
382
383 $name =~ s/,.*//;
384 $name =~ s/[[:^alpha:]]//g;
385 return lc $name;
386 }
387
388 sub UpperLatin1 {
389 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
390 }
391
392 output_invlist("Latin1", [ 0, 256 ]);
393 output_invlist("AboveLatin1", [ 256 ]);
394
395 end_file_pound_if;
396
397 # We construct lists for all the POSIX and backslash sequence character
398 # classes in two forms:
399 # 1) ones which match only in the ASCII range
400 # 2) ones which match either in the Latin1 range, or the entire Unicode range
401 #
402 # These get compiled in, and hence affect the memory footprint of every Perl
403 # program, even those not using Unicode. To minimize the size, currently
404 # the Latin1 version is generated for the beyond ASCII range except for those
405 # lists that are quite small for the entire range, such as for \s, which is 22
406 # UVs long plus 4 UVs (currently) for the header.
407 #
408 # To save even more memory, the ASCII versions could be derived from the
409 # larger ones at runtime, saving some memory (minus the expense of the machine
410 # instructions to do so), but these are all small anyway, so their total is
411 # about 100 UVs.
412 #
413 # In the list of properties below that get generated, the L1 prefix is a fake
414 # property that means just the Latin1 range of the full property (whose name
415 # has an X prefix instead of L1).
416 #
417 # An initial & means to use the subroutine from this file instead of an
418 # official inversion list.
419
420 for my $charset (get_supported_code_pages()) {
421 print $out_fh "\n" . get_conditional_compile_line_start($charset);
422
423 @a2n = @{get_a2n($charset)};
424 no warnings 'qw';
425 # Ignore non-alpha in sort
426 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
427 ASCII
428 Cased
429 VertSpace
430 XPerlSpace
431 XPosixAlnum
432 XPosixAlpha
433 XPosixBlank
434 XPosixCntrl
435 XPosixDigit
436 XPosixGraph
437 XPosixLower
438 XPosixPrint
439 XPosixPunct
440 XPosixSpace
441 XPosixUpper
442 XPosixWord
443 XPosixXDigit
444 _Perl_Any_Folds
445 &NonL1_Perl_Non_Final_Folds
446 _Perl_Folds_To_Multi_Char
447 &UpperLatin1
448 _Perl_IDStart
449 _Perl_IDCont
450 Grapheme_Cluster_Break,EDGE
451 Word_Break,EDGE,UNKNOWN
452 Sentence_Break,EDGE
453 )
454 ) {
455
456 # For the Latin1 properties, we change to use the eXtended version of the
457 # base property, then go through the result and get rid of everything not
458 # in Latin1 (above 255). Actually, we retain the element for the range
459 # that crosses the 255/256 boundary if it is one that matches the
460 # property. For example, in the Word property, there is a range of code
461 # points that start at U+00F8 and goes through U+02C1. Instead of
462 # artificially cutting that off at 256 because 256 is the first code point
463 # above Latin1, we let the range go to its natural ending. That gives us
464 # extra information with no added space taken. But if the range that
465 # crosses the boundary is one that doesn't match the property, we don't
466 # start a new range above 255, as that could be construed as going to
467 # infinity. For example, the Upper property doesn't include the character
468 # at 255, but does include the one at 256. We don't include the 256 one.
469 my $prop_name = $prop;
470 my $is_local_sub = $prop_name =~ s/^&//;
471 my $extra_enums = "";
472 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
473 my $lookup_prop = $prop_name;
474 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
475 or $lookup_prop =~ s/^L1//);
476 my $nonl1_only = 0;
477 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
478 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
479
480 my @invlist;
481 my @invmap;
482 my $map_format;
483 my $map_default;
484 my $maps_to_code_point;
485 my $to_adjust;
486 if ($is_local_sub) {
487 @invlist = eval $lookup_prop;
488 }
489 else {
490 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
491 if (! @invlist) {
492 my ($list_ref, $map_ref, $format, $default);
493
494 ($list_ref, $map_ref, $format, $default)
495 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
496 die "Could not find inversion list for '$lookup_prop'" unless $list_ref;
497 @invlist = @$list_ref;
498 @invmap = @$map_ref;
499 $map_format = $format;
500 $map_default = $default;
501 $maps_to_code_point = $map_format =~ /x/;
502 $to_adjust = $map_format =~ /a/;
503 }
504 }
505 die "Could not find inversion list for '$lookup_prop'" unless @invlist;
506
507 # Re-order the Unicode code points to native ones for this platform.
508 # This is only needed for code points below 256, because native code
509 # points are only in that range. For inversion maps of properties
510 # where the mappings are adjusted (format =~ /a/), this reordering
511 # could mess up the adjustment pattern that was in the input, so that
512 # has to be dealt with.
513 #
514 # And inversion maps that map to code points need to eventually have
515 # all those code points remapped to native, and it's better to do that
516 # here, going through the whole list not just those below 256. This
517 # is because some inversion maps have adjustments (format =~ /a/)
518 # which may be affected by the reordering. This code needs to be done
519 # both for when we are translating the inversion lists for < 256, and
520 # for the inversion maps for everything. By doing both in this loop,
521 # we can share that code.
522 #
523 # So, we go through everything for an inversion map to code points;
524 # otherwise, we can skip any remapping at all if we are going to
525 # output only the above-Latin1 values, or if the range spans the whole
526 # of 0..256, as the remap will also include all of 0..256 (256 not
527 # 255 because a re-ordering could cause 256 to need to be in the same
528 # range as 255.)
529 if ((@invmap && $maps_to_code_point)
530 || (! $nonl1_only || ($invlist[0] < 256
531 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
532 {
533
534 if (! @invmap) { # Straight inversion list
535 # Look at all the ranges that start before 257.
536 my @latin1_list;
537 while (@invlist) {
538 last if $invlist[0] > 256;
539 my $upper = @invlist > 1
540 ? $invlist[1] - 1 # In range
541
542 # To infinity. You may want to stop much much
543 # earlier; going this high may expose perl
544 # deficiencies with very large numbers.
545 : $Unicode::UCD::MAX_CP;
546 for my $j ($invlist[0] .. $upper) {
547 push @latin1_list, a2n($j);
548 }
549
550 shift @invlist; # Shift off the range that's in the list
551 shift @invlist; # Shift off the range not in the list
552 }
553
554 # Here @invlist contains all the ranges in the original that start
555 # at code points above 256, and @latin1_list contains all the
556 # native code points for ranges that start with a Unicode code
557 # point below 257. We sort the latter and convert it to inversion
558 # list format. Then simply prepend it to the list of the higher
559 # code points.
560 @latin1_list = sort { $a <=> $b } @latin1_list;
561 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
562 unshift @invlist, @latin1_list;
563 }
564 else { # Is an inversion map
565
566 # This is a similar procedure as plain inversion list, but has
567 # multiple buckets. A plain inversion list just has two
568 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
569 # pretty much can ignore the 2nd bucket, as it is completely
570 # defined by the 1st. But here, what we do is create buckets
571 # which contain the code points that map to each, translated
572 # to native and turned into an inversion list. Thus each
573 # bucket is an inversion list of native code points that map
574 # to it or don't map to it. We use these to create an
575 # inversion map for the whole property.
576
577 # As mentioned earlier, we use this procedure to not just
578 # remap the inversion list to native values, but also the maps
579 # of code points to native ones. In the latter case we have
580 # to look at the whole of the inversion map (or at least to
581 # above Unicode; as the maps of code points above that should
582 # all be to the default).
583 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
584
585 my %mapped_lists; # A hash whose keys are the buckets.
586 while (@invlist) {
587 last if $invlist[0] > $upper_limit;
588
589 # This shouldn't actually happen, as prop_invmap() returns
590 # an extra element at the end that is beyond $upper_limit
591 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
592
593 my $bucket;
594
595 # A hash key can't be a ref (we are only expecting arrays
596 # of scalars here), so convert any such to a string that
597 # will be converted back later (using a vertical tab as
598 # the separator). Even if the mapping is to code points,
599 # we don't translate to native here because the code
600 # output_map() calls to output these arrays assumes the
601 # input is Unicode, not native.
602 if (ref $invmap[0]) {
603 $bucket = join "\cK", @{$invmap[0]};
604 }
605 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
606
607 # Do convert to native for maps to single code points.
608 # There are some properties that have a few outlier
609 # maps that aren't code points, so the above test
610 # skips those.
611 $bucket = a2n($invmap[0]);
612 } else {
613 $bucket = $invmap[0];
614 }
615
616 # We now have the bucket that all code points in the range
617 # map to, though possibly they need to be adjusted. Go
618 # through the range and put each translated code point in
619 # it into its bucket.
620 my $base_map = $invmap[0];
621 for my $j ($invlist[0] .. $invlist[1] - 1) {
622 if ($to_adjust
623 # The 1st code point doesn't need adjusting
624 && $j > $invlist[0]
625
626 # Skip any non-numeric maps: these are outliers
627 # that aren't code points.
628 && $base_map =~ $numeric_re
629
630 # 'ne' because the default can be a string
631 && $base_map ne $map_default)
632 {
633 # We adjust, by incrementing each the bucket and
634 # the map. For code point maps, translate to
635 # native
636 $base_map++;
637 $bucket = ($maps_to_code_point)
638 ? a2n($base_map)
639 : $base_map;
640 }
641
642 # Add the native code point to the bucket for the
643 # current map
644 push @{$mapped_lists{$bucket}}, a2n($j);
645 } # End of loop through all code points in the range
646
647 # Get ready for the next range
648 shift @invlist;
649 shift @invmap;
650 } # End of loop through all ranges in the map.
651
652 # Here, @invlist and @invmap retain all the ranges from the
653 # originals that start with code points above $upper_limit.
654 # Each bucket in %mapped_lists contains all the code points
655 # that map to that bucket. If the bucket is for a map to a
656 # single code point is a single code point, the bucket has
657 # been converted to native. If something else (including
658 # multiple code points), no conversion is done.
659 #
660 # Now we recreate the inversion map into %xlated, but this
661 # time for the native character set.
662 my %xlated;
663 foreach my $bucket (keys %mapped_lists) {
664
665 # Sort and convert this bucket to an inversion list. The
666 # result will be that ranges that start with even-numbered
667 # indexes will be for code points that map to this bucket;
668 # odd ones map to some other bucket, and are discarded
669 # below.
670 @{$mapped_lists{$bucket}}
671 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
672 @{$mapped_lists{$bucket}}
673 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
674
675 # Add each even-numbered range in the bucket to %xlated;
676 # so that the keys of %xlated become the range start code
677 # points, and the values are their corresponding maps.
678 while (@{$mapped_lists{$bucket}}) {
679 my $range_start = $mapped_lists{$bucket}->[0];
680 if ($bucket =~ /\cK/) {
681 @{$xlated{$range_start}} = split /\cK/, $bucket;
682 }
683 else {
684 $xlated{$range_start} = $bucket;
685 }
686 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
687 shift @{$mapped_lists{$bucket}}; # Get ready for next
688 # iteration
689 }
690 } # End of loop through all the buckets.
691
692 # Here %xlated's keys are the range starts of all the code
693 # points in the inversion map. Construct an inversion list
694 # from them.
695 my @new_invlist = sort { $a <=> $b } keys %xlated;
696
697 # If the list is adjusted, we want to munge this list so that
698 # we only have one entry for where consecutive code points map
699 # to consecutive values. We just skip the subsequent entries
700 # where this is the case.
701 if ($to_adjust) {
702 my @temp;
703 for my $i (0 .. @new_invlist - 1) {
704 next if $i > 0
705 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
706 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
707 && $xlated{$new_invlist[$i]} =~ $numeric_re
708 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
709 push @temp, $new_invlist[$i];
710 }
711 @new_invlist = @temp;
712 }
713
714 # The inversion map comes from %xlated's values. We can
715 # unshift each onto the front of the untouched portion, in
716 # reverse order of the portion we did process.
717 foreach my $start (reverse @new_invlist) {
718 unshift @invmap, $xlated{$start};
719 }
720
721 # Finally prepend the inversion list we have just constructed to the
722 # one that contains anything we didn't process.
723 unshift @invlist, @new_invlist;
724 }
725 }
726
727 # prop_invmap() returns an extra final entry, which we can now
728 # discard.
729 if (@invmap) {
730 pop @invlist;
731 pop @invmap;
732 }
733
734 if ($l1_only) {
735 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
736 for my $i (0 .. @invlist - 1 - 1) {
737 if ($invlist[$i] > 255) {
738
739 # In an inversion list, even-numbered elements give the code
740 # points that begin ranges that match the property;
741 # odd-numbered give ones that begin ranges that don't match.
742 # If $i is odd, we are at the first code point above 255 that
743 # doesn't match, which means the range it is ending does
744 # match, and crosses the 255/256 boundary. We want to include
745 # this ending point, so increment $i, so the splice below
746 # includes it. Conversely, if $i is even, it is the first
747 # code point above 255 that matches, which means there was no
748 # matching range that crossed the boundary, and we don't want
749 # to include this code point, so splice before it.
750 $i++ if $i % 2 != 0;
751
752 # Remove everything past this.
753 splice @invlist, $i;
754 splice @invmap, $i if @invmap;
755 last;
756 }
757 }
758 }
759 elsif ($nonl1_only) {
760 my $found_nonl1 = 0;
761 for my $i (0 .. @invlist - 1 - 1) {
762 next if $invlist[$i] < 256;
763
764 # Here, we have the first element in the array that indicates an
765 # element above Latin1. Get rid of all previous ones.
766 splice @invlist, 0, $i;
767 splice @invmap, 0, $i if @invmap;
768
769 # If this one's index is not divisible by 2, it means that this
770 # element is inverting away from being in the list, which means
771 # all code points from 256 to this one are in this list (or
772 # map to the default for inversion maps)
773 if ($i % 2 != 0) {
774 unshift @invlist, 256;
775 unshift @invmap, $map_default if @invmap;
776 }
777 $found_nonl1 = 1;
778 last;
779 }
780 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
781 }
782
783 output_invlist($prop_name, \@invlist, $charset);
784 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
785 }
786 end_file_pound_if;
787 print $out_fh "\n" . get_conditional_compile_line_end();
788 }
789
790 my $sources_list = "lib/unicore/mktables.lst";
791 my @sources = ($0, qw(lib/unicore/mktables lib/Unicode/UCD.pm));
792 {
793 # Depend on mktables’ own sources. It’s a shorter list of files than
794 # those that Unicode::UCD uses.
795 if (! open my $mktables_list, $sources_list) {
796
797 # This should force a rebuild once $sources_list exists
798 push @sources, $sources_list;
799 }
800 else {
801 while(<$mktables_list>) {
802 last if /===/;
803 chomp;
804 push @sources, "lib/unicore/$_" if /^[^#]/;
805 }
806 }
807 }
808 read_only_bottom_close_and_rename($out_fh, \@sources)