8043
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 use 5.010;
|
|
5 use Encode qw/decode encode FB_CROAK LEAVE_SRC/;
|
|
6
|
|
7 sub put {
|
|
8 my ($count, $item) = @_;
|
|
9 if ($count == 1) {
|
|
10 say "1 $item";
|
|
11 } else {
|
|
12 say "$count ${item}s";
|
|
13 }
|
|
14 }
|
|
15
|
|
16 my $line = "@ARGV";
|
|
17
|
|
18 my $unicode;
|
|
19 eval {
|
|
20 $unicode = decode "UTF-8", $line, FB_CROAK | LEAVE_SRC;
|
|
21 };
|
|
22 # Not valid UTF-8
|
|
23 if ($@) {
|
|
24 my $modifier = length($line) == 1 ? '' : 's';
|
|
25 say length($line)." byte$modifier (UTF-8 not valid)";
|
|
26 } else {
|
|
27 my @output;
|
|
28 my @graphemes = $unicode =~ /\X/g;
|
|
29 my @ucs2 = $unicode =~ /[\x{10000}-\x{10FFFF}]/g;
|
|
30 my $ucs2chars = @ucs2 + length $unicode;
|
|
31 if (@graphemes != length $unicode) {
|
|
32 put scalar @graphemes, 'grapheme';
|
|
33 }
|
|
34 put length $unicode, 'codepoint';
|
|
35 if ($ucs2chars != length $unicode) {
|
|
36 put $ucs2chars, 'Java character';
|
|
37 }
|
|
38 if (length $unicode != length $line) {
|
|
39 put length $line, 'UTF-8 byte';
|
|
40 }
|
|
41 } |