changeset 8040:3021689c6749

<xfix> fetch https://gist.githubusercontent.com/xfix/3c688ed7c63e1dde5777fa7cb268d613/raw/4f9b4b4d5ff0bf5da67788058081844f890d61fd/len.pl
author HackBot
date Sat, 14 May 2016 14:03:32 +0000
parents 4fd4022b9652
children 7ae5d5b9baca
files len.pl
diffstat 1 files changed, 41 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/len.pl	Sat May 14 14:03:32 2016 +0000
@@ -0,0 +1,41 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use 5.010;
+use Encode qw/decode encode FB_CROAK LEAVE_SRC/;
+
+sub put {
+    my ($count, $item) = @_;
+    if ($count == 1) {
+        say "1 $item";
+    } else {
+        say "$count ${item}s";
+    }
+}
+
+my $line = "@ARGV";
+
+my $unicode;
+eval {
+    $unicode = decode "UTF-8", $line, FB_CROAK | LEAVE_SRC;
+};
+# Not valid UTF-8
+if ($@) {
+    my $modifier = length($line) == 1 ? '' : 's';
+    say length($line)." byte$modifier (UTF-8 not valid)";
+} else {
+    my @output;
+    my @graphemes = $unicode =~ /\X/g;
+    my @ucs2 = $unicode =~ /[\x{10000}-\x{10FFFF}]/g;
+    my $ucs2chars = @ucs2 + length $unicode;
+    if (@graphemes != length $unicode) {
+        put scalar @graphemes, 'grapheme';
+    }
+    put length $unicode, 'codepoint';
+    if ($ucs2chars != length $unicode) {
+        put $ucs2chars, 'Java character';
+    }
+    if (length $unicode != length $line) {
+        put length $line, 'UTF-8 byte';
+    }
+}
\ No newline at end of file