Mercurial > repo
changeset 8042:b080f67e1919
<xfix> fetch https://gist.githubusercontent.com/xfix/3c688ed7c63e1dde5777fa7cb268d613/raw/4f9b4b4d5ff0bf5da67788058081844f890d61fd/len.pl
author | HackBot |
---|---|
date | Sat, 14 May 2016 14:05:35 +0000 |
parents | 7ae5d5b9baca |
children | fd90e11cc6b9 |
files | len.pl |
diffstat | 1 files changed, 41 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/len.pl Sat May 14 14:05:35 2016 +0000 @@ -0,0 +1,41 @@ +#!/usr/bin/perl +use strict; +use warnings; +use 5.010; +use Encode qw/decode encode FB_CROAK LEAVE_SRC/; + +sub put { + my ($count, $item) = @_; + if ($count == 1) { + say "1 $item"; + } else { + say "$count ${item}s"; + } +} + +my $line = "@ARGV"; + +my $unicode; +eval { + $unicode = decode "UTF-8", $line, FB_CROAK | LEAVE_SRC; +}; +# Not valid UTF-8 +if ($@) { + my $modifier = length($line) == 1 ? '' : 's'; + say length($line)." byte$modifier (UTF-8 not valid)"; +} else { + my @output; + my @graphemes = $unicode =~ /\X/g; + my @ucs2 = $unicode =~ /[\x{10000}-\x{10FFFF}]/g; + my $ucs2chars = @ucs2 + length $unicode; + if (@graphemes != length $unicode) { + put scalar @graphemes, 'grapheme'; + } + put length $unicode, 'codepoint'; + if ($ucs2chars != length $unicode) { + put $ucs2chars, 'Java character'; + } + if (length $unicode != length $line) { + put length $line, 'UTF-8 byte'; + } +} \ No newline at end of file