Authors: Tom Bishop (tbishop@wenlin.com) and Richard Cook (rscook@wenlin.com).
Updated October, 2009.
This is not a finalized specification. It is still at the "draft proposal" stage and may change.
The following Perl script implements conversion, in both directions, between code points and UTF-X encodings.
| Command-line option | Set | Maximum code point | 8-bit | 16-bit | 32-bit |
|---|---|---|---|---|---|
| -6 (default) | UCS-M | U+10FFFF | UTF-8 | UTF-16 | UTF-32 |
| -8 | UCS-G | U+7FFFFFFF | UTF-G-8 | UTF-G-16 | UTF-G-32 |
| -16 | UCS-E | U+7FFFFFFFFFFFFFFF | UTF-E-8 | UTF-E-16 | UTF-E-32 |
| -32, -64 or -128 | UCS-∞ | (maxNUD = 32, 64, or 128) | UTF-∞-8 | UTF-∞-16 | UTF-∞-32 |
In order to illustrate the conversion algorithm explicitly, this implementation intentionally avoids using Perl's built-in capabilities for processing Unicode. It does not require a 64-bit processor. It checks validity by verifying that the output of round-trip conversion is identical to the input. Some CGI and other details of the web-based version are not included here.
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(gettimeofday);
my $startTime = gettimeofday;
# Command line can have either "U+..." (e.g. "U+123456789"),
# or a UTF code in hexadecimal, with spaces between code units
# (e.g. "FE 84 A3 91 96 9E 89" for UTF-E-8,
# "DD24 DED1 DEB3 DF89" for UTF-E-16,
# or "F0000012 E3456789" for UTF-E-32).
my $maxNUD = 6; my $utfName = 'UTF';
my $usv = '?'; my $u8 = '?'; my $u16 = '?'; my $u32 = '?';
if ($ARGV[0] eq '-6') { # max U+10FFFF
shift; $maxNUD = 6; $utfName = 'UTF';
}
elsif ($ARGV[0] eq '-8') { # max U+7FFFFFFF
shift; $maxNUD = 8; $utfName = 'UTF-G';
}
elsif ($ARGV[0] eq '-16') { # max U+7FFFFFFFFFFFFFFF
shift; $maxNUD = 16; $utfName = 'UTF-E';
}
elsif ($ARGV[0] eq '-32' || $ARGV[0] eq '-64' || $ARGV[0] eq '-128') {
$maxNUD = shift; $maxNUD =~ s/^-//; $utfName = 'UTF-∞';
}
if ($ARGV[0] =~ /^U\+/) {
$usv = uc($ARGV[0]);
unless ($usv =~ /^U\+[0-9A-F]+$/) {
$usv = '?';
}
}
else {
my $utf = uc(join(' ', @ARGV));
if ($utf !~ /^[0-9A-F ]+$/) {
$usv = '?';
}
elsif ($utf =~ /^[0-9A-F]{6}/) {
$usv = ConvertUTFX32ToUSV($u32 = $utf, $maxNUD);
}
elsif ($utf =~ /^[0-9A-F]{4}/) {
$usv = ConvertUTFX16ToUSV($u16 = $utf, $maxNUD);
}
elsif ($utf =~ /^[0-9A-F]{2}/) {
$usv = ConvertUTFX8ToUSV($u8 = $utf, $maxNUD);
}
else {
$usv = '?';
}
}
if ($usv ne '?') {
$u8 = ConvertUSVToUTFX8($usv, $maxNUD) if ($u8 eq '?');
$u16 = ConvertUSVToUTFX16($usv, $maxNUD) if ($u16 eq '?');
$u32 = ConvertUSVToUTFX32($usv, $maxNUD) if ($u32 eq '?');
if ($u8 eq '?' || $u16 eq '?' || $u32 eq '?') {
$usv = '?';
}
}
if ($usv eq '?') {
print "Invalid input.\n";
}
else {
print "USV = $usv\n";
print "$utfName-8 = $u8\n";
print "$utfName-16 = $u16\n";
print "$utfName-32 = $u32\n";
}
printf("Finished in %.5f seconds.\n", gettimeofday() - $startTime);
###### High-level routines with round-trip checking ######
sub ConvertUSVToUTFX8 {
my ($usv, $maxNUD) = @_;
my $utf = ConvertUSVToUTFX8LL($usv, $maxNUD);
return '?' if ($utf eq '?');
return (ConvertUTFX8ToUSVLL($utf, $maxNUD) eq $usv) ? $utf : '?';
} # ConvertUSVToUTFX8
sub ConvertUSVToUTFX16 {
my ($usv, $maxNUD) = @_;
my $utf = ConvertUSVToUTFX16LL($usv, $maxNUD);
return '?' if ($utf eq '?');
return (ConvertUTFX16ToUSVLL($utf, $maxNUD) eq $usv) ? $utf : '?';
} # ConvertUSVToUTFX16
sub ConvertUSVToUTFX32 {
my ($usv, $maxNUD) = @_;
my $utf = ConvertUSVToUTFX32LL($usv, $maxNUD);
return '?' if ($utf eq '?');
return (ConvertUTFX32ToUSVLL($utf, $maxNUD) eq $usv) ? $utf : '?';
} # ConvertUSVToUTFX32
sub ConvertUTFX8ToUSV {
my ($utf, $maxNUD) = @_;
my $usv = ConvertUTFX8ToUSVLL($utf, $maxNUD);
return '?' if ($usv eq '?');
return (ConvertUSVToUTFX8LL($usv, $maxNUD) eq $utf) ? $usv : '?';
} # ConvertUTFX8ToUSV
sub ConvertUTFX16ToUSV {
my ($utf, $maxNUD) = @_;
my $usv = ConvertUTFX16ToUSVLL($utf, $maxNUD);
return '?' if ($usv eq '?');
return (ConvertUSVToUTFX16LL($usv, $maxNUD) eq $utf) ? $usv : '?';
} # ConvertUTFX16ToUSV
sub ConvertUTFX32ToUSV {
my ($utf, $maxNUD) = @_;
my $usv = ConvertUTFX32ToUSVLL($utf, $maxNUD);
return '?' if ($usv eq '?');
return (ConvertUSVToUTFX32LL($usv, $maxNUD) eq $utf) ? $usv : '?';
} # ConvertUTFX32ToUSV
# Following are low-level ("LL") subroutines.
####################### UTF-X-8 #######################
sub ConvertUSVToUTFX8LL {
my ($h, $NUD) = USVPrep(@_);
return '?' if ($h eq '?');
if ($NUD < 8 || ($NUD == 8 && $h =~ /^[1-7]/)) {
return ConvertUSVToUTF8SixByte(hex($h));
}
my $utf = ($NUD > 9) ? 'FF' : 'FE';
if ($NUD < 18 || ($NUD == 18 && $h =~ /^[1-7]/)) {
# Make a seven-byte or thirteen-byte code.
if ($NUD == 8) {
# Left-pad $h with a zero to make nine digits
$h = '0' . $h;
}
elsif ($NUD > 9 && $NUD < 18) {
# Left-pad $h with zeros to make eighteen digits
$h = ('0' x (18 - $NUD)) . $h;
}
}
else { # Make fourteen-byte or longer code.
my $NME = $NUD - 18;
my @nmeHexArray = split(//, sprintf('%X', $NME));
$utf .= (' B4' x (@nmeHexArray - 1));
foreach my $nmeHexDigit (@nmeHexArray) {
$utf .= ' A' . $nmeHexDigit;
}
# Left-pad $h with one or two zeros as needed to make multiple of 3
my $m = ($NUD % 3);
if ($m == 2) {
$h = '0' . $h;
}
elsif ($m == 1) {
$h = '00' . $h;
}
}
return PackThreeUdigitsPerBytePair($utf, $h);
} # ConvertUSVToUTFX8LL
sub ConvertUTFX8ToUSVLL {
my ($utf, $maxNUD) = @_;
@_ = split(/ /, $utf);
my $init = $_[0];
return ConvertUTF8SixByteToUSV(@_) unless ($init eq 'FE' || $init eq 'FF');
shift; # $init = FE or FF
my $USVStorageByteCount = 0;
if ($init eq 'FE') {
$USVStorageByteCount = 6; # seven-byte code
}
elsif ($_[0] =~ /^[89]/) {
$USVStorageByteCount = 12; # thirteen-byte code
}
else { # fourteen-byte or longer code
my $AxCount = 1;
my $lengthByte;
while (($lengthByte = shift) eq 'B4') {
$AxCount++;
}
my $NME = 0;
for (;;) {
my $Ax = hex($lengthByte);
return '?' unless (($Ax | 0x0F) == 0xAF);
$NME = ($NME << 4) + ($Ax & 0x0F);
return '?' if ($NME + 14 > $maxNUD);
last if (--$AxCount == 0);
$lengthByte = shift;
}
$USVStorageByteCount = int(($NME + 18 + 2) / 3) * 2;
}
return '?' unless ($USVStorageByteCount == @_);
return UnpackThreeUdigitsPerBytePair($maxNUD, @_);
} # ConvertUTFX8ToUSVLL
sub PackThreeUdigitsPerBytePair {
my ($utf, $usv) = @_;
while ($usv =~ s/^(.)(.)(.)//) {
my ($x, $y, $z) = (hex($1), hex($2), hex($3));
$utf .= sprintf(' %02X %02X', (0x80|($x<<2)|($y>>2)), (0x80|(($y&3)<<4)|$z));
}
return $utf;
} # PackThreeUdigitsPerBytePair
sub UnpackThreeUdigitsPerBytePair {
my $maxNUD = shift;
my $usv = '';
my $NUD = 0;
while (defined(my $u = shift) && defined(my $v = shift)) {
$u = hex($u) & 0x3F;
$v = hex($v) & 0x3F;
$usv .= sprintf('%X%X%X', ($u>>2), ((($u&3)<<2)|($v>>4)), ($v&0x0F));
$NUD += 3;
return '?' if ($NUD - 2 > $maxNUD); # minus 2 since leading zeros don't count
}
$usv =~ s/^0*//;
$NUD = length($usv);
return ($NUD > $maxNUD) ? '?' : "U+$usv";
} # UnpackThreeUdigitsPerBytePair
# The remainder of the UTF-X-8 code just implements the original
# form of UTF-8 up to six bytes for U+7FFFFFFF.
sub ConvertUSVToUTF8SixByte {
my $x = shift;
if ($x <= 0x7F) {
return sprintf("%02X", $x);
}
if ($x <= 0x7FF) {
return sprintf("%02X %02X",
0xC0|($x>>6),
0x80|($x&0x3F));
}
if ($x <= 0xFFFF) {
return sprintf("%02X %02X %02X",
0xE0|($x>>12),
0x80|(($x>>6)&0x3F),
0x80|($x&0x3F));
}
if ($x <= 0x1FFFFF) {
return sprintf("%02X %02X %02X %02X",
0xF0|($x>>18),
0x80|(($x>>12)&0x3F),
0x80|(($x>>6)&0x3F),
0x80|($x&0x3F));
}
if ($x <= 0x3FFFFFF) {
return sprintf("%02X %02X %02X %02X %02X",
0xF8|($x>>24),
0x80|(($x>>18)&0x3F),
0x80|(($x>>12)&0x3F),
0x80|(($x>>6)&0x3F),
0x80|($x&0x3F));
}
if ($x <= 0x7FFFFFFF) {
return sprintf("%02X %02X %02X %02X %02X %02X",
0xFC|($x>>30),
0x80|(($x>>24)&0x3F),
0x80|(($x>>18)&0x3F),
0x80|(($x>>12)&0x3F),
0x80|(($x>>6)&0x3F),
0x80|($x&0x3F));
}
return '?';
} # ConvertUSVToUTF8SixByte
sub ConvertUTF8SixByteToUSV {
my $i = ConvertUTF8SixByteToInteger(@_);
if ($i < 0 || ($i >= 0xD800 && $i <= 0xDFFF)) {
return '?';
}
return sprintf('U+%04X', $i);
} # ConvertUTF8SixByteToUSV
sub ConvertUTF8SixByteToInteger {
return -1 unless defined $_[0];
my $b0 = hex(shift);
if ($b0 <= 0x7F) {
return $b0;
}
return -1 unless defined $_[0];
my $b1 = hex(shift) & 0x3F;
if ($b0 <= 0xDF) {
return (($b0&0x1F)<<6)|$b1;
}
return -1 unless defined $_[0];
my $b2 = hex(shift) & 0x3F;
if ($b0 <= 0xEF) {
return (($b0&0xF)<<12)|($b1<<6)|$b2;
}
return -1 unless defined $_[0];
my $b3 = hex(shift) & 0x3F;
if ($b0 <= 0xF7) {
return (($b0&7)<<18)|($b1<<12)|($b2<<6)|$b3;
}
return -1 unless defined $_[0];
my $b4 = hex(shift) & 0x3F;
if ($b0 <= 0xFB) {
return (($b0&3)<<24)|($b1<<18)|($b2<<12)|($b3<<6)|$b4;
}
return -1 unless defined $_[0];
my $b5 = hex(shift) & 0x3F;
if ($b0 <= 0xFD) {
return (($b0&1)<<30)|($b1<<24)|($b2<<18)|($b3<<12)|($b4<<6)|$b5;
}
return -1;
} # ConvertUTF8SixByteToInteger
####################### UTF-X-16 #######################
sub ConvertUSVToUTFX16LL {
my ($h, $NUD) = USVPrep(@_);
return '?' if ($h eq '?');
if ($NUD <= 4) { # one-unit code (BMP)
return ('0' x (4 - $NUD)) . $h;
}
if ($NUD < 6 || ($NUD == 6 && hex($h) <= 0x10ffff)) {
my $n = hex($h); # two-unit code: surrogate pair
return sprintf('%04X %04X', (0xd800 | (($n - 0x10000) >> 10)), (0xdc00 | ($n & 0x03ff)));
}
my $utf = '';
my $nUSVUnits;
my @bin = split(//, HexToBin($h));
my $bitsNeeded = @bin; # Note: $bitsNeeded >= 21
if ($bitsNeeded <= 90) { # 3 to 11 units
my $nUnits = 3 + int(($bitsNeeded - 19) / 8);
my $unitBitsAvail = 11 - $nUnits;
my $paddingBits = $unitBitsAvail + 9 * ($nUnits - 1) - $bitsNeeded;
while ($paddingBits--) {
unshift(@bin, '0');
}
my $unitBits = '1101110' . ('1' x ($nUnits - 3)) . '0';
while ($unitBitsAvail--) {
$unitBits .= shift @bin;
}
$utf = BinToHex($unitBits);
$nUSVUnits = $nUnits - 1;
}
else { # 13-or-more-unit code
my $NMT = $NUD - 23;
my $beforeCount = ($NMT < 256) ? 0 : int(log($NMT)/log(256));
$utf = 'DDFF' . (' DFB4' x $beforeCount);
for (my $i = $beforeCount; $i >= 0; $i--) {
$utf .= sprintf(' DE%02X', ($NMT >> ($i << 3)) & 0xFF);
}
$nUSVUnits = int(($bitsNeeded + 8) / 9);
my $paddingBits = 9 * $nUSVUnits - $bitsNeeded; # < 9
my $unitBits = '1101111' . ('0' x $paddingBits);
for (my $i = 9 - $paddingBits; $i--; ) {
$unitBits .= shift @bin;
}
$utf .= ' ' . BinToHex($unitBits);
$nUSVUnits--;
}
while ($nUSVUnits--) {
my $unitBits = '1101111';
for (my $i = 9; $i--; ) {
$unitBits .= shift @bin;
}
$utf .= ' ' . BinToHex($unitBits)
}
return $utf;
} # ConvertUSVToUTFX16LL
sub ConvertUTFX16ToUSVLL {
my ($utf, $maxNUD) = @_;
@_ = split(/ /, $utf);
my $unit = shift;
my $h = hex($unit);
if ($h < 0xDC00 || $h > 0xDFFF) {
if ($h < 0xD800 || $h > 0xDFFF) {
return 'U+' . $unit; # 1-unit code
}
# 2-unit code; combine surrogates
return sprintf('U+%X', 0x10000 + ((($h - 0xd800)) << 10) + hex(shift) - 0xdc00);
}
my $usv = '';
my $nybble = 0;
my $nybbleBitCount = 0;
my $codeBitsAvail;
my $nUSVUnits;
if ($unit ne 'DDFF') { # 3 to 11-unit code
my $nUnits = 3;
my $mask = (1 << 8);
while (($h & $mask) != 0) {
++$nUnits;
$mask >>= 1;
}
my $unitBitsAvail = 11 - $nUnits;
$nUSVUnits = $nUnits - 1; # not including the 1st unit
my $unitsRemaining = @_;
if ($nUSVUnits != $unitsRemaining) {
return '?';
}
$codeBitsAvail = $unitBitsAvail + 9 * $nUSVUnits;
my $codeBitsMod4 = ($codeBitsAvail % 4);
$nybbleBitCount = $codeBitsMod4 ? (4 - $codeBitsMod4) : 0;
while ($unitBitsAvail--) {
$mask >>= 1;
$nybble = (($nybble << 1) | (($h & $mask) ? 1 : 0));
if (++$nybbleBitCount == 4) {
$usv .= sprintf('%X', $nybble);
$nybbleBitCount = $nybble = 0;
}
}
}
else { # 13-or-more-unit code
my $beforeCount = 0;
while (($unit = shift) eq 'DFB4') {
$beforeCount++;
}
my $NMT = hex($unit) & 0xff;
for (my $i = 0; $i < $beforeCount; $i++) {
$NMT = ($NMT << 8) | (hex(shift) & 0xff);
}
my $NUD = $NMT + 23;
return '?' if ($NUD > $maxNUD);
my $unitsRemaining = @_;
$codeBitsAvail = 9 * $unitsRemaining;
if ($codeBitsAvail < $NUD * 4 - 3) {
return '?';
}
my $codeBitsMod4 = ($codeBitsAvail % 4);
$nybbleBitCount = $codeBitsMod4 ? (4 - $codeBitsMod4) : 0;
$nUSVUnits = $unitsRemaining;
}
while ($nUSVUnits--) {
$h = hex(shift);
my $mask = (1 << 8);
while ($mask) {
$nybble = (($nybble << 1) | (($h & $mask) ? 1 : 0));
$mask >>= 1;
if (++$nybbleBitCount == 4) {
$usv .= sprintf('%X', $nybble);
$nybbleBitCount = $nybble = 0;
}
}
}
my $NUD; ($h, $NUD) = USVPrep($usv, $maxNUD);
return ($h eq '?') ? '?' : "U+$h";
} # ConvertUTFX16ToUSVLL
sub BinToHex {
# Input is 16-bit string like '1010101010101010'; output is like 'AAAA'.
return sprintf('%04X', unpack('n', pack('B16', shift)));
} # BinToHex
sub HexToBin {
my $s = shift; # Maybe very long hex string; don't convert it all to an integer.
$s =~ s/0/0000/g;
$s =~ s/1/0001/g;
$s =~ s/2/0010/g;
$s =~ s/3/0011/g;
$s =~ s/4/0100/g;
$s =~ s/5/0101/g;
$s =~ s/6/0110/g;
$s =~ s/7/0111/g;
$s =~ s/8/1000/g;
$s =~ s/9/1001/g;
$s =~ s/A/1010/g;
$s =~ s/B/1011/g;
$s =~ s/C/1100/g;
$s =~ s/D/1101/g;
$s =~ s/E/1110/g;
$s =~ s/F/1111/g;
$s =~ s/^0*//;
return $s;
} # HexToBin
####################### UTF-X-32 #######################
sub ConvertUSVToUTFX32LL {
my ($h, $NUD) = USVPrep(@_);
return '?' if ($h eq '?');
if ($NUD < 8 || ($NUD == 8 && $h !~ /^[EF]/)) { # one-unit code
return ('0' x (8 - $NUD)) . $h;
}
if ($NUD < 14 || ($NUD == 14 && $h !~ /^[EF]/)) { # two-unit code
$h = ('0' x (14 - $NUD)) . $h;
# insert F and E prefix nybbles and insert space between units.
$h =~ s/^(.......)/F$1 E/;
return $h;
}
my $unitsNeeded;
my $utf = 'FF';
if ($NUD < 20) { # 3-unit code
$unitsNeeded = 3;
$h = ('0' x (20 - $NUD)) . $h;
}
else { # 4-unit or longer code.
my $NM = $NUD - 20;
my $nmHex = ($NM <= 0) ? '0' : sprintf('%X', $NM);
my $beforeCount = length($nmHex) - 1;
my $nybblesNeeded = 1 + 2 * ($beforeCount + 1) + $NUD;
$unitsNeeded = int(($nybblesNeeded + 6) / 7);
my $padding = $unitsNeeded * 7 - $nybblesNeeded;
$h = ('B' x $beforeCount) . 'A' . $nmHex . ('0' x $padding) . $h;
}
$h =~ s/^(......)//;
$utf .= $1;
while (--$unitsNeeded) {
$h =~ s/^(.......)//;
$utf .= ' E' . $1;
}
return $utf;
} # ConvertUSVToUTFX32LL
sub ConvertUTFX32ToUSVLL {
my ($utf, $maxNUD) = @_;
@_ = split(/ /, $utf);
my $unit = shift; # initial unit
if ($unit !~ /^[EF]/) { # one unit
$unit =~ s/^0*//; # remove leading zeros
my $len = length($unit);
if ($len < 4) {
$unit = ('0' x (4 - $len)) . $unit;
}
return '?' if ($maxNUD == 8 && hex($unit) > 0x7FFFFFFF);
return 'U+' . $unit;
}
return '?' if ($maxNUD == 8);
if ($unit !~ /^FF/) { # two units
$unit =~ s/^.//; # remove prefix
my $unit2 = shift; # second unit
$unit2 =~ s/^.//; # remove prefix
my ($h, $NUD) = USVPrep($unit . $unit2, $maxNUD);
return ($h eq '?') ? '?' : "U+$h";
}
# three or more units
$unit =~ s/^..//; # remove prefix
my @a = split(//, $unit);
while (defined ($unit = shift)) {
$unit = $unit;
return '?' unless ($unit =~ /^E/);
$unit =~ s/^.//; # remove prefix
push(@a, split(//, $unit));
}
my $NUD;
if ($a[0] eq '0') { # 3 units
shift(@a);
return '?' unless (@a == 19);
$NUD = 19;
}
else { # 4 or more units
my $beforeCount = 0;
my $lengthNybble;
while (($lengthNybble = shift(@a)) eq 'B') {
$beforeCount++;
}
return '?' unless ($lengthNybble eq 'A');
my $nmHex = '';
for (my $i = 0; $i <= $beforeCount; $i++) {
$nmHex .= shift(@a);
}
$NUD = 20 + hex($nmHex);
}
while ($a[0] eq '0') {
shift(@a);
--$NUD if ($NUD < 20);
}
return '?' unless ($NUD <= $maxNUD && $NUD == @a);
return 'U+' . join('', @a);
} # ConvertUTFX32ToUSVLL
sub USVPrep {
# Input: $usv, $maxNUD = 6, 8, 16, or larger power of two.
# If $usv is out of range, return ('?', 0).
# If $usv is OK (in range), return ($h, $NUD),
# where $h is $usv without "U+" or leading zeros.
my ($h, $maxNUD) = @_;
$h =~ s/^U\+//; # remove "U+" (if present)
$h =~ s/^0*//; # remove any leading zeros (even if there was no U+)
my $NUD = length($h); # count how many udigits
if ($NUD > $maxNUD || ($NUD == $maxNUD &&
(($NUD == 6 && hex($h) > 0x10FFFF) || ($NUD >= 8 && $h !~ /^[1-7]/)))) {
return ('?', 0); # out of range
}
return ($h, $NUD); # in range
} # USVPrep
__END__