codepage/cptable.pl - platform/external/syslinux - Git at Google

 #!/usr/bin/perl
 #
 # Produce a codepage matching table.  For each 8-bit character, list
 # a primary and an alternate match (the latter used for case-insensitive
 # matching.)
 #
 # Usage:
 #	cptable.pl UnicodeData console-cp.txt filesystem-cp.txt output.cp
 #
 # Note: for the format of the UnicodeData file, see:
 # http://www.unicode.org/Public/UNIDATA/UCD.html
 #

 ($ucd, $cpco, $cpfs, $cpout) = @ARGV;

 if (!defined($cpout)) {
     die "Usage: $0 UnicodeData console-cp.txt fs-cp.txt output.cp\n";
 }

 %ucase   = ();
 %lcase   = ();
 %tcase   = ();
 %decomp  = ();

 open(UCD, '<', $ucd)
     or die "$0: could not open unicode data: $ucd: $!\n";
 while (defined($line = <UCD>)) {
     chomp $line;
     @f = split(/;/, $line);
     $n = hex $f[0];
     $ucase{$n} = ($f[12] ne '') ? hex $f[12] : $n;
     $lcase{$n} = ($f[13] ne '') ? hex $f[13] : $n;
     $tcase{$n} = ($f[14] ne '') ? hex $f[14] : $n;
     if ($f[5] =~ /^[0-9A-F\s]+$/) {
 	# This character has a canonical decomposition.
 	# The regular expression rejects angle brackets, so other
 	# decompositions aren't permitted.
 	$decomp{$n} = [];
 	foreach my $dch (split(' ', $f[5])) {
 	    push(@{$decomp{$n}}, hex $dch);
 	}
     }
 }
 close(UCD);

 #
 # Filesystem and console codepages.  The filesystem codepage is used
 # for FAT shortnames, whereas the console codepage is whatever is used
 # on the screen and keyboard.
 #
 @xtab = (undef) x 256;
 %tabx = ();
 open(CPFS, '<', $cpfs)
     or die "$0: could not open fs codepage: $cpfs: $!\n";
 while (defined($line = <CPFS>)) {
     $line =~ s/\s*(\#.*|)$//;
     @f = split(/\s+/, $line);
     next if (scalar @f != 2);
     next if (hex $f[0] > 255);
     $xtab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
     $tabx{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
 }
 close(CPFS);

 @ytab = (undef) x 256;
 %taby = ();
 open(CPCO, '<', $cpco)
     or die "$0: could not open console codepage: $cpco: $!\n";
 while (defined($line = <CPCO>)) {
     $line =~ s/\s*(\#.*|)$//;
     @f = split(/\s+/, $line);
     next if (scalar @f != 2);
     next if (hex $f[0] > 255);
     $ytab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
     $taby{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
 }
 close(CPCO);

 open(CPOUT, '>', $cpout)
     or die "$0: could not open output file: $cpout: $!\n";
 #
 # Magic number, in anticipation of being able to load these
 # files dynamically...
 #
 print CPOUT pack("VV", 0x58a8b3d4, 0x51d21eb1);

 # Header fields available for future use...
 print CPOUT pack("VVVVVV", 0, 0, 0, 0, 0, 0);

 #
 # Self (shortname) uppercase table.
 # This depends both on the console codepage and the filesystem codepage;
 # the logical transcoding operation is:
 #
 # $tabx{$ucase{$ytab[$i]}}
 #
 # ... where @ytab is console codepage -> Unicode and
 # %tabx is Unicode -> filesystem codepage.
 #
 @uctab = (undef) x 256;
 for ($i = 0; $i < 256; $i++) {
     $uuc = $ucase{$ytab[$i]};	# Unicode upper case
     if (defined($tabx{$uuc})) {
 	# Straight-forward conversion
 	$u = $tabx{$uuc};
     } elsif (defined($tabx{${$decomp{$uuc}}[0]})) {
 	# Upper case equivalent stripped of accents
 	$u = $tabx{${$decomp{$uuc}}[0]};
     } else {
 	# No equivalent at all found.  Assume it is a lower-case-only
 	# character, like greek alpha in CP437.
 	$u = $i;
     }
     $uctab[$i] = $u;
     print CPOUT pack("C", $u);
 }

 #
 # Self (shortname) lowercase table.
 # This depends both on the console codepage and the filesystem codepage;
 # the logical transcoding operation is:
 #
 # $taby{$lcase{$xtab[$i]}}
 #
 # ... where @ytab is console codepage -> Unicode and
 # %tabx is Unicode -> filesystem codepage.
 #
 @lctab = (undef) x 256;
 for ($i = 0; $i < 256; $i++) {
     $llc = $lcase{$xtab[$i]};	# Unicode lower case
     if (defined($l = $taby{$llc}) && $uctab[$l] == $i) {
 	# Straight-forward conversion
     } elsif (defined($l = $tabx{${$decomp{$llc}}[0]}) && $uctab[$l] == $i) {
 	# Lower case equivalent stripped of accents
     } else {
 	# No equivalent at all found.  Find *anything* that matches the
 	# bijection criterion...
 	for ($l = 0; $l < 256; $l++) {
 	    last if ($uctab[$l] == $i);
 	}
 	$l = $i if ($l == 256);	# If nothing, we're screwed anyway...
     }
     $lctab[$i] = $l;
     print CPOUT pack("C", $l);
 }

 #
 # Unicode (longname) matching table.
 # This only depends on the console codepage.
 #
 $pp0 = '';  $pp1 = '';
 for ($i = 0; $i < 256; $i++) {
     if (!defined($ytab[$i])) {
 	$p0 = $p1 = 0xffff;
     } else {
 	$p0 = $ytab[$i];
 	if ($ucase{$p0} != $p0) {
 	    $p1 = $ucase{$p0};
 	} elsif ($lcase{$p0} != $p0) {
 	    $p1 = $lcase{$p0};
 	} elsif ($tcase{$p0} != $p0) {
 	    $p1 = $tcase{$p0};
 	} else {
 	    $p1 = $p0;
 	}
     }
     # Only the BMP is supported...
     $p0 = 0xffff if ($p0 > 0xffff);
     $p1 = 0xffff if ($p1 > 0xffff);
     $pp0 .= pack("v", $p0);
     $pp1 .= pack("v", $p1);
 }
 print CPOUT $pp0, $pp1;
 close (CPOUT);
	#!/usr/bin/perl
	#
	# Produce a codepage matching table. For each 8-bit character, list
	# a primary and an alternate match (the latter used for case-insensitive
	# matching.)
	#
	# Usage:
	# cptable.pl UnicodeData console-cp.txt filesystem-cp.txt output.cp
	#
	# Note: for the format of the UnicodeData file, see:
	# http://www.unicode.org/Public/UNIDATA/UCD.html
	#

	($ucd, $cpco, $cpfs, $cpout) = @ARGV;

	if (!defined($cpout)) {
	die "Usage: $0 UnicodeData console-cp.txt fs-cp.txt output.cp\n";
	}

	%ucase = ();
	%lcase = ();
	%tcase = ();
	%decomp = ();

	open(UCD, '<', $ucd)
	or die "$0: could not open unicode data: $ucd: $!\n";
	while (defined($line = <UCD>)) {
	chomp $line;
	@f = split(/;/, $line);
	$n = hex $f[0];
	$ucase{$n} = ($f[12] ne '') ? hex $f[12] : $n;
	$lcase{$n} = ($f[13] ne '') ? hex $f[13] : $n;
	$tcase{$n} = ($f[14] ne '') ? hex $f[14] : $n;
	if ($f[5] =~ /^[0-9A-F\s]+$/) {
	# This character has a canonical decomposition.
	# The regular expression rejects angle brackets, so other
	# decompositions aren't permitted.
	$decomp{$n} = [];
	foreach my $dch (split(' ', $f[5])) {
	push(@{$decomp{$n}}, hex $dch);
	}
	}
	}
	close(UCD);

	#
	# Filesystem and console codepages. The filesystem codepage is used
	# for FAT shortnames, whereas the console codepage is whatever is used
	# on the screen and keyboard.
	#
	@xtab = (undef) x 256;
	%tabx = ();
	open(CPFS, '<', $cpfs)
	or die "$0: could not open fs codepage: $cpfs: $!\n";
	while (defined($line = <CPFS>)) {
	$line =~ s/\s(\#.\|)$//;
	@f = split(/\s+/, $line);
	next if (scalar @f != 2);
	next if (hex $f[0] > 255);
	$xtab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
	$tabx{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
	}
	close(CPFS);

	@ytab = (undef) x 256;
	%taby = ();
	open(CPCO, '<', $cpco)
	or die "$0: could not open console codepage: $cpco: $!\n";
	while (defined($line = <CPCO>)) {
	$line =~ s/\s(\#.\|)$//;
	@f = split(/\s+/, $line);
	next if (scalar @f != 2);
	next if (hex $f[0] > 255);
	$ytab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
	$taby{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
	}
	close(CPCO);

	open(CPOUT, '>', $cpout)
	or die "$0: could not open output file: $cpout: $!\n";
	#
	# Magic number, in anticipation of being able to load these
	# files dynamically...
	#
	print CPOUT pack("VV", 0x58a8b3d4, 0x51d21eb1);

	# Header fields available for future use...
	print CPOUT pack("VVVVVV", 0, 0, 0, 0, 0, 0);

	#
	# Self (shortname) uppercase table.
	# This depends both on the console codepage and the filesystem codepage;
	# the logical transcoding operation is:
	#
	# $tabx{$ucase{$ytab[$i]}}
	#
	# ... where @ytab is console codepage -> Unicode and
	# %tabx is Unicode -> filesystem codepage.
	#
	@uctab = (undef) x 256;
	for ($i = 0; $i < 256; $i++) {
	$uuc = $ucase{$ytab[$i]}; # Unicode upper case
	if (defined($tabx{$uuc})) {
	# Straight-forward conversion
	$u = $tabx{$uuc};
	} elsif (defined($tabx{${$decomp{$uuc}}[0]})) {
	# Upper case equivalent stripped of accents
	$u = $tabx{${$decomp{$uuc}}[0]};
	} else {
	# No equivalent at all found. Assume it is a lower-case-only
	# character, like greek alpha in CP437.
	$u = $i;
	}
	$uctab[$i] = $u;
	print CPOUT pack("C", $u);
	}

	#
	# Self (shortname) lowercase table.
	# This depends both on the console codepage and the filesystem codepage;
	# the logical transcoding operation is:
	#
	# $taby{$lcase{$xtab[$i]}}
	#
	# ... where @ytab is console codepage -> Unicode and
	# %tabx is Unicode -> filesystem codepage.
	#
	@lctab = (undef) x 256;
	for ($i = 0; $i < 256; $i++) {
	$llc = $lcase{$xtab[$i]}; # Unicode lower case
	if (defined($l = $taby{$llc}) && $uctab[$l] == $i) {
	# Straight-forward conversion
	} elsif (defined($l = $tabx{${$decomp{$llc}}[0]}) && $uctab[$l] == $i) {
	# Lower case equivalent stripped of accents
	} else {
	# No equivalent at all found. Find anything that matches the
	# bijection criterion...
	for ($l = 0; $l < 256; $l++) {
	last if ($uctab[$l] == $i);
	}
	$l = $i if ($l == 256); # If nothing, we're screwed anyway...
	}
	$lctab[$i] = $l;
	print CPOUT pack("C", $l);
	}

	#
	# Unicode (longname) matching table.
	# This only depends on the console codepage.
	#
	$pp0 = ''; $pp1 = '';
	for ($i = 0; $i < 256; $i++) {
	if (!defined($ytab[$i])) {
	$p0 = $p1 = 0xffff;
	} else {
	$p0 = $ytab[$i];
	if ($ucase{$p0} != $p0) {
	$p1 = $ucase{$p0};
	} elsif ($lcase{$p0} != $p0) {
	$p1 = $lcase{$p0};
	} elsif ($tcase{$p0} != $p0) {
	$p1 = $tcase{$p0};
	} else {
	$p1 = $p0;
	}
	}
	# Only the BMP is supported...
	$p0 = 0xffff if ($p0 > 0xffff);
	$p1 = 0xffff if ($p1 > 0xffff);
	$pp0 .= pack("v", $p0);
	$pp1 .= pack("v", $p1);
	}
	print CPOUT $pp0, $pp1;
	close (CPOUT);