blob: e29de55aa39051eb7a75c31555c3753e3ce99077 [file] [log] [blame]
#!/usr/bin/perl
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
# Generate table entries giving character ranges
# for POSIX/Perl character classes. Rather than
# figure out what the definition is, it is easier to ask
# Perl about each letter from 0-128 and write down
# its answer.
@posixclasses = (
"[:alnum:]",
"[:^alnum:]",
"[:alpha:]",
"[:^alpha:]",
"[:ascii:]",
"[:^ascii:]",
"[:blank:]",
"[:^blank:]",
"[:cntrl:]",
"[:^cntrl:]",
"[:digit:]",
"[:^digit:]",
"[:graph:]",
"[:^graph:]",
"[:lower:]",
"[:^lower:]",
"[:print:]",
"[:^print:]",
"[:punct:]",
"[:^punct:]",
"[:space:]",
"[:^space:]",
"[:upper:]",
"[:^upper:]",
"[:word:]",
"[:^word:]",
"[:xdigit:]",
"[:^xdigit:]"
);
@perlclasses = (
"\\d",
"\\D",
"\\s",
"\\S",
"\\w",
"\\W",
"\\D"
);
# We extend the tables to Unicode by saying that
# 0x80-0x10FFFF matches if and only if byte 0x80 matches.
$Runemax = 0x10ffff;
sub ComputeClass($) {
my @ranges;
my ($class) = @_;
my $regexp = "[$class]";
my $start = -1;
for (my $i=0; $i<=129; $i++) {
if ($i == 129) { $i = $Runemax+1; }
if ($i <= 128 && chr($i) =~ $regexp) {
if ($start < 0) {
$start = $i;
}
} else {
if ($start >= 0) {
push @ranges, [$start, $i-1];
}
$start = -1;
}
}
return @ranges;
}
sub PrintClass($$@) {
# We could split the codes into URange16 and URange32 like
# make_unicode_groups.py does, but these are too small to bother.
my ($cname, $name, @ranges) = @_;
print "static URange32 code${cname}[] = { /* $name */\n";
for (my $i=0; $i<@ranges; $i++) {
my @a = @{$ranges[$i]};
printf "\t{ 0x%x, 0x%x },\n", $a[0], $a[1];
}
print "};\n";
my $n = @ranges;
my $escname = $name;
$escname =~ s/\\/\\\\/g;
return "{ \"$escname\", 0, 0, code$cname, $n }";
}
my $gen = 0;
sub PrintClasses($@) {
my ($cname, @classes) = @_;
my @entries;
foreach my $cl (@classes) {
my @ranges = ComputeClass($cl);
push @entries, PrintClass(++$gen, $cl, @ranges);
}
print "UGroup ${cname}_groups[] = {\n";
foreach my $e (@entries) {
print "\t$e,\n";
}
print "};\n";
my $count = @entries;
print "int num_${cname}_groups = $count;\n";
}
print <<EOF;
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
// make_perl_groups.pl >perl_groups.cc
#include "re2/unicode_groups.h"
namespace re2 {
EOF
PrintClasses("perl", @perlclasses);
PrintClasses("posix", @posixclasses);
print <<EOF;
} // namespace re2
EOF