mirror of
https://github.com/zebrajr/postgres.git
synced 2025-12-06 12:20:15 +01:00
Generate tables from Unicode SpecialCasing.txt to support more sophisticated case mapping behavior: * support case mappings to multiple codepoints, such as "ß" uppercasing to "SS" * support conditional case mappings, such as the "final sigma" * support titlecase variants, such as "dž" uppercasing to "DŽ" but titlecasing to "Dž" Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org Reviewed-by: Peter Eisentraut, Daniel Verite
288 lines
8.7 KiB
Perl
288 lines
8.7 KiB
Perl
#!/usr/bin/perl
|
|
#
|
|
# Generate Unicode character case mappings. Does not include tailoring
|
|
# or locale-specific mappings.
|
|
#
|
|
# Input: SpecialCasing.txt UnicodeData.txt
|
|
# Output: unicode_case_table.h
|
|
#
|
|
# Copyright (c) 2000-2025, PostgreSQL Global Development Group
|
|
|
|
use strict;
|
|
use warnings FATAL => 'all';
|
|
use Getopt::Long;
|
|
|
|
use FindBin;
|
|
use lib "$FindBin::RealBin/../../tools/";
|
|
|
|
my $output_path = '.';
|
|
|
|
GetOptions('outdir:s' => \$output_path);
|
|
|
|
my $output_table_file = "$output_path/unicode_case_table.h";
|
|
|
|
# The maximum number of codepoints that can result from case mapping
|
|
# of a single character. See Unicode section 5.18 "Case Mappings".
|
|
my $MAX_CASE_EXPANSION = 3;
|
|
|
|
my $FH;
|
|
|
|
my %simple = ();
|
|
|
|
open($FH, '<', "$output_path/UnicodeData.txt")
|
|
or die "Could not open $output_path/UnicodeData.txt: $!.";
|
|
while (my $line = <$FH>)
|
|
{
|
|
my @elts = split(';', $line);
|
|
my $code = hex($elts[0]);
|
|
my $simple_uppercase = hex($elts[12] =~ s/^\s+|\s+$//rg);
|
|
my $simple_lowercase = hex($elts[13] =~ s/^\s+|\s+$//rg);
|
|
my $simple_titlecase = hex($elts[14] =~ s/^\s+|\s+$//rg);
|
|
|
|
die "codepoint $code out of range" if $code > 0x10FFFF;
|
|
die "Simple_Lowercase $code out of range" if $simple_lowercase > 0x10FFFF;
|
|
die "Simple_Titlecase $code out of range" if $simple_titlecase > 0x10FFFF;
|
|
die "Simple_Uppercase $code out of range" if $simple_uppercase > 0x10FFFF;
|
|
|
|
if ($simple_lowercase || $simple_titlecase || $simple_uppercase)
|
|
{
|
|
$simple{$code} = {
|
|
Simple_Lowercase => ($simple_lowercase || $code),
|
|
Simple_Titlecase => ($simple_titlecase || $code),
|
|
Simple_Uppercase => ($simple_uppercase || $code)
|
|
};
|
|
}
|
|
}
|
|
close $FH;
|
|
|
|
# Map for special casing rules that aren't represented in the simple
|
|
# mapping. Language-sensitive mappings are not supported.
|
|
#
|
|
# See https://www.unicode.org/reports/tr44/#SpecialCasing.txt, or the
|
|
# SpecialCasing.txt file itself for details.
|
|
|
|
# for now, only Final_Sigma is supported
|
|
my %condition_map = (Final_Sigma => 'PG_U_FINAL_SIGMA');
|
|
|
|
my %special = ();
|
|
open($FH, '<', "$output_path/SpecialCasing.txt")
|
|
or die "Could not open $output_path/SpecialCasing.txt: $!.";
|
|
while (my $line = <$FH>)
|
|
{
|
|
# language-sensitive mappings not supported
|
|
last if $line =~ /\# Language-Sensitive Mappings/;
|
|
|
|
# remove comments
|
|
$line =~ s/^(.*?)#.*$/$1/s;
|
|
|
|
# ignore empty lines
|
|
next unless $line =~ /;/;
|
|
|
|
my @elts = split /;/, $line;
|
|
my $code = hex($elts[0]);
|
|
|
|
# Codepoint may map to multiple characters when converting
|
|
# case. Split each mapping on whitespace and extract the
|
|
# hexadecimal into an array of codepoints.
|
|
my @lower = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[1]));
|
|
my @title = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[2]));
|
|
my @upper = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[3]));
|
|
my @conditions = map {
|
|
# supporting negated conditions may require storing a
|
|
# mask of relevant conditions for a given rule to differentiate
|
|
# between lack of a condition and a negated condition
|
|
die "negated conditions not supported" if /^Not_/;
|
|
$condition_map{$_} || die "unrecognized condition: $_"
|
|
} (grep /\w+/, (split /\s+/, $elts[4]));
|
|
|
|
my $cond_str = (join '|', @conditions) || '0';
|
|
|
|
# if empty, create a self-mapping
|
|
push @lower, $code if (scalar @lower == 0);
|
|
push @title, $code if (scalar @title == 0);
|
|
push @upper, $code if (scalar @upper == 0);
|
|
|
|
# none should map to more than 3 codepoints
|
|
die "lowercase expansion for 0x$elts[0] exceeds maximum: '$elts[1]'"
|
|
if (scalar @lower) > $MAX_CASE_EXPANSION;
|
|
die "titlecase expansion for 0x$elts[0] exceeds maximum: '$elts[2]'"
|
|
if (scalar @title) > $MAX_CASE_EXPANSION;
|
|
die "uppercase expansion for 0x$elts[0] exceeds maximum: '$elts[3]'"
|
|
if (scalar @upper) > $MAX_CASE_EXPANSION;
|
|
|
|
# pad arrays to a fixed length of 3
|
|
while (scalar @upper < $MAX_CASE_EXPANSION) { push @upper, 0x000000 }
|
|
while (scalar @lower < $MAX_CASE_EXPANSION) { push @lower, 0x000000 }
|
|
while (scalar @title < $MAX_CASE_EXPANSION) { push @title, 0x000000 }
|
|
|
|
# Characters with special mappings may not have simple mappings;
|
|
# ensure that an entry exists.
|
|
$simple{$code} ||= {
|
|
Simple_Lowercase => $code,
|
|
Simple_Titlecase => $code,
|
|
Simple_Uppercase => $code
|
|
};
|
|
|
|
# Multiple special case rules for a single codepoint could be
|
|
# supported by making several entries for each codepoint, and have
|
|
# the simple mapping point to the first entry. The caller could
|
|
# scan forward looking for an entry that matches the conditions,
|
|
# or fall back to the normal behavior.
|
|
die "multiple special case mappings not supported"
|
|
if defined $special{$code};
|
|
|
|
$special{$code} = {
|
|
Lowercase => \@lower,
|
|
Titlecase => \@title,
|
|
Uppercase => \@upper,
|
|
Conditions => $cond_str
|
|
};
|
|
}
|
|
close $FH;
|
|
|
|
# assign sequential array indexes to the special mappings
|
|
my $special_idx = 0;
|
|
foreach my $code (sort { $a <=> $b } (keys %special))
|
|
{
|
|
$special{$code}{Index} = $special_idx++;
|
|
}
|
|
|
|
# Start writing out the output files
|
|
open my $OT, '>', $output_table_file
|
|
or die "Could not open output file $output_table_file: $!\n";
|
|
|
|
# determine size of array given that codepoints <= 0x80 are dense and
|
|
# the rest of the entries are sparse
|
|
my $num_simple = 0x80;
|
|
foreach my $code (sort { $a <=> $b } (keys %simple))
|
|
{
|
|
$num_simple++ unless $code < 0x80;
|
|
}
|
|
|
|
my $num_special = scalar(keys %special) + 1;
|
|
|
|
print $OT <<"EOS";
|
|
/*-------------------------------------------------------------------------
|
|
*
|
|
* unicode_case_table.h
|
|
* Case mapping and information table.
|
|
*
|
|
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* src/include/common/unicode_case_table.h
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
/*
|
|
* File auto-generated by src/common/unicode/generate-unicode_case_table.pl,
|
|
* do not edit. There is deliberately not an #ifndef PG_UNICODE_CASE_TABLE_H
|
|
* here.
|
|
*/
|
|
|
|
#include "common/unicode_case.h"
|
|
#include "mb/pg_wchar.h"
|
|
|
|
/*
|
|
* The maximum number of codepoints that can result from case mapping
|
|
* of a single character. See Unicode section 5.18 "Case Mappings".
|
|
*/
|
|
#define MAX_CASE_EXPANSION 3
|
|
|
|
/*
|
|
* Case mapping condition flags. For now, only Final_Sigma is supported.
|
|
*
|
|
* See Unicode Context Specification for Casing.
|
|
*/
|
|
#define PG_U_FINAL_SIGMA (1 << 0)
|
|
|
|
typedef enum
|
|
{
|
|
CaseLower = 0,
|
|
CaseTitle = 1,
|
|
CaseUpper = 2,
|
|
NCaseKind
|
|
} CaseKind;
|
|
|
|
typedef struct
|
|
{
|
|
pg_wchar codepoint; /* Unicode codepoint */
|
|
int16 conditions;
|
|
pg_wchar map[NCaseKind][MAX_CASE_EXPANSION];
|
|
} pg_special_case;
|
|
|
|
typedef struct
|
|
{
|
|
pg_wchar codepoint; /* Unicode codepoint */
|
|
pg_wchar simplemap[NCaseKind];
|
|
const pg_special_case *special_case;
|
|
} pg_case_map;
|
|
|
|
/*
|
|
* Special case mappings that aren't representable in the simple map.
|
|
* Entries are referenced from simple_case_map.
|
|
*/
|
|
static const pg_special_case special_case[$num_special] =
|
|
{
|
|
EOS
|
|
|
|
foreach my $code (sort { $a <=> $b } (keys %special))
|
|
{
|
|
die if scalar @{ $special{$code}{Lowercase} } != $MAX_CASE_EXPANSION;
|
|
die if scalar @{ $special{$code}{Titlecase} } != $MAX_CASE_EXPANSION;
|
|
die if scalar @{ $special{$code}{Uppercase} } != $MAX_CASE_EXPANSION;
|
|
my $lower = join ", ",
|
|
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Lowercase} });
|
|
my $title = join ", ",
|
|
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Titlecase} });
|
|
my $upper = join ", ",
|
|
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Uppercase} });
|
|
printf $OT "\t{0x%06x, %s, ", $code, $special{$code}{Conditions};
|
|
printf $OT "{{%s}, {%s}, {%s}}},\n", $lower, $title, $upper;
|
|
}
|
|
|
|
print $OT "\t{0, 0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}\n";
|
|
print $OT <<"EOS";
|
|
};
|
|
|
|
/*
|
|
* Case mapping table. Dense for codepoints < 0x80 (enabling fast lookup),
|
|
* sparse for higher codepoints (requiring scan or binary search).
|
|
*/
|
|
static const pg_case_map case_map[$num_simple] =
|
|
{
|
|
EOS
|
|
|
|
printf $OT "\t/* begin dense entries for codepoints < 0x80 */\n";
|
|
for (my $code = 0; $code < 0x80; $code++)
|
|
{
|
|
my $lc = ($simple{$code}{Simple_Lowercase} || $code);
|
|
my $tc = ($simple{$code}{Simple_Titlecase} || $code);
|
|
my $uc = ($simple{$code}{Simple_Uppercase} || $code);
|
|
die "unexpected special case for code $code"
|
|
if defined $special{$code};
|
|
printf $OT
|
|
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, NULL},\n",
|
|
$code, $lc, $tc, $uc;
|
|
}
|
|
printf $OT "\n";
|
|
|
|
printf $OT "\t/* begin sparse entries for codepoints >= 0x80 */\n";
|
|
foreach my $code (sort { $a <=> $b } (keys %simple))
|
|
{
|
|
next unless $code >= 0x80; # already output above
|
|
|
|
my $map = $simple{$code};
|
|
my $special_case = "NULL";
|
|
if (exists $special{$code})
|
|
{
|
|
$special_case = sprintf "&special_case[%d]", $special{$code}{Index};
|
|
}
|
|
printf $OT
|
|
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, %s},\n",
|
|
$code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase},
|
|
$map->{Simple_Uppercase}, $special_case;
|
|
}
|
|
print $OT "};\n";
|