From ebd914a38ec99e0f844cf12d8237634b3fdea8c1 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Mon, 8 Sep 2025 15:29:01 +0700 Subject: [PATCH v4 2/3] JCN changes --- .../utils/mb/Unicode/UCS_to_GB18030.pl | 38 ++++++------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl index 658e0d59e2c..084fdf66af1 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl @@ -4,17 +4,15 @@ # # src/backend/utils/mb/Unicode/UCS_to_GB18030.pl # - # Generate UTF-8 <--> GB18030 code conversion tables from -# "gb-18030-2000.ucm", a Unicode Character Mapping file (UCM) from ICU, -# obtained from https://github.com/unicode-org/icu-data/blob/d9d3a6ed27bb98a7106763e940258f0be8cd995b/charset/data/ucm/gb-18030-2000.ucm +# "gb-18030-2000.ucm", obtained from +# https://github.com/unicode-org/icu-data/tree/main/charset/data/ucm # # The lines we care about in the source file look like # \xYY[\xYY...] |n -# where is the Unicode code point in hex, +# where XXXX is the Unicode code point in hex, # and the \xYY... is the hex byte sequence for GB18030, # and n is a flag indicating the type of mapping. -# use strict; use warnings FATAL => 'all'; @@ -30,37 +28,23 @@ my $in_file = "gb-18030-2000.ucm"; open(my $in, '<', $in_file) || die("cannot open $in_file"); my @mapping; -my $in_charmap = 0; while (<$in>) { - chomp; - # Enter CHARMAP section - if (/^CHARMAP/) { - $in_charmap = 1; - next; - } - # Exit CHARMAP section - if (/^END CHARMAP/) { - $in_charmap = 0; - last; - } - next unless $in_charmap; - # Skip comments and empty lines - next if /^#/ || /^$/; + # Mappings may have been removed by commenting out + next if /^#/; - # Match lines like: \xYY[\xYY...] |n - next if !/^\s+((?:\\x[0-9A-Fa-f]{2})+)\s*\|(\d+)/; + next if !/^\s+ + ((?:\\x[0-9A-Fa-f]{2})+)\s+ + \|(\d+)/x; my ($u, $c, $flag) = ($1, $2, $3); + $c =~ s/\\x//g; - # flag 0 means round-trip mapping, we only care about that + # We only want round-trip mappings next if ($flag ne '0'); my $ucs = hex($u); - # Remove \x and concatenate bytes - my $c_hex = $c; - $c_hex =~ s/\\x//g; - my $code = hex($c_hex); + my $code = hex($c); if ($code >= 0x80 && $ucs >= 0x0080) { push @mapping, -- 2.51.0