From 6a91ea12a3ec9257bf2b7b80c2dd4a3e483cb724 Mon Sep 17 00:00:00 2001 From: "Chao Li (Evan)" Date: Wed, 24 Sep 2025 17:03:13 +0800 Subject: [PATCH v3] Generate EUC_CN and UHC mappings from the Unicode Consortium's UCM file This is a follow-up change of cfa6cd2, so that we can delete the XML file from our repository. Author: Chao Li Discussion: https://postgr.es/m/966d9fc.169.198741fe60b.Coremail.jiaoshuntian%40highgo.com --- src/backend/utils/mb/Unicode/Makefile | 9 ++---- src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl | 32 ++++++++++++------- src/backend/utils/mb/Unicode/UCS_to_UHC.pl | 30 +++++++++++------ 3 files changed, 44 insertions(+), 27 deletions(-) diff --git a/src/backend/utils/mb/Unicode/Makefile b/src/backend/utils/mb/Unicode/Makefile index 9f6cdcc96de..fe12d73be65 100644 --- a/src/backend/utils/mb/Unicode/Makefile +++ b/src/backend/utils/mb/Unicode/Makefile @@ -48,9 +48,9 @@ $(eval $(call map_rule,koi8u,UCS_to_most.pl,KOI8-U.TXT,KOI8U)) $(eval $(call map_rule,gbk,UCS_to_most.pl,CP936.TXT,GBK)) $(eval $(call map_rule,johab,UCS_to_JOHAB.pl,JOHAB.TXT)) -$(eval $(call map_rule,uhc,UCS_to_UHC.pl,windows-949-2000.xml)) +$(eval $(call map_rule,uhc,UCS_to_UHC.pl,windows-949-2000.ucm)) $(eval $(call map_rule,euc_jp,UCS_to_EUC_JP.pl,CP932.TXT JIS0212.TXT)) -$(eval $(call map_rule,euc_cn,UCS_to_EUC_CN.pl,gb-18030-2000.xml)) +$(eval $(call map_rule,euc_cn,UCS_to_EUC_CN.pl,gb18030-2022.ucm)) $(eval $(call map_rule,euc_kr,UCS_to_EUC_KR.pl,KSX1001.TXT)) $(eval $(call map_rule,euc_tw,UCS_to_EUC_TW.pl,CNS11643.TXT)) $(eval $(call map_rule,sjis,UCS_to_SJIS.pl,CP932.TXT)) @@ -75,10 +75,7 @@ BIG5.TXT CNS11643.TXT: euc-jis-2004-std.txt sjis-0213-2004-std.txt: $(DOWNLOAD) http://x0213.org/codetable/$(@F) -gb-18030-2000.xml windows-949-2000.xml: - $(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/master/charset/data/xml/$(@F) - -gb18030-2022.ucm: +gb18030-2022.ucm windows-949-2000.ucm: $(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu/refs/heads/main/icu4c/source/data/mappings/$(@F) GB2312.TXT: diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl index f7776631e4c..ac39609e900 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl @@ -2,16 +2,17 @@ # # Copyright (c) 2007-2025, PostgreSQL Global Development Group # -# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl +# src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl # -# Generate UTF-8 <--> GB18030 code conversion tables from -# "gb-18030-2000.xml", obtained from -# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/ +# Generate UTF-8 <--> EUC_CN code conversion tables from +# "gb-18030-2022.ucm", obtained from +# https://github.com/unicode-org/icu/blob/main/icu4c/source/data/mappings/ # # The lines we care about in the source file look like -# -# where the "u" field is the Unicode code point in hex, -# and the "b" field is the hex byte sequence for GB18030 +# \xYY[\xYY...] |n +# where XXXX is the Unicode code point in hex, +# and the \xYY... is the hex byte sequence for GB18030, +# and n is a flag indicating the type of mapping. use strict; use warnings FATAL => 'all'; @@ -22,7 +23,7 @@ my $this_script = 'src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl'; # Read the input -my $in_file = "gb-18030-2000.xml"; +my $in_file = "gb18030-2022.ucm"; open(my $in, '<', $in_file) || die("cannot open $in_file"); @@ -30,9 +31,18 @@ my @mapping; while (<$in>) { - next if (!m/\s+ + ((?:\\x[0-9A-Fa-f]{2})+)\s+ + \|(\d+)/x; + my ($u, $c, $flag) = ($1, $2, $3); + $c =~ s/\\x//g; + + # We only want round-trip mappings + next if ($flag ne '0'); + my $ucs = hex($u); my $code = hex($c); diff --git a/src/backend/utils/mb/Unicode/UCS_to_UHC.pl b/src/backend/utils/mb/Unicode/UCS_to_UHC.pl index c6087b5c382..e666c1839cf 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_UHC.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_UHC.pl @@ -2,16 +2,17 @@ # # Copyright (c) 2007-2025, PostgreSQL Global Development Group # -# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl +# src/backend/utils/mb/Unicode/UCS_to_UHC.pl # # Generate UTF-8 <--> UHC code conversion tables from -# "windows-949-2000.xml", obtained from -# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/ +# "windows-949-2000.ucm", obtained from +# https://github.com/unicode-org/icu/blob/main/icu4c/source/data/mappings/ # # The lines we care about in the source file look like -# -# where the "u" field is the Unicode code point in hex, -# and the "b" field is the hex byte sequence for UHC +# \xYY[\xYY...] |n +# where XXXX is the Unicode code point in hex, +# and the \xYY... is the hex byte sequence for UHC, +# and n is a flag indicating the type of mapping. use strict; use warnings FATAL => 'all'; @@ -22,7 +23,7 @@ my $this_script = 'src/backend/utils/mb/Unicode/UCS_to_UHC.pl'; # Read the input -my $in_file = "windows-949-2000.xml"; +my $in_file = "windows-949-2000.ucm"; open(my $in, '<', $in_file) || die("cannot open $in_file"); @@ -30,9 +31,18 @@ my @mapping; while (<$in>) { - next if (!m/\s+ + ((?:\\x[0-9A-Fa-f]{2})+)\s+ + \|(\d+)/x; + my ($u, $c, $flag) = ($1, $2, $3); + $c =~ s/\\x//g; + + # We only want round-trip mappings + next if ($flag ne '0'); + my $ucs = hex($u); my $code = hex($c); -- 2.39.5 (Apple Git-154)