From 6ca50a54b9a03f7b6c53d1cc369d3bddbc58c54b Mon Sep 17 00:00:00 2001 From: John Naylor Date: Mon, 18 Dec 2017 13:11:16 +0700 Subject: [PATCH v4 07/12] Update catalog scripts to read data files. Teach genbki.pl, Gen_fmgrtab.pl, duplicate_oids, and unused_oids to read the data files, and arrange for genbki.pl to double-quote certain values so bootscanner.l can read them. Introduce Makefile dependencies on the data files. --- doc/src/sgml/bki.sgml | 5 +- src/backend/catalog/Makefile | 11 ++- src/backend/catalog/README | 82 +++++++++++++++++++---- src/backend/catalog/genbki.pl | 133 ++++++++++++++++++++++++++++--------- src/backend/utils/Gen_fmgrtab.pl | 32 ++++----- src/backend/utils/Makefile | 2 +- src/include/catalog/duplicate_oids | 6 +- src/include/catalog/unused_oids | 6 +- 8 files changed, 207 insertions(+), 70 deletions(-) diff --git a/doc/src/sgml/bki.sgml b/doc/src/sgml/bki.sgml index 33378b4..a3962c5 100644 --- a/doc/src/sgml/bki.sgml +++ b/doc/src/sgml/bki.sgml @@ -21,8 +21,9 @@ input file used by initdb is created as part of building and installing PostgreSQL by a program named genbki.pl, which reads some - specially formatted C header files in the src/include/catalog/ - directory of the source tree. The created BKI file + specially formatted C header files and data files in the + src/include/catalog/ directory of the source tree. + The created BKI file is called postgres.bki and is normally installed in the share subdirectory of the installation tree. diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index 30ca509..23858b8 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -49,6 +49,15 @@ POSTGRES_BKI_SRCS = $(addprefix $(top_srcdir)/src/include/catalog/,\ toasting.h indexing.h \ ) +POSTGRES_BKI_DATA = $(addprefix $(top_srcdir)/src/include/catalog/,\ + pg_aggregate.dat pg_am.dat pg_amop.dat pg_amproc.dat pg_authid.dat \ + pg_cast.dat pg_class.dat pg_collation.dat pg_database.dat pg_language.dat \ + pg_namespace.dat pg_opclass.dat pg_operator.dat pg_opfamily.dat \ + pg_pltemplate.dat pg_proc.dat pg_range.dat pg_tablespace.dat \ + pg_ts_config.dat pg_ts_config_map.dat pg_ts_dict.dat pg_ts_parser.dat \ + pg_ts_template.dat pg_type.dat \ + ) + # location of Catalog.pm catalogdir = $(top_srcdir)/src/backend/catalog @@ -67,7 +76,7 @@ schemapg.h: postgres.bki ; # even in distribution tarballs. So this is cheating a bit, but it # will achieve the goal of updating the version number when it # changes. -postgres.bki: genbki.pl Catalog.pm $(POSTGRES_BKI_SRCS) $(top_srcdir)/configure $(top_srcdir)/src/include/catalog/duplicate_oids +postgres.bki: genbki.pl Catalog.pm $(POSTGRES_BKI_SRCS) $(POSTGRES_BKI_DATA) $(top_srcdir)/configure $(top_srcdir)/src/include/catalog/duplicate_oids cd $(top_srcdir)/src/include/catalog && $(PERL) ./duplicate_oids $(PERL) -I $(catalogdir) $< $(pg_includes) --set-version=$(MAJORVERSION) $(POSTGRES_BKI_SRCS) diff --git a/src/backend/catalog/README b/src/backend/catalog/README index 7e0ddf3..d1ba320 100644 --- a/src/backend/catalog/README +++ b/src/backend/catalog/README @@ -7,24 +7,73 @@ This directory contains .c files that manipulate the system catalogs; src/include/catalog contains the .h files that define the structure of the system catalogs. -When the compile-time scripts (Gen_fmgrtab.pl and genbki.pl) -execute, they grep the DATA statements out of the .h files and munge -these in order to generate the postgres.bki file. The .bki file is then +When the compile-time script genbki.pl executes, it parses the .h files +and .dat files in order to generate the postgres.* files. These are then used as input to initdb (which is just a wrapper around postgres running single-user in bootstrapping mode) in order to generate the initial (template) system catalog relation files. +backend/utils/Gen_fmgrtab.pl uses the same mechanism to genarate .c and +.h files used by the function manager. + ----------------------------------------------------------------- -People who are going to hose around with the .h files should be aware -of the following facts: +The data file format and bootstrap data conventions -- It is very important that the DATA statements be properly formatted +- As far as the bootstrap code is concerned, it is very important +that the insert statements in postgres.bki be properly formatted (e.g., no broken lines, proper use of white-space and _null_). The scripts are line-oriented and break easily. In addition, the only documentation on the proper format for them is the code in the -bootstrap/ directory. Just be careful when adding new DATA -statements. +bootstrap/ directory. Fortunately, the source bootstrap data is much +more tolerant with respect to formatting, but it still pays to be +careful when adding new data. + +- The .dat files contain Perl data literals that are simply eval'd to +produce in-memory data structures. As such, the code reading them doesn't +care about ordering and layout, but in order to maintain a standard +appearance, src/include/catalog/rewrite_dat.pl should be run before +committing data changes. Each file contains an array of hash references, +which represent the data entries. The best examples are the existing +data files, but an altered subset of pg_database.dat will demonstrate +the key features: + +[ +# a comment +{ oid => '1', + datname => 'Berkely\'s DB', datcollate => '"LC_COLLATE"', datacl => '_null_' }, +] + +-The layout is: open bracket, one or more sets of curly brackets containing +comma-separated key-value pairs, close bracket. +-All values are single-quoted. +-Single quotes within values must be escaped. +-If a value is a macro to be expanded by initdb.c, it must have double- +quotes, since we don't know what kind of characters will be substituted. +-Nulls are represented as "_null_". +-Comments must be on their own lines. +-The fields oid, descr, and shdescr are on their own line within the +hash. (This is done automatically during rewriting so don't worry about +their placement during development.) + +- Some techniques are used to keep the data representation compact. +These are automatically enforced by rewrite_dat.pl, but you should be +aware of them. pg_proc.dat uses all three of them in an attempt to keep +the file manageable: +1. If the .h file specifies a default value for a column, and a data entry +has that same value, it will be ommitted from the data file. +2. Likewise, some values could be computed from other values, so are also +left out. +3. If the .h file specifies a column abbeviation, then it will be used as +the hash key in the data entry. + +- If you want to change a default value or abbreviation, you must +1. rewrite the data files into the expanded representation via +"perl -I ../../backend/catalog rewrite_dat.pl pg_foo.dat --expand". +2. change the relevant .h file to use the new default/abbreviation. +3. run rewrite_dat.pl without the expand parameter to write out the new +compact representation. The first step can be skipped if you are adding +a new default/abbreviation. - Some catalogs require that OIDs be preallocated to tuples because of cross-references from other pre-loaded tuples. For example, pg_type @@ -49,19 +98,26 @@ up #define's for the pg_class OIDs of system catalogs and indexes. For all the other system catalogs, you have to manually create any #define's you need. -- If you need to find a valid OID for a new predefined tuple, -use the unused_oids script. It generates inclusive ranges of +- If you need to find a valid OID for a new predefined tuple, use the +script src/include/catalog/unused_oids. It generates inclusive ranges of *unused* OIDs (e.g., the line "45-900" means OIDs 45 through 900 have not been allocated yet). Currently, OIDs 1-9999 are reserved for manual assignment; the unused_oids script simply looks through the include/catalog -headers to see which ones do not appear in "OID =" clauses in DATA lines. +headers and .dat files to see which ones do not appear. (As of Postgres 8.1, it also looks at CATALOG and DECLARE_INDEX lines.) -You can also use the duplicate_oids script to check for mistakes. +You can use the duplicate_oids script to check for mistakes. This script +is also run at compile time, and will stop the build if a duplicate is +found. - The OID counter starts at 10000 at bootstrap. If a catalog row is in a table that requires OIDs, but no OID was preassigned by an "OID =" clause, then it will receive an OID of 10000 or above. +----------------------------------------------------------------- + +People who are going to hose around with the .h files should be aware +of the following facts: + - To create a "BOOTSTRAP" table you have to do a lot of extra work: these tables are not created through a normal CREATE TABLE operation, but spring into existence when first written to during initdb. Therefore, you must @@ -98,7 +154,7 @@ catalog tuples that contain NULL attributes except in their variable-length portions! (The bootstrapping code is fairly good about marking NOT NULL each of the columns that can legally be referenced via C struct declarations ... but those markings won't be enforced against -DATA commands, so you must get it right in a DATA line.) +insert commands, so you must get it right in the data files.) - Modification of the catalogs must be performed with the proper updating of catalog indexes! That is, most catalogs have indexes diff --git a/src/backend/catalog/genbki.pl b/src/backend/catalog/genbki.pl index cf6de28..00fc35d 100644 --- a/src/backend/catalog/genbki.pl +++ b/src/backend/catalog/genbki.pl @@ -4,8 +4,8 @@ # genbki.pl # Perl script that generates postgres.bki, postgres.description, # postgres.shdescription, and schemapg.h from specially formatted -# header files. The .bki files are used to initialize the postgres -# template database. +# header files and data files. The BKI files are used to initialize +# the postgres template database. # # Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group # Portions Copyright (c) 1994, Regents of the University of California @@ -93,8 +93,43 @@ my $PG_CATALOG_NAMESPACE = Catalog::FindDefinedSymbol('pg_namespace.h', \@include_path, 'PG_CATALOG_NAMESPACE'); -# Read all the input header files into internal data structures -my $catalogs = Catalog::Catalogs(@input_files); +# Read all the files into internal data structures. Not all catalogs +# will have a data file. +my @catnames; +my %catalogs; +my %catalog_data; +my @toast_decls; +my @index_decls; +foreach my $header (@input_files) +{ + $header =~ /(.+)\.h$/ + or die "Input files need to be header files.\n"; + my $datfile = "$1.dat"; + + my $catalog = Catalog::ParseHeader($header); + my $catname = $catalog->{catname}; + my $schema = $catalog->{columns}; + + if (defined $catname) + { + push @catnames, $catname; + $catalogs{$catname} = $catalog; + } + + if (-e $datfile) + { + $catalog_data{$catname} = Catalog::ParseData($datfile, $schema, 0); + } + + foreach my $toast_decl (@{ $catalog->{toasting} }) + { + push @toast_decls, $toast_decl; + } + foreach my $index_decl (@{ $catalog->{indexing} }) + { + push @index_decls, $index_decl; + } +} # Generate postgres.bki, postgres.description, and postgres.shdescription @@ -108,11 +143,11 @@ my %regprocoids; my @types; # produce output, one catalog at a time -foreach my $catname (@{ $catalogs->{names} }) +foreach my $catname (@catnames) { # .bki CREATE command for this catalog - my $catalog = $catalogs->{$catname}; + my $catalog = $catalogs{$catname}; print $bki "create $catname $catalog->{relation_oid}" . $catalog->{shared_relation} . $catalog->{bootstrap} @@ -156,17 +191,13 @@ foreach my $catname (@{ $catalogs->{names} }) print $bki "open $catname\n"; } - if (defined $catalog->{data}) + if (defined $catalog_data{$catname}) { - # Ordinary catalog with DATA line(s) - foreach my $row (@{ $catalog->{data} }) + # Ordinary catalog with a data file + foreach my $row (@{ $catalog_data{$catname} }) { - - # Split line into tokens without interpreting their meaning. - my %bki_values; - @bki_values{@attnames} = - Catalog::SplitDataLine($row->{bki_values}); + my %bki_values = %$row; # Perform required substitutions on fields foreach my $column (@$schema) @@ -200,7 +231,7 @@ foreach my $catname (@{ $catalogs->{names} }) } else { - $regprocoids{ $bki_values{proname} } = $row->{oid}; + $regprocoids{ $bki_values{proname} } = $bki_values{oid}; } } @@ -208,38 +239,38 @@ foreach my $catname (@{ $catalogs->{names} }) if ($catname eq 'pg_type') { my %type = %bki_values; - $type{oid} = $row->{oid}; push @types, \%type; } + # Add quotes where necessary. + quote_bki_values(\%bki_values, $schema); + # Write to postgres.bki - my $oid = $row->{oid} ? "OID = $row->{oid} " : ''; - printf $bki "insert %s( %s )\n", $oid, - join(' ', @bki_values{@attnames}); + bki_insert(\%bki_values, @attnames); # Write comments to postgres.description and # postgres.shdescription - if (defined $row->{descr}) + if (defined $bki_values{descr}) { printf $descr "%s\t%s\t0\t%s\n", - $row->{oid}, $catname, $row->{descr}; + $bki_values{oid}, $catname, $bki_values{descr}; } - if (defined $row->{shdescr}) + if (defined $bki_values{shdescr}) { printf $shdescr "%s\t%s\t%s\n", - $row->{oid}, $catname, $row->{shdescr}; + $bki_values{oid}, $catname, $bki_values{shdescr}; } } } if ($catname eq 'pg_attribute') { - # For pg_attribute.h, we generate DATA entries ourselves. + # For pg_attribute.h, we generate data entries ourselves. # NB: pg_type.h must come before pg_attribute.h in the input list # of catalog names, since we use info from pg_type.h here. - foreach my $table_name (@{ $catalogs->{names} }) + foreach my $table_name (@catnames) { - my $table = $catalogs->{$table_name}; + my $table = $catalogs{$table_name}; # Currently, all bootstrapped relations also need schemapg.h # entries, so skip if the relation isn't to be in schemapg.h. @@ -316,12 +347,12 @@ foreach my $catname (@{ $catalogs->{names} }) # (i.e., not contained in a header with a CATALOG() statement) comes here # Write out declare toast/index statements -foreach my $declaration (@{ $catalogs->{toasting}->{data} }) +foreach my $declaration (@toast_decls) { print $bki $declaration; } -foreach my $declaration (@{ $catalogs->{indexing}->{data} }) +foreach my $declaration (@index_decls) { print $bki $declaration; } @@ -379,6 +410,46 @@ exit 0; #################### Subroutines ######################## +# Supply quoting for a normal bki row. +# This allows us to keep most double quotes +# out of the catalog data files for readability. +sub quote_bki_values +{ + my $row = shift; + my $schema = shift; + + foreach my $column (@$schema) + { + my $attname = $column->{name}; + my $atttype = $column->{type}; + + if + ( + length($row->{$attname}) == 0 # Empty string + or $row->{$attname} =~ /\s/ # Whitespace + + # Quote strings that have special characters + # except for certain cases. See bootscanner.l + or ( $row->{$attname} =~ /\W/ + and $row->{$attname} !~ /^\\\d{3}$/ # octal + and $row->{$attname} !~ /^-\d*$/) # '-' or '-1' + + # XXX Not needed, but keeps the .bki diff down to a reasonable + # size during review + or $attname eq 'oprname' # Operator names + or $atttype eq 'oidvector' # Arrays etc. + or $atttype eq 'int2vector' + or $atttype =~ /\[\]$/ + ) + { + if ($row->{$attname} ne '_null_' and $row->{$attname} !~ /^"([^"])*"$/) + { + $row->{$attname} = q|"| . $row->{$attname} . q|"|; + } + } + } +} + # Given the schema of pg_attribute, generate an entry for it using information # about the attribute it describes. Any value that is not handled here @@ -452,7 +523,7 @@ sub emit_pgattr_row } } -# Write a pg_attribute entry to postgres.bki +# Write an entry to postgres.bki sub bki_insert { my $row = shift; @@ -522,8 +593,8 @@ Options: --set-version PostgreSQL version number for initdb cross-check genbki.pl generates BKI files from specially formatted -header files. These BKI files are used to initialize the -postgres template database. +header files and .dat files. These BKI files are used +to initialize the postgres template database. Report bugs to . EOM diff --git a/src/backend/utils/Gen_fmgrtab.pl b/src/backend/utils/Gen_fmgrtab.pl index 14c02f5..f30a9e7 100644 --- a/src/backend/utils/Gen_fmgrtab.pl +++ b/src/backend/utils/Gen_fmgrtab.pl @@ -3,7 +3,7 @@ # # Gen_fmgrtab.pl # Perl script that generates fmgroids.h, fmgrprotos.h, and fmgrtab.c -# from pg_proc.h +# from pg_proc.h and pg_proc.dat # # Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group # Portions Copyright (c) 1994, Regents of the University of California @@ -55,35 +55,35 @@ if ($output_path ne '' && substr($output_path, -1) ne '/') die "No input files.\n" if !$infile; die "No include path; you must specify -I at least once.\n" if !@include_path; +# We pass the pg_proc.h path as an argument and then look for a matching +# data file. +$infile =~ /(.+)\.h$/ + or die "Input file needs to be a header file.\n"; +my $datfile = "$1.dat"; +die "No data file\n" + if ! -e $datfile; + my $FirstBootstrapObjectId = Catalog::FindDefinedSymbol('access/transam.h', \@include_path, 'FirstBootstrapObjectId'); my $INTERNALlanguageId = Catalog::FindDefinedSymbol('catalog/pg_language.h', \@include_path, 'INTERNALlanguageId'); -# Read all the data from the include/catalog files. -my $catalogs = Catalog::Catalogs($infile); +# Read all the files into internal data structures. +my $catalog = Catalog::ParseHeader($infile); +my $data = Catalog::ParseData($datfile, $catalog->{columns}, 0); -# Collect the raw data from pg_proc.h. +# Collect certain fields from pg_proc.dat. my @fmgr = (); -my @attnames; -foreach my $column (@{ $catalogs->{pg_proc}->{columns} }) -{ - push @attnames, $column->{name}; -} -my $data = $catalogs->{pg_proc}->{data}; foreach my $row (@$data) { - - # Split line into tokens without interpreting their meaning. - my %bki_values; - @bki_values{@attnames} = Catalog::SplitDataLine($row->{bki_values}); + my %bki_values = %$row; # Select out just the rows for internal-language procedures. next if $bki_values{prolang} ne $INTERNALlanguageId; push @fmgr, - { oid => $row->{oid}, + { oid => $bki_values{oid}, strict => $bki_values{proisstrict}, retset => $bki_values{proretset}, nargs => $bki_values{pronargs}, @@ -284,7 +284,7 @@ sub usage Usage: perl -I [directory of Catalog.pm] Gen_fmgrtab.pl [path to pg_proc.h] Gen_fmgrtab.pl generates fmgroids.h, fmgrprotos.h, and fmgrtab.c from -pg_proc.h +pg_proc.h and pg_proc.dat Report bugs to . EOM diff --git a/src/backend/utils/Makefile b/src/backend/utils/Makefile index efb8b53..8ccfc3b 100644 --- a/src/backend/utils/Makefile +++ b/src/backend/utils/Makefile @@ -24,7 +24,7 @@ $(SUBDIRS:%=%-recursive): fmgroids.h fmgrprotos.h fmgrprotos.h: fmgroids.h ; fmgroids.h: fmgrtab.c ; -fmgrtab.c: Gen_fmgrtab.pl $(catalogdir)/Catalog.pm $(top_srcdir)/src/include/catalog/pg_proc.h +fmgrtab.c: Gen_fmgrtab.pl $(catalogdir)/Catalog.pm $(top_srcdir)/src/include/catalog/pg_proc.h $(top_srcdir)/src/include/catalog/pg_proc.dat $(PERL) -I $(catalogdir) $< -I $(top_srcdir)/src/include/ $(top_srcdir)/src/include/catalog/pg_proc.h errcodes.h: $(top_srcdir)/src/backend/utils/errcodes.txt generate-errcodes.pl diff --git a/src/include/catalog/duplicate_oids b/src/include/catalog/duplicate_oids index 7342d61..9732f61 100755 --- a/src/include/catalog/duplicate_oids +++ b/src/include/catalog/duplicate_oids @@ -5,7 +5,7 @@ use warnings; BEGIN { - @ARGV = (glob("pg_*.h"), qw(indexing.h toasting.h)); + @ARGV = (glob("pg_*.h"), glob("pg_*.dat"), qw(indexing.h toasting.h)); } my %oidcounts; @@ -14,7 +14,7 @@ while (<>) { next if /^CATALOG\(.*BKI_BOOTSTRAP/; next - unless /^DATA\(insert *OID *= *(\d+)/ + unless /\boid *=> *'(\d+)'/ || /^CATALOG\([^,]*, *(\d+).*BKI_ROWTYPE_OID\((\d+)\)/ || /^CATALOG\([^,]*, *(\d+)/ || /^DECLARE_INDEX\([^,]*, *(\d+)/ @@ -30,7 +30,7 @@ foreach my $oid (sort { $a <=> $b } keys %oidcounts) { next unless $oidcounts{$oid} > 1; $found = 1; - print "$oid\n"; + print "***Duplicate OID: $oid\n"; } exit $found; diff --git a/src/include/catalog/unused_oids b/src/include/catalog/unused_oids index 97769d3..a930560 100755 --- a/src/include/catalog/unused_oids +++ b/src/include/catalog/unused_oids @@ -25,11 +25,11 @@ export FIRSTOBJECTID # this part (down to the uniq step) should match the duplicate_oids script # note: we exclude BKI_BOOTSTRAP relations since they are expected to have -# matching DATA lines in pg_class.h and pg_type.h +# matching data lines in pg_class.dat and pg_type.dat -cat pg_*.h toasting.h indexing.h | \ +cat pg_*.h pg_*.dat toasting.h indexing.h | egrep -v -e '^CATALOG\(.*BKI_BOOTSTRAP' | \ -sed -n -e 's/^DATA(insert *OID *= *\([0-9][0-9]*\).*$/\1/p' \ +sed -n -e 's/.*\boid *=> *'\''\([0-9][0-9]*\)'\''.*$/\1/p' \ -e 's/^CATALOG([^,]*, *\([0-9][0-9]*\).*BKI_ROWTYPE_OID(\([0-9][0-9]*\)).*$/\1,\2/p' \ -e 's/^CATALOG([^,]*, *\([0-9][0-9]*\).*$/\1/p' \ -e 's/^DECLARE_INDEX([^,]*, *\([0-9][0-9]*\).*$/\1/p' \ -- 2.7.4