#!/usr/bin/perl

#created by Maxim Boguk (maxim.boguk@gmail.com) November 2010

#vacuum_table.pl connects to given table, and tries to compact given table, as described
#in this blogpost:
#http://blog.endpoint.com/2010/09/reducing-bloat-without-locking.html
#utility inspired by this post:
#http://www.depesz.com/index.php/2010/10/17/reduce-bloat-of-table-without-longexclusive-locks/


use strict;
use warnings;

use Getopt::Long;

#connect options
my $psql     = 'psql';
my $db_host  = 'localhost';
my $db_port  = 5432;
my $db_user  = `whoami`;
chomp $db_user;
my $database = $db_user;
my $schema   = 'public';
my $table;

#performance and load settings
my $pages_per_round  = 100;
my $pages_per_vacuum = 10000;
my $delay            = 0;

my $no_routine_vacuum = 0;
my $no_initial_vacuum = 0;
my $no_final_vacuum   = 0;

my $help;

my $result = GetOptions (
        "psql=s" => \$psql,
        "host=s" => \$db_host,
        "port=i" => \$db_port,
        "username=s" => \$db_user,
        "dbname=s" => \$database,
        "schema=s" => \$schema,
        "table=s" => \$table,
        "pages-per-round=i" => \$pages_per_round,
        "pages-per-vacuum=i" => \$pages_per_vacuum,
        "no-routine-vacuum" => \$no_routine_vacuum,
        "no-initial-vacuum" => \$no_initial_vacuum,
        "no-final-vacuum" => \$no_final_vacuum,
        "delay=i" => \$delay,
        "help" => \$help
);

if ($help or not($table) or not($result)) {
        print_help();
        exit(0);
}

#internal variables
my $field;
my $pages = 0;
my $round = 0;
my $position;
my $vacuum_position=0;
my $res;

$| = 1;

print "Using: host=$db_host, port=$db_port, user=$db_user, database=$database, schema=$schema, table=$table, pages_per_round=$pages_per_round, pages_per_vacuum=$pages_per_vacuum, delay=$delay\n";

#getting some table properties
$res = `$psql -h $db_host -p $db_port -U $db_user -d $database -t -c "
    SELECT
        --replation size in pages
        (SELECT pg_relation_size('$schema.$table') / current_setting('block_size')::int4) as relpages,
        --existing always or replica ON UPDATE triggers on relation
        EXISTS(SELECT * FROM pg_trigger WHERE pg_trigger.tgrelid=pg_class.oid AND tgtype&16=8 AND tgenabled IN ('A', 'R')) as has_conflicting_triggers
    FROM pg_class WHERE oid='$schema.$table'::regclass
"`;
if ($res =~ /^\s*(\d+)\s*\|\s*(t|f)\s*$/) {
        $position = $1;
        if ($2 eq 't') {
                die "can't work in presence of 'always' or 'replica' ON UPDATE triggers on table";
        }
} else {
        die "some error happened or table not found, database answer - '$res'\n";
}

#getting best field to perform update sequence
$res = `$psql -h $db_host -p $db_port -U $db_user -d $database -t -c "
SELECT
        quote_ident(attname)
FROM pg_attribute
WHERE
        --no system or dropped fields
        attnum>0 AND attisdropped IS FALSE AND
        attrelid='$schema.$table'::regclass
ORDER BY
        --variable legth attributes has least priority because chance of toast firing
        (attlen=-1),
        --preferly not indexed attributes
        (attnum::text in (select regexp_split_to_table(indkey::text, ' ') from pg_index where indrelid = '$schema.$table'::regclass)),
        --preferly small attributes
        attlen,
        attnum
LIMIT 1
"`;

if ($res =~ /^\s*(\S+)\s*$/) {
        $field = $1;
} else {
         die "can't find best candidate field for table... answer - '$res'\n";
}

#initial table vacuum
unless ($no_initial_vacuum) {
        print " ".(scalar localtime).": PERFORMING INITIAL VACUUM ==\n";
        system "/usr/bin/time $psql -h $db_host -p $db_port -U $db_user -d $database -q -c 'VACUUM \"$schema\".\"$table\"'";
}
#creating working function
system "$psql -h $db_host -p $db_port -U $db_user -d $database -q -f clear_table_tail.sql";

system "psql -h $db_host -p $db_port -U $db_user -d $database -t -c \"SELECT 'table size = '||(pg_relation_size('$schema.$table') / current_setting('block_size')::bigint)\"";
system "psql -h $db_host -p $db_port -U $db_user -d $database -t -c \"SELECT 'indexes size = '||sum(pg_relation_size('$schema.'||indexname))::bigint/current_setting('block_size')::bigint FROM pg_indexes WHERE schemaname='$schema' AND tablename='$table'\"";

print "begin real work...using field=$field\n";

while (1) {
        $round++;
        my $cmd = "$psql -h $db_host -p $db_port -U $db_user -d $database -t -c \"SELECT * from __clear_table_tail('$table', '$schema', '$field', $position, $pages_per_round)\"";
        $res = `$cmd`;
        if ($res =~ /^\s*(\S+)\s*$/m) {
                if (($1 eq '0') or ($1 eq 'NULL')) {
                        last;
                }
                $position = $1;
        } else {
                print "Some error happens... $res";
                last;
        }
        $pages += $pages_per_round;
        if (int($pages/$pages_per_vacuum) != $vacuum_position and !$no_routine_vacuum) {
                print " ".(scalar localtime).": PERFORMING ROUTINE VACUUM (ROUND=$round, PAGES DONE=$pages)\n";
                system "/usr/bin/time -h $psql -h $db_host -p $db_port -U $db_user -d $database -q -c 'VACUUM \"$schema\".\"$table\"'";
                system "$psql -h $db_host -p $db_port -U $db_user -d $database -t -c \"SELECT 'table size = '||(pg_relation_size('$schema.$table') / current_setting('block_size')::int4)\"";
                $vacuum_position = int($pages/$pages_per_vacuum);
        }
        sleep($delay);
}

#drop working function
system "$psql -h $db_host -p $db_port -U $db_user -d $database -q -c 'DROP FUNCTION __clear_table_tail(text,text,text,integer,integer)'";

unless ($no_final_vacuum) {
        print "\n ".(scalar localtime).": PERFORMING FINAL VACUUM (ROUND=$round, PAGES DONE=$pages)\n";
        system "/usr/bin/time -h $psql -h $db_host -p $db_port -U $db_user -d $database -q -c 'VACUUM \"$schema\".\"$table\";'";
}

system "psql -h $db_host -p $db_port -U $db_user -d $database -t -c \"SELECT 'table size = '||(pg_relation_size('$schema.$table') / current_setting('block_size')::bigint)\"";
system "psql -h $db_host -p $db_port -U $db_user -d $database -t -c \"SELECT 'indexes size = '||sum(pg_relation_size('$schema.'||indexname))::bigint/current_setting('block_size')::bigint FROM pg_indexes WHERE schemaname='$schema' AND tablename='$table'\"";


sub print_help {
    print <<_EO_HELP_;
Syntax:
    $0 [arguments] [options]

Mandatory arguments:
    --table=          : table name

Options:
    --username=         : database user name (default - current user as produced by `whoami`)
    --host=             : database server host or socket directory (default - localhost)
    --port=             : database server port (default - 5432)
    --dbname=           : database name to connect to (default - same as database user name)
    --psql=             : path to psql program (default psql)
    --help              : show this message
    --schema=           : namespace in which table exists (default - public)
    --pages-per-round=  : number of pages to try to free per round (default - 100)
    --pages-per-vacuum= : number pages freed before fire vacuum (default - 10000)
    --delay=            : pause --delay second after done with --pages-per-round pages (default - 0)
    --no-[initial|routine|final]-vacuum : turn off initial/routine/final vacuum (by default vacuums enabled)

Description:

$0 connects to given table, and tries to compact given table, as described
in this blogpost:
http://blog.endpoint.com/2010/09/reducing-bloat-without-locking.html
utility inspired by this post:
http://www.depesz.com/index.php/2010/10/17/reduce-bloat-of-table-without-longexclusive-locks/
_EO_HELP_
}
