diff --git a/doc/src/sgml/ref/pg_checksums.sgml b/doc/src/sgml/ref/pg_checksums.sgml
index 1f4d4ab8b4..5027809a59 100644
--- a/doc/src/sgml/ref/pg_checksums.sgml
+++ b/doc/src/sgml/ref/pg_checksums.sgml
@@ -37,12 +37,10 @@ PostgreSQL documentation
   <title>Description</title>
   <para>
    <application>pg_checksums</application> checks, enables or disables data
-   checksums in a <productname>PostgreSQL</productname> cluster.  The server
-   must be shut down cleanly before running
-   <application>pg_checksums</application>. The exit status is zero if there
-   are no checksum errors when checking them, and nonzero if at least one
-   checksum failure is detected. If enabling or disabling checksums, the
-   exit status is nonzero if the operation failed.
+   checksums in a <productname>PostgreSQL</productname> cluster. The exit
+   status is zero if there are no checksum errors when checking them, and
+   nonzero if at least one checksum failure is detected. If enabling or
+   disabling checksums, the exit status is nonzero if the operation failed.
   </para>
 
   <para>
@@ -50,6 +48,12 @@ PostgreSQL documentation
    the cluster, disabling checksums will only update the file
    <filename>pg_control</filename>.
   </para>
+
+  <para>
+   If the cluster is online, pages that have been (re-)written since the last
+   checkpoint will not count as checksum failures if they cannot be verified
+   correctly.
+  </page>
  </refsect1>
 
  <refsect1>
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 5c185ebed8..f62fdc90a4 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -1,8 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * pg_checksums.c
- *	  Checks, enables or disables page level checksums for an offline
- *	  cluster
+ *	  Checks, enables or disables page level checksums for a cluster
  *
  * Copyright (c) 2010-2019, PostgreSQL Global Development Group
  *
@@ -30,13 +29,20 @@
 
 
 static int64 files = 0;
+static int64 skippedfiles = 0;
 static int64 blocks = 0;
 static int64 badblocks = 0;
+static int64 skippedblocks = 0;
 static ControlFileData *ControlFile;
+static XLogRecPtr checkpointLSN;
+static int64 blocks_last_lsn_update = 0;
 
 static char *only_relfilenode = NULL;
 static bool do_sync = true;
 static bool verbose = false;
+static bool online = false;
+
+static char *DataDir = NULL;
 
 typedef enum
 {
@@ -97,6 +103,23 @@ static const char *const skip[] = {
 	NULL,
 };
 
+static void
+update_checkpoint_lsn(void)
+{
+	bool		crc_ok;
+
+	ControlFile = get_controlfile(DataDir, progname, &crc_ok);
+	if (!crc_ok)
+	{
+		fprintf(stderr, _("%s: pg_control CRC value is incorrect\n"), progname);
+		exit(1);
+	}
+
+	/* Update checkpointLSN and blocks_last_lsn_update with the current value */
+	checkpointLSN = ControlFile->checkPoint;
+	blocks_last_lsn_update = blocks;
+}
+
 static bool
 skipfile(const char *fn)
 {
@@ -116,6 +139,10 @@ scan_file(const char *fn, BlockNumber segmentno)
 	PageHeader	header = (PageHeader) buf.data;
 	int			f;
 	BlockNumber blockno;
+	int			block_loc = 0;
+	bool		block_retry = false;
+	bool		all_zeroes;
+	size_t	   *pagebytes;
 	int			flags;
 
 	Assert(mode == PG_MODE_ENABLE ||
@@ -126,6 +153,12 @@ scan_file(const char *fn, BlockNumber segmentno)
 
 	if (f < 0)
 	{
+		if (online && errno == ENOENT)
+		{
+			/* File was removed in the meantime */
+			return;
+		}
+
 		fprintf(stderr, _("%s: could not open file \"%s\": %s\n"),
 				progname, fn, strerror(errno));
 		exit(1);
@@ -136,27 +169,129 @@ scan_file(const char *fn, BlockNumber segmentno)
 	for (blockno = 0;; blockno++)
 	{
 		uint16		csum;
-		int			r = read(f, buf.data, BLCKSZ);
+		int			r = read(f, buf.data + block_loc, BLCKSZ - block_loc);
 
 		if (r == 0)
+		{
+			/*
+			 * We had a short read and got an EOF before we could read the
+			 * whole block, so count this as a skipped block.
+			 */
+			if (online && block_loc != 0)
+				skippedblocks++;
 			break;
-		if (r != BLCKSZ)
+		}
+		if (r < 0)
 		{
-			fprintf(stderr, _("%s: could not read block %u in file \"%s\": read %d of %d\n"),
-					progname, blockno, fn, r, BLCKSZ);
-			exit(1);
+			skippedfiles++;
+			fprintf(stderr, _("%s: could not read block %u in file \"%s\": %s\n"),
+					progname, blockno, fn, strerror(errno));
+			return;
+		}
+		if (r < (BLCKSZ - block_loc))
+		{
+			if (online)
+			{
+				/*
+				 * We read only parts of the block, possibly due to a
+				 * concurrent extension of the relation file.  Increment
+				 * block_loc by the amount that we read and try again.
+				 */
+				block_loc += r;
+				continue;
+			}
+			else
+			{
+				skippedfiles++;
+				fprintf(stderr, _("%s: could not read block %u in file \"%s\": read %d of %d\n"),
+						progname, blockno, fn, r, BLCKSZ);
+				return;
+			}
 		}
+
 		blocks++;
 
 		/* New pages have no checksum yet */
 		if (PageIsNew(header))
+		{
+			/* Check for an all-zeroes page */
+			all_zeroes = true;
+			pagebytes = (size_t *) buf.data;
+			for (int i = 0; i < (BLCKSZ / sizeof(size_t)); i++)
+			{
+				if (pagebytes[i] != 0)
+				{
+					all_zeroes = false;
+					break;
+				}
+			}
+			if (!all_zeroes)
+			{
+				fprintf(stderr, _("%s: checksum verification failed in file \"%s\", block %u: pd_upper is zero but block is not all-zero\n"),
+						progname, fn, blockno);
+				badblocks++;
+			}
+			else
+			{
+				skippedblocks++;
+			}
 			continue;
+		}
 
 		csum = pg_checksum_page(buf.data, blockno + segmentno * RELSEG_SIZE);
 		if (mode == PG_MODE_CHECK)
 		{
 			if (csum != header->pd_checksum)
 			{
+				if (online)
+				{
+					/*
+					 * Retry the block on the first failure if online.  If the
+					 * verification is done while the instance is online, it is
+					 * possible that we read the first 4K page of the block just
+					 * before postgres updated the entire block so it ends up
+					 * looking torn to us.  We only need to retry once because the
+					 * LSN should be updated to something we can ignore on the next
+					 * pass.  If the error happens again then it is a true
+					 * validation failure.
+					 */
+					if (!block_retry)
+					{
+						/* Seek to the beginning of the failed block */
+						if (lseek(f, -BLCKSZ, SEEK_CUR) == -1)
+						{
+							skippedfiles++;
+							fprintf(stderr, _("%s: could not lseek in file \"%s\": %m\n"),
+									progname, fn);
+							return;
+						}
+
+						/* Set flag so we know a retry was attempted */
+						block_retry = true;
+
+						/* Reset loop to validate the block again */
+						blockno--;
+						blocks--;
+
+						continue;
+					}
+
+					/*
+					 * The checksum verification failed on retry as well.  Check if
+					 * the page has been modified since the checkpoint and skip it
+					 * in this case.  As a sanity check, demand that the upper
+					 * 32 bits of the LSN are identical in order to skip as a
+					 * guard against a corrupted LSN in the pageheader.
+					 */
+					if ((PageGetLSN(buf.data) > checkpointLSN) &&
+					   (PageGetLSN(buf.data) >> 32 == checkpointLSN >> 32))
+					{
+						block_retry = false;
+						skippedblocks++;
+						continue;
+					}
+				}
+
 				if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION)
 					fprintf(stderr, _("%s: checksum verification failed in file \"%s\", block %u: calculated checksum %X but block contains %X\n"),
 							progname, fn, blockno, csum, header->pd_checksum);
@@ -183,6 +318,9 @@ scan_file(const char *fn, BlockNumber segmentno)
 				exit(1);
 			}
 		}
+
+		block_retry = false;
+		block_loc = 0;
 	}
 
 	if (verbose)
@@ -237,6 +375,12 @@ scan_directory(const char *basedir, const char *subdir)
 		snprintf(fn, sizeof(fn), "%s/%s", path, de->d_name);
 		if (lstat(fn, &st) < 0)
 		{
+			if (online && errno == ENOENT)
+			{
+				/* File was removed in the meantime */
+				continue;
+			}
+
 			fprintf(stderr, _("%s: could not stat file \"%s\": %s\n"),
 					progname, fn, strerror(errno));
 			exit(1);
@@ -280,6 +424,10 @@ scan_directory(const char *basedir, const char *subdir)
 				continue;
 
 			scan_file(fn, segmentno);
+
+			/* Update checkpointLSN every 1024 * 1024 blocks */
+			if (online && ((blocks - blocks_last_lsn_update) > (1024 * 1024)))
+				update_checkpoint_lsn();
 		}
 #ifndef WIN32
 		else if (S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode))
@@ -304,7 +452,6 @@ main(int argc, char *argv[])
 		{NULL, 0, NULL, 0}
 	};
 
-	char	   *DataDir = NULL;
 	int			c;
 	int			option_index;
 	bool		crc_ok;
@@ -398,7 +545,7 @@ main(int argc, char *argv[])
 		exit(1);
 	}
 
-	/* Check if cluster is running */
+	/* Check if checksums are enabled */
 	ControlFile = get_controlfile(DataDir, progname, &crc_ok);
 	if (!crc_ok)
 	{
@@ -422,12 +569,10 @@ main(int argc, char *argv[])
 		exit(1);
 	}
 
+	/* Check if cluster is running */
 	if (ControlFile->state != DB_SHUTDOWNED &&
 		ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
-	{
-		fprintf(stderr, _("%s: cluster must be shut down\n"), progname);
-		exit(1);
-	}
+		online = true;
 
 	if (ControlFile->data_checksum_version == 0 &&
 		mode == PG_MODE_CHECK)
@@ -450,6 +595,9 @@ main(int argc, char *argv[])
 		exit(1);
 	}
 
+	/* Get checkpoint LSN */
+	checkpointLSN = ControlFile->checkPoint;
+
 	/* Operate on all files if checking or enabling checksums */
 	if (mode == PG_MODE_CHECK || mode == PG_MODE_ENABLE)
 	{
@@ -459,7 +607,11 @@ main(int argc, char *argv[])
 
 		printf(_("Checksum operation completed\n"));
 		printf(_("Files scanned:  %s\n"), psprintf(INT64_FORMAT, files));
+		if (skippedfiles > 0)
+			printf(_("Files skipped: %s\n"), psprintf(INT64_FORMAT, skippedfiles));
 		printf(_("Blocks scanned: %s\n"), psprintf(INT64_FORMAT, blocks));
+		if (skippedblocks > 0)
+			printf(_("Blocks skipped: %s\n"), psprintf(INT64_FORMAT, skippedblocks));
 		if (mode == PG_MODE_CHECK)
 		{
 			printf(_("Bad checksums:  %s\n"), psprintf(INT64_FORMAT, badblocks));
@@ -497,5 +649,10 @@ main(int argc, char *argv[])
 			printf(_("Checksums disabled in cluster\n"));
 	}
 
+	/* skipped blocks or files are considered an error if offline */
+	if (!online)
+		if (skippedblocks > 0 || skippedfiles > 0)
+			return 1;
+
 	return 0;
 }
diff --git a/src/bin/pg_checksums/t/002_actions.pl b/src/bin/pg_checksums/t/002_actions.pl
index 41575c5245..b6b6092da2 100644
--- a/src/bin/pg_checksums/t/002_actions.pl
+++ b/src/bin/pg_checksums/t/002_actions.pl
@@ -5,8 +5,7 @@ use strict;
 use warnings;
 use PostgresNode;
 use TestLib;
-use Test::More tests => 62;
-
+use Test::More tests => 126;
 
 # Utility routine to create and check a table with corrupted checksums
 # on a wanted tablespace.  Note that this stops and starts the node
@@ -17,6 +16,9 @@ sub check_relation_corruption
 	my $node = shift;
 	my $table = shift;
 	my $tablespace = shift;
+	my $offset = shift;
+	my $corrupted_data = shift;
+	my $description = shift;
 	my $pgdata = $node->data_dir;
 
 	$node->safe_psql('postgres',
@@ -31,21 +33,18 @@ sub check_relation_corruption
 	my $relfilenode_corrupted =  $node->safe_psql('postgres',
 		"SELECT relfilenode FROM pg_class WHERE relname = '$table';");
 
-	# Set page header and block size
-	my $pageheader_size = 24;
-	my $block_size = $node->safe_psql('postgres', 'SHOW block_size;');
 	$node->stop;
 
 	# Checksums are correct for single relfilenode as the table is not
 	# corrupted yet.
 	command_ok(['pg_checksums',  '--check', '-D', $pgdata, '-r',
 			   $relfilenode_corrupted],
-		"succeeds for single relfilenode on tablespace $tablespace with offline cluster");
+		"succeeds for single relfilenode $description with offline cluster");
 
 	# Time to create some corruption
 	open my $file, '+<', "$pgdata/$file_corrupted";
-	seek($file, $pageheader_size, 0);
-	syswrite($file, "\0\0\0\0\0\0\0\0\0");
+	seek($file, $offset, 0);
+	syswrite($file, $corrupted_data);
 	close $file;
 
 	# Checksum checks on single relfilenode fail
@@ -54,23 +53,38 @@ sub check_relation_corruption
 							  1,
 							  [qr/Bad checksums:.*1/],
 							  [qr/checksum verification failed/],
-							  "fails with corrupted data for single relfilenode on tablespace $tablespace");
+							  "fails with corrupted data for single relfilenode $description");
 
 	# Global checksum checks fail as well
 	$node->command_checks_all([ 'pg_checksums', '--check', '-D', $pgdata],
 							  1,
 							  [qr/Bad checksums:.*1/],
 							  [qr/checksum verification failed/],
-							  "fails with corrupted data on tablespace $tablespace");
+							  "fails with corrupted data $description");
 
-	# Drop corrupted table again and make sure there is no more corruption.
+	# Now check online as well
 	$node->start;
+
+	# Checksum checks on single relfilenode fail
+	$node->command_checks_all([ 'pg_checksums', '--check', '-D', $pgdata,
+							  '-r', $relfilenode_corrupted],
+							  1,
+							  [qr/Bad checksums:.*1/],
+							  [qr/checksum verification failed/],
+							  "fails with corrupted data for single relfilenode $description with online cluster");
+
+	# Global checksum checks fail as well
+	$node->command_checks_all([ 'pg_checksums', '--check', '-D', $pgdata],
+							  1,
+							  [qr/Bad checksums:.*1/],
+							  [qr/checksum verification failed/],
+							  "fails with corrupted data $description with online cluster");
+
+
+	# Drop corrupted table again and make sure there is no more corruption.
 	$node->safe_psql('postgres', "DROP TABLE $table;");
-	$node->stop;
 	$node->command_ok(['pg_checksums', '--check', '-D', $pgdata],
 	        "succeeds again after table drop on tablespace $tablespace");
-
-	$node->start;
 	return;
 }
 
@@ -143,6 +157,11 @@ command_ok(['pg_checksums', '--check', '-D', $pgdata],
 command_ok(['pg_checksums', '-D', $pgdata],
 		   "verifies checksums as default action");
 
+# Checksums pass on an online cluster
+$node->start;
+command_ok(['pg_checksums', '--check', '-D', $pgdata],
+		   "succeeds with online cluster");
+
 # Specific relation files cannot be requested when action is --disable
 # or --enable.
 command_fails(['pg_checksums', '--disable', '-r', '1234', '-D', $pgdata],
@@ -150,13 +169,12 @@ command_fails(['pg_checksums', '--disable', '-r', '1234', '-D', $pgdata],
 command_fails(['pg_checksums', '--enable', '-r', '1234', '-D', $pgdata],
 	      "fails when relfilenodes are requested and action is --enable");
 
-# Checks cannot happen with an online cluster
-$node->start;
-command_fails(['pg_checksums', '--check', '-D', $pgdata],
-			  "fails with online cluster");
+# Set page header and block size
+my $pageheader_size = 24;
+my $block_size = $node->safe_psql('postgres', 'SHOW block_size;');
 
 # Check corruption of table on default tablespace.
-check_relation_corruption($node, 'corrupt1', 'pg_default');
+check_relation_corruption($node, 'corrupt1', 'pg_default', $pageheader_size, "\0\0\0\0\0\0\0\0\0", "on tablespace pg_default");
 
 # Create tablespace to check corruptions in a non-default tablespace.
 my $basedir = $node->basedir;
@@ -165,7 +183,15 @@ mkdir ($tablespace_dir);
 $tablespace_dir = TestLib::real_dir($tablespace_dir);
 $node->safe_psql('postgres',
 	"CREATE TABLESPACE ts_corrupt LOCATION '$tablespace_dir';");
-check_relation_corruption($node, 'corrupt2', 'ts_corrupt');
+check_relation_corruption($node, 'corrupt2', 'ts_corrupt', $pageheader_size, "\0\0\0\0\0\0\0\0\0", "on tablespace ts_corrupt");
+
+# Check corruption in the pageheader with random data in it
+my $random_data = join '', map { ("a".."z")[rand 26] } 1 .. $pageheader_size;
+check_relation_corruption($node, 'corrupt1', 'pg_default', 0, $random_data, "with random data in pageheader");
+
+# Check corruption when the pageheader has been zeroed-out completely
+my $zero_data = "\0"x$pageheader_size;
+check_relation_corruption($node, 'corrupt1', 'pg_default', 0, $zero_data, "with zeroed-out pageheader");
 
 # Utility routine to check that pg_checksums is able to detect
 # correctly-named relation files filled with some corrupted data.
@@ -179,23 +205,32 @@ sub fail_corrupt
 	my $file_name = "$pgdata/global/$file";
 	append_to_file $file_name, "foo";
 
+	$node->stop;
+	# If the instance is offline, the whole file is skipped and this is
+	# considered to be an error.
 	$node->command_checks_all([ 'pg_checksums', '--check', '-D', $pgdata],
 						  1,
-						  [qr/^$/],
+						  [qr/Files skipped:.*1/],
 						  [qr/could not read block 0 in file.*$file\":/],
-						  "fails for corrupted data in $file");
+						  "skips corrupted data in $file");
+
+	$node->start;
+	# If the instance is online, the block is skipped and this is not
+	# considered to be an error
+	$node->command_checks_all([ 'pg_checksums', '--check', '-D', $pgdata],
+						  0,
+						  [qr/Blocks skipped:.*1/],
+						  [qr/^$/],
+						  "skips corrupted data in $file");
 
 	# Remove file to prevent future lookup errors on conflicts.
 	unlink $file_name;
 	return;
 }
 
-# Stop instance for the follow-up checks.
-$node->stop;
-
-# Authorized relation files filled with corrupted data cause the
-# checksum checks to fail.  Make sure to use file names different
-# than the previous ones.
+# Authorized relation files filled with corrupted data cause the files to be
+# skipped and, if the instance is offline, a non-zero exit status.  Make sure
+# to use file names different than the previous ones.
 fail_corrupt($node, "99990");
 fail_corrupt($node, "99990.123");
 fail_corrupt($node, "99990_fsm");
@@ -204,3 +239,6 @@ fail_corrupt($node, "99990_vm");
 fail_corrupt($node, "99990_init.123");
 fail_corrupt($node, "99990_fsm.123");
 fail_corrupt($node, "99990_vm.123");
+
+# Stop node again at the end of tests
+$node->stop;