/*-------------------------------------------------------------------------
 *
 * posix_shmem.c
 *	  Implement shared memory using POSIX facilities
 *
 * These routines represent a fairly thin layer on top of POSIX shared
 * memory functionality.
 *
 * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 * Portions Copyright (c) 2007, Apple Inc.
 *
 * Permission to use, copy, modify, and distribute this software and its
 * documentation for any purpose, without fee, and without a written agreement
 * is hereby granted, provided that the above copyright notice and this
 * paragraph and the following two paragraphs appear in all copies.
 *
 * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA OR APPLE INC. BE LIABLE TO
 * ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL
 * DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE
 * AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA OR APPLE INC.
 * HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * THE UNIVERSITY OF CALIFORNIA AND APPLE INC. SPECIFICALLY DISCLAIM ANY
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED
 * HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA OR
 * APPLE INC. HAVE NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
 * ENHANCEMENTS, OR MODIFICATIONS.
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include <signal.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/file.h>
#include <sys/mman.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <sys/types.h>
#ifdef HAVE_KERNEL_OS_H
#include <kernel/OS.h>
#endif

#include "miscadmin.h"
#include "storage/ipc.h"
#include "storage/pg_shmem.h"


#define IPCProtection	(0600)	/* access/modify by user only */

void	*UsedShmemSegAddr = NULL;
unsigned long UsedShmemSegID = 0;	/* Not used by the POSIX Shmem shim */

static char *GenerateIPCName();
static void *InternalIpcMemoryCreate(Size size);
static void IpcMemoryDetach(int status, Datum shmaddr);
static void IpcMemoryDelete(int status, Datum notUsed);

#ifdef EXEC_BACKEND
static PGShmemHeader *PGSharedMemoryAttach(void);
#endif

static int shm_open_robust(const char *name, int flags, mode_t mode);
static int close_robust(int d);

/*
 *	GenerateIPCName()
 *
 * Returns a shared memory object key name using the implicit argument
 * DataDir, the data directory's pathname via its device and inode values.
 */
static char*
GenerateIPCName()
{
	struct		stat statbuf;

	static char	ipcName[100];
	static bool	initialized = false;
	
	if (!initialized)
	{
		/* Get the data directory's device and inode */
		if (stat(DataDir, &statbuf) < 0)
			ereport(FATAL,
					(errcode_for_file_access(),
					 errmsg("could not stat data directory \"%s\": %m",
							DataDir)));
		
		/*
		 * POSIX requires that shared memory names begin with a single slash.  
		 * They should not have any others slashes or any non-alphanumerics to 
		 * maintain the broadest assumption of what is permitted in a filename.
		 * Also, case sensitivity should not be presumed.
		 */
		snprintf(ipcName, sizeof(ipcName), "/PostgreSQL.%jx.%jx",
				 (intmax_t) statbuf.st_dev, (intmax_t) statbuf.st_ino);
		
		initialized = true;
	}
	
	return ipcName;
}

/*
 *	InternalIpcMemoryCreate(size)
 *
 * Create a new shared memory segment.
 * Attaches the segment to the current process and return its attached
 * address. Callbacks are registered with on_shmem_exit to detach and
 * delete the segment when on_shmem_exit is called.
 *
 * If we fail for any reason print out an error and abort.
 */
static void *
InternalIpcMemoryCreate(Size size)
{
	int			fd;
	void	   *shmaddr;
	struct		stat statbuf;
	char	   *ipcName = GenerateIPCName();
	
	/* Create new shared memory segment. Fail if it already exists
	 * (or for any other reason) just to make sure that nothing else
	 * is using this segment.
	 * Do an extra unlink first for the degenerate case of the admin
	 * killing the process and then deleting the lockfile manually,
	 * since in that case PGSharedMemoryIsInUse() would not be called.
	 */
	shm_unlink(ipcName);
	fd = shm_open_robust(ipcName, O_RDWR | O_CREAT | O_EXCL, IPCProtection);

	if (fd < 0)
	{
		/* Complain and abort */
		ereport(FATAL,
				(errmsg("could not create shared memory segment: %m"),
				 errdetail("Failed system call was shm_open(name=%s, oflag=%lu, mode=%lu).",
						   GenerateIPCName(), (unsigned long) O_CREAT | O_EXCL,
						   (unsigned long) IPCProtection),
				 (errno == EEXIST || errno == EACCES) ?
				 errhint("This error means that the shared memory segment for "
						 "this data directory is still in use. Is another "
						 "postgres running in data directory \"%s\"?", DataDir) : 0,
				 (errno == EMFILE) ?
				 errhint("This error means that the process has reached its limit "
						 "for open file descriptors.") : 0,
				 (errno == ENOSPC) ?
				 errhint("This error means the process has ran out of address "
						 "space.") : 0,
				 (errno == ENAMETOOLONG) ?
				 errhint("This error means that the shared memory segment name "
						 "is too long.") : 0));
	}
	
	/* Register on-exit routine to delete the new segment */
	on_shmem_exit(IpcMemoryDelete, 0);
	
	/* Increase the size of the file descriptor to the desired length.
	 * If this fails so will mmap since it can't map size bytes.
	 */
	fstat(fd, &statbuf);
	if (statbuf.st_size < size)
		ftruncate(fd, size);
	
	/* OK, should be able to attach to the segment */
	shmaddr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
	
	/* Close the file descriptor since we don't need it anymore. */
	close_robust(fd);

	if (shmaddr == (void *) -1)
		elog(FATAL, "mmap with size=%ul and fd=%d failed: %m", (unsigned int) size, fd);

	/* Register on-exit routine to detach new segment before deleting */
	on_shmem_exit(IpcMemoryDetach, PointerGetDatum(shmaddr));

	/* Record instance ID in lockfile for data directory. The arguments aren't
	 * used since the shared memory segment name is unique.
	 */
	RecordSharedMemoryInLockFile(0, 0);

	return shmaddr;
}

/****************************************************************************/
/*	IpcMemoryDetach(status, shmaddr)	removes a shared memory segment		*/
/*										from process' address space			*/
/*	(called as an on_shmem_exit callback, hence funny argument list)		*/
/****************************************************************************/
static void
IpcMemoryDetach(int status, Datum shmaddr)
{
	PGShmemHeader  *hdr;
	hdr = (PGShmemHeader *) DatumGetPointer(shmaddr);
	
	if (munmap(DatumGetPointer(shmaddr), hdr->totalsize) < 0)
		elog(LOG, "munmap(%p) failed: %m", DatumGetPointer(shmaddr));
}

/****************************************************************************/
/*	IpcMemoryDelete(status, notUsed)		deletes a shared memory segment	*/
/*	(called as an on_shmem_exit callback, hence funny argument list)		*/
/****************************************************************************/
static void
IpcMemoryDelete(int status, Datum notUsed)
{
	char	   *ipcName = GenerateIPCName();
	
	if (shm_unlink(ipcName) < 0)
		elog(LOG, "shm_unlink(%s) failed: %m", ipcName);
}

/*
 * PGSharedMemoryIsInUse
 *
 * Is a previously-existing shmem segment still existing and in use?
 *
 * The point of this exercise is to detect the case where a prior postmaster
 * crashed, but it left child backends that are still running.	This only tests
 * for shmem segments that are associated with the intended DataDir using the
 * segment name which contains the hash of the canonicalized DataDir pathname.
 */
bool
PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
{
	int			fd;
	char	   *ipcName = GenerateIPCName();
		
	/* First unlink any segment left over from a crashed or kill -9'd
	 * postmaster. Throw an error if there is a failure other than the
	 * segment not existing (which should be the most common case).
	 * 
	 * If an orphaned backend is still running, this will only mark the
	 * segment for removal once the backend terminates, so we shm_open it
	 * next just to make sure.
	 */
	if (shm_unlink(ipcName) == -1 && errno != ENOENT)
	{
		ereport(FATAL,
				(errmsg("could not unlink dead shared memory segment: %m"),
				 errdetail("Failed system call was shm_unlink(name=%s).",
						   ipcName),
				 (errno == EACCES) ?
				 errhint("This error means that permission was denied to "
						 "remove the segment.") : 0,
				(errno == ENAMETOOLONG) ?
				errhint("This error means that the shared memory segment name "
						"is too long.") : 0));
	}
	
	/* Now try to atomically create a new segment. If this succeeds then there
	 * must be no orphaned backends running on this data dir.
	 */
	fd = shm_open_robust(ipcName, O_RDWR | O_CREAT | O_EXCL, IPCProtection);
	
	if (fd < 0)
	{
		/*
		 * Return true if an error indicates a collision with existing segment.
		 * One would expect EEXIST, given that we said O_EXCL.
		 */
		if (errno == EEXIST || errno == EACCES)
			return true;
		
		/*
		 * Else complain and abort
		 */
		ereport(FATAL,
				(errmsg("could not create shared memory segment: %m"),
				 errdetail("Failed system call was shm_open(name=%s, oflag=%lu, mode=%lu).",
						   ipcName, (unsigned long) O_CREAT | O_EXCL,
						   (unsigned long) IPCProtection),
				 (errno == EMFILE) ?
				 errhint("This error means that the process has reached its limit "
						 "for open file descriptors.") : 0,
				 (errno == ENOSPC) ?
				 errhint("This error means the process has ran out of address "
						 "space.") : 0,
				 (errno == ENAMETOOLONG) ?
				 errhint("This error means that the shared memory segment name "
						 "is too long.") : 0));
	}
	
	/* Cleanup and release the segment. */
	close_robust(fd);
	if (shm_unlink(ipcName) == -1)
	{
		ereport(FATAL,
				(errmsg("could not cleanup shared memory segment: %m"),
				 errdetail("Failed system call was shm_unlink(name=%s).",
						   ipcName)));
	}
	
	return false;
}


/*
 * PGSharedMemoryCreate
 *
 * Create a shared memory segment of the given size and initialize its
 * standard header.  Also, register an on_shmem_exit callback to release
 * the storage.
 *
 * Dead Postgres segments are released when found, and due to the hash in the
 * shmem key name, collision with non-Postgres shmem segments is effectively
 * impossible. 
 *
 * makePrivate means to always create a new segment, rather than attach to
 * or recycle any existing segment. Currently, this value is ignored as
 * all segments are newly created (the dead ones are simply freed either
 * immediately or when the orphan backends die). Port is similarly ignored,
 * as this POSIX layer bases its shmem segment names only on the data dir's
 * pathname.
 */
PGShmemHeader *
PGSharedMemoryCreate(Size size, bool makePrivate, int port)
{
	void		   *shmaddr;
	PGShmemHeader  *hdr;
	struct stat		statbuf;

	/* Room for a header? */
	Assert(size > MAXALIGN(sizeof(PGShmemHeader)));

	/* Make sure PGSharedMemoryAttach doesn't fail without need */
	UsedShmemSegAddr = NULL;
	
	/* Create the new segment */
	shmaddr = InternalIpcMemoryCreate(size);
	
	/* OK, we created a new segment.  Mark it as created by this process. */
	hdr = (PGShmemHeader *) shmaddr;
	hdr->creatorPID = getpid();
	hdr->magic = PGShmemMagic;

	/* Fill in the data directory ID info, too */
	if (stat(DataDir, &statbuf) < 0)
		ereport(FATAL,
				(errcode_for_file_access(),
				 errmsg("could not stat data directory \"%s\": %m",
						DataDir)));
	hdr->device = statbuf.st_dev;
	hdr->inode = statbuf.st_ino;

	/* Initialize space allocation status for segment. */
	hdr->totalsize = size;
	hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));

	/* Save address for possible future use */
	UsedShmemSegAddr = shmaddr;

	return hdr;
}

#ifdef EXEC_BACKEND

/*
 * PGSharedMemoryReAttach
 *
 * Re-attach to an already existing shared memory segment.	In the non
 * EXEC_BACKEND case this is not used, because postmaster children inherit
 * the shared memory segment attachment via fork().
 *
 * UsedShmemSegAddr is an implicit parameter to this routine.  The caller
 * must have already restored them to the postmaster's values.
 */
void
PGSharedMemoryReAttach(void)
{
	void   *hdr;
	void   *origUsedShmemSegAddr = UsedShmemSegAddr;

	Assert(UsedShmemSegAddr != NULL);
	Assert(IsUnderPostmaster);

#ifdef __CYGWIN__
	/* cygipc (currently) appears to not detach on exec. */
	PGSharedMemoryDetach();
	UsedShmemSegAddr = origUsedShmemSegAddr;
#endif

	elog(DEBUG3, "attaching to %p", UsedShmemSegAddr);
	hdr = (void *) PGSharedMemoryAttach();
	if (hdr == NULL)
		elog(FATAL, "could not reattach to shared memory (addr=%p): %m",
			 UsedShmemSegAddr);
	if (hdr != origUsedShmemSegAddr)
		elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)",
			 hdr, origUsedShmemSegAddr);

	UsedShmemSegAddr = hdr;		/* probably redundant */
}


/*
 * PGSharedMemoryAttach
 *
 * Attach to shared memory and make sure it has a Postgres header
 *
 * Returns attach address if OK, else NULL
 */
static PGShmemHeader *
PGSharedMemoryAttach(void)
{
	PGShmemHeader *hdr;
	Size		size;
	int			fd;
	
	if ((fd = shm_open_robust(GenerateIPCName(), O_RDWR, 0)) < 0)
		return NULL;

	hdr = (PGShmemHeader *) mmap(UsedShmemSegAddr, sizeof(PGShmemHeader),
								 PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);

	if (hdr == (PGShmemHeader *) -1)
	{
		close_robust(fd);
		return NULL;			/* failed: this should never happen */
	}

	if (hdr->magic != PGShmemMagic)
	{
		close_robust(fd);
		munmap((void *) hdr, sizeof(PGShmemHeader));
		return NULL;			/* segment belongs to a non-Postgres app, which should be impossible */
	}
	
	/* Since the segment has a valid Postgres header, unmap and re-map it with the proper size */
	size = hdr->totalsize;
	munmap((void *) hdr, sizeof(PGShmemHeader));
	hdr = (PGShmemHeader *) mmap(UsedShmemSegAddr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
	close_robust(fd);
	
	if (hdr == (PGShmemHeader *) -1)   /* this shouldn't happen either */
		return NULL;
	
	return hdr;
}
#endif   /* EXEC_BACKEND */

/*
 * PGSharedMemoryDetach
 *
 * Detach from the shared memory segment, if still attached.  This is not
 * intended for use by the process that originally created the segment
 * (it will have an on_shmem_exit callback registered to do that).	Rather,
 * this is for subprocesses that have inherited an attachment and want to
 * get rid of it.
 */
void
PGSharedMemoryDetach(void)
{
	PGShmemHeader  *hdr;
	if (UsedShmemSegAddr != NULL)
	{
		hdr = (PGShmemHeader *) UsedShmemSegAddr;
		if (munmap(UsedShmemSegAddr, hdr->totalsize) < 0)
			elog(LOG, "munmap(%p) failed: %m", UsedShmemSegAddr);
		UsedShmemSegAddr = NULL;
	}
}

/*
 * shm_open_robust
 *
 * Wrapper to call shm_open until it is not interrupted.
 */
static int
shm_open_robust(const char *name, int flags, mode_t mode)
{
	int fd;
	do
	{
		fd = shm_open(name, flags, mode);
	} while (fd < 0 && errno == EINTR);
	return fd;
}

/*
 * close_robust
 *
 * Wrapper to call close until it is not interrupted.
 */
static int
close_robust(int d)
{
	int result;
	do
	{
		result = close(d);
	} while (result == -1 && errno == EINTR);
	return result;
}
