#include <math.h>
#include <stdio.h>
#include <stdlib.h>

typedef unsigned char uint8;
typedef long int64;
typedef unsigned long uint64;
typedef unsigned __int128 uint128;
typedef int64 (*permute_fn)(const int64, const int64, const int64);

#define PRP_PRIMES 16

static uint64 primes[PRP_PRIMES] = {
    8388617,
    8912921,
    9437189,
    9961487,
    10485767,
    11010059,
    11534351,
    12058679,
    12582917,
    13107229,
    13631489,
    14155777,
    14680067,
    15204391,
    15728681,
    16252967
};

#define PRP_ROUNDS 4

static uint64
compute_mask(uint64 n)
{
    n |= n >> 1;
    n |= n >> 2;
    n |= n >> 4;
    n |= n >> 8;
    n |= n >> 16;
    n |= n >> 32;
    return n;
}

static uint64
modular_multiply(uint64 x, uint64 y, const uint64 m)
{
    return (uint128) x * (uint128) y % (uint128) m;
}

#define DK_LCG_MUL 6364136223846793005L
#define DK_LCG_INC 1442695040888963407L

#define LCG_SHIFT 13

static int64
permute(const int64 data, const int64 isize, const int64 seed)
{
    uint64      size = (uint64) isize;
    uint64      v = (uint64) data % size;
    uint64      key = (uint64) seed;
    uint64      mask = compute_mask(size - 1) >> 1;

    if (isize == 1)
        return 0;

    for (unsigned int i = 0, p = key % PRP_PRIMES;
         i < PRP_ROUNDS; i++, p = (p + 1) % PRP_PRIMES)
    {
        uint64      t;

        key = key * DK_LCG_MUL + DK_LCG_INC;
        if (v <= mask)
            v ^= (key >> LCG_SHIFT) & mask;

        key = key * DK_LCG_MUL + DK_LCG_INC;
        t = size - 1 - v;
        if (t <= mask)
        {
            t ^= (key >> LCG_SHIFT) & mask;
            v = size - 1 - t;
        }

        while (size % primes[p] == 0)
            p = (p + 1) % PRP_PRIMES;

        key = key * DK_LCG_MUL + DK_LCG_INC;

        if ((v & 0xffffffffffL) == v)
            v = (primes[p] * v + (key >> LCG_SHIFT)) % size;
        else
            v = (modular_multiply(primes[p], v, size) +
                (key >> LCG_SHIFT)) % size;
    }

    return (int64) v;
}

static int64
permute2(const int64 data, const int64 isize, const int64 seed)
{
    unsigned short eseed[] = { (seed >> 32) & 0xffff,
                               (seed >> 16) &0xffff,
                               seed & 0xffff };
    uint64      size = (uint64) isize;
    uint64      v = (uint64) data % size;
    uint64      mask;
    uint64      top_bit;
    int         i;

    if (isize == 1)
        return 0;

    // This choice of mask satisfies size/2 <= mask <= size-1
    mask = compute_mask(size - 1);
    if (mask >= size) mask >>= 1;

    // Most significant bit of mask
    top_bit = (mask + 1) >> 1;

    for (i = 0; i < 4; i++)
    {
        uint64 m;
        uint64 r;
        uint64 t;

        m = (uint64) (erand48(eseed) * (mask + 1)) | 1;
        r = (uint64) (erand48(eseed) * (mask + 1));
        if (v <= mask)
        {
          v = ((v * m) ^ r) & mask;
          v = ((v << 1) & mask) | (v & top_bit ? 1 : 0);
        }

        r = (uint64) (erand48(eseed) * size);
        v = (v + r) % size;

        m = (uint64) (erand48(eseed) * (mask + 1)) | 1;
        r = (uint64) (erand48(eseed) * (mask + 1));
        t = size - 1 - v;
        if (t <= mask)
        {
          t = ((t * m) ^ r) & mask;
          t = ((t << 1) & mask) | (t & top_bit ? 1 : 0);
          v = size - 1 - t;
        }

        r = (uint64) (erand48(eseed) * size);
        v = (v + r) % size;
    }

    return (int64) v;
}

static uint64 perm_to_int(int64 *pvals, int64 size)
{
  int64 f = 1;
  int64 N = 0;
  int64 i;

  for (i = 1; i < size; i++)
  {
    N += pvals[i] * f;
    f *= size;
  }
  return N;
}

int main()
{
  permute_fn permute_fn = &permute2;
  unsigned short seed[3] = { 1234, 5678, 9012 };
  int s[] = { 2, 3, 4, 5, 6, 7, 8, 9, -1 };
  int x;
  int y;
  int64 size;
  int64 seed64;
  int i;

  for (x = 0, size = s[x]; size > 0; size = s[++x])
  {
    int64 *pvals = malloc(size * sizeof(int64));
    uint64 id = -1;
    int c = 0;

    for (y = 0; ; y++)
    {
      seed64 = erand48(seed) * (1L<<48);
      for (i = 0; i < size; i++)
        pvals[i] = (*permute_fn)(i, size, seed64);
      printf("size=%d, p=[", size);
      for (i = 0; i < size; i++)
        printf(" %ld", pvals[i]);
      printf(" ]\n");

      if (id == -1) id = perm_to_int(pvals, size);
      else if (perm_to_int(pvals, size) == id && ++c == 100) break;
    }
  }

  return 0;
}
