#include <string.h>
#include <stdio.h>
#include <time.h>
#include <stdint.h>
#include <stdlib.h>

#define LONG_ALIGN_MASK (sizeof(long) - 1)
#define MEMSET_LOOP_LIMIT 1024

#define MemSet(start, val, len) \
	do \
	{ \
		/* must be void* because we don't know if it is integer aligned yet */ \
		void   *_vstart = (void *) (start); \
		int		_val = (val); \
		Size	_len = (len); \
\
		if ((((uintptr_t) _vstart) & LONG_ALIGN_MASK) == 0 && \
			(_len & LONG_ALIGN_MASK) == 0 && \
			_val == 0 && \
			_len <= MEMSET_LOOP_LIMIT && \
			/* \
			 *	If MEMSET_LOOP_LIMIT == 0, optimizer should find \
			 *	the whole "if" false at compile time. \
			 */ \
			MEMSET_LOOP_LIMIT != 0) \
		{ \
			long *_start = (long *) _vstart; \
			long *_stop = (long *) ((char *) _start + _len); \
			while (_start < _stop) \
				*_start++ = 0; \
		} \
		else { \
			fprintf(stderr, "Macro is using builtin memset function\n"); \
			exit(-1); \
		} \
	} while (0)


#define LOOPS 100000000

#define test_memset(s, loops) \
	do { \
		clock_t start, end; \
		start = clock(); \
		for (int i = 0; i < (loops); i++) \
		{ \
			memset(&(a), 0, s); \
		} \
		end = clock(); \
		printf("memset: size %d: %f seconds\n", s, (double) (end - start) / CLOCKS_PER_SEC); \
	} while (0)

#define test_MemSet(s, loops) \
	do { \
		clock_t start, end; \
		start = clock(); \
		for (int i = 0; i < (loops); i++) \
		{ \
			MemSet(&(a), 0, s); \
		} \
		end = clock(); \
		printf("MemSet: size %d: %f seconds\n", s, (double) (end - start) / CLOCKS_PER_SEC); \
	} while (0)

typedef size_t Size;

char a[512];

int main(int argc, char *argv[])
{
	int loops;
	
	if (argc < 2)
	{
		printf("Usage: %s <loops>\n", argv[0]);
		return -1;
	}
	
	loops = atoi(argv[1]);
	printf("Running %d loops\n", loops);

	test_MemSet(8, loops);
	test_MemSet(16, loops);
	test_MemSet(32, loops);
	test_MemSet(64, loops);
	test_MemSet(128, loops);
	test_MemSet(256, loops);
	test_MemSet(512, loops);
	
	test_memset(8, loops);
	test_memset(16, loops);
	test_memset(32, loops);
	test_memset(64, loops);
	test_memset(128, loops);
	test_memset(256, loops);
	test_memset(512, loops);
	

	return 0;
}