#include <sys/atomic.h>

#include <assert.h>
#include <err.h>
#include <pthread.h>
#include <stdio.h>
#include <unistd.h>

#undef	BROKEN
#define	BROKEN	1

#if defined(__amd64__)
#define	membar_acquire()	asm volatile("" ::: "memory")
#define	membar_release()	asm volatile("" ::: "memory")
#ifdef BROKEN	/* not really broken because atomic_swap implies seq_cst */
#define	membar_dekker()		asm volatile("" ::: "memory")
#else
#define	membar_dekker()		asm volatile("mfence" ::: "memory")
#endif
#define	noop()			asm volatile("pause" ::: "memory")
#elif defined(__aarch64__)
#define	membar_acquire()	asm volatile("dmb ishld" ::: "memory")
#define	membar_release()	asm volatile("dmb ish" ::: "memory")
#ifdef BROKEN
#define	membar_dekker()		asm volatile("dmb ishld" ::: "memory")
#else
#define	membar_dekker()		asm volatile("dmb ish" ::: "memory")
#endif
#define	noop()			asm volatile("yield" ::: "memory")
#endif

volatile unsigned waiting[2];
volatile unsigned turn;
volatile unsigned counter;

static void
lock(unsigned me)
{

top:	(void)atomic_swap_uint(&waiting[me], 1);
	membar_dekker();
	while (waiting[1 - me]) {
		if (turn != me) {
			waiting[me] = 0;
			while (turn != me)
				continue;
			goto top;
		}
	}
	membar_acquire();
}

static void
unlock(unsigned me)
{

	membar_release();
	turn = 1 - me;
	waiting[me] = 0;
}

static void *
thread(void *cookie)
{
	unsigned me = (uintptr_t)cookie;
	unsigned i;

	for (i = 10000000; i --> 0;) {
		lock(me);
		counter++;
		noop();
		counter++;
		unlock(me);
	}

	return NULL;
}

int
main(void)
{
	pthread_t t[2];
	unsigned i;
	int error;

	for (i = 0; i < 2; i++) {
		error = pthread_create(&t[i], NULL, &thread,
		    (void *)(uintptr_t)i);
		if (error)
			errc(1, error, "pthread_create");
	}
	for (i = 0; i < 2; i++) {
		error = pthread_join(t[i], NULL);
		if (error)
			errc(1, error, "pthread_join");
	}
	printf("%u\n", counter);
	fflush(stdout);
	return ferror(stdout);
}