From ebce7b8e8350ab8799bd13aa54bdb2a562365442 Mon Sep 17 00:00:00 2001
From: fef <owo@fef.moe>
Date: Thu, 4 Nov 2021 05:56:40 +0100
Subject: [PATCH] mutex: fix lock race conditions

---
 arch/x86/include/amd64/latom.h |  24 +++---
 arch/x86/include/arch/atom.h   |  53 +++++++-----
 arch/x86/include/i386/latom.h  |   9 +-
 cmake/config.cmake             |   2 +
 include/gay/cdefs.h            |  24 ++++++
 include/gay/config.h.in        |   3 +
 include/gay/mutex.h            |  15 +++-
 include/gay/systm.h            |  20 +++++
 kernel/mutex.c                 | 150 ++++++++++++++++++++-------------
 9 files changed, 202 insertions(+), 98 deletions(-)

diff --git a/arch/x86/include/amd64/latom.h b/arch/x86/include/amd64/latom.h
index e9c172e..90f7751 100644
--- a/arch/x86/include/amd64/latom.h
+++ b/arch/x86/include/amd64/latom.h
@@ -12,27 +12,27 @@
 #error "__LP64__ must be defined on amd64"
 #endif
 
+static inline void latom_init(latom_t *latom, long val)
+{
+	latom->_value = val;
+}
+
 static inline long latom_read(const latom_t *latom)
 {
 	return latom->_value;
 }
 
-static inline long latom_write(latom_t *latom, long val)
+static inline long latom_xchg(latom_t *latom, long val)
 {
-	long rax;
-
+	/* the intel manual says XCHG is always atomic, and you don't need a LOCK prefix */
 	__asm__ volatile(
-"	movq	(%2),	%0	\n" /* rax = atom->_value */
-"1:	lock			\n"
-"	cmpxchgq %1, (%2)	\n" /* if (latom->_value == rax) latom->_value = val */
-"	pause			\n" /* intel says you're supposed to do this in spin loops */
-"	jne	1b		\n" /* else goto 1 (rax updated to new latom->_value) */
-	: "=a"(rax)
-	: "r"(val), "r"(&latom->_value)
-	: "cc", "memory"
+"	xchgq	%0, (%1)	\n"
+	: "+r"(val)
+	: "r"(&latom->_value)
+	: "memory"
 	);
 
-	return rax;
+	return val;
 }
 
 static inline long latom_cmp_xchg(latom_t *latom, long compare, long val)
diff --git a/arch/x86/include/arch/atom.h b/arch/x86/include/arch/atom.h
index 74a0a98..37ce1d6 100644
--- a/arch/x86/include/arch/atom.h
+++ b/arch/x86/include/arch/atom.h
@@ -6,11 +6,24 @@
 #include <gay/cdefs.h>
 #include <gay/types.h>
 
+/**
+ * @brief Initialize an atom non-atomically.
+ * This function is **only** for initializing an atom, you should never use it
+ * in code that requires actual atomicity (i.e. everything except initializers).
+ *
+ * @param atom Atom to initialize
+ * @param val Initial value for the atom
+ */
+static inline void atom_init(atom_t *atom, int val)
+{
+	atom->_value = val;
+}
+
 /**
  * @brief Read an atom's current value.
  * You usually shouldn't need this function because all the other atomic
  * primitives return the value before the operation, and we are only really
- * interested in how values *compare* between operations.
+ * interested in how values *compare* between operations in most cases.
  * Don't use `atom_read()` followed by another atomic operation, it defeats the
  * whole purpose of using atomics in the first place.
  *
@@ -29,22 +42,17 @@ static inline int atom_read(const atom_t *atom)
  * @param val New value
  * @return The value of `atom` *before* the operation
  */
-static inline int atom_write(atom_t *atom, int val)
+static inline int atom_xchg(atom_t *atom, int val)
 {
-	int eax;
-
+	/* the intel manual says XCHG is always atomic, and you don't need a LOCK prefix */
 	__asm__ volatile(
-"	movl	(%2),	%0	\n" /* eax = atom->_value */
-"1:	lock			\n"
-"	cmpxchgl %1, (%2)	\n" /* if (atom->_value == eax) atom->_value = val */
-"	pause			\n" /* intel says you're supposed to do this in spin loops */
-"	jne	1b		\n" /* else goto 1 (eax updated to new atom->_value) */
-	: "=a"(eax)
-	: "r"(val), "r"(&atom->_value)
-	: "cc", "memory"
+"	xchgl	%0, (%1)	\n"
+	: "+r"(val)
+	: "r"(&atom->_value)
+	: "memory"
 	);
 
-	return eax;
+	return val;
 }
 
 /**
@@ -164,16 +172,15 @@ static inline bool atom_dec(atom_t *atom)
  */
 static inline int atom_and(atom_t *atom, int val)
 {
-	int eax;
+	int eax = atom->_value;
 
 	__asm__ volatile(
-"	movl	(%2), %0	\n" /* eax = atom->_value */
 "1:	andl	%0, %1		\n" /* val &= eax */
 "	lock			\n"
 "	cmpxchgl %1, (%2)	\n" /* if (atom->_value == eax) atom->_value = val */
 "	pause			\n" /* intel says you're supposed to do this in spin loops */
 "	jne	1b		\n" /* else goto 1 (eax updated to new atom->_value) */
-	: "=a"(eax), "+r"(val)
+	: "+a"(eax), "+r"(val)
 	: "r"(&atom->_value)
 	: "cc", "memory"
 	);
@@ -190,16 +197,15 @@ static inline int atom_and(atom_t *atom, int val)
  */
 static inline int atom_or(atom_t *atom, int val)
 {
-	int eax;
+	int eax = atom->_value;
 
 	__asm__ volatile(
-"	movl	(%2),	%0	\n" /* eax = atom->_value */
 "1:	orl	%0,	%1	\n" /* val |= eax */
 "	lock			\n"
 "	cmpxchgl %1,	(%2)	\n" /* if (atom->_value == eax) atom->_value = val */
 "	pause			\n" /* intel says you're supposed to do this in spin loops */
 "	jne	1b		\n" /* else goto 1 (eax updated to new atom->_value) */
-	: "=a"(eax), "+r"(val)
+	: "+a"(eax), "+r"(val)
 	: "r"(&atom->_value)
 	: "cc", "memory"
 	);
@@ -276,14 +282,19 @@ static inline bool atom_clr_bit(atom_t *atom, int pos)
 #error "sizeof(long) is expected equal sizeof(void *) on x86"
 #endif
 
+static inline void patom_init(patom_t *patom, void *val)
+{
+	patom->_ptr = val;
+}
+
 static inline void *patom_read(const patom_t *patom)
 {
 	return patom->_ptr;
 }
 
-static __always_inline void *patom_write(patom_t *patom, void *val)
+static __always_inline void *patom_xchg(patom_t *patom, void *val)
 {
-	return (void *)latom_write((latom_t *)patom, (long)val);
+	return (void *)latom_xchg((latom_t *)patom, (long)val);
 }
 
 static __always_inline void *patom_cmp_xchg(patom_t *patom, void *compare, void *val)
diff --git a/arch/x86/include/i386/latom.h b/arch/x86/include/i386/latom.h
index 8075dee..506777a 100644
--- a/arch/x86/include/i386/latom.h
+++ b/arch/x86/include/i386/latom.h
@@ -16,14 +16,19 @@
 #error "__ILP32__ must be defined on i386"
 #endif
 
+static inline void latom_init(latom_t *latom, long val)
+{
+	latom->_value = val;
+}
+
 static inline long latom_read(const latom_t *latom)
 {
 	return latom->_value;
 }
 
-static __always_inline long latom_write(latom_t *latom, long val)
+static __always_inline long latom_xchg(latom_t *latom, long val)
 {
-	return atom_write((atom_t *)latom, val);
+	return atom_xchg((atom_t *)latom, val);
 }
 
 static __always_inline long latom_cmp_xchg(latom_t *latom, long compare, long val)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 0fc1af3..1326fd5 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -20,6 +20,8 @@ set(CFG_MAX_CPU "64" CACHE STRING "Maximum number of logical processors")
 
 option(CFG_DEBUG_IRQ "Debug IRQs" ${DEBUG})
 
+option(CFG_DEBUG_MTX "Debug mutexes" ${DEBUG})
+
 option(CFG_DEBUG_CLIST "Sanitize clist operations" ${DEBUG})
 
 option(CFG_DEBUG_PAGE_ALLOCS "Debug page frame allocations" OFF)
diff --git a/include/gay/cdefs.h b/include/gay/cdefs.h
index 6eaeb1c..9f55fe5 100644
--- a/include/gay/cdefs.h
+++ b/include/gay/cdefs.h
@@ -106,6 +106,30 @@
 #define typeof(expr) __typeof(expr)
 #endif
 
+/*
+ * These are hints for clang's branch optimizer which will try to arrange the
+ * code to yield the best performance when a condition is true or false.
+ *
+ * - Use them sparingly and only in performance critical places because they
+ *   come with a sometimes very significant code size overhead due to branches
+ *   being rearranged and aligned
+ * - Only use them if you know *for sure* that a particular branch is *very*
+ *   unlikely to be hit, for example when
+ *
+ * Use it sparingly and only in performance critical places because the overhead
+ * from rearranging and aligning the individual instructions can quickly make
+ * the kernel image too big.
+ * Also, only use it if you actually know *for sure* that a particular branch
+ * is *very* unlikely to be hit.  Most modern CPUs have become pretty good at
+ * speculative execution (~~and intel even managed to make it buggy~~) and
+ * probably do a better job optimizing themselves at runtime than you can here.
+ * Finally, you don't need this when doing resource allocation failure checks,
+ * because those functions are annotated with __malloc_like anyway.
+ */
+
+#define __predict_true(x) __builtin_expect(x, 1)
+#define __predict_false(x) __builtin_expect(x, 0)
+
 /*
  * This file is part of GayBSD.
  * Copyright (c) 2021 fef <owo@fef.moe>.
diff --git a/include/gay/config.h.in b/include/gay/config.h.in
index c75b4b9..9e735ec 100644
--- a/include/gay/config.h.in
+++ b/include/gay/config.h.in
@@ -40,6 +40,9 @@
 /** @brief Debug IRQs */
 #cmakedefine01 CFG_DEBUG_IRQ
 
+/** @brief Debug mutexes */
+#cmakedefine01 CFG_DEBUG_MTX
+
 /** @brief Sanitize clist operations */
 #cmakedefine01 CFG_DEBUG_CLIST
 
diff --git a/include/gay/mutex.h b/include/gay/mutex.h
index e3c7fb0..6ccbce6 100644
--- a/include/gay/mutex.h
+++ b/include/gay/mutex.h
@@ -19,7 +19,7 @@ typedef struct {
 
 void spin_init(spin_t *spin);
 void spin_lock(spin_t *spin);
-int spin_trylock(spin_t *spin);
+bool spin_trylock(spin_t *spin);
 void spin_unlock(spin_t *spin);
 
 struct lock_waiter {
@@ -27,6 +27,15 @@ struct lock_waiter {
 	struct task *task;
 };
 
+/**
+ * @brief Yielding mutual exclusion lock.
+ *
+ * Several rules apply when using a mutex:
+ *
+ * - No access from irq context or within a critical section
+ * - Must be unlocked from the same thread that locked it
+ * - No more than one consecutive unlock allowed
+ */
 struct mtx {
 	atom_t lock; /* 1 = free, 0 = locked, < 0 = locked and other threads are waiting */
 	spin_t wait_queue_lock;
@@ -34,7 +43,7 @@ struct mtx {
 };
 
 #define MTX_DEFINE(name) {					\
-	.lock = ATOM_DEFINE(0),					\
+	.lock = ATOM_DEFINE(1),					\
 	.wait_queue_lock = SPIN_DEFINE,				\
 	.wait_queue = CLIST_DEFINE((name).wait_queue),		\
 }
@@ -44,7 +53,7 @@ struct mtx {
 void mtx_init(struct mtx *mutex);
 void mtx_lock(struct mtx *mutex);
 void mtx_unlock(struct mtx *mutex);
-int mtx_trylock(struct mtx *mutex);
+bool mtx_trylock(struct mtx *mutex);
 
 struct sem {
 	atom_t count;
diff --git a/include/gay/systm.h b/include/gay/systm.h
index 1600948..d1a3e88 100644
--- a/include/gay/systm.h
+++ b/include/gay/systm.h
@@ -3,6 +3,7 @@
 #pragma once
 
 #include <gay/cdefs.h>
+#include <gay/irq.h>
 #include <gay/sched.h>
 #include <gay/types.h>
 
@@ -30,14 +31,31 @@ void panic(const char *fmt, ...) __noreturn __printflike(1, 2);
 #define KASSERT(x) do { } while (0)
 #endif
 
+/**
+ * @brief Enter a critical section, meaning preemption gets disabled.
+ * Critical sections should be kept as short as possible for performance.
+ * Cannot be called from irq context because irqs always run atomically.
+ *
+ * @see `critical_leave()`
+ */
 static inline void critical_enter(void)
 {
+	KASSERT(!in_irq());
+
 	struct task *tsk = current;
 	tsk->critnest++;
 }
 
+/**
+ * @brief Leave a critical section.
+ * Cannot be called from irq context.
+ *
+ * @see `critical_enter()`
+ */
 static inline void critical_leave(void)
 {
+	KASSERT(!in_irq());
+
 	struct task *tsk = current;
 	tsk->critnest--;
 
@@ -46,6 +64,8 @@ static inline void critical_leave(void)
 
 static inline bool in_critical(void)
 {
+	if (in_irq())
+		return true;
 	return current->critnest > 0;
 }
 
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 717edbb..74e2577 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -4,56 +4,58 @@
 #include <arch/cpufunc.h>
 
 #include <gay/clist.h>
-#include <gay/irq.h>
-#include <gay/kprintf.h>
 #include <gay/mutex.h>
 #include <gay/sched.h>
+#include <gay/systm.h>
+#include <gay/util.h>
 
-#include <errno.h>
+#if CFG_DEBUG_MTX
+#define MTX_ASSERT(x) KASSERT(x)
+#else
+#define MTX_ASSERT(x) ({})
+#endif
 
 void spin_init(spin_t *spin)
 {
-	atom_write(&spin->lock, 0);
+	atom_init(&spin->lock, 0);
 }
 
 void spin_lock(spin_t *spin)
 {
+	MTX_ASSERT(in_critical());
+
 	spin_loop {
-		if (atom_cmp_xchg(&spin->lock, 0, 1) == 0)
+		if (atom_xchg(&spin->lock, 1) == 0)
 			break;
 	}
 }
 
-int spin_trylock(spin_t *spin)
+bool spin_trylock(spin_t *spin)
 {
-	if (atom_cmp_xchg(&spin->lock, 0, 1) != 0)
-		return -EAGAIN;
-	return 0;
+	MTX_ASSERT(in_critical());
+
+	return atom_xchg(&spin->lock, 1) == 0;
 }
 
 void spin_unlock(spin_t *spin)
 {
-	atom_write(&spin->lock, 0);
+	MTX_ASSERT(in_critical());
+
+	atom_init(&spin->lock, 0);
 }
 
 void mtx_init(struct mtx *mtx)
 {
-	atom_write(&mtx->lock, 1);
+	atom_init(&mtx->lock, 1);
 	spin_init(&mtx->wait_queue_lock);
 	clist_init(&mtx->wait_queue);
 }
 
 void mtx_lock(struct mtx *mtx)
 {
-#	ifdef DEBUG
-		if (in_irq()) {
-			kprintf("mtx_lock() called from irq context!\n");
-			spin_loop {
-				if (atom_cmp_xchg(&mtx->lock, 1, 0) == 1)
-					return;
-			}
-		}
-#	endif
+	MTX_ASSERT(!in_critical());
+
+	critical_enter();
 
 	/*
 	 * When the mutex is locked, its lock value goes to 0.
@@ -61,79 +63,92 @@ void mtx_lock(struct mtx *mtx)
 	 * nonzero, meaning the lock value has become negative.
 	 */
 	if (atom_dec(&mtx->lock)) {
-		struct task *task = current;
-		/*
-		 * It might not be the smartest idea to allocate this thing on
-		 * the stack because it's gonna blow up if the task somehow dies
-		 * before returning here.  Let's see how this turns out.
-		 */
+		struct task *this_task = current;
 		struct lock_waiter waiter = {
-			.task = task,
+			.task = this_task,
 		};
 
 		spin_lock(&mtx->wait_queue_lock);
-		clist_add(&mtx->wait_queue, &waiter.clink);
-		spin_unlock(&mtx->wait_queue_lock);
+		if (atom_cmp_xchg(&mtx->lock, 1, 0) == 1) {
+			/* mutex was unlocked after we failed to claim it, but
+			 * before the other thread claimed wait_queue_lock */
+			spin_unlock(&mtx->wait_queue_lock);
 
-		task->state = TASK_BLOCKED;
-		schedule();
+			critical_leave();
+		} else {
+			this_task->state = TASK_BLOCKED;
+			clist_add(&mtx->wait_queue, &waiter.clink);
+			spin_unlock(&mtx->wait_queue_lock);
+
+			critical_leave();
+
+			schedule();
+		}
+	} else {
+		critical_leave();
 	}
 }
 
-int mtx_trylock(struct mtx *mtx)
+bool mtx_trylock(struct mtx *mtx)
 {
-	if (atom_cmp_xchg(&mtx->lock, 1, 0) != 1)
-		return -EAGAIN;
-	return 0;
+	MTX_ASSERT(!in_critical());
+
+	return atom_cmp_xchg(&mtx->lock, 1, 0) == 1;
 }
 
 void mtx_unlock(struct mtx *mtx)
 {
+	MTX_ASSERT(!in_critical());
+
+	critical_enter();
+
 	if (atom_add(&mtx->lock, 1) < 0) {
 		spin_lock(&mtx->wait_queue_lock);
-		struct lock_waiter *waiter =
-			clist_del_first_entry(&mtx->wait_queue, typeof(*waiter), clink);
+		if (!clist_is_empty(&mtx->wait_queue)) {
+			struct lock_waiter *waiter = clist_del_first_entry(
+				&mtx->wait_queue,
+				typeof(*waiter),
+				clink
+			);
+			waiter->task->state = TASK_READY;
+		}
 		spin_unlock(&mtx->wait_queue_lock);
-		waiter->task->state = TASK_READY;
 	}
+
+	critical_leave();
 }
 
 void sem_init(struct sem *sem, int initial_count)
 {
-	atom_write(&sem->count, initial_count);
+	atom_init(&sem->count, initial_count);
 	spin_init(&sem->wait_queue_lock);
 	clist_init(&sem->wait_queue);
 }
 
 int sem_down(struct sem *sem)
 {
-#	ifdef DEBUG
-		if (in_irq()) {
-			kprintf("sem_down() called from IRQ context!\n");
-			spin_loop {
-				int old = atom_sub(&sem->count, 1);
-				if (old >= 0)
-					return old;
-				atom_inc(&sem->count);
-			}
-		}
-#	endif
+	MTX_ASSERT(!in_critical());
 
-	int ret = atom_sub(&sem->count, 1);
+	critical_enter();
 
+	int ret = atom_sub(&sem->count, 1);
 	if (ret < 0) {
-		struct task *task = current;
+		struct task *this_task = current;
 		struct lock_waiter waiter = {
-			.task = task,
+			.task = this_task,
 		};
+		this_task->state = TASK_BLOCKED;
 
 		spin_lock(&sem->wait_queue_lock);
 		clist_add(&sem->wait_queue, &waiter.clink);
 		spin_unlock(&sem->wait_queue_lock);
 
-		task->state = TASK_BLOCKED;
+		critical_leave();
+
 		schedule();
 		ret = 0;
+	} else {
+		critical_leave();
 	}
 
 	return ret;
@@ -141,30 +156,45 @@ int sem_down(struct sem *sem)
 
 int sem_up(struct sem *sem)
 {
-	int ret = atom_add(&sem->count, 1);
+	MTX_ASSERT(!in_critical());
+
+	critical_enter();
 
+	int ret = atom_add(&sem->count, 1);
 	if (ret <= 0) {
 		spin_lock(&sem->wait_queue_lock);
-		struct lock_waiter *waiter =
-			clist_del_first_entry(&sem->wait_queue, typeof(*waiter), clink);
+		if (!clist_is_empty(&sem->wait_queue)) {
+			struct lock_waiter *waiter = clist_del_first_entry(
+				&sem->wait_queue,
+				typeof(*waiter),
+				clink
+			);
+			waiter->task->state = TASK_READY;
+		}
 		spin_unlock(&sem->wait_queue_lock);
 
-		waiter->task->state = TASK_READY;
 		ret = 0;
 	}
 
+	critical_leave();
+
 	return ret;
 }
 
 int sem_trydown(struct sem *sem)
 {
-	int ret = atom_sub(&sem->count, 1);
+	MTX_ASSERT(!in_critical());
 
+	critical_enter();
+
+	int ret = atom_sub(&sem->count, 1);
 	if (ret < 0) {
 		atom_inc(&sem->count);
-		ret = -EAGAIN;
+		ret = -1;
 	}
 
+	critical_leave();
+
 	return ret;
 }