mutex: avoid wait queue lock if possible

This also tidies up the atomic operations a little and adds a new atom_cmp_xchg() as well as the same APIs for longs and pointers.
2021-10-28 17:01:13 +02:00 · 2021-10-28 17:01:13 +02:00 · c36b03d97c
commit c36b03d97c
parent c66b05216d
9 changed files with 446 additions and 225 deletions
--- a/arch/x86/include/amd64/latom.h
+++ b/arch/x86/include/amd64/latom.h
@ -0,0 +1,193 @@
+/* See the end of this file for copyright and license terms. */
+
+#pragma once
+#ifndef _ARCH_ATOM_H_
+#error "This file is not meant to be included directly, use <arch/atom.h>"
+#endif
+
+#include <gay/cdefs.h>
+#include <gay/types.h>
+
+#ifndef __LP64__
+#error "__LP64__ must be defined on amd64"
+#endif
+
+static inline long latom_read(const latom_t *latom)
+{
+	return latom->_value;
+}
+
+static inline long latom_write(latom_t *latom, long val)
+{
+	long rax;
+
+	__asm__ volatile(
+"	movq	(%2),	%0	\n" /* rax = atom->_value */
+"1:	lock			\n"
+"	cmpxchgq %1, (%2)	\n" /* if (latom->_value == rax) latom->_value = val */
+"	pause			\n" /* intel says you're supposed to do this in spin loops */
+"	jne	1b		\n" /* else goto 1 (rax updated to new latom->_value) */
+	: "=a"(rax)
+	: "r"(val), "r"(&latom->_value)
+	: "cc", "memory"
+	);
+
+	return rax;
+}
+
+static inline long latom_cmp_xchg(latom_t *latom, long compare, long val)
+{
+	long rax = compare;
+
+	__asm__ volatile(
+"	lock			\n"
+"	cmpxchlq %1,	(%2)	\n" /* if ((rax = latom->_value) == compare) latom->_value = val */
+	: "+a"(rax)
+	: "r"(val), "r"(&latom->_value)
+	: "cc", "memory"
+	);
+
+	return rax;
+}
+
+/**
+ * @brief Perform an atomic load/add/store.
+ *
+ * @param atom Atom to add to
+ * @param val Value to add
+ * @return The value of `atom` *before* the operation
+ */
+static inline long latom_add(latom_t *latom, long val)
+{
+	__asm__ volatile(
+"	lock			\n"
+"	xaddq	%0, (%1)	\n"
+	: "+r"(val)
+	: "r"(&latom->_value)
+	: "cc", "memory"
+	);
+
+	return val;
+}
+
+static inline long latom_sub(latom_t *latom, long val)
+{
+	return latom_add(latom, -val);
+}
+
+static inline bool latom_inc(latom_t *latom)
+{
+	bool nonzero = false;
+
+	__asm__ volatile(
+"	lock		\n"
+"	incq	(%1)	\n"
+"	setne	%0	\n"
+	: "+r"(nonzero) /* read+write to ensure the initial value isn't optimized out */
+	: "r"(&latom->_value)
+	: "cc", "memory"
+	);
+
+	return nonzero;
+}
+
+static inline bool latom_dec(latom_t *latom)
+{
+	bool nonzero = false;
+
+	__asm__ volatile(
+"	lock		\n"
+"	decq	(%1)	\n"
+"	setne	%0	\n"
+	: "+r"(nonzero) /* read+write to ensure the initializer isn't optimized out */
+	: "r"(&latom->_value)
+	: "cc", "memory"
+	);
+
+	return nonzero;
+}
+
+static inline long latom_and(latom_t *latom, long val)
+{
+	long rax;
+
+	__asm__ volatile(
+"	movq	(%2), %0	\n" /* rax = latom->_value */
+"1:	andq	%0, %1		\n" /* val &= rax */
+"	lock			\n"
+"	cmpxchgq %1, (%2)	\n" /* if (latom->_value == rax) latom->_value = val */
+"	pause			\n" /* intel says you're supposed to do this in spin loops */
+"	jne	1b		\n" /* else goto 1 (rax updated to new latom->_value) */
+	: "=a"(rax), "+r"(val)
+	: "r"(&latom->_value)
+	: "cc", "memory"
+	);
+
+	return rax;
+}
+
+static inline long latom_or(latom_t *latom, long val)
+{
+	long rax;
+
+	__asm__ volatile(
+"	movq	(%2),	%0	\n" /* rax = latom->_value */
+"1:	orq	%0,	%1	\n" /* val |= rax */
+"	lock			\n"
+"	cmpxchgq %1,	(%2)	\n" /* if (latom->_value == rax) latom->_value = val */
+"	pause			\n" /* intel says you're supposed to do this in spin loops */
+"	jne	1b		\n" /* else goto 1 (rax updated to new latom->_value) */
+	: "=a"(rax), "+r"(val)
+	: "r"(&latom->_value)
+	: "cc", "memory"
+	);
+
+	return rax;
+}
+
+static inline long latom_xor(latom_t *latom, long val)
+{
+	long rax;
+
+	__asm__ volatile(
+"	movq	(%2),	%0	\n" /* rax = latom->_value */
+"1:	xorq	%0,	%1	\n" /* val ^= rax */
+"	lock			\n"
+"	cmpxchgq %1,	(%2)	\n" /* if (latom->_value == rax) latom->_value = val */
+"	pause			\n" /* intel says you're supposed to do this in spin loops */
+"	jne	1b		\n" /* else goto 1 (rax updated to new latom->_value) */
+	: "=a"(rax), "+r"(val)
+	: "r"(&latom->_value)
+	: "cc", "memory"
+	);
+
+	return rax;
+}
+
+static inline bool latom_set_bit(latom_t *latom, int pos)
+{
+	int mask = 1 << pos;
+	long oldval = latom_or(latom, mask);
+	return (oldval & mask) == 0;
+}
+
+static inline bool latom_clr_bit(latom_t *latom, int pos)
+{
+	int mask = 1 << pos;
+	long oldval = latom_and(latom, ~mask);
+	return (oldval & mask) != 0;
+}
+
+/*
+ * This file is part of GayBSD.
+ * Copyright (c) 2021 fef <owo@fef.moe>.
+ *
+ * GayBSD is nonviolent software: you may only use, redistribute, and/or
+ * modify it under the terms of the Cooperative Nonviolent Public License
+ * (CNPL) as found in the LICENSE file in the source code root directory
+ * or at <https://git.pixie.town/thufie/npl-builder>; either version 7
+ * of the license, or (at your option) any later version.
+ *
+ * GayBSD comes with ABSOLUTELY NO WARRANTY, to the extent
+ * permitted by applicable law.  See the CNPL for details.
+ */
--- a/arch/x86/include/arch/atom.h
+++ b/arch/x86/include/arch/atom.h
@ -1,9 +1,24 @@
 /* See the end of this file for copyright and license terms. */

 #pragma once
+#define _ARCH_ATOM_H_

+#include <gay/cdefs.h>
 #include <gay/types.h>

+static __always_inline void __spin_loop(void)
+{
+	/*
+	 * Intel says you're supposed to put this in tight spin loops as a
+	 * workaround for their buggy memory order violation prediction or
+	 * something.  They also claim that it significantly reduces power
+	 * consumption, whatever, i don't care.
+	 */
+	__asm__ volatile("pause" ::: "memory");
+}
+/** @brief Use this macro to build spin-wait loops. */
+#define spin_loop for (;; __spin_loop())
+
 /**
 * @brief Read an atom's current value.
 * You usually shouldn't need this function because all the other atomic
@ -15,7 +30,7 @@
 * @param atom Atom to read the value of
 * @return The atom's "current" value (at the time of reading it)
 */
-inline int atom_read(const atom_t *atom)
+static inline int atom_read(const atom_t *atom)
 {
 	return atom->_value;
 }
@ -27,12 +42,12 @@ inline int atom_read(const atom_t *atom)
 * @param val New value
 * @return The value of `atom` *before* the operation
 */
-inline int atom_write(atom_t *atom, int val)
+static inline int atom_write(atom_t *atom, int val)
 {
 	int eax;

 	__asm__ volatile(
-"	mov	(%2),	%0	\n" /* eax = atom->_value */
+"	movl	(%2),	%0	\n" /* eax = atom->_value */
 "1:	lock			\n"
 "	cmpxchgl %1, (%2)	\n" /* if (atom->_value == eax) atom->_value = val */
 "	pause			\n" /* intel says you're supposed to do this in spin loops */
@ -45,6 +60,38 @@ inline int atom_write(atom_t *atom, int val)
 	return eax;
 }

+/**
+ * @brief Perform an atomic compare/exchange.
+ * The specified value will only be written out if the current value in the atom
+ * matches `compare`.
+ *
+ * You can remember the order of the arguments by the name `atom_cmp_xchg`:
+ * first the `atom` pointer,
+ * then the value to `cmp` it with (i.e. `compare`),
+ * then the value to `xchg` it with (i.e. `val`).
+ *
+ * @param atom Atom to write to
+ * @param compare Expected current value
+ * @param val New value
+ * @return The value that was stored in the atom *before* the write.
+ *	If this is equal to `compare`, the store was successful, otherwise the
+ *	atom is unmodified.
+ */
+static inline int atom_cmp_xchg(atom_t *atom, int compare, int val)
+{
+	int eax = compare;
+
+	__asm__ volatile(
+"	lock			\n"
+"	cmpxchgl %1,	(%2)	\n" /* if (atom->_value == eax) atom->_value = val */
+	: "+a"(eax)
+	: "r"(val), "r"(&atom->_value)
+	: "cc", "memory"
+	);
+
+	return eax;
+}
+
 /**
 * @brief Perform an atomic load/add/store.
 *
@ -52,7 +99,7 @@ inline int atom_write(atom_t *atom, int val)
 * @param val Value to add
 * @return The value of `atom` *before* the operation
 */
-inline int atom_add(atom_t *atom, int val)
+static inline int atom_add(atom_t *atom, int val)
 {
 	__asm__ volatile(
 "	lock			\n"
@ -72,7 +119,7 @@ inline int atom_add(atom_t *atom, int val)
 * @param val Value to subtract
 * @return The value of `atom` *before* the operation
 */
-inline int atom_sub(atom_t *atom, int val)
+static inline int atom_sub(atom_t *atom, int val)
 {
 	return atom_add(atom, -val);
 }
@ -83,7 +130,7 @@ inline int atom_sub(atom_t *atom, int val)
 * @param atom Atom to increment
 * @return `true` if the vale *after* the operation is nonzero
 */
-inline bool atom_inc(atom_t *atom)
+static inline bool atom_inc(atom_t *atom)
 {
 	bool nonzero = false;

@ -105,7 +152,7 @@ inline bool atom_inc(atom_t *atom)
 * @param atom Atom to decrement
 * @return `true` if the value *after* the operation is nonzero
 */
-inline bool atom_dec(atom_t *atom)
+static inline bool atom_dec(atom_t *atom)
 {
 	bool nonzero = false;

@ -128,7 +175,7 @@ inline bool atom_dec(atom_t *atom)
 * @param val Value to AND with
 * @return The value of `atom` *before* the operation
 */
-inline int atom_and(atom_t *atom, int val)
+static inline int atom_and(atom_t *atom, int val)
 {
 	int eax;

@ -154,7 +201,7 @@ inline int atom_and(atom_t *atom, int val)
 * @param val Value to OR with
 * @return The value of `atom` *before* the operation
 */
-inline int atom_or(atom_t *atom, int val)
+static inline int atom_or(atom_t *atom, int val)
 {
 	int eax;

@ -162,7 +209,7 @@ inline int atom_or(atom_t *atom, int val)
 "	movl	(%2),	%0	\n" /* eax = atom->_value */
 "1:	orl	%0,	%1	\n" /* val |= eax */
 "	lock			\n"
-"	cmpxchgl %1,	(%2)	\n" /* if (atom->_value == eax) atom->_value = eax */
+"	cmpxchgl %1,	(%2)	\n" /* if (atom->_value == eax) atom->_value = val */
 "	pause			\n" /* intel says you're supposed to do this in spin loops */
 "	jne	1b		\n" /* else goto 1 (eax updated to new atom->_value) */
 	: "=a"(eax), "+r"(val)
@ -180,7 +227,7 @@ inline int atom_or(atom_t *atom, int val)
 * @param val Value to XOR with
 * @return The value of `atom` *before* the operation
 */
-inline int atom_xor(atom_t *atom, int val)
+static inline int atom_xor(atom_t *atom, int val)
 {
 	int eax;

@ -188,7 +235,7 @@ inline int atom_xor(atom_t *atom, int val)
 "	movl	(%2),	%0	\n" /* eax = atom->_value */
 "1:	xorl	%0,	%1	\n" /* val ^= eax */
 "	lock			\n"
-"	cmpxchgl %1,	(%2)	\n" /* if (atom->_value == eax) atom->_value = eax */
+"	cmpxchgl %1,	(%2)	\n" /* if (atom->_value == eax) atom->_value = val */
 "	pause			\n" /* intel says you're supposed to do this in spin loops */
 "	jne	1b		\n" /* else goto 1 (eax updated to new atom->_value) */
 	: "=a"(eax), "+r"(val)
@ -206,7 +253,7 @@ inline int atom_xor(atom_t *atom, int val)
 * @param pos Bit position (starting from 0 for the LSB)
 * @return `true` the bit was clear *before* the operation
 */
-inline bool atom_set_bit(atom_t *atom, int pos)
+static inline bool atom_set_bit(atom_t *atom, int pos)
 {
 	int mask = 1 << pos;
 	int oldval = atom_or(atom, mask);
@ -220,13 +267,53 @@ inline bool atom_set_bit(atom_t *atom, int pos)
 * @param pos Bit position (starting from 0 for the LSB)
 * @return `true` if the bit was set *before* the operation
 */
-inline bool atom_clr_bit(atom_t *atom, int pos)
+static inline bool atom_clr_bit(atom_t *atom, int pos)
 {
 	int mask = 1 << pos;
 	int oldval = atom_and(atom, ~mask);
 	return (oldval & mask) != 0;
 }

+#ifdef __x86_64__
+#include <amd64/latom.h>
+#else
+#include <i386/latom.h>
+#endif
+
+/*
+ * we use ILP32 on i386 and LP64 on amd64, therefore a long is exactly the size
+ * of a pointer on both platforms
+ */
+
+#if __SIZEOF_LONG__ != __SIZEOF_POINTER__
+#error "sizeof(long) is expected equal sizeof(void *) on x86"
+#endif
+
+static inline void *patom_read(const patom_t *patom)
+{
+	return patom->_ptr;
+}
+
+static __always_inline void *patom_write(patom_t *patom, void *val)
+{
+	return (void *)latom_write((latom_t *)patom, (long)val);
+}
+
+static __always_inline void *patom_cmp_xchg(patom_t *patom, void *compare, void *val)
+{
+	return (void *)latom_cmp_xchg((latom_t *)patom, (long)compare, (long)val);
+}
+
+static __always_inline void *patom_add(patom_t *patom, void *val)
+{
+	return (void *)latom_add((latom_t *)patom, (long)val);
+}
+
+static __always_inline void *patom_sub(patom_t *patom, void *val)
+{
+	return (void *)latom_sub((latom_t *)patom, (long)val);
+}
+
 /*
 * This file is part of GayBSD.
 * Copyright (c) 2021 fef <owo@fef.moe>.
--- a/arch/x86/include/i386/latom.h
+++ b/arch/x86/include/i386/latom.h
@ -0,0 +1,91 @@
+/* See the end of this file for copyright and license terms. */
+
+#pragma once
+#ifndef _ARCH_ATOM_H_
+#error "This file is not meant to be included directly, use <arch/atom.h>"
+#endif
+
+#include <gay/cdefs.h>
+#include <gay/types.h>
+
+/*
+ * we use ILP32 on i386, long is the same as int
+ */
+
+#ifndef __ILP32__
+#error "__ILP32__ must be defined on i386"
+#endif
+
+static inline long latom_read(const latom_t *latom)
+{
+	return latom->_value;
+}
+
+static __always_inline long latom_write(latom_t *latom, long val)
+{
+	return atom_write((atom_t *)latom, val);
+}
+
+static __always_inline long latom_cmp_xchg(latom_t *latom, long compare, long val)
+{
+	return atom_cmp_xchg((atom_t *)latom, compare, val);
+}
+
+static __always_inline long latom_add(latom_t *latom, long val)
+{
+	return atom_add((atom_t *)latom, val);
+}
+
+static __always_inline long latom_sub(latom_t *latom, long val)
+{
+	return atom_sub((atom_t *)latom, val);
+}
+
+static __always_inline bool latom_inc(latom_t *latom)
+{
+	return atom_inc((atom_t *)latom);
+}
+
+static __always_inline bool latom_dec(latom_t *latom)
+{
+	return atom_dec((atom_t *)latom);
+}
+
+static __always_inline long latom_and(latom_t *latom, long val)
+{
+	return atom_and((atom_t *)latom, val);
+}
+
+static __always_inline long latom_or(latom_t *latom, long val)
+{
+	return atom_or((atom_t *)latom, val);
+}
+
+static __always_inline long latom_xor(latom_t *latom, long val)
+{
+	return atom_xor((atom_t *)latom, val);
+}
+
+static __always_inline bool latom_set_bit(latom_t *latom, int pos)
+{
+	return atom_set_bit((atom_t *)latom, pos);
+}
+
+static __always_inline bool latom_clr_bit(latom_t *latom, int pos)
+{
+	return atom_clr_bit((atom_t *)latom, pos);
+}
+
+/*
+ * This file is part of GayBSD.
+ * Copyright (c) 2021 fef <owo@fef.moe>.
+ *
+ * GayBSD is nonviolent software: you may only use, redistribute, and/or
+ * modify it under the terms of the Cooperative Nonviolent Public License
+ * (CNPL) as found in the LICENSE file in the source code root directory
+ * or at <https://git.pixie.town/thufie/npl-builder>; either version 7
+ * of the license, or (at your option) any later version.
+ *
+ * GayBSD comes with ABSOLUTELY NO WARRANTY, to the extent
+ * permitted by applicable law.  See the CNPL for details.
+ */
--- a/arch/x86/sys/i386/CMakeLists.txt
+++ b/arch/x86/sys/i386/CMakeLists.txt
@ -1,7 +1,6 @@
 # See the end of this file for copyright and license terms.

 target_sources(gay_arch PRIVATE
-    atom.S
    idt.S
    irq.S
    port.S
--- a/arch/x86/sys/i386/atom.S
+++ b/arch/x86/sys/i386/atom.S
@ -1,185 +0,0 @@
-/* See the end of this file for copyright and license terms. */
-
-#include <asm/common.h>
-
-/* int atom_read(const atom_t *atom) */
-ASM_ENTRY(atom_read)
-	mov	4(%esp), %ecx
-	mov	(%ecx), %eax
-
-	ret
-ASM_END(atom_read)
-
-/* int atom_write(atom_t *atom, int val) */
-ASM_ENTRY(atom_write)
-	mov	4(%esp), %edx
-	mov	8(%esp), %ecx
-	mov	(%edx), %eax
-
-1:	lock
-	cmpxchg	%ecx, (%edx)
-	pause
-	jne	1b
-
-	ret
-ASM_END(atom_write)
-
-/* bool atom_inc(atom_t *atom) */
-ASM_ENTRY(atom_inc)
-	mov	4(%esp), %edx
-
-	lock
-	incl	(%edx)
-	xor	%eax, %eax
-	setne	%al
-
-	ret
-ASM_END(atom_inc)
-
-/* bool atom_dec(atom_t *atom) */
-ASM_ENTRY(atom_dec)
-	mov	4(%esp), %edx
-
-	lock
-	decl	(%edx)
-	xor	%eax, %eax
-	setne	%al
-
-	ret
-ASM_END(atom_dec)
-
-/* int atom_add(atom_t *atom, int val) */
-ASM_ENTRY(atom_add)
-	mov	4(%esp), %edx
-	mov	8(%esp), %eax
-
-	lock
-	xadd	%eax, (%edx)
-
-	ret
-ASM_END(atom_add)
-
-/* int atom_sub(atom_t *atom, int val) */
-ASM_ENTRY(atom_sub)
-	mov	4(%esp), %edx
-	mov	8(%esp), %eax
-
-	/* there is no xsubl, so we add the two's complement */
-	neg	%eax
-	lock
-	xadd	%eax, (%edx)
-
-	ret
-ASM_END(atom_sub)
-
-/* int atom_and(atom_t *atom, int val) */
-ASM_ENTRY(atom_and)
-	mov	4(%esp), %edx
-	mov	(%edx), %eax
-
-1:	mov	%eax, %ecx
-	and	8(%esp), %ecx
-	lock
-	cmpxchg	%ecx, (%edx)
-	pause
-	jne	1b
-
-	ret
-ASM_END(atom_and)
-
-/* int atom_or(atom_t *atom, int val) */
-ASM_ENTRY(atom_or)
-	mov	4(%esp), %edx
-	mov	(%edx), %eax
-
-1:	mov	%eax, %ecx
-	and	8(%esp), %ecx
-	lock
-	cmpxchg	%ecx, (%edx)
-	pause
-	jne	1b
-
-	ret
-ASM_END(atom_or)
-
-/* int atom_xor(atom_t *atom, int val) */
-ASM_ENTRY(atom_xor)
-	mov	4(%esp), %edx
-	mov	(%edx), %eax
-
-1:	mov	%eax, %ecx
-	xor	8(%esp), %ecx
-	lock
-	cmpxchg	%ecx, (%edx)
-	pause
-	jne	1b
-
-	ret
-ASM_END(atom_xor)
-
-/* bool atom_set_bit(atom_t *atom, int bit) */
-ASM_ENTRY(atom_set_bit)
-	mov	4(%esp), %edx
-	mov	8(%esp), %ecx
-
-	push	%ebx
-	mov	$1, %ebx
-	shl	%cl, %ebx
-
-	mov	(%edx), %eax
-
-1:	mov	%eax, %ecx
-	or	%ebx, %ecx
-	lock
-	cmpxchg	%ecx, (%edx)
-	pause
-	jne	1b
-
-	/* return true if bit was clear before */
-	not	%eax
-	and	%ebx, %eax
-	shr	%cl, %eax
-
-	pop	%ebx
-	ret
-ASM_END(atom_set_bit)
-
-/* bool atom_clr_bit(atom_t *atom, int bit) */
-ASM_ENTRY(atom_clr_bit)
-	mov	4(%esp), %edx
-	mov	8(%esp), %ecx
-
-	push	%ebx
-	mov	$0xfffffffe, %ebx
-	rol	%cl, %ebx
-	mov	(%edx), %eax
-
-1:	mov	%eax, %ecx
-	and	%ebx, %ecx
-	lock
-	cmpxchg	%ecx, (%edx)
-	pause
-	jne	1b
-
-	/* return true if bit was set before */
-	not	%ebx
-	and	%ebx, %eax
-	shr	%cl, %eax
-
-	pop	%ebx
-	ret
-ASM_END(atom_clr_bit)
-
-/*
- * This file is part of GayBSD.
- * Copyright (c) 2021 fef <owo@fef.moe>.
- *
- * GayBSD is nonviolent software: you may only use, redistribute, and/or
- * modify it under the terms of the Cooperative Nonviolent Public License
- * (CNPL) as found in the LICENSE file in the source code root directory
- * or at <https://git.pixie.town/thufie/npl-builder>; either version 7
- * of the license, or (at your option) any later version.
- *
- * GayBSD comes with ABSOLUTELY NO WARRANTY, to the extent
- * permitted by applicable law.  See the CNPL for details.
- */
--- a/include/gay/clist.h
+++ b/include/gay/clist.h
@ -135,6 +135,11 @@ struct clist *clist_del_last(struct clist *head);
 #define clist_first_entry(head, type, member) \
 	clist_entry((head)->next, type, member)

+#define clist_del_first_entry(head, type, member) ({	\
+	struct clist *__first = clist_del_first(head);	\
+	clist_entry(__first, type, member);		\
+})
+
 /**
 * @brief Get the last entry in a list.
 *
@ -146,6 +151,11 @@ struct clist *clist_del_last(struct clist *head);
 #define clist_last_entry(head, type, member) \
 	clist_entry((head)->prev, type, member)

+#define clist_del_last_entry(head, type, member) ({	\
+	struct clist *__last = clist_del_last(head);	\
+	clist_entry(__last, type, member);		\
+})
+
 /**
 * @brief Get the next entry in a clist.
 *
--- a/include/gay/mutex.h
+++ b/include/gay/mutex.h
@ -8,7 +8,7 @@
 struct task;

 typedef struct {
-	atom_t lock;
+	atom_t lock; /* 0 = free, 1 = locked */
 } spin_t;

 #define SPIN_DEFINE {			\
@ -28,7 +28,7 @@ struct lock_waiter {
 };

 struct mtx {
-	atom_t lock;
+	atom_t lock; /* 1 = free, 0 = locked, < 0 = locked and other threads are waiting */
 	spin_t wait_queue_lock;
 	struct clist wait_queue; /* -> struct lock_waiter::clink */
 };
--- a/include/gay/types.h
+++ b/include/gay/types.h
@ -93,7 +93,7 @@ typedef __u_register_t		u_register_t;

 /**
 * @brief The primitive atomic integral type.
- * For use with the APIs defined in `arch/atomic.h`.
+ * For use with the APIs defined in `arch/atom.h`.
 */
 typedef struct {
 	volatile int _value;
@ -102,6 +102,26 @@ typedef struct {
 /** @brief Atom definition body for static initialization of higher level components. */
 #define ATOM_DEFINE(val) { ._value = (val) }

+/**
+ * @brief A long, but atomic.
+ * For use with the APIs defined in `arch/atom.h`.
+ */
+typedef struct {
+	volatile long _value;
+} latom_t;
+
+#define LATOM_DEFINE(val) { ._value = (val) }
+
+/**
+ * @brief A pointer, but atomic.
+ * For use with the APIs defined in `arch/atom.h`.
+ */
+typedef struct {
+	void *volatile _ptr;
+} patom_t;
+
+#define PATOM_DEFINE(ptr) { ._ptr = (ptr) }
+
 #ifndef _PID_T_DECLARED
 #define _PID_T_DECLARED 1
 typedef int			pid_t;
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@ -3,6 +3,8 @@
 #include <arch/atom.h>

 #include <gay/clist.h>
+#include <gay/irq.h>
+#include <gay/kprintf.h>
 #include <gay/mutex.h>
 #include <gay/sched.h>

@ -15,13 +17,15 @@ void spin_init(spin_t *spin)

 void spin_lock(spin_t *spin)
 {
-	while (atom_write(&spin->lock, 1) != 0);
-		/* nothing */
+	spin_loop {
+		if (atom_cmp_xchg(&spin->lock, 0, 1) == 0)
+			break;
+	}
 }

 int spin_trylock(spin_t *spin)
 {
-	if (atom_write(&spin->lock, 1) != 0)
+	if (atom_cmp_xchg(&spin->lock, 0, 1) != 0)
 		return -EAGAIN;
 	return 0;
 }
@ -33,22 +37,29 @@ void spin_unlock(spin_t *spin)

 void mtx_init(struct mtx *mtx)
 {
-	atom_write(&mtx->lock, 0);
+	atom_write(&mtx->lock, 1);
 	spin_init(&mtx->wait_queue_lock);
 	clist_init(&mtx->wait_queue);
 }

 void mtx_lock(struct mtx *mtx)
 {
+#	ifdef DEBUG
+		if (in_irq()) {
+			kprintf("mtx_lock() called from irq context!\n");
+			spin_loop {
+				if (atom_cmp_xchg(&mtx->lock, 1, 0) == 1)
+					return;
+			}
+		}
+#	endif
+
 	/*
-	 * acquire a lock on the wait queue before trying to claim the
-	 * mutex itself to make sure the mutex isn't released while we
-	 * are inserting ourselves into the wait queue
+	 * When the mutex is locked, its lock value goes to 0.
+	 * atom_dec() returns true if the value *after* the decrement is
+	 * nonzero, meaning the lock value has become negative.
 	 */
-	spin_lock(&mtx->wait_queue_lock);
-	if (atom_write(&mtx->lock, 1) == 0) {
-		spin_unlock(&mtx->wait_queue_lock);
-	} else {
+	if (atom_dec(&mtx->lock)) {
 		struct task *task = current;
 		/*
 		 * It might not be the smartest idea to allocate this thing on
@ -58,34 +69,29 @@ void mtx_lock(struct mtx *mtx)
 		struct lock_waiter waiter = {
 			.task = task,
 		};
+
+		spin_lock(&mtx->wait_queue_lock);
 		clist_add(&mtx->wait_queue, &waiter.clink);
 		spin_unlock(&mtx->wait_queue_lock);

 		task->state = TASK_BLOCKED;
-		/*
-		 * This is only gonna return when the task currently owning the
-		 * lock releases it.  In that case, it doesn't unlock the mutex
-		 * but merely switches back to us directly and thereby implicitly
-		 * transfers the ownership of the mutex to us (see mtx_unlock()).
-		 */
 		schedule();
 	}
 }

 int mtx_trylock(struct mtx *mtx)
 {
-	if (atom_write(&mtx->lock, 1) != 0)
+	if (atom_cmp_xchg(&mtx->lock, 1, 0) != 1)
 		return -EAGAIN;
 	return 0;
 }

 void mtx_unlock(struct mtx *mtx)
 {
-	spin_lock(&mtx->wait_queue_lock);
-	if (!clist_is_empty(&mtx->wait_queue)) {
+	if (atom_add(&mtx->lock, 1) < 0) {
+		spin_lock(&mtx->wait_queue_lock);
 		struct lock_waiter *waiter =
-			clist_first_entry(&mtx->wait_queue, typeof(*waiter), clink);
-		clist_del(&waiter->clink);
+			clist_del_first_entry(&mtx->wait_queue, typeof(*waiter), clink);
 		spin_unlock(&mtx->wait_queue_lock);
 		waiter->task->state = TASK_READY;
 		switch_to(waiter->task, current);