295 lines
8.6 KiB
Diff
295 lines
8.6 KiB
Diff
--- sys/amd64/amd64/exception.S.orig
|
|
+++ sys/amd64/amd64/exception.S
|
|
@@ -108,8 +108,6 @@
|
|
movq $0,TF_ADDR(%rsp) ; \
|
|
movq $0,TF_ERR(%rsp) ; \
|
|
jmp alltraps_noen
|
|
-IDTVEC(dbg)
|
|
- TRAP_NOEN(T_TRCTRAP)
|
|
IDTVEC(bpt)
|
|
TRAP_NOEN(T_BPTFLT)
|
|
#ifdef KDTRACE_HOOKS
|
|
@@ -436,6 +434,101 @@
|
|
sysret
|
|
|
|
/*
|
|
+ * DB# handler is very similar to NM#, because 'mov/pop %ss' delay
|
|
+ * generation of exception until the next instruction is executed,
|
|
+ * which might be a kernel entry. So we must execute the handler
|
|
+ * on IST stack and be ready for non-kernel GSBASE.
|
|
+ */
|
|
+IDTVEC(dbg)
|
|
+ subq $TF_RIP,%rsp
|
|
+ movl $(T_TRCTRAP),TF_TRAPNO(%rsp)
|
|
+ movq $0,TF_ADDR(%rsp)
|
|
+ movq $0,TF_ERR(%rsp)
|
|
+ movq %rdi,TF_RDI(%rsp)
|
|
+ movq %rsi,TF_RSI(%rsp)
|
|
+ movq %rdx,TF_RDX(%rsp)
|
|
+ movq %rcx,TF_RCX(%rsp)
|
|
+ movq %r8,TF_R8(%rsp)
|
|
+ movq %r9,TF_R9(%rsp)
|
|
+ movq %rax,TF_RAX(%rsp)
|
|
+ movq %rbx,TF_RBX(%rsp)
|
|
+ movq %rbp,TF_RBP(%rsp)
|
|
+ movq %r10,TF_R10(%rsp)
|
|
+ movq %r11,TF_R11(%rsp)
|
|
+ movq %r12,TF_R12(%rsp)
|
|
+ movq %r13,TF_R13(%rsp)
|
|
+ movq %r14,TF_R14(%rsp)
|
|
+ movq %r15,TF_R15(%rsp)
|
|
+ movw %fs,TF_FS(%rsp)
|
|
+ movw %gs,TF_GS(%rsp)
|
|
+ movw %es,TF_ES(%rsp)
|
|
+ movw %ds,TF_DS(%rsp)
|
|
+ movl $TF_HASSEGS,TF_FLAGS(%rsp)
|
|
+ cld
|
|
+ testb $SEL_RPL_MASK,TF_CS(%rsp)
|
|
+ jnz dbg_fromuserspace
|
|
+ /*
|
|
+ * We've interrupted the kernel. Preserve GS.base in %r12.
|
|
+ */
|
|
+ movl $MSR_GSBASE,%ecx
|
|
+ rdmsr
|
|
+ movq %rax,%r12
|
|
+ shlq $32,%rdx
|
|
+ orq %rdx,%r12
|
|
+ /* Retrieve and load the canonical value for GS.base. */
|
|
+ movq TF_SIZE(%rsp),%rdx
|
|
+ movl %edx,%eax
|
|
+ shrq $32,%rdx
|
|
+ wrmsr
|
|
+ FAKE_MCOUNT(TF_RIP(%rsp))
|
|
+ movq %rsp,%rdi
|
|
+ call trap
|
|
+ MEXITCOUNT
|
|
+ /*
|
|
+ * Put back the preserved MSR_GSBASE value.
|
|
+ */
|
|
+ movl $MSR_GSBASE,%ecx
|
|
+ movq %r12,%rdx
|
|
+ movl %edx,%eax
|
|
+ shrq $32,%rdx
|
|
+ wrmsr
|
|
+ movq TF_RDI(%rsp),%rdi
|
|
+ movq TF_RSI(%rsp),%rsi
|
|
+ movq TF_RDX(%rsp),%rdx
|
|
+ movq TF_RCX(%rsp),%rcx
|
|
+ movq TF_R8(%rsp),%r8
|
|
+ movq TF_R9(%rsp),%r9
|
|
+ movq TF_RAX(%rsp),%rax
|
|
+ movq TF_RBX(%rsp),%rbx
|
|
+ movq TF_RBP(%rsp),%rbp
|
|
+ movq TF_R10(%rsp),%r10
|
|
+ movq TF_R11(%rsp),%r11
|
|
+ movq TF_R12(%rsp),%r12
|
|
+ movq TF_R13(%rsp),%r13
|
|
+ movq TF_R14(%rsp),%r14
|
|
+ movq TF_R15(%rsp),%r15
|
|
+ addq $TF_RIP,%rsp
|
|
+ jmp doreti_iret
|
|
+dbg_fromuserspace:
|
|
+ /*
|
|
+ * Switch to kernel GSBASE and kernel page table, and copy frame
|
|
+ * from the IST stack to the normal kernel stack, since trap()
|
|
+ * re-enables interrupts, and since we might trap on DB# while
|
|
+ * in trap().
|
|
+ */
|
|
+ swapgs
|
|
+ movq PCPU(RSP0),%rax
|
|
+ movl $TF_SIZE,%ecx
|
|
+ subq %rcx,%rax
|
|
+ movq %rax,%rdi
|
|
+ movq %rsp,%rsi
|
|
+ rep;movsb
|
|
+ movq %rax,%rsp
|
|
+ movq PCPU(CURPCB),%rdi
|
|
+ orl $PCB_FULL_IRET,PCB_FLAGS(%rdi)
|
|
+ jmp calltrap
|
|
+
|
|
+/*
|
|
* NMI handling is special.
|
|
*
|
|
* First, NMIs do not respect the state of the processor's RFLAGS.IF
|
|
--- sys/amd64/amd64/machdep.c.orig
|
|
+++ sys/amd64/amd64/machdep.c
|
|
@@ -1023,6 +1023,7 @@
|
|
static char dblfault_stack[PAGE_SIZE] __aligned(16);
|
|
|
|
static char nmi0_stack[PAGE_SIZE] __aligned(16);
|
|
+static char dbg0_stack[PAGE_SIZE] __aligned(16);
|
|
CTASSERT(sizeof(struct nmi_pcpu) == 16);
|
|
|
|
struct amd64tss common_tss[MAXCPU];
|
|
@@ -1908,7 +1909,7 @@
|
|
for (x = 0; x < NIDT; x++)
|
|
setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
|
|
setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0);
|
|
- setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0);
|
|
+ setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
|
|
setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
|
|
setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0);
|
|
setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0);
|
|
@@ -1966,6 +1967,13 @@
|
|
np->np_pcpu = (register_t) pc;
|
|
common_tss[0].tss_ist2 = (long) np;
|
|
|
|
+ /*
|
|
+ * DB# stack, runs on ist4.
|
|
+ */
|
|
+ np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
|
|
+ np->np_pcpu = (register_t) pc;
|
|
+ common_tss[0].tss_ist4 = (long) np;
|
|
+
|
|
/* Set the IO permission bitmap (empty due to tss seg limit) */
|
|
common_tss[0].tss_iobase = sizeof(struct amd64tss) +
|
|
IOPAGES * PAGE_SIZE;
|
|
--- sys/amd64/amd64/mp_machdep.c.orig
|
|
+++ sys/amd64/amd64/mp_machdep.c
|
|
@@ -98,6 +98,7 @@
|
|
/* Temporary variables for init_secondary() */
|
|
char *doublefault_stack;
|
|
char *nmi_stack;
|
|
+char *dbg_stack;
|
|
void *dpcpu;
|
|
|
|
struct pcb stoppcbs[MAXCPU];
|
|
@@ -647,6 +648,10 @@
|
|
np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
|
|
common_tss[cpu].tss_ist2 = (long) np;
|
|
|
|
+ /* The DB# stack runs on IST4. */
|
|
+ np = ((struct nmi_pcpu *) &dbg_stack[PAGE_SIZE]) - 1;
|
|
+ common_tss[cpu].tss_ist4 = (long) np;
|
|
+
|
|
/* Prepare private GDT */
|
|
gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
|
|
for (x = 0; x < NGDT; x++) {
|
|
@@ -682,6 +687,10 @@
|
|
/* Save the per-cpu pointer for use by the NMI handler. */
|
|
np->np_pcpu = (register_t) pc;
|
|
|
|
+ /* Save the per-cpu pointer for use by the DB# handler. */
|
|
+ np = ((struct nmi_pcpu *) &dbg_stack[PAGE_SIZE]) - 1;
|
|
+ np->np_pcpu = (register_t) pc;
|
|
+
|
|
wrmsr(MSR_FSBASE, 0); /* User value */
|
|
wrmsr(MSR_GSBASE, (u_int64_t)pc);
|
|
wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */
|
|
@@ -970,6 +979,8 @@
|
|
PAGE_SIZE, M_WAITOK | M_ZERO);
|
|
nmi_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
|
|
M_WAITOK | M_ZERO);
|
|
+ dbg_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
|
|
+ M_WAITOK | M_ZERO);
|
|
dpcpu = (void *)kmem_malloc(kernel_arena, DPCPU_SIZE,
|
|
M_WAITOK | M_ZERO);
|
|
|
|
--- sys/amd64/amd64/trap.c.orig
|
|
+++ sys/amd64/amd64/trap.c
|
|
@@ -45,6 +45,7 @@
|
|
*/
|
|
|
|
#include "opt_clock.h"
|
|
+#include "opt_compat.h"
|
|
#include "opt_cpu.h"
|
|
#include "opt_hwpmc_hooks.h"
|
|
#include "opt_isa.h"
|
|
@@ -98,6 +99,9 @@
|
|
#include <sys/dtrace_bsd.h>
|
|
#endif
|
|
|
|
+extern inthand_t IDTVEC(bpt), IDTVEC(dbg), IDTVEC(fast_syscall),
|
|
+ IDTVEC(fast_syscall32), IDTVEC(int0x80_syscall);
|
|
+
|
|
extern void trap(struct trapframe *frame);
|
|
extern void syscall(struct trapframe *frame);
|
|
void dblfault_handler(struct trapframe *frame);
|
|
@@ -549,7 +553,40 @@
|
|
load_dr6(rdr6() & 0xfffffff0);
|
|
goto out;
|
|
}
|
|
+
|
|
/*
|
|
+ * Malicious user code can configure a debug
|
|
+ * register watchpoint to trap on data access
|
|
+ * to the top of stack and then execute 'pop
|
|
+ * %ss; int 3'. Due to exception deferral for
|
|
+ * 'pop %ss', the CPU will not interrupt 'int
|
|
+ * 3' to raise the DB# exception for the debug
|
|
+ * register but will postpone the DB# until
|
|
+ * execution of the first instruction of the
|
|
+ * BP# handler (in kernel mode). Normally the
|
|
+ * previous check would ignore DB# exceptions
|
|
+ * for watchpoints on user addresses raised in
|
|
+ * kernel mode. However, some CPU errata
|
|
+ * include cases where DB# exceptions do not
|
|
+ * properly set bits in %dr6, e.g. Haswell
|
|
+ * HSD23 and Skylake-X SKZ24.
|
|
+ *
|
|
+ * A deferred DB# can also be raised on the
|
|
+ * first instructions of system call entry
|
|
+ * points or single-step traps via similar use
|
|
+ * of 'pop %ss' or 'mov xxx, %ss'.
|
|
+ */
|
|
+ if (frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall) ||
|
|
+#ifdef COMPAT_FREEBSD32
|
|
+ frame->tf_rip ==
|
|
+ (uintptr_t)IDTVEC(int0x80_syscall) ||
|
|
+#endif
|
|
+ frame->tf_rip == (uintptr_t)IDTVEC(bpt) ||
|
|
+ frame->tf_rip == (uintptr_t)IDTVEC(dbg) ||
|
|
+ /* Needed for AMD. */
|
|
+ frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall32))
|
|
+ return;
|
|
+ /*
|
|
* FALLTHROUGH (TRCTRAP kernel mode, kernel address)
|
|
*/
|
|
case T_BPTFLT:
|
|
--- sys/i386/i386/trap.c.orig
|
|
+++ sys/i386/i386/trap.c
|
|
@@ -116,6 +116,8 @@
|
|
|
|
extern inthand_t IDTVEC(lcall_syscall);
|
|
|
|
+extern inthand_t IDTVEC(bpt), IDTVEC(dbg), IDTVEC(int0x80_syscall);
|
|
+
|
|
#define MAX_TRAP_MSG 32
|
|
static char *trap_msg[] = {
|
|
"", /* 0 unused */
|
|
@@ -683,7 +685,35 @@
|
|
load_dr6(rdr6() & 0xfffffff0);
|
|
goto out;
|
|
}
|
|
+
|
|
/*
|
|
+ * Malicious user code can configure a debug
|
|
+ * register watchpoint to trap on data access
|
|
+ * to the top of stack and then execute 'pop
|
|
+ * %ss; int 3'. Due to exception deferral for
|
|
+ * 'pop %ss', the CPU will not interrupt 'int
|
|
+ * 3' to raise the DB# exception for the debug
|
|
+ * register but will postpone the DB# until
|
|
+ * execution of the first instruction of the
|
|
+ * BP# handler (in kernel mode). Normally the
|
|
+ * previous check would ignore DB# exceptions
|
|
+ * for watchpoints on user addresses raised in
|
|
+ * kernel mode. However, some CPU errata
|
|
+ * include cases where DB# exceptions do not
|
|
+ * properly set bits in %dr6, e.g. Haswell
|
|
+ * HSD23 and Skylake-X SKZ24.
|
|
+ *
|
|
+ * A deferred DB# can also be raised on the
|
|
+ * first instructions of system call entry
|
|
+ * points or single-step traps via similar use
|
|
+ * of 'pop %ss' or 'mov xxx, %ss'.
|
|
+ */
|
|
+ if (frame->tf_eip ==
|
|
+ (uintptr_t)IDTVEC(int0x80_syscall) ||
|
|
+ frame->tf_eip == (uintptr_t)IDTVEC(bpt) ||
|
|
+ frame->tf_eip == (uintptr_t)IDTVEC(dbg))
|
|
+ return;
|
|
+ /*
|
|
* FALLTHROUGH (TRCTRAP kernel mode, kernel address)
|
|
*/
|
|
case T_BPTFLT:
|