351 lines
		
	
	
	
		
			10 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
			
		
		
	
	
			351 lines
		
	
	
	
		
			10 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
| --- sys/amd64/amd64/exception.S.orig
 | |
| +++ sys/amd64/amd64/exception.S
 | |
| @@ -116,7 +116,6 @@
 | |
|  	jmp alltraps_noen
 | |
|  	.endm
 | |
|  
 | |
| -	TRAP_NOEN	dbg, T_TRCTRAP
 | |
|  	TRAP_NOEN	bpt, T_BPTFLT
 | |
|  #ifdef KDTRACE_HOOKS
 | |
|  	TRAP_NOEN	dtrace_ret, T_DTRACE_RET
 | |
| @@ -509,6 +508,121 @@
 | |
|  	sysret
 | |
|  
 | |
|  /*
 | |
| + * DB# handler is very similar to NM#, because 'mov/pop %ss' delay
 | |
| + * generation of exception until the next instruction is executed,
 | |
| + * which might be a kernel entry.  So we must execute the handler
 | |
| + * on IST stack and be ready for non-kernel GSBASE.
 | |
| + */
 | |
| +IDTVEC(dbg)
 | |
| +	subq	$TF_RIP,%rsp
 | |
| +	movl	$(T_TRCTRAP),TF_TRAPNO(%rsp)
 | |
| +	movq	$0,TF_ADDR(%rsp)
 | |
| +	movq	$0,TF_ERR(%rsp)
 | |
| +	movq	%rdi,TF_RDI(%rsp)
 | |
| +	movq	%rsi,TF_RSI(%rsp)
 | |
| +	movq	%rdx,TF_RDX(%rsp)
 | |
| +	movq	%rcx,TF_RCX(%rsp)
 | |
| +	movq	%r8,TF_R8(%rsp)
 | |
| +	movq	%r9,TF_R9(%rsp)
 | |
| +	movq	%rax,TF_RAX(%rsp)
 | |
| +	movq	%rbx,TF_RBX(%rsp)
 | |
| +	movq	%rbp,TF_RBP(%rsp)
 | |
| +	movq	%r10,TF_R10(%rsp)
 | |
| +	movq	%r11,TF_R11(%rsp)
 | |
| +	movq	%r12,TF_R12(%rsp)
 | |
| +	movq	%r13,TF_R13(%rsp)
 | |
| +	movq	%r14,TF_R14(%rsp)
 | |
| +	movq	%r15,TF_R15(%rsp)
 | |
| +	SAVE_SEGS
 | |
| +	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 | |
| +	cld
 | |
| +	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 | |
| +	jnz	dbg_fromuserspace
 | |
| +	/*
 | |
| +	 * We've interrupted the kernel.  Preserve GS.base in %r12,
 | |
| +	 * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d.
 | |
| +	 */
 | |
| +	movl	$MSR_GSBASE,%ecx
 | |
| +	rdmsr
 | |
| +	movq	%rax,%r12
 | |
| +	shlq	$32,%rdx
 | |
| +	orq	%rdx,%r12
 | |
| +	/* Retrieve and load the canonical value for GS.base. */
 | |
| +	movq	TF_SIZE(%rsp),%rdx
 | |
| +	movl	%edx,%eax
 | |
| +	shrq	$32,%rdx
 | |
| +	wrmsr
 | |
| +	movq	%cr3,%r13
 | |
| +	movq	PCPU(KCR3),%rax
 | |
| +	cmpq	$~0,%rax
 | |
| +	je	1f
 | |
| +	movq	%rax,%cr3
 | |
| +1:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
 | |
| +	je	2f
 | |
| +	movl	$MSR_IA32_SPEC_CTRL,%ecx
 | |
| +	rdmsr
 | |
| +	movl	%eax,%r14d
 | |
| +	call	handle_ibrs_entry
 | |
| +2:	FAKE_MCOUNT(TF_RIP(%rsp))
 | |
| +	movq	%rsp,%rdi
 | |
| +	call	trap
 | |
| +	MEXITCOUNT
 | |
| +	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
 | |
| +	je	3f
 | |
| +	movl	%r14d,%eax
 | |
| +	xorl	%edx,%edx
 | |
| +	movl	$MSR_IA32_SPEC_CTRL,%ecx
 | |
| +	wrmsr
 | |
| +	/*
 | |
| +	 * Put back the preserved MSR_GSBASE value.
 | |
| +	 */
 | |
| +3:	movl	$MSR_GSBASE,%ecx
 | |
| +	movq	%r12,%rdx
 | |
| +	movl	%edx,%eax
 | |
| +	shrq	$32,%rdx
 | |
| +	wrmsr
 | |
| +	movq	%r13,%cr3
 | |
| +	RESTORE_REGS
 | |
| +	addq	$TF_RIP,%rsp
 | |
| +	jmp	doreti_iret
 | |
| +dbg_fromuserspace:
 | |
| +	/*
 | |
| +	 * Switch to kernel GSBASE and kernel page table, and copy frame
 | |
| +	 * from the IST stack to the normal kernel stack, since trap()
 | |
| +	 * re-enables interrupts, and since we might trap on DB# while
 | |
| +	 * in trap().
 | |
| +	 */
 | |
| +	swapgs
 | |
| +	movq	PCPU(KCR3),%rax
 | |
| +	cmpq	$~0,%rax
 | |
| +	je	1f
 | |
| +	movq	%rax,%cr3
 | |
| +1:	movq	PCPU(RSP0),%rax
 | |
| +	movl	$TF_SIZE,%ecx
 | |
| +	subq	%rcx,%rax
 | |
| +	movq	%rax,%rdi
 | |
| +	movq	%rsp,%rsi
 | |
| +	rep;movsb
 | |
| +	movq	%rax,%rsp
 | |
| +	call	handle_ibrs_entry
 | |
| +	movq	PCPU(CURPCB),%rdi
 | |
| +	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)
 | |
| +	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
 | |
| +	jz	3f
 | |
| +	cmpw	$KUF32SEL,TF_FS(%rsp)
 | |
| +	jne	2f
 | |
| +	rdfsbase %rax
 | |
| +	movq	%rax,PCB_FSBASE(%rdi)
 | |
| +2:	cmpw	$KUG32SEL,TF_GS(%rsp)
 | |
| +	jne	3f
 | |
| +	movl	$MSR_KGSBASE,%ecx
 | |
| +	rdmsr
 | |
| +	shlq	$32,%rdx
 | |
| +	orq	%rdx,%rax
 | |
| +	movq	%rax,PCB_GSBASE(%rdi)
 | |
| +3:	jmp	calltrap
 | |
| +
 | |
| +/*
 | |
|   * NMI handling is special.
 | |
|   *
 | |
|   * First, NMIs do not respect the state of the processor's RFLAGS.IF
 | |
| --- sys/amd64/amd64/machdep.c.orig
 | |
| +++ sys/amd64/amd64/machdep.c
 | |
| @@ -675,6 +675,7 @@
 | |
|  static char dblfault_stack[PAGE_SIZE] __aligned(16);
 | |
|  static char mce0_stack[PAGE_SIZE] __aligned(16);
 | |
|  static char nmi0_stack[PAGE_SIZE] __aligned(16);
 | |
| +static char dbg0_stack[PAGE_SIZE] __aligned(16);
 | |
|  CTASSERT(sizeof(struct nmi_pcpu) == 16);
 | |
|  
 | |
|  struct amd64tss common_tss[MAXCPU];
 | |
| @@ -827,7 +828,7 @@
 | |
|  	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 | |
|  	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 | |
|  	IDTVEC(xmm), IDTVEC(dblfault),
 | |
| -	IDTVEC(div_pti), IDTVEC(dbg_pti), IDTVEC(bpt_pti),
 | |
| +	IDTVEC(div_pti), IDTVEC(bpt_pti),
 | |
|  	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
 | |
|  	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
 | |
|  	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
 | |
| @@ -1637,8 +1638,7 @@
 | |
|  		    SEL_KPL, 0);
 | |
|  	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
 | |
|  	    SEL_KPL, 0);
 | |
| -	setidt(IDT_DB, pti ? &IDTVEC(dbg_pti) : &IDTVEC(dbg), SDT_SYSIGT,
 | |
| -	    SEL_KPL, 0);
 | |
| +	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
 | |
|  	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
 | |
|  	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
 | |
|  	    SEL_UPL, 0);
 | |
| @@ -1720,6 +1720,13 @@
 | |
|  	np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
 | |
|  	np->np_pcpu = (register_t) pc;
 | |
|  	common_tss[0].tss_ist3 = (long) np;
 | |
| +
 | |
| +	/*
 | |
| +	 * DB# stack, runs on ist4.
 | |
| +	 */
 | |
| +	np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
 | |
| +	np->np_pcpu = (register_t) pc;
 | |
| +	common_tss[0].tss_ist4 = (long) np;
 | |
|  	
 | |
|  	/* Set the IO permission bitmap (empty due to tss seg limit) */
 | |
|  	common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
 | |
| --- sys/amd64/amd64/mp_machdep.c.orig
 | |
| +++ sys/amd64/amd64/mp_machdep.c
 | |
| @@ -87,6 +87,7 @@
 | |
|  char *doublefault_stack;
 | |
|  char *mce_stack;
 | |
|  char *nmi_stack;
 | |
| +char *dbg_stack;
 | |
|  
 | |
|  /*
 | |
|   * Local data and functions.
 | |
| @@ -225,6 +226,10 @@
 | |
|  	np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1;
 | |
|  	common_tss[cpu].tss_ist3 = (long) np;
 | |
|  
 | |
| +	/* The DB# stack runs on IST4. */
 | |
| +	np = ((struct nmi_pcpu *) &dbg_stack[PAGE_SIZE]) - 1;
 | |
| +	common_tss[cpu].tss_ist4 = (long) np;
 | |
| +
 | |
|  	/* Prepare private GDT */
 | |
|  	gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
 | |
|  	for (x = 0; x < NGDT; x++) {
 | |
| @@ -270,6 +275,10 @@
 | |
|  	np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1;
 | |
|  	np->np_pcpu = (register_t) pc;
 | |
|  
 | |
| +	/* Save the per-cpu pointer for use by the DB# handler. */
 | |
| +	np = ((struct nmi_pcpu *) &dbg_stack[PAGE_SIZE]) - 1;
 | |
| +	np->np_pcpu = (register_t) pc;
 | |
| +
 | |
|  	wrmsr(MSR_FSBASE, 0);		/* User value */
 | |
|  	wrmsr(MSR_GSBASE, (u_int64_t)pc);
 | |
|  	wrmsr(MSR_KGSBASE, (u_int64_t)pc);	/* XXX User value while we're in the kernel */
 | |
| @@ -368,6 +377,8 @@
 | |
|  		    M_WAITOK | M_ZERO);
 | |
|  		nmi_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
 | |
|  		    M_WAITOK | M_ZERO);
 | |
| +		dbg_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
 | |
| +		    M_WAITOK | M_ZERO);
 | |
|  		dpcpu = (void *)kmem_malloc(kernel_arena, DPCPU_SIZE,
 | |
|  		    M_WAITOK | M_ZERO);
 | |
|  
 | |
| --- sys/amd64/amd64/pmap.c.orig
 | |
| +++ sys/amd64/amd64/pmap.c
 | |
| @@ -7565,6 +7565,9 @@
 | |
|  		/* MC# stack IST 3 */
 | |
|  		va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu);
 | |
|  		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
 | |
| +		/* DB# stack IST 4 */
 | |
| +		va = common_tss[i].tss_ist4 + sizeof(struct nmi_pcpu);
 | |
| +		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
 | |
|  	}
 | |
|  	pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
 | |
|  	    (vm_offset_t)etext, true);
 | |
| --- sys/amd64/amd64/trap.c.orig
 | |
| +++ sys/amd64/amd64/trap.c
 | |
| @@ -45,6 +45,7 @@
 | |
|   */
 | |
|  
 | |
|  #include "opt_clock.h"
 | |
| +#include "opt_compat.h"
 | |
|  #include "opt_cpu.h"
 | |
|  #include "opt_hwpmc_hooks.h"
 | |
|  #include "opt_isa.h"
 | |
| @@ -99,6 +100,11 @@
 | |
|  #include <sys/dtrace_bsd.h>
 | |
|  #endif
 | |
|  
 | |
| +extern inthand_t IDTVEC(bpt), IDTVEC(bpt_pti), IDTVEC(dbg),
 | |
| +    IDTVEC(fast_syscall), IDTVEC(fast_syscall_pti), IDTVEC(fast_syscall32),
 | |
| +    IDTVEC(int0x80_syscall_pti), IDTVEC(int0x80_syscall);
 | |
| +
 | |
| +
 | |
|  extern void __noinline trap(struct trapframe *frame);
 | |
|  extern void trap_check(struct trapframe *frame);
 | |
|  extern void syscall(struct trapframe *frame);
 | |
| @@ -536,7 +542,53 @@
 | |
|  				load_dr6(rdr6() & ~0xf);
 | |
|  				goto out;
 | |
|  			}
 | |
| +
 | |
|  			/*
 | |
| +			 * Malicious user code can configure a debug
 | |
| +			 * register watchpoint to trap on data access
 | |
| +			 * to the top of stack and then execute 'pop
 | |
| +			 * %ss; int 3'.  Due to exception deferral for
 | |
| +			 * 'pop %ss', the CPU will not interrupt 'int
 | |
| +			 * 3' to raise the DB# exception for the debug
 | |
| +			 * register but will postpone the DB# until
 | |
| +			 * execution of the first instruction of the
 | |
| +			 * BP# handler (in kernel mode).  Normally the
 | |
| +			 * previous check would ignore DB# exceptions
 | |
| +			 * for watchpoints on user addresses raised in
 | |
| +			 * kernel mode.  However, some CPU errata
 | |
| +			 * include cases where DB# exceptions do not
 | |
| +			 * properly set bits in %dr6, e.g. Haswell
 | |
| +			 * HSD23 and Skylake-X SKZ24.
 | |
| +			 *
 | |
| +			 * A deferred DB# can also be raised on the
 | |
| +			 * first instructions of system call entry
 | |
| +			 * points or single-step traps via similar use
 | |
| +			 * of 'pop %ss' or 'mov xxx, %ss'.
 | |
| +			 */
 | |
| +			if (pti) {
 | |
| +				if (frame->tf_rip ==
 | |
| +				    (uintptr_t)IDTVEC(fast_syscall_pti) ||
 | |
| +#ifdef COMPAT_FREEBSD32
 | |
| +				    frame->tf_rip ==
 | |
| +				    (uintptr_t)IDTVEC(int0x80_syscall_pti) ||
 | |
| +#endif
 | |
| +				    frame->tf_rip == (uintptr_t)IDTVEC(bpt_pti))
 | |
| +					return;
 | |
| +			} else {
 | |
| +				if (frame->tf_rip ==
 | |
| +				    (uintptr_t)IDTVEC(fast_syscall) ||
 | |
| +#ifdef COMPAT_FREEBSD32
 | |
| +				    frame->tf_rip ==
 | |
| +				    (uintptr_t)IDTVEC(int0x80_syscall) ||
 | |
| +#endif
 | |
| +				    frame->tf_rip == (uintptr_t)IDTVEC(bpt))
 | |
| +					return;
 | |
| +			}
 | |
| +			if (frame->tf_rip == (uintptr_t)IDTVEC(dbg) ||
 | |
| +			    /* Needed for AMD. */
 | |
| +			    frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall32))
 | |
| +				return;
 | |
| +			/*
 | |
|  			 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
 | |
|  			 */
 | |
|  		case T_BPTFLT:
 | |
| --- sys/i386/i386/trap.c.orig
 | |
| +++ sys/i386/i386/trap.c
 | |
| @@ -116,6 +116,8 @@
 | |
|  
 | |
|  extern inthand_t IDTVEC(lcall_syscall);
 | |
|  
 | |
| +extern inthand_t IDTVEC(bpt), IDTVEC(dbg), IDTVEC(int0x80_syscall);
 | |
| +
 | |
|  #define MAX_TRAP_MSG		32
 | |
|  static char *trap_msg[] = {
 | |
|  	"",					/*  0 unused */
 | |
| @@ -668,7 +670,35 @@
 | |
|  				load_dr6(rdr6() & ~0xf);
 | |
|  				goto out;
 | |
|  			}
 | |
| +
 | |
|  			/*
 | |
| +			 * Malicious user code can configure a debug
 | |
| +			 * register watchpoint to trap on data access
 | |
| +			 * to the top of stack and then execute 'pop
 | |
| +			 * %ss; int 3'.  Due to exception deferral for
 | |
| +			 * 'pop %ss', the CPU will not interrupt 'int
 | |
| +			 * 3' to raise the DB# exception for the debug
 | |
| +			 * register but will postpone the DB# until
 | |
| +			 * execution of the first instruction of the
 | |
| +			 * BP# handler (in kernel mode).  Normally the
 | |
| +			 * previous check would ignore DB# exceptions
 | |
| +			 * for watchpoints on user addresses raised in
 | |
| +			 * kernel mode.  However, some CPU errata
 | |
| +			 * include cases where DB# exceptions do not
 | |
| +			 * properly set bits in %dr6, e.g. Haswell
 | |
| +			 * HSD23 and Skylake-X SKZ24.
 | |
| +			 *
 | |
| +			 * A deferred DB# can also be raised on the
 | |
| +			 * first instructions of system call entry
 | |
| +			 * points or single-step traps via similar use
 | |
| +			 * of 'pop %ss' or 'mov xxx, %ss'.
 | |
| +			 */
 | |
| +			if (frame->tf_eip ==
 | |
| +			    (uintptr_t)IDTVEC(int0x80_syscall) ||
 | |
| +			    frame->tf_eip == (uintptr_t)IDTVEC(bpt) ||
 | |
| +			    frame->tf_eip == (uintptr_t)IDTVEC(dbg))
 | |
| +				return;
 | |
| +			/*
 | |
|  			 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
 | |
|  			 */
 | |
|  		case T_BPTFLT:
 |