4618 lines
122 KiB
Diff
4618 lines
122 KiB
Diff
--- sys/amd64/amd64/apic_vector.S.orig
|
||
+++ sys/amd64/amd64/apic_vector.S
|
||
@@ -2,7 +2,13 @@
|
||
* Copyright (c) 1989, 1990 William F. Jolitz.
|
||
* Copyright (c) 1990 The Regents of the University of California.
|
||
* All rights reserved.
|
||
+ * Copyright (c) 2014-2018 The FreeBSD Foundation
|
||
+ * All rights reserved.
|
||
*
|
||
+ * Portions of this software were developed by
|
||
+ * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
|
||
+ * the FreeBSD Foundation.
|
||
+ *
|
||
* Redistribution and use in source and binary forms, with or without
|
||
* modification, are permitted provided that the following conditions
|
||
* are met:
|
||
@@ -38,12 +44,12 @@
|
||
|
||
#include "opt_smp.h"
|
||
|
||
+#include "assym.s"
|
||
+
|
||
#include <machine/asmacros.h>
|
||
#include <machine/specialreg.h>
|
||
#include <x86/apicreg.h>
|
||
|
||
-#include "assym.s"
|
||
-
|
||
#ifdef SMP
|
||
#define LK lock ;
|
||
#else
|
||
@@ -73,30 +79,28 @@
|
||
* translates that into a vector, and passes the vector to the
|
||
* lapic_handle_intr() function.
|
||
*/
|
||
-#define ISR_VEC(index, vec_name) \
|
||
- .text ; \
|
||
- SUPERALIGN_TEXT ; \
|
||
-IDTVEC(vec_name) ; \
|
||
- PUSH_FRAME ; \
|
||
- FAKE_MCOUNT(TF_RIP(%rsp)) ; \
|
||
- cmpl $0,x2apic_mode ; \
|
||
- je 1f ; \
|
||
- movl $(MSR_APIC_ISR0 + index),%ecx ; \
|
||
- rdmsr ; \
|
||
- jmp 2f ; \
|
||
-1: ; \
|
||
- movq lapic_map, %rdx ; /* pointer to local APIC */ \
|
||
- movl LA_ISR + 16 * (index)(%rdx), %eax ; /* load ISR */ \
|
||
-2: ; \
|
||
- bsrl %eax, %eax ; /* index of highest set bit in ISR */ \
|
||
- jz 3f ; \
|
||
- addl $(32 * index),%eax ; \
|
||
- movq %rsp, %rsi ; \
|
||
- movl %eax, %edi ; /* pass the IRQ */ \
|
||
- call lapic_handle_intr ; \
|
||
-3: ; \
|
||
- MEXITCOUNT ; \
|
||
+ .macro ISR_VEC index, vec_name
|
||
+ INTR_HANDLER \vec_name
|
||
+ FAKE_MCOUNT(TF_RIP(%rsp))
|
||
+ cmpl $0,x2apic_mode
|
||
+ je 1f
|
||
+ movl $(MSR_APIC_ISR0 + \index),%ecx
|
||
+ rdmsr
|
||
+ jmp 2f
|
||
+1:
|
||
+ movq lapic_map, %rdx /* pointer to local APIC */
|
||
+ movl LA_ISR + 16 * (\index)(%rdx), %eax /* load ISR */
|
||
+2:
|
||
+ bsrl %eax, %eax /* index of highest set bit in ISR */
|
||
+ jz 3f
|
||
+ addl $(32 * \index),%eax
|
||
+ movq %rsp, %rsi
|
||
+ movl %eax, %edi /* pass the IRQ */
|
||
+ call lapic_handle_intr
|
||
+3:
|
||
+ MEXITCOUNT
|
||
jmp doreti
|
||
+ .endm
|
||
|
||
/*
|
||
* Handle "spurious INTerrupts".
|
||
@@ -108,26 +112,21 @@
|
||
.text
|
||
SUPERALIGN_TEXT
|
||
IDTVEC(spuriousint)
|
||
-
|
||
/* No EOI cycle used here */
|
||
-
|
||
jmp doreti_iret
|
||
|
||
- ISR_VEC(1, apic_isr1)
|
||
- ISR_VEC(2, apic_isr2)
|
||
- ISR_VEC(3, apic_isr3)
|
||
- ISR_VEC(4, apic_isr4)
|
||
- ISR_VEC(5, apic_isr5)
|
||
- ISR_VEC(6, apic_isr6)
|
||
- ISR_VEC(7, apic_isr7)
|
||
+ ISR_VEC 1, apic_isr1
|
||
+ ISR_VEC 2, apic_isr2
|
||
+ ISR_VEC 3, apic_isr3
|
||
+ ISR_VEC 4, apic_isr4
|
||
+ ISR_VEC 5, apic_isr5
|
||
+ ISR_VEC 6, apic_isr6
|
||
+ ISR_VEC 7, apic_isr7
|
||
|
||
/*
|
||
* Local APIC periodic timer handler.
|
||
*/
|
||
- .text
|
||
- SUPERALIGN_TEXT
|
||
-IDTVEC(timerint)
|
||
- PUSH_FRAME
|
||
+ INTR_HANDLER timerint
|
||
FAKE_MCOUNT(TF_RIP(%rsp))
|
||
movq %rsp, %rdi
|
||
call lapic_handle_timer
|
||
@@ -137,10 +136,7 @@
|
||
/*
|
||
* Local APIC CMCI handler.
|
||
*/
|
||
- .text
|
||
- SUPERALIGN_TEXT
|
||
-IDTVEC(cmcint)
|
||
- PUSH_FRAME
|
||
+ INTR_HANDLER cmcint
|
||
FAKE_MCOUNT(TF_RIP(%rsp))
|
||
call lapic_handle_cmc
|
||
MEXITCOUNT
|
||
@@ -149,10 +145,7 @@
|
||
/*
|
||
* Local APIC error interrupt handler.
|
||
*/
|
||
- .text
|
||
- SUPERALIGN_TEXT
|
||
-IDTVEC(errorint)
|
||
- PUSH_FRAME
|
||
+ INTR_HANDLER errorint
|
||
FAKE_MCOUNT(TF_RIP(%rsp))
|
||
call lapic_handle_error
|
||
MEXITCOUNT
|
||
@@ -163,10 +156,7 @@
|
||
* Xen event channel upcall interrupt handler.
|
||
* Only used when the hypervisor supports direct vector callbacks.
|
||
*/
|
||
- .text
|
||
- SUPERALIGN_TEXT
|
||
-IDTVEC(xen_intr_upcall)
|
||
- PUSH_FRAME
|
||
+ INTR_HANDLER xen_intr_upcall
|
||
FAKE_MCOUNT(TF_RIP(%rsp))
|
||
movq %rsp, %rdi
|
||
call xen_intr_handle_upcall
|
||
@@ -183,59 +173,59 @@
|
||
SUPERALIGN_TEXT
|
||
invltlb_ret:
|
||
call as_lapic_eoi
|
||
- POP_FRAME
|
||
- jmp doreti_iret
|
||
+ jmp ld_regs
|
||
|
||
SUPERALIGN_TEXT
|
||
-IDTVEC(invltlb)
|
||
- PUSH_FRAME
|
||
-
|
||
+ INTR_HANDLER invltlb
|
||
call invltlb_handler
|
||
jmp invltlb_ret
|
||
|
||
-IDTVEC(invltlb_pcid)
|
||
- PUSH_FRAME
|
||
-
|
||
+ INTR_HANDLER invltlb_pcid
|
||
call invltlb_pcid_handler
|
||
jmp invltlb_ret
|
||
|
||
-IDTVEC(invltlb_invpcid)
|
||
- PUSH_FRAME
|
||
-
|
||
+ INTR_HANDLER invltlb_invpcid_nopti
|
||
call invltlb_invpcid_handler
|
||
jmp invltlb_ret
|
||
|
||
+ INTR_HANDLER invltlb_invpcid_pti
|
||
+ call invltlb_invpcid_pti_handler
|
||
+ jmp invltlb_ret
|
||
+
|
||
/*
|
||
* Single page TLB shootdown
|
||
*/
|
||
- .text
|
||
+ INTR_HANDLER invlpg
|
||
+ call invlpg_handler
|
||
+ jmp invltlb_ret
|
||
|
||
- SUPERALIGN_TEXT
|
||
-IDTVEC(invlpg)
|
||
- PUSH_FRAME
|
||
+ INTR_HANDLER invlpg_invpcid
|
||
+ call invlpg_invpcid_handler
|
||
+ jmp invltlb_ret
|
||
|
||
- call invlpg_handler
|
||
+ INTR_HANDLER invlpg_pcid
|
||
+ call invlpg_pcid_handler
|
||
jmp invltlb_ret
|
||
|
||
/*
|
||
* Page range TLB shootdown.
|
||
*/
|
||
- .text
|
||
- SUPERALIGN_TEXT
|
||
-IDTVEC(invlrng)
|
||
- PUSH_FRAME
|
||
-
|
||
+ INTR_HANDLER invlrng
|
||
call invlrng_handler
|
||
jmp invltlb_ret
|
||
|
||
+ INTR_HANDLER invlrng_invpcid
|
||
+ call invlrng_invpcid_handler
|
||
+ jmp invltlb_ret
|
||
+
|
||
+ INTR_HANDLER invlrng_pcid
|
||
+ call invlrng_pcid_handler
|
||
+ jmp invltlb_ret
|
||
+
|
||
/*
|
||
* Invalidate cache.
|
||
*/
|
||
- .text
|
||
- SUPERALIGN_TEXT
|
||
-IDTVEC(invlcache)
|
||
- PUSH_FRAME
|
||
-
|
||
+ INTR_HANDLER invlcache
|
||
call invlcache_handler
|
||
jmp invltlb_ret
|
||
|
||
@@ -242,15 +232,9 @@
|
||
/*
|
||
* Handler for IPIs sent via the per-cpu IPI bitmap.
|
||
*/
|
||
- .text
|
||
- SUPERALIGN_TEXT
|
||
-IDTVEC(ipi_intr_bitmap_handler)
|
||
- PUSH_FRAME
|
||
-
|
||
+ INTR_HANDLER ipi_intr_bitmap_handler
|
||
call as_lapic_eoi
|
||
-
|
||
FAKE_MCOUNT(TF_RIP(%rsp))
|
||
-
|
||
call ipi_bitmap_handler
|
||
MEXITCOUNT
|
||
jmp doreti
|
||
@@ -258,13 +242,8 @@
|
||
/*
|
||
* Executed by a CPU when it receives an IPI_STOP from another CPU.
|
||
*/
|
||
- .text
|
||
- SUPERALIGN_TEXT
|
||
-IDTVEC(cpustop)
|
||
- PUSH_FRAME
|
||
-
|
||
+ INTR_HANDLER cpustop
|
||
call as_lapic_eoi
|
||
-
|
||
call cpustop_handler
|
||
jmp doreti
|
||
|
||
@@ -271,11 +250,7 @@
|
||
/*
|
||
* Executed by a CPU when it receives an IPI_SUSPEND from another CPU.
|
||
*/
|
||
- .text
|
||
- SUPERALIGN_TEXT
|
||
-IDTVEC(cpususpend)
|
||
- PUSH_FRAME
|
||
-
|
||
+ INTR_HANDLER cpususpend
|
||
call cpususpend_handler
|
||
call as_lapic_eoi
|
||
jmp doreti
|
||
@@ -285,10 +260,7 @@
|
||
*
|
||
* - Calls the generic rendezvous action function.
|
||
*/
|
||
- .text
|
||
- SUPERALIGN_TEXT
|
||
-IDTVEC(rendezvous)
|
||
- PUSH_FRAME
|
||
+ INTR_HANDLER rendezvous
|
||
#ifdef COUNT_IPIS
|
||
movl PCPU(CPUID), %eax
|
||
movq ipi_rendezvous_counts(,%rax,8), %rax
|
||
@@ -328,4 +300,8 @@
|
||
popq %rax
|
||
jmp doreti_iret
|
||
|
||
+ INTR_HANDLER justreturn1
|
||
+ call as_lapic_eoi
|
||
+ jmp doreti
|
||
+
|
||
#endif /* SMP */
|
||
--- sys/amd64/amd64/atpic_vector.S.orig
|
||
+++ sys/amd64/amd64/atpic_vector.S
|
||
@@ -36,38 +36,35 @@
|
||
* master and slave interrupt controllers.
|
||
*/
|
||
|
||
+#include "assym.s"
|
||
#include <machine/asmacros.h>
|
||
|
||
-#include "assym.s"
|
||
-
|
||
/*
|
||
* Macros for interrupt entry, call to handler, and exit.
|
||
*/
|
||
-#define INTR(irq_num, vec_name) \
|
||
- .text ; \
|
||
- SUPERALIGN_TEXT ; \
|
||
-IDTVEC(vec_name) ; \
|
||
- PUSH_FRAME ; \
|
||
- FAKE_MCOUNT(TF_RIP(%rsp)) ; \
|
||
- movq %rsp, %rsi ; \
|
||
- movl $irq_num, %edi; /* pass the IRQ */ \
|
||
- call atpic_handle_intr ; \
|
||
- MEXITCOUNT ; \
|
||
+ .macro INTR irq_num, vec_name
|
||
+ INTR_HANDLER \vec_name
|
||
+ FAKE_MCOUNT(TF_RIP(%rsp))
|
||
+ movq %rsp, %rsi
|
||
+ movl $\irq_num, %edi /* pass the IRQ */
|
||
+ call atpic_handle_intr
|
||
+ MEXITCOUNT
|
||
jmp doreti
|
||
+ .endm
|
||
|
||
- INTR(0, atpic_intr0)
|
||
- INTR(1, atpic_intr1)
|
||
- INTR(2, atpic_intr2)
|
||
- INTR(3, atpic_intr3)
|
||
- INTR(4, atpic_intr4)
|
||
- INTR(5, atpic_intr5)
|
||
- INTR(6, atpic_intr6)
|
||
- INTR(7, atpic_intr7)
|
||
- INTR(8, atpic_intr8)
|
||
- INTR(9, atpic_intr9)
|
||
- INTR(10, atpic_intr10)
|
||
- INTR(11, atpic_intr11)
|
||
- INTR(12, atpic_intr12)
|
||
- INTR(13, atpic_intr13)
|
||
- INTR(14, atpic_intr14)
|
||
- INTR(15, atpic_intr15)
|
||
+ INTR 0, atpic_intr0
|
||
+ INTR 1, atpic_intr1
|
||
+ INTR 2, atpic_intr2
|
||
+ INTR 3, atpic_intr3
|
||
+ INTR 4, atpic_intr4
|
||
+ INTR 5, atpic_intr5
|
||
+ INTR 6, atpic_intr6
|
||
+ INTR 7, atpic_intr7
|
||
+ INTR 8, atpic_intr8
|
||
+ INTR 9, atpic_intr9
|
||
+ INTR 10, atpic_intr10
|
||
+ INTR 11, atpic_intr11
|
||
+ INTR 12, atpic_intr12
|
||
+ INTR 13, atpic_intr13
|
||
+ INTR 14, atpic_intr14
|
||
+ INTR 15, atpic_intr15
|
||
--- sys/amd64/amd64/cpu_switch.S.orig
|
||
+++ sys/amd64/amd64/cpu_switch.S
|
||
@@ -191,9 +191,11 @@
|
||
done_tss:
|
||
movq %r8,PCPU(RSP0)
|
||
movq %r8,PCPU(CURPCB)
|
||
- /* Update the TSS_RSP0 pointer for the next interrupt */
|
||
+ /* Update the COMMON_TSS_RSP0 pointer for the next interrupt */
|
||
+ cmpb $0,pti(%rip)
|
||
+ jne 1f
|
||
movq %r8,COMMON_TSS_RSP0(%rdx)
|
||
- movq %r12,PCPU(CURTHREAD) /* into next thread */
|
||
+1: movq %r12,PCPU(CURTHREAD) /* into next thread */
|
||
|
||
/* Test if debug registers should be restored. */
|
||
testl $PCB_DBREGS,PCB_FLAGS(%r8)
|
||
@@ -270,7 +272,12 @@
|
||
shrq $8,%rcx
|
||
movl %ecx,8(%rax)
|
||
movb $0x89,5(%rax) /* unset busy */
|
||
- movl $TSSSEL,%eax
|
||
+ cmpb $0,pti(%rip)
|
||
+ je 1f
|
||
+ movq PCPU(PRVSPACE),%rax
|
||
+ addq $PC_PTI_STACK+PC_PTI_STACK_SZ*8,%rax
|
||
+ movq %rax,COMMON_TSS_RSP0(%rdx)
|
||
+1: movl $TSSSEL,%eax
|
||
ltr %ax
|
||
jmp done_tss
|
||
|
||
--- sys/amd64/amd64/db_trace.c.orig
|
||
+++ sys/amd64/amd64/db_trace.c
|
||
@@ -200,6 +200,7 @@
|
||
if (name != NULL) {
|
||
if (strcmp(name, "calltrap") == 0 ||
|
||
strcmp(name, "fork_trampoline") == 0 ||
|
||
+ strcmp(name, "mchk_calltrap") == 0 ||
|
||
strcmp(name, "nmi_calltrap") == 0 ||
|
||
strcmp(name, "Xdblfault") == 0)
|
||
frame_type = TRAP;
|
||
--- sys/amd64/amd64/exception.S.orig
|
||
+++ sys/amd64/amd64/exception.S
|
||
@@ -1,12 +1,16 @@
|
||
/*-
|
||
* Copyright (c) 1989, 1990 William F. Jolitz.
|
||
* Copyright (c) 1990 The Regents of the University of California.
|
||
- * Copyright (c) 2007 The FreeBSD Foundation
|
||
+ * Copyright (c) 2007-2018 The FreeBSD Foundation
|
||
* All rights reserved.
|
||
*
|
||
* Portions of this software were developed by A. Joseph Koshy under
|
||
* sponsorship from the FreeBSD Foundation and Google, Inc.
|
||
*
|
||
+ * Portions of this software were developed by
|
||
+ * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
|
||
+ * the FreeBSD Foundation.
|
||
+ *
|
||
* Redistribution and use in source and binary forms, with or without
|
||
* modification, are permitted provided that the following conditions
|
||
* are met:
|
||
@@ -38,13 +42,13 @@
|
||
#include "opt_compat.h"
|
||
#include "opt_hwpmc_hooks.h"
|
||
|
||
+#include "assym.s"
|
||
+
|
||
#include <machine/asmacros.h>
|
||
#include <machine/psl.h>
|
||
#include <machine/trap.h>
|
||
#include <machine/specialreg.h>
|
||
|
||
-#include "assym.s"
|
||
-
|
||
#ifdef KDTRACE_HOOKS
|
||
.bss
|
||
.globl dtrace_invop_jump_addr
|
||
@@ -100,69 +104,62 @@
|
||
MCOUNT_LABEL(user)
|
||
MCOUNT_LABEL(btrap)
|
||
|
||
-/* Traps that we leave interrupts disabled for.. */
|
||
-#define TRAP_NOEN(a) \
|
||
- subq $TF_RIP,%rsp; \
|
||
- movl $(a),TF_TRAPNO(%rsp) ; \
|
||
- movq $0,TF_ADDR(%rsp) ; \
|
||
- movq $0,TF_ERR(%rsp) ; \
|
||
+/* Traps that we leave interrupts disabled for. */
|
||
+ .macro TRAP_NOEN l, trapno
|
||
+ PTI_ENTRY \l,X\l
|
||
+ .globl X\l
|
||
+ .type X\l,@function
|
||
+X\l: subq $TF_RIP,%rsp
|
||
+ movl $\trapno,TF_TRAPNO(%rsp)
|
||
+ movq $0,TF_ADDR(%rsp)
|
||
+ movq $0,TF_ERR(%rsp)
|
||
jmp alltraps_noen
|
||
-IDTVEC(dbg)
|
||
- TRAP_NOEN(T_TRCTRAP)
|
||
-IDTVEC(bpt)
|
||
- TRAP_NOEN(T_BPTFLT)
|
||
+ .endm
|
||
+
|
||
+ TRAP_NOEN dbg, T_TRCTRAP
|
||
+ TRAP_NOEN bpt, T_BPTFLT
|
||
#ifdef KDTRACE_HOOKS
|
||
-IDTVEC(dtrace_ret)
|
||
- TRAP_NOEN(T_DTRACE_RET)
|
||
+ TRAP_NOEN dtrace_ret, T_DTRACE_RET
|
||
#endif
|
||
|
||
/* Regular traps; The cpu does not supply tf_err for these. */
|
||
-#define TRAP(a) \
|
||
- subq $TF_RIP,%rsp; \
|
||
- movl $(a),TF_TRAPNO(%rsp) ; \
|
||
- movq $0,TF_ADDR(%rsp) ; \
|
||
- movq $0,TF_ERR(%rsp) ; \
|
||
+ .macro TRAP l, trapno
|
||
+ PTI_ENTRY \l,X\l
|
||
+ .globl X\l
|
||
+ .type X\l,@function
|
||
+X\l:
|
||
+ subq $TF_RIP,%rsp
|
||
+ movl $\trapno,TF_TRAPNO(%rsp)
|
||
+ movq $0,TF_ADDR(%rsp)
|
||
+ movq $0,TF_ERR(%rsp)
|
||
jmp alltraps
|
||
-IDTVEC(div)
|
||
- TRAP(T_DIVIDE)
|
||
-IDTVEC(ofl)
|
||
- TRAP(T_OFLOW)
|
||
-IDTVEC(bnd)
|
||
- TRAP(T_BOUND)
|
||
-IDTVEC(ill)
|
||
- TRAP(T_PRIVINFLT)
|
||
-IDTVEC(dna)
|
||
- TRAP(T_DNA)
|
||
-IDTVEC(fpusegm)
|
||
- TRAP(T_FPOPFLT)
|
||
-IDTVEC(mchk)
|
||
- TRAP(T_MCHK)
|
||
-IDTVEC(rsvd)
|
||
- TRAP(T_RESERVED)
|
||
-IDTVEC(fpu)
|
||
- TRAP(T_ARITHTRAP)
|
||
-IDTVEC(xmm)
|
||
- TRAP(T_XMMFLT)
|
||
+ .endm
|
||
|
||
-/* This group of traps have tf_err already pushed by the cpu */
|
||
-#define TRAP_ERR(a) \
|
||
- subq $TF_ERR,%rsp; \
|
||
- movl $(a),TF_TRAPNO(%rsp) ; \
|
||
- movq $0,TF_ADDR(%rsp) ; \
|
||
+ TRAP div, T_DIVIDE
|
||
+ TRAP ofl, T_OFLOW
|
||
+ TRAP bnd, T_BOUND
|
||
+ TRAP ill, T_PRIVINFLT
|
||
+ TRAP dna, T_DNA
|
||
+ TRAP fpusegm, T_FPOPFLT
|
||
+ TRAP rsvd, T_RESERVED
|
||
+ TRAP fpu, T_ARITHTRAP
|
||
+ TRAP xmm, T_XMMFLT
|
||
+
|
||
+/* This group of traps have tf_err already pushed by the cpu. */
|
||
+ .macro TRAP_ERR l, trapno
|
||
+ PTI_ENTRY \l,X\l,has_err=1
|
||
+ .globl X\l
|
||
+ .type X\l,@function
|
||
+X\l:
|
||
+ subq $TF_ERR,%rsp
|
||
+ movl $\trapno,TF_TRAPNO(%rsp)
|
||
+ movq $0,TF_ADDR(%rsp)
|
||
jmp alltraps
|
||
-IDTVEC(tss)
|
||
- TRAP_ERR(T_TSSFLT)
|
||
-IDTVEC(missing)
|
||
- subq $TF_ERR,%rsp
|
||
- movl $T_SEGNPFLT,TF_TRAPNO(%rsp)
|
||
- jmp prot_addrf
|
||
-IDTVEC(stk)
|
||
- subq $TF_ERR,%rsp
|
||
- movl $T_STKFLT,TF_TRAPNO(%rsp)
|
||
- jmp prot_addrf
|
||
-IDTVEC(align)
|
||
- TRAP_ERR(T_ALIGNFLT)
|
||
+ .endm
|
||
|
||
+ TRAP_ERR tss, T_TSSFLT
|
||
+ TRAP_ERR align, T_ALIGNFLT
|
||
+
|
||
/*
|
||
* alltraps entry point. Use swapgs if this is the first time in the
|
||
* kernel from userland. Reenable interrupts if they were enabled
|
||
@@ -174,25 +171,24 @@
|
||
alltraps:
|
||
movq %rdi,TF_RDI(%rsp)
|
||
testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
|
||
- jz alltraps_testi /* already running with kernel GS.base */
|
||
+ jz 1f /* already running with kernel GS.base */
|
||
swapgs
|
||
movq PCPU(CURPCB),%rdi
|
||
andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi)
|
||
- movw %fs,TF_FS(%rsp)
|
||
- movw %gs,TF_GS(%rsp)
|
||
- movw %es,TF_ES(%rsp)
|
||
- movw %ds,TF_DS(%rsp)
|
||
-alltraps_testi:
|
||
- testl $PSL_I,TF_RFLAGS(%rsp)
|
||
- jz alltraps_pushregs_no_rdi
|
||
+1: SAVE_SEGS
|
||
+ movq %rdx,TF_RDX(%rsp)
|
||
+ movq %rax,TF_RAX(%rsp)
|
||
+ movq %rcx,TF_RCX(%rsp)
|
||
+ testb $SEL_RPL_MASK,TF_CS(%rsp)
|
||
+ jz 2f
|
||
+ call handle_ibrs_entry
|
||
+2: testl $PSL_I,TF_RFLAGS(%rsp)
|
||
+ jz alltraps_pushregs_no_rax
|
||
sti
|
||
-alltraps_pushregs_no_rdi:
|
||
+alltraps_pushregs_no_rax:
|
||
movq %rsi,TF_RSI(%rsp)
|
||
- movq %rdx,TF_RDX(%rsp)
|
||
- movq %rcx,TF_RCX(%rsp)
|
||
movq %r8,TF_R8(%rsp)
|
||
movq %r9,TF_R9(%rsp)
|
||
- movq %rax,TF_RAX(%rsp)
|
||
movq %rbx,TF_RBX(%rsp)
|
||
movq %rbp,TF_RBP(%rsp)
|
||
movq %r10,TF_R10(%rsp)
|
||
@@ -248,15 +244,18 @@
|
||
alltraps_noen:
|
||
movq %rdi,TF_RDI(%rsp)
|
||
testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
|
||
- jz 1f /* already running with kernel GS.base */
|
||
+ jz 1f /* already running with kernel GS.base */
|
||
swapgs
|
||
movq PCPU(CURPCB),%rdi
|
||
andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi)
|
||
-1: movw %fs,TF_FS(%rsp)
|
||
- movw %gs,TF_GS(%rsp)
|
||
- movw %es,TF_ES(%rsp)
|
||
- movw %ds,TF_DS(%rsp)
|
||
- jmp alltraps_pushregs_no_rdi
|
||
+1: SAVE_SEGS
|
||
+ movq %rdx,TF_RDX(%rsp)
|
||
+ movq %rax,TF_RAX(%rsp)
|
||
+ movq %rcx,TF_RCX(%rsp)
|
||
+ testb $SEL_RPL_MASK,TF_CS(%rsp)
|
||
+ jz alltraps_pushregs_no_rax
|
||
+ call handle_ibrs_entry
|
||
+ jmp alltraps_pushregs_no_rax
|
||
|
||
IDTVEC(dblfault)
|
||
subq $TF_ERR,%rsp
|
||
@@ -278,10 +277,7 @@
|
||
movq %r13,TF_R13(%rsp)
|
||
movq %r14,TF_R14(%rsp)
|
||
movq %r15,TF_R15(%rsp)
|
||
- movw %fs,TF_FS(%rsp)
|
||
- movw %gs,TF_GS(%rsp)
|
||
- movw %es,TF_ES(%rsp)
|
||
- movw %ds,TF_DS(%rsp)
|
||
+ SAVE_SEGS
|
||
movl $TF_HASSEGS,TF_FLAGS(%rsp)
|
||
cld
|
||
testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
|
||
@@ -288,31 +284,54 @@
|
||
jz 1f /* already running with kernel GS.base */
|
||
swapgs
|
||
1:
|
||
- movq %rsp,%rdi
|
||
+ movq PCPU(KCR3),%rax
|
||
+ cmpq $~0,%rax
|
||
+ je 2f
|
||
+ movq %rax,%cr3
|
||
+2: movq %rsp,%rdi
|
||
call dblfault_handler
|
||
-2:
|
||
- hlt
|
||
- jmp 2b
|
||
+3: hlt
|
||
+ jmp 3b
|
||
|
||
+ ALIGN_TEXT
|
||
+IDTVEC(page_pti)
|
||
+ testb $SEL_RPL_MASK,PTI_CS-2*8(%rsp)
|
||
+ jz Xpage
|
||
+ swapgs
|
||
+ pushq %rax
|
||
+ pushq %rdx
|
||
+ movq %cr3,%rax
|
||
+ movq %rax,PCPU(SAVED_UCR3)
|
||
+ PTI_UUENTRY has_err=1
|
||
+ subq $TF_ERR,%rsp
|
||
+ movq %rdi,TF_RDI(%rsp)
|
||
+ movq %rax,TF_RAX(%rsp)
|
||
+ movq %rdx,TF_RDX(%rsp)
|
||
+ movq %rcx,TF_RCX(%rsp)
|
||
+ jmp page_u
|
||
IDTVEC(page)
|
||
subq $TF_ERR,%rsp
|
||
- movl $T_PAGEFLT,TF_TRAPNO(%rsp)
|
||
- movq %rdi,TF_RDI(%rsp) /* free up a GP register */
|
||
+ movq %rdi,TF_RDI(%rsp) /* free up GP registers */
|
||
+ movq %rax,TF_RAX(%rsp)
|
||
+ movq %rdx,TF_RDX(%rsp)
|
||
+ movq %rcx,TF_RCX(%rsp)
|
||
testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
|
||
- jz 1f /* already running with kernel GS.base */
|
||
+ jz page_cr2 /* already running with kernel GS.base */
|
||
swapgs
|
||
- movq PCPU(CURPCB),%rdi
|
||
+page_u: movq PCPU(CURPCB),%rdi
|
||
andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi)
|
||
-1: movq %cr2,%rdi /* preserve %cr2 before .. */
|
||
+ movq PCPU(SAVED_UCR3),%rax
|
||
+ movq %rax,PCB_SAVED_UCR3(%rdi)
|
||
+ call handle_ibrs_entry
|
||
+page_cr2:
|
||
+ movq %cr2,%rdi /* preserve %cr2 before .. */
|
||
movq %rdi,TF_ADDR(%rsp) /* enabling interrupts. */
|
||
- movw %fs,TF_FS(%rsp)
|
||
- movw %gs,TF_GS(%rsp)
|
||
- movw %es,TF_ES(%rsp)
|
||
- movw %ds,TF_DS(%rsp)
|
||
+ SAVE_SEGS
|
||
+ movl $T_PAGEFLT,TF_TRAPNO(%rsp)
|
||
testl $PSL_I,TF_RFLAGS(%rsp)
|
||
- jz alltraps_pushregs_no_rdi
|
||
+ jz alltraps_pushregs_no_rax
|
||
sti
|
||
- jmp alltraps_pushregs_no_rdi
|
||
+ jmp alltraps_pushregs_no_rax
|
||
|
||
/*
|
||
* We have to special-case this one. If we get a trap in doreti() at
|
||
@@ -319,30 +338,71 @@
|
||
* the iretq stage, we'll reenter with the wrong gs state. We'll have
|
||
* to do a special the swapgs in this case even coming from the kernel.
|
||
* XXX linux has a trap handler for their equivalent of load_gs().
|
||
+ *
|
||
+ * On the stack, we have the hardware interrupt frame to return
|
||
+ * to usermode (faulted) and another frame with error code, for
|
||
+ * fault. For PTI, copy both frames to the main thread stack.
|
||
*/
|
||
-IDTVEC(prot)
|
||
+ .macro PROTF_ENTRY name,trapno
|
||
+\name\()_pti_doreti:
|
||
+ pushq %rax
|
||
+ pushq %rdx
|
||
+ swapgs
|
||
+ movq PCPU(KCR3),%rax
|
||
+ movq %rax,%cr3
|
||
+ movq PCPU(RSP0),%rax
|
||
+ subq $2*PTI_SIZE-3*8,%rax /* no err, %rax, %rdx in faulted frame */
|
||
+ MOVE_STACKS (PTI_SIZE / 4 - 3)
|
||
+ movq %rax,%rsp
|
||
+ popq %rdx
|
||
+ popq %rax
|
||
+ swapgs
|
||
+ jmp X\name
|
||
+IDTVEC(\name\()_pti)
|
||
+ cmpq $doreti_iret,PTI_RIP-2*8(%rsp)
|
||
+ je \name\()_pti_doreti
|
||
+ testb $SEL_RPL_MASK,PTI_CS-2*8(%rsp) /* %rax, %rdx not yet pushed */
|
||
+ jz X\name
|
||
+ PTI_UENTRY has_err=1
|
||
+ swapgs
|
||
+IDTVEC(\name)
|
||
subq $TF_ERR,%rsp
|
||
- movl $T_PROTFLT,TF_TRAPNO(%rsp)
|
||
+ movl $\trapno,TF_TRAPNO(%rsp)
|
||
+ jmp prot_addrf
|
||
+ .endm
|
||
+
|
||
+ PROTF_ENTRY missing, T_SEGNPFLT
|
||
+ PROTF_ENTRY stk, T_STKFLT
|
||
+ PROTF_ENTRY prot, T_PROTFLT
|
||
+
|
||
prot_addrf:
|
||
movq $0,TF_ADDR(%rsp)
|
||
movq %rdi,TF_RDI(%rsp) /* free up a GP register */
|
||
+ movq %rax,TF_RAX(%rsp)
|
||
+ movq %rdx,TF_RDX(%rsp)
|
||
+ movq %rcx,TF_RCX(%rsp)
|
||
+ movw %fs,TF_FS(%rsp)
|
||
+ movw %gs,TF_GS(%rsp)
|
||
leaq doreti_iret(%rip),%rdi
|
||
cmpq %rdi,TF_RIP(%rsp)
|
||
- je 1f /* kernel but with user gsbase!! */
|
||
+ je 5f /* kernel but with user gsbase!! */
|
||
testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
|
||
- jz 2f /* already running with kernel GS.base */
|
||
-1: swapgs
|
||
-2: movq PCPU(CURPCB),%rdi
|
||
+ jz 6f /* already running with kernel GS.base */
|
||
+ swapgs
|
||
+ movq PCPU(CURPCB),%rdi
|
||
+4: call handle_ibrs_entry
|
||
orl $PCB_FULL_IRET,PCB_FLAGS(%rdi) /* always full iret from GPF */
|
||
- movw %fs,TF_FS(%rsp)
|
||
- movw %gs,TF_GS(%rsp)
|
||
movw %es,TF_ES(%rsp)
|
||
movw %ds,TF_DS(%rsp)
|
||
testl $PSL_I,TF_RFLAGS(%rsp)
|
||
- jz alltraps_pushregs_no_rdi
|
||
+ jz alltraps_pushregs_no_rax
|
||
sti
|
||
- jmp alltraps_pushregs_no_rdi
|
||
+ jmp alltraps_pushregs_no_rax
|
||
|
||
+5: swapgs
|
||
+6: movq PCPU(CURPCB),%rdi
|
||
+ jmp 4b
|
||
+
|
||
/*
|
||
* Fast syscall entry point. We enter here with just our new %cs/%ss set,
|
||
* and the new privilige level. We are still running on the old user stack
|
||
@@ -352,8 +412,18 @@
|
||
* We do not support invoking this from a custom %cs or %ss (e.g. using
|
||
* entries from an LDT).
|
||
*/
|
||
+ SUPERALIGN_TEXT
|
||
+IDTVEC(fast_syscall_pti)
|
||
+ swapgs
|
||
+ movq %rax,PCPU(SCRATCH_RAX)
|
||
+ movq PCPU(KCR3),%rax
|
||
+ movq %rax,%cr3
|
||
+ jmp fast_syscall_common
|
||
+ SUPERALIGN_TEXT
|
||
IDTVEC(fast_syscall)
|
||
swapgs
|
||
+ movq %rax,PCPU(SCRATCH_RAX)
|
||
+fast_syscall_common:
|
||
movq %rsp,PCPU(SCRATCH_RSP)
|
||
movq PCPU(RSP0),%rsp
|
||
/* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
|
||
@@ -363,10 +433,11 @@
|
||
movq %rcx,TF_RIP(%rsp) /* %rcx original value is in %r10 */
|
||
movq PCPU(SCRATCH_RSP),%r11 /* %r11 already saved */
|
||
movq %r11,TF_RSP(%rsp) /* user stack pointer */
|
||
- movw %fs,TF_FS(%rsp)
|
||
- movw %gs,TF_GS(%rsp)
|
||
- movw %es,TF_ES(%rsp)
|
||
- movw %ds,TF_DS(%rsp)
|
||
+ movq PCPU(SCRATCH_RAX),%rax
|
||
+ movq %rax,TF_RAX(%rsp) /* syscall number */
|
||
+ movq %rdx,TF_RDX(%rsp) /* arg 3 */
|
||
+ SAVE_SEGS
|
||
+ call handle_ibrs_entry
|
||
movq PCPU(CURPCB),%r11
|
||
andl $~PCB_FULL_IRET,PCB_FLAGS(%r11)
|
||
sti
|
||
@@ -375,11 +446,9 @@
|
||
movq $2,TF_ERR(%rsp)
|
||
movq %rdi,TF_RDI(%rsp) /* arg 1 */
|
||
movq %rsi,TF_RSI(%rsp) /* arg 2 */
|
||
- movq %rdx,TF_RDX(%rsp) /* arg 3 */
|
||
movq %r10,TF_RCX(%rsp) /* arg 4 */
|
||
movq %r8,TF_R8(%rsp) /* arg 5 */
|
||
movq %r9,TF_R9(%rsp) /* arg 6 */
|
||
- movq %rax,TF_RAX(%rsp) /* syscall number */
|
||
movq %rbx,TF_RBX(%rsp) /* C preserved */
|
||
movq %rbp,TF_RBP(%rsp) /* C preserved */
|
||
movq %r12,TF_R12(%rsp) /* C preserved */
|
||
@@ -398,11 +467,12 @@
|
||
/* Disable interrupts before testing PCB_FULL_IRET. */
|
||
cli
|
||
testl $PCB_FULL_IRET,PCB_FLAGS(%rax)
|
||
- jnz 3f
|
||
+ jnz 4f
|
||
/* Check for and handle AST's on return to userland. */
|
||
movq PCPU(CURTHREAD),%rax
|
||
testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax)
|
||
- jne 2f
|
||
+ jne 3f
|
||
+ call handle_ibrs_exit
|
||
/* Restore preserved registers. */
|
||
MEXITCOUNT
|
||
movq TF_RDI(%rsp),%rdi /* bonus; preserve arg 1 */
|
||
@@ -412,16 +482,21 @@
|
||
movq TF_RFLAGS(%rsp),%r11 /* original %rflags */
|
||
movq TF_RIP(%rsp),%rcx /* original %rip */
|
||
movq TF_RSP(%rsp),%rsp /* user stack pointer */
|
||
- swapgs
|
||
+ cmpb $0,pti
|
||
+ je 2f
|
||
+ movq PCPU(UCR3),%r9
|
||
+ movq %r9,%cr3
|
||
+ xorl %r9d,%r9d
|
||
+2: swapgs
|
||
sysretq
|
||
|
||
-2: /* AST scheduled. */
|
||
+3: /* AST scheduled. */
|
||
sti
|
||
movq %rsp,%rdi
|
||
call ast
|
||
jmp 1b
|
||
|
||
-3: /* Requested full context restore, use doreti for that. */
|
||
+4: /* Requested full context restore, use doreti for that. */
|
||
MEXITCOUNT
|
||
jmp doreti
|
||
|
||
@@ -477,10 +552,7 @@
|
||
movq %r13,TF_R13(%rsp)
|
||
movq %r14,TF_R14(%rsp)
|
||
movq %r15,TF_R15(%rsp)
|
||
- movw %fs,TF_FS(%rsp)
|
||
- movw %gs,TF_GS(%rsp)
|
||
- movw %es,TF_ES(%rsp)
|
||
- movw %ds,TF_DS(%rsp)
|
||
+ SAVE_SEGS
|
||
movl $TF_HASSEGS,TF_FLAGS(%rsp)
|
||
cld
|
||
xorl %ebx,%ebx
|
||
@@ -487,7 +559,8 @@
|
||
testb $SEL_RPL_MASK,TF_CS(%rsp)
|
||
jnz nmi_fromuserspace
|
||
/*
|
||
- * We've interrupted the kernel. Preserve GS.base in %r12.
|
||
+ * We've interrupted the kernel. Preserve GS.base in %r12,
|
||
+ * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d.
|
||
*/
|
||
movl $MSR_GSBASE,%ecx
|
||
rdmsr
|
||
@@ -499,10 +572,32 @@
|
||
movl %edx,%eax
|
||
shrq $32,%rdx
|
||
wrmsr
|
||
+ movq %cr3,%r13
|
||
+ movq PCPU(KCR3),%rax
|
||
+ cmpq $~0,%rax
|
||
+ je 1f
|
||
+ movq %rax,%cr3
|
||
+1: testl $CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
|
||
+ je nmi_calltrap
|
||
+ movl $MSR_IA32_SPEC_CTRL,%ecx
|
||
+ rdmsr
|
||
+ movl %eax,%r14d
|
||
+ call handle_ibrs_entry
|
||
jmp nmi_calltrap
|
||
nmi_fromuserspace:
|
||
incl %ebx
|
||
swapgs
|
||
+ movq %cr3,%r13
|
||
+ movq PCPU(KCR3),%rax
|
||
+ cmpq $~0,%rax
|
||
+ je 1f
|
||
+ movq %rax,%cr3
|
||
+1: call handle_ibrs_entry
|
||
+ movq PCPU(CURPCB),%rdi
|
||
+ testq %rdi,%rdi
|
||
+ jz 3f
|
||
+ orl $PCB_FULL_IRET,PCB_FLAGS(%rdi)
|
||
+3:
|
||
/* Note: this label is also used by ddb and gdb: */
|
||
nmi_calltrap:
|
||
FAKE_MCOUNT(TF_RIP(%rsp))
|
||
@@ -525,14 +620,9 @@
|
||
movq PCPU(CURTHREAD),%rax
|
||
orq %rax,%rax /* curthread present? */
|
||
jz nocallchain
|
||
- testl $TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
|
||
- jz nocallchain
|
||
/*
|
||
- * A user callchain is to be captured, so:
|
||
- * - Move execution to the regular kernel stack, to allow for
|
||
- * nested NMI interrupts.
|
||
- * - Take the processor out of "NMI" mode by faking an "iret".
|
||
- * - Enable interrupts, so that copyin() can work.
|
||
+ * Move execution to the regular kernel stack, because we
|
||
+ * committed to return through doreti.
|
||
*/
|
||
movq %rsp,%rsi /* source stack pointer */
|
||
movq $TF_SIZE,%rcx
|
||
@@ -539,12 +629,20 @@
|
||
movq PCPU(RSP0),%rdx
|
||
subq %rcx,%rdx
|
||
movq %rdx,%rdi /* destination stack pointer */
|
||
-
|
||
shrq $3,%rcx /* trap frame size in long words */
|
||
cld
|
||
rep
|
||
movsq /* copy trapframe */
|
||
+ movq %rdx,%rsp /* we are on the regular kstack */
|
||
|
||
+ testl $TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
|
||
+ jz nocallchain
|
||
+ /*
|
||
+ * A user callchain is to be captured, so:
|
||
+ * - Take the processor out of "NMI" mode by faking an "iret",
|
||
+ * to allow for nested NMI interrupts.
|
||
+ * - Enable interrupts, so that copyin() can work.
|
||
+ */
|
||
movl %ss,%eax
|
||
pushq %rax /* tf_ss */
|
||
pushq %rdx /* tf_rsp (on kernel stack) */
|
||
@@ -574,33 +672,139 @@
|
||
cli
|
||
nocallchain:
|
||
#endif
|
||
- testl %ebx,%ebx
|
||
+ testl %ebx,%ebx /* %ebx == 0 => return to userland */
|
||
jnz doreti_exit
|
||
-nmi_kernelexit:
|
||
/*
|
||
+ * Restore speculation control MSR, if preserved.
|
||
+ */
|
||
+ testl $CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
|
||
+ je 1f
|
||
+ movl %r14d,%eax
|
||
+ xorl %edx,%edx
|
||
+ movl $MSR_IA32_SPEC_CTRL,%ecx
|
||
+ wrmsr
|
||
+ /*
|
||
* Put back the preserved MSR_GSBASE value.
|
||
*/
|
||
+1: movl $MSR_GSBASE,%ecx
|
||
+ movq %r12,%rdx
|
||
+ movl %edx,%eax
|
||
+ shrq $32,%rdx
|
||
+ wrmsr
|
||
+ movq %r13,%cr3
|
||
+ RESTORE_REGS
|
||
+ addq $TF_RIP,%rsp
|
||
+ jmp doreti_iret
|
||
+
|
||
+/*
|
||
+ * MC# handling is similar to NMI.
|
||
+ *
|
||
+ * As with NMIs, machine check exceptions do not respect RFLAGS.IF and
|
||
+ * can occur at any time with a GS.base value that does not correspond
|
||
+ * to the privilege level in CS.
|
||
+ *
|
||
+ * Machine checks are not unblocked by iretq, but it is best to run
|
||
+ * the handler with interrupts disabled since the exception may have
|
||
+ * interrupted a critical section.
|
||
+ *
|
||
+ * The MC# handler runs on its own stack (tss_ist3). The canonical
|
||
+ * GS.base value for the processor is stored just above the bottom of
|
||
+ * its MC# stack. For exceptions taken from kernel mode, the current
|
||
+ * value in the processor's GS.base is saved at entry to C-preserved
|
||
+ * register %r12, the canonical value for GS.base is then loaded into
|
||
+ * the processor, and the saved value is restored at exit time. For
|
||
+ * exceptions taken from user mode, the cheaper 'SWAPGS' instructions
|
||
+ * are used for swapping GS.base.
|
||
+ */
|
||
+
|
||
+IDTVEC(mchk)
|
||
+ subq $TF_RIP,%rsp
|
||
+ movl $(T_MCHK),TF_TRAPNO(%rsp)
|
||
+ movq $0,TF_ADDR(%rsp)
|
||
+ movq $0,TF_ERR(%rsp)
|
||
+ movq %rdi,TF_RDI(%rsp)
|
||
+ movq %rsi,TF_RSI(%rsp)
|
||
+ movq %rdx,TF_RDX(%rsp)
|
||
+ movq %rcx,TF_RCX(%rsp)
|
||
+ movq %r8,TF_R8(%rsp)
|
||
+ movq %r9,TF_R9(%rsp)
|
||
+ movq %rax,TF_RAX(%rsp)
|
||
+ movq %rbx,TF_RBX(%rsp)
|
||
+ movq %rbp,TF_RBP(%rsp)
|
||
+ movq %r10,TF_R10(%rsp)
|
||
+ movq %r11,TF_R11(%rsp)
|
||
+ movq %r12,TF_R12(%rsp)
|
||
+ movq %r13,TF_R13(%rsp)
|
||
+ movq %r14,TF_R14(%rsp)
|
||
+ movq %r15,TF_R15(%rsp)
|
||
+ SAVE_SEGS
|
||
+ movl $TF_HASSEGS,TF_FLAGS(%rsp)
|
||
+ cld
|
||
+ xorl %ebx,%ebx
|
||
+ testb $SEL_RPL_MASK,TF_CS(%rsp)
|
||
+ jnz mchk_fromuserspace
|
||
+ /*
|
||
+ * We've interrupted the kernel. Preserve GS.base in %r12,
|
||
+ * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d.
|
||
+ */
|
||
movl $MSR_GSBASE,%ecx
|
||
+ rdmsr
|
||
+ movq %rax,%r12
|
||
+ shlq $32,%rdx
|
||
+ orq %rdx,%r12
|
||
+ /* Retrieve and load the canonical value for GS.base. */
|
||
+ movq TF_SIZE(%rsp),%rdx
|
||
+ movl %edx,%eax
|
||
+ shrq $32,%rdx
|
||
+ wrmsr
|
||
+ movq %cr3,%r13
|
||
+ movq PCPU(KCR3),%rax
|
||
+ cmpq $~0,%rax
|
||
+ je 1f
|
||
+ movq %rax,%cr3
|
||
+1: testl $CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
|
||
+ je mchk_calltrap
|
||
+ movl $MSR_IA32_SPEC_CTRL,%ecx
|
||
+ rdmsr
|
||
+ movl %eax,%r14d
|
||
+ call handle_ibrs_entry
|
||
+ jmp mchk_calltrap
|
||
+mchk_fromuserspace:
|
||
+ incl %ebx
|
||
+ swapgs
|
||
+ movq %cr3,%r13
|
||
+ movq PCPU(KCR3),%rax
|
||
+ cmpq $~0,%rax
|
||
+ je 1f
|
||
+ movq %rax,%cr3
|
||
+1: call handle_ibrs_entry
|
||
+/* Note: this label is also used by ddb and gdb: */
|
||
+mchk_calltrap:
|
||
+ FAKE_MCOUNT(TF_RIP(%rsp))
|
||
+ movq %rsp,%rdi
|
||
+ call mca_intr
|
||
+ MEXITCOUNT
|
||
+ testl %ebx,%ebx /* %ebx == 0 => return to userland */
|
||
+ jnz doreti_exit
|
||
+ /*
|
||
+ * Restore speculation control MSR, if preserved.
|
||
+ */
|
||
+ testl $CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
|
||
+ je 1f
|
||
+ movl %r14d,%eax
|
||
+ xorl %edx,%edx
|
||
+ movl $MSR_IA32_SPEC_CTRL,%ecx
|
||
+ wrmsr
|
||
+ /*
|
||
+ * Put back the preserved MSR_GSBASE value.
|
||
+ */
|
||
+1: movl $MSR_GSBASE,%ecx
|
||
movq %r12,%rdx
|
||
movl %edx,%eax
|
||
shrq $32,%rdx
|
||
wrmsr
|
||
-nmi_restoreregs:
|
||
- movq TF_RDI(%rsp),%rdi
|
||
- movq TF_RSI(%rsp),%rsi
|
||
- movq TF_RDX(%rsp),%rdx
|
||
- movq TF_RCX(%rsp),%rcx
|
||
- movq TF_R8(%rsp),%r8
|
||
- movq TF_R9(%rsp),%r9
|
||
- movq TF_RAX(%rsp),%rax
|
||
- movq TF_RBX(%rsp),%rbx
|
||
- movq TF_RBP(%rsp),%rbp
|
||
- movq TF_R10(%rsp),%r10
|
||
- movq TF_R11(%rsp),%r11
|
||
- movq TF_R12(%rsp),%r12
|
||
- movq TF_R13(%rsp),%r13
|
||
- movq TF_R14(%rsp),%r14
|
||
- movq TF_R15(%rsp),%r15
|
||
+ movq %r13,%cr3
|
||
+ RESTORE_REGS
|
||
addq $TF_RIP,%rsp
|
||
jmp doreti_iret
|
||
|
||
@@ -767,27 +971,39 @@
|
||
ld_ds:
|
||
movw TF_DS(%rsp),%ds
|
||
ld_regs:
|
||
- movq TF_RDI(%rsp),%rdi
|
||
- movq TF_RSI(%rsp),%rsi
|
||
- movq TF_RDX(%rsp),%rdx
|
||
- movq TF_RCX(%rsp),%rcx
|
||
- movq TF_R8(%rsp),%r8
|
||
- movq TF_R9(%rsp),%r9
|
||
- movq TF_RAX(%rsp),%rax
|
||
- movq TF_RBX(%rsp),%rbx
|
||
- movq TF_RBP(%rsp),%rbp
|
||
- movq TF_R10(%rsp),%r10
|
||
- movq TF_R11(%rsp),%r11
|
||
- movq TF_R12(%rsp),%r12
|
||
- movq TF_R13(%rsp),%r13
|
||
- movq TF_R14(%rsp),%r14
|
||
- movq TF_R15(%rsp),%r15
|
||
+ RESTORE_REGS
|
||
testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
|
||
- jz 1f /* keep running with kernel GS.base */
|
||
+ jz 2f /* keep running with kernel GS.base */
|
||
cli
|
||
+ call handle_ibrs_exit_rs
|
||
+ cmpb $0,pti
|
||
+ je 1f
|
||
+ pushq %rdx
|
||
+ movq PCPU(PRVSPACE),%rdx
|
||
+ addq $PC_PTI_STACK+PC_PTI_STACK_SZ*8-PTI_SIZE,%rdx
|
||
+ movq %rax,PTI_RAX(%rdx)
|
||
+ popq %rax
|
||
+ movq %rax,PTI_RDX(%rdx)
|
||
+ movq TF_RIP(%rsp),%rax
|
||
+ movq %rax,PTI_RIP(%rdx)
|
||
+ movq TF_CS(%rsp),%rax
|
||
+ movq %rax,PTI_CS(%rdx)
|
||
+ movq TF_RFLAGS(%rsp),%rax
|
||
+ movq %rax,PTI_RFLAGS(%rdx)
|
||
+ movq TF_RSP(%rsp),%rax
|
||
+ movq %rax,PTI_RSP(%rdx)
|
||
+ movq TF_SS(%rsp),%rax
|
||
+ movq %rax,PTI_SS(%rdx)
|
||
+ movq PCPU(UCR3),%rax
|
||
swapgs
|
||
-1:
|
||
- addq $TF_RIP,%rsp /* skip over tf_err, tf_trapno */
|
||
+ movq %rdx,%rsp
|
||
+ movq %rax,%cr3
|
||
+ popq %rdx
|
||
+ popq %rax
|
||
+ addq $8,%rsp
|
||
+ jmp doreti_iret
|
||
+1: swapgs
|
||
+2: addq $TF_RIP,%rsp
|
||
.globl doreti_iret
|
||
doreti_iret:
|
||
iretq
|
||
@@ -811,22 +1027,20 @@
|
||
.globl doreti_iret_fault
|
||
doreti_iret_fault:
|
||
subq $TF_RIP,%rsp /* space including tf_err, tf_trapno */
|
||
- testl $PSL_I,TF_RFLAGS(%rsp)
|
||
+ movq %rax,TF_RAX(%rsp)
|
||
+ movq %rdx,TF_RDX(%rsp)
|
||
+ movq %rcx,TF_RCX(%rsp)
|
||
+ call handle_ibrs_entry
|
||
+ testb $SEL_RPL_MASK,TF_CS(%rsp)
|
||
jz 1f
|
||
sti
|
||
1:
|
||
- movw %fs,TF_FS(%rsp)
|
||
- movw %gs,TF_GS(%rsp)
|
||
- movw %es,TF_ES(%rsp)
|
||
- movw %ds,TF_DS(%rsp)
|
||
+ SAVE_SEGS
|
||
movl $TF_HASSEGS,TF_FLAGS(%rsp)
|
||
movq %rdi,TF_RDI(%rsp)
|
||
movq %rsi,TF_RSI(%rsp)
|
||
- movq %rdx,TF_RDX(%rsp)
|
||
- movq %rcx,TF_RCX(%rsp)
|
||
movq %r8,TF_R8(%rsp)
|
||
movq %r9,TF_R9(%rsp)
|
||
- movq %rax,TF_RAX(%rsp)
|
||
movq %rbx,TF_RBX(%rsp)
|
||
movq %rbp,TF_RBP(%rsp)
|
||
movq %r10,TF_R10(%rsp)
|
||
@@ -845,7 +1059,7 @@
|
||
.globl ds_load_fault
|
||
ds_load_fault:
|
||
movl $T_PROTFLT,TF_TRAPNO(%rsp)
|
||
- testl $PSL_I,TF_RFLAGS(%rsp)
|
||
+ testb $SEL_RPL_MASK,TF_CS(%rsp)
|
||
jz 1f
|
||
sti
|
||
1:
|
||
--- sys/amd64/amd64/genassym.c.orig
|
||
+++ sys/amd64/amd64/genassym.c
|
||
@@ -145,6 +145,7 @@
|
||
ASSYM(PCB_TR, offsetof(struct pcb, pcb_tr));
|
||
ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags));
|
||
ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault));
|
||
+ASSYM(PCB_SAVED_UCR3, offsetof(struct pcb, pcb_saved_ucr3));
|
||
ASSYM(PCB_TSSP, offsetof(struct pcb, pcb_tssp));
|
||
ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save));
|
||
ASSYM(PCB_EFER, offsetof(struct pcb, pcb_efer));
|
||
@@ -190,6 +191,16 @@
|
||
ASSYM(TF_SIZE, sizeof(struct trapframe));
|
||
ASSYM(TF_HASSEGS, TF_HASSEGS);
|
||
|
||
+ASSYM(PTI_RDX, offsetof(struct pti_frame, pti_rdx));
|
||
+ASSYM(PTI_RAX, offsetof(struct pti_frame, pti_rax));
|
||
+ASSYM(PTI_ERR, offsetof(struct pti_frame, pti_err));
|
||
+ASSYM(PTI_RIP, offsetof(struct pti_frame, pti_rip));
|
||
+ASSYM(PTI_CS, offsetof(struct pti_frame, pti_cs));
|
||
+ASSYM(PTI_RFLAGS, offsetof(struct pti_frame, pti_rflags));
|
||
+ASSYM(PTI_RSP, offsetof(struct pti_frame, pti_rsp));
|
||
+ASSYM(PTI_SS, offsetof(struct pti_frame, pti_ss));
|
||
+ASSYM(PTI_SIZE, sizeof(struct pti_frame));
|
||
+
|
||
ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler));
|
||
ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc));
|
||
ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_rflags));
|
||
@@ -206,6 +217,7 @@
|
||
ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb));
|
||
ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
|
||
ASSYM(PC_SCRATCH_RSP, offsetof(struct pcpu, pc_scratch_rsp));
|
||
+ASSYM(PC_SCRATCH_RAX, offsetof(struct pcpu, pc_scratch_rax));
|
||
ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap));
|
||
ASSYM(PC_TSSP, offsetof(struct pcpu, pc_tssp));
|
||
ASSYM(PC_RSP0, offsetof(struct pcpu, pc_rsp0));
|
||
@@ -215,6 +227,12 @@
|
||
ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp));
|
||
ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss));
|
||
ASSYM(PC_PM_SAVE_CNT, offsetof(struct pcpu, pc_pm_save_cnt));
|
||
+ASSYM(PC_KCR3, offsetof(struct pcpu, pc_kcr3));
|
||
+ASSYM(PC_UCR3, offsetof(struct pcpu, pc_ucr3));
|
||
+ASSYM(PC_SAVED_UCR3, offsetof(struct pcpu, pc_saved_ucr3));
|
||
+ASSYM(PC_PTI_STACK, offsetof(struct pcpu, pc_pti_stack));
|
||
+ASSYM(PC_PTI_STACK_SZ, PC_PTI_STACK_SZ);
|
||
+ASSYM(PC_IBPB_SET, offsetof(struct pcpu, pc_ibpb_set));
|
||
|
||
ASSYM(LA_EOI, LAPIC_EOI * LAPIC_MEM_MUL);
|
||
ASSYM(LA_ISR, LAPIC_ISR0 * LAPIC_MEM_MUL);
|
||
--- sys/amd64/amd64/initcpu.c.orig
|
||
+++ sys/amd64/amd64/initcpu.c
|
||
@@ -194,6 +194,7 @@
|
||
wrmsr(MSR_EFER, msr);
|
||
pg_nx = PG_NX;
|
||
}
|
||
+ hw_ibrs_recalculate();
|
||
switch (cpu_vendor_id) {
|
||
case CPU_VENDOR_AMD:
|
||
init_amd();
|
||
--- sys/amd64/amd64/machdep.c.orig
|
||
+++ sys/amd64/amd64/machdep.c
|
||
@@ -114,6 +114,7 @@
|
||
#include <machine/clock.h>
|
||
#include <machine/cpu.h>
|
||
#include <machine/cputypes.h>
|
||
+#include <machine/frame.h>
|
||
#include <machine/intr_machdep.h>
|
||
#include <x86/mca.h>
|
||
#include <machine/md_var.h>
|
||
@@ -149,6 +150,14 @@
|
||
/* Sanity check for __curthread() */
|
||
CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
|
||
|
||
+/*
|
||
+ * The PTI trampoline stack needs enough space for a hardware trapframe and a
|
||
+ * couple of scratch registers, as well as the trapframe left behind after an
|
||
+ * iret fault.
|
||
+ */
|
||
+CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
|
||
+ offsetof(struct pti_frame, pti_rip));
|
||
+
|
||
extern u_int64_t hammer_time(u_int64_t, u_int64_t);
|
||
|
||
#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
|
||
@@ -180,12 +189,6 @@
|
||
.msi_init = msi_init,
|
||
};
|
||
|
||
-/*
|
||
- * The file "conf/ldscript.amd64" defines the symbol "kernphys". Its value is
|
||
- * the physical address at which the kernel is loaded.
|
||
- */
|
||
-extern char kernphys[];
|
||
-
|
||
struct msgbuf *msgbufp;
|
||
|
||
/*
|
||
@@ -670,7 +673,7 @@
|
||
struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
|
||
|
||
static char dblfault_stack[PAGE_SIZE] __aligned(16);
|
||
-
|
||
+static char mce0_stack[PAGE_SIZE] __aligned(16);
|
||
static char nmi0_stack[PAGE_SIZE] __aligned(16);
|
||
CTASSERT(sizeof(struct nmi_pcpu) == 16);
|
||
|
||
@@ -824,13 +827,20 @@
|
||
IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
|
||
IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
|
||
IDTVEC(xmm), IDTVEC(dblfault),
|
||
+ IDTVEC(div_pti), IDTVEC(dbg_pti), IDTVEC(bpt_pti),
|
||
+ IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
|
||
+ IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
|
||
+ IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
|
||
+ IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
|
||
+ IDTVEC(xmm_pti),
|
||
#ifdef KDTRACE_HOOKS
|
||
- IDTVEC(dtrace_ret),
|
||
+ IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
|
||
#endif
|
||
#ifdef XENHVM
|
||
- IDTVEC(xen_intr_upcall),
|
||
+ IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
|
||
#endif
|
||
- IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
|
||
+ IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
|
||
+ IDTVEC(fast_syscall_pti);
|
||
|
||
#ifdef DDB
|
||
/*
|
||
@@ -1523,6 +1533,23 @@
|
||
#endif
|
||
}
|
||
|
||
+/* Set up the fast syscall stuff */
|
||
+void
|
||
+amd64_conf_fast_syscall(void)
|
||
+{
|
||
+ uint64_t msr;
|
||
+
|
||
+ msr = rdmsr(MSR_EFER) | EFER_SCE;
|
||
+ wrmsr(MSR_EFER, msr);
|
||
+ wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
|
||
+ (u_int64_t)IDTVEC(fast_syscall));
|
||
+ wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
|
||
+ msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
|
||
+ ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
|
||
+ wrmsr(MSR_STAR, msr);
|
||
+ wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D);
|
||
+}
|
||
+
|
||
u_int64_t
|
||
hammer_time(u_int64_t modulep, u_int64_t physfree)
|
||
{
|
||
@@ -1531,7 +1558,7 @@
|
||
struct pcpu *pc;
|
||
struct nmi_pcpu *np;
|
||
struct xstate_hdr *xhdr;
|
||
- u_int64_t msr;
|
||
+ u_int64_t rsp0;
|
||
char *env;
|
||
size_t kstack0_sz;
|
||
int late_console;
|
||
@@ -1544,6 +1571,8 @@
|
||
|
||
kmdp = init_ops.parse_preload_data(modulep);
|
||
|
||
+ identify_cpu1();
|
||
+
|
||
/* Init basic tunables, hz etc */
|
||
init_param1();
|
||
|
||
@@ -1600,34 +1629,55 @@
|
||
mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
|
||
|
||
/* exceptions */
|
||
+ pti = pti_get_default();
|
||
+ TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
|
||
+
|
||
for (x = 0; x < NIDT; x++)
|
||
- setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
|
||
- setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0);
|
||
- setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0);
|
||
+ setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
|
||
+ SEL_KPL, 0);
|
||
+ setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
|
||
+ SEL_KPL, 0);
|
||
+ setidt(IDT_DB, pti ? &IDTVEC(dbg_pti) : &IDTVEC(dbg), SDT_SYSIGT,
|
||
+ SEL_KPL, 0);
|
||
setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
|
||
- setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0);
|
||
- setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0);
|
||
- setidt(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0);
|
||
- setidt(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0);
|
||
- setidt(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0);
|
||
+ setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
|
||
+ SEL_UPL, 0);
|
||
+ setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
|
||
+ SEL_KPL, 0);
|
||
+ setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
|
||
+ SEL_KPL, 0);
|
||
+ setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
|
||
+ SEL_KPL, 0);
|
||
+ setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
|
||
+ SEL_KPL, 0);
|
||
setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
|
||
- setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0);
|
||
- setidt(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0);
|
||
- setidt(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0);
|
||
- setidt(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0);
|
||
- setidt(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0);
|
||
- setidt(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0);
|
||
- setidt(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0);
|
||
- setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
|
||
- setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0);
|
||
- setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
|
||
+ setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
|
||
+ SDT_SYSIGT, SEL_KPL, 0);
|
||
+ setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
|
||
+ SEL_KPL, 0);
|
||
+ setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
|
||
+ SDT_SYSIGT, SEL_KPL, 0);
|
||
+ setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
|
||
+ SEL_KPL, 0);
|
||
+ setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
|
||
+ SEL_KPL, 0);
|
||
+ setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
|
||
+ SEL_KPL, 0);
|
||
+ setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
|
||
+ SEL_KPL, 0);
|
||
+ setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
|
||
+ SEL_KPL, 0);
|
||
+ setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
|
||
+ setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
|
||
+ SEL_KPL, 0);
|
||
#ifdef KDTRACE_HOOKS
|
||
- setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
|
||
+ setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
|
||
+ &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
|
||
#endif
|
||
#ifdef XENHVM
|
||
- setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_UPL, 0);
|
||
+ setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
|
||
+ &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
|
||
#endif
|
||
-
|
||
r_idt.rd_limit = sizeof(idt0) - 1;
|
||
r_idt.rd_base = (long) idt;
|
||
lidt(&r_idt);
|
||
@@ -1648,7 +1698,7 @@
|
||
!= NULL)
|
||
vty_set_preferred(VTY_VT);
|
||
|
||
- identify_cpu(); /* Final stage of CPU initialization */
|
||
+ finishidentcpu(); /* Final stage of CPU initialization */
|
||
initializecpu(); /* Initialize CPU registers */
|
||
initializecpucache();
|
||
|
||
@@ -1663,6 +1713,14 @@
|
||
np->np_pcpu = (register_t) pc;
|
||
common_tss[0].tss_ist2 = (long) np;
|
||
|
||
+ /*
|
||
+ * MC# stack, runs on ist3. The pcpu pointer is stored just
|
||
+ * above the start of the ist3 stack.
|
||
+ */
|
||
+ np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
|
||
+ np->np_pcpu = (register_t) pc;
|
||
+ common_tss[0].tss_ist3 = (long) np;
|
||
+
|
||
/* Set the IO permission bitmap (empty due to tss seg limit) */
|
||
common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
|
||
|
||
@@ -1669,15 +1727,7 @@
|
||
gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
|
||
ltr(gsel_tss);
|
||
|
||
- /* Set up the fast syscall stuff */
|
||
- msr = rdmsr(MSR_EFER) | EFER_SCE;
|
||
- wrmsr(MSR_EFER, msr);
|
||
- wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
|
||
- wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
|
||
- msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
|
||
- ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
|
||
- wrmsr(MSR_STAR, msr);
|
||
- wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
|
||
+ amd64_conf_fast_syscall();
|
||
|
||
/*
|
||
* Temporary forge some valid pointer to PCB, for exception
|
||
@@ -1749,10 +1799,12 @@
|
||
xhdr->xstate_bv = xsave_mask;
|
||
}
|
||
/* make an initial tss so cpu can get interrupt stack on syscall! */
|
||
- common_tss[0].tss_rsp0 = (vm_offset_t)thread0.td_pcb;
|
||
+ rsp0 = (vm_offset_t)thread0.td_pcb;
|
||
/* Ensure the stack is aligned to 16 bytes */
|
||
- common_tss[0].tss_rsp0 &= ~0xFul;
|
||
- PCPU_SET(rsp0, common_tss[0].tss_rsp0);
|
||
+ rsp0 &= ~0xFul;
|
||
+ common_tss[0].tss_rsp0 = pti ? ((vm_offset_t)PCPU_PTR(pti_stack) +
|
||
+ PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful : rsp0;
|
||
+ PCPU_SET(rsp0, rsp0);
|
||
PCPU_SET(curpcb, thread0.td_pcb);
|
||
|
||
/* transfer to user mode */
|
||
@@ -1782,6 +1834,8 @@
|
||
#endif
|
||
thread0.td_critnest = 0;
|
||
|
||
+ TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
|
||
+
|
||
/* Location of kernel stack for locore */
|
||
return ((u_int64_t)thread0.td_pcb);
|
||
}
|
||
--- sys/amd64/amd64/mp_machdep.c.orig
|
||
+++ sys/amd64/amd64/mp_machdep.c
|
||
@@ -85,10 +85,9 @@
|
||
|
||
/* Temporary variables for init_secondary() */
|
||
char *doublefault_stack;
|
||
+char *mce_stack;
|
||
char *nmi_stack;
|
||
|
||
-extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
|
||
-
|
||
/*
|
||
* Local data and functions.
|
||
*/
|
||
@@ -132,33 +131,50 @@
|
||
/* Install an inter-CPU IPI for TLB invalidation */
|
||
if (pmap_pcid_enabled) {
|
||
if (invpcid_works) {
|
||
- setidt(IPI_INVLTLB, IDTVEC(invltlb_invpcid),
|
||
- SDT_SYSIGT, SEL_KPL, 0);
|
||
+ setidt(IPI_INVLTLB, pti ?
|
||
+ IDTVEC(invltlb_invpcid_pti_pti) :
|
||
+ IDTVEC(invltlb_invpcid_nopti), SDT_SYSIGT,
|
||
+ SEL_KPL, 0);
|
||
+ setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_invpcid_pti) :
|
||
+ IDTVEC(invlpg_invpcid), SDT_SYSIGT, SEL_KPL, 0);
|
||
+ setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_invpcid_pti) :
|
||
+ IDTVEC(invlrng_invpcid), SDT_SYSIGT, SEL_KPL, 0);
|
||
} else {
|
||
- setidt(IPI_INVLTLB, IDTVEC(invltlb_pcid), SDT_SYSIGT,
|
||
- SEL_KPL, 0);
|
||
+ setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pcid_pti) :
|
||
+ IDTVEC(invltlb_pcid), SDT_SYSIGT, SEL_KPL, 0);
|
||
+ setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_pcid_pti) :
|
||
+ IDTVEC(invlpg_pcid), SDT_SYSIGT, SEL_KPL, 0);
|
||
+ setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_pcid_pti) :
|
||
+ IDTVEC(invlrng_pcid), SDT_SYSIGT, SEL_KPL, 0);
|
||
}
|
||
} else {
|
||
- setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0);
|
||
+ setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pti) : IDTVEC(invltlb),
|
||
+ SDT_SYSIGT, SEL_KPL, 0);
|
||
+ setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_pti) : IDTVEC(invlpg),
|
||
+ SDT_SYSIGT, SEL_KPL, 0);
|
||
+ setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_pti) : IDTVEC(invlrng),
|
||
+ SDT_SYSIGT, SEL_KPL, 0);
|
||
}
|
||
- setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0);
|
||
- setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0);
|
||
|
||
/* Install an inter-CPU IPI for cache invalidation. */
|
||
- setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0);
|
||
+ setidt(IPI_INVLCACHE, pti ? IDTVEC(invlcache_pti) : IDTVEC(invlcache),
|
||
+ SDT_SYSIGT, SEL_KPL, 0);
|
||
|
||
/* Install an inter-CPU IPI for all-CPU rendezvous */
|
||
- setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0);
|
||
+ setidt(IPI_RENDEZVOUS, pti ? IDTVEC(rendezvous_pti) :
|
||
+ IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0);
|
||
|
||
/* Install generic inter-CPU IPI handler */
|
||
- setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
|
||
- SDT_SYSIGT, SEL_KPL, 0);
|
||
+ setidt(IPI_BITMAP_VECTOR, pti ? IDTVEC(ipi_intr_bitmap_handler_pti) :
|
||
+ IDTVEC(ipi_intr_bitmap_handler), SDT_SYSIGT, SEL_KPL, 0);
|
||
|
||
/* Install an inter-CPU IPI for CPU stop/restart */
|
||
- setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0);
|
||
+ setidt(IPI_STOP, pti ? IDTVEC(cpustop_pti) : IDTVEC(cpustop),
|
||
+ SDT_SYSIGT, SEL_KPL, 0);
|
||
|
||
/* Install an inter-CPU IPI for CPU suspend/resume */
|
||
- setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0);
|
||
+ setidt(IPI_SUSPEND, pti ? IDTVEC(cpususpend_pti) : IDTVEC(cpususpend),
|
||
+ SDT_SYSIGT, SEL_KPL, 0);
|
||
|
||
/* Set boot_cpu_id if needed. */
|
||
if (boot_cpu_id == -1) {
|
||
@@ -188,7 +204,7 @@
|
||
{
|
||
struct pcpu *pc;
|
||
struct nmi_pcpu *np;
|
||
- u_int64_t msr, cr0;
|
||
+ u_int64_t cr0;
|
||
int cpu, gsel_tss, x;
|
||
struct region_descriptor ap_gdt;
|
||
|
||
@@ -197,7 +213,6 @@
|
||
|
||
/* Init tss */
|
||
common_tss[cpu] = common_tss[0];
|
||
- common_tss[cpu].tss_rsp0 = 0; /* not used until after switch */
|
||
common_tss[cpu].tss_iobase = sizeof(struct amd64tss) +
|
||
IOPERM_BITMAP_SIZE;
|
||
common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE];
|
||
@@ -206,6 +221,10 @@
|
||
np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
|
||
common_tss[cpu].tss_ist2 = (long) np;
|
||
|
||
+ /* The MC# stack runs on IST3. */
|
||
+ np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1;
|
||
+ common_tss[cpu].tss_ist3 = (long) np;
|
||
+
|
||
/* Prepare private GDT */
|
||
gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
|
||
for (x = 0; x < NGDT; x++) {
|
||
@@ -240,10 +259,17 @@
|
||
pc->pc_curpmap = kernel_pmap;
|
||
pc->pc_pcid_gen = 1;
|
||
pc->pc_pcid_next = PMAP_PCID_KERN + 1;
|
||
+ common_tss[cpu].tss_rsp0 = pti ? ((vm_offset_t)&pc->pc_pti_stack +
|
||
+ PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful : 0;
|
||
|
||
/* Save the per-cpu pointer for use by the NMI handler. */
|
||
+ np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
|
||
np->np_pcpu = (register_t) pc;
|
||
|
||
+ /* Save the per-cpu pointer for use by the MC# handler. */
|
||
+ np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1;
|
||
+ np->np_pcpu = (register_t) pc;
|
||
+
|
||
wrmsr(MSR_FSBASE, 0); /* User value */
|
||
wrmsr(MSR_GSBASE, (u_int64_t)pc);
|
||
wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */
|
||
@@ -263,15 +289,7 @@
|
||
cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
|
||
load_cr0(cr0);
|
||
|
||
- /* Set up the fast syscall stuff */
|
||
- msr = rdmsr(MSR_EFER) | EFER_SCE;
|
||
- wrmsr(MSR_EFER, msr);
|
||
- wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
|
||
- wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
|
||
- msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
|
||
- ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
|
||
- wrmsr(MSR_STAR, msr);
|
||
- wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
|
||
+ amd64_conf_fast_syscall();
|
||
|
||
/* signal our startup to the BSP. */
|
||
mp_naps++;
|
||
@@ -346,6 +364,8 @@
|
||
kstack_pages * PAGE_SIZE, M_WAITOK | M_ZERO);
|
||
doublefault_stack = (char *)kmem_malloc(kernel_arena,
|
||
PAGE_SIZE, M_WAITOK | M_ZERO);
|
||
+ mce_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
|
||
+ M_WAITOK | M_ZERO);
|
||
nmi_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
|
||
M_WAITOK | M_ZERO);
|
||
dpcpu = (void *)kmem_malloc(kernel_arena, DPCPU_SIZE,
|
||
@@ -428,9 +448,43 @@
|
||
}
|
||
|
||
void
|
||
+invltlb_invpcid_pti_handler(void)
|
||
+{
|
||
+ struct invpcid_descr d;
|
||
+ uint32_t generation;
|
||
+
|
||
+#ifdef COUNT_XINVLTLB_HITS
|
||
+ xhits_gbl[PCPU_GET(cpuid)]++;
|
||
+#endif /* COUNT_XINVLTLB_HITS */
|
||
+#ifdef COUNT_IPIS
|
||
+ (*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
|
||
+#endif /* COUNT_IPIS */
|
||
+
|
||
+ generation = smp_tlb_generation;
|
||
+ d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
|
||
+ d.pad = 0;
|
||
+ d.addr = 0;
|
||
+ if (smp_tlb_pmap == kernel_pmap) {
|
||
+ /*
|
||
+ * This invalidation actually needs to clear kernel
|
||
+ * mappings from the TLB in the current pmap, but
|
||
+ * since we were asked for the flush in the kernel
|
||
+ * pmap, achieve it by performing global flush.
|
||
+ */
|
||
+ invpcid(&d, INVPCID_CTXGLOB);
|
||
+ } else {
|
||
+ invpcid(&d, INVPCID_CTX);
|
||
+ d.pcid |= PMAP_PCID_USER_PT;
|
||
+ invpcid(&d, INVPCID_CTX);
|
||
+ }
|
||
+ PCPU_SET(smp_tlb_done, generation);
|
||
+}
|
||
+
|
||
+void
|
||
invltlb_pcid_handler(void)
|
||
{
|
||
- uint32_t generation;
|
||
+ uint64_t kcr3, ucr3;
|
||
+ uint32_t generation, pcid;
|
||
|
||
#ifdef COUNT_XINVLTLB_HITS
|
||
xhits_gbl[PCPU_GET(cpuid)]++;
|
||
@@ -451,9 +505,132 @@
|
||
* CPU.
|
||
*/
|
||
if (PCPU_GET(curpmap) == smp_tlb_pmap) {
|
||
- load_cr3(smp_tlb_pmap->pm_cr3 |
|
||
- smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid);
|
||
+ pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
|
||
+ kcr3 = smp_tlb_pmap->pm_cr3 | pcid;
|
||
+ ucr3 = smp_tlb_pmap->pm_ucr3;
|
||
+ if (ucr3 != PMAP_NO_CR3) {
|
||
+ ucr3 |= PMAP_PCID_USER_PT | pcid;
|
||
+ pmap_pti_pcid_invalidate(ucr3, kcr3);
|
||
+ } else
|
||
+ load_cr3(kcr3);
|
||
}
|
||
}
|
||
PCPU_SET(smp_tlb_done, generation);
|
||
}
|
||
+
|
||
+void
|
||
+invlpg_invpcid_handler(void)
|
||
+{
|
||
+ struct invpcid_descr d;
|
||
+ uint32_t generation;
|
||
+
|
||
+#ifdef COUNT_XINVLTLB_HITS
|
||
+ xhits_pg[PCPU_GET(cpuid)]++;
|
||
+#endif /* COUNT_XINVLTLB_HITS */
|
||
+#ifdef COUNT_IPIS
|
||
+ (*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
|
||
+#endif /* COUNT_IPIS */
|
||
+
|
||
+ generation = smp_tlb_generation; /* Overlap with serialization */
|
||
+ invlpg(smp_tlb_addr1);
|
||
+ if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) {
|
||
+ d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid |
|
||
+ PMAP_PCID_USER_PT;
|
||
+ d.pad = 0;
|
||
+ d.addr = smp_tlb_addr1;
|
||
+ invpcid(&d, INVPCID_ADDR);
|
||
+ }
|
||
+ PCPU_SET(smp_tlb_done, generation);
|
||
+}
|
||
+
|
||
+void
|
||
+invlpg_pcid_handler(void)
|
||
+{
|
||
+ uint64_t kcr3, ucr3;
|
||
+ uint32_t generation;
|
||
+ uint32_t pcid;
|
||
+
|
||
+#ifdef COUNT_XINVLTLB_HITS
|
||
+ xhits_pg[PCPU_GET(cpuid)]++;
|
||
+#endif /* COUNT_XINVLTLB_HITS */
|
||
+#ifdef COUNT_IPIS
|
||
+ (*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
|
||
+#endif /* COUNT_IPIS */
|
||
+
|
||
+ generation = smp_tlb_generation; /* Overlap with serialization */
|
||
+ invlpg(smp_tlb_addr1);
|
||
+ if (smp_tlb_pmap == PCPU_GET(curpmap) &&
|
||
+ (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3) {
|
||
+ pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
|
||
+ kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
|
||
+ ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
|
||
+ pmap_pti_pcid_invlpg(ucr3, kcr3, smp_tlb_addr1);
|
||
+ }
|
||
+ PCPU_SET(smp_tlb_done, generation);
|
||
+}
|
||
+
|
||
+void
|
||
+invlrng_invpcid_handler(void)
|
||
+{
|
||
+ struct invpcid_descr d;
|
||
+ vm_offset_t addr, addr2;
|
||
+ uint32_t generation;
|
||
+
|
||
+#ifdef COUNT_XINVLTLB_HITS
|
||
+ xhits_rng[PCPU_GET(cpuid)]++;
|
||
+#endif /* COUNT_XINVLTLB_HITS */
|
||
+#ifdef COUNT_IPIS
|
||
+ (*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
|
||
+#endif /* COUNT_IPIS */
|
||
+
|
||
+ addr = smp_tlb_addr1;
|
||
+ addr2 = smp_tlb_addr2;
|
||
+ generation = smp_tlb_generation; /* Overlap with serialization */
|
||
+ do {
|
||
+ invlpg(addr);
|
||
+ addr += PAGE_SIZE;
|
||
+ } while (addr < addr2);
|
||
+ if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) {
|
||
+ d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid |
|
||
+ PMAP_PCID_USER_PT;
|
||
+ d.pad = 0;
|
||
+ d.addr = smp_tlb_addr1;
|
||
+ do {
|
||
+ invpcid(&d, INVPCID_ADDR);
|
||
+ d.addr += PAGE_SIZE;
|
||
+ } while (d.addr < addr2);
|
||
+ }
|
||
+ PCPU_SET(smp_tlb_done, generation);
|
||
+}
|
||
+
|
||
+void
|
||
+invlrng_pcid_handler(void)
|
||
+{
|
||
+ vm_offset_t addr, addr2;
|
||
+ uint64_t kcr3, ucr3;
|
||
+ uint32_t generation;
|
||
+ uint32_t pcid;
|
||
+
|
||
+#ifdef COUNT_XINVLTLB_HITS
|
||
+ xhits_rng[PCPU_GET(cpuid)]++;
|
||
+#endif /* COUNT_XINVLTLB_HITS */
|
||
+#ifdef COUNT_IPIS
|
||
+ (*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
|
||
+#endif /* COUNT_IPIS */
|
||
+
|
||
+ addr = smp_tlb_addr1;
|
||
+ addr2 = smp_tlb_addr2;
|
||
+ generation = smp_tlb_generation; /* Overlap with serialization */
|
||
+ do {
|
||
+ invlpg(addr);
|
||
+ addr += PAGE_SIZE;
|
||
+ } while (addr < addr2);
|
||
+ if (smp_tlb_pmap == PCPU_GET(curpmap) &&
|
||
+ (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3) {
|
||
+ pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
|
||
+ kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
|
||
+ ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
|
||
+ pmap_pti_pcid_invlrng(ucr3, kcr3, smp_tlb_addr1, addr2);
|
||
+ }
|
||
+ PCPU_SET(smp_tlb_done, generation);
|
||
+}
|
||
--- sys/amd64/amd64/pmap.c.orig
|
||
+++ sys/amd64/amd64/pmap.c
|
||
@@ -9,11 +9,17 @@
|
||
* All rights reserved.
|
||
* Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
|
||
* All rights reserved.
|
||
+ * Copyright (c) 2014-2018 The FreeBSD Foundation
|
||
+ * All rights reserved.
|
||
*
|
||
* This code is derived from software contributed to Berkeley by
|
||
* the Systems Programming Group of the University of Utah Computer
|
||
* Science Department and William Jolitz of UUNET Technologies Inc.
|
||
*
|
||
+ * Portions of this software were developed by
|
||
+ * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
|
||
+ * the FreeBSD Foundation.
|
||
+ *
|
||
* Redistribution and use in source and binary forms, with or without
|
||
* modification, are permitted provided that the following conditions
|
||
* are met:
|
||
@@ -147,6 +153,7 @@
|
||
#ifdef SMP
|
||
#include <machine/smp.h>
|
||
#endif
|
||
+#include <machine/tss.h>
|
||
|
||
static __inline boolean_t
|
||
pmap_type_guest(pmap_t pmap)
|
||
@@ -208,6 +215,8 @@
|
||
return (mask);
|
||
}
|
||
|
||
+static pt_entry_t pg_g;
|
||
+
|
||
static __inline pt_entry_t
|
||
pmap_global_bit(pmap_t pmap)
|
||
{
|
||
@@ -215,7 +224,7 @@
|
||
|
||
switch (pmap->pm_type) {
|
||
case PT_X86:
|
||
- mask = X86_PG_G;
|
||
+ mask = pg_g;
|
||
break;
|
||
case PT_RVI:
|
||
case PT_EPT:
|
||
@@ -405,6 +414,15 @@
|
||
SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
|
||
"Is the invpcid instruction available ?");
|
||
|
||
+int pti = 0;
|
||
+SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
|
||
+ &pti, 0,
|
||
+ "Page Table Isolation enabled");
|
||
+static vm_object_t pti_obj;
|
||
+static pml4_entry_t *pti_pml4;
|
||
+static vm_pindex_t pti_pg_idx;
|
||
+static bool pti_finalized;
|
||
+
|
||
static int
|
||
pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
|
||
{
|
||
@@ -622,6 +640,11 @@
|
||
static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
|
||
vm_prot_t prot);
|
||
static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
|
||
+static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva,
|
||
+ bool exec);
|
||
+static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va);
|
||
+static pd_entry_t *pmap_pti_pde(vm_offset_t va);
|
||
+static void pmap_pti_wire_pte(void *pte);
|
||
static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
|
||
struct spglist *free, struct rwlock **lockp);
|
||
static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
|
||
@@ -901,7 +924,7 @@
|
||
/* XXX not fully used, underneath 2M pages */
|
||
pt_p = (pt_entry_t *)KPTphys;
|
||
for (i = 0; ptoa(i) < *firstaddr; i++)
|
||
- pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
|
||
+ pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | pg_g;
|
||
|
||
/* Now map the page tables at their location within PTmap */
|
||
pd_p = (pd_entry_t *)KPDphys;
|
||
@@ -912,7 +935,7 @@
|
||
/* This replaces some of the KPTphys entries above */
|
||
for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
|
||
pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
|
||
- X86_PG_G;
|
||
+ pg_g;
|
||
|
||
/* And connect up the PD to the PDP (leaving room for L4 pages) */
|
||
pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
|
||
@@ -932,7 +955,7 @@
|
||
for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
|
||
pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
|
||
/* Preset PG_M and PG_A because demotion expects it. */
|
||
- pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
|
||
+ pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
|
||
X86_PG_M | X86_PG_A;
|
||
}
|
||
pdp_p = (pdp_entry_t *)DMPDPphys;
|
||
@@ -939,7 +962,7 @@
|
||
for (i = 0; i < ndm1g; i++) {
|
||
pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
|
||
/* Preset PG_M and PG_A because demotion expects it. */
|
||
- pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
|
||
+ pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
|
||
X86_PG_M | X86_PG_A;
|
||
}
|
||
for (j = 0; i < ndmpdp; i++, j++) {
|
||
@@ -982,6 +1005,9 @@
|
||
pt_entry_t *pte;
|
||
int i;
|
||
|
||
+ if (!pti)
|
||
+ pg_g = X86_PG_G;
|
||
+
|
||
/*
|
||
* Create an initial set of page tables to run the kernel in.
|
||
*/
|
||
@@ -1014,6 +1040,7 @@
|
||
PMAP_LOCK_INIT(kernel_pmap);
|
||
kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
|
||
kernel_pmap->pm_cr3 = KPML4phys;
|
||
+ kernel_pmap->pm_ucr3 = PMAP_NO_CR3;
|
||
CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */
|
||
TAILQ_INIT(&kernel_pmap->pm_pvchunk);
|
||
kernel_pmap->pm_flags = pmap_flags;
|
||
@@ -1528,6 +1555,9 @@
|
||
pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
|
||
{
|
||
cpuset_t *mask;
|
||
+ struct invpcid_descr d;
|
||
+ uint64_t kcr3, ucr3;
|
||
+ uint32_t pcid;
|
||
u_int cpuid, i;
|
||
|
||
if (pmap_type_guest(pmap)) {
|
||
@@ -1544,9 +1574,32 @@
|
||
mask = &all_cpus;
|
||
} else {
|
||
cpuid = PCPU_GET(cpuid);
|
||
- if (pmap == PCPU_GET(curpmap))
|
||
+ if (pmap == PCPU_GET(curpmap)) {
|
||
invlpg(va);
|
||
- else if (pmap_pcid_enabled)
|
||
+ if (pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) {
|
||
+ /*
|
||
+ * Disable context switching. pm_pcid
|
||
+ * is recalculated on switch, which
|
||
+ * might make us use wrong pcid below.
|
||
+ */
|
||
+ critical_enter();
|
||
+ pcid = pmap->pm_pcids[cpuid].pm_pcid;
|
||
+
|
||
+ if (invpcid_works) {
|
||
+ d.pcid = pcid | PMAP_PCID_USER_PT;
|
||
+ d.pad = 0;
|
||
+ d.addr = va;
|
||
+ invpcid(&d, INVPCID_ADDR);
|
||
+ } else {
|
||
+ kcr3 = pmap->pm_cr3 | pcid |
|
||
+ CR3_PCID_SAVE;
|
||
+ ucr3 = pmap->pm_ucr3 | pcid |
|
||
+ PMAP_PCID_USER_PT | CR3_PCID_SAVE;
|
||
+ pmap_pti_pcid_invlpg(ucr3, kcr3, va);
|
||
+ }
|
||
+ critical_exit();
|
||
+ }
|
||
+ } else if (pmap_pcid_enabled)
|
||
pmap->pm_pcids[cpuid].pm_gen = 0;
|
||
if (pmap_pcid_enabled) {
|
||
CPU_FOREACH(i) {
|
||
@@ -1556,7 +1609,7 @@
|
||
}
|
||
mask = &pmap->pm_active;
|
||
}
|
||
- smp_masked_invlpg(*mask, va);
|
||
+ smp_masked_invlpg(*mask, va, pmap);
|
||
sched_unpin();
|
||
}
|
||
|
||
@@ -1567,7 +1620,10 @@
|
||
pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
|
||
{
|
||
cpuset_t *mask;
|
||
+ struct invpcid_descr d;
|
||
vm_offset_t addr;
|
||
+ uint64_t kcr3, ucr3;
|
||
+ uint32_t pcid;
|
||
u_int cpuid, i;
|
||
|
||
if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
|
||
@@ -1593,6 +1649,26 @@
|
||
if (pmap == PCPU_GET(curpmap)) {
|
||
for (addr = sva; addr < eva; addr += PAGE_SIZE)
|
||
invlpg(addr);
|
||
+ if (pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) {
|
||
+ critical_enter();
|
||
+ pcid = pmap->pm_pcids[cpuid].pm_pcid;
|
||
+ if (invpcid_works) {
|
||
+ d.pcid = pcid | PMAP_PCID_USER_PT;
|
||
+ d.pad = 0;
|
||
+ d.addr = sva;
|
||
+ for (; d.addr < eva; d.addr +=
|
||
+ PAGE_SIZE)
|
||
+ invpcid(&d, INVPCID_ADDR);
|
||
+ } else {
|
||
+ kcr3 = pmap->pm_cr3 | pcid |
|
||
+ CR3_PCID_SAVE;
|
||
+ ucr3 = pmap->pm_ucr3 | pcid |
|
||
+ PMAP_PCID_USER_PT | CR3_PCID_SAVE;
|
||
+ pmap_pti_pcid_invlrng(ucr3, kcr3, sva,
|
||
+ eva);
|
||
+ }
|
||
+ critical_exit();
|
||
+ }
|
||
} else if (pmap_pcid_enabled) {
|
||
pmap->pm_pcids[cpuid].pm_gen = 0;
|
||
}
|
||
@@ -1604,7 +1680,7 @@
|
||
}
|
||
mask = &pmap->pm_active;
|
||
}
|
||
- smp_masked_invlpg_range(*mask, sva, eva);
|
||
+ smp_masked_invlpg_range(*mask, sva, eva, pmap);
|
||
sched_unpin();
|
||
}
|
||
|
||
@@ -1613,6 +1689,8 @@
|
||
{
|
||
cpuset_t *mask;
|
||
struct invpcid_descr d;
|
||
+ uint64_t kcr3, ucr3;
|
||
+ uint32_t pcid;
|
||
u_int cpuid, i;
|
||
|
||
if (pmap_type_guest(pmap)) {
|
||
@@ -1636,15 +1714,29 @@
|
||
cpuid = PCPU_GET(cpuid);
|
||
if (pmap == PCPU_GET(curpmap)) {
|
||
if (pmap_pcid_enabled) {
|
||
+ critical_enter();
|
||
+ pcid = pmap->pm_pcids[cpuid].pm_pcid;
|
||
if (invpcid_works) {
|
||
- d.pcid = pmap->pm_pcids[cpuid].pm_pcid;
|
||
+ d.pcid = pcid;
|
||
d.pad = 0;
|
||
d.addr = 0;
|
||
invpcid(&d, INVPCID_CTX);
|
||
+ if (pmap->pm_ucr3 != PMAP_NO_CR3) {
|
||
+ d.pcid |= PMAP_PCID_USER_PT;
|
||
+ invpcid(&d, INVPCID_CTX);
|
||
+ }
|
||
} else {
|
||
- load_cr3(pmap->pm_cr3 | pmap->pm_pcids
|
||
- [PCPU_GET(cpuid)].pm_pcid);
|
||
+ kcr3 = pmap->pm_cr3 | pcid;
|
||
+ ucr3 = pmap->pm_ucr3;
|
||
+ if (ucr3 != PMAP_NO_CR3) {
|
||
+ ucr3 |= pcid | PMAP_PCID_USER_PT;
|
||
+ pmap_pti_pcid_invalidate(ucr3,
|
||
+ kcr3);
|
||
+ } else {
|
||
+ load_cr3(kcr3);
|
||
+ }
|
||
}
|
||
+ critical_exit();
|
||
} else {
|
||
invltlb();
|
||
}
|
||
@@ -1749,6 +1841,9 @@
|
||
void
|
||
pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
|
||
{
|
||
+ struct invpcid_descr d;
|
||
+ uint64_t kcr3, ucr3;
|
||
+ uint32_t pcid;
|
||
|
||
if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
|
||
pmap->pm_eptgen++;
|
||
@@ -1757,9 +1852,26 @@
|
||
KASSERT(pmap->pm_type == PT_X86,
|
||
("pmap_invalidate_range: unknown type %d", pmap->pm_type));
|
||
|
||
- if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
|
||
+ if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
|
||
invlpg(va);
|
||
- else if (pmap_pcid_enabled)
|
||
+ if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
|
||
+ pmap->pm_ucr3 != PMAP_NO_CR3) {
|
||
+ critical_enter();
|
||
+ pcid = pmap->pm_pcids[0].pm_pcid;
|
||
+ if (invpcid_works) {
|
||
+ d.pcid = pcid | PMAP_PCID_USER_PT;
|
||
+ d.pad = 0;
|
||
+ d.addr = va;
|
||
+ invpcid(&d, INVPCID_ADDR);
|
||
+ } else {
|
||
+ kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
|
||
+ ucr3 = pmap->pm_ucr3 | pcid |
|
||
+ PMAP_PCID_USER_PT | CR3_PCID_SAVE;
|
||
+ pmap_pti_pcid_invlpg(ucr3, kcr3, va);
|
||
+ }
|
||
+ critical_exit();
|
||
+ }
|
||
+ } else if (pmap_pcid_enabled)
|
||
pmap->pm_pcids[0].pm_gen = 0;
|
||
}
|
||
|
||
@@ -1766,7 +1878,9 @@
|
||
void
|
||
pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
|
||
{
|
||
+ struct invpcid_descr d;
|
||
vm_offset_t addr;
|
||
+ uint64_t kcr3, ucr3;
|
||
|
||
if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
|
||
pmap->pm_eptgen++;
|
||
@@ -1778,6 +1892,25 @@
|
||
if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
|
||
for (addr = sva; addr < eva; addr += PAGE_SIZE)
|
||
invlpg(addr);
|
||
+ if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
|
||
+ pmap->pm_ucr3 != PMAP_NO_CR3) {
|
||
+ critical_enter();
|
||
+ if (invpcid_works) {
|
||
+ d.pcid = pmap->pm_pcids[0].pm_pcid |
|
||
+ PMAP_PCID_USER_PT;
|
||
+ d.pad = 0;
|
||
+ d.addr = sva;
|
||
+ for (; d.addr < eva; d.addr += PAGE_SIZE)
|
||
+ invpcid(&d, INVPCID_ADDR);
|
||
+ } else {
|
||
+ kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].
|
||
+ pm_pcid | CR3_PCID_SAVE;
|
||
+ ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0].
|
||
+ pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
|
||
+ pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
|
||
+ }
|
||
+ critical_exit();
|
||
+ }
|
||
} else if (pmap_pcid_enabled) {
|
||
pmap->pm_pcids[0].pm_gen = 0;
|
||
}
|
||
@@ -1787,6 +1920,7 @@
|
||
pmap_invalidate_all(pmap_t pmap)
|
||
{
|
||
struct invpcid_descr d;
|
||
+ uint64_t kcr3, ucr3;
|
||
|
||
if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
|
||
pmap->pm_eptgen++;
|
||
@@ -1804,15 +1938,26 @@
|
||
}
|
||
} else if (pmap == PCPU_GET(curpmap)) {
|
||
if (pmap_pcid_enabled) {
|
||
+ critical_enter();
|
||
if (invpcid_works) {
|
||
d.pcid = pmap->pm_pcids[0].pm_pcid;
|
||
d.pad = 0;
|
||
d.addr = 0;
|
||
invpcid(&d, INVPCID_CTX);
|
||
+ if (pmap->pm_ucr3 != PMAP_NO_CR3) {
|
||
+ d.pcid |= PMAP_PCID_USER_PT;
|
||
+ invpcid(&d, INVPCID_CTX);
|
||
+ }
|
||
} else {
|
||
- load_cr3(pmap->pm_cr3 | pmap->pm_pcids[0].
|
||
- pm_pcid);
|
||
+ kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid;
|
||
+ if (pmap->pm_ucr3 != PMAP_NO_CR3) {
|
||
+ ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[
|
||
+ 0].pm_pcid | PMAP_PCID_USER_PT;
|
||
+ pmap_pti_pcid_invalidate(ucr3, kcr3);
|
||
+ } else
|
||
+ load_cr3(kcr3);
|
||
}
|
||
+ critical_exit();
|
||
} else {
|
||
invltlb();
|
||
}
|
||
@@ -2094,7 +2239,7 @@
|
||
pt_entry_t *pte;
|
||
|
||
pte = vtopte(va);
|
||
- pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
|
||
+ pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g);
|
||
}
|
||
|
||
static __inline void
|
||
@@ -2105,7 +2250,7 @@
|
||
|
||
pte = vtopte(va);
|
||
cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
|
||
- pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
|
||
+ pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | cache_bits);
|
||
}
|
||
|
||
/*
|
||
@@ -2165,7 +2310,7 @@
|
||
pa = VM_PAGE_TO_PHYS(m) | cache_bits;
|
||
if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
|
||
oldpte |= *pte;
|
||
- pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
|
||
+ pte_store(pte, pa | pg_g | X86_PG_RW | X86_PG_V);
|
||
}
|
||
pte++;
|
||
}
|
||
@@ -2284,6 +2429,10 @@
|
||
pml4_entry_t *pml4;
|
||
pml4 = pmap_pml4e(pmap, va);
|
||
*pml4 = 0;
|
||
+ if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) {
|
||
+ pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)];
|
||
+ *pml4 = 0;
|
||
+ }
|
||
} else if (m->pindex >= NUPDE) {
|
||
/* PD page */
|
||
pdp_entry_t *pdp;
|
||
@@ -2349,7 +2498,10 @@
|
||
|
||
PMAP_LOCK_INIT(pmap);
|
||
pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
|
||
+ pmap->pm_pml4u = NULL;
|
||
pmap->pm_cr3 = KPML4phys;
|
||
+ /* hack to keep pmap_pti_pcid_invalidate() alive */
|
||
+ pmap->pm_ucr3 = PMAP_NO_CR3;
|
||
pmap->pm_root.rt_root = 0;
|
||
CPU_ZERO(&pmap->pm_active);
|
||
TAILQ_INIT(&pmap->pm_pvchunk);
|
||
@@ -2358,6 +2510,8 @@
|
||
CPU_FOREACH(i) {
|
||
pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
|
||
pmap->pm_pcids[i].pm_gen = 0;
|
||
+ if (!pti)
|
||
+ __pcpu[i].pc_kcr3 = PMAP_NO_CR3;
|
||
}
|
||
PCPU_SET(curpmap, kernel_pmap);
|
||
pmap_activate(curthread);
|
||
@@ -2387,6 +2541,17 @@
|
||
X86_PG_A | X86_PG_M;
|
||
}
|
||
|
||
+static void
|
||
+pmap_pinit_pml4_pti(vm_page_t pml4pg)
|
||
+{
|
||
+ pml4_entry_t *pm_pml4;
|
||
+ int i;
|
||
+
|
||
+ pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
|
||
+ for (i = 0; i < NPML4EPG; i++)
|
||
+ pm_pml4[i] = pti_pml4[i];
|
||
+}
|
||
+
|
||
/*
|
||
* Initialize a preallocated and zeroed pmap structure,
|
||
* such as one in a vmspace structure.
|
||
@@ -2394,7 +2559,7 @@
|
||
int
|
||
pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
|
||
{
|
||
- vm_page_t pml4pg;
|
||
+ vm_page_t pml4pg, pml4pgu;
|
||
vm_paddr_t pml4phys;
|
||
int i;
|
||
|
||
@@ -2411,8 +2576,11 @@
|
||
pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
|
||
pmap->pm_pcids[i].pm_gen = 0;
|
||
}
|
||
- pmap->pm_cr3 = ~0; /* initialize to an invalid value */
|
||
+ pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */
|
||
+ pmap->pm_ucr3 = PMAP_NO_CR3;
|
||
+ pmap->pm_pml4u = NULL;
|
||
|
||
+ pmap->pm_type = pm_type;
|
||
if ((pml4pg->flags & PG_ZERO) == 0)
|
||
pagezero(pmap->pm_pml4);
|
||
|
||
@@ -2420,10 +2588,21 @@
|
||
* Do not install the host kernel mappings in the nested page
|
||
* tables. These mappings are meaningless in the guest physical
|
||
* address space.
|
||
+ * Install minimal kernel mappings in PTI case.
|
||
*/
|
||
- if ((pmap->pm_type = pm_type) == PT_X86) {
|
||
+ if (pm_type == PT_X86) {
|
||
pmap->pm_cr3 = pml4phys;
|
||
pmap_pinit_pml4(pml4pg);
|
||
+ if (pti) {
|
||
+ while ((pml4pgu = vm_page_alloc(NULL, 0,
|
||
+ VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED))
|
||
+ == NULL)
|
||
+ VM_WAIT;
|
||
+ pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(
|
||
+ VM_PAGE_TO_PHYS(pml4pgu));
|
||
+ pmap_pinit_pml4_pti(pml4pgu);
|
||
+ pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu);
|
||
+ }
|
||
}
|
||
|
||
pmap->pm_root.rt_root = 0;
|
||
@@ -2495,7 +2674,7 @@
|
||
*/
|
||
|
||
if (ptepindex >= (NUPDE + NUPDPE)) {
|
||
- pml4_entry_t *pml4;
|
||
+ pml4_entry_t *pml4, *pml4u;
|
||
vm_pindex_t pml4index;
|
||
|
||
/* Wire up a new PDPE page */
|
||
@@ -2502,7 +2681,21 @@
|
||
pml4index = ptepindex - (NUPDE + NUPDPE);
|
||
pml4 = &pmap->pm_pml4[pml4index];
|
||
*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
|
||
+ if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) {
|
||
+ /*
|
||
+ * PTI: Make all user-space mappings in the
|
||
+ * kernel-mode page table no-execute so that
|
||
+ * we detect any programming errors that leave
|
||
+ * the kernel-mode page table active on return
|
||
+ * to user space.
|
||
+ */
|
||
+ *pml4 |= pg_nx;
|
||
|
||
+ pml4u = &pmap->pm_pml4u[pml4index];
|
||
+ *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
|
||
+ PG_A | PG_M;
|
||
+ }
|
||
+
|
||
} else if (ptepindex >= NUPDE) {
|
||
vm_pindex_t pml4index;
|
||
vm_pindex_t pdpindex;
|
||
@@ -2702,6 +2895,13 @@
|
||
m->wire_count--;
|
||
atomic_subtract_int(&vm_cnt.v_wire_count, 1);
|
||
vm_page_free_zero(m);
|
||
+
|
||
+ if (pmap->pm_pml4u != NULL) {
|
||
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u));
|
||
+ m->wire_count--;
|
||
+ atomic_subtract_int(&vm_cnt.v_wire_count, 1);
|
||
+ vm_page_free(m);
|
||
+ }
|
||
}
|
||
|
||
static int
|
||
@@ -6867,13 +7067,15 @@
|
||
|
||
CRITICAL_ASSERT(curthread);
|
||
gen = PCPU_GET(pcid_gen);
|
||
- if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN ||
|
||
- pmap->pm_pcids[cpuid].pm_gen == gen)
|
||
+ if (!pti && (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN ||
|
||
+ pmap->pm_pcids[cpuid].pm_gen == gen))
|
||
return (CR3_PCID_SAVE);
|
||
pcid_next = PCPU_GET(pcid_next);
|
||
- KASSERT(pcid_next <= PMAP_PCID_OVERMAX, ("cpu %d pcid_next %#x",
|
||
- cpuid, pcid_next));
|
||
- if (pcid_next == PMAP_PCID_OVERMAX) {
|
||
+ KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) ||
|
||
+ (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN),
|
||
+ ("cpu %d pcid_next %#x", cpuid, pcid_next));
|
||
+ if ((!pti && pcid_next == PMAP_PCID_OVERMAX) ||
|
||
+ (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) {
|
||
new_gen = gen + 1;
|
||
if (new_gen == 0)
|
||
new_gen = 1;
|
||
@@ -6892,7 +7094,8 @@
|
||
pmap_activate_sw(struct thread *td)
|
||
{
|
||
pmap_t oldpmap, pmap;
|
||
- uint64_t cached, cr3;
|
||
+ struct invpcid_descr d;
|
||
+ uint64_t cached, cr3, kcr3, ucr3;
|
||
register_t rflags;
|
||
u_int cpuid;
|
||
|
||
@@ -6948,11 +7151,41 @@
|
||
PCPU_INC(pm_save_cnt);
|
||
}
|
||
PCPU_SET(curpmap, pmap);
|
||
+ if (pti) {
|
||
+ kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid;
|
||
+ ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid |
|
||
+ PMAP_PCID_USER_PT;
|
||
+
|
||
+ /*
|
||
+ * Manually invalidate translations cached
|
||
+ * from the user page table, which are not
|
||
+ * flushed by reload of cr3 with the kernel
|
||
+ * page table pointer above.
|
||
+ */
|
||
+ if (pmap->pm_ucr3 != PMAP_NO_CR3) {
|
||
+ if (invpcid_works) {
|
||
+ d.pcid = PMAP_PCID_USER_PT |
|
||
+ pmap->pm_pcids[cpuid].pm_pcid;
|
||
+ d.pad = 0;
|
||
+ d.addr = 0;
|
||
+ invpcid(&d, INVPCID_CTX);
|
||
+ } else {
|
||
+ pmap_pti_pcid_invalidate(ucr3, kcr3);
|
||
+ }
|
||
+ }
|
||
+
|
||
+ PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE);
|
||
+ PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE);
|
||
+ }
|
||
if (!invpcid_works)
|
||
intr_restore(rflags);
|
||
} else if (cr3 != pmap->pm_cr3) {
|
||
load_cr3(pmap->pm_cr3);
|
||
PCPU_SET(curpmap, pmap);
|
||
+ if (pti) {
|
||
+ PCPU_SET(kcr3, pmap->pm_cr3);
|
||
+ PCPU_SET(ucr3, pmap->pm_ucr3);
|
||
+ }
|
||
}
|
||
#ifdef SMP
|
||
CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
|
||
@@ -7271,6 +7504,291 @@
|
||
mtx_unlock_spin(&qframe_mtx);
|
||
}
|
||
|
||
+static vm_page_t
|
||
+pmap_pti_alloc_page(void)
|
||
+{
|
||
+ vm_page_t m;
|
||
+
|
||
+ VM_OBJECT_ASSERT_WLOCKED(pti_obj);
|
||
+ m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY |
|
||
+ VM_ALLOC_WIRED | VM_ALLOC_ZERO);
|
||
+ return (m);
|
||
+}
|
||
+
|
||
+static bool
|
||
+pmap_pti_free_page(vm_page_t m)
|
||
+{
|
||
+
|
||
+ KASSERT(m->wire_count > 0, ("page %p not wired", m));
|
||
+ m->wire_count--;
|
||
+ if (m->wire_count != 0)
|
||
+ return (false);
|
||
+ atomic_subtract_int(&vm_cnt.v_wire_count, 1);
|
||
+ vm_page_free_zero(m);
|
||
+ return (true);
|
||
+}
|
||
+
|
||
+static void
|
||
+pmap_pti_init(void)
|
||
+{
|
||
+ vm_page_t pml4_pg;
|
||
+ pdp_entry_t *pdpe;
|
||
+ vm_offset_t va;
|
||
+ int i;
|
||
+
|
||
+ if (!pti)
|
||
+ return;
|
||
+ pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL);
|
||
+ VM_OBJECT_WLOCK(pti_obj);
|
||
+ pml4_pg = pmap_pti_alloc_page();
|
||
+ pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg));
|
||
+ for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS &&
|
||
+ va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) {
|
||
+ pdpe = pmap_pti_pdpe(va);
|
||
+ pmap_pti_wire_pte(pdpe);
|
||
+ }
|
||
+ pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0],
|
||
+ (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false);
|
||
+ pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt +
|
||
+ sizeof(struct user_segment_descriptor) * NGDT * MAXCPU, false);
|
||
+ pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt +
|
||
+ sizeof(struct gate_descriptor) * NIDT, false);
|
||
+ pmap_pti_add_kva_locked((vm_offset_t)common_tss,
|
||
+ (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false);
|
||
+ CPU_FOREACH(i) {
|
||
+ /* Doublefault stack IST 1 */
|
||
+ va = common_tss[i].tss_ist1;
|
||
+ pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
|
||
+ /* NMI stack IST 2 */
|
||
+ va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu);
|
||
+ pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
|
||
+ /* MC# stack IST 3 */
|
||
+ va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu);
|
||
+ pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
|
||
+ }
|
||
+ pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
|
||
+ (vm_offset_t)etext, true);
|
||
+ pti_finalized = true;
|
||
+ VM_OBJECT_WUNLOCK(pti_obj);
|
||
+}
|
||
+SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL);
|
||
+
|
||
+static pdp_entry_t *
|
||
+pmap_pti_pdpe(vm_offset_t va)
|
||
+{
|
||
+ pml4_entry_t *pml4e;
|
||
+ pdp_entry_t *pdpe;
|
||
+ vm_page_t m;
|
||
+ vm_pindex_t pml4_idx;
|
||
+ vm_paddr_t mphys;
|
||
+
|
||
+ VM_OBJECT_ASSERT_WLOCKED(pti_obj);
|
||
+
|
||
+ pml4_idx = pmap_pml4e_index(va);
|
||
+ pml4e = &pti_pml4[pml4_idx];
|
||
+ m = NULL;
|
||
+ if (*pml4e == 0) {
|
||
+ if (pti_finalized)
|
||
+ panic("pml4 alloc after finalization\n");
|
||
+ m = pmap_pti_alloc_page();
|
||
+ if (*pml4e != 0) {
|
||
+ pmap_pti_free_page(m);
|
||
+ mphys = *pml4e & ~PAGE_MASK;
|
||
+ } else {
|
||
+ mphys = VM_PAGE_TO_PHYS(m);
|
||
+ *pml4e = mphys | X86_PG_RW | X86_PG_V;
|
||
+ }
|
||
+ } else {
|
||
+ mphys = *pml4e & ~PAGE_MASK;
|
||
+ }
|
||
+ pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va);
|
||
+ return (pdpe);
|
||
+}
|
||
+
|
||
+static void
|
||
+pmap_pti_wire_pte(void *pte)
|
||
+{
|
||
+ vm_page_t m;
|
||
+
|
||
+ VM_OBJECT_ASSERT_WLOCKED(pti_obj);
|
||
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
|
||
+ m->wire_count++;
|
||
+}
|
||
+
|
||
+static void
|
||
+pmap_pti_unwire_pde(void *pde, bool only_ref)
|
||
+{
|
||
+ vm_page_t m;
|
||
+
|
||
+ VM_OBJECT_ASSERT_WLOCKED(pti_obj);
|
||
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde));
|
||
+ MPASS(m->wire_count > 0);
|
||
+ MPASS(only_ref || m->wire_count > 1);
|
||
+ pmap_pti_free_page(m);
|
||
+}
|
||
+
|
||
+static void
|
||
+pmap_pti_unwire_pte(void *pte, vm_offset_t va)
|
||
+{
|
||
+ vm_page_t m;
|
||
+ pd_entry_t *pde;
|
||
+
|
||
+ VM_OBJECT_ASSERT_WLOCKED(pti_obj);
|
||
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
|
||
+ MPASS(m->wire_count > 0);
|
||
+ if (pmap_pti_free_page(m)) {
|
||
+ pde = pmap_pti_pde(va);
|
||
+ MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V);
|
||
+ *pde = 0;
|
||
+ pmap_pti_unwire_pde(pde, false);
|
||
+ }
|
||
+}
|
||
+
|
||
+static pd_entry_t *
|
||
+pmap_pti_pde(vm_offset_t va)
|
||
+{
|
||
+ pdp_entry_t *pdpe;
|
||
+ pd_entry_t *pde;
|
||
+ vm_page_t m;
|
||
+ vm_pindex_t pd_idx;
|
||
+ vm_paddr_t mphys;
|
||
+
|
||
+ VM_OBJECT_ASSERT_WLOCKED(pti_obj);
|
||
+
|
||
+ pdpe = pmap_pti_pdpe(va);
|
||
+ if (*pdpe == 0) {
|
||
+ m = pmap_pti_alloc_page();
|
||
+ if (*pdpe != 0) {
|
||
+ pmap_pti_free_page(m);
|
||
+ MPASS((*pdpe & X86_PG_PS) == 0);
|
||
+ mphys = *pdpe & ~PAGE_MASK;
|
||
+ } else {
|
||
+ mphys = VM_PAGE_TO_PHYS(m);
|
||
+ *pdpe = mphys | X86_PG_RW | X86_PG_V;
|
||
+ }
|
||
+ } else {
|
||
+ MPASS((*pdpe & X86_PG_PS) == 0);
|
||
+ mphys = *pdpe & ~PAGE_MASK;
|
||
+ }
|
||
+
|
||
+ pde = (pd_entry_t *)PHYS_TO_DMAP(mphys);
|
||
+ pd_idx = pmap_pde_index(va);
|
||
+ pde += pd_idx;
|
||
+ return (pde);
|
||
+}
|
||
+
|
||
+static pt_entry_t *
|
||
+pmap_pti_pte(vm_offset_t va, bool *unwire_pde)
|
||
+{
|
||
+ pd_entry_t *pde;
|
||
+ pt_entry_t *pte;
|
||
+ vm_page_t m;
|
||
+ vm_paddr_t mphys;
|
||
+
|
||
+ VM_OBJECT_ASSERT_WLOCKED(pti_obj);
|
||
+
|
||
+ pde = pmap_pti_pde(va);
|
||
+ if (unwire_pde != NULL) {
|
||
+ *unwire_pde = true;
|
||
+ pmap_pti_wire_pte(pde);
|
||
+ }
|
||
+ if (*pde == 0) {
|
||
+ m = pmap_pti_alloc_page();
|
||
+ if (*pde != 0) {
|
||
+ pmap_pti_free_page(m);
|
||
+ MPASS((*pde & X86_PG_PS) == 0);
|
||
+ mphys = *pde & ~(PAGE_MASK | pg_nx);
|
||
+ } else {
|
||
+ mphys = VM_PAGE_TO_PHYS(m);
|
||
+ *pde = mphys | X86_PG_RW | X86_PG_V;
|
||
+ if (unwire_pde != NULL)
|
||
+ *unwire_pde = false;
|
||
+ }
|
||
+ } else {
|
||
+ MPASS((*pde & X86_PG_PS) == 0);
|
||
+ mphys = *pde & ~(PAGE_MASK | pg_nx);
|
||
+ }
|
||
+
|
||
+ pte = (pt_entry_t *)PHYS_TO_DMAP(mphys);
|
||
+ pte += pmap_pte_index(va);
|
||
+
|
||
+ return (pte);
|
||
+}
|
||
+
|
||
+static void
|
||
+pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec)
|
||
+{
|
||
+ vm_paddr_t pa;
|
||
+ pd_entry_t *pde;
|
||
+ pt_entry_t *pte, ptev;
|
||
+ bool unwire_pde;
|
||
+
|
||
+ VM_OBJECT_ASSERT_WLOCKED(pti_obj);
|
||
+
|
||
+ sva = trunc_page(sva);
|
||
+ MPASS(sva > VM_MAXUSER_ADDRESS);
|
||
+ eva = round_page(eva);
|
||
+ MPASS(sva < eva);
|
||
+ for (; sva < eva; sva += PAGE_SIZE) {
|
||
+ pte = pmap_pti_pte(sva, &unwire_pde);
|
||
+ pa = pmap_kextract(sva);
|
||
+ ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A |
|
||
+ (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap,
|
||
+ VM_MEMATTR_DEFAULT, FALSE);
|
||
+ if (*pte == 0) {
|
||
+ pte_store(pte, ptev);
|
||
+ pmap_pti_wire_pte(pte);
|
||
+ } else {
|
||
+ KASSERT(!pti_finalized,
|
||
+ ("pti overlap after fin %#lx %#lx %#lx",
|
||
+ sva, *pte, ptev));
|
||
+ KASSERT(*pte == ptev,
|
||
+ ("pti non-identical pte after fin %#lx %#lx %#lx",
|
||
+ sva, *pte, ptev));
|
||
+ }
|
||
+ if (unwire_pde) {
|
||
+ pde = pmap_pti_pde(sva);
|
||
+ pmap_pti_unwire_pde(pde, true);
|
||
+ }
|
||
+ }
|
||
+}
|
||
+
|
||
+void
|
||
+pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec)
|
||
+{
|
||
+
|
||
+ if (!pti)
|
||
+ return;
|
||
+ VM_OBJECT_WLOCK(pti_obj);
|
||
+ pmap_pti_add_kva_locked(sva, eva, exec);
|
||
+ VM_OBJECT_WUNLOCK(pti_obj);
|
||
+}
|
||
+
|
||
+void
|
||
+pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva)
|
||
+{
|
||
+ pt_entry_t *pte;
|
||
+ vm_offset_t va;
|
||
+
|
||
+ if (!pti)
|
||
+ return;
|
||
+ sva = rounddown2(sva, PAGE_SIZE);
|
||
+ MPASS(sva > VM_MAXUSER_ADDRESS);
|
||
+ eva = roundup2(eva, PAGE_SIZE);
|
||
+ MPASS(sva < eva);
|
||
+ VM_OBJECT_WLOCK(pti_obj);
|
||
+ for (va = sva; va < eva; va += PAGE_SIZE) {
|
||
+ pte = pmap_pti_pte(va, NULL);
|
||
+ KASSERT((*pte & X86_PG_V) != 0,
|
||
+ ("invalid pte va %#lx pte %#lx pt %#lx", va,
|
||
+ (u_long)pte, *pte));
|
||
+ pte_clear(pte);
|
||
+ pmap_pti_unwire_pte(pte, va);
|
||
+ }
|
||
+ pmap_invalidate_range(kernel_pmap, sva, eva);
|
||
+ VM_OBJECT_WUNLOCK(pti_obj);
|
||
+}
|
||
+
|
||
#include "opt_ddb.h"
|
||
#ifdef DDB
|
||
#include <ddb/ddb.h>
|
||
--- sys/amd64/amd64/support.S.orig
|
||
+++ sys/amd64/amd64/support.S
|
||
@@ -33,6 +33,7 @@
|
||
#include "opt_ddb.h"
|
||
|
||
#include <machine/asmacros.h>
|
||
+#include <machine/specialreg.h>
|
||
#include <machine/pmap.h>
|
||
|
||
#include "assym.s"
|
||
@@ -787,3 +788,115 @@
|
||
movl $EFAULT,%eax
|
||
POP_FRAME_POINTER
|
||
ret
|
||
+
|
||
+/*
|
||
+ * void pmap_pti_pcid_invalidate(uint64_t ucr3, uint64_t kcr3);
|
||
+ * Invalidates address space addressed by ucr3, then returns to kcr3.
|
||
+ * Done in assembler to ensure no other memory accesses happen while
|
||
+ * on ucr3.
|
||
+ */
|
||
+ ALIGN_TEXT
|
||
+ENTRY(pmap_pti_pcid_invalidate)
|
||
+ pushfq
|
||
+ cli
|
||
+ movq %rdi,%cr3 /* to user page table */
|
||
+ movq %rsi,%cr3 /* back to kernel */
|
||
+ popfq
|
||
+ retq
|
||
+
|
||
+/*
|
||
+ * void pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va);
|
||
+ * Invalidates virtual address va in address space ucr3, then returns to kcr3.
|
||
+ */
|
||
+ ALIGN_TEXT
|
||
+ENTRY(pmap_pti_pcid_invlpg)
|
||
+ pushfq
|
||
+ cli
|
||
+ movq %rdi,%cr3 /* to user page table */
|
||
+ invlpg (%rdx)
|
||
+ movq %rsi,%cr3 /* back to kernel */
|
||
+ popfq
|
||
+ retq
|
||
+
|
||
+/*
|
||
+ * void pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva,
|
||
+ * vm_offset_t eva);
|
||
+ * Invalidates virtual addresses between sva and eva in address space ucr3,
|
||
+ * then returns to kcr3.
|
||
+ */
|
||
+ ALIGN_TEXT
|
||
+ENTRY(pmap_pti_pcid_invlrng)
|
||
+ pushfq
|
||
+ cli
|
||
+ movq %rdi,%cr3 /* to user page table */
|
||
+1: invlpg (%rdx)
|
||
+ addq $PAGE_SIZE,%rdx
|
||
+ cmpq %rdx,%rcx
|
||
+ ja 1b
|
||
+ movq %rsi,%cr3 /* back to kernel */
|
||
+ popfq
|
||
+ retq
|
||
+
|
||
+ .altmacro
|
||
+ .macro ibrs_seq_label l
|
||
+handle_ibrs_\l:
|
||
+ .endm
|
||
+ .macro ibrs_call_label l
|
||
+ call handle_ibrs_\l
|
||
+ .endm
|
||
+ .macro ibrs_seq count
|
||
+ ll=1
|
||
+ .rept \count
|
||
+ ibrs_call_label %(ll)
|
||
+ nop
|
||
+ ibrs_seq_label %(ll)
|
||
+ addq $8,%rsp
|
||
+ ll=ll+1
|
||
+ .endr
|
||
+ .endm
|
||
+
|
||
+/* all callers already saved %rax, %rdx, and %rcx */
|
||
+ENTRY(handle_ibrs_entry)
|
||
+ cmpb $0,hw_ibrs_active(%rip)
|
||
+ je 1f
|
||
+ movl $MSR_IA32_SPEC_CTRL,%ecx
|
||
+ movl $(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax
|
||
+ movl $(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32,%edx
|
||
+ wrmsr
|
||
+ movb $1,PCPU(IBPB_SET)
|
||
+ testl $CPUID_STDEXT_SMEP,cpu_stdext_feature(%rip)
|
||
+ jne 1f
|
||
+ ibrs_seq 32
|
||
+1: ret
|
||
+END(handle_ibrs_entry)
|
||
+
|
||
+ENTRY(handle_ibrs_exit)
|
||
+ cmpb $0,PCPU(IBPB_SET)
|
||
+ je 1f
|
||
+ movl $MSR_IA32_SPEC_CTRL,%ecx
|
||
+ xorl %eax,%eax
|
||
+ xorl %edx,%edx
|
||
+ wrmsr
|
||
+ movb $0,PCPU(IBPB_SET)
|
||
+1: ret
|
||
+END(handle_ibrs_exit)
|
||
+
|
||
+/* registers-neutral version, but needs stack */
|
||
+ENTRY(handle_ibrs_exit_rs)
|
||
+ cmpb $0,PCPU(IBPB_SET)
|
||
+ je 1f
|
||
+ pushq %rax
|
||
+ pushq %rdx
|
||
+ pushq %rcx
|
||
+ movl $MSR_IA32_SPEC_CTRL,%ecx
|
||
+ xorl %eax,%eax
|
||
+ xorl %edx,%edx
|
||
+ wrmsr
|
||
+ popq %rcx
|
||
+ popq %rdx
|
||
+ popq %rax
|
||
+ movb $0,PCPU(IBPB_SET)
|
||
+1: ret
|
||
+END(handle_ibrs_exit_rs)
|
||
+
|
||
+ .noaltmacro
|
||
--- sys/amd64/amd64/sys_machdep.c.orig
|
||
+++ sys/amd64/amd64/sys_machdep.c
|
||
@@ -357,7 +357,9 @@
|
||
pcb = td->td_pcb;
|
||
if (pcb->pcb_tssp == NULL) {
|
||
tssp = (struct amd64tss *)kmem_malloc(kernel_arena,
|
||
- ctob(IOPAGES+1), M_WAITOK);
|
||
+ ctob(IOPAGES + 1), M_WAITOK);
|
||
+ pmap_pti_add_kva((vm_offset_t)tssp, (vm_offset_t)tssp +
|
||
+ ctob(IOPAGES + 1), false);
|
||
iomap = (char *)&tssp[1];
|
||
memset(iomap, 0xff, IOPERM_BITMAP_SIZE);
|
||
critical_enter();
|
||
@@ -452,6 +454,8 @@
|
||
struct proc_ldt *pldt, *new_ldt;
|
||
struct mdproc *mdp;
|
||
struct soft_segment_descriptor sldt;
|
||
+ vm_offset_t sva;
|
||
+ vm_size_t sz;
|
||
|
||
mtx_assert(&dt_lock, MA_OWNED);
|
||
mdp = &p->p_md;
|
||
@@ -459,13 +463,13 @@
|
||
return (mdp->md_ldt);
|
||
mtx_unlock(&dt_lock);
|
||
new_ldt = malloc(sizeof(struct proc_ldt), M_SUBPROC, M_WAITOK);
|
||
- new_ldt->ldt_base = (caddr_t)kmem_malloc(kernel_arena,
|
||
- max_ldt_segment * sizeof(struct user_segment_descriptor),
|
||
- M_WAITOK | M_ZERO);
|
||
+ sz = max_ldt_segment * sizeof(struct user_segment_descriptor);
|
||
+ sva = kmem_malloc(kernel_arena, sz, M_WAITOK | M_ZERO);
|
||
+ new_ldt->ldt_base = (caddr_t)sva;
|
||
+ pmap_pti_add_kva(sva, sva + sz, false);
|
||
new_ldt->ldt_refcnt = 1;
|
||
- sldt.ssd_base = (uint64_t)new_ldt->ldt_base;
|
||
- sldt.ssd_limit = max_ldt_segment *
|
||
- sizeof(struct user_segment_descriptor) - 1;
|
||
+ sldt.ssd_base = sva;
|
||
+ sldt.ssd_limit = sz - 1;
|
||
sldt.ssd_type = SDT_SYSLDT;
|
||
sldt.ssd_dpl = SEL_KPL;
|
||
sldt.ssd_p = 1;
|
||
@@ -475,8 +479,8 @@
|
||
mtx_lock(&dt_lock);
|
||
pldt = mdp->md_ldt;
|
||
if (pldt != NULL && !force) {
|
||
- kmem_free(kernel_arena, (vm_offset_t)new_ldt->ldt_base,
|
||
- max_ldt_segment * sizeof(struct user_segment_descriptor));
|
||
+ pmap_pti_remove_kva(sva, sva + sz);
|
||
+ kmem_free(kernel_arena, sva, sz);
|
||
free(new_ldt, M_SUBPROC);
|
||
return (pldt);
|
||
}
|
||
@@ -518,10 +522,14 @@
|
||
static void
|
||
user_ldt_derefl(struct proc_ldt *pldt)
|
||
{
|
||
+ vm_offset_t sva;
|
||
+ vm_size_t sz;
|
||
|
||
if (--pldt->ldt_refcnt == 0) {
|
||
- kmem_free(kernel_arena, (vm_offset_t)pldt->ldt_base,
|
||
- max_ldt_segment * sizeof(struct user_segment_descriptor));
|
||
+ sva = (vm_offset_t)pldt->ldt_base;
|
||
+ sz = max_ldt_segment * sizeof(struct user_segment_descriptor);
|
||
+ pmap_pti_remove_kva(sva, sva + sz);
|
||
+ kmem_free(kernel_arena, sva, sz);
|
||
free(pldt, M_SUBPROC);
|
||
}
|
||
}
|
||
--- sys/amd64/amd64/trap.c.orig
|
||
+++ sys/amd64/amd64/trap.c
|
||
@@ -218,11 +218,6 @@
|
||
#endif
|
||
}
|
||
|
||
- if (type == T_MCHK) {
|
||
- mca_intr();
|
||
- goto out;
|
||
- }
|
||
-
|
||
if ((frame->tf_rflags & PSL_I) == 0) {
|
||
/*
|
||
* Buggy application or kernel code has disabled
|
||
@@ -452,9 +447,28 @@
|
||
* problem here and not have to check all the
|
||
* selectors and pointers when the user changes
|
||
* them.
|
||
+ *
|
||
+ * In case of PTI, the IRETQ faulted while the
|
||
+ * kernel used the pti stack, and exception
|
||
+ * frame records %rsp value pointing to that
|
||
+ * stack. If we return normally to
|
||
+ * doreti_iret_fault, the trapframe is
|
||
+ * reconstructed on pti stack, and calltrap()
|
||
+ * called on it as well. Due to the very
|
||
+ * limited pti stack size, kernel does not
|
||
+ * survive for too long. Switch to the normal
|
||
+ * thread stack for the trap handling.
|
||
+ *
|
||
+ * Magic '5' is the number of qwords occupied by
|
||
+ * the hardware trap frame.
|
||
*/
|
||
if (frame->tf_rip == (long)doreti_iret) {
|
||
frame->tf_rip = (long)doreti_iret_fault;
|
||
+ if (pti && frame->tf_rsp == (uintptr_t)PCPU_PTR(
|
||
+ pti_stack) + (PC_PTI_STACK_SZ - 5) *
|
||
+ sizeof(register_t))
|
||
+ frame->tf_rsp = PCPU_GET(rsp0) - 5 *
|
||
+ sizeof(register_t);
|
||
goto out;
|
||
}
|
||
if (frame->tf_rip == (long)ld_ds) {
|
||
@@ -694,6 +708,17 @@
|
||
}
|
||
|
||
/*
|
||
+ * If nx protection of the usermode portion of kernel page
|
||
+ * tables caused trap, panic.
|
||
+ */
|
||
+ if (pti && usermode && pg_nx != 0 && (frame->tf_err & (PGEX_P | PGEX_W |
|
||
+ PGEX_U | PGEX_I)) == (PGEX_P | PGEX_U | PGEX_I) &&
|
||
+ (curpcb->pcb_saved_ucr3 & ~CR3_PCID_MASK)==
|
||
+ (PCPU_GET(curpmap)->pm_cr3 & ~CR3_PCID_MASK))
|
||
+ panic("PTI: pid %d comm %s tf_err %#lx\n", p->p_pid,
|
||
+ p->p_comm, frame->tf_err);
|
||
+
|
||
+ /*
|
||
* PGEX_I is defined only if the execute disable bit capability is
|
||
* supported and enabled.
|
||
*/
|
||
--- sys/amd64/amd64/vm_machdep.c.orig
|
||
+++ sys/amd64/amd64/vm_machdep.c
|
||
@@ -339,6 +339,8 @@
|
||
* Clean TSS/iomap
|
||
*/
|
||
if (pcb->pcb_tssp != NULL) {
|
||
+ pmap_pti_remove_kva((vm_offset_t)pcb->pcb_tssp,
|
||
+ (vm_offset_t)pcb->pcb_tssp + ctob(IOPAGES + 1));
|
||
kmem_free(kernel_arena, (vm_offset_t)pcb->pcb_tssp,
|
||
ctob(IOPAGES + 1));
|
||
pcb->pcb_tssp = NULL;
|
||
--- sys/amd64/ia32/ia32_exception.S.orig
|
||
+++ sys/amd64/ia32/ia32_exception.S
|
||
@@ -40,24 +40,27 @@
|
||
* that it originated in supervisor mode and skip the swapgs.
|
||
*/
|
||
SUPERALIGN_TEXT
|
||
+IDTVEC(int0x80_syscall_pti)
|
||
+ PTI_UENTRY has_err=0
|
||
+ jmp int0x80_syscall_common
|
||
+ SUPERALIGN_TEXT
|
||
IDTVEC(int0x80_syscall)
|
||
swapgs
|
||
+int0x80_syscall_common:
|
||
pushq $2 /* sizeof "int 0x80" */
|
||
subq $TF_ERR,%rsp /* skip over tf_trapno */
|
||
movq %rdi,TF_RDI(%rsp)
|
||
movq PCPU(CURPCB),%rdi
|
||
andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi)
|
||
- movw %fs,TF_FS(%rsp)
|
||
- movw %gs,TF_GS(%rsp)
|
||
- movw %es,TF_ES(%rsp)
|
||
- movw %ds,TF_DS(%rsp)
|
||
+ SAVE_SEGS
|
||
+ movq %rax,TF_RAX(%rsp)
|
||
+ movq %rdx,TF_RDX(%rsp)
|
||
+ movq %rcx,TF_RCX(%rsp)
|
||
+ call handle_ibrs_entry
|
||
sti
|
||
movq %rsi,TF_RSI(%rsp)
|
||
- movq %rdx,TF_RDX(%rsp)
|
||
- movq %rcx,TF_RCX(%rsp)
|
||
movq %r8,TF_R8(%rsp)
|
||
movq %r9,TF_R9(%rsp)
|
||
- movq %rax,TF_RAX(%rsp)
|
||
movq %rbx,TF_RBX(%rsp)
|
||
movq %rbp,TF_RBP(%rsp)
|
||
movq %r10,TF_R10(%rsp)
|
||
--- sys/amd64/ia32/ia32_syscall.c.orig
|
||
+++ sys/amd64/ia32/ia32_syscall.c
|
||
@@ -93,7 +93,8 @@
|
||
|
||
#define IDTVEC(name) __CONCAT(X,name)
|
||
|
||
-extern inthand_t IDTVEC(int0x80_syscall), IDTVEC(rsvd);
|
||
+extern inthand_t IDTVEC(int0x80_syscall), IDTVEC(int0x80_syscall_pti),
|
||
+ IDTVEC(rsvd), IDTVEC(rsvd_pti);
|
||
|
||
void ia32_syscall(struct trapframe *frame); /* Called from asm code */
|
||
|
||
@@ -205,7 +206,8 @@
|
||
ia32_syscall_enable(void *dummy)
|
||
{
|
||
|
||
- setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0);
|
||
+ setidt(IDT_SYSCALL, pti ? &IDTVEC(int0x80_syscall_pti) :
|
||
+ &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0);
|
||
}
|
||
|
||
static void
|
||
@@ -212,7 +214,8 @@
|
||
ia32_syscall_disable(void *dummy)
|
||
{
|
||
|
||
- setidt(IDT_SYSCALL, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
|
||
+ setidt(IDT_SYSCALL, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd),
|
||
+ SDT_SYSIGT, SEL_KPL, 0);
|
||
}
|
||
|
||
SYSINIT(ia32_syscall, SI_SUB_EXEC, SI_ORDER_ANY, ia32_syscall_enable, NULL);
|
||
--- sys/amd64/include/asmacros.h.orig
|
||
+++ sys/amd64/include/asmacros.h
|
||
@@ -1,7 +1,15 @@
|
||
+/* -*- mode: asm -*- */
|
||
/*-
|
||
* Copyright (c) 1993 The Regents of the University of California.
|
||
* All rights reserved.
|
||
*
|
||
+ * Copyright (c) 2018 The FreeBSD Foundation
|
||
+ * All rights reserved.
|
||
+ *
|
||
+ * Portions of this software were developed by
|
||
+ * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
|
||
+ * the FreeBSD Foundation.
|
||
+ *
|
||
* Redistribution and use in source and binary forms, with or without
|
||
* modification, are permitted provided that the following conditions
|
||
* are met:
|
||
@@ -144,70 +152,135 @@
|
||
|
||
#ifdef LOCORE
|
||
/*
|
||
+ * Access per-CPU data.
|
||
+ */
|
||
+#define PCPU(member) %gs:PC_ ## member
|
||
+#define PCPU_ADDR(member, reg) \
|
||
+ movq %gs:PC_PRVSPACE, reg ; \
|
||
+ addq $PC_ ## member, reg
|
||
+
|
||
+/*
|
||
* Convenience macro for declaring interrupt entry points.
|
||
*/
|
||
#define IDTVEC(name) ALIGN_TEXT; .globl __CONCAT(X,name); \
|
||
.type __CONCAT(X,name),@function; __CONCAT(X,name):
|
||
|
||
-/*
|
||
- * Macros to create and destroy a trap frame.
|
||
- */
|
||
-#define PUSH_FRAME \
|
||
- subq $TF_RIP,%rsp ; /* skip dummy tf_err and tf_trapno */ \
|
||
- testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */ \
|
||
- jz 1f ; /* Yes, dont swapgs again */ \
|
||
- swapgs ; \
|
||
-1: movq %rdi,TF_RDI(%rsp) ; \
|
||
- movq %rsi,TF_RSI(%rsp) ; \
|
||
- movq %rdx,TF_RDX(%rsp) ; \
|
||
- movq %rcx,TF_RCX(%rsp) ; \
|
||
- movq %r8,TF_R8(%rsp) ; \
|
||
- movq %r9,TF_R9(%rsp) ; \
|
||
- movq %rax,TF_RAX(%rsp) ; \
|
||
- movq %rbx,TF_RBX(%rsp) ; \
|
||
- movq %rbp,TF_RBP(%rsp) ; \
|
||
- movq %r10,TF_R10(%rsp) ; \
|
||
- movq %r11,TF_R11(%rsp) ; \
|
||
- movq %r12,TF_R12(%rsp) ; \
|
||
- movq %r13,TF_R13(%rsp) ; \
|
||
- movq %r14,TF_R14(%rsp) ; \
|
||
- movq %r15,TF_R15(%rsp) ; \
|
||
- movw %fs,TF_FS(%rsp) ; \
|
||
- movw %gs,TF_GS(%rsp) ; \
|
||
- movw %es,TF_ES(%rsp) ; \
|
||
- movw %ds,TF_DS(%rsp) ; \
|
||
- movl $TF_HASSEGS,TF_FLAGS(%rsp) ; \
|
||
+ .macro SAVE_SEGS
|
||
+ movw %fs,TF_FS(%rsp)
|
||
+ movw %gs,TF_GS(%rsp)
|
||
+ movw %es,TF_ES(%rsp)
|
||
+ movw %ds,TF_DS(%rsp)
|
||
+ .endm
|
||
+
|
||
+ .macro MOVE_STACKS qw
|
||
+ .L.offset=0
|
||
+ .rept \qw
|
||
+ movq .L.offset(%rsp),%rdx
|
||
+ movq %rdx,.L.offset(%rax)
|
||
+ .L.offset=.L.offset+8
|
||
+ .endr
|
||
+ .endm
|
||
+
|
||
+ .macro PTI_UUENTRY has_err
|
||
+ movq PCPU(KCR3),%rax
|
||
+ movq %rax,%cr3
|
||
+ movq PCPU(RSP0),%rax
|
||
+ subq $PTI_SIZE,%rax
|
||
+ MOVE_STACKS ((PTI_SIZE / 8) - 1 + \has_err)
|
||
+ movq %rax,%rsp
|
||
+ popq %rdx
|
||
+ popq %rax
|
||
+ .endm
|
||
+
|
||
+ .macro PTI_UENTRY has_err
|
||
+ swapgs
|
||
+ pushq %rax
|
||
+ pushq %rdx
|
||
+ PTI_UUENTRY \has_err
|
||
+ .endm
|
||
+
|
||
+ .macro PTI_ENTRY name, cont, has_err=0
|
||
+ ALIGN_TEXT
|
||
+ .globl X\name\()_pti
|
||
+ .type X\name\()_pti,@function
|
||
+X\name\()_pti:
|
||
+ /* %rax, %rdx and possibly err not yet pushed */
|
||
+ testb $SEL_RPL_MASK,PTI_CS-(2+1-\has_err)*8(%rsp)
|
||
+ jz \cont
|
||
+ PTI_UENTRY \has_err
|
||
+ swapgs
|
||
+ jmp \cont
|
||
+ .endm
|
||
+
|
||
+ .macro PTI_INTRENTRY vec_name
|
||
+ SUPERALIGN_TEXT
|
||
+ .globl X\vec_name\()_pti
|
||
+ .type X\vec_name\()_pti,@function
|
||
+X\vec_name\()_pti:
|
||
+ testb $SEL_RPL_MASK,PTI_CS-3*8(%rsp) /* err, %rax, %rdx not pushed */
|
||
+ jz \vec_name\()_u
|
||
+ PTI_UENTRY has_err=0
|
||
+ jmp \vec_name\()_u
|
||
+ .endm
|
||
+
|
||
+ .macro INTR_PUSH_FRAME vec_name
|
||
+ SUPERALIGN_TEXT
|
||
+ .globl X\vec_name
|
||
+ .type X\vec_name,@function
|
||
+X\vec_name:
|
||
+ testb $SEL_RPL_MASK,PTI_CS-3*8(%rsp) /* come from kernel? */
|
||
+ jz \vec_name\()_u /* Yes, dont swapgs again */
|
||
+ swapgs
|
||
+\vec_name\()_u:
|
||
+ subq $TF_RIP,%rsp /* skip dummy tf_err and tf_trapno */
|
||
+ movq %rdi,TF_RDI(%rsp)
|
||
+ movq %rsi,TF_RSI(%rsp)
|
||
+ movq %rdx,TF_RDX(%rsp)
|
||
+ movq %rcx,TF_RCX(%rsp)
|
||
+ movq %r8,TF_R8(%rsp)
|
||
+ movq %r9,TF_R9(%rsp)
|
||
+ movq %rax,TF_RAX(%rsp)
|
||
+ movq %rbx,TF_RBX(%rsp)
|
||
+ movq %rbp,TF_RBP(%rsp)
|
||
+ movq %r10,TF_R10(%rsp)
|
||
+ movq %r11,TF_R11(%rsp)
|
||
+ movq %r12,TF_R12(%rsp)
|
||
+ movq %r13,TF_R13(%rsp)
|
||
+ movq %r14,TF_R14(%rsp)
|
||
+ movq %r15,TF_R15(%rsp)
|
||
+ SAVE_SEGS
|
||
+ movl $TF_HASSEGS,TF_FLAGS(%rsp)
|
||
cld
|
||
+ testb $SEL_RPL_MASK,TF_CS(%rsp) /* come from kernel ? */
|
||
+ jz 1f /* yes, leave PCB_FULL_IRET alone */
|
||
+ movq PCPU(CURPCB),%r8
|
||
+ andl $~PCB_FULL_IRET,PCB_FLAGS(%r8)
|
||
+1:
|
||
+ .endm
|
||
|
||
-#define POP_FRAME \
|
||
- movq TF_RDI(%rsp),%rdi ; \
|
||
- movq TF_RSI(%rsp),%rsi ; \
|
||
- movq TF_RDX(%rsp),%rdx ; \
|
||
- movq TF_RCX(%rsp),%rcx ; \
|
||
- movq TF_R8(%rsp),%r8 ; \
|
||
- movq TF_R9(%rsp),%r9 ; \
|
||
- movq TF_RAX(%rsp),%rax ; \
|
||
- movq TF_RBX(%rsp),%rbx ; \
|
||
- movq TF_RBP(%rsp),%rbp ; \
|
||
- movq TF_R10(%rsp),%r10 ; \
|
||
- movq TF_R11(%rsp),%r11 ; \
|
||
- movq TF_R12(%rsp),%r12 ; \
|
||
- movq TF_R13(%rsp),%r13 ; \
|
||
- movq TF_R14(%rsp),%r14 ; \
|
||
- movq TF_R15(%rsp),%r15 ; \
|
||
- testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */ \
|
||
- jz 1f ; /* keep kernel GS.base */ \
|
||
- cli ; \
|
||
- swapgs ; \
|
||
-1: addq $TF_RIP,%rsp /* skip over tf_err, tf_trapno */
|
||
+ .macro INTR_HANDLER vec_name
|
||
+ .text
|
||
+ PTI_INTRENTRY \vec_name
|
||
+ INTR_PUSH_FRAME \vec_name
|
||
+ .endm
|
||
|
||
-/*
|
||
- * Access per-CPU data.
|
||
- */
|
||
-#define PCPU(member) %gs:PC_ ## member
|
||
-#define PCPU_ADDR(member, reg) \
|
||
- movq %gs:PC_PRVSPACE, reg ; \
|
||
- addq $PC_ ## member, reg
|
||
+ .macro RESTORE_REGS
|
||
+ movq TF_RDI(%rsp),%rdi
|
||
+ movq TF_RSI(%rsp),%rsi
|
||
+ movq TF_RDX(%rsp),%rdx
|
||
+ movq TF_RCX(%rsp),%rcx
|
||
+ movq TF_R8(%rsp),%r8
|
||
+ movq TF_R9(%rsp),%r9
|
||
+ movq TF_RAX(%rsp),%rax
|
||
+ movq TF_RBX(%rsp),%rbx
|
||
+ movq TF_RBP(%rsp),%rbp
|
||
+ movq TF_R10(%rsp),%r10
|
||
+ movq TF_R11(%rsp),%r11
|
||
+ movq TF_R12(%rsp),%r12
|
||
+ movq TF_R13(%rsp),%r13
|
||
+ movq TF_R14(%rsp),%r14
|
||
+ movq TF_R15(%rsp),%r15
|
||
+ .endm
|
||
|
||
#endif /* LOCORE */
|
||
|
||
--- sys/amd64/include/frame.h.orig
|
||
+++ sys/amd64/include/frame.h
|
||
@@ -1,6 +1,50 @@
|
||
/*-
|
||
- * This file is in the public domain.
|
||
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
|
||
+ *
|
||
+ * Copyright (c) 2018 The FreeBSD Foundation
|
||
+ * All rights reserved.
|
||
+ *
|
||
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
|
||
+ * under sponsorship from the FreeBSD Foundation.
|
||
+ *
|
||
+ * Redistribution and use in source and binary forms, with or without
|
||
+ * modification, are permitted provided that the following conditions
|
||
+ * are met:
|
||
+ * 1. Redistributions of source code must retain the above copyright
|
||
+ * notice, this list of conditions and the following disclaimer.
|
||
+ * 2. Redistributions in binary form must reproduce the above copyright
|
||
+ * notice, this list of conditions and the following disclaimer in the
|
||
+ * documentation and/or other materials provided with the distribution.
|
||
+ *
|
||
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||
+ * SUCH DAMAGE.
|
||
+ *
|
||
+ * $FreeBSD$
|
||
*/
|
||
-/* $FreeBSD: releng/11.1/sys/amd64/include/frame.h 247047 2013-02-20 17:39:52Z kib $ */
|
||
|
||
+#ifndef _AMD64_FRAME_H
|
||
+#define _AMD64_FRAME_H
|
||
+
|
||
#include <x86/frame.h>
|
||
+
|
||
+struct pti_frame {
|
||
+ register_t pti_rdx;
|
||
+ register_t pti_rax;
|
||
+ register_t pti_err;
|
||
+ register_t pti_rip;
|
||
+ register_t pti_cs;
|
||
+ register_t pti_rflags;
|
||
+ register_t pti_rsp;
|
||
+ register_t pti_ss;
|
||
+};
|
||
+
|
||
+#endif
|
||
--- sys/amd64/include/intr_machdep.h.orig
|
||
+++ sys/amd64/include/intr_machdep.h
|
||
@@ -136,7 +136,7 @@
|
||
|
||
/*
|
||
* The following data structure holds per-cpu data, and is placed just
|
||
- * above the top of the space used for the NMI stack.
|
||
+ * above the top of the space used for the NMI and MC# stacks.
|
||
*/
|
||
struct nmi_pcpu {
|
||
register_t np_pcpu;
|
||
--- sys/amd64/include/md_var.h.orig
|
||
+++ sys/amd64/include/md_var.h
|
||
@@ -35,9 +35,17 @@
|
||
#include <x86/x86_var.h>
|
||
|
||
extern uint64_t *vm_page_dump;
|
||
+extern int hw_ibrs_disable;
|
||
|
||
+/*
|
||
+ * The file "conf/ldscript.amd64" defines the symbol "kernphys". Its
|
||
+ * value is the physical address at which the kernel is loaded.
|
||
+ */
|
||
+extern char kernphys[];
|
||
+
|
||
struct savefpu;
|
||
|
||
+void amd64_conf_fast_syscall(void);
|
||
void amd64_db_resume_dbreg(void);
|
||
void amd64_syscall(struct thread *td, int traced);
|
||
void doreti_iret(void) __asm(__STRING(doreti_iret));
|
||
--- sys/amd64/include/pcb.h.orig
|
||
+++ sys/amd64/include/pcb.h
|
||
@@ -90,7 +90,7 @@
|
||
/* copyin/out fault recovery */
|
||
caddr_t pcb_onfault;
|
||
|
||
- uint64_t pcb_pad0;
|
||
+ uint64_t pcb_saved_ucr3;
|
||
|
||
/* local tss, with i/o bitmap; NULL for common */
|
||
struct amd64tss *pcb_tssp;
|
||
--- sys/amd64/include/pcpu.h.orig
|
||
+++ sys/amd64/include/pcpu.h
|
||
@@ -33,6 +33,7 @@
|
||
#error "sys/cdefs.h is a prerequisite for this file"
|
||
#endif
|
||
|
||
+#define PC_PTI_STACK_SZ 16
|
||
/*
|
||
* The SMP parts are setup in pmap.c and locore.s for the BSP, and
|
||
* mp_machdep.c sets up the data for the AP's to "see" when they awake.
|
||
@@ -46,8 +47,12 @@
|
||
struct pmap *pc_curpmap; \
|
||
struct amd64tss *pc_tssp; /* TSS segment active on CPU */ \
|
||
struct amd64tss *pc_commontssp;/* Common TSS for the CPU */ \
|
||
+ uint64_t pc_kcr3; \
|
||
+ uint64_t pc_ucr3; \
|
||
+ uint64_t pc_saved_ucr3; \
|
||
register_t pc_rsp0; \
|
||
register_t pc_scratch_rsp; /* User %rsp in syscall */ \
|
||
+ register_t pc_scratch_rax; \
|
||
u_int pc_apic_id; \
|
||
u_int pc_acpi_id; /* ACPI CPU id */ \
|
||
/* Pointer to the CPU %fs descriptor */ \
|
||
@@ -61,12 +66,14 @@
|
||
uint64_t pc_pm_save_cnt; \
|
||
u_int pc_cmci_mask; /* MCx banks for CMCI */ \
|
||
uint64_t pc_dbreg[16]; /* ddb debugging regs */ \
|
||
+ uint64_t pc_pti_stack[PC_PTI_STACK_SZ]; \
|
||
int pc_dbreg_cmd; /* ddb debugging reg cmd */ \
|
||
u_int pc_vcpu_id; /* Xen vCPU ID */ \
|
||
uint32_t pc_pcid_next; \
|
||
uint32_t pc_pcid_gen; \
|
||
uint32_t pc_smp_tlb_done; /* TLB op acknowledgement */ \
|
||
- char __pad[145] /* be divisor of PAGE_SIZE \
|
||
+ uint32_t pc_ibpb_set; \
|
||
+ char __pad[96] /* be divisor of PAGE_SIZE \
|
||
after cache alignment */
|
||
|
||
#define PC_DBREG_CMD_NONE 0
|
||
--- sys/amd64/include/pmap.h.orig
|
||
+++ sys/amd64/include/pmap.h
|
||
@@ -223,7 +223,11 @@
|
||
#define PMAP_PCID_NONE 0xffffffff
|
||
#define PMAP_PCID_KERN 0
|
||
#define PMAP_PCID_OVERMAX 0x1000
|
||
+#define PMAP_PCID_OVERMAX_KERN 0x800
|
||
+#define PMAP_PCID_USER_PT 0x800
|
||
|
||
+#define PMAP_NO_CR3 (~0UL)
|
||
+
|
||
#ifndef LOCORE
|
||
|
||
#include <sys/queue.h>
|
||
@@ -313,7 +317,9 @@
|
||
struct pmap {
|
||
struct mtx pm_mtx;
|
||
pml4_entry_t *pm_pml4; /* KVA of level 4 page table */
|
||
+ pml4_entry_t *pm_pml4u; /* KVA of user l4 page table */
|
||
uint64_t pm_cr3;
|
||
+ uint64_t pm_ucr3;
|
||
TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */
|
||
cpuset_t pm_active; /* active on cpus */
|
||
enum pmap_type pm_type; /* regular or nested tables */
|
||
@@ -419,6 +425,12 @@
|
||
void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num);
|
||
boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t);
|
||
void pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t);
|
||
+void pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec);
|
||
+void pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva);
|
||
+void pmap_pti_pcid_invalidate(uint64_t ucr3, uint64_t kcr3);
|
||
+void pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va);
|
||
+void pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva,
|
||
+ vm_offset_t eva);
|
||
#endif /* _KERNEL */
|
||
|
||
/* Return various clipped indexes for a given VA */
|
||
--- sys/amd64/include/smp.h.orig
|
||
+++ sys/amd64/include/smp.h
|
||
@@ -28,12 +28,36 @@
|
||
|
||
/* IPI handlers */
|
||
inthand_t
|
||
+ IDTVEC(justreturn), /* interrupt CPU with minimum overhead */
|
||
+ IDTVEC(justreturn1_pti),
|
||
+ IDTVEC(invltlb_pti),
|
||
+ IDTVEC(invltlb_pcid_pti),
|
||
IDTVEC(invltlb_pcid), /* TLB shootdowns - global, pcid */
|
||
- IDTVEC(invltlb_invpcid),/* TLB shootdowns - global, invpcid */
|
||
- IDTVEC(justreturn); /* interrupt CPU with minimum overhead */
|
||
+ IDTVEC(invltlb_invpcid_pti_pti),
|
||
+ IDTVEC(invltlb_invpcid_nopti),
|
||
+ IDTVEC(invlpg_pti),
|
||
+ IDTVEC(invlpg_invpcid_pti),
|
||
+ IDTVEC(invlpg_invpcid),
|
||
+ IDTVEC(invlpg_pcid_pti),
|
||
+ IDTVEC(invlpg_pcid),
|
||
+ IDTVEC(invlrng_pti),
|
||
+ IDTVEC(invlrng_invpcid_pti),
|
||
+ IDTVEC(invlrng_invpcid),
|
||
+ IDTVEC(invlrng_pcid_pti),
|
||
+ IDTVEC(invlrng_pcid),
|
||
+ IDTVEC(invlcache_pti),
|
||
+ IDTVEC(ipi_intr_bitmap_handler_pti),
|
||
+ IDTVEC(cpustop_pti),
|
||
+ IDTVEC(cpususpend_pti),
|
||
+ IDTVEC(rendezvous_pti);
|
||
|
||
void invltlb_pcid_handler(void);
|
||
void invltlb_invpcid_handler(void);
|
||
+void invltlb_invpcid_pti_handler(void);
|
||
+void invlpg_invpcid_handler(void);
|
||
+void invlpg_pcid_handler(void);
|
||
+void invlrng_invpcid_handler(void);
|
||
+void invlrng_pcid_handler(void);
|
||
int native_start_all_aps(void);
|
||
|
||
#endif /* !LOCORE */
|
||
--- sys/amd64/vmm/intel/vmx.c.orig
|
||
+++ sys/amd64/vmm/intel/vmx.c
|
||
@@ -693,7 +693,8 @@
|
||
MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
|
||
&tmp);
|
||
if (error == 0) {
|
||
- pirvec = lapic_ipi_alloc(&IDTVEC(justreturn));
|
||
+ pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
|
||
+ &IDTVEC(justreturn));
|
||
if (pirvec < 0) {
|
||
if (bootverbose) {
|
||
printf("vmx_init: unable to allocate "
|
||
--- sys/amd64/vmm/vmm.c.orig
|
||
+++ sys/amd64/vmm/vmm.c
|
||
@@ -55,6 +55,7 @@
|
||
#include <machine/cpu.h>
|
||
#include <machine/pcb.h>
|
||
#include <machine/smp.h>
|
||
+#include <machine/md_var.h>
|
||
#include <x86/psl.h>
|
||
#include <x86/apicreg.h>
|
||
|
||
@@ -325,7 +326,8 @@
|
||
|
||
vmm_host_state_init();
|
||
|
||
- vmm_ipinum = lapic_ipi_alloc(&IDTVEC(justreturn));
|
||
+ vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
|
||
+ &IDTVEC(justreturn));
|
||
if (vmm_ipinum < 0)
|
||
vmm_ipinum = IPI_AST;
|
||
|
||
--- sys/conf/Makefile.amd64.orig
|
||
+++ sys/conf/Makefile.amd64
|
||
@@ -39,6 +39,7 @@
|
||
|
||
ASM_CFLAGS.acpi_wakecode.S= ${CLANG_NO_IAS34}
|
||
ASM_CFLAGS.mpboot.S= ${CLANG_NO_IAS34}
|
||
+ASM_CFLAGS.support.S= ${CLANG_NO_IAS}
|
||
|
||
%BEFORE_DEPEND
|
||
|
||
--- sys/dev/cpuctl/cpuctl.c.orig
|
||
+++ sys/dev/cpuctl/cpuctl.c
|
||
@@ -71,6 +71,7 @@
|
||
struct thread *td);
|
||
static int cpuctl_do_cpuid_count(int cpu, cpuctl_cpuid_count_args_t *data,
|
||
struct thread *td);
|
||
+static int cpuctl_do_eval_cpu_features(int cpu, struct thread *td);
|
||
static int cpuctl_do_update(int cpu, cpuctl_update_args_t *data,
|
||
struct thread *td);
|
||
static int update_intel(int cpu, cpuctl_update_args_t *args,
|
||
@@ -157,7 +158,8 @@
|
||
}
|
||
/* Require write flag for "write" requests. */
|
||
if ((cmd == CPUCTL_MSRCBIT || cmd == CPUCTL_MSRSBIT ||
|
||
- cmd == CPUCTL_UPDATE || cmd == CPUCTL_WRMSR) &&
|
||
+ cmd == CPUCTL_UPDATE || cmd == CPUCTL_WRMSR ||
|
||
+ cmd == CPUCTL_EVAL_CPU_FEATURES) &&
|
||
(flags & FWRITE) == 0)
|
||
return (EPERM);
|
||
switch (cmd) {
|
||
@@ -185,6 +187,9 @@
|
||
ret = cpuctl_do_cpuid_count(cpu,
|
||
(cpuctl_cpuid_count_args_t *)data, td);
|
||
break;
|
||
+ case CPUCTL_EVAL_CPU_FEATURES:
|
||
+ ret = cpuctl_do_eval_cpu_features(cpu, td);
|
||
+ break;
|
||
default:
|
||
ret = EINVAL;
|
||
break;
|
||
@@ -502,6 +507,30 @@
|
||
return (ret);
|
||
}
|
||
|
||
+static int
|
||
+cpuctl_do_eval_cpu_features(int cpu, struct thread *td)
|
||
+{
|
||
+ int is_bound = 0;
|
||
+ int oldcpu;
|
||
+
|
||
+ KASSERT(cpu >= 0 && cpu <= mp_maxid,
|
||
+ ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu));
|
||
+
|
||
+#ifdef __i386__
|
||
+ if (cpu_id == 0)
|
||
+ return (ENODEV);
|
||
+#endif
|
||
+ oldcpu = td->td_oncpu;
|
||
+ is_bound = cpu_sched_is_bound(td);
|
||
+ set_cpu(cpu, td);
|
||
+ identify_cpu1();
|
||
+ identify_cpu2();
|
||
+ hw_ibrs_recalculate();
|
||
+ restore_cpu(oldcpu, is_bound, td);
|
||
+ printcpuinfo();
|
||
+ return (0);
|
||
+}
|
||
+
|
||
int
|
||
cpuctl_open(struct cdev *dev, int flags, int fmt __unused, struct thread *td)
|
||
{
|
||
--- sys/dev/hyperv/vmbus/amd64/vmbus_vector.S.orig
|
||
+++ sys/dev/hyperv/vmbus/amd64/vmbus_vector.S
|
||
@@ -26,11 +26,11 @@
|
||
* $FreeBSD$
|
||
*/
|
||
|
||
+#include "assym.s"
|
||
+
|
||
#include <machine/asmacros.h>
|
||
#include <machine/specialreg.h>
|
||
|
||
-#include "assym.s"
|
||
-
|
||
/*
|
||
* This is the Hyper-V vmbus channel direct callback interrupt.
|
||
* Only used when it is running on Hyper-V.
|
||
@@ -37,8 +37,7 @@
|
||
*/
|
||
.text
|
||
SUPERALIGN_TEXT
|
||
-IDTVEC(vmbus_isr)
|
||
- PUSH_FRAME
|
||
+ INTR_HANDLER vmbus_isr
|
||
FAKE_MCOUNT(TF_RIP(%rsp))
|
||
movq %rsp, %rdi
|
||
call vmbus_handle_intr
|
||
--- sys/dev/hyperv/vmbus/i386/vmbus_vector.S.orig
|
||
+++ sys/dev/hyperv/vmbus/i386/vmbus_vector.S
|
||
@@ -37,6 +37,7 @@
|
||
*/
|
||
.text
|
||
SUPERALIGN_TEXT
|
||
+IDTVEC(vmbus_isr_pti)
|
||
IDTVEC(vmbus_isr)
|
||
PUSH_FRAME
|
||
SET_KERNEL_SREGS
|
||
--- sys/dev/hyperv/vmbus/vmbus.c.orig
|
||
+++ sys/dev/hyperv/vmbus/vmbus.c
|
||
@@ -46,6 +46,7 @@
|
||
|
||
#include <machine/bus.h>
|
||
#include <machine/intr_machdep.h>
|
||
+#include <machine/md_var.h>
|
||
#include <machine/resource.h>
|
||
#include <x86/include/apicvar.h>
|
||
|
||
@@ -128,7 +129,7 @@
|
||
|
||
static struct vmbus_softc *vmbus_sc;
|
||
|
||
-extern inthand_t IDTVEC(vmbus_isr);
|
||
+extern inthand_t IDTVEC(vmbus_isr), IDTVEC(vmbus_isr_pti);
|
||
|
||
static const uint32_t vmbus_version[] = {
|
||
VMBUS_VERSION_WIN8_1,
|
||
@@ -928,7 +929,8 @@
|
||
* All Hyper-V ISR required resources are setup, now let's find a
|
||
* free IDT vector for Hyper-V ISR and set it up.
|
||
*/
|
||
- sc->vmbus_idtvec = lapic_ipi_alloc(IDTVEC(vmbus_isr));
|
||
+ sc->vmbus_idtvec = lapic_ipi_alloc(pti ? IDTVEC(vmbus_isr_pti) :
|
||
+ IDTVEC(vmbus_isr));
|
||
if (sc->vmbus_idtvec < 0) {
|
||
device_printf(sc->vmbus_dev, "cannot find free IDT vector\n");
|
||
return ENXIO;
|
||
--- sys/i386/i386/apic_vector.s.orig
|
||
+++ sys/i386/i386/apic_vector.s
|
||
@@ -70,6 +70,7 @@
|
||
#define ISR_VEC(index, vec_name) \
|
||
.text ; \
|
||
SUPERALIGN_TEXT ; \
|
||
+IDTVEC(vec_name ## _pti) ; \
|
||
IDTVEC(vec_name) ; \
|
||
PUSH_FRAME ; \
|
||
SET_KERNEL_SREGS ; \
|
||
@@ -123,6 +124,7 @@
|
||
*/
|
||
.text
|
||
SUPERALIGN_TEXT
|
||
+IDTVEC(timerint_pti)
|
||
IDTVEC(timerint)
|
||
PUSH_FRAME
|
||
SET_KERNEL_SREGS
|
||
@@ -139,6 +141,7 @@
|
||
*/
|
||
.text
|
||
SUPERALIGN_TEXT
|
||
+IDTVEC(cmcint_pti)
|
||
IDTVEC(cmcint)
|
||
PUSH_FRAME
|
||
SET_KERNEL_SREGS
|
||
@@ -153,6 +156,7 @@
|
||
*/
|
||
.text
|
||
SUPERALIGN_TEXT
|
||
+IDTVEC(errorint_pti)
|
||
IDTVEC(errorint)
|
||
PUSH_FRAME
|
||
SET_KERNEL_SREGS
|
||
--- sys/i386/i386/atpic_vector.s.orig
|
||
+++ sys/i386/i386/atpic_vector.s
|
||
@@ -46,6 +46,7 @@
|
||
#define INTR(irq_num, vec_name) \
|
||
.text ; \
|
||
SUPERALIGN_TEXT ; \
|
||
+IDTVEC(vec_name ##_pti) ; \
|
||
IDTVEC(vec_name) ; \
|
||
PUSH_FRAME ; \
|
||
SET_KERNEL_SREGS ; \
|
||
--- sys/i386/i386/exception.s.orig
|
||
+++ sys/i386/i386/exception.s
|
||
@@ -133,6 +133,7 @@
|
||
TRAP(T_PAGEFLT)
|
||
IDTVEC(mchk)
|
||
pushl $0; TRAP(T_MCHK)
|
||
+IDTVEC(rsvd_pti)
|
||
IDTVEC(rsvd)
|
||
pushl $0; TRAP(T_RESERVED)
|
||
IDTVEC(fpu)
|
||
--- sys/i386/i386/machdep.c.orig
|
||
+++ sys/i386/i386/machdep.c
|
||
@@ -2577,7 +2577,7 @@
|
||
GSEL(GCODE_SEL, SEL_KPL));
|
||
#endif
|
||
#ifdef XENHVM
|
||
- setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_UPL,
|
||
+ setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_KPL,
|
||
GSEL(GCODE_SEL, SEL_KPL));
|
||
#endif
|
||
|
||
--- sys/i386/i386/pmap.c.orig
|
||
+++ sys/i386/i386/pmap.c
|
||
@@ -283,6 +283,8 @@
|
||
"Number of times pmap_pte_quick didn't change PMAP1");
|
||
static struct mtx PMAP2mutex;
|
||
|
||
+int pti;
|
||
+
|
||
static void free_pv_chunk(struct pv_chunk *pc);
|
||
static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
|
||
static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
|
||
@@ -1043,7 +1045,7 @@
|
||
CPU_AND(&other_cpus, &pmap->pm_active);
|
||
mask = &other_cpus;
|
||
}
|
||
- smp_masked_invlpg(*mask, va);
|
||
+ smp_masked_invlpg(*mask, va, pmap);
|
||
sched_unpin();
|
||
}
|
||
|
||
@@ -1077,7 +1079,7 @@
|
||
CPU_AND(&other_cpus, &pmap->pm_active);
|
||
mask = &other_cpus;
|
||
}
|
||
- smp_masked_invlpg_range(*mask, sva, eva);
|
||
+ smp_masked_invlpg_range(*mask, sva, eva, pmap);
|
||
sched_unpin();
|
||
}
|
||
|
||
--- sys/i386/i386/support.s.orig
|
||
+++ sys/i386/i386/support.s
|
||
@@ -830,3 +830,11 @@
|
||
movl $0,PCB_ONFAULT(%ecx)
|
||
movl $EFAULT,%eax
|
||
ret
|
||
+
|
||
+ENTRY(handle_ibrs_entry)
|
||
+ ret
|
||
+END(handle_ibrs_entry)
|
||
+
|
||
+ENTRY(handle_ibrs_exit)
|
||
+ ret
|
||
+END(handle_ibrs_exit)
|
||
--- sys/i386/i386/vm_machdep.c.orig
|
||
+++ sys/i386/i386/vm_machdep.c
|
||
@@ -795,7 +795,7 @@
|
||
CPU_NAND(&other_cpus, &sf->cpumask);
|
||
if (!CPU_EMPTY(&other_cpus)) {
|
||
CPU_OR(&sf->cpumask, &other_cpus);
|
||
- smp_masked_invlpg(other_cpus, sf->kva);
|
||
+ smp_masked_invlpg(other_cpus, sf->kva, kernel_pmap);
|
||
}
|
||
}
|
||
sched_unpin();
|
||
--- sys/sys/cpuctl.h.orig
|
||
+++ sys/sys/cpuctl.h
|
||
@@ -57,5 +57,6 @@
|
||
#define CPUCTL_MSRSBIT _IOWR('c', 5, cpuctl_msr_args_t)
|
||
#define CPUCTL_MSRCBIT _IOWR('c', 6, cpuctl_msr_args_t)
|
||
#define CPUCTL_CPUID_COUNT _IOWR('c', 7, cpuctl_cpuid_count_args_t)
|
||
+#define CPUCTL_EVAL_CPU_FEATURES _IO('c', 8)
|
||
|
||
#endif /* _CPUCTL_H_ */
|
||
--- sys/x86/include/apicvar.h.orig
|
||
+++ sys/x86/include/apicvar.h
|
||
@@ -179,7 +179,11 @@
|
||
IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3),
|
||
IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6),
|
||
IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint),
|
||
- IDTVEC(spuriousint), IDTVEC(timerint);
|
||
+ IDTVEC(spuriousint), IDTVEC(timerint),
|
||
+ IDTVEC(apic_isr1_pti), IDTVEC(apic_isr2_pti), IDTVEC(apic_isr3_pti),
|
||
+ IDTVEC(apic_isr4_pti), IDTVEC(apic_isr5_pti), IDTVEC(apic_isr6_pti),
|
||
+ IDTVEC(apic_isr7_pti), IDTVEC(cmcint_pti), IDTVEC(errorint_pti),
|
||
+ IDTVEC(spuriousint_pti), IDTVEC(timerint_pti);
|
||
|
||
extern vm_paddr_t lapic_paddr;
|
||
extern int apic_cpuids[];
|
||
--- sys/x86/include/specialreg.h.orig
|
||
+++ sys/x86/include/specialreg.h
|
||
@@ -374,6 +374,17 @@
|
||
#define CPUID_STDEXT2_SGXLC 0x40000000
|
||
|
||
/*
|
||
+ * CPUID instruction 7 Structured Extended Features, leaf 0 edx info
|
||
+ */
|
||
+#define CPUID_STDEXT3_IBPB 0x04000000
|
||
+#define CPUID_STDEXT3_STIBP 0x08000000
|
||
+#define CPUID_STDEXT3_ARCH_CAP 0x20000000
|
||
+
|
||
+/* MSR IA32_ARCH_CAP(ABILITIES) bits */
|
||
+#define IA32_ARCH_CAP_RDCL_NO 0x00000001
|
||
+#define IA32_ARCH_CAP_IBRS_ALL 0x00000002
|
||
+
|
||
+/*
|
||
* CPUID manufacturers identifiers
|
||
*/
|
||
#define AMD_VENDOR_ID "AuthenticAMD"
|
||
@@ -401,6 +412,8 @@
|
||
#define MSR_EBL_CR_POWERON 0x02a
|
||
#define MSR_TEST_CTL 0x033
|
||
#define MSR_IA32_FEATURE_CONTROL 0x03a
|
||
+#define MSR_IA32_SPEC_CTRL 0x048
|
||
+#define MSR_IA32_PRED_CMD 0x049
|
||
#define MSR_BIOS_UPDT_TRIG 0x079
|
||
#define MSR_BBL_CR_D0 0x088
|
||
#define MSR_BBL_CR_D1 0x089
|
||
@@ -413,6 +426,7 @@
|
||
#define MSR_APERF 0x0e8
|
||
#define MSR_IA32_EXT_CONFIG 0x0ee /* Undocumented. Core Solo/Duo only */
|
||
#define MSR_MTRRcap 0x0fe
|
||
+#define MSR_IA32_ARCH_CAP 0x10a
|
||
#define MSR_BBL_CR_ADDR 0x116
|
||
#define MSR_BBL_CR_DECC 0x118
|
||
#define MSR_BBL_CR_CTL 0x119
|
||
@@ -556,6 +570,17 @@
|
||
#define IA32_MISC_EN_XDD 0x0000000400000000ULL
|
||
|
||
/*
|
||
+ * IA32_SPEC_CTRL and IA32_PRED_CMD MSRs are described in the Intel'
|
||
+ * document 336996-001 Speculative Execution Side Channel Mitigations.
|
||
+ */
|
||
+/* MSR IA32_SPEC_CTRL */
|
||
+#define IA32_SPEC_CTRL_IBRS 0x00000001
|
||
+#define IA32_SPEC_CTRL_STIBP 0x00000002
|
||
+
|
||
+/* MSR IA32_PRED_CMD */
|
||
+#define IA32_PRED_CMD_IBPB_BARRIER 0x0000000000000001ULL
|
||
+
|
||
+/*
|
||
* PAT modes.
|
||
*/
|
||
#define PAT_UNCACHEABLE 0x00
|
||
--- sys/x86/include/x86_smp.h.orig
|
||
+++ sys/x86/include/x86_smp.h
|
||
@@ -37,6 +37,7 @@
|
||
extern int cpu_cores;
|
||
extern volatile uint32_t smp_tlb_generation;
|
||
extern struct pmap *smp_tlb_pmap;
|
||
+extern vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
|
||
extern u_int xhits_gbl[];
|
||
extern u_int xhits_pg[];
|
||
extern u_int xhits_rng[];
|
||
@@ -95,9 +96,9 @@
|
||
u_int mp_bootaddress(u_int);
|
||
void set_interrupt_apic_ids(void);
|
||
void smp_cache_flush(void);
|
||
-void smp_masked_invlpg(cpuset_t mask, vm_offset_t addr);
|
||
+void smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, struct pmap *pmap);
|
||
void smp_masked_invlpg_range(cpuset_t mask, vm_offset_t startva,
|
||
- vm_offset_t endva);
|
||
+ vm_offset_t endva, struct pmap *pmap);
|
||
void smp_masked_invltlb(cpuset_t mask, struct pmap *pmap);
|
||
void mem_range_AP_init(void);
|
||
void topo_probe(void);
|
||
--- sys/x86/include/x86_var.h.orig
|
||
+++ sys/x86/include/x86_var.h
|
||
@@ -50,6 +50,8 @@
|
||
extern u_int cpu_clflush_line_size;
|
||
extern u_int cpu_stdext_feature;
|
||
extern u_int cpu_stdext_feature2;
|
||
+extern u_int cpu_stdext_feature3;
|
||
+extern uint64_t cpu_ia32_arch_caps;
|
||
extern u_int cpu_fxsr;
|
||
extern u_int cpu_high;
|
||
extern u_int cpu_id;
|
||
@@ -78,6 +80,7 @@
|
||
extern int _ugssel;
|
||
extern int use_xsave;
|
||
extern uint64_t xsave_mask;
|
||
+extern int pti;
|
||
|
||
struct pcb;
|
||
struct thread;
|
||
@@ -115,7 +118,9 @@
|
||
void cpu_setregs(void);
|
||
void dump_add_page(vm_paddr_t);
|
||
void dump_drop_page(vm_paddr_t);
|
||
-void identify_cpu(void);
|
||
+void finishidentcpu(void);
|
||
+void identify_cpu1(void);
|
||
+void identify_cpu2(void);
|
||
void initializecpu(void);
|
||
void initializecpucache(void);
|
||
bool fix_cpuid(void);
|
||
@@ -122,11 +127,15 @@
|
||
void fillw(int /*u_short*/ pat, void *base, size_t cnt);
|
||
int is_physical_memory(vm_paddr_t addr);
|
||
int isa_nmi(int cd);
|
||
+void handle_ibrs_entry(void);
|
||
+void handle_ibrs_exit(void);
|
||
+void hw_ibrs_recalculate(void);
|
||
void nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame);
|
||
void nmi_call_kdb_smp(u_int type, struct trapframe *frame);
|
||
void nmi_handle_intr(u_int type, struct trapframe *frame);
|
||
void pagecopy(void *from, void *to);
|
||
void printcpuinfo(void);
|
||
+int pti_get_default(void);
|
||
int user_dbreg_trap(void);
|
||
int minidumpsys(struct dumperinfo *);
|
||
struct pcb *get_pcb_td(struct thread *td);
|
||
--- sys/x86/isa/atpic.c.orig
|
||
+++ sys/x86/isa/atpic.c
|
||
@@ -86,6 +86,16 @@
|
||
IDTVEC(atpic_intr9), IDTVEC(atpic_intr10), IDTVEC(atpic_intr11),
|
||
IDTVEC(atpic_intr12), IDTVEC(atpic_intr13), IDTVEC(atpic_intr14),
|
||
IDTVEC(atpic_intr15);
|
||
+/* XXXKIB i386 uses stubs until pti comes */
|
||
+inthand_t
|
||
+ IDTVEC(atpic_intr0_pti), IDTVEC(atpic_intr1_pti),
|
||
+ IDTVEC(atpic_intr2_pti), IDTVEC(atpic_intr3_pti),
|
||
+ IDTVEC(atpic_intr4_pti), IDTVEC(atpic_intr5_pti),
|
||
+ IDTVEC(atpic_intr6_pti), IDTVEC(atpic_intr7_pti),
|
||
+ IDTVEC(atpic_intr8_pti), IDTVEC(atpic_intr9_pti),
|
||
+ IDTVEC(atpic_intr10_pti), IDTVEC(atpic_intr11_pti),
|
||
+ IDTVEC(atpic_intr12_pti), IDTVEC(atpic_intr13_pti),
|
||
+ IDTVEC(atpic_intr14_pti), IDTVEC(atpic_intr15_pti);
|
||
|
||
#define IRQ(ap, ai) ((ap)->at_irqbase + (ai)->at_irq)
|
||
|
||
@@ -98,7 +108,7 @@
|
||
|
||
#define INTSRC(irq) \
|
||
{ { &atpics[(irq) / 8].at_pic }, IDTVEC(atpic_intr ## irq ), \
|
||
- (irq) % 8 }
|
||
+ IDTVEC(atpic_intr ## irq ## _pti), (irq) % 8 }
|
||
|
||
struct atpic {
|
||
struct pic at_pic;
|
||
@@ -110,7 +120,7 @@
|
||
|
||
struct atpic_intsrc {
|
||
struct intsrc at_intsrc;
|
||
- inthand_t *at_intr;
|
||
+ inthand_t *at_intr, *at_intr_pti;
|
||
int at_irq; /* Relative to PIC base. */
|
||
enum intr_trigger at_trigger;
|
||
u_long at_count;
|
||
@@ -435,7 +445,8 @@
|
||
ai->at_intsrc.is_count = &ai->at_count;
|
||
ai->at_intsrc.is_straycount = &ai->at_straycount;
|
||
setidt(((struct atpic *)ai->at_intsrc.is_pic)->at_intbase +
|
||
- ai->at_irq, ai->at_intr, SDT_ATPIC, SEL_KPL, GSEL_ATPIC);
|
||
+ ai->at_irq, pti ? ai->at_intr_pti : ai->at_intr, SDT_ATPIC,
|
||
+ SEL_KPL, GSEL_ATPIC);
|
||
}
|
||
|
||
#ifdef DEV_MCA
|
||
--- sys/x86/x86/cpu_machdep.c.orig
|
||
+++ sys/x86/x86/cpu_machdep.c
|
||
@@ -139,6 +139,12 @@
|
||
int *state;
|
||
|
||
/*
|
||
+ * A comment in Linux patch claims that 'CPUs run faster with
|
||
+ * speculation protection disabled. All CPU threads in a core
|
||
+ * must disable speculation protection for it to be
|
||
+ * disabled. Disable it while we are idle so the other
|
||
+ * hyperthread can run fast.'
|
||
+ *
|
||
* XXXKIB. Software coordination mode should be supported,
|
||
* but all Intel CPUs provide hardware coordination.
|
||
*/
|
||
@@ -147,9 +153,11 @@
|
||
KASSERT(*state == STATE_SLEEPING,
|
||
("cpu_mwait_cx: wrong monitorbuf state"));
|
||
*state = STATE_MWAIT;
|
||
+ handle_ibrs_entry();
|
||
cpu_monitor(state, 0, 0);
|
||
if (*state == STATE_MWAIT)
|
||
cpu_mwait(MWAIT_INTRBREAK, mwait_hint);
|
||
+ handle_ibrs_exit();
|
||
|
||
/*
|
||
* We should exit on any event that interrupts mwait, because
|
||
@@ -578,3 +586,47 @@
|
||
nmi_call_kdb(PCPU_GET(cpuid), type, frame);
|
||
#endif
|
||
}
|
||
+
|
||
+int hw_ibrs_active;
|
||
+int hw_ibrs_disable = 1;
|
||
+
|
||
+SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0,
|
||
+ "Indirect Branch Restricted Speculation active");
|
||
+
|
||
+void
|
||
+hw_ibrs_recalculate(void)
|
||
+{
|
||
+ uint64_t v;
|
||
+
|
||
+ if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) {
|
||
+ if (hw_ibrs_disable) {
|
||
+ v= rdmsr(MSR_IA32_SPEC_CTRL);
|
||
+ v &= ~(uint64_t)IA32_SPEC_CTRL_IBRS;
|
||
+ wrmsr(MSR_IA32_SPEC_CTRL, v);
|
||
+ } else {
|
||
+ v= rdmsr(MSR_IA32_SPEC_CTRL);
|
||
+ v |= IA32_SPEC_CTRL_IBRS;
|
||
+ wrmsr(MSR_IA32_SPEC_CTRL, v);
|
||
+ }
|
||
+ return;
|
||
+ }
|
||
+ hw_ibrs_active = (cpu_stdext_feature3 & CPUID_STDEXT3_IBPB) != 0 &&
|
||
+ !hw_ibrs_disable;
|
||
+}
|
||
+
|
||
+static int
|
||
+hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS)
|
||
+{
|
||
+ int error, val;
|
||
+
|
||
+ val = hw_ibrs_disable;
|
||
+ error = sysctl_handle_int(oidp, &val, 0, req);
|
||
+ if (error != 0 || req->newptr == NULL)
|
||
+ return (error);
|
||
+ hw_ibrs_disable = val != 0;
|
||
+ hw_ibrs_recalculate();
|
||
+ return (0);
|
||
+}
|
||
+SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN |
|
||
+ CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I",
|
||
+ "Disable Indirect Branch Restricted Speculation");
|
||
--- sys/x86/x86/identcpu.c.orig
|
||
+++ sys/x86/x86/identcpu.c
|
||
@@ -104,8 +104,10 @@
|
||
u_int cpu_fxsr; /* SSE enabled */
|
||
u_int cpu_mxcsr_mask; /* Valid bits in mxcsr */
|
||
u_int cpu_clflush_line_size = 32;
|
||
-u_int cpu_stdext_feature;
|
||
-u_int cpu_stdext_feature2;
|
||
+u_int cpu_stdext_feature; /* %ebx */
|
||
+u_int cpu_stdext_feature2; /* %ecx */
|
||
+u_int cpu_stdext_feature3; /* %edx */
|
||
+uint64_t cpu_ia32_arch_caps;
|
||
u_int cpu_max_ext_state_size;
|
||
u_int cpu_mon_mwait_flags; /* MONITOR/MWAIT flags (CPUID.05H.ECX) */
|
||
u_int cpu_mon_min_size; /* MONITOR minimum range size, bytes */
|
||
@@ -978,6 +980,16 @@
|
||
);
|
||
}
|
||
|
||
+ if (cpu_stdext_feature3 != 0) {
|
||
+ printf("\n Structured Extended Features3=0x%b",
|
||
+ cpu_stdext_feature3,
|
||
+ "\020"
|
||
+ "\033IBPB"
|
||
+ "\034STIBP"
|
||
+ "\036ARCH_CAP"
|
||
+ );
|
||
+ }
|
||
+
|
||
if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
|
||
cpuid_count(0xd, 0x1, regs);
|
||
if (regs[0] != 0) {
|
||
@@ -991,6 +1003,15 @@
|
||
}
|
||
}
|
||
|
||
+ if (cpu_ia32_arch_caps != 0) {
|
||
+ printf("\n IA32_ARCH_CAPS=0x%b",
|
||
+ (u_int)cpu_ia32_arch_caps,
|
||
+ "\020"
|
||
+ "\001RDCL_NO"
|
||
+ "\002IBRS_ALL"
|
||
+ );
|
||
+ }
|
||
+
|
||
if (via_feature_rng != 0 || via_feature_xcrypt != 0)
|
||
print_via_padlock_info();
|
||
|
||
@@ -1370,23 +1391,11 @@
|
||
return (false);
|
||
}
|
||
|
||
-/*
|
||
- * Final stage of CPU identification.
|
||
- */
|
||
-#ifdef __i386__
|
||
void
|
||
-finishidentcpu(void)
|
||
-#else
|
||
-void
|
||
-identify_cpu(void)
|
||
-#endif
|
||
+identify_cpu1(void)
|
||
{
|
||
- u_int regs[4], cpu_stdext_disable;
|
||
-#ifdef __i386__
|
||
- u_char ccr3;
|
||
-#endif
|
||
+ u_int regs[4];
|
||
|
||
-#ifdef __amd64__
|
||
do_cpuid(0, regs);
|
||
cpu_high = regs[0];
|
||
((u_int *)&cpu_vendor)[0] = regs[1];
|
||
@@ -1399,6 +1408,44 @@
|
||
cpu_procinfo = regs[1];
|
||
cpu_feature = regs[3];
|
||
cpu_feature2 = regs[2];
|
||
+}
|
||
+
|
||
+void
|
||
+identify_cpu2(void)
|
||
+{
|
||
+ u_int regs[4], cpu_stdext_disable;
|
||
+
|
||
+ if (cpu_high >= 7) {
|
||
+ cpuid_count(7, 0, regs);
|
||
+ cpu_stdext_feature = regs[1];
|
||
+
|
||
+ /*
|
||
+ * Some hypervisors failed to filter out unsupported
|
||
+ * extended features. Allow to disable the
|
||
+ * extensions, activation of which requires setting a
|
||
+ * bit in CR4, and which VM monitors do not support.
|
||
+ */
|
||
+ cpu_stdext_disable = 0;
|
||
+ TUNABLE_INT_FETCH("hw.cpu_stdext_disable", &cpu_stdext_disable);
|
||
+ cpu_stdext_feature &= ~cpu_stdext_disable;
|
||
+
|
||
+ cpu_stdext_feature2 = regs[2];
|
||
+ cpu_stdext_feature3 = regs[3];
|
||
+
|
||
+ if ((cpu_stdext_feature3 & CPUID_STDEXT3_ARCH_CAP) != 0)
|
||
+ cpu_ia32_arch_caps = rdmsr(MSR_IA32_ARCH_CAP);
|
||
+ }
|
||
+}
|
||
+
|
||
+/*
|
||
+ * Final stage of CPU identification.
|
||
+ */
|
||
+void
|
||
+finishidentcpu(void)
|
||
+{
|
||
+ u_int regs[4];
|
||
+#ifdef __i386__
|
||
+ u_char ccr3;
|
||
#endif
|
||
|
||
identify_hypervisor();
|
||
@@ -1416,26 +1463,8 @@
|
||
cpu_mon_max_size = regs[1] & CPUID5_MON_MAX_SIZE;
|
||
}
|
||
|
||
- if (cpu_high >= 7) {
|
||
- cpuid_count(7, 0, regs);
|
||
- cpu_stdext_feature = regs[1];
|
||
+ identify_cpu2();
|
||
|
||
- /*
|
||
- * Some hypervisors fail to filter out unsupported
|
||
- * extended features. For now, disable the
|
||
- * extensions, activation of which requires setting a
|
||
- * bit in CR4, and which VM monitors do not support.
|
||
- */
|
||
- if (cpu_feature2 & CPUID2_HV) {
|
||
- cpu_stdext_disable = CPUID_STDEXT_FSGSBASE |
|
||
- CPUID_STDEXT_SMEP;
|
||
- } else
|
||
- cpu_stdext_disable = 0;
|
||
- TUNABLE_INT_FETCH("hw.cpu_stdext_disable", &cpu_stdext_disable);
|
||
- cpu_stdext_feature &= ~cpu_stdext_disable;
|
||
- cpu_stdext_feature2 = regs[2];
|
||
- }
|
||
-
|
||
#ifdef __i386__
|
||
if (cpu_high > 0 &&
|
||
(cpu_vendor_id == CPU_VENDOR_INTEL ||
|
||
@@ -1563,6 +1592,17 @@
|
||
#endif
|
||
}
|
||
|
||
+int
|
||
+pti_get_default(void)
|
||
+{
|
||
+
|
||
+ if (strcmp(cpu_vendor, AMD_VENDOR_ID) == 0)
|
||
+ return (0);
|
||
+ if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_RDCL_NO) != 0)
|
||
+ return (0);
|
||
+ return (1);
|
||
+}
|
||
+
|
||
static u_int
|
||
find_cpu_vendor_id(void)
|
||
{
|
||
--- sys/x86/x86/local_apic.c.orig
|
||
+++ sys/x86/x86/local_apic.c
|
||
@@ -166,6 +166,16 @@
|
||
IDTVEC(apic_isr7), /* 224 - 255 */
|
||
};
|
||
|
||
+static inthand_t *ioint_pti_handlers[] = {
|
||
+ NULL, /* 0 - 31 */
|
||
+ IDTVEC(apic_isr1_pti), /* 32 - 63 */
|
||
+ IDTVEC(apic_isr2_pti), /* 64 - 95 */
|
||
+ IDTVEC(apic_isr3_pti), /* 96 - 127 */
|
||
+ IDTVEC(apic_isr4_pti), /* 128 - 159 */
|
||
+ IDTVEC(apic_isr5_pti), /* 160 - 191 */
|
||
+ IDTVEC(apic_isr6_pti), /* 192 - 223 */
|
||
+ IDTVEC(apic_isr7_pti), /* 224 - 255 */
|
||
+};
|
||
|
||
static u_int32_t lapic_timer_divisors[] = {
|
||
APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16,
|
||
@@ -172,7 +182,7 @@
|
||
APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128
|
||
};
|
||
|
||
-extern inthand_t IDTVEC(rsvd);
|
||
+extern inthand_t IDTVEC(rsvd_pti), IDTVEC(rsvd);
|
||
|
||
volatile char *lapic_map;
|
||
vm_paddr_t lapic_paddr;
|
||
@@ -489,15 +499,18 @@
|
||
PCPU_SET(apic_id, lapic_id());
|
||
|
||
/* Local APIC timer interrupt. */
|
||
- setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_APIC, SEL_KPL, GSEL_APIC);
|
||
+ setidt(APIC_TIMER_INT, pti ? IDTVEC(timerint_pti) : IDTVEC(timerint),
|
||
+ SDT_APIC, SEL_KPL, GSEL_APIC);
|
||
|
||
/* Local APIC error interrupt. */
|
||
- setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC);
|
||
+ setidt(APIC_ERROR_INT, pti ? IDTVEC(errorint_pti) : IDTVEC(errorint),
|
||
+ SDT_APIC, SEL_KPL, GSEL_APIC);
|
||
|
||
/* XXX: Thermal interrupt */
|
||
|
||
/* Local APIC CMCI. */
|
||
- setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_APICT, SEL_KPL, GSEL_APIC);
|
||
+ setidt(APIC_CMC_INT, pti ? IDTVEC(cmcint_pti) : IDTVEC(cmcint),
|
||
+ SDT_APICT, SEL_KPL, GSEL_APIC);
|
||
|
||
if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) {
|
||
arat = 0;
|
||
@@ -1561,8 +1574,8 @@
|
||
KASSERT(vector != IDT_DTRACE_RET,
|
||
("Attempt to overwrite DTrace entry"));
|
||
#endif
|
||
- setidt(vector, ioint_handlers[vector / 32], SDT_APIC, SEL_KPL,
|
||
- GSEL_APIC);
|
||
+ setidt(vector, (pti ? ioint_pti_handlers : ioint_handlers)[vector / 32],
|
||
+ SDT_APIC, SEL_KPL, GSEL_APIC);
|
||
}
|
||
|
||
static void
|
||
@@ -1581,7 +1594,8 @@
|
||
* We can not currently clear the idt entry because other cpus
|
||
* may have a valid vector at this offset.
|
||
*/
|
||
- setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC);
|
||
+ setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
|
||
+ SEL_KPL, GSEL_APIC);
|
||
#endif
|
||
}
|
||
|
||
@@ -2084,7 +2098,8 @@
|
||
long func;
|
||
int idx, vector;
|
||
|
||
- KASSERT(ipifunc != &IDTVEC(rsvd), ("invalid ipifunc %p", ipifunc));
|
||
+ KASSERT(ipifunc != &IDTVEC(rsvd) && ipifunc != &IDTVEC(rsvd_pti),
|
||
+ ("invalid ipifunc %p", ipifunc));
|
||
|
||
vector = -1;
|
||
mtx_lock_spin(&icu_lock);
|
||
@@ -2091,7 +2106,8 @@
|
||
for (idx = IPI_DYN_FIRST; idx <= IPI_DYN_LAST; idx++) {
|
||
ip = &idt[idx];
|
||
func = (ip->gd_hioffset << 16) | ip->gd_looffset;
|
||
- if (func == (uintptr_t)&IDTVEC(rsvd)) {
|
||
+ if ((!pti && func == (uintptr_t)&IDTVEC(rsvd)) ||
|
||
+ (pti && func == (uintptr_t)&IDTVEC(rsvd_pti))) {
|
||
vector = idx;
|
||
setidt(vector, ipifunc, SDT_APIC, SEL_KPL, GSEL_APIC);
|
||
break;
|
||
@@ -2113,8 +2129,10 @@
|
||
mtx_lock_spin(&icu_lock);
|
||
ip = &idt[vector];
|
||
func = (ip->gd_hioffset << 16) | ip->gd_looffset;
|
||
- KASSERT(func != (uintptr_t)&IDTVEC(rsvd),
|
||
+ KASSERT(func != (uintptr_t)&IDTVEC(rsvd) &&
|
||
+ func != (uintptr_t)&IDTVEC(rsvd_pti),
|
||
("invalid idtfunc %#lx", func));
|
||
- setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC);
|
||
+ setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
|
||
+ SEL_KPL, GSEL_APIC);
|
||
mtx_unlock_spin(&icu_lock);
|
||
}
|
||
--- sys/x86/x86/mp_x86.c.orig
|
||
+++ sys/x86/x86/mp_x86.c
|
||
@@ -1436,7 +1436,7 @@
|
||
*/
|
||
|
||
/* Variables needed for SMP tlb shootdown. */
|
||
-static vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
|
||
+vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
|
||
pmap_t smp_tlb_pmap;
|
||
volatile uint32_t smp_tlb_generation;
|
||
|
||
@@ -1509,11 +1509,11 @@
|
||
}
|
||
|
||
void
|
||
-smp_masked_invlpg(cpuset_t mask, vm_offset_t addr)
|
||
+smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap)
|
||
{
|
||
|
||
if (smp_started) {
|
||
- smp_targeted_tlb_shootdown(mask, IPI_INVLPG, NULL, addr, 0);
|
||
+ smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0);
|
||
#ifdef COUNT_XINVLTLB_HITS
|
||
ipi_page++;
|
||
#endif
|
||
@@ -1521,11 +1521,12 @@
|
||
}
|
||
|
||
void
|
||
-smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2)
|
||
+smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
|
||
+ pmap_t pmap)
|
||
{
|
||
|
||
if (smp_started) {
|
||
- smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, NULL,
|
||
+ smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap,
|
||
addr1, addr2);
|
||
#ifdef COUNT_XINVLTLB_HITS
|
||
ipi_range++;
|
||
--- sys/x86/xen/pv.c.orig
|
||
+++ sys/x86/xen/pv.c
|
||
@@ -97,6 +97,7 @@
|
||
#ifdef SMP
|
||
/* Variables used by amd64 mp_machdep to start APs */
|
||
extern char *doublefault_stack;
|
||
+extern char *mce_stack;
|
||
extern char *nmi_stack;
|
||
#endif
|
||
|
||
@@ -217,6 +218,8 @@
|
||
(void *)kmem_malloc(kernel_arena, stacksize, M_WAITOK | M_ZERO);
|
||
doublefault_stack =
|
||
(char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO);
|
||
+ mce_stack =
|
||
+ (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO);
|
||
nmi_stack =
|
||
(char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO);
|
||
dpcpu =
|
||
--- usr.sbin/cpucontrol/cpucontrol.8.orig
|
||
+++ usr.sbin/cpucontrol/cpucontrol.8
|
||
@@ -24,7 +24,7 @@
|
||
.\"
|
||
.\" $FreeBSD$
|
||
.\"
|
||
-.Dd June 30, 2009
|
||
+.Dd January 5, 2018
|
||
.Dt CPUCONTROL 8
|
||
.Os
|
||
.Sh NAME
|
||
@@ -36,44 +36,48 @@
|
||
.Nm
|
||
.Op Fl vh
|
||
.Fl m Ar msr
|
||
-.Bk
|
||
.Ar device
|
||
.Ek
|
||
+.Bk
|
||
.Nm
|
||
.Op Fl vh
|
||
.Fl m Ar msr Ns = Ns Ar value
|
||
-.Bk
|
||
.Ar device
|
||
.Ek
|
||
+.Bk
|
||
.Nm
|
||
.Op Fl vh
|
||
.Fl m Ar msr Ns &= Ns Ar mask
|
||
-.Bk
|
||
.Ar device
|
||
.Ek
|
||
+.Bk
|
||
.Nm
|
||
.Op Fl vh
|
||
.Fl m Ar msr Ns |= Ns Ar mask
|
||
-.Bk
|
||
.Ar device
|
||
.Ek
|
||
+.Bk
|
||
.Nm
|
||
.Op Fl vh
|
||
.Fl i Ar level
|
||
-.Bk
|
||
.Ar device
|
||
.Ek
|
||
+.Bk
|
||
.Nm
|
||
.Op Fl vh
|
||
.Fl i Ar level,level_type
|
||
-.Bk
|
||
.Ar device
|
||
.Ek
|
||
+.Bk
|
||
.Nm
|
||
.Op Fl vh
|
||
.Op Fl d Ar datadir
|
||
.Fl u
|
||
+.Ar device
|
||
+.Ek
|
||
.Bk
|
||
+.Nm
|
||
+.Fl e
|
||
.Ar device
|
||
.Ek
|
||
.Sh DESCRIPTION
|
||
@@ -129,6 +133,20 @@
|
||
.Nm
|
||
utility will walk through the configured data directories
|
||
and apply all firmware updates available for this CPU.
|
||
+.It Fl e
|
||
+Re-evaluate the kernel flags indicating the present CPU features.
|
||
+This command is typically executed after a firmware update was applied
|
||
+which changes information reported by the
|
||
+.Dv CPUID
|
||
+instruction.
|
||
+.Pp
|
||
+.Bf -symbolic
|
||
+Only execute the
|
||
+.Fl e
|
||
+command after the microcode update was applied to all CPUs in the system.
|
||
+The kernel does not operate correctly if the features of processors are
|
||
+not identical.
|
||
+.Ef
|
||
.It Fl v
|
||
Increase the verbosity level.
|
||
.It Fl h
|
||
--- usr.sbin/cpucontrol/cpucontrol.c.orig
|
||
+++ usr.sbin/cpucontrol/cpucontrol.c
|
||
@@ -60,6 +60,7 @@
|
||
#define FLAG_I 0x01
|
||
#define FLAG_M 0x02
|
||
#define FLAG_U 0x04
|
||
+#define FLAG_E 0x10
|
||
|
||
#define OP_INVAL 0x00
|
||
#define OP_READ 0x01
|
||
@@ -114,7 +115,7 @@
|
||
if (name == NULL)
|
||
name = "cpuctl";
|
||
fprintf(stderr, "Usage: %s [-vh] [-d datadir] [-m msr[=value] | "
|
||
- "-i level | -i level,level_type | -u] device\n", name);
|
||
+ "-i level | -i level,level_type | -e | -u] device\n", name);
|
||
exit(EX_USAGE);
|
||
}
|
||
|
||
@@ -338,6 +339,25 @@
|
||
}
|
||
|
||
static int
|
||
+do_eval_cpu_features(const char *dev)
|
||
+{
|
||
+ int fd, error;
|
||
+
|
||
+ assert(dev != NULL);
|
||
+
|
||
+ fd = open(dev, O_RDWR);
|
||
+ if (fd < 0) {
|
||
+ WARN(0, "error opening %s for writing", dev);
|
||
+ return (1);
|
||
+ }
|
||
+ error = ioctl(fd, CPUCTL_EVAL_CPU_FEATURES, NULL);
|
||
+ if (error < 0)
|
||
+ WARN(0, "ioctl(%s, CPUCTL_EVAL_CPU_FEATURES)", dev);
|
||
+ close(fd);
|
||
+ return (error);
|
||
+}
|
||
+
|
||
+static int
|
||
do_update(const char *dev)
|
||
{
|
||
int fd;
|
||
@@ -431,11 +451,14 @@
|
||
* Add all default data dirs to the list first.
|
||
*/
|
||
datadir_add(DEFAULT_DATADIR);
|
||
- while ((c = getopt(argc, argv, "d:hi:m:uv")) != -1) {
|
||
+ while ((c = getopt(argc, argv, "d:ehi:m:uv")) != -1) {
|
||
switch (c) {
|
||
case 'd':
|
||
datadir_add(optarg);
|
||
break;
|
||
+ case 'e':
|
||
+ flags |= FLAG_E;
|
||
+ break;
|
||
case 'i':
|
||
flags |= FLAG_I;
|
||
cmdarg = optarg;
|
||
@@ -464,22 +487,25 @@
|
||
/* NOTREACHED */
|
||
}
|
||
dev = argv[0];
|
||
- c = flags & (FLAG_I | FLAG_M | FLAG_U);
|
||
+ c = flags & (FLAG_E | FLAG_I | FLAG_M | FLAG_U);
|
||
switch (c) {
|
||
- case FLAG_I:
|
||
- if (strstr(cmdarg, ",") != NULL)
|
||
- error = do_cpuid_count(cmdarg, dev);
|
||
- else
|
||
- error = do_cpuid(cmdarg, dev);
|
||
- break;
|
||
- case FLAG_M:
|
||
- error = do_msr(cmdarg, dev);
|
||
- break;
|
||
- case FLAG_U:
|
||
- error = do_update(dev);
|
||
- break;
|
||
- default:
|
||
- usage(); /* Only one command can be selected. */
|
||
+ case FLAG_I:
|
||
+ if (strstr(cmdarg, ",") != NULL)
|
||
+ error = do_cpuid_count(cmdarg, dev);
|
||
+ else
|
||
+ error = do_cpuid(cmdarg, dev);
|
||
+ break;
|
||
+ case FLAG_M:
|
||
+ error = do_msr(cmdarg, dev);
|
||
+ break;
|
||
+ case FLAG_U:
|
||
+ error = do_update(dev);
|
||
+ break;
|
||
+ case FLAG_E:
|
||
+ error = do_eval_cpu_features(dev);
|
||
+ break;
|
||
+ default:
|
||
+ usage(); /* Only one command can be selected. */
|
||
}
|
||
SLIST_FREE(&datadirs, next, free);
|
||
return (error == 0 ? 0 : 1);
|