diff options
author | Mario Preksavec <mario@slackware.hr> | 2018-06-03 17:14:09 +0200 |
---|---|---|
committer | Willy Sudiarto Raharjo <willysr@slackbuilds.org> | 2018-06-09 07:15:40 +0700 |
commit | 5d04b7d933ed8da40981ea7ae37c7e48271f8d6a (patch) | |
tree | f277f9f1b04c7789671be43a4c8948e028c53be4 /system/xen/patches | |
parent | c4ca04219c4725efcc78990abdd83c6c2e5b4658 (diff) |
system/xen: XSA 263 update.
Signed-off-by: Mario Preksavec <mario@slackware.hr>
Diffstat (limited to 'system/xen/patches')
-rw-r--r-- | system/xen/patches/xen-4.10.2-pre.patch | 1631 |
1 files changed, 1631 insertions, 0 deletions
diff --git a/system/xen/patches/xen-4.10.2-pre.patch b/system/xen/patches/xen-4.10.2-pre.patch new file mode 100644 index 0000000000000..42477696e15fa --- /dev/null +++ b/system/xen/patches/xen-4.10.2-pre.patch @@ -0,0 +1,1631 @@ +diff --git a/tools/libacpi/Makefile b/tools/libacpi/Makefile +index a47a658a25..c17f3924cc 100644 +--- a/tools/libacpi/Makefile ++++ b/tools/libacpi/Makefile +@@ -43,7 +43,7 @@ all: $(C_SRC) $(H_SRC) + + $(H_SRC): $(ACPI_BUILD_DIR)/%.h: %.asl iasl + iasl -vs -p $(ACPI_BUILD_DIR)/$*.$(TMP_SUFFIX) -tc $< +- sed -e 's/AmlCode/$*/g' $(ACPI_BUILD_DIR)/$*.hex >$@ ++ sed -e 's/AmlCode/$*/g' -e 's/_aml_code//g' $(ACPI_BUILD_DIR)/$*.hex >$@ + rm -f $(addprefix $(ACPI_BUILD_DIR)/, $*.aml $*.hex) + + $(MK_DSDT): mk_dsdt.c +@@ -76,7 +76,7 @@ $(ACPI_BUILD_DIR)/dsdt_anycpu_arm.asl: $(MK_DSDT) + + $(C_SRC): $(ACPI_BUILD_DIR)/%.c: iasl $(ACPI_BUILD_DIR)/%.asl + iasl -vs -p $(ACPI_BUILD_DIR)/$*.$(TMP_SUFFIX) -tc $(ACPI_BUILD_DIR)/$*.asl +- sed -e 's/AmlCode/$*/g' $(ACPI_BUILD_DIR)/$*.hex > $@.$(TMP_SUFFIX) ++ sed -e 's/AmlCode/$*/g' -e 's/_aml_code//g' $(ACPI_BUILD_DIR)/$*.hex > $@.$(TMP_SUFFIX) + echo "int $*_len=sizeof($*);" >> $@.$(TMP_SUFFIX) + mv -f $@.$(TMP_SUFFIX) $@ + rm -f $(addprefix $(ACPI_BUILD_DIR)/, $*.aml $*.hex) +#diff --git a/xen/Makefile b/xen/Makefile +#index ecec297b9b..580af86931 100644 +#--- a/xen/Makefile +#+++ b/xen/Makefile +#@@ -2,7 +2,7 @@ +# # All other places this is stored (eg. compile.h) should be autogenerated. +# export XEN_VERSION = 4 +# export XEN_SUBVERSION = 10 +#-export XEN_EXTRAVERSION ?= .1$(XEN_VENDORVERSION) +#+export XEN_EXTRAVERSION ?= .2-pre$(XEN_VENDORVERSION) +# export XEN_FULLVERSION = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION) +# -include xen-version +# +diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c +index 1e4e5680a7..f7085d3c7b 100644 +--- a/xen/arch/x86/acpi/power.c ++++ b/xen/arch/x86/acpi/power.c +@@ -28,6 +28,7 @@ + #include <asm/tboot.h> + #include <asm/apic.h> + #include <asm/io_apic.h> ++#include <asm/spec_ctrl.h> + #include <acpi/cpufreq/cpufreq.h> + + uint32_t system_reset_counter = 1; +@@ -163,6 +164,7 @@ static int enter_state(u32 state) + { + unsigned long flags; + int error; ++ struct cpu_info *ci; + unsigned long cr4; + + if ( (state <= ACPI_STATE_S0) || (state > ACPI_S_STATES_MAX) ) +@@ -203,12 +205,18 @@ static int enter_state(u32 state) + printk(XENLOG_ERR "Some devices failed to power down."); + system_state = SYS_STATE_resume; + device_power_up(error); ++ console_end_sync(); + error = -EIO; + goto done; + } + else + error = 0; + ++ ci = get_cpu_info(); ++ spec_ctrl_enter_idle(ci); ++ /* Avoid NMI/#MC using MSR_SPEC_CTRL until we've reloaded microcode. */ ++ ci->bti_ist_info = 0; ++ + ACPI_FLUSH_CPU_CACHE(); + + switch ( state ) +@@ -243,17 +251,23 @@ static int enter_state(u32 state) + if ( (state == ACPI_STATE_S3) && error ) + tboot_s3_error(error); + ++ console_end_sync(); ++ ++ microcode_resume_cpu(0); ++ ++ /* Re-enabled default NMI/#MC use of MSR_SPEC_CTRL. */ ++ ci->bti_ist_info = default_bti_ist_info; ++ spec_ctrl_exit_idle(ci); ++ + done: + spin_debug_enable(); + local_irq_restore(flags); +- console_end_sync(); + acpi_sleep_post(state); + if ( hvm_cpu_up() ) + BUG(); ++ cpufreq_add_cpu(0); + + enable_cpu: +- cpufreq_add_cpu(0); +- microcode_resume_cpu(0); + rcu_barrier(); + mtrr_aps_sync_begin(); + enable_nonboot_cpus(); +diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c +index fdb2bf1779..136adadb63 100644 +--- a/xen/arch/x86/cpu/common.c ++++ b/xen/arch/x86/cpu/common.c +@@ -747,6 +747,7 @@ void load_system_tables(void) + [IST_MCE - 1] = stack_top + IST_MCE * PAGE_SIZE, + [IST_DF - 1] = stack_top + IST_DF * PAGE_SIZE, + [IST_NMI - 1] = stack_top + IST_NMI * PAGE_SIZE, ++ [IST_DB - 1] = stack_top + IST_DB * PAGE_SIZE, + + [IST_MAX ... ARRAY_SIZE(tss->ist) - 1] = + 0x8600111111111111ul, +@@ -774,6 +775,7 @@ void load_system_tables(void) + set_ist(&idt_tables[cpu][TRAP_double_fault], IST_DF); + set_ist(&idt_tables[cpu][TRAP_nmi], IST_NMI); + set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE); ++ set_ist(&idt_tables[cpu][TRAP_debug], IST_DB); + + /* + * Bottom-of-stack must be 16-byte aligned! +diff --git a/xen/arch/x86/hpet.c b/xen/arch/x86/hpet.c +index 8229c635e4..f18cbbd55a 100644 +--- a/xen/arch/x86/hpet.c ++++ b/xen/arch/x86/hpet.c +@@ -509,6 +509,8 @@ static void hpet_attach_channel(unsigned int cpu, + static void hpet_detach_channel(unsigned int cpu, + struct hpet_event_channel *ch) + { ++ unsigned int next; ++ + spin_lock_irq(&ch->lock); + + ASSERT(ch == per_cpu(cpu_bc_channel, cpu)); +@@ -517,7 +519,7 @@ static void hpet_detach_channel(unsigned int cpu, + + if ( cpu != ch->cpu ) + spin_unlock_irq(&ch->lock); +- else if ( cpumask_empty(ch->cpumask) ) ++ else if ( (next = cpumask_first(ch->cpumask)) >= nr_cpu_ids ) + { + ch->cpu = -1; + clear_bit(HPET_EVT_USED_BIT, &ch->flags); +@@ -525,7 +527,7 @@ static void hpet_detach_channel(unsigned int cpu, + } + else + { +- ch->cpu = cpumask_first(ch->cpumask); ++ ch->cpu = next; + set_channel_irq_affinity(ch); + local_irq_enable(); + } +diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c +index b282089e03..131480fdd9 100644 +--- a/xen/arch/x86/hvm/emulate.c ++++ b/xen/arch/x86/hvm/emulate.c +@@ -2113,22 +2113,20 @@ static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt, + + vio->mmio_retry = 0; + +- switch ( rc = x86_emulate(&hvmemul_ctxt->ctxt, ops) ) ++ rc = x86_emulate(&hvmemul_ctxt->ctxt, ops); ++ if ( rc == X86EMUL_OKAY && vio->mmio_retry ) ++ rc = X86EMUL_RETRY; ++ ++ if ( !hvm_vcpu_io_need_completion(vio) ) + { +- case X86EMUL_OKAY: +- if ( vio->mmio_retry ) +- rc = X86EMUL_RETRY; +- /* fall through */ +- default: + vio->mmio_cache_count = 0; + vio->mmio_insn_bytes = 0; +- break; +- +- case X86EMUL_RETRY: ++ } ++ else ++ { + BUILD_BUG_ON(sizeof(vio->mmio_insn) < sizeof(hvmemul_ctxt->insn_buf)); + vio->mmio_insn_bytes = hvmemul_ctxt->insn_buf_bytes; + memcpy(vio->mmio_insn, hvmemul_ctxt->insn_buf, vio->mmio_insn_bytes); +- break; + } + + if ( hvmemul_ctxt->ctxt.retire.singlestep ) +diff --git a/xen/arch/x86/hvm/hpet.c b/xen/arch/x86/hvm/hpet.c +index f7aed7f69e..28377091ca 100644 +--- a/xen/arch/x86/hvm/hpet.c ++++ b/xen/arch/x86/hvm/hpet.c +@@ -264,13 +264,20 @@ static void hpet_set_timer(HPETState *h, unsigned int tn, + diff = (timer_is_32bit(h, tn) && (-diff > HPET_TINY_TIME_SPAN)) + ? (uint32_t)diff : 0; + ++ destroy_periodic_time(&h->pt[tn]); + if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) ) ++ { + /* if LegacyReplacementRoute bit is set, HPET specification requires + timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC, + timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */ + irq = (tn == 0) ? 0 : 8; ++ h->pt[tn].source = PTSRC_isa; ++ } + else ++ { + irq = timer_int_route(h, tn); ++ h->pt[tn].source = PTSRC_ioapic; ++ } + + /* + * diff is the time from now when the timer should fire, for a periodic +diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c +index d5afe20cc8..25b2445429 100644 +--- a/xen/arch/x86/hvm/ioreq.c ++++ b/xen/arch/x86/hvm/ioreq.c +@@ -87,14 +87,17 @@ static void hvm_io_assist(struct hvm_ioreq_vcpu *sv, uint64_t data) + + static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p) + { ++ unsigned int prev_state = STATE_IOREQ_NONE; ++ + while ( sv->pending ) + { + unsigned int state = p->state; + +- rmb(); +- switch ( state ) ++ smp_rmb(); ++ ++ recheck: ++ if ( unlikely(state == STATE_IOREQ_NONE) ) + { +- case STATE_IOREQ_NONE: + /* + * The only reason we should see this case is when an + * emulator is dying and it races with an I/O being +@@ -102,14 +105,30 @@ static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p) + */ + hvm_io_assist(sv, ~0ul); + break; ++ } ++ ++ if ( unlikely(state < prev_state) ) ++ { ++ gdprintk(XENLOG_ERR, "Weird HVM ioreq state transition %u -> %u\n", ++ prev_state, state); ++ sv->pending = false; ++ domain_crash(sv->vcpu->domain); ++ return false; /* bail */ ++ } ++ ++ switch ( prev_state = state ) ++ { + case STATE_IORESP_READY: /* IORESP_READY -> NONE */ + p->state = STATE_IOREQ_NONE; + hvm_io_assist(sv, p->data); + break; + case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */ + case STATE_IOREQ_INPROCESS: +- wait_on_xen_event_channel(sv->ioreq_evtchn, p->state != state); +- break; ++ wait_on_xen_event_channel(sv->ioreq_evtchn, ++ ({ state = p->state; ++ smp_rmb(); ++ state != prev_state; })); ++ goto recheck; + default: + gdprintk(XENLOG_ERR, "Weird HVM iorequest state %u\n", state); + sv->pending = false; +diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c +index f528e2d081..c85d004402 100644 +--- a/xen/arch/x86/hvm/irq.c ++++ b/xen/arch/x86/hvm/irq.c +@@ -41,6 +41,26 @@ static void assert_gsi(struct domain *d, unsigned ioapic_gsi) + vioapic_irq_positive_edge(d, ioapic_gsi); + } + ++int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level) ++{ ++ struct hvm_irq *hvm_irq = hvm_domain_irq(d); ++ int vector; ++ ++ if ( gsi >= hvm_irq->nr_gsis ) ++ { ++ ASSERT_UNREACHABLE(); ++ return -1; ++ } ++ ++ spin_lock(&d->arch.hvm_domain.irq_lock); ++ if ( !level || hvm_irq->gsi_assert_count[gsi]++ == 0 ) ++ assert_gsi(d, gsi); ++ vector = vioapic_get_vector(d, gsi); ++ spin_unlock(&d->arch.hvm_domain.irq_lock); ++ ++ return vector; ++} ++ + static void assert_irq(struct domain *d, unsigned ioapic_gsi, unsigned pic_irq) + { + assert_gsi(d, ioapic_gsi); +diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c +index dedec5752d..3b72b4dc2a 100644 +--- a/xen/arch/x86/hvm/svm/svm.c ++++ b/xen/arch/x86/hvm/svm/svm.c +@@ -1046,6 +1046,7 @@ static void svm_ctxt_switch_from(struct vcpu *v) + set_ist(&idt_tables[cpu][TRAP_double_fault], IST_DF); + set_ist(&idt_tables[cpu][TRAP_nmi], IST_NMI); + set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE); ++ set_ist(&idt_tables[cpu][TRAP_debug], IST_DB); + } + + static void svm_ctxt_switch_to(struct vcpu *v) +@@ -1067,6 +1068,7 @@ static void svm_ctxt_switch_to(struct vcpu *v) + set_ist(&idt_tables[cpu][TRAP_double_fault], IST_NONE); + set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE); + set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE); ++ set_ist(&idt_tables[cpu][TRAP_debug], IST_NONE); + + svm_restore_dr(v); + +@@ -1836,6 +1838,25 @@ static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content) + struct vcpu *v = current; + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + ++ switch ( msr ) ++ { ++ /* ++ * Sync not needed while the cross-vendor logic is in unilateral effect. ++ case MSR_IA32_SYSENTER_CS: ++ case MSR_IA32_SYSENTER_ESP: ++ case MSR_IA32_SYSENTER_EIP: ++ */ ++ case MSR_STAR: ++ case MSR_LSTAR: ++ case MSR_CSTAR: ++ case MSR_SYSCALL_MASK: ++ case MSR_FS_BASE: ++ case MSR_GS_BASE: ++ case MSR_SHADOW_GS_BASE: ++ svm_sync_vmcb(v); ++ break; ++ } ++ + switch ( msr ) + { + case MSR_IA32_SYSENTER_CS: +@@ -1848,6 +1869,34 @@ static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content) + *msr_content = v->arch.hvm_svm.guest_sysenter_eip; + break; + ++ case MSR_STAR: ++ *msr_content = vmcb->star; ++ break; ++ ++ case MSR_LSTAR: ++ *msr_content = vmcb->lstar; ++ break; ++ ++ case MSR_CSTAR: ++ *msr_content = vmcb->cstar; ++ break; ++ ++ case MSR_SYSCALL_MASK: ++ *msr_content = vmcb->sfmask; ++ break; ++ ++ case MSR_FS_BASE: ++ *msr_content = vmcb->fs.base; ++ break; ++ ++ case MSR_GS_BASE: ++ *msr_content = vmcb->gs.base; ++ break; ++ ++ case MSR_SHADOW_GS_BASE: ++ *msr_content = vmcb->kerngsbase; ++ break; ++ + case MSR_IA32_MCx_MISC(4): /* Threshold register */ + case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3: + /* +@@ -1976,32 +2025,81 @@ static int svm_msr_write_intercept(unsigned int msr, uint64_t msr_content) + int ret, result = X86EMUL_OKAY; + struct vcpu *v = current; + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; +- int sync = 0; ++ bool sync = false; + + switch ( msr ) + { + case MSR_IA32_SYSENTER_CS: + case MSR_IA32_SYSENTER_ESP: + case MSR_IA32_SYSENTER_EIP: +- sync = 1; +- break; +- default: ++ case MSR_STAR: ++ case MSR_LSTAR: ++ case MSR_CSTAR: ++ case MSR_SYSCALL_MASK: ++ case MSR_FS_BASE: ++ case MSR_GS_BASE: ++ case MSR_SHADOW_GS_BASE: ++ sync = true; + break; + } + + if ( sync ) +- svm_sync_vmcb(v); ++ svm_sync_vmcb(v); + + switch ( msr ) + { ++ case MSR_IA32_SYSENTER_ESP: ++ case MSR_IA32_SYSENTER_EIP: ++ case MSR_LSTAR: ++ case MSR_CSTAR: ++ case MSR_FS_BASE: ++ case MSR_GS_BASE: ++ case MSR_SHADOW_GS_BASE: ++ if ( !is_canonical_address(msr_content) ) ++ goto gpf; ++ ++ switch ( msr ) ++ { ++ case MSR_IA32_SYSENTER_ESP: ++ vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = msr_content; ++ break; ++ ++ case MSR_IA32_SYSENTER_EIP: ++ vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = msr_content; ++ break; ++ ++ case MSR_LSTAR: ++ vmcb->lstar = msr_content; ++ break; ++ ++ case MSR_CSTAR: ++ vmcb->cstar = msr_content; ++ break; ++ ++ case MSR_FS_BASE: ++ vmcb->fs.base = msr_content; ++ break; ++ ++ case MSR_GS_BASE: ++ vmcb->gs.base = msr_content; ++ break; ++ ++ case MSR_SHADOW_GS_BASE: ++ vmcb->kerngsbase = msr_content; ++ break; ++ } ++ break; ++ + case MSR_IA32_SYSENTER_CS: + vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = msr_content; + break; +- case MSR_IA32_SYSENTER_ESP: +- vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = msr_content; ++ ++ case MSR_STAR: ++ vmcb->star = msr_content; + break; +- case MSR_IA32_SYSENTER_EIP: +- vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = msr_content; ++ ++ case MSR_SYSCALL_MASK: ++ vmcb->sfmask = msr_content; + break; + + case MSR_IA32_DEBUGCTLMSR: +diff --git a/xen/arch/x86/hvm/svm/svmdebug.c b/xen/arch/x86/hvm/svm/svmdebug.c +index 89ef2db932..b5b946aa94 100644 +--- a/xen/arch/x86/hvm/svm/svmdebug.c ++++ b/xen/arch/x86/hvm/svm/svmdebug.c +@@ -131,9 +131,8 @@ bool svm_vmcb_isvalid(const char *from, const struct vmcb_struct *vmcb, + PRINTF("DR7: bits [63:32] are not zero (%#"PRIx64")\n", + vmcb_get_dr7(vmcb)); + +- if ( efer & ~(EFER_SCE | EFER_LME | EFER_LMA | EFER_NX | EFER_SVME | +- EFER_LMSLE | EFER_FFXSE) ) +- PRINTF("EFER: undefined bits are not zero (%#"PRIx64")\n", efer); ++ if ( efer & ~EFER_KNOWN_MASK ) ++ PRINTF("EFER: unknown bits are not zero (%#"PRIx64")\n", efer); + + if ( hvm_efer_valid(v, efer, -1) ) + PRINTF("EFER: %s (%"PRIx64")\n", hvm_efer_valid(v, efer, -1), efer); +diff --git a/xen/arch/x86/hvm/viridian.c b/xen/arch/x86/hvm/viridian.c +index f0fa59d7d5..b02a70d086 100644 +--- a/xen/arch/x86/hvm/viridian.c ++++ b/xen/arch/x86/hvm/viridian.c +@@ -245,7 +245,7 @@ void cpuid_viridian_leaves(const struct vcpu *v, uint32_t leaf, + }; + union { + HV_PARTITION_PRIVILEGE_MASK mask; +- uint32_t lo, hi; ++ struct { uint32_t lo, hi; }; + } u; + + if ( !(viridian_feature_mask(d) & HVMPV_no_freq) ) +@@ -966,12 +966,10 @@ int viridian_hypercall(struct cpu_user_regs *regs) + gprintk(XENLOG_WARNING, "unimplemented hypercall %04x\n", + input.call_code); + /* Fallthrough. */ +- case HvGetPartitionId: + case HvExtCallQueryCapabilities: + /* +- * These hypercalls seem to be erroneously issued by Windows +- * despite neither AccessPartitionId nor EnableExtendedHypercalls +- * being set in CPUID leaf 2. ++ * This hypercall seems to be erroneously issued by Windows ++ * despite EnableExtendedHypercalls not being set in CPUID leaf 2. + * Given that return a status of 'invalid code' has not so far + * caused any problems it's not worth logging. + */ +diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c +index 181f4cb631..04e3c2e15b 100644 +--- a/xen/arch/x86/hvm/vpt.c ++++ b/xen/arch/x86/hvm/vpt.c +@@ -107,31 +107,49 @@ static int pt_irq_vector(struct periodic_time *pt, enum hvm_intsrc src) + static int pt_irq_masked(struct periodic_time *pt) + { + struct vcpu *v = pt->vcpu; +- unsigned int gsi, isa_irq; +- int mask; +- uint8_t pic_imr; ++ unsigned int gsi = pt->irq; + +- if ( pt->source == PTSRC_lapic ) ++ switch ( pt->source ) ++ { ++ case PTSRC_lapic: + { + struct vlapic *vlapic = vcpu_vlapic(v); ++ + return (!vlapic_enabled(vlapic) || + (vlapic_get_reg(vlapic, APIC_LVTT) & APIC_LVT_MASKED)); + } + +- isa_irq = pt->irq; +- gsi = hvm_isa_irq_to_gsi(isa_irq); +- pic_imr = v->domain->arch.hvm_domain.vpic[isa_irq >> 3].imr; +- mask = vioapic_get_mask(v->domain, gsi); +- if ( mask < 0 ) ++ case PTSRC_isa: + { +- dprintk(XENLOG_WARNING, "d%u: invalid GSI (%u) for platform timer\n", +- v->domain->domain_id, gsi); +- domain_crash(v->domain); +- return -1; ++ uint8_t pic_imr = v->domain->arch.hvm_domain.vpic[pt->irq >> 3].imr; ++ ++ /* Check if the interrupt is unmasked in the PIC. */ ++ if ( !(pic_imr & (1 << (pt->irq & 7))) && vlapic_accept_pic_intr(v) ) ++ return 0; ++ ++ gsi = hvm_isa_irq_to_gsi(pt->irq); ++ } ++ ++ /* Fallthrough to check if the interrupt is masked on the IO APIC. */ ++ case PTSRC_ioapic: ++ { ++ int mask = vioapic_get_mask(v->domain, gsi); ++ ++ if ( mask < 0 ) ++ { ++ dprintk(XENLOG_WARNING, ++ "d%d: invalid GSI (%u) for platform timer\n", ++ v->domain->domain_id, gsi); ++ domain_crash(v->domain); ++ return -1; ++ } ++ ++ return mask; ++ } + } + +- return (((pic_imr & (1 << (isa_irq & 7))) || !vlapic_accept_pic_intr(v)) && +- mask); ++ ASSERT_UNREACHABLE(); ++ return 1; + } + + static void pt_lock(struct periodic_time *pt) +@@ -252,7 +270,7 @@ int pt_update_irq(struct vcpu *v) + struct list_head *head = &v->arch.hvm_vcpu.tm_list; + struct periodic_time *pt, *temp, *earliest_pt; + uint64_t max_lag; +- int irq, is_lapic, pt_vector; ++ int irq, pt_vector = -1; + + spin_lock(&v->arch.hvm_vcpu.tm_lock); + +@@ -288,29 +306,26 @@ int pt_update_irq(struct vcpu *v) + + earliest_pt->irq_issued = 1; + irq = earliest_pt->irq; +- is_lapic = (earliest_pt->source == PTSRC_lapic); + + spin_unlock(&v->arch.hvm_vcpu.tm_lock); + +- /* +- * If periodic timer interrut is handled by lapic, its vector in +- * IRR is returned and used to set eoi_exit_bitmap for virtual +- * interrupt delivery case. Otherwise return -1 to do nothing. +- */ +- if ( is_lapic ) ++ switch ( earliest_pt->source ) + { ++ case PTSRC_lapic: ++ /* ++ * If periodic timer interrupt is handled by lapic, its vector in ++ * IRR is returned and used to set eoi_exit_bitmap for virtual ++ * interrupt delivery case. Otherwise return -1 to do nothing. ++ */ + vlapic_set_irq(vcpu_vlapic(v), irq, 0); + pt_vector = irq; +- } +- else +- { ++ break; ++ ++ case PTSRC_isa: + hvm_isa_irq_deassert(v->domain, irq); + if ( platform_legacy_irq(irq) && vlapic_accept_pic_intr(v) && + v->domain->arch.hvm_domain.vpic[irq >> 3].int_output ) +- { + hvm_isa_irq_assert(v->domain, irq, NULL); +- pt_vector = -1; +- } + else + { + pt_vector = hvm_isa_irq_assert(v->domain, irq, vioapic_get_vector); +@@ -321,6 +336,17 @@ int pt_update_irq(struct vcpu *v) + if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) ) + pt_vector = -1; + } ++ break; ++ ++ case PTSRC_ioapic: ++ /* ++ * NB: At the moment IO-APIC routed interrupts generated by vpt devices ++ * (HPET) are edge-triggered. ++ */ ++ pt_vector = hvm_ioapic_assert(v->domain, irq, false); ++ if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) ) ++ pt_vector = -1; ++ break; + } + + return pt_vector; +@@ -418,7 +444,14 @@ void create_periodic_time( + struct vcpu *v, struct periodic_time *pt, uint64_t delta, + uint64_t period, uint8_t irq, time_cb *cb, void *data) + { +- ASSERT(pt->source != 0); ++ if ( !pt->source || ++ (pt->irq >= NR_ISAIRQS && pt->source == PTSRC_isa) || ++ (pt->irq >= hvm_domain_irq(v->domain)->nr_gsis && ++ pt->source == PTSRC_ioapic) ) ++ { ++ ASSERT_UNREACHABLE(); ++ return; ++ } + + destroy_periodic_time(pt); + +@@ -498,7 +531,7 @@ static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v) + { + int on_list; + +- ASSERT(pt->source == PTSRC_isa); ++ ASSERT(pt->source == PTSRC_isa || pt->source == PTSRC_ioapic); + + if ( pt->vcpu == NULL ) + return; +diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c +index 642ca312bf..c281936af0 100644 +--- a/xen/arch/x86/pv/emul-priv-op.c ++++ b/xen/arch/x86/pv/emul-priv-op.c +@@ -813,26 +813,6 @@ static int write_cr(unsigned int reg, unsigned long val, + return X86EMUL_UNHANDLEABLE; + } + +-static int read_dr(unsigned int reg, unsigned long *val, +- struct x86_emulate_ctxt *ctxt) +-{ +- unsigned long res = do_get_debugreg(reg); +- +- if ( IS_ERR_VALUE(res) ) +- return X86EMUL_UNHANDLEABLE; +- +- *val = res; +- +- return X86EMUL_OKAY; +-} +- +-static int write_dr(unsigned int reg, unsigned long val, +- struct x86_emulate_ctxt *ctxt) +-{ +- return do_set_debugreg(reg, val) == 0 +- ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE; +-} +- + static inline uint64_t guest_misc_enable(uint64_t val) + { + val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL | +@@ -906,9 +886,16 @@ static int read_msr(unsigned int reg, uint64_t *val, + return X86EMUL_OKAY; + + case MSR_EFER: +- *val = read_efer(); ++ /* Hide unknown bits, and unconditionally hide SVME from guests. */ ++ *val = read_efer() & EFER_KNOWN_MASK & ~EFER_SVME; ++ /* ++ * Hide the 64-bit features from 32-bit guests. SCE has ++ * vendor-dependent behaviour. ++ */ + if ( is_pv_32bit_domain(currd) ) +- *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE); ++ *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE | ++ (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ++ ? EFER_SCE : 0)); + return X86EMUL_OKAY; + + case MSR_K7_FID_VID_CTL: +@@ -1326,8 +1313,8 @@ static const struct x86_emulate_ops priv_op_ops = { + .read_segment = read_segment, + .read_cr = read_cr, + .write_cr = write_cr, +- .read_dr = read_dr, +- .write_dr = write_dr, ++ .read_dr = x86emul_read_dr, ++ .write_dr = x86emul_write_dr, + .read_msr = read_msr, + .write_msr = write_msr, + .cpuid = pv_emul_cpuid, +diff --git a/xen/arch/x86/pv/misc-hypercalls.c b/xen/arch/x86/pv/misc-hypercalls.c +index 5862130697..1619be7874 100644 +--- a/xen/arch/x86/pv/misc-hypercalls.c ++++ b/xen/arch/x86/pv/misc-hypercalls.c +@@ -30,22 +30,10 @@ long do_set_debugreg(int reg, unsigned long value) + + unsigned long do_get_debugreg(int reg) + { +- struct vcpu *curr = current; ++ unsigned long val; ++ int res = x86emul_read_dr(reg, &val, NULL); + +- switch ( reg ) +- { +- case 0 ... 3: +- case 6: +- return curr->arch.debugreg[reg]; +- case 7: +- return (curr->arch.debugreg[7] | +- curr->arch.debugreg[5]); +- case 4 ... 5: +- return ((curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ? +- curr->arch.debugreg[reg + 2] : 0); +- } +- +- return -EINVAL; ++ return res == X86EMUL_OKAY ? val : -ENODEV; + } + + long do_fpu_taskswitch(int set) +diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c +index e1d023428c..f81fc2ca65 100644 +--- a/xen/arch/x86/smpboot.c ++++ b/xen/arch/x86/smpboot.c +@@ -968,6 +968,7 @@ static int cpu_smpboot_alloc(unsigned int cpu) + set_ist(&idt_tables[cpu][TRAP_double_fault], IST_NONE); + set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE); + set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE); ++ set_ist(&idt_tables[cpu][TRAP_debug], IST_NONE); + + for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1); + i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i ) +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 3c7447bfe6..fa67a0ffbd 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -97,12 +97,13 @@ static void __init print_details(enum ind_thunk thunk) + printk(XENLOG_DEBUG "Speculative mitigation facilities:\n"); + + /* Hardware features which pertain to speculative mitigations. */ +- printk(XENLOG_DEBUG " Hardware features:%s%s%s%s%s\n", ++ printk(XENLOG_DEBUG " Hardware features:%s%s%s%s%s%s\n", + (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "", + (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBPB)) ? " IBPB" : "", + (caps & ARCH_CAPABILITIES_IBRS_ALL) ? " IBRS_ALL" : "", +- (caps & ARCH_CAPABILITIES_RDCL_NO) ? " RDCL_NO" : ""); ++ (caps & ARCH_CAPABILITIES_RDCL_NO) ? " RDCL_NO" : "", ++ (caps & ARCH_CAPS_RSBA) ? " RSBA" : ""); + + /* Compiled-in support which pertains to BTI mitigations. */ + if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) ) +@@ -135,6 +136,20 @@ static bool __init retpoline_safe(void) + boot_cpu_data.x86 != 6 ) + return false; + ++ if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) ) ++ { ++ uint64_t caps; ++ ++ rdmsrl(MSR_ARCH_CAPABILITIES, caps); ++ ++ /* ++ * RBSA may be set by a hypervisor to indicate that we may move to a ++ * processor which isn't retpoline-safe. ++ */ ++ if ( caps & ARCH_CAPS_RSBA ) ++ return false; ++ } ++ + switch ( boot_cpu_data.x86_model ) + { + case 0x17: /* Penryn */ +@@ -161,18 +176,40 @@ static bool __init retpoline_safe(void) + * versions. + */ + case 0x3d: /* Broadwell */ +- return ucode_rev >= 0x28; ++ return ucode_rev >= 0x2a; + case 0x47: /* Broadwell H */ +- return ucode_rev >= 0x1b; ++ return ucode_rev >= 0x1d; + case 0x4f: /* Broadwell EP/EX */ +- return ucode_rev >= 0xb000025; ++ return ucode_rev >= 0xb000021; + case 0x56: /* Broadwell D */ +- return false; /* TBD. */ ++ switch ( boot_cpu_data.x86_mask ) ++ { ++ case 2: return ucode_rev >= 0x15; ++ case 3: return ucode_rev >= 0x7000012; ++ case 4: return ucode_rev >= 0xf000011; ++ case 5: return ucode_rev >= 0xe000009; ++ default: ++ printk("Unrecognised CPU stepping %#x - assuming not reptpoline safe\n", ++ boot_cpu_data.x86_mask); ++ return false; ++ } ++ break; + + /* +- * Skylake and later processors are not retpoline-safe. ++ * Skylake, Kabylake and Cannonlake processors are not retpoline-safe. + */ ++ case 0x4e: ++ case 0x55: ++ case 0x5e: ++ case 0x66: ++ case 0x67: ++ case 0x8e: ++ case 0x9e: ++ return false; ++ + default: ++ printk("Unrecognised CPU model %#x - assuming not reptpoline safe\n", ++ boot_cpu_data.x86_model); + return false; + } + } +diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c +index 906124331b..e217b0d6e2 100644 +--- a/xen/arch/x86/traps.c ++++ b/xen/arch/x86/traps.c +@@ -325,13 +325,13 @@ static void show_guest_stack(struct vcpu *v, const struct cpu_user_regs *regs) + /* + * Notes for get_stack_trace_bottom() and get_stack_dump_bottom() + * +- * Stack pages 0, 1 and 2: ++ * Stack pages 0 - 3: + * These are all 1-page IST stacks. Each of these stacks have an exception + * frame and saved register state at the top. The interesting bound for a + * trace is the word adjacent to this, while the bound for a dump is the + * very top, including the exception frame. + * +- * Stack pages 3, 4 and 5: ++ * Stack pages 4 and 5: + * None of these are particularly interesting. With MEMORY_GUARD, page 5 is + * explicitly not present, so attempting to dump or trace it is + * counterproductive. Without MEMORY_GUARD, it is possible for a call chain +@@ -352,12 +352,12 @@ unsigned long get_stack_trace_bottom(unsigned long sp) + { + switch ( get_stack_page(sp) ) + { +- case 0 ... 2: ++ case 0 ... 3: + return ROUNDUP(sp, PAGE_SIZE) - + offsetof(struct cpu_user_regs, es) - sizeof(unsigned long); + + #ifndef MEMORY_GUARD +- case 3 ... 5: ++ case 4 ... 5: + #endif + case 6 ... 7: + return ROUNDUP(sp, STACK_SIZE) - +@@ -372,11 +372,11 @@ unsigned long get_stack_dump_bottom(unsigned long sp) + { + switch ( get_stack_page(sp) ) + { +- case 0 ... 2: ++ case 0 ... 3: + return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long); + + #ifndef MEMORY_GUARD +- case 3 ... 5: ++ case 4 ... 5: + #endif + case 6 ... 7: + return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long); +@@ -1761,11 +1761,36 @@ static void ler_enable(void) + + void do_debug(struct cpu_user_regs *regs) + { ++ unsigned long dr6; + struct vcpu *v = current; + ++ /* Stash dr6 as early as possible. */ ++ dr6 = read_debugreg(6); ++ + if ( debugger_trap_entry(TRAP_debug, regs) ) + return; + ++ /* ++ * At the time of writing (March 2018), on the subject of %dr6: ++ * ++ * The Intel manual says: ++ * Certain debug exceptions may clear bits 0-3. The remaining contents ++ * of the DR6 register are never cleared by the processor. To avoid ++ * confusion in identifying debug exceptions, debug handlers should ++ * clear the register (except bit 16, which they should set) before ++ * returning to the interrupted task. ++ * ++ * The AMD manual says: ++ * Bits 15:13 of the DR6 register are not cleared by the processor and ++ * must be cleared by software after the contents have been read. ++ * ++ * Some bits are reserved set, some are reserved clear, and some bits ++ * which were previously reserved set are reused and cleared by hardware. ++ * For future compatibility, reset to the default value, which will allow ++ * us to spot any bit being changed by hardware to its non-default value. ++ */ ++ write_debugreg(6, X86_DR6_DEFAULT); ++ + if ( !guest_mode(regs) ) + { + if ( regs->eflags & X86_EFLAGS_TF ) +@@ -1784,21 +1809,50 @@ void do_debug(struct cpu_user_regs *regs) + regs->eflags &= ~X86_EFLAGS_TF; + } + } +- else ++ ++ /* ++ * Check for fault conditions. General Detect, and instruction ++ * breakpoints are faults rather than traps, at which point attempting ++ * to ignore and continue will result in a livelock. ++ */ ++ if ( dr6 & DR_GENERAL_DETECT ) + { +- /* +- * We ignore watchpoints when they trigger within Xen. This may +- * happen when a buffer is passed to us which previously had a +- * watchpoint set on it. No need to bump EIP; the only faulting +- * trap is an instruction breakpoint, which can't happen to us. +- */ +- WARN_ON(!search_exception_table(regs)); ++ printk(XENLOG_ERR "Hit General Detect in Xen context\n"); ++ fatal_trap(regs, 0); ++ } ++ ++ if ( dr6 & (DR_TRAP3 | DR_TRAP2 | DR_TRAP1 | DR_TRAP0) ) ++ { ++ unsigned int bp, dr7 = read_debugreg(7) >> DR_CONTROL_SHIFT; ++ ++ for ( bp = 0; bp < 4; ++bp ) ++ { ++ if ( (dr6 & (1u << bp)) && /* Breakpoint triggered? */ ++ ((dr7 & (3u << (bp * DR_CONTROL_SIZE))) == 0) /* Insn? */ ) ++ { ++ printk(XENLOG_ERR ++ "Hit instruction breakpoint in Xen context\n"); ++ fatal_trap(regs, 0); ++ } ++ } + } ++ ++ /* ++ * Whatever caused this #DB should be a trap. Note it and continue. ++ * Guests can trigger this in certain corner cases, so ensure the ++ * message is ratelimited. ++ */ ++ gprintk(XENLOG_WARNING, ++ "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, dr6 %lx\n", ++ regs->cs, _p(regs->rip), _p(regs->rip), ++ regs->ss, _p(regs->rsp), dr6); ++ + goto out; + } + + /* Save debug status register where guest OS can peek at it */ +- v->arch.debugreg[6] = read_debugreg(6); ++ v->arch.debugreg[6] |= (dr6 & ~X86_DR6_DEFAULT); ++ v->arch.debugreg[6] &= (dr6 | ~X86_DR6_DEFAULT); + + ler_enable(); + pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC); +@@ -1917,6 +1971,7 @@ void __init init_idt_traps(void) + set_ist(&idt_table[TRAP_double_fault], IST_DF); + set_ist(&idt_table[TRAP_nmi], IST_NMI); + set_ist(&idt_table[TRAP_machine_check], IST_MCE); ++ set_ist(&idt_table[TRAP_debug], IST_DB); + + /* CPU0 uses the master IDT. */ + idt_tables[0] = idt_table; +@@ -1984,6 +2039,12 @@ void activate_debugregs(const struct vcpu *curr) + } + } + ++/* ++ * Used by hypercalls and the emulator. ++ * -ENODEV => #UD ++ * -EINVAL => #GP Invalid bit ++ * -EPERM => #GP Valid bit, but not permitted to use ++ */ + long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) + { + int i; +@@ -2015,7 +2076,17 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) + if ( v == curr ) + write_debugreg(3, value); + break; ++ ++ case 4: ++ if ( v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE ) ++ return -ENODEV; ++ ++ /* Fallthrough */ + case 6: ++ /* The upper 32 bits are strictly reserved. */ ++ if ( value != (uint32_t)value ) ++ return -EINVAL; ++ + /* + * DR6: Bits 4-11,16-31 reserved (set to 1). + * Bit 12 reserved (set to 0). +@@ -2025,7 +2096,17 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) + if ( v == curr ) + write_debugreg(6, value); + break; ++ ++ case 5: ++ if ( v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE ) ++ return -ENODEV; ++ ++ /* Fallthrough */ + case 7: ++ /* The upper 32 bits are strictly reserved. */ ++ if ( value != (uint32_t)value ) ++ return -EINVAL; ++ + /* + * DR7: Bit 10 reserved (set to 1). + * Bits 11-12,14-15 reserved (set to 0). +@@ -2038,6 +2119,10 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) + */ + if ( value & DR_GENERAL_DETECT ) + return -EPERM; ++ ++ /* Zero the IO shadow before recalculating the real %dr7 */ ++ v->arch.debugreg[5] = 0; ++ + /* DR7.{G,L}E = 0 => debugging disabled for this domain. */ + if ( value & DR7_ACTIVE_MASK ) + { +@@ -2070,7 +2155,7 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) + write_debugreg(7, value); + break; + default: +- return -EINVAL; ++ return -ENODEV; + } + + v->arch.debugreg[reg] = value; +diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S +index 75497bc292..a47cb9dc19 100644 +--- a/xen/arch/x86/x86_64/compat/entry.S ++++ b/xen/arch/x86/x86_64/compat/entry.S +@@ -39,6 +39,12 @@ ENTRY(compat_test_all_events) + leaq irq_stat+IRQSTAT_softirq_pending(%rip),%rcx + cmpl $0,(%rcx,%rax,1) + jne compat_process_softirqs ++ ++ /* Inject exception if pending. */ ++ lea VCPU_trap_bounce(%rbx), %rdx ++ testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx) ++ jnz .Lcompat_process_trapbounce ++ + testb $1,VCPU_mce_pending(%rbx) + jnz compat_process_mce + .Lcompat_test_guest_nmi: +@@ -68,15 +74,24 @@ compat_process_softirqs: + call do_softirq + jmp compat_test_all_events + ++ ALIGN ++/* %rbx: struct vcpu, %rdx: struct trap_bounce */ ++.Lcompat_process_trapbounce: ++ sti ++.Lcompat_bounce_exception: ++ call compat_create_bounce_frame ++ movb $0, TRAPBOUNCE_flags(%rdx) ++ jmp compat_test_all_events ++ + ALIGN + /* %rbx: struct vcpu */ + compat_process_mce: + testb $1 << VCPU_TRAP_MCE,VCPU_async_exception_mask(%rbx) + jnz .Lcompat_test_guest_nmi + sti +- movb $0,VCPU_mce_pending(%rbx) +- call set_guest_machinecheck_trapbounce +- testl %eax,%eax ++ movb $0, VCPU_mce_pending(%rbx) ++ call set_guest_machinecheck_trapbounce ++ test %al, %al + jz compat_test_all_events + movzbl VCPU_async_exception_mask(%rbx),%edx # save mask for the + movb %dl,VCPU_mce_old_mask(%rbx) # iret hypercall +@@ -88,11 +103,11 @@ compat_process_mce: + /* %rbx: struct vcpu */ + compat_process_nmi: + testb $1 << VCPU_TRAP_NMI,VCPU_async_exception_mask(%rbx) +- jnz compat_test_guest_events ++ jnz compat_test_guest_events + sti +- movb $0,VCPU_nmi_pending(%rbx) ++ movb $0, VCPU_nmi_pending(%rbx) + call set_guest_nmi_trapbounce +- testl %eax,%eax ++ test %al, %al + jz compat_test_all_events + movzbl VCPU_async_exception_mask(%rbx),%edx # save mask for the + movb %dl,VCPU_nmi_old_mask(%rbx) # iret hypercall +@@ -189,15 +204,6 @@ ENTRY(cr4_pv32_restore) + xor %eax, %eax + ret + +-/* %rdx: trap_bounce, %rbx: struct vcpu */ +-ENTRY(compat_post_handle_exception) +- testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx) +- jz compat_test_all_events +-.Lcompat_bounce_exception: +- call compat_create_bounce_frame +- movb $0,TRAPBOUNCE_flags(%rdx) +- jmp compat_test_all_events +- + .section .text.entry, "ax", @progbits + + /* See lstar_enter for entry register state. */ +diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S +index bdd33e727f..41d3ec21a1 100644 +--- a/xen/arch/x86/x86_64/entry.S ++++ b/xen/arch/x86/x86_64/entry.S +@@ -42,6 +42,12 @@ test_all_events: + leaq irq_stat+IRQSTAT_softirq_pending(%rip), %rcx + cmpl $0, (%rcx, %rax, 1) + jne process_softirqs ++ ++ /* Inject exception if pending. */ ++ lea VCPU_trap_bounce(%rbx), %rdx ++ testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx) ++ jnz .Lprocess_trapbounce ++ + cmpb $0, VCPU_mce_pending(%rbx) + jne process_mce + .Ltest_guest_nmi: +@@ -69,6 +75,15 @@ process_softirqs: + call do_softirq + jmp test_all_events + ++ ALIGN ++/* %rbx: struct vcpu, %rdx struct trap_bounce */ ++.Lprocess_trapbounce: ++ sti ++.Lbounce_exception: ++ call create_bounce_frame ++ movb $0, TRAPBOUNCE_flags(%rdx) ++ jmp test_all_events ++ + ALIGN + /* %rbx: struct vcpu */ + process_mce: +@@ -77,7 +92,7 @@ process_mce: + sti + movb $0, VCPU_mce_pending(%rbx) + call set_guest_machinecheck_trapbounce +- test %eax, %eax ++ test %al, %al + jz test_all_events + movzbl VCPU_async_exception_mask(%rbx), %edx # save mask for the + movb %dl, VCPU_mce_old_mask(%rbx) # iret hypercall +@@ -93,7 +108,7 @@ process_nmi: + sti + movb $0, VCPU_nmi_pending(%rbx) + call set_guest_nmi_trapbounce +- test %eax, %eax ++ test %al, %al + jz test_all_events + movzbl VCPU_async_exception_mask(%rbx), %edx # save mask for the + movb %dl, VCPU_nmi_old_mask(%rbx) # iret hypercall +@@ -667,15 +682,9 @@ handle_exception_saved: + mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14) + testb $3,UREGS_cs(%rsp) + jz restore_all_xen +- leaq VCPU_trap_bounce(%rbx),%rdx + movq VCPU_domain(%rbx),%rax + testb $1,DOMAIN_is_32bit_pv(%rax) +- jnz compat_post_handle_exception +- testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx) +- jz test_all_events +-.Lbounce_exception: +- call create_bounce_frame +- movb $0,TRAPBOUNCE_flags(%rdx) ++ jnz compat_test_all_events + jmp test_all_events + + /* No special register assumptions. */ +@@ -730,7 +739,7 @@ ENTRY(device_not_available) + ENTRY(debug) + pushq $0 + movl $TRAP_debug,4(%rsp) +- jmp handle_exception ++ jmp handle_ist_exception + + ENTRY(int3) + pushq $0 +@@ -783,12 +792,14 @@ ENTRY(double_fault) + /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ + + mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rbx +- test %rbx, %rbx ++ neg %rbx + jz .Ldblf_cr3_okay + jns .Ldblf_cr3_load ++ mov %rbx, STACK_CPUINFO_FIELD(xen_cr3)(%r14) + neg %rbx + .Ldblf_cr3_load: + mov %rbx, %cr3 ++ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%r14) + .Ldblf_cr3_okay: + + movq %rsp,%rdi +diff --git a/xen/arch/x86/x86_emulate.c b/xen/arch/x86/x86_emulate.c +index c7ba221d11..9125c67c9e 100644 +--- a/xen/arch/x86/x86_emulate.c ++++ b/xen/arch/x86/x86_emulate.c +@@ -14,6 +14,7 @@ + #include <asm/processor.h> /* current_cpu_info */ + #include <asm/xstate.h> + #include <asm/amd.h> /* cpu_has_amd_erratum() */ ++#include <asm/debugreg.h> + + /* Avoid namespace pollution. */ + #undef cmpxchg +@@ -41,3 +42,75 @@ + }) + + #include "x86_emulate/x86_emulate.c" ++ ++/* Called with NULL ctxt in hypercall context. */ ++int x86emul_read_dr(unsigned int reg, unsigned long *val, ++ struct x86_emulate_ctxt *ctxt) ++{ ++ struct vcpu *curr = current; ++ ++ /* HVM support requires a bit more plumbing before it will work. */ ++ ASSERT(is_pv_vcpu(curr)); ++ ++ switch ( reg ) ++ { ++ case 0 ... 3: ++ case 6: ++ *val = curr->arch.debugreg[reg]; ++ break; ++ ++ case 7: ++ *val = (curr->arch.debugreg[7] | ++ curr->arch.debugreg[5]); ++ break; ++ ++ case 4 ... 5: ++ if ( !(curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ) ++ { ++ *val = curr->arch.debugreg[reg + 2]; ++ break; ++ } ++ ++ /* Fallthrough */ ++ default: ++ if ( ctxt ) ++ x86_emul_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC, ctxt); ++ ++ return X86EMUL_EXCEPTION; ++ } ++ ++ return X86EMUL_OKAY; ++} ++ ++int x86emul_write_dr(unsigned int reg, unsigned long val, ++ struct x86_emulate_ctxt *ctxt) ++{ ++ struct vcpu *curr = current; ++ ++ /* HVM support requires a bit more plumbing before it will work. */ ++ ASSERT(is_pv_vcpu(curr)); ++ ++ switch ( set_debugreg(curr, reg, val) ) ++ { ++ case 0: ++ return X86EMUL_OKAY; ++ ++ case -ENODEV: ++ x86_emul_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC, ctxt); ++ return X86EMUL_EXCEPTION; ++ ++ default: ++ x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt); ++ return X86EMUL_EXCEPTION; ++ } ++} ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * tab-width: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +diff --git a/xen/arch/x86/x86_emulate/x86_emulate.h b/xen/arch/x86/x86_emulate/x86_emulate.h +index 0c8c80ad5a..9c2bb8157c 100644 +--- a/xen/arch/x86/x86_emulate/x86_emulate.h ++++ b/xen/arch/x86/x86_emulate/x86_emulate.h +@@ -662,6 +662,11 @@ static inline void x86_emulate_free_state(struct x86_emulate_state *state) {} + void x86_emulate_free_state(struct x86_emulate_state *state); + #endif + ++int x86emul_read_dr(unsigned int reg, unsigned long *val, ++ struct x86_emulate_ctxt *ctxt); ++int x86emul_write_dr(unsigned int reg, unsigned long val, ++ struct x86_emulate_ctxt *ctxt); ++ + #endif + + static inline void x86_emul_hw_exception( +diff --git a/xen/common/schedule.c b/xen/common/schedule.c +index b7884263f2..f21c3e5a64 100644 +--- a/xen/common/schedule.c ++++ b/xen/common/schedule.c +@@ -436,14 +436,9 @@ void sched_destroy_domain(struct domain *d) + cpupool_rm_domain(d); + } + +-void vcpu_sleep_nosync(struct vcpu *v) ++void vcpu_sleep_nosync_locked(struct vcpu *v) + { +- unsigned long flags; +- spinlock_t *lock; +- +- TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id); +- +- lock = vcpu_schedule_lock_irqsave(v, &flags); ++ ASSERT(spin_is_locked(per_cpu(schedule_data,v->processor).schedule_lock)); + + if ( likely(!vcpu_runnable(v)) ) + { +@@ -452,6 +447,18 @@ void vcpu_sleep_nosync(struct vcpu *v) + + SCHED_OP(vcpu_scheduler(v), sleep, v); + } ++} ++ ++void vcpu_sleep_nosync(struct vcpu *v) ++{ ++ unsigned long flags; ++ spinlock_t *lock; ++ ++ TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id); ++ ++ lock = vcpu_schedule_lock_irqsave(v, &flags); ++ ++ vcpu_sleep_nosync_locked(v); + + vcpu_schedule_unlock_irqrestore(lock, flags, v); + } +@@ -567,13 +574,54 @@ static void vcpu_move_nosched(struct vcpu *v, unsigned int new_cpu) + sched_move_irqs(v); + } + +-static void vcpu_migrate(struct vcpu *v) ++/* ++ * Initiating migration ++ * ++ * In order to migrate, we need the vcpu in question to have stopped ++ * running and had SCHED_OP(sleep) called (to take it off any ++ * runqueues, for instance); and if it is currently running, it needs ++ * to be scheduled out. Finally, we need to hold the scheduling locks ++ * for both the processor we're migrating from, and the processor ++ * we're migrating to. ++ * ++ * In order to avoid deadlock while satisfying the final requirement, ++ * we must release any scheduling lock we hold, then try to grab both ++ * locks we want, then double-check to make sure that what we started ++ * to do hasn't been changed in the mean time. ++ * ++ * These steps are encapsulated in the following two functions; they ++ * should be called like this: ++ * ++ * lock = vcpu_schedule_lock_irq(v); ++ * vcpu_migrate_start(v); ++ * vcpu_schedule_unlock_irq(lock, v) ++ * vcpu_migrate_finish(v); ++ * ++ * vcpu_migrate_finish() will do the work now if it can, or simply ++ * return if it can't (because v is still running); in that case ++ * vcpu_migrate_finish() will be called by context_saved(). ++ */ ++void vcpu_migrate_start(struct vcpu *v) ++{ ++ set_bit(_VPF_migrating, &v->pause_flags); ++ vcpu_sleep_nosync_locked(v); ++} ++ ++static void vcpu_migrate_finish(struct vcpu *v) + { + unsigned long flags; + unsigned int old_cpu, new_cpu; + spinlock_t *old_lock, *new_lock; + bool_t pick_called = 0; + ++ /* ++ * If the vcpu is currently running, this will be handled by ++ * context_saved(); and in any case, if the bit is cleared, then ++ * someone else has already done the work so we don't need to. ++ */ ++ if ( v->is_running || !test_bit(_VPF_migrating, &v->pause_flags) ) ++ return; ++ + old_cpu = new_cpu = v->processor; + for ( ; ; ) + { +@@ -653,14 +701,11 @@ void vcpu_force_reschedule(struct vcpu *v) + spinlock_t *lock = vcpu_schedule_lock_irq(v); + + if ( v->is_running ) +- set_bit(_VPF_migrating, &v->pause_flags); ++ vcpu_migrate_start(v); ++ + vcpu_schedule_unlock_irq(lock, v); + +- if ( v->pause_flags & VPF_migrating ) +- { +- vcpu_sleep_nosync(v); +- vcpu_migrate(v); +- } ++ vcpu_migrate_finish(v); + } + + void restore_vcpu_affinity(struct domain *d) +@@ -812,10 +857,10 @@ int cpu_disable_scheduler(unsigned int cpu) + * * the scheduler will always fine a suitable solution, or + * things would have failed before getting in here. + */ +- set_bit(_VPF_migrating, &v->pause_flags); ++ vcpu_migrate_start(v); + vcpu_schedule_unlock_irqrestore(lock, flags, v); +- vcpu_sleep_nosync(v); +- vcpu_migrate(v); ++ ++ vcpu_migrate_finish(v); + + /* + * The only caveat, in this case, is that if a vcpu active in +@@ -849,18 +894,14 @@ static int vcpu_set_affinity( + * Always ask the scheduler to re-evaluate placement + * when changing the affinity. + */ +- set_bit(_VPF_migrating, &v->pause_flags); ++ vcpu_migrate_start(v); + } + + vcpu_schedule_unlock_irq(lock, v); + + domain_update_node_affinity(v->domain); + +- if ( v->pause_flags & VPF_migrating ) +- { +- vcpu_sleep_nosync(v); +- vcpu_migrate(v); +- } ++ vcpu_migrate_finish(v); + + return ret; + } +@@ -1088,7 +1129,6 @@ int vcpu_pin_override(struct vcpu *v, int cpu) + { + cpumask_copy(v->cpu_hard_affinity, v->cpu_hard_affinity_saved); + v->affinity_broken = 0; +- set_bit(_VPF_migrating, &v->pause_flags); + ret = 0; + } + } +@@ -1101,20 +1141,18 @@ int vcpu_pin_override(struct vcpu *v, int cpu) + cpumask_copy(v->cpu_hard_affinity_saved, v->cpu_hard_affinity); + v->affinity_broken = 1; + cpumask_copy(v->cpu_hard_affinity, cpumask_of(cpu)); +- set_bit(_VPF_migrating, &v->pause_flags); + ret = 0; + } + } + ++ if ( ret == 0 ) ++ vcpu_migrate_start(v); ++ + vcpu_schedule_unlock_irq(lock, v); + + domain_update_node_affinity(v->domain); + +- if ( v->pause_flags & VPF_migrating ) +- { +- vcpu_sleep_nosync(v); +- vcpu_migrate(v); +- } ++ vcpu_migrate_finish(v); + + return ret; + } +@@ -1501,8 +1539,7 @@ void context_saved(struct vcpu *prev) + + SCHED_OP(vcpu_scheduler(prev), context_saved, prev); + +- if ( unlikely(prev->pause_flags & VPF_migrating) ) +- vcpu_migrate(prev); ++ vcpu_migrate_finish(prev); + } + + /* The scheduler timer: force a run through the scheduler */ +diff --git a/xen/include/asm-x86/debugreg.h b/xen/include/asm-x86/debugreg.h +index c57914efc6..b3b10eaf40 100644 +--- a/xen/include/asm-x86/debugreg.h ++++ b/xen/include/asm-x86/debugreg.h +@@ -24,6 +24,8 @@ + #define DR_STATUS_RESERVED_ZERO (~0xffffeffful) /* Reserved, read as zero */ + #define DR_STATUS_RESERVED_ONE 0xffff0ff0ul /* Reserved, read as one */ + ++#define X86_DR6_DEFAULT 0xffff0ff0ul /* Default %dr6 value. */ ++ + /* Now define a bunch of things for manipulating the control register. + The top two bytes of the control register consist of 4 fields of 4 + bits - each field corresponds to one of the four debug registers, +diff --git a/xen/include/asm-x86/hvm/irq.h b/xen/include/asm-x86/hvm/irq.h +index f756cb5a0d..1a52ec6045 100644 +--- a/xen/include/asm-x86/hvm/irq.h ++++ b/xen/include/asm-x86/hvm/irq.h +@@ -207,6 +207,9 @@ int hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq); + + int hvm_inject_msi(struct domain *d, uint64_t addr, uint32_t data); + ++/* Assert an IO APIC pin. */ ++int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level); ++ + void hvm_maybe_deassert_evtchn_irq(void); + void hvm_assert_evtchn_irq(struct vcpu *v); + void hvm_set_callback_via(struct domain *d, uint64_t via); +diff --git a/xen/include/asm-x86/hvm/vpt.h b/xen/include/asm-x86/hvm/vpt.h +index 21166edd06..0eb5ff632e 100644 +--- a/xen/include/asm-x86/hvm/vpt.h ++++ b/xen/include/asm-x86/hvm/vpt.h +@@ -44,6 +44,7 @@ struct periodic_time { + bool_t warned_timeout_too_short; + #define PTSRC_isa 1 /* ISA time source */ + #define PTSRC_lapic 2 /* LAPIC time source */ ++#define PTSRC_ioapic 3 /* IOAPIC time source */ + u8 source; /* PTSRC_ */ + u8 irq; + struct vcpu *vcpu; /* vcpu timer interrupt delivers to */ +diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h +index a8ceecf3e2..68fae91567 100644 +--- a/xen/include/asm-x86/msr-index.h ++++ b/xen/include/asm-x86/msr-index.h +@@ -31,6 +31,9 @@ + #define EFER_LMSLE (1<<_EFER_LMSLE) + #define EFER_FFXSE (1<<_EFER_FFXSE) + ++#define EFER_KNOWN_MASK (EFER_SCE | EFER_LME | EFER_LMA | EFER_NX | \ ++ EFER_SVME | EFER_LMSLE | EFER_FFXSE) ++ + /* Speculation Controls. */ + #define MSR_SPEC_CTRL 0x00000048 + #define SPEC_CTRL_IBRS (_AC(1, ULL) << 0) +@@ -42,6 +45,7 @@ + #define MSR_ARCH_CAPABILITIES 0x0000010a + #define ARCH_CAPABILITIES_RDCL_NO (_AC(1, ULL) << 0) + #define ARCH_CAPABILITIES_IBRS_ALL (_AC(1, ULL) << 1) ++#define ARCH_CAPS_RSBA (_AC(1, ULL) << 2) + + /* Intel MSRs. Some also available on other CPUs */ + #define MSR_IA32_PERFCTR0 0x000000c1 +diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h +index 80f8411355..a152f1d413 100644 +--- a/xen/include/asm-x86/processor.h ++++ b/xen/include/asm-x86/processor.h +@@ -445,7 +445,8 @@ struct __packed __cacheline_aligned tss_struct { + #define IST_DF 1UL + #define IST_NMI 2UL + #define IST_MCE 3UL +-#define IST_MAX 3UL ++#define IST_DB 4UL ++#define IST_MAX 4UL + + /* Set the interrupt stack table used by a particular interrupt + * descriptor table entry. */ |