diff options
Diffstat (limited to 'system/xen/patches/xen-4.10.2-pre.patch')
-rw-r--r-- | system/xen/patches/xen-4.10.2-pre.patch | 1631 |
1 files changed, 0 insertions, 1631 deletions
diff --git a/system/xen/patches/xen-4.10.2-pre.patch b/system/xen/patches/xen-4.10.2-pre.patch deleted file mode 100644 index 42477696e15fa..0000000000000 --- a/system/xen/patches/xen-4.10.2-pre.patch +++ /dev/null @@ -1,1631 +0,0 @@ -diff --git a/tools/libacpi/Makefile b/tools/libacpi/Makefile -index a47a658a25..c17f3924cc 100644 ---- a/tools/libacpi/Makefile -+++ b/tools/libacpi/Makefile -@@ -43,7 +43,7 @@ all: $(C_SRC) $(H_SRC) - - $(H_SRC): $(ACPI_BUILD_DIR)/%.h: %.asl iasl - iasl -vs -p $(ACPI_BUILD_DIR)/$*.$(TMP_SUFFIX) -tc $< -- sed -e 's/AmlCode/$*/g' $(ACPI_BUILD_DIR)/$*.hex >$@ -+ sed -e 's/AmlCode/$*/g' -e 's/_aml_code//g' $(ACPI_BUILD_DIR)/$*.hex >$@ - rm -f $(addprefix $(ACPI_BUILD_DIR)/, $*.aml $*.hex) - - $(MK_DSDT): mk_dsdt.c -@@ -76,7 +76,7 @@ $(ACPI_BUILD_DIR)/dsdt_anycpu_arm.asl: $(MK_DSDT) - - $(C_SRC): $(ACPI_BUILD_DIR)/%.c: iasl $(ACPI_BUILD_DIR)/%.asl - iasl -vs -p $(ACPI_BUILD_DIR)/$*.$(TMP_SUFFIX) -tc $(ACPI_BUILD_DIR)/$*.asl -- sed -e 's/AmlCode/$*/g' $(ACPI_BUILD_DIR)/$*.hex > $@.$(TMP_SUFFIX) -+ sed -e 's/AmlCode/$*/g' -e 's/_aml_code//g' $(ACPI_BUILD_DIR)/$*.hex > $@.$(TMP_SUFFIX) - echo "int $*_len=sizeof($*);" >> $@.$(TMP_SUFFIX) - mv -f $@.$(TMP_SUFFIX) $@ - rm -f $(addprefix $(ACPI_BUILD_DIR)/, $*.aml $*.hex) -#diff --git a/xen/Makefile b/xen/Makefile -#index ecec297b9b..580af86931 100644 -#--- a/xen/Makefile -#+++ b/xen/Makefile -#@@ -2,7 +2,7 @@ -# # All other places this is stored (eg. compile.h) should be autogenerated. -# export XEN_VERSION = 4 -# export XEN_SUBVERSION = 10 -#-export XEN_EXTRAVERSION ?= .1$(XEN_VENDORVERSION) -#+export XEN_EXTRAVERSION ?= .2-pre$(XEN_VENDORVERSION) -# export XEN_FULLVERSION = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION) -# -include xen-version -# -diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c -index 1e4e5680a7..f7085d3c7b 100644 ---- a/xen/arch/x86/acpi/power.c -+++ b/xen/arch/x86/acpi/power.c -@@ -28,6 +28,7 @@ - #include <asm/tboot.h> - #include <asm/apic.h> - #include <asm/io_apic.h> -+#include <asm/spec_ctrl.h> - #include <acpi/cpufreq/cpufreq.h> - - uint32_t system_reset_counter = 1; -@@ -163,6 +164,7 @@ static int enter_state(u32 state) - { - unsigned long flags; - int error; -+ struct cpu_info *ci; - unsigned long cr4; - - if ( (state <= ACPI_STATE_S0) || (state > ACPI_S_STATES_MAX) ) -@@ -203,12 +205,18 @@ static int enter_state(u32 state) - printk(XENLOG_ERR "Some devices failed to power down."); - system_state = SYS_STATE_resume; - device_power_up(error); -+ console_end_sync(); - error = -EIO; - goto done; - } - else - error = 0; - -+ ci = get_cpu_info(); -+ spec_ctrl_enter_idle(ci); -+ /* Avoid NMI/#MC using MSR_SPEC_CTRL until we've reloaded microcode. */ -+ ci->bti_ist_info = 0; -+ - ACPI_FLUSH_CPU_CACHE(); - - switch ( state ) -@@ -243,17 +251,23 @@ static int enter_state(u32 state) - if ( (state == ACPI_STATE_S3) && error ) - tboot_s3_error(error); - -+ console_end_sync(); -+ -+ microcode_resume_cpu(0); -+ -+ /* Re-enabled default NMI/#MC use of MSR_SPEC_CTRL. */ -+ ci->bti_ist_info = default_bti_ist_info; -+ spec_ctrl_exit_idle(ci); -+ - done: - spin_debug_enable(); - local_irq_restore(flags); -- console_end_sync(); - acpi_sleep_post(state); - if ( hvm_cpu_up() ) - BUG(); -+ cpufreq_add_cpu(0); - - enable_cpu: -- cpufreq_add_cpu(0); -- microcode_resume_cpu(0); - rcu_barrier(); - mtrr_aps_sync_begin(); - enable_nonboot_cpus(); -diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c -index fdb2bf1779..136adadb63 100644 ---- a/xen/arch/x86/cpu/common.c -+++ b/xen/arch/x86/cpu/common.c -@@ -747,6 +747,7 @@ void load_system_tables(void) - [IST_MCE - 1] = stack_top + IST_MCE * PAGE_SIZE, - [IST_DF - 1] = stack_top + IST_DF * PAGE_SIZE, - [IST_NMI - 1] = stack_top + IST_NMI * PAGE_SIZE, -+ [IST_DB - 1] = stack_top + IST_DB * PAGE_SIZE, - - [IST_MAX ... ARRAY_SIZE(tss->ist) - 1] = - 0x8600111111111111ul, -@@ -774,6 +775,7 @@ void load_system_tables(void) - set_ist(&idt_tables[cpu][TRAP_double_fault], IST_DF); - set_ist(&idt_tables[cpu][TRAP_nmi], IST_NMI); - set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE); -+ set_ist(&idt_tables[cpu][TRAP_debug], IST_DB); - - /* - * Bottom-of-stack must be 16-byte aligned! -diff --git a/xen/arch/x86/hpet.c b/xen/arch/x86/hpet.c -index 8229c635e4..f18cbbd55a 100644 ---- a/xen/arch/x86/hpet.c -+++ b/xen/arch/x86/hpet.c -@@ -509,6 +509,8 @@ static void hpet_attach_channel(unsigned int cpu, - static void hpet_detach_channel(unsigned int cpu, - struct hpet_event_channel *ch) - { -+ unsigned int next; -+ - spin_lock_irq(&ch->lock); - - ASSERT(ch == per_cpu(cpu_bc_channel, cpu)); -@@ -517,7 +519,7 @@ static void hpet_detach_channel(unsigned int cpu, - - if ( cpu != ch->cpu ) - spin_unlock_irq(&ch->lock); -- else if ( cpumask_empty(ch->cpumask) ) -+ else if ( (next = cpumask_first(ch->cpumask)) >= nr_cpu_ids ) - { - ch->cpu = -1; - clear_bit(HPET_EVT_USED_BIT, &ch->flags); -@@ -525,7 +527,7 @@ static void hpet_detach_channel(unsigned int cpu, - } - else - { -- ch->cpu = cpumask_first(ch->cpumask); -+ ch->cpu = next; - set_channel_irq_affinity(ch); - local_irq_enable(); - } -diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c -index b282089e03..131480fdd9 100644 ---- a/xen/arch/x86/hvm/emulate.c -+++ b/xen/arch/x86/hvm/emulate.c -@@ -2113,22 +2113,20 @@ static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt, - - vio->mmio_retry = 0; - -- switch ( rc = x86_emulate(&hvmemul_ctxt->ctxt, ops) ) -+ rc = x86_emulate(&hvmemul_ctxt->ctxt, ops); -+ if ( rc == X86EMUL_OKAY && vio->mmio_retry ) -+ rc = X86EMUL_RETRY; -+ -+ if ( !hvm_vcpu_io_need_completion(vio) ) - { -- case X86EMUL_OKAY: -- if ( vio->mmio_retry ) -- rc = X86EMUL_RETRY; -- /* fall through */ -- default: - vio->mmio_cache_count = 0; - vio->mmio_insn_bytes = 0; -- break; -- -- case X86EMUL_RETRY: -+ } -+ else -+ { - BUILD_BUG_ON(sizeof(vio->mmio_insn) < sizeof(hvmemul_ctxt->insn_buf)); - vio->mmio_insn_bytes = hvmemul_ctxt->insn_buf_bytes; - memcpy(vio->mmio_insn, hvmemul_ctxt->insn_buf, vio->mmio_insn_bytes); -- break; - } - - if ( hvmemul_ctxt->ctxt.retire.singlestep ) -diff --git a/xen/arch/x86/hvm/hpet.c b/xen/arch/x86/hvm/hpet.c -index f7aed7f69e..28377091ca 100644 ---- a/xen/arch/x86/hvm/hpet.c -+++ b/xen/arch/x86/hvm/hpet.c -@@ -264,13 +264,20 @@ static void hpet_set_timer(HPETState *h, unsigned int tn, - diff = (timer_is_32bit(h, tn) && (-diff > HPET_TINY_TIME_SPAN)) - ? (uint32_t)diff : 0; - -+ destroy_periodic_time(&h->pt[tn]); - if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) ) -+ { - /* if LegacyReplacementRoute bit is set, HPET specification requires - timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC, - timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */ - irq = (tn == 0) ? 0 : 8; -+ h->pt[tn].source = PTSRC_isa; -+ } - else -+ { - irq = timer_int_route(h, tn); -+ h->pt[tn].source = PTSRC_ioapic; -+ } - - /* - * diff is the time from now when the timer should fire, for a periodic -diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c -index d5afe20cc8..25b2445429 100644 ---- a/xen/arch/x86/hvm/ioreq.c -+++ b/xen/arch/x86/hvm/ioreq.c -@@ -87,14 +87,17 @@ static void hvm_io_assist(struct hvm_ioreq_vcpu *sv, uint64_t data) - - static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p) - { -+ unsigned int prev_state = STATE_IOREQ_NONE; -+ - while ( sv->pending ) - { - unsigned int state = p->state; - -- rmb(); -- switch ( state ) -+ smp_rmb(); -+ -+ recheck: -+ if ( unlikely(state == STATE_IOREQ_NONE) ) - { -- case STATE_IOREQ_NONE: - /* - * The only reason we should see this case is when an - * emulator is dying and it races with an I/O being -@@ -102,14 +105,30 @@ static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p) - */ - hvm_io_assist(sv, ~0ul); - break; -+ } -+ -+ if ( unlikely(state < prev_state) ) -+ { -+ gdprintk(XENLOG_ERR, "Weird HVM ioreq state transition %u -> %u\n", -+ prev_state, state); -+ sv->pending = false; -+ domain_crash(sv->vcpu->domain); -+ return false; /* bail */ -+ } -+ -+ switch ( prev_state = state ) -+ { - case STATE_IORESP_READY: /* IORESP_READY -> NONE */ - p->state = STATE_IOREQ_NONE; - hvm_io_assist(sv, p->data); - break; - case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */ - case STATE_IOREQ_INPROCESS: -- wait_on_xen_event_channel(sv->ioreq_evtchn, p->state != state); -- break; -+ wait_on_xen_event_channel(sv->ioreq_evtchn, -+ ({ state = p->state; -+ smp_rmb(); -+ state != prev_state; })); -+ goto recheck; - default: - gdprintk(XENLOG_ERR, "Weird HVM iorequest state %u\n", state); - sv->pending = false; -diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c -index f528e2d081..c85d004402 100644 ---- a/xen/arch/x86/hvm/irq.c -+++ b/xen/arch/x86/hvm/irq.c -@@ -41,6 +41,26 @@ static void assert_gsi(struct domain *d, unsigned ioapic_gsi) - vioapic_irq_positive_edge(d, ioapic_gsi); - } - -+int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level) -+{ -+ struct hvm_irq *hvm_irq = hvm_domain_irq(d); -+ int vector; -+ -+ if ( gsi >= hvm_irq->nr_gsis ) -+ { -+ ASSERT_UNREACHABLE(); -+ return -1; -+ } -+ -+ spin_lock(&d->arch.hvm_domain.irq_lock); -+ if ( !level || hvm_irq->gsi_assert_count[gsi]++ == 0 ) -+ assert_gsi(d, gsi); -+ vector = vioapic_get_vector(d, gsi); -+ spin_unlock(&d->arch.hvm_domain.irq_lock); -+ -+ return vector; -+} -+ - static void assert_irq(struct domain *d, unsigned ioapic_gsi, unsigned pic_irq) - { - assert_gsi(d, ioapic_gsi); -diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c -index dedec5752d..3b72b4dc2a 100644 ---- a/xen/arch/x86/hvm/svm/svm.c -+++ b/xen/arch/x86/hvm/svm/svm.c -@@ -1046,6 +1046,7 @@ static void svm_ctxt_switch_from(struct vcpu *v) - set_ist(&idt_tables[cpu][TRAP_double_fault], IST_DF); - set_ist(&idt_tables[cpu][TRAP_nmi], IST_NMI); - set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE); -+ set_ist(&idt_tables[cpu][TRAP_debug], IST_DB); - } - - static void svm_ctxt_switch_to(struct vcpu *v) -@@ -1067,6 +1068,7 @@ static void svm_ctxt_switch_to(struct vcpu *v) - set_ist(&idt_tables[cpu][TRAP_double_fault], IST_NONE); - set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE); - set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE); -+ set_ist(&idt_tables[cpu][TRAP_debug], IST_NONE); - - svm_restore_dr(v); - -@@ -1836,6 +1838,25 @@ static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content) - struct vcpu *v = current; - struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - -+ switch ( msr ) -+ { -+ /* -+ * Sync not needed while the cross-vendor logic is in unilateral effect. -+ case MSR_IA32_SYSENTER_CS: -+ case MSR_IA32_SYSENTER_ESP: -+ case MSR_IA32_SYSENTER_EIP: -+ */ -+ case MSR_STAR: -+ case MSR_LSTAR: -+ case MSR_CSTAR: -+ case MSR_SYSCALL_MASK: -+ case MSR_FS_BASE: -+ case MSR_GS_BASE: -+ case MSR_SHADOW_GS_BASE: -+ svm_sync_vmcb(v); -+ break; -+ } -+ - switch ( msr ) - { - case MSR_IA32_SYSENTER_CS: -@@ -1848,6 +1869,34 @@ static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content) - *msr_content = v->arch.hvm_svm.guest_sysenter_eip; - break; - -+ case MSR_STAR: -+ *msr_content = vmcb->star; -+ break; -+ -+ case MSR_LSTAR: -+ *msr_content = vmcb->lstar; -+ break; -+ -+ case MSR_CSTAR: -+ *msr_content = vmcb->cstar; -+ break; -+ -+ case MSR_SYSCALL_MASK: -+ *msr_content = vmcb->sfmask; -+ break; -+ -+ case MSR_FS_BASE: -+ *msr_content = vmcb->fs.base; -+ break; -+ -+ case MSR_GS_BASE: -+ *msr_content = vmcb->gs.base; -+ break; -+ -+ case MSR_SHADOW_GS_BASE: -+ *msr_content = vmcb->kerngsbase; -+ break; -+ - case MSR_IA32_MCx_MISC(4): /* Threshold register */ - case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3: - /* -@@ -1976,32 +2025,81 @@ static int svm_msr_write_intercept(unsigned int msr, uint64_t msr_content) - int ret, result = X86EMUL_OKAY; - struct vcpu *v = current; - struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; -- int sync = 0; -+ bool sync = false; - - switch ( msr ) - { - case MSR_IA32_SYSENTER_CS: - case MSR_IA32_SYSENTER_ESP: - case MSR_IA32_SYSENTER_EIP: -- sync = 1; -- break; -- default: -+ case MSR_STAR: -+ case MSR_LSTAR: -+ case MSR_CSTAR: -+ case MSR_SYSCALL_MASK: -+ case MSR_FS_BASE: -+ case MSR_GS_BASE: -+ case MSR_SHADOW_GS_BASE: -+ sync = true; - break; - } - - if ( sync ) -- svm_sync_vmcb(v); -+ svm_sync_vmcb(v); - - switch ( msr ) - { -+ case MSR_IA32_SYSENTER_ESP: -+ case MSR_IA32_SYSENTER_EIP: -+ case MSR_LSTAR: -+ case MSR_CSTAR: -+ case MSR_FS_BASE: -+ case MSR_GS_BASE: -+ case MSR_SHADOW_GS_BASE: -+ if ( !is_canonical_address(msr_content) ) -+ goto gpf; -+ -+ switch ( msr ) -+ { -+ case MSR_IA32_SYSENTER_ESP: -+ vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = msr_content; -+ break; -+ -+ case MSR_IA32_SYSENTER_EIP: -+ vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = msr_content; -+ break; -+ -+ case MSR_LSTAR: -+ vmcb->lstar = msr_content; -+ break; -+ -+ case MSR_CSTAR: -+ vmcb->cstar = msr_content; -+ break; -+ -+ case MSR_FS_BASE: -+ vmcb->fs.base = msr_content; -+ break; -+ -+ case MSR_GS_BASE: -+ vmcb->gs.base = msr_content; -+ break; -+ -+ case MSR_SHADOW_GS_BASE: -+ vmcb->kerngsbase = msr_content; -+ break; -+ } -+ break; -+ - case MSR_IA32_SYSENTER_CS: - vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = msr_content; - break; -- case MSR_IA32_SYSENTER_ESP: -- vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = msr_content; -+ -+ case MSR_STAR: -+ vmcb->star = msr_content; - break; -- case MSR_IA32_SYSENTER_EIP: -- vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = msr_content; -+ -+ case MSR_SYSCALL_MASK: -+ vmcb->sfmask = msr_content; - break; - - case MSR_IA32_DEBUGCTLMSR: -diff --git a/xen/arch/x86/hvm/svm/svmdebug.c b/xen/arch/x86/hvm/svm/svmdebug.c -index 89ef2db932..b5b946aa94 100644 ---- a/xen/arch/x86/hvm/svm/svmdebug.c -+++ b/xen/arch/x86/hvm/svm/svmdebug.c -@@ -131,9 +131,8 @@ bool svm_vmcb_isvalid(const char *from, const struct vmcb_struct *vmcb, - PRINTF("DR7: bits [63:32] are not zero (%#"PRIx64")\n", - vmcb_get_dr7(vmcb)); - -- if ( efer & ~(EFER_SCE | EFER_LME | EFER_LMA | EFER_NX | EFER_SVME | -- EFER_LMSLE | EFER_FFXSE) ) -- PRINTF("EFER: undefined bits are not zero (%#"PRIx64")\n", efer); -+ if ( efer & ~EFER_KNOWN_MASK ) -+ PRINTF("EFER: unknown bits are not zero (%#"PRIx64")\n", efer); - - if ( hvm_efer_valid(v, efer, -1) ) - PRINTF("EFER: %s (%"PRIx64")\n", hvm_efer_valid(v, efer, -1), efer); -diff --git a/xen/arch/x86/hvm/viridian.c b/xen/arch/x86/hvm/viridian.c -index f0fa59d7d5..b02a70d086 100644 ---- a/xen/arch/x86/hvm/viridian.c -+++ b/xen/arch/x86/hvm/viridian.c -@@ -245,7 +245,7 @@ void cpuid_viridian_leaves(const struct vcpu *v, uint32_t leaf, - }; - union { - HV_PARTITION_PRIVILEGE_MASK mask; -- uint32_t lo, hi; -+ struct { uint32_t lo, hi; }; - } u; - - if ( !(viridian_feature_mask(d) & HVMPV_no_freq) ) -@@ -966,12 +966,10 @@ int viridian_hypercall(struct cpu_user_regs *regs) - gprintk(XENLOG_WARNING, "unimplemented hypercall %04x\n", - input.call_code); - /* Fallthrough. */ -- case HvGetPartitionId: - case HvExtCallQueryCapabilities: - /* -- * These hypercalls seem to be erroneously issued by Windows -- * despite neither AccessPartitionId nor EnableExtendedHypercalls -- * being set in CPUID leaf 2. -+ * This hypercall seems to be erroneously issued by Windows -+ * despite EnableExtendedHypercalls not being set in CPUID leaf 2. - * Given that return a status of 'invalid code' has not so far - * caused any problems it's not worth logging. - */ -diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c -index 181f4cb631..04e3c2e15b 100644 ---- a/xen/arch/x86/hvm/vpt.c -+++ b/xen/arch/x86/hvm/vpt.c -@@ -107,31 +107,49 @@ static int pt_irq_vector(struct periodic_time *pt, enum hvm_intsrc src) - static int pt_irq_masked(struct periodic_time *pt) - { - struct vcpu *v = pt->vcpu; -- unsigned int gsi, isa_irq; -- int mask; -- uint8_t pic_imr; -+ unsigned int gsi = pt->irq; - -- if ( pt->source == PTSRC_lapic ) -+ switch ( pt->source ) -+ { -+ case PTSRC_lapic: - { - struct vlapic *vlapic = vcpu_vlapic(v); -+ - return (!vlapic_enabled(vlapic) || - (vlapic_get_reg(vlapic, APIC_LVTT) & APIC_LVT_MASKED)); - } - -- isa_irq = pt->irq; -- gsi = hvm_isa_irq_to_gsi(isa_irq); -- pic_imr = v->domain->arch.hvm_domain.vpic[isa_irq >> 3].imr; -- mask = vioapic_get_mask(v->domain, gsi); -- if ( mask < 0 ) -+ case PTSRC_isa: - { -- dprintk(XENLOG_WARNING, "d%u: invalid GSI (%u) for platform timer\n", -- v->domain->domain_id, gsi); -- domain_crash(v->domain); -- return -1; -+ uint8_t pic_imr = v->domain->arch.hvm_domain.vpic[pt->irq >> 3].imr; -+ -+ /* Check if the interrupt is unmasked in the PIC. */ -+ if ( !(pic_imr & (1 << (pt->irq & 7))) && vlapic_accept_pic_intr(v) ) -+ return 0; -+ -+ gsi = hvm_isa_irq_to_gsi(pt->irq); -+ } -+ -+ /* Fallthrough to check if the interrupt is masked on the IO APIC. */ -+ case PTSRC_ioapic: -+ { -+ int mask = vioapic_get_mask(v->domain, gsi); -+ -+ if ( mask < 0 ) -+ { -+ dprintk(XENLOG_WARNING, -+ "d%d: invalid GSI (%u) for platform timer\n", -+ v->domain->domain_id, gsi); -+ domain_crash(v->domain); -+ return -1; -+ } -+ -+ return mask; -+ } - } - -- return (((pic_imr & (1 << (isa_irq & 7))) || !vlapic_accept_pic_intr(v)) && -- mask); -+ ASSERT_UNREACHABLE(); -+ return 1; - } - - static void pt_lock(struct periodic_time *pt) -@@ -252,7 +270,7 @@ int pt_update_irq(struct vcpu *v) - struct list_head *head = &v->arch.hvm_vcpu.tm_list; - struct periodic_time *pt, *temp, *earliest_pt; - uint64_t max_lag; -- int irq, is_lapic, pt_vector; -+ int irq, pt_vector = -1; - - spin_lock(&v->arch.hvm_vcpu.tm_lock); - -@@ -288,29 +306,26 @@ int pt_update_irq(struct vcpu *v) - - earliest_pt->irq_issued = 1; - irq = earliest_pt->irq; -- is_lapic = (earliest_pt->source == PTSRC_lapic); - - spin_unlock(&v->arch.hvm_vcpu.tm_lock); - -- /* -- * If periodic timer interrut is handled by lapic, its vector in -- * IRR is returned and used to set eoi_exit_bitmap for virtual -- * interrupt delivery case. Otherwise return -1 to do nothing. -- */ -- if ( is_lapic ) -+ switch ( earliest_pt->source ) - { -+ case PTSRC_lapic: -+ /* -+ * If periodic timer interrupt is handled by lapic, its vector in -+ * IRR is returned and used to set eoi_exit_bitmap for virtual -+ * interrupt delivery case. Otherwise return -1 to do nothing. -+ */ - vlapic_set_irq(vcpu_vlapic(v), irq, 0); - pt_vector = irq; -- } -- else -- { -+ break; -+ -+ case PTSRC_isa: - hvm_isa_irq_deassert(v->domain, irq); - if ( platform_legacy_irq(irq) && vlapic_accept_pic_intr(v) && - v->domain->arch.hvm_domain.vpic[irq >> 3].int_output ) -- { - hvm_isa_irq_assert(v->domain, irq, NULL); -- pt_vector = -1; -- } - else - { - pt_vector = hvm_isa_irq_assert(v->domain, irq, vioapic_get_vector); -@@ -321,6 +336,17 @@ int pt_update_irq(struct vcpu *v) - if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) ) - pt_vector = -1; - } -+ break; -+ -+ case PTSRC_ioapic: -+ /* -+ * NB: At the moment IO-APIC routed interrupts generated by vpt devices -+ * (HPET) are edge-triggered. -+ */ -+ pt_vector = hvm_ioapic_assert(v->domain, irq, false); -+ if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) ) -+ pt_vector = -1; -+ break; - } - - return pt_vector; -@@ -418,7 +444,14 @@ void create_periodic_time( - struct vcpu *v, struct periodic_time *pt, uint64_t delta, - uint64_t period, uint8_t irq, time_cb *cb, void *data) - { -- ASSERT(pt->source != 0); -+ if ( !pt->source || -+ (pt->irq >= NR_ISAIRQS && pt->source == PTSRC_isa) || -+ (pt->irq >= hvm_domain_irq(v->domain)->nr_gsis && -+ pt->source == PTSRC_ioapic) ) -+ { -+ ASSERT_UNREACHABLE(); -+ return; -+ } - - destroy_periodic_time(pt); - -@@ -498,7 +531,7 @@ static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v) - { - int on_list; - -- ASSERT(pt->source == PTSRC_isa); -+ ASSERT(pt->source == PTSRC_isa || pt->source == PTSRC_ioapic); - - if ( pt->vcpu == NULL ) - return; -diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c -index 642ca312bf..c281936af0 100644 ---- a/xen/arch/x86/pv/emul-priv-op.c -+++ b/xen/arch/x86/pv/emul-priv-op.c -@@ -813,26 +813,6 @@ static int write_cr(unsigned int reg, unsigned long val, - return X86EMUL_UNHANDLEABLE; - } - --static int read_dr(unsigned int reg, unsigned long *val, -- struct x86_emulate_ctxt *ctxt) --{ -- unsigned long res = do_get_debugreg(reg); -- -- if ( IS_ERR_VALUE(res) ) -- return X86EMUL_UNHANDLEABLE; -- -- *val = res; -- -- return X86EMUL_OKAY; --} -- --static int write_dr(unsigned int reg, unsigned long val, -- struct x86_emulate_ctxt *ctxt) --{ -- return do_set_debugreg(reg, val) == 0 -- ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE; --} -- - static inline uint64_t guest_misc_enable(uint64_t val) - { - val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL | -@@ -906,9 +886,16 @@ static int read_msr(unsigned int reg, uint64_t *val, - return X86EMUL_OKAY; - - case MSR_EFER: -- *val = read_efer(); -+ /* Hide unknown bits, and unconditionally hide SVME from guests. */ -+ *val = read_efer() & EFER_KNOWN_MASK & ~EFER_SVME; -+ /* -+ * Hide the 64-bit features from 32-bit guests. SCE has -+ * vendor-dependent behaviour. -+ */ - if ( is_pv_32bit_domain(currd) ) -- *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE); -+ *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE | -+ (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL -+ ? EFER_SCE : 0)); - return X86EMUL_OKAY; - - case MSR_K7_FID_VID_CTL: -@@ -1326,8 +1313,8 @@ static const struct x86_emulate_ops priv_op_ops = { - .read_segment = read_segment, - .read_cr = read_cr, - .write_cr = write_cr, -- .read_dr = read_dr, -- .write_dr = write_dr, -+ .read_dr = x86emul_read_dr, -+ .write_dr = x86emul_write_dr, - .read_msr = read_msr, - .write_msr = write_msr, - .cpuid = pv_emul_cpuid, -diff --git a/xen/arch/x86/pv/misc-hypercalls.c b/xen/arch/x86/pv/misc-hypercalls.c -index 5862130697..1619be7874 100644 ---- a/xen/arch/x86/pv/misc-hypercalls.c -+++ b/xen/arch/x86/pv/misc-hypercalls.c -@@ -30,22 +30,10 @@ long do_set_debugreg(int reg, unsigned long value) - - unsigned long do_get_debugreg(int reg) - { -- struct vcpu *curr = current; -+ unsigned long val; -+ int res = x86emul_read_dr(reg, &val, NULL); - -- switch ( reg ) -- { -- case 0 ... 3: -- case 6: -- return curr->arch.debugreg[reg]; -- case 7: -- return (curr->arch.debugreg[7] | -- curr->arch.debugreg[5]); -- case 4 ... 5: -- return ((curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ? -- curr->arch.debugreg[reg + 2] : 0); -- } -- -- return -EINVAL; -+ return res == X86EMUL_OKAY ? val : -ENODEV; - } - - long do_fpu_taskswitch(int set) -diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c -index e1d023428c..f81fc2ca65 100644 ---- a/xen/arch/x86/smpboot.c -+++ b/xen/arch/x86/smpboot.c -@@ -968,6 +968,7 @@ static int cpu_smpboot_alloc(unsigned int cpu) - set_ist(&idt_tables[cpu][TRAP_double_fault], IST_NONE); - set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE); - set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE); -+ set_ist(&idt_tables[cpu][TRAP_debug], IST_NONE); - - for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1); - i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i ) -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 3c7447bfe6..fa67a0ffbd 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -97,12 +97,13 @@ static void __init print_details(enum ind_thunk thunk) - printk(XENLOG_DEBUG "Speculative mitigation facilities:\n"); - - /* Hardware features which pertain to speculative mitigations. */ -- printk(XENLOG_DEBUG " Hardware features:%s%s%s%s%s\n", -+ printk(XENLOG_DEBUG " Hardware features:%s%s%s%s%s%s\n", - (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "", - (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBPB)) ? " IBPB" : "", - (caps & ARCH_CAPABILITIES_IBRS_ALL) ? " IBRS_ALL" : "", -- (caps & ARCH_CAPABILITIES_RDCL_NO) ? " RDCL_NO" : ""); -+ (caps & ARCH_CAPABILITIES_RDCL_NO) ? " RDCL_NO" : "", -+ (caps & ARCH_CAPS_RSBA) ? " RSBA" : ""); - - /* Compiled-in support which pertains to BTI mitigations. */ - if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) ) -@@ -135,6 +136,20 @@ static bool __init retpoline_safe(void) - boot_cpu_data.x86 != 6 ) - return false; - -+ if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) ) -+ { -+ uint64_t caps; -+ -+ rdmsrl(MSR_ARCH_CAPABILITIES, caps); -+ -+ /* -+ * RBSA may be set by a hypervisor to indicate that we may move to a -+ * processor which isn't retpoline-safe. -+ */ -+ if ( caps & ARCH_CAPS_RSBA ) -+ return false; -+ } -+ - switch ( boot_cpu_data.x86_model ) - { - case 0x17: /* Penryn */ -@@ -161,18 +176,40 @@ static bool __init retpoline_safe(void) - * versions. - */ - case 0x3d: /* Broadwell */ -- return ucode_rev >= 0x28; -+ return ucode_rev >= 0x2a; - case 0x47: /* Broadwell H */ -- return ucode_rev >= 0x1b; -+ return ucode_rev >= 0x1d; - case 0x4f: /* Broadwell EP/EX */ -- return ucode_rev >= 0xb000025; -+ return ucode_rev >= 0xb000021; - case 0x56: /* Broadwell D */ -- return false; /* TBD. */ -+ switch ( boot_cpu_data.x86_mask ) -+ { -+ case 2: return ucode_rev >= 0x15; -+ case 3: return ucode_rev >= 0x7000012; -+ case 4: return ucode_rev >= 0xf000011; -+ case 5: return ucode_rev >= 0xe000009; -+ default: -+ printk("Unrecognised CPU stepping %#x - assuming not reptpoline safe\n", -+ boot_cpu_data.x86_mask); -+ return false; -+ } -+ break; - - /* -- * Skylake and later processors are not retpoline-safe. -+ * Skylake, Kabylake and Cannonlake processors are not retpoline-safe. - */ -+ case 0x4e: -+ case 0x55: -+ case 0x5e: -+ case 0x66: -+ case 0x67: -+ case 0x8e: -+ case 0x9e: -+ return false; -+ - default: -+ printk("Unrecognised CPU model %#x - assuming not reptpoline safe\n", -+ boot_cpu_data.x86_model); - return false; - } - } -diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c -index 906124331b..e217b0d6e2 100644 ---- a/xen/arch/x86/traps.c -+++ b/xen/arch/x86/traps.c -@@ -325,13 +325,13 @@ static void show_guest_stack(struct vcpu *v, const struct cpu_user_regs *regs) - /* - * Notes for get_stack_trace_bottom() and get_stack_dump_bottom() - * -- * Stack pages 0, 1 and 2: -+ * Stack pages 0 - 3: - * These are all 1-page IST stacks. Each of these stacks have an exception - * frame and saved register state at the top. The interesting bound for a - * trace is the word adjacent to this, while the bound for a dump is the - * very top, including the exception frame. - * -- * Stack pages 3, 4 and 5: -+ * Stack pages 4 and 5: - * None of these are particularly interesting. With MEMORY_GUARD, page 5 is - * explicitly not present, so attempting to dump or trace it is - * counterproductive. Without MEMORY_GUARD, it is possible for a call chain -@@ -352,12 +352,12 @@ unsigned long get_stack_trace_bottom(unsigned long sp) - { - switch ( get_stack_page(sp) ) - { -- case 0 ... 2: -+ case 0 ... 3: - return ROUNDUP(sp, PAGE_SIZE) - - offsetof(struct cpu_user_regs, es) - sizeof(unsigned long); - - #ifndef MEMORY_GUARD -- case 3 ... 5: -+ case 4 ... 5: - #endif - case 6 ... 7: - return ROUNDUP(sp, STACK_SIZE) - -@@ -372,11 +372,11 @@ unsigned long get_stack_dump_bottom(unsigned long sp) - { - switch ( get_stack_page(sp) ) - { -- case 0 ... 2: -+ case 0 ... 3: - return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long); - - #ifndef MEMORY_GUARD -- case 3 ... 5: -+ case 4 ... 5: - #endif - case 6 ... 7: - return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long); -@@ -1761,11 +1761,36 @@ static void ler_enable(void) - - void do_debug(struct cpu_user_regs *regs) - { -+ unsigned long dr6; - struct vcpu *v = current; - -+ /* Stash dr6 as early as possible. */ -+ dr6 = read_debugreg(6); -+ - if ( debugger_trap_entry(TRAP_debug, regs) ) - return; - -+ /* -+ * At the time of writing (March 2018), on the subject of %dr6: -+ * -+ * The Intel manual says: -+ * Certain debug exceptions may clear bits 0-3. The remaining contents -+ * of the DR6 register are never cleared by the processor. To avoid -+ * confusion in identifying debug exceptions, debug handlers should -+ * clear the register (except bit 16, which they should set) before -+ * returning to the interrupted task. -+ * -+ * The AMD manual says: -+ * Bits 15:13 of the DR6 register are not cleared by the processor and -+ * must be cleared by software after the contents have been read. -+ * -+ * Some bits are reserved set, some are reserved clear, and some bits -+ * which were previously reserved set are reused and cleared by hardware. -+ * For future compatibility, reset to the default value, which will allow -+ * us to spot any bit being changed by hardware to its non-default value. -+ */ -+ write_debugreg(6, X86_DR6_DEFAULT); -+ - if ( !guest_mode(regs) ) - { - if ( regs->eflags & X86_EFLAGS_TF ) -@@ -1784,21 +1809,50 @@ void do_debug(struct cpu_user_regs *regs) - regs->eflags &= ~X86_EFLAGS_TF; - } - } -- else -+ -+ /* -+ * Check for fault conditions. General Detect, and instruction -+ * breakpoints are faults rather than traps, at which point attempting -+ * to ignore and continue will result in a livelock. -+ */ -+ if ( dr6 & DR_GENERAL_DETECT ) - { -- /* -- * We ignore watchpoints when they trigger within Xen. This may -- * happen when a buffer is passed to us which previously had a -- * watchpoint set on it. No need to bump EIP; the only faulting -- * trap is an instruction breakpoint, which can't happen to us. -- */ -- WARN_ON(!search_exception_table(regs)); -+ printk(XENLOG_ERR "Hit General Detect in Xen context\n"); -+ fatal_trap(regs, 0); -+ } -+ -+ if ( dr6 & (DR_TRAP3 | DR_TRAP2 | DR_TRAP1 | DR_TRAP0) ) -+ { -+ unsigned int bp, dr7 = read_debugreg(7) >> DR_CONTROL_SHIFT; -+ -+ for ( bp = 0; bp < 4; ++bp ) -+ { -+ if ( (dr6 & (1u << bp)) && /* Breakpoint triggered? */ -+ ((dr7 & (3u << (bp * DR_CONTROL_SIZE))) == 0) /* Insn? */ ) -+ { -+ printk(XENLOG_ERR -+ "Hit instruction breakpoint in Xen context\n"); -+ fatal_trap(regs, 0); -+ } -+ } - } -+ -+ /* -+ * Whatever caused this #DB should be a trap. Note it and continue. -+ * Guests can trigger this in certain corner cases, so ensure the -+ * message is ratelimited. -+ */ -+ gprintk(XENLOG_WARNING, -+ "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, dr6 %lx\n", -+ regs->cs, _p(regs->rip), _p(regs->rip), -+ regs->ss, _p(regs->rsp), dr6); -+ - goto out; - } - - /* Save debug status register where guest OS can peek at it */ -- v->arch.debugreg[6] = read_debugreg(6); -+ v->arch.debugreg[6] |= (dr6 & ~X86_DR6_DEFAULT); -+ v->arch.debugreg[6] &= (dr6 | ~X86_DR6_DEFAULT); - - ler_enable(); - pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC); -@@ -1917,6 +1971,7 @@ void __init init_idt_traps(void) - set_ist(&idt_table[TRAP_double_fault], IST_DF); - set_ist(&idt_table[TRAP_nmi], IST_NMI); - set_ist(&idt_table[TRAP_machine_check], IST_MCE); -+ set_ist(&idt_table[TRAP_debug], IST_DB); - - /* CPU0 uses the master IDT. */ - idt_tables[0] = idt_table; -@@ -1984,6 +2039,12 @@ void activate_debugregs(const struct vcpu *curr) - } - } - -+/* -+ * Used by hypercalls and the emulator. -+ * -ENODEV => #UD -+ * -EINVAL => #GP Invalid bit -+ * -EPERM => #GP Valid bit, but not permitted to use -+ */ - long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) - { - int i; -@@ -2015,7 +2076,17 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) - if ( v == curr ) - write_debugreg(3, value); - break; -+ -+ case 4: -+ if ( v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE ) -+ return -ENODEV; -+ -+ /* Fallthrough */ - case 6: -+ /* The upper 32 bits are strictly reserved. */ -+ if ( value != (uint32_t)value ) -+ return -EINVAL; -+ - /* - * DR6: Bits 4-11,16-31 reserved (set to 1). - * Bit 12 reserved (set to 0). -@@ -2025,7 +2096,17 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) - if ( v == curr ) - write_debugreg(6, value); - break; -+ -+ case 5: -+ if ( v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE ) -+ return -ENODEV; -+ -+ /* Fallthrough */ - case 7: -+ /* The upper 32 bits are strictly reserved. */ -+ if ( value != (uint32_t)value ) -+ return -EINVAL; -+ - /* - * DR7: Bit 10 reserved (set to 1). - * Bits 11-12,14-15 reserved (set to 0). -@@ -2038,6 +2119,10 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) - */ - if ( value & DR_GENERAL_DETECT ) - return -EPERM; -+ -+ /* Zero the IO shadow before recalculating the real %dr7 */ -+ v->arch.debugreg[5] = 0; -+ - /* DR7.{G,L}E = 0 => debugging disabled for this domain. */ - if ( value & DR7_ACTIVE_MASK ) - { -@@ -2070,7 +2155,7 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) - write_debugreg(7, value); - break; - default: -- return -EINVAL; -+ return -ENODEV; - } - - v->arch.debugreg[reg] = value; -diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S -index 75497bc292..a47cb9dc19 100644 ---- a/xen/arch/x86/x86_64/compat/entry.S -+++ b/xen/arch/x86/x86_64/compat/entry.S -@@ -39,6 +39,12 @@ ENTRY(compat_test_all_events) - leaq irq_stat+IRQSTAT_softirq_pending(%rip),%rcx - cmpl $0,(%rcx,%rax,1) - jne compat_process_softirqs -+ -+ /* Inject exception if pending. */ -+ lea VCPU_trap_bounce(%rbx), %rdx -+ testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx) -+ jnz .Lcompat_process_trapbounce -+ - testb $1,VCPU_mce_pending(%rbx) - jnz compat_process_mce - .Lcompat_test_guest_nmi: -@@ -68,15 +74,24 @@ compat_process_softirqs: - call do_softirq - jmp compat_test_all_events - -+ ALIGN -+/* %rbx: struct vcpu, %rdx: struct trap_bounce */ -+.Lcompat_process_trapbounce: -+ sti -+.Lcompat_bounce_exception: -+ call compat_create_bounce_frame -+ movb $0, TRAPBOUNCE_flags(%rdx) -+ jmp compat_test_all_events -+ - ALIGN - /* %rbx: struct vcpu */ - compat_process_mce: - testb $1 << VCPU_TRAP_MCE,VCPU_async_exception_mask(%rbx) - jnz .Lcompat_test_guest_nmi - sti -- movb $0,VCPU_mce_pending(%rbx) -- call set_guest_machinecheck_trapbounce -- testl %eax,%eax -+ movb $0, VCPU_mce_pending(%rbx) -+ call set_guest_machinecheck_trapbounce -+ test %al, %al - jz compat_test_all_events - movzbl VCPU_async_exception_mask(%rbx),%edx # save mask for the - movb %dl,VCPU_mce_old_mask(%rbx) # iret hypercall -@@ -88,11 +103,11 @@ compat_process_mce: - /* %rbx: struct vcpu */ - compat_process_nmi: - testb $1 << VCPU_TRAP_NMI,VCPU_async_exception_mask(%rbx) -- jnz compat_test_guest_events -+ jnz compat_test_guest_events - sti -- movb $0,VCPU_nmi_pending(%rbx) -+ movb $0, VCPU_nmi_pending(%rbx) - call set_guest_nmi_trapbounce -- testl %eax,%eax -+ test %al, %al - jz compat_test_all_events - movzbl VCPU_async_exception_mask(%rbx),%edx # save mask for the - movb %dl,VCPU_nmi_old_mask(%rbx) # iret hypercall -@@ -189,15 +204,6 @@ ENTRY(cr4_pv32_restore) - xor %eax, %eax - ret - --/* %rdx: trap_bounce, %rbx: struct vcpu */ --ENTRY(compat_post_handle_exception) -- testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx) -- jz compat_test_all_events --.Lcompat_bounce_exception: -- call compat_create_bounce_frame -- movb $0,TRAPBOUNCE_flags(%rdx) -- jmp compat_test_all_events -- - .section .text.entry, "ax", @progbits - - /* See lstar_enter for entry register state. */ -diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S -index bdd33e727f..41d3ec21a1 100644 ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -42,6 +42,12 @@ test_all_events: - leaq irq_stat+IRQSTAT_softirq_pending(%rip), %rcx - cmpl $0, (%rcx, %rax, 1) - jne process_softirqs -+ -+ /* Inject exception if pending. */ -+ lea VCPU_trap_bounce(%rbx), %rdx -+ testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx) -+ jnz .Lprocess_trapbounce -+ - cmpb $0, VCPU_mce_pending(%rbx) - jne process_mce - .Ltest_guest_nmi: -@@ -69,6 +75,15 @@ process_softirqs: - call do_softirq - jmp test_all_events - -+ ALIGN -+/* %rbx: struct vcpu, %rdx struct trap_bounce */ -+.Lprocess_trapbounce: -+ sti -+.Lbounce_exception: -+ call create_bounce_frame -+ movb $0, TRAPBOUNCE_flags(%rdx) -+ jmp test_all_events -+ - ALIGN - /* %rbx: struct vcpu */ - process_mce: -@@ -77,7 +92,7 @@ process_mce: - sti - movb $0, VCPU_mce_pending(%rbx) - call set_guest_machinecheck_trapbounce -- test %eax, %eax -+ test %al, %al - jz test_all_events - movzbl VCPU_async_exception_mask(%rbx), %edx # save mask for the - movb %dl, VCPU_mce_old_mask(%rbx) # iret hypercall -@@ -93,7 +108,7 @@ process_nmi: - sti - movb $0, VCPU_nmi_pending(%rbx) - call set_guest_nmi_trapbounce -- test %eax, %eax -+ test %al, %al - jz test_all_events - movzbl VCPU_async_exception_mask(%rbx), %edx # save mask for the - movb %dl, VCPU_nmi_old_mask(%rbx) # iret hypercall -@@ -667,15 +682,9 @@ handle_exception_saved: - mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14) - testb $3,UREGS_cs(%rsp) - jz restore_all_xen -- leaq VCPU_trap_bounce(%rbx),%rdx - movq VCPU_domain(%rbx),%rax - testb $1,DOMAIN_is_32bit_pv(%rax) -- jnz compat_post_handle_exception -- testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx) -- jz test_all_events --.Lbounce_exception: -- call create_bounce_frame -- movb $0,TRAPBOUNCE_flags(%rdx) -+ jnz compat_test_all_events - jmp test_all_events - - /* No special register assumptions. */ -@@ -730,7 +739,7 @@ ENTRY(device_not_available) - ENTRY(debug) - pushq $0 - movl $TRAP_debug,4(%rsp) -- jmp handle_exception -+ jmp handle_ist_exception - - ENTRY(int3) - pushq $0 -@@ -783,12 +792,14 @@ ENTRY(double_fault) - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - - mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rbx -- test %rbx, %rbx -+ neg %rbx - jz .Ldblf_cr3_okay - jns .Ldblf_cr3_load -+ mov %rbx, STACK_CPUINFO_FIELD(xen_cr3)(%r14) - neg %rbx - .Ldblf_cr3_load: - mov %rbx, %cr3 -+ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%r14) - .Ldblf_cr3_okay: - - movq %rsp,%rdi -diff --git a/xen/arch/x86/x86_emulate.c b/xen/arch/x86/x86_emulate.c -index c7ba221d11..9125c67c9e 100644 ---- a/xen/arch/x86/x86_emulate.c -+++ b/xen/arch/x86/x86_emulate.c -@@ -14,6 +14,7 @@ - #include <asm/processor.h> /* current_cpu_info */ - #include <asm/xstate.h> - #include <asm/amd.h> /* cpu_has_amd_erratum() */ -+#include <asm/debugreg.h> - - /* Avoid namespace pollution. */ - #undef cmpxchg -@@ -41,3 +42,75 @@ - }) - - #include "x86_emulate/x86_emulate.c" -+ -+/* Called with NULL ctxt in hypercall context. */ -+int x86emul_read_dr(unsigned int reg, unsigned long *val, -+ struct x86_emulate_ctxt *ctxt) -+{ -+ struct vcpu *curr = current; -+ -+ /* HVM support requires a bit more plumbing before it will work. */ -+ ASSERT(is_pv_vcpu(curr)); -+ -+ switch ( reg ) -+ { -+ case 0 ... 3: -+ case 6: -+ *val = curr->arch.debugreg[reg]; -+ break; -+ -+ case 7: -+ *val = (curr->arch.debugreg[7] | -+ curr->arch.debugreg[5]); -+ break; -+ -+ case 4 ... 5: -+ if ( !(curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ) -+ { -+ *val = curr->arch.debugreg[reg + 2]; -+ break; -+ } -+ -+ /* Fallthrough */ -+ default: -+ if ( ctxt ) -+ x86_emul_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC, ctxt); -+ -+ return X86EMUL_EXCEPTION; -+ } -+ -+ return X86EMUL_OKAY; -+} -+ -+int x86emul_write_dr(unsigned int reg, unsigned long val, -+ struct x86_emulate_ctxt *ctxt) -+{ -+ struct vcpu *curr = current; -+ -+ /* HVM support requires a bit more plumbing before it will work. */ -+ ASSERT(is_pv_vcpu(curr)); -+ -+ switch ( set_debugreg(curr, reg, val) ) -+ { -+ case 0: -+ return X86EMUL_OKAY; -+ -+ case -ENODEV: -+ x86_emul_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC, ctxt); -+ return X86EMUL_EXCEPTION; -+ -+ default: -+ x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt); -+ return X86EMUL_EXCEPTION; -+ } -+} -+ -+/* -+ * Local variables: -+ * mode: C -+ * c-file-style: "BSD" -+ * c-basic-offset: 4 -+ * tab-width: 4 -+ * indent-tabs-mode: nil -+ * End: -+ */ -diff --git a/xen/arch/x86/x86_emulate/x86_emulate.h b/xen/arch/x86/x86_emulate/x86_emulate.h -index 0c8c80ad5a..9c2bb8157c 100644 ---- a/xen/arch/x86/x86_emulate/x86_emulate.h -+++ b/xen/arch/x86/x86_emulate/x86_emulate.h -@@ -662,6 +662,11 @@ static inline void x86_emulate_free_state(struct x86_emulate_state *state) {} - void x86_emulate_free_state(struct x86_emulate_state *state); - #endif - -+int x86emul_read_dr(unsigned int reg, unsigned long *val, -+ struct x86_emulate_ctxt *ctxt); -+int x86emul_write_dr(unsigned int reg, unsigned long val, -+ struct x86_emulate_ctxt *ctxt); -+ - #endif - - static inline void x86_emul_hw_exception( -diff --git a/xen/common/schedule.c b/xen/common/schedule.c -index b7884263f2..f21c3e5a64 100644 ---- a/xen/common/schedule.c -+++ b/xen/common/schedule.c -@@ -436,14 +436,9 @@ void sched_destroy_domain(struct domain *d) - cpupool_rm_domain(d); - } - --void vcpu_sleep_nosync(struct vcpu *v) -+void vcpu_sleep_nosync_locked(struct vcpu *v) - { -- unsigned long flags; -- spinlock_t *lock; -- -- TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id); -- -- lock = vcpu_schedule_lock_irqsave(v, &flags); -+ ASSERT(spin_is_locked(per_cpu(schedule_data,v->processor).schedule_lock)); - - if ( likely(!vcpu_runnable(v)) ) - { -@@ -452,6 +447,18 @@ void vcpu_sleep_nosync(struct vcpu *v) - - SCHED_OP(vcpu_scheduler(v), sleep, v); - } -+} -+ -+void vcpu_sleep_nosync(struct vcpu *v) -+{ -+ unsigned long flags; -+ spinlock_t *lock; -+ -+ TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id); -+ -+ lock = vcpu_schedule_lock_irqsave(v, &flags); -+ -+ vcpu_sleep_nosync_locked(v); - - vcpu_schedule_unlock_irqrestore(lock, flags, v); - } -@@ -567,13 +574,54 @@ static void vcpu_move_nosched(struct vcpu *v, unsigned int new_cpu) - sched_move_irqs(v); - } - --static void vcpu_migrate(struct vcpu *v) -+/* -+ * Initiating migration -+ * -+ * In order to migrate, we need the vcpu in question to have stopped -+ * running and had SCHED_OP(sleep) called (to take it off any -+ * runqueues, for instance); and if it is currently running, it needs -+ * to be scheduled out. Finally, we need to hold the scheduling locks -+ * for both the processor we're migrating from, and the processor -+ * we're migrating to. -+ * -+ * In order to avoid deadlock while satisfying the final requirement, -+ * we must release any scheduling lock we hold, then try to grab both -+ * locks we want, then double-check to make sure that what we started -+ * to do hasn't been changed in the mean time. -+ * -+ * These steps are encapsulated in the following two functions; they -+ * should be called like this: -+ * -+ * lock = vcpu_schedule_lock_irq(v); -+ * vcpu_migrate_start(v); -+ * vcpu_schedule_unlock_irq(lock, v) -+ * vcpu_migrate_finish(v); -+ * -+ * vcpu_migrate_finish() will do the work now if it can, or simply -+ * return if it can't (because v is still running); in that case -+ * vcpu_migrate_finish() will be called by context_saved(). -+ */ -+void vcpu_migrate_start(struct vcpu *v) -+{ -+ set_bit(_VPF_migrating, &v->pause_flags); -+ vcpu_sleep_nosync_locked(v); -+} -+ -+static void vcpu_migrate_finish(struct vcpu *v) - { - unsigned long flags; - unsigned int old_cpu, new_cpu; - spinlock_t *old_lock, *new_lock; - bool_t pick_called = 0; - -+ /* -+ * If the vcpu is currently running, this will be handled by -+ * context_saved(); and in any case, if the bit is cleared, then -+ * someone else has already done the work so we don't need to. -+ */ -+ if ( v->is_running || !test_bit(_VPF_migrating, &v->pause_flags) ) -+ return; -+ - old_cpu = new_cpu = v->processor; - for ( ; ; ) - { -@@ -653,14 +701,11 @@ void vcpu_force_reschedule(struct vcpu *v) - spinlock_t *lock = vcpu_schedule_lock_irq(v); - - if ( v->is_running ) -- set_bit(_VPF_migrating, &v->pause_flags); -+ vcpu_migrate_start(v); -+ - vcpu_schedule_unlock_irq(lock, v); - -- if ( v->pause_flags & VPF_migrating ) -- { -- vcpu_sleep_nosync(v); -- vcpu_migrate(v); -- } -+ vcpu_migrate_finish(v); - } - - void restore_vcpu_affinity(struct domain *d) -@@ -812,10 +857,10 @@ int cpu_disable_scheduler(unsigned int cpu) - * * the scheduler will always fine a suitable solution, or - * things would have failed before getting in here. - */ -- set_bit(_VPF_migrating, &v->pause_flags); -+ vcpu_migrate_start(v); - vcpu_schedule_unlock_irqrestore(lock, flags, v); -- vcpu_sleep_nosync(v); -- vcpu_migrate(v); -+ -+ vcpu_migrate_finish(v); - - /* - * The only caveat, in this case, is that if a vcpu active in -@@ -849,18 +894,14 @@ static int vcpu_set_affinity( - * Always ask the scheduler to re-evaluate placement - * when changing the affinity. - */ -- set_bit(_VPF_migrating, &v->pause_flags); -+ vcpu_migrate_start(v); - } - - vcpu_schedule_unlock_irq(lock, v); - - domain_update_node_affinity(v->domain); - -- if ( v->pause_flags & VPF_migrating ) -- { -- vcpu_sleep_nosync(v); -- vcpu_migrate(v); -- } -+ vcpu_migrate_finish(v); - - return ret; - } -@@ -1088,7 +1129,6 @@ int vcpu_pin_override(struct vcpu *v, int cpu) - { - cpumask_copy(v->cpu_hard_affinity, v->cpu_hard_affinity_saved); - v->affinity_broken = 0; -- set_bit(_VPF_migrating, &v->pause_flags); - ret = 0; - } - } -@@ -1101,20 +1141,18 @@ int vcpu_pin_override(struct vcpu *v, int cpu) - cpumask_copy(v->cpu_hard_affinity_saved, v->cpu_hard_affinity); - v->affinity_broken = 1; - cpumask_copy(v->cpu_hard_affinity, cpumask_of(cpu)); -- set_bit(_VPF_migrating, &v->pause_flags); - ret = 0; - } - } - -+ if ( ret == 0 ) -+ vcpu_migrate_start(v); -+ - vcpu_schedule_unlock_irq(lock, v); - - domain_update_node_affinity(v->domain); - -- if ( v->pause_flags & VPF_migrating ) -- { -- vcpu_sleep_nosync(v); -- vcpu_migrate(v); -- } -+ vcpu_migrate_finish(v); - - return ret; - } -@@ -1501,8 +1539,7 @@ void context_saved(struct vcpu *prev) - - SCHED_OP(vcpu_scheduler(prev), context_saved, prev); - -- if ( unlikely(prev->pause_flags & VPF_migrating) ) -- vcpu_migrate(prev); -+ vcpu_migrate_finish(prev); - } - - /* The scheduler timer: force a run through the scheduler */ -diff --git a/xen/include/asm-x86/debugreg.h b/xen/include/asm-x86/debugreg.h -index c57914efc6..b3b10eaf40 100644 ---- a/xen/include/asm-x86/debugreg.h -+++ b/xen/include/asm-x86/debugreg.h -@@ -24,6 +24,8 @@ - #define DR_STATUS_RESERVED_ZERO (~0xffffeffful) /* Reserved, read as zero */ - #define DR_STATUS_RESERVED_ONE 0xffff0ff0ul /* Reserved, read as one */ - -+#define X86_DR6_DEFAULT 0xffff0ff0ul /* Default %dr6 value. */ -+ - /* Now define a bunch of things for manipulating the control register. - The top two bytes of the control register consist of 4 fields of 4 - bits - each field corresponds to one of the four debug registers, -diff --git a/xen/include/asm-x86/hvm/irq.h b/xen/include/asm-x86/hvm/irq.h -index f756cb5a0d..1a52ec6045 100644 ---- a/xen/include/asm-x86/hvm/irq.h -+++ b/xen/include/asm-x86/hvm/irq.h -@@ -207,6 +207,9 @@ int hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq); - - int hvm_inject_msi(struct domain *d, uint64_t addr, uint32_t data); - -+/* Assert an IO APIC pin. */ -+int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level); -+ - void hvm_maybe_deassert_evtchn_irq(void); - void hvm_assert_evtchn_irq(struct vcpu *v); - void hvm_set_callback_via(struct domain *d, uint64_t via); -diff --git a/xen/include/asm-x86/hvm/vpt.h b/xen/include/asm-x86/hvm/vpt.h -index 21166edd06..0eb5ff632e 100644 ---- a/xen/include/asm-x86/hvm/vpt.h -+++ b/xen/include/asm-x86/hvm/vpt.h -@@ -44,6 +44,7 @@ struct periodic_time { - bool_t warned_timeout_too_short; - #define PTSRC_isa 1 /* ISA time source */ - #define PTSRC_lapic 2 /* LAPIC time source */ -+#define PTSRC_ioapic 3 /* IOAPIC time source */ - u8 source; /* PTSRC_ */ - u8 irq; - struct vcpu *vcpu; /* vcpu timer interrupt delivers to */ -diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h -index a8ceecf3e2..68fae91567 100644 ---- a/xen/include/asm-x86/msr-index.h -+++ b/xen/include/asm-x86/msr-index.h -@@ -31,6 +31,9 @@ - #define EFER_LMSLE (1<<_EFER_LMSLE) - #define EFER_FFXSE (1<<_EFER_FFXSE) - -+#define EFER_KNOWN_MASK (EFER_SCE | EFER_LME | EFER_LMA | EFER_NX | \ -+ EFER_SVME | EFER_LMSLE | EFER_FFXSE) -+ - /* Speculation Controls. */ - #define MSR_SPEC_CTRL 0x00000048 - #define SPEC_CTRL_IBRS (_AC(1, ULL) << 0) -@@ -42,6 +45,7 @@ - #define MSR_ARCH_CAPABILITIES 0x0000010a - #define ARCH_CAPABILITIES_RDCL_NO (_AC(1, ULL) << 0) - #define ARCH_CAPABILITIES_IBRS_ALL (_AC(1, ULL) << 1) -+#define ARCH_CAPS_RSBA (_AC(1, ULL) << 2) - - /* Intel MSRs. Some also available on other CPUs */ - #define MSR_IA32_PERFCTR0 0x000000c1 -diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h -index 80f8411355..a152f1d413 100644 ---- a/xen/include/asm-x86/processor.h -+++ b/xen/include/asm-x86/processor.h -@@ -445,7 +445,8 @@ struct __packed __cacheline_aligned tss_struct { - #define IST_DF 1UL - #define IST_NMI 2UL - #define IST_MCE 3UL --#define IST_MAX 3UL -+#define IST_DB 4UL -+#define IST_MAX 4UL - - /* Set the interrupt stack table used by a particular interrupt - * descriptor table entry. */ |