diff options
-rw-r--r-- | Makefile.target | 3 | ||||
-rw-r--r-- | hw/apic.c | 126 | ||||
-rw-r--r-- | hw/apic_common.c | 64 | ||||
-rw-r--r-- | hw/apic_internal.h | 27 | ||||
-rw-r--r-- | hw/kvm/apic.c | 32 | ||||
-rw-r--r-- | hw/kvmvapic.c | 805 |
6 files changed, 1043 insertions, 14 deletions
diff --git a/Makefile.target b/Makefile.target index 29fde6e982..4b460fdcab 100644 --- a/Makefile.target +++ b/Makefile.target @@ -236,7 +236,8 @@ obj-y += device-hotplug.o # Hardware support obj-i386-y += mc146818rtc.o pc.o -obj-i386-y += sga.o apic_common.o apic.o ioapic_common.o ioapic.o piix_pci.o +obj-i386-y += apic_common.o apic.o kvmvapic.o +obj-i386-y += sga.o ioapic_common.o ioapic.o piix_pci.o obj-i386-y += vmport.o obj-i386-y += pci-hotplug.o smbios.o wdt_ib700.o obj-i386-y += debugcon.o multiboot.o @@ -35,6 +35,10 @@ #define MSI_ADDR_DEST_ID_SHIFT 12 #define MSI_ADDR_DEST_ID_MASK 0x00ffff0 +#define SYNC_FROM_VAPIC 0x1 +#define SYNC_TO_VAPIC 0x2 +#define SYNC_ISR_IRR_TO_VAPIC 0x4 + static APICCommonState *local_apics[MAX_APICS + 1]; static void apic_set_irq(APICCommonState *s, int vector_num, int trigger_mode); @@ -78,6 +82,70 @@ static inline int get_bit(uint32_t *tab, int index) return !!(tab[i] & mask); } +/* return -1 if no bit is set */ +static int get_highest_priority_int(uint32_t *tab) +{ + int i; + for (i = 7; i >= 0; i--) { + if (tab[i] != 0) { + return i * 32 + fls_bit(tab[i]); + } + } + return -1; +} + +static void apic_sync_vapic(APICCommonState *s, int sync_type) +{ + VAPICState vapic_state; + size_t length; + off_t start; + int vector; + + if (!s->vapic_paddr) { + return; + } + if (sync_type & SYNC_FROM_VAPIC) { + cpu_physical_memory_rw(s->vapic_paddr, (void *)&vapic_state, + sizeof(vapic_state), 0); + s->tpr = vapic_state.tpr; + } + if (sync_type & (SYNC_TO_VAPIC | SYNC_ISR_IRR_TO_VAPIC)) { + start = offsetof(VAPICState, isr); + length = offsetof(VAPICState, enabled) - offsetof(VAPICState, isr); + + if (sync_type & SYNC_TO_VAPIC) { + assert(qemu_cpu_is_self(s->cpu_env)); + + vapic_state.tpr = s->tpr; + vapic_state.enabled = 1; + start = 0; + length = sizeof(VAPICState); + } + + vector = get_highest_priority_int(s->isr); + if (vector < 0) { + vector = 0; + } + vapic_state.isr = vector & 0xf0; + + vapic_state.zero = 0; + + vector = get_highest_priority_int(s->irr); + if (vector < 0) { + vector = 0; + } + vapic_state.irr = vector & 0xff; + + cpu_physical_memory_write_rom(s->vapic_paddr + start, + ((void *)&vapic_state) + start, length); + } +} + +static void apic_vapic_base_update(APICCommonState *s) +{ + apic_sync_vapic(s, SYNC_TO_VAPIC); +} + static void apic_local_deliver(APICCommonState *s, int vector) { uint32_t lvt = s->lvt[vector]; @@ -239,20 +307,17 @@ static void apic_set_base(APICCommonState *s, uint64_t val) static void apic_set_tpr(APICCommonState *s, uint8_t val) { - s->tpr = (val & 0x0f) << 4; - apic_update_irq(s); + /* Updates from cr8 are ignored while the VAPIC is active */ + if (!s->vapic_paddr) { + s->tpr = val << 4; + apic_update_irq(s); + } } -/* return -1 if no bit is set */ -static int get_highest_priority_int(uint32_t *tab) +static uint8_t apic_get_tpr(APICCommonState *s) { - int i; - for(i = 7; i >= 0; i--) { - if (tab[i] != 0) { - return i * 32 + fls_bit(tab[i]); - } - } - return -1; + apic_sync_vapic(s, SYNC_FROM_VAPIC); + return s->tpr >> 4; } static int apic_get_ppr(APICCommonState *s) @@ -312,6 +377,14 @@ static void apic_update_irq(APICCommonState *s) } } +void apic_poll_irq(DeviceState *d) +{ + APICCommonState *s = APIC_COMMON(d); + + apic_sync_vapic(s, SYNC_FROM_VAPIC); + apic_update_irq(s); +} + static void apic_set_irq(APICCommonState *s, int vector_num, int trigger_mode) { apic_report_irq_delivered(!get_bit(s->irr, vector_num)); @@ -321,6 +394,16 @@ static void apic_set_irq(APICCommonState *s, int vector_num, int trigger_mode) set_bit(s->tmr, vector_num); else reset_bit(s->tmr, vector_num); + if (s->vapic_paddr) { + apic_sync_vapic(s, SYNC_ISR_IRR_TO_VAPIC); + /* + * The vcpu thread needs to see the new IRR before we pull its current + * TPR value. That way, if we miss a lowering of the TRP, the guest + * has the chance to notice the new IRR and poll for IRQs on its own. + */ + smp_wmb(); + apic_sync_vapic(s, SYNC_FROM_VAPIC); + } apic_update_irq(s); } @@ -334,6 +417,7 @@ static void apic_eoi(APICCommonState *s) if (!(s->spurious_vec & APIC_SV_DIRECTED_IO) && get_bit(s->tmr, isrv)) { ioapic_eoi_broadcast(isrv); } + apic_sync_vapic(s, SYNC_FROM_VAPIC | SYNC_TO_VAPIC); apic_update_irq(s); } @@ -471,15 +555,19 @@ int apic_get_interrupt(DeviceState *d) if (!(s->spurious_vec & APIC_SV_ENABLE)) return -1; + apic_sync_vapic(s, SYNC_FROM_VAPIC); intno = apic_irq_pending(s); if (intno == 0) { + apic_sync_vapic(s, SYNC_TO_VAPIC); return -1; } else if (intno < 0) { + apic_sync_vapic(s, SYNC_TO_VAPIC); return s->spurious_vec & 0xff; } reset_bit(s->irr, intno); set_bit(s->isr, intno); + apic_sync_vapic(s, SYNC_TO_VAPIC); apic_update_irq(s); return intno; } @@ -576,6 +664,10 @@ static uint32_t apic_mem_readl(void *opaque, target_phys_addr_t addr) val = 0x11 | ((APIC_LVT_NB - 1) << 16); /* version 0x11 */ break; case 0x08: + apic_sync_vapic(s, SYNC_FROM_VAPIC); + if (apic_report_tpr_access) { + cpu_report_tpr_access(s->cpu_env, TPR_ACCESS_READ); + } val = s->tpr; break; case 0x09: @@ -675,7 +767,11 @@ static void apic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t val) case 0x03: break; case 0x08: + if (apic_report_tpr_access) { + cpu_report_tpr_access(s->cpu_env, TPR_ACCESS_WRITE); + } s->tpr = val; + apic_sync_vapic(s, SYNC_TO_VAPIC); apic_update_irq(s); break; case 0x09: @@ -737,6 +833,11 @@ static void apic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t val) } } +static void apic_pre_save(APICCommonState *s) +{ + apic_sync_vapic(s, SYNC_FROM_VAPIC); +} + static void apic_post_load(APICCommonState *s) { if (s->timer_expiry != -1) { @@ -770,7 +871,10 @@ static void apic_class_init(ObjectClass *klass, void *data) k->init = apic_init; k->set_base = apic_set_base; k->set_tpr = apic_set_tpr; + k->get_tpr = apic_get_tpr; + k->vapic_base_update = apic_vapic_base_update; k->external_nmi = apic_external_nmi; + k->pre_save = apic_pre_save; k->post_load = apic_post_load; } diff --git a/hw/apic_common.c b/hw/apic_common.c index 340d793c70..60b82596e7 100644 --- a/hw/apic_common.c +++ b/hw/apic_common.c @@ -20,8 +20,10 @@ #include "apic.h" #include "apic_internal.h" #include "trace.h" +#include "kvm.h" static int apic_irq_delivered; +bool apic_report_tpr_access; void cpu_set_apic_base(DeviceState *d, uint64_t val) { @@ -63,14 +65,45 @@ void cpu_set_apic_tpr(DeviceState *d, uint8_t val) uint8_t cpu_get_apic_tpr(DeviceState *d) { + APICCommonState *s; + APICCommonClass *info; + + if (!d) { + return 0; + } + + s = APIC_COMMON(d); + info = APIC_COMMON_GET_CLASS(s); + + return info->get_tpr(s); +} + +void apic_enable_tpr_access_reporting(DeviceState *d, bool enable) +{ + APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); + APICCommonClass *info = APIC_COMMON_GET_CLASS(s); + + apic_report_tpr_access = enable; + if (info->enable_tpr_reporting) { + info->enable_tpr_reporting(s, enable); + } +} + +void apic_enable_vapic(DeviceState *d, target_phys_addr_t paddr) +{ APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); + APICCommonClass *info = APIC_COMMON_GET_CLASS(s); - return s ? s->tpr >> 4 : 0; + s->vapic_paddr = paddr; + info->vapic_base_update(s); } void apic_handle_tpr_access_report(DeviceState *d, target_ulong ip, TPRAccess access) { + APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); + + vapic_report_tpr_access(s->vapic, s->cpu_env, ip, access); } void apic_report_irq_delivered(int delivered) @@ -171,12 +204,16 @@ void apic_init_reset(DeviceState *d) static void apic_reset_common(DeviceState *d) { APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); + APICCommonClass *info = APIC_COMMON_GET_CLASS(s); bool bsp; bsp = cpu_is_bsp(s->cpu_env); s->apicbase = 0xfee00000 | (bsp ? MSR_IA32_APICBASE_BSP : 0) | MSR_IA32_APICBASE_ENABLE; + s->vapic_paddr = 0; + info->vapic_base_update(s); + apic_init_reset(d); if (bsp) { @@ -239,6 +276,7 @@ static int apic_init_common(SysBusDevice *dev) { APICCommonState *s = APIC_COMMON(dev); APICCommonClass *info; + static DeviceState *vapic; static int apic_no; if (apic_no >= MAX_APICS) { @@ -249,10 +287,29 @@ static int apic_init_common(SysBusDevice *dev) info = APIC_COMMON_GET_CLASS(s); info->init(s); - sysbus_init_mmio(&s->busdev, &s->io_memory); + sysbus_init_mmio(dev, &s->io_memory); + + if (!vapic && s->vapic_control & VAPIC_ENABLE_MASK) { + vapic = sysbus_create_simple("kvmvapic", -1, NULL); + } + s->vapic = vapic; + if (apic_report_tpr_access && info->enable_tpr_reporting) { + info->enable_tpr_reporting(s, true); + } + return 0; } +static void apic_dispatch_pre_save(void *opaque) +{ + APICCommonState *s = APIC_COMMON(opaque); + APICCommonClass *info = APIC_COMMON_GET_CLASS(s); + + if (info->pre_save) { + info->pre_save(s); + } +} + static int apic_dispatch_post_load(void *opaque, int version_id) { APICCommonState *s = APIC_COMMON(opaque); @@ -270,6 +327,7 @@ static const VMStateDescription vmstate_apic_common = { .minimum_version_id = 3, .minimum_version_id_old = 1, .load_state_old = apic_load_old, + .pre_save = apic_dispatch_pre_save, .post_load = apic_dispatch_post_load, .fields = (VMStateField[]) { VMSTATE_UINT32(apicbase, APICCommonState), @@ -299,6 +357,8 @@ static const VMStateDescription vmstate_apic_common = { static Property apic_properties_common[] = { DEFINE_PROP_UINT8("id", APICCommonState, id, -1), DEFINE_PROP_PTR("cpu_env", APICCommonState, cpu_env), + DEFINE_PROP_BIT("vapic", APICCommonState, vapic_control, VAPIC_ENABLE_BIT, + true), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/apic_internal.h b/hw/apic_internal.h index 0cab010717..60a6a8bdae 100644 --- a/hw/apic_internal.h +++ b/hw/apic_internal.h @@ -61,6 +61,9 @@ #define APIC_SV_DIRECTED_IO (1<<12) #define APIC_SV_ENABLE (1<<8) +#define VAPIC_ENABLE_BIT 0 +#define VAPIC_ENABLE_MASK (1 << VAPIC_ENABLE_BIT) + #define MAX_APICS 255 #define MSI_SPACE_SIZE 0x100000 @@ -82,7 +85,11 @@ typedef struct APICCommonClass void (*init)(APICCommonState *s); void (*set_base)(APICCommonState *s, uint64_t val); void (*set_tpr)(APICCommonState *s, uint8_t val); + uint8_t (*get_tpr)(APICCommonState *s); + void (*enable_tpr_reporting)(APICCommonState *s, bool enable); + void (*vapic_base_update)(APICCommonState *s); void (*external_nmi)(APICCommonState *s); + void (*pre_save)(APICCommonState *s); void (*post_load)(APICCommonState *s); } APICCommonClass; @@ -114,9 +121,29 @@ struct APICCommonState { int64_t timer_expiry; int sipi_vector; int wait_for_sipi; + + uint32_t vapic_control; + DeviceState *vapic; + target_phys_addr_t vapic_paddr; /* note: persistence via kvmvapic */ }; +typedef struct VAPICState { + uint8_t tpr; + uint8_t isr; + uint8_t zero; + uint8_t irr; + uint8_t enabled; +} QEMU_PACKED VAPICState; + +extern bool apic_report_tpr_access; + void apic_report_irq_delivered(int delivered); bool apic_next_timer(APICCommonState *s, int64_t current_time); +void apic_enable_tpr_access_reporting(DeviceState *d, bool enable); +void apic_enable_vapic(DeviceState *d, target_phys_addr_t paddr); +void apic_poll_irq(DeviceState *d); + +void vapic_report_tpr_access(DeviceState *dev, void *cpu, target_ulong ip, + TPRAccess access); #endif /* !QEMU_APIC_INTERNAL_H */ diff --git a/hw/kvm/apic.c b/hw/kvm/apic.c index 5bb0a4b9fd..9ca68f81aa 100644 --- a/hw/kvm/apic.c +++ b/hw/kvm/apic.c @@ -92,6 +92,35 @@ static void kvm_apic_set_tpr(APICCommonState *s, uint8_t val) s->tpr = (val & 0x0f) << 4; } +static uint8_t kvm_apic_get_tpr(APICCommonState *s) +{ + return s->tpr >> 4; +} + +static void kvm_apic_enable_tpr_reporting(APICCommonState *s, bool enable) +{ + struct kvm_tpr_access_ctl ctl = { + .enabled = enable + }; + + kvm_vcpu_ioctl(s->cpu_env, KVM_TPR_ACCESS_REPORTING, &ctl); +} + +static void kvm_apic_vapic_base_update(APICCommonState *s) +{ + struct kvm_vapic_addr vapid_addr = { + .vapic_addr = s->vapic_paddr, + }; + int ret; + + ret = kvm_vcpu_ioctl(s->cpu_env, KVM_SET_VAPIC_ADDR, &vapid_addr); + if (ret < 0) { + fprintf(stderr, "KVM: setting VAPIC address failed (%s)\n", + strerror(-ret)); + abort(); + } +} + static void do_inject_external_nmi(void *data) { APICCommonState *s = data; @@ -129,6 +158,9 @@ static void kvm_apic_class_init(ObjectClass *klass, void *data) k->init = kvm_apic_init; k->set_base = kvm_apic_set_base; k->set_tpr = kvm_apic_set_tpr; + k->get_tpr = kvm_apic_get_tpr; + k->enable_tpr_reporting = kvm_apic_enable_tpr_reporting; + k->vapic_base_update = kvm_apic_vapic_base_update; k->external_nmi = kvm_apic_external_nmi; } diff --git a/hw/kvmvapic.c b/hw/kvmvapic.c new file mode 100644 index 0000000000..36ccfbcdbd --- /dev/null +++ b/hw/kvmvapic.c @@ -0,0 +1,805 @@ +/* + * TPR optimization for 32-bit Windows guests (XP and Server 2003) + * + * Copyright (C) 2007-2008 Qumranet Technologies + * Copyright (C) 2012 Jan Kiszka, Siemens AG + * + * This work is licensed under the terms of the GNU GPL version 2, or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ +#include "sysemu.h" +#include "cpus.h" +#include "kvm.h" +#include "apic_internal.h" + +#define APIC_DEFAULT_ADDRESS 0xfee00000 + +#define VAPIC_IO_PORT 0x7e + +#define VAPIC_CPU_SHIFT 7 + +#define ROM_BLOCK_SIZE 512 +#define ROM_BLOCK_MASK (~(ROM_BLOCK_SIZE - 1)) + +typedef enum VAPICMode { + VAPIC_INACTIVE = 0, + VAPIC_ACTIVE = 1, + VAPIC_STANDBY = 2, +} VAPICMode; + +typedef struct VAPICHandlers { + uint32_t set_tpr; + uint32_t set_tpr_eax; + uint32_t get_tpr[8]; + uint32_t get_tpr_stack; +} QEMU_PACKED VAPICHandlers; + +typedef struct GuestROMState { + char signature[8]; + uint32_t vaddr; + uint32_t fixup_start; + uint32_t fixup_end; + uint32_t vapic_vaddr; + uint32_t vapic_size; + uint32_t vcpu_shift; + uint32_t real_tpr_addr; + VAPICHandlers up; + VAPICHandlers mp; +} QEMU_PACKED GuestROMState; + +typedef struct VAPICROMState { + SysBusDevice busdev; + MemoryRegion io; + MemoryRegion rom; + uint32_t state; + uint32_t rom_state_paddr; + uint32_t rom_state_vaddr; + uint32_t vapic_paddr; + uint32_t real_tpr_addr; + GuestROMState rom_state; + size_t rom_size; + bool rom_mapped_writable; +} VAPICROMState; + +#define TPR_INSTR_ABS_MODRM 0x1 +#define TPR_INSTR_MATCH_MODRM_REG 0x2 + +typedef struct TPRInstruction { + uint8_t opcode; + uint8_t modrm_reg; + unsigned int flags; + TPRAccess access; + size_t length; + off_t addr_offset; +} TPRInstruction; + +/* must be sorted by length, shortest first */ +static const TPRInstruction tpr_instr[] = { + { /* mov abs to eax */ + .opcode = 0xa1, + .access = TPR_ACCESS_READ, + .length = 5, + .addr_offset = 1, + }, + { /* mov eax to abs */ + .opcode = 0xa3, + .access = TPR_ACCESS_WRITE, + .length = 5, + .addr_offset = 1, + }, + { /* mov r32 to r/m32 */ + .opcode = 0x89, + .flags = TPR_INSTR_ABS_MODRM, + .access = TPR_ACCESS_WRITE, + .length = 6, + .addr_offset = 2, + }, + { /* mov r/m32 to r32 */ + .opcode = 0x8b, + .flags = TPR_INSTR_ABS_MODRM, + .access = TPR_ACCESS_READ, + .length = 6, + .addr_offset = 2, + }, + { /* push r/m32 */ + .opcode = 0xff, + .modrm_reg = 6, + .flags = TPR_INSTR_ABS_MODRM | TPR_INSTR_MATCH_MODRM_REG, + .access = TPR_ACCESS_READ, + .length = 6, + .addr_offset = 2, + }, + { /* mov imm32, r/m32 (c7/0) */ + .opcode = 0xc7, + .modrm_reg = 0, + .flags = TPR_INSTR_ABS_MODRM | TPR_INSTR_MATCH_MODRM_REG, + .access = TPR_ACCESS_WRITE, + .length = 10, + .addr_offset = 2, + }, +}; + +static void read_guest_rom_state(VAPICROMState *s) +{ + cpu_physical_memory_rw(s->rom_state_paddr, (void *)&s->rom_state, + sizeof(GuestROMState), 0); +} + +static void write_guest_rom_state(VAPICROMState *s) +{ + cpu_physical_memory_rw(s->rom_state_paddr, (void *)&s->rom_state, + sizeof(GuestROMState), 1); +} + +static void update_guest_rom_state(VAPICROMState *s) +{ + read_guest_rom_state(s); + + s->rom_state.real_tpr_addr = cpu_to_le32(s->real_tpr_addr); + s->rom_state.vcpu_shift = cpu_to_le32(VAPIC_CPU_SHIFT); + + write_guest_rom_state(s); +} + +static int find_real_tpr_addr(VAPICROMState *s, CPUState *env) +{ + target_phys_addr_t paddr; + target_ulong addr; + + if (s->state == VAPIC_ACTIVE) { + return 0; + } + /* + * If there is no prior TPR access instruction we could analyze (which is + * the case after resume from hibernation), we need to scan the possible + * virtual address space for the APIC mapping. + */ + for (addr = 0xfffff000; addr >= 0x80000000; addr -= TARGET_PAGE_SIZE) { + paddr = cpu_get_phys_page_debug(env, addr); + if (paddr != APIC_DEFAULT_ADDRESS) { + continue; + } + s->real_tpr_addr = addr + 0x80; + update_guest_rom_state(s); + return 0; + } + return -1; +} + +static uint8_t modrm_reg(uint8_t modrm) +{ + return (modrm >> 3) & 7; +} + +static bool is_abs_modrm(uint8_t modrm) +{ + return (modrm & 0xc7) == 0x05; +} + +static bool opcode_matches(uint8_t *opcode, const TPRInstruction *instr) +{ + return opcode[0] == instr->opcode && + (!(instr->flags & TPR_INSTR_ABS_MODRM) || is_abs_modrm(opcode[1])) && + (!(instr->flags & TPR_INSTR_MATCH_MODRM_REG) || + modrm_reg(opcode[1]) == instr->modrm_reg); +} + +static int evaluate_tpr_instruction(VAPICROMState *s, CPUState *env, + target_ulong *pip, TPRAccess access) +{ + const TPRInstruction *instr; + target_ulong ip = *pip; + uint8_t opcode[2]; + uint32_t real_tpr_addr; + int i; + + if ((ip & 0xf0000000ULL) != 0x80000000ULL && + (ip & 0xf0000000ULL) != 0xe0000000ULL) { + return -1; + } + + /* + * Early Windows 2003 SMP initialization contains a + * + * mov imm32, r/m32 + * + * instruction that is patched by TPR optimization. The problem is that + * RSP, used by the patched instruction, is zero, so the guest gets a + * double fault and dies. + */ + if (env->regs[R_ESP] == 0) { + return -1; + } + + if (kvm_enabled() && !kvm_irqchip_in_kernel()) { + /* + * KVM without kernel-based TPR access reporting will pass an IP that + * points after the accessing instruction. So we need to look backward + * to find the reason. + */ + for (i = 0; i < ARRAY_SIZE(tpr_instr); i++) { + instr = &tpr_instr[i]; + if (instr->access != access) { + continue; + } + if (cpu_memory_rw_debug(env, ip - instr->length, opcode, + sizeof(opcode), 0) < 0) { + return -1; + } + if (opcode_matches(opcode, instr)) { + ip -= instr->length; + goto instruction_ok; + } + } + return -1; + } else { + if (cpu_memory_rw_debug(env, ip, opcode, sizeof(opcode), 0) < 0) { + return -1; + } + for (i = 0; i < ARRAY_SIZE(tpr_instr); i++) { + instr = &tpr_instr[i]; + if (opcode_matches(opcode, instr)) { + goto instruction_ok; + } + } + return -1; + } + +instruction_ok: + /* + * Grab the virtual TPR address from the instruction + * and update the cached values. + */ + if (cpu_memory_rw_debug(env, ip + instr->addr_offset, + (void *)&real_tpr_addr, + sizeof(real_tpr_addr), 0) < 0) { + return -1; + } + real_tpr_addr = le32_to_cpu(real_tpr_addr); + if ((real_tpr_addr & 0xfff) != 0x80) { + return -1; + } + s->real_tpr_addr = real_tpr_addr; + update_guest_rom_state(s); + + *pip = ip; + return 0; +} + +static int update_rom_mapping(VAPICROMState *s, CPUState *env, target_ulong ip) +{ + target_phys_addr_t paddr; + uint32_t rom_state_vaddr; + uint32_t pos, patch, offset; + + /* nothing to do if already activated */ + if (s->state == VAPIC_ACTIVE) { + return 0; + } + + /* bail out if ROM init code was not executed (missing ROM?) */ + if (s->state == VAPIC_INACTIVE) { + return -1; + } + + /* find out virtual address of the ROM */ + rom_state_vaddr = s->rom_state_paddr + (ip & 0xf0000000); + paddr = cpu_get_phys_page_debug(env, rom_state_vaddr); + if (paddr == -1) { + return -1; + } + paddr += rom_state_vaddr & ~TARGET_PAGE_MASK; + if (paddr != s->rom_state_paddr) { + return -1; + } + read_guest_rom_state(s); + if (memcmp(s->rom_state.signature, "kvm aPiC", 8) != 0) { + return -1; + } + s->rom_state_vaddr = rom_state_vaddr; + + /* fixup addresses in ROM if needed */ + if (rom_state_vaddr == le32_to_cpu(s->rom_state.vaddr)) { + return 0; + } + for (pos = le32_to_cpu(s->rom_state.fixup_start); + pos < le32_to_cpu(s->rom_state.fixup_end); + pos += 4) { + cpu_physical_memory_rw(paddr + pos - s->rom_state.vaddr, + (void *)&offset, sizeof(offset), 0); + offset = le32_to_cpu(offset); + cpu_physical_memory_rw(paddr + offset, (void *)&patch, + sizeof(patch), 0); + patch = le32_to_cpu(patch); + patch += rom_state_vaddr - le32_to_cpu(s->rom_state.vaddr); + patch = cpu_to_le32(patch); + cpu_physical_memory_rw(paddr + offset, (void *)&patch, + sizeof(patch), 1); + } + read_guest_rom_state(s); + s->vapic_paddr = paddr + le32_to_cpu(s->rom_state.vapic_vaddr) - + le32_to_cpu(s->rom_state.vaddr); + + return 0; +} + +/* + * Tries to read the unique processor number from the Kernel Processor Control + * Region (KPCR) of 32-bit Windows XP and Server 2003. Returns -1 if the KPCR + * cannot be accessed or is considered invalid. This also ensures that we are + * not patching the wrong guest. + */ +static int get_kpcr_number(CPUState *env) +{ + struct kpcr { + uint8_t fill1[0x1c]; + uint32_t self; + uint8_t fill2[0x31]; + uint8_t number; + } QEMU_PACKED kpcr; + + if (cpu_memory_rw_debug(env, env->segs[R_FS].base, + (void *)&kpcr, sizeof(kpcr), 0) < 0 || + kpcr.self != env->segs[R_FS].base) { + return -1; + } + return kpcr.number; +} + +static int vapic_enable(VAPICROMState *s, CPUState *env) +{ + int cpu_number = get_kpcr_number(env); + target_phys_addr_t vapic_paddr; + static const uint8_t enabled = 1; + + if (cpu_number < 0) { + return -1; + } + vapic_paddr = s->vapic_paddr + + (((target_phys_addr_t)cpu_number) << VAPIC_CPU_SHIFT); + cpu_physical_memory_rw(vapic_paddr + offsetof(VAPICState, enabled), + (void *)&enabled, sizeof(enabled), 1); + apic_enable_vapic(env->apic_state, vapic_paddr); + + s->state = VAPIC_ACTIVE; + + return 0; +} + +static void patch_byte(CPUState *env, target_ulong addr, uint8_t byte) +{ + cpu_memory_rw_debug(env, addr, &byte, 1, 1); +} + +static void patch_call(VAPICROMState *s, CPUState *env, target_ulong ip, + uint32_t target) +{ + uint32_t offset; + + offset = cpu_to_le32(target - ip - 5); + patch_byte(env, ip, 0xe8); /* call near */ + cpu_memory_rw_debug(env, ip + 1, (void *)&offset, sizeof(offset), 1); +} + +static void patch_instruction(VAPICROMState *s, CPUState *env, target_ulong ip) +{ + target_phys_addr_t paddr; + VAPICHandlers *handlers; + uint8_t opcode[2]; + uint32_t imm32; + + if (smp_cpus == 1) { + handlers = &s->rom_state.up; + } else { + handlers = &s->rom_state.mp; + } + + pause_all_vcpus(); + + cpu_memory_rw_debug(env, ip, opcode, sizeof(opcode), 0); + + switch (opcode[0]) { + case 0x89: /* mov r32 to r/m32 */ + patch_byte(env, ip, 0x50 + modrm_reg(opcode[1])); /* push reg */ + patch_call(s, env, ip + 1, handlers->set_tpr); + break; + case 0x8b: /* mov r/m32 to r32 */ + patch_byte(env, ip, 0x90); + patch_call(s, env, ip + 1, handlers->get_tpr[modrm_reg(opcode[1])]); + break; + case 0xa1: /* mov abs to eax */ + patch_call(s, env, ip, handlers->get_tpr[0]); + break; + case 0xa3: /* mov eax to abs */ + patch_call(s, env, ip, handlers->set_tpr_eax); + break; + case 0xc7: /* mov imm32, r/m32 (c7/0) */ + patch_byte(env, ip, 0x68); /* push imm32 */ + cpu_memory_rw_debug(env, ip + 6, (void *)&imm32, sizeof(imm32), 0); + cpu_memory_rw_debug(env, ip + 1, (void *)&imm32, sizeof(imm32), 1); + patch_call(s, env, ip + 5, handlers->set_tpr); + break; + case 0xff: /* push r/m32 */ + patch_byte(env, ip, 0x50); /* push eax */ + patch_call(s, env, ip + 1, handlers->get_tpr_stack); + break; + default: + abort(); + } + + resume_all_vcpus(); + + paddr = cpu_get_phys_page_debug(env, ip); + paddr += ip & ~TARGET_PAGE_MASK; + tb_invalidate_phys_page_range(paddr, paddr + 1, 1); +} + +void vapic_report_tpr_access(DeviceState *dev, void *cpu, target_ulong ip, + TPRAccess access) +{ + VAPICROMState *s = DO_UPCAST(VAPICROMState, busdev.qdev, dev); + CPUState *env = cpu; + + cpu_synchronize_state(env); + + if (evaluate_tpr_instruction(s, env, &ip, access) < 0) { + if (s->state == VAPIC_ACTIVE) { + vapic_enable(s, env); + } + return; + } + if (update_rom_mapping(s, env, ip) < 0) { + return; + } + if (vapic_enable(s, env) < 0) { + return; + } + patch_instruction(s, env, ip); +} + +typedef struct VAPICEnableTPRReporting { + DeviceState *apic; + bool enable; +} VAPICEnableTPRReporting; + +static void vapic_do_enable_tpr_reporting(void *data) +{ + VAPICEnableTPRReporting *info = data; + + apic_enable_tpr_access_reporting(info->apic, info->enable); +} + +static void vapic_enable_tpr_reporting(bool enable) +{ + VAPICEnableTPRReporting info = { + .enable = enable, + }; + CPUState *env; + + for (env = first_cpu; env != NULL; env = env->next_cpu) { + info.apic = env->apic_state; + run_on_cpu(env, vapic_do_enable_tpr_reporting, &info); + } +} + +static void vapic_reset(DeviceState *dev) +{ + VAPICROMState *s = DO_UPCAST(VAPICROMState, busdev.qdev, dev); + + if (s->state == VAPIC_ACTIVE) { + s->state = VAPIC_STANDBY; + } + vapic_enable_tpr_reporting(false); +} + +/* + * Set the IRQ polling hypercalls to the supported variant: + * - vmcall if using KVM in-kernel irqchip + * - 32-bit VAPIC port write otherwise + */ +static int patch_hypercalls(VAPICROMState *s) +{ + target_phys_addr_t rom_paddr = s->rom_state_paddr & ROM_BLOCK_MASK; + static const uint8_t vmcall_pattern[] = { /* vmcall */ + 0xb8, 0x1, 0, 0, 0, 0xf, 0x1, 0xc1 + }; + static const uint8_t outl_pattern[] = { /* nop; outl %eax,0x7e */ + 0xb8, 0x1, 0, 0, 0, 0x90, 0xe7, 0x7e + }; + uint8_t alternates[2]; + const uint8_t *pattern; + const uint8_t *patch; + int patches = 0; + off_t pos; + uint8_t *rom; + + rom = g_malloc(s->rom_size); + cpu_physical_memory_rw(rom_paddr, rom, s->rom_size, 0); + + for (pos = 0; pos < s->rom_size - sizeof(vmcall_pattern); pos++) { + if (kvm_irqchip_in_kernel()) { + pattern = outl_pattern; + alternates[0] = outl_pattern[7]; + alternates[1] = outl_pattern[7]; + patch = &vmcall_pattern[5]; + } else { + pattern = vmcall_pattern; + alternates[0] = vmcall_pattern[7]; + alternates[1] = 0xd9; /* AMD's VMMCALL */ + patch = &outl_pattern[5]; + } + if (memcmp(rom + pos, pattern, 7) == 0 && + (rom[pos + 7] == alternates[0] || rom[pos + 7] == alternates[1])) { + cpu_physical_memory_rw(rom_paddr + pos + 5, (uint8_t *)patch, + 3, 1); + /* + * Don't flush the tb here. Under ordinary conditions, the patched + * calls are miles away from the current IP. Under malicious + * conditions, the guest could trick us to crash. + */ + } + } + + g_free(rom); + + if (patches != 0 && patches != 2) { + return -1; + } + + return 0; +} + +/* + * For TCG mode or the time KVM honors read-only memory regions, we need to + * enable write access to the option ROM so that variables can be updated by + * the guest. + */ +static void vapic_map_rom_writable(VAPICROMState *s) +{ + target_phys_addr_t rom_paddr = s->rom_state_paddr & ROM_BLOCK_MASK; + MemoryRegionSection section; + MemoryRegion *as; + size_t rom_size; + uint8_t *ram; + + as = sysbus_address_space(&s->busdev); + + if (s->rom_mapped_writable) { + memory_region_del_subregion(as, &s->rom); + memory_region_destroy(&s->rom); + } + + /* grab RAM memory region (region @rom_paddr may still be pc.rom) */ + section = memory_region_find(as, 0, 1); + + /* read ROM size from RAM region */ + ram = memory_region_get_ram_ptr(section.mr); + rom_size = ram[rom_paddr + 2] * ROM_BLOCK_SIZE; + s->rom_size = rom_size; + + /* We need to round up to avoid creating subpages + * from which we cannot run code. */ + rom_size = TARGET_PAGE_ALIGN(rom_size); + + memory_region_init_alias(&s->rom, "kvmvapic-rom", section.mr, rom_paddr, + rom_size); + memory_region_add_subregion_overlap(as, rom_paddr, &s->rom, 1000); + s->rom_mapped_writable = true; +} + +static int vapic_prepare(VAPICROMState *s) +{ + vapic_map_rom_writable(s); + + if (patch_hypercalls(s) < 0) { + return -1; + } + + vapic_enable_tpr_reporting(true); + + return 0; +} + +static void vapic_write(void *opaque, target_phys_addr_t addr, uint64_t data, + unsigned int size) +{ + CPUState *env = cpu_single_env; + target_phys_addr_t rom_paddr; + VAPICROMState *s = opaque; + + cpu_synchronize_state(env); + + /* + * The VAPIC supports two PIO-based hypercalls, both via port 0x7E. + * o 16-bit write access: + * Reports the option ROM initialization to the hypervisor. Written + * value is the offset of the state structure in the ROM. + * o 8-bit write access: + * Reactivates the VAPIC after a guest hibernation, i.e. after the + * option ROM content has been re-initialized by a guest power cycle. + * o 32-bit write access: + * Poll for pending IRQs, considering the current VAPIC state. + */ + switch (size) { + case 2: + if (s->state == VAPIC_INACTIVE) { + rom_paddr = (env->segs[R_CS].base + env->eip) & ROM_BLOCK_MASK; + s->rom_state_paddr = rom_paddr + data; + + s->state = VAPIC_STANDBY; + } + if (vapic_prepare(s) < 0) { + s->state = VAPIC_INACTIVE; + break; + } + break; + case 1: + if (kvm_enabled()) { + /* + * Disable triggering instruction in ROM by writing a NOP. + * + * We cannot do this in TCG mode as the reported IP is not + * accurate. + */ + pause_all_vcpus(); + patch_byte(env, env->eip - 2, 0x66); + patch_byte(env, env->eip - 1, 0x90); + resume_all_vcpus(); + } + + if (s->state == VAPIC_ACTIVE) { + break; + } + if (update_rom_mapping(s, env, env->eip) < 0) { + break; + } + if (find_real_tpr_addr(s, env) < 0) { + break; + } + vapic_enable(s, env); + break; + default: + case 4: + if (!kvm_irqchip_in_kernel()) { + apic_poll_irq(env->apic_state); + } + break; + } +} + +static const MemoryRegionOps vapic_ops = { + .write = vapic_write, + .endianness = DEVICE_NATIVE_ENDIAN, +}; + +static int vapic_init(SysBusDevice *dev) +{ + VAPICROMState *s = FROM_SYSBUS(VAPICROMState, dev); + + memory_region_init_io(&s->io, &vapic_ops, s, "kvmvapic", 2); + sysbus_add_io(dev, VAPIC_IO_PORT, &s->io); + sysbus_init_ioports(dev, VAPIC_IO_PORT, 2); + + option_rom[nb_option_roms].name = "kvmvapic.bin"; + option_rom[nb_option_roms].bootindex = -1; + nb_option_roms++; + + return 0; +} + +static void do_vapic_enable(void *data) +{ + VAPICROMState *s = data; + + vapic_enable(s, first_cpu); +} + +static int vapic_post_load(void *opaque, int version_id) +{ + VAPICROMState *s = opaque; + uint8_t *zero; + + /* + * The old implementation of qemu-kvm did not provide the state + * VAPIC_STANDBY. Reconstruct it. + */ + if (s->state == VAPIC_INACTIVE && s->rom_state_paddr != 0) { + s->state = VAPIC_STANDBY; + } + + if (s->state != VAPIC_INACTIVE) { + if (vapic_prepare(s) < 0) { + return -1; + } + } + if (s->state == VAPIC_ACTIVE) { + if (smp_cpus == 1) { + run_on_cpu(first_cpu, do_vapic_enable, s); + } else { + zero = g_malloc0(s->rom_state.vapic_size); + cpu_physical_memory_rw(s->vapic_paddr, zero, + s->rom_state.vapic_size, 1); + g_free(zero); + } + } + + return 0; +} + +static const VMStateDescription vmstate_handlers = { + .name = "kvmvapic-handlers", + .version_id = 1, + .minimum_version_id = 1, + .minimum_version_id_old = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(set_tpr, VAPICHandlers), + VMSTATE_UINT32(set_tpr_eax, VAPICHandlers), + VMSTATE_UINT32_ARRAY(get_tpr, VAPICHandlers, 8), + VMSTATE_UINT32(get_tpr_stack, VAPICHandlers), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_guest_rom = { + .name = "kvmvapic-guest-rom", + .version_id = 1, + .minimum_version_id = 1, + .minimum_version_id_old = 1, + .fields = (VMStateField[]) { + VMSTATE_UNUSED(8), /* signature */ + VMSTATE_UINT32(vaddr, GuestROMState), + VMSTATE_UINT32(fixup_start, GuestROMState), + VMSTATE_UINT32(fixup_end, GuestROMState), + VMSTATE_UINT32(vapic_vaddr, GuestROMState), + VMSTATE_UINT32(vapic_size, GuestROMState), + VMSTATE_UINT32(vcpu_shift, GuestROMState), + VMSTATE_UINT32(real_tpr_addr, GuestROMState), + VMSTATE_STRUCT(up, GuestROMState, 0, vmstate_handlers, VAPICHandlers), + VMSTATE_STRUCT(mp, GuestROMState, 0, vmstate_handlers, VAPICHandlers), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_vapic = { + .name = "kvm-tpr-opt", /* compatible with qemu-kvm VAPIC */ + .version_id = 1, + .minimum_version_id = 1, + .minimum_version_id_old = 1, + .post_load = vapic_post_load, + .fields = (VMStateField[]) { + VMSTATE_STRUCT(rom_state, VAPICROMState, 0, vmstate_guest_rom, + GuestROMState), + VMSTATE_UINT32(state, VAPICROMState), + VMSTATE_UINT32(real_tpr_addr, VAPICROMState), + VMSTATE_UINT32(rom_state_vaddr, VAPICROMState), + VMSTATE_UINT32(vapic_paddr, VAPICROMState), + VMSTATE_UINT32(rom_state_paddr, VAPICROMState), + VMSTATE_END_OF_LIST() + } +}; + +static void vapic_class_init(ObjectClass *klass, void *data) +{ + SysBusDeviceClass *sc = SYS_BUS_DEVICE_CLASS(klass); + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->no_user = 1; + dc->reset = vapic_reset; + dc->vmsd = &vmstate_vapic; + sc->init = vapic_init; +} + +static TypeInfo vapic_type = { + .name = "kvmvapic", + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(VAPICROMState), + .class_init = vapic_class_init, +}; + +static void vapic_register(void) +{ + type_register_static(&vapic_type); +} + +type_init(vapic_register); |