diff options
-rw-r--r-- | Makefile.objs | 1 | ||||
-rw-r--r-- | arch_init.c | 246 | ||||
-rwxr-xr-x | configure | 14 | ||||
-rw-r--r-- | cpus.c | 3 | ||||
-rw-r--r-- | cutils.c | 42 | ||||
-rw-r--r-- | default-configs/arm-softmmu.mak | 18 | ||||
-rw-r--r-- | docs/xbzrle.txt | 128 | ||||
-rw-r--r-- | hmp-commands.hx | 38 | ||||
-rw-r--r-- | hmp.c | 103 | ||||
-rw-r--r-- | hmp.h | 4 | ||||
-rw-r--r-- | hw/Makefile.objs | 20 | ||||
-rw-r--r-- | hw/arm/Makefile.objs | 15 | ||||
-rw-r--r-- | hw/armv7m.c | 5 | ||||
-rw-r--r-- | hw/armv7m_nvic.c | 21 | ||||
-rw-r--r-- | hw/kvm/i8259.c | 2 | ||||
-rw-r--r-- | hw/kvm/ioapic.c | 2 | ||||
-rw-r--r-- | hw/pc.c | 1 | ||||
-rw-r--r-- | hw/sd.c | 72 | ||||
-rw-r--r-- | hw/sd.h | 6 | ||||
-rw-r--r-- | hw/ssd0323.c | 4 | ||||
-rw-r--r-- | hw/virtio-pci.c | 4 | ||||
-rw-r--r-- | include/qemu/page_cache.h | 79 | ||||
-rw-r--r-- | kvm-all.c | 54 | ||||
-rw-r--r-- | kvm-stub.c | 9 | ||||
-rw-r--r-- | kvm.h | 60 | ||||
-rw-r--r-- | migration.c | 112 | ||||
-rw-r--r-- | migration.h | 21 | ||||
-rw-r--r-- | monitor.c | 14 | ||||
-rw-r--r-- | page_cache.c | 218 | ||||
-rw-r--r-- | qapi-schema.json | 119 | ||||
-rw-r--r-- | qemu-common.h | 21 | ||||
-rw-r--r-- | qmp-commands.hx | 159 | ||||
-rw-r--r-- | savevm.c | 159 | ||||
-rw-r--r-- | target-i386/Makefile.objs | 1 | ||||
-rw-r--r-- | target-i386/kvm-stub.c | 18 | ||||
-rw-r--r-- | target-i386/kvm.c | 13 | ||||
-rw-r--r-- | target-i386/kvm_i386.h | 16 |
37 files changed, 1725 insertions, 97 deletions
diff --git a/Makefile.objs b/Makefile.objs index 8454b536cf..309d066286 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -77,6 +77,7 @@ common-obj-y += qemu-char.o #aio.o common-obj-y += block-migration.o iohandler.o common-obj-y += pflib.o common-obj-y += bitmap.o bitops.o +common-obj-y += page_cache.o common-obj-$(CONFIG_POSIX) += migration-exec.o migration-unix.o migration-fd.o common-obj-$(CONFIG_WIN32) += version.o diff --git a/arch_init.c b/arch_init.c index 7b65c4888b..9b46bfcaa5 100644 --- a/arch_init.c +++ b/arch_init.c @@ -43,6 +43,7 @@ #include "hw/smbios.h" #include "exec-memory.h" #include "hw/pcspk.h" +#include "qemu/page_cache.h" #ifdef DEBUG_ARCH_INIT #define DPRINTF(fmt, ...) \ @@ -106,6 +107,7 @@ const uint32_t arch_type = QEMU_ARCH; #define RAM_SAVE_FLAG_PAGE 0x08 #define RAM_SAVE_FLAG_EOS 0x10 #define RAM_SAVE_FLAG_CONTINUE 0x20 +#define RAM_SAVE_FLAG_XBZRLE 0x40 #ifdef __ALTIVEC__ #include <altivec.h> @@ -173,6 +175,92 @@ static int is_dup_page(uint8_t *page) return 1; } +/* struct contains XBZRLE cache and a static page + used by the compression */ +static struct { + /* buffer used for XBZRLE encoding */ + uint8_t *encoded_buf; + /* buffer for storing page content */ + uint8_t *current_buf; + /* buffer used for XBZRLE decoding */ + uint8_t *decoded_buf; + /* Cache for XBZRLE */ + PageCache *cache; +} XBZRLE = { + .encoded_buf = NULL, + .current_buf = NULL, + .decoded_buf = NULL, + .cache = NULL, +}; + + +int64_t xbzrle_cache_resize(int64_t new_size) +{ + if (XBZRLE.cache != NULL) { + return cache_resize(XBZRLE.cache, new_size / TARGET_PAGE_SIZE) * + TARGET_PAGE_SIZE; + } + return pow2floor(new_size); +} + +/* accounting for migration statistics */ +typedef struct AccountingInfo { + uint64_t dup_pages; + uint64_t norm_pages; + uint64_t iterations; + uint64_t xbzrle_bytes; + uint64_t xbzrle_pages; + uint64_t xbzrle_cache_miss; + uint64_t xbzrle_overflows; +} AccountingInfo; + +static AccountingInfo acct_info; + +static void acct_clear(void) +{ + memset(&acct_info, 0, sizeof(acct_info)); +} + +uint64_t dup_mig_bytes_transferred(void) +{ + return acct_info.dup_pages * TARGET_PAGE_SIZE; +} + +uint64_t dup_mig_pages_transferred(void) +{ + return acct_info.dup_pages; +} + +uint64_t norm_mig_bytes_transferred(void) +{ + return acct_info.norm_pages * TARGET_PAGE_SIZE; +} + +uint64_t norm_mig_pages_transferred(void) +{ + return acct_info.norm_pages; +} + +uint64_t xbzrle_mig_bytes_transferred(void) +{ + return acct_info.xbzrle_bytes; +} + +uint64_t xbzrle_mig_pages_transferred(void) +{ + return acct_info.xbzrle_pages; +} + +uint64_t xbzrle_mig_pages_cache_miss(void) +{ + return acct_info.xbzrle_cache_miss; +} + +uint64_t xbzrle_mig_pages_overflow(void) +{ + return acct_info.xbzrle_overflows; +} + static void save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset, int cont, int flag) { @@ -185,6 +273,61 @@ static void save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset, } +#define ENCODING_FLAG_XBZRLE 0x1 + +static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data, + ram_addr_t current_addr, RAMBlock *block, + ram_addr_t offset, int cont, bool last_stage) +{ + int encoded_len = 0, bytes_sent = -1; + uint8_t *prev_cached_page; + + if (!cache_is_cached(XBZRLE.cache, current_addr)) { + if (!last_stage) { + cache_insert(XBZRLE.cache, current_addr, + g_memdup(current_data, TARGET_PAGE_SIZE)); + } + acct_info.xbzrle_cache_miss++; + return -1; + } + + prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); + + /* save current buffer into memory */ + memcpy(XBZRLE.current_buf, current_data, TARGET_PAGE_SIZE); + + /* XBZRLE encoding (if there is no overflow) */ + encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, + TARGET_PAGE_SIZE, XBZRLE.encoded_buf, + TARGET_PAGE_SIZE); + if (encoded_len == 0) { + DPRINTF("Skipping unmodified page\n"); + return 0; + } else if (encoded_len == -1) { + DPRINTF("Overflow\n"); + acct_info.xbzrle_overflows++; + /* update data in the cache */ + memcpy(prev_cached_page, current_data, TARGET_PAGE_SIZE); + return -1; + } + + /* we need to update the data in the cache, in order to get the same data */ + if (!last_stage) { + memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); + } + + /* Send XBZRLE based compressed page */ + save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_XBZRLE); + qemu_put_byte(f, ENCODING_FLAG_XBZRLE); + qemu_put_be16(f, encoded_len); + qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len); + bytes_sent = encoded_len + 1 + 2; + acct_info.xbzrle_pages++; + acct_info.xbzrle_bytes += bytes_sent; + + return bytes_sent; +} + static RAMBlock *last_block; static ram_addr_t last_offset; @@ -196,12 +339,13 @@ static ram_addr_t last_offset; * n: the amount of bytes written in other case */ -static int ram_save_block(QEMUFile *f) +static int ram_save_block(QEMUFile *f, bool last_stage) { RAMBlock *block = last_block; ram_addr_t offset = last_offset; int bytes_sent = -1; MemoryRegion *mr; + ram_addr_t current_addr; if (!block) block = QLIST_FIRST(&ram_list.blocks); @@ -219,16 +363,31 @@ static int ram_save_block(QEMUFile *f) p = memory_region_get_ram_ptr(mr) + offset; if (is_dup_page(p)) { + acct_info.dup_pages++; save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_COMPRESS); qemu_put_byte(f, *p); bytes_sent = 1; - } else { + } else if (migrate_use_xbzrle()) { + current_addr = block->offset + offset; + bytes_sent = save_xbzrle_page(f, p, current_addr, block, + offset, cont, last_stage); + if (!last_stage) { + p = get_cached_data(XBZRLE.cache, current_addr); + } + } + + /* either we didn't send yet (we may have had XBZRLE overflow) */ + if (bytes_sent == -1) { save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_PAGE); qemu_put_buffer(f, p, TARGET_PAGE_SIZE); bytes_sent = TARGET_PAGE_SIZE; + acct_info.norm_pages++; } - break; + /* if page is unmodified, continue to the next */ + if (bytes_sent != 0) { + break; + } } offset += TARGET_PAGE_SIZE; @@ -306,6 +465,15 @@ static void sort_ram_list(void) static void migration_end(void) { memory_global_dirty_log_stop(); + + if (migrate_use_xbzrle()) { + cache_fini(XBZRLE.cache); + g_free(XBZRLE.cache); + g_free(XBZRLE.encoded_buf); + g_free(XBZRLE.current_buf); + g_free(XBZRLE.decoded_buf); + XBZRLE.cache = NULL; + } } static void ram_migration_cancel(void *opaque) @@ -325,6 +493,19 @@ static int ram_save_setup(QEMUFile *f, void *opaque) last_offset = 0; sort_ram_list(); + if (migrate_use_xbzrle()) { + XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() / + TARGET_PAGE_SIZE, + TARGET_PAGE_SIZE); + if (!XBZRLE.cache) { + DPRINTF("Error creating cache\n"); + return -1; + } + XBZRLE.encoded_buf = g_malloc0(TARGET_PAGE_SIZE); + XBZRLE.current_buf = g_malloc(TARGET_PAGE_SIZE); + acct_clear(); + } + /* Make sure all dirty bits are set */ QLIST_FOREACH(block, &ram_list.blocks, next) { for (addr = 0; addr < block->length; addr += TARGET_PAGE_SIZE) { @@ -365,12 +546,13 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) while ((ret = qemu_file_rate_limit(f)) == 0) { int bytes_sent; - bytes_sent = ram_save_block(f); + bytes_sent = ram_save_block(f, false); /* no more blocks to sent */ if (bytes_sent < 0) { break; } bytes_transferred += bytes_sent; + acct_info.iterations++; /* we want to check in the 1st loop, just in case it was the 1st time and we had to sync the dirty bitmap. qemu_get_clock_ns() is a bit expensive, so we only check each some @@ -426,7 +608,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque) while (true) { int bytes_sent; - bytes_sent = ram_save_block(f); + bytes_sent = ram_save_block(f, true); /* no more blocks to sent */ if (bytes_sent < 0) { break; @@ -440,6 +622,47 @@ static int ram_save_complete(QEMUFile *f, void *opaque) return 0; } +static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) +{ + int ret, rc = 0; + unsigned int xh_len; + int xh_flags; + + if (!XBZRLE.decoded_buf) { + XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); + } + + /* extract RLE header */ + xh_flags = qemu_get_byte(f); + xh_len = qemu_get_be16(f); + + if (xh_flags != ENCODING_FLAG_XBZRLE) { + fprintf(stderr, "Failed to load XBZRLE page - wrong compression!\n"); + return -1; + } + + if (xh_len > TARGET_PAGE_SIZE) { + fprintf(stderr, "Failed to load XBZRLE page - len overflow!\n"); + return -1; + } + /* load data and decode */ + qemu_get_buffer(f, XBZRLE.decoded_buf, xh_len); + + /* decode RLE */ + ret = xbzrle_decode_buffer(XBZRLE.decoded_buf, xh_len, host, + TARGET_PAGE_SIZE); + if (ret == -1) { + fprintf(stderr, "Failed to load XBZRLE page - decode error!\n"); + rc = -1; + } else if (ret > TARGET_PAGE_SIZE) { + fprintf(stderr, "Failed to load XBZRLE page - size %d exceeds %d!\n", + ret, TARGET_PAGE_SIZE); + abort(); + } + + return rc; +} + static inline void *host_from_stream_offset(QEMUFile *f, ram_addr_t offset, int flags) @@ -553,6 +776,19 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) } qemu_get_buffer(f, host, TARGET_PAGE_SIZE); + } else if (flags & RAM_SAVE_FLAG_XBZRLE) { + if (!migrate_use_xbzrle()) { + return -EINVAL; + } + void *host = host_from_stream_offset(f, addr, flags); + if (!host) { + return -EINVAL; + } + + if (load_xbzrle(f, addr, host) < 0) { + ret = -EINVAL; + goto done; + } } error = qemu_file_get_error(f); if (error) { @@ -3553,15 +3553,23 @@ if test "$linux" = "yes" ; then mkdir -p linux-headers case "$cpu" in i386|x86_64) - symlink "$source_path/linux-headers/asm-x86" linux-headers/asm + linux_arch=x86 ;; ppcemb|ppc|ppc64) - symlink "$source_path/linux-headers/asm-powerpc" linux-headers/asm + linux_arch=powerpc ;; s390x) - symlink "$source_path/linux-headers/asm-s390" linux-headers/asm + linux_arch=s390 + ;; + *) + # For most CPUs the kernel architecture name and QEMU CPU name match. + linux_arch="$cpu" ;; esac + # For non-KVM architectures we will not have asm headers + if [ -e "$source_path/linux-headers/asm-$linux_arch" ]; then + symlink "$source_path/linux-headers/asm-$linux_arch" linux-headers/asm + fi fi for target in $target_list; do @@ -70,7 +70,8 @@ static bool cpu_thread_is_idle(CPUArchState *env) if (env->stopped || !runstate_is_running()) { return true; } - if (!env->halted || qemu_cpu_has_work(env) || kvm_irqchip_in_kernel()) { + if (!env->halted || qemu_cpu_has_work(env) || + kvm_async_interrupts_enabled()) { return false; } return true; @@ -382,3 +382,45 @@ int qemu_parse_fd(const char *param) } return fd; } + +/* round down to the nearest power of 2*/ +int64_t pow2floor(int64_t value) +{ + if (!is_power_of_2(value)) { + value = 0x8000000000000000ULL >> clz64(value); + } + return value; +} + +/* + * Implementation of ULEB128 (http://en.wikipedia.org/wiki/LEB128) + * Input is limited to 14-bit numbers + */ +int uleb128_encode_small(uint8_t *out, uint32_t n) +{ + g_assert(n <= 0x3fff); + if (n < 0x80) { + *out++ = n; + return 1; + } else { + *out++ = (n & 0x7f) | 0x80; + *out++ = n >> 7; + return 2; + } +} + +int uleb128_decode_small(const uint8_t *in, uint32_t *n) +{ + if (!(*in & 0x80)) { + *n = *in++; + return 1; + } else { + *n = *in++ & 0x7f; + /* we exceed 14 bit number */ + if (*in & 0x80) { + return -1; + } + *n |= *in++ << 7; + return 2; + } +} diff --git a/default-configs/arm-softmmu.mak b/default-configs/arm-softmmu.mak index e542b4f8ee..f335a725f0 100644 --- a/default-configs/arm-softmmu.mak +++ b/default-configs/arm-softmmu.mak @@ -27,3 +27,21 @@ CONFIG_SMC91C111=y CONFIG_DS1338=y CONFIG_PFLASH_CFI01=y CONFIG_PFLASH_CFI02=y + +CONFIG_ARM_TIMER=y +CONFIG_PL011=y +CONFIG_PL022=y +CONFIG_PL031=y +CONFIG_PL041=y +CONFIG_PL050=y +CONFIG_PL061=y +CONFIG_PL080=y +CONFIG_PL110=y +CONFIG_PL181=y +CONFIG_PL190=y +CONFIG_PL310=y +CONFIG_CADENCE=y +CONFIG_XGMAC=y + +CONFIG_VERSATILE_PCI=y +CONFIG_VERSATILE_I2C=y diff --git a/docs/xbzrle.txt b/docs/xbzrle.txt new file mode 100644 index 0000000000..cc3a26a91d --- /dev/null +++ b/docs/xbzrle.txt @@ -0,0 +1,128 @@ +XBZRLE (Xor Based Zero Run Length Encoding) +=========================================== + +Using XBZRLE (Xor Based Zero Run Length Encoding) allows for the reduction +of VM downtime and the total live-migration time of Virtual machines. +It is particularly useful for virtual machines running memory write intensive +workloads that are typical of large enterprise applications such as SAP ERP +Systems, and generally speaking for any application that uses a sparse memory +update pattern. + +Instead of sending the changed guest memory page this solution will send a +compressed version of the updates, thus reducing the amount of data sent during +live migration. +In order to be able to calculate the update, the previous memory pages need to +be stored on the source. Those pages are stored in a dedicated cache +(hash table) and are accessed by their address. +The larger the cache size the better the chances are that the page has already +been stored in the cache. +A small cache size will result in high cache miss rate. +Cache size can be changed before and during migration. + +Format +======= + +The compression format performs a XOR between the previous and current content +of the page, where zero represents an unchanged value. +The page data delta is represented by zero and non zero runs. +A zero run is represented by its length (in bytes). +A non zero run is represented by its length (in bytes) and the new data. +The run length is encoded using ULEB128 (http://en.wikipedia.org/wiki/LEB128) + +There can be more than one valid encoding, the sender may send a longer encoding +for the benefit of reducing computation cost. + +page = zrun nzrun + | zrun nzrun page + +zrun = length + +nzrun = length byte... + +length = uleb128 encoded integer + +On the sender side XBZRLE is used as a compact delta encoding of page updates, +retrieving the old page content from the cache (default size of 512 MB). The +receiving side uses the existing page's content and XBZRLE to decode the new +page's content. + +This work was originally based on research results published +VEE 2011: Evaluation of Delta Compression Techniques for Efficient Live +Migration of Large Virtual Machines by Benoit, Svard, Tordsson and Elmroth. +Additionally the delta encoder XBRLE was improved further using the XBZRLE +instead. + +XBZRLE has a sustained bandwidth of 2-2.5 GB/s for typical workloads making it +ideal for in-line, real-time encoding such as is needed for live-migration. + +Example +old buffer: +1001 zeros +05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 11 12 13 68 00 00 6b 00 6d +3074 zeros + +new buffer: +1001 zeros +01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 68 00 00 67 00 69 +3074 zeros + +encoded buffer: + +encoded length 24 +e9 07 0f 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 03 01 67 01 01 69 + +Usage +====================== +1. Verify the destination QEMU version is able to decode the new format. + {qemu} info migrate_capabilities + {qemu} xbzrle: off , ... + +2. Activate xbzrle on both source and destination: + {qemu} migrate_set_capability xbzrle on + +3. Set the XBZRLE cache size - the cache size is in MBytes and should be a +power of 2. The cache default value is 64MBytes. (on source only) + {qemu} migrate_set_cache_size 256m + +4. Start outgoing migration + {qemu} migrate -d tcp:destination.host:4444 + {qemu} info migrate + capabilities: xbzrle: on + Migration status: active + transferred ram: A kbytes + remaining ram: B kbytes + total ram: C kbytes + total time: D milliseconds + duplicate: E pages + normal: F pages + normal bytes: G kbytes + cache size: H bytes + xbzrle transferred: I kbytes + xbzrle pages: J pages + xbzrle cache miss: K + xbzrle overflow : L + +xbzrle cache-miss: the number of cache misses to date - high cache-miss rate +indicates that the cache size is set too low. +xbzrle overflow: the number of overflows in the decoding which where the delta +could not be compressed. This can happen if the changes in the pages are too +large or there are many short changes; for example, changing every second byte +(half a page). + +Testing: Testing indicated that live migration with XBZRLE was completed in 110 +seconds, whereas without it would not be able to complete. + +A simple synthetic memory r/w load generator: +.. include <stdlib.h> +.. include <stdio.h> +.. int main() +.. { +.. char *buf = (char *) calloc(4096, 4096); +.. while (1) { +.. int i; +.. for (i = 0; i < 4096 * 4; i++) { +.. buf[i * 4096 / 4]++; +.. } +.. printf("."); +.. } +.. } diff --git a/hmp-commands.hx b/hmp-commands.hx index 9bbc7f7555..f6104b099f 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -829,6 +829,26 @@ STEXI @item migrate_cancel @findex migrate_cancel Cancel the current VM migration. + +ETEXI + + { + .name = "migrate_set_cache_size", + .args_type = "value:o", + .params = "value", + .help = "set cache size (in bytes) for XBZRLE migrations," + "the cache size will be rounded down to the nearest " + "power of 2.\n" + "The cache size affects the number of cache misses." + "In case of a high cache miss ratio you need to increase" + " the cache size", + .mhandler.cmd = hmp_migrate_set_cache_size, + }, + +STEXI +@item migrate_set_cache_size @var{value} +@findex migrate_set_cache_size +Set cache size to @var{value} (in bytes) for xbzrle migrations. ETEXI { @@ -861,6 +881,20 @@ Set maximum tolerated downtime (in seconds) for migration. ETEXI { + .name = "migrate_set_capability", + .args_type = "capability:s,state:b", + .params = "capability state", + .help = "Enable/Disable the usage of a capability for migration", + .mhandler.cmd = hmp_migrate_set_capability, + }, + +STEXI +@item migrate_set_capability @var{capability} @var{state} +@findex migrate_set_capability +Enable/Disable the usage of a capability @var{capability} for migration. +ETEXI + + { .name = "client_migrate_info", .args_type = "protocol:s,hostname:s,port:i?,tls-port:i?,cert-subject:s?", .params = "protocol hostname port tls-port cert-subject", @@ -1417,6 +1451,10 @@ show CPU statistics show user network stack connection states @item info migrate show migration status +@item info migrate_capabilities +show current migration capabilities +@item info migrate_cache_size +show current migration XBZRLE cache size @item info balloon show balloon information @item info qtree @@ -131,8 +131,21 @@ void hmp_info_mice(Monitor *mon) void hmp_info_migrate(Monitor *mon) { MigrationInfo *info; + MigrationCapabilityStatusList *caps, *cap; info = qmp_query_migrate(NULL); + caps = qmp_query_migrate_capabilities(NULL); + + /* do not display parameters during setup */ + if (info->has_status && caps) { + monitor_printf(mon, "capabilities: "); + for (cap = caps; cap; cap = cap->next) { + monitor_printf(mon, "%s: %s ", + MigrationCapability_lookup[cap->value->capability], + cap->value->state ? "on" : "off"); + } + monitor_printf(mon, "\n"); + } if (info->has_status) { monitor_printf(mon, "Migration status: %s\n", info->status); @@ -147,6 +160,12 @@ void hmp_info_migrate(Monitor *mon) info->ram->total >> 10); monitor_printf(mon, "total time: %" PRIu64 " milliseconds\n", info->ram->total_time); + monitor_printf(mon, "duplicate: %" PRIu64 " pages\n", + info->ram->duplicate); + monitor_printf(mon, "normal: %" PRIu64 " pages\n", + info->ram->normal); + monitor_printf(mon, "normal bytes: %" PRIu64 " kbytes\n", + info->ram->normal_bytes >> 10); } if (info->has_disk) { @@ -158,7 +177,46 @@ void hmp_info_migrate(Monitor *mon) info->disk->total >> 10); } + if (info->has_xbzrle_cache) { + monitor_printf(mon, "cache size: %" PRIu64 " bytes\n", + info->xbzrle_cache->cache_size); + monitor_printf(mon, "xbzrle transferred: %" PRIu64 " kbytes\n", + info->xbzrle_cache->bytes >> 10); + monitor_printf(mon, "xbzrle pages: %" PRIu64 " pages\n", + info->xbzrle_cache->pages); + monitor_printf(mon, "xbzrle cache miss: %" PRIu64 "\n", + info->xbzrle_cache->cache_miss); + monitor_printf(mon, "xbzrle overflow : %" PRIu64 "\n", + info->xbzrle_cache->overflow); + } + qapi_free_MigrationInfo(info); + qapi_free_MigrationCapabilityStatusList(caps); +} + +void hmp_info_migrate_capabilities(Monitor *mon) +{ + MigrationCapabilityStatusList *caps, *cap; + + caps = qmp_query_migrate_capabilities(NULL); + + if (caps) { + monitor_printf(mon, "capabilities: "); + for (cap = caps; cap; cap = cap->next) { + monitor_printf(mon, "%s: %s ", + MigrationCapability_lookup[cap->value->capability], + cap->value->state ? "on" : "off"); + } + monitor_printf(mon, "\n"); + } + + qapi_free_MigrationCapabilityStatusList(caps); +} + +void hmp_info_migrate_cache_size(Monitor *mon) +{ + monitor_printf(mon, "xbzrel cache size: %" PRId64 " kbytes\n", + qmp_query_migrate_cache_size(NULL) >> 10); } void hmp_info_cpus(Monitor *mon) @@ -732,12 +790,57 @@ void hmp_migrate_set_downtime(Monitor *mon, const QDict *qdict) qmp_migrate_set_downtime(value, NULL); } +void hmp_migrate_set_cache_size(Monitor *mon, const QDict *qdict) +{ + int64_t value = qdict_get_int(qdict, "value"); + Error *err = NULL; + + qmp_migrate_set_cache_size(value, &err); + if (err) { + monitor_printf(mon, "%s\n", error_get_pretty(err)); + error_free(err); + return; + } +} + void hmp_migrate_set_speed(Monitor *mon, const QDict *qdict) { int64_t value = qdict_get_int(qdict, "value"); qmp_migrate_set_speed(value, NULL); } +void hmp_migrate_set_capability(Monitor *mon, const QDict *qdict) +{ + const char *cap = qdict_get_str(qdict, "capability"); + bool state = qdict_get_bool(qdict, "state"); + Error *err = NULL; + MigrationCapabilityStatusList *caps = g_malloc0(sizeof(*caps)); + int i; + + for (i = 0; i < MIGRATION_CAPABILITY_MAX; i++) { + if (strcmp(cap, MigrationCapability_lookup[i]) == 0) { + caps->value = g_malloc0(sizeof(*caps->value)); + caps->value->capability = i; + caps->value->state = state; + caps->next = NULL; + qmp_migrate_set_capabilities(caps, &err); + break; + } + } + + if (i == MIGRATION_CAPABILITY_MAX) { + error_set(&err, QERR_INVALID_PARAMETER, cap); + } + + qapi_free_MigrationCapabilityStatusList(caps); + + if (err) { + monitor_printf(mon, "migrate_set_parameter: %s\n", + error_get_pretty(err)); + error_free(err); + } +} + void hmp_set_password(Monitor *mon, const QDict *qdict) { const char *protocol = qdict_get_str(qdict, "protocol"); @@ -26,6 +26,8 @@ void hmp_info_uuid(Monitor *mon); void hmp_info_chardev(Monitor *mon); void hmp_info_mice(Monitor *mon); void hmp_info_migrate(Monitor *mon); +void hmp_info_migrate_capabilities(Monitor *mon); +void hmp_info_migrate_cache_size(Monitor *mon); void hmp_info_cpus(Monitor *mon); void hmp_info_block(Monitor *mon); void hmp_info_blockstats(Monitor *mon); @@ -52,6 +54,8 @@ void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict); void hmp_migrate_cancel(Monitor *mon, const QDict *qdict); void hmp_migrate_set_downtime(Monitor *mon, const QDict *qdict); void hmp_migrate_set_speed(Monitor *mon, const QDict *qdict); +void hmp_migrate_set_capability(Monitor *mon, const QDict *qdict); +void hmp_migrate_set_cache_size(Monitor *mon, const QDict *qdict); void hmp_set_password(Monitor *mon, const QDict *qdict); void hmp_expire_password(Monitor *mon, const QDict *qdict); void hmp_eject(Monitor *mon, const QDict *qdict); diff --git a/hw/Makefile.objs b/hw/Makefile.objs index 6eee9a074c..7f57ed58e2 100644 --- a/hw/Makefile.objs +++ b/hw/Makefile.objs @@ -74,6 +74,26 @@ hw-obj-$(CONFIG_PUV3) += puv3_gpio.o hw-obj-$(CONFIG_PUV3) += puv3_pm.o hw-obj-$(CONFIG_PUV3) += puv3_dma.o +# ARM devices +hw-obj-$(CONFIG_ARM_TIMER) += arm_timer.o +hw-obj-$(CONFIG_PL011) += pl011.o +hw-obj-$(CONFIG_PL022) += pl022.o +hw-obj-$(CONFIG_PL031) += pl031.o +hw-obj-$(CONFIG_PL041) += pl041.o lm4549.o +hw-obj-$(CONFIG_PL050) += pl050.o +hw-obj-$(CONFIG_PL061) += pl061.o +hw-obj-$(CONFIG_PL080) += pl080.o +hw-obj-$(CONFIG_PL110) += pl110.o +hw-obj-$(CONFIG_PL181) += pl181.o +hw-obj-$(CONFIG_PL190) += pl190.o +hw-obj-$(CONFIG_PL310) += arm_l2x0.o +hw-obj-$(CONFIG_VERSATILE_PCI) += versatile_pci.o +hw-obj-$(CONFIG_VERSATILE_I2C) += versatile_i2c.o +hw-obj-$(CONFIG_CADENCE) += cadence_uart.o +hw-obj-$(CONFIG_CADENCE) += cadence_ttc.o +hw-obj-$(CONFIG_CADENCE) += cadence_gem.o +hw-obj-$(CONFIG_XGMAC) += xgmac.o + # PCI watchdog devices hw-obj-$(CONFIG_PCI) += wdt_i6300esb.o diff --git a/hw/arm/Makefile.objs b/hw/arm/Makefile.objs index c413780784..2b39fb3c85 100644 --- a/hw/arm/Makefile.objs +++ b/hw/arm/Makefile.objs @@ -1,10 +1,5 @@ -obj-y = integratorcp.o versatilepb.o arm_pic.o arm_timer.o -obj-y += arm_boot.o pl011.o pl031.o pl050.o pl080.o pl110.o pl181.o pl190.o -obj-y += versatile_pci.o -obj-y += versatile_i2c.o -obj-y += cadence_uart.o -obj-y += cadence_ttc.o -obj-y += cadence_gem.o +obj-y = integratorcp.o versatilepb.o arm_pic.o +obj-y += arm_boot.o obj-y += xilinx_zynq.o zynq_slcr.o obj-y += arm_gic.o arm_gic_common.o obj-y += realview_gic.o realview.o arm_sysctl.o arm11mpcore.o a9mpcore.o @@ -12,12 +7,9 @@ obj-y += exynos4210_gic.o exynos4210_combiner.o exynos4210.o obj-y += exynos4_boards.o exynos4210_uart.o exynos4210_pwm.o obj-y += exynos4210_pmu.o exynos4210_mct.o exynos4210_fimd.o obj-y += exynos4210_rtc.o exynos4210_i2c.o -obj-y += arm_l2x0.o obj-y += arm_mptimer.o a15mpcore.o -obj-y += armv7m.o armv7m_nvic.o stellaris.o pl022.o stellaris_enet.o +obj-y += armv7m.o armv7m_nvic.o stellaris.o stellaris_enet.o obj-y += highbank.o -obj-y += pl061.o -obj-y += xgmac.o obj-y += pxa2xx.o pxa2xx_pic.o pxa2xx_gpio.o pxa2xx_timer.o pxa2xx_dma.o obj-y += pxa2xx_lcd.o pxa2xx_mmci.o pxa2xx_pcmcia.o pxa2xx_keypad.o obj-y += gumstix.o @@ -37,7 +29,6 @@ obj-y += strongarm.o obj-y += collie.o obj-y += imx_serial.o imx_ccm.o imx_timer.o imx_avic.o obj-y += kzm.o -obj-y += pl041.o lm4549.o obj-$(CONFIG_FDT) += ../device_tree.o obj-y := $(addprefix ../,$(obj-y)) diff --git a/hw/armv7m.c b/hw/armv7m.c index 8cec78db96..9f66667c6d 100644 --- a/hw/armv7m.c +++ b/hw/armv7m.c @@ -227,6 +227,11 @@ qemu_irq *armv7m_init(MemoryRegion *address_space_mem, big_endian = 0; #endif + if (!kernel_filename) { + fprintf(stderr, "Guest image must be specified (using -kernel)\n"); + exit(1); + } + image_size = load_elf(kernel_filename, NULL, NULL, &entry, &lowaddr, NULL, big_endian, ELF_MACHINE, 1); if (image_size < 0) { diff --git a/hw/armv7m_nvic.c b/hw/armv7m_nvic.c index 4867c1d5fa..6a0832eb3f 100644 --- a/hw/armv7m_nvic.c +++ b/hw/armv7m_nvic.c @@ -467,7 +467,7 @@ static int armv7m_nvic_init(SysBusDevice *dev) s->gic.num_cpu = 1; /* Tell the common code we're an NVIC */ s->gic.revision = 0xffffffff; - s->gic.num_irq = s->num_irq; + s->num_irq = s->gic.num_irq; nc->parent_init(dev); gic_init_irqs_and_distributor(&s->gic, s->num_irq); /* The NVIC and system controller register area looks like this: @@ -498,14 +498,21 @@ static int armv7m_nvic_init(SysBusDevice *dev) return 0; } -static Property armv7m_nvic_properties[] = { +static void armv7m_nvic_instance_init(Object *obj) +{ + /* We have a different default value for the num-irq property + * than our superclass. This function runs after qdev init + * has set the defaults from the Property array and before + * any user-specified property setting, so just modify the + * value in the gic_state struct. + */ + gic_state *s = ARM_GIC_COMMON(obj); /* The ARM v7m may have anything from 0 to 496 external interrupt * IRQ lines. We default to 64. Other boards may differ and should - * set this property appropriately. + * set the num-irq property appropriately. */ - DEFINE_PROP_UINT32("num-irq", nvic_state, num_irq, 64), - DEFINE_PROP_END_OF_LIST(), -}; + s->num_irq = 64; +} static void armv7m_nvic_class_init(ObjectClass *klass, void *data) { @@ -518,12 +525,12 @@ static void armv7m_nvic_class_init(ObjectClass *klass, void *data) sdc->init = armv7m_nvic_init; dc->vmsd = &vmstate_nvic; dc->reset = armv7m_nvic_reset; - dc->props = armv7m_nvic_properties; } static TypeInfo armv7m_nvic_info = { .name = TYPE_NVIC, .parent = TYPE_ARM_GIC_COMMON, + .instance_init = armv7m_nvic_instance_init, .instance_size = sizeof(nvic_state), .class_init = armv7m_nvic_class_init, .class_size = sizeof(NVICClass), diff --git a/hw/kvm/i8259.c b/hw/kvm/i8259.c index 94d1b9aa95..1e24cd4f36 100644 --- a/hw/kvm/i8259.c +++ b/hw/kvm/i8259.c @@ -94,7 +94,7 @@ static void kvm_pic_set_irq(void *opaque, int irq, int level) { int delivered; - delivered = kvm_irqchip_set_irq(kvm_state, irq, level); + delivered = kvm_set_irq(kvm_state, irq, level); apic_report_irq_delivered(delivered); } diff --git a/hw/kvm/ioapic.c b/hw/kvm/ioapic.c index 3ae3175403..6c3b8fe39a 100644 --- a/hw/kvm/ioapic.c +++ b/hw/kvm/ioapic.c @@ -82,7 +82,7 @@ static void kvm_ioapic_set_irq(void *opaque, int irq, int level) KVMIOAPICState *s = opaque; int delivered; - delivered = kvm_irqchip_set_irq(kvm_state, s->kvm_gsi_base + irq, level); + delivered = kvm_set_irq(kvm_state, s->kvm_gsi_base + irq, level); apic_report_irq_delivered(delivered); } @@ -42,6 +42,7 @@ #include "sysbus.h" #include "sysemu.h" #include "kvm.h" +#include "kvm_i386.h" #include "xen.h" #include "blockdev.h" #include "hw/block-common.h" @@ -32,6 +32,7 @@ #include "hw.h" #include "block.h" #include "sd.h" +#include "bitmap.h" //#define DEBUG_SD 1 @@ -80,8 +81,8 @@ struct SDState { uint32_t card_status; uint8_t sd_status[64]; uint32_t vhs; - int wp_switch; - int *wp_groups; + bool wp_switch; + unsigned long *wp_groups; uint64_t size; int blk_len; uint32_t erase_start; @@ -90,12 +91,12 @@ struct SDState { int pwd_len; int function_group[6]; - int spi; + bool spi; int current_cmd; /* True if we will handle the next command as an ACMD. Note that this does * *not* track the APP_CMD status bit! */ - int expecting_acmd; + bool expecting_acmd; int blk_written; uint64_t data_start; uint32_t data_offset; @@ -105,7 +106,7 @@ struct SDState { BlockDriverState *bdrv; uint8_t *buf; - int enable; + bool enable; }; static void sd_set_mode(SDState *sd) @@ -387,6 +388,11 @@ static void sd_response_r7_make(SDState *sd, uint8_t *response) response[3] = (sd->vhs >> 0) & 0xff; } +static inline uint64_t sd_addr_to_wpnum(uint64_t addr) +{ + return addr >> (HWBLOCK_SHIFT + SECTOR_SHIFT + WPGROUP_SHIFT); +} + static void sd_reset(SDState *sd, BlockDriverState *bdrv) { uint64_t size; @@ -399,7 +405,7 @@ static void sd_reset(SDState *sd, BlockDriverState *bdrv) } size = sect << 9; - sect = (size >> (HWBLOCK_SHIFT + SECTOR_SHIFT + WPGROUP_SHIFT)) + 1; + sect = sd_addr_to_wpnum(size) + 1; sd->state = sd_idle_state; sd->rca = 0x0000; @@ -414,15 +420,15 @@ static void sd_reset(SDState *sd, BlockDriverState *bdrv) if (sd->wp_groups) g_free(sd->wp_groups); - sd->wp_switch = bdrv ? bdrv_is_read_only(bdrv) : 0; - sd->wp_groups = (int *) g_malloc0(sizeof(int) * sect); + sd->wp_switch = bdrv ? bdrv_is_read_only(bdrv) : false; + sd->wp_groups = bitmap_new(sect); memset(sd->function_group, 0, sizeof(int) * 6); sd->erase_start = 0; sd->erase_end = 0; sd->size = size; sd->blk_len = 0x200; sd->pwd_len = 0; - sd->expecting_acmd = 0; + sd->expecting_acmd = false; } static void sd_cardchange(void *opaque, bool load) @@ -444,14 +450,14 @@ static const BlockDevOps sd_block_ops = { whether card should be in SSI or MMC/SD mode. It is also up to the board to ensure that ssi transfers only occur when the chip select is asserted. */ -SDState *sd_init(BlockDriverState *bs, int is_spi) +SDState *sd_init(BlockDriverState *bs, bool is_spi) { SDState *sd; sd = (SDState *) g_malloc0(sizeof(SDState)); sd->buf = qemu_blockalign(bs, 512); sd->spi = is_spi; - sd->enable = 1; + sd->enable = true; sd_reset(sd, bs); if (sd->bdrv) { bdrv_attach_dev_nofail(sd->bdrv, sd); @@ -476,17 +482,17 @@ static void sd_erase(SDState *sd) return; } - start = sd->erase_start >> - (HWBLOCK_SHIFT + SECTOR_SHIFT + WPGROUP_SHIFT); - end = sd->erase_end >> - (HWBLOCK_SHIFT + SECTOR_SHIFT + WPGROUP_SHIFT); + start = sd_addr_to_wpnum(sd->erase_start); + end = sd_addr_to_wpnum(sd->erase_end); sd->erase_start = 0; sd->erase_end = 0; sd->csd[14] |= 0x40; - for (i = start; i <= end; i ++) - if (sd->wp_groups[i]) + for (i = start; i <= end; i++) { + if (test_bit(i, sd->wp_groups)) { sd->card_status |= WP_ERASE_SKIP; + } + } } static uint32_t sd_wpbits(SDState *sd, uint64_t addr) @@ -494,11 +500,13 @@ static uint32_t sd_wpbits(SDState *sd, uint64_t addr) uint32_t i, wpnum; uint32_t ret = 0; - wpnum = addr >> (HWBLOCK_SHIFT + SECTOR_SHIFT + WPGROUP_SHIFT); + wpnum = sd_addr_to_wpnum(addr); - for (i = 0; i < 32; i ++, wpnum ++, addr += WPGROUP_SIZE) - if (addr < sd->size && sd->wp_groups[wpnum]) + for (i = 0; i < 32; i++, wpnum++, addr += WPGROUP_SIZE) { + if (addr < sd->size && test_bit(wpnum, sd->wp_groups)) { ret |= (1 << i); + } + } return ret; } @@ -534,10 +542,9 @@ static void sd_function_switch(SDState *sd, uint32_t arg) sd->data[66] = crc & 0xff; } -static inline int sd_wp_addr(SDState *sd, uint32_t addr) +static inline bool sd_wp_addr(SDState *sd, uint64_t addr) { - return sd->wp_groups[addr >> - (HWBLOCK_SHIFT + SECTOR_SHIFT + WPGROUP_SHIFT)]; + return test_bit(sd_addr_to_wpnum(addr), sd->wp_groups); } static void sd_lock_command(SDState *sd) @@ -560,8 +567,7 @@ static void sd_lock_command(SDState *sd) sd->card_status |= LOCK_UNLOCK_FAILED; return; } - memset(sd->wp_groups, 0, sizeof(int) * (sd->size >> - (HWBLOCK_SHIFT + SECTOR_SHIFT + WPGROUP_SHIFT))); + bitmap_zero(sd->wp_groups, sd_addr_to_wpnum(sd->size) + 1); sd->csd[14] &= ~0x10; sd->card_status &= ~CARD_IS_LOCKED; sd->pwd_len = 0; @@ -1007,8 +1013,7 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, } sd->state = sd_programming_state; - sd->wp_groups[addr >> (HWBLOCK_SHIFT + - SECTOR_SHIFT + WPGROUP_SHIFT)] = 1; + set_bit(sd_addr_to_wpnum(addr), sd->wp_groups); /* Bzzzzzzztt .... Operation complete. */ sd->state = sd_transfer_state; return sd_r1b; @@ -1027,8 +1032,7 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, } sd->state = sd_programming_state; - sd->wp_groups[addr >> (HWBLOCK_SHIFT + - SECTOR_SHIFT + WPGROUP_SHIFT)] = 0; + clear_bit(sd_addr_to_wpnum(addr), sd->wp_groups); /* Bzzzzzzztt .... Operation complete. */ sd->state = sd_transfer_state; return sd_r1b; @@ -1125,7 +1129,7 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, if (sd->rca != rca) return sd_r0; - sd->expecting_acmd = 1; + sd->expecting_acmd = true; sd->card_status |= APP_CMD; return sd_r1; @@ -1307,7 +1311,7 @@ int sd_do_command(SDState *sd, SDRequest *req, if (sd->card_status & CARD_IS_LOCKED) { if (!cmd_valid_while_locked(sd, req)) { sd->card_status |= ILLEGAL_COMMAND; - sd->expecting_acmd = 0; + sd->expecting_acmd = false; fprintf(stderr, "SD: Card is locked\n"); rtype = sd_illegal; goto send_response; @@ -1318,7 +1322,7 @@ int sd_do_command(SDState *sd, SDRequest *req, sd_set_mode(sd); if (sd->expecting_acmd) { - sd->expecting_acmd = 0; + sd->expecting_acmd = false; rtype = sd_app_command(sd, *req); } else { rtype = sd_normal_command(sd, *req); @@ -1699,12 +1703,12 @@ uint8_t sd_read_data(SDState *sd) return ret; } -int sd_data_ready(SDState *sd) +bool sd_data_ready(SDState *sd) { return sd->state == sd_sendingdata_state; } -void sd_enable(SDState *sd, int enable) +void sd_enable(SDState *sd, bool enable) { sd->enable = enable; } @@ -67,13 +67,13 @@ typedef struct { typedef struct SDState SDState; -SDState *sd_init(BlockDriverState *bs, int is_spi); +SDState *sd_init(BlockDriverState *bs, bool is_spi); int sd_do_command(SDState *sd, SDRequest *req, uint8_t *response); void sd_write_data(SDState *sd, uint8_t value); uint8_t sd_read_data(SDState *sd); void sd_set_cb(SDState *sd, qemu_irq readonly, qemu_irq insert); -int sd_data_ready(SDState *sd); -void sd_enable(SDState *sd, int enable); +bool sd_data_ready(SDState *sd); +void sd_enable(SDState *sd, bool enable); #endif /* __hw_sd_h */ diff --git a/hw/ssd0323.c b/hw/ssd0323.c index b0b2e94a81..b101c5112c 100644 --- a/hw/ssd0323.c +++ b/hw/ssd0323.c @@ -19,7 +19,9 @@ #define DPRINTF(fmt, ...) \ do { printf("ssd0323: " fmt , ## __VA_ARGS__); } while (0) #define BADF(fmt, ...) \ -do { fprintf(stderr, "ssd0323: error: " fmt , ## __VA_ARGS__); exit(1);} while (0) +do { \ + fprintf(stderr, "ssd0323: error: " fmt , ## __VA_ARGS__); abort(); \ +} while (0) #else #define DPRINTF(fmt, ...) do {} while(0) #define BADF(fmt, ...) \ diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c index 125eded9ca..5e6e09efb7 100644 --- a/hw/virtio-pci.c +++ b/hw/virtio-pci.c @@ -627,7 +627,7 @@ static int virtio_pci_set_guest_notifiers(void *opaque, bool assign) int r, n; /* Must unset vector notifier while guest notifier is still assigned */ - if (kvm_irqchip_in_kernel() && !assign) { + if (kvm_msi_via_irqfd_enabled() && !assign) { msix_unset_vector_notifiers(&proxy->pci_dev); g_free(proxy->vector_irqfd); proxy->vector_irqfd = NULL; @@ -645,7 +645,7 @@ static int virtio_pci_set_guest_notifiers(void *opaque, bool assign) } /* Must set vector notifier after guest notifier has been assigned */ - if (kvm_irqchip_in_kernel() && assign) { + if (kvm_msi_via_irqfd_enabled() && assign) { proxy->vector_irqfd = g_malloc0(sizeof(*proxy->vector_irqfd) * msix_nr_vectors_allocated(&proxy->pci_dev)); diff --git a/include/qemu/page_cache.h b/include/qemu/page_cache.h new file mode 100644 index 0000000000..3839ac7726 --- /dev/null +++ b/include/qemu/page_cache.h @@ -0,0 +1,79 @@ +/* + * Page cache for QEMU + * The cache is base on a hash of the page address + * + * Copyright 2012 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Orit Wasserman <owasserm@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef PAGE_CACHE_H +#define PAGE_CACHE_H + +/* Page cache for storing guest pages */ +typedef struct PageCache PageCache; + +/** + * cache_init: Initialize the page cache + * + * + * Returns new allocated cache or NULL on error + * + * @cache pointer to the PageCache struct + * @num_pages: cache maximal number of cached pages + * @page_size: cache page size + */ +PageCache *cache_init(int64_t num_pages, unsigned int page_size); + +/** + * cache_fini: free all cache resources + * @cache pointer to the PageCache struct + */ +void cache_fini(PageCache *cache); + +/** + * cache_is_cached: Checks to see if the page is cached + * + * Returns %true if page is cached + * + * @cache pointer to the PageCache struct + * @addr: page addr + */ +bool cache_is_cached(const PageCache *cache, uint64_t addr); + +/** + * get_cached_data: Get the data cached for an addr + * + * Returns pointer to the data cached or NULL if not cached + * + * @cache pointer to the PageCache struct + * @addr: page addr + */ +uint8_t *get_cached_data(const PageCache *cache, uint64_t addr); + +/** + * cache_insert: insert the page into the cache. the previous value will be overwritten + * + * @cache pointer to the PageCache struct + * @addr: page address + * @pdata: pointer to the page + */ +void cache_insert(PageCache *cache, uint64_t addr, uint8_t *pdata); + +/** + * cache_resize: resize the page cache. In case of size reduction the extra + * pages will be freed + * + * Returns -1 on error new cache size on success + * + * @cache pointer to the PageCache struct + * @num_pages: new page cache size (in pages) + */ +int64_t cache_resize(PageCache *cache, int64_t num_pages); + +#endif @@ -100,6 +100,10 @@ struct KVMState KVMState *kvm_state; bool kvm_kernel_irqchip; +bool kvm_async_interrupts_allowed; +bool kvm_irqfds_allowed; +bool kvm_msi_via_irqfd_allowed; +bool kvm_gsi_routing_allowed; static const KVMCapabilityInfo kvm_required_capabilites[] = { KVM_CAP_INFO(USER_MEMORY), @@ -852,18 +856,18 @@ static void kvm_handle_interrupt(CPUArchState *env, int mask) } } -int kvm_irqchip_set_irq(KVMState *s, int irq, int level) +int kvm_set_irq(KVMState *s, int irq, int level) { struct kvm_irq_level event; int ret; - assert(kvm_irqchip_in_kernel()); + assert(kvm_async_interrupts_enabled()); event.level = level; event.irq = irq; ret = kvm_vm_ioctl(s, s->irqchip_inject_ioctl, &event); if (ret < 0) { - perror("kvm_set_irqchip_line"); + perror("kvm_set_irq"); abort(); } @@ -1088,7 +1092,7 @@ int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) assert(route->kroute.type == KVM_IRQ_ROUTING_MSI); - return kvm_irqchip_set_irq(s, route->kroute.gsi, 1); + return kvm_set_irq(s, route->kroute.gsi, 1); } int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg) @@ -1096,7 +1100,7 @@ int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg) struct kvm_irq_routing_entry kroute; int virq; - if (!kvm_irqchip_in_kernel()) { + if (!kvm_gsi_routing_enabled()) { return -ENOSYS; } @@ -1125,7 +1129,7 @@ static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign) .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN, }; - if (!kvm_irqchip_in_kernel()) { + if (!kvm_irqfds_enabled()) { return -ENOSYS; } @@ -1201,12 +1205,36 @@ static int kvm_irqchip_create(KVMState *s) s->irqchip_inject_ioctl = KVM_IRQ_LINE_STATUS; } kvm_kernel_irqchip = true; + /* If we have an in-kernel IRQ chip then we must have asynchronous + * interrupt delivery (though the reverse is not necessarily true) + */ + kvm_async_interrupts_allowed = true; kvm_init_irq_routing(s); return 0; } +static int kvm_max_vcpus(KVMState *s) +{ + int ret; + + /* Find number of supported CPUs using the recommended + * procedure from the kernel API documentation to cope with + * older kernels that may be missing capabilities. + */ + ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS); + if (ret) { + return ret; + } + ret = kvm_check_extension(s, KVM_CAP_NR_VCPUS); + if (ret) { + return ret; + } + + return 4; +} + int kvm_init(void) { static const char upgrade_note[] = @@ -1216,6 +1244,7 @@ int kvm_init(void) const KVMCapabilityInfo *missing_cap; int ret; int i; + int max_vcpus; s = g_malloc0(sizeof(KVMState)); @@ -1256,6 +1285,14 @@ int kvm_init(void) goto err; } + max_vcpus = kvm_max_vcpus(s); + if (smp_cpus > max_vcpus) { + ret = -EINVAL; + fprintf(stderr, "Number of SMP cpus requested (%d) exceeds max cpus " + "supported by KVM (%d)\n", smp_cpus, max_vcpus); + goto err; + } + s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0); if (s->vmfd < 0) { #ifdef TARGET_S390X @@ -1667,11 +1704,6 @@ int kvm_has_gsi_routing(void) #endif } -int kvm_allows_irq0_override(void) -{ - return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing(); -} - void *kvm_vmalloc(ram_addr_t size) { #ifdef TARGET_S390X diff --git a/kvm-stub.c b/kvm-stub.c index d23b11c020..94c9ea15b0 100644 --- a/kvm-stub.c +++ b/kvm-stub.c @@ -19,6 +19,10 @@ KVMState *kvm_state; bool kvm_kernel_irqchip; +bool kvm_async_interrupts_allowed; +bool kvm_irqfds_allowed; +bool kvm_msi_via_irqfd_allowed; +bool kvm_gsi_routing_allowed; int kvm_init_vcpu(CPUArchState *env) { @@ -71,11 +75,6 @@ int kvm_has_many_ioeventfds(void) return 0; } -int kvm_allows_irq0_override(void) -{ - return 1; -} - int kvm_has_pit_state2(void) { return 0; @@ -24,13 +24,69 @@ extern int kvm_allowed; extern bool kvm_kernel_irqchip; +extern bool kvm_async_interrupts_allowed; +extern bool kvm_irqfds_allowed; +extern bool kvm_msi_via_irqfd_allowed; +extern bool kvm_gsi_routing_allowed; #if defined CONFIG_KVM || !defined NEED_CPU_H #define kvm_enabled() (kvm_allowed) +/** + * kvm_irqchip_in_kernel: + * + * Returns: true if the user asked us to create an in-kernel + * irqchip via the "kernel_irqchip=on" machine option. + * What this actually means is architecture and machine model + * specific: on PC, for instance, it means that the LAPIC, + * IOAPIC and PIT are all in kernel. This function should never + * be used from generic target-independent code: use one of the + * following functions or some other specific check instead. + */ #define kvm_irqchip_in_kernel() (kvm_kernel_irqchip) + +/** + * kvm_async_interrupts_enabled: + * + * Returns: true if we can deliver interrupts to KVM + * asynchronously (ie by ioctl from any thread at any time) + * rather than having to do interrupt delivery synchronously + * (where the vcpu must be stopped at a suitable point first). + */ +#define kvm_async_interrupts_enabled() (kvm_async_interrupts_allowed) + +/** + * kvm_irqfds_enabled: + * + * Returns: true if we can use irqfds to inject interrupts into + * a KVM CPU (ie the kernel supports irqfds and we are running + * with a configuration where it is meaningful to use them). + */ +#define kvm_irqfds_enabled() (kvm_irqfds_allowed) + +/** + * kvm_msi_via_irqfd_enabled: + * + * Returns: true if we can route a PCI MSI (Message Signaled Interrupt) + * to a KVM CPU via an irqfd. This requires that the kernel supports + * this and that we're running in a configuration that permits it. + */ +#define kvm_msi_via_irqfd_enabled() (kvm_msi_via_irqfd_allowed) + +/** + * kvm_gsi_routing_enabled: + * + * Returns: true if GSI routing is enabled (ie the kernel supports + * it and we're running in a configuration that permits it). + */ +#define kvm_gsi_routing_enabled() (kvm_gsi_routing_allowed) + #else #define kvm_enabled() (0) #define kvm_irqchip_in_kernel() (false) +#define kvm_async_interrupts_enabled() (false) +#define kvm_irqfds_enabled() (false) +#define kvm_msi_via_irqfd_enabled() (false) +#define kvm_gsi_routing_allowed() (false) #endif struct kvm_run; @@ -62,8 +118,6 @@ int kvm_has_pit_state2(void); int kvm_has_many_ioeventfds(void); int kvm_has_gsi_routing(void); -int kvm_allows_irq0_override(void); - #ifdef NEED_CPU_H int kvm_init_vcpu(CPUArchState *env); @@ -133,7 +187,7 @@ int kvm_arch_on_sigbus(int code, void *addr); void kvm_arch_init_irq_routing(KVMState *s); -int kvm_irqchip_set_irq(KVMState *s, int irq, int level); +int kvm_set_irq(KVMState *s, int irq, int level); int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg); void kvm_irqchip_add_irq_route(KVMState *s, int gsi, int irqchip, int pin); diff --git a/migration.c b/migration.c index 8db1b433f0..653a3c1a88 100644 --- a/migration.c +++ b/migration.c @@ -43,6 +43,9 @@ enum { #define MAX_THROTTLE (32 << 20) /* Migration speed throttling */ +/* Migration XBZRLE default cache size */ +#define DEFAULT_MIGRATE_CACHE_SIZE (64 * 1024 * 1024) + static NotifierList migration_state_notifiers = NOTIFIER_LIST_INITIALIZER(migration_state_notifiers); @@ -55,6 +58,7 @@ static MigrationState *migrate_get_current(void) static MigrationState current_migration = { .state = MIG_STATE_SETUP, .bandwidth_limit = MAX_THROTTLE, + .xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE, }; return ¤t_migration; @@ -113,6 +117,43 @@ uint64_t migrate_max_downtime(void) return max_downtime; } +MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp) +{ + MigrationCapabilityStatusList *head = NULL; + MigrationCapabilityStatusList *caps; + MigrationState *s = migrate_get_current(); + int i; + + for (i = 0; i < MIGRATION_CAPABILITY_MAX; i++) { + if (head == NULL) { + head = g_malloc0(sizeof(*caps)); + caps = head; + } else { + caps->next = g_malloc0(sizeof(*caps)); + caps = caps->next; + } + caps->value = + g_malloc(sizeof(*caps->value)); + caps->value->capability = i; + caps->value->state = s->enabled_capabilities[i]; + } + + return head; +} + +static void get_xbzrle_cache_stats(MigrationInfo *info) +{ + if (migrate_use_xbzrle()) { + info->has_xbzrle_cache = true; + info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache)); + info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size(); + info->xbzrle_cache->bytes = xbzrle_mig_bytes_transferred(); + info->xbzrle_cache->pages = xbzrle_mig_pages_transferred(); + info->xbzrle_cache->cache_miss = xbzrle_mig_pages_cache_miss(); + info->xbzrle_cache->overflow = xbzrle_mig_pages_overflow(); + } +} + MigrationInfo *qmp_query_migrate(Error **errp) { MigrationInfo *info = g_malloc0(sizeof(*info)); @@ -133,6 +174,9 @@ MigrationInfo *qmp_query_migrate(Error **errp) info->ram->total = ram_bytes_total(); info->ram->total_time = qemu_get_clock_ms(rt_clock) - s->total_time; + info->ram->duplicate = dup_mig_pages_transferred(); + info->ram->normal = norm_mig_pages_transferred(); + info->ram->normal_bytes = norm_mig_bytes_transferred(); if (blk_mig_active()) { info->has_disk = true; @@ -141,8 +185,12 @@ MigrationInfo *qmp_query_migrate(Error **errp) info->disk->remaining = blk_mig_bytes_remaining(); info->disk->total = blk_mig_bytes_total(); } + + get_xbzrle_cache_stats(info); break; case MIG_STATE_COMPLETED: + get_xbzrle_cache_stats(info); + info->has_status = true; info->status = g_strdup("completed"); @@ -152,6 +200,9 @@ MigrationInfo *qmp_query_migrate(Error **errp) info->ram->remaining = 0; info->ram->total = ram_bytes_total(); info->ram->total_time = s->total_time; + info->ram->duplicate = dup_mig_pages_transferred(); + info->ram->normal = norm_mig_pages_transferred(); + info->ram->normal_bytes = norm_mig_bytes_transferred(); break; case MIG_STATE_ERROR: info->has_status = true; @@ -166,6 +217,22 @@ MigrationInfo *qmp_query_migrate(Error **errp) return info; } +void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, + Error **errp) +{ + MigrationState *s = migrate_get_current(); + MigrationCapabilityStatusList *cap; + + if (s->state == MIG_STATE_ACTIVE) { + error_set(errp, QERR_MIGRATION_ACTIVE); + return; + } + + for (cap = params; cap; cap = cap->next) { + s->enabled_capabilities[cap->value->capability] = cap->value->state; + } +} + /* shared migration helpers */ static int migrate_fd_cleanup(MigrationState *s) @@ -375,10 +442,18 @@ static MigrationState *migrate_init(const MigrationParams *params) { MigrationState *s = migrate_get_current(); int64_t bandwidth_limit = s->bandwidth_limit; + bool enabled_capabilities[MIGRATION_CAPABILITY_MAX]; + int64_t xbzrle_cache_size = s->xbzrle_cache_size; + + memcpy(enabled_capabilities, s->enabled_capabilities, + sizeof(enabled_capabilities)); memset(s, 0, sizeof(*s)); s->bandwidth_limit = bandwidth_limit; s->params = *params; + memcpy(s->enabled_capabilities, enabled_capabilities, + sizeof(enabled_capabilities)); + s->xbzrle_cache_size = xbzrle_cache_size; s->bandwidth_limit = bandwidth_limit; s->state = MIG_STATE_SETUP; @@ -459,6 +534,25 @@ void qmp_migrate_cancel(Error **errp) migrate_fd_cancel(migrate_get_current()); } +void qmp_migrate_set_cache_size(int64_t value, Error **errp) +{ + MigrationState *s = migrate_get_current(); + + /* Check for truncation */ + if (value != (size_t)value) { + error_set(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", + "exceeding address space"); + return; + } + + s->xbzrle_cache_size = xbzrle_cache_resize(value); +} + +int64_t qmp_query_migrate_cache_size(Error **errp) +{ + return migrate_xbzrle_cache_size(); +} + void qmp_migrate_set_speed(int64_t value, Error **errp) { MigrationState *s; @@ -478,3 +572,21 @@ void qmp_migrate_set_downtime(double value, Error **errp) value = MAX(0, MIN(UINT64_MAX, value)); max_downtime = (uint64_t)value; } + +int migrate_use_xbzrle(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE]; +} + +int64_t migrate_xbzrle_cache_size(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->xbzrle_cache_size; +} diff --git a/migration.h b/migration.h index 57572a61e9..a9852fcae0 100644 --- a/migration.h +++ b/migration.h @@ -19,6 +19,7 @@ #include "notify.h" #include "error.h" #include "vmstate.h" +#include "qapi-types.h" struct MigrationParams { bool blk; @@ -39,6 +40,8 @@ struct MigrationState void *opaque; MigrationParams params; int64_t total_time; + bool enabled_capabilities[MIGRATION_CAPABILITY_MAX]; + int64_t xbzrle_cache_size; }; void process_incoming_migration(QEMUFile *f); @@ -84,6 +87,15 @@ uint64_t ram_bytes_total(void); extern SaveVMHandlers savevm_ram_handlers; +uint64_t dup_mig_bytes_transferred(void); +uint64_t dup_mig_pages_transferred(void); +uint64_t norm_mig_bytes_transferred(void); +uint64_t norm_mig_pages_transferred(void); +uint64_t xbzrle_mig_bytes_transferred(void); +uint64_t xbzrle_mig_pages_transferred(void); +uint64_t xbzrle_mig_pages_overflow(void); +uint64_t xbzrle_mig_pages_cache_miss(void); + /** * @migrate_add_blocker - prevent migration from proceeding * @@ -98,4 +110,13 @@ void migrate_add_blocker(Error *reason); */ void migrate_del_blocker(Error *reason); +int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen, + uint8_t *dst, int dlen); +int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen); + +int migrate_use_xbzrle(void); +int64_t migrate_xbzrle_cache_size(void); + +int64_t xbzrle_cache_resize(int64_t new_size); + #endif @@ -2632,6 +2632,20 @@ static mon_cmd_t info_cmds[] = { .mhandler.info = hmp_info_migrate, }, { + .name = "migrate_capabilities", + .args_type = "", + .params = "", + .help = "show current migration capabilities", + .mhandler.info = hmp_info_migrate_capabilities, + }, + { + .name = "migrate_cache_size", + .args_type = "", + .params = "", + .help = "show current migration xbzrle cache size", + .mhandler.info = hmp_info_migrate_cache_size, + }, + { .name = "balloon", .args_type = "", .params = "", diff --git a/page_cache.c b/page_cache.c new file mode 100644 index 0000000000..0294f7e9f6 --- /dev/null +++ b/page_cache.c @@ -0,0 +1,218 @@ +/* + * Page cache for QEMU + * The cache is base on a hash of the page address + * + * Copyright 2012 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Orit Wasserman <owasserm@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <string.h> +#include <sys/time.h> +#include <sys/types.h> +#include <stdbool.h> +#include <glib.h> +#include <strings.h> + +#include "qemu-common.h" +#include "qemu/page_cache.h" + +#ifdef DEBUG_CACHE +#define DPRINTF(fmt, ...) \ + do { fprintf(stdout, "cache: " fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ + do { } while (0) +#endif + +typedef struct CacheItem CacheItem; + +struct CacheItem { + uint64_t it_addr; + uint64_t it_age; + uint8_t *it_data; +}; + +struct PageCache { + CacheItem *page_cache; + unsigned int page_size; + int64_t max_num_items; + uint64_t max_item_age; + int64_t num_items; +}; + +PageCache *cache_init(int64_t num_pages, unsigned int page_size) +{ + int64_t i; + + PageCache *cache; + + if (num_pages <= 0) { + DPRINTF("invalid number of pages\n"); + return NULL; + } + + cache = g_malloc(sizeof(*cache)); + + /* round down to the nearest power of 2 */ + if (!is_power_of_2(num_pages)) { + num_pages = pow2floor(num_pages); + DPRINTF("rounding down to %" PRId64 "\n", num_pages); + } + cache->page_size = page_size; + cache->num_items = 0; + cache->max_item_age = 0; + cache->max_num_items = num_pages; + + DPRINTF("Setting cache buckets to %" PRId64 "\n", cache->max_num_items); + + cache->page_cache = g_malloc((cache->max_num_items) * + sizeof(*cache->page_cache)); + + for (i = 0; i < cache->max_num_items; i++) { + cache->page_cache[i].it_data = NULL; + cache->page_cache[i].it_age = 0; + cache->page_cache[i].it_addr = -1; + } + + return cache; +} + +void cache_fini(PageCache *cache) +{ + int64_t i; + + g_assert(cache); + g_assert(cache->page_cache); + + for (i = 0; i < cache->max_num_items; i++) { + g_free(cache->page_cache[i].it_data); + } + + g_free(cache->page_cache); + cache->page_cache = NULL; +} + +static size_t cache_get_cache_pos(const PageCache *cache, + uint64_t address) +{ + size_t pos; + + g_assert(cache->max_num_items); + pos = (address / cache->page_size) & (cache->max_num_items - 1); + return pos; +} + +bool cache_is_cached(const PageCache *cache, uint64_t addr) +{ + size_t pos; + + g_assert(cache); + g_assert(cache->page_cache); + + pos = cache_get_cache_pos(cache, addr); + + return (cache->page_cache[pos].it_addr == addr); +} + +static CacheItem *cache_get_by_addr(const PageCache *cache, uint64_t addr) +{ + size_t pos; + + g_assert(cache); + g_assert(cache->page_cache); + + pos = cache_get_cache_pos(cache, addr); + + return &cache->page_cache[pos]; +} + +uint8_t *get_cached_data(const PageCache *cache, uint64_t addr) +{ + return cache_get_by_addr(cache, addr)->it_data; +} + +void cache_insert(PageCache *cache, uint64_t addr, uint8_t *pdata) +{ + + CacheItem *it = NULL; + + g_assert(cache); + g_assert(cache->page_cache); + + /* actual update of entry */ + it = cache_get_by_addr(cache, addr); + + if (!it->it_data) { + cache->num_items++; + } + + it->it_data = pdata; + it->it_age = ++cache->max_item_age; + it->it_addr = addr; +} + +int64_t cache_resize(PageCache *cache, int64_t new_num_pages) +{ + PageCache *new_cache; + int64_t i; + + CacheItem *old_it, *new_it; + + g_assert(cache); + + /* cache was not inited */ + if (cache->page_cache == NULL) { + return -1; + } + + /* same size */ + if (pow2floor(new_num_pages) == cache->max_num_items) { + return cache->max_num_items; + } + + new_cache = cache_init(new_num_pages, cache->page_size); + if (!(new_cache)) { + DPRINTF("Error creating new cache\n"); + return -1; + } + + /* move all data from old cache */ + for (i = 0; i < cache->max_num_items; i++) { + old_it = &cache->page_cache[i]; + if (old_it->it_addr != -1) { + /* check for collision, if there is, keep MRU page */ + new_it = cache_get_by_addr(new_cache, old_it->it_addr); + if (new_it->it_data) { + /* keep the MRU page */ + if (new_it->it_age >= old_it->it_age) { + g_free(old_it->it_data); + } else { + g_free(new_it->it_data); + new_it->it_data = old_it->it_data; + new_it->it_age = old_it->it_age; + new_it->it_addr = old_it->it_addr; + } + } else { + cache_insert(new_cache, old_it->it_addr, old_it->it_data); + } + } + } + + cache->page_cache = new_cache->page_cache; + cache->max_num_items = new_cache->max_num_items; + cache->num_items = new_cache->num_items; + + g_free(new_cache); + + return cache->max_num_items; +} diff --git a/qapi-schema.json b/qapi-schema.json index 9347b6a499..53bbe46e4d 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -290,15 +290,43 @@ # # @total: total amount of bytes involved in the migration process # -# @total_time: tota0l amount of ms since migration started. If +# @total-time: total amount of ms since migration started. If # migration has ended, it returns the total migration # time. (since 1.2) # -# Since: 0.14.0. +# @duplicate: number of duplicate pages (since 1.2) +# +# @normal : number of normal pages (since 1.2) +# +# @normal-bytes : number of normal bytes sent (since 1.2) +# +# Since: 0.14.0 ## { 'type': 'MigrationStats', 'data': {'transferred': 'int', 'remaining': 'int', 'total': 'int' , - 'total_time': 'int' } } + 'total-time': 'int', 'duplicate': 'int', 'normal': 'int', + 'normal-bytes': 'int' } } + +## +# @XBZRLECacheStats +# +# Detailed XBZRLE migration cache statistics +# +# @cache-size: XBZRLE cache size +# +# @bytes: amount of bytes already transferred to the target VM +# +# @pages: amount of pages transferred to the target VM +# +# @cache-miss: number of cache miss +# +# @overflow: number of overflows +# +# Since: 1.2 +## +{ 'type': 'XBZRLECacheStats', + 'data': {'cache-size': 'int', 'bytes': 'int', 'pages': 'int', + 'cache-miss': 'int', 'overflow': 'int' } } ## # @MigrationInfo @@ -318,11 +346,16 @@ # status, only returned if status is 'active' and it is a block # migration # +# @xbzrle-cache: #optional @XBZRLECacheStats containing detailed XBZRLE +# migration statistics, only returned if XBZRLE feature is on and +# status is 'active' or 'completed' (since 1.2) +# # Since: 0.14.0 ## { 'type': 'MigrationInfo', 'data': {'*status': 'str', '*ram': 'MigrationStats', - '*disk': 'MigrationStats'} } + '*disk': 'MigrationStats', + '*xbzrle-cache': 'XBZRLECacheStats'} } ## # @query-migrate @@ -336,6 +369,57 @@ { 'command': 'query-migrate', 'returns': 'MigrationInfo' } ## +# @MigrationCapability +# +# Migration capabilities enumeration +# +# @xbzrle: Migration supports xbzrle (Xor Based Zero Run Length Encoding). +# This feature allows us to minimize migration traffic for certain work +# loads, by sending compressed difference of the pages +# +# Since: 1.2 +## +{ 'enum': 'MigrationCapability', + 'data': ['xbzrle'] } + +## +# @MigrationCapabilityStatus +# +# Migration capability information +# +# @capability: capability enum +# +# @state: capability state bool +# +# Since: 1.2 +## +{ 'type': 'MigrationCapabilityStatus', + 'data': { 'capability' : 'MigrationCapability', 'state' : 'bool' } } + +## +# @migrate-set-capabilities +# +# Enable/Disable the following migration capabilities (like xbzrle) +# +# @capabilities: json array of capability modifications to make +# +# Since: 1.2 +## +{ 'command': 'migrate-set-capabilities', + 'data': { 'capabilities': ['MigrationCapabilityStatus'] } } + +## +# @query-migrate-capabilities +# +# Returns information about the current migration capabilities status +# +# Returns: @MigrationCapabilitiesStatus +# +# Since: 1.2 +## +{ 'command': 'query-migrate-capabilities', 'returns': ['MigrationCapabilityStatus']} + +## # @MouseInfo: # # Information about a mouse device. @@ -1358,6 +1442,33 @@ { 'command': 'migrate_set_speed', 'data': {'value': 'int'} } ## +# @migrate-set-cache-size +# +# Set XBZRLE cache size +# +# @value: cache size in bytes +# +# The size will be rounded down to the nearest power of 2. +# The cache size can be modified before and during ongoing migration +# +# Returns: nothing on success +# +# Since: 1.2 +## +{ 'command': 'migrate-set-cache-size', 'data': {'value': 'int'} } + +## +# @query-migrate-cache-size +# +# query XBZRLE cache size +# +# Returns: XBZRLE cache size in bytes +# +# Since: 1.2 +## +{ 'command': 'query-migrate-cache-size', 'returns': 'int' } + +## # @ObjectPropertyInfo: # # @name: the name of the property diff --git a/qemu-common.h b/qemu-common.h index f9deca6f86..095e28d89a 100644 --- a/qemu-common.h +++ b/qemu-common.h @@ -1,3 +1,4 @@ + /* Common header file that is included by all of qemu. */ #ifndef QEMU_COMMON_H #define QEMU_COMMON_H @@ -429,6 +430,26 @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) /* Round number up to multiple */ #define QEMU_ALIGN_UP(n, m) QEMU_ALIGN_DOWN((n) + (m) - 1, (m)) +static inline bool is_power_of_2(uint64_t value) +{ + if (!value) { + return 0; + } + + return !(value & (value - 1)); +} + +/* round down to the nearest power of 2*/ +int64_t pow2floor(int64_t value); + #include "module.h" +/* + * Implementation of ULEB128 (http://en.wikipedia.org/wiki/LEB128) + * Input is limited to 14-bit numbers + */ + +int uleb128_encode_small(uint8_t *out, uint32_t n); +int uleb128_decode_small(const uint8_t *in, uint32_t *n); + #endif diff --git a/qmp-commands.hx b/qmp-commands.hx index 521da1055f..527b9f7c24 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -520,6 +520,50 @@ Example: <- { "return": {} } EQMP +{ + .name = "migrate-set-cache-size", + .args_type = "value:o", + .mhandler.cmd_new = qmp_marshal_input_migrate_set_cache_size, + }, + +SQMP +migrate-set-cache-size +--------------------- + +Set cache size to be used by XBZRLE migration, the cache size will be rounded +down to the nearest power of 2 + +Arguments: + +- "value": cache size in bytes (json-int) + +Example: + +-> { "execute": "migrate-set-cache-size", "arguments": { "value": 536870912 } } +<- { "return": {} } + +EQMP + { + .name = "query-migrate-cache-size", + .args_type = "", + .mhandler.cmd_new = qmp_marshal_input_query_migrate_cache_size, + }, + +SQMP +query-migrate-cache-size +--------------------- + +Show cache size to be used by XBZRLE migration + +returns a json-object with the following information: +- "size" : json-int + +Example: + +-> { "execute": "query-migrate-cache-size" } +<- { "return": 67108864 } + +EQMP { .name = "migrate_set_speed", @@ -2078,12 +2122,24 @@ The main json-object contains the following: - "transferred": amount transferred (json-int) - "remaining": amount remaining (json-int) - "total": total (json-int) + - "total-time": total amount of ms since migration started. If + migration has ended, it returns the total migration time + (json-int) + - "duplicate": number of duplicated pages (json-int) + - "normal" : number of normal pages transferred (json-int) + - "normal-bytes" : number of normal bytes transferred (json-int) - "disk": only present if "status" is "active" and it is a block migration, it is a json-object with the following disk information (in bytes): - "transferred": amount transferred (json-int) - "remaining": amount remaining (json-int) - "total": total (json-int) - +- "xbzrle-cache": only present if XBZRLE is active. + It is a json-object with the following XBZRLE information: + - "cache-size": XBZRLE cache size + - "bytes": total XBZRLE bytes transferred + - "pages": number of XBZRLE compressed pages + - "cache-miss": number of cache misses + - "overflow": number of XBZRLE overflows Examples: 1. Before the first migration @@ -2094,7 +2150,19 @@ Examples: 2. Migration is done and has succeeded -> { "execute": "query-migrate" } -<- { "return": { "status": "completed" } } +<- { "return": { + "status": "completed", + "ram":{ + "transferred":123, + "remaining":123, + "total":246, + "total-time":12345, + "duplicate":123, + "normal":123, + "normal-bytes":123456 + } + } + } 3. Migration is done and has failed @@ -2110,7 +2178,11 @@ Examples: "ram":{ "transferred":123, "remaining":123, - "total":246 + "total":246, + "total-time":12345, + "duplicate":123, + "normal":123, + "normal-bytes":123456 } } } @@ -2124,7 +2196,11 @@ Examples: "ram":{ "total":1057024, "remaining":1053304, - "transferred":3720 + "transferred":3720, + "total-time":12345, + "duplicate":123, + "normal":123, + "normal-bytes":123456 }, "disk":{ "total":20971520, @@ -2134,6 +2210,32 @@ Examples: } } +6. Migration is being performed and XBZRLE is active: + +-> { "execute": "query-migrate" } +<- { + "return":{ + "status":"active", + "capabilities" : [ { "capability": "xbzrle", "state" : true } ], + "ram":{ + "total":1057024, + "remaining":1053304, + "transferred":3720, + "total-time":12345, + "duplicate":10, + "normal":3333, + "normal-bytes":3412992 + }, + "xbzrle-cache":{ + "cache-size":67108864, + "bytes":20971520, + "pages":2444343, + "cache-miss":2244, + "overflow":34434 + } + } + } + EQMP { @@ -2143,6 +2245,55 @@ EQMP }, SQMP +migrate-set-capabilities +------- + +Enable/Disable migration capabilities + +- "xbzrle": xbzrle support + +Arguments: + +Example: + +-> { "execute": "migrate-set-capabilities" , "arguments": + { "capabilities": [ { "capability": "xbzrle", "state": true } ] } } + +EQMP + + { + .name = "migrate-set-capabilities", + .args_type = "capabilities:O", + .params = "capability:s,state:b", + .mhandler.cmd_new = qmp_marshal_input_migrate_set_capabilities, + }, +SQMP +query-migrate-capabilities +------- + +Query current migration capabilities + +- "capabilities": migration capabilities state + - "xbzrle" : XBZRLE state (json-bool) + +Arguments: + +Example: + +-> { "execute": "query-migrate-capabilities" } +<- { "return": { + "capabilities" : [ { "capability" : "xbzrle", "state" : false } ] + } + } +EQMP + + { + .name = "query-migrate-capabilities", + .args_type = "", + .mhandler.cmd_new = qmp_marshal_input_query_migrate_capabilities, + }, + +SQMP query-balloon ------------- @@ -2392,3 +2392,162 @@ void vmstate_register_ram_global(MemoryRegion *mr) { vmstate_register_ram(mr, NULL); } + +/* + page = zrun nzrun + | zrun nzrun page + + zrun = length + + nzrun = length byte... + + length = uleb128 encoded integer + */ +int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen, + uint8_t *dst, int dlen) +{ + uint32_t zrun_len = 0, nzrun_len = 0; + int d = 0, i = 0; + long res, xor; + uint8_t *nzrun_start = NULL; + + g_assert(!(((uintptr_t)old_buf | (uintptr_t)new_buf | slen) % + sizeof(long))); + + while (i < slen) { + /* overflow */ + if (d + 2 > dlen) { + return -1; + } + + /* not aligned to sizeof(long) */ + res = (slen - i) % sizeof(long); + while (res && old_buf[i] == new_buf[i]) { + zrun_len++; + i++; + res--; + } + + /* word at a time for speed */ + if (!res) { + while (i < slen && + (*(long *)(old_buf + i)) == (*(long *)(new_buf + i))) { + i += sizeof(long); + zrun_len += sizeof(long); + } + + /* go over the rest */ + while (i < slen && old_buf[i] == new_buf[i]) { + zrun_len++; + i++; + } + } + + /* buffer unchanged */ + if (zrun_len == slen) { + return 0; + } + + /* skip last zero run */ + if (i == slen) { + return d; + } + + d += uleb128_encode_small(dst + d, zrun_len); + + zrun_len = 0; + nzrun_start = new_buf + i; + + /* overflow */ + if (d + 2 > dlen) { + return -1; + } + /* not aligned to sizeof(long) */ + res = (slen - i) % sizeof(long); + while (res && old_buf[i] != new_buf[i]) { + i++; + nzrun_len++; + res--; + } + + /* word at a time for speed, use of 32-bit long okay */ + if (!res) { + /* truncation to 32-bit long okay */ + long mask = 0x0101010101010101ULL; + while (i < slen) { + xor = *(long *)(old_buf + i) ^ *(long *)(new_buf + i); + if ((xor - mask) & ~xor & (mask << 7)) { + /* found the end of an nzrun within the current long */ + while (old_buf[i] != new_buf[i]) { + nzrun_len++; + i++; + } + break; + } else { + i += sizeof(long); + nzrun_len += sizeof(long); + } + } + } + + d += uleb128_encode_small(dst + d, nzrun_len); + /* overflow */ + if (d + nzrun_len > dlen) { + return -1; + } + memcpy(dst + d, nzrun_start, nzrun_len); + d += nzrun_len; + nzrun_len = 0; + } + + return d; +} + +int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen) +{ + int i = 0, d = 0; + int ret; + uint32_t count = 0; + + while (i < slen) { + + /* zrun */ + if ((slen - i) < 2) { + return -1; + } + + ret = uleb128_decode_small(src + i, &count); + if (ret < 0 || (i && !count)) { + return -1; + } + i += ret; + d += count; + + /* overflow */ + if (d > dlen) { + return -1; + } + + /* nzrun */ + if ((slen - i) < 2) { + return -1; + } + + ret = uleb128_decode_small(src + i, &count); + if (ret < 0 || !count) { + return -1; + } + i += ret; + + /* overflow */ + if (d + count > dlen || i + count > slen) { + return -1; + } + + memcpy(dst + d, src + i, count); + d += count; + i += count; + } + + return d; +} diff --git a/target-i386/Makefile.objs b/target-i386/Makefile.objs index 683fd59af9..0715f58a8e 100644 --- a/target-i386/Makefile.objs +++ b/target-i386/Makefile.objs @@ -3,6 +3,7 @@ obj-y += excp_helper.o fpu_helper.o cc_helper.o int_helper.o svm_helper.o obj-y += smm_helper.o misc_helper.o mem_helper.o seg_helper.o obj-$(CONFIG_SOFTMMU) += machine.o arch_memory_mapping.o arch_dump.o obj-$(CONFIG_KVM) += kvm.o hyperv.o +obj-$(CONFIG_NO_KVM) += kvm-stub.o obj-$(CONFIG_LINUX_USER) += ioport-user.o obj-$(CONFIG_BSD_USER) += ioport-user.o diff --git a/target-i386/kvm-stub.c b/target-i386/kvm-stub.c new file mode 100644 index 0000000000..11429c461e --- /dev/null +++ b/target-i386/kvm-stub.c @@ -0,0 +1,18 @@ +/* + * QEMU KVM x86 specific function stubs + * + * Copyright Linaro Limited 2012 + * + * Author: Peter Maydell <peter.maydell@linaro.org> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ +#include "qemu-common.h" +#include "kvm_i386.h" + +bool kvm_allows_irq0_override(void) +{ + return 1; +} diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 4cfb3faf01..696b14a04a 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -23,6 +23,7 @@ #include "qemu-common.h" #include "sysemu.h" #include "kvm.h" +#include "kvm_i386.h" #include "cpu.h" #include "gdbstub.h" #include "host-utils.h" @@ -65,6 +66,11 @@ static bool has_msr_async_pf_en; static bool has_msr_misc_enable; static int lm_capable_kernel; +bool kvm_allows_irq0_override(void) +{ + return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing(); +} + static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max) { struct kvm_cpuid2 *cpuid; @@ -2041,4 +2047,11 @@ void kvm_arch_init_irq_routing(KVMState *s) */ no_hpet = 1; } + /* We know at this point that we're using the in-kernel + * irqchip, so we can use irqfds, and on x86 we know + * we can use msi via irqfd and GSI routing. + */ + kvm_irqfds_allowed = true; + kvm_msi_via_irqfd_allowed = true; + kvm_gsi_routing_allowed = true; } diff --git a/target-i386/kvm_i386.h b/target-i386/kvm_i386.h new file mode 100644 index 0000000000..b82bbf401e --- /dev/null +++ b/target-i386/kvm_i386.h @@ -0,0 +1,16 @@ +/* + * QEMU KVM support -- x86 specific functions. + * + * Copyright (c) 2012 Linaro Limited + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef QEMU_KVM_I386_H +#define QEMU_KVM_I386_H + +bool kvm_allows_irq0_override(void); + +#endif |