diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2020-09-25 14:46:18 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2020-09-25 14:46:18 +0100 |
commit | 8d16e72f2d4df2c9e631393adf1669a1da7efe8a (patch) | |
tree | badeaa53bb34f24540bf03dc00e536483eaa61f1 | |
parent | 8c1c07929feae876202ba26f07a540c5115c18cd (diff) | |
parent | e12a0edafeb5019aac74114b62a4703f79c5c693 (diff) |
Merge remote-tracking branch 'remotes/dgilbert/tags/pull-migration-20200925a' into staging
Migration and virtiofsd pull
Chuan Zheng's Dirtyrate and TLS changes, with small fixes from Dov and
Luarent.
Small virtiofs changes from Harry, Stefan, Vivek and Jiachen.
One HMP/monitor rework from me.
# gpg: Signature made Fri 25 Sep 2020 13:03:50 BST
# gpg: using RSA key 45F5C71B4A0CB7FB977A9FA90516331EBC5BFDE7
# gpg: Good signature from "Dr. David Alan Gilbert (RH2) <dgilbert@redhat.com>" [full]
# Primary key fingerprint: 45F5 C71B 4A0C B7FB 977A 9FA9 0516 331E BC5B FDE7
* remotes/dgilbert/tags/pull-migration-20200925a: (26 commits)
virtiofsd: Add -o allow_direct_io|no_allow_direct_io options
virtiofsd: Used glib "shared" thread pool
virtiofsd: document cache=auto default
monitor: Use LOCK_GUARD macros
migration/tls: add trace points for multifd-tls
migration/tls: add support for multifd tls-handshake
migration/tls: extract cleanup function for common-use
migration/tls: add tls_hostname into MultiFDSendParams
migration/tls: extract migration_tls_client_create for common-use
migration/tls: save hostname into MigrationState
migration: increase max-bandwidth to 128 MiB/s (1 Gib/s)
migration: Truncate state file in xen-save-devices-state
migration/dirtyrate: Add trace_calls to make it easier to debug
migration/dirtyrate: Implement qmp_cal_dirty_rate()/qmp_get_dirty_rate() function
migration/dirtyrate: Implement calculate_dirtyrate() function
migration/dirtyrate: Implement set_sample_page_period() and is_sample_period_valid()
migration/dirtyrate: skip sampling ramblock with size below MIN_RAMBLOCK_SIZE
migration/dirtyrate: Compare page hash results for recorded sampled page
migration/dirtyrate: Record hash results for each sampled page
migration/dirtyrate: move RAMBLOCK_FOREACH_MIGRATABLE into ram.h
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r-- | docs/tools/virtiofsd.rst | 1 | ||||
-rw-r--r-- | migration/channel.c | 1 | ||||
-rw-r--r-- | migration/dirtyrate.c | 426 | ||||
-rw-r--r-- | migration/dirtyrate.h | 69 | ||||
-rw-r--r-- | migration/meson.build | 2 | ||||
-rw-r--r-- | migration/migration.c | 36 | ||||
-rw-r--r-- | migration/migration.h | 9 | ||||
-rw-r--r-- | migration/multifd.c | 124 | ||||
-rw-r--r-- | migration/multifd.h | 2 | ||||
-rw-r--r-- | migration/postcopy-ram.c | 24 | ||||
-rw-r--r-- | migration/ram.c | 11 | ||||
-rw-r--r-- | migration/ram.h | 10 | ||||
-rw-r--r-- | migration/savevm.c | 3 | ||||
-rw-r--r-- | migration/tls.c | 28 | ||||
-rw-r--r-- | migration/tls.h | 6 | ||||
-rw-r--r-- | migration/trace-events | 12 | ||||
-rw-r--r-- | monitor/misc.c | 44 | ||||
-rw-r--r-- | qapi/migration.json | 67 | ||||
-rw-r--r-- | tools/virtiofsd/fuse_virtio.c | 2 | ||||
-rw-r--r-- | tools/virtiofsd/helper.c | 4 | ||||
-rw-r--r-- | tools/virtiofsd/passthrough_ll.c | 20 | ||||
-rw-r--r-- | tools/virtiofsd/passthrough_seccomp.c | 2 |
22 files changed, 797 insertions, 106 deletions
diff --git a/docs/tools/virtiofsd.rst b/docs/tools/virtiofsd.rst index 7fe6a87291..ae02938a95 100644 --- a/docs/tools/virtiofsd.rst +++ b/docs/tools/virtiofsd.rst @@ -103,6 +103,7 @@ Options forbids the FUSE client from caching to achieve best coherency at the cost of performance. ``auto`` acts similar to NFS with a 1 second metadata cache timeout. ``always`` sets a long cache lifetime at the expense of coherency. + The default is ``auto``. Examples -------- diff --git a/migration/channel.c b/migration/channel.c index 20e4c8e2dc..8a783baa0b 100644 --- a/migration/channel.c +++ b/migration/channel.c @@ -90,5 +90,6 @@ void migration_channel_connect(MigrationState *s, } } migrate_fd_connect(s, error); + g_free(s->hostname); error_free(error); } diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c new file mode 100644 index 0000000000..68577ef250 --- /dev/null +++ b/migration/dirtyrate.c @@ -0,0 +1,426 @@ +/* + * Dirtyrate implement code + * + * Copyright (c) 2020 HUAWEI TECHNOLOGIES CO.,LTD. + * + * Authors: + * Chuan Zheng <zhengchuan@huawei.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include <zlib.h> +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "cpu.h" +#include "qemu/config-file.h" +#include "exec/memory.h" +#include "exec/ramblock.h" +#include "exec/target_page.h" +#include "qemu/rcu_queue.h" +#include "qapi/qapi-commands-migration.h" +#include "migration.h" +#include "ram.h" +#include "trace.h" +#include "dirtyrate.h" + +static int CalculatingState = DIRTY_RATE_STATUS_UNSTARTED; +static struct DirtyRateStat DirtyStat; + +static int64_t set_sample_page_period(int64_t msec, int64_t initial_time) +{ + int64_t current_time; + + current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + if ((current_time - initial_time) >= msec) { + msec = current_time - initial_time; + } else { + g_usleep((msec + initial_time - current_time) * 1000); + } + + return msec; +} + +static bool is_sample_period_valid(int64_t sec) +{ + if (sec < MIN_FETCH_DIRTYRATE_TIME_SEC || + sec > MAX_FETCH_DIRTYRATE_TIME_SEC) { + return false; + } + + return true; +} + +static int dirtyrate_set_state(int *state, int old_state, int new_state) +{ + assert(new_state < DIRTY_RATE_STATUS__MAX); + trace_dirtyrate_set_state(DirtyRateStatus_str(new_state)); + if (qatomic_cmpxchg(state, old_state, new_state) == old_state) { + return 0; + } else { + return -1; + } +} + +static struct DirtyRateInfo *query_dirty_rate_info(void) +{ + int64_t dirty_rate = DirtyStat.dirty_rate; + struct DirtyRateInfo *info = g_malloc0(sizeof(DirtyRateInfo)); + + if (qatomic_read(&CalculatingState) == DIRTY_RATE_STATUS_MEASURED) { + info->dirty_rate = dirty_rate; + } else { + info->dirty_rate = -1; + } + + info->status = CalculatingState; + info->start_time = DirtyStat.start_time; + info->calc_time = DirtyStat.calc_time; + + trace_query_dirty_rate_info(DirtyRateStatus_str(CalculatingState)); + + return info; +} + +static void reset_dirtyrate_stat(void) +{ + DirtyStat.total_dirty_samples = 0; + DirtyStat.total_sample_count = 0; + DirtyStat.total_block_mem_MB = 0; + DirtyStat.dirty_rate = -1; + DirtyStat.start_time = 0; + DirtyStat.calc_time = 0; +} + +static void update_dirtyrate_stat(struct RamblockDirtyInfo *info) +{ + DirtyStat.total_dirty_samples += info->sample_dirty_count; + DirtyStat.total_sample_count += info->sample_pages_count; + /* size of total pages in MB */ + DirtyStat.total_block_mem_MB += (info->ramblock_pages * + TARGET_PAGE_SIZE) >> 20; +} + +static void update_dirtyrate(uint64_t msec) +{ + uint64_t dirtyrate; + uint64_t total_dirty_samples = DirtyStat.total_dirty_samples; + uint64_t total_sample_count = DirtyStat.total_sample_count; + uint64_t total_block_mem_MB = DirtyStat.total_block_mem_MB; + + dirtyrate = total_dirty_samples * total_block_mem_MB * + 1000 / (total_sample_count * msec); + + DirtyStat.dirty_rate = dirtyrate; +} + +/* + * get hash result for the sampled memory with length of TARGET_PAGE_SIZE + * in ramblock, which starts from ramblock base address. + */ +static uint32_t get_ramblock_vfn_hash(struct RamblockDirtyInfo *info, + uint64_t vfn) +{ + uint32_t crc; + + crc = crc32(0, (info->ramblock_addr + + vfn * TARGET_PAGE_SIZE), TARGET_PAGE_SIZE); + + trace_get_ramblock_vfn_hash(info->idstr, vfn, crc); + return crc; +} + +static bool save_ramblock_hash(struct RamblockDirtyInfo *info) +{ + unsigned int sample_pages_count; + int i; + GRand *rand; + + sample_pages_count = info->sample_pages_count; + + /* ramblock size less than one page, return success to skip this ramblock */ + if (unlikely(info->ramblock_pages == 0 || sample_pages_count == 0)) { + return true; + } + + info->hash_result = g_try_malloc0_n(sample_pages_count, + sizeof(uint32_t)); + if (!info->hash_result) { + return false; + } + + info->sample_page_vfn = g_try_malloc0_n(sample_pages_count, + sizeof(uint64_t)); + if (!info->sample_page_vfn) { + g_free(info->hash_result); + return false; + } + + rand = g_rand_new(); + for (i = 0; i < sample_pages_count; i++) { + info->sample_page_vfn[i] = g_rand_int_range(rand, 0, + info->ramblock_pages - 1); + info->hash_result[i] = get_ramblock_vfn_hash(info, + info->sample_page_vfn[i]); + } + g_rand_free(rand); + + return true; +} + +static void get_ramblock_dirty_info(RAMBlock *block, + struct RamblockDirtyInfo *info, + struct DirtyRateConfig *config) +{ + uint64_t sample_pages_per_gigabytes = config->sample_pages_per_gigabytes; + + /* Right shift 30 bits to calc ramblock size in GB */ + info->sample_pages_count = (qemu_ram_get_used_length(block) * + sample_pages_per_gigabytes) >> 30; + /* Right shift TARGET_PAGE_BITS to calc page count */ + info->ramblock_pages = qemu_ram_get_used_length(block) >> + TARGET_PAGE_BITS; + info->ramblock_addr = qemu_ram_get_host_addr(block); + strcpy(info->idstr, qemu_ram_get_idstr(block)); +} + +static void free_ramblock_dirty_info(struct RamblockDirtyInfo *infos, int count) +{ + int i; + + if (!infos) { + return; + } + + for (i = 0; i < count; i++) { + g_free(infos[i].sample_page_vfn); + g_free(infos[i].hash_result); + } + g_free(infos); +} + +static bool skip_sample_ramblock(RAMBlock *block) +{ + /* + * Sample only blocks larger than MIN_RAMBLOCK_SIZE. + */ + if (qemu_ram_get_used_length(block) < (MIN_RAMBLOCK_SIZE << 10)) { + trace_skip_sample_ramblock(block->idstr, + qemu_ram_get_used_length(block)); + return true; + } + + return false; +} + +static bool record_ramblock_hash_info(struct RamblockDirtyInfo **block_dinfo, + struct DirtyRateConfig config, + int *block_count) +{ + struct RamblockDirtyInfo *info = NULL; + struct RamblockDirtyInfo *dinfo = NULL; + RAMBlock *block = NULL; + int total_count = 0; + int index = 0; + bool ret = false; + + RAMBLOCK_FOREACH_MIGRATABLE(block) { + if (skip_sample_ramblock(block)) { + continue; + } + total_count++; + } + + dinfo = g_try_malloc0_n(total_count, sizeof(struct RamblockDirtyInfo)); + if (dinfo == NULL) { + goto out; + } + + RAMBLOCK_FOREACH_MIGRATABLE(block) { + if (skip_sample_ramblock(block)) { + continue; + } + if (index >= total_count) { + break; + } + info = &dinfo[index]; + get_ramblock_dirty_info(block, info, &config); + if (!save_ramblock_hash(info)) { + goto out; + } + index++; + } + ret = true; + +out: + *block_count = index; + *block_dinfo = dinfo; + return ret; +} + +static void calc_page_dirty_rate(struct RamblockDirtyInfo *info) +{ + uint32_t crc; + int i; + + for (i = 0; i < info->sample_pages_count; i++) { + crc = get_ramblock_vfn_hash(info, info->sample_page_vfn[i]); + if (crc != info->hash_result[i]) { + trace_calc_page_dirty_rate(info->idstr, crc, info->hash_result[i]); + info->sample_dirty_count++; + } + } +} + +static struct RamblockDirtyInfo * +find_block_matched(RAMBlock *block, int count, + struct RamblockDirtyInfo *infos) +{ + int i; + struct RamblockDirtyInfo *matched; + + for (i = 0; i < count; i++) { + if (!strcmp(infos[i].idstr, qemu_ram_get_idstr(block))) { + break; + } + } + + if (i == count) { + return NULL; + } + + if (infos[i].ramblock_addr != qemu_ram_get_host_addr(block) || + infos[i].ramblock_pages != + (qemu_ram_get_used_length(block) >> TARGET_PAGE_BITS)) { + trace_find_page_matched(block->idstr); + return NULL; + } + + matched = &infos[i]; + + return matched; +} + +static bool compare_page_hash_info(struct RamblockDirtyInfo *info, + int block_count) +{ + struct RamblockDirtyInfo *block_dinfo = NULL; + RAMBlock *block = NULL; + + RAMBLOCK_FOREACH_MIGRATABLE(block) { + if (skip_sample_ramblock(block)) { + continue; + } + block_dinfo = find_block_matched(block, block_count, info); + if (block_dinfo == NULL) { + continue; + } + calc_page_dirty_rate(block_dinfo); + update_dirtyrate_stat(block_dinfo); + } + + if (DirtyStat.total_sample_count == 0) { + return false; + } + + return true; +} + +static void calculate_dirtyrate(struct DirtyRateConfig config) +{ + struct RamblockDirtyInfo *block_dinfo = NULL; + int block_count = 0; + int64_t msec = 0; + int64_t initial_time; + + rcu_register_thread(); + reset_dirtyrate_stat(); + rcu_read_lock(); + initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + if (!record_ramblock_hash_info(&block_dinfo, config, &block_count)) { + goto out; + } + rcu_read_unlock(); + + msec = config.sample_period_seconds * 1000; + msec = set_sample_page_period(msec, initial_time); + DirtyStat.start_time = initial_time / 1000; + DirtyStat.calc_time = msec / 1000; + + rcu_read_lock(); + if (!compare_page_hash_info(block_dinfo, block_count)) { + goto out; + } + + update_dirtyrate(msec); + +out: + rcu_read_unlock(); + free_ramblock_dirty_info(block_dinfo, block_count); + rcu_unregister_thread(); +} + +void *get_dirtyrate_thread(void *arg) +{ + struct DirtyRateConfig config = *(struct DirtyRateConfig *)arg; + int ret; + + ret = dirtyrate_set_state(&CalculatingState, DIRTY_RATE_STATUS_UNSTARTED, + DIRTY_RATE_STATUS_MEASURING); + if (ret == -1) { + error_report("change dirtyrate state failed."); + return NULL; + } + + calculate_dirtyrate(config); + + ret = dirtyrate_set_state(&CalculatingState, DIRTY_RATE_STATUS_MEASURING, + DIRTY_RATE_STATUS_MEASURED); + if (ret == -1) { + error_report("change dirtyrate state failed."); + } + return NULL; +} + +void qmp_calc_dirty_rate(int64_t calc_time, Error **errp) +{ + static struct DirtyRateConfig config; + QemuThread thread; + int ret; + + /* + * If the dirty rate is already being measured, don't attempt to start. + */ + if (qatomic_read(&CalculatingState) == DIRTY_RATE_STATUS_MEASURING) { + error_setg(errp, "the dirty rate is already being measured."); + return; + } + + if (!is_sample_period_valid(calc_time)) { + error_setg(errp, "calc-time is out of range[%d, %d].", + MIN_FETCH_DIRTYRATE_TIME_SEC, + MAX_FETCH_DIRTYRATE_TIME_SEC); + return; + } + + /* + * Init calculation state as unstarted. + */ + ret = dirtyrate_set_state(&CalculatingState, CalculatingState, + DIRTY_RATE_STATUS_UNSTARTED); + if (ret == -1) { + error_setg(errp, "init dirty rate calculation state failed."); + return; + } + + config.sample_period_seconds = calc_time; + config.sample_pages_per_gigabytes = DIRTYRATE_DEFAULT_SAMPLE_PAGES; + qemu_thread_create(&thread, "get_dirtyrate", get_dirtyrate_thread, + (void *)&config, QEMU_THREAD_DETACHED); +} + +struct DirtyRateInfo *qmp_query_dirty_rate(Error **errp) +{ + return query_dirty_rate_info(); +} diff --git a/migration/dirtyrate.h b/migration/dirtyrate.h new file mode 100644 index 0000000000..6ec429534d --- /dev/null +++ b/migration/dirtyrate.h @@ -0,0 +1,69 @@ +/* + * Dirtyrate common functions + * + * Copyright (c) 2020 HUAWEI TECHNOLOGIES CO., LTD. + * + * Authors: + * Chuan Zheng <zhengchuan@huawei.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_MIGRATION_DIRTYRATE_H +#define QEMU_MIGRATION_DIRTYRATE_H + +/* + * Sample 512 pages per GB as default. + * TODO: Make it configurable. + */ +#define DIRTYRATE_DEFAULT_SAMPLE_PAGES 512 + +/* + * Record ramblock idstr + */ +#define RAMBLOCK_INFO_MAX_LEN 256 + +/* + * Minimum RAMBlock size to sample, in megabytes. + */ +#define MIN_RAMBLOCK_SIZE 128 + +/* + * Take 1s as minimum time for calculation duration + */ +#define MIN_FETCH_DIRTYRATE_TIME_SEC 1 +#define MAX_FETCH_DIRTYRATE_TIME_SEC 60 + +struct DirtyRateConfig { + uint64_t sample_pages_per_gigabytes; /* sample pages per GB */ + int64_t sample_period_seconds; /* time duration between two sampling */ +}; + +/* + * Store dirtypage info for each ramblock. + */ +struct RamblockDirtyInfo { + char idstr[RAMBLOCK_INFO_MAX_LEN]; /* idstr for each ramblock */ + uint8_t *ramblock_addr; /* base address of ramblock we measure */ + uint64_t ramblock_pages; /* ramblock size in TARGET_PAGE_SIZE */ + uint64_t *sample_page_vfn; /* relative offset address for sampled page */ + uint64_t sample_pages_count; /* count of sampled pages */ + uint64_t sample_dirty_count; /* count of dirty pages we measure */ + uint32_t *hash_result; /* array of hash result for sampled pages */ +}; + +/* + * Store calculation statistics for each measure. + */ +struct DirtyRateStat { + uint64_t total_dirty_samples; /* total dirty sampled page */ + uint64_t total_sample_count; /* total sampled pages */ + uint64_t total_block_mem_MB; /* size of total sampled pages in MB */ + int64_t dirty_rate; /* dirty rate in MB/s */ + int64_t start_time; /* calculation start time in units of second */ + int64_t calc_time; /* time duration of two sampling in units of second */ +}; + +void *get_dirtyrate_thread(void *arg); +#endif diff --git a/migration/meson.build b/migration/meson.build index ac8ff1419f..b5b71c8060 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -37,4 +37,4 @@ softmmu_ss.add(when: ['CONFIG_RDMA', rdma], if_true: files('rdma.c')) softmmu_ss.add(when: 'CONFIG_LIVE_BLOCK_MIGRATION', if_true: files('block.c')) softmmu_ss.add(when: 'CONFIG_ZSTD', if_true: [files('multifd-zstd.c'), zstd]) -specific_ss.add(when: 'CONFIG_SOFTMMU', if_true: files('ram.c')) +specific_ss.add(when: 'CONFIG_SOFTMMU', if_true: files('dirtyrate.c', 'ram.c')) diff --git a/migration/migration.c b/migration/migration.c index d9d1e0b190..838ca79f57 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -57,7 +57,7 @@ #include "qemu/queue.h" #include "multifd.h" -#define MAX_THROTTLE (32 << 20) /* Migration transfer speed throttling */ +#define MAX_THROTTLE (128 << 20) /* Migration transfer speed throttling */ /* Amount of time to allocate to each "chunk" of bandwidth-throttled * data. */ @@ -238,12 +238,15 @@ void migration_incoming_state_destroy(void) mis->postcopy_remote_fds = NULL; } - qemu_event_reset(&mis->main_thread_load_event); - if (mis->socket_address_list) { qapi_free_SocketAddressList(mis->socket_address_list); mis->socket_address_list = NULL; } + + qemu_event_destroy(&mis->main_thread_load_event); + qemu_sem_destroy(&mis->postcopy_pause_sem_dst); + qemu_sem_destroy(&mis->postcopy_pause_sem_fault); + qemu_mutex_destroy(&mis->rp_mutex); } static void migrate_generate_event(int new_state) @@ -311,25 +314,35 @@ error: return ret; } -/* Request a range of pages from the source VM at the given - * start address. - * rbname: Name of the RAMBlock to request the page in, if NULL it's the same - * as the last request (a name must have been given previously) +/* Request one page from the source VM at the given start address. + * rb: the RAMBlock to request the page in * Start: Address offset within the RB * Len: Length in bytes required - must be a multiple of pagesize */ -int migrate_send_rp_req_pages(MigrationIncomingState *mis, const char *rbname, - ram_addr_t start, size_t len) +int migrate_send_rp_req_pages(MigrationIncomingState *mis, RAMBlock *rb, + ram_addr_t start) { uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname up to 256 */ size_t msglen = 12; /* start + len */ + size_t len = qemu_ram_pagesize(rb); enum mig_rp_message_type msg_type; + const char *rbname; + int rbname_len; *(uint64_t *)bufc = cpu_to_be64((uint64_t)start); *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len); - if (rbname) { - int rbname_len = strlen(rbname); + /* + * We maintain the last ramblock that we requested for page. Note that we + * don't need locking because this function will only be called within the + * postcopy ram fault thread. + */ + if (rb != mis->last_rb) { + mis->last_rb = rb; + + rbname = qemu_ram_get_idstr(rb); + rbname_len = strlen(rbname); + assert(rbname_len < 256); bufc[msglen++] = rbname_len; @@ -1883,6 +1896,7 @@ void migrate_init(MigrationState *s) s->migration_thread_running = false; error_free(s->error); s->error = NULL; + s->hostname = NULL; migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP); diff --git a/migration/migration.h b/migration/migration.h index bdc7450da3..deb411aaad 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -259,6 +259,11 @@ struct MigrationState * (which is in 4M chunk). */ uint8_t clear_bitmap_shift; + + /* + * This save hostname when out-going migration starts + */ + char *hostname; }; void migrate_set_state(int *state, int old_state, int new_state); @@ -326,8 +331,8 @@ void migrate_send_rp_shut(MigrationIncomingState *mis, uint32_t value); void migrate_send_rp_pong(MigrationIncomingState *mis, uint32_t value); -int migrate_send_rp_req_pages(MigrationIncomingState *mis, const char* rbname, - ram_addr_t start, size_t len); +int migrate_send_rp_req_pages(MigrationIncomingState *mis, RAMBlock *rb, + ram_addr_t start); void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis, char *block_name); void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value); diff --git a/migration/multifd.c b/migration/multifd.c index fd57378db8..776f963436 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -20,6 +20,7 @@ #include "ram.h" #include "migration.h" #include "socket.h" +#include "tls.h" #include "qemu-file.h" #include "trace.h" #include "multifd.h" @@ -548,6 +549,8 @@ void multifd_save_cleanup(void) qemu_sem_destroy(&p->sem_sync); g_free(p->name); p->name = NULL; + g_free(p->tls_hostname); + p->tls_hostname = NULL; multifd_pages_clear(p->pages); p->pages = NULL; p->packet_len = 0; @@ -717,6 +720,102 @@ out: return NULL; } +static bool multifd_channel_connect(MultiFDSendParams *p, + QIOChannel *ioc, + Error *error); + +static void multifd_tls_outgoing_handshake(QIOTask *task, + gpointer opaque) +{ + MultiFDSendParams *p = opaque; + QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task)); + Error *err = NULL; + + if (qio_task_propagate_error(task, &err)) { + trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); + } else { + trace_multifd_tls_outgoing_handshake_complete(ioc); + } + multifd_channel_connect(p, ioc, err); +} + +static void multifd_tls_channel_connect(MultiFDSendParams *p, + QIOChannel *ioc, + Error **errp) +{ + MigrationState *s = migrate_get_current(); + const char *hostname = p->tls_hostname; + QIOChannelTLS *tioc; + + tioc = migration_tls_client_create(s, ioc, hostname, errp); + if (!tioc) { + return; + } + + trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname); + qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing"); + qio_channel_tls_handshake(tioc, + multifd_tls_outgoing_handshake, + p, + NULL, + NULL); + +} + +static bool multifd_channel_connect(MultiFDSendParams *p, + QIOChannel *ioc, + Error *error) +{ + MigrationState *s = migrate_get_current(); + + trace_multifd_set_outgoing_channel( + ioc, object_get_typename(OBJECT(ioc)), p->tls_hostname, error); + + if (!error) { + if (s->parameters.tls_creds && + *s->parameters.tls_creds && + !object_dynamic_cast(OBJECT(ioc), + TYPE_QIO_CHANNEL_TLS)) { + multifd_tls_channel_connect(p, ioc, &error); + if (!error) { + /* + * tls_channel_connect will call back to this + * function after the TLS handshake, + * so we mustn't call multifd_send_thread until then + */ + return false; + } else { + return true; + } + } else { + /* update for tls qio channel */ + p->c = ioc; + qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, + QEMU_THREAD_JOINABLE); + } + return false; + } + + return true; +} + +static void multifd_new_send_channel_cleanup(MultiFDSendParams *p, + QIOChannel *ioc, Error *err) +{ + migrate_set_error(migrate_get_current(), err); + /* Error happen, we need to tell who pay attention to me */ + qemu_sem_post(&multifd_send_state->channels_ready); + qemu_sem_post(&p->sem_sync); + /* + * Although multifd_send_thread is not created, but main migration + * thread neet to judge whether it is running, so we need to mark + * its status. + */ + p->quit = true; + object_unref(OBJECT(ioc)); + error_free(err); +} + static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) { MultiFDSendParams *p = opaque; @@ -725,25 +824,19 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) trace_multifd_new_send_channel_async(p->id); if (qio_task_propagate_error(task, &local_err)) { - migrate_set_error(migrate_get_current(), local_err); - /* Error happen, we need to tell who pay attention to me */ - qemu_sem_post(&multifd_send_state->channels_ready); - qemu_sem_post(&p->sem_sync); - /* - * Although multifd_send_thread is not created, but main migration - * thread needs to judge whether it is running, so we need to mark - * its status. - */ - p->quit = true; - object_unref(OBJECT(sioc)); - error_free(local_err); + goto cleanup; } else { p->c = QIO_CHANNEL(sioc); qio_channel_set_delay(p->c, false); p->running = true; - qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, - QEMU_THREAD_JOINABLE); + if (multifd_channel_connect(p, sioc, local_err)) { + goto cleanup; + } + return; } + +cleanup: + multifd_new_send_channel_cleanup(p, sioc, local_err); } int multifd_save_setup(Error **errp) @@ -751,10 +844,12 @@ int multifd_save_setup(Error **errp) int thread_count; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); uint8_t i; + MigrationState *s; if (!migrate_use_multifd()) { return 0; } + s = migrate_get_current(); thread_count = migrate_multifd_channels(); multifd_send_state = g_malloc0(sizeof(*multifd_send_state)); multifd_send_state->params = g_new0(MultiFDSendParams, thread_count); @@ -779,6 +874,7 @@ int multifd_save_setup(Error **errp) p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); p->packet->version = cpu_to_be32(MULTIFD_VERSION); p->name = g_strdup_printf("multifdsend_%d", i); + p->tls_hostname = g_strdup(s->hostname); socket_send_channel_create(multifd_new_send_channel_async, p); } diff --git a/migration/multifd.h b/migration/multifd.h index 448a03d89a..8d6751f5ed 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -71,6 +71,8 @@ typedef struct { uint8_t id; /* channel thread name */ char *name; + /* tls hostname */ + char *tls_hostname; /* channel thread id */ QemuThread thread; /* communication channel */ diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 1654ff11a5..0a2f88a87d 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -684,14 +684,7 @@ int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, qemu_ram_get_idstr(rb), rb_offset); return postcopy_wake_shared(pcfd, client_addr, rb); } - if (rb != mis->last_rb) { - mis->last_rb = rb; - migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), - aligned_rbo, pagesize); - } else { - /* Save some space */ - migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize); - } + migrate_send_rp_req_pages(mis, rb, aligned_rbo); return 0; } @@ -986,20 +979,7 @@ retry: * Send the request to the source - we want to request one * of our host page sizes (which is >= TPS) */ - if (rb != mis->last_rb) { - mis->last_rb = rb; - ret = migrate_send_rp_req_pages(mis, - qemu_ram_get_idstr(rb), - rb_offset, - qemu_ram_pagesize(rb)); - } else { - /* Save some space */ - ret = migrate_send_rp_req_pages(mis, - NULL, - rb_offset, - qemu_ram_pagesize(rb)); - } - + ret = migrate_send_rp_req_pages(mis, rb, rb_offset); if (ret) { /* May be network failure, try to wait for recovery */ if (ret == -EIO && postcopy_pause_fault_thread(mis)) { diff --git a/migration/ram.c b/migration/ram.c index c5f36aeae5..433489d633 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -158,21 +158,12 @@ out: return ret; } -static bool ramblock_is_ignored(RAMBlock *block) +bool ramblock_is_ignored(RAMBlock *block) { return !qemu_ram_is_migratable(block) || (migrate_ignore_shared() && qemu_ram_is_shared(block)); } -/* Should be holding either ram_list.mutex, or the RCU lock. */ -#define RAMBLOCK_FOREACH_NOT_IGNORED(block) \ - INTERNAL_RAMBLOCK_FOREACH(block) \ - if (ramblock_is_ignored(block)) {} else - -#define RAMBLOCK_FOREACH_MIGRATABLE(block) \ - INTERNAL_RAMBLOCK_FOREACH(block) \ - if (!qemu_ram_is_migratable(block)) {} else - #undef RAMBLOCK_FOREACH int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) diff --git a/migration/ram.h b/migration/ram.h index 2eeaacfa13..011e85414e 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -37,6 +37,16 @@ extern MigrationStats ram_counters; extern XBZRLECacheStats xbzrle_counters; extern CompressionStats compression_counters; +bool ramblock_is_ignored(RAMBlock *block); +/* Should be holding either ram_list.mutex, or the RCU lock. */ +#define RAMBLOCK_FOREACH_NOT_IGNORED(block) \ + INTERNAL_RAMBLOCK_FOREACH(block) \ + if (ramblock_is_ignored(block)) {} else + +#define RAMBLOCK_FOREACH_MIGRATABLE(block) \ + INTERNAL_RAMBLOCK_FOREACH(block) \ + if (!qemu_ram_is_migratable(block)) {} else + int xbzrle_cache_resize(int64_t new_size, Error **errp); uint64_t ram_bytes_remaining(void); uint64_t ram_bytes_total(void); diff --git a/migration/savevm.c b/migration/savevm.c index ee21e981ba..34e4b71052 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -2803,7 +2803,8 @@ void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live, vm_stop(RUN_STATE_SAVE_VM); global_state_store_running(); - ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT, 0660, errp); + ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT | O_TRUNC, + 0660, errp); if (!ioc) { goto the_end; } diff --git a/migration/tls.c b/migration/tls.c index 7a02ec8656..66c6f43221 100644 --- a/migration/tls.c +++ b/migration/tls.c @@ -22,7 +22,6 @@ #include "channel.h" #include "migration.h" #include "tls.h" -#include "io/channel-tls.h" #include "crypto/tlscreds.h" #include "qemu/error-report.h" #include "qapi/error.h" @@ -125,11 +124,10 @@ static void migration_tls_outgoing_handshake(QIOTask *task, object_unref(OBJECT(ioc)); } - -void migration_tls_channel_connect(MigrationState *s, - QIOChannel *ioc, - const char *hostname, - Error **errp) +QIOChannelTLS *migration_tls_client_create(MigrationState *s, + QIOChannel *ioc, + const char *hostname, + Error **errp) { QCryptoTLSCreds *creds; QIOChannelTLS *tioc; @@ -137,7 +135,7 @@ void migration_tls_channel_connect(MigrationState *s, creds = migration_tls_get_creds( s, QCRYPTO_TLS_CREDS_ENDPOINT_CLIENT, errp); if (!creds) { - return; + return NULL; } if (s->parameters.tls_hostname && *s->parameters.tls_hostname) { @@ -145,15 +143,29 @@ void migration_tls_channel_connect(MigrationState *s, } if (!hostname) { error_setg(errp, "No hostname available for TLS"); - return; + return NULL; } tioc = qio_channel_tls_new_client( ioc, creds, hostname, errp); + + return tioc; +} + +void migration_tls_channel_connect(MigrationState *s, + QIOChannel *ioc, + const char *hostname, + Error **errp) +{ + QIOChannelTLS *tioc; + + tioc = migration_tls_client_create(s, ioc, hostname, errp); if (!tioc) { return; } + /* Save hostname into MigrationState for handshake */ + s->hostname = g_strdup(hostname); trace_migration_tls_outgoing_handshake_start(hostname); qio_channel_set_name(QIO_CHANNEL(tioc), "migration-tls-outgoing"); qio_channel_tls_handshake(tioc, diff --git a/migration/tls.h b/migration/tls.h index cdd70001ed..0cfbe368ba 100644 --- a/migration/tls.h +++ b/migration/tls.h @@ -22,11 +22,17 @@ #define QEMU_MIGRATION_TLS_H #include "io/channel.h" +#include "io/channel-tls.h" void migration_tls_channel_process_incoming(MigrationState *s, QIOChannel *ioc, Error **errp); +QIOChannelTLS *migration_tls_client_create(MigrationState *s, + QIOChannel *ioc, + const char *hostname, + Error **errp); + void migration_tls_channel_connect(MigrationState *s, QIOChannel *ioc, const char *hostname, diff --git a/migration/trace-events b/migration/trace-events index 7ba2fa6644..338f38b3dd 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -129,6 +129,10 @@ multifd_send_sync_main_wait(uint8_t id) "channel %d" multifd_send_terminate_threads(bool error) "error %d" multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %d packets %" PRIu64 " pages %" PRIu64 multifd_send_thread_start(uint8_t id) "%d" +multifd_tls_outgoing_handshake_start(void *ioc, void *tioc, const char *hostname) "ioc=%p tioc=%p hostname=%s" +multifd_tls_outgoing_handshake_error(void *ioc, const char *err) "ioc=%p err=%s" +multifd_tls_outgoing_handshake_complete(void *ioc) "ioc=%p" +multifd_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostname, void *err) "ioc=%p ioctype=%s hostname=%s err=%p" # migration.c await_return_path_close_on_source_close(void) "" @@ -313,3 +317,11 @@ dirty_bitmap_load_bits_zeroes(void) "" dirty_bitmap_load_header(uint32_t flags) "flags 0x%x" dirty_bitmap_load_enter(void) "" dirty_bitmap_load_success(void) "" + +# dirtyrate.c +dirtyrate_set_state(const char *new_state) "new state %s" +query_dirty_rate_info(const char *new_state) "current state %s" +get_ramblock_vfn_hash(const char *idstr, uint64_t vfn, uint32_t crc) "ramblock name: %s, vfn: %"PRIu64 ", crc: %" PRIu32 +calc_page_dirty_rate(const char *idstr, uint32_t new_crc, uint32_t old_crc) "ramblock name: %s, new crc: %" PRIu32 ", old crc: %" PRIu32 +skip_sample_ramblock(const char *idstr, uint64_t ramblock_size) "ramblock name: %s, ramblock size: %" PRIu64 +find_page_matched(const char *idstr) "ramblock %s addr or size changed" diff --git a/monitor/misc.c b/monitor/misc.c index 262f2bd951..6e0da0cb96 100644 --- a/monitor/misc.c +++ b/monitor/misc.c @@ -141,13 +141,13 @@ char *qmp_human_monitor_command(const char *command_line, bool has_cpu_index, handle_hmp_command(&hmp, command_line); cur_mon = old_mon; - qemu_mutex_lock(&hmp.common.mon_lock); - if (qstring_get_length(hmp.common.outbuf) > 0) { - output = g_strdup(qstring_get_str(hmp.common.outbuf)); - } else { - output = g_strdup(""); + WITH_QEMU_LOCK_GUARD(&hmp.common.mon_lock) { + if (qstring_get_length(hmp.common.outbuf) > 0) { + output = g_strdup(qstring_get_str(hmp.common.outbuf)); + } else { + output = g_strdup(""); + } } - qemu_mutex_unlock(&hmp.common.mon_lock); out: monitor_data_destroy(&hmp.common); @@ -1248,7 +1248,7 @@ void qmp_getfd(const char *fdname, Error **errp) return; } - qemu_mutex_lock(&cur_mon->mon_lock); + QEMU_LOCK_GUARD(&cur_mon->mon_lock); QLIST_FOREACH(monfd, &cur_mon->fds, next) { if (strcmp(monfd->name, fdname) != 0) { continue; @@ -1256,7 +1256,6 @@ void qmp_getfd(const char *fdname, Error **errp) tmp_fd = monfd->fd; monfd->fd = fd; - qemu_mutex_unlock(&cur_mon->mon_lock); /* Make sure close() is outside critical section */ close(tmp_fd); return; @@ -1267,7 +1266,6 @@ void qmp_getfd(const char *fdname, Error **errp) monfd->fd = fd; QLIST_INSERT_HEAD(&cur_mon->fds, monfd, next); - qemu_mutex_unlock(&cur_mon->mon_lock); } void qmp_closefd(const char *fdname, Error **errp) @@ -1299,7 +1297,7 @@ int monitor_get_fd(Monitor *mon, const char *fdname, Error **errp) { mon_fd_t *monfd; - qemu_mutex_lock(&mon->mon_lock); + QEMU_LOCK_GUARD(&mon->mon_lock); QLIST_FOREACH(monfd, &mon->fds, next) { int fd; @@ -1313,12 +1311,10 @@ int monitor_get_fd(Monitor *mon, const char *fdname, Error **errp) QLIST_REMOVE(monfd, next); g_free(monfd->name); g_free(monfd); - qemu_mutex_unlock(&mon->mon_lock); return fd; } - qemu_mutex_unlock(&mon->mon_lock); error_setg(errp, "File descriptor named '%s' has not been found", fdname); return -1; } @@ -1350,11 +1346,10 @@ void monitor_fdsets_cleanup(void) MonFdset *mon_fdset; MonFdset *mon_fdset_next; - qemu_mutex_lock(&mon_fdsets_lock); + QEMU_LOCK_GUARD(&mon_fdsets_lock); QLIST_FOREACH_SAFE(mon_fdset, &mon_fdsets, next, mon_fdset_next) { monitor_fdset_cleanup(mon_fdset); } - qemu_mutex_unlock(&mon_fdsets_lock); } AddfdInfo *qmp_add_fd(bool has_fdset_id, int64_t fdset_id, bool has_opaque, @@ -1389,7 +1384,7 @@ void qmp_remove_fd(int64_t fdset_id, bool has_fd, int64_t fd, Error **errp) MonFdsetFd *mon_fdset_fd; char fd_str[60]; - qemu_mutex_lock(&mon_fdsets_lock); + QEMU_LOCK_GUARD(&mon_fdsets_lock); QLIST_FOREACH(mon_fdset, &mon_fdsets, next) { if (mon_fdset->id != fdset_id) { continue; @@ -1409,12 +1404,10 @@ void qmp_remove_fd(int64_t fdset_id, bool has_fd, int64_t fd, Error **errp) goto error; } monitor_fdset_cleanup(mon_fdset); - qemu_mutex_unlock(&mon_fdsets_lock); return; } error: - qemu_mutex_unlock(&mon_fdsets_lock); if (has_fd) { snprintf(fd_str, sizeof(fd_str), "fdset-id:%" PRId64 ", fd:%" PRId64, fdset_id, fd); @@ -1430,7 +1423,7 @@ FdsetInfoList *qmp_query_fdsets(Error **errp) MonFdsetFd *mon_fdset_fd; FdsetInfoList *fdset_list = NULL; - qemu_mutex_lock(&mon_fdsets_lock); + QEMU_LOCK_GUARD(&mon_fdsets_lock); QLIST_FOREACH(mon_fdset, &mon_fdsets, next) { FdsetInfoList *fdset_info = g_malloc0(sizeof(*fdset_info)); FdsetFdInfoList *fdsetfd_list = NULL; @@ -1460,7 +1453,6 @@ FdsetInfoList *qmp_query_fdsets(Error **errp) fdset_info->next = fdset_list; fdset_list = fdset_info; } - qemu_mutex_unlock(&mon_fdsets_lock); return fdset_list; } @@ -1554,7 +1546,7 @@ int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags) #else MonFdset *mon_fdset; - qemu_mutex_lock(&mon_fdsets_lock); + QEMU_LOCK_GUARD(&mon_fdsets_lock); QLIST_FOREACH(mon_fdset, &mon_fdsets, next) { MonFdsetFd *mon_fdset_fd; MonFdsetFd *mon_fdset_fd_dup; @@ -1569,7 +1561,6 @@ int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags) QLIST_FOREACH(mon_fdset_fd, &mon_fdset->fds, next) { mon_fd_flags = fcntl(mon_fdset_fd->fd, F_GETFL); if (mon_fd_flags == -1) { - qemu_mutex_unlock(&mon_fdsets_lock); return -1; } @@ -1580,25 +1571,21 @@ int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags) } if (fd == -1) { - qemu_mutex_unlock(&mon_fdsets_lock); errno = EACCES; return -1; } dup_fd = qemu_dup_flags(fd, flags); if (dup_fd == -1) { - qemu_mutex_unlock(&mon_fdsets_lock); return -1; } mon_fdset_fd_dup = g_malloc0(sizeof(*mon_fdset_fd_dup)); mon_fdset_fd_dup->fd = dup_fd; QLIST_INSERT_HEAD(&mon_fdset->dup_fds, mon_fdset_fd_dup, next); - qemu_mutex_unlock(&mon_fdsets_lock); return dup_fd; } - qemu_mutex_unlock(&mon_fdsets_lock); errno = ENOENT; return -1; #endif @@ -1609,7 +1596,7 @@ static int64_t monitor_fdset_dup_fd_find_remove(int dup_fd, bool remove) MonFdset *mon_fdset; MonFdsetFd *mon_fdset_fd_dup; - qemu_mutex_lock(&mon_fdsets_lock); + QEMU_LOCK_GUARD(&mon_fdsets_lock); QLIST_FOREACH(mon_fdset, &mon_fdsets, next) { QLIST_FOREACH(mon_fdset_fd_dup, &mon_fdset->dup_fds, next) { if (mon_fdset_fd_dup->fd == dup_fd) { @@ -1619,17 +1606,14 @@ static int64_t monitor_fdset_dup_fd_find_remove(int dup_fd, bool remove) if (QLIST_EMPTY(&mon_fdset->dup_fds)) { monitor_fdset_cleanup(mon_fdset); } - goto err; + return -1; } else { - qemu_mutex_unlock(&mon_fdsets_lock); return mon_fdset->id; } } } } -err: - qemu_mutex_unlock(&mon_fdsets_lock); return -1; } diff --git a/qapi/migration.json b/qapi/migration.json index 675f70bb67..ce2216cfea 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -1720,3 +1720,70 @@ ## { 'event': 'UNPLUG_PRIMARY', 'data': { 'device-id': 'str' } } + +## +# @DirtyRateStatus: +# +# An enumeration of dirtyrate status. +# +# @unstarted: the dirtyrate thread has not been started. +# +# @measuring: the dirtyrate thread is measuring. +# +# @measured: the dirtyrate thread has measured and results are available. +# +# Since: 5.2 +# +## +{ 'enum': 'DirtyRateStatus', + 'data': [ 'unstarted', 'measuring', 'measured'] } + +## +# @DirtyRateInfo: +# +# Information about current dirty page rate of vm. +# +# @dirty-rate: @dirtyrate describing the dirty page rate of vm +# in units of MB/s. +# If this field returns '-1', it means querying has not +# yet started or completed. +# +# @status: status containing dirtyrate query status includes +# 'unstarted' or 'measuring' or 'measured' +# +# @start-time: start time in units of second for calculation +# +# @calc-time: time in units of second for sample dirty pages +# +# Since: 5.2 +# +## +{ 'struct': 'DirtyRateInfo', + 'data': {'dirty-rate': 'int64', + 'status': 'DirtyRateStatus', + 'start-time': 'int64', + 'calc-time': 'int64'} } + +## +# @calc-dirty-rate: +# +# start calculating dirty page rate for vm +# +# @calc-time: time in units of second for sample dirty pages +# +# Since: 5.2 +# +# Example: +# {"command": "calc-dirty-rate", "data": {"calc-time": 1} } +# +## +{ 'command': 'calc-dirty-rate', 'data': {'calc-time': 'int64'} } + +## +# @query-dirty-rate: +# +# query dirty page rate in units of MB/s for vm +# +# Since: 5.2 +## +{ 'command': 'query-dirty-rate', 'returns': 'DirtyRateInfo' } diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c index 9e5537506c..d5c8e98253 100644 --- a/tools/virtiofsd/fuse_virtio.c +++ b/tools/virtiofsd/fuse_virtio.c @@ -588,7 +588,7 @@ static void *fv_queue_thread(void *opaque) struct fuse_session *se = qi->virtio_dev->se; GThreadPool *pool; - pool = g_thread_pool_new(fv_queue_worker, qi, se->thread_pool_size, TRUE, + pool = g_thread_pool_new(fv_queue_worker, qi, se->thread_pool_size, FALSE, NULL); if (!pool) { fuse_log(FUSE_LOG_ERR, "%s: g_thread_pool_new failed\n", __func__); diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c index 7bc5d7dc5a..85770d63f1 100644 --- a/tools/virtiofsd/helper.c +++ b/tools/virtiofsd/helper.c @@ -178,6 +178,10 @@ void fuse_cmdline_help(void) " (0 leaves rlimit unchanged)\n" " default: min(1000000, fs.file-max - 16384)\n" " if the current rlimit is lower\n" + " -o allow_direct_io|no_allow_direct_io\n" + " retain/discard O_DIRECT flags passed down\n" + " to virtiofsd from guest applications.\n" + " default: no_allow_direct_io\n" ); } diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index 784330e0e4..0b229ebd57 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -151,6 +151,7 @@ struct lo_data { int timeout_set; int readdirplus_set; int readdirplus_clear; + int allow_direct_io; struct lo_inode root; GHashTable *inodes; /* protected by lo->mutex */ struct lo_map ino_map; /* protected by lo->mutex */ @@ -179,6 +180,8 @@ static const struct fuse_opt lo_opts[] = { { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS }, { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 }, { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 }, + { "allow_direct_io", offsetof(struct lo_data, allow_direct_io), 1 }, + { "no_allow_direct_io", offsetof(struct lo_data, allow_direct_io), 0 }, FUSE_OPT_END }; static bool use_syslog = false; @@ -1516,7 +1519,8 @@ static void lo_releasedir(fuse_req_t req, fuse_ino_t ino, fuse_reply_err(req, 0); } -static void update_open_flags(int writeback, struct fuse_file_info *fi) +static void update_open_flags(int writeback, int allow_direct_io, + struct fuse_file_info *fi) { /* * With writeback cache, kernel may send read requests even @@ -1541,10 +1545,13 @@ static void update_open_flags(int writeback, struct fuse_file_info *fi) /* * O_DIRECT in guest should not necessarily mean bypassing page - * cache on host as well. If somebody needs that behavior, it - * probably should be a configuration knob in daemon. + * cache on host as well. Therefore, we discard it by default + * ('-o no_allow_direct_io'). If somebody needs that behavior, + * the '-o allow_direct_io' option should be set. */ - fi->flags &= ~O_DIRECT; + if (!allow_direct_io) { + fi->flags &= ~O_DIRECT; + } } static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, @@ -1576,7 +1583,7 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, goto out; } - update_open_flags(lo->writeback, fi); + update_open_flags(lo->writeback, lo->allow_direct_io, fi); fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW, mode); @@ -1786,7 +1793,7 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino, fi->flags); - update_open_flags(lo->writeback, fi); + update_open_flags(lo->writeback, lo->allow_direct_io, fi); sprintf(buf, "%i", lo_fd(req, ino)); fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW); @@ -2823,6 +2830,7 @@ int main(int argc, char *argv[]) .debug = 0, .writeback = 0, .posix_lock = 0, + .allow_direct_io = 0, .proc_self_fd = -1, }; struct lo_map_elem *root_elem; diff --git a/tools/virtiofsd/passthrough_seccomp.c b/tools/virtiofsd/passthrough_seccomp.c index 19fee60011..eb9af8265f 100644 --- a/tools/virtiofsd/passthrough_seccomp.c +++ b/tools/virtiofsd/passthrough_seccomp.c @@ -93,6 +93,8 @@ static const int syscall_whitelist[] = { SCMP_SYS(rt_sigaction), SCMP_SYS(rt_sigprocmask), SCMP_SYS(rt_sigreturn), + SCMP_SYS(sched_getattr), + SCMP_SYS(sched_setattr), SCMP_SYS(sendmsg), SCMP_SYS(setresgid), SCMP_SYS(setresuid), |