diff options
213 files changed, 5352 insertions, 2250 deletions
diff --git a/.gitignore b/.gitignore index 8aab671265..61fa39967b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,13 @@ /GNUmakefile /build/ /.cache/ +/.vscode/ *.pyc .sdk .stgit-* .git-submodule-status +.clang-format +.gdb_history cscope.* tags TAGS diff --git a/MAINTAINERS b/MAINTAINERS index e3d5b7e09c..64893e36bc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1915,7 +1915,7 @@ SSI M: Alistair Francis <alistair@alistair23.me> S: Maintained F: hw/ssi/* -F: hw/block/m25p80.c +F: hw/block/m25p80* F: include/hw/ssi/ssi.h X: hw/ssi/xilinx_* F: tests/qtest/m25p80-test.c @@ -2510,6 +2510,7 @@ F: block* F: block/ F: hw/block/ F: include/block/ +F: include/sysemu/block-*.h F: qemu-img* F: docs/tools/qemu-img.rst F: qemu-io* @@ -3415,6 +3416,12 @@ L: qemu-block@nongnu.org S: Maintained F: block/vdi.c +blkio +M: Stefan Hajnoczi <stefanha@redhat.com> +L: qemu-block@nongnu.org +S: Maintained +F: block/blkio.c + iSCSI M: Ronnie Sahlberg <ronniesahlberg@gmail.com> M: Paolo Bonzini <pbonzini@redhat.com> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index f9e5cc9ba0..82b06c1824 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -187,13 +187,14 @@ static bool tb_lookup_cmp(const void *p, const void *d) const struct tb_desc *desc = d; if ((TARGET_TB_PCREL || tb_pc(tb) == desc->pc) && - tb->page_addr[0] == desc->page_addr0 && + tb_page_addr0(tb) == desc->page_addr0 && tb->cs_base == desc->cs_base && tb->flags == desc->flags && tb->trace_vcpu_dstate == desc->trace_vcpu_dstate && tb_cflags(tb) == desc->cflags) { /* check next page if needed */ - if (tb->page_addr[1] == -1) { + tb_page_addr_t tb_phys_page1 = tb_page_addr1(tb); + if (tb_phys_page1 == -1) { return true; } else { tb_page_addr_t phys_page1; @@ -210,7 +211,7 @@ static bool tb_lookup_cmp(const void *p, const void *d) */ virt_page1 = TARGET_PAGE_ALIGN(desc->pc); phys_page1 = get_page_addr_code(desc->env, virt_page1); - if (tb->page_addr[1] == phys_page1) { + if (tb_phys_page1 == phys_page1) { return true; } } @@ -304,16 +305,12 @@ static void log_cpu_exec(target_ulong pc, CPUState *cpu, } } -static bool check_for_breakpoints(CPUState *cpu, target_ulong pc, - uint32_t *cflags) +static bool check_for_breakpoints_slow(CPUState *cpu, target_ulong pc, + uint32_t *cflags) { CPUBreakpoint *bp; bool match_page = false; - if (likely(QTAILQ_EMPTY(&cpu->breakpoints))) { - return false; - } - /* * Singlestep overrides breakpoints. * This requirement is visible in the record-replay tests, where @@ -374,6 +371,13 @@ static bool check_for_breakpoints(CPUState *cpu, target_ulong pc, return false; } +static inline bool check_for_breakpoints(CPUState *cpu, target_ulong pc, + uint32_t *cflags) +{ + return unlikely(!QTAILQ_EMPTY(&cpu->breakpoints)) && + check_for_breakpoints_slow(cpu, pc, cflags); +} + /** * helper_lookup_tb_ptr: quick check for next tb * @env: current cpu state @@ -1016,7 +1020,7 @@ int cpu_exec(CPUState *cpu) * direct jump to a TB spanning two pages because the mapping * for the second page can change. */ - if (tb->page_addr[1] != -1) { + if (tb_page_addr1(tb) != -1) { last_tb = NULL; } #endif diff --git a/accel/tcg/internal.h b/accel/tcg/internal.h index dc800fd485..1227bb69bd 100644 --- a/accel/tcg/internal.h +++ b/accel/tcg/internal.h @@ -11,12 +11,103 @@ #include "exec/exec-all.h" +/* + * Access to the various translations structures need to be serialised + * via locks for consistency. In user-mode emulation access to the + * memory related structures are protected with mmap_lock. + * In !user-mode we use per-page locks. + */ +#ifdef CONFIG_SOFTMMU +#define assert_memory_lock() +#else +#define assert_memory_lock() tcg_debug_assert(have_mmap_lock()) +#endif + +typedef struct PageDesc { + /* list of TBs intersecting this ram page */ + uintptr_t first_tb; +#ifdef CONFIG_USER_ONLY + unsigned long flags; + void *target_data; +#endif +#ifdef CONFIG_SOFTMMU + QemuSpin lock; +#endif +} PageDesc; + +/* Size of the L2 (and L3, etc) page tables. */ +#define V_L2_BITS 10 +#define V_L2_SIZE (1 << V_L2_BITS) + +/* + * L1 Mapping properties + */ +extern int v_l1_size; +extern int v_l1_shift; +extern int v_l2_levels; + +/* + * The bottom level has pointers to PageDesc, and is indexed by + * anything from 4 to (V_L2_BITS + 3) bits, depending on target page size. + */ +#define V_L1_MIN_BITS 4 +#define V_L1_MAX_BITS (V_L2_BITS + 3) +#define V_L1_MAX_SIZE (1 << V_L1_MAX_BITS) + +extern void *l1_map[V_L1_MAX_SIZE]; + +PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc); + +static inline PageDesc *page_find(tb_page_addr_t index) +{ + return page_find_alloc(index, false); +} + +/* list iterators for lists of tagged pointers in TranslationBlock */ +#define TB_FOR_EACH_TAGGED(head, tb, n, field) \ + for (n = (head) & 1, tb = (TranslationBlock *)((head) & ~1); \ + tb; tb = (TranslationBlock *)tb->field[n], n = (uintptr_t)tb & 1, \ + tb = (TranslationBlock *)((uintptr_t)tb & ~1)) + +#define PAGE_FOR_EACH_TB(pagedesc, tb, n) \ + TB_FOR_EACH_TAGGED((pagedesc)->first_tb, tb, n, page_next) + +#define TB_FOR_EACH_JMP(head_tb, tb, n) \ + TB_FOR_EACH_TAGGED((head_tb)->jmp_list_head, tb, n, jmp_list_next) + +/* In user-mode page locks aren't used; mmap_lock is enough */ +#ifdef CONFIG_USER_ONLY +#define assert_page_locked(pd) tcg_debug_assert(have_mmap_lock()) +static inline void page_lock(PageDesc *pd) { } +static inline void page_unlock(PageDesc *pd) { } +#else +#ifdef CONFIG_DEBUG_TCG +void do_assert_page_locked(const PageDesc *pd, const char *file, int line); +#define assert_page_locked(pd) do_assert_page_locked(pd, __FILE__, __LINE__) +#else +#define assert_page_locked(pd) +#endif +void page_lock(PageDesc *pd); +void page_unlock(PageDesc *pd); +#endif +#if !defined(CONFIG_USER_ONLY) && defined(CONFIG_DEBUG_TCG) +void assert_no_pages_locked(void); +#else +static inline void assert_no_pages_locked(void) { } +#endif + TranslationBlock *tb_gen_code(CPUState *cpu, target_ulong pc, target_ulong cs_base, uint32_t flags, int cflags); G_NORETURN void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr); void page_init(void); void tb_htable_init(void); +void tb_reset_jump(TranslationBlock *tb, int n); +TranslationBlock *tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc, + tb_page_addr_t phys_page2); +bool tb_invalidate_phys_page_unwind(tb_page_addr_t addr, uintptr_t pc); +int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb, + uintptr_t searched_pc, bool reset_icount); /* Return the current PC from CPU, which may be cached in TB. */ static inline target_ulong log_pc(CPUState *cpu, const TranslationBlock *tb) diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build index 7a0a79d731..75e1dffb4d 100644 --- a/accel/tcg/meson.build +++ b/accel/tcg/meson.build @@ -3,6 +3,7 @@ tcg_ss.add(files( 'tcg-all.c', 'cpu-exec-common.c', 'cpu-exec.c', + 'tb-maint.c', 'tcg-runtime-gvec.c', 'tcg-runtime.c', 'translate-all.c', diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c new file mode 100644 index 0000000000..c8e921089d --- /dev/null +++ b/accel/tcg/tb-maint.c @@ -0,0 +1,704 @@ +/* + * Translation Block Maintaince + * + * Copyright (c) 2003 Fabrice Bellard + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "exec/cputlb.h" +#include "exec/log.h" +#include "exec/exec-all.h" +#include "exec/translate-all.h" +#include "sysemu/tcg.h" +#include "tcg/tcg.h" +#include "tb-hash.h" +#include "tb-context.h" +#include "internal.h" + + +static bool tb_cmp(const void *ap, const void *bp) +{ + const TranslationBlock *a = ap; + const TranslationBlock *b = bp; + + return ((TARGET_TB_PCREL || tb_pc(a) == tb_pc(b)) && + a->cs_base == b->cs_base && + a->flags == b->flags && + (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) && + a->trace_vcpu_dstate == b->trace_vcpu_dstate && + tb_page_addr0(a) == tb_page_addr0(b) && + tb_page_addr1(a) == tb_page_addr1(b)); +} + +void tb_htable_init(void) +{ + unsigned int mode = QHT_MODE_AUTO_RESIZE; + + qht_init(&tb_ctx.htable, tb_cmp, CODE_GEN_HTABLE_SIZE, mode); +} + +/* Set to NULL all the 'first_tb' fields in all PageDescs. */ +static void page_flush_tb_1(int level, void **lp) +{ + int i; + + if (*lp == NULL) { + return; + } + if (level == 0) { + PageDesc *pd = *lp; + + for (i = 0; i < V_L2_SIZE; ++i) { + page_lock(&pd[i]); + pd[i].first_tb = (uintptr_t)NULL; + page_unlock(&pd[i]); + } + } else { + void **pp = *lp; + + for (i = 0; i < V_L2_SIZE; ++i) { + page_flush_tb_1(level - 1, pp + i); + } + } +} + +static void page_flush_tb(void) +{ + int i, l1_sz = v_l1_size; + + for (i = 0; i < l1_sz; i++) { + page_flush_tb_1(v_l2_levels, l1_map + i); + } +} + +/* flush all the translation blocks */ +static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count) +{ + bool did_flush = false; + + mmap_lock(); + /* If it is already been done on request of another CPU, just retry. */ + if (tb_ctx.tb_flush_count != tb_flush_count.host_int) { + goto done; + } + did_flush = true; + + CPU_FOREACH(cpu) { + tcg_flush_jmp_cache(cpu); + } + + qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE); + page_flush_tb(); + + tcg_region_reset_all(); + /* XXX: flush processor icache at this point if cache flush is expensive */ + qatomic_mb_set(&tb_ctx.tb_flush_count, tb_ctx.tb_flush_count + 1); + +done: + mmap_unlock(); + if (did_flush) { + qemu_plugin_flush_cb(); + } +} + +void tb_flush(CPUState *cpu) +{ + if (tcg_enabled()) { + unsigned tb_flush_count = qatomic_mb_read(&tb_ctx.tb_flush_count); + + if (cpu_in_exclusive_context(cpu)) { + do_tb_flush(cpu, RUN_ON_CPU_HOST_INT(tb_flush_count)); + } else { + async_safe_run_on_cpu(cpu, do_tb_flush, + RUN_ON_CPU_HOST_INT(tb_flush_count)); + } + } +} + +/* + * user-mode: call with mmap_lock held + * !user-mode: call with @pd->lock held + */ +static inline void tb_page_remove(PageDesc *pd, TranslationBlock *tb) +{ + TranslationBlock *tb1; + uintptr_t *pprev; + unsigned int n1; + + assert_page_locked(pd); + pprev = &pd->first_tb; + PAGE_FOR_EACH_TB(pd, tb1, n1) { + if (tb1 == tb) { + *pprev = tb1->page_next[n1]; + return; + } + pprev = &tb1->page_next[n1]; + } + g_assert_not_reached(); +} + +/* remove @orig from its @n_orig-th jump list */ +static inline void tb_remove_from_jmp_list(TranslationBlock *orig, int n_orig) +{ + uintptr_t ptr, ptr_locked; + TranslationBlock *dest; + TranslationBlock *tb; + uintptr_t *pprev; + int n; + + /* mark the LSB of jmp_dest[] so that no further jumps can be inserted */ + ptr = qatomic_or_fetch(&orig->jmp_dest[n_orig], 1); + dest = (TranslationBlock *)(ptr & ~1); + if (dest == NULL) { + return; + } + + qemu_spin_lock(&dest->jmp_lock); + /* + * While acquiring the lock, the jump might have been removed if the + * destination TB was invalidated; check again. + */ + ptr_locked = qatomic_read(&orig->jmp_dest[n_orig]); + if (ptr_locked != ptr) { + qemu_spin_unlock(&dest->jmp_lock); + /* + * The only possibility is that the jump was unlinked via + * tb_jump_unlink(dest). Seeing here another destination would be a bug, + * because we set the LSB above. + */ + g_assert(ptr_locked == 1 && dest->cflags & CF_INVALID); + return; + } + /* + * We first acquired the lock, and since the destination pointer matches, + * we know for sure that @orig is in the jmp list. + */ + pprev = &dest->jmp_list_head; + TB_FOR_EACH_JMP(dest, tb, n) { + if (tb == orig && n == n_orig) { + *pprev = tb->jmp_list_next[n]; + /* no need to set orig->jmp_dest[n]; setting the LSB was enough */ + qemu_spin_unlock(&dest->jmp_lock); + return; + } + pprev = &tb->jmp_list_next[n]; + } + g_assert_not_reached(); +} + +/* + * Reset the jump entry 'n' of a TB so that it is not chained to another TB. + */ +void tb_reset_jump(TranslationBlock *tb, int n) +{ + uintptr_t addr = (uintptr_t)(tb->tc.ptr + tb->jmp_reset_offset[n]); + tb_set_jmp_target(tb, n, addr); +} + +/* remove any jumps to the TB */ +static inline void tb_jmp_unlink(TranslationBlock *dest) +{ + TranslationBlock *tb; + int n; + + qemu_spin_lock(&dest->jmp_lock); + + TB_FOR_EACH_JMP(dest, tb, n) { + tb_reset_jump(tb, n); + qatomic_and(&tb->jmp_dest[n], (uintptr_t)NULL | 1); + /* No need to clear the list entry; setting the dest ptr is enough */ + } + dest->jmp_list_head = (uintptr_t)NULL; + + qemu_spin_unlock(&dest->jmp_lock); +} + +static void tb_jmp_cache_inval_tb(TranslationBlock *tb) +{ + CPUState *cpu; + + if (TARGET_TB_PCREL) { + /* A TB may be at any virtual address */ + CPU_FOREACH(cpu) { + tcg_flush_jmp_cache(cpu); + } + } else { + uint32_t h = tb_jmp_cache_hash_func(tb_pc(tb)); + + CPU_FOREACH(cpu) { + CPUJumpCache *jc = cpu->tb_jmp_cache; + + if (qatomic_read(&jc->array[h].tb) == tb) { + qatomic_set(&jc->array[h].tb, NULL); + } + } + } +} + +/* + * In user-mode, call with mmap_lock held. + * In !user-mode, if @rm_from_page_list is set, call with the TB's pages' + * locks held. + */ +static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list) +{ + PageDesc *p; + uint32_t h; + tb_page_addr_t phys_pc; + uint32_t orig_cflags = tb_cflags(tb); + + assert_memory_lock(); + + /* make sure no further incoming jumps will be chained to this TB */ + qemu_spin_lock(&tb->jmp_lock); + qatomic_set(&tb->cflags, tb->cflags | CF_INVALID); + qemu_spin_unlock(&tb->jmp_lock); + + /* remove the TB from the hash list */ + phys_pc = tb_page_addr0(tb); + h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)), + tb->flags, orig_cflags, tb->trace_vcpu_dstate); + if (!qht_remove(&tb_ctx.htable, tb, h)) { + return; + } + + /* remove the TB from the page list */ + if (rm_from_page_list) { + p = page_find(phys_pc >> TARGET_PAGE_BITS); + tb_page_remove(p, tb); + phys_pc = tb_page_addr1(tb); + if (phys_pc != -1) { + p = page_find(phys_pc >> TARGET_PAGE_BITS); + tb_page_remove(p, tb); + } + } + + /* remove the TB from the hash list */ + tb_jmp_cache_inval_tb(tb); + + /* suppress this TB from the two jump lists */ + tb_remove_from_jmp_list(tb, 0); + tb_remove_from_jmp_list(tb, 1); + + /* suppress any remaining jumps to this TB */ + tb_jmp_unlink(tb); + + qatomic_set(&tb_ctx.tb_phys_invalidate_count, + tb_ctx.tb_phys_invalidate_count + 1); +} + +static void tb_phys_invalidate__locked(TranslationBlock *tb) +{ + qemu_thread_jit_write(); + do_tb_phys_invalidate(tb, true); + qemu_thread_jit_execute(); +} + +static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1, + PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc) +{ + PageDesc *p1, *p2; + tb_page_addr_t page1; + tb_page_addr_t page2; + + assert_memory_lock(); + g_assert(phys1 != -1); + + page1 = phys1 >> TARGET_PAGE_BITS; + page2 = phys2 >> TARGET_PAGE_BITS; + + p1 = page_find_alloc(page1, alloc); + if (ret_p1) { + *ret_p1 = p1; + } + if (likely(phys2 == -1)) { + page_lock(p1); + return; + } else if (page1 == page2) { + page_lock(p1); + if (ret_p2) { + *ret_p2 = p1; + } + return; + } + p2 = page_find_alloc(page2, alloc); + if (ret_p2) { + *ret_p2 = p2; + } + if (page1 < page2) { + page_lock(p1); + page_lock(p2); + } else { + page_lock(p2); + page_lock(p1); + } +} + +#ifdef CONFIG_USER_ONLY +static inline void page_lock_tb(const TranslationBlock *tb) { } +static inline void page_unlock_tb(const TranslationBlock *tb) { } +#else +/* lock the page(s) of a TB in the correct acquisition order */ +static void page_lock_tb(const TranslationBlock *tb) +{ + page_lock_pair(NULL, tb_page_addr0(tb), NULL, tb_page_addr1(tb), false); +} + +static void page_unlock_tb(const TranslationBlock *tb) +{ + PageDesc *p1 = page_find(tb_page_addr0(tb) >> TARGET_PAGE_BITS); + + page_unlock(p1); + if (unlikely(tb_page_addr1(tb) != -1)) { + PageDesc *p2 = page_find(tb_page_addr1(tb) >> TARGET_PAGE_BITS); + + if (p2 != p1) { + page_unlock(p2); + } + } +} +#endif + +/* + * Invalidate one TB. + * Called with mmap_lock held in user-mode. + */ +void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr) +{ + if (page_addr == -1 && tb_page_addr0(tb) != -1) { + page_lock_tb(tb); + do_tb_phys_invalidate(tb, true); + page_unlock_tb(tb); + } else { + do_tb_phys_invalidate(tb, false); + } +} + +/* + * Add the tb in the target page and protect it if necessary. + * Called with mmap_lock held for user-mode emulation. + * Called with @p->lock held in !user-mode. + */ +static inline void tb_page_add(PageDesc *p, TranslationBlock *tb, + unsigned int n, tb_page_addr_t page_addr) +{ +#ifndef CONFIG_USER_ONLY + bool page_already_protected; +#endif + + assert_page_locked(p); + + tb->page_next[n] = p->first_tb; +#ifndef CONFIG_USER_ONLY + page_already_protected = p->first_tb != (uintptr_t)NULL; +#endif + p->first_tb = (uintptr_t)tb | n; + +#if defined(CONFIG_USER_ONLY) + /* translator_loop() must have made all TB pages non-writable */ + assert(!(p->flags & PAGE_WRITE)); +#else + /* + * If some code is already present, then the pages are already + * protected. So we handle the case where only the first TB is + * allocated in a physical page. + */ + if (!page_already_protected) { + tlb_protect_code(page_addr); + } +#endif +} + +/* + * Add a new TB and link it to the physical page tables. phys_page2 is + * (-1) to indicate that only one page contains the TB. + * + * Called with mmap_lock held for user-mode emulation. + * + * Returns a pointer @tb, or a pointer to an existing TB that matches @tb. + * Note that in !user-mode, another thread might have already added a TB + * for the same block of guest code that @tb corresponds to. In that case, + * the caller should discard the original @tb, and use instead the returned TB. + */ +TranslationBlock *tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc, + tb_page_addr_t phys_page2) +{ + PageDesc *p; + PageDesc *p2 = NULL; + void *existing_tb = NULL; + uint32_t h; + + assert_memory_lock(); + tcg_debug_assert(!(tb->cflags & CF_INVALID)); + + /* + * Add the TB to the page list, acquiring first the pages's locks. + * We keep the locks held until after inserting the TB in the hash table, + * so that if the insertion fails we know for sure that the TBs are still + * in the page descriptors. + * Note that inserting into the hash table first isn't an option, since + * we can only insert TBs that are fully initialized. + */ + page_lock_pair(&p, phys_pc, &p2, phys_page2, true); + tb_page_add(p, tb, 0, phys_pc); + if (p2) { + tb_page_add(p2, tb, 1, phys_page2); + } + + /* add in the hash table */ + h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)), + tb->flags, tb->cflags, tb->trace_vcpu_dstate); + qht_insert(&tb_ctx.htable, tb, h, &existing_tb); + + /* remove TB from the page(s) if we couldn't insert it */ + if (unlikely(existing_tb)) { + tb_page_remove(p, tb); + if (p2) { + tb_page_remove(p2, tb); + } + tb = existing_tb; + } + + if (p2 && p2 != p) { + page_unlock(p2); + } + page_unlock(p); + return tb; +} + +/* + * @p must be non-NULL. + * user-mode: call with mmap_lock held. + * !user-mode: call with all @pages locked. + */ +static void +tb_invalidate_phys_page_range__locked(struct page_collection *pages, + PageDesc *p, tb_page_addr_t start, + tb_page_addr_t end, + uintptr_t retaddr) +{ + TranslationBlock *tb; + tb_page_addr_t tb_start, tb_end; + int n; +#ifdef TARGET_HAS_PRECISE_SMC + CPUState *cpu = current_cpu; + bool current_tb_not_found = retaddr != 0; + bool current_tb_modified = false; + TranslationBlock *current_tb = NULL; +#endif /* TARGET_HAS_PRECISE_SMC */ + + assert_page_locked(p); + + /* + * We remove all the TBs in the range [start, end[. + * XXX: see if in some cases it could be faster to invalidate all the code + */ + PAGE_FOR_EACH_TB(p, tb, n) { + assert_page_locked(p); + /* NOTE: this is subtle as a TB may span two physical pages */ + if (n == 0) { + /* NOTE: tb_end may be after the end of the page, but + it is not a problem */ + tb_start = tb_page_addr0(tb); + tb_end = tb_start + tb->size; + } else { + tb_start = tb_page_addr1(tb); + tb_end = tb_start + ((tb_page_addr0(tb) + tb->size) + & ~TARGET_PAGE_MASK); + } + if (!(tb_end <= start || tb_start >= end)) { +#ifdef TARGET_HAS_PRECISE_SMC + if (current_tb_not_found) { + current_tb_not_found = false; + /* now we have a real cpu fault */ + current_tb = tcg_tb_lookup(retaddr); + } + if (current_tb == tb && + (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) { + /* + * If we are modifying the current TB, we must stop + * its execution. We could be more precise by checking + * that the modification is after the current PC, but it + * would require a specialized function to partially + * restore the CPU state. + */ + current_tb_modified = true; + cpu_restore_state_from_tb(cpu, current_tb, retaddr, true); + } +#endif /* TARGET_HAS_PRECISE_SMC */ + tb_phys_invalidate__locked(tb); + } + } +#if !defined(CONFIG_USER_ONLY) + /* if no code remaining, no need to continue to use slow writes */ + if (!p->first_tb) { + tlb_unprotect_code(start); + } +#endif +#ifdef TARGET_HAS_PRECISE_SMC + if (current_tb_modified) { + page_collection_unlock(pages); + /* Force execution of one insn next time. */ + cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(cpu); + mmap_unlock(); + cpu_loop_exit_noexc(cpu); + } +#endif +} + +/* + * Invalidate all TBs which intersect with the target physical + * address page @addr. + * + * Called with mmap_lock held for user-mode emulation + */ +void tb_invalidate_phys_page(tb_page_addr_t addr) +{ + struct page_collection *pages; + tb_page_addr_t start, end; + PageDesc *p; + + assert_memory_lock(); + + p = page_find(addr >> TARGET_PAGE_BITS); + if (p == NULL) { + return; + } + + start = addr & TARGET_PAGE_MASK; + end = start + TARGET_PAGE_SIZE; + pages = page_collection_lock(start, end); + tb_invalidate_phys_page_range__locked(pages, p, start, end, 0); + page_collection_unlock(pages); +} + +/* + * Invalidate all TBs which intersect with the target physical address range + * [start;end[. NOTE: start and end may refer to *different* physical pages. + * 'is_cpu_write_access' should be true if called from a real cpu write + * access: the virtual CPU will exit the current TB if code is modified inside + * this TB. + * + * Called with mmap_lock held for user-mode emulation. + */ +void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end) +{ + struct page_collection *pages; + tb_page_addr_t next; + + assert_memory_lock(); + + pages = page_collection_lock(start, end); + for (next = (start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE; + start < end; + start = next, next += TARGET_PAGE_SIZE) { + PageDesc *pd = page_find(start >> TARGET_PAGE_BITS); + tb_page_addr_t bound = MIN(next, end); + + if (pd == NULL) { + continue; + } + tb_invalidate_phys_page_range__locked(pages, pd, start, bound, 0); + } + page_collection_unlock(pages); +} + +#ifdef CONFIG_SOFTMMU +/* + * len must be <= 8 and start must be a multiple of len. + * Called via softmmu_template.h when code areas are written to with + * iothread mutex not held. + * + * Call with all @pages in the range [@start, @start + len[ locked. + */ +void tb_invalidate_phys_page_fast(struct page_collection *pages, + tb_page_addr_t start, int len, + uintptr_t retaddr) +{ + PageDesc *p; + + assert_memory_lock(); + + p = page_find(start >> TARGET_PAGE_BITS); + if (!p) { + return; + } + + assert_page_locked(p); + tb_invalidate_phys_page_range__locked(pages, p, start, start + len, + retaddr); +} +#else +/* + * Called with mmap_lock held. If pc is not 0 then it indicates the + * host PC of the faulting store instruction that caused this invalidate. + * Returns true if the caller needs to abort execution of the current + * TB (because it was modified by this store and the guest CPU has + * precise-SMC semantics). + */ +bool tb_invalidate_phys_page_unwind(tb_page_addr_t addr, uintptr_t pc) +{ + TranslationBlock *tb; + PageDesc *p; + int n; +#ifdef TARGET_HAS_PRECISE_SMC + TranslationBlock *current_tb = NULL; + CPUState *cpu = current_cpu; + bool current_tb_modified = false; +#endif + + assert_memory_lock(); + + addr &= TARGET_PAGE_MASK; + p = page_find(addr >> TARGET_PAGE_BITS); + if (!p) { + return false; + } + +#ifdef TARGET_HAS_PRECISE_SMC + if (p->first_tb && pc != 0) { + current_tb = tcg_tb_lookup(pc); + } +#endif + assert_page_locked(p); + PAGE_FOR_EACH_TB(p, tb, n) { +#ifdef TARGET_HAS_PRECISE_SMC + if (current_tb == tb && + (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) { + /* + * If we are modifying the current TB, we must stop its execution. + * We could be more precise by checking that the modification is + * after the current PC, but it would require a specialized + * function to partially restore the CPU state. + */ + current_tb_modified = true; + cpu_restore_state_from_tb(cpu, current_tb, pc, true); + } +#endif /* TARGET_HAS_PRECISE_SMC */ + tb_phys_invalidate(tb, addr); + } + p->first_tb = (uintptr_t)NULL; +#ifdef TARGET_HAS_PRECISE_SMC + if (current_tb_modified) { + /* Force execution of one insn next time. */ + cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(cpu); + return true; + } +#endif + + return false; +} +#endif diff --git a/accel/tcg/tcg-accel-ops-mttcg.c b/accel/tcg/tcg-accel-ops-mttcg.c index ba997f6cfe..d50239e0e2 100644 --- a/accel/tcg/tcg-accel-ops-mttcg.c +++ b/accel/tcg/tcg-accel-ops-mttcg.c @@ -70,8 +70,6 @@ static void *mttcg_cpu_thread_fn(void *arg) assert(tcg_enabled()); g_assert(!icount_enabled()); - tcg_cpu_init_cflags(cpu, current_machine->smp.max_cpus > 1); - rcu_register_thread(); force_rcu.notifier.notify = mttcg_force_rcu; force_rcu.cpu = cpu; @@ -141,6 +139,9 @@ void mttcg_start_vcpu_thread(CPUState *cpu) { char thread_name[VCPU_THREAD_NAME_SIZE]; + g_assert(tcg_enabled()); + tcg_cpu_init_cflags(cpu, current_machine->smp.max_cpus > 1); + cpu->thread = g_new0(QemuThread, 1); cpu->halt_cond = g_malloc0(sizeof(QemuCond)); qemu_cond_init(cpu->halt_cond); diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c index cc8adc2380..290833a37f 100644 --- a/accel/tcg/tcg-accel-ops-rr.c +++ b/accel/tcg/tcg-accel-ops-rr.c @@ -51,7 +51,7 @@ void rr_kick_vcpu_thread(CPUState *unused) * * The kick timer is responsible for moving single threaded vCPU * emulation on to the next vCPU. If more than one vCPU is running a - * timer event with force a cpu->exit so the next vCPU can get + * timer event we force a cpu->exit so the next vCPU can get * scheduled. * * The timer is removed if all vCPUs are idle and restarted again once @@ -152,9 +152,7 @@ static void *rr_cpu_thread_fn(void *arg) Notifier force_rcu; CPUState *cpu = arg; - g_assert(tcg_enabled()); - tcg_cpu_init_cflags(cpu, false); - + assert(tcg_enabled()); rcu_register_thread(); force_rcu.notify = rr_force_rcu; rcu_add_force_rcu_notifier(&force_rcu); @@ -277,6 +275,9 @@ void rr_start_vcpu_thread(CPUState *cpu) static QemuCond *single_tcg_halt_cond; static QemuThread *single_tcg_cpu_thread; + g_assert(tcg_enabled()); + tcg_cpu_init_cflags(cpu, false); + if (!single_tcg_cpu_thread) { cpu->thread = g_new0(QemuThread, 1); cpu->halt_cond = g_new0(QemuCond, 1); diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index 4ed75a13e1..f185356a36 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -63,57 +63,7 @@ #include "tb-context.h" #include "internal.h" -/* #define DEBUG_TB_INVALIDATE */ -/* #define DEBUG_TB_FLUSH */ /* make various TB consistency checks */ -/* #define DEBUG_TB_CHECK */ - -#ifdef DEBUG_TB_INVALIDATE -#define DEBUG_TB_INVALIDATE_GATE 1 -#else -#define DEBUG_TB_INVALIDATE_GATE 0 -#endif - -#ifdef DEBUG_TB_FLUSH -#define DEBUG_TB_FLUSH_GATE 1 -#else -#define DEBUG_TB_FLUSH_GATE 0 -#endif - -#if !defined(CONFIG_USER_ONLY) -/* TB consistency checks only implemented for usermode emulation. */ -#undef DEBUG_TB_CHECK -#endif - -#ifdef DEBUG_TB_CHECK -#define DEBUG_TB_CHECK_GATE 1 -#else -#define DEBUG_TB_CHECK_GATE 0 -#endif - -/* Access to the various translations structures need to be serialised via locks - * for consistency. - * In user-mode emulation access to the memory related structures are protected - * with mmap_lock. - * In !user-mode we use per-page locks. - */ -#ifdef CONFIG_SOFTMMU -#define assert_memory_lock() -#else -#define assert_memory_lock() tcg_debug_assert(have_mmap_lock()) -#endif - -typedef struct PageDesc { - /* list of TBs intersecting this ram page */ - uintptr_t first_tb; -#ifdef CONFIG_USER_ONLY - unsigned long flags; - void *target_data; -#endif -#ifdef CONFIG_SOFTMMU - QemuSpin lock; -#endif -} PageDesc; /** * struct page_entry - page descriptor entry @@ -159,18 +109,6 @@ struct page_collection { struct page_entry *max; }; -/* list iterators for lists of tagged pointers in TranslationBlock */ -#define TB_FOR_EACH_TAGGED(head, tb, n, field) \ - for (n = (head) & 1, tb = (TranslationBlock *)((head) & ~1); \ - tb; tb = (TranslationBlock *)tb->field[n], n = (uintptr_t)tb & 1, \ - tb = (TranslationBlock *)((uintptr_t)tb & ~1)) - -#define PAGE_FOR_EACH_TB(pagedesc, tb, n) \ - TB_FOR_EACH_TAGGED((pagedesc)->first_tb, tb, n, page_next) - -#define TB_FOR_EACH_JMP(head_tb, tb, n) \ - TB_FOR_EACH_TAGGED((head_tb)->jmp_list_head, tb, n, jmp_list_next) - /* * In system mode we want L1_MAP to be based on ram offsets, * while in user mode we want it to be based on virtual addresses. @@ -188,10 +126,6 @@ struct page_collection { # define L1_MAP_ADDR_SPACE_BITS MIN(HOST_LONG_BITS, TARGET_ABI_BITS) #endif -/* Size of the L2 (and L3, etc) page tables. */ -#define V_L2_BITS 10 -#define V_L2_SIZE (1 << V_L2_BITS) - /* Make sure all possible CPU event bits fit in tb->trace_vcpu_dstate */ QEMU_BUILD_BUG_ON(CPU_TRACE_DSTATE_MAX_EVENTS > sizeof_field(TranslationBlock, trace_vcpu_dstate) @@ -200,18 +134,11 @@ QEMU_BUILD_BUG_ON(CPU_TRACE_DSTATE_MAX_EVENTS > /* * L1 Mapping properties */ -static int v_l1_size; -static int v_l1_shift; -static int v_l2_levels; +int v_l1_size; +int v_l1_shift; +int v_l2_levels; -/* The bottom level has pointers to PageDesc, and is indexed by - * anything from 4 to (V_L2_BITS + 3) bits, depending on target page size. - */ -#define V_L1_MIN_BITS 4 -#define V_L1_MAX_BITS (V_L2_BITS + 3) -#define V_L1_MAX_SIZE (1 << V_L1_MAX_BITS) - -static void *l1_map[V_L1_MAX_SIZE]; +void *l1_map[V_L1_MAX_SIZE]; TBContext tb_ctx; @@ -324,12 +251,11 @@ static int encode_search(TranslationBlock *tb, uint8_t *block) * When reset_icount is true, current TB will be interrupted and * icount should be recalculated. */ -static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb, - uintptr_t searched_pc, bool reset_icount) +int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb, + uintptr_t searched_pc, bool reset_icount) { - target_ulong data[TARGET_INSN_START_WORDS]; + uint64_t data[TARGET_INSN_START_WORDS]; uintptr_t host_pc = (uintptr_t)tb->tc.ptr; - CPUArchState *env = cpu->env_ptr; const uint8_t *p = tb->tc.ptr + tb->tc.size; int i, j, num_insns = tb->icount; #ifdef CONFIG_PROFILER @@ -368,7 +294,8 @@ static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb, and shift if to the number of actually executed instructions */ cpu_neg(cpu)->icount_decr.u16.low += num_insns - i; } - restore_state_to_opc(env, tb, data); + + cpu->cc->tcg_ops->restore_state_to_opc(cpu, tb, data); #ifdef CONFIG_PROFILER qatomic_set(&prof->restore_time, @@ -381,6 +308,14 @@ static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb, bool cpu_restore_state(CPUState *cpu, uintptr_t host_pc, bool will_exit) { /* + * The pc update associated with restore without exit will + * break the relative pc adjustments performed by TARGET_TB_PCREL. + */ + if (TARGET_TB_PCREL) { + assert(will_exit); + } + + /* * The host_pc has to be in the rx region of the code buffer. * If it is not we will not be able to resolve it here. * The two cases where host_pc will not be correct are: @@ -471,7 +406,7 @@ void page_init(void) #endif } -static PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc) +PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc) { PageDesc *pd; void **lp; @@ -537,31 +472,8 @@ static PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc) return pd + (index & (V_L2_SIZE - 1)); } -static inline PageDesc *page_find(tb_page_addr_t index) -{ - return page_find_alloc(index, false); -} - -static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1, - PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc); - /* In user-mode page locks aren't used; mmap_lock is enough */ #ifdef CONFIG_USER_ONLY - -#define assert_page_locked(pd) tcg_debug_assert(have_mmap_lock()) - -static inline void page_lock(PageDesc *pd) -{ } - -static inline void page_unlock(PageDesc *pd) -{ } - -static inline void page_lock_tb(const TranslationBlock *tb) -{ } - -static inline void page_unlock_tb(const TranslationBlock *tb) -{ } - struct page_collection * page_collection_lock(tb_page_addr_t start, tb_page_addr_t end) { @@ -610,8 +522,7 @@ static void page_unlock__debug(const PageDesc *pd) g_assert(removed); } -static void -do_assert_page_locked(const PageDesc *pd, const char *file, int line) +void do_assert_page_locked(const PageDesc *pd, const char *file, int line) { if (unlikely(!page_is_locked(pd))) { error_report("assert_page_lock: PageDesc %p not locked @ %s:%d", @@ -620,8 +531,6 @@ do_assert_page_locked(const PageDesc *pd, const char *file, int line) } } -#define assert_page_locked(pd) do_assert_page_locked(pd, __FILE__, __LINE__) - void assert_no_pages_locked(void) { ht_pages_locked_debug_init(); @@ -630,50 +539,23 @@ void assert_no_pages_locked(void) #else /* !CONFIG_DEBUG_TCG */ -#define assert_page_locked(pd) - -static inline void page_lock__debug(const PageDesc *pd) -{ -} - -static inline void page_unlock__debug(const PageDesc *pd) -{ -} +static inline void page_lock__debug(const PageDesc *pd) { } +static inline void page_unlock__debug(const PageDesc *pd) { } #endif /* CONFIG_DEBUG_TCG */ -static inline void page_lock(PageDesc *pd) +void page_lock(PageDesc *pd) { page_lock__debug(pd); qemu_spin_lock(&pd->lock); } -static inline void page_unlock(PageDesc *pd) +void page_unlock(PageDesc *pd) { qemu_spin_unlock(&pd->lock); page_unlock__debug(pd); } -/* lock the page(s) of a TB in the correct acquisition order */ -static inline void page_lock_tb(const TranslationBlock *tb) -{ - page_lock_pair(NULL, tb->page_addr[0], NULL, tb->page_addr[1], false); -} - -static inline void page_unlock_tb(const TranslationBlock *tb) -{ - PageDesc *p1 = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS); - - page_unlock(p1); - if (unlikely(tb->page_addr[1] != -1)) { - PageDesc *p2 = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS); - - if (p2 != p1) { - page_unlock(p2); - } - } -} - static inline struct page_entry * page_entry_new(PageDesc *pd, tb_page_addr_t index) { @@ -824,9 +706,9 @@ page_collection_lock(tb_page_addr_t start, tb_page_addr_t end) } assert_page_locked(pd); PAGE_FOR_EACH_TB(pd, tb, n) { - if (page_trylock_add(set, tb->page_addr[0]) || - (tb->page_addr[1] != -1 && - page_trylock_add(set, tb->page_addr[1]))) { + if (page_trylock_add(set, tb_page_addr0(tb)) || + (tb_page_addr1(tb) != -1 && + page_trylock_add(set, tb_page_addr1(tb)))) { /* drop all locks, and reacquire in order */ g_tree_foreach(set->tree, page_entry_unlock, NULL); goto retry; @@ -845,509 +727,6 @@ void page_collection_unlock(struct page_collection *set) #endif /* !CONFIG_USER_ONLY */ -static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1, - PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc) -{ - PageDesc *p1, *p2; - tb_page_addr_t page1; - tb_page_addr_t page2; - - assert_memory_lock(); - g_assert(phys1 != -1); - - page1 = phys1 >> TARGET_PAGE_BITS; - page2 = phys2 >> TARGET_PAGE_BITS; - - p1 = page_find_alloc(page1, alloc); - if (ret_p1) { - *ret_p1 = p1; - } - if (likely(phys2 == -1)) { - page_lock(p1); - return; - } else if (page1 == page2) { - page_lock(p1); - if (ret_p2) { - *ret_p2 = p1; - } - return; - } - p2 = page_find_alloc(page2, alloc); - if (ret_p2) { - *ret_p2 = p2; - } - if (page1 < page2) { - page_lock(p1); - page_lock(p2); - } else { - page_lock(p2); - page_lock(p1); - } -} - -static bool tb_cmp(const void *ap, const void *bp) -{ - const TranslationBlock *a = ap; - const TranslationBlock *b = bp; - - return ((TARGET_TB_PCREL || tb_pc(a) == tb_pc(b)) && - a->cs_base == b->cs_base && - a->flags == b->flags && - (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) && - a->trace_vcpu_dstate == b->trace_vcpu_dstate && - a->page_addr[0] == b->page_addr[0] && - a->page_addr[1] == b->page_addr[1]); -} - -void tb_htable_init(void) -{ - unsigned int mode = QHT_MODE_AUTO_RESIZE; - - qht_init(&tb_ctx.htable, tb_cmp, CODE_GEN_HTABLE_SIZE, mode); -} - -/* Set to NULL all the 'first_tb' fields in all PageDescs. */ -static void page_flush_tb_1(int level, void **lp) -{ - int i; - - if (*lp == NULL) { - return; - } - if (level == 0) { - PageDesc *pd = *lp; - - for (i = 0; i < V_L2_SIZE; ++i) { - page_lock(&pd[i]); - pd[i].first_tb = (uintptr_t)NULL; - page_unlock(&pd[i]); - } - } else { - void **pp = *lp; - - for (i = 0; i < V_L2_SIZE; ++i) { - page_flush_tb_1(level - 1, pp + i); - } - } -} - -static void page_flush_tb(void) -{ - int i, l1_sz = v_l1_size; - - for (i = 0; i < l1_sz; i++) { - page_flush_tb_1(v_l2_levels, l1_map + i); - } -} - -static gboolean tb_host_size_iter(gpointer key, gpointer value, gpointer data) -{ - const TranslationBlock *tb = value; - size_t *size = data; - - *size += tb->tc.size; - return false; -} - -/* flush all the translation blocks */ -static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count) -{ - bool did_flush = false; - - mmap_lock(); - /* If it is already been done on request of another CPU, - * just retry. - */ - if (tb_ctx.tb_flush_count != tb_flush_count.host_int) { - goto done; - } - did_flush = true; - - if (DEBUG_TB_FLUSH_GATE) { - size_t nb_tbs = tcg_nb_tbs(); - size_t host_size = 0; - - tcg_tb_foreach(tb_host_size_iter, &host_size); - printf("qemu: flush code_size=%zu nb_tbs=%zu avg_tb_size=%zu\n", - tcg_code_size(), nb_tbs, nb_tbs > 0 ? host_size / nb_tbs : 0); - } - - CPU_FOREACH(cpu) { - tcg_flush_jmp_cache(cpu); - } - - qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE); - page_flush_tb(); - - tcg_region_reset_all(); - /* XXX: flush processor icache at this point if cache flush is - expensive */ - qatomic_mb_set(&tb_ctx.tb_flush_count, tb_ctx.tb_flush_count + 1); - -done: - mmap_unlock(); - if (did_flush) { - qemu_plugin_flush_cb(); - } -} - -void tb_flush(CPUState *cpu) -{ - if (tcg_enabled()) { - unsigned tb_flush_count = qatomic_mb_read(&tb_ctx.tb_flush_count); - - if (cpu_in_exclusive_context(cpu)) { - do_tb_flush(cpu, RUN_ON_CPU_HOST_INT(tb_flush_count)); - } else { - async_safe_run_on_cpu(cpu, do_tb_flush, - RUN_ON_CPU_HOST_INT(tb_flush_count)); - } - } -} - -/* - * Formerly ifdef DEBUG_TB_CHECK. These debug functions are user-mode-only, - * so in order to prevent bit rot we compile them unconditionally in user-mode, - * and let the optimizer get rid of them by wrapping their user-only callers - * with if (DEBUG_TB_CHECK_GATE). - */ -#ifdef CONFIG_USER_ONLY - -static void do_tb_invalidate_check(void *p, uint32_t hash, void *userp) -{ - TranslationBlock *tb = p; - target_ulong addr = *(target_ulong *)userp; - - if (!(addr + TARGET_PAGE_SIZE <= tb_pc(tb) || - addr >= tb_pc(tb) + tb->size)) { - printf("ERROR invalidate: address=" TARGET_FMT_lx - " PC=%08lx size=%04x\n", addr, (long)tb_pc(tb), tb->size); - } -} - -/* verify that all the pages have correct rights for code - * - * Called with mmap_lock held. - */ -static void tb_invalidate_check(target_ulong address) -{ - address &= TARGET_PAGE_MASK; - qht_iter(&tb_ctx.htable, do_tb_invalidate_check, &address); -} - -static void do_tb_page_check(void *p, uint32_t hash, void *userp) -{ - TranslationBlock *tb = p; - int flags1, flags2; - - flags1 = page_get_flags(tb_pc(tb)); - flags2 = page_get_flags(tb_pc(tb) + tb->size - 1); - if ((flags1 & PAGE_WRITE) || (flags2 & PAGE_WRITE)) { - printf("ERROR page flags: PC=%08lx size=%04x f1=%x f2=%x\n", - (long)tb_pc(tb), tb->size, flags1, flags2); - } -} - -/* verify that all the pages have correct rights for code */ -static void tb_page_check(void) -{ - qht_iter(&tb_ctx.htable, do_tb_page_check, NULL); -} - -#endif /* CONFIG_USER_ONLY */ - -/* - * user-mode: call with mmap_lock held - * !user-mode: call with @pd->lock held - */ -static inline void tb_page_remove(PageDesc *pd, TranslationBlock *tb) -{ - TranslationBlock *tb1; - uintptr_t *pprev; - unsigned int n1; - - assert_page_locked(pd); - pprev = &pd->first_tb; - PAGE_FOR_EACH_TB(pd, tb1, n1) { - if (tb1 == tb) { - *pprev = tb1->page_next[n1]; - return; - } - pprev = &tb1->page_next[n1]; - } - g_assert_not_reached(); -} - -/* remove @orig from its @n_orig-th jump list */ -static inline void tb_remove_from_jmp_list(TranslationBlock *orig, int n_orig) -{ - uintptr_t ptr, ptr_locked; - TranslationBlock *dest; - TranslationBlock *tb; - uintptr_t *pprev; - int n; - - /* mark the LSB of jmp_dest[] so that no further jumps can be inserted */ - ptr = qatomic_or_fetch(&orig->jmp_dest[n_orig], 1); - dest = (TranslationBlock *)(ptr & ~1); - if (dest == NULL) { - return; - } - - qemu_spin_lock(&dest->jmp_lock); - /* - * While acquiring the lock, the jump might have been removed if the - * destination TB was invalidated; check again. - */ - ptr_locked = qatomic_read(&orig->jmp_dest[n_orig]); - if (ptr_locked != ptr) { - qemu_spin_unlock(&dest->jmp_lock); - /* - * The only possibility is that the jump was unlinked via - * tb_jump_unlink(dest). Seeing here another destination would be a bug, - * because we set the LSB above. - */ - g_assert(ptr_locked == 1 && dest->cflags & CF_INVALID); - return; - } - /* - * We first acquired the lock, and since the destination pointer matches, - * we know for sure that @orig is in the jmp list. - */ - pprev = &dest->jmp_list_head; - TB_FOR_EACH_JMP(dest, tb, n) { - if (tb == orig && n == n_orig) { - *pprev = tb->jmp_list_next[n]; - /* no need to set orig->jmp_dest[n]; setting the LSB was enough */ - qemu_spin_unlock(&dest->jmp_lock); - return; - } - pprev = &tb->jmp_list_next[n]; - } - g_assert_not_reached(); -} - -/* reset the jump entry 'n' of a TB so that it is not chained to - another TB */ -static inline void tb_reset_jump(TranslationBlock *tb, int n) -{ - uintptr_t addr = (uintptr_t)(tb->tc.ptr + tb->jmp_reset_offset[n]); - tb_set_jmp_target(tb, n, addr); -} - -/* remove any jumps to the TB */ -static inline void tb_jmp_unlink(TranslationBlock *dest) -{ - TranslationBlock *tb; - int n; - - qemu_spin_lock(&dest->jmp_lock); - - TB_FOR_EACH_JMP(dest, tb, n) { - tb_reset_jump(tb, n); - qatomic_and(&tb->jmp_dest[n], (uintptr_t)NULL | 1); - /* No need to clear the list entry; setting the dest ptr is enough */ - } - dest->jmp_list_head = (uintptr_t)NULL; - - qemu_spin_unlock(&dest->jmp_lock); -} - -static void tb_jmp_cache_inval_tb(TranslationBlock *tb) -{ - CPUState *cpu; - - if (TARGET_TB_PCREL) { - /* A TB may be at any virtual address */ - CPU_FOREACH(cpu) { - tcg_flush_jmp_cache(cpu); - } - } else { - uint32_t h = tb_jmp_cache_hash_func(tb_pc(tb)); - - CPU_FOREACH(cpu) { - CPUJumpCache *jc = cpu->tb_jmp_cache; - - if (qatomic_read(&jc->array[h].tb) == tb) { - qatomic_set(&jc->array[h].tb, NULL); - } - } - } -} - -/* - * In user-mode, call with mmap_lock held. - * In !user-mode, if @rm_from_page_list is set, call with the TB's pages' - * locks held. - */ -static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list) -{ - PageDesc *p; - uint32_t h; - tb_page_addr_t phys_pc; - uint32_t orig_cflags = tb_cflags(tb); - - assert_memory_lock(); - - /* make sure no further incoming jumps will be chained to this TB */ - qemu_spin_lock(&tb->jmp_lock); - qatomic_set(&tb->cflags, tb->cflags | CF_INVALID); - qemu_spin_unlock(&tb->jmp_lock); - - /* remove the TB from the hash list */ - phys_pc = tb->page_addr[0]; - h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)), - tb->flags, orig_cflags, tb->trace_vcpu_dstate); - if (!qht_remove(&tb_ctx.htable, tb, h)) { - return; - } - - /* remove the TB from the page list */ - if (rm_from_page_list) { - p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS); - tb_page_remove(p, tb); - if (tb->page_addr[1] != -1) { - p = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS); - tb_page_remove(p, tb); - } - } - - /* remove the TB from the hash list */ - tb_jmp_cache_inval_tb(tb); - - /* suppress this TB from the two jump lists */ - tb_remove_from_jmp_list(tb, 0); - tb_remove_from_jmp_list(tb, 1); - - /* suppress any remaining jumps to this TB */ - tb_jmp_unlink(tb); - - qatomic_set(&tb_ctx.tb_phys_invalidate_count, - tb_ctx.tb_phys_invalidate_count + 1); -} - -static void tb_phys_invalidate__locked(TranslationBlock *tb) -{ - qemu_thread_jit_write(); - do_tb_phys_invalidate(tb, true); - qemu_thread_jit_execute(); -} - -/* invalidate one TB - * - * Called with mmap_lock held in user-mode. - */ -void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr) -{ - if (page_addr == -1 && tb->page_addr[0] != -1) { - page_lock_tb(tb); - do_tb_phys_invalidate(tb, true); - page_unlock_tb(tb); - } else { - do_tb_phys_invalidate(tb, false); - } -} - -/* add the tb in the target page and protect it if necessary - * - * Called with mmap_lock held for user-mode emulation. - * Called with @p->lock held in !user-mode. - */ -static inline void tb_page_add(PageDesc *p, TranslationBlock *tb, - unsigned int n, tb_page_addr_t page_addr) -{ -#ifndef CONFIG_USER_ONLY - bool page_already_protected; -#endif - - assert_page_locked(p); - - tb->page_addr[n] = page_addr; - tb->page_next[n] = p->first_tb; -#ifndef CONFIG_USER_ONLY - page_already_protected = p->first_tb != (uintptr_t)NULL; -#endif - p->first_tb = (uintptr_t)tb | n; - -#if defined(CONFIG_USER_ONLY) - /* translator_loop() must have made all TB pages non-writable */ - assert(!(p->flags & PAGE_WRITE)); -#else - /* if some code is already present, then the pages are already - protected. So we handle the case where only the first TB is - allocated in a physical page */ - if (!page_already_protected) { - tlb_protect_code(page_addr); - } -#endif -} - -/* - * Add a new TB and link it to the physical page tables. phys_page2 is - * (-1) to indicate that only one page contains the TB. - * - * Called with mmap_lock held for user-mode emulation. - * - * Returns a pointer @tb, or a pointer to an existing TB that matches @tb. - * Note that in !user-mode, another thread might have already added a TB - * for the same block of guest code that @tb corresponds to. In that case, - * the caller should discard the original @tb, and use instead the returned TB. - */ -static TranslationBlock * -tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc, - tb_page_addr_t phys_page2) -{ - PageDesc *p; - PageDesc *p2 = NULL; - void *existing_tb = NULL; - uint32_t h; - - assert_memory_lock(); - tcg_debug_assert(!(tb->cflags & CF_INVALID)); - - /* - * Add the TB to the page list, acquiring first the pages's locks. - * We keep the locks held until after inserting the TB in the hash table, - * so that if the insertion fails we know for sure that the TBs are still - * in the page descriptors. - * Note that inserting into the hash table first isn't an option, since - * we can only insert TBs that are fully initialized. - */ - page_lock_pair(&p, phys_pc, &p2, phys_page2, true); - tb_page_add(p, tb, 0, phys_pc); - if (p2) { - tb_page_add(p2, tb, 1, phys_page2); - } else { - tb->page_addr[1] = -1; - } - - /* add in the hash table */ - h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)), - tb->flags, tb->cflags, tb->trace_vcpu_dstate); - qht_insert(&tb_ctx.htable, tb, h, &existing_tb); - - /* remove TB from the page(s) if we couldn't insert it */ - if (unlikely(existing_tb)) { - tb_page_remove(p, tb); - if (p2) { - tb_page_remove(p2, tb); - } - tb = existing_tb; - } - - if (p2 && p2 != p) { - page_unlock(p2); - } - page_unlock(p); - -#ifdef CONFIG_USER_ONLY - if (DEBUG_TB_CHECK_GATE) { - tb_page_check(); - } -#endif - return tb; -} - /* Called with mmap_lock held for user mode emulation. */ TranslationBlock *tb_gen_code(CPUState *cpu, target_ulong pc, target_ulong cs_base, @@ -1400,8 +779,8 @@ TranslationBlock *tb_gen_code(CPUState *cpu, tb->flags = flags; tb->cflags = cflags; tb->trace_vcpu_dstate = *cpu->trace_dstate; - tb->page_addr[0] = phys_pc; - tb->page_addr[1] = -1; + tb_set_page_addr0(tb, phys_pc); + tb_set_page_addr1(tb, -1); tcg_ctx->tb_cflags = cflags; tb_overflow: @@ -1599,7 +978,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu, * a temporary one-insn TB, and we have nothing left to do. Return early * before attempting to link to other TBs or add to the lookup table. */ - if (tb->page_addr[0] == -1) { + if (tb_page_addr0(tb) == -1) { return tb; } @@ -1614,7 +993,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu, * No explicit memory barrier is required -- tb_link_page() makes the * TB visible in a consistent state. */ - existing_tb = tb_link_page(tb, tb->page_addr[0], tb->page_addr[1]); + existing_tb = tb_link_page(tb, tb_page_addr0(tb), tb_page_addr1(tb)); /* if the TB already exists, discard what we just translated */ if (unlikely(existing_tb != tb)) { uintptr_t orig_aligned = (uintptr_t)gen_code_buf; @@ -1627,251 +1006,6 @@ TranslationBlock *tb_gen_code(CPUState *cpu, return tb; } -/* - * @p must be non-NULL. - * user-mode: call with mmap_lock held. - * !user-mode: call with all @pages locked. - */ -static void -tb_invalidate_phys_page_range__locked(struct page_collection *pages, - PageDesc *p, tb_page_addr_t start, - tb_page_addr_t end, - uintptr_t retaddr) -{ - TranslationBlock *tb; - tb_page_addr_t tb_start, tb_end; - int n; -#ifdef TARGET_HAS_PRECISE_SMC - CPUState *cpu = current_cpu; - CPUArchState *env = NULL; - bool current_tb_not_found = retaddr != 0; - bool current_tb_modified = false; - TranslationBlock *current_tb = NULL; - target_ulong current_pc = 0; - target_ulong current_cs_base = 0; - uint32_t current_flags = 0; -#endif /* TARGET_HAS_PRECISE_SMC */ - - assert_page_locked(p); - -#if defined(TARGET_HAS_PRECISE_SMC) - if (cpu != NULL) { - env = cpu->env_ptr; - } -#endif - - /* we remove all the TBs in the range [start, end[ */ - /* XXX: see if in some cases it could be faster to invalidate all - the code */ - PAGE_FOR_EACH_TB(p, tb, n) { - assert_page_locked(p); - /* NOTE: this is subtle as a TB may span two physical pages */ - if (n == 0) { - /* NOTE: tb_end may be after the end of the page, but - it is not a problem */ - tb_start = tb->page_addr[0]; - tb_end = tb_start + tb->size; - } else { - tb_start = tb->page_addr[1]; - tb_end = tb_start + ((tb->page_addr[0] + tb->size) - & ~TARGET_PAGE_MASK); - } - if (!(tb_end <= start || tb_start >= end)) { -#ifdef TARGET_HAS_PRECISE_SMC - if (current_tb_not_found) { - current_tb_not_found = false; - /* now we have a real cpu fault */ - current_tb = tcg_tb_lookup(retaddr); - } - if (current_tb == tb && - (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) { - /* - * If we are modifying the current TB, we must stop - * its execution. We could be more precise by checking - * that the modification is after the current PC, but it - * would require a specialized function to partially - * restore the CPU state. - */ - current_tb_modified = true; - cpu_restore_state_from_tb(cpu, current_tb, retaddr, true); - cpu_get_tb_cpu_state(env, ¤t_pc, ¤t_cs_base, - ¤t_flags); - } -#endif /* TARGET_HAS_PRECISE_SMC */ - tb_phys_invalidate__locked(tb); - } - } -#if !defined(CONFIG_USER_ONLY) - /* if no code remaining, no need to continue to use slow writes */ - if (!p->first_tb) { - tlb_unprotect_code(start); - } -#endif -#ifdef TARGET_HAS_PRECISE_SMC - if (current_tb_modified) { - page_collection_unlock(pages); - /* Force execution of one insn next time. */ - cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(cpu); - mmap_unlock(); - cpu_loop_exit_noexc(cpu); - } -#endif -} - -/* - * Invalidate all TBs which intersect with the target physical address range - * [start;end[. NOTE: start and end must refer to the *same* physical page. - * 'is_cpu_write_access' should be true if called from a real cpu write - * access: the virtual CPU will exit the current TB if code is modified inside - * this TB. - * - * Called with mmap_lock held for user-mode emulation - */ -void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end) -{ - struct page_collection *pages; - PageDesc *p; - - assert_memory_lock(); - - p = page_find(start >> TARGET_PAGE_BITS); - if (p == NULL) { - return; - } - pages = page_collection_lock(start, end); - tb_invalidate_phys_page_range__locked(pages, p, start, end, 0); - page_collection_unlock(pages); -} - -/* - * Invalidate all TBs which intersect with the target physical address range - * [start;end[. NOTE: start and end may refer to *different* physical pages. - * 'is_cpu_write_access' should be true if called from a real cpu write - * access: the virtual CPU will exit the current TB if code is modified inside - * this TB. - * - * Called with mmap_lock held for user-mode emulation. - */ -#ifdef CONFIG_SOFTMMU -void tb_invalidate_phys_range(ram_addr_t start, ram_addr_t end) -#else -void tb_invalidate_phys_range(target_ulong start, target_ulong end) -#endif -{ - struct page_collection *pages; - tb_page_addr_t next; - - assert_memory_lock(); - - pages = page_collection_lock(start, end); - for (next = (start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE; - start < end; - start = next, next += TARGET_PAGE_SIZE) { - PageDesc *pd = page_find(start >> TARGET_PAGE_BITS); - tb_page_addr_t bound = MIN(next, end); - - if (pd == NULL) { - continue; - } - tb_invalidate_phys_page_range__locked(pages, pd, start, bound, 0); - } - page_collection_unlock(pages); -} - -#ifdef CONFIG_SOFTMMU -/* len must be <= 8 and start must be a multiple of len. - * Called via softmmu_template.h when code areas are written to with - * iothread mutex not held. - * - * Call with all @pages in the range [@start, @start + len[ locked. - */ -void tb_invalidate_phys_page_fast(struct page_collection *pages, - tb_page_addr_t start, int len, - uintptr_t retaddr) -{ - PageDesc *p; - - assert_memory_lock(); - - p = page_find(start >> TARGET_PAGE_BITS); - if (!p) { - return; - } - - assert_page_locked(p); - tb_invalidate_phys_page_range__locked(pages, p, start, start + len, - retaddr); -} -#else -/* Called with mmap_lock held. If pc is not 0 then it indicates the - * host PC of the faulting store instruction that caused this invalidate. - * Returns true if the caller needs to abort execution of the current - * TB (because it was modified by this store and the guest CPU has - * precise-SMC semantics). - */ -static bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc) -{ - TranslationBlock *tb; - PageDesc *p; - int n; -#ifdef TARGET_HAS_PRECISE_SMC - TranslationBlock *current_tb = NULL; - CPUState *cpu = current_cpu; - CPUArchState *env = NULL; - int current_tb_modified = 0; - target_ulong current_pc = 0; - target_ulong current_cs_base = 0; - uint32_t current_flags = 0; -#endif - - assert_memory_lock(); - - addr &= TARGET_PAGE_MASK; - p = page_find(addr >> TARGET_PAGE_BITS); - if (!p) { - return false; - } - -#ifdef TARGET_HAS_PRECISE_SMC - if (p->first_tb && pc != 0) { - current_tb = tcg_tb_lookup(pc); - } - if (cpu != NULL) { - env = cpu->env_ptr; - } -#endif - assert_page_locked(p); - PAGE_FOR_EACH_TB(p, tb, n) { -#ifdef TARGET_HAS_PRECISE_SMC - if (current_tb == tb && - (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) { - /* If we are modifying the current TB, we must stop - its execution. We could be more precise by checking - that the modification is after the current PC, but it - would require a specialized function to partially - restore the CPU state */ - - current_tb_modified = 1; - cpu_restore_state_from_tb(cpu, current_tb, pc, true); - cpu_get_tb_cpu_state(env, ¤t_pc, ¤t_cs_base, - ¤t_flags); - } -#endif /* TARGET_HAS_PRECISE_SMC */ - tb_phys_invalidate(tb, addr); - } - p->first_tb = (uintptr_t)NULL; -#ifdef TARGET_HAS_PRECISE_SMC - if (current_tb_modified) { - /* Force execution of one insn next time. */ - cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(cpu); - return true; - } -#endif - - return false; -} -#endif - /* user-mode: call with mmap_lock held */ void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr) { @@ -2014,7 +1148,7 @@ static gboolean tb_tree_stats_iter(gpointer key, gpointer value, gpointer data) if (tb->size > tst->max_target_size) { tst->max_target_size = tb->size; } - if (tb->page_addr[1] != -1) { + if (tb_page_addr1(tb) != -1) { tst->cross_page++; } if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) { @@ -2226,7 +1360,7 @@ int page_get_flags(target_ulong address) void page_set_flags(target_ulong start, target_ulong end, int flags) { target_ulong addr, len; - bool reset_target_data; + bool reset, inval_tb = false; /* This function should never be called with addresses outside the guest address space. If this assert fires, it probably indicates @@ -2243,7 +1377,10 @@ void page_set_flags(target_ulong start, target_ulong end, int flags) if (flags & PAGE_WRITE) { flags |= PAGE_WRITE_ORG; } - reset_target_data = !(flags & PAGE_VALID) || (flags & PAGE_RESET); + reset = !(flags & PAGE_VALID) || (flags & PAGE_RESET); + if (reset) { + page_reset_target_data(start, end); + } flags &= ~PAGE_RESET; for (addr = start, len = end - start; @@ -2251,68 +1388,23 @@ void page_set_flags(target_ulong start, target_ulong end, int flags) len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) { PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, true); - /* If the write protection bit is set, then we invalidate - the code inside. */ - if (!(p->flags & PAGE_WRITE) && - (flags & PAGE_WRITE) && - p->first_tb) { - tb_invalidate_phys_page(addr, 0); - } - if (reset_target_data) { - g_free(p->target_data); - p->target_data = NULL; - p->flags = flags; - } else { - /* Using mprotect on a page does not change sticky bits. */ - p->flags = (p->flags & PAGE_STICKY) | flags; + /* + * If the page was executable, but is reset, or is no longer + * executable, or has become writable, then invalidate any code. + */ + if ((p->flags & PAGE_EXEC) + && (reset || + !(flags & PAGE_EXEC) || + (flags & ~p->flags & PAGE_WRITE))) { + inval_tb = true; } + /* Using mprotect on a page does not change sticky bits. */ + p->flags = (reset ? 0 : p->flags & PAGE_STICKY) | flags; } -} - -void page_reset_target_data(target_ulong start, target_ulong end) -{ - target_ulong addr, len; - - /* - * This function should never be called with addresses outside the - * guest address space. If this assert fires, it probably indicates - * a missing call to h2g_valid. - */ - assert(end - 1 <= GUEST_ADDR_MAX); - assert(start < end); - assert_memory_lock(); - - start = start & TARGET_PAGE_MASK; - end = TARGET_PAGE_ALIGN(end); - - for (addr = start, len = end - start; - len != 0; - len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) { - PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, 1); - - g_free(p->target_data); - p->target_data = NULL; - } -} -void *page_get_target_data(target_ulong address) -{ - PageDesc *p = page_find(address >> TARGET_PAGE_BITS); - return p ? p->target_data : NULL; -} - -void *page_alloc_target_data(target_ulong address, size_t size) -{ - PageDesc *p = page_find(address >> TARGET_PAGE_BITS); - void *ret = NULL; - - if (p->flags & PAGE_VALID) { - ret = p->target_data; - if (!ret) { - p->target_data = ret = g_malloc0(size); - } + if (inval_tb) { + tb_invalidate_phys_range(start, end); } - return ret; } int page_check_range(target_ulong start, target_ulong len, int flags) @@ -2396,9 +1488,6 @@ void page_protect(tb_page_addr_t page_addr) } mprotect(g2h_untagged(page_addr), qemu_host_page_size, (prot & PAGE_BITS) & ~PAGE_WRITE); - if (DEBUG_TB_INVALIDATE_GATE) { - printf("protecting code page: 0x" TB_PAGE_ADDR_FMT "\n", page_addr); - } } } @@ -2453,12 +1542,8 @@ int page_unprotect(target_ulong address, uintptr_t pc) /* and since the content will be modified, we must invalidate the corresponding translated code. */ - current_tb_invalidated |= tb_invalidate_phys_page(addr, pc); -#ifdef CONFIG_USER_ONLY - if (DEBUG_TB_CHECK_GATE) { - tb_invalidate_check(addr); - } -#endif + current_tb_invalidated |= + tb_invalidate_phys_page_unwind(addr, pc); } mprotect((void *)g2h_untagged(host_start), qemu_host_page_size, prot & PAGE_BITS); diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c index 8e78fd7a9c..061519691f 100644 --- a/accel/tcg/translator.c +++ b/accel/tcg/translator.c @@ -157,7 +157,7 @@ static void *translator_access(CPUArchState *env, DisasContextBase *db, tb = db->tb; /* Use slow path if first page is MMIO. */ - if (unlikely(tb->page_addr[0] == -1)) { + if (unlikely(tb_page_addr0(tb) == -1)) { return NULL; } @@ -169,13 +169,14 @@ static void *translator_access(CPUArchState *env, DisasContextBase *db, host = db->host_addr[1]; base = TARGET_PAGE_ALIGN(db->pc_first); if (host == NULL) { - tb->page_addr[1] = + tb_page_addr_t phys_page = get_page_addr_code_hostp(env, base, &db->host_addr[1]); + /* We cannot handle MMIO as second page. */ + assert(phys_page != -1); + tb_set_page_addr1(tb, phys_page); #ifdef CONFIG_USER_ONLY page_protect(end); #endif - /* We cannot handle MMIO as second page. */ - assert(tb->page_addr[1] != -1); host = db->host_addr[1]; } diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c index 521aa8b61e..fb7d6ee9e9 100644 --- a/accel/tcg/user-exec.c +++ b/accel/tcg/user-exec.c @@ -210,6 +210,48 @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr, return addr; } +void page_reset_target_data(target_ulong start, target_ulong end) +{ +#ifdef TARGET_PAGE_DATA_SIZE + target_ulong addr, len; + + /* + * This function should never be called with addresses outside the + * guest address space. If this assert fires, it probably indicates + * a missing call to h2g_valid. + */ + assert(end - 1 <= GUEST_ADDR_MAX); + assert(start < end); + assert_memory_lock(); + + start = start & TARGET_PAGE_MASK; + end = TARGET_PAGE_ALIGN(end); + + for (addr = start, len = end - start; + len != 0; + len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) { + PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, 1); + + g_free(p->target_data); + p->target_data = NULL; + } +#endif +} + +#ifdef TARGET_PAGE_DATA_SIZE +void *page_get_target_data(target_ulong address) +{ + PageDesc *p = page_find(address >> TARGET_PAGE_BITS); + void *ret = p->target_data; + + if (!ret) { + ret = g_malloc0(TARGET_PAGE_DATA_SIZE); + p->target_data = ret; + } + return ret; +} +#endif + /* The softmmu versions of these helpers are in cputlb.c. */ /* @@ -1641,6 +1641,20 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, goto open_failed; } + assert(!(bs->supported_read_flags & ~BDRV_REQ_MASK)); + assert(!(bs->supported_write_flags & ~BDRV_REQ_MASK)); + + /* + * Always allow the BDRV_REQ_REGISTERED_BUF optimization hint. This saves + * drivers that pass read/write requests through to a child the trouble of + * declaring support explicitly. + * + * Drivers must not propagate this flag accidentally when they initiate I/O + * to a bounce buffer. That case should be rare though. + */ + bs->supported_read_flags |= BDRV_REQ_REGISTERED_BUF; + bs->supported_write_flags |= BDRV_REQ_REGISTERED_BUF; + ret = refresh_total_sectors(bs, bs->total_sectors); if (ret < 0) { error_setg_errno(errp, -ret, "Could not refresh total sector count"); diff --git a/block/blkio.c b/block/blkio.c new file mode 100644 index 0000000000..82f26eedd2 --- /dev/null +++ b/block/blkio.c @@ -0,0 +1,1008 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/* + * libblkio BlockDriver + * + * Copyright Red Hat, Inc. + * + * Author: + * Stefan Hajnoczi <stefanha@redhat.com> + */ + +#include "qemu/osdep.h" +#include <blkio.h> +#include "block/block_int.h" +#include "exec/memory.h" +#include "exec/cpu-common.h" /* for qemu_ram_get_fd() */ +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qapi/qmp/qdict.h" +#include "qemu/module.h" +#include "exec/memory.h" /* for ram_block_discard_disable() */ + +/* + * Keep the QEMU BlockDriver names identical to the libblkio driver names. + * Using macros instead of typing out the string literals avoids typos. + */ +#define DRIVER_IO_URING "io_uring" +#define DRIVER_NVME_IO_URING "nvme-io_uring" +#define DRIVER_VIRTIO_BLK_VHOST_USER "virtio-blk-vhost-user" +#define DRIVER_VIRTIO_BLK_VHOST_VDPA "virtio-blk-vhost-vdpa" + +/* + * Allocated bounce buffers are kept in a list sorted by buffer address. + */ +typedef struct BlkioBounceBuf { + QLIST_ENTRY(BlkioBounceBuf) next; + + /* The bounce buffer */ + struct iovec buf; +} BlkioBounceBuf; + +typedef struct { + /* + * libblkio is not thread-safe so this lock protects ->blkio and + * ->blkioq. + */ + QemuMutex blkio_lock; + struct blkio *blkio; + struct blkioq *blkioq; /* make this multi-queue in the future... */ + int completion_fd; + + /* + * Polling fetches the next completion into this field. + * + * No lock is necessary since only one thread calls aio_poll() and invokes + * fd and poll handlers. + */ + struct blkio_completion poll_completion; + + /* + * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available. + * + * Lock ordering: ->bounce_lock before ->blkio_lock. + */ + CoMutex bounce_lock; + + /* Bounce buffer pool */ + struct blkio_mem_region bounce_pool; + + /* Sorted list of allocated bounce buffers */ + QLIST_HEAD(, BlkioBounceBuf) bounce_bufs; + + /* Queue for coroutines waiting for bounce buffer space */ + CoQueue bounce_available; + + /* The value of the "mem-region-alignment" property */ + size_t mem_region_alignment; + + /* Can we skip adding/deleting blkio_mem_regions? */ + bool needs_mem_regions; + + /* Are file descriptors necessary for blkio_mem_regions? */ + bool needs_mem_region_fd; + + /* Are madvise(MADV_DONTNEED)-style operations unavailable? */ + bool may_pin_mem_regions; +} BDRVBlkioState; + +/* Called with s->bounce_lock held */ +static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes) +{ + /* There can be no allocated bounce buffers during resize */ + assert(QLIST_EMPTY(&s->bounce_bufs)); + + /* Pad size to reduce frequency of resize calls */ + bytes += 128 * 1024; + + WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { + int ret; + + if (s->bounce_pool.addr) { + blkio_unmap_mem_region(s->blkio, &s->bounce_pool); + blkio_free_mem_region(s->blkio, &s->bounce_pool); + memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); + } + + /* Automatically freed when s->blkio is destroyed */ + ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes); + if (ret < 0) { + return ret; + } + + ret = blkio_map_mem_region(s->blkio, &s->bounce_pool); + if (ret < 0) { + blkio_free_mem_region(s->blkio, &s->bounce_pool); + memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); + return ret; + } + } + + return 0; +} + +/* Called with s->bounce_lock held */ +static bool +blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, + int64_t bytes) +{ + void *addr = s->bounce_pool.addr; + BlkioBounceBuf *cur = NULL; + BlkioBounceBuf *prev = NULL; + ptrdiff_t space; + + /* + * This is just a linear search over the holes between requests. An + * efficient allocator would be nice. + */ + QLIST_FOREACH(cur, &s->bounce_bufs, next) { + space = cur->buf.iov_base - addr; + if (bytes <= space) { + QLIST_INSERT_BEFORE(cur, bounce, next); + bounce->buf.iov_base = addr; + bounce->buf.iov_len = bytes; + return true; + } + + addr = cur->buf.iov_base + cur->buf.iov_len; + prev = cur; + } + + /* Is there space after the last request? */ + space = s->bounce_pool.addr + s->bounce_pool.len - addr; + if (bytes > space) { + return false; + } + if (prev) { + QLIST_INSERT_AFTER(prev, bounce, next); + } else { + QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next); + } + bounce->buf.iov_base = addr; + bounce->buf.iov_len = bytes; + return true; +} + +static int coroutine_fn +blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, + int64_t bytes) +{ + /* + * Ensure fairness: first time around we join the back of the queue, + * subsequently we join the front so we don't lose our place. + */ + CoQueueWaitFlags wait_flags = 0; + + QEMU_LOCK_GUARD(&s->bounce_lock); + + /* Ensure fairness: don't even try if other requests are already waiting */ + if (!qemu_co_queue_empty(&s->bounce_available)) { + qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, + wait_flags); + wait_flags = CO_QUEUE_WAIT_FRONT; + } + + while (true) { + if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) { + /* Kick the next queued request since there may be space */ + qemu_co_queue_next(&s->bounce_available); + return 0; + } + + /* + * If there are no in-flight requests then the pool was simply too + * small. + */ + if (QLIST_EMPTY(&s->bounce_bufs)) { + bool ok; + int ret; + + ret = blkio_resize_bounce_pool(s, bytes); + if (ret < 0) { + /* Kick the next queued request since that may fail too */ + qemu_co_queue_next(&s->bounce_available); + return ret; + } + + ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes); + assert(ok); /* must have space this time */ + return 0; + } + + qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, + wait_flags); + wait_flags = CO_QUEUE_WAIT_FRONT; + } +} + +static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s, + BlkioBounceBuf *bounce) +{ + QEMU_LOCK_GUARD(&s->bounce_lock); + + QLIST_REMOVE(bounce, next); + + /* Wake up waiting coroutines since space may now be available */ + qemu_co_queue_next(&s->bounce_available); +} + +/* For async to .bdrv_co_*() conversion */ +typedef struct { + Coroutine *coroutine; + int ret; +} BlkioCoData; + +static void blkio_completion_fd_read(void *opaque) +{ + BlockDriverState *bs = opaque; + BDRVBlkioState *s = bs->opaque; + uint64_t val; + int ret; + + /* Polling may have already fetched a completion */ + if (s->poll_completion.user_data != NULL) { + BlkioCoData *cod = s->poll_completion.user_data; + cod->ret = s->poll_completion.ret; + + /* Clear it in case aio_co_wake() enters a nested event loop */ + s->poll_completion.user_data = NULL; + + aio_co_wake(cod->coroutine); + } + + /* Reset completion fd status */ + ret = read(s->completion_fd, &val, sizeof(val)); + + /* Ignore errors, there's nothing we can do */ + (void)ret; + + /* + * Reading one completion at a time makes nested event loop re-entrancy + * simple. Change this loop to get multiple completions in one go if it + * becomes a performance bottleneck. + */ + while (true) { + struct blkio_completion completion; + + WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { + ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL); + } + if (ret != 1) { + break; + } + + BlkioCoData *cod = completion.user_data; + cod->ret = completion.ret; + aio_co_wake(cod->coroutine); + } +} + +static bool blkio_completion_fd_poll(void *opaque) +{ + BlockDriverState *bs = opaque; + BDRVBlkioState *s = bs->opaque; + int ret; + + /* Just in case we already fetched a completion */ + if (s->poll_completion.user_data != NULL) { + return true; + } + + WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { + ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL); + } + return ret == 1; +} + +static void blkio_completion_fd_poll_ready(void *opaque) +{ + blkio_completion_fd_read(opaque); +} + +static void blkio_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + BDRVBlkioState *s = bs->opaque; + + aio_set_fd_handler(new_context, + s->completion_fd, + false, + blkio_completion_fd_read, + NULL, + blkio_completion_fd_poll, + blkio_completion_fd_poll_ready, + bs); +} + +static void blkio_detach_aio_context(BlockDriverState *bs) +{ + BDRVBlkioState *s = bs->opaque; + + aio_set_fd_handler(bdrv_get_aio_context(bs), + s->completion_fd, + false, NULL, NULL, NULL, NULL, NULL); +} + +/* Call with s->blkio_lock held to submit I/O after enqueuing a new request */ +static void blkio_submit_io(BlockDriverState *bs) +{ + if (qatomic_read(&bs->io_plugged) == 0) { + BDRVBlkioState *s = bs->opaque; + + blkioq_do_io(s->blkioq, NULL, 0, 0, NULL); + } +} + +static int coroutine_fn +blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) +{ + BDRVBlkioState *s = bs->opaque; + BlkioCoData cod = { + .coroutine = qemu_coroutine_self(), + }; + + WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { + blkioq_discard(s->blkioq, offset, bytes, &cod, 0); + blkio_submit_io(bs); + } + + qemu_coroutine_yield(); + return cod.ret; +} + +static int coroutine_fn +blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) +{ + BlkioCoData cod = { + .coroutine = qemu_coroutine_self(), + }; + BDRVBlkioState *s = bs->opaque; + bool use_bounce_buffer = + s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); + BlkioBounceBuf bounce; + struct iovec *iov = qiov->iov; + int iovcnt = qiov->niov; + + if (use_bounce_buffer) { + int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); + if (ret < 0) { + return ret; + } + + iov = &bounce.buf; + iovcnt = 1; + } + + WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { + blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0); + blkio_submit_io(bs); + } + + qemu_coroutine_yield(); + + if (use_bounce_buffer) { + if (cod.ret == 0) { + qemu_iovec_from_buf(qiov, 0, + bounce.buf.iov_base, + bounce.buf.iov_len); + } + + blkio_free_bounce_buffer(s, &bounce); + } + + return cod.ret; +} + +static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) +{ + uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0; + BlkioCoData cod = { + .coroutine = qemu_coroutine_self(), + }; + BDRVBlkioState *s = bs->opaque; + bool use_bounce_buffer = + s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); + BlkioBounceBuf bounce; + struct iovec *iov = qiov->iov; + int iovcnt = qiov->niov; + + if (use_bounce_buffer) { + int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); + if (ret < 0) { + return ret; + } + + qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes); + iov = &bounce.buf; + iovcnt = 1; + } + + WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { + blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags); + blkio_submit_io(bs); + } + + qemu_coroutine_yield(); + + if (use_bounce_buffer) { + blkio_free_bounce_buffer(s, &bounce); + } + + return cod.ret; +} + +static int coroutine_fn blkio_co_flush(BlockDriverState *bs) +{ + BDRVBlkioState *s = bs->opaque; + BlkioCoData cod = { + .coroutine = qemu_coroutine_self(), + }; + + WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { + blkioq_flush(s->blkioq, &cod, 0); + blkio_submit_io(bs); + } + + qemu_coroutine_yield(); + return cod.ret; +} + +static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int64_t bytes, BdrvRequestFlags flags) +{ + BDRVBlkioState *s = bs->opaque; + BlkioCoData cod = { + .coroutine = qemu_coroutine_self(), + }; + uint32_t blkio_flags = 0; + + if (flags & BDRV_REQ_FUA) { + blkio_flags |= BLKIO_REQ_FUA; + } + if (!(flags & BDRV_REQ_MAY_UNMAP)) { + blkio_flags |= BLKIO_REQ_NO_UNMAP; + } + if (flags & BDRV_REQ_NO_FALLBACK) { + blkio_flags |= BLKIO_REQ_NO_FALLBACK; + } + + WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { + blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags); + blkio_submit_io(bs); + } + + qemu_coroutine_yield(); + return cod.ret; +} + +static void blkio_io_unplug(BlockDriverState *bs) +{ + BDRVBlkioState *s = bs->opaque; + + WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { + blkio_submit_io(bs); + } +} + +typedef enum { + BMRR_OK, + BMRR_SKIP, + BMRR_FAIL, +} BlkioMemRegionResult; + +/* + * Produce a struct blkio_mem_region for a given address and size. + * + * This function produces identical results when called multiple times with the + * same arguments. This property is necessary because blkio_unmap_mem_region() + * must receive the same struct blkio_mem_region field values that were passed + * to blkio_map_mem_region(). + */ +static BlkioMemRegionResult +blkio_mem_region_from_host(BlockDriverState *bs, + void *host, size_t size, + struct blkio_mem_region *region, + Error **errp) +{ + BDRVBlkioState *s = bs->opaque; + int fd = -1; + ram_addr_t fd_offset = 0; + + if (((uintptr_t)host | size) % s->mem_region_alignment) { + error_setg(errp, "unaligned buf %p with size %zu", host, size); + return BMRR_FAIL; + } + + /* Attempt to find the fd for the underlying memory */ + if (s->needs_mem_region_fd) { + RAMBlock *ram_block; + RAMBlock *end_block; + ram_addr_t offset; + + /* + * bdrv_register_buf() is called with the BQL held so mr lives at least + * until this function returns. + */ + ram_block = qemu_ram_block_from_host(host, false, &fd_offset); + if (ram_block) { + fd = qemu_ram_get_fd(ram_block); + } + if (fd == -1) { + /* + * Ideally every RAMBlock would have an fd. pc-bios and other + * things don't. Luckily they are usually not I/O buffers and we + * can just ignore them. + */ + return BMRR_SKIP; + } + + /* Make sure the fd covers the entire range */ + end_block = qemu_ram_block_from_host(host + size - 1, false, &offset); + if (ram_block != end_block) { + error_setg(errp, "registered buffer at %p with size %zu extends " + "beyond RAMBlock", host, size); + return BMRR_FAIL; + } + } + + *region = (struct blkio_mem_region){ + .addr = host, + .len = size, + .fd = fd, + .fd_offset = fd_offset, + }; + return BMRR_OK; +} + +static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size, + Error **errp) +{ + BDRVBlkioState *s = bs->opaque; + struct blkio_mem_region region; + BlkioMemRegionResult region_result; + int ret; + + /* + * Mapping memory regions conflicts with RAM discard (virtio-mem) when + * there is pinning, so only do it when necessary. + */ + if (!s->needs_mem_regions && s->may_pin_mem_regions) { + return true; + } + + region_result = blkio_mem_region_from_host(bs, host, size, ®ion, errp); + if (region_result == BMRR_SKIP) { + return true; + } else if (region_result != BMRR_OK) { + return false; + } + + WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { + ret = blkio_map_mem_region(s->blkio, ®ion); + } + + if (ret < 0) { + error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s", + host, size, blkio_get_error_msg()); + return false; + } + return true; +} + +static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size) +{ + BDRVBlkioState *s = bs->opaque; + struct blkio_mem_region region; + + /* See blkio_register_buf() */ + if (!s->needs_mem_regions && s->may_pin_mem_regions) { + return; + } + + if (blkio_mem_region_from_host(bs, host, size, ®ion, NULL) != BMRR_OK) { + return; + } + + WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { + blkio_unmap_mem_region(s->blkio, ®ion); + } +} + +static int blkio_io_uring_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + const char *filename = qdict_get_str(options, "filename"); + BDRVBlkioState *s = bs->opaque; + int ret; + + ret = blkio_set_str(s->blkio, "path", filename); + qdict_del(options, "filename"); + if (ret < 0) { + error_setg_errno(errp, -ret, "failed to set path: %s", + blkio_get_error_msg()); + return ret; + } + + if (flags & BDRV_O_NOCACHE) { + ret = blkio_set_bool(s->blkio, "direct", true); + if (ret < 0) { + error_setg_errno(errp, -ret, "failed to set direct: %s", + blkio_get_error_msg()); + return ret; + } + } + + return 0; +} + +static int blkio_nvme_io_uring(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + const char *filename = qdict_get_str(options, "filename"); + BDRVBlkioState *s = bs->opaque; + int ret; + + ret = blkio_set_str(s->blkio, "path", filename); + qdict_del(options, "filename"); + if (ret < 0) { + error_setg_errno(errp, -ret, "failed to set path: %s", + blkio_get_error_msg()); + return ret; + } + + if (!(flags & BDRV_O_NOCACHE)) { + error_setg(errp, "cache.direct=off is not supported"); + return -EINVAL; + } + + return 0; +} + +static int blkio_virtio_blk_common_open(BlockDriverState *bs, + QDict *options, int flags, Error **errp) +{ + const char *path = qdict_get_try_str(options, "path"); + BDRVBlkioState *s = bs->opaque; + int ret; + + if (!path) { + error_setg(errp, "missing 'path' option"); + return -EINVAL; + } + + ret = blkio_set_str(s->blkio, "path", path); + qdict_del(options, "path"); + if (ret < 0) { + error_setg_errno(errp, -ret, "failed to set path: %s", + blkio_get_error_msg()); + return ret; + } + + if (!(flags & BDRV_O_NOCACHE)) { + error_setg(errp, "cache.direct=off is not supported"); + return -EINVAL; + } + return 0; +} + +static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + const char *blkio_driver = bs->drv->protocol_name; + BDRVBlkioState *s = bs->opaque; + int ret; + + ret = blkio_create(blkio_driver, &s->blkio); + if (ret < 0) { + error_setg_errno(errp, -ret, "blkio_create failed: %s", + blkio_get_error_msg()); + return ret; + } + + if (strcmp(blkio_driver, DRIVER_IO_URING) == 0) { + ret = blkio_io_uring_open(bs, options, flags, errp); + } else if (strcmp(blkio_driver, DRIVER_NVME_IO_URING) == 0) { + ret = blkio_nvme_io_uring(bs, options, flags, errp); + } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_USER) == 0) { + ret = blkio_virtio_blk_common_open(bs, options, flags, errp); + } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_VDPA) == 0) { + ret = blkio_virtio_blk_common_open(bs, options, flags, errp); + } else { + g_assert_not_reached(); + } + if (ret < 0) { + blkio_destroy(&s->blkio); + return ret; + } + + if (!(flags & BDRV_O_RDWR)) { + ret = blkio_set_bool(s->blkio, "read-only", true); + if (ret < 0) { + error_setg_errno(errp, -ret, "failed to set read-only: %s", + blkio_get_error_msg()); + blkio_destroy(&s->blkio); + return ret; + } + } + + ret = blkio_connect(s->blkio); + if (ret < 0) { + error_setg_errno(errp, -ret, "blkio_connect failed: %s", + blkio_get_error_msg()); + blkio_destroy(&s->blkio); + return ret; + } + + ret = blkio_get_bool(s->blkio, + "needs-mem-regions", + &s->needs_mem_regions); + if (ret < 0) { + error_setg_errno(errp, -ret, + "failed to get needs-mem-regions: %s", + blkio_get_error_msg()); + blkio_destroy(&s->blkio); + return ret; + } + + ret = blkio_get_bool(s->blkio, + "needs-mem-region-fd", + &s->needs_mem_region_fd); + if (ret < 0) { + error_setg_errno(errp, -ret, + "failed to get needs-mem-region-fd: %s", + blkio_get_error_msg()); + blkio_destroy(&s->blkio); + return ret; + } + + ret = blkio_get_uint64(s->blkio, + "mem-region-alignment", + &s->mem_region_alignment); + if (ret < 0) { + error_setg_errno(errp, -ret, + "failed to get mem-region-alignment: %s", + blkio_get_error_msg()); + blkio_destroy(&s->blkio); + return ret; + } + + ret = blkio_get_bool(s->blkio, + "may-pin-mem-regions", + &s->may_pin_mem_regions); + if (ret < 0) { + /* Be conservative (assume pinning) if the property is not supported */ + s->may_pin_mem_regions = s->needs_mem_regions; + } + + /* + * Notify if libblkio drivers pin memory and prevent features like + * virtio-mem from working. + */ + if (s->may_pin_mem_regions) { + ret = ram_block_discard_disable(true); + if (ret < 0) { + error_setg_errno(errp, -ret, "ram_block_discard_disable() failed"); + blkio_destroy(&s->blkio); + return ret; + } + } + + ret = blkio_start(s->blkio); + if (ret < 0) { + error_setg_errno(errp, -ret, "blkio_start failed: %s", + blkio_get_error_msg()); + blkio_destroy(&s->blkio); + if (s->may_pin_mem_regions) { + ram_block_discard_disable(false); + } + return ret; + } + + bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF; + bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | + BDRV_REQ_NO_FALLBACK; + + qemu_mutex_init(&s->blkio_lock); + qemu_co_mutex_init(&s->bounce_lock); + qemu_co_queue_init(&s->bounce_available); + QLIST_INIT(&s->bounce_bufs); + s->blkioq = blkio_get_queue(s->blkio, 0); + s->completion_fd = blkioq_get_completion_fd(s->blkioq); + + blkio_attach_aio_context(bs, bdrv_get_aio_context(bs)); + return 0; +} + +static void blkio_close(BlockDriverState *bs) +{ + BDRVBlkioState *s = bs->opaque; + + /* There is no destroy() API for s->bounce_lock */ + + qemu_mutex_destroy(&s->blkio_lock); + blkio_detach_aio_context(bs); + blkio_destroy(&s->blkio); + + if (s->may_pin_mem_regions) { + ram_block_discard_disable(false); + } +} + +static int64_t blkio_getlength(BlockDriverState *bs) +{ + BDRVBlkioState *s = bs->opaque; + uint64_t capacity; + int ret; + + WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { + ret = blkio_get_uint64(s->blkio, "capacity", &capacity); + } + if (ret < 0) { + return -ret; + } + + return capacity; +} + +static int blkio_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + return 0; +} + +static void blkio_refresh_limits(BlockDriverState *bs, Error **errp) +{ + BDRVBlkioState *s = bs->opaque; + QEMU_LOCK_GUARD(&s->blkio_lock); + int value; + int ret; + + ret = blkio_get_int(s->blkio, "request-alignment", &value); + if (ret < 0) { + error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s", + blkio_get_error_msg()); + return; + } + bs->bl.request_alignment = value; + if (bs->bl.request_alignment < 1 || + bs->bl.request_alignment >= INT_MAX || + !is_power_of_2(bs->bl.request_alignment)) { + error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", " + "must be a power of 2 less than INT_MAX", + bs->bl.request_alignment); + return; + } + + ret = blkio_get_int(s->blkio, "optimal-io-size", &value); + if (ret < 0) { + error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s", + blkio_get_error_msg()); + return; + } + bs->bl.opt_transfer = value; + if (bs->bl.opt_transfer > INT_MAX || + (bs->bl.opt_transfer % bs->bl.request_alignment)) { + error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must " + "be a multiple of %" PRIu32, bs->bl.opt_transfer, + bs->bl.request_alignment); + return; + } + + ret = blkio_get_int(s->blkio, "max-transfer", &value); + if (ret < 0) { + error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s", + blkio_get_error_msg()); + return; + } + bs->bl.max_transfer = value; + if ((bs->bl.max_transfer % bs->bl.request_alignment) || + (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) { + error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be " + "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)", + bs->bl.max_transfer, bs->bl.request_alignment, + bs->bl.opt_transfer); + return; + } + + ret = blkio_get_int(s->blkio, "buf-alignment", &value); + if (ret < 0) { + error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s", + blkio_get_error_msg()); + return; + } + if (value < 1) { + error_setg(errp, "invalid \"buf-alignment\" value %d, must be " + "positive", value); + return; + } + bs->bl.min_mem_alignment = value; + + ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value); + if (ret < 0) { + error_setg_errno(errp, -ret, + "failed to get \"optimal-buf-alignment\": %s", + blkio_get_error_msg()); + return; + } + if (value < 1) { + error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, " + "must be positive", value); + return; + } + bs->bl.opt_mem_alignment = value; + + ret = blkio_get_int(s->blkio, "max-segments", &value); + if (ret < 0) { + error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s", + blkio_get_error_msg()); + return; + } + if (value < 1) { + error_setg(errp, "invalid \"max-segments\" value %d, must be positive", + value); + return; + } + bs->bl.max_iov = value; +} + +/* + * TODO + * Missing libblkio APIs: + * - block_status + * - co_invalidate_cache + * + * Out of scope? + * - create + * - truncate + */ + +#define BLKIO_DRIVER(name, ...) \ + { \ + .format_name = name, \ + .protocol_name = name, \ + .instance_size = sizeof(BDRVBlkioState), \ + .bdrv_file_open = blkio_file_open, \ + .bdrv_close = blkio_close, \ + .bdrv_getlength = blkio_getlength, \ + .bdrv_get_info = blkio_get_info, \ + .bdrv_attach_aio_context = blkio_attach_aio_context, \ + .bdrv_detach_aio_context = blkio_detach_aio_context, \ + .bdrv_co_pdiscard = blkio_co_pdiscard, \ + .bdrv_co_preadv = blkio_co_preadv, \ + .bdrv_co_pwritev = blkio_co_pwritev, \ + .bdrv_co_flush_to_disk = blkio_co_flush, \ + .bdrv_co_pwrite_zeroes = blkio_co_pwrite_zeroes, \ + .bdrv_io_unplug = blkio_io_unplug, \ + .bdrv_refresh_limits = blkio_refresh_limits, \ + .bdrv_register_buf = blkio_register_buf, \ + .bdrv_unregister_buf = blkio_unregister_buf, \ + __VA_ARGS__ \ + } + +static BlockDriver bdrv_io_uring = BLKIO_DRIVER( + DRIVER_IO_URING, + .bdrv_needs_filename = true, +); + +static BlockDriver bdrv_nvme_io_uring = BLKIO_DRIVER( + DRIVER_NVME_IO_URING, + .bdrv_needs_filename = true, +); + +static BlockDriver bdrv_virtio_blk_vhost_user = BLKIO_DRIVER( + DRIVER_VIRTIO_BLK_VHOST_USER +); + +static BlockDriver bdrv_virtio_blk_vhost_vdpa = BLKIO_DRIVER( + DRIVER_VIRTIO_BLK_VHOST_VDPA +); + +static void bdrv_blkio_init(void) +{ + bdrv_register(&bdrv_io_uring); + bdrv_register(&bdrv_nvme_io_uring); + bdrv_register(&bdrv_virtio_blk_vhost_user); + bdrv_register(&bdrv_virtio_blk_vhost_vdpa); +} + +block_init(bdrv_blkio_init); diff --git a/block/blkverify.c b/block/blkverify.c index 020b1ae7b6..f36fd6aeb2 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -235,8 +235,8 @@ blkverify_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, qemu_iovec_init(&raw_qiov, qiov->niov); qemu_iovec_clone(&raw_qiov, qiov, buf); - ret = blkverify_co_prwv(bs, &r, offset, bytes, qiov, &raw_qiov, flags, - false); + ret = blkverify_co_prwv(bs, &r, offset, bytes, qiov, &raw_qiov, + flags & ~BDRV_REQ_REGISTERED_BUF, false); cmp_offset = qemu_iovec_compare(qiov, &raw_qiov); if (cmp_offset != -1) { diff --git a/block/block-backend.c b/block/block-backend.c index aa4adf06ae..4f59664397 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -2545,16 +2545,16 @@ static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter) } } -void blk_register_buf(BlockBackend *blk, void *host, size_t size) +bool blk_register_buf(BlockBackend *blk, void *host, size_t size, Error **errp) { GLOBAL_STATE_CODE(); - bdrv_register_buf(blk_bs(blk), host, size); + return bdrv_register_buf(blk_bs(blk), host, size, errp); } -void blk_unregister_buf(BlockBackend *blk, void *host) +void blk_unregister_buf(BlockBackend *blk, void *host, size_t size) { GLOBAL_STATE_CODE(); - bdrv_unregister_buf(blk_bs(blk), host); + bdrv_unregister_buf(blk_bs(blk), host, size); } int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in, diff --git a/block/block-ram-registrar.c b/block/block-ram-registrar.c new file mode 100644 index 0000000000..25dbafa789 --- /dev/null +++ b/block/block-ram-registrar.c @@ -0,0 +1,58 @@ +/* + * BlockBackend RAM Registrar + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "sysemu/block-backend.h" +#include "sysemu/block-ram-registrar.h" +#include "qapi/error.h" + +static void ram_block_added(RAMBlockNotifier *n, void *host, size_t size, + size_t max_size) +{ + BlockRAMRegistrar *r = container_of(n, BlockRAMRegistrar, notifier); + Error *err = NULL; + + if (!r->ok) { + return; /* don't try again if we've already failed */ + } + + if (!blk_register_buf(r->blk, host, max_size, &err)) { + error_report_err(err); + ram_block_notifier_remove(&r->notifier); + r->ok = false; + } +} + +static void ram_block_removed(RAMBlockNotifier *n, void *host, size_t size, + size_t max_size) +{ + BlockRAMRegistrar *r = container_of(n, BlockRAMRegistrar, notifier); + blk_unregister_buf(r->blk, host, max_size); +} + +void blk_ram_registrar_init(BlockRAMRegistrar *r, BlockBackend *blk) +{ + r->blk = blk; + r->notifier = (RAMBlockNotifier){ + .ram_block_added = ram_block_added, + .ram_block_removed = ram_block_removed, + + /* + * .ram_block_resized() is not necessary because we use the max_size + * value that does not change across resize. + */ + }; + r->ok = true; + + ram_block_notifier_add(&r->notifier); +} + +void blk_ram_registrar_destroy(BlockRAMRegistrar *r) +{ + if (r->ok) { + ram_block_notifier_remove(&r->notifier); + } +} diff --git a/block/crypto.c b/block/crypto.c index 7a57774b76..c7365598a7 100644 --- a/block/crypto.c +++ b/block/crypto.c @@ -410,7 +410,6 @@ block_crypto_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, uint64_t sector_size = qcrypto_block_get_sector_size(crypto->block); uint64_t payload_offset = qcrypto_block_get_payload_offset(crypto->block); - assert(!flags); assert(payload_offset < INT64_MAX); assert(QEMU_IS_ALIGNED(offset, sector_size)); assert(QEMU_IS_ALIGNED(bytes, sector_size)); @@ -473,7 +472,8 @@ block_crypto_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, uint64_t sector_size = qcrypto_block_get_sector_size(crypto->block); uint64_t payload_offset = qcrypto_block_get_payload_offset(crypto->block); - assert(!(flags & ~BDRV_REQ_FUA)); + flags &= ~BDRV_REQ_REGISTERED_BUF; + assert(payload_offset < INT64_MAX); assert(QEMU_IS_ALIGNED(offset, sector_size)); assert(QEMU_IS_ALIGNED(bytes, sector_size)); diff --git a/block/file-posix.c b/block/file-posix.c index 23acffb9a4..b9647c5ffc 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -2133,7 +2133,6 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { - assert(flags == 0); return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE); } diff --git a/block/gluster.c b/block/gluster.c index bb1144cf6a..7c90f7ba4b 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -1236,7 +1236,6 @@ static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs, QEMUIOVector *qiov, int flags) { - assert(!flags); return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1); } diff --git a/block/io.c b/block/io.c index d30073036e..a9673465dd 100644 --- a/block/io.c +++ b/block/io.c @@ -1130,8 +1130,7 @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, int ret; bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); - assert(!(flags & ~BDRV_REQ_MASK)); - assert(!(flags & BDRV_REQ_NO_FALLBACK)); + assert(!(flags & ~bs->supported_read_flags)); if (!drv) { return -ENOMEDIUM; @@ -1195,23 +1194,29 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, BdrvRequestFlags flags) { BlockDriver *drv = bs->drv; + bool emulate_fua = false; int64_t sector_num; unsigned int nb_sectors; QEMUIOVector local_qiov; int ret; bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); - assert(!(flags & ~BDRV_REQ_MASK)); - assert(!(flags & BDRV_REQ_NO_FALLBACK)); if (!drv) { return -ENOMEDIUM; } + if ((flags & BDRV_REQ_FUA) && + (~bs->supported_write_flags & BDRV_REQ_FUA)) { + flags &= ~BDRV_REQ_FUA; + emulate_fua = true; + } + + flags &= bs->supported_write_flags; + if (drv->bdrv_co_pwritev_part) { ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset, - flags & bs->supported_write_flags); - flags &= ~bs->supported_write_flags; + flags); goto emulate_flags; } @@ -1221,9 +1226,7 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, } if (drv->bdrv_co_pwritev) { - ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, - flags & bs->supported_write_flags); - flags &= ~bs->supported_write_flags; + ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, flags); goto emulate_flags; } @@ -1233,10 +1236,8 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, .coroutine = qemu_coroutine_self(), }; - acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, - flags & bs->supported_write_flags, + acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, flags, bdrv_co_io_em_complete, &co); - flags &= ~bs->supported_write_flags; if (acb == NULL) { ret = -EIO; } else { @@ -1254,12 +1255,10 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, assert(bytes <= BDRV_REQUEST_MAX_BYTES); assert(drv->bdrv_co_writev); - ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, - flags & bs->supported_write_flags); - flags &= ~bs->supported_write_flags; + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, flags); emulate_flags: - if (ret == 0 && (flags & BDRV_REQ_FUA)) { + if (ret == 0 && emulate_fua) { ret = bdrv_co_flush(bs); } @@ -1487,11 +1486,14 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), align); - /* TODO: We would need a per-BDS .supported_read_flags and + /* + * TODO: We would need a per-BDS .supported_read_flags and * potential fallback support, if we ever implement any read flags * to pass through to drivers. For now, there aren't any - * passthrough flags. */ - assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH))); + * passthrough flags except the BDRV_REQ_REGISTERED_BUF optimization hint. + */ + assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH | + BDRV_REQ_REGISTERED_BUF))); /* Handle Copy on Read and associated serialisation */ if (flags & BDRV_REQ_COPY_ON_READ) { @@ -1532,7 +1534,7 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, goto out; } - assert(!(flags & ~bs->supported_read_flags)); + assert(!(flags & ~(bs->supported_read_flags | BDRV_REQ_REGISTERED_BUF))); max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); if (bytes <= max_bytes && bytes <= max_transfer) { @@ -1721,7 +1723,8 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad) static int bdrv_pad_request(BlockDriverState *bs, QEMUIOVector **qiov, size_t *qiov_offset, int64_t *offset, int64_t *bytes, - BdrvRequestPadding *pad, bool *padded) + BdrvRequestPadding *pad, bool *padded, + BdrvRequestFlags *flags) { int ret; @@ -1749,6 +1752,10 @@ static int bdrv_pad_request(BlockDriverState *bs, if (padded) { *padded = true; } + if (flags) { + /* Can't use optimization hint with bounce buffer */ + *flags &= ~BDRV_REQ_REGISTERED_BUF; + } return 0; } @@ -1803,7 +1810,7 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, } ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, - NULL); + NULL, &flags); if (ret < 0) { goto fail; } @@ -1848,6 +1855,11 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, return -ENOTSUP; } + /* By definition there is no user buffer so this flag doesn't make sense */ + if (flags & BDRV_REQ_REGISTERED_BUF) { + return -EINVAL; + } + /* Invalidate the cached block-status data range if this write overlaps */ bdrv_bsc_invalidate_range(bs, offset, bytes); @@ -2133,6 +2145,9 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, bool padding; BdrvRequestPadding pad; + /* This flag doesn't make sense for padding or zero writes */ + flags &= ~BDRV_REQ_REGISTERED_BUF; + padding = bdrv_init_padding(bs, offset, bytes, &pad); if (padding) { assert(!(flags & BDRV_REQ_NO_WAIT)); @@ -2250,7 +2265,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, * alignment only if there is no ZERO flag. */ ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, - &padded); + &padded, &flags); if (ret < 0) { return ret; } @@ -3262,29 +3277,57 @@ void bdrv_io_unplug(BlockDriverState *bs) } } -void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size) +/* Helper that undoes bdrv_register_buf() when it fails partway through */ +static void bdrv_register_buf_rollback(BlockDriverState *bs, + void *host, + size_t size, + BdrvChild *final_child) +{ + BdrvChild *child; + + QLIST_FOREACH(child, &bs->children, next) { + if (child == final_child) { + break; + } + + bdrv_unregister_buf(child->bs, host, size); + } + + if (bs->drv && bs->drv->bdrv_unregister_buf) { + bs->drv->bdrv_unregister_buf(bs, host, size); + } +} + +bool bdrv_register_buf(BlockDriverState *bs, void *host, size_t size, + Error **errp) { BdrvChild *child; GLOBAL_STATE_CODE(); if (bs->drv && bs->drv->bdrv_register_buf) { - bs->drv->bdrv_register_buf(bs, host, size); + if (!bs->drv->bdrv_register_buf(bs, host, size, errp)) { + return false; + } } QLIST_FOREACH(child, &bs->children, next) { - bdrv_register_buf(child->bs, host, size); + if (!bdrv_register_buf(child->bs, host, size, errp)) { + bdrv_register_buf_rollback(bs, host, size, child); + return false; + } } + return true; } -void bdrv_unregister_buf(BlockDriverState *bs, void *host) +void bdrv_unregister_buf(BlockDriverState *bs, void *host, size_t size) { BdrvChild *child; GLOBAL_STATE_CODE(); if (bs->drv && bs->drv->bdrv_unregister_buf) { - bs->drv->bdrv_unregister_buf(bs, host); + bs->drv->bdrv_unregister_buf(bs, host, size); } QLIST_FOREACH(child, &bs->children, next) { - bdrv_unregister_buf(child->bs, host); + bdrv_unregister_buf(child->bs, host, size); } } diff --git a/block/meson.build b/block/meson.build index 60bc305597..b7c68b83a3 100644 --- a/block/meson.build +++ b/block/meson.build @@ -46,6 +46,7 @@ block_ss.add(files( ), zstd, zlib, gnutls) softmmu_ss.add(when: 'CONFIG_TCG', if_true: files('blkreplay.c')) +softmmu_ss.add(files('block-ram-registrar.c')) if get_option('qcow1').allowed() block_ss.add(files('qcow.c')) @@ -92,6 +93,7 @@ block_modules = {} modsrc = [] foreach m : [ + [blkio, 'blkio', files('blkio.c')], [curl, 'curl', files('curl.c')], [glusterfs, 'gluster', files('gluster.c')], [libiscsi, 'iscsi', [files('iscsi.c'), libm]], diff --git a/block/mirror.c b/block/mirror.c index 80c0109d39..bed089d2e0 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1486,6 +1486,8 @@ static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs, qemu_iovec_init(&bounce_qiov, 1); qemu_iovec_add(&bounce_qiov, bounce_buf, bytes); qiov = &bounce_qiov; + + flags &= ~BDRV_REQ_REGISTERED_BUF; } ret = bdrv_mirror_top_do_write(bs, MIRROR_METHOD_COPY, offset, bytes, qiov, diff --git a/block/nbd.c b/block/nbd.c index 494b9d683e..7d485c86d2 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -1222,7 +1222,6 @@ static int coroutine_fn nbd_client_co_preadv(BlockDriverState *bs, int64_t offse }; assert(bytes <= NBD_MAX_BUFFER_SIZE); - assert(!flags); if (!bytes) { return 0; diff --git a/block/nvme.c b/block/nvme.c index 2b24f95164..656624c585 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -1587,22 +1587,22 @@ static void nvme_aio_unplug(BlockDriverState *bs) } } -static void nvme_register_buf(BlockDriverState *bs, void *host, size_t size) +static bool nvme_register_buf(BlockDriverState *bs, void *host, size_t size, + Error **errp) { int ret; - Error *local_err = NULL; BDRVNVMeState *s = bs->opaque; - ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL, &local_err); - if (ret) { - /* FIXME: we may run out of IOVA addresses after repeated - * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap - * doesn't reclaim addresses for fixed mappings. */ - error_reportf_err(local_err, "nvme_register_buf failed: "); - } + /* + * FIXME: we may run out of IOVA addresses after repeated + * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap + * doesn't reclaim addresses for fixed mappings. + */ + ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL, errp); + return ret == 0; } -static void nvme_unregister_buf(BlockDriverState *bs, void *host) +static void nvme_unregister_buf(BlockDriverState *bs, void *host, size_t size) { BDRVNVMeState *s = bs->opaque; diff --git a/block/parallels.c b/block/parallels.c index c1523e7dab..dd15a44100 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -329,7 +329,6 @@ static coroutine_fn int parallels_co_writev(BlockDriverState *bs, QEMUIOVector hd_qiov; int ret = 0; - assert(!flags); qemu_iovec_init(&hd_qiov, qiov->niov); while (nb_sectors > 0) { diff --git a/block/qcow.c b/block/qcow.c index 311aaa8705..e9180c7b61 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -628,7 +628,6 @@ static coroutine_fn int qcow_co_preadv(BlockDriverState *bs, int64_t offset, uint8_t *buf; void *orig_buf; - assert(!flags); if (qiov->niov > 1) { buf = orig_buf = qemu_try_blockalign(bs, qiov->size); if (buf == NULL) { @@ -725,7 +724,6 @@ static coroutine_fn int qcow_co_pwritev(BlockDriverState *bs, int64_t offset, uint8_t *buf; void *orig_buf; - assert(!flags); s->cluster_cache_offset = -1; /* disable compressed cache */ /* We must always copy the iov when encrypting, so we diff --git a/block/qed.c b/block/qed.c index bda00e6257..99a9ec9b57 100644 --- a/block/qed.c +++ b/block/qed.c @@ -1395,7 +1395,6 @@ static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags) { - assert(!flags); return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE); } diff --git a/block/raw-format.c b/block/raw-format.c index f337ac7569..c8dc9bc850 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -258,6 +258,8 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset, qemu_iovec_add(&local_qiov, buf, 512); qemu_iovec_concat(&local_qiov, qiov, 512, qiov->size - 512); qiov = &local_qiov; + + flags &= ~BDRV_REQ_REGISTERED_BUF; } ret = raw_adjust_offset(bs, &offset, bytes, true); diff --git a/block/replication.c b/block/replication.c index c67f931f37..13f1d39571 100644 --- a/block/replication.c +++ b/block/replication.c @@ -261,7 +261,6 @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs, int ret; int64_t n; - assert(!flags); ret = replication_get_io_status(s); if (ret < 0) { goto out; diff --git a/block/ssh.c b/block/ssh.c index a2dc646536..a3cddc392c 100644 --- a/block/ssh.c +++ b/block/ssh.c @@ -1196,7 +1196,6 @@ static coroutine_fn int ssh_co_writev(BlockDriverState *bs, BDRVSSHState *s = bs->opaque; int ret; - assert(!flags); qemu_co_mutex_lock(&s->lock); ret = ssh_write(s, bs, sector_num * BDRV_SECTOR_SIZE, nb_sectors * BDRV_SECTOR_SIZE, qiov); diff --git a/block/vhdx.c b/block/vhdx.c index e10e78ebfd..e2344ee0b7 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -1342,7 +1342,6 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num, uint64_t bat_prior_offset = 0; bool bat_update = false; - assert(!flags); qemu_iovec_init(&hd_qiov, qiov->niov); qemu_co_mutex_lock(&s->lock); diff --git a/bsd-user/mmap.c b/bsd-user/mmap.c index e54e26de17..d6c5a344c9 100644 --- a/bsd-user/mmap.c +++ b/bsd-user/mmap.c @@ -663,7 +663,6 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int prot, page_dump(stdout); printf("\n"); #endif - tb_invalidate_phys_range(start, start + len); mmap_unlock(); return start; fail: @@ -769,7 +768,6 @@ int target_munmap(abi_ulong start, abi_ulong len) if (ret == 0) { page_set_flags(start, start + len, 0); - tb_invalidate_phys_range(start, start + len); } mmap_unlock(); return ret; diff --git a/contrib/elf2dmp/main.c b/contrib/elf2dmp/main.c index b9fc6d230c..d77b8f98f7 100644 --- a/contrib/elf2dmp/main.c +++ b/contrib/elf2dmp/main.c @@ -125,6 +125,7 @@ static KDDEBUGGER_DATA64 *get_kdbg(uint64_t KernBase, struct pdb_reader *pdb, if (va_space_rw(vs, KdDebuggerDataBlock, kdbg, kdbg_hdr.Size, 0)) { eprintf("Failed to extract entire KDBG\n"); + free(kdbg); return NULL; } @@ -277,7 +277,7 @@ void list_cpus(const char *optarg) void tb_invalidate_phys_addr(target_ulong addr) { mmap_lock(); - tb_invalidate_phys_page_range(addr, addr + 1); + tb_invalidate_phys_page(addr); mmap_unlock(); } #else @@ -298,7 +298,7 @@ void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs) return; } ram_addr = memory_region_get_ram_addr(mr) + addr; - tb_invalidate_phys_page_range(ram_addr, ram_addr + 1); + tb_invalidate_phys_page(ram_addr); } #endif diff --git a/docs/devel/reset.rst b/docs/devel/reset.rst index abea1102dc..7cc6a6b314 100644 --- a/docs/devel/reset.rst +++ b/docs/devel/reset.rst @@ -210,9 +210,11 @@ Polling the reset state Resettable interface provides the ``resettable_is_in_reset()`` function. This function returns true if the object parameter is currently under reset. -An object is under reset from the beginning of the *init* phase to the end of -the *exit* phase. During all three phases, the function will return that the -object is in reset. +An object is under reset from the beginning of the *enter* phase (before +either its children or its own enter method is called) to the *exit* +phase. During *enter* and *hold* phase only, the function will return that the +object is in reset. The state is changed after the *exit* is propagated to +its children and just before calling the object's own *exit* method. This function may be used if the object behavior has to be adapted while in reset state. For example if a device has an irq input, diff --git a/docs/system/arm/emulation.rst b/docs/system/arm/emulation.rst index cfb4b0768b..e3af79bb8c 100644 --- a/docs/system/arm/emulation.rst +++ b/docs/system/arm/emulation.rst @@ -24,6 +24,7 @@ the following architecture extensions: - FEAT_Debugv8p4 (Debug changes for v8.4) - FEAT_DotProd (Advanced SIMD dot product instructions) - FEAT_DoubleFault (Double Fault Extension) +- FEAT_E0PD (Preventing EL0 access to halves of address maps) - FEAT_ETS (Enhanced Translation Synchronization) - FEAT_FCMA (Floating-point complex number instructions) - FEAT_FHM (Floating-point half-precision multiplication instructions) @@ -32,6 +33,7 @@ the following architecture extensions: - FEAT_FlagM (Flag manipulation instructions v2) - FEAT_FlagM2 (Enhancements to flag manipulation instructions) - FEAT_GTG (Guest translation granule size) +- FEAT_HAFDBS (Hardware management of the access flag and dirty bit state) - FEAT_HCX (Support for the HCRX_EL2 register) - FEAT_HPDS (Hierarchical permission disables) - FEAT_I8MM (AArch64 Int8 matrix multiplication instructions) diff --git a/dump/dump.c b/dump/dump.c index 236559b03a..df117c847f 100644 --- a/dump/dump.c +++ b/dump/dump.c @@ -103,6 +103,7 @@ static int dump_cleanup(DumpState *s) memory_mapping_list_free(&s->list); close(s->fd); g_free(s->guest_note); + g_array_unref(s->string_table_buf); s->guest_note = NULL; if (s->resume) { if (s->detached) { @@ -152,11 +153,10 @@ static void prepare_elf64_header(DumpState *s, Elf64_Ehdr *elf_header) elf_header->e_phoff = cpu_to_dump64(s, s->phdr_offset); elf_header->e_phentsize = cpu_to_dump16(s, sizeof(Elf64_Phdr)); elf_header->e_phnum = cpu_to_dump16(s, phnum); - if (s->shdr_num) { - elf_header->e_shoff = cpu_to_dump64(s, s->shdr_offset); - elf_header->e_shentsize = cpu_to_dump16(s, sizeof(Elf64_Shdr)); - elf_header->e_shnum = cpu_to_dump16(s, s->shdr_num); - } + elf_header->e_shoff = cpu_to_dump64(s, s->shdr_offset); + elf_header->e_shentsize = cpu_to_dump16(s, sizeof(Elf64_Shdr)); + elf_header->e_shnum = cpu_to_dump16(s, s->shdr_num); + elf_header->e_shstrndx = cpu_to_dump16(s, s->shdr_num - 1); } static void prepare_elf32_header(DumpState *s, Elf32_Ehdr *elf_header) @@ -180,11 +180,10 @@ static void prepare_elf32_header(DumpState *s, Elf32_Ehdr *elf_header) elf_header->e_phoff = cpu_to_dump32(s, s->phdr_offset); elf_header->e_phentsize = cpu_to_dump16(s, sizeof(Elf32_Phdr)); elf_header->e_phnum = cpu_to_dump16(s, phnum); - if (s->shdr_num) { - elf_header->e_shoff = cpu_to_dump32(s, s->shdr_offset); - elf_header->e_shentsize = cpu_to_dump16(s, sizeof(Elf32_Shdr)); - elf_header->e_shnum = cpu_to_dump16(s, s->shdr_num); - } + elf_header->e_shoff = cpu_to_dump32(s, s->shdr_offset); + elf_header->e_shentsize = cpu_to_dump16(s, sizeof(Elf32_Shdr)); + elf_header->e_shnum = cpu_to_dump16(s, s->shdr_num); + elf_header->e_shstrndx = cpu_to_dump16(s, s->shdr_num - 1); } static void write_elf_header(DumpState *s, Error **errp) @@ -195,6 +194,8 @@ static void write_elf_header(DumpState *s, Error **errp) void *header_ptr; int ret; + /* The NULL header and the shstrtab are always defined */ + assert(s->shdr_num >= 2); if (dump_is_64bit(s)) { prepare_elf64_header(s, &elf64_header); header_size = sizeof(elf64_header); @@ -380,30 +381,136 @@ static void write_elf_phdr_note(DumpState *s, Error **errp) } } -static void write_elf_section(DumpState *s, int type, Error **errp) +static void prepare_elf_section_hdr_zero(DumpState *s) { - Elf32_Shdr shdr32; - Elf64_Shdr shdr64; + if (dump_is_64bit(s)) { + Elf64_Shdr *shdr64 = s->elf_section_hdrs; + + shdr64->sh_info = cpu_to_dump32(s, s->phdr_num); + } else { + Elf32_Shdr *shdr32 = s->elf_section_hdrs; + + shdr32->sh_info = cpu_to_dump32(s, s->phdr_num); + } +} + +static void prepare_elf_section_hdr_string(DumpState *s, void *buff) +{ + uint64_t index = s->string_table_buf->len; + const char strtab[] = ".shstrtab"; + Elf32_Shdr shdr32 = {}; + Elf64_Shdr shdr64 = {}; int shdr_size; void *shdr; - int ret; - if (type == 0) { - shdr_size = sizeof(Elf32_Shdr); - memset(&shdr32, 0, shdr_size); - shdr32.sh_info = cpu_to_dump32(s, s->phdr_num); - shdr = &shdr32; - } else { + g_array_append_vals(s->string_table_buf, strtab, sizeof(strtab)); + if (dump_is_64bit(s)) { shdr_size = sizeof(Elf64_Shdr); - memset(&shdr64, 0, shdr_size); - shdr64.sh_info = cpu_to_dump32(s, s->phdr_num); + shdr64.sh_type = SHT_STRTAB; + shdr64.sh_offset = s->section_offset + s->elf_section_data_size; + shdr64.sh_name = index; + shdr64.sh_size = s->string_table_buf->len; shdr = &shdr64; + } else { + shdr_size = sizeof(Elf32_Shdr); + shdr32.sh_type = SHT_STRTAB; + shdr32.sh_offset = s->section_offset + s->elf_section_data_size; + shdr32.sh_name = index; + shdr32.sh_size = s->string_table_buf->len; + shdr = &shdr32; + } + memcpy(buff, shdr, shdr_size); +} + +static bool prepare_elf_section_hdrs(DumpState *s, Error **errp) +{ + size_t len, sizeof_shdr; + void *buff_hdr; + + /* + * Section ordering: + * - HDR zero + * - Arch section hdrs + * - String table hdr + */ + sizeof_shdr = dump_is_64bit(s) ? sizeof(Elf64_Shdr) : sizeof(Elf32_Shdr); + len = sizeof_shdr * s->shdr_num; + s->elf_section_hdrs = g_malloc0(len); + buff_hdr = s->elf_section_hdrs; + + /* + * The first section header is ALWAYS a special initial section + * header. + * + * The header should be 0 with one exception being that if + * phdr_num is PN_XNUM then the sh_info field contains the real + * number of segment entries. + * + * As we zero allocate the buffer we will only need to modify + * sh_info for the PN_XNUM case. + */ + if (s->phdr_num >= PN_XNUM) { + prepare_elf_section_hdr_zero(s); } + buff_hdr += sizeof_shdr; + + /* Add architecture defined section headers */ + if (s->dump_info.arch_sections_write_hdr_fn + && s->shdr_num > 2) { + buff_hdr += s->dump_info.arch_sections_write_hdr_fn(s, buff_hdr); + + if (s->shdr_num >= SHN_LORESERVE) { + error_setg_errno(errp, EINVAL, + "dump: too many architecture defined sections"); + return false; + } + } + + /* + * String table is the last section since strings are added via + * arch_sections_write_hdr(). + */ + prepare_elf_section_hdr_string(s, buff_hdr); + return true; +} - ret = fd_write_vmcore(shdr, shdr_size, s); +static void write_elf_section_headers(DumpState *s, Error **errp) +{ + size_t sizeof_shdr = dump_is_64bit(s) ? sizeof(Elf64_Shdr) : sizeof(Elf32_Shdr); + int ret; + + if (!prepare_elf_section_hdrs(s, errp)) { + return; + } + + ret = fd_write_vmcore(s->elf_section_hdrs, s->shdr_num * sizeof_shdr, s); if (ret < 0) { - error_setg_errno(errp, -ret, - "dump: failed to write section header table"); + error_setg_errno(errp, -ret, "dump: failed to write section headers"); + } + + g_free(s->elf_section_hdrs); +} + +static void write_elf_sections(DumpState *s, Error **errp) +{ + int ret; + + if (s->elf_section_data_size) { + /* Write architecture section data */ + ret = fd_write_vmcore(s->elf_section_data, + s->elf_section_data_size, s); + if (ret < 0) { + error_setg_errno(errp, -ret, + "dump: failed to write architecture section data"); + return; + } + } + + /* Write string table */ + ret = fd_write_vmcore(s->string_table_buf->data, + s->string_table_buf->len, s); + if (ret < 0) { + error_setg_errno(errp, -ret, "dump: failed to write string table data"); } } @@ -554,6 +661,8 @@ static void dump_begin(DumpState *s, Error **errp) * -------------- * | elf header | * -------------- + * | sctn_hdr | + * -------------- * | PT_NOTE | * -------------- * | PT_LOAD | @@ -562,8 +671,6 @@ static void dump_begin(DumpState *s, Error **errp) * -------------- * | PT_LOAD | * -------------- - * | sec_hdr | - * -------------- * | elf note | * -------------- * | memory | @@ -579,6 +686,12 @@ static void dump_begin(DumpState *s, Error **errp) return; } + /* write section headers to vmcore */ + write_elf_section_headers(s, errp); + if (*errp) { + return; + } + /* write PT_NOTE to vmcore */ write_elf_phdr_note(s, errp); if (*errp) { @@ -591,21 +704,13 @@ static void dump_begin(DumpState *s, Error **errp) return; } - /* write section to vmcore */ - if (s->shdr_num) { - write_elf_section(s, 1, errp); - if (*errp) { - return; - } - } - /* write notes to vmcore */ write_elf_notes(s, errp); } -static int64_t dump_filtered_memblock_size(GuestPhysBlock *block, - int64_t filter_area_start, - int64_t filter_area_length) +int64_t dump_filtered_memblock_size(GuestPhysBlock *block, + int64_t filter_area_start, + int64_t filter_area_length) { int64_t size, left, right; @@ -623,9 +728,9 @@ static int64_t dump_filtered_memblock_size(GuestPhysBlock *block, return size; } -static int64_t dump_filtered_memblock_start(GuestPhysBlock *block, - int64_t filter_area_start, - int64_t filter_area_length) +int64_t dump_filtered_memblock_start(GuestPhysBlock *block, + int64_t filter_area_start, + int64_t filter_area_length) { if (filter_area_length) { /* return -1 if the block is not within filter area */ @@ -665,6 +770,31 @@ static void dump_iterate(DumpState *s, Error **errp) } } +static void dump_end(DumpState *s, Error **errp) +{ + int rc; + ERRP_GUARD(); + + if (s->elf_section_data_size) { + s->elf_section_data = g_malloc0(s->elf_section_data_size); + } + + /* Adds the architecture defined section data to s->elf_section_data */ + if (s->dump_info.arch_sections_write_fn && + s->elf_section_data_size) { + rc = s->dump_info.arch_sections_write_fn(s, s->elf_section_data); + if (rc) { + error_setg_errno(errp, rc, + "dump: failed to get arch section data"); + g_free(s->elf_section_data); + return; + } + } + + /* write sections to vmcore */ + write_elf_sections(s, errp); +} + static void create_vmcore(DumpState *s, Error **errp) { ERRP_GUARD(); @@ -674,7 +804,14 @@ static void create_vmcore(DumpState *s, Error **errp) return; } + /* Iterate over memory and dump it to file */ dump_iterate(s, errp); + if (*errp) { + return; + } + + /* Write the section data */ + dump_end(s, errp); } static int write_start_flat_header(int fd) @@ -1684,6 +1821,14 @@ static void dump_init(DumpState *s, int fd, bool has_format, s->filter_area_begin = begin; s->filter_area_length = length; + /* First index is 0, it's the special null name */ + s->string_table_buf = g_array_new(FALSE, TRUE, 1); + /* + * Allocate the null name, due to the clearing option set to true + * it will be 0. + */ + g_array_set_size(s->string_table_buf, 1); + memory_mapping_list_init(&s->list); guest_phys_blocks_init(&s->guest_phys_blocks); @@ -1820,38 +1965,53 @@ static void dump_init(DumpState *s, int fd, bool has_format, } /* - * calculate phdr_num + * The first section header is always a special one in which most + * fields are 0. The section header string table is also always + * set. + */ + s->shdr_num = 2; + + /* + * Adds the number of architecture sections to shdr_num and sets + * elf_section_data_size so we know the offsets and sizes of all + * parts. + */ + if (s->dump_info.arch_sections_add_fn) { + s->dump_info.arch_sections_add_fn(s); + } + + /* + * calculate shdr_num so we know the offsets and sizes of all + * parts. + * Calculate phdr_num * - * the type of ehdr->e_phnum is uint16_t, so we should avoid overflow + * The absolute maximum amount of phdrs is UINT32_MAX - 1 as + * sh_info is 32 bit. There's special handling once we go over + * UINT16_MAX - 1 but that is handled in the ehdr and section + * code. */ - s->phdr_num = 1; /* PT_NOTE */ - if (s->list.num < UINT16_MAX - 2) { - s->shdr_num = 0; + s->phdr_num = 1; /* Reserve PT_NOTE */ + if (s->list.num <= UINT32_MAX - 1) { s->phdr_num += s->list.num; } else { - /* sh_info of section 0 holds the real number of phdrs */ - s->shdr_num = 1; - - /* the type of shdr->sh_info is uint32_t, so we should avoid overflow */ - if (s->list.num <= UINT32_MAX - 1) { - s->phdr_num += s->list.num; - } else { - s->phdr_num = UINT32_MAX; - } + s->phdr_num = UINT32_MAX; } + /* + * Now that the number of section and program headers is known we + * can calculate the offsets of the headers and data. + */ if (dump_is_64bit(s)) { - s->phdr_offset = sizeof(Elf64_Ehdr); - s->shdr_offset = s->phdr_offset + sizeof(Elf64_Phdr) * s->phdr_num; - s->note_offset = s->shdr_offset + sizeof(Elf64_Shdr) * s->shdr_num; - s->memory_offset = s->note_offset + s->note_size; + s->shdr_offset = sizeof(Elf64_Ehdr); + s->phdr_offset = s->shdr_offset + sizeof(Elf64_Shdr) * s->shdr_num; + s->note_offset = s->phdr_offset + sizeof(Elf64_Phdr) * s->phdr_num; } else { - - s->phdr_offset = sizeof(Elf32_Ehdr); - s->shdr_offset = s->phdr_offset + sizeof(Elf32_Phdr) * s->phdr_num; - s->note_offset = s->shdr_offset + sizeof(Elf32_Shdr) * s->shdr_num; - s->memory_offset = s->note_offset + s->note_size; + s->shdr_offset = sizeof(Elf32_Ehdr); + s->phdr_offset = s->shdr_offset + sizeof(Elf32_Shdr) * s->shdr_num; + s->note_offset = s->phdr_offset + sizeof(Elf32_Phdr) * s->phdr_num; } + s->memory_offset = s->note_offset + s->note_size; + s->section_offset = s->memory_offset + s->total_size; return; diff --git a/dump/win_dump.c b/dump/win_dump.c index fd91350fbb..f20b6051b6 100644 --- a/dump/win_dump.c +++ b/dump/win_dump.c @@ -273,6 +273,13 @@ static void patch_and_save_context(WinDumpHeader *h, bool x64, uint64_t Context; WinContext ctx; + if (i >= WIN_DUMP_FIELD(NumberProcessors)) { + warn_report("win-dump: number of QEMU CPUs is bigger than" + " NumberProcessors (%u) in guest Windows", + WIN_DUMP_FIELD(NumberProcessors)); + return; + } + if (cpu_read_ptr(x64, first_cpu, KiProcessorBlock + i * win_dump_ptr_size(x64), &Prcb)) { diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c index 9bf13133e5..072cf67956 100644 --- a/hw/9pfs/9p.c +++ b/hw/9pfs/9p.c @@ -1803,7 +1803,7 @@ static void coroutine_fn v9fs_walk(void *opaque) err = pdu_unmarshal(pdu, offset, "ddw", &fid, &newfid, &nwnames); if (err < 0) { pdu_complete(pdu, err); - return ; + return; } offset += err; diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c index bc3ecdb619..55f114ef72 100644 --- a/hw/arm/aspeed.c +++ b/hw/arm/aspeed.c @@ -1099,7 +1099,7 @@ static void aspeed_machine_palmetto_class_init(ObjectClass *oc, void *data) amc->soc_name = "ast2400-a1"; amc->hw_strap1 = PALMETTO_BMC_HW_STRAP1; amc->fmc_model = "n25q256a"; - amc->spi_model = "mx25l25635e"; + amc->spi_model = "mx25l25635f"; amc->num_cs = 1; amc->i2c_init = palmetto_bmc_i2c_init; mc->default_ram_size = 256 * MiB; @@ -1150,7 +1150,7 @@ static void aspeed_machine_ast2500_evb_class_init(ObjectClass *oc, void *data) amc->soc_name = "ast2500-a1"; amc->hw_strap1 = AST2500_EVB_HW_STRAP1; amc->fmc_model = "mx25l25635e"; - amc->spi_model = "mx25l25635e"; + amc->spi_model = "mx25l25635f"; amc->num_cs = 1; amc->i2c_init = ast2500_evb_i2c_init; mc->default_ram_size = 512 * MiB; @@ -1200,7 +1200,7 @@ static void aspeed_machine_witherspoon_class_init(ObjectClass *oc, void *data) mc->desc = "OpenPOWER Witherspoon BMC (ARM1176)"; amc->soc_name = "ast2500-a1"; amc->hw_strap1 = WITHERSPOON_BMC_HW_STRAP1; - amc->fmc_model = "mx25l25635e"; + amc->fmc_model = "mx25l25635f"; amc->spi_model = "mx66l1g45g"; amc->num_cs = 2; amc->i2c_init = witherspoon_bmc_i2c_init; @@ -1330,6 +1330,13 @@ static void aspeed_machine_fuji_class_init(ObjectClass *oc, void *data) aspeed_soc_num_cpus(amc->soc_name); }; +/* On 32-bit hosts, lower RAM to 1G because of the 2047 MB limit */ +#if HOST_LONG_BITS == 32 +#define BLETCHLEY_BMC_RAM_SIZE (1 * GiB) +#else +#define BLETCHLEY_BMC_RAM_SIZE (2 * GiB) +#endif + static void aspeed_machine_bletchley_class_init(ObjectClass *oc, void *data) { MachineClass *mc = MACHINE_CLASS(oc); @@ -1344,17 +1351,17 @@ static void aspeed_machine_bletchley_class_init(ObjectClass *oc, void *data) amc->num_cs = 2; amc->macs_mask = ASPEED_MAC2_ON; amc->i2c_init = bletchley_bmc_i2c_init; - mc->default_ram_size = 512 * MiB; + mc->default_ram_size = BLETCHLEY_BMC_RAM_SIZE; mc->default_cpus = mc->min_cpus = mc->max_cpus = aspeed_soc_num_cpus(amc->soc_name); } -static void fby35_reset(MachineState *state) +static void fby35_reset(MachineState *state, ShutdownCause reason) { AspeedMachineState *bmc = ASPEED_MACHINE(state); AspeedGPIOState *gpio = &bmc->soc.gpio; - qemu_devices_reset(); + qemu_devices_reset(reason); /* Board ID: 7 (Class-1, 4 slots) */ object_property_set_bool(OBJECT(gpio), "gpioV4", true, &error_fatal); diff --git a/hw/arm/aspeed_ast2600.c b/hw/arm/aspeed_ast2600.c index aa2cd90bec..cd75465c2b 100644 --- a/hw/arm/aspeed_ast2600.c +++ b/hw/arm/aspeed_ast2600.c @@ -307,6 +307,8 @@ static void aspeed_soc_ast2600_realize(DeviceState *dev, Error **errp) object_property_set_int(OBJECT(&s->cpu[i]), "cntfrq", 1125000000, &error_abort); + object_property_set_bool(OBJECT(&s->cpu[i]), "neon", false, + &error_abort); object_property_set_link(OBJECT(&s->cpu[i]), "memory", OBJECT(s->memory), &error_abort); diff --git a/hw/arm/boot.c b/hw/arm/boot.c index b0b92af188..b106f31468 100644 --- a/hw/arm/boot.c +++ b/hw/arm/boot.c @@ -683,6 +683,8 @@ int arm_load_dtb(hwaddr addr, const struct arm_boot_info *binfo, * the DTB is copied again upon reset, even if addr points into RAM. */ rom_add_blob_fixed_as("dtb", fdt, size, addr, as); + qemu_register_reset_nosnapshotload(qemu_fdt_randomize_seeds, + rom_ptr_for_as(as, addr, size)); g_free(fdt); diff --git a/hw/arm/mps2-tz.c b/hw/arm/mps2-tz.c index 394192b9b2..284c09c91d 100644 --- a/hw/arm/mps2-tz.c +++ b/hw/arm/mps2-tz.c @@ -1239,7 +1239,7 @@ static void mps2_set_remap(Object *obj, const char *value, Error **errp) } } -static void mps2_machine_reset(MachineState *machine) +static void mps2_machine_reset(MachineState *machine, ShutdownCause reason) { MPS2TZMachineState *mms = MPS2TZ_MACHINE(machine); @@ -1249,7 +1249,7 @@ static void mps2_machine_reset(MachineState *machine) * reset see the correct mapping. */ remap_memory(mms, mms->remap); - qemu_devices_reset(); + qemu_devices_reset(reason); } static void mps2tz_class_init(ObjectClass *oc, void *data) diff --git a/hw/arm/nseries.c b/hw/arm/nseries.c index 692c94ceb4..b151113c27 100644 --- a/hw/arm/nseries.c +++ b/hw/arm/nseries.c @@ -702,7 +702,7 @@ static uint32_t mipid_txrx(void *opaque, uint32_t cmd, int len) static void *mipid_init(void) { - struct mipid_s *s = (struct mipid_s *) g_malloc0(sizeof(*s)); + struct mipid_s *s = g_malloc0(sizeof(*s)); s->id = 0x838f03; mipid_reset(s); @@ -1300,7 +1300,7 @@ static int n810_atag_setup(const struct arm_boot_info *info, void *p) static void n8x0_init(MachineState *machine, struct arm_boot_info *binfo, int model) { - struct n800_s *s = (struct n800_s *) g_malloc0(sizeof(*s)); + struct n800_s *s = g_malloc0(sizeof(*s)); MachineClass *mc = MACHINE_GET_CLASS(machine); if (machine->ram_size != mc->default_ram_size) { diff --git a/hw/arm/virt.c b/hw/arm/virt.c index cda9defe8f..b871350856 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -1371,14 +1371,15 @@ static void create_smmu(const VirtMachineState *vms, static void create_virtio_iommu_dt_bindings(VirtMachineState *vms) { - const char compat[] = "virtio,pci-iommu"; + const char compat[] = "virtio,pci-iommu\0pci1af4,1057"; uint16_t bdf = vms->virtio_iommu_bdf; MachineState *ms = MACHINE(vms); char *node; vms->iommu_phandle = qemu_fdt_alloc_phandle(ms->fdt); - node = g_strdup_printf("%s/virtio_iommu@%d", vms->pciehb_nodename, bdf); + node = g_strdup_printf("%s/virtio_iommu@%x,%x", vms->pciehb_nodename, + PCI_SLOT(bdf), PCI_FUNC(bdf)); qemu_fdt_add_subnode(ms->fdt, node); qemu_fdt_setprop(ms->fdt, node, "compatible", compat, sizeof(compat)); qemu_fdt_setprop_sized_cells(ms->fdt, node, "reg", diff --git a/hw/block/m25p80.c b/hw/block/m25p80.c index a8d2519141..02adc87527 100644 --- a/hw/block/m25p80.c +++ b/hw/block/m25p80.c @@ -35,6 +35,7 @@ #include "qapi/error.h" #include "trace.h" #include "qom/object.h" +#include "m25p80_sfdp.h" /* 16 MiB max in 3 byte address mode */ #define MAX_3BYTES_SIZE 0x1000000 @@ -72,6 +73,7 @@ typedef struct FlashPartInfo { * This field inform how many die is in the chip. */ uint8_t die_cnt; + uint8_t (*sfdp_read)(uint32_t sfdp_addr); } FlashPartInfo; /* adapted from linux */ @@ -230,12 +232,16 @@ static const FlashPartInfo known_devices[] = { { INFO("mx25l6405d", 0xc22017, 0, 64 << 10, 128, 0) }, { INFO("mx25l12805d", 0xc22018, 0, 64 << 10, 256, 0) }, { INFO("mx25l12855e", 0xc22618, 0, 64 << 10, 256, 0) }, - { INFO6("mx25l25635e", 0xc22019, 0xc22019, 64 << 10, 512, 0) }, + { INFO6("mx25l25635e", 0xc22019, 0xc22019, 64 << 10, 512, + ER_4K | ER_32K), .sfdp_read = m25p80_sfdp_mx25l25635e }, + { INFO6("mx25l25635f", 0xc22019, 0xc22019, 64 << 10, 512, + ER_4K | ER_32K), .sfdp_read = m25p80_sfdp_mx25l25635f }, { INFO("mx25l25655e", 0xc22619, 0, 64 << 10, 512, 0) }, { INFO("mx66l51235f", 0xc2201a, 0, 64 << 10, 1024, ER_4K | ER_32K) }, { INFO("mx66u51235f", 0xc2253a, 0, 64 << 10, 1024, ER_4K | ER_32K) }, { INFO("mx66u1g45g", 0xc2253b, 0, 64 << 10, 2048, ER_4K | ER_32K) }, - { INFO("mx66l1g45g", 0xc2201b, 0, 64 << 10, 2048, ER_4K | ER_32K) }, + { INFO("mx66l1g45g", 0xc2201b, 0, 64 << 10, 2048, ER_4K | ER_32K), + .sfdp_read = m25p80_sfdp_mx66l1g45g }, /* Micron */ { INFO("n25q032a11", 0x20bb16, 0, 64 << 10, 64, ER_4K) }, @@ -245,13 +251,15 @@ static const FlashPartInfo known_devices[] = { { INFO("n25q128a11", 0x20bb18, 0, 64 << 10, 256, ER_4K) }, { INFO("n25q128a13", 0x20ba18, 0, 64 << 10, 256, ER_4K) }, { INFO("n25q256a11", 0x20bb19, 0, 64 << 10, 512, ER_4K) }, - { INFO("n25q256a13", 0x20ba19, 0, 64 << 10, 512, ER_4K) }, + { INFO("n25q256a13", 0x20ba19, 0, 64 << 10, 512, ER_4K), + .sfdp_read = m25p80_sfdp_n25q256a }, { INFO("n25q512a11", 0x20bb20, 0, 64 << 10, 1024, ER_4K) }, { INFO("n25q512a13", 0x20ba20, 0, 64 << 10, 1024, ER_4K) }, { INFO("n25q128", 0x20ba18, 0, 64 << 10, 256, 0) }, { INFO("n25q256a", 0x20ba19, 0, 64 << 10, 512, - ER_4K | HAS_SR_BP3_BIT6 | HAS_SR_TB) }, - { INFO("n25q512a", 0x20ba20, 0, 64 << 10, 1024, ER_4K) }, + ER_4K | HAS_SR_BP3_BIT6 | HAS_SR_TB), + .sfdp_read = m25p80_sfdp_n25q256a }, + { INFO("n25q512a", 0x20ba20, 0, 64 << 10, 1024, ER_4K) }, { INFO("n25q512ax3", 0x20ba20, 0x1000, 64 << 10, 1024, ER_4K) }, { INFO("mt25ql512ab", 0x20ba20, 0x1044, 64 << 10, 1024, ER_4K | ER_32K) }, { INFO_STACKED("mt35xu01g", 0x2c5b1b, 0x104100, 128 << 10, 1024, @@ -337,9 +345,12 @@ static const FlashPartInfo known_devices[] = { { INFO("w25q64", 0xef4017, 0, 64 << 10, 128, ER_4K) }, { INFO("w25q80", 0xef5014, 0, 64 << 10, 16, ER_4K) }, { INFO("w25q80bl", 0xef4014, 0, 64 << 10, 16, ER_4K) }, - { INFO("w25q256", 0xef4019, 0, 64 << 10, 512, ER_4K) }, - { INFO("w25q512jv", 0xef4020, 0, 64 << 10, 1024, ER_4K) }, - { INFO("w25q01jvq", 0xef4021, 0, 64 << 10, 2048, ER_4K) }, + { INFO("w25q256", 0xef4019, 0, 64 << 10, 512, ER_4K), + .sfdp_read = m25p80_sfdp_w25q256 }, + { INFO("w25q512jv", 0xef4020, 0, 64 << 10, 1024, ER_4K), + .sfdp_read = m25p80_sfdp_w25q512jv }, + { INFO("w25q01jvq", 0xef4021, 0, 64 << 10, 2048, ER_4K), + .sfdp_read = m25p80_sfdp_w25q01jvq }, }; typedef enum { @@ -355,6 +366,7 @@ typedef enum { BULK_ERASE = 0xc7, READ_FSR = 0x70, RDCR = 0x15, + RDSFDP = 0x5a, READ = 0x03, READ4 = 0x13, @@ -421,6 +433,7 @@ typedef enum { STATE_COLLECTING_DATA, STATE_COLLECTING_VAR_LEN_DATA, STATE_READING_DATA, + STATE_READING_SFDP, } CMDState; typedef enum { @@ -679,6 +692,8 @@ static inline int get_addr_length(Flash *s) } switch (s->cmd_in_progress) { + case RDSFDP: + return 3; case PP4: case PP4_4: case QPP_4: @@ -823,6 +838,11 @@ static void complete_collecting_data(Flash *s) " by device\n"); } break; + + case RDSFDP: + s->state = STATE_READING_SFDP; + break; + default: break; } @@ -1431,6 +1451,16 @@ static void decode_new_cmd(Flash *s, uint32_t value) qemu_log_mask(LOG_GUEST_ERROR, "M25P80: Unknown cmd %x\n", value); } break; + case RDSFDP: + if (s->pi->sfdp_read) { + s->needed_bytes = get_addr_length(s) + 1; /* SFDP addr + dummy */ + s->pos = 0; + s->len = 0; + s->state = STATE_COLLECTING_DATA; + break; + } + /* Fallthrough */ + default: s->pos = 0; s->len = 1; @@ -1538,6 +1568,12 @@ static uint32_t m25p80_transfer8(SSIPeripheral *ss, uint32_t tx) } } break; + case STATE_READING_SFDP: + assert(s->pi->sfdp_read); + r = s->pi->sfdp_read(s->cur_addr); + trace_m25p80_read_sfdp(s, s->cur_addr, (uint8_t)r); + s->cur_addr = (s->cur_addr + 1) & (M25P80_SFDP_MAX_SIZE - 1); + break; default: case STATE_IDLE: diff --git a/hw/block/m25p80_sfdp.c b/hw/block/m25p80_sfdp.c new file mode 100644 index 0000000000..77615fa29e --- /dev/null +++ b/hw/block/m25p80_sfdp.c @@ -0,0 +1,332 @@ +/* + * M25P80 Serial Flash Discoverable Parameter (SFDP) + * + * Copyright (c) 2020, IBM Corporation. + * + * This code is licensed under the GPL version 2 or later. See the + * COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/host-utils.h" +#include "m25p80_sfdp.h" + +#define define_sfdp_read(model) \ + uint8_t m25p80_sfdp_##model(uint32_t addr) \ + { \ + assert(is_power_of_2(sizeof(sfdp_##model))); \ + return sfdp_##model[addr & (sizeof(sfdp_##model) - 1)]; \ + } + +/* + * Micron + */ +static const uint8_t sfdp_n25q256a[] = { + 0x53, 0x46, 0x44, 0x50, 0x00, 0x01, 0x00, 0xff, + 0x00, 0x00, 0x01, 0x09, 0x30, 0x00, 0x00, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xe5, 0x20, 0xfb, 0xff, 0xff, 0xff, 0xff, 0x0f, + 0x29, 0xeb, 0x27, 0x6b, 0x08, 0x3b, 0x27, 0xbb, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x27, 0xbb, + 0xff, 0xff, 0x29, 0xeb, 0x0c, 0x20, 0x10, 0xd8, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, +}; +define_sfdp_read(n25q256a); + + +/* + * Matronix + */ + +/* mx25l25635e. No 4B opcodes */ +static const uint8_t sfdp_mx25l25635e[] = { + 0x53, 0x46, 0x44, 0x50, 0x00, 0x01, 0x01, 0xff, + 0x00, 0x00, 0x01, 0x09, 0x30, 0x00, 0x00, 0xff, + 0xc2, 0x00, 0x01, 0x04, 0x60, 0x00, 0x00, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xe5, 0x20, 0xf3, 0xff, 0xff, 0xff, 0xff, 0x0f, + 0x44, 0xeb, 0x08, 0x6b, 0x08, 0x3b, 0x04, 0xbb, + 0xee, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0xff, + 0xff, 0xff, 0x00, 0xff, 0x0c, 0x20, 0x0f, 0x52, + 0x10, 0xd8, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x36, 0x00, 0x27, 0xf7, 0x4f, 0xff, 0xff, + 0xd9, 0xc8, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, +}; +define_sfdp_read(mx25l25635e) + +static const uint8_t sfdp_mx25l25635f[] = { + 0x53, 0x46, 0x44, 0x50, 0x00, 0x01, 0x01, 0xff, + 0x00, 0x00, 0x01, 0x09, 0x30, 0x00, 0x00, 0xff, + 0xc2, 0x00, 0x01, 0x04, 0x60, 0x00, 0x00, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xe5, 0x20, 0xf3, 0xff, 0xff, 0xff, 0xff, 0x0f, + 0x44, 0xeb, 0x08, 0x6b, 0x08, 0x3b, 0x04, 0xbb, + 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0xff, + 0xff, 0xff, 0x44, 0xeb, 0x0c, 0x20, 0x0f, 0x52, + 0x10, 0xd8, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x36, 0x00, 0x27, 0x9d, 0xf9, 0xc0, 0x64, + 0x85, 0xcb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xc2, 0xf5, 0x08, 0x0a, + 0x08, 0x04, 0x03, 0x06, 0x00, 0x00, 0x07, 0x29, + 0x17, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, +}; +define_sfdp_read(mx25l25635f); + +static const uint8_t sfdp_mx66l1g45g[] = { + 0x53, 0x46, 0x44, 0x50, 0x06, 0x01, 0x02, 0xff, + 0x00, 0x06, 0x01, 0x10, 0x30, 0x00, 0x00, 0xff, + 0xc2, 0x00, 0x01, 0x04, 0x10, 0x01, 0x00, 0xff, + 0x84, 0x00, 0x01, 0x02, 0xc0, 0x00, 0x00, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xe5, 0x20, 0xfb, 0xff, 0xff, 0xff, 0xff, 0x3f, + 0x44, 0xeb, 0x08, 0x6b, 0x08, 0x3b, 0x04, 0xbb, + 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0xff, + 0xff, 0xff, 0x44, 0xeb, 0x0c, 0x20, 0x0f, 0x52, + 0x10, 0xd8, 0x00, 0xff, 0xd6, 0x49, 0xc5, 0x00, + 0x85, 0xdf, 0x04, 0xe3, 0x44, 0x03, 0x67, 0x38, + 0x30, 0xb0, 0x30, 0xb0, 0xf7, 0xbd, 0xd5, 0x5c, + 0x4a, 0x9e, 0x29, 0xff, 0xf0, 0x50, 0xf9, 0x85, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x7f, 0xef, 0xff, 0xff, 0x21, 0x5c, 0xdc, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x36, 0x00, 0x27, 0x9d, 0xf9, 0xc0, 0x64, + 0x85, 0xcb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xc2, 0xf5, 0x08, 0x00, 0x0c, 0x04, 0x08, 0x08, + 0x01, 0x00, 0x19, 0x0f, 0x01, 0x01, 0x06, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, +}; +define_sfdp_read(mx66l1g45g); + +/* + * Windbond + */ + +static const uint8_t sfdp_w25q256[] = { + 0x53, 0x46, 0x44, 0x50, 0x00, 0x01, 0x00, 0xff, + 0x00, 0x00, 0x01, 0x09, 0x80, 0x00, 0x00, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xe5, 0x20, 0xf3, 0xff, 0xff, 0xff, 0xff, 0x0f, + 0x44, 0xeb, 0x08, 0x6b, 0x08, 0x3b, 0x42, 0xbb, + 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, + 0xff, 0xff, 0x21, 0xeb, 0x0c, 0x20, 0x0f, 0x52, + 0x10, 0xd8, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, +}; +define_sfdp_read(w25q256); + +static const uint8_t sfdp_w25q512jv[] = { + 0x53, 0x46, 0x44, 0x50, 0x06, 0x01, 0x01, 0xff, + 0x00, 0x06, 0x01, 0x10, 0x80, 0x00, 0x00, 0xff, + 0x84, 0x00, 0x01, 0x02, 0xd0, 0x00, 0x00, 0xff, + 0x03, 0x00, 0x01, 0x02, 0xf0, 0x00, 0x00, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xe5, 0x20, 0xfb, 0xff, 0xff, 0xff, 0xff, 0x1f, + 0x44, 0xeb, 0x08, 0x6b, 0x08, 0x3b, 0x42, 0xbb, + 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, + 0xff, 0xff, 0x40, 0xeb, 0x0c, 0x20, 0x0f, 0x52, + 0x10, 0xd8, 0x00, 0x00, 0x36, 0x02, 0xa6, 0x00, + 0x82, 0xea, 0x14, 0xe2, 0xe9, 0x63, 0x76, 0x33, + 0x7a, 0x75, 0x7a, 0x75, 0xf7, 0xa2, 0xd5, 0x5c, + 0x19, 0xf7, 0x4d, 0xff, 0xe9, 0x70, 0xf9, 0xa5, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0x0a, 0xf0, 0xff, 0x21, 0xff, 0xdc, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, +}; +define_sfdp_read(w25q512jv); + +static const uint8_t sfdp_w25q01jvq[] = { + 0x53, 0x46, 0x44, 0x50, 0x06, 0x01, 0x01, 0xff, + 0x00, 0x06, 0x01, 0x10, 0x80, 0x00, 0x00, 0xff, + 0x84, 0x00, 0x01, 0x02, 0xd0, 0x00, 0x00, 0xff, + 0x03, 0x00, 0x01, 0x02, 0xf0, 0x00, 0x00, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xe5, 0x20, 0xfb, 0xff, 0xff, 0xff, 0xff, 0x3f, + 0x44, 0xeb, 0x08, 0x6b, 0x08, 0x3b, 0x42, 0xbb, + 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, + 0xff, 0xff, 0x40, 0xeb, 0x0c, 0x20, 0x0f, 0x52, + 0x10, 0xd8, 0x00, 0x00, 0x36, 0x02, 0xa6, 0x00, + 0x82, 0xea, 0x14, 0xe2, 0xe9, 0x63, 0x76, 0x33, + 0x7a, 0x75, 0x7a, 0x75, 0xf7, 0xa2, 0xd5, 0x5c, + 0x19, 0xf7, 0x4d, 0xff, 0xe9, 0x70, 0xf9, 0xa5, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0x0a, 0xf0, 0xff, 0x21, 0xff, 0xdc, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, +}; +define_sfdp_read(w25q01jvq); diff --git a/hw/block/m25p80_sfdp.h b/hw/block/m25p80_sfdp.h new file mode 100644 index 0000000000..df7adfb5ce --- /dev/null +++ b/hw/block/m25p80_sfdp.h @@ -0,0 +1,29 @@ +/* + * M25P80 SFDP + * + * Copyright (c) 2020, IBM Corporation. + * + * This code is licensed under the GPL version 2 or later. See the + * COPYING file in the top-level directory. + */ + +#ifndef HW_M25P80_SFDP_H +#define HW_M25P80_SFDP_H + +/* + * SFDP area has a 3 bytes address space. + */ +#define M25P80_SFDP_MAX_SIZE (1 << 24) + +uint8_t m25p80_sfdp_n25q256a(uint32_t addr); + +uint8_t m25p80_sfdp_mx25l25635e(uint32_t addr); +uint8_t m25p80_sfdp_mx25l25635f(uint32_t addr); +uint8_t m25p80_sfdp_mx66l1g45g(uint32_t addr); + +uint8_t m25p80_sfdp_w25q256(uint32_t addr); +uint8_t m25p80_sfdp_w25q512jv(uint32_t addr); + +uint8_t m25p80_sfdp_w25q01jvq(uint32_t addr); + +#endif diff --git a/hw/block/meson.build b/hw/block/meson.build index 1908abd45c..b434d5654c 100644 --- a/hw/block/meson.build +++ b/hw/block/meson.build @@ -12,6 +12,7 @@ softmmu_ss.add(when: 'CONFIG_ONENAND', if_true: files('onenand.c')) softmmu_ss.add(when: 'CONFIG_PFLASH_CFI01', if_true: files('pflash_cfi01.c')) softmmu_ss.add(when: 'CONFIG_PFLASH_CFI02', if_true: files('pflash_cfi02.c')) softmmu_ss.add(when: 'CONFIG_SSI_M25P80', if_true: files('m25p80.c')) +softmmu_ss.add(when: 'CONFIG_SSI_M25P80', if_true: files('m25p80_sfdp.c')) softmmu_ss.add(when: 'CONFIG_SWIM', if_true: files('swim.c')) softmmu_ss.add(when: 'CONFIG_XEN', if_true: files('xen-block.c')) softmmu_ss.add(when: 'CONFIG_TC58128', if_true: files('tc58128.c')) diff --git a/hw/block/trace-events b/hw/block/trace-events index d86b53520c..2c45a62bd5 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -80,5 +80,6 @@ m25p80_page_program(void *s, uint32_t addr, uint8_t tx) "[%p] page program cur_a m25p80_transfer(void *s, uint8_t state, uint32_t len, uint8_t needed, uint32_t pos, uint32_t cur_addr, uint8_t t) "[%p] Transfer state 0x%"PRIx8" len 0x%"PRIx32" needed 0x%"PRIx8" pos 0x%"PRIx32" addr 0x%"PRIx32" tx 0x%"PRIx8 m25p80_read_byte(void *s, uint32_t addr, uint8_t v) "[%p] Read byte 0x%"PRIx32"=0x%"PRIx8 m25p80_read_data(void *s, uint32_t pos, uint8_t v) "[%p] Read data 0x%"PRIx32"=0x%"PRIx8 +m25p80_read_sfdp(void *s, uint32_t addr, uint8_t v) "[%p] Read SFDP 0x%"PRIx32"=0x%"PRIx8 m25p80_binding(void *s) "[%p] Binding to IF_MTD drive" m25p80_binding_no_bdrv(void *s) "[%p] No BDRV - binding to RAM" diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 8131ec2dbc..f717550fdc 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -21,6 +21,7 @@ #include "hw/block/block.h" #include "hw/qdev-properties.h" #include "sysemu/blockdev.h" +#include "sysemu/block-ram-registrar.h" #include "sysemu/sysemu.h" #include "sysemu/runstate.h" #include "hw/virtio/virtio-blk.h" @@ -362,12 +363,14 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req) } } -static inline void submit_requests(BlockBackend *blk, MultiReqBuffer *mrb, +static inline void submit_requests(VirtIOBlock *s, MultiReqBuffer *mrb, int start, int num_reqs, int niov) { + BlockBackend *blk = s->blk; QEMUIOVector *qiov = &mrb->reqs[start]->qiov; int64_t sector_num = mrb->reqs[start]->sector_num; bool is_write = mrb->is_write; + BdrvRequestFlags flags = 0; if (num_reqs > 1) { int i; @@ -398,12 +401,18 @@ static inline void submit_requests(BlockBackend *blk, MultiReqBuffer *mrb, num_reqs - 1); } + if (blk_ram_registrar_ok(&s->blk_ram_registrar)) { + flags |= BDRV_REQ_REGISTERED_BUF; + } + if (is_write) { - blk_aio_pwritev(blk, sector_num << BDRV_SECTOR_BITS, qiov, 0, - virtio_blk_rw_complete, mrb->reqs[start]); + blk_aio_pwritev(blk, sector_num << BDRV_SECTOR_BITS, qiov, + flags, virtio_blk_rw_complete, + mrb->reqs[start]); } else { - blk_aio_preadv(blk, sector_num << BDRV_SECTOR_BITS, qiov, 0, - virtio_blk_rw_complete, mrb->reqs[start]); + blk_aio_preadv(blk, sector_num << BDRV_SECTOR_BITS, qiov, + flags, virtio_blk_rw_complete, + mrb->reqs[start]); } } @@ -425,14 +434,14 @@ static int multireq_compare(const void *a, const void *b) } } -static void virtio_blk_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb) +static void virtio_blk_submit_multireq(VirtIOBlock *s, MultiReqBuffer *mrb) { int i = 0, start = 0, num_reqs = 0, niov = 0, nb_sectors = 0; uint32_t max_transfer; int64_t sector_num = 0; if (mrb->num_reqs == 1) { - submit_requests(blk, mrb, 0, 1, -1); + submit_requests(s, mrb, 0, 1, -1); mrb->num_reqs = 0; return; } @@ -452,11 +461,11 @@ static void virtio_blk_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb) * 3. merge would exceed maximum transfer length of backend device */ if (sector_num + nb_sectors != req->sector_num || - niov > blk_get_max_iov(blk) - req->qiov.niov || + niov > blk_get_max_iov(s->blk) - req->qiov.niov || req->qiov.size > max_transfer || nb_sectors > (max_transfer - req->qiov.size) / BDRV_SECTOR_SIZE) { - submit_requests(blk, mrb, start, num_reqs, niov); + submit_requests(s, mrb, start, num_reqs, niov); num_reqs = 0; } } @@ -472,7 +481,7 @@ static void virtio_blk_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb) num_reqs++; } - submit_requests(blk, mrb, start, num_reqs, niov); + submit_requests(s, mrb, start, num_reqs, niov); mrb->num_reqs = 0; } @@ -487,7 +496,7 @@ static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb) * Make sure all outstanding writes are posted to the backing device. */ if (mrb->is_write && mrb->num_reqs > 0) { - virtio_blk_submit_multireq(s->blk, mrb); + virtio_blk_submit_multireq(s, mrb); } blk_aio_flush(s->blk, virtio_blk_flush_complete, req); } @@ -667,7 +676,7 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb) if (mrb->num_reqs > 0 && (mrb->num_reqs == VIRTIO_BLK_MAX_MERGE_REQS || is_write != mrb->is_write || !s->conf.request_merging)) { - virtio_blk_submit_multireq(s->blk, mrb); + virtio_blk_submit_multireq(s, mrb); } assert(mrb->num_reqs < VIRTIO_BLK_MAX_MERGE_REQS); @@ -774,7 +783,7 @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) } while (!virtio_queue_empty(vq)); if (mrb.num_reqs) { - virtio_blk_submit_multireq(s->blk, &mrb); + virtio_blk_submit_multireq(s, &mrb); } blk_io_unplug(s->blk); @@ -823,7 +832,7 @@ void virtio_blk_process_queued_requests(VirtIOBlock *s, bool is_bh) } if (mrb.num_reqs) { - virtio_blk_submit_multireq(s->blk, &mrb); + virtio_blk_submit_multireq(s, &mrb); } if (is_bh) { blk_dec_in_flight(s->conf.conf.blk); @@ -1205,6 +1214,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) } s->change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s); + blk_ram_registrar_init(&s->blk_ram_registrar, s->blk); blk_set_dev_ops(s->blk, &virtio_block_ops, s); blk_iostatus_enable(s->blk); @@ -1230,6 +1240,7 @@ static void virtio_blk_device_unrealize(DeviceState *dev) virtio_del_queue(vdev, i); } qemu_coroutine_dec_pool_size(conf->num_queues * conf->queue_size / 2); + blk_ram_registrar_destroy(&s->blk_ram_registrar); qemu_del_vm_change_state_handler(s->change); blockdev_mark_auto_del(s->blk); virtio_cleanup(vdev); diff --git a/hw/char/exynos4210_uart.c b/hw/char/exynos4210_uart.c index addcd59b02..7b7c56b6ef 100644 --- a/hw/char/exynos4210_uart.c +++ b/hw/char/exynos4210_uart.c @@ -211,7 +211,7 @@ static void fifo_reset(Exynos4210UartFIFO *q) g_free(q->data); q->data = NULL; - q->data = (uint8_t *)g_malloc0(q->size); + q->data = g_malloc0(q->size); q->sp = 0; q->rp = 0; diff --git a/hw/core/numa.c b/hw/core/numa.c index 26d8e5f616..ea24a5fa8c 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -822,6 +822,19 @@ static int ram_block_notify_add_single(RAMBlock *rb, void *opaque) return 0; } +static int ram_block_notify_remove_single(RAMBlock *rb, void *opaque) +{ + const ram_addr_t max_size = qemu_ram_get_max_length(rb); + const ram_addr_t size = qemu_ram_get_used_length(rb); + void *host = qemu_ram_get_host_addr(rb); + RAMBlockNotifier *notifier = opaque; + + if (host) { + notifier->ram_block_removed(notifier, host, size, max_size); + } + return 0; +} + void ram_block_notifier_add(RAMBlockNotifier *n) { QLIST_INSERT_HEAD(&ram_list.ramblock_notifiers, n, next); @@ -835,13 +848,18 @@ void ram_block_notifier_add(RAMBlockNotifier *n) void ram_block_notifier_remove(RAMBlockNotifier *n) { QLIST_REMOVE(n, next); + + if (n->ram_block_removed) { + qemu_ram_foreach_block(ram_block_notify_remove_single, n); + } } void ram_block_notify_add(void *host, size_t size, size_t max_size) { RAMBlockNotifier *notifier; + RAMBlockNotifier *next; - QLIST_FOREACH(notifier, &ram_list.ramblock_notifiers, next) { + QLIST_FOREACH_SAFE(notifier, &ram_list.ramblock_notifiers, next, next) { if (notifier->ram_block_added) { notifier->ram_block_added(notifier, host, size, max_size); } @@ -851,8 +869,9 @@ void ram_block_notify_add(void *host, size_t size, size_t max_size) void ram_block_notify_remove(void *host, size_t size, size_t max_size) { RAMBlockNotifier *notifier; + RAMBlockNotifier *next; - QLIST_FOREACH(notifier, &ram_list.ramblock_notifiers, next) { + QLIST_FOREACH_SAFE(notifier, &ram_list.ramblock_notifiers, next, next) { if (notifier->ram_block_removed) { notifier->ram_block_removed(notifier, host, size, max_size); } @@ -862,8 +881,9 @@ void ram_block_notify_remove(void *host, size_t size, size_t max_size) void ram_block_notify_resize(void *host, size_t old_size, size_t new_size) { RAMBlockNotifier *notifier; + RAMBlockNotifier *next; - QLIST_FOREACH(notifier, &ram_list.ramblock_notifiers, next) { + QLIST_FOREACH_SAFE(notifier, &ram_list.ramblock_notifiers, next, next) { if (notifier->ram_block_resized) { notifier->ram_block_resized(notifier, host, old_size, new_size); } diff --git a/hw/core/reset.c b/hw/core/reset.c index 36be82c491..d3263b613e 100644 --- a/hw/core/reset.c +++ b/hw/core/reset.c @@ -33,6 +33,7 @@ typedef struct QEMUResetEntry { QTAILQ_ENTRY(QEMUResetEntry) entry; QEMUResetHandler *func; void *opaque; + bool skip_on_snapshot_load; } QEMUResetEntry; static QTAILQ_HEAD(, QEMUResetEntry) reset_handlers = @@ -47,6 +48,16 @@ void qemu_register_reset(QEMUResetHandler *func, void *opaque) QTAILQ_INSERT_TAIL(&reset_handlers, re, entry); } +void qemu_register_reset_nosnapshotload(QEMUResetHandler *func, void *opaque) +{ + QEMUResetEntry *re = g_new0(QEMUResetEntry, 1); + + re->func = func; + re->opaque = opaque; + re->skip_on_snapshot_load = true; + QTAILQ_INSERT_TAIL(&reset_handlers, re, entry); +} + void qemu_unregister_reset(QEMUResetHandler *func, void *opaque) { QEMUResetEntry *re; @@ -60,12 +71,16 @@ void qemu_unregister_reset(QEMUResetHandler *func, void *opaque) } } -void qemu_devices_reset(void) +void qemu_devices_reset(ShutdownCause reason) { QEMUResetEntry *re, *nre; /* reset all devices */ QTAILQ_FOREACH_SAFE(re, &reset_handlers, entry, nre) { + if (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD && + re->skip_on_snapshot_load) { + continue; + } re->func(re->opaque); } } diff --git a/hw/core/resettable.c b/hw/core/resettable.c index 96a99ce39e..c3df75c6ba 100644 --- a/hw/core/resettable.c +++ b/hw/core/resettable.c @@ -201,12 +201,11 @@ static void resettable_phase_exit(Object *obj, void *opaque, ResetType type) resettable_child_foreach(rc, obj, resettable_phase_exit, NULL, type); assert(s->count > 0); - if (s->count == 1) { + if (--s->count == 0) { trace_resettable_phase_exit_exec(obj, obj_typename, !!rc->phases.exit); if (rc->phases.exit && !resettable_get_tr_func(rc, obj)) { rc->phases.exit(obj); } - s->count = 0; } s->exit_phase_in_progress = false; trace_resettable_phase_exit_end(obj, obj_typename, s->count); diff --git a/hw/core/sysbus-fdt.c b/hw/core/sysbus-fdt.c index edb0c49b19..eebcd28f9a 100644 --- a/hw/core/sysbus-fdt.c +++ b/hw/core/sysbus-fdt.c @@ -299,7 +299,8 @@ static int add_amd_xgbe_fdt_node(SysBusDevice *sbdev, void *opaque) void *guest_fdt = data->fdt, *host_fdt; const void *r; int i, prop_len; - uint32_t *irq_attr, *reg_attr, *host_clock_phandles; + uint32_t *irq_attr, *reg_attr; + const uint32_t *host_clock_phandles; uint64_t mmio_base, irq_number; uint32_t guest_clock_phandles[2]; @@ -339,7 +340,7 @@ static int add_amd_xgbe_fdt_node(SysBusDevice *sbdev, void *opaque) error_report("%s clocks property should contain 2 handles", __func__); exit(1); } - host_clock_phandles = (uint32_t *)r; + host_clock_phandles = r; guest_clock_phandles[0] = qemu_fdt_alloc_phandle(guest_fdt); guest_clock_phandles[1] = qemu_fdt_alloc_phandle(guest_fdt); diff --git a/hw/display/blizzard.c b/hw/display/blizzard.c index 105241577d..ebe230dd0a 100644 --- a/hw/display/blizzard.c +++ b/hw/display/blizzard.c @@ -1007,7 +1007,7 @@ static const GraphicHwOps blizzard_ops = { void *s1d13745_init(qemu_irq gpio_int) { - BlizzardState *s = (BlizzardState *) g_malloc0(sizeof(*s)); + BlizzardState *s = g_malloc0(sizeof(*s)); DisplaySurface *surface; s->fb = g_malloc(0x180000); diff --git a/hw/dma/pl330.c b/hw/dma/pl330.c index 08e5938ec7..e5d521c329 100644 --- a/hw/dma/pl330.c +++ b/hw/dma/pl330.c @@ -1328,7 +1328,7 @@ static void pl330_debug_exec(PL330State *s) } if (!insn) { pl330_fault(ch, PL330_FAULT_UNDEF_INSTR | PL330_FAULT_DBG_INSTR); - return ; + return; } ch->stall = 0; insn->exec(ch, opcode, args, insn->size - 1); diff --git a/hw/hppa/machine.c b/hw/hppa/machine.c index e53d5f0fa7..19ea7c2c66 100644 --- a/hw/hppa/machine.c +++ b/hw/hppa/machine.c @@ -411,12 +411,12 @@ static void machine_hppa_init(MachineState *machine) cpu[0]->env.gr[19] = FW_CFG_IO_BASE; } -static void hppa_machine_reset(MachineState *ms) +static void hppa_machine_reset(MachineState *ms, ShutdownCause reason) { unsigned int smp_cpus = ms->smp.cpus; int i; - qemu_devices_reset(); + qemu_devices_reset(reason); /* Start all CPUs at the firmware entry point. * Monarch CPU will initialize firmware, secondary CPUs diff --git a/hw/hyperv/hyperv.c b/hw/hyperv/hyperv.c index 4a1b59cb9d..57b402b956 100644 --- a/hw/hyperv/hyperv.c +++ b/hw/hyperv/hyperv.c @@ -157,7 +157,7 @@ void hyperv_synic_reset(CPUState *cs) SynICState *synic = get_synic(cs); if (synic) { - device_legacy_reset(DEVICE(synic)); + device_cold_reset(DEVICE(synic)); } } diff --git a/hw/i2c/aspeed_i2c.c b/hw/i2c/aspeed_i2c.c index 42c6d69b82..c166fd20fa 100644 --- a/hw/i2c/aspeed_i2c.c +++ b/hw/i2c/aspeed_i2c.c @@ -1131,7 +1131,9 @@ static int aspeed_i2c_bus_slave_event(I2CSlave *slave, enum i2c_event event) AspeedI2CBus *bus = ASPEED_I2C_BUS(qbus->parent); uint32_t reg_intr_sts = aspeed_i2c_bus_intr_sts_offset(bus); uint32_t reg_byte_buf = aspeed_i2c_bus_byte_buf_offset(bus); - uint32_t value; + uint32_t reg_dev_addr = aspeed_i2c_bus_dev_addr_offset(bus); + uint32_t dev_addr = SHARED_ARRAY_FIELD_EX32(bus->regs, reg_dev_addr, + SLAVE_DEV_ADDR1); if (aspeed_i2c_is_new_mode(bus->controller)) { return aspeed_i2c_bus_new_slave_event(bus, event); @@ -1139,8 +1141,8 @@ static int aspeed_i2c_bus_slave_event(I2CSlave *slave, enum i2c_event event) switch (event) { case I2C_START_SEND_ASYNC: - value = SHARED_ARRAY_FIELD_EX32(bus->regs, reg_byte_buf, TX_BUF); - SHARED_ARRAY_FIELD_DP32(bus->regs, reg_byte_buf, RX_BUF, value << 1); + /* Bit[0] == 0 indicates "send". */ + SHARED_ARRAY_FIELD_DP32(bus->regs, reg_byte_buf, RX_BUF, dev_addr << 1); ARRAY_FIELD_DP32(bus->regs, I2CD_INTR_STS, SLAVE_ADDR_RX_MATCH, 1); SHARED_ARRAY_FIELD_DP32(bus->regs, reg_intr_sts, RX_DONE, 1); diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c index 52f9aa9d8c..ffd1884100 100644 --- a/hw/i386/microvm.c +++ b/hw/i386/microvm.c @@ -467,7 +467,7 @@ static void microvm_machine_state_init(MachineState *machine) microvm_devices_init(mms); } -static void microvm_machine_reset(MachineState *machine) +static void microvm_machine_reset(MachineState *machine, ShutdownCause reason) { MicrovmMachineState *mms = MICROVM_MACHINE(machine); CPUState *cs; @@ -480,7 +480,7 @@ static void microvm_machine_reset(MachineState *machine) mms->kernel_cmdline_fixed = true; } - qemu_devices_reset(); + qemu_devices_reset(reason); CPU_FOREACH(cs) { cpu = X86_CPU(cs); diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 768982ae9a..3e86083db3 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1847,12 +1847,12 @@ static void pc_machine_initfn(Object *obj) cxl_machine_init(obj, &pcms->cxl_devices_state); } -static void pc_machine_reset(MachineState *machine) +static void pc_machine_reset(MachineState *machine, ShutdownCause reason) { CPUState *cs; X86CPU *cpu; - qemu_devices_reset(); + qemu_devices_reset(reason); /* Reset APIC after devices have been reset to cancel * any changes that qemu_devices_reset() might have done. @@ -1867,7 +1867,7 @@ static void pc_machine_reset(MachineState *machine) static void pc_machine_wakeup(MachineState *machine) { cpu_synchronize_all_states(); - pc_machine_reset(machine); + pc_machine_reset(machine, SHUTDOWN_CAUSE_NONE); cpu_synchronize_all_post_reset(); } diff --git a/hw/i386/x86.c b/hw/i386/x86.c index 1148f70c03..bd50a064a3 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -1111,7 +1111,7 @@ void x86_load_linux(X86MachineState *x86ms, setup_data->type = cpu_to_le32(SETUP_RNG_SEED); setup_data->len = cpu_to_le32(RNG_SEED_LENGTH); qemu_guest_getrandom_nofail(setup_data->data, RNG_SEED_LENGTH); - qemu_register_reset(reset_rng_seed, setup_data); + qemu_register_reset_nosnapshotload(reset_rng_seed, setup_data); fw_cfg_add_bytes_callback(fw_cfg, FW_CFG_KERNEL_DATA, reset_rng_seed, NULL, setup_data, kernel, kernel_size, true); } else { diff --git a/hw/m68k/q800.c b/hw/m68k/q800.c index e09e244ddc..9d52ca6613 100644 --- a/hw/m68k/q800.c +++ b/hw/m68k/q800.c @@ -321,27 +321,23 @@ static const TypeInfo glue_info = { }, }; -typedef struct { - M68kCPU *cpu; - struct bi_record *rng_seed; -} ResetInfo; - static void main_cpu_reset(void *opaque) { - ResetInfo *reset_info = opaque; - M68kCPU *cpu = reset_info->cpu; + M68kCPU *cpu = opaque; CPUState *cs = CPU(cpu); - if (reset_info->rng_seed) { - qemu_guest_getrandom_nofail((void *)reset_info->rng_seed->data + 2, - be16_to_cpu(*(uint16_t *)reset_info->rng_seed->data)); - } - cpu_reset(cs); cpu->env.aregs[7] = ldl_phys(cs->as, 0); cpu->env.pc = ldl_phys(cs->as, 4); } +static void rerandomize_rng_seed(void *opaque) +{ + struct bi_record *rng_seed = opaque; + qemu_guest_getrandom_nofail((void *)rng_seed->data + 2, + be16_to_cpu(*(uint16_t *)rng_seed->data)); +} + static uint8_t fake_mac_rom[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -397,7 +393,6 @@ static void q800_init(MachineState *machine) NubusBus *nubus; DeviceState *glue; DriveInfo *dinfo; - ResetInfo *reset_info; uint8_t rng_seed[32]; linux_boot = (kernel_filename != NULL); @@ -408,12 +403,9 @@ static void q800_init(MachineState *machine) exit(1); } - reset_info = g_new0(ResetInfo, 1); - /* init CPUs */ cpu = M68K_CPU(cpu_create(machine->cpu_type)); - reset_info->cpu = cpu; - qemu_register_reset(main_cpu_reset, reset_info); + qemu_register_reset(main_cpu_reset, cpu); /* RAM */ memory_region_add_subregion(get_system_memory(), 0, machine->ram); @@ -687,9 +679,10 @@ static void q800_init(MachineState *machine) BOOTINFO0(param_ptr, BI_LAST); rom_add_blob_fixed_as("bootinfo", param_blob, param_ptr - param_blob, parameters_base, cs->as); - reset_info->rng_seed = rom_ptr_for_as(cs->as, parameters_base, - param_ptr - param_blob) + - (param_rng_seed - param_blob); + qemu_register_reset_nosnapshotload(rerandomize_rng_seed, + rom_ptr_for_as(cs->as, parameters_base, + param_ptr - param_blob) + + (param_rng_seed - param_blob)); g_free(param_blob); } else { uint8_t *ptr; diff --git a/hw/m68k/virt.c b/hw/m68k/virt.c index 89c4108eb5..da5eafd275 100644 --- a/hw/m68k/virt.c +++ b/hw/m68k/virt.c @@ -89,7 +89,6 @@ typedef struct { M68kCPU *cpu; hwaddr initial_pc; hwaddr initial_stack; - struct bi_record *rng_seed; } ResetInfo; static void main_cpu_reset(void *opaque) @@ -98,16 +97,18 @@ static void main_cpu_reset(void *opaque) M68kCPU *cpu = reset_info->cpu; CPUState *cs = CPU(cpu); - if (reset_info->rng_seed) { - qemu_guest_getrandom_nofail((void *)reset_info->rng_seed->data + 2, - be16_to_cpu(*(uint16_t *)reset_info->rng_seed->data)); - } - cpu_reset(cs); cpu->env.aregs[7] = reset_info->initial_stack; cpu->env.pc = reset_info->initial_pc; } +static void rerandomize_rng_seed(void *opaque) +{ + struct bi_record *rng_seed = opaque; + qemu_guest_getrandom_nofail((void *)rng_seed->data + 2, + be16_to_cpu(*(uint16_t *)rng_seed->data)); +} + static void virt_init(MachineState *machine) { M68kCPU *cpu = NULL; @@ -289,9 +290,10 @@ static void virt_init(MachineState *machine) BOOTINFO0(param_ptr, BI_LAST); rom_add_blob_fixed_as("bootinfo", param_blob, param_ptr - param_blob, parameters_base, cs->as); - reset_info->rng_seed = rom_ptr_for_as(cs->as, parameters_base, - param_ptr - param_blob) + - (param_rng_seed - param_blob); + qemu_register_reset_nosnapshotload(rerandomize_rng_seed, + rom_ptr_for_as(cs->as, parameters_base, + param_ptr - param_blob) + + (param_rng_seed - param_blob)); g_free(param_blob); } } diff --git a/hw/mips/boston.c b/hw/mips/boston.c index d2ab9da1a0..cab63f43bf 100644 --- a/hw/mips/boston.c +++ b/hw/mips/boston.c @@ -41,6 +41,7 @@ #include "sysemu/sysemu.h" #include "sysemu/qtest.h" #include "sysemu/runstate.h" +#include "sysemu/reset.h" #include <libfdt.h> #include "qom/object.h" @@ -810,6 +811,8 @@ static void boston_mach_init(MachineState *machine) /* Calculate real fdt size after filter */ dt_size = fdt_totalsize(dtb_load_data); rom_add_blob_fixed("dtb", dtb_load_data, dt_size, dtb_paddr); + qemu_register_reset_nosnapshotload(qemu_fdt_randomize_seeds, + rom_ptr(dtb_paddr, dt_size)); } else { /* Try to load file as FIT */ fit_err = load_fit(&boston_fit_loader, machine->kernel_filename, s); diff --git a/hw/mips/malta.c b/hw/mips/malta.c index 0e932988e0..7c3ad0974b 100644 --- a/hw/mips/malta.c +++ b/hw/mips/malta.c @@ -26,6 +26,7 @@ #include "qemu/units.h" #include "qemu/bitops.h" #include "qemu/datadir.h" +#include "qemu/guest-random.h" #include "hw/clock.h" #include "hw/southbridge/piix.h" #include "hw/isa/superio.h" @@ -1017,6 +1018,17 @@ static void G_GNUC_PRINTF(3, 4) prom_set(uint32_t *prom_buf, int index, va_end(ap); } +static void reinitialize_rng_seed(void *opaque) +{ + char *rng_seed_hex = opaque; + uint8_t rng_seed[32]; + + qemu_guest_getrandom_nofail(rng_seed, sizeof(rng_seed)); + for (size_t i = 0; i < sizeof(rng_seed); ++i) { + sprintf(rng_seed_hex + i * 2, "%02x", rng_seed[i]); + } +} + /* Kernel */ static uint64_t load_kernel(void) { @@ -1028,6 +1040,9 @@ static uint64_t load_kernel(void) long prom_size; int prom_index = 0; uint64_t (*xlate_to_kseg0) (void *opaque, uint64_t addr); + uint8_t rng_seed[32]; + char rng_seed_hex[sizeof(rng_seed) * 2 + 1]; + size_t rng_seed_prom_offset; #if TARGET_BIG_ENDIAN big_endian = 1; @@ -1115,9 +1130,21 @@ static uint64_t load_kernel(void) prom_set(prom_buf, prom_index++, "modetty0"); prom_set(prom_buf, prom_index++, "38400n8r"); + + qemu_guest_getrandom_nofail(rng_seed, sizeof(rng_seed)); + for (size_t i = 0; i < sizeof(rng_seed); ++i) { + sprintf(rng_seed_hex + i * 2, "%02x", rng_seed[i]); + } + prom_set(prom_buf, prom_index++, "rngseed"); + rng_seed_prom_offset = prom_index * ENVP_ENTRY_SIZE + + sizeof(uint32_t) * ENVP_NB_ENTRIES; + prom_set(prom_buf, prom_index++, "%s", rng_seed_hex); + prom_set(prom_buf, prom_index++, NULL); rom_add_blob_fixed("prom", prom_buf, prom_size, ENVP_PADDR); + qemu_register_reset_nosnapshotload(reinitialize_rng_seed, + rom_ptr(ENVP_PADDR, prom_size) + rng_seed_prom_offset); g_free(prom_buf); return kernel_entry; diff --git a/hw/misc/cbus.c b/hw/misc/cbus.c index 3c3721ad2d..653e8ddcd5 100644 --- a/hw/misc/cbus.c +++ b/hw/misc/cbus.c @@ -133,7 +133,7 @@ static void cbus_sel(void *opaque, int line, int level) CBus *cbus_init(qemu_irq dat) { - CBusPriv *s = (CBusPriv *) g_malloc0(sizeof(*s)); + CBusPriv *s = g_malloc0(sizeof(*s)); s->dat_out = dat; s->cbus.clk = qemu_allocate_irq(cbus_clk, s, 0); @@ -388,7 +388,7 @@ static void retu_io(void *opaque, int rw, int reg, uint16_t *val) void *retu_init(qemu_irq irq, int vilma) { - CBusRetu *s = (CBusRetu *) g_malloc0(sizeof(*s)); + CBusRetu *s = g_malloc0(sizeof(*s)); s->irq = irq; s->irqen = 0xffff; @@ -604,7 +604,7 @@ static void tahvo_io(void *opaque, int rw, int reg, uint16_t *val) void *tahvo_init(qemu_irq irq, int betty) { - CBusTahvo *s = (CBusTahvo *) g_malloc0(sizeof(*s)); + CBusTahvo *s = g_malloc0(sizeof(*s)); s->irq = irq; s->irqen = 0xffff; diff --git a/hw/net/can/can_sja1000.c b/hw/net/can/can_sja1000.c index e0f76d3eb3..73201f9139 100644 --- a/hw/net/can/can_sja1000.c +++ b/hw/net/can/can_sja1000.c @@ -431,7 +431,7 @@ void can_sja_mem_write(CanSJA1000State *s, hwaddr addr, uint64_t val, (unsigned long long)val, (unsigned int)addr); if (addr > CAN_SJA_MEM_SIZE) { - return ; + return; } if (s->clock & 0x80) { /* PeliCAN Mode */ diff --git a/hw/nvram/eeprom93xx.c b/hw/nvram/eeprom93xx.c index a1b9c78844..1081e2cc0d 100644 --- a/hw/nvram/eeprom93xx.c +++ b/hw/nvram/eeprom93xx.c @@ -315,7 +315,7 @@ eeprom_t *eeprom93xx_new(DeviceState *dev, uint16_t nwords) addrbits = 6; } - eeprom = (eeprom_t *)g_malloc0(sizeof(*eeprom) + nwords * 2); + eeprom = g_malloc0(sizeof(*eeprom) + nwords * 2); eeprom->size = nwords; eeprom->addrbits = addrbits; /* Output DO is tristate, read results in 1. */ diff --git a/hw/openrisc/boot.c b/hw/openrisc/boot.c index 128ccbcba2..007e80cd5a 100644 --- a/hw/openrisc/boot.c +++ b/hw/openrisc/boot.c @@ -14,6 +14,7 @@ #include "hw/openrisc/boot.h" #include "sysemu/device_tree.h" #include "sysemu/qtest.h" +#include "sysemu/reset.h" #include <libfdt.h> @@ -111,6 +112,8 @@ uint32_t openrisc_load_fdt(void *fdt, hwaddr load_start, rom_add_blob_fixed_as("fdt", fdt, fdtsize, fdt_addr, &address_space_memory); + qemu_register_reset_nosnapshotload(qemu_fdt_randomize_seeds, + rom_ptr_for_as(&address_space_memory, fdt_addr, fdtsize)); return fdt_addr; } diff --git a/hw/ppc/pegasos2.c b/hw/ppc/pegasos2.c index ecf682b148..bb4d008ba9 100644 --- a/hw/ppc/pegasos2.c +++ b/hw/ppc/pegasos2.c @@ -248,14 +248,14 @@ static void pegasos2_pci_config_write(Pegasos2MachineState *pm, int bus, pegasos2_mv_reg_write(pm, pcicfg + 4, len, val); } -static void pegasos2_machine_reset(MachineState *machine) +static void pegasos2_machine_reset(MachineState *machine, ShutdownCause reason) { Pegasos2MachineState *pm = PEGASOS2_MACHINE(machine); void *fdt; uint64_t d[2]; int sz; - qemu_devices_reset(); + qemu_devices_reset(reason); if (!pm->vof) { return; /* Firmware should set up machine so nothing to do */ } diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c index 40bb573d1a..3d01e26f84 100644 --- a/hw/ppc/pnv.c +++ b/hw/ppc/pnv.c @@ -643,13 +643,13 @@ static void pnv_powerdown_notify(Notifier *n, void *opaque) } } -static void pnv_reset(MachineState *machine) +static void pnv_reset(MachineState *machine, ShutdownCause reason) { PnvMachineState *pnv = PNV_MACHINE(machine); IPMIBmc *bmc; void *fdt; - qemu_devices_reset(); + qemu_devices_reset(reason); /* * The machine should provide by default an internal BMC simulator. diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index f79ac85ca1..66b414d2e9 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -1623,7 +1623,7 @@ void spapr_check_mmu_mode(bool guest_radix) } } -static void spapr_machine_reset(MachineState *machine) +static void spapr_machine_reset(MachineState *machine, ShutdownCause reason) { SpaprMachineState *spapr = SPAPR_MACHINE(machine); PowerPCCPU *first_ppc_cpu; @@ -1649,7 +1649,7 @@ static void spapr_machine_reset(MachineState *machine) spapr_setup_hpt(spapr); } - qemu_devices_reset(); + qemu_devices_reset(reason); spapr_ovec_cleanup(spapr->ov5_cas); spapr->ov5_cas = spapr_ovec_new(); diff --git a/hw/riscv/boot.c b/hw/riscv/boot.c index e82bf27338..ebd351c840 100644 --- a/hw/riscv/boot.c +++ b/hw/riscv/boot.c @@ -30,6 +30,7 @@ #include "sysemu/device_tree.h" #include "sysemu/qtest.h" #include "sysemu/kvm.h" +#include "sysemu/reset.h" #include <libfdt.h> @@ -241,6 +242,8 @@ uint64_t riscv_load_fdt(hwaddr dram_base, uint64_t mem_size, void *fdt) rom_add_blob_fixed_as("fdt", fdt, fdtsize, fdt_addr, &address_space_memory); + qemu_register_reset_nosnapshotload(qemu_fdt_randomize_seeds, + rom_ptr_for_as(&address_space_memory, fdt_addr, fdtsize)); return fdt_addr; } diff --git a/hw/rx/rx-gdbsim.c b/hw/rx/rx-gdbsim.c index 8ffe1b8035..47c17026c7 100644 --- a/hw/rx/rx-gdbsim.c +++ b/hw/rx/rx-gdbsim.c @@ -25,6 +25,7 @@ #include "hw/rx/rx62n.h" #include "sysemu/qtest.h" #include "sysemu/device_tree.h" +#include "sysemu/reset.h" #include "hw/boards.h" #include "qom/object.h" @@ -148,6 +149,8 @@ static void rx_gdbsim_init(MachineState *machine) dtb_offset = ROUND_DOWN(machine->ram_size - dtb_size, 16); rom_add_blob_fixed("dtb", dtb, dtb_size, SDRAM_BASE + dtb_offset); + qemu_register_reset_nosnapshotload(qemu_fdt_randomize_seeds, + rom_ptr(SDRAM_BASE + dtb_offset, dtb_size)); /* Set dtb address to R1 */ RX_CPU(first_cpu)->env.regs[1] = SDRAM_BASE + dtb_offset; } diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c index 401b63d6cb..728ba24547 100644 --- a/hw/s390x/pv.c +++ b/hw/s390x/pv.c @@ -20,6 +20,11 @@ #include "exec/confidential-guest-support.h" #include "hw/s390x/ipl.h" #include "hw/s390x/pv.h" +#include "target/s390x/kvm/kvm_s390x.h" + +static bool info_valid; +static struct kvm_s390_pv_info_vm info_vm; +static struct kvm_s390_pv_info_dump info_dump; static int __s390_pv_cmd(uint32_t cmd, const char *cmdname, void *data) { @@ -56,6 +61,42 @@ static int __s390_pv_cmd(uint32_t cmd, const char *cmdname, void *data) } \ } +int s390_pv_query_info(void) +{ + struct kvm_s390_pv_info info = { + .header.id = KVM_PV_INFO_VM, + .header.len_max = sizeof(info.header) + sizeof(info.vm), + }; + int rc; + + /* Info API's first user is dump so they are bundled */ + if (!kvm_s390_get_protected_dump()) { + return 0; + } + + rc = s390_pv_cmd(KVM_PV_INFO, &info); + if (rc) { + error_report("KVM PV INFO cmd %x failed: %s", + info.header.id, strerror(-rc)); + return rc; + } + memcpy(&info_vm, &info.vm, sizeof(info.vm)); + + info.header.id = KVM_PV_INFO_DUMP; + info.header.len_max = sizeof(info.header) + sizeof(info.dump); + rc = s390_pv_cmd(KVM_PV_INFO, &info); + if (rc) { + error_report("KVM PV INFO cmd %x failed: %s", + info.header.id, strerror(-rc)); + return rc; + } + + memcpy(&info_dump, &info.dump, sizeof(info.dump)); + info_valid = true; + + return rc; +} + int s390_pv_vm_enable(void) { return s390_pv_cmd(KVM_PV_ENABLE, NULL); @@ -114,6 +155,77 @@ void s390_pv_inject_reset_error(CPUState *cs) env->regs[r1 + 1] = DIAG_308_RC_INVAL_FOR_PV; } +uint64_t kvm_s390_pv_dmp_get_size_cpu(void) +{ + return info_dump.dump_cpu_buffer_len; +} + +uint64_t kvm_s390_pv_dmp_get_size_completion_data(void) +{ + return info_dump.dump_config_finalize_len; +} + +uint64_t kvm_s390_pv_dmp_get_size_mem_state(void) +{ + return info_dump.dump_config_mem_buffer_per_1m; +} + +bool kvm_s390_pv_info_basic_valid(void) +{ + return info_valid; +} + +static int s390_pv_dump_cmd(uint64_t subcmd, uint64_t uaddr, uint64_t gaddr, + uint64_t len) +{ + struct kvm_s390_pv_dmp dmp = { + .subcmd = subcmd, + .buff_addr = uaddr, + .buff_len = len, + .gaddr = gaddr, + }; + int ret; + + ret = s390_pv_cmd(KVM_PV_DUMP, (void *)&dmp); + if (ret) { + error_report("KVM DUMP command %ld failed", subcmd); + } + return ret; +} + +int kvm_s390_dump_cpu(S390CPU *cpu, void *buff) +{ + struct kvm_s390_pv_dmp dmp = { + .subcmd = KVM_PV_DUMP_CPU, + .buff_addr = (uint64_t)buff, + .gaddr = 0, + .buff_len = info_dump.dump_cpu_buffer_len, + }; + struct kvm_pv_cmd pv = { + .cmd = KVM_PV_DUMP, + .data = (uint64_t)&dmp, + }; + + return kvm_vcpu_ioctl(CPU(cpu), KVM_S390_PV_CPU_COMMAND, &pv); +} + +int kvm_s390_dump_init(void) +{ + return s390_pv_dump_cmd(KVM_PV_DUMP_INIT, 0, 0, 0); +} + +int kvm_s390_dump_mem_state(uint64_t gaddr, size_t len, void *dest) +{ + return s390_pv_dump_cmd(KVM_PV_DUMP_CONFIG_STOR_STATE, (uint64_t)dest, + gaddr, len); +} + +int kvm_s390_dump_completion_data(void *buff) +{ + return s390_pv_dump_cmd(KVM_PV_DUMP_COMPLETE, (uint64_t)buff, 0, + info_dump.dump_config_finalize_len); +} + #define TYPE_S390_PV_GUEST "s390-pv-guest" OBJECT_DECLARE_SIMPLE_TYPE(S390PVGuest, S390_PV_GUEST) diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index 03855c7231..806de32034 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -366,6 +366,12 @@ static int s390_machine_protect(S390CcwMachineState *ms) ms->pv = true; + /* Will return 0 if API is not available since it's not vital */ + rc = s390_pv_query_info(); + if (rc) { + goto out_err; + } + /* Set SE header and unpack */ rc = s390_ipl_prepare_pv_header(); if (rc) { @@ -405,7 +411,7 @@ static void s390_pv_prepare_reset(S390CcwMachineState *ms) s390_pv_prep_reset(); } -static void s390_machine_reset(MachineState *machine) +static void s390_machine_reset(MachineState *machine, ShutdownCause reason) { S390CcwMachineState *ms = S390_CCW_MACHINE(machine); enum s390_reset reset_type; @@ -427,7 +433,7 @@ static void s390_machine_reset(MachineState *machine) s390_machine_unprotect(ms); } - qemu_devices_reset(); + qemu_devices_reset(reason); s390_crypto_reset(); /* configure and start the ipl CPU only */ diff --git a/hw/ssi/aspeed_smc.c b/hw/ssi/aspeed_smc.c index faed7e0cbe..22df4be528 100644 --- a/hw/ssi/aspeed_smc.c +++ b/hw/ssi/aspeed_smc.c @@ -388,7 +388,7 @@ static inline int aspeed_smc_flash_cmd(const AspeedSMCFlash *fl) static inline int aspeed_smc_flash_addr_width(const AspeedSMCFlash *fl) { const AspeedSMCState *s = fl->controller; - AspeedSMCClass *asc = ASPEED_SMC_GET_CLASS(s); + AspeedSMCClass *asc = fl->asc; if (asc->addr_width) { return asc->addr_width(s); @@ -420,7 +420,7 @@ static uint32_t aspeed_smc_check_segment_addr(const AspeedSMCFlash *fl, uint32_t addr) { const AspeedSMCState *s = fl->controller; - AspeedSMCClass *asc = ASPEED_SMC_GET_CLASS(s); + AspeedSMCClass *asc = fl->asc; AspeedSegments seg; asc->reg_to_segment(s, s->regs[R_SEG_ADDR0 + fl->cs], &seg); @@ -1234,7 +1234,6 @@ static const TypeInfo aspeed_smc_info = { static void aspeed_smc_flash_realize(DeviceState *dev, Error **errp) { AspeedSMCFlash *s = ASPEED_SMC_FLASH(dev); - AspeedSMCClass *asc; g_autofree char *name = g_strdup_printf(TYPE_ASPEED_SMC_FLASH ".%d", s->cs); if (!s->controller) { @@ -1242,14 +1241,14 @@ static void aspeed_smc_flash_realize(DeviceState *dev, Error **errp) return; } - asc = ASPEED_SMC_GET_CLASS(s->controller); + s->asc = ASPEED_SMC_GET_CLASS(s->controller); /* * Use the default segment value to size the memory region. This * can be changed by FW at runtime. */ memory_region_init_io(&s->mmio, OBJECT(s), &aspeed_smc_flash_ops, - s, name, asc->segments[s->cs].size); + s, name, s->asc->segments[s->cs].size); sysbus_init_mmio(SYS_BUS_DEVICE(dev), &s->mmio); } diff --git a/hw/ssi/ssi.c b/hw/ssi/ssi.c index 003931fb50..d54a109bee 100644 --- a/hw/ssi/ssi.c +++ b/hw/ssi/ssi.c @@ -38,9 +38,8 @@ static void ssi_cs_default(void *opaque, int n, int level) bool cs = !!level; assert(n == 0); if (s->cs != cs) { - SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(s); - if (ssc->set_cs) { - ssc->set_cs(s, cs); + if (s->spc->set_cs) { + s->spc->set_cs(s, cs); } } s->cs = cs; @@ -48,11 +47,11 @@ static void ssi_cs_default(void *opaque, int n, int level) static uint32_t ssi_transfer_raw_default(SSIPeripheral *dev, uint32_t val) { - SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(dev); + SSIPeripheralClass *ssc = dev->spc; if ((dev->cs && ssc->cs_polarity == SSI_CS_HIGH) || - (!dev->cs && ssc->cs_polarity == SSI_CS_LOW) || - ssc->cs_polarity == SSI_CS_NONE) { + (!dev->cs && ssc->cs_polarity == SSI_CS_LOW) || + ssc->cs_polarity == SSI_CS_NONE) { return ssc->transfer(dev, val); } return 0; @@ -67,6 +66,7 @@ static void ssi_peripheral_realize(DeviceState *dev, Error **errp) ssc->cs_polarity != SSI_CS_NONE) { qdev_init_gpio_in_named(dev, ssi_cs_default, SSI_GPIO_CS, 1); } + s->spc = ssc; ssc->realize(s, errp); } @@ -115,13 +115,11 @@ uint32_t ssi_transfer(SSIBus *bus, uint32_t val) { BusState *b = BUS(bus); BusChild *kid; - SSIPeripheralClass *ssc; uint32_t r = 0; QTAILQ_FOREACH(kid, &b->children, sibling) { - SSIPeripheral *peripheral = SSI_PERIPHERAL(kid->child); - ssc = SSI_PERIPHERAL_GET_CLASS(peripheral); - r |= ssc->transfer_raw(peripheral, val); + SSIPeripheral *p = SSI_PERIPHERAL(kid->child); + r |= p->spc->transfer_raw(p, val); } return r; diff --git a/hw/timer/imx_epit.c b/hw/timer/imx_epit.c index 2bf8c754b2..ec0fa440d7 100644 --- a/hw/timer/imx_epit.c +++ b/hw/timer/imx_epit.c @@ -275,10 +275,15 @@ static void imx_epit_write(void *opaque, hwaddr offset, uint64_t value, /* If IOVW bit is set then set the timer value */ ptimer_set_count(s->timer_reload, s->lr); } - + /* + * Commit the change to s->timer_reload, so it can propagate. Otherwise + * the timer interrupt may not fire properly. The commit must happen + * before calling imx_epit_reload_compare_timer(), which reads + * s->timer_reload internally again. + */ + ptimer_transaction_commit(s->timer_reload); imx_epit_reload_compare_timer(s); ptimer_transaction_commit(s->timer_cmp); - ptimer_transaction_commit(s->timer_reload); break; case 3: /* CMP */ diff --git a/hw/timer/renesas_cmt.c b/hw/timer/renesas_cmt.c index 2e0fd21a36..69eabc678a 100644 --- a/hw/timer/renesas_cmt.c +++ b/hw/timer/renesas_cmt.c @@ -57,7 +57,7 @@ static void update_events(RCMTState *cmt, int ch) if ((cmt->cmstr & (1 << ch)) == 0) { /* count disable, so not happened next event. */ - return ; + return; } next_time = cmt->cmcor[ch] - cmt->cmcnt[ch]; next_time *= NANOSECONDS_PER_SECOND; diff --git a/hw/timer/renesas_tmr.c b/hw/timer/renesas_tmr.c index d96002e1ee..c15f654738 100644 --- a/hw/timer/renesas_tmr.c +++ b/hw/timer/renesas_tmr.c @@ -67,18 +67,18 @@ static void update_events(RTMRState *tmr, int ch) int i, event; if (tmr->tccr[ch] == 0) { - return ; + return; } if (FIELD_EX8(tmr->tccr[ch], TCCR, CSS) == 0) { /* external clock mode */ /* event not happened */ - return ; + return; } if (FIELD_EX8(tmr->tccr[0], TCCR, CSS) == CSS_CASCADING) { /* cascading mode */ if (ch == 1) { tmr->next[ch] = none; - return ; + return; } diff[cmia] = concat_reg(tmr->tcora) - concat_reg(tmr->tcnt); diff[cmib] = concat_reg(tmr->tcorb) - concat_reg(tmr->tcnt); @@ -384,7 +384,7 @@ static void timer_events(RTMRState *tmr, int ch) tmr->tcorb[ch]) & 0xff; } else { if (ch == 1) { - return ; + return; } tcnt = issue_event(tmr, ch, 16, concat_reg(tmr->tcnt), diff --git a/hw/usb/ccid-card-emulated.c b/hw/usb/ccid-card-emulated.c index 1ddf7297f6..ee41a81801 100644 --- a/hw/usb/ccid-card-emulated.c +++ b/hw/usb/ccid-card-emulated.c @@ -140,7 +140,7 @@ static void emulated_apdu_from_guest(CCIDCardState *base, const uint8_t *apdu, uint32_t len) { EmulatedState *card = EMULATED_CCID_CARD(base); - EmulEvent *event = (EmulEvent *)g_malloc(sizeof(EmulEvent) + len); + EmulEvent *event = g_malloc(sizeof(EmulEvent) + len); assert(event); event->p.data.type = EMUL_GUEST_APDU; diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index e7d80242b7..34db51e241 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1675,7 +1675,7 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp) if (virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM)) { error_setg(errp, "VIRTIO_F_IOMMU_PLATFORM was supported by" " neither legacy nor transitional device"); - return ; + return; } /* * Legacy and transitional devices use specific subsystem IDs. diff --git a/include/block/block-common.h b/include/block/block-common.h index fdb7306e78..061606e867 100644 --- a/include/block/block-common.h +++ b/include/block/block-common.h @@ -80,6 +80,15 @@ typedef enum { */ BDRV_REQ_MAY_UNMAP = 0x4, + /* + * An optimization hint when all QEMUIOVector elements are within + * previously registered bdrv_register_buf() memory ranges. + * + * Code that replaces the user's QEMUIOVector elements with bounce buffers + * must take care to clear this flag. + */ + BDRV_REQ_REGISTERED_BUF = 0x8, + BDRV_REQ_FUA = 0x10, BDRV_REQ_WRITE_COMPRESSED = 0x20, diff --git a/include/block/block-global-state.h b/include/block/block-global-state.h index 21265e3966..eba4ed23b4 100644 --- a/include/block/block-global-state.h +++ b/include/block/block-global-state.h @@ -243,9 +243,15 @@ void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp); * Register/unregister a buffer for I/O. For example, VFIO drivers are * interested to know the memory areas that would later be used for I/O, so * that they can prepare IOMMU mapping etc., to get better performance. + * + * Buffers must not overlap and they must be unregistered with the same <host, + * size> values that they were registered with. + * + * Returns: true on success, false on failure */ -void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size); -void bdrv_unregister_buf(BlockDriverState *bs, void *host); +bool bdrv_register_buf(BlockDriverState *bs, void *host, size_t size, + Error **errp); +void bdrv_unregister_buf(BlockDriverState *bs, void *host, size_t size); void bdrv_cancel_in_flight(BlockDriverState *bs); diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index 8947abab76..9c569be162 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -433,9 +433,12 @@ struct BlockDriver { * that it can do IOMMU mapping with VFIO etc., in order to get better * performance. In the case of VFIO drivers, this callback is used to do * DMA mapping for hot buffers. + * + * Returns: true on success, false on failure */ - void (*bdrv_register_buf)(BlockDriverState *bs, void *host, size_t size); - void (*bdrv_unregister_buf)(BlockDriverState *bs, void *host); + bool (*bdrv_register_buf)(BlockDriverState *bs, void *host, size_t size, + Error **errp); + void (*bdrv_unregister_buf)(BlockDriverState *bs, void *host, size_t size); /* * This field is modified only under the BQL, and is part of @@ -1051,7 +1054,7 @@ struct BlockDriverState { /* * Flags honored during pread */ - unsigned int supported_read_flags; + BdrvRequestFlags supported_read_flags; /* * Flags honored during pwrite (so far: BDRV_REQ_FUA, * BDRV_REQ_WRITE_UNCHANGED). @@ -1069,12 +1072,12 @@ struct BlockDriverState { * flag), or they have to explicitly take the WRITE permission for * their children. */ - unsigned int supported_write_flags; + BdrvRequestFlags supported_write_flags; /* * Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA, * BDRV_REQ_MAY_UNMAP, BDRV_REQ_WRITE_UNCHANGED) */ - unsigned int supported_zero_flags; + BdrvRequestFlags supported_zero_flags; /* * Flags honoured during truncate (so far: BDRV_REQ_ZERO_WRITE). * @@ -1082,7 +1085,7 @@ struct BlockDriverState { * that any added space reads as all zeros. If this can't be guaranteed, * the operation must fail. */ - unsigned int supported_truncate_flags; + BdrvRequestFlags supported_truncate_flags; /* the following member gives a name to every node on the bs graph. */ char node_name[32]; diff --git a/include/elf.h b/include/elf.h index 3d6b9062c0..8bf1e72720 100644 --- a/include/elf.h +++ b/include/elf.h @@ -1650,6 +1650,8 @@ typedef struct elf64_shdr { #define NT_TASKSTRUCT 4 #define NT_AUXV 6 #define NT_PRXFPREG 0x46e62b7f /* copied from gdb5.1/include/elf/common.h */ +#define NT_S390_PV_CPU_DATA 0x30e /* s390 protvirt cpu dump data */ +#define NT_S390_RI_CB 0x30d /* s390 runtime instrumentation */ #define NT_S390_GS_CB 0x30b /* s390 guarded storage registers */ #define NT_S390_VXRS_HIGH 0x30a /* s390 vector registers 16-31 */ #define NT_S390_VXRS_LOW 0x309 /* s390 vector registers 0-15 (lower half) */ diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h index 16b7df41bf..2eb1176538 100644 --- a/include/exec/cpu-all.h +++ b/include/exec/cpu-all.h @@ -281,28 +281,18 @@ void page_reset_target_data(target_ulong start, target_ulong end); int page_check_range(target_ulong start, target_ulong len, int flags); /** - * page_alloc_target_data(address, size) + * page_get_target_data(address) * @address: guest virtual address - * @size: size of data to allocate * - * Allocate @size bytes of out-of-band data to associate with the - * guest page at @address. If the page is not mapped, NULL will - * be returned. If there is existing data associated with @address, - * no new memory will be allocated. + * Return TARGET_PAGE_DATA_SIZE bytes of out-of-band data to associate + * with the guest page at @address, allocating it if necessary. The + * caller should already have verified that the address is valid. * * The memory will be freed when the guest page is deallocated, * e.g. with the munmap system call. */ -void *page_alloc_target_data(target_ulong address, size_t size); - -/** - * page_get_target_data(address) - * @address: guest virtual address - * - * Return any out-of-bound memory assocated with the guest page - * at @address, as per page_alloc_target_data. - */ -void *page_get_target_data(target_ulong address); +void *page_get_target_data(target_ulong address) + __attribute__((returns_nonnull)); #endif CPUArchState *cpu_copy(CPUArchState *env); diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index c493510ee9..6feaa40ca7 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -92,6 +92,7 @@ void qemu_ram_set_uf_zeroable(RAMBlock *rb); bool qemu_ram_is_migratable(RAMBlock *rb); void qemu_ram_set_migratable(RAMBlock *rb); void qemu_ram_unset_migratable(RAMBlock *rb); +int qemu_ram_get_fd(RAMBlock *rb); size_t qemu_ram_pagesize(RAMBlock *block); size_t qemu_ram_pagesize_largest(void); diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index e5f8b224a5..e948992a80 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -39,9 +39,6 @@ typedef ram_addr_t tb_page_addr_t; #define TB_PAGE_ADDR_FMT RAM_ADDR_FMT #endif -void restore_state_to_opc(CPUArchState *env, TranslationBlock *tb, - target_ulong *data); - /** * cpu_restore_state: * @cpu: the vCPU state is to be restore to @@ -610,18 +607,40 @@ static inline uint32_t tb_cflags(const TranslationBlock *tb) return qatomic_read(&tb->cflags); } +static inline tb_page_addr_t tb_page_addr0(const TranslationBlock *tb) +{ + return tb->page_addr[0]; +} + +static inline tb_page_addr_t tb_page_addr1(const TranslationBlock *tb) +{ + return tb->page_addr[1]; +} + +static inline void tb_set_page_addr0(TranslationBlock *tb, + tb_page_addr_t addr) +{ + tb->page_addr[0] = addr; +} + +static inline void tb_set_page_addr1(TranslationBlock *tb, + tb_page_addr_t addr) +{ + tb->page_addr[1] = addr; +} + /* current cflags for hashing/comparison */ uint32_t curr_cflags(CPUState *cpu); /* TranslationBlock invalidate API */ #if defined(CONFIG_USER_ONLY) void tb_invalidate_phys_addr(target_ulong addr); -void tb_invalidate_phys_range(target_ulong start, target_ulong end); #else void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs); #endif void tb_flush(CPUState *cpu); void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr); +void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end); void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr); /* GETPC is the true target of the return instruction that we'll execute. */ @@ -642,14 +661,6 @@ extern __thread uintptr_t tci_tb_ptr; smaller than 4 bytes, so we don't worry about special-casing this. */ #define GETPC_ADJ 2 -#if !defined(CONFIG_USER_ONLY) && defined(CONFIG_DEBUG_TCG) -void assert_no_pages_locked(void); -#else -static inline void assert_no_pages_locked(void) -{ -} -#endif - #if !defined(CONFIG_USER_ONLY) /** diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h index f3e0c78161..1500680458 100644 --- a/include/exec/ram_addr.h +++ b/include/exec/ram_addr.h @@ -147,8 +147,6 @@ static inline void qemu_ram_block_writeback(RAMBlock *block) #define DIRTY_CLIENTS_ALL ((1 << DIRTY_MEMORY_NUM) - 1) #define DIRTY_CLIENTS_NOCODE (DIRTY_CLIENTS_ALL & ~(1 << DIRTY_MEMORY_CODE)) -void tb_invalidate_phys_range(ram_addr_t start, ram_addr_t end); - static inline bool cpu_physical_memory_get_dirty(ram_addr_t start, ram_addr_t length, unsigned client) diff --git a/include/exec/translate-all.h b/include/exec/translate-all.h index 9f646389af..3e9cb91565 100644 --- a/include/exec/translate-all.h +++ b/include/exec/translate-all.h @@ -29,7 +29,7 @@ void page_collection_unlock(struct page_collection *set); void tb_invalidate_phys_page_fast(struct page_collection *pages, tb_page_addr_t start, int len, uintptr_t retaddr); -void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end); +void tb_invalidate_phys_page(tb_page_addr_t addr); void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr); #ifdef CONFIG_USER_ONLY diff --git a/include/hw/boards.h b/include/hw/boards.h index 311ed17e18..90f1dd3aeb 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -231,7 +231,7 @@ struct MachineClass { const char *deprecation_reason; void (*init)(MachineState *state); - void (*reset)(MachineState *state); + void (*reset)(MachineState *state, ShutdownCause reason); void (*wakeup)(MachineState *state); int (*kvm_type)(MachineState *machine, const char *arg); diff --git a/include/hw/core/tcg-cpu-ops.h b/include/hw/core/tcg-cpu-ops.h index 78c6c6635d..20e3c0ffbb 100644 --- a/include/hw/core/tcg-cpu-ops.h +++ b/include/hw/core/tcg-cpu-ops.h @@ -31,6 +31,17 @@ struct TCGCPUOps { * function to restore all the state, and register it here. */ void (*synchronize_from_tb)(CPUState *cpu, const TranslationBlock *tb); + /** + * @restore_state_to_opc: Synchronize state from INDEX_op_start_insn + * + * This is called when we unwind state in the middle of a TB, + * usually before raising an exception. Set all part of the CPU + * state which are tracked insn-by-insn in the target-specific + * arguments to start_insn, passed as @data. + */ + void (*restore_state_to_opc)(CPUState *cpu, const TranslationBlock *tb, + const uint64_t *data); + /** @cpu_exec_enter: Callback for cpu_exec preparation */ void (*cpu_exec_enter)(CPUState *cpu); /** @cpu_exec_exit: Callback for cpu_exec cleanup */ diff --git a/include/hw/elf_ops.h b/include/hw/elf_ops.h index 7c3b1d0f6c..fbe0b1e956 100644 --- a/include/hw/elf_ops.h +++ b/include/hw/elf_ops.h @@ -117,7 +117,7 @@ static void glue(load_symbols, SZ)(struct elfhdr *ehdr, int fd, int must_swab, shdr_table = load_at(fd, ehdr->e_shoff, sizeof(struct elf_shdr) * ehdr->e_shnum); if (!shdr_table) { - return ; + return; } if (must_swab) { diff --git a/include/hw/i2c/aspeed_i2c.h b/include/hw/i2c/aspeed_i2c.h index 300a89b343..adc904d6c1 100644 --- a/include/hw/i2c/aspeed_i2c.h +++ b/include/hw/i2c/aspeed_i2c.h @@ -130,6 +130,7 @@ REG32(I2CD_CMD, 0x14) /* I2CD Command/Status */ SHARED_FIELD(M_TX_CMD, 1, 1) SHARED_FIELD(M_START_CMD, 0, 1) REG32(I2CD_DEV_ADDR, 0x18) /* Slave Device Address */ + SHARED_FIELD(SLAVE_DEV_ADDR1, 0, 7) REG32(I2CD_POOL_CTRL, 0x1C) /* Pool Buffer Control */ SHARED_FIELD(RX_COUNT, 24, 5) SHARED_FIELD(RX_SIZE, 16, 5) diff --git a/include/hw/s390x/pv.h b/include/hw/s390x/pv.h index 1f1f545bfc..9360aa1091 100644 --- a/include/hw/s390x/pv.h +++ b/include/hw/s390x/pv.h @@ -38,6 +38,7 @@ static inline bool s390_is_pv(void) return ccw->pv; } +int s390_pv_query_info(void); int s390_pv_vm_enable(void); void s390_pv_vm_disable(void); int s390_pv_set_sec_parms(uint64_t origin, uint64_t length); @@ -46,8 +47,17 @@ void s390_pv_prep_reset(void); int s390_pv_verify(void); void s390_pv_unshare(void); void s390_pv_inject_reset_error(CPUState *cs); +uint64_t kvm_s390_pv_dmp_get_size_cpu(void); +uint64_t kvm_s390_pv_dmp_get_size_mem_state(void); +uint64_t kvm_s390_pv_dmp_get_size_completion_data(void); +bool kvm_s390_pv_info_basic_valid(void); +int kvm_s390_dump_init(void); +int kvm_s390_dump_cpu(S390CPU *cpu, void *buff); +int kvm_s390_dump_mem_state(uint64_t addr, size_t len, void *dest); +int kvm_s390_dump_completion_data(void *buff); #else /* CONFIG_KVM */ static inline bool s390_is_pv(void) { return false; } +static inline int s390_pv_query_info(void) { return 0; } static inline int s390_pv_vm_enable(void) { return 0; } static inline void s390_pv_vm_disable(void) {} static inline int s390_pv_set_sec_parms(uint64_t origin, uint64_t length) { return 0; } @@ -56,6 +66,15 @@ static inline void s390_pv_prep_reset(void) {} static inline int s390_pv_verify(void) { return 0; } static inline void s390_pv_unshare(void) {} static inline void s390_pv_inject_reset_error(CPUState *cs) {}; +static inline uint64_t kvm_s390_pv_dmp_get_size_cpu(void) { return 0; } +static inline uint64_t kvm_s390_pv_dmp_get_size_mem_state(void) { return 0; } +static inline uint64_t kvm_s390_pv_dmp_get_size_completion_data(void) { return 0; } +static inline bool kvm_s390_pv_info_basic_valid(void) { return false; } +static inline int kvm_s390_dump_init(void) { return 0; } +static inline int kvm_s390_dump_cpu(S390CPU *cpu, void *buff) { return 0; } +static inline int kvm_s390_dump_mem_state(uint64_t addr, size_t len, + void *dest) { return 0; } +static inline int kvm_s390_dump_completion_data(void *buff) { return 0; } #endif /* CONFIG_KVM */ int s390_pv_kvm_init(ConfidentialGuestSupport *cgs, Error **errp); diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h index 3b1b3d278e..6ea4b64fe7 100644 --- a/include/hw/scsi/scsi.h +++ b/include/hw/scsi/scsi.h @@ -188,7 +188,6 @@ SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, BlockBackend *blk, const char *serial, Error **errp); void scsi_bus_set_ua(SCSIBus *bus, SCSISense sense); void scsi_bus_legacy_handle_cmdline(SCSIBus *bus); -void scsi_legacy_handle_cmdline(void); SCSIRequest *scsi_req_alloc(const SCSIReqOps *reqops, SCSIDevice *d, uint32_t tag, uint32_t lun, void *hba_private); diff --git a/include/hw/ssi/aspeed_smc.h b/include/hw/ssi/aspeed_smc.h index 2d5f8f3d8f..8e1dda556b 100644 --- a/include/hw/ssi/aspeed_smc.h +++ b/include/hw/ssi/aspeed_smc.h @@ -30,6 +30,7 @@ #include "qom/object.h" struct AspeedSMCState; +struct AspeedSMCClass; #define TYPE_ASPEED_SMC_FLASH "aspeed.smc.flash" OBJECT_DECLARE_SIMPLE_TYPE(AspeedSMCFlash, ASPEED_SMC_FLASH) @@ -37,6 +38,7 @@ struct AspeedSMCFlash { SysBusDevice parent_obj; struct AspeedSMCState *controller; + struct AspeedSMCClass *asc; uint8_t cs; MemoryRegion mmio; diff --git a/include/hw/ssi/ssi.h b/include/hw/ssi/ssi.h index f411858ab0..6950f86810 100644 --- a/include/hw/ssi/ssi.h +++ b/include/hw/ssi/ssi.h @@ -59,6 +59,9 @@ struct SSIPeripheralClass { struct SSIPeripheral { DeviceState parent_obj; + /* cache the class */ + SSIPeripheralClass *spc; + /* Chip select state */ bool cs; }; diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h index d311c57cca..7f589b4146 100644 --- a/include/hw/virtio/virtio-blk.h +++ b/include/hw/virtio/virtio-blk.h @@ -19,6 +19,7 @@ #include "hw/block/block.h" #include "sysemu/iothread.h" #include "sysemu/block-backend.h" +#include "sysemu/block-ram-registrar.h" #include "qom/object.h" #define TYPE_VIRTIO_BLK "virtio-blk-device" @@ -64,6 +65,7 @@ struct VirtIOBlock { struct VirtIOBlockDataPlane *dataplane; uint64_t host_features; size_t config_size; + BlockRAMRegistrar blk_ram_registrar; }; typedef struct VirtIOBlockReq { diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h index 7e8fc8e7cd..874134fd19 100644 --- a/include/qemu/atomic.h +++ b/include/qemu/atomic.h @@ -133,7 +133,7 @@ #define qatomic_read(ptr) \ ({ \ - QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE); \ + qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \ qatomic_read__nocheck(ptr); \ }) @@ -141,7 +141,7 @@ __atomic_store_n(ptr, i, __ATOMIC_RELAXED) #define qatomic_set(ptr, i) do { \ - QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE); \ + qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \ qatomic_set__nocheck(ptr, i); \ } while(0) @@ -159,27 +159,27 @@ #define qatomic_rcu_read(ptr) \ ({ \ - QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE); \ + qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \ typeof_strip_qual(*ptr) _val; \ qatomic_rcu_read__nocheck(ptr, &_val); \ _val; \ }) #define qatomic_rcu_set(ptr, i) do { \ - QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE); \ + qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \ __atomic_store_n(ptr, i, __ATOMIC_RELEASE); \ } while(0) #define qatomic_load_acquire(ptr) \ ({ \ - QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE); \ + qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \ typeof_strip_qual(*ptr) _val; \ __atomic_load(ptr, &_val, __ATOMIC_ACQUIRE); \ _val; \ }) #define qatomic_store_release(ptr, i) do { \ - QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE); \ + qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \ __atomic_store_n(ptr, i, __ATOMIC_RELEASE); \ } while(0) @@ -191,7 +191,7 @@ }) #define qatomic_xchg(ptr, i) ({ \ - QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE); \ + qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \ qatomic_xchg__nocheck(ptr, i); \ }) @@ -204,7 +204,7 @@ }) #define qatomic_cmpxchg(ptr, old, new) ({ \ - QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE); \ + qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \ qatomic_cmpxchg__nocheck(ptr, old, new); \ }) diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h index aae33cce17..608fe45dcf 100644 --- a/include/qemu/coroutine.h +++ b/include/qemu/coroutine.h @@ -198,14 +198,25 @@ typedef struct CoQueue { */ void qemu_co_queue_init(CoQueue *queue); +typedef enum { + /* + * Enqueue at front instead of back. Use this to re-queue a request when + * its wait condition is not satisfied after being woken up. + */ + CO_QUEUE_WAIT_FRONT = 0x1, +} CoQueueWaitFlags; + /** * Adds the current coroutine to the CoQueue and transfers control to the * caller of the coroutine. The mutex is unlocked during the wait and * locked again afterwards. */ #define qemu_co_queue_wait(queue, lock) \ - qemu_co_queue_wait_impl(queue, QEMU_MAKE_LOCKABLE(lock)) -void coroutine_fn qemu_co_queue_wait_impl(CoQueue *queue, QemuLockable *lock); + qemu_co_queue_wait_impl(queue, QEMU_MAKE_LOCKABLE(lock), 0) +#define qemu_co_queue_wait_flags(queue, lock, flags) \ + qemu_co_queue_wait_impl(queue, QEMU_MAKE_LOCKABLE(lock), (flags)) +void coroutine_fn qemu_co_queue_wait_impl(CoQueue *queue, QemuLockable *lock, + CoQueueWaitFlags flags); /** * Removes the next coroutine from the CoQueue, and queue it to run after diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index b1c161c035..2276094729 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -186,6 +186,14 @@ void QEMU_ERROR("code path is reachable") #define qemu_build_not_reached() g_assert_not_reached() #endif +/** + * qemu_build_assert() + * + * The compiler, during optimization, is expected to prove that the + * assertion is true. + */ +#define qemu_build_assert(test) while (!(test)) qemu_build_not_reached() + /* * According to waitpid man page: * WCOREDUMP diff --git a/include/qemu/thread.h b/include/qemu/thread.h index af19f2b3fc..20641e5844 100644 --- a/include/qemu/thread.h +++ b/include/qemu/thread.h @@ -227,7 +227,7 @@ struct QemuSpin { static inline void qemu_spin_init(QemuSpin *spin) { - __sync_lock_release(&spin->value); + qatomic_set(&spin->value, 0); #ifdef CONFIG_TSAN __tsan_mutex_create(spin, __tsan_mutex_not_static); #endif @@ -246,7 +246,7 @@ static inline void qemu_spin_lock(QemuSpin *spin) #ifdef CONFIG_TSAN __tsan_mutex_pre_lock(spin, 0); #endif - while (unlikely(__sync_lock_test_and_set(&spin->value, true))) { + while (unlikely(qatomic_xchg(&spin->value, 1))) { while (qatomic_read(&spin->value)) { cpu_relax(); } @@ -261,7 +261,7 @@ static inline bool qemu_spin_trylock(QemuSpin *spin) #ifdef CONFIG_TSAN __tsan_mutex_pre_lock(spin, __tsan_mutex_try_lock); #endif - bool busy = __sync_lock_test_and_set(&spin->value, true); + bool busy = qatomic_xchg(&spin->value, true); #ifdef CONFIG_TSAN unsigned flags = __tsan_mutex_try_lock; flags |= busy ? __tsan_mutex_try_lock_failed : 0; @@ -280,7 +280,7 @@ static inline void qemu_spin_unlock(QemuSpin *spin) #ifdef CONFIG_TSAN __tsan_mutex_pre_unlock(spin, 0); #endif - __sync_lock_release(&spin->value); + qatomic_store_release(&spin->value, 0); #ifdef CONFIG_TSAN __tsan_mutex_post_unlock(spin, 0); #endif diff --git a/include/sysemu/block-backend-global-state.h b/include/sysemu/block-backend-global-state.h index 415f0c91d7..6858e39cb6 100644 --- a/include/sysemu/block-backend-global-state.h +++ b/include/sysemu/block-backend-global-state.h @@ -106,8 +106,8 @@ void blk_io_limits_enable(BlockBackend *blk, const char *group); void blk_io_limits_update_group(BlockBackend *blk, const char *group); void blk_set_force_allow_inactivate(BlockBackend *blk); -void blk_register_buf(BlockBackend *blk, void *host, size_t size); -void blk_unregister_buf(BlockBackend *blk, void *host); +bool blk_register_buf(BlockBackend *blk, void *host, size_t size, Error **errp); +void blk_unregister_buf(BlockBackend *blk, void *host, size_t size); const BdrvChild *blk_root(BlockBackend *blk); diff --git a/include/sysemu/block-ram-registrar.h b/include/sysemu/block-ram-registrar.h new file mode 100644 index 0000000000..d8b2f7942b --- /dev/null +++ b/include/sysemu/block-ram-registrar.h @@ -0,0 +1,37 @@ +/* + * BlockBackend RAM Registrar + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef BLOCK_RAM_REGISTRAR_H +#define BLOCK_RAM_REGISTRAR_H + +#include "exec/ramlist.h" + +/** + * struct BlockRAMRegistrar: + * + * Keeps RAMBlock memory registered with a BlockBackend using + * blk_register_buf() including hotplugged memory. + * + * Emulated devices or other BlockBackend users initialize a BlockRAMRegistrar + * with blk_ram_registrar_init() before submitting I/O requests with the + * BDRV_REQ_REGISTERED_BUF flag set. + */ +typedef struct { + BlockBackend *blk; + RAMBlockNotifier notifier; + bool ok; +} BlockRAMRegistrar; + +void blk_ram_registrar_init(BlockRAMRegistrar *r, BlockBackend *blk); +void blk_ram_registrar_destroy(BlockRAMRegistrar *r); + +/* Have all RAMBlocks been registered successfully? */ +static inline bool blk_ram_registrar_ok(BlockRAMRegistrar *r) +{ + return r->ok; +} + +#endif /* BLOCK_RAM_REGISTRAR_H */ diff --git a/include/sysemu/device_tree.h b/include/sysemu/device_tree.h index e7c5441f56..ca5339beae 100644 --- a/include/sysemu/device_tree.h +++ b/include/sysemu/device_tree.h @@ -197,6 +197,15 @@ int qemu_fdt_setprop_sized_cells_from_array(void *fdt, qdt_tmp); \ }) + +/** + * qemu_fdt_randomize_seeds: + * @fdt: device tree blob + * + * Re-randomize all "rng-seed" properties with new seeds. + */ +void qemu_fdt_randomize_seeds(void *fdt); + #define FDT_PCI_RANGE_RELOCATABLE 0x80000000 #define FDT_PCI_RANGE_PREFETCHABLE 0x40000000 #define FDT_PCI_RANGE_ALIASED 0x20000000 diff --git a/include/sysemu/dump-arch.h b/include/sysemu/dump-arch.h index e25b02e990..59bbc9be38 100644 --- a/include/sysemu/dump-arch.h +++ b/include/sysemu/dump-arch.h @@ -21,6 +21,9 @@ typedef struct ArchDumpInfo { uint32_t page_size; /* The target's page size. If it's variable and * unknown, then this should be the maximum. */ uint64_t phys_base; /* The target's physmem base. */ + void (*arch_sections_add_fn)(DumpState *s); + uint64_t (*arch_sections_write_hdr_fn)(DumpState *s, uint8_t *buff); + int (*arch_sections_write_fn)(DumpState *s, uint8_t *buff); } ArchDumpInfo; struct GuestPhysBlockList; /* memory_mapping.h */ diff --git a/include/sysemu/dump.h b/include/sysemu/dump.h index b62513d87d..4ffed0b659 100644 --- a/include/sysemu/dump.h +++ b/include/sysemu/dump.h @@ -154,15 +154,8 @@ typedef struct DumpState { GuestPhysBlockList guest_phys_blocks; ArchDumpInfo dump_info; MemoryMappingList list; - uint32_t phdr_num; - uint32_t shdr_num; bool resume; bool detached; - ssize_t note_size; - hwaddr shdr_offset; - hwaddr phdr_offset; - hwaddr section_offset; - hwaddr note_offset; hwaddr memory_offset; int fd; @@ -177,6 +170,20 @@ typedef struct DumpState { int64_t filter_area_begin; /* Start address of partial guest memory area */ int64_t filter_area_length; /* Length of partial guest memory area */ + /* Elf dump related data */ + uint32_t phdr_num; + uint32_t shdr_num; + ssize_t note_size; + hwaddr shdr_offset; + hwaddr phdr_offset; + hwaddr section_offset; + hwaddr note_offset; + + void *elf_section_hdrs; /* Pointer to section header buffer */ + void *elf_section_data; /* Pointer to section data buffer */ + uint64_t elf_section_data_size; /* Size of section data */ + GArray *string_table_buf; /* String table data buffer */ + uint8_t *note_buf; /* buffer for notes */ size_t note_buf_offset; /* the writing place in note_buf */ uint32_t nr_cpus; /* number of guest's cpu */ @@ -208,4 +215,9 @@ typedef struct DumpState { uint16_t cpu_to_dump16(DumpState *s, uint16_t val); uint32_t cpu_to_dump32(DumpState *s, uint32_t val); uint64_t cpu_to_dump64(DumpState *s, uint64_t val); + +int64_t dump_filtered_memblock_size(GuestPhysBlock *block, int64_t filter_area_start, + int64_t filter_area_length); +int64_t dump_filtered_memblock_start(GuestPhysBlock *block, int64_t filter_area_start, + int64_t filter_area_length); #endif diff --git a/include/sysemu/reset.h b/include/sysemu/reset.h index 0b0d6d7598..609e4d50c2 100644 --- a/include/sysemu/reset.h +++ b/include/sysemu/reset.h @@ -1,10 +1,13 @@ #ifndef QEMU_SYSEMU_RESET_H #define QEMU_SYSEMU_RESET_H +#include "qapi/qapi-events-run-state.h" + typedef void QEMUResetHandler(void *opaque); void qemu_register_reset(QEMUResetHandler *func, void *opaque); +void qemu_register_reset_nosnapshotload(QEMUResetHandler *func, void *opaque); void qemu_unregister_reset(QEMUResetHandler *func, void *opaque); -void qemu_devices_reset(void); +void qemu_devices_reset(ShutdownCause reason); #endif diff --git a/linux-user/cpu_loop-common.h b/linux-user/cpu_loop-common.h index 36ff5b14f2..e644d2ef90 100644 --- a/linux-user/cpu_loop-common.h +++ b/linux-user/cpu_loop-common.h @@ -23,18 +23,9 @@ #include "exec/log.h" #include "special-errno.h" -#define EXCP_DUMP(env, fmt, ...) \ -do { \ - CPUState *cs = env_cpu(env); \ - fprintf(stderr, fmt , ## __VA_ARGS__); \ - fprintf(stderr, "Failing executable: %s\n", exec_path); \ - cpu_dump_state(cs, stderr, 0); \ - if (qemu_log_separate()) { \ - qemu_log(fmt, ## __VA_ARGS__); \ - qemu_log("Failing executable: %s\n", exec_path); \ - log_cpu_state(cs, 0); \ - } \ -} while (0) +void target_exception_dump(CPUArchState *env, const char *fmt, int code); +#define EXCP_DUMP(env, fmt, code) \ + target_exception_dump(env, fmt, code) void target_cpu_copy_regs(CPUArchState *env, struct target_pt_regs *regs); #endif diff --git a/linux-user/i386/cpu_loop.c b/linux-user/i386/cpu_loop.c index 42837399bc..865413c08f 100644 --- a/linux-user/i386/cpu_loop.c +++ b/linux-user/i386/cpu_loop.c @@ -201,7 +201,6 @@ void cpu_loop(CPUX86State *env) { CPUState *cs = env_cpu(env); int trapnr; - abi_ulong pc; abi_ulong ret; for(;;) { @@ -307,9 +306,8 @@ void cpu_loop(CPUX86State *env) cpu_exec_step_atomic(cs); break; default: - pc = env->segs[R_CS].base + env->eip; - EXCP_DUMP(env, "qemu: 0x%08lx: unhandled CPU exception 0x%x - aborting\n", - (long)pc, trapnr); + EXCP_DUMP(env, "qemu: unhandled CPU exception 0x%x - aborting\n", + trapnr); abort(); } process_pending_signals(env); diff --git a/linux-user/ioctls.h b/linux-user/ioctls.h index f182d40190..071f7ca253 100644 --- a/linux-user/ioctls.h +++ b/linux-user/ioctls.h @@ -96,9 +96,7 @@ IOCTL(BLKROGET, IOC_R, MK_PTR(TYPE_INT)) IOCTL(BLKRRPART, 0, TYPE_NULL) IOCTL(BLKGETSIZE, IOC_R, MK_PTR(TYPE_ULONG)) -#ifdef BLKGETSIZE64 IOCTL(BLKGETSIZE64, IOC_R, MK_PTR(TYPE_ULONGLONG)) -#endif IOCTL(BLKFLSBUF, 0, TYPE_NULL) IOCTL(BLKRASET, 0, TYPE_INT) IOCTL(BLKRAGET, IOC_R, MK_PTR(TYPE_LONG)) @@ -107,33 +105,15 @@ IOCTL_SPECIAL(BLKPG, IOC_W, do_ioctl_blkpg, MK_PTR(MK_STRUCT(STRUCT_blkpg_ioctl_arg))) -#ifdef BLKDISCARD IOCTL(BLKDISCARD, IOC_W, MK_PTR(MK_ARRAY(TYPE_ULONGLONG, 2))) -#endif -#ifdef BLKIOMIN IOCTL(BLKIOMIN, IOC_R, MK_PTR(TYPE_INT)) -#endif -#ifdef BLKIOOPT IOCTL(BLKIOOPT, IOC_R, MK_PTR(TYPE_INT)) -#endif -#ifdef BLKALIGNOFF IOCTL(BLKALIGNOFF, IOC_R, MK_PTR(TYPE_INT)) -#endif -#ifdef BLKPBSZGET IOCTL(BLKPBSZGET, IOC_R, MK_PTR(TYPE_INT)) -#endif -#ifdef BLKDISCARDZEROES IOCTL(BLKDISCARDZEROES, IOC_R, MK_PTR(TYPE_INT)) -#endif -#ifdef BLKSECDISCARD IOCTL(BLKSECDISCARD, IOC_W, MK_PTR(MK_ARRAY(TYPE_ULONGLONG, 2))) -#endif -#ifdef BLKROTATIONAL IOCTL(BLKROTATIONAL, IOC_R, MK_PTR(TYPE_SHORT)) -#endif -#ifdef BLKZEROOUT IOCTL(BLKZEROOUT, IOC_W, MK_PTR(MK_ARRAY(TYPE_ULONGLONG, 2))) -#endif IOCTL(FDMSGON, 0, TYPE_NULL) IOCTL(FDMSGOFF, 0, TYPE_NULL) @@ -149,17 +129,13 @@ IOCTL(FDTWADDLE, 0, TYPE_NULL) IOCTL(FDEJECT, 0, TYPE_NULL) -#ifdef FIBMAP IOCTL(FIBMAP, IOC_W | IOC_R, MK_PTR(TYPE_LONG)) -#endif #ifdef FICLONE IOCTL(FICLONE, IOC_W, TYPE_INT) IOCTL(FICLONERANGE, IOC_W, MK_PTR(MK_STRUCT(STRUCT_file_clone_range))) #endif -#ifdef FIGETBSZ IOCTL(FIGETBSZ, IOC_R, MK_PTR(TYPE_LONG)) -#endif #ifdef CONFIG_FIEMAP IOCTL_SPECIAL(FS_IOC_FIEMAP, IOC_W | IOC_R, do_ioctl_fs_ioc_fiemap, MK_PTR(MK_STRUCT(STRUCT_fiemap))) diff --git a/linux-user/mmap.c b/linux-user/mmap.c index 28f3bc85ed..10f5079331 100644 --- a/linux-user/mmap.c +++ b/linux-user/mmap.c @@ -182,7 +182,6 @@ int target_mprotect(abi_ulong start, abi_ulong len, int target_prot) } page_set_flags(start, start + len, page_flags); - tb_invalidate_phys_range(start, start + len); ret = 0; error: @@ -662,7 +661,6 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int target_prot, qemu_log_unlock(f); } } - tb_invalidate_phys_range(start, start + len); mmap_unlock(); return start; fail: @@ -766,7 +764,6 @@ int target_munmap(abi_ulong start, abi_ulong len) if (ret == 0) { page_set_flags(start, start + len, 0); - tb_invalidate_phys_range(start, start + len); } mmap_unlock(); return ret; @@ -856,7 +853,6 @@ abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size, page_set_flags(new_addr, new_addr + new_size, prot | PAGE_VALID | PAGE_RESET); } - tb_invalidate_phys_range(new_addr, new_addr + new_size); mmap_unlock(); return new_addr; } diff --git a/linux-user/strace.c b/linux-user/strace.c index 37bc96df9b..9ae5a812cd 100644 --- a/linux-user/strace.c +++ b/linux-user/strace.c @@ -1969,7 +1969,7 @@ print_execv(CPUArchState *cpu_env, const struct syscallname *name, } #endif -#ifdef TARGET_NR_faccessat +#if defined(TARGET_NR_faccessat) || defined(TARGET_NR_faccessat2) static void print_faccessat(CPUArchState *cpu_env, const struct syscallname *name, abi_long arg0, abi_long arg1, abi_long arg2, @@ -3383,10 +3383,10 @@ print_pidfd_send_signal(CPUArchState *cpu_env, const struct syscallname *name, unlock_user(p, arg2, 0); } else { - print_pointer(arg2, 1); + print_pointer(arg2, 0); } - print_raw_param("%u", arg3, 0); + print_raw_param("%u", arg3, 1); print_syscall_epilogue(name); } #endif diff --git a/linux-user/strace.list b/linux-user/strace.list index a87415bf3d..3df2184580 100644 --- a/linux-user/strace.list +++ b/linux-user/strace.list @@ -178,6 +178,9 @@ #ifdef TARGET_NR_faccessat { TARGET_NR_faccessat, "faccessat" , NULL, print_faccessat, NULL }, #endif +#ifdef TARGET_NR_faccessat2 +{ TARGET_NR_faccessat2, "faccessat2" , NULL, print_faccessat, NULL }, +#endif #ifdef TARGET_NR_fadvise64 { TARGET_NR_fadvise64, "fadvise64" , NULL, NULL, NULL }, #endif diff --git a/linux-user/syscall.c b/linux-user/syscall.c index 2e954d8dbd..8402c1399d 100644 --- a/linux-user/syscall.c +++ b/linux-user/syscall.c @@ -111,6 +111,31 @@ #define FS_IOC32_SETFLAGS _IOW('f', 2, int) #define FS_IOC32_GETVERSION _IOR('v', 1, int) #define FS_IOC32_SETVERSION _IOW('v', 2, int) + +#define BLKGETSIZE64 _IOR(0x12,114,size_t) +#define BLKDISCARD _IO(0x12,119) +#define BLKIOMIN _IO(0x12,120) +#define BLKIOOPT _IO(0x12,121) +#define BLKALIGNOFF _IO(0x12,122) +#define BLKPBSZGET _IO(0x12,123) +#define BLKDISCARDZEROES _IO(0x12,124) +#define BLKSECDISCARD _IO(0x12,125) +#define BLKROTATIONAL _IO(0x12,126) +#define BLKZEROOUT _IO(0x12,127) + +#define FIBMAP _IO(0x00,1) +#define FIGETBSZ _IO(0x00,2) + +struct file_clone_range { + __s64 src_fd; + __u64 src_offset; + __u64 src_length; + __u64 dest_offset; +}; + +#define FICLONE _IOW(0x94, 9, int) +#define FICLONERANGE _IOW(0x94, 13, struct file_clone_range) + #else #include <linux/fs.h> #endif @@ -158,6 +183,7 @@ #include "qapi/error.h" #include "fd-trans.h" #include "tcg/tcg.h" +#include "cpu_loop-common.h" #ifndef CLONE_IO #define CLONE_IO 0x80000000 /* Clone io context */ @@ -8144,6 +8170,33 @@ static int is_proc_myself(const char *filename, const char *entry) return 0; } +static void excp_dump_file(FILE *logfile, CPUArchState *env, + const char *fmt, int code) +{ + if (logfile) { + CPUState *cs = env_cpu(env); + + fprintf(logfile, fmt, code); + fprintf(logfile, "Failing executable: %s\n", exec_path); + cpu_dump_state(cs, logfile, 0); + open_self_maps(env, fileno(logfile)); + } +} + +void target_exception_dump(CPUArchState *env, const char *fmt, int code) +{ + /* dump to console */ + excp_dump_file(stderr, env, fmt, code); + + /* dump to log file */ + if (qemu_log_separate()) { + FILE *logfile = qemu_log_trylock(); + + excp_dump_file(logfile, env, fmt, code); + qemu_log_unlock(logfile); + } +} + #if HOST_BIG_ENDIAN != TARGET_BIG_ENDIAN || \ defined(TARGET_SPARC) || defined(TARGET_M68K) || defined(TARGET_HPPA) static int is_proc(const char *filename, const char *entry) @@ -8251,8 +8304,7 @@ static int do_openat(CPUArchState *cpu_env, int dirfd, const char *pathname, int }; if (is_proc_myself(pathname, "exe")) { - int execfd = qemu_getauxval(AT_EXECFD); - return execfd ? execfd : safe_openat(dirfd, exec_path, flags, mode); + return safe_openat(dirfd, exec_path, flags, mode); } for (fake_open = fakes; fake_open->filename; fake_open++) { @@ -8679,16 +8731,21 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int num, abi_long arg1, #if defined(__NR_pidfd_send_signal) && defined(TARGET_NR_pidfd_send_signal) case TARGET_NR_pidfd_send_signal: { - siginfo_t uinfo; + siginfo_t uinfo, *puinfo; - p = lock_user(VERIFY_READ, arg3, sizeof(target_siginfo_t), 1); - if (!p) { - return -TARGET_EFAULT; + if (arg3) { + p = lock_user(VERIFY_READ, arg3, sizeof(target_siginfo_t), 1); + if (!p) { + return -TARGET_EFAULT; + } + target_to_host_siginfo(&uinfo, p); + unlock_user(p, arg3, 0); + puinfo = &uinfo; + } else { + puinfo = NULL; } - target_to_host_siginfo(&uinfo, p); - unlock_user(p, arg3, 0); ret = get_errno(pidfd_send_signal(arg1, target_to_host_signal(arg2), - &uinfo, arg4)); + puinfo, arg4)); } return ret; #endif @@ -8855,7 +8912,11 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int num, abi_long arg1, * before the execve completes and makes it the other * program's problem. */ - ret = get_errno(safe_execve(p, argp, envp)); + if (is_proc_myself(p, "exe")) { + ret = get_errno(safe_execve(exec_path, argp, envp)); + } else { + ret = get_errno(safe_execve(p, argp, envp)); + } unlock_user(p, arg1, 0); goto execve_end; @@ -9110,6 +9171,15 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int num, abi_long arg1, unlock_user(p, arg2, 0); return ret; #endif +#if defined(TARGET_NR_faccessat2) + case TARGET_NR_faccessat2: + if (!(p = lock_user_string(arg2))) { + return -TARGET_EFAULT; + } + ret = get_errno(faccessat(arg1, p, arg3, arg4)); + unlock_user(p, arg2, 0); + return ret; +#endif #ifdef TARGET_NR_nice /* not on alpha */ case TARGET_NR_nice: return get_errno(nice(arg1)); @@ -11793,7 +11863,7 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int num, abi_long arg1, return -host_to_target_errno(ret); #endif -#if TARGET_ABI_BITS == 32 +#if TARGET_ABI_BITS == 32 && !defined(TARGET_ABI_MIPSN32) #ifdef TARGET_NR_fadvise64_64 case TARGET_NR_fadvise64_64: @@ -11920,7 +11990,7 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int num, abi_long arg1, return get_errno(sys_gettid()); #ifdef TARGET_NR_readahead case TARGET_NR_readahead: -#if TARGET_ABI_BITS == 32 +#if TARGET_ABI_BITS == 32 && !defined(TARGET_ABI_MIPSN32) if (regpairs_aligned(cpu_env, num)) { arg2 = arg3; arg3 = arg4; @@ -12612,7 +12682,7 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int num, abi_long arg1, #endif /* CONFIG_EVENTFD */ #if defined(CONFIG_FALLOCATE) && defined(TARGET_NR_fallocate) case TARGET_NR_fallocate: -#if TARGET_ABI_BITS == 32 +#if TARGET_ABI_BITS == 32 && !defined(TARGET_ABI_MIPSN32) ret = get_errno(fallocate(arg1, arg2, target_offset64(arg3, arg4), target_offset64(arg5, arg6))); #else @@ -12623,7 +12693,7 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int num, abi_long arg1, #if defined(CONFIG_SYNC_FILE_RANGE) #if defined(TARGET_NR_sync_file_range) case TARGET_NR_sync_file_range: -#if TARGET_ABI_BITS == 32 +#if TARGET_ABI_BITS == 32 && !defined(TARGET_ABI_MIPSN32) #if defined(TARGET_MIPS) ret = get_errno(sync_file_range(arg1, target_offset64(arg3, arg4), target_offset64(arg5, arg6), arg7)); @@ -12645,7 +12715,7 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int num, abi_long arg1, case TARGET_NR_arm_sync_file_range: #endif /* This is like sync_file_range but the arguments are reordered */ -#if TARGET_ABI_BITS == 32 +#if TARGET_ABI_BITS == 32 && !defined(TARGET_ABI_MIPSN32) ret = get_errno(sync_file_range(arg1, target_offset64(arg3, arg4), target_offset64(arg5, arg6), arg2)); #else diff --git a/meson.build b/meson.build index 5f114c89d9..f06d465279 100644 --- a/meson.build +++ b/meson.build @@ -75,7 +75,7 @@ have_tools = get_option('tools') \ .allowed() have_ga = get_option('guest_agent') \ .disable_auto_if(not have_system and not have_tools) \ - .require(targetos in ['sunos', 'linux', 'windows'], + .require(targetos in ['sunos', 'linux', 'windows', 'freebsd'], error_message: 'unsupported OS for QEMU guest agent') \ .allowed() have_block = have_system or have_tools @@ -777,6 +777,13 @@ if not get_option('virglrenderer').auto() or have_system or have_vhost_user_gpu required: get_option('virglrenderer'), kwargs: static_kwargs) endif +blkio = not_found +if not get_option('blkio').auto() or have_block + blkio = dependency('blkio', + method: 'pkg-config', + required: get_option('blkio'), + kwargs: static_kwargs) +endif curl = not_found if not get_option('curl').auto() or have_block curl = dependency('libcurl', version: '>=7.29.0', @@ -1821,6 +1828,7 @@ config_host_data.set('CONFIG_LIBUDEV', libudev.found()) config_host_data.set('CONFIG_LZO', lzo.found()) config_host_data.set('CONFIG_MPATH', mpathpersist.found()) config_host_data.set('CONFIG_MPATH_NEW_API', mpathpersist_new_api) +config_host_data.set('CONFIG_BLKIO', blkio.found()) config_host_data.set('CONFIG_CURL', curl.found()) config_host_data.set('CONFIG_CURSES', curses.found()) config_host_data.set('CONFIG_GBM', gbm.found()) @@ -3878,6 +3886,7 @@ summary_info += {'PAM': pam} summary_info += {'iconv support': iconv} summary_info += {'curses support': curses} summary_info += {'virgl support': virgl} +summary_info += {'blkio support': blkio} summary_info += {'curl support': curl} summary_info += {'Multipath support': mpathpersist} summary_info += {'PNG support': png} diff --git a/meson_options.txt b/meson_options.txt index 79c6af18d5..66128178bf 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -117,6 +117,8 @@ option('bzip2', type : 'feature', value : 'auto', description: 'bzip2 support for DMG images') option('cap_ng', type : 'feature', value : 'auto', description: 'cap_ng support') +option('blkio', type : 'feature', value : 'auto', + description: 'libblkio block device driver') option('bpf', type : 'feature', value : 'auto', description: 'eBPF support') option('cocoa', type : 'feature', value : 'auto', diff --git a/migration/savevm.c b/migration/savevm.c index 48e85c052c..a0cdb714f7 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -3058,7 +3058,7 @@ bool load_snapshot(const char *name, const char *vmstate, goto err_drain; } - qemu_system_reset(SHUTDOWN_CAUSE_NONE); + qemu_system_reset(SHUTDOWN_CAUSE_SNAPSHOT_LOAD); mis->from_src_file = f; if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) { diff --git a/qapi/block-core.json b/qapi/block-core.json index 882b266532..cb5079e645 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2951,11 +2951,18 @@ 'file', 'snapshot-access', 'ftp', 'ftps', 'gluster', {'name': 'host_cdrom', 'if': 'HAVE_HOST_BLOCK_DEVICE' }, {'name': 'host_device', 'if': 'HAVE_HOST_BLOCK_DEVICE' }, - 'http', 'https', 'iscsi', - 'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', - 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd', + 'http', 'https', + { 'name': 'io_uring', 'if': 'CONFIG_BLKIO' }, + 'iscsi', + 'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', + { 'name': 'nvme-io_uring', 'if': 'CONFIG_BLKIO' }, + 'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', + 'raw', 'rbd', { 'name': 'replication', 'if': 'CONFIG_REPLICATION' }, - 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] } + 'ssh', 'throttle', 'vdi', 'vhdx', + { 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' }, + { 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' }, + 'vmdk', 'vpc', 'vvfat' ] } ## # @BlockdevOptionsFile: @@ -3679,6 +3686,58 @@ '*logfile': 'str' } } ## +# @BlockdevOptionsIoUring: +# +# Driver specific block device options for the io_uring backend. +# +# @filename: path to the image file +# +# Since: 7.2 +## +{ 'struct': 'BlockdevOptionsIoUring', + 'data': { 'filename': 'str' }, + 'if': 'CONFIG_BLKIO' } + +## +# @BlockdevOptionsNvmeIoUring: +# +# Driver specific block device options for the nvme-io_uring backend. +# +# @filename: path to the image file +# +# Since: 7.2 +## +{ 'struct': 'BlockdevOptionsNvmeIoUring', + 'data': { 'filename': 'str' }, + 'if': 'CONFIG_BLKIO' } + +## +# @BlockdevOptionsVirtioBlkVhostUser: +# +# Driver specific block device options for the virtio-blk-vhost-user backend. +# +# @path: path to the vhost-user UNIX domain socket. +# +# Since: 7.2 +## +{ 'struct': 'BlockdevOptionsVirtioBlkVhostUser', + 'data': { 'path': 'str' }, + 'if': 'CONFIG_BLKIO' } + +## +# @BlockdevOptionsVirtioBlkVhostVdpa: +# +# Driver specific block device options for the virtio-blk-vhost-vdpa backend. +# +# @path: path to the vhost-vdpa character device. +# +# Since: 7.2 +## +{ 'struct': 'BlockdevOptionsVirtioBlkVhostVdpa', + 'data': { 'path': 'str' }, + 'if': 'CONFIG_BLKIO' } + +## # @IscsiTransport: # # An enumeration of libiscsi transport types @@ -4305,6 +4364,8 @@ 'if': 'HAVE_HOST_BLOCK_DEVICE' }, 'http': 'BlockdevOptionsCurlHttp', 'https': 'BlockdevOptionsCurlHttps', + 'io_uring': { 'type': 'BlockdevOptionsIoUring', + 'if': 'CONFIG_BLKIO' }, 'iscsi': 'BlockdevOptionsIscsi', 'luks': 'BlockdevOptionsLUKS', 'nbd': 'BlockdevOptionsNbd', @@ -4312,6 +4373,8 @@ 'null-aio': 'BlockdevOptionsNull', 'null-co': 'BlockdevOptionsNull', 'nvme': 'BlockdevOptionsNVMe', + 'nvme-io_uring': { 'type': 'BlockdevOptionsNvmeIoUring', + 'if': 'CONFIG_BLKIO' }, 'parallels': 'BlockdevOptionsGenericFormat', 'preallocate':'BlockdevOptionsPreallocate', 'qcow2': 'BlockdevOptionsQcow2', @@ -4327,6 +4390,12 @@ 'throttle': 'BlockdevOptionsThrottle', 'vdi': 'BlockdevOptionsGenericFormat', 'vhdx': 'BlockdevOptionsGenericFormat', + 'virtio-blk-vhost-user': + { 'type': 'BlockdevOptionsVirtioBlkVhostUser', + 'if': 'CONFIG_BLKIO' }, + 'virtio-blk-vhost-vdpa': + { 'type': 'BlockdevOptionsVirtioBlkVhostVdpa', + 'if': 'CONFIG_BLKIO' }, 'vmdk': 'BlockdevOptionsGenericCOWFormat', 'vpc': 'BlockdevOptionsGenericFormat', 'vvfat': 'BlockdevOptionsVVFAT' diff --git a/qapi/run-state.json b/qapi/run-state.json index 49989d30e6..419c188dd1 100644 --- a/qapi/run-state.json +++ b/qapi/run-state.json @@ -86,12 +86,16 @@ # ignores --no-reboot. This is useful for sanitizing # hypercalls on s390 that are used during kexec/kdump/boot # +# @snapshot-load: A snapshot is being loaded by the record & replay +# subsystem. This value is used only within QEMU. It +# doesn't occur in QMP. (since 7.2) +# ## { 'enum': 'ShutdownCause', # Beware, shutdown_caused_by_guest() depends on enumeration order 'data': [ 'none', 'host-error', 'host-qmp-quit', 'host-qmp-system-reset', 'host-signal', 'host-ui', 'guest-shutdown', 'guest-reset', - 'guest-panic', 'subsystem-reset'] } + 'guest-panic', 'subsystem-reset', 'snapshot-load'] } ## # @StatusInfo: diff --git a/qemu-img.c b/qemu-img.c index ace3adf8ae..a3b64c88af 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -4371,7 +4371,7 @@ static int img_bench(int argc, char **argv) struct timeval t1, t2; int i; bool force_share = false; - size_t buf_size; + size_t buf_size = 0; for (;;) { static const struct option long_options[] = { @@ -4570,7 +4570,7 @@ static int img_bench(int argc, char **argv) data.buf = blk_blockalign(blk, buf_size); memset(data.buf, pattern, data.nrreq * data.bufsize); - blk_register_buf(blk, data.buf, buf_size); + blk_register_buf(blk, data.buf, buf_size, &error_fatal); data.qiov = g_new(QEMUIOVector, data.nrreq); for (i = 0; i < data.nrreq; i++) { @@ -4593,7 +4593,7 @@ static int img_bench(int argc, char **argv) out: if (data.buf) { - blk_unregister_buf(blk, data.buf); + blk_unregister_buf(blk, data.buf, buf_size); } qemu_vfree(data.buf); blk_unref(blk); diff --git a/qga/channel-posix.c b/qga/channel-posix.c index 6796a02cff..0c5175d957 100644 --- a/qga/channel-posix.c +++ b/qga/channel-posix.c @@ -138,7 +138,7 @@ static gboolean ga_channel_open(GAChannel *c, const gchar *path, 0 ); if (fd == -1) { - error_setg_errno(errp, errno, "error opening channel"); + error_setg_errno(errp, errno, "error opening channel '%s'", path); return false; } #ifdef CONFIG_SOLARIS @@ -149,6 +149,25 @@ static gboolean ga_channel_open(GAChannel *c, const gchar *path, return false; } #endif +#ifdef __FreeBSD__ + /* + * In the default state channel sends echo of every command to a + * client. The client programm doesn't expect this and raises an + * error. Suppress echo by resetting ECHO terminal flag. + */ + struct termios tio; + if (tcgetattr(fd, &tio) < 0) { + error_setg_errno(errp, errno, "error getting channel termios attrs"); + close(fd); + return false; + } + tio.c_lflag &= ~ECHO; + if (tcsetattr(fd, TCSAFLUSH, &tio) < 0) { + error_setg_errno(errp, errno, "error setting channel termios attrs"); + close(fd); + return false; + } +#endif /* __FreeBSD__ */ ret = ga_channel_client_add(c, fd); if (ret) { error_setg(errp, "error adding channel to main loop"); @@ -163,7 +182,7 @@ static gboolean ga_channel_open(GAChannel *c, const gchar *path, assert(fd < 0); fd = qga_open_cloexec(path, O_RDWR | O_NOCTTY | O_NONBLOCK, 0); if (fd == -1) { - error_setg_errno(errp, errno, "error opening channel"); + error_setg_errno(errp, errno, "error opening channel '%s'", path); return false; } tcgetattr(fd, &tio); diff --git a/qga/commands-bsd.c b/qga/commands-bsd.c new file mode 100644 index 0000000000..15cade2d4c --- /dev/null +++ b/qga/commands-bsd.c @@ -0,0 +1,200 @@ +/* + * QEMU Guest Agent BSD-specific command implementations + * + * Copyright (c) Virtuozzo International GmbH. + * + * Authors: + * Alexander Ivanov <alexander.ivanov@virtuozzo.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qga-qapi-commands.h" +#include "qapi/qmp/qerror.h" +#include "qapi/error.h" +#include "qemu/queue.h" +#include "commands-common.h" +#include <sys/ioctl.h> +#include <sys/param.h> +#include <sys/ucred.h> +#include <sys/mount.h> +#include <net/if_dl.h> +#include <net/ethernet.h> +#include <paths.h> + +#if defined(CONFIG_FSFREEZE) || defined(CONFIG_FSTRIM) +bool build_fs_mount_list(FsMountList *mounts, Error **errp) +{ + FsMount *mount; + struct statfs *mntbuf, *mntp; + struct stat statbuf; + int i, count, ret; + + count = getmntinfo(&mntbuf, MNT_NOWAIT); + if (count == 0) { + error_setg_errno(errp, errno, "getmntinfo failed"); + return false; + } + + for (i = 0; i < count; i++) { + mntp = &mntbuf[i]; + ret = stat(mntp->f_mntonname, &statbuf); + if (ret != 0) { + error_setg_errno(errp, errno, "stat failed on %s", + mntp->f_mntonname); + return false; + } + + mount = g_new0(FsMount, 1); + + mount->dirname = g_strdup(mntp->f_mntonname); + mount->devtype = g_strdup(mntp->f_fstypename); + mount->devmajor = major(mount->dev); + mount->devminor = minor(mount->dev); + mount->fsid = mntp->f_fsid; + mount->dev = statbuf.st_dev; + + QTAILQ_INSERT_TAIL(mounts, mount, next); + } + return true; +} +#endif /* CONFIG_FSFREEZE || CONFIG_FSTRIM */ + +#if defined(CONFIG_FSFREEZE) +static int ufssuspend_fd = -1; +static int ufssuspend_cnt; + +int64_t qmp_guest_fsfreeze_do_freeze_list(bool has_mountpoints, + strList *mountpoints, + FsMountList mounts, + Error **errp) +{ + int ret; + strList *list; + struct FsMount *mount; + + if (ufssuspend_fd != -1) { + error_setg(errp, "filesystems have already frozen"); + return -1; + } + + ufssuspend_cnt = 0; + ufssuspend_fd = qemu_open(_PATH_UFSSUSPEND, O_RDWR, errp); + if (ufssuspend_fd == -1) { + return -1; + } + + QTAILQ_FOREACH_REVERSE(mount, &mounts, next) { + /* + * To issue fsfreeze in the reverse order of mounts, check if the + * mount is listed in the list here + */ + if (has_mountpoints) { + for (list = mountpoints; list; list = list->next) { + if (g_str_equal(list->value, mount->dirname)) { + break; + } + } + if (!list) { + continue; + } + } + + /* Only UFS supports suspend */ + if (!g_str_equal(mount->devtype, "ufs")) { + continue; + } + + ret = ioctl(ufssuspend_fd, UFSSUSPEND, &mount->fsid); + if (ret == -1) { + /* + * ioctl returns EBUSY for all the FS except the first one + * that was suspended + */ + if (errno == EBUSY) { + continue; + } + error_setg_errno(errp, errno, "failed to freeze %s", + mount->dirname); + goto error; + } + ufssuspend_cnt++; + } + return ufssuspend_cnt; +error: + close(ufssuspend_fd); + ufssuspend_fd = -1; + return -1; + +} + +/* + * We don't need to call UFSRESUME ioctl because all the frozen FS + * are thawed on /dev/ufssuspend closing. + */ +int qmp_guest_fsfreeze_do_thaw(Error **errp) +{ + int ret = ufssuspend_cnt; + ufssuspend_cnt = 0; + if (ufssuspend_fd != -1) { + close(ufssuspend_fd); + ufssuspend_fd = -1; + } + return ret; +} + +GuestFilesystemInfoList *qmp_guest_get_fsinfo(Error **errp) +{ + error_setg(errp, QERR_UNSUPPORTED); + return NULL; +} + +GuestDiskInfoList *qmp_guest_get_disks(Error **errp) +{ + error_setg(errp, QERR_UNSUPPORTED); + return NULL; +} + +GuestDiskStatsInfoList *qmp_guest_get_diskstats(Error **errp) +{ + error_setg(errp, QERR_UNSUPPORTED); + return NULL; +} + +GuestCpuStatsList *qmp_guest_get_cpustats(Error **errp) +{ + error_setg(errp, QERR_UNSUPPORTED); + return NULL; +} +#endif /* CONFIG_FSFREEZE */ + +#ifdef HAVE_GETIFADDRS +/* + * Fill "buf" with MAC address by ifaddrs. Pointer buf must point to a + * buffer with ETHER_ADDR_LEN length at least. + * + * Returns false in case of an error, otherwise true. "obtained" arguument + * is true if a MAC address was obtained successful, otherwise false. + */ +bool guest_get_hw_addr(struct ifaddrs *ifa, unsigned char *buf, + bool *obtained, Error **errp) +{ + struct sockaddr_dl *sdp; + + *obtained = false; + + if (ifa->ifa_addr->sa_family != AF_LINK) { + /* We can get HW address only for AF_LINK family. */ + g_debug("failed to get MAC address of %s", ifa->ifa_name); + return true; + } + + sdp = (struct sockaddr_dl *)ifa->ifa_addr; + memcpy(buf, sdp->sdl_data + sdp->sdl_nlen, ETHER_ADDR_LEN); + *obtained = true; + + return true; +} +#endif /* HAVE_GETIFADDRS */ diff --git a/qga/commands-common.h b/qga/commands-common.h index d0e4a9696f..8c1c56aac9 100644 --- a/qga/commands-common.h +++ b/qga/commands-common.h @@ -10,6 +10,57 @@ #define QGA_COMMANDS_COMMON_H #include "qga-qapi-types.h" +#include "guest-agent-core.h" +#include "qemu/queue.h" + +#if defined(__linux__) +#include <linux/fs.h> +#ifdef FIFREEZE +#define CONFIG_FSFREEZE +#endif +#ifdef FITRIM +#define CONFIG_FSTRIM +#endif +#endif /* __linux__ */ + +#ifdef __FreeBSD__ +#include <ufs/ffs/fs.h> +#ifdef UFSSUSPEND +#define CONFIG_FSFREEZE +#endif +#endif /* __FreeBSD__ */ + +#if defined(CONFIG_FSFREEZE) || defined(CONFIG_FSTRIM) +typedef struct FsMount { + char *dirname; + char *devtype; + unsigned int devmajor, devminor; +#if defined(__FreeBSD__) + dev_t dev; + fsid_t fsid; +#endif + QTAILQ_ENTRY(FsMount) next; +} FsMount; + +typedef QTAILQ_HEAD(FsMountList, FsMount) FsMountList; + +bool build_fs_mount_list(FsMountList *mounts, Error **errp); +void free_fs_mount_list(FsMountList *mounts); +#endif /* CONFIG_FSFREEZE || CONFIG_FSTRIM */ + +#if defined(CONFIG_FSFREEZE) +int64_t qmp_guest_fsfreeze_do_freeze_list(bool has_mountpoints, + strList *mountpoints, + FsMountList mounts, + Error **errp); +int qmp_guest_fsfreeze_do_thaw(Error **errp); +#endif /* CONFIG_FSFREEZE */ + +#ifdef HAVE_GETIFADDRS +#include <ifaddrs.h> +bool guest_get_hw_addr(struct ifaddrs *ifa, unsigned char *buf, + bool *obtained, Error **errp); +#endif typedef struct GuestFileHandle GuestFileHandle; diff --git a/qga/commands-linux.c b/qga/commands-linux.c new file mode 100644 index 0000000000..214e408fcd --- /dev/null +++ b/qga/commands-linux.c @@ -0,0 +1,286 @@ +/* + * QEMU Guest Agent Linux-specific command implementations + * + * Copyright IBM Corp. 2011 + * + * Authors: + * Michael Roth <mdroth@linux.vnet.ibm.com> + * Michal Privoznik <mprivozn@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "commands-common.h" +#include "cutils.h" +#include <mntent.h> +#include <sys/ioctl.h> + +#if defined(CONFIG_FSFREEZE) || defined(CONFIG_FSTRIM) +static int dev_major_minor(const char *devpath, + unsigned int *devmajor, unsigned int *devminor) +{ + struct stat st; + + *devmajor = 0; + *devminor = 0; + + if (stat(devpath, &st) < 0) { + slog("failed to stat device file '%s': %s", devpath, strerror(errno)); + return -1; + } + if (S_ISDIR(st.st_mode)) { + /* It is bind mount */ + return -2; + } + if (S_ISBLK(st.st_mode)) { + *devmajor = major(st.st_rdev); + *devminor = minor(st.st_rdev); + return 0; + } + return -1; +} + +static bool build_fs_mount_list_from_mtab(FsMountList *mounts, Error **errp) +{ + struct mntent *ment; + FsMount *mount; + char const *mtab = "/proc/self/mounts"; + FILE *fp; + unsigned int devmajor, devminor; + + fp = setmntent(mtab, "r"); + if (!fp) { + error_setg(errp, "failed to open mtab file: '%s'", mtab); + return false; + } + + while ((ment = getmntent(fp))) { + /* + * An entry which device name doesn't start with a '/' is + * either a dummy file system or a network file system. + * Add special handling for smbfs and cifs as is done by + * coreutils as well. + */ + if ((ment->mnt_fsname[0] != '/') || + (strcmp(ment->mnt_type, "smbfs") == 0) || + (strcmp(ment->mnt_type, "cifs") == 0)) { + continue; + } + if (dev_major_minor(ment->mnt_fsname, &devmajor, &devminor) == -2) { + /* Skip bind mounts */ + continue; + } + + mount = g_new0(FsMount, 1); + mount->dirname = g_strdup(ment->mnt_dir); + mount->devtype = g_strdup(ment->mnt_type); + mount->devmajor = devmajor; + mount->devminor = devminor; + + QTAILQ_INSERT_TAIL(mounts, mount, next); + } + + endmntent(fp); + return true; +} + +static void decode_mntname(char *name, int len) +{ + int i, j = 0; + for (i = 0; i <= len; i++) { + if (name[i] != '\\') { + name[j++] = name[i]; + } else if (name[i + 1] == '\\') { + name[j++] = '\\'; + i++; + } else if (name[i + 1] >= '0' && name[i + 1] <= '3' && + name[i + 2] >= '0' && name[i + 2] <= '7' && + name[i + 3] >= '0' && name[i + 3] <= '7') { + name[j++] = (name[i + 1] - '0') * 64 + + (name[i + 2] - '0') * 8 + + (name[i + 3] - '0'); + i += 3; + } else { + name[j++] = name[i]; + } + } +} + +/* + * Walk the mount table and build a list of local file systems + */ +bool build_fs_mount_list(FsMountList *mounts, Error **errp) +{ + FsMount *mount; + char const *mountinfo = "/proc/self/mountinfo"; + FILE *fp; + char *line = NULL, *dash; + size_t n; + char check; + unsigned int devmajor, devminor; + int ret, dir_s, dir_e, type_s, type_e, dev_s, dev_e; + + fp = fopen(mountinfo, "r"); + if (!fp) { + return build_fs_mount_list_from_mtab(mounts, errp); + } + + while (getline(&line, &n, fp) != -1) { + ret = sscanf(line, "%*u %*u %u:%u %*s %n%*s%n%c", + &devmajor, &devminor, &dir_s, &dir_e, &check); + if (ret < 3) { + continue; + } + dash = strstr(line + dir_e, " - "); + if (!dash) { + continue; + } + ret = sscanf(dash, " - %n%*s%n %n%*s%n%c", + &type_s, &type_e, &dev_s, &dev_e, &check); + if (ret < 1) { + continue; + } + line[dir_e] = 0; + dash[type_e] = 0; + dash[dev_e] = 0; + decode_mntname(line + dir_s, dir_e - dir_s); + decode_mntname(dash + dev_s, dev_e - dev_s); + if (devmajor == 0) { + /* btrfs reports major number = 0 */ + if (strcmp("btrfs", dash + type_s) != 0 || + dev_major_minor(dash + dev_s, &devmajor, &devminor) < 0) { + continue; + } + } + + mount = g_new0(FsMount, 1); + mount->dirname = g_strdup(line + dir_s); + mount->devtype = g_strdup(dash + type_s); + mount->devmajor = devmajor; + mount->devminor = devminor; + + QTAILQ_INSERT_TAIL(mounts, mount, next); + } + free(line); + + fclose(fp); + return true; +} +#endif /* CONFIG_FSFREEZE || CONFIG_FSTRIM */ + +#ifdef CONFIG_FSFREEZE +/* + * Walk list of mounted file systems in the guest, and freeze the ones which + * are real local file systems. + */ +int64_t qmp_guest_fsfreeze_do_freeze_list(bool has_mountpoints, + strList *mountpoints, + FsMountList mounts, + Error **errp) +{ + struct FsMount *mount; + strList *list; + int fd, ret, i = 0; + + QTAILQ_FOREACH_REVERSE(mount, &mounts, next) { + /* To issue fsfreeze in the reverse order of mounts, check if the + * mount is listed in the list here */ + if (has_mountpoints) { + for (list = mountpoints; list; list = list->next) { + if (strcmp(list->value, mount->dirname) == 0) { + break; + } + } + if (!list) { + continue; + } + } + + fd = qga_open_cloexec(mount->dirname, O_RDONLY, 0); + if (fd == -1) { + error_setg_errno(errp, errno, "failed to open %s", mount->dirname); + return -1; + } + + /* we try to cull filesystems we know won't work in advance, but other + * filesystems may not implement fsfreeze for less obvious reasons. + * these will report EOPNOTSUPP. we simply ignore these when tallying + * the number of frozen filesystems. + * if a filesystem is mounted more than once (aka bind mount) a + * consecutive attempt to freeze an already frozen filesystem will + * return EBUSY. + * + * any other error means a failure to freeze a filesystem we + * expect to be freezable, so return an error in those cases + * and return system to thawed state. + */ + ret = ioctl(fd, FIFREEZE); + if (ret == -1) { + if (errno != EOPNOTSUPP && errno != EBUSY) { + error_setg_errno(errp, errno, "failed to freeze %s", + mount->dirname); + close(fd); + return -1; + } + } else { + i++; + } + close(fd); + } + return i; +} + +int qmp_guest_fsfreeze_do_thaw(Error **errp) +{ + int ret; + FsMountList mounts; + FsMount *mount; + int fd, i = 0, logged; + Error *local_err = NULL; + + QTAILQ_INIT(&mounts); + if (!build_fs_mount_list(&mounts, &local_err)) { + error_propagate(errp, local_err); + return -1; + } + + QTAILQ_FOREACH(mount, &mounts, next) { + logged = false; + fd = qga_open_cloexec(mount->dirname, O_RDONLY, 0); + if (fd == -1) { + continue; + } + /* we have no way of knowing whether a filesystem was actually unfrozen + * as a result of a successful call to FITHAW, only that if an error + * was returned the filesystem was *not* unfrozen by that particular + * call. + * + * since multiple preceding FIFREEZEs require multiple calls to FITHAW + * to unfreeze, continuing issuing FITHAW until an error is returned, + * in which case either the filesystem is in an unfreezable state, or, + * more likely, it was thawed previously (and remains so afterward). + * + * also, since the most recent successful call is the one that did + * the actual unfreeze, we can use this to provide an accurate count + * of the number of filesystems unfrozen by guest-fsfreeze-thaw, which + * may * be useful for determining whether a filesystem was unfrozen + * during the freeze/thaw phase by a process other than qemu-ga. + */ + do { + ret = ioctl(fd, FITHAW); + if (ret == 0 && !logged) { + i++; + logged = true; + } + } while (ret == 0); + close(fd); + } + + free_fs_mount_list(&mounts); + + return i; +} +#endif /* CONFIG_FSFREEZE */ diff --git a/qga/commands-posix.c b/qga/commands-posix.c index eea819cff0..32493d6383 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -16,11 +16,9 @@ #include <sys/utsname.h> #include <sys/wait.h> #include <dirent.h> -#include "guest-agent-core.h" #include "qga-qapi-commands.h" #include "qapi/error.h" #include "qapi/qmp/qerror.h" -#include "qemu/queue.h" #include "qemu/host-utils.h" #include "qemu/sockets.h" #include "qemu/base64.h" @@ -35,28 +33,20 @@ #if defined(__linux__) #include <mntent.h> -#include <linux/fs.h> #include <sys/statvfs.h> #include <linux/nvme_ioctl.h> #ifdef CONFIG_LIBUDEV #include <libudev.h> #endif - -#ifdef FIFREEZE -#define CONFIG_FSFREEZE -#endif -#ifdef FITRIM -#define CONFIG_FSTRIM -#endif #endif #ifdef HAVE_GETIFADDRS #include <arpa/inet.h> #include <sys/socket.h> #include <net/if.h> +#include <net/ethernet.h> #include <sys/types.h> -#include <ifaddrs.h> #ifdef CONFIG_SOLARIS #include <sys/sockio.h> #endif @@ -92,6 +82,10 @@ void qmp_guest_shutdown(bool has_mode, const char *mode, Error **errp) const char *powerdown_flag = "-i5"; const char *halt_flag = "-i0"; const char *reboot_flag = "-i6"; +#elif defined(CONFIG_BSD) + const char *powerdown_flag = "-p"; + const char *halt_flag = "-h"; + const char *reboot_flag = "-r"; #else const char *powerdown_flag = "-P"; const char *halt_flag = "-H"; @@ -122,6 +116,9 @@ void qmp_guest_shutdown(bool has_mode, const char *mode, Error **errp) #ifdef CONFIG_SOLARIS execl("/sbin/shutdown", "shutdown", shutdown_flag, "-g0", "-y", "hypervisor initiated shutdown", (char *)NULL); +#elif defined(CONFIG_BSD) + execl("/sbin/shutdown", "shutdown", shutdown_flag, "+0", + "hypervisor initiated shutdown", (char *)NULL); #else execl("/sbin/shutdown", "shutdown", "-h", shutdown_flag, "+0", "hypervisor initiated shutdown", (char *)NULL); @@ -617,20 +614,8 @@ void qmp_guest_file_flush(int64_t handle, Error **errp) } } -/* linux-specific implementations. avoid this if at all possible. */ -#if defined(__linux__) - #if defined(CONFIG_FSFREEZE) || defined(CONFIG_FSTRIM) -typedef struct FsMount { - char *dirname; - char *devtype; - unsigned int devmajor, devminor; - QTAILQ_ENTRY(FsMount) next; -} FsMount; - -typedef QTAILQ_HEAD(FsMountList, FsMount) FsMountList; - -static void free_fs_mount_list(FsMountList *mounts) +void free_fs_mount_list(FsMountList *mounts) { FsMount *mount, *temp; @@ -645,159 +630,158 @@ static void free_fs_mount_list(FsMountList *mounts) g_free(mount); } } +#endif + +#if defined(CONFIG_FSFREEZE) +typedef enum { + FSFREEZE_HOOK_THAW = 0, + FSFREEZE_HOOK_FREEZE, +} FsfreezeHookArg; + +static const char *fsfreeze_hook_arg_string[] = { + "thaw", + "freeze", +}; -static int dev_major_minor(const char *devpath, - unsigned int *devmajor, unsigned int *devminor) +static void execute_fsfreeze_hook(FsfreezeHookArg arg, Error **errp) { - struct stat st; + int status; + pid_t pid; + const char *hook; + const char *arg_str = fsfreeze_hook_arg_string[arg]; + Error *local_err = NULL; + + hook = ga_fsfreeze_hook(ga_state); + if (!hook) { + return; + } + if (access(hook, X_OK) != 0) { + error_setg_errno(errp, errno, "can't access fsfreeze hook '%s'", hook); + return; + } - *devmajor = 0; - *devminor = 0; + slog("executing fsfreeze hook with arg '%s'", arg_str); + pid = fork(); + if (pid == 0) { + setsid(); + reopen_fd_to_null(0); + reopen_fd_to_null(1); + reopen_fd_to_null(2); - if (stat(devpath, &st) < 0) { - slog("failed to stat device file '%s': %s", devpath, strerror(errno)); - return -1; + execl(hook, hook, arg_str, NULL); + _exit(EXIT_FAILURE); + } else if (pid < 0) { + error_setg_errno(errp, errno, "failed to create child process"); + return; } - if (S_ISDIR(st.st_mode)) { - /* It is bind mount */ - return -2; + + ga_wait_child(pid, &status, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; } - if (S_ISBLK(st.st_mode)) { - *devmajor = major(st.st_rdev); - *devminor = minor(st.st_rdev); - return 0; + + if (!WIFEXITED(status)) { + error_setg(errp, "fsfreeze hook has terminated abnormally"); + return; + } + + status = WEXITSTATUS(status); + if (status) { + error_setg(errp, "fsfreeze hook has failed with status %d", status); + return; } - return -1; } /* - * Walk the mount table and build a list of local file systems + * Return status of freeze/thaw */ -static bool build_fs_mount_list_from_mtab(FsMountList *mounts, Error **errp) +GuestFsfreezeStatus qmp_guest_fsfreeze_status(Error **errp) { - struct mntent *ment; - FsMount *mount; - char const *mtab = "/proc/self/mounts"; - FILE *fp; - unsigned int devmajor, devminor; - - fp = setmntent(mtab, "r"); - if (!fp) { - error_setg(errp, "failed to open mtab file: '%s'", mtab); - return false; + if (ga_is_frozen(ga_state)) { + return GUEST_FSFREEZE_STATUS_FROZEN; } - while ((ment = getmntent(fp))) { - /* - * An entry which device name doesn't start with a '/' is - * either a dummy file system or a network file system. - * Add special handling for smbfs and cifs as is done by - * coreutils as well. - */ - if ((ment->mnt_fsname[0] != '/') || - (strcmp(ment->mnt_type, "smbfs") == 0) || - (strcmp(ment->mnt_type, "cifs") == 0)) { - continue; - } - if (dev_major_minor(ment->mnt_fsname, &devmajor, &devminor) == -2) { - /* Skip bind mounts */ - continue; - } + return GUEST_FSFREEZE_STATUS_THAWED; +} + +int64_t qmp_guest_fsfreeze_freeze(Error **errp) +{ + return qmp_guest_fsfreeze_freeze_list(false, NULL, errp); +} + +int64_t qmp_guest_fsfreeze_freeze_list(bool has_mountpoints, + strList *mountpoints, + Error **errp) +{ + int ret; + FsMountList mounts; + Error *local_err = NULL; - mount = g_new0(FsMount, 1); - mount->dirname = g_strdup(ment->mnt_dir); - mount->devtype = g_strdup(ment->mnt_type); - mount->devmajor = devmajor; - mount->devminor = devminor; + slog("guest-fsfreeze called"); - QTAILQ_INSERT_TAIL(mounts, mount, next); + execute_fsfreeze_hook(FSFREEZE_HOOK_FREEZE, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return -1; } - endmntent(fp); - return true; -} + QTAILQ_INIT(&mounts); + if (!build_fs_mount_list(&mounts, &local_err)) { + error_propagate(errp, local_err); + return -1; + } -static void decode_mntname(char *name, int len) -{ - int i, j = 0; - for (i = 0; i <= len; i++) { - if (name[i] != '\\') { - name[j++] = name[i]; - } else if (name[i + 1] == '\\') { - name[j++] = '\\'; - i++; - } else if (name[i + 1] >= '0' && name[i + 1] <= '3' && - name[i + 2] >= '0' && name[i + 2] <= '7' && - name[i + 3] >= '0' && name[i + 3] <= '7') { - name[j++] = (name[i + 1] - '0') * 64 + - (name[i + 2] - '0') * 8 + - (name[i + 3] - '0'); - i += 3; - } else { - name[j++] = name[i]; - } + /* cannot risk guest agent blocking itself on a write in this state */ + ga_set_frozen(ga_state); + + ret = qmp_guest_fsfreeze_do_freeze_list(has_mountpoints, mountpoints, + mounts, errp); + + free_fs_mount_list(&mounts); + /* We may not issue any FIFREEZE here. + * Just unset ga_state here and ready for the next call. + */ + if (ret == 0) { + ga_unset_frozen(ga_state); + } else if (ret < 0) { + qmp_guest_fsfreeze_thaw(NULL); } + return ret; } -static bool build_fs_mount_list(FsMountList *mounts, Error **errp) +int64_t qmp_guest_fsfreeze_thaw(Error **errp) { - FsMount *mount; - char const *mountinfo = "/proc/self/mountinfo"; - FILE *fp; - char *line = NULL, *dash; - size_t n; - char check; - unsigned int devmajor, devminor; - int ret, dir_s, dir_e, type_s, type_e, dev_s, dev_e; + int ret; - fp = fopen(mountinfo, "r"); - if (!fp) { - return build_fs_mount_list_from_mtab(mounts, errp); + ret = qmp_guest_fsfreeze_do_thaw(errp); + if (ret >= 0) { + ga_unset_frozen(ga_state); + execute_fsfreeze_hook(FSFREEZE_HOOK_THAW, errp); + } else { + ret = 0; } - while (getline(&line, &n, fp) != -1) { - ret = sscanf(line, "%*u %*u %u:%u %*s %n%*s%n%c", - &devmajor, &devminor, &dir_s, &dir_e, &check); - if (ret < 3) { - continue; - } - dash = strstr(line + dir_e, " - "); - if (!dash) { - continue; - } - ret = sscanf(dash, " - %n%*s%n %n%*s%n%c", - &type_s, &type_e, &dev_s, &dev_e, &check); - if (ret < 1) { - continue; - } - line[dir_e] = 0; - dash[type_e] = 0; - dash[dev_e] = 0; - decode_mntname(line + dir_s, dir_e - dir_s); - decode_mntname(dash + dev_s, dev_e - dev_s); - if (devmajor == 0) { - /* btrfs reports major number = 0 */ - if (strcmp("btrfs", dash + type_s) != 0 || - dev_major_minor(dash + dev_s, &devmajor, &devminor) < 0) { - continue; - } - } + return ret; +} - mount = g_new0(FsMount, 1); - mount->dirname = g_strdup(line + dir_s); - mount->devtype = g_strdup(dash + type_s); - mount->devmajor = devmajor; - mount->devminor = devminor; +static void guest_fsfreeze_cleanup(void) +{ + Error *err = NULL; - QTAILQ_INSERT_TAIL(mounts, mount, next); + if (ga_is_frozen(ga_state) == GUEST_FSFREEZE_STATUS_FROZEN) { + qmp_guest_fsfreeze_thaw(&err); + if (err) { + slog("failed to clean up frozen filesystems: %s", + error_get_pretty(err)); + error_free(err); + } } - free(line); - - fclose(fp); - return true; } #endif +/* linux-specific implementations. avoid this if at all possible. */ +#if defined(__linux__) #if defined(CONFIG_FSFREEZE) static char *get_pci_driver(char const *syspath, int pathlen, Error **errp) @@ -1621,248 +1605,6 @@ GuestFilesystemInfoList *qmp_guest_get_fsinfo(Error **errp) free_fs_mount_list(&mounts); return ret; } - - -typedef enum { - FSFREEZE_HOOK_THAW = 0, - FSFREEZE_HOOK_FREEZE, -} FsfreezeHookArg; - -static const char *fsfreeze_hook_arg_string[] = { - "thaw", - "freeze", -}; - -static void execute_fsfreeze_hook(FsfreezeHookArg arg, Error **errp) -{ - int status; - pid_t pid; - const char *hook; - const char *arg_str = fsfreeze_hook_arg_string[arg]; - Error *local_err = NULL; - - hook = ga_fsfreeze_hook(ga_state); - if (!hook) { - return; - } - if (access(hook, X_OK) != 0) { - error_setg_errno(errp, errno, "can't access fsfreeze hook '%s'", hook); - return; - } - - slog("executing fsfreeze hook with arg '%s'", arg_str); - pid = fork(); - if (pid == 0) { - setsid(); - reopen_fd_to_null(0); - reopen_fd_to_null(1); - reopen_fd_to_null(2); - - execl(hook, hook, arg_str, NULL); - _exit(EXIT_FAILURE); - } else if (pid < 0) { - error_setg_errno(errp, errno, "failed to create child process"); - return; - } - - ga_wait_child(pid, &status, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; - } - - if (!WIFEXITED(status)) { - error_setg(errp, "fsfreeze hook has terminated abnormally"); - return; - } - - status = WEXITSTATUS(status); - if (status) { - error_setg(errp, "fsfreeze hook has failed with status %d", status); - return; - } -} - -/* - * Return status of freeze/thaw - */ -GuestFsfreezeStatus qmp_guest_fsfreeze_status(Error **errp) -{ - if (ga_is_frozen(ga_state)) { - return GUEST_FSFREEZE_STATUS_FROZEN; - } - - return GUEST_FSFREEZE_STATUS_THAWED; -} - -int64_t qmp_guest_fsfreeze_freeze(Error **errp) -{ - return qmp_guest_fsfreeze_freeze_list(false, NULL, errp); -} - -/* - * Walk list of mounted file systems in the guest, and freeze the ones which - * are real local file systems. - */ -int64_t qmp_guest_fsfreeze_freeze_list(bool has_mountpoints, - strList *mountpoints, - Error **errp) -{ - int ret = 0, i = 0; - strList *list; - FsMountList mounts; - struct FsMount *mount; - Error *local_err = NULL; - int fd; - - slog("guest-fsfreeze called"); - - execute_fsfreeze_hook(FSFREEZE_HOOK_FREEZE, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return -1; - } - - QTAILQ_INIT(&mounts); - if (!build_fs_mount_list(&mounts, &local_err)) { - error_propagate(errp, local_err); - return -1; - } - - /* cannot risk guest agent blocking itself on a write in this state */ - ga_set_frozen(ga_state); - - QTAILQ_FOREACH_REVERSE(mount, &mounts, next) { - /* To issue fsfreeze in the reverse order of mounts, check if the - * mount is listed in the list here */ - if (has_mountpoints) { - for (list = mountpoints; list; list = list->next) { - if (strcmp(list->value, mount->dirname) == 0) { - break; - } - } - if (!list) { - continue; - } - } - - fd = qga_open_cloexec(mount->dirname, O_RDONLY, 0); - if (fd == -1) { - error_setg_errno(errp, errno, "failed to open %s", mount->dirname); - goto error; - } - - /* we try to cull filesystems we know won't work in advance, but other - * filesystems may not implement fsfreeze for less obvious reasons. - * these will report EOPNOTSUPP. we simply ignore these when tallying - * the number of frozen filesystems. - * if a filesystem is mounted more than once (aka bind mount) a - * consecutive attempt to freeze an already frozen filesystem will - * return EBUSY. - * - * any other error means a failure to freeze a filesystem we - * expect to be freezable, so return an error in those cases - * and return system to thawed state. - */ - ret = ioctl(fd, FIFREEZE); - if (ret == -1) { - if (errno != EOPNOTSUPP && errno != EBUSY) { - error_setg_errno(errp, errno, "failed to freeze %s", - mount->dirname); - close(fd); - goto error; - } - } else { - i++; - } - close(fd); - } - - free_fs_mount_list(&mounts); - /* We may not issue any FIFREEZE here. - * Just unset ga_state here and ready for the next call. - */ - if (i == 0) { - ga_unset_frozen(ga_state); - } - return i; - -error: - free_fs_mount_list(&mounts); - qmp_guest_fsfreeze_thaw(NULL); - return 0; -} - -/* - * Walk list of frozen file systems in the guest, and thaw them. - */ -int64_t qmp_guest_fsfreeze_thaw(Error **errp) -{ - int ret; - FsMountList mounts; - FsMount *mount; - int fd, i = 0, logged; - Error *local_err = NULL; - - QTAILQ_INIT(&mounts); - if (!build_fs_mount_list(&mounts, &local_err)) { - error_propagate(errp, local_err); - return 0; - } - - QTAILQ_FOREACH(mount, &mounts, next) { - logged = false; - fd = qga_open_cloexec(mount->dirname, O_RDONLY, 0); - if (fd == -1) { - continue; - } - /* we have no way of knowing whether a filesystem was actually unfrozen - * as a result of a successful call to FITHAW, only that if an error - * was returned the filesystem was *not* unfrozen by that particular - * call. - * - * since multiple preceding FIFREEZEs require multiple calls to FITHAW - * to unfreeze, continuing issuing FITHAW until an error is returned, - * in which case either the filesystem is in an unfreezable state, or, - * more likely, it was thawed previously (and remains so afterward). - * - * also, since the most recent successful call is the one that did - * the actual unfreeze, we can use this to provide an accurate count - * of the number of filesystems unfrozen by guest-fsfreeze-thaw, which - * may * be useful for determining whether a filesystem was unfrozen - * during the freeze/thaw phase by a process other than qemu-ga. - */ - do { - ret = ioctl(fd, FITHAW); - if (ret == 0 && !logged) { - i++; - logged = true; - } - } while (ret == 0); - close(fd); - } - - ga_unset_frozen(ga_state); - free_fs_mount_list(&mounts); - - execute_fsfreeze_hook(FSFREEZE_HOOK_THAW, errp); - - return i; -} - -static void guest_fsfreeze_cleanup(void) -{ - Error *err = NULL; - - if (ga_is_frozen(ga_state) == GUEST_FSFREEZE_STATUS_FROZEN) { - qmp_guest_fsfreeze_thaw(&err); - if (err) { - slog("failed to clean up frozen filesystems: %s", - error_get_pretty(err)); - error_free(err); - } - } -} #endif /* CONFIG_FSFREEZE */ #if defined(CONFIG_FSTRIM) @@ -2372,7 +2114,9 @@ int64_t qmp_guest_set_vcpus(GuestLogicalProcessorList *vcpus, Error **errp) return processed; } +#endif /* __linux__ */ +#if defined(__linux__) || defined(__FreeBSD__) void qmp_guest_set_user_password(const char *username, const char *password, bool crypted, @@ -2406,10 +2150,15 @@ void qmp_guest_set_user_password(const char *username, goto out; } +#ifdef __FreeBSD__ + chpasswddata = g_strdup(rawpasswddata); + passwd_path = g_find_program_in_path("pw"); +#else chpasswddata = g_strdup_printf("%s:%s\n", username, rawpasswddata); - chpasswdlen = strlen(chpasswddata); - passwd_path = g_find_program_in_path("chpasswd"); +#endif + + chpasswdlen = strlen(chpasswddata); if (!passwd_path) { error_setg(errp, "cannot find 'passwd' program in PATH"); @@ -2430,11 +2179,17 @@ void qmp_guest_set_user_password(const char *username, reopen_fd_to_null(1); reopen_fd_to_null(2); +#ifdef __FreeBSD__ + const char *h_arg; + h_arg = (crypted) ? "-H" : "-h"; + execl(passwd_path, "pw", "usermod", "-n", username, h_arg, "0", NULL); +#else if (crypted) { execl(passwd_path, "chpasswd", "-e", NULL); } else { execl(passwd_path, "chpasswd", NULL); } +#endif _exit(EXIT_FAILURE); } else if (pid < 0) { error_setg_errno(errp, errno, "failed to create child process"); @@ -2477,7 +2232,17 @@ out: close(datafd[1]); } } +#else /* __linux__ || __FreeBSD__ */ +void qmp_guest_set_user_password(const char *username, + const char *password, + bool crypted, + Error **errp) +{ + error_setg(errp, QERR_UNSUPPORTED); +} +#endif /* __linux__ || __FreeBSD__ */ +#ifdef __linux__ static void ga_read_sysfs_file(int dirfd, const char *pathname, char *buf, int size, Error **errp) { @@ -3014,14 +2779,6 @@ int64_t qmp_guest_set_vcpus(GuestLogicalProcessorList *vcpus, Error **errp) return -1; } -void qmp_guest_set_user_password(const char *username, - const char *password, - bool crypted, - Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); -} - GuestMemoryBlockList *qmp_guest_get_memory_blocks(Error **errp) { error_setg(errp, QERR_UNSUPPORTED); @@ -3124,6 +2881,57 @@ static int guest_get_network_stats(const char *name, return -1; } +#ifndef __FreeBSD__ +/* + * Fill "buf" with MAC address by ifaddrs. Pointer buf must point to a + * buffer with ETHER_ADDR_LEN length at least. + * + * Returns false in case of an error, otherwise true. "obtained" argument + * is true if a MAC address was obtained successful, otherwise false. + */ +bool guest_get_hw_addr(struct ifaddrs *ifa, unsigned char *buf, + bool *obtained, Error **errp) +{ + struct ifreq ifr; + int sock; + + *obtained = false; + + /* we haven't obtained HW address yet */ + sock = socket(PF_INET, SOCK_STREAM, 0); + if (sock == -1) { + error_setg_errno(errp, errno, "failed to create socket"); + return false; + } + + memset(&ifr, 0, sizeof(ifr)); + pstrcpy(ifr.ifr_name, IF_NAMESIZE, ifa->ifa_name); + if (ioctl(sock, SIOCGIFHWADDR, &ifr) == -1) { + /* + * We can't get the hw addr of this interface, but that's not a + * fatal error. + */ + if (errno == EADDRNOTAVAIL) { + /* The interface doesn't have a hw addr (e.g. loopback). */ + g_debug("failed to get MAC address of %s: %s", + ifa->ifa_name, strerror(errno)); + } else{ + g_warning("failed to get MAC address of %s: %s", + ifa->ifa_name, strerror(errno)); + } + } else { +#ifdef CONFIG_SOLARIS + memcpy(buf, &ifr.ifr_addr.sa_data, ETHER_ADDR_LEN); +#else + memcpy(buf, &ifr.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); +#endif + *obtained = true; + } + close(sock); + return true; +} +#endif /* __FreeBSD__ */ + /* * Build information about guest interfaces */ @@ -3144,9 +2952,8 @@ GuestNetworkInterfaceList *qmp_guest_network_get_interfaces(Error **errp) GuestNetworkInterfaceStat *interface_stat = NULL; char addr4[INET_ADDRSTRLEN]; char addr6[INET6_ADDRSTRLEN]; - int sock; - struct ifreq ifr; - unsigned char *mac_addr; + unsigned char mac_addr[ETHER_ADDR_LEN]; + bool obtained; void *p; g_debug("Processing %s interface", ifa->ifa_name); @@ -3161,45 +2968,17 @@ GuestNetworkInterfaceList *qmp_guest_network_get_interfaces(Error **errp) } if (!info->has_hardware_address) { - /* we haven't obtained HW address yet */ - sock = socket(PF_INET, SOCK_STREAM, 0); - if (sock == -1) { - error_setg_errno(errp, errno, "failed to create socket"); + if (!guest_get_hw_addr(ifa, mac_addr, &obtained, errp)) { goto error; } - - memset(&ifr, 0, sizeof(ifr)); - pstrcpy(ifr.ifr_name, IF_NAMESIZE, info->name); - if (ioctl(sock, SIOCGIFHWADDR, &ifr) == -1) { - /* - * We can't get the hw addr of this interface, but that's not a - * fatal error. Don't set info->hardware_address, but keep - * going. - */ - if (errno == EADDRNOTAVAIL) { - /* The interface doesn't have a hw addr (e.g. loopback). */ - g_debug("failed to get MAC address of %s: %s", - ifa->ifa_name, strerror(errno)); - } else{ - g_warning("failed to get MAC address of %s: %s", - ifa->ifa_name, strerror(errno)); - } - - } else { -#ifdef CONFIG_SOLARIS - mac_addr = (unsigned char *) &ifr.ifr_addr.sa_data; -#else - mac_addr = (unsigned char *) &ifr.ifr_hwaddr.sa_data; -#endif + if (obtained) { info->hardware_address = g_strdup_printf("%02x:%02x:%02x:%02x:%02x:%02x", (int) mac_addr[0], (int) mac_addr[1], (int) mac_addr[2], (int) mac_addr[3], (int) mac_addr[4], (int) mac_addr[5]); - info->has_hardware_address = true; } - close(sock); } if (ifa->ifa_addr && diff --git a/qga/main.c b/qga/main.c index 5a9d8252e0..b3580508fa 100644 --- a/qga/main.c +++ b/qga/main.c @@ -37,17 +37,16 @@ #include "qga/service-win32.h" #include "qga/vss-win32.h" #endif -#ifdef __linux__ -#include <linux/fs.h> -#ifdef FIFREEZE -#define CONFIG_FSFREEZE -#endif -#endif +#include "commands-common.h" #ifndef _WIN32 +#ifdef __FreeBSD__ +#define QGA_VIRTIO_PATH_DEFAULT "/dev/vtcon/org.qemu.guest_agent.0" +#else /* __FreeBSD__ */ #define QGA_VIRTIO_PATH_DEFAULT "/dev/virtio-ports/org.qemu.guest_agent.0" -#define QGA_STATE_RELATIVE_DIR "run" +#endif /* __FreeBSD__ */ #define QGA_SERIAL_PATH_DEFAULT "/dev/ttyS0" +#define QGA_STATE_RELATIVE_DIR "run" #else #define QGA_VIRTIO_PATH_DEFAULT "\\\\.\\Global\\org.qemu.guest_agent.0" #define QGA_STATE_RELATIVE_DIR "qemu-ga" diff --git a/qga/meson.build b/qga/meson.build index a0ffd6d268..3cfb9166e5 100644 --- a/qga/meson.build +++ b/qga/meson.build @@ -72,6 +72,12 @@ qga_ss.add(when: 'CONFIG_POSIX', if_true: files( 'commands-posix.c', 'commands-posix-ssh.c', )) +qga_ss.add(when: 'CONFIG_LINUX', if_true: files( + 'commands-linux.c', +)) +qga_ss.add(when: 'CONFIG_BSD', if_true: files( + 'commands-bsd.c', +)) qga_ss.add(when: 'CONFIG_WIN32', if_true: files( 'channel-win32.c', 'commands-win32.c', diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index eb3267bef5..2cb0de5601 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -67,6 +67,7 @@ meson_options_help() { printf "%s\n" ' auth-pam PAM access control' printf "%s\n" ' avx2 AVX2 optimizations' printf "%s\n" ' avx512f AVX512F optimizations' + printf "%s\n" ' blkio libblkio block device driver' printf "%s\n" ' bochs bochs image format support' printf "%s\n" ' bpf eBPF support' printf "%s\n" ' brlapi brlapi character device driver' @@ -198,6 +199,8 @@ _meson_option_parse() { --disable-gcov) printf "%s" -Db_coverage=false ;; --enable-lto) printf "%s" -Db_lto=true ;; --disable-lto) printf "%s" -Db_lto=false ;; + --enable-blkio) printf "%s" -Dblkio=enabled ;; + --disable-blkio) printf "%s" -Dblkio=disabled ;; --block-drv-ro-whitelist=*) quote_sh "-Dblock_drv_ro_whitelist=$2" ;; --block-drv-rw-whitelist=*) quote_sh "-Dblock_drv_rw_whitelist=$2" ;; --enable-block-drv-whitelist-in-tools) printf "%s" -Dblock_drv_whitelist_in_tools=true ;; diff --git a/scripts/vmstate-static-checker.py b/scripts/vmstate-static-checker.py index b369388360..dfeee8231a 100755 --- a/scripts/vmstate-static-checker.py +++ b/scripts/vmstate-static-checker.py @@ -367,7 +367,6 @@ def check_machine_type(s, d): if s["Name"] != d["Name"]: print("Warning: checking incompatible machine types:", end=' ') print("\"" + s["Name"] + "\", \"" + d["Name"] + "\"") - return def main(): diff --git a/softmmu/device_tree.c b/softmmu/device_tree.c index ce74f3d48d..30aa3aea9f 100644 --- a/softmmu/device_tree.c +++ b/softmmu/device_tree.c @@ -22,6 +22,7 @@ #include "qemu/option.h" #include "qemu/bswap.h" #include "qemu/cutils.h" +#include "qemu/guest-random.h" #include "sysemu/device_tree.h" #include "hw/loader.h" #include "hw/boards.h" @@ -680,3 +681,23 @@ void hmp_dumpdtb(Monitor *mon, const QDict *qdict) info_report("dtb dumped to %s", filename); } + +void qemu_fdt_randomize_seeds(void *fdt) +{ + int noffset, poffset, len; + const char *name; + uint8_t *data; + + for (noffset = fdt_next_node(fdt, 0, NULL); + noffset >= 0; + noffset = fdt_next_node(fdt, noffset, NULL)) { + for (poffset = fdt_first_property_offset(fdt, noffset); + poffset >= 0; + poffset = fdt_next_property_offset(fdt, poffset)) { + data = (uint8_t *)fdt_getprop_by_offset(fdt, poffset, &name, &len); + if (!data || strcmp(name, "rng-seed")) + continue; + qemu_guest_getrandom_nofail(data, len); + } + } +} diff --git a/softmmu/physmem.c b/softmmu/physmem.c index 56e03e07b5..d9578ccfd4 100644 --- a/softmmu/physmem.c +++ b/softmmu/physmem.c @@ -1748,6 +1748,11 @@ void qemu_ram_unset_migratable(RAMBlock *rb) rb->flags &= ~RAM_MIGRATABLE; } +int qemu_ram_get_fd(RAMBlock *rb) +{ + return rb->fd; +} + /* Called with iothread lock held. */ void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev) { diff --git a/softmmu/runstate.c b/softmmu/runstate.c index 1e68680b9d..3dd83d5e5d 100644 --- a/softmmu/runstate.c +++ b/softmmu/runstate.c @@ -441,11 +441,16 @@ void qemu_system_reset(ShutdownCause reason) cpu_synchronize_all_states(); if (mc && mc->reset) { - mc->reset(current_machine); + mc->reset(current_machine, reason); } else { - qemu_devices_reset(); + qemu_devices_reset(reason); } - if (reason && reason != SHUTDOWN_CAUSE_SUBSYSTEM_RESET) { + switch (reason) { + case SHUTDOWN_CAUSE_NONE: + case SHUTDOWN_CAUSE_SUBSYSTEM_RESET: + case SHUTDOWN_CAUSE_SNAPSHOT_LOAD: + break; + default: qapi_event_send_reset(shutdown_caused_by_guest(reason), reason); } cpu_synchronize_all_post_reset(); diff --git a/stubs/meson.build b/stubs/meson.build index d8f3fd5c44..4314161f5f 100644 --- a/stubs/meson.build +++ b/stubs/meson.build @@ -29,6 +29,7 @@ stub_ss.add(files('migr-blocker.c')) stub_ss.add(files('module-opts.c')) stub_ss.add(files('monitor.c')) stub_ss.add(files('monitor-core.c')) +stub_ss.add(files('physmem.c')) stub_ss.add(files('qemu-timer-notify-cb.c')) stub_ss.add(files('qmp_memory_device.c')) stub_ss.add(files('qmp-command-available.c')) diff --git a/stubs/physmem.c b/stubs/physmem.c new file mode 100644 index 0000000000..1fc5f2df29 --- /dev/null +++ b/stubs/physmem.c @@ -0,0 +1,13 @@ +#include "qemu/osdep.h" +#include "exec/cpu-common.h" + +RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset, + ram_addr_t *offset) +{ + return NULL; +} + +int qemu_ram_get_fd(RAMBlock *rb) +{ + return -1; +} diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c index 979a629d59..270ae787b1 100644 --- a/target/alpha/cpu.c +++ b/target/alpha/cpu.c @@ -40,6 +40,14 @@ static vaddr alpha_cpu_get_pc(CPUState *cs) return cpu->env.pc; } +static void alpha_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) +{ + AlphaCPU *cpu = ALPHA_CPU(cs); + + cpu->env.pc = data[0]; +} static bool alpha_cpu_has_work(CPUState *cs) { @@ -226,6 +234,7 @@ static const struct SysemuCPUOps alpha_sysemu_ops = { static const struct TCGCPUOps alpha_tcg_ops = { .initialize = alpha_translate_init, + .restore_state_to_opc = alpha_restore_state_to_opc, #ifdef CONFIG_USER_ONLY .record_sigsegv = alpha_cpu_record_sigsegv, diff --git a/target/alpha/translate.c b/target/alpha/translate.c index 6766350f56..f9bcdeb717 100644 --- a/target/alpha/translate.c +++ b/target/alpha/translate.c @@ -3049,9 +3049,3 @@ void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int max_insns, DisasContext dc; translator_loop(cpu, tb, max_insns, pc, host_pc, &alpha_tr_ops, &dc.base); } - -void restore_state_to_opc(CPUAlphaState *env, TranslationBlock *tb, - target_ulong *data) -{ - env->pc = data[0]; -} diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 0bc5e9b125..a021df9e9e 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -90,6 +90,31 @@ void arm_cpu_synchronize_from_tb(CPUState *cs, } } } + +static void arm_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) +{ + CPUARMState *env = cs->env_ptr; + + if (is_a64(env)) { + if (TARGET_TB_PCREL) { + env->pc = (env->pc & TARGET_PAGE_MASK) | data[0]; + } else { + env->pc = data[0]; + } + env->condexec_bits = 0; + env->exception.syndrome = data[2] << ARM_INSN_START_WORD2_SHIFT; + } else { + if (TARGET_TB_PCREL) { + env->regs[15] = (env->regs[15] & TARGET_PAGE_MASK) | data[0]; + } else { + env->regs[15] = data[0]; + } + env->condexec_bits = data[1]; + env->exception.syndrome = data[2] << ARM_INSN_START_WORD2_SHIFT; + } +} #endif /* CONFIG_TCG */ static bool arm_cpu_has_work(CPUState *cs) @@ -562,14 +587,24 @@ static inline bool arm_excp_unmasked(CPUState *cs, unsigned int excp_idx, if ((target_el > cur_el) && (target_el != 1)) { /* Exceptions targeting a higher EL may not be maskable */ if (arm_feature(env, ARM_FEATURE_AARCH64)) { - /* - * 64-bit masking rules are simple: exceptions to EL3 - * can't be masked, and exceptions to EL2 can only be - * masked from Secure state. The HCR and SCR settings - * don't affect the masking logic, only the interrupt routing. - */ - if (target_el == 3 || !secure || (env->cp15.scr_el3 & SCR_EEL2)) { + switch (target_el) { + case 2: + /* + * According to ARM DDI 0487H.a, an interrupt can be masked + * when HCR_E2H and HCR_TGE are both set regardless of the + * current Security state. Note that we need to revisit this + * part again once we need to support NMI. + */ + if ((hcr_el2 & (HCR_E2H | HCR_TGE)) != (HCR_E2H | HCR_TGE)) { + unmasked = true; + } + break; + case 3: + /* Interrupt cannot be masked when the target EL is 3 */ unmasked = true; + break; + default: + g_assert_not_reached(); } } else { /* @@ -2152,6 +2187,7 @@ static const struct TCGCPUOps arm_tcg_ops = { .initialize = arm_translate_init, .synchronize_from_tb = arm_cpu_synchronize_from_tb, .debug_excp_handler = arm_debug_excp_handler, + .restore_state_to_opc = arm_restore_state_to_opc, #ifdef CONFIG_USER_ONLY .record_sigsegv = arm_cpu_record_sigsegv, diff --git a/target/arm/cpu.h b/target/arm/cpu.h index 64fc03214c..9aeed3c848 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -3410,6 +3410,14 @@ extern const uint64_t pred_esz_masks[5]; #define PAGE_MTE PAGE_TARGET_2 #define PAGE_TARGET_STICKY PAGE_MTE +/* We associate one allocation tag per 16 bytes, the minimum. */ +#define LOG2_TAG_GRANULE 4 +#define TAG_GRANULE (1 << LOG2_TAG_GRANULE) + +#ifdef CONFIG_USER_ONLY +#define TARGET_PAGE_DATA_SIZE (TARGET_PAGE_SIZE >> (LOG2_TAG_GRANULE + 1)) +#endif + #ifdef TARGET_TAGGED_ADDRESSES /** * cpu_untagged_addr: @@ -4139,6 +4147,21 @@ static inline bool isar_feature_aa64_lva(const ARMISARegisters *id) return FIELD_EX64(id->id_aa64mmfr2, ID_AA64MMFR2, VARANGE) != 0; } +static inline bool isar_feature_aa64_e0pd(const ARMISARegisters *id) +{ + return FIELD_EX64(id->id_aa64mmfr2, ID_AA64MMFR2, E0PD) != 0; +} + +static inline bool isar_feature_aa64_hafs(const ARMISARegisters *id) +{ + return FIELD_EX64(id->id_aa64mmfr1, ID_AA64MMFR1, HAFDBS) != 0; +} + +static inline bool isar_feature_aa64_hdbs(const ARMISARegisters *id) +{ + return FIELD_EX64(id->id_aa64mmfr1, ID_AA64MMFR1, HAFDBS) >= 2; +} + static inline bool isar_feature_aa64_tts2uxn(const ARMISARegisters *id) { return FIELD_EX64(id->id_aa64mmfr1, ID_AA64MMFR1, XNX) != 0; diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c index 85e0d1daf1..3d74f134f5 100644 --- a/target/arm/cpu64.c +++ b/target/arm/cpu64.c @@ -1165,6 +1165,7 @@ static void aarch64_max_initfn(Object *obj) cpu->isar.id_aa64mmfr0 = t; t = cpu->isar.id_aa64mmfr1; + t = FIELD_DP64(t, ID_AA64MMFR1, HAFDBS, 2); /* FEAT_HAFDBS */ t = FIELD_DP64(t, ID_AA64MMFR1, VMIDBITS, 2); /* FEAT_VMID16 */ t = FIELD_DP64(t, ID_AA64MMFR1, VH, 1); /* FEAT_VHE */ t = FIELD_DP64(t, ID_AA64MMFR1, HPDS, 1); /* FEAT_HPDS */ @@ -1185,6 +1186,7 @@ static void aarch64_max_initfn(Object *obj) t = FIELD_DP64(t, ID_AA64MMFR2, FWB, 1); /* FEAT_S2FWB */ t = FIELD_DP64(t, ID_AA64MMFR2, TTL, 1); /* FEAT_TTL */ t = FIELD_DP64(t, ID_AA64MMFR2, BBM, 2); /* FEAT_BBM at level 2 */ + t = FIELD_DP64(t, ID_AA64MMFR2, E0PD, 1); /* FEAT_E0PD */ cpu->isar.id_aa64mmfr2 = t; t = cpu->isar.id_aa64zfr0; diff --git a/target/arm/helper.c b/target/arm/helper.c index c672903f43..b070a20f1a 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -10352,7 +10352,7 @@ int aa64_va_parameter_tbi(uint64_t tcr, ARMMMUIdx mmu_idx) { if (regime_has_2_ranges(mmu_idx)) { return extract64(tcr, 37, 2); - } else if (mmu_idx == ARMMMUIdx_Stage2 || mmu_idx == ARMMMUIdx_Stage2_S) { + } else if (regime_is_stage2(mmu_idx)) { return 0; /* VTCR_EL2 */ } else { /* Replicate the single TBI bit so we always have 2 bits. */ @@ -10364,7 +10364,7 @@ int aa64_va_parameter_tbid(uint64_t tcr, ARMMMUIdx mmu_idx) { if (regime_has_2_ranges(mmu_idx)) { return extract64(tcr, 51, 2); - } else if (mmu_idx == ARMMMUIdx_Stage2 || mmu_idx == ARMMMUIdx_Stage2_S) { + } else if (regime_is_stage2(mmu_idx)) { return 0; /* VTCR_EL2 */ } else { /* Replicate the single TBID bit so we always have 2 bits. */ @@ -10470,11 +10470,11 @@ ARMVAParameters aa64_va_parameters(CPUARMState *env, uint64_t va, ARMMMUIdx mmu_idx, bool data) { uint64_t tcr = regime_tcr(env, mmu_idx); - bool epd, hpd, tsz_oob, ds; + bool epd, hpd, tsz_oob, ds, ha, hd; int select, tsz, tbi, max_tsz, min_tsz, ps, sh; ARMGranuleSize gran; ARMCPU *cpu = env_archcpu(env); - bool stage2 = mmu_idx == ARMMMUIdx_Stage2 || mmu_idx == ARMMMUIdx_Stage2_S; + bool stage2 = regime_is_stage2(mmu_idx); if (!regime_has_2_ranges(mmu_idx)) { select = 0; @@ -10489,8 +10489,12 @@ ARMVAParameters aa64_va_parameters(CPUARMState *env, uint64_t va, epd = false; sh = extract32(tcr, 12, 2); ps = extract32(tcr, 16, 3); + ha = extract32(tcr, 21, 1) && cpu_isar_feature(aa64_hafs, cpu); + hd = extract32(tcr, 22, 1) && cpu_isar_feature(aa64_hdbs, cpu); ds = extract64(tcr, 32, 1); } else { + bool e0pd; + /* * Bit 55 is always between the two regions, and is canonical for * determining if address tagging is enabled. @@ -10502,15 +10506,24 @@ ARMVAParameters aa64_va_parameters(CPUARMState *env, uint64_t va, epd = extract32(tcr, 7, 1); sh = extract32(tcr, 12, 2); hpd = extract64(tcr, 41, 1); + e0pd = extract64(tcr, 55, 1); } else { tsz = extract32(tcr, 16, 6); gran = tg1_to_gran_size(extract32(tcr, 30, 2)); epd = extract32(tcr, 23, 1); sh = extract32(tcr, 28, 2); hpd = extract64(tcr, 42, 1); + e0pd = extract64(tcr, 56, 1); } ps = extract64(tcr, 32, 3); + ha = extract64(tcr, 39, 1) && cpu_isar_feature(aa64_hafs, cpu); + hd = extract64(tcr, 40, 1) && cpu_isar_feature(aa64_hdbs, cpu); ds = extract64(tcr, 59, 1); + + if (e0pd && cpu_isar_feature(aa64_e0pd, cpu) && + regime_is_user(env, mmu_idx)) { + epd = true; + } } gran = sanitize_gran_size(cpu, gran, stage2); @@ -10532,22 +10545,18 @@ ARMVAParameters aa64_va_parameters(CPUARMState *env, uint64_t va, } ds = false; } else if (ds) { - switch (mmu_idx) { - case ARMMMUIdx_Stage2: - case ARMMMUIdx_Stage2_S: + if (regime_is_stage2(mmu_idx)) { if (gran == Gran16K) { ds = cpu_isar_feature(aa64_tgran16_2_lpa2, cpu); } else { ds = cpu_isar_feature(aa64_tgran4_2_lpa2, cpu); } - break; - default: + } else { if (gran == Gran16K) { ds = cpu_isar_feature(aa64_tgran16_lpa2, cpu); } else { ds = cpu_isar_feature(aa64_tgran4_lpa2, cpu); } - break; } if (ds) { min_tsz = 12; @@ -10581,6 +10590,8 @@ ARMVAParameters aa64_va_parameters(CPUARMState *env, uint64_t va, .hpd = hpd, .tsz_oob = tsz_oob, .ds = ds, + .ha = ha, + .hd = ha && hd, .gran = gran, }; } diff --git a/target/arm/internals.h b/target/arm/internals.h index c3c3920ded..d9121d9ff8 100644 --- a/target/arm/internals.h +++ b/target/arm/internals.h @@ -338,6 +338,7 @@ typedef enum ARMFaultType { ARMFault_AsyncExternal, ARMFault_Debug, ARMFault_TLBConflict, + ARMFault_UnsuppAtomicUpdate, ARMFault_Lockdown, ARMFault_Exclusive, ARMFault_ICacheMaint, @@ -524,6 +525,9 @@ static inline uint32_t arm_fi_to_lfsc(ARMMMUFaultInfo *fi) case ARMFault_TLBConflict: fsc = 0x30; break; + case ARMFault_UnsuppAtomicUpdate: + fsc = 0x31; + break; case ARMFault_Lockdown: fsc = 0x34; break; @@ -673,6 +677,11 @@ static inline bool regime_is_pan(CPUARMState *env, ARMMMUIdx mmu_idx) } } +static inline bool regime_is_stage2(ARMMMUIdx mmu_idx) +{ + return mmu_idx == ARMMMUIdx_Stage2 || mmu_idx == ARMMMUIdx_Stage2_S; +} + /* Return the exception level which controls this address translation regime */ static inline uint32_t regime_el(CPUARMState *env, ARMMMUIdx mmu_idx) { @@ -707,6 +716,25 @@ static inline uint32_t regime_el(CPUARMState *env, ARMMMUIdx mmu_idx) } } +static inline bool regime_is_user(CPUARMState *env, ARMMMUIdx mmu_idx) +{ + switch (mmu_idx) { + case ARMMMUIdx_E20_0: + case ARMMMUIdx_Stage1_E0: + case ARMMMUIdx_MUser: + case ARMMMUIdx_MSUser: + case ARMMMUIdx_MUserNegPri: + case ARMMMUIdx_MSUserNegPri: + return true; + default: + return false; + case ARMMMUIdx_E10_0: + case ARMMMUIdx_E10_1: + case ARMMMUIdx_E10_1_PAN: + g_assert_not_reached(); + } +} + /* Return the SCTLR value which controls this address translation regime */ static inline uint64_t regime_sctlr(CPUARMState *env, ARMMMUIdx mmu_idx) { @@ -1041,6 +1069,8 @@ typedef struct ARMVAParameters { bool hpd : 1; bool tsz_oob : 1; /* tsz has been clamped to legal range */ bool ds : 1; + bool ha : 1; + bool hd : 1; ARMGranuleSize gran : 2; } ARMVAParameters; @@ -1164,10 +1194,6 @@ void arm_log_exception(CPUState *cs); */ #define GMID_EL1_BS 6 -/* We associate one allocation tag per 16 bytes, the minimum. */ -#define LOG2_TAG_GRANULE 4 -#define TAG_GRANULE (1 << LOG2_TAG_GRANULE) - /* * SVE predicates are 1/8 the size of SVE vectors, and cannot use * the same simd_desc() encoding due to restrictions on size. diff --git a/target/arm/mte_helper.c b/target/arm/mte_helper.c index e85208339e..86b3754838 100644 --- a/target/arm/mte_helper.c +++ b/target/arm/mte_helper.c @@ -95,11 +95,6 @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx, } tags = page_get_target_data(clean_ptr); - if (tags == NULL) { - size_t alloc_size = TARGET_PAGE_SIZE >> (LOG2_TAG_GRANULE + 1); - tags = page_alloc_target_data(clean_ptr, alloc_size); - assert(tags != NULL); - } index = extract32(ptr, LOG2_TAG_GRANULE + 1, TARGET_PAGE_BITS - LOG2_TAG_GRANULE - 1); diff --git a/target/arm/ptw.c b/target/arm/ptw.c index 6c5ed56a10..58a7bbda50 100644 --- a/target/arm/ptw.c +++ b/target/arm/ptw.c @@ -9,6 +9,7 @@ #include "qemu/osdep.h" #include "qemu/log.h" #include "qemu/range.h" +#include "qemu/main-loop.h" #include "exec/exec-all.h" #include "cpu.h" #include "internals.h" @@ -17,10 +18,13 @@ typedef struct S1Translate { ARMMMUIdx in_mmu_idx; + ARMMMUIdx in_ptw_idx; bool in_secure; bool in_debug; bool out_secure; + bool out_rw; bool out_be; + hwaddr out_virt; hwaddr out_phys; void *out_host; } S1Translate; @@ -104,25 +108,6 @@ static bool regime_translation_big_endian(CPUARMState *env, ARMMMUIdx mmu_idx) return (regime_sctlr(env, mmu_idx) & SCTLR_EE) != 0; } -static bool regime_is_user(CPUARMState *env, ARMMMUIdx mmu_idx) -{ - switch (mmu_idx) { - case ARMMMUIdx_E20_0: - case ARMMMUIdx_Stage1_E0: - case ARMMMUIdx_MUser: - case ARMMMUIdx_MSUser: - case ARMMMUIdx_MUserNegPri: - case ARMMMUIdx_MSUserNegPri: - return true; - default: - return false; - case ARMMMUIdx_E10_0: - case ARMMMUIdx_E10_1: - case ARMMMUIdx_E10_1_PAN: - g_assert_not_reached(); - } -} - /* Return the TTBR associated with this translation regime */ static uint64_t regime_ttbr(CPUARMState *env, ARMMMUIdx mmu_idx, int ttbrn) { @@ -233,33 +218,26 @@ static bool S1_ptw_translate(CPUARMState *env, S1Translate *ptw, { bool is_secure = ptw->in_secure; ARMMMUIdx mmu_idx = ptw->in_mmu_idx; - ARMMMUIdx s2_mmu_idx = is_secure ? ARMMMUIdx_Stage2_S : ARMMMUIdx_Stage2; - bool s2_phys = false; + ARMMMUIdx s2_mmu_idx = ptw->in_ptw_idx; uint8_t pte_attrs; bool pte_secure; - if (!arm_mmu_idx_is_stage1_of_2(mmu_idx) - || regime_translation_disabled(env, s2_mmu_idx, is_secure)) { - s2_mmu_idx = is_secure ? ARMMMUIdx_Phys_S : ARMMMUIdx_Phys_NS; - s2_phys = true; - } + ptw->out_virt = addr; if (unlikely(ptw->in_debug)) { /* * From gdbstub, do not use softmmu so that we don't modify the * state of the cpu at all, including softmmu tlb contents. */ - if (s2_phys) { - ptw->out_phys = addr; - pte_attrs = 0; - pte_secure = is_secure; - } else { + if (regime_is_stage2(s2_mmu_idx)) { S1Translate s2ptw = { .in_mmu_idx = s2_mmu_idx, + .in_ptw_idx = is_secure ? ARMMMUIdx_Phys_S : ARMMMUIdx_Phys_NS, .in_secure = is_secure, .in_debug = true, }; GetPhysAddrResult s2 = { }; + if (!get_phys_addr_lpae(env, &s2ptw, addr, MMU_DATA_LOAD, false, &s2, fi)) { goto fail; @@ -267,8 +245,14 @@ static bool S1_ptw_translate(CPUARMState *env, S1Translate *ptw, ptw->out_phys = s2.f.phys_addr; pte_attrs = s2.cacheattrs.attrs; pte_secure = s2.f.attrs.secure; + } else { + /* Regime is physical. */ + ptw->out_phys = addr; + pte_attrs = 0; + pte_secure = is_secure; } ptw->out_host = NULL; + ptw->out_rw = false; } else { CPUTLBEntryFull *full; int flags; @@ -283,11 +267,12 @@ static bool S1_ptw_translate(CPUARMState *env, S1Translate *ptw, goto fail; } ptw->out_phys = full->phys_addr; + ptw->out_rw = full->prot & PAGE_WRITE; pte_attrs = full->pte_attrs; pte_secure = full->attrs.secure; } - if (!s2_phys) { + if (regime_is_stage2(s2_mmu_idx)) { uint64_t hcr = arm_hcr_el2_eff_secstate(env, is_secure); if ((hcr & HCR_PTW) && S2_attrs_are_device(hcr, pte_attrs)) { @@ -322,24 +307,20 @@ static bool S1_ptw_translate(CPUARMState *env, S1Translate *ptw, } /* All loads done in the course of a page table walk go through here. */ -static uint32_t arm_ldl_ptw(CPUARMState *env, S1Translate *ptw, hwaddr addr, +static uint32_t arm_ldl_ptw(CPUARMState *env, S1Translate *ptw, ARMMMUFaultInfo *fi) { CPUState *cs = env_cpu(env); + void *host = ptw->out_host; uint32_t data; - if (!S1_ptw_translate(env, ptw, addr, fi)) { - /* Failure. */ - assert(fi->s1ptw); - return 0; - } - - if (likely(ptw->out_host)) { + if (likely(host)) { /* Page tables are in RAM, and we have the host address. */ + data = qatomic_read((uint32_t *)host); if (ptw->out_be) { - data = ldl_be_p(ptw->out_host); + data = be32_to_cpu(data); } else { - data = ldl_le_p(ptw->out_host); + data = le32_to_cpu(data); } } else { /* Page tables are in MMIO. */ @@ -361,25 +342,29 @@ static uint32_t arm_ldl_ptw(CPUARMState *env, S1Translate *ptw, hwaddr addr, return data; } -static uint64_t arm_ldq_ptw(CPUARMState *env, S1Translate *ptw, hwaddr addr, +static uint64_t arm_ldq_ptw(CPUARMState *env, S1Translate *ptw, ARMMMUFaultInfo *fi) { CPUState *cs = env_cpu(env); + void *host = ptw->out_host; uint64_t data; - if (!S1_ptw_translate(env, ptw, addr, fi)) { - /* Failure. */ - assert(fi->s1ptw); - return 0; - } - - if (likely(ptw->out_host)) { + if (likely(host)) { /* Page tables are in RAM, and we have the host address. */ +#ifdef CONFIG_ATOMIC64 + data = qatomic_read__nocheck((uint64_t *)host); + if (ptw->out_be) { + data = be64_to_cpu(data); + } else { + data = le64_to_cpu(data); + } +#else if (ptw->out_be) { - data = ldq_be_p(ptw->out_host); + data = ldq_be_p(host); } else { - data = ldq_le_p(ptw->out_host); + data = ldq_le_p(host); } +#endif } else { /* Page tables are in MMIO. */ MemTxAttrs attrs = { .secure = ptw->out_secure }; @@ -400,6 +385,91 @@ static uint64_t arm_ldq_ptw(CPUARMState *env, S1Translate *ptw, hwaddr addr, return data; } +static uint64_t arm_casq_ptw(CPUARMState *env, uint64_t old_val, + uint64_t new_val, S1Translate *ptw, + ARMMMUFaultInfo *fi) +{ + uint64_t cur_val; + void *host = ptw->out_host; + + if (unlikely(!host)) { + fi->type = ARMFault_UnsuppAtomicUpdate; + fi->s1ptw = true; + return 0; + } + + /* + * Raising a stage2 Protection fault for an atomic update to a read-only + * page is delayed until it is certain that there is a change to make. + */ + if (unlikely(!ptw->out_rw)) { + int flags; + void *discard; + + env->tlb_fi = fi; + flags = probe_access_flags(env, ptw->out_virt, MMU_DATA_STORE, + arm_to_core_mmu_idx(ptw->in_ptw_idx), + true, &discard, 0); + env->tlb_fi = NULL; + + if (unlikely(flags & TLB_INVALID_MASK)) { + assert(fi->type != ARMFault_None); + fi->s2addr = ptw->out_virt; + fi->stage2 = true; + fi->s1ptw = true; + fi->s1ns = !ptw->in_secure; + return 0; + } + + /* In case CAS mismatches and we loop, remember writability. */ + ptw->out_rw = true; + } + +#ifdef CONFIG_ATOMIC64 + if (ptw->out_be) { + old_val = cpu_to_be64(old_val); + new_val = cpu_to_be64(new_val); + cur_val = qatomic_cmpxchg__nocheck((uint64_t *)host, old_val, new_val); + cur_val = be64_to_cpu(cur_val); + } else { + old_val = cpu_to_le64(old_val); + new_val = cpu_to_le64(new_val); + cur_val = qatomic_cmpxchg__nocheck((uint64_t *)host, old_val, new_val); + cur_val = le64_to_cpu(cur_val); + } +#else + /* + * We can't support the full 64-bit atomic cmpxchg on the host. + * Because this is only used for FEAT_HAFDBS, which is only for AA64, + * we know that TCG_OVERSIZED_GUEST is set, which means that we are + * running in round-robin mode and could only race with dma i/o. + */ +#ifndef TCG_OVERSIZED_GUEST +# error "Unexpected configuration" +#endif + bool locked = qemu_mutex_iothread_locked(); + if (!locked) { + qemu_mutex_lock_iothread(); + } + if (ptw->out_be) { + cur_val = ldq_be_p(host); + if (cur_val == old_val) { + stq_be_p(host, new_val); + } + } else { + cur_val = ldq_le_p(host); + if (cur_val == old_val) { + stq_le_p(host, new_val); + } + } + if (!locked) { + qemu_mutex_unlock_iothread(); + } +#endif + + return cur_val; +} + static bool get_level1_table_address(CPUARMState *env, ARMMMUIdx mmu_idx, uint32_t *table, uint32_t address) { @@ -529,7 +599,10 @@ static bool get_phys_addr_v5(CPUARMState *env, S1Translate *ptw, fi->type = ARMFault_Translation; goto do_fault; } - desc = arm_ldl_ptw(env, ptw, table, fi); + if (!S1_ptw_translate(env, ptw, table, fi)) { + goto do_fault; + } + desc = arm_ldl_ptw(env, ptw, fi); if (fi->type != ARMFault_None) { goto do_fault; } @@ -567,7 +640,10 @@ static bool get_phys_addr_v5(CPUARMState *env, S1Translate *ptw, /* Fine pagetable. */ table = (desc & 0xfffff000) | ((address >> 8) & 0xffc); } - desc = arm_ldl_ptw(env, ptw, table, fi); + if (!S1_ptw_translate(env, ptw, table, fi)) { + goto do_fault; + } + desc = arm_ldl_ptw(env, ptw, fi); if (fi->type != ARMFault_None) { goto do_fault; } @@ -652,7 +728,10 @@ static bool get_phys_addr_v6(CPUARMState *env, S1Translate *ptw, fi->type = ARMFault_Translation; goto do_fault; } - desc = arm_ldl_ptw(env, ptw, table, fi); + if (!S1_ptw_translate(env, ptw, table, fi)) { + goto do_fault; + } + desc = arm_ldl_ptw(env, ptw, fi); if (fi->type != ARMFault_None) { goto do_fault; } @@ -705,7 +784,10 @@ static bool get_phys_addr_v6(CPUARMState *env, S1Translate *ptw, ns = extract32(desc, 3, 1); /* Lookup l2 entry. */ table = (desc & 0xfffffc00) | ((address >> 10) & 0x3fc); - desc = arm_ldl_ptw(env, ptw, table, fi); + if (!S1_ptw_translate(env, ptw, table, fi)) { + goto do_fault; + } + desc = arm_ldl_ptw(env, ptw, fi); if (fi->type != ARMFault_None) { goto do_fault; } @@ -842,8 +924,7 @@ static int get_S1prot(CPUARMState *env, ARMMMUIdx mmu_idx, bool is_aa64, bool have_wxn; int wxn = 0; - assert(mmu_idx != ARMMMUIdx_Stage2); - assert(mmu_idx != ARMMMUIdx_Stage2_S); + assert(!regime_is_stage2(mmu_idx)); user_rw = simple_ap_to_rw_prot_is_user(ap, true); if (is_user) { @@ -1067,15 +1148,13 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, ARMCPU *cpu = env_archcpu(env); ARMMMUIdx mmu_idx = ptw->in_mmu_idx; bool is_secure = ptw->in_secure; - /* Read an LPAE long-descriptor translation table. */ - ARMFaultType fault_type = ARMFault_Translation; uint32_t level; ARMVAParameters param; uint64_t ttbr; hwaddr descaddr, indexmask, indexmask_grainsize; uint32_t tableattrs; target_ulong page_size; - uint32_t attrs; + uint64_t attrs; int32_t stride; int addrsize, inputsize, outputsize; uint64_t tcr = regime_tcr(env, mmu_idx); @@ -1083,7 +1162,8 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, uint32_t el = regime_el(env, mmu_idx); uint64_t descaddrmask; bool aarch64 = arm_el_is_aa64(env, el); - bool guarded = false; + uint64_t descriptor, new_descriptor; + bool nstable; /* TODO: This code does not support shareability levels. */ if (aarch64) { @@ -1103,8 +1183,7 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, * so our choice is to always raise the fault. */ if (param.tsz_oob) { - fault_type = ARMFault_Translation; - goto do_fault; + goto do_translation_fault; } addrsize = 64 - 8 * param.tbi; @@ -1141,8 +1220,7 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, addrsize - inputsize); if (-top_bits != param.select) { /* The gap between the two regions is a Translation fault */ - fault_type = ARMFault_Translation; - goto do_fault; + goto do_translation_fault; } } @@ -1168,10 +1246,10 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, * Translation table walk disabled => Translation fault on TLB miss * Note: This is always 0 on 64-bit EL2 and EL3. */ - goto do_fault; + goto do_translation_fault; } - if (mmu_idx != ARMMMUIdx_Stage2 && mmu_idx != ARMMMUIdx_Stage2_S) { + if (!regime_is_stage2(mmu_idx)) { /* * The starting level depends on the virtual address size (which can * be up to 48 bits) and the translation granule size. It indicates @@ -1199,8 +1277,7 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, if (param.ds && stride == 9 && sl2) { if (sl0 != 0) { level = 0; - fault_type = ARMFault_Translation; - goto do_fault; + goto do_translation_fault; } startlevel = -1; } else if (!aarch64 || stride == 9) { @@ -1219,8 +1296,7 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, ok = check_s2_mmu_setup(cpu, aarch64, startlevel, inputsize, stride, outputsize); if (!ok) { - fault_type = ARMFault_Translation; - goto do_fault; + goto do_translation_fault; } level = startlevel; } @@ -1242,7 +1318,7 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, descaddr |= extract64(ttbr, 2, 4) << 48; } else if (descaddr >> outputsize) { level = 0; - fault_type = ARMFault_AddressSize; + fi->type = ARMFault_AddressSize; goto do_fault; } @@ -1276,120 +1352,173 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, * bits at each step. */ tableattrs = is_secure ? 0 : (1 << 4); - for (;;) { - uint64_t descriptor; - bool nstable; - - descaddr |= (address >> (stride * (4 - level))) & indexmask; - descaddr &= ~7ULL; - nstable = extract32(tableattrs, 4, 1); - ptw->in_secure = !nstable; - descriptor = arm_ldq_ptw(env, ptw, descaddr, fi); - if (fi->type != ARMFault_None) { - goto do_fault; - } - if (!(descriptor & 1) || - (!(descriptor & 2) && (level == 3))) { - /* Invalid, or the Reserved level 3 encoding */ - goto do_fault; + next_level: + descaddr |= (address >> (stride * (4 - level))) & indexmask; + descaddr &= ~7ULL; + nstable = extract32(tableattrs, 4, 1); + if (!nstable) { + /* + * Stage2_S -> Stage2 or Phys_S -> Phys_NS + * Assert that the non-secure idx are even, and relative order. + */ + QEMU_BUILD_BUG_ON((ARMMMUIdx_Phys_NS & 1) != 0); + QEMU_BUILD_BUG_ON((ARMMMUIdx_Stage2 & 1) != 0); + QEMU_BUILD_BUG_ON(ARMMMUIdx_Phys_NS + 1 != ARMMMUIdx_Phys_S); + QEMU_BUILD_BUG_ON(ARMMMUIdx_Stage2 + 1 != ARMMMUIdx_Stage2_S); + ptw->in_ptw_idx &= ~1; + ptw->in_secure = false; + } + if (!S1_ptw_translate(env, ptw, descaddr, fi)) { + goto do_fault; + } + descriptor = arm_ldq_ptw(env, ptw, fi); + if (fi->type != ARMFault_None) { + goto do_fault; + } + new_descriptor = descriptor; + + restart_atomic_update: + if (!(descriptor & 1) || (!(descriptor & 2) && (level == 3))) { + /* Invalid, or the Reserved level 3 encoding */ + goto do_translation_fault; + } + + descaddr = descriptor & descaddrmask; + + /* + * For FEAT_LPA and PS=6, bits [51:48] of descaddr are in [15:12] + * of descriptor. For FEAT_LPA2 and effective DS, bits [51:50] of + * descaddr are in [9:8]. Otherwise, if descaddr is out of range, + * raise AddressSizeFault. + */ + if (outputsize > 48) { + if (param.ds) { + descaddr |= extract64(descriptor, 8, 2) << 50; + } else { + descaddr |= extract64(descriptor, 12, 4) << 48; } + } else if (descaddr >> outputsize) { + fi->type = ARMFault_AddressSize; + goto do_fault; + } + + if ((descriptor & 2) && (level < 3)) { + /* + * Table entry. The top five bits are attributes which may + * propagate down through lower levels of the table (and + * which are all arranged so that 0 means "no effect", so + * we can gather them up by ORing in the bits at each level). + */ + tableattrs |= extract64(descriptor, 59, 5); + level++; + indexmask = indexmask_grainsize; + goto next_level; + } - descaddr = descriptor & descaddrmask; + /* + * Block entry at level 1 or 2, or page entry at level 3. + * These are basically the same thing, although the number + * of bits we pull in from the vaddr varies. Note that although + * descaddrmask masks enough of the low bits of the descriptor + * to give a correct page or table address, the address field + * in a block descriptor is smaller; so we need to explicitly + * clear the lower bits here before ORing in the low vaddr bits. + * + * Afterward, descaddr is the final physical address. + */ + page_size = (1ULL << ((stride * (4 - level)) + 3)); + descaddr &= ~(hwaddr)(page_size - 1); + descaddr |= (address & (page_size - 1)); + if (likely(!ptw->in_debug)) { /* - * For FEAT_LPA and PS=6, bits [51:48] of descaddr are in [15:12] - * of descriptor. For FEAT_LPA2 and effective DS, bits [51:50] of - * descaddr are in [9:8]. Otherwise, if descaddr is out of range, - * raise AddressSizeFault. + * Access flag. + * If HA is enabled, prepare to update the descriptor below. + * Otherwise, pass the access fault on to software. */ - if (outputsize > 48) { - if (param.ds) { - descaddr |= extract64(descriptor, 8, 2) << 50; + if (!(descriptor & (1 << 10))) { + if (param.ha) { + new_descriptor |= 1 << 10; /* AF */ } else { - descaddr |= extract64(descriptor, 12, 4) << 48; + fi->type = ARMFault_AccessFlag; + goto do_fault; } - } else if (descaddr >> outputsize) { - fault_type = ARMFault_AddressSize; - goto do_fault; } - if ((descriptor & 2) && (level < 3)) { - /* - * Table entry. The top five bits are attributes which may - * propagate down through lower levels of the table (and - * which are all arranged so that 0 means "no effect", so - * we can gather them up by ORing in the bits at each level). - */ - tableattrs |= extract64(descriptor, 59, 5); - level++; - indexmask = indexmask_grainsize; - continue; - } /* - * Block entry at level 1 or 2, or page entry at level 3. - * These are basically the same thing, although the number - * of bits we pull in from the vaddr varies. Note that although - * descaddrmask masks enough of the low bits of the descriptor - * to give a correct page or table address, the address field - * in a block descriptor is smaller; so we need to explicitly - * clear the lower bits here before ORing in the low vaddr bits. + * Dirty Bit. + * If HD is enabled, pre-emptively set/clear the appropriate AP/S2AP + * bit for writeback. The actual write protection test may still be + * overridden by tableattrs, to be merged below. */ - page_size = (1ULL << ((stride * (4 - level)) + 3)); - descaddr &= ~(hwaddr)(page_size - 1); - descaddr |= (address & (page_size - 1)); - /* Extract attributes from the descriptor */ - attrs = extract64(descriptor, 2, 10) - | (extract64(descriptor, 52, 12) << 10); - - if (mmu_idx == ARMMMUIdx_Stage2 || mmu_idx == ARMMMUIdx_Stage2_S) { - /* Stage 2 table descriptors do not include any attribute fields */ - break; - } - /* Merge in attributes from table descriptors */ - attrs |= nstable << 3; /* NS */ - guarded = extract64(descriptor, 50, 1); /* GP */ - if (param.hpd) { - /* HPD disables all the table attributes except NSTable. */ - break; + if (param.hd + && extract64(descriptor, 51, 1) /* DBM */ + && access_type == MMU_DATA_STORE) { + if (regime_is_stage2(mmu_idx)) { + new_descriptor |= 1ull << 7; /* set S2AP[1] */ + } else { + new_descriptor &= ~(1ull << 7); /* clear AP[2] */ + } } - attrs |= extract32(tableattrs, 0, 2) << 11; /* XN, PXN */ - /* - * The sense of AP[1] vs APTable[0] is reversed, as APTable[0] == 1 - * means "force PL1 access only", which means forcing AP[1] to 0. - */ - attrs &= ~(extract32(tableattrs, 2, 1) << 4); /* !APT[0] => AP[1] */ - attrs |= extract32(tableattrs, 3, 1) << 5; /* APT[1] => AP[2] */ - break; } + /* - * Here descaddr is the final physical address, and attributes - * are all in attrs. + * Extract attributes from the (modified) descriptor, and apply + * table descriptors. Stage 2 table descriptors do not include + * any attribute fields. HPD disables all the table attributes + * except NSTable. */ - fault_type = ARMFault_AccessFlag; - if ((attrs & (1 << 8)) == 0) { - /* Access flag */ - goto do_fault; + attrs = new_descriptor & (MAKE_64BIT_MASK(2, 10) | MAKE_64BIT_MASK(50, 14)); + if (!regime_is_stage2(mmu_idx)) { + attrs |= nstable << 5; /* NS */ + if (!param.hpd) { + attrs |= extract64(tableattrs, 0, 2) << 53; /* XN, PXN */ + /* + * The sense of AP[1] vs APTable[0] is reversed, as APTable[0] == 1 + * means "force PL1 access only", which means forcing AP[1] to 0. + */ + attrs &= ~(extract64(tableattrs, 2, 1) << 6); /* !APT[0] => AP[1] */ + attrs |= extract32(tableattrs, 3, 1) << 7; /* APT[1] => AP[2] */ + } } - ap = extract32(attrs, 4, 2); - - if (mmu_idx == ARMMMUIdx_Stage2 || mmu_idx == ARMMMUIdx_Stage2_S) { + ap = extract32(attrs, 6, 2); + if (regime_is_stage2(mmu_idx)) { ns = mmu_idx == ARMMMUIdx_Stage2; - xn = extract32(attrs, 11, 2); + xn = extract64(attrs, 53, 2); result->f.prot = get_S2prot(env, ap, xn, s1_is_el0); } else { - ns = extract32(attrs, 3, 1); - xn = extract32(attrs, 12, 1); - pxn = extract32(attrs, 11, 1); + ns = extract32(attrs, 5, 1); + xn = extract64(attrs, 54, 1); + pxn = extract64(attrs, 53, 1); result->f.prot = get_S1prot(env, mmu_idx, aarch64, ap, ns, xn, pxn); } - fault_type = ARMFault_Permission; if (!(result->f.prot & (1 << access_type))) { + fi->type = ARMFault_Permission; goto do_fault; } + /* If FEAT_HAFDBS has made changes, update the PTE. */ + if (new_descriptor != descriptor) { + new_descriptor = arm_casq_ptw(env, descriptor, new_descriptor, ptw, fi); + if (fi->type != ARMFault_None) { + goto do_fault; + } + /* + * I_YZSVV says that if the in-memory descriptor has changed, + * then we must use the information in that new value + * (which might include a different output address, different + * attributes, or generate a fault). + * Restart the handling of the descriptor value from scratch. + */ + if (new_descriptor != descriptor) { + descriptor = new_descriptor; + goto restart_atomic_update; + } + } + if (ns) { /* * The NS bit will (as required by the architecture) have no effect if @@ -1401,15 +1530,15 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, /* When in aarch64 mode, and BTI is enabled, remember GP in the TLB. */ if (aarch64 && cpu_isar_feature(aa64_bti, cpu)) { - result->f.guarded = guarded; + result->f.guarded = extract64(attrs, 50, 1); /* GP */ } - if (mmu_idx == ARMMMUIdx_Stage2 || mmu_idx == ARMMMUIdx_Stage2_S) { + if (regime_is_stage2(mmu_idx)) { result->cacheattrs.is_s2_format = true; - result->cacheattrs.attrs = extract32(attrs, 0, 4); + result->cacheattrs.attrs = extract32(attrs, 2, 4); } else { /* Index into MAIR registers for cache attributes */ - uint8_t attrindx = extract32(attrs, 0, 3); + uint8_t attrindx = extract32(attrs, 2, 3); uint64_t mair = env->cp15.mair_el[regime_el(env, mmu_idx)]; assert(attrindx <= 7); result->cacheattrs.is_s2_format = false; @@ -1424,19 +1553,19 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, if (param.ds) { result->cacheattrs.shareability = param.sh; } else { - result->cacheattrs.shareability = extract32(attrs, 6, 2); + result->cacheattrs.shareability = extract32(attrs, 8, 2); } result->f.phys_addr = descaddr; result->f.lg_page_size = ctz64(page_size); return false; -do_fault: - fi->type = fault_type; + do_translation_fault: + fi->type = ARMFault_Translation; + do_fault: fi->level = level; /* Tag the error as S2 for failed S1 PTW at S2 or ordinary S2. */ - fi->stage2 = fi->s1ptw || (mmu_idx == ARMMMUIdx_Stage2 || - mmu_idx == ARMMMUIdx_Stage2_S); + fi->stage2 = fi->s1ptw || regime_is_stage2(mmu_idx); fi->s1ns = mmu_idx == ARMMMUIdx_Stage2; return true; } @@ -2442,7 +2571,7 @@ static bool get_phys_addr_twostage(CPUARMState *env, S1Translate *ptw, ARMMMUFaultInfo *fi) { hwaddr ipa; - int s1_prot; + int s1_prot, s1_lgpgsz; bool is_secure = ptw->in_secure; bool ret, ipa_secure, s2walk_secure; ARMCacheAttrs cacheattrs1; @@ -2470,6 +2599,7 @@ static bool get_phys_addr_twostage(CPUARMState *env, S1Translate *ptw, is_el0 = ptw->in_mmu_idx == ARMMMUIdx_Stage1_E0; ptw->in_mmu_idx = s2walk_secure ? ARMMMUIdx_Stage2_S : ARMMMUIdx_Stage2; + ptw->in_ptw_idx = s2walk_secure ? ARMMMUIdx_Phys_S : ARMMMUIdx_Phys_NS; ptw->in_secure = s2walk_secure; /* @@ -2477,6 +2607,7 @@ static bool get_phys_addr_twostage(CPUARMState *env, S1Translate *ptw, * Save the stage1 results so that we may merge prot and cacheattrs later. */ s1_prot = result->f.prot; + s1_lgpgsz = result->f.lg_page_size; cacheattrs1 = result->cacheattrs; memset(result, 0, sizeof(*result)); @@ -2491,6 +2622,14 @@ static bool get_phys_addr_twostage(CPUARMState *env, S1Translate *ptw, return ret; } + /* + * Use the maximum of the S1 & S2 page size, so that invalidation + * of pages > TARGET_PAGE_SIZE works correctly. + */ + if (result->f.lg_page_size < s1_lgpgsz) { + result->f.lg_page_size = s1_lgpgsz; + } + /* Combine the S1 and S2 cache attributes. */ hcr = arm_hcr_el2_eff_secstate(env, is_secure); if (hcr & HCR_DC) { @@ -2529,10 +2668,32 @@ static bool get_phys_addr_with_struct(CPUARMState *env, S1Translate *ptw, ARMMMUFaultInfo *fi) { ARMMMUIdx mmu_idx = ptw->in_mmu_idx; - ARMMMUIdx s1_mmu_idx = stage_1_mmu_idx(mmu_idx); bool is_secure = ptw->in_secure; + ARMMMUIdx s1_mmu_idx; + + switch (mmu_idx) { + case ARMMMUIdx_Phys_S: + case ARMMMUIdx_Phys_NS: + /* Checking Phys early avoids special casing later vs regime_el. */ + return get_phys_addr_disabled(env, address, access_type, mmu_idx, + is_secure, result, fi); + + case ARMMMUIdx_Stage1_E0: + case ARMMMUIdx_Stage1_E1: + case ARMMMUIdx_Stage1_E1_PAN: + /* First stage lookup uses second stage for ptw. */ + ptw->in_ptw_idx = is_secure ? ARMMMUIdx_Stage2_S : ARMMMUIdx_Stage2; + break; - if (mmu_idx != s1_mmu_idx) { + case ARMMMUIdx_E10_0: + s1_mmu_idx = ARMMMUIdx_Stage1_E0; + goto do_twostage; + case ARMMMUIdx_E10_1: + s1_mmu_idx = ARMMMUIdx_Stage1_E1; + goto do_twostage; + case ARMMMUIdx_E10_1_PAN: + s1_mmu_idx = ARMMMUIdx_Stage1_E1_PAN; + do_twostage: /* * Call ourselves recursively to do the stage 1 and then stage 2 * translations if mmu_idx is a two-stage regime, and EL2 present. @@ -2543,6 +2704,12 @@ static bool get_phys_addr_with_struct(CPUARMState *env, S1Translate *ptw, return get_phys_addr_twostage(env, ptw, address, access_type, result, fi); } + /* fall through */ + + default: + /* Single stage and second stage uses physical for ptw. */ + ptw->in_ptw_idx = is_secure ? ARMMMUIdx_Phys_S : ARMMMUIdx_Phys_NS; + break; } /* diff --git a/target/arm/translate.c b/target/arm/translate.c index d1b868430e..74a903072f 100644 --- a/target/arm/translate.c +++ b/target/arm/translate.c @@ -9939,25 +9939,3 @@ void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int max_insns, translator_loop(cpu, tb, max_insns, pc, host_pc, ops, &dc.base); } - -void restore_state_to_opc(CPUARMState *env, TranslationBlock *tb, - target_ulong *data) -{ - if (is_a64(env)) { - if (TARGET_TB_PCREL) { - env->pc = (env->pc & TARGET_PAGE_MASK) | data[0]; - } else { - env->pc = data[0]; - } - env->condexec_bits = 0; - env->exception.syndrome = data[2] << ARM_INSN_START_WORD2_SHIFT; - } else { - if (TARGET_TB_PCREL) { - env->regs[15] = (env->regs[15] & TARGET_PAGE_MASK) | data[0]; - } else { - env->regs[15] = data[0]; - } - env->condexec_bits = data[1]; - env->exception.syndrome = data[2] << ARM_INSN_START_WORD2_SHIFT; - } -} diff --git a/target/avr/cpu.c b/target/avr/cpu.c index 0d2861179d..c7295b488d 100644 --- a/target/avr/cpu.c +++ b/target/avr/cpu.c @@ -57,6 +57,16 @@ static void avr_cpu_synchronize_from_tb(CPUState *cs, env->pc_w = tb_pc(tb) / 2; /* internally PC points to words */ } +static void avr_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) +{ + AVRCPU *cpu = AVR_CPU(cs); + CPUAVRState *env = &cpu->env; + + env->pc_w = data[0]; +} + static void avr_cpu_reset(DeviceState *ds) { CPUState *cs = CPU(ds); @@ -202,6 +212,7 @@ static const struct SysemuCPUOps avr_sysemu_ops = { static const struct TCGCPUOps avr_tcg_ops = { .initialize = avr_cpu_tcg_init, .synchronize_from_tb = avr_cpu_synchronize_from_tb, + .restore_state_to_opc = avr_restore_state_to_opc, .cpu_exec_interrupt = avr_cpu_exec_interrupt, .tlb_fill = avr_cpu_tlb_fill, .do_interrupt = avr_cpu_do_interrupt, diff --git a/target/avr/translate.c b/target/avr/translate.c index e65b6008c0..2bed56f135 100644 --- a/target/avr/translate.c +++ b/target/avr/translate.c @@ -3055,9 +3055,3 @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns, DisasContext dc = { }; translator_loop(cs, tb, max_insns, pc, host_pc, &avr_tr_ops, &dc.base); } - -void restore_state_to_opc(CPUAVRState *env, TranslationBlock *tb, - target_ulong *data) -{ - env->pc_w = data[0]; -} diff --git a/target/cris/cpu.c b/target/cris/cpu.c index 22f5c70f39..fb05dc6f9a 100644 --- a/target/cris/cpu.c +++ b/target/cris/cpu.c @@ -42,6 +42,15 @@ static vaddr cris_cpu_get_pc(CPUState *cs) return cpu->env.pc; } +static void cris_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) +{ + CRISCPU *cpu = CRIS_CPU(cs); + + cpu->env.pc = data[0]; +} + static bool cris_cpu_has_work(CPUState *cs) { return cs->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI); @@ -212,6 +221,7 @@ static const struct SysemuCPUOps cris_sysemu_ops = { static const struct TCGCPUOps crisv10_tcg_ops = { .initialize = cris_initialize_crisv10_tcg, + .restore_state_to_opc = cris_restore_state_to_opc, #ifndef CONFIG_USER_ONLY .tlb_fill = cris_cpu_tlb_fill, @@ -222,6 +232,7 @@ static const struct TCGCPUOps crisv10_tcg_ops = { static const struct TCGCPUOps crisv32_tcg_ops = { .initialize = cris_initialize_tcg, + .restore_state_to_opc = cris_restore_state_to_opc, #ifndef CONFIG_USER_ONLY .tlb_fill = cris_cpu_tlb_fill, diff --git a/target/cris/translate.c b/target/cris/translate.c index 73385b0b3c..fbc3fd5865 100644 --- a/target/cris/translate.c +++ b/target/cris/translate.c @@ -3392,9 +3392,3 @@ void cris_initialize_tcg(void) pregnames_v32[i]); } } - -void restore_state_to_opc(CPUCRISState *env, TranslationBlock *tb, - target_ulong *data) -{ - env->pc = data[0]; -} diff --git a/target/hexagon/cpu.c b/target/hexagon/cpu.c index fa6d722555..03221fbdc2 100644 --- a/target/hexagon/cpu.c +++ b/target/hexagon/cpu.c @@ -271,9 +271,13 @@ static bool hexagon_cpu_has_work(CPUState *cs) return true; } -void restore_state_to_opc(CPUHexagonState *env, TranslationBlock *tb, - target_ulong *data) +static void hexagon_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) { + HexagonCPU *cpu = HEXAGON_CPU(cs); + CPUHexagonState *env = &cpu->env; + env->gpr[HEX_REG_PC] = data[0]; } @@ -327,6 +331,7 @@ static void hexagon_cpu_init(Object *obj) static const struct TCGCPUOps hexagon_tcg_ops = { .initialize = hexagon_translate_init, .synchronize_from_tb = hexagon_cpu_synchronize_from_tb, + .restore_state_to_opc = hexagon_restore_state_to_opc, }; static void hexagon_cpu_class_init(ObjectClass *c, void *data) diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c index e677ca09d4..55c190280e 100644 --- a/target/hppa/cpu.c +++ b/target/hppa/cpu.c @@ -68,6 +68,24 @@ static void hppa_cpu_synchronize_from_tb(CPUState *cs, cpu->env.psw_n = (tb->flags & PSW_N) != 0; } +static void hppa_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) +{ + HPPACPU *cpu = HPPA_CPU(cs); + + cpu->env.iaoq_f = data[0]; + if (data[1] != (target_ureg)-1) { + cpu->env.iaoq_b = data[1]; + } + /* + * Since we were executing the instruction at IAOQ_F, and took some + * sort of action that provoked the cpu_restore_state, we can infer + * that the instruction was not nullified. + */ + cpu->env.psw_n = 0; +} + static bool hppa_cpu_has_work(CPUState *cs) { return cs->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI); @@ -153,6 +171,7 @@ static const struct SysemuCPUOps hppa_sysemu_ops = { static const struct TCGCPUOps hppa_tcg_ops = { .initialize = hppa_translate_init, .synchronize_from_tb = hppa_cpu_synchronize_from_tb, + .restore_state_to_opc = hppa_restore_state_to_opc, #ifndef CONFIG_USER_ONLY .tlb_fill = hppa_cpu_tlb_fill, diff --git a/target/hppa/translate.c b/target/hppa/translate.c index 8b861957e0..1af77473da 100644 --- a/target/hppa/translate.c +++ b/target/hppa/translate.c @@ -4346,16 +4346,3 @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns, DisasContext ctx; translator_loop(cs, tb, max_insns, pc, host_pc, &hppa_tr_ops, &ctx.base); } - -void restore_state_to_opc(CPUHPPAState *env, TranslationBlock *tb, - target_ulong *data) -{ - env->iaoq_f = data[0]; - if (data[1] != (target_ureg)-1) { - env->iaoq_b = data[1]; - } - /* Since we were executing the instruction at IAOQ_F, and took some - sort of action that provoked the cpu_restore_state, we can infer - that the instruction was not nullified. */ - env->psw_n = 0; -} diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index dac100c67c..4df0428089 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -2262,8 +2262,7 @@ static int kvm_get_supported_feature_msrs(KVMState *s) } assert(msr_list.nmsrs > 0); - kvm_feature_msrs = (struct kvm_msr_list *) \ - g_malloc0(sizeof(msr_list) + + kvm_feature_msrs = g_malloc0(sizeof(msr_list) + msr_list.nmsrs * sizeof(msr_list.indices[0])); kvm_feature_msrs->nmsrs = msr_list.nmsrs; diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c index 828244abe2..79ac5908f7 100644 --- a/target/i386/tcg/tcg-cpu.c +++ b/target/i386/tcg/tcg-cpu.c @@ -56,6 +56,24 @@ static void x86_cpu_synchronize_from_tb(CPUState *cs, } } +static void x86_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) +{ + X86CPU *cpu = X86_CPU(cs); + CPUX86State *env = &cpu->env; + int cc_op = data[1]; + + if (TARGET_TB_PCREL) { + env->eip = (env->eip & TARGET_PAGE_MASK) | data[0]; + } else { + env->eip = data[0] - tb->cs_base; + } + if (cc_op != CC_OP_DYNAMIC) { + env->cc_op = cc_op; + } +} + #ifndef CONFIG_USER_ONLY static bool x86_debug_check_breakpoint(CPUState *cs) { @@ -72,6 +90,7 @@ static bool x86_debug_check_breakpoint(CPUState *cs) static const struct TCGCPUOps x86_tcg_ops = { .initialize = tcg_x86_init, .synchronize_from_tb = x86_cpu_synchronize_from_tb, + .restore_state_to_opc = x86_restore_state_to_opc, .cpu_exec_enter = x86_cpu_exec_enter, .cpu_exec_exit = x86_cpu_exec_exit, #ifdef CONFIG_USER_ONLY diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 85be2e58c2..546c427c23 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -7023,18 +7023,3 @@ void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int max_insns, translator_loop(cpu, tb, max_insns, pc, host_pc, &i386_tr_ops, &dc.base); } - -void restore_state_to_opc(CPUX86State *env, TranslationBlock *tb, - target_ulong *data) -{ - int cc_op = data[1]; - - if (TARGET_TB_PCREL) { - env->eip = (env->eip & TARGET_PAGE_MASK) | data[0]; - } else { - env->eip = data[0] - tb->cs_base; - } - if (cc_op != CC_OP_DYNAMIC) { - env->cc_op = cc_op; - } -} diff --git a/target/i386/whpx/whpx-all.c b/target/i386/whpx/whpx-all.c index 8e4969edeb..e738d83e81 100644 --- a/target/i386/whpx/whpx-all.c +++ b/target/i386/whpx/whpx-all.c @@ -1164,9 +1164,8 @@ static void whpx_translate_cpu_breakpoints( (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0); struct whpx_breakpoint_collection *new_breakpoints = - (struct whpx_breakpoint_collection *)g_malloc0( - sizeof(struct whpx_breakpoint_collection) + - max_breakpoints * sizeof(struct whpx_breakpoint)); + g_malloc0(sizeof(struct whpx_breakpoint_collection) + + max_breakpoints * sizeof(struct whpx_breakpoint)); new_breakpoints->allocated = max_breakpoints; new_breakpoints->used = 0; diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index 1722ed2a4d..49393d95d8 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -319,6 +319,16 @@ static void loongarch_cpu_synchronize_from_tb(CPUState *cs, env->pc = tb_pc(tb); } + +static void loongarch_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) +{ + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + env->pc = data[0]; +} #endif /* CONFIG_TCG */ static bool loongarch_cpu_has_work(CPUState *cs) @@ -651,6 +661,7 @@ void loongarch_cpu_dump_state(CPUState *cs, FILE *f, int flags) static struct TCGCPUOps loongarch_tcg_ops = { .initialize = loongarch_translate_init, .synchronize_from_tb = loongarch_cpu_synchronize_from_tb, + .restore_state_to_opc = loongarch_restore_state_to_opc, #ifndef CONFIG_USER_ONLY .tlb_fill = loongarch_cpu_tlb_fill, diff --git a/target/loongarch/translate.c b/target/loongarch/translate.c index 95b37ea180..6091772349 100644 --- a/target/loongarch/translate.c +++ b/target/loongarch/translate.c @@ -272,9 +272,3 @@ void loongarch_translate_init(void) cpu_llval = tcg_global_mem_new(cpu_env, offsetof(CPULoongArchState, llval), "llval"); } - -void restore_state_to_opc(CPULoongArchState *env, TranslationBlock *tb, - target_ulong *data) -{ - env->pc = data[0]; -} diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c index 1e902e1ef0..b67ddea2ae 100644 --- a/target/m68k/cpu.c +++ b/target/m68k/cpu.c @@ -38,6 +38,19 @@ static vaddr m68k_cpu_get_pc(CPUState *cs) return cpu->env.pc; } +static void m68k_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) +{ + M68kCPU *cpu = M68K_CPU(cs); + int cc_op = data[1]; + + cpu->env.pc = data[0]; + if (cc_op != CC_OP_DYNAMIC) { + cpu->env.cc_op = cc_op; + } +} + static bool m68k_cpu_has_work(CPUState *cs) { return cs->interrupt_request & CPU_INTERRUPT_HARD; @@ -524,6 +537,7 @@ static const struct SysemuCPUOps m68k_sysemu_ops = { static const struct TCGCPUOps m68k_tcg_ops = { .initialize = m68k_tcg_init, + .restore_state_to_opc = m68k_restore_state_to_opc, #ifndef CONFIG_USER_ONLY .tlb_fill = m68k_cpu_tlb_fill, diff --git a/target/m68k/translate.c b/target/m68k/translate.c index 9df17aa4b2..5cbde4be34 100644 --- a/target/m68k/translate.c +++ b/target/m68k/translate.c @@ -6479,13 +6479,3 @@ void m68k_cpu_dump_state(CPUState *cs, FILE *f, int flags) env->mmu.mmusr, env->mmu.ar); #endif } - -void restore_state_to_opc(CPUM68KState *env, TranslationBlock *tb, - target_ulong *data) -{ - int cc_op = data[1]; - env->pc = data[0]; - if (cc_op != CC_OP_DYNAMIC) { - env->cc_op = cc_op; - } -} diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c index c10b8ac029..89e493f3ff 100644 --- a/target/microblaze/cpu.c +++ b/target/microblaze/cpu.c @@ -100,6 +100,16 @@ static void mb_cpu_synchronize_from_tb(CPUState *cs, cpu->env.iflags = tb->flags & IFLAGS_TB_MASK; } +static void mb_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) +{ + MicroBlazeCPU *cpu = MICROBLAZE_CPU(cs); + + cpu->env.pc = data[0]; + cpu->env.iflags = data[1]; +} + static bool mb_cpu_has_work(CPUState *cs) { return cs->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI); @@ -373,6 +383,7 @@ static const struct SysemuCPUOps mb_sysemu_ops = { static const struct TCGCPUOps mb_tcg_ops = { .initialize = mb_tcg_init, .synchronize_from_tb = mb_cpu_synchronize_from_tb, + .restore_state_to_opc = mb_restore_state_to_opc, #ifndef CONFIG_USER_ONLY .tlb_fill = mb_cpu_tlb_fill, diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c index c5546f93aa..974f21eb31 100644 --- a/target/microblaze/translate.c +++ b/target/microblaze/translate.c @@ -1946,10 +1946,3 @@ void mb_tcg_init(void) cpu_res_addr = tcg_global_mem_new(cpu_env, offsetof(CPUMBState, res_addr), "res_addr"); } - -void restore_state_to_opc(CPUMBState *env, TranslationBlock *tb, - target_ulong *data) -{ - env->pc = data[0]; - env->iflags = data[1]; -} diff --git a/target/mips/cpu.c b/target/mips/cpu.c index da58eb8892..e997c1b9cb 100644 --- a/target/mips/cpu.c +++ b/target/mips/cpu.c @@ -538,6 +538,7 @@ static const struct SysemuCPUOps mips_sysemu_ops = { static const struct TCGCPUOps mips_tcg_ops = { .initialize = mips_tcg_init, .synchronize_from_tb = mips_cpu_synchronize_from_tb, + .restore_state_to_opc = mips_restore_state_to_opc, #if !defined(CONFIG_USER_ONLY) .tlb_fill = mips_cpu_tlb_fill, diff --git a/target/mips/tcg/tcg-internal.h b/target/mips/tcg/tcg-internal.h index 1d27fa2ff9..aef032c48d 100644 --- a/target/mips/tcg/tcg-internal.h +++ b/target/mips/tcg/tcg-internal.h @@ -21,6 +21,9 @@ void mips_cpu_synchronize_from_tb(CPUState *cs, const TranslationBlock *tb); G_NORETURN void mips_cpu_do_unaligned_access(CPUState *cpu, vaddr addr, MMUAccessType access_type, int mmu_idx, uintptr_t retaddr); +void mips_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data); const char *mips_exception_name(int32_t exception); diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c index c3f92ea652..2f2d707a12 100644 --- a/target/mips/tcg/translate.c +++ b/target/mips/tcg/translate.c @@ -16229,9 +16229,13 @@ void mips_tcg_init(void) } } -void restore_state_to_opc(CPUMIPSState *env, TranslationBlock *tb, - target_ulong *data) +void mips_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) { + MIPSCPU *cpu = MIPS_CPU(cs); + CPUMIPSState *env = &cpu->env; + env->active_tc.PC = data[0]; env->hflags &= ~MIPS_HFLAG_BMASK; env->hflags |= data[1]; diff --git a/target/nios2/cpu.c b/target/nios2/cpu.c index 2b28429c08..9a5351bc81 100644 --- a/target/nios2/cpu.c +++ b/target/nios2/cpu.c @@ -42,6 +42,16 @@ static vaddr nios2_cpu_get_pc(CPUState *cs) return env->pc; } +static void nios2_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) +{ + Nios2CPU *cpu = NIOS2_CPU(cs); + CPUNios2State *env = &cpu->env; + + env->pc = data[0]; +} + static bool nios2_cpu_has_work(CPUState *cs) { return cs->interrupt_request & CPU_INTERRUPT_HARD; @@ -346,6 +356,7 @@ static const struct SysemuCPUOps nios2_sysemu_ops = { static const struct TCGCPUOps nios2_tcg_ops = { .initialize = nios2_tcg_init, + .restore_state_to_opc = nios2_restore_state_to_opc, #ifndef CONFIG_USER_ONLY .tlb_fill = nios2_cpu_tlb_fill, diff --git a/target/nios2/translate.c b/target/nios2/translate.c index 8dc0a32c6c..4db8b47744 100644 --- a/target/nios2/translate.c +++ b/target/nios2/translate.c @@ -1110,9 +1110,3 @@ void nios2_tcg_init(void) cpu_pc = tcg_global_mem_new(cpu_env, offsetof(CPUNios2State, pc), "pc"); } - -void restore_state_to_opc(CPUNios2State *env, TranslationBlock *tb, - target_ulong *data) -{ - env->pc = data[0]; -} diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c index f6fd437785..de0176cd20 100644 --- a/target/openrisc/cpu.c +++ b/target/openrisc/cpu.c @@ -46,6 +46,18 @@ static void openrisc_cpu_synchronize_from_tb(CPUState *cs, cpu->env.pc = tb_pc(tb); } +static void openrisc_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) +{ + OpenRISCCPU *cpu = OPENRISC_CPU(cs); + + cpu->env.pc = data[0]; + cpu->env.dflag = data[1] & 1; + if (data[1] & 2) { + cpu->env.ppc = cpu->env.pc - 4; + } +} static bool openrisc_cpu_has_work(CPUState *cs) { @@ -203,6 +215,7 @@ static const struct SysemuCPUOps openrisc_sysemu_ops = { static const struct TCGCPUOps openrisc_tcg_ops = { .initialize = openrisc_translate_init, .synchronize_from_tb = openrisc_cpu_synchronize_from_tb, + .restore_state_to_opc = openrisc_restore_state_to_opc, #ifndef CONFIG_USER_ONLY .tlb_fill = openrisc_cpu_tlb_fill, diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c index 8154f9d744..2f3d7c5fd1 100644 --- a/target/openrisc/translate.c +++ b/target/openrisc/translate.c @@ -1726,13 +1726,3 @@ void openrisc_cpu_dump_state(CPUState *cs, FILE *f, int flags) (i % 4) == 3 ? '\n' : ' '); } } - -void restore_state_to_opc(CPUOpenRISCState *env, TranslationBlock *tb, - target_ulong *data) -{ - env->pc = data[0]; - env->dflag = data[1] & 1; - if (data[1] & 2) { - env->ppc = env->pc - 4; - } -} diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c index 763a8431be..335351c226 100644 --- a/target/ppc/cpu_init.c +++ b/target/ppc/cpu_init.c @@ -7221,6 +7221,15 @@ static vaddr ppc_cpu_get_pc(CPUState *cs) return cpu->env.nip; } +static void ppc_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) +{ + PowerPCCPU *cpu = POWERPC_CPU(cs); + + cpu->env.nip = data[0]; +} + static bool ppc_cpu_has_work(CPUState *cs) { PowerPCCPU *cpu = POWERPC_CPU(cs); @@ -7446,6 +7455,7 @@ static const struct SysemuCPUOps ppc_sysemu_ops = { static const struct TCGCPUOps ppc_tcg_ops = { .initialize = ppc_translate_init, + .restore_state_to_opc = ppc_restore_state_to_opc, #ifdef CONFIG_USER_ONLY .record_sigsegv = ppc_cpu_record_sigsegv, diff --git a/target/ppc/translate.c b/target/ppc/translate.c index e810842925..7228857e23 100644 --- a/target/ppc/translate.c +++ b/target/ppc/translate.c @@ -7739,9 +7739,3 @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns, translator_loop(cs, tb, max_insns, pc, host_pc, &ppc_tr_ops, &ctx.base); } - -void restore_state_to_opc(CPUPPCState *env, TranslationBlock *tb, - target_ulong *data) -{ - env->nip = data[0]; -} diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c index e6d9c706bb..d14e95c9dc 100644 --- a/target/riscv/cpu.c +++ b/target/riscv/cpu.c @@ -503,10 +503,14 @@ static bool riscv_cpu_has_work(CPUState *cs) #endif } -void restore_state_to_opc(CPURISCVState *env, TranslationBlock *tb, - target_ulong *data) +static void riscv_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) { + RISCVCPU *cpu = RISCV_CPU(cs); + CPURISCVState *env = &cpu->env; RISCVMXL xl = FIELD_EX32(tb->flags, TB_FLAGS, XL); + if (xl == MXL_RV32) { env->pc = (int32_t)data[0]; } else { @@ -1138,6 +1142,7 @@ static const struct SysemuCPUOps riscv_sysemu_ops = { static const struct TCGCPUOps riscv_tcg_ops = { .initialize = riscv_translate_init, .synchronize_from_tb = riscv_cpu_synchronize_from_tb, + .restore_state_to_opc = riscv_restore_state_to_opc, #ifndef CONFIG_USER_ONLY .tlb_fill = riscv_cpu_tlb_fill, diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index b94f809eb3..0020b9a95d 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -211,7 +211,7 @@ static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt, return; } if (tot - cnt == 0) { - return ; + return; } memset(base + cnt, -1, tot - cnt); } diff --git a/target/rx/cpu.c b/target/rx/cpu.c index 2f28099723..9003c6e9fe 100644 --- a/target/rx/cpu.c +++ b/target/rx/cpu.c @@ -47,6 +47,15 @@ static void rx_cpu_synchronize_from_tb(CPUState *cs, cpu->env.pc = tb_pc(tb); } +static void rx_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) +{ + RXCPU *cpu = RX_CPU(cs); + + cpu->env.pc = data[0]; +} + static bool rx_cpu_has_work(CPUState *cs) { return cs->interrupt_request & @@ -192,6 +201,7 @@ static const struct SysemuCPUOps rx_sysemu_ops = { static const struct TCGCPUOps rx_tcg_ops = { .initialize = rx_translate_init, .synchronize_from_tb = rx_cpu_synchronize_from_tb, + .restore_state_to_opc = rx_restore_state_to_opc, .tlb_fill = rx_cpu_tlb_fill, #ifndef CONFIG_USER_ONLY diff --git a/target/rx/op_helper.c b/target/rx/op_helper.c index 9ca32dcc82..acce650185 100644 --- a/target/rx/op_helper.c +++ b/target/rx/op_helper.c @@ -286,7 +286,7 @@ void helper_suntil(CPURXState *env, uint32_t sz) uint32_t tmp; tcg_debug_assert(sz < 3); if (env->regs[3] == 0) { - return ; + return; } do { tmp = cpu_ldufn[sz](env, env->regs[1], GETPC()); @@ -305,7 +305,7 @@ void helper_swhile(CPURXState *env, uint32_t sz) uint32_t tmp; tcg_debug_assert(sz < 3); if (env->regs[3] == 0) { - return ; + return; } do { tmp = cpu_ldufn[sz](env, env->regs[1], GETPC()); diff --git a/target/rx/translate.c b/target/rx/translate.c index ea5653bc95..87a3f54adb 100644 --- a/target/rx/translate.c +++ b/target/rx/translate.c @@ -2371,12 +2371,6 @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns, translator_loop(cs, tb, max_insns, pc, host_pc, &rx_tr_ops, &dc.base); } -void restore_state_to_opc(CPURXState *env, TranslationBlock *tb, - target_ulong *data) -{ - env->pc = data[0]; -} - #define ALLOC_REGISTER(sym, name) \ cpu_##sym = tcg_global_mem_new_i32(cpu_env, \ offsetof(CPURXState, sym), name) diff --git a/target/s390x/arch_dump.c b/target/s390x/arch_dump.c index f60a14920d..a2329141e8 100644 --- a/target/s390x/arch_dump.c +++ b/target/s390x/arch_dump.c @@ -12,11 +12,13 @@ */ #include "qemu/osdep.h" +#include "qemu/units.h" #include "cpu.h" #include "s390x-internal.h" #include "elf.h" #include "sysemu/dump.h" - +#include "hw/s390x/pv.h" +#include "kvm/kvm_s390x.h" struct S390xUserRegsStruct { uint64_t psw[2]; @@ -76,9 +78,16 @@ typedef struct noteStruct { uint64_t todcmp; uint32_t todpreg; uint64_t ctrs[16]; + uint8_t dynamic[1]; /* + * Would be a flexible array member, if + * that was legal inside a union. Real + * size comes from PV info interface. + */ } contents; } QEMU_PACKED Note; +static bool pv_dump_initialized; + static void s390x_write_elf64_prstatus(Note *note, S390CPU *cpu, int id) { int i; @@ -177,28 +186,39 @@ static void s390x_write_elf64_prefix(Note *note, S390CPU *cpu, int id) note->contents.prefix = cpu_to_be32((uint32_t)(cpu->env.psa)); } +static void s390x_write_elf64_pv(Note *note, S390CPU *cpu, int id) +{ + note->hdr.n_type = cpu_to_be32(NT_S390_PV_CPU_DATA); + if (!pv_dump_initialized) { + return; + } + kvm_s390_dump_cpu(cpu, ¬e->contents.dynamic); +} typedef struct NoteFuncDescStruct { int contents_size; + uint64_t (*note_size_func)(void); /* NULL for non-dynamic sized contents */ void (*note_contents_func)(Note *note, S390CPU *cpu, int id); + bool pvonly; } NoteFuncDesc; static const NoteFuncDesc note_core[] = { - {sizeof_field(Note, contents.prstatus), s390x_write_elf64_prstatus}, - {sizeof_field(Note, contents.fpregset), s390x_write_elf64_fpregset}, - { 0, NULL} + {sizeof_field(Note, contents.prstatus), NULL, s390x_write_elf64_prstatus, false}, + {sizeof_field(Note, contents.fpregset), NULL, s390x_write_elf64_fpregset, false}, + { 0, NULL, NULL, false} }; static const NoteFuncDesc note_linux[] = { - {sizeof_field(Note, contents.prefix), s390x_write_elf64_prefix}, - {sizeof_field(Note, contents.ctrs), s390x_write_elf64_ctrs}, - {sizeof_field(Note, contents.timer), s390x_write_elf64_timer}, - {sizeof_field(Note, contents.todcmp), s390x_write_elf64_todcmp}, - {sizeof_field(Note, contents.todpreg), s390x_write_elf64_todpreg}, - {sizeof_field(Note, contents.vregslo), s390x_write_elf64_vregslo}, - {sizeof_field(Note, contents.vregshi), s390x_write_elf64_vregshi}, - {sizeof_field(Note, contents.gscb), s390x_write_elf64_gscb}, - { 0, NULL} + {sizeof_field(Note, contents.prefix), NULL, s390x_write_elf64_prefix, false}, + {sizeof_field(Note, contents.ctrs), NULL, s390x_write_elf64_ctrs, false}, + {sizeof_field(Note, contents.timer), NULL, s390x_write_elf64_timer, false}, + {sizeof_field(Note, contents.todcmp), NULL, s390x_write_elf64_todcmp, false}, + {sizeof_field(Note, contents.todpreg), NULL, s390x_write_elf64_todpreg, false}, + {sizeof_field(Note, contents.vregslo), NULL, s390x_write_elf64_vregslo, false}, + {sizeof_field(Note, contents.vregshi), NULL, s390x_write_elf64_vregshi, false}, + {sizeof_field(Note, contents.gscb), NULL, s390x_write_elf64_gscb, false}, + {0, kvm_s390_pv_dmp_get_size_cpu, s390x_write_elf64_pv, true}, + { 0, NULL, NULL, false} }; static int s390x_write_elf64_notes(const char *note_name, @@ -207,22 +227,41 @@ static int s390x_write_elf64_notes(const char *note_name, DumpState *s, const NoteFuncDesc *funcs) { - Note note; + Note note, *notep; const NoteFuncDesc *nf; - int note_size; + int note_size, content_size; int ret = -1; assert(strlen(note_name) < sizeof(note.name)); for (nf = funcs; nf->note_contents_func; nf++) { - memset(¬e, 0, sizeof(note)); - note.hdr.n_namesz = cpu_to_be32(strlen(note_name) + 1); - note.hdr.n_descsz = cpu_to_be32(nf->contents_size); - g_strlcpy(note.name, note_name, sizeof(note.name)); - (*nf->note_contents_func)(¬e, cpu, id); + notep = ¬e; + if (nf->pvonly && !s390_is_pv()) { + continue; + } + + content_size = nf->note_size_func ? nf->note_size_func() : nf->contents_size; + note_size = sizeof(note) - sizeof(notep->contents) + content_size; + + /* Notes with dynamic sizes need to allocate a note */ + if (nf->note_size_func) { + notep = g_malloc(note_size); + } + + memset(notep, 0, sizeof(note)); - note_size = sizeof(note) - sizeof(note.contents) + nf->contents_size; - ret = f(¬e, note_size, s); + /* Setup note header data */ + notep->hdr.n_descsz = cpu_to_be32(content_size); + notep->hdr.n_namesz = cpu_to_be32(strlen(note_name) + 1); + g_strlcpy(notep->name, note_name, sizeof(notep->name)); + + /* Get contents and write them out */ + (*nf->note_contents_func)(notep, cpu, id); + ret = f(notep, note_size, s); + + if (nf->note_size_func) { + g_free(notep); + } if (ret < 0) { return -1; @@ -247,13 +286,179 @@ int s390_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs, return s390x_write_elf64_notes("LINUX", f, cpu, cpuid, s, note_linux); } +/* PV dump section size functions */ +static uint64_t get_mem_state_size_from_len(uint64_t len) +{ + return (len / (MiB)) * kvm_s390_pv_dmp_get_size_mem_state(); +} + +static uint64_t get_size_mem_state(DumpState *s) +{ + return get_mem_state_size_from_len(s->total_size); +} + +static uint64_t get_size_completion_data(DumpState *s) +{ + return kvm_s390_pv_dmp_get_size_completion_data(); +} + +/* PV dump section data functions*/ +static int get_data_completion(DumpState *s, uint8_t *buff) +{ + int rc; + + if (!pv_dump_initialized) { + return 0; + } + rc = kvm_s390_dump_completion_data(buff); + if (!rc) { + pv_dump_initialized = false; + } + return rc; +} + +static int get_mem_state(DumpState *s, uint8_t *buff) +{ + int64_t memblock_size, memblock_start; + GuestPhysBlock *block; + uint64_t off; + int rc; + + QTAILQ_FOREACH(block, &s->guest_phys_blocks.head, next) { + memblock_start = dump_filtered_memblock_start(block, s->filter_area_begin, + s->filter_area_length); + if (memblock_start == -1) { + continue; + } + + memblock_size = dump_filtered_memblock_size(block, s->filter_area_begin, + s->filter_area_length); + + off = get_mem_state_size_from_len(block->target_start); + + rc = kvm_s390_dump_mem_state(block->target_start, + get_mem_state_size_from_len(memblock_size), + buff + off); + if (rc) { + return rc; + } + } + + return 0; +} + +static struct sections { + uint64_t (*sections_size_func)(DumpState *s); + int (*sections_contents_func)(DumpState *s, uint8_t *buff); + char sctn_str[12]; +} sections[] = { + { get_size_mem_state, get_mem_state, "pv_mem_meta"}, + { get_size_completion_data, get_data_completion, "pv_compl"}, + {NULL , NULL, ""} +}; + +static uint64_t arch_sections_write_hdr(DumpState *s, uint8_t *buff) +{ + Elf64_Shdr *shdr = (void *)buff; + struct sections *sctn = sections; + uint64_t off = s->section_offset; + + if (!pv_dump_initialized) { + return 0; + } + + for (; sctn->sections_size_func; off += shdr->sh_size, sctn++, shdr++) { + memset(shdr, 0, sizeof(*shdr)); + shdr->sh_type = SHT_PROGBITS; + shdr->sh_offset = off; + shdr->sh_size = sctn->sections_size_func(s); + shdr->sh_name = s->string_table_buf->len; + g_array_append_vals(s->string_table_buf, sctn->sctn_str, sizeof(sctn->sctn_str)); + } + + return (uintptr_t)shdr - (uintptr_t)buff; +} + + +/* Add arch specific number of sections and their respective sizes */ +static void arch_sections_add(DumpState *s) +{ + struct sections *sctn = sections; + + /* + * We only do a PV dump if we are running a PV guest, KVM supports + * the dump API and we got valid dump length information. + */ + if (!s390_is_pv() || !kvm_s390_get_protected_dump() || + !kvm_s390_pv_info_basic_valid()) { + return; + } + + /* + * Start the UV dump process by doing the initialize dump call via + * KVM as the proxy. + */ + if (!kvm_s390_dump_init()) { + pv_dump_initialized = true; + } else { + /* + * Dump init failed, maybe the guest owner disabled dumping. + * We'll continue the non-PV dump process since this is no + * reason to crash qemu. + */ + return; + } + + for (; sctn->sections_size_func; sctn++) { + s->shdr_num += 1; + s->elf_section_data_size += sctn->sections_size_func(s); + } +} + +/* + * After the PV dump has been initialized, the CPU data has been + * fetched and memory has been dumped, we need to grab the tweak data + * and the completion data. + */ +static int arch_sections_write(DumpState *s, uint8_t *buff) +{ + struct sections *sctn = sections; + int rc; + + if (!pv_dump_initialized) { + return -EINVAL; + } + + for (; sctn->sections_size_func; sctn++) { + rc = sctn->sections_contents_func(s, buff); + buff += sctn->sections_size_func(s); + if (rc) { + return rc; + } + } + return 0; +} + int cpu_get_dump_info(ArchDumpInfo *info, const struct GuestPhysBlockList *guest_phys_blocks) { info->d_machine = EM_S390; info->d_endian = ELFDATA2MSB; info->d_class = ELFCLASS64; - + /* + * This is evaluated for each dump so we can freely switch + * between PV and non-PV. + */ + if (s390_is_pv() && kvm_s390_get_protected_dump() && + kvm_s390_pv_info_basic_valid()) { + info->arch_sections_add_fn = *arch_sections_add; + info->arch_sections_write_hdr_fn = *arch_sections_write_hdr; + info->arch_sections_write_fn = *arch_sections_write; + } else { + info->arch_sections_add_fn = NULL; + info->arch_sections_write_hdr_fn = NULL; + info->arch_sections_write_fn = NULL; + } return 0; } @@ -261,7 +466,7 @@ ssize_t cpu_get_note_size(int class, int machine, int nr_cpus) { int name_size = 8; /* "LINUX" or "CORE" + pad */ size_t elf_note_size = 0; - int note_head_size; + int note_head_size, content_size; const NoteFuncDesc *nf; assert(class == ELFCLASS64); @@ -270,12 +475,15 @@ ssize_t cpu_get_note_size(int class, int machine, int nr_cpus) note_head_size = sizeof(Elf64_Nhdr); for (nf = note_core; nf->note_contents_func; nf++) { - elf_note_size = elf_note_size + note_head_size + name_size + - nf->contents_size; + elf_note_size = elf_note_size + note_head_size + name_size + nf->contents_size; } for (nf = note_linux; nf->note_contents_func; nf++) { + if (nf->pvonly && !s390_is_pv()) { + continue; + } + content_size = nf->contents_size ? nf->contents_size : nf->note_size_func(); elf_note_size = elf_note_size + note_head_size + name_size + - nf->contents_size; + content_size; } return (elf_note_size) * nr_cpus; diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c index df00040e95..96562c516d 100644 --- a/target/s390x/cpu.c +++ b/target/s390x/cpu.c @@ -272,6 +272,7 @@ static void s390_cpu_reset_full(DeviceState *dev) static const struct TCGCPUOps s390_tcg_ops = { .initialize = s390x_translate_init, + .restore_state_to_opc = s390x_restore_state_to_opc, #ifdef CONFIG_USER_ONLY .record_sigsegv = s390_cpu_record_sigsegv, diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c index 508c24cfec..3ac7ec9acf 100644 --- a/target/s390x/kvm/kvm.c +++ b/target/s390x/kvm/kvm.c @@ -158,6 +158,7 @@ static int cap_hpage_1m; static int cap_vcpu_resets; static int cap_protected; static int cap_zpci_op; +static int cap_protected_dump; static bool mem_op_storage_key_support; @@ -364,6 +365,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) cap_vcpu_resets = kvm_check_extension(s, KVM_CAP_S390_VCPU_RESETS); cap_protected = kvm_check_extension(s, KVM_CAP_S390_PROTECTED); cap_zpci_op = kvm_check_extension(s, KVM_CAP_S390_ZPCI_OP); + cap_protected_dump = kvm_check_extension(s, KVM_CAP_S390_PROTECTED_DUMP); kvm_vm_enable_cap(s, KVM_CAP_S390_USER_SIGP, 0); kvm_vm_enable_cap(s, KVM_CAP_S390_VECTOR_REGISTERS, 0); @@ -1033,7 +1035,7 @@ int kvm_arch_remove_hw_breakpoint(target_ulong addr, } size = nb_hw_breakpoints * sizeof(struct kvm_hw_breakpoint); hw_breakpoints = - (struct kvm_hw_breakpoint *)g_realloc(hw_breakpoints, size); + g_realloc(hw_breakpoints, size); } else { g_free(hw_breakpoints); hw_breakpoints = NULL; @@ -2045,6 +2047,11 @@ int kvm_s390_assign_subch_ioeventfd(EventNotifier *notifier, uint32_t sch, return kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick); } +int kvm_s390_get_protected_dump(void) +{ + return cap_protected_dump; +} + int kvm_s390_get_ri(void) { return cap_ri; diff --git a/target/s390x/kvm/kvm_s390x.h b/target/s390x/kvm/kvm_s390x.h index aaae8570de..f9785564d0 100644 --- a/target/s390x/kvm/kvm_s390x.h +++ b/target/s390x/kvm/kvm_s390x.h @@ -26,6 +26,7 @@ int kvm_s390_set_cpu_state(S390CPU *cpu, uint8_t cpu_state); void kvm_s390_vcpu_interrupt_pre_save(S390CPU *cpu); int kvm_s390_vcpu_interrupt_post_load(S390CPU *cpu); int kvm_s390_get_hpage_1m(void); +int kvm_s390_get_protected_dump(void); int kvm_s390_get_ri(void); int kvm_s390_get_zpci_op(void); int kvm_s390_get_clock(uint8_t *tod_high, uint64_t *tod_clock); diff --git a/target/s390x/kvm/meson.build b/target/s390x/kvm/meson.build index d1356356b1..aef52b6686 100644 --- a/target/s390x/kvm/meson.build +++ b/target/s390x/kvm/meson.build @@ -1,6 +1,8 @@ s390x_ss.add(when: 'CONFIG_KVM', if_true: files( 'kvm.c' +), if_false: files( + 'stubs.c' )) # Newer kernels on s390 check for an S390_PGSTE program header and diff --git a/target/s390x/kvm/stubs.c b/target/s390x/kvm/stubs.c new file mode 100644 index 0000000000..5fd63b9a7e --- /dev/null +++ b/target/s390x/kvm/stubs.c @@ -0,0 +1,12 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" + +#include "kvm_s390x.h" + +int kvm_s390_get_protected_dump(void) +{ + return false; +} diff --git a/target/s390x/s390x-internal.h b/target/s390x/s390x-internal.h index b5ae0ae364..5d4361d35b 100644 --- a/target/s390x/s390x-internal.h +++ b/target/s390x/s390x-internal.h @@ -398,7 +398,9 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, /* translate.c */ void s390x_translate_init(void); - +void s390x_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data); /* sigp.c */ int handle_sigp(CPUS390XState *env, uint8_t order, uint64_t r1, uint64_t r3); diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c index 1d2dddab1c..5798928473 100644 --- a/target/s390x/tcg/translate.c +++ b/target/s390x/tcg/translate.c @@ -6691,9 +6691,12 @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns, translator_loop(cs, tb, max_insns, pc, host_pc, &s390x_tr_ops, &dc.base); } -void restore_state_to_opc(CPUS390XState *env, TranslationBlock *tb, - target_ulong *data) +void s390x_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) { + S390CPU *cpu = S390_CPU(cs); + CPUS390XState *env = &cpu->env; int cc_op = data[1]; env->psw.addr = data[0]; diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c index 56c50530da..453268392b 100644 --- a/target/sh4/cpu.c +++ b/target/sh4/cpu.c @@ -50,6 +50,21 @@ static void superh_cpu_synchronize_from_tb(CPUState *cs, cpu->env.flags = tb->flags; } +static void superh_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) +{ + SuperHCPU *cpu = SUPERH_CPU(cs); + + cpu->env.pc = data[0]; + cpu->env.flags = data[1]; + /* + * Theoretically delayed_pc should also be restored. In practice the + * branch instruction is re-executed after exception, so the delayed + * branch target will be recomputed. + */ +} + #ifndef CONFIG_USER_ONLY static bool superh_io_recompile_replay_branch(CPUState *cs, const TranslationBlock *tb) @@ -243,6 +258,7 @@ static const struct SysemuCPUOps sh4_sysemu_ops = { static const struct TCGCPUOps superh_tcg_ops = { .initialize = sh4_translate_init, .synchronize_from_tb = superh_cpu_synchronize_from_tb, + .restore_state_to_opc = superh_restore_state_to_opc, #ifndef CONFIG_USER_ONLY .tlb_fill = superh_cpu_tlb_fill, diff --git a/target/sh4/translate.c b/target/sh4/translate.c index 26231b2a5a..7db3468b01 100644 --- a/target/sh4/translate.c +++ b/target/sh4/translate.c @@ -2381,13 +2381,3 @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns, translator_loop(cs, tb, max_insns, pc, host_pc, &sh4_tr_ops, &ctx.base); } - -void restore_state_to_opc(CPUSH4State *env, TranslationBlock *tb, - target_ulong *data) -{ - env->pc = data[0]; - env->flags = data[1]; - /* Theoretically delayed_pc should also be restored. In practice the - branch instruction is re-executed after exception, so the delayed - branch target will be recomputed. */ -} diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c index 1f9ef7afd8..4c3d08a875 100644 --- a/target/sparc/cpu.c +++ b/target/sparc/cpu.c @@ -872,6 +872,7 @@ static const struct SysemuCPUOps sparc_sysemu_ops = { static const struct TCGCPUOps sparc_tcg_ops = { .initialize = sparc_tcg_init, .synchronize_from_tb = sparc_cpu_synchronize_from_tb, + .restore_state_to_opc = sparc_restore_state_to_opc, #ifndef CONFIG_USER_ONLY .tlb_fill = sparc_cpu_tlb_fill, diff --git a/target/sparc/cpu.h b/target/sparc/cpu.h index f80ea2e8cf..e478c5eb16 100644 --- a/target/sparc/cpu.h +++ b/target/sparc/cpu.h @@ -600,6 +600,9 @@ int sparc_cpu_memory_rw_debug(CPUState *cpu, vaddr addr, /* translate.c */ void sparc_tcg_init(void); +void sparc_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data); /* cpu-exec.c */ diff --git a/target/sparc/translate.c b/target/sparc/translate.c index 2cbbe2396a..34858eb95f 100644 --- a/target/sparc/translate.c +++ b/target/sparc/translate.c @@ -6011,9 +6011,12 @@ void sparc_tcg_init(void) } } -void restore_state_to_opc(CPUSPARCState *env, TranslationBlock *tb, - target_ulong *data) +void sparc_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) { + SPARCCPU *cpu = SPARC_CPU(cs); + CPUSPARCState *env = &cpu->env; target_ulong pc = data[0]; target_ulong npc = data[1]; diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c index ab7a1e3a6d..2c54a2825f 100644 --- a/target/tricore/cpu.c +++ b/target/tricore/cpu.c @@ -58,6 +58,16 @@ static void tricore_cpu_synchronize_from_tb(CPUState *cs, env->PC = tb_pc(tb); } +static void tricore_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) +{ + TriCoreCPU *cpu = TRICORE_CPU(cs); + CPUTriCoreState *env = &cpu->env; + + env->PC = data[0]; +} + static void tricore_cpu_reset(DeviceState *dev) { CPUState *s = CPU(dev); @@ -161,6 +171,7 @@ static const struct SysemuCPUOps tricore_sysemu_ops = { static const struct TCGCPUOps tricore_tcg_ops = { .initialize = tricore_tcg_init, .synchronize_from_tb = tricore_cpu_synchronize_from_tb, + .restore_state_to_opc = tricore_restore_state_to_opc, .tlb_fill = tricore_cpu_tlb_fill, }; diff --git a/target/tricore/translate.c b/target/tricore/translate.c index a0558ead71..c5b7bfbf20 100644 --- a/target/tricore/translate.c +++ b/target/tricore/translate.c @@ -8886,12 +8886,6 @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns, &tricore_tr_ops, &ctx.base); } -void -restore_state_to_opc(CPUTriCoreState *env, TranslationBlock *tb, - target_ulong *data) -{ - env->PC = data[0]; -} /* * * Initialization diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c index cbbe0e84a2..09923301c4 100644 --- a/target/xtensa/cpu.c +++ b/target/xtensa/cpu.c @@ -51,6 +51,15 @@ static vaddr xtensa_cpu_get_pc(CPUState *cs) return cpu->env.pc; } +static void xtensa_restore_state_to_opc(CPUState *cs, + const TranslationBlock *tb, + const uint64_t *data) +{ + XtensaCPU *cpu = XTENSA_CPU(cs); + + cpu->env.pc = data[0]; +} + static bool xtensa_cpu_has_work(CPUState *cs) { #ifndef CONFIG_USER_ONLY @@ -215,6 +224,7 @@ static const struct SysemuCPUOps xtensa_sysemu_ops = { static const struct TCGCPUOps xtensa_tcg_ops = { .initialize = xtensa_translate_init, .debug_excp_handler = xtensa_breakpoint_handler, + .restore_state_to_opc = xtensa_restore_state_to_opc, #ifndef CONFIG_USER_ONLY .tlb_fill = xtensa_cpu_tlb_fill, diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c index bdd4690a5c..77bcd71030 100644 --- a/target/xtensa/translate.c +++ b/target/xtensa/translate.c @@ -1355,12 +1355,6 @@ void xtensa_cpu_dump_state(CPUState *cs, FILE *f, int flags) } } -void restore_state_to_opc(CPUXtensaState *env, TranslationBlock *tb, - target_ulong *data) -{ - env->pc = data[0]; -} - static void translate_abs(DisasContext *dc, const OpcodeArg arg[], const uint32_t par[]) { diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc index d997f7922a..344b63e20f 100644 --- a/tcg/aarch64/tcg-target.c.inc +++ b/tcg/aarch64/tcg-target.c.inc @@ -1916,24 +1916,21 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, break; case INDEX_op_goto_tb: - if (s->tb_jmp_insn_offset != NULL) { - /* TCG_TARGET_HAS_direct_jump */ - /* Ensure that ADRP+ADD are 8-byte aligned so that an atomic - write can be used to patch the target address. */ - if ((uintptr_t)s->code_ptr & 7) { - tcg_out32(s, NOP); - } - s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s); - /* actual branch destination will be patched by - tb_target_set_jmp_target later. */ - tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0); - tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0); - } else { - /* !TCG_TARGET_HAS_direct_jump */ - tcg_debug_assert(s->tb_jmp_target_addr != NULL); - intptr_t offset = tcg_pcrel_diff(s, (s->tb_jmp_target_addr + a0)) >> 2; - tcg_out_insn(s, 3305, LDR, offset, TCG_REG_TMP); + tcg_debug_assert(s->tb_jmp_insn_offset != NULL); + /* + * Ensure that ADRP+ADD are 8-byte aligned so that an atomic + * write can be used to patch the target address. + */ + if ((uintptr_t)s->code_ptr & 7) { + tcg_out32(s, NOP); } + s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s); + /* + * actual branch destination will be patched by + * tb_target_set_jmp_target later + */ + tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0); + tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0); tcg_out_insn(s, 3207, BR, TCG_REG_TMP); set_jmp_reset_offset(s, a0); break; diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc index a3debf6da7..d326e28740 100644 --- a/tcg/loongarch64/tcg-target.c.inc +++ b/tcg/loongarch64/tcg-target.c.inc @@ -1031,6 +1031,36 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args) #endif } +/* LoongArch uses `andi zero, zero, 0` as NOP. */ +#define NOP OPC_ANDI +static void tcg_out_nop(TCGContext *s) +{ + tcg_out32(s, NOP); +} + +void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx, + uintptr_t jmp_rw, uintptr_t addr) +{ + tcg_insn_unit i1, i2; + ptrdiff_t upper, lower; + ptrdiff_t offset = (ptrdiff_t)(addr - jmp_rx) >> 2; + + if (offset == sextreg(offset, 0, 26)) { + i1 = encode_sd10k16_insn(OPC_B, offset); + i2 = NOP; + } else { + tcg_debug_assert(offset == sextreg(offset, 0, 36)); + lower = (int16_t)offset; + upper = (offset - lower) >> 16; + + i1 = encode_dsj20_insn(OPC_PCADDU18I, TCG_REG_TMP0, upper); + i2 = encode_djsk16_insn(OPC_JIRL, TCG_REG_ZERO, TCG_REG_TMP0, lower); + } + uint64_t pair = ((uint64_t)i2 << 32) | i1; + qatomic_set((uint64_t *)jmp_rw, pair); + flush_idcache_range(jmp_rx, jmp_rw, 8); +} + /* * Entry-points */ @@ -1058,10 +1088,20 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, break; case INDEX_op_goto_tb: - assert(s->tb_jmp_insn_offset == 0); - /* indirect jump method */ - tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO, - (uintptr_t)(s->tb_jmp_target_addr + a0)); + tcg_debug_assert(s->tb_jmp_insn_offset != NULL); + /* + * Ensure that patch area is 8-byte aligned so that an + * atomic write can be used to patch the target address. + */ + if ((uintptr_t)s->code_ptr & 7) { + tcg_out_nop(s); + } + s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s); + /* + * actual branch destination will be patched by + * tb_target_set_jmp_target later + */ + tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0); tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0); set_jmp_reset_offset(s, a0); break; diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h index d58a6162f2..a659c8d6fd 100644 --- a/tcg/loongarch64/tcg-target.h +++ b/tcg/loongarch64/tcg-target.h @@ -42,7 +42,11 @@ #define TCG_TARGET_INSN_UNIT_SIZE 4 #define TCG_TARGET_NB_REGS 32 -#define MAX_CODE_GEN_BUFFER_SIZE SIZE_MAX +/* + * PCADDU18I + JIRL sequence can give 20 + 16 + 2 = 38 bits + * signed offset, which is +/- 128 GiB. + */ +#define MAX_CODE_GEN_BUFFER_SIZE (128 * GiB) typedef enum { TCG_REG_ZERO, @@ -123,7 +127,7 @@ typedef enum { #define TCG_TARGET_HAS_clz_i32 1 #define TCG_TARGET_HAS_ctz_i32 1 #define TCG_TARGET_HAS_ctpop_i32 0 -#define TCG_TARGET_HAS_direct_jump 0 +#define TCG_TARGET_HAS_direct_jump 1 #define TCG_TARGET_HAS_brcond2 0 #define TCG_TARGET_HAS_setcond2 0 #define TCG_TARGET_HAS_qemu_st8_i32 0 @@ -166,7 +170,6 @@ typedef enum { #define TCG_TARGET_HAS_muluh_i64 1 #define TCG_TARGET_HAS_mulsh_i64 1 -/* not defined -- call should be eliminated at compile time */ void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t); #define TCG_TARGET_DEFAULT_MO (0) diff --git a/tests/avocado/machine_aspeed.py b/tests/avocado/machine_aspeed.py index 124649a24b..fba6527026 100644 --- a/tests/avocado/machine_aspeed.py +++ b/tests/avocado/machine_aspeed.py @@ -92,7 +92,7 @@ class AST2x00Machine(QemuSystemTest): self.do_test_arm_aspeed(image_path) - def do_test_arm_aspeed_buidroot_start(self, image, cpu_id): + def do_test_arm_aspeed_buildroot_start(self, image, cpu_id): self.require_netdev('user') self.vm.set_console() @@ -111,11 +111,11 @@ class AST2x00Machine(QemuSystemTest): exec_command(self, 'root') time.sleep(0.1) - def do_test_arm_aspeed_buidroot_poweroff(self): + def do_test_arm_aspeed_buildroot_poweroff(self): exec_command_and_wait_for_pattern(self, 'poweroff', 'reboot: System halted'); - def test_arm_ast2500_evb_builroot(self): + def test_arm_ast2500_evb_buildroot(self): """ :avocado: tags=arch:arm :avocado: tags=machine:ast2500-evb @@ -129,7 +129,7 @@ class AST2x00Machine(QemuSystemTest): self.vm.add_args('-device', 'tmp105,bus=aspeed.i2c.bus.3,address=0x4d,id=tmp-test'); - self.do_test_arm_aspeed_buidroot_start(image_path, '0x0') + self.do_test_arm_aspeed_buildroot_start(image_path, '0x0') exec_command_and_wait_for_pattern(self, 'echo lm75 0x4d > /sys/class/i2c-dev/i2c-3/device/new_device', @@ -141,9 +141,9 @@ class AST2x00Machine(QemuSystemTest): exec_command_and_wait_for_pattern(self, 'cat /sys/class/hwmon/hwmon1/temp1_input', '18000') - self.do_test_arm_aspeed_buidroot_poweroff() + self.do_test_arm_aspeed_buildroot_poweroff() - def test_arm_ast2600_evb_builroot(self): + def test_arm_ast2600_evb_buildroot(self): """ :avocado: tags=arch:arm :avocado: tags=machine:ast2600-evb @@ -159,7 +159,7 @@ class AST2x00Machine(QemuSystemTest): 'tmp105,bus=aspeed.i2c.bus.3,address=0x4d,id=tmp-test'); self.vm.add_args('-device', 'ds1338,bus=aspeed.i2c.bus.3,address=0x32'); - self.do_test_arm_aspeed_buidroot_start(image_path, '0xf00') + self.do_test_arm_aspeed_buildroot_start(image_path, '0xf00') exec_command_and_wait_for_pattern(self, 'echo lm75 0x4d > /sys/class/i2c-dev/i2c-3/device/new_device', @@ -177,7 +177,7 @@ class AST2x00Machine(QemuSystemTest): year = time.strftime("%Y") exec_command_and_wait_for_pattern(self, 'hwclock -f /dev/rtc1', year); - self.do_test_arm_aspeed_buidroot_poweroff() + self.do_test_arm_aspeed_buildroot_poweroff() class AST2x00MachineSDK(QemuSystemTest): diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index ef4427ff4d..aa1ba179fa 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2481,8 +2481,8 @@ int main(int argc, char **argv) tmpfs = g_dir_make_tmp("migration-test-XXXXXX", &err); if (!tmpfs) { - g_test_message("g_dir_make_tmp on path (%s): %s", tmpfs, - err->message); + g_test_message("Can't create temporary directory in %s: %s", + g_get_tmp_dir(), err->message); } g_assert(tmpfs); diff --git a/tests/qtest/modules-test.c b/tests/qtest/modules-test.c index 88217686e1..be2575ae6d 100644 --- a/tests/qtest/modules-test.c +++ b/tests/qtest/modules-test.c @@ -16,6 +16,9 @@ static void test_modules_load(const void *data) int main(int argc, char *argv[]) { const char *modules[] = { +#ifdef CONFIG_BLKIO + "block-", "blkio", +#endif #ifdef CONFIG_CURL "block-", "curl", #endif diff --git a/tests/qtest/vhost-user-test.c b/tests/qtest/vhost-user-test.c index e8d2da7228..bf9f7c4248 100644 --- a/tests/qtest/vhost-user-test.c +++ b/tests/qtest/vhost-user-test.c @@ -571,8 +571,8 @@ static TestServer *test_server_new(const gchar *name, tmpfs = g_dir_make_tmp("vhost-test-XXXXXX", &err); if (!tmpfs) { - g_test_message("g_dir_make_tmp on path (%s): %s", tmpfs, - err->message); + g_test_message("Can't create temporary directory in %s: %s", + g_get_tmp_dir(), err->message); g_error_free(err); } g_assert(tmpfs); diff --git a/ui/console.c b/ui/console.c index 49da6a91df..65c117874c 100644 --- a/ui/console.c +++ b/ui/console.c @@ -1297,7 +1297,7 @@ static void kbd_send_chars(QemuConsole *s) uint32_t size; buf = fifo8_pop_buf(&s->out_fifo, MIN(len, avail), &size); - qemu_chr_be_write(s->chr, (uint8_t *)buf, size); + qemu_chr_be_write(s->chr, buf, size); len = qemu_chr_be_can_write(s->chr); avail -= size; } @@ -1763,7 +1763,7 @@ static void gd_vc_send_chars(VirtualConsole *vc) uint32_t size; buf = fifo8_pop_buf(&vc->vte.out_fifo, MIN(len, avail), &size); - qemu_chr_be_write(vc->vte.chr, (uint8_t *)buf, size); + qemu_chr_be_write(vc->vte.chr, buf, size); len = qemu_chr_be_can_write(vc->vte.chr); avail -= size; } diff --git a/ui/vnc-enc-hextile.c b/ui/vnc-enc-hextile.c index 4215bd7daf..c763256f29 100644 --- a/ui/vnc-enc-hextile.c +++ b/ui/vnc-enc-hextile.c @@ -50,8 +50,8 @@ int vnc_hextile_send_framebuffer_update(VncState *vs, int x, int has_fg, has_bg; uint8_t *last_fg, *last_bg; - last_fg = (uint8_t *) g_malloc(VNC_SERVER_FB_BYTES); - last_bg = (uint8_t *) g_malloc(VNC_SERVER_FB_BYTES); + last_fg = g_malloc(VNC_SERVER_FB_BYTES); + last_bg = g_malloc(VNC_SERVER_FB_BYTES); has_fg = has_bg = 0; for (j = y; j < (y + h); j += 16) { for (i = x; i < (x + w); i += 16) { diff --git a/ui/vnc-jobs.c b/ui/vnc-jobs.c index 4562bf8928..886f9bf611 100644 --- a/ui/vnc-jobs.c +++ b/ui/vnc-jobs.c @@ -373,7 +373,7 @@ void vnc_start_worker_thread(void) VncJobQueue *q; if (vnc_worker_thread_running()) - return ; + return; q = vnc_queue_init(); qemu_thread_create(&q->thread, "vnc_worker", vnc_worker_thread, q, @@ -3085,7 +3085,7 @@ static void vnc_rect_updated(VncDisplay *vd, int x, int y, struct timeval * tv) rect = vnc_stat_rect(vd, x, y); if (rect->updated) { - return ; + return; } rect->times[rect->idx] = *tv; rect->idx = (rect->idx + 1) % ARRAY_SIZE(rect->times); diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c index 15c82d9348..45c6b57374 100644 --- a/util/qemu-coroutine-lock.c +++ b/util/qemu-coroutine-lock.c @@ -39,10 +39,15 @@ void qemu_co_queue_init(CoQueue *queue) QSIMPLEQ_INIT(&queue->entries); } -void coroutine_fn qemu_co_queue_wait_impl(CoQueue *queue, QemuLockable *lock) +void coroutine_fn qemu_co_queue_wait_impl(CoQueue *queue, QemuLockable *lock, + CoQueueWaitFlags flags) { Coroutine *self = qemu_coroutine_self(); - QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next); + if (flags & CO_QUEUE_WAIT_FRONT) { + QSIMPLEQ_INSERT_HEAD(&queue->entries, self, co_queue_next); + } else { + QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next); + } if (lock) { qemu_lockable_unlock(lock); diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c index 5ba01177bf..0d1520caac 100644 --- a/util/vfio-helpers.c +++ b/util/vfio-helpers.c @@ -847,10 +847,13 @@ void qemu_vfio_close(QEMUVFIOState *s) if (!s) { return; } + + ram_block_notifier_remove(&s->ram_notifier); + for (i = 0; i < s->nr_mappings; ++i) { qemu_vfio_undo_mapping(s, &s->mappings[i], NULL); } - ram_block_notifier_remove(&s->ram_notifier); + g_free(s->usable_iova_ranges); s->nb_iova_ranges = 0; qemu_vfio_reset(s); |