diff options
author | Richard Henderson <richard.henderson@linaro.org> | 2021-03-09 17:02:48 -0600 |
---|---|---|
committer | Richard Henderson <richard.henderson@linaro.org> | 2021-06-11 09:26:28 -0700 |
commit | c46184a90a5a0209960b7c0813aff4feb1e373e1 (patch) | |
tree | af5c0fba9d39fd31386b8e3cf47b18c4beab22a2 /tcg/region.c | |
parent | 324b9d462ea227f10a25c80421046e6187247116 (diff) |
accel/tcg: Move alloc_code_gen_buffer to tcg/region.c
Buffer management is integral to tcg. Do not leave the allocation
to code outside of tcg/. This is code movement, with further
cleanups to follow.
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Diffstat (limited to 'tcg/region.c')
-rw-r--r-- | tcg/region.c | 431 |
1 files changed, 426 insertions, 5 deletions
diff --git a/tcg/region.c b/tcg/region.c index 6e34fcf775..162b4d6486 100644 --- a/tcg/region.c +++ b/tcg/region.c @@ -23,6 +23,8 @@ */ #include "qemu/osdep.h" +#include "qemu/units.h" +#include "qapi/error.h" #include "exec/exec-all.h" #include "tcg/tcg.h" #if !defined(CONFIG_USER_ONLY) @@ -407,6 +409,418 @@ static size_t tcg_n_regions(void) #endif /* + * Minimum size of the code gen buffer. This number is randomly chosen, + * but not so small that we can't have a fair number of TB's live. + */ +#define MIN_CODE_GEN_BUFFER_SIZE (1 * MiB) + +/* + * Maximum size of the code gen buffer we'd like to use. Unless otherwise + * indicated, this is constrained by the range of direct branches on the + * host cpu, as used by the TCG implementation of goto_tb. + */ +#if defined(__x86_64__) +# define MAX_CODE_GEN_BUFFER_SIZE (2 * GiB) +#elif defined(__sparc__) +# define MAX_CODE_GEN_BUFFER_SIZE (2 * GiB) +#elif defined(__powerpc64__) +# define MAX_CODE_GEN_BUFFER_SIZE (2 * GiB) +#elif defined(__powerpc__) +# define MAX_CODE_GEN_BUFFER_SIZE (32 * MiB) +#elif defined(__aarch64__) +# define MAX_CODE_GEN_BUFFER_SIZE (2 * GiB) +#elif defined(__s390x__) + /* We have a +- 4GB range on the branches; leave some slop. */ +# define MAX_CODE_GEN_BUFFER_SIZE (3 * GiB) +#elif defined(__mips__) + /* + * We have a 256MB branch region, but leave room to make sure the + * main executable is also within that region. + */ +# define MAX_CODE_GEN_BUFFER_SIZE (128 * MiB) +#else +# define MAX_CODE_GEN_BUFFER_SIZE ((size_t)-1) +#endif + +#if TCG_TARGET_REG_BITS == 32 +#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (32 * MiB) +#ifdef CONFIG_USER_ONLY +/* + * For user mode on smaller 32 bit systems we may run into trouble + * allocating big chunks of data in the right place. On these systems + * we utilise a static code generation buffer directly in the binary. + */ +#define USE_STATIC_CODE_GEN_BUFFER +#endif +#else /* TCG_TARGET_REG_BITS == 64 */ +#ifdef CONFIG_USER_ONLY +/* + * As user-mode emulation typically means running multiple instances + * of the translator don't go too nuts with our default code gen + * buffer lest we make things too hard for the OS. + */ +#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (128 * MiB) +#else +/* + * We expect most system emulation to run one or two guests per host. + * Users running large scale system emulation may want to tweak their + * runtime setup via the tb-size control on the command line. + */ +#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (1 * GiB) +#endif +#endif + +#define DEFAULT_CODE_GEN_BUFFER_SIZE \ + (DEFAULT_CODE_GEN_BUFFER_SIZE_1 < MAX_CODE_GEN_BUFFER_SIZE \ + ? DEFAULT_CODE_GEN_BUFFER_SIZE_1 : MAX_CODE_GEN_BUFFER_SIZE) + +static size_t size_code_gen_buffer(size_t tb_size) +{ + /* Size the buffer. */ + if (tb_size == 0) { + size_t phys_mem = qemu_get_host_physmem(); + if (phys_mem == 0) { + tb_size = DEFAULT_CODE_GEN_BUFFER_SIZE; + } else { + tb_size = MIN(DEFAULT_CODE_GEN_BUFFER_SIZE, phys_mem / 8); + } + } + if (tb_size < MIN_CODE_GEN_BUFFER_SIZE) { + tb_size = MIN_CODE_GEN_BUFFER_SIZE; + } + if (tb_size > MAX_CODE_GEN_BUFFER_SIZE) { + tb_size = MAX_CODE_GEN_BUFFER_SIZE; + } + return tb_size; +} + +#ifdef __mips__ +/* + * In order to use J and JAL within the code_gen_buffer, we require + * that the buffer not cross a 256MB boundary. + */ +static inline bool cross_256mb(void *addr, size_t size) +{ + return ((uintptr_t)addr ^ ((uintptr_t)addr + size)) & ~0x0ffffffful; +} + +/* + * We weren't able to allocate a buffer without crossing that boundary, + * so make do with the larger portion of the buffer that doesn't cross. + * Returns the new base of the buffer, and adjusts code_gen_buffer_size. + */ +static inline void *split_cross_256mb(void *buf1, size_t size1) +{ + void *buf2 = (void *)(((uintptr_t)buf1 + size1) & ~0x0ffffffful); + size_t size2 = buf1 + size1 - buf2; + + size1 = buf2 - buf1; + if (size1 < size2) { + size1 = size2; + buf1 = buf2; + } + + tcg_ctx->code_gen_buffer_size = size1; + return buf1; +} +#endif + +#ifdef USE_STATIC_CODE_GEN_BUFFER +static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE] + __attribute__((aligned(CODE_GEN_ALIGN))); + +static bool alloc_code_gen_buffer(size_t tb_size, int splitwx, Error **errp) +{ + void *buf, *end; + size_t size; + + if (splitwx > 0) { + error_setg(errp, "jit split-wx not supported"); + return false; + } + + /* page-align the beginning and end of the buffer */ + buf = static_code_gen_buffer; + end = static_code_gen_buffer + sizeof(static_code_gen_buffer); + buf = QEMU_ALIGN_PTR_UP(buf, qemu_real_host_page_size); + end = QEMU_ALIGN_PTR_DOWN(end, qemu_real_host_page_size); + + size = end - buf; + + /* Honor a command-line option limiting the size of the buffer. */ + if (size > tb_size) { + size = QEMU_ALIGN_DOWN(tb_size, qemu_real_host_page_size); + } + tcg_ctx->code_gen_buffer_size = size; + +#ifdef __mips__ + if (cross_256mb(buf, size)) { + buf = split_cross_256mb(buf, size); + size = tcg_ctx->code_gen_buffer_size; + } +#endif + + if (qemu_mprotect_rwx(buf, size)) { + error_setg_errno(errp, errno, "mprotect of jit buffer"); + return false; + } + qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE); + + tcg_ctx->code_gen_buffer = buf; + return true; +} +#elif defined(_WIN32) +static bool alloc_code_gen_buffer(size_t size, int splitwx, Error **errp) +{ + void *buf; + + if (splitwx > 0) { + error_setg(errp, "jit split-wx not supported"); + return false; + } + + buf = VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, + PAGE_EXECUTE_READWRITE); + if (buf == NULL) { + error_setg_win32(errp, GetLastError(), + "allocate %zu bytes for jit buffer", size); + return false; + } + + tcg_ctx->code_gen_buffer = buf; + tcg_ctx->code_gen_buffer_size = size; + return true; +} +#else +static bool alloc_code_gen_buffer_anon(size_t size, int prot, + int flags, Error **errp) +{ + void *buf; + + buf = mmap(NULL, size, prot, flags, -1, 0); + if (buf == MAP_FAILED) { + error_setg_errno(errp, errno, + "allocate %zu bytes for jit buffer", size); + return false; + } + tcg_ctx->code_gen_buffer_size = size; + +#ifdef __mips__ + if (cross_256mb(buf, size)) { + /* + * Try again, with the original still mapped, to avoid re-acquiring + * the same 256mb crossing. + */ + size_t size2; + void *buf2 = mmap(NULL, size, prot, flags, -1, 0); + switch ((int)(buf2 != MAP_FAILED)) { + case 1: + if (!cross_256mb(buf2, size)) { + /* Success! Use the new buffer. */ + munmap(buf, size); + break; + } + /* Failure. Work with what we had. */ + munmap(buf2, size); + /* fallthru */ + default: + /* Split the original buffer. Free the smaller half. */ + buf2 = split_cross_256mb(buf, size); + size2 = tcg_ctx->code_gen_buffer_size; + if (buf == buf2) { + munmap(buf + size2, size - size2); + } else { + munmap(buf, size - size2); + } + size = size2; + break; + } + buf = buf2; + } +#endif + + /* Request large pages for the buffer. */ + qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE); + + tcg_ctx->code_gen_buffer = buf; + return true; +} + +#ifndef CONFIG_TCG_INTERPRETER +#ifdef CONFIG_POSIX +#include "qemu/memfd.h" + +static bool alloc_code_gen_buffer_splitwx_memfd(size_t size, Error **errp) +{ + void *buf_rw = NULL, *buf_rx = MAP_FAILED; + int fd = -1; + +#ifdef __mips__ + /* Find space for the RX mapping, vs the 256MiB regions. */ + if (!alloc_code_gen_buffer_anon(size, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS | + MAP_NORESERVE, errp)) { + return false; + } + /* The size of the mapping may have been adjusted. */ + size = tcg_ctx->code_gen_buffer_size; + buf_rx = tcg_ctx->code_gen_buffer; +#endif + + buf_rw = qemu_memfd_alloc("tcg-jit", size, 0, &fd, errp); + if (buf_rw == NULL) { + goto fail; + } + +#ifdef __mips__ + void *tmp = mmap(buf_rx, size, PROT_READ | PROT_EXEC, + MAP_SHARED | MAP_FIXED, fd, 0); + if (tmp != buf_rx) { + goto fail_rx; + } +#else + buf_rx = mmap(NULL, size, PROT_READ | PROT_EXEC, MAP_SHARED, fd, 0); + if (buf_rx == MAP_FAILED) { + goto fail_rx; + } +#endif + + close(fd); + tcg_ctx->code_gen_buffer = buf_rw; + tcg_ctx->code_gen_buffer_size = size; + tcg_splitwx_diff = buf_rx - buf_rw; + + /* Request large pages for the buffer and the splitwx. */ + qemu_madvise(buf_rw, size, QEMU_MADV_HUGEPAGE); + qemu_madvise(buf_rx, size, QEMU_MADV_HUGEPAGE); + return true; + + fail_rx: + error_setg_errno(errp, errno, "failed to map shared memory for execute"); + fail: + if (buf_rx != MAP_FAILED) { + munmap(buf_rx, size); + } + if (buf_rw) { + munmap(buf_rw, size); + } + if (fd >= 0) { + close(fd); + } + return false; +} +#endif /* CONFIG_POSIX */ + +#ifdef CONFIG_DARWIN +#include <mach/mach.h> + +extern kern_return_t mach_vm_remap(vm_map_t target_task, + mach_vm_address_t *target_address, + mach_vm_size_t size, + mach_vm_offset_t mask, + int flags, + vm_map_t src_task, + mach_vm_address_t src_address, + boolean_t copy, + vm_prot_t *cur_protection, + vm_prot_t *max_protection, + vm_inherit_t inheritance); + +static bool alloc_code_gen_buffer_splitwx_vmremap(size_t size, Error **errp) +{ + kern_return_t ret; + mach_vm_address_t buf_rw, buf_rx; + vm_prot_t cur_prot, max_prot; + + /* Map the read-write portion via normal anon memory. */ + if (!alloc_code_gen_buffer_anon(size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, errp)) { + return false; + } + + buf_rw = (mach_vm_address_t)tcg_ctx->code_gen_buffer; + buf_rx = 0; + ret = mach_vm_remap(mach_task_self(), + &buf_rx, + size, + 0, + VM_FLAGS_ANYWHERE, + mach_task_self(), + buf_rw, + false, + &cur_prot, + &max_prot, + VM_INHERIT_NONE); + if (ret != KERN_SUCCESS) { + /* TODO: Convert "ret" to a human readable error message. */ + error_setg(errp, "vm_remap for jit splitwx failed"); + munmap((void *)buf_rw, size); + return false; + } + + if (mprotect((void *)buf_rx, size, PROT_READ | PROT_EXEC) != 0) { + error_setg_errno(errp, errno, "mprotect for jit splitwx"); + munmap((void *)buf_rx, size); + munmap((void *)buf_rw, size); + return false; + } + + tcg_splitwx_diff = buf_rx - buf_rw; + return true; +} +#endif /* CONFIG_DARWIN */ +#endif /* CONFIG_TCG_INTERPRETER */ + +static bool alloc_code_gen_buffer_splitwx(size_t size, Error **errp) +{ +#ifndef CONFIG_TCG_INTERPRETER +# ifdef CONFIG_DARWIN + return alloc_code_gen_buffer_splitwx_vmremap(size, errp); +# endif +# ifdef CONFIG_POSIX + return alloc_code_gen_buffer_splitwx_memfd(size, errp); +# endif +#endif + error_setg(errp, "jit split-wx not supported"); + return false; +} + +static bool alloc_code_gen_buffer(size_t size, int splitwx, Error **errp) +{ + ERRP_GUARD(); + int prot, flags; + + if (splitwx) { + if (alloc_code_gen_buffer_splitwx(size, errp)) { + return true; + } + /* + * If splitwx force-on (1), fail; + * if splitwx default-on (-1), fall through to splitwx off. + */ + if (splitwx > 0) { + return false; + } + error_free_or_abort(errp); + } + + prot = PROT_READ | PROT_WRITE | PROT_EXEC; + flags = MAP_PRIVATE | MAP_ANONYMOUS; +#ifdef CONFIG_TCG_INTERPRETER + /* The tcg interpreter does not need execute permission. */ + prot = PROT_READ | PROT_WRITE; +#elif defined(CONFIG_DARWIN) + /* Applicable to both iOS and macOS (Apple Silicon). */ + if (!splitwx) { + flags |= MAP_JIT; + } +#endif + + return alloc_code_gen_buffer_anon(size, prot, flags, errp); +} +#endif /* USE_STATIC_CODE_GEN_BUFFER, WIN32, POSIX */ + +/* * Initializes region partitioning. * * Called at init time from the parent thread (i.e. the one calling @@ -434,16 +848,23 @@ static size_t tcg_n_regions(void) * in practice. Multi-threaded guests share most if not all of their translated * code, which makes parallel code generation less appealing than in softmmu. */ -void tcg_region_init(void) +void tcg_region_init(size_t tb_size, int splitwx) { - void *buf = tcg_init_ctx.code_gen_buffer; - void *aligned; - size_t size = tcg_init_ctx.code_gen_buffer_size; - size_t page_size = qemu_real_host_page_size; + void *buf, *aligned; + size_t size; + size_t page_size; size_t region_size; size_t n_regions; size_t i; + bool ok; + + ok = alloc_code_gen_buffer(size_code_gen_buffer(tb_size), + splitwx, &error_fatal); + assert(ok); + buf = tcg_init_ctx.code_gen_buffer; + size = tcg_init_ctx.code_gen_buffer_size; + page_size = qemu_real_host_page_size; n_regions = tcg_n_regions(); /* The first region will be 'aligned - buf' bytes larger than the others */ |