11 files changed, 1149 insertions, 44 deletions
diff --git a/accel/stubs/tcg-stub.c b/accel/stubs/tcg-stub.c
index e2d23edafe..677191a69c 100644
--- a/accel/stubs/tcg-stub.c
+++ b/accel/stubs/tcg-stub.c
@@ -11,7 +11,6 @@
  */
 
 #include "qemu/osdep.h"
-#include "qemu-common.h"
 #include "cpu.h"
 #include "tcg/tcg.h"
 #include "exec/exec-all.h"
diff --git a/accel/tcg/Makefile.objs b/accel/tcg/Makefile.objs
index d381a02f34..a92f2c454b 100644
--- a/accel/tcg/Makefile.objs
+++ b/accel/tcg/Makefile.objs
@@ -6,3 +6,4 @@ obj-y += translator.o
 
 obj-$(CONFIG_USER_ONLY) += user-exec.o
 obj-$(call lnot,$(CONFIG_SOFTMMU)) += user-exec-stub.o
+obj-$(CONFIG_PLUGIN) += plugin-gen.o
diff --git a/accel/tcg/atomic_common.inc.c b/accel/tcg/atomic_common.inc.c
new file mode 100644
index 0000000000..344525b0bb
--- /dev/null
+++ b/accel/tcg/atomic_common.inc.c
@@ -0,0 +1,54 @@
+/*
+ * Common Atomic Helper Functions
+ *
+ * This file should be included before the various instantiations of
+ * the atomic_template.h helpers.
+ *
+ * Copyright (c) 2019 Linaro
+ * Written by Alex Bennée <alex.bennee@linaro.org>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+static inline
+void atomic_trace_rmw_pre(CPUArchState *env, target_ulong addr, uint16_t info)
+{
+    CPUState *cpu = env_cpu(env);
+
+    trace_guest_mem_before_exec(cpu, addr, info);
+    trace_guest_mem_before_exec(cpu, addr, info | TRACE_MEM_ST);
+}
+
+static inline void
+atomic_trace_rmw_post(CPUArchState *env, target_ulong addr, uint16_t info)
+{
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, info);
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, info | TRACE_MEM_ST);
+}
+
+static inline
+void atomic_trace_ld_pre(CPUArchState *env, target_ulong addr, uint16_t info)
+{
+    trace_guest_mem_before_exec(env_cpu(env), addr, info);
+}
+
+static inline
+void atomic_trace_ld_post(CPUArchState *env, target_ulong addr, uint16_t info)
+{
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, info);
+}
+
+static inline
+void atomic_trace_st_pre(CPUArchState *env, target_ulong addr, uint16_t info)
+{
+    trace_guest_mem_before_exec(env_cpu(env), addr, info);
+}
+
+static inline
+void atomic_trace_st_post(CPUArchState *env, target_ulong addr, uint16_t info)
+{
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, info);
+}
diff --git a/accel/tcg/atomic_template.h b/accel/tcg/atomic_template.h
index 287433d809..837676231f 100644
--- a/accel/tcg/atomic_template.h
+++ b/accel/tcg/atomic_template.h
@@ -18,6 +18,7 @@
  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  */
 
+#include "qemu/plugin.h"
 #include "trace/mem.h"
 
 #if DATA_SIZE == 16
@@ -59,26 +60,6 @@
 # define ABI_TYPE  uint32_t
 #endif
 
-#define ATOMIC_TRACE_RMW do {                                           \
-        uint8_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, false); \
-                                                                        \
-        trace_guest_mem_before_exec(env_cpu(env), addr, info);      \
-        trace_guest_mem_before_exec(env_cpu(env), addr,             \
-                                    info | TRACE_MEM_ST);               \
-    } while (0)
-
-#define ATOMIC_TRACE_LD do {                                            \
-        uint8_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, false); \
-                                                                        \
-        trace_guest_mem_before_exec(env_cpu(env), addr, info);      \
-    } while (0)
-
-# define ATOMIC_TRACE_ST do {                                           \
-        uint8_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, true); \
-                                                                        \
-        trace_guest_mem_before_exec(env_cpu(env), addr, info);      \
-    } while (0)
-
 /* Define host-endian atomic operations.  Note that END is used within
    the ATOMIC_NAME macro, and redefined below.  */
 #if DATA_SIZE == 1
@@ -98,14 +79,17 @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
     ATOMIC_MMU_DECLS;
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
     DATA_TYPE ret;
+    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, false,
+                                                           ATOMIC_MMU_IDX);
 
-    ATOMIC_TRACE_RMW;
+    atomic_trace_rmw_pre(env, addr, info);
 #if DATA_SIZE == 16
     ret = atomic16_cmpxchg(haddr, cmpv, newv);
 #else
     ret = atomic_cmpxchg__nocheck(haddr, cmpv, newv);
 #endif
     ATOMIC_MMU_CLEANUP;
+    atomic_trace_rmw_post(env, addr, info);
     return ret;
 }
 
@@ -115,10 +99,13 @@ ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
 {
     ATOMIC_MMU_DECLS;
     DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
+    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, false,
+                                                           ATOMIC_MMU_IDX);
 
-    ATOMIC_TRACE_LD;
+    atomic_trace_ld_pre(env, addr, info);
     val = atomic16_read(haddr);
     ATOMIC_MMU_CLEANUP;
+    atomic_trace_ld_post(env, addr, info);
     return val;
 }
 
@@ -127,10 +114,13 @@ void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
 {
     ATOMIC_MMU_DECLS;
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, true,
+                                                          ATOMIC_MMU_IDX);
 
-    ATOMIC_TRACE_ST;
+    atomic_trace_st_pre(env, addr, info);
     atomic16_set(haddr, val);
     ATOMIC_MMU_CLEANUP;
+    atomic_trace_st_post(env, addr, info);
 }
 #endif
 #else
@@ -140,10 +130,13 @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
     ATOMIC_MMU_DECLS;
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
     DATA_TYPE ret;
+    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, false,
+                                                          ATOMIC_MMU_IDX);
 
-    ATOMIC_TRACE_RMW;
+    atomic_trace_rmw_pre(env, addr, info);
     ret = atomic_xchg__nocheck(haddr, val);
     ATOMIC_MMU_CLEANUP;
+    atomic_trace_rmw_post(env, addr, info);
     return ret;
 }
 
@@ -154,10 +147,14 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
     ATOMIC_MMU_DECLS;                                               \
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                           \
     DATA_TYPE ret;                                                  \
+    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,   \
+                                                           false,   \
+                                                           ATOMIC_MMU_IDX); \
                                                                     \
-    ATOMIC_TRACE_RMW;                                               \
+    atomic_trace_rmw_pre(env, addr, info);                          \
     ret = atomic_##X(haddr, val);                                   \
     ATOMIC_MMU_CLEANUP;                                             \
+    atomic_trace_rmw_post(env, addr, info);                         \
     return ret;                                                     \
 }
 
@@ -186,8 +183,11 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
     ATOMIC_MMU_DECLS;                                               \
     XDATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                          \
     XDATA_TYPE cmp, old, new, val = xval;                           \
+    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,   \
+                                                           false,   \
+                                                           ATOMIC_MMU_IDX); \
                                                                     \
-    ATOMIC_TRACE_RMW;                                               \
+    atomic_trace_rmw_pre(env, addr, info);                          \
     smp_mb();                                                       \
     cmp = atomic_read__nocheck(haddr);                              \
     do {                                                            \
@@ -195,6 +195,7 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
         cmp = atomic_cmpxchg__nocheck(haddr, old, new);             \
     } while (cmp != old);                                           \
     ATOMIC_MMU_CLEANUP;                                             \
+    atomic_trace_rmw_post(env, addr, info);                         \
     return RET;                                                     \
 }
 
@@ -232,14 +233,18 @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
     ATOMIC_MMU_DECLS;
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
     DATA_TYPE ret;
+    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,
+                                                           false,
+                                                           ATOMIC_MMU_IDX);
 
-    ATOMIC_TRACE_RMW;
+    atomic_trace_rmw_pre(env, addr, info);
 #if DATA_SIZE == 16
     ret = atomic16_cmpxchg(haddr, BSWAP(cmpv), BSWAP(newv));
 #else
     ret = atomic_cmpxchg__nocheck(haddr, BSWAP(cmpv), BSWAP(newv));
 #endif
     ATOMIC_MMU_CLEANUP;
+    atomic_trace_rmw_post(env, addr, info);
     return BSWAP(ret);
 }
 
@@ -249,10 +254,14 @@ ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
 {
     ATOMIC_MMU_DECLS;
     DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
+    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,
+                                                           false,
+                                                           ATOMIC_MMU_IDX);
 
-    ATOMIC_TRACE_LD;
+    atomic_trace_ld_pre(env, addr, info);
     val = atomic16_read(haddr);
     ATOMIC_MMU_CLEANUP;
+    atomic_trace_ld_post(env, addr, info);
     return BSWAP(val);
 }
 
@@ -261,11 +270,16 @@ void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
 {
     ATOMIC_MMU_DECLS;
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,
+                                                           true,
+                                                           ATOMIC_MMU_IDX);
 
-    ATOMIC_TRACE_ST;
+    val = BSWAP(val);
+    atomic_trace_st_pre(env, addr, info);
     val = BSWAP(val);
     atomic16_set(haddr, val);
     ATOMIC_MMU_CLEANUP;
+    atomic_trace_st_post(env, addr, info);
 }
 #endif
 #else
@@ -275,10 +289,14 @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
     ATOMIC_MMU_DECLS;
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
     ABI_TYPE ret;
+    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,
+                                                           false,
+                                                           ATOMIC_MMU_IDX);
 
-    ATOMIC_TRACE_RMW;
+    atomic_trace_rmw_pre(env, addr, info);
     ret = atomic_xchg__nocheck(haddr, BSWAP(val));
     ATOMIC_MMU_CLEANUP;
+    atomic_trace_rmw_post(env, addr, info);
     return BSWAP(ret);
 }
 
@@ -289,10 +307,14 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
     ATOMIC_MMU_DECLS;                                               \
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                           \
     DATA_TYPE ret;                                                  \
+    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,   \
+                                                           false,   \
+                                                           ATOMIC_MMU_IDX); \
                                                                     \
-    ATOMIC_TRACE_RMW;                                               \
+    atomic_trace_rmw_pre(env, addr, info);                          \
     ret = atomic_##X(haddr, BSWAP(val));                            \
     ATOMIC_MMU_CLEANUP;                                             \
+    atomic_trace_rmw_post(env, addr, info);                         \
     return BSWAP(ret);                                              \
 }
 
@@ -319,8 +341,11 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
     ATOMIC_MMU_DECLS;                                               \
     XDATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                          \
     XDATA_TYPE ldo, ldn, old, new, val = xval;                      \
+    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,   \
+                                                           false,   \
+                                                           ATOMIC_MMU_IDX); \
                                                                     \
-    ATOMIC_TRACE_RMW;                                               \
+    atomic_trace_rmw_pre(env, addr, info);                          \
     smp_mb();                                                       \
     ldn = atomic_read__nocheck(haddr);                              \
     do {                                                            \
@@ -328,6 +353,7 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
         ldn = atomic_cmpxchg__nocheck(haddr, ldo, BSWAP(new));      \
     } while (ldo != ldn);                                           \
     ATOMIC_MMU_CLEANUP;                                             \
+    atomic_trace_rmw_post(env, addr, info);                         \
     return RET;                                                     \
 }
 
@@ -355,10 +381,6 @@ GEN_ATOMIC_HELPER_FN(add_fetch, ADD, DATA_TYPE, new)
 #undef MEND
 #endif /* DATA_SIZE > 1 */
 
-#undef ATOMIC_TRACE_ST
-#undef ATOMIC_TRACE_LD
-#undef ATOMIC_TRACE_RMW
-
 #undef BSWAP
 #undef ABI_TYPE
 #undef DATA_TYPE
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index 48272c781b..c01f59c743 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -238,8 +238,6 @@ void cpu_exec_step_atomic(CPUState *cpu)
     uint32_t flags;
     uint32_t cflags = 1;
     uint32_t cf_mask = cflags & CF_HASH_MASK;
-    /* volatile because we modify it between setjmp and longjmp */
-    volatile bool in_exclusive_region = false;
 
     if (sigsetjmp(cpu->jmp_env, 0) == 0) {
         tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
@@ -253,7 +251,6 @@ void cpu_exec_step_atomic(CPUState *cpu)
 
         /* Since we got here, we know that parallel_cpus must be true.  */
         parallel_cpus = false;
-        in_exclusive_region = true;
         cc->cpu_exec_enter(cpu);
         /* execute the generated code */
         trace_exec_tb(tb, pc);
@@ -271,9 +268,10 @@ void cpu_exec_step_atomic(CPUState *cpu)
             qemu_mutex_unlock_iothread();
         }
         assert_no_pages_locked();
+        qemu_plugin_disable_mem_helpers(cpu);
     }
 
-    if (in_exclusive_region) {
+    if (cpu_in_exclusive_context(cpu)) {
         /* We might longjump out of either the codegen or the
          * execution, so must make sure we only end the exclusive
          * region if we started it.
@@ -704,6 +702,8 @@ int cpu_exec(CPUState *cpu)
         if (qemu_mutex_iothread_locked()) {
             qemu_mutex_unlock_iothread();
         }
+        qemu_plugin_disable_mem_helpers(cpu);
+
         assert_no_pages_locked();
     }
 
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 5eebddcca8..68487dceb5 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -34,6 +34,9 @@
 #include "qemu/atomic.h"
 #include "qemu/atomic128.h"
 #include "translate-all.h"
+#ifdef CONFIG_PLUGIN
+#include "qemu/plugin-memory.h"
+#endif
 
 /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
 /* #define DEBUG_TLB */
@@ -1051,7 +1054,8 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
  * NOTE: This function will trigger an exception if the page is
  * not executable.
  */
-tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
+tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
+                                        void **hostp)
 {
     uintptr_t mmu_idx = cpu_mmu_index(env, true);
     uintptr_t index = tlb_index(env, mmu_idx, addr);
@@ -1077,13 +1081,24 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
 
     if (unlikely(entry->addr_code & TLB_MMIO)) {
         /* The region is not backed by RAM.  */
+        if (hostp) {
+            *hostp = NULL;
+        }
         return -1;
     }
 
     p = (void *)((uintptr_t)addr + entry->addend);
+    if (hostp) {
+        *hostp = p;
+    }
     return qemu_ram_addr_from_host_nofail(p);
 }
 
+tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
+{
+    return get_page_addr_code_hostp(env, addr, NULL);
+}
+
 static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
                            CPUIOTLBEntry *iotlbentry, uintptr_t retaddr)
 {
@@ -1235,6 +1250,45 @@ void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
     return (void *)((uintptr_t)addr + entry->addend);
 }
 
+
+#ifdef CONFIG_PLUGIN
+/*
+ * Perform a TLB lookup and populate the qemu_plugin_hwaddr structure.
+ * This should be a hot path as we will have just looked this path up
+ * in the softmmu lookup code (or helper). We don't handle re-fills or
+ * checking the victim table. This is purely informational.
+ *
+ * This should never fail as the memory access being instrumented
+ * should have just filled the TLB.
+ */
+
+bool tlb_plugin_lookup(CPUState *cpu, target_ulong addr, int mmu_idx,
+                       bool is_store, struct qemu_plugin_hwaddr *data)
+{
+    CPUArchState *env = cpu->env_ptr;
+    CPUTLBEntry *tlbe = tlb_entry(env, mmu_idx, addr);
+    uintptr_t index = tlb_index(env, mmu_idx, addr);
+    target_ulong tlb_addr = is_store ? tlb_addr_write(tlbe) : tlbe->addr_read;
+
+    if (likely(tlb_hit(tlb_addr, addr))) {
+        /* We must have an iotlb entry for MMIO */
+        if (tlb_addr & TLB_MMIO) {
+            CPUIOTLBEntry *iotlbentry;
+            iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
+            data->is_io = true;
+            data->v.io.section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
+            data->v.io.offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
+        } else {
+            data->is_io = false;
+            data->v.ram.hostaddr = addr + tlbe->addend;
+        }
+        return true;
+    }
+    return false;
+}
+
+#endif
+
 /* Probe for a read-modify-write atomic operation.  Do not allow unaligned
  * operations, or io operations to proceed.  Return the host address.  */
 static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
@@ -1811,6 +1865,9 @@ void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
 #define ATOMIC_MMU_DECLS
 #define ATOMIC_MMU_LOOKUP atomic_mmu_lookup(env, addr, oi, retaddr)
 #define ATOMIC_MMU_CLEANUP
+#define ATOMIC_MMU_IDX   get_mmuidx(oi)
+
+#include "atomic_common.inc.c"
 
 #define DATA_SIZE 1
 #include "atomic_template.h"
@@ -1853,6 +1910,7 @@ void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
 #define DATA_SIZE 8
 #include "atomic_template.h"
 #endif
+#undef ATOMIC_MMU_IDX
 
 /* Code access functions.  */
 
diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
new file mode 100644
index 0000000000..51580d51a0
--- /dev/null
+++ b/accel/tcg/plugin-gen.c
@@ -0,0 +1,932 @@
+/*
+ * plugin-gen.c - TCG-related bits of plugin infrastructure
+ *
+ * Copyright (C) 2018, Emilio G. Cota <cota@braap.org>
+ * License: GNU GPL, version 2 or later.
+ *   See the COPYING file in the top-level directory.
+ *
+ * We support instrumentation at an instruction granularity. That is,
+ * if a plugin wants to instrument the memory accesses performed by a
+ * particular instruction, it can just do that instead of instrumenting
+ * all memory accesses. Thus, in order to do this we first have to
+ * translate a TB, so that plugins can decide what/where to instrument.
+ *
+ * Injecting the desired instrumentation could be done with a second
+ * translation pass that combined the instrumentation requests, but that
+ * would be ugly and inefficient since we would decode the guest code twice.
+ * Instead, during TB translation we add "empty" instrumentation calls for all
+ * possible instrumentation events, and then once we collect the instrumentation
+ * requests from plugins, we either "fill in" those empty events or remove them
+ * if they have no requests.
+ *
+ * When "filling in" an event we first copy the empty callback's TCG ops. This
+ * might seem unnecessary, but it is done to support an arbitrary number
+ * of callbacks per event. Take for example a regular instruction callback.
+ * We first generate a callback to an empty helper function. Then, if two
+ * plugins register one callback each for this instruction, we make two copies
+ * of the TCG ops generated for the empty callback, substituting the function
+ * pointer that points to the empty helper function with the plugins' desired
+ * callback functions. After that we remove the empty callback's ops.
+ *
+ * Note that the location in TCGOp.args[] of the pointer to a helper function
+ * varies across different guest and host architectures. Instead of duplicating
+ * the logic that figures this out, we rely on the fact that the empty
+ * callbacks point to empty functions that are unique pointers in the program.
+ * Thus, to find the right location we just have to look for a match in
+ * TCGOp.args[]. This is the main reason why we first copy an empty callback's
+ * TCG ops and then fill them in; regardless of whether we have one or many
+ * callbacks for that event, the logic to add all of them is the same.
+ *
+ * When generating more than one callback per event, we make a small
+ * optimization to avoid generating redundant operations. For instance, for the
+ * second and all subsequent callbacks of an event, we do not need to reload the
+ * CPU's index into a TCG temp, since the first callback did it already.
+ */
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "tcg/tcg.h"
+#include "tcg/tcg-op.h"
+#include "trace/mem.h"
+#include "exec/exec-all.h"
+#include "exec/plugin-gen.h"
+#include "exec/translator.h"
+
+#ifdef CONFIG_SOFTMMU
+# define CONFIG_SOFTMMU_GATE 1
+#else
+# define CONFIG_SOFTMMU_GATE 0
+#endif
+
+/*
+ * plugin_cb_start TCG op args[]:
+ * 0: enum plugin_gen_from
+ * 1: enum plugin_gen_cb
+ * 2: set to 1 for mem callback that is a write, 0 otherwise.
+ */
+
+enum plugin_gen_from {
+    PLUGIN_GEN_FROM_TB,
+    PLUGIN_GEN_FROM_INSN,
+    PLUGIN_GEN_FROM_MEM,
+    PLUGIN_GEN_AFTER_INSN,
+    PLUGIN_GEN_N_FROMS,
+};
+
+enum plugin_gen_cb {
+    PLUGIN_GEN_CB_UDATA,
+    PLUGIN_GEN_CB_INLINE,
+    PLUGIN_GEN_CB_MEM,
+    PLUGIN_GEN_ENABLE_MEM_HELPER,
+    PLUGIN_GEN_DISABLE_MEM_HELPER,
+    PLUGIN_GEN_N_CBS,
+};
+
+/*
+ * These helpers are stubs that get dynamically switched out for calls
+ * direct to the plugin if they are subscribed to.
+ */
+void HELPER(plugin_vcpu_udata_cb)(uint32_t cpu_index, void *udata)
+{ }
+
+void HELPER(plugin_vcpu_mem_cb)(unsigned int vcpu_index,
+                                qemu_plugin_meminfo_t info, uint64_t vaddr,
+                                void *userdata)
+{ }
+
+static void do_gen_mem_cb(TCGv vaddr, uint32_t info)
+{
+    TCGv_i32 cpu_index = tcg_temp_new_i32();
+    TCGv_i32 meminfo = tcg_const_i32(info);
+    TCGv_i64 vaddr64 = tcg_temp_new_i64();
+    TCGv_ptr udata = tcg_const_ptr(NULL);
+
+    tcg_gen_ld_i32(cpu_index, cpu_env,
+                   -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index));
+    tcg_gen_extu_tl_i64(vaddr64, vaddr);
+
+    gen_helper_plugin_vcpu_mem_cb(cpu_index, meminfo, vaddr64, udata);
+
+    tcg_temp_free_ptr(udata);
+    tcg_temp_free_i64(vaddr64);
+    tcg_temp_free_i32(meminfo);
+    tcg_temp_free_i32(cpu_index);
+}
+
+static void gen_empty_udata_cb(void)
+{
+    TCGv_i32 cpu_index = tcg_temp_new_i32();
+    TCGv_ptr udata = tcg_const_ptr(NULL); /* will be overwritten later */
+
+    tcg_gen_ld_i32(cpu_index, cpu_env,
+                   -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index));
+    gen_helper_plugin_vcpu_udata_cb(cpu_index, udata);
+
+    tcg_temp_free_ptr(udata);
+    tcg_temp_free_i32(cpu_index);
+}
+
+/*
+ * For now we only support addi_i64.
+ * When we support more ops, we can generate one empty inline cb for each.
+ */
+static void gen_empty_inline_cb(void)
+{
+    TCGv_i64 val = tcg_temp_new_i64();
+    TCGv_ptr ptr = tcg_const_ptr(NULL); /* overwritten later */
+
+    tcg_gen_ld_i64(val, ptr, 0);
+    /* pass an immediate != 0 so that it doesn't get optimized away */
+    tcg_gen_addi_i64(val, val, 0xdeadface);
+    tcg_gen_st_i64(val, ptr, 0);
+    tcg_temp_free_ptr(ptr);
+    tcg_temp_free_i64(val);
+}
+
+static void gen_empty_mem_cb(TCGv addr, uint32_t info)
+{
+    do_gen_mem_cb(addr, info);
+}
+
+/*
+ * Share the same function for enable/disable. When enabling, the NULL
+ * pointer will be overwritten later.
+ */
+static void gen_empty_mem_helper(void)
+{
+    TCGv_ptr ptr;
+
+    ptr = tcg_const_ptr(NULL);
+    tcg_gen_st_ptr(ptr, cpu_env, offsetof(CPUState, plugin_mem_cbs) -
+                                 offsetof(ArchCPU, env));
+    tcg_temp_free_ptr(ptr);
+}
+
+static inline
+void gen_plugin_cb_start(enum plugin_gen_from from,
+                         enum plugin_gen_cb type, unsigned wr)
+{
+    TCGOp *op;
+
+    tcg_gen_plugin_cb_start(from, type, wr);
+    op = tcg_last_op();
+    QSIMPLEQ_INSERT_TAIL(&tcg_ctx->plugin_ops, op, plugin_link);
+}
+
+static void gen_wrapped(enum plugin_gen_from from,
+                        enum plugin_gen_cb type, void (*func)(void))
+{
+    gen_plugin_cb_start(from, type, 0);
+    func();
+    tcg_gen_plugin_cb_end();
+}
+
+static inline void plugin_gen_empty_callback(enum plugin_gen_from from)
+{
+    switch (from) {
+    case PLUGIN_GEN_AFTER_INSN:
+        gen_wrapped(from, PLUGIN_GEN_DISABLE_MEM_HELPER,
+                    gen_empty_mem_helper);
+        break;
+    case PLUGIN_GEN_FROM_INSN:
+        /*
+         * Note: plugin_gen_inject() relies on ENABLE_MEM_HELPER being
+         * the first callback of an instruction
+         */
+        gen_wrapped(from, PLUGIN_GEN_ENABLE_MEM_HELPER,
+                    gen_empty_mem_helper);
+        /* fall through */
+    case PLUGIN_GEN_FROM_TB:
+        gen_wrapped(from, PLUGIN_GEN_CB_UDATA, gen_empty_udata_cb);
+        gen_wrapped(from, PLUGIN_GEN_CB_INLINE, gen_empty_inline_cb);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+union mem_gen_fn {
+    void (*mem_fn)(TCGv, uint32_t);
+    void (*inline_fn)(void);
+};
+
+static void gen_mem_wrapped(enum plugin_gen_cb type,
+                            const union mem_gen_fn *f, TCGv addr,
+                            uint32_t info, bool is_mem)
+{
+    int wr = !!(info & TRACE_MEM_ST);
+
+    gen_plugin_cb_start(PLUGIN_GEN_FROM_MEM, type, wr);
+    if (is_mem) {
+        f->mem_fn(addr, info);
+    } else {
+        f->inline_fn();
+    }
+    tcg_gen_plugin_cb_end();
+}
+
+void plugin_gen_empty_mem_callback(TCGv addr, uint32_t info)
+{
+    union mem_gen_fn fn;
+
+    fn.mem_fn = gen_empty_mem_cb;
+    gen_mem_wrapped(PLUGIN_GEN_CB_MEM, &fn, addr, info, true);
+
+    fn.inline_fn = gen_empty_inline_cb;
+    gen_mem_wrapped(PLUGIN_GEN_CB_INLINE, &fn, 0, info, false);
+}
+
+static TCGOp *find_op(TCGOp *op, TCGOpcode opc)
+{
+    while (op) {
+        if (op->opc == opc) {
+            return op;
+        }
+        op = QTAILQ_NEXT(op, link);
+    }
+    return NULL;
+}
+
+static TCGOp *rm_ops_range(TCGOp *begin, TCGOp *end)
+{
+    TCGOp *ret = QTAILQ_NEXT(end, link);
+
+    QTAILQ_REMOVE_SEVERAL(&tcg_ctx->ops, begin, end, link);
+    return ret;
+}
+
+/* remove all ops until (and including) plugin_cb_end */
+static TCGOp *rm_ops(TCGOp *op)
+{
+    TCGOp *end_op = find_op(op, INDEX_op_plugin_cb_end);
+
+    tcg_debug_assert(end_op);
+    return rm_ops_range(op, end_op);
+}
+
+static TCGOp *copy_op_nocheck(TCGOp **begin_op, TCGOp *op)
+{
+    *begin_op = QTAILQ_NEXT(*begin_op, link);
+    tcg_debug_assert(*begin_op);
+    op = tcg_op_insert_after(tcg_ctx, op, (*begin_op)->opc);
+    memcpy(op->args, (*begin_op)->args, sizeof(op->args));
+    return op;
+}
+
+static TCGOp *copy_op(TCGOp **begin_op, TCGOp *op, TCGOpcode opc)
+{
+    op = copy_op_nocheck(begin_op, op);
+    tcg_debug_assert((*begin_op)->opc == opc);
+    return op;
+}
+
+static TCGOp *copy_extu_i32_i64(TCGOp **begin_op, TCGOp *op)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        /* mov_i32 */
+        op = copy_op(begin_op, op, INDEX_op_mov_i32);
+        /* movi_i32 */
+        op = copy_op(begin_op, op, INDEX_op_movi_i32);
+    } else {
+        /* extu_i32_i64 */
+        op = copy_op(begin_op, op, INDEX_op_extu_i32_i64);
+    }
+    return op;
+}
+
+static TCGOp *copy_mov_i64(TCGOp **begin_op, TCGOp *op)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        /* 2x mov_i32 */
+        op = copy_op(begin_op, op, INDEX_op_mov_i32);
+        op = copy_op(begin_op, op, INDEX_op_mov_i32);
+    } else {
+        /* mov_i64 */
+        op = copy_op(begin_op, op, INDEX_op_mov_i64);
+    }
+    return op;
+}
+
+static TCGOp *copy_movi_i64(TCGOp **begin_op, TCGOp *op, uint64_t v)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        /* 2x movi_i32 */
+        op = copy_op(begin_op, op, INDEX_op_movi_i32);
+        op->args[1] = v;
+
+        op = copy_op(begin_op, op, INDEX_op_movi_i32);
+        op->args[1] = v >> 32;
+    } else {
+        /* movi_i64 */
+        op = copy_op(begin_op, op, INDEX_op_movi_i64);
+        op->args[1] = v;
+    }
+    return op;
+}
+
+static TCGOp *copy_const_ptr(TCGOp **begin_op, TCGOp *op, void *ptr)
+{
+    if (UINTPTR_MAX == UINT32_MAX) {
+        /* movi_i32 */
+        op = copy_op(begin_op, op, INDEX_op_movi_i32);
+        op->args[1] = (uintptr_t)ptr;
+    } else {
+        /* movi_i64 */
+        op = copy_movi_i64(begin_op, op, (uint64_t)(uintptr_t)ptr);
+    }
+    return op;
+}
+
+static TCGOp *copy_const_i64(TCGOp **begin_op, TCGOp *op, uint64_t v)
+{
+    return copy_movi_i64(begin_op, op, v);
+}
+
+static TCGOp *copy_extu_tl_i64(TCGOp **begin_op, TCGOp *op)
+{
+    if (TARGET_LONG_BITS == 32) {
+        /* extu_i32_i64 */
+        op = copy_extu_i32_i64(begin_op, op);
+    } else {
+        /* mov_i64 */
+        op = copy_mov_i64(begin_op, op);
+    }
+    return op;
+}
+
+static TCGOp *copy_ld_i64(TCGOp **begin_op, TCGOp *op)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        /* 2x ld_i32 */
+        op = copy_op(begin_op, op, INDEX_op_ld_i32);
+        op = copy_op(begin_op, op, INDEX_op_ld_i32);
+    } else {
+        /* ld_i64 */
+        op = copy_op(begin_op, op, INDEX_op_ld_i64);
+    }
+    return op;
+}
+
+static TCGOp *copy_st_i64(TCGOp **begin_op, TCGOp *op)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        /* 2x st_i32 */
+        op = copy_op(begin_op, op, INDEX_op_st_i32);
+        op = copy_op(begin_op, op, INDEX_op_st_i32);
+    } else {
+        /* st_i64 */
+        op = copy_op(begin_op, op, INDEX_op_st_i64);
+    }
+    return op;
+}
+
+static TCGOp *copy_add_i64(TCGOp **begin_op, TCGOp *op)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        /* all 32-bit backends must implement add2_i32 */
+        g_assert(TCG_TARGET_HAS_add2_i32);
+        op = copy_op(begin_op, op, INDEX_op_add2_i32);
+    } else {
+        op = copy_op(begin_op, op, INDEX_op_add_i64);
+    }
+    return op;
+}
+
+static TCGOp *copy_st_ptr(TCGOp **begin_op, TCGOp *op)
+{
+    if (UINTPTR_MAX == UINT32_MAX) {
+        /* st_i32 */
+        op = copy_op(begin_op, op, INDEX_op_st_i32);
+    } else {
+        /* st_i64 */
+        op = copy_st_i64(begin_op, op);
+    }
+    return op;
+}
+
+static TCGOp *copy_call(TCGOp **begin_op, TCGOp *op, void *empty_func,
+                        void *func, unsigned tcg_flags, int *cb_idx)
+{
+    /* copy all ops until the call */
+    do {
+        op = copy_op_nocheck(begin_op, op);
+    } while (op->opc != INDEX_op_call);
+
+    /* fill in the op call */
+    op->param1 = (*begin_op)->param1;
+    op->param2 = (*begin_op)->param2;
+    tcg_debug_assert(op->life == 0);
+    if (*cb_idx == -1) {
+        int i;
+
+        /*
+         * Instead of working out the position of the callback in args[], just
+         * look for @empty_func, since it should be a unique pointer.
+         */
+        for (i = 0; i < MAX_OPC_PARAM_ARGS; i++) {
+            if ((uintptr_t)(*begin_op)->args[i] == (uintptr_t)empty_func) {
+                *cb_idx = i;
+                break;
+            }
+        }
+        tcg_debug_assert(i < MAX_OPC_PARAM_ARGS);
+    }
+    op->args[*cb_idx] = (uintptr_t)func;
+    op->args[*cb_idx + 1] = tcg_flags;
+
+    return op;
+}
+
+static TCGOp *append_udata_cb(const struct qemu_plugin_dyn_cb *cb,
+                              TCGOp *begin_op, TCGOp *op, int *cb_idx)
+{
+    /* const_ptr */
+    op = copy_const_ptr(&begin_op, op, cb->userp);
+
+    /* copy the ld_i32, but note that we only have to copy it once */
+    begin_op = QTAILQ_NEXT(begin_op, link);
+    tcg_debug_assert(begin_op && begin_op->opc == INDEX_op_ld_i32);
+    if (*cb_idx == -1) {
+        op = tcg_op_insert_after(tcg_ctx, op, INDEX_op_ld_i32);
+        memcpy(op->args, begin_op->args, sizeof(op->args));
+    }
+
+    /* call */
+    op = copy_call(&begin_op, op, HELPER(plugin_vcpu_udata_cb),
+                   cb->f.vcpu_udata, cb->tcg_flags, cb_idx);
+
+    return op;
+}
+
+static TCGOp *append_inline_cb(const struct qemu_plugin_dyn_cb *cb,
+                               TCGOp *begin_op, TCGOp *op,
+                               int *unused)
+{
+    /* const_ptr */
+    op = copy_const_ptr(&begin_op, op, cb->userp);
+
+    /* ld_i64 */
+    op = copy_ld_i64(&begin_op, op);
+
+    /* const_i64 */
+    op = copy_const_i64(&begin_op, op, cb->inline_insn.imm);
+
+    /* add_i64 */
+    op = copy_add_i64(&begin_op, op);
+
+    /* st_i64 */
+    op = copy_st_i64(&begin_op, op);
+
+    return op;
+}
+
+static TCGOp *append_mem_cb(const struct qemu_plugin_dyn_cb *cb,
+                            TCGOp *begin_op, TCGOp *op, int *cb_idx)
+{
+    enum plugin_gen_cb type = begin_op->args[1];
+
+    tcg_debug_assert(type == PLUGIN_GEN_CB_MEM);
+
+    /* const_i32 == movi_i32 ("info", so it remains as is) */
+    op = copy_op(&begin_op, op, INDEX_op_movi_i32);
+
+    /* const_ptr */
+    op = copy_const_ptr(&begin_op, op, cb->userp);
+
+    /* copy the ld_i32, but note that we only have to copy it once */
+    begin_op = QTAILQ_NEXT(begin_op, link);
+    tcg_debug_assert(begin_op && begin_op->opc == INDEX_op_ld_i32);
+    if (*cb_idx == -1) {
+        op = tcg_op_insert_after(tcg_ctx, op, INDEX_op_ld_i32);
+        memcpy(op->args, begin_op->args, sizeof(op->args));
+    }
+
+    /* extu_tl_i64 */
+    op = copy_extu_tl_i64(&begin_op, op);
+
+    if (type == PLUGIN_GEN_CB_MEM) {
+        /* call */
+        op = copy_call(&begin_op, op, HELPER(plugin_vcpu_mem_cb),
+                       cb->f.vcpu_udata, cb->tcg_flags, cb_idx);
+    }
+
+    return op;
+}
+
+typedef TCGOp *(*inject_fn)(const struct qemu_plugin_dyn_cb *cb,
+                            TCGOp *begin_op, TCGOp *op, int *intp);
+typedef bool (*op_ok_fn)(const TCGOp *op, const struct qemu_plugin_dyn_cb *cb);
+
+static bool op_ok(const TCGOp *op, const struct qemu_plugin_dyn_cb *cb)
+{
+    return true;
+}
+
+static bool op_rw(const TCGOp *op, const struct qemu_plugin_dyn_cb *cb)
+{
+    int w;
+
+    w = op->args[2];
+    return !!(cb->rw & (w + 1));
+}
+
+static inline
+void inject_cb_type(const GArray *cbs, TCGOp *begin_op, inject_fn inject,
+                    op_ok_fn ok)
+{
+    TCGOp *end_op;
+    TCGOp *op;
+    int cb_idx = -1;
+    int i;
+
+    if (!cbs || cbs->len == 0) {
+        rm_ops(begin_op);
+        return;
+    }
+
+    end_op = find_op(begin_op, INDEX_op_plugin_cb_end);
+    tcg_debug_assert(end_op);
+
+    op = end_op;
+    for (i = 0; i < cbs->len; i++) {
+        struct qemu_plugin_dyn_cb *cb =
+            &g_array_index(cbs, struct qemu_plugin_dyn_cb, i);
+
+        if (!ok(begin_op, cb)) {
+            continue;
+        }
+        op = inject(cb, begin_op, op, &cb_idx);
+    }
+    rm_ops_range(begin_op, end_op);
+}
+
+static void
+inject_udata_cb(const GArray *cbs, TCGOp *begin_op)
+{
+    inject_cb_type(cbs, begin_op, append_udata_cb, op_ok);
+}
+
+static void
+inject_inline_cb(const GArray *cbs, TCGOp *begin_op, op_ok_fn ok)
+{
+    inject_cb_type(cbs, begin_op, append_inline_cb, ok);
+}
+
+static void
+inject_mem_cb(const GArray *cbs, TCGOp *begin_op)
+{
+    inject_cb_type(cbs, begin_op, append_mem_cb, op_rw);
+}
+
+/* we could change the ops in place, but we can reuse more code by copying */
+static void inject_mem_helper(TCGOp *begin_op, GArray *arr)
+{
+    TCGOp *orig_op = begin_op;
+    TCGOp *end_op;
+    TCGOp *op;
+
+    end_op = find_op(begin_op, INDEX_op_plugin_cb_end);
+    tcg_debug_assert(end_op);
+
+    /* const ptr */
+    op = copy_const_ptr(&begin_op, end_op, arr);
+
+    /* st_ptr */
+    op = copy_st_ptr(&begin_op, op);
+
+    rm_ops_range(orig_op, end_op);
+}
+
+/*
+ * Tracking memory accesses performed from helpers requires extra work.
+ * If an instruction is emulated with helpers, we do two things:
+ * (1) copy the CB descriptors, and keep track of it so that they can be
+ * freed later on, and (2) point CPUState.plugin_mem_cbs to the descriptors, so
+ * that we can read them at run-time (i.e. when the helper executes).
+ * This run-time access is performed from qemu_plugin_vcpu_mem_cb.
+ *
+ * Note that plugin_gen_disable_mem_helpers undoes (2). Since it
+ * is possible that the code we generate after the instruction is
+ * dead, we also add checks before generating tb_exit etc.
+ */
+static void inject_mem_enable_helper(struct qemu_plugin_insn *plugin_insn,
+                                     TCGOp *begin_op)
+{
+    GArray *cbs[2];
+    GArray *arr;
+    size_t n_cbs, i;
+
+    cbs[0] = plugin_insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR];
+    cbs[1] = plugin_insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE];
+
+    n_cbs = 0;
+    for (i = 0; i < ARRAY_SIZE(cbs); i++) {
+        n_cbs += cbs[i]->len;
+    }
+
+    plugin_insn->mem_helper = plugin_insn->calls_helpers && n_cbs;
+    if (likely(!plugin_insn->mem_helper)) {
+        rm_ops(begin_op);
+        return;
+    }
+
+    arr = g_array_sized_new(false, false,
+                            sizeof(struct qemu_plugin_dyn_cb), n_cbs);
+
+    for (i = 0; i < ARRAY_SIZE(cbs); i++) {
+        g_array_append_vals(arr, cbs[i]->data, cbs[i]->len);
+    }
+
+    qemu_plugin_add_dyn_cb_arr(arr);
+    inject_mem_helper(begin_op, arr);
+}
+
+static void inject_mem_disable_helper(struct qemu_plugin_insn *plugin_insn,
+                                      TCGOp *begin_op)
+{
+    if (likely(!plugin_insn->mem_helper)) {
+        rm_ops(begin_op);
+        return;
+    }
+    inject_mem_helper(begin_op, NULL);
+}
+
+/* called before finishing a TB with exit_tb, goto_tb or goto_ptr */
+void plugin_gen_disable_mem_helpers(void)
+{
+    TCGv_ptr ptr;
+
+    if (likely(tcg_ctx->plugin_insn == NULL ||
+               !tcg_ctx->plugin_insn->mem_helper)) {
+        return;
+    }
+    ptr = tcg_const_ptr(NULL);
+    tcg_gen_st_ptr(ptr, cpu_env, offsetof(CPUState, plugin_mem_cbs) -
+                                 offsetof(ArchCPU, env));
+    tcg_temp_free_ptr(ptr);
+    tcg_ctx->plugin_insn->mem_helper = false;
+}
+
+static void plugin_gen_tb_udata(const struct qemu_plugin_tb *ptb,
+                                TCGOp *begin_op)
+{
+    inject_udata_cb(ptb->cbs[PLUGIN_CB_REGULAR], begin_op);
+}
+
+static void plugin_gen_tb_inline(const struct qemu_plugin_tb *ptb,
+                                 TCGOp *begin_op)
+{
+    inject_inline_cb(ptb->cbs[PLUGIN_CB_INLINE], begin_op, op_ok);
+}
+
+static void plugin_gen_insn_udata(const struct qemu_plugin_tb *ptb,
+                                  TCGOp *begin_op, int insn_idx)
+{
+    struct qemu_plugin_insn *insn = g_ptr_array_index(ptb->insns, insn_idx);
+
+    inject_udata_cb(insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_REGULAR], begin_op);
+}
+
+static void plugin_gen_insn_inline(const struct qemu_plugin_tb *ptb,
+                                   TCGOp *begin_op, int insn_idx)
+{
+    struct qemu_plugin_insn *insn = g_ptr_array_index(ptb->insns, insn_idx);
+    inject_inline_cb(insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE],
+                     begin_op, op_ok);
+}
+
+static void plugin_gen_mem_regular(const struct qemu_plugin_tb *ptb,
+                                   TCGOp *begin_op, int insn_idx)
+{
+    struct qemu_plugin_insn *insn = g_ptr_array_index(ptb->insns, insn_idx);
+    inject_mem_cb(insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR], begin_op);
+}
+
+static void plugin_gen_mem_inline(const struct qemu_plugin_tb *ptb,
+                                  TCGOp *begin_op, int insn_idx)
+{
+    const GArray *cbs;
+    struct qemu_plugin_insn *insn = g_ptr_array_index(ptb->insns, insn_idx);
+
+    cbs = insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE];
+    inject_inline_cb(cbs, begin_op, op_rw);
+}
+
+static void plugin_gen_enable_mem_helper(const struct qemu_plugin_tb *ptb,
+                                         TCGOp *begin_op, int insn_idx)
+{
+    struct qemu_plugin_insn *insn = g_ptr_array_index(ptb->insns, insn_idx);
+    inject_mem_enable_helper(insn, begin_op);
+}
+
+static void plugin_gen_disable_mem_helper(const struct qemu_plugin_tb *ptb,
+                                          TCGOp *begin_op, int insn_idx)
+{
+    struct qemu_plugin_insn *insn = g_ptr_array_index(ptb->insns, insn_idx);
+    inject_mem_disable_helper(insn, begin_op);
+}
+
+static void plugin_inject_cb(const struct qemu_plugin_tb *ptb, TCGOp *begin_op,
+                             int insn_idx)
+{
+    enum plugin_gen_from from = begin_op->args[0];
+    enum plugin_gen_cb type = begin_op->args[1];
+
+    switch (from) {
+    case PLUGIN_GEN_FROM_TB:
+        switch (type) {
+        case PLUGIN_GEN_CB_UDATA:
+            plugin_gen_tb_udata(ptb, begin_op);
+            return;
+        case PLUGIN_GEN_CB_INLINE:
+            plugin_gen_tb_inline(ptb, begin_op);
+            return;
+        default:
+            g_assert_not_reached();
+        }
+    case PLUGIN_GEN_FROM_INSN:
+        switch (type) {
+        case PLUGIN_GEN_CB_UDATA:
+            plugin_gen_insn_udata(ptb, begin_op, insn_idx);
+            return;
+        case PLUGIN_GEN_CB_INLINE:
+            plugin_gen_insn_inline(ptb, begin_op, insn_idx);
+            return;
+        case PLUGIN_GEN_ENABLE_MEM_HELPER:
+            plugin_gen_enable_mem_helper(ptb, begin_op, insn_idx);
+            return;
+        default:
+            g_assert_not_reached();
+        }
+    case PLUGIN_GEN_FROM_MEM:
+        switch (type) {
+        case PLUGIN_GEN_CB_MEM:
+            plugin_gen_mem_regular(ptb, begin_op, insn_idx);
+            return;
+        case PLUGIN_GEN_CB_INLINE:
+            plugin_gen_mem_inline(ptb, begin_op, insn_idx);
+            return;
+        default:
+            g_assert_not_reached();
+        }
+    case PLUGIN_GEN_AFTER_INSN:
+        switch (type) {
+        case PLUGIN_GEN_DISABLE_MEM_HELPER:
+            plugin_gen_disable_mem_helper(ptb, begin_op, insn_idx);
+            return;
+        default:
+            g_assert_not_reached();
+        }
+    default:
+        g_assert_not_reached();
+    }
+}
+
+/* #define DEBUG_PLUGIN_GEN_OPS */
+static void pr_ops(void)
+{
+#ifdef DEBUG_PLUGIN_GEN_OPS
+    TCGOp *op;
+    int i = 0;
+
+    QTAILQ_FOREACH(op, &tcg_ctx->ops, link) {
+        const char *name = "";
+        const char *type = "";
+
+        if (op->opc == INDEX_op_plugin_cb_start) {
+            switch (op->args[0]) {
+            case PLUGIN_GEN_FROM_TB:
+                name = "tb";
+                break;
+            case PLUGIN_GEN_FROM_INSN:
+                name = "insn";
+                break;
+            case PLUGIN_GEN_FROM_MEM:
+                name = "mem";
+                break;
+            case PLUGIN_GEN_AFTER_INSN:
+                name = "after insn";
+                break;
+            default:
+                break;
+            }
+            switch (op->args[1]) {
+            case PLUGIN_GEN_CB_UDATA:
+                type = "udata";
+                break;
+            case PLUGIN_GEN_CB_INLINE:
+                type = "inline";
+                break;
+            case PLUGIN_GEN_CB_MEM:
+                type = "mem";
+                break;
+            case PLUGIN_GEN_ENABLE_MEM_HELPER:
+                type = "enable mem helper";
+                break;
+            case PLUGIN_GEN_DISABLE_MEM_HELPER:
+                type = "disable mem helper";
+                break;
+            default:
+                break;
+            }
+        }
+        printf("op[%2i]: %s %s %s\n", i, tcg_op_defs[op->opc].name, name, type);
+        i++;
+    }
+#endif
+}
+
+static void plugin_gen_inject(const struct qemu_plugin_tb *plugin_tb)
+{
+    TCGOp *op;
+    int insn_idx;
+
+    pr_ops();
+    insn_idx = -1;
+    QSIMPLEQ_FOREACH(op, &tcg_ctx->plugin_ops, plugin_link) {
+        enum plugin_gen_from from = op->args[0];
+        enum plugin_gen_cb type = op->args[1];
+
+        tcg_debug_assert(op->opc == INDEX_op_plugin_cb_start);
+        /* ENABLE_MEM_HELPER is the first callback of an instruction */
+        if (from == PLUGIN_GEN_FROM_INSN &&
+            type == PLUGIN_GEN_ENABLE_MEM_HELPER) {
+            insn_idx++;
+        }
+        plugin_inject_cb(plugin_tb, op, insn_idx);
+    }
+    pr_ops();
+}
+
+bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb)
+{
+    struct qemu_plugin_tb *ptb = tcg_ctx->plugin_tb;
+    bool ret = false;
+
+    if (test_bit(QEMU_PLUGIN_EV_VCPU_TB_TRANS, cpu->plugin_mask)) {
+        ret = true;
+
+        QSIMPLEQ_INIT(&tcg_ctx->plugin_ops);
+        ptb->vaddr = tb->pc;
+        ptb->vaddr2 = -1;
+        get_page_addr_code_hostp(cpu->env_ptr, tb->pc, &ptb->haddr1);
+        ptb->haddr2 = NULL;
+
+        plugin_gen_empty_callback(PLUGIN_GEN_FROM_TB);
+    }
+    return ret;
+}
+
+void plugin_gen_insn_start(CPUState *cpu, const DisasContextBase *db)
+{
+    struct qemu_plugin_tb *ptb = tcg_ctx->plugin_tb;
+    struct qemu_plugin_insn *pinsn;
+
+    pinsn = qemu_plugin_tb_insn_get(ptb);
+    tcg_ctx->plugin_insn = pinsn;
+    pinsn->vaddr = db->pc_next;
+    plugin_gen_empty_callback(PLUGIN_GEN_FROM_INSN);
+
+    /*
+     * Detect page crossing to get the new host address.
+     * Note that we skip this when haddr1 == NULL, e.g. when we're
+     * fetching instructions from a region not backed by RAM.
+     */
+    if (likely(ptb->haddr1 != NULL && ptb->vaddr2 == -1) &&
+        unlikely((db->pc_next & TARGET_PAGE_MASK) !=
+                 (db->pc_first & TARGET_PAGE_MASK))) {
+        get_page_addr_code_hostp(cpu->env_ptr, db->pc_next,
+                                 &ptb->haddr2);
+        ptb->vaddr2 = db->pc_next;
+    }
+    if (likely(ptb->vaddr2 == -1)) {
+        pinsn->haddr = ptb->haddr1 + pinsn->vaddr - ptb->vaddr;
+    } else {
+        pinsn->haddr = ptb->haddr2 + pinsn->vaddr - ptb->vaddr2;
+    }
+}
+
+void plugin_gen_insn_end(void)
+{
+    plugin_gen_empty_callback(PLUGIN_GEN_AFTER_INSN);
+}
+
+void plugin_gen_tb_end(CPUState *cpu)
+{
+    struct qemu_plugin_tb *ptb = tcg_ctx->plugin_tb;
+    int i;
+
+    /* collect instrumentation requests */
+    qemu_plugin_tb_trans_cb(cpu, ptb);
+
+    /* inject the instrumentation at the appropriate places */
+    plugin_gen_inject(ptb);
+
+    /* clean up */
+    for (i = 0; i < PLUGIN_N_CB_SUBTYPES; i++) {
+        if (ptb->cbs[i]) {
+            g_array_set_size(ptb->cbs[i], 0);
+        }
+    }
+    ptb->n = 0;
+    tcg_ctx->plugin_insn = NULL;
+}
diff --git a/accel/tcg/plugin-helpers.h b/accel/tcg/plugin-helpers.h
new file mode 100644
index 0000000000..1916ee7920
--- /dev/null
+++ b/accel/tcg/plugin-helpers.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_PLUGIN
+/* Note: no TCG flags because those are overwritten later */
+DEF_HELPER_2(plugin_vcpu_udata_cb, void, i32, ptr)
+DEF_HELPER_4(plugin_vcpu_mem_cb, void, i32, i32, i64, ptr)
+#endif
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index ae063b53f9..9f48da9472 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -1214,6 +1214,8 @@ static gboolean tb_host_size_iter(gpointer key, gpointer value, gpointer data)
 /* flush all the translation blocks */
 static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
 {
+    bool did_flush = false;
+
     mmap_lock();
     /* If it is already been done on request of another CPU,
      * just retry.
@@ -1221,6 +1223,7 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
     if (tb_ctx.tb_flush_count != tb_flush_count.host_int) {
         goto done;
     }
+    did_flush = true;
 
     if (DEBUG_TB_FLUSH_GATE) {
         size_t nb_tbs = tcg_nb_tbs();
@@ -1245,14 +1248,22 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
 
 done:
     mmap_unlock();
+    if (did_flush) {
+        qemu_plugin_flush_cb();
+    }
 }
 
 void tb_flush(CPUState *cpu)
 {
     if (tcg_enabled()) {
         unsigned tb_flush_count = atomic_mb_read(&tb_ctx.tb_flush_count);
-        async_safe_run_on_cpu(cpu, do_tb_flush,
-                              RUN_ON_CPU_HOST_INT(tb_flush_count));
+
+        if (cpu_in_exclusive_context(cpu)) {
+            do_tb_flush(cpu, RUN_ON_CPU_HOST_INT(tb_flush_count));
+        } else {
+            async_safe_run_on_cpu(cpu, do_tb_flush,
+                                  RUN_ON_CPU_HOST_INT(tb_flush_count));
+        }
     }
 }
 
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index 70c66c538c..f977682be7 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -16,6 +16,7 @@
 #include "exec/gen-icount.h"
 #include "exec/log.h"
 #include "exec/translator.h"
+#include "exec/plugin-gen.h"
 
 /* Pairs with tcg_clear_temp_count.
    To be called by #TranslatorOps.{translate_insn,tb_stop} if
@@ -34,6 +35,7 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,
                      CPUState *cpu, TranslationBlock *tb, int max_insns)
 {
     int bp_insn = 0;
+    bool plugin_enabled;
 
     /* Initialize DisasContext */
     db->tb = tb;
@@ -55,11 +57,17 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,
     ops->tb_start(db, cpu);
     tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */
 
+    plugin_enabled = plugin_gen_tb_start(cpu, tb);
+
     while (true) {
         db->num_insns++;
         ops->insn_start(db, cpu);
         tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */
 
+        if (plugin_enabled) {
+            plugin_gen_insn_start(cpu, db);
+        }
+
         /* Pass breakpoint hits to target for further processing */
         if (!db->singlestep_enabled
             && unlikely(!QTAILQ_EMPTY(&cpu->breakpoints))) {
@@ -99,6 +107,14 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,
             break;
         }
 
+        /*
+         * We can't instrument after instructions that change control
+         * flow although this only really affects post-load operations.
+         */
+        if (plugin_enabled) {
+            plugin_gen_insn_end();
+        }
+
         /* Stop translation if the output buffer is full,
            or we have executed all of the allowed instructions.  */
         if (tcg_op_buf_full() || db->num_insns >= db->max_insns) {
@@ -111,6 +127,10 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,
     ops->tb_stop(db, cpu);
     gen_tb_end(db->tb, db->num_insns - bp_insn);
 
+    if (plugin_enabled) {
+        plugin_gen_tb_end(cpu);
+    }
+
     /* The disas_log hook may use these values rather than recompute.  */
     db->tb->size = db->pc_next - db->pc_first;
     db->tb->icount = db->num_insns;
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index 71c4bf6477..b09f7a1577 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -751,10 +751,13 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 #define ATOMIC_MMU_DECLS do {} while (0)
 #define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, DATA_SIZE, GETPC())
 #define ATOMIC_MMU_CLEANUP do { clear_helper_retaddr(); } while (0)
+#define ATOMIC_MMU_IDX MMU_USER_IDX
 
 #define ATOMIC_NAME(X)   HELPER(glue(glue(atomic_ ## X, SUFFIX), END))
 #define EXTRA_ARGS
 
+#include "atomic_common.inc.c"
+
 #define DATA_SIZE 1
 #include "atomic_template.h"