aboutsummaryrefslogtreecommitdiff
path: root/softmmu_template.h
diff options
context:
space:
mode:
authorXin Tong <trent.tong@gmail.com>2014-08-04 20:35:23 -0500
committerPeter Maydell <peter.maydell@linaro.org>2014-09-01 17:43:06 +0100
commit88e89a57f985296a6eeb416b2a875072e09d7faa (patch)
tree1b8b30a624e75bb0a488ae6a5a3c0a7192fc6cd1 /softmmu_template.h
parent44ea34309e38ce1bcb7d2c8816c6b0baaee7979c (diff)
implementing victim TLB for QEMU system emulated TLB
QEMU system mode page table walks are expensive. Taken by running QEMU qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a 4-level page tables in guest Linux OS takes ~450 X86 instructions on average. QEMU system mode TLB is implemented using a directly-mapped hashtable. This structure suffers from conflict misses. Increasing the associativity of the TLB may not be the solution to conflict misses as all the ways may have to be walked in serial. A victim TLB is a TLB used to hold translations evicted from the primary TLB upon replacement. The victim TLB lies between the main TLB and its refill path. Victim TLB is of greater associativity (fully associative in this patch). It takes longer to lookup the victim TLB, but its likely better than a full page table walk. The memory translation path is changed as follows : Before Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. TLB refill. 5. Do the memory access. 6. Return to code cache. After Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. Victim TLB lookup. 5. If victim TLB misses, TLB refill 6. Do the memory access. 7. Return to code cache The advantage is that victim TLB can offer more associativity to a directly mapped TLB and thus potentially fewer page table walks while still keeping the time taken to flush within reasonable limits. However, placing a victim TLB before the refill path increase TLB refill path as the victim TLB is consulted before the TLB refill. The performance results demonstrate that the pros outweigh the cons. some performance results taken on SPECINT2006 train datasets and kernel boot and qemu configure script on an Intel(R) Xeon(R) CPU E5620 @ 2.40GHz Linux machine are shown in the Google Doc link below. https://docs.google.com/spreadsheets/d/1eiItzekZwNQOal_h-5iJmC4tMDi051m9qidi5_nwvH4/edit?usp=sharing In summary, victim TLB improves the performance of qemu-system-x86_64 by 11% on average on SPECINT2006, kernelboot and qemu configscript and with highest improvement of in 26% in 456.hmmer. And victim TLB does not result in any performance degradation in any of the measured benchmarks. Furthermore, the implemented victim TLB is architecture independent and is expected to benefit other architectures in QEMU as well. Although there are measurement fluctuations, the performance improvement is very significant and by no means in the range of noises. Signed-off-by: Xin Tong <trent.tong@gmail.com> Message-id: 1407202523-23553-1-git-send-email-trent.tong@gmail.com Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'softmmu_template.h')
-rw-r--r--softmmu_template.h43
1 files changed, 39 insertions, 4 deletions
diff --git a/softmmu_template.h b/softmmu_template.h
index 5a07f991a1..88e33900b6 100644
--- a/softmmu_template.h
+++ b/softmmu_template.h
@@ -116,6 +116,31 @@
# define helper_te_st_name helper_le_st_name
#endif
+/* macro to check the victim tlb */
+#define VICTIM_TLB_HIT(ty) \
+({ \
+ /* we are about to do a page table walk. our last hope is the \
+ * victim tlb. try to refill from the victim tlb before walking the \
+ * page table. */ \
+ int vidx; \
+ hwaddr tmpiotlb; \
+ CPUTLBEntry tmptlb; \
+ for (vidx = CPU_VTLB_SIZE-1; vidx >= 0; --vidx) { \
+ if (env->tlb_v_table[mmu_idx][vidx].ty == (addr & TARGET_PAGE_MASK)) {\
+ /* found entry in victim tlb, swap tlb and iotlb */ \
+ tmptlb = env->tlb_table[mmu_idx][index]; \
+ env->tlb_table[mmu_idx][index] = env->tlb_v_table[mmu_idx][vidx]; \
+ env->tlb_v_table[mmu_idx][vidx] = tmptlb; \
+ tmpiotlb = env->iotlb[mmu_idx][index]; \
+ env->iotlb[mmu_idx][index] = env->iotlb_v[mmu_idx][vidx]; \
+ env->iotlb_v[mmu_idx][vidx] = tmpiotlb; \
+ break; \
+ } \
+ } \
+ /* return true when there is a vtlb hit, i.e. vidx >=0 */ \
+ vidx >= 0; \
+})
+
#ifndef SOFTMMU_CODE_ACCESS
static inline DATA_TYPE glue(io_read, SUFFIX)(CPUArchState *env,
hwaddr physaddr,
@@ -161,7 +186,10 @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr, int mmu_idx,
mmu_idx, retaddr);
}
#endif
- tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
+ if (!VICTIM_TLB_HIT(ADDR_READ)) {
+ tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+ mmu_idx, retaddr);
+ }
tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
}
@@ -246,7 +274,10 @@ WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr, int mmu_idx,
mmu_idx, retaddr);
}
#endif
- tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
+ if (!VICTIM_TLB_HIT(ADDR_READ)) {
+ tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+ mmu_idx, retaddr);
+ }
tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
}
@@ -368,7 +399,9 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
cpu_unaligned_access(ENV_GET_CPU(env), addr, 1, mmu_idx, retaddr);
}
#endif
- tlb_fill(ENV_GET_CPU(env), addr, 1, mmu_idx, retaddr);
+ if (!VICTIM_TLB_HIT(addr_write)) {
+ tlb_fill(ENV_GET_CPU(env), addr, 1, mmu_idx, retaddr);
+ }
tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
}
@@ -444,7 +477,9 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
cpu_unaligned_access(ENV_GET_CPU(env), addr, 1, mmu_idx, retaddr);
}
#endif
- tlb_fill(ENV_GET_CPU(env), addr, 1, mmu_idx, retaddr);
+ if (!VICTIM_TLB_HIT(addr_write)) {
+ tlb_fill(ENV_GET_CPU(env), addr, 1, mmu_idx, retaddr);
+ }
tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
}