diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2018-06-15 15:27:48 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2018-06-15 15:27:48 +0100 |
commit | 81d386479640879d87ab9661c8fb44d586c965ec (patch) | |
tree | d406d297e04eb0b86fd16ac974aaf34673110110 | |
parent | 2702c2d3eb74e3908c0c5dbf3a71c8987595a86e (diff) | |
parent | 14120108f87b3f9e1beacdf0a6096e464e62bb65 (diff) |
Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20180615' into staging
target-arm and miscellaneous queue:
* fix KVM state save/restore for GICv3 priority registers for high IRQ numbers
* hw/arm/mps2-tz: Put ethernet controller behind PPC
* hw/sh/sh7750: Convert away from old_mmio
* hw/m68k/mcf5206: Convert away from old_mmio
* hw/block/pflash_cfi02: Convert away from old_mmio
* hw/watchdog/wdt_i6300esb: Convert away from old_mmio
* hw/input/pckbd: Convert away from old_mmio
* hw/char/parallel: Convert away from old_mmio
* armv7m: refactor to get rid of armv7m_init() function
* arm: Don't crash if user tries to use a Cortex-M CPU without an NVIC
* hw/core/or-irq: Support more than 16 inputs to an OR gate
* cpu-defs.h: Document CPUIOTLBEntry 'addr' field
* cputlb: Pass cpu_transaction_failed() the correct physaddr
* CODING_STYLE: Define our preferred form for multiline comments
* Add and use new stn_*_p() and ldn_*_p() memory access functions
* target/arm: More parts of the upcoming SVE support
* aspeed_scu: Implement RNG register
* m25p80: add support for two bytes WRSR for Macronix chips
* exec.c: Handle IOMMUs being in the path of TCG CPU memory accesses
* target/arm: Allow ARMv6-M Thumb2 instructions
# gpg: Signature made Fri 15 Jun 2018 15:24:03 BST
# gpg: using RSA key 3C2525ED14360CDE
# gpg: Good signature from "Peter Maydell <peter.maydell@linaro.org>"
# gpg: aka "Peter Maydell <pmaydell@gmail.com>"
# gpg: aka "Peter Maydell <pmaydell@chiark.greenend.org.uk>"
# Primary key fingerprint: E1A5 C593 CD41 9DE2 8E83 15CF 3C25 25ED 1436 0CDE
* remotes/pmaydell/tags/pull-target-arm-20180615: (43 commits)
target/arm: Allow ARMv6-M Thumb2 instructions
exec.c: Handle IOMMUs in address_space_translate_for_iotlb()
iommu: Add IOMMU index argument to translate method
iommu: Add IOMMU index argument to notifier APIs
iommu: Add IOMMU index concept to IOMMU API
m25p80: add support for two bytes WRSR for Macronix chips
aspeed_scu: Implement RNG register
target/arm: Implement SVE Floating Point Arithmetic - Unpredicated Group
target/arm: Implement SVE Integer Wide Immediate - Unpredicated Group
target/arm: Implement FDUP/DUP
target/arm: Implement SVE Integer Compare - Scalars Group
target/arm: Implement SVE Predicate Count Group
target/arm: Implement SVE Partition Break Group
target/arm: Implement SVE Integer Compare - Immediate Group
target/arm: Implement SVE Integer Compare - Vectors Group
target/arm: Implement SVE Select Vectors Group
target/arm: Implement SVE vector splice (predicated)
target/arm: Implement SVE reverse within elements
target/arm: Implement SVE copy to vector (predicated)
target/arm: Implement SVE conditionally broadcast/extract element
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
48 files changed, 4114 insertions, 363 deletions
diff --git a/CODING_STYLE b/CODING_STYLE index 12ba58ee29..ec075dedc4 100644 --- a/CODING_STYLE +++ b/CODING_STYLE @@ -124,6 +124,23 @@ We use traditional C-style /* */ comments and avoid // comments. Rationale: The // form is valid in C99, so this is purely a matter of consistency of style. The checkpatch script will warn you about this. +Multiline comment blocks should have a row of stars on the left, +and the initial /* and terminating */ both on their own lines: + /* + * like + * this + */ +This is the same format required by the Linux kernel coding style. + +(Some of the existing comments in the codebase use the GNU Coding +Standards form which does not have stars on the left, or other +variations; avoid these when writing new comments, but don't worry +about converting to the preferred form unless you're editing that +comment anyway.) + +Rationale: Consistency, and ease of visually picking out a multiline +comment from the surrounding code. + 8. trace-events style 8.1 0x prefix diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c index 05439039e9..0a721bb9c4 100644 --- a/accel/tcg/cputlb.c +++ b/accel/tcg/cputlb.c @@ -632,7 +632,8 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr, } sz = size; - section = address_space_translate_for_iotlb(cpu, asidx, paddr, &xlat, &sz); + section = address_space_translate_for_iotlb(cpu, asidx, paddr, &xlat, &sz, + attrs, &prot); assert(sz >= TARGET_PAGE_SIZE); tlb_debug("vaddr=" TARGET_FMT_lx " paddr=0x" TARGET_FMT_plx @@ -664,6 +665,18 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr, env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index]; /* refill the tlb */ + /* + * At this point iotlb contains a physical section number in the lower + * TARGET_PAGE_BITS, and either + * + the ram_addr_t of the page base of the target RAM (if NOTDIRTY or ROM) + * + the offset within section->mr of the page base (otherwise) + * We subtract the vaddr (which is page aligned and thus won't + * disturb the low bits) to give an offset which can be added to the + * (non-page-aligned) vaddr of the eventual memory access to get + * the MemoryRegion offset for the access. Note that the vaddr we + * subtract here is that of the page base, and not the same as the + * vaddr we add back in io_readx()/io_writex()/get_page_addr_code(). + */ env->iotlb[mmu_idx][index].addr = iotlb - vaddr; env->iotlb[mmu_idx][index].attrs = attrs; @@ -765,13 +778,16 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry, target_ulong addr, uintptr_t retaddr, int size) { CPUState *cpu = ENV_GET_CPU(env); - hwaddr physaddr = iotlbentry->addr; - MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs); + hwaddr mr_offset; + MemoryRegionSection *section; + MemoryRegion *mr; uint64_t val; bool locked = false; MemTxResult r; - physaddr = (physaddr & TARGET_PAGE_MASK) + addr; + section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs); + mr = section->mr; + mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr; cpu->mem_io_pc = retaddr; if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) { cpu_io_recompile(cpu, retaddr); @@ -783,9 +799,13 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry, qemu_mutex_lock_iothread(); locked = true; } - r = memory_region_dispatch_read(mr, physaddr, + r = memory_region_dispatch_read(mr, mr_offset, &val, size, iotlbentry->attrs); if (r != MEMTX_OK) { + hwaddr physaddr = mr_offset + + section->offset_within_address_space - + section->offset_within_region; + cpu_transaction_failed(cpu, physaddr, addr, size, MMU_DATA_LOAD, mmu_idx, iotlbentry->attrs, r, retaddr); } @@ -802,12 +822,15 @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry, uintptr_t retaddr, int size) { CPUState *cpu = ENV_GET_CPU(env); - hwaddr physaddr = iotlbentry->addr; - MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs); + hwaddr mr_offset; + MemoryRegionSection *section; + MemoryRegion *mr; bool locked = false; MemTxResult r; - physaddr = (physaddr & TARGET_PAGE_MASK) + addr; + section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs); + mr = section->mr; + mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr; if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) { cpu_io_recompile(cpu, retaddr); } @@ -818,9 +841,13 @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry, qemu_mutex_lock_iothread(); locked = true; } - r = memory_region_dispatch_write(mr, physaddr, + r = memory_region_dispatch_write(mr, mr_offset, val, size, iotlbentry->attrs); if (r != MEMTX_OK) { + hwaddr physaddr = mr_offset + + section->offset_within_address_space - + section->offset_within_region; + cpu_transaction_failed(cpu, physaddr, addr, size, MMU_DATA_STORE, mmu_idx, iotlbentry->attrs, r, retaddr); } @@ -868,12 +895,13 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index, */ tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr) { - int mmu_idx, index, pd; + int mmu_idx, index; void *p; MemoryRegion *mr; + MemoryRegionSection *section; CPUState *cpu = ENV_GET_CPU(env); CPUIOTLBEntry *iotlbentry; - hwaddr physaddr; + hwaddr physaddr, mr_offset; index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); mmu_idx = cpu_mmu_index(env, true); @@ -884,8 +912,8 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr) } } iotlbentry = &env->iotlb[mmu_idx][index]; - pd = iotlbentry->addr & ~TARGET_PAGE_MASK; - mr = iotlb_to_region(cpu, pd, iotlbentry->attrs); + section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs); + mr = section->mr; if (memory_region_is_unassigned(mr)) { qemu_mutex_lock_iothread(); if (memory_region_request_mmio_ptr(mr, addr)) { @@ -906,7 +934,10 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr) * and use the MemTXResult it produced). However it is the * simplest place we have currently available for the check. */ - physaddr = (iotlbentry->addr & TARGET_PAGE_MASK) + addr; + mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr; + physaddr = mr_offset + + section->offset_within_address_space - + section->offset_within_region; cpu_transaction_failed(cpu, physaddr, addr, 0, MMU_INST_FETCH, mmu_idx, iotlbentry->attrs, MEMTX_DECODE_ERROR, 0); diff --git a/docs/devel/loads-stores.rst b/docs/devel/loads-stores.rst index 6a990cc243..57d8c524bf 100644 --- a/docs/devel/loads-stores.rst +++ b/docs/devel/loads-stores.rst @@ -53,9 +53,24 @@ The ``_{endian}`` infix is omitted for target-endian accesses. The target endian accessors are only available to source files which are built per-target. +There are also functions which take the size as an argument: + +load: ``ldn{endian}_p(ptr, sz)`` + +which performs an unsigned load of ``sz`` bytes from ``ptr`` +as an ``{endian}`` order value and returns it in a uint64_t. + +store: ``stn{endian}_p(ptr, sz, val)`` + +which stores ``val`` to ``ptr`` as an ``{endian}`` order value +of size ``sz`` bytes. + + Regexes for git grep - ``\<ldf\?[us]\?[bwlq]\(_[hbl]e\)\?_p\>`` - ``\<stf\?[bwlq]\(_[hbl]e\)\?_p\>`` + - ``\<ldn_\([hbl]e\)?_p\>`` + - ``\<stn_\([hbl]e\)?_p\>`` ``cpu_{ld,st}_*`` ~~~~~~~~~~~~~~~~~ @@ -501,8 +501,15 @@ static MemoryRegionSection address_space_translate_iommu(IOMMUMemoryRegion *iomm do { hwaddr addr = *xlat; IOMMUMemoryRegionClass *imrc = memory_region_get_iommu_class_nocheck(iommu_mr); - IOMMUTLBEntry iotlb = imrc->translate(iommu_mr, addr, is_write ? - IOMMU_WO : IOMMU_RO); + int iommu_idx = 0; + IOMMUTLBEntry iotlb; + + if (imrc->attrs_to_index) { + iommu_idx = imrc->attrs_to_index(iommu_mr, attrs); + } + + iotlb = imrc->translate(iommu_mr, addr, is_write ? + IOMMU_WO : IOMMU_RO, iommu_idx); if (!(iotlb.perm & (1 << is_write))) { goto unassigned; @@ -646,18 +653,144 @@ MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat, return mr; } +typedef struct TCGIOMMUNotifier { + IOMMUNotifier n; + MemoryRegion *mr; + CPUState *cpu; + int iommu_idx; + bool active; +} TCGIOMMUNotifier; + +static void tcg_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) +{ + TCGIOMMUNotifier *notifier = container_of(n, TCGIOMMUNotifier, n); + + if (!notifier->active) { + return; + } + tlb_flush(notifier->cpu); + notifier->active = false; + /* We leave the notifier struct on the list to avoid reallocating it later. + * Generally the number of IOMMUs a CPU deals with will be small. + * In any case we can't unregister the iommu notifier from a notify + * callback. + */ +} + +static void tcg_register_iommu_notifier(CPUState *cpu, + IOMMUMemoryRegion *iommu_mr, + int iommu_idx) +{ + /* Make sure this CPU has an IOMMU notifier registered for this + * IOMMU/IOMMU index combination, so that we can flush its TLB + * when the IOMMU tells us the mappings we've cached have changed. + */ + MemoryRegion *mr = MEMORY_REGION(iommu_mr); + TCGIOMMUNotifier *notifier; + int i; + + for (i = 0; i < cpu->iommu_notifiers->len; i++) { + notifier = &g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier, i); + if (notifier->mr == mr && notifier->iommu_idx == iommu_idx) { + break; + } + } + if (i == cpu->iommu_notifiers->len) { + /* Not found, add a new entry at the end of the array */ + cpu->iommu_notifiers = g_array_set_size(cpu->iommu_notifiers, i + 1); + notifier = &g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier, i); + + notifier->mr = mr; + notifier->iommu_idx = iommu_idx; + notifier->cpu = cpu; + /* Rather than trying to register interest in the specific part + * of the iommu's address space that we've accessed and then + * expand it later as subsequent accesses touch more of it, we + * just register interest in the whole thing, on the assumption + * that iommu reconfiguration will be rare. + */ + iommu_notifier_init(¬ifier->n, + tcg_iommu_unmap_notify, + IOMMU_NOTIFIER_UNMAP, + 0, + HWADDR_MAX, + iommu_idx); + memory_region_register_iommu_notifier(notifier->mr, ¬ifier->n); + } + + if (!notifier->active) { + notifier->active = true; + } +} + +static void tcg_iommu_free_notifier_list(CPUState *cpu) +{ + /* Destroy the CPU's notifier list */ + int i; + TCGIOMMUNotifier *notifier; + + for (i = 0; i < cpu->iommu_notifiers->len; i++) { + notifier = &g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier, i); + memory_region_unregister_iommu_notifier(notifier->mr, ¬ifier->n); + } + g_array_free(cpu->iommu_notifiers, true); +} + /* Called from RCU critical section */ MemoryRegionSection * address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr, - hwaddr *xlat, hwaddr *plen) + hwaddr *xlat, hwaddr *plen, + MemTxAttrs attrs, int *prot) { MemoryRegionSection *section; + IOMMUMemoryRegion *iommu_mr; + IOMMUMemoryRegionClass *imrc; + IOMMUTLBEntry iotlb; + int iommu_idx; AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch); - section = address_space_translate_internal(d, addr, xlat, plen, false); + for (;;) { + section = address_space_translate_internal(d, addr, &addr, plen, false); + + iommu_mr = memory_region_get_iommu(section->mr); + if (!iommu_mr) { + break; + } + + imrc = memory_region_get_iommu_class_nocheck(iommu_mr); + + iommu_idx = imrc->attrs_to_index(iommu_mr, attrs); + tcg_register_iommu_notifier(cpu, iommu_mr, iommu_idx); + /* We need all the permissions, so pass IOMMU_NONE so the IOMMU + * doesn't short-cut its translation table walk. + */ + iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, iommu_idx); + addr = ((iotlb.translated_addr & ~iotlb.addr_mask) + | (addr & iotlb.addr_mask)); + /* Update the caller's prot bits to remove permissions the IOMMU + * is giving us a failure response for. If we get down to no + * permissions left at all we can give up now. + */ + if (!(iotlb.perm & IOMMU_RO)) { + *prot &= ~(PAGE_READ | PAGE_EXEC); + } + if (!(iotlb.perm & IOMMU_WO)) { + *prot &= ~PAGE_WRITE; + } + + if (!*prot) { + goto translate_fail; + } + + d = flatview_to_dispatch(address_space_to_flatview(iotlb.target_as)); + } assert(!memory_region_is_iommu(section->mr)); + *xlat = addr; return section; + +translate_fail: + return &d->map.sections[PHYS_SECTION_UNASSIGNED]; } #endif @@ -816,6 +949,9 @@ void cpu_exec_unrealizefn(CPUState *cpu) if (qdev_get_vmsd(DEVICE(cpu)) == NULL) { vmstate_unregister(NULL, &vmstate_cpu_common, cpu); } +#ifndef CONFIG_USER_ONLY + tcg_iommu_free_notifier_list(cpu); +#endif } Property cpu_common_props[] = { @@ -863,6 +999,8 @@ void cpu_exec_realizefn(CPUState *cpu, Error **errp) if (cc->vmsd != NULL) { vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu); } + + cpu->iommu_notifiers = g_array_new(false, true, sizeof(TCGIOMMUNotifier)); #endif } @@ -2544,22 +2682,7 @@ static void notdirty_mem_write(void *opaque, hwaddr ram_addr, memory_notdirty_write_prepare(&ndi, current_cpu, current_cpu->mem_io_vaddr, ram_addr, size); - switch (size) { - case 1: - stb_p(qemu_map_ram_ptr(NULL, ram_addr), val); - break; - case 2: - stw_p(qemu_map_ram_ptr(NULL, ram_addr), val); - break; - case 4: - stl_p(qemu_map_ram_ptr(NULL, ram_addr), val); - break; - case 8: - stq_p(qemu_map_ram_ptr(NULL, ram_addr), val); - break; - default: - abort(); - } + stn_p(qemu_map_ram_ptr(NULL, ram_addr), size, val); memory_notdirty_write_complete(&ndi); } @@ -2739,22 +2862,8 @@ static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data, if (res) { return res; } - switch (len) { - case 1: - *data = ldub_p(buf); - return MEMTX_OK; - case 2: - *data = lduw_p(buf); - return MEMTX_OK; - case 4: - *data = ldl_p(buf); - return MEMTX_OK; - case 8: - *data = ldq_p(buf); - return MEMTX_OK; - default: - abort(); - } + *data = ldn_p(buf, len); + return MEMTX_OK; } static MemTxResult subpage_write(void *opaque, hwaddr addr, @@ -2768,22 +2877,7 @@ static MemTxResult subpage_write(void *opaque, hwaddr addr, " value %"PRIx64"\n", __func__, subpage, len, addr, value); #endif - switch (len) { - case 1: - stb_p(buf, value); - break; - case 2: - stw_p(buf, value); - break; - case 4: - stl_p(buf, value); - break; - case 8: - stq_p(buf, value); - break; - default: - abort(); - } + stn_p(buf, len, value); return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len); } @@ -2897,14 +2991,15 @@ static const MemoryRegionOps readonly_mem_ops = { }, }; -MemoryRegion *iotlb_to_region(CPUState *cpu, hwaddr index, MemTxAttrs attrs) +MemoryRegionSection *iotlb_to_section(CPUState *cpu, + hwaddr index, MemTxAttrs attrs) { int asidx = cpu_asidx_from_attrs(cpu, attrs); CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx]; AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch); MemoryRegionSection *sections = d->map.sections; - return sections[index & ~TARGET_PAGE_MASK].mr; + return §ions[index & ~TARGET_PAGE_MASK]; } static void io_mem_init(void) @@ -3128,34 +3223,8 @@ static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr, l = memory_access_size(mr, l, addr1); /* XXX: could force current_cpu to NULL to avoid potential bugs */ - switch (l) { - case 8: - /* 64 bit write access */ - val = ldq_p(buf); - result |= memory_region_dispatch_write(mr, addr1, val, 8, - attrs); - break; - case 4: - /* 32 bit write access */ - val = (uint32_t)ldl_p(buf); - result |= memory_region_dispatch_write(mr, addr1, val, 4, - attrs); - break; - case 2: - /* 16 bit write access */ - val = lduw_p(buf); - result |= memory_region_dispatch_write(mr, addr1, val, 2, - attrs); - break; - case 1: - /* 8 bit write access */ - val = ldub_p(buf); - result |= memory_region_dispatch_write(mr, addr1, val, 1, - attrs); - break; - default: - abort(); - } + val = ldn_p(buf, l); + result |= memory_region_dispatch_write(mr, addr1, val, l, attrs); } else { /* RAM case */ ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false); @@ -3216,34 +3285,8 @@ MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, /* I/O case */ release_lock |= prepare_mmio_access(mr); l = memory_access_size(mr, l, addr1); - switch (l) { - case 8: - /* 64 bit read access */ - result |= memory_region_dispatch_read(mr, addr1, &val, 8, - attrs); - stq_p(buf, val); - break; - case 4: - /* 32 bit read access */ - result |= memory_region_dispatch_read(mr, addr1, &val, 4, - attrs); - stl_p(buf, val); - break; - case 2: - /* 16 bit read access */ - result |= memory_region_dispatch_read(mr, addr1, &val, 2, - attrs); - stw_p(buf, val); - break; - case 1: - /* 8 bit read access */ - result |= memory_region_dispatch_read(mr, addr1, &val, 1, - attrs); - stb_p(buf, val); - break; - default: - abort(); - } + result |= memory_region_dispatch_read(mr, addr1, &val, l, attrs); + stn_p(buf, l, val); } else { /* RAM case */ ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false); diff --git a/hw/alpha/typhoon.c b/hw/alpha/typhoon.c index 6a40869488..d3ed7cdbe8 100644 --- a/hw/alpha/typhoon.c +++ b/hw/alpha/typhoon.c @@ -666,7 +666,8 @@ static bool window_translate(TyphoonWindow *win, hwaddr addr, Pchip and generate a machine check interrupt. */ static IOMMUTLBEntry typhoon_translate_iommu(IOMMUMemoryRegion *iommu, hwaddr addr, - IOMMUAccessFlags flag) + IOMMUAccessFlags flag, + int iommu_idx) { TyphoonPchip *pchip = container_of(iommu, TyphoonPchip, iommu); IOMMUTLBEntry ret; diff --git a/hw/arm/armv7m.c b/hw/arm/armv7m.c index f123cc7d3d..9e00d4037c 100644 --- a/hw/arm/armv7m.c +++ b/hw/arm/armv7m.c @@ -178,6 +178,12 @@ static void armv7m_realize(DeviceState *dev, Error **errp) return; } } + + /* Tell the CPU where the NVIC is; it will fail realize if it doesn't + * have one. + */ + s->cpu->env.nvic = &s->nvic; + object_property_set_bool(OBJECT(s->cpu), true, "realized", &err); if (err != NULL) { error_propagate(errp, err); @@ -202,7 +208,6 @@ static void armv7m_realize(DeviceState *dev, Error **errp) sbd = SYS_BUS_DEVICE(&s->nvic); sysbus_connect_irq(sbd, 0, qdev_get_gpio_in(DEVICE(s->cpu), ARM_CPU_IRQ)); - s->cpu->env.nvic = &s->nvic; memory_region_add_subregion(&s->container, 0xe000e000, sysbus_mmio_get_region(sbd, 0)); @@ -261,27 +266,6 @@ static void armv7m_reset(void *opaque) cpu_reset(CPU(cpu)); } -/* Init CPU and memory for a v7-M based board. - mem_size is in bytes. - Returns the ARMv7M device. */ - -DeviceState *armv7m_init(MemoryRegion *system_memory, int mem_size, int num_irq, - const char *kernel_filename, const char *cpu_type) -{ - DeviceState *armv7m; - - armv7m = qdev_create(NULL, TYPE_ARMV7M); - qdev_prop_set_uint32(armv7m, "num-irq", num_irq); - qdev_prop_set_string(armv7m, "cpu-type", cpu_type); - object_property_set_link(OBJECT(armv7m), OBJECT(get_system_memory()), - "memory", &error_abort); - /* This will exit with an error if the user passed us a bad cpu_type */ - qdev_init_nofail(armv7m); - - armv7m_load_kernel(ARM_CPU(first_cpu), kernel_filename, mem_size); - return armv7m; -} - void armv7m_load_kernel(ARMCPU *cpu, const char *kernel_filename, int mem_size) { int image_size; diff --git a/hw/arm/mps2-tz.c b/hw/arm/mps2-tz.c index 8dc8bfd4ab..c5ef95e4cc 100644 --- a/hw/arm/mps2-tz.c +++ b/hw/arm/mps2-tz.c @@ -74,12 +74,13 @@ typedef struct { UnimplementedDeviceState spi[5]; UnimplementedDeviceState i2c[4]; UnimplementedDeviceState i2s_audio; - UnimplementedDeviceState gpio[5]; + UnimplementedDeviceState gpio[4]; UnimplementedDeviceState dma[4]; UnimplementedDeviceState gfx; CMSDKAPBUART uart[5]; SplitIRQ sec_resp_splitter; qemu_or_irq uart_irq_orgate; + DeviceState *lan9118; } MPS2TZMachineState; #define TYPE_MPS2TZ_MACHINE "mps2tz" @@ -224,6 +225,26 @@ static MemoryRegion *make_fpgaio(MPS2TZMachineState *mms, void *opaque, return sysbus_mmio_get_region(SYS_BUS_DEVICE(fpgaio), 0); } +static MemoryRegion *make_eth_dev(MPS2TZMachineState *mms, void *opaque, + const char *name, hwaddr size) +{ + SysBusDevice *s; + DeviceState *iotkitdev = DEVICE(&mms->iotkit); + NICInfo *nd = &nd_table[0]; + + /* In hardware this is a LAN9220; the LAN9118 is software compatible + * except that it doesn't support the checksum-offload feature. + */ + qemu_check_nic_model(nd, "lan9118"); + mms->lan9118 = qdev_create(NULL, "lan9118"); + qdev_set_nic_properties(mms->lan9118, nd); + qdev_init_nofail(mms->lan9118); + + s = SYS_BUS_DEVICE(mms->lan9118); + sysbus_connect_irq(s, 0, qdev_get_gpio_in_named(iotkitdev, "EXP_IRQ", 16)); + return sysbus_mmio_get_region(s, 0); +} + static void mps2tz_common_init(MachineState *machine) { MPS2TZMachineState *mms = MPS2TZ_MACHINE(machine); @@ -363,7 +384,7 @@ static void mps2tz_common_init(MachineState *machine) { "gpio1", make_unimp_dev, &mms->gpio[1], 0x40101000, 0x1000 }, { "gpio2", make_unimp_dev, &mms->gpio[2], 0x40102000, 0x1000 }, { "gpio3", make_unimp_dev, &mms->gpio[3], 0x40103000, 0x1000 }, - { "gpio4", make_unimp_dev, &mms->gpio[4], 0x40104000, 0x1000 }, + { "eth", make_eth_dev, NULL, 0x42000000, 0x100000 }, }, }, { .name = "ahb_ppcexp1", @@ -447,13 +468,6 @@ static void mps2tz_common_init(MachineState *machine) "cfg_sec_resp", 0)); } - /* In hardware this is a LAN9220; the LAN9118 is software compatible - * except that it doesn't support the checksum-offload feature. - * The ethernet controller is not behind a PPC. - */ - lan9118_init(&nd_table[0], 0x42000000, - qdev_get_gpio_in_named(iotkitdev, "EXP_IRQ", 16)); - create_unimplemented_device("FPGA NS PC", 0x48007000, 0x1000); armv7m_load_kernel(ARM_CPU(first_cpu), machine->kernel_filename, 0x400000); diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 42dc521c13..978330900d 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -538,7 +538,7 @@ static int smmuv3_decode_config(IOMMUMemoryRegion *mr, SMMUTransCfg *cfg, } static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion *mr, hwaddr addr, - IOMMUAccessFlags flag) + IOMMUAccessFlags flag, int iommu_idx) { SMMUDevice *sdev = container_of(mr, SMMUDevice, iommu); SMMUv3State *s = sdev->smmu; diff --git a/hw/arm/stellaris.c b/hw/arm/stellaris.c index 502a20842c..a8f1f6a912 100644 --- a/hw/arm/stellaris.c +++ b/hw/arm/stellaris.c @@ -20,6 +20,7 @@ #include "qemu/log.h" #include "exec/address-spaces.h" #include "sysemu/sysemu.h" +#include "hw/arm/armv7m.h" #include "hw/char/pl011.h" #include "hw/misc/unimp.h" #include "cpu.h" @@ -1298,8 +1299,13 @@ static void stellaris_init(MachineState *ms, stellaris_board_info *board) &error_fatal); memory_region_add_subregion(system_memory, 0x20000000, sram); - nvic = armv7m_init(system_memory, flash_size, NUM_IRQ_LINES, - ms->kernel_filename, ms->cpu_type); + nvic = qdev_create(NULL, TYPE_ARMV7M); + qdev_prop_set_uint32(nvic, "num-irq", NUM_IRQ_LINES); + qdev_prop_set_string(nvic, "cpu-type", ms->cpu_type); + object_property_set_link(OBJECT(nvic), OBJECT(get_system_memory()), + "memory", &error_abort); + /* This will exit with an error if the user passed us a bad cpu_type */ + qdev_init_nofail(nvic); qdev_connect_gpio_out_named(nvic, "SYSRESETREQ", 0, qemu_allocate_irq(&do_sys_reset, NULL, 0)); @@ -1431,6 +1437,8 @@ static void stellaris_init(MachineState *ms, stellaris_board_info *board) create_unimplemented_device("analogue-comparator", 0x4003c000, 0x1000); create_unimplemented_device("hibernation", 0x400fc000, 0x1000); create_unimplemented_device("flash-control", 0x400fd000, 0x1000); + + armv7m_load_kernel(ARM_CPU(first_cpu), ms->kernel_filename, flash_size); } /* FIXME: Figure out how to generate these from stellaris_boards. */ diff --git a/hw/block/m25p80.c b/hw/block/m25p80.c index a5ccffb4aa..b0ed8fa418 100644 --- a/hw/block/m25p80.c +++ b/hw/block/m25p80.c @@ -698,6 +698,7 @@ static void complete_collecting_data(Flash *s) case MAN_MACRONIX: s->quad_enable = extract32(s->data[0], 6, 1); if (s->len > 1) { + s->volatile_cfg = s->data[1]; s->four_bytes_address_mode = extract32(s->data[1], 5, 1); } break; diff --git a/hw/block/pflash_cfi02.c b/hw/block/pflash_cfi02.c index a8b3f7f978..6c18e5e578 100644 --- a/hw/block/pflash_cfi02.c +++ b/hw/block/pflash_cfi02.c @@ -493,102 +493,41 @@ static void pflash_write (pflash_t *pfl, hwaddr offset, pfl->cmd = 0; } - -static uint32_t pflash_readb_be(void *opaque, hwaddr addr) -{ - return pflash_read(opaque, addr, 1, 1); -} - -static uint32_t pflash_readb_le(void *opaque, hwaddr addr) +static uint64_t pflash_be_readfn(void *opaque, hwaddr addr, unsigned size) { - return pflash_read(opaque, addr, 1, 0); + return pflash_read(opaque, addr, size, 1); } -static uint32_t pflash_readw_be(void *opaque, hwaddr addr) +static void pflash_be_writefn(void *opaque, hwaddr addr, + uint64_t value, unsigned size) { - pflash_t *pfl = opaque; - - return pflash_read(pfl, addr, 2, 1); -} - -static uint32_t pflash_readw_le(void *opaque, hwaddr addr) -{ - pflash_t *pfl = opaque; - - return pflash_read(pfl, addr, 2, 0); -} - -static uint32_t pflash_readl_be(void *opaque, hwaddr addr) -{ - pflash_t *pfl = opaque; - - return pflash_read(pfl, addr, 4, 1); -} - -static uint32_t pflash_readl_le(void *opaque, hwaddr addr) -{ - pflash_t *pfl = opaque; - - return pflash_read(pfl, addr, 4, 0); + pflash_write(opaque, addr, value, size, 1); } -static void pflash_writeb_be(void *opaque, hwaddr addr, - uint32_t value) +static uint64_t pflash_le_readfn(void *opaque, hwaddr addr, unsigned size) { - pflash_write(opaque, addr, value, 1, 1); + return pflash_read(opaque, addr, size, 0); } -static void pflash_writeb_le(void *opaque, hwaddr addr, - uint32_t value) +static void pflash_le_writefn(void *opaque, hwaddr addr, + uint64_t value, unsigned size) { - pflash_write(opaque, addr, value, 1, 0); -} - -static void pflash_writew_be(void *opaque, hwaddr addr, - uint32_t value) -{ - pflash_t *pfl = opaque; - - pflash_write(pfl, addr, value, 2, 1); -} - -static void pflash_writew_le(void *opaque, hwaddr addr, - uint32_t value) -{ - pflash_t *pfl = opaque; - - pflash_write(pfl, addr, value, 2, 0); -} - -static void pflash_writel_be(void *opaque, hwaddr addr, - uint32_t value) -{ - pflash_t *pfl = opaque; - - pflash_write(pfl, addr, value, 4, 1); -} - -static void pflash_writel_le(void *opaque, hwaddr addr, - uint32_t value) -{ - pflash_t *pfl = opaque; - - pflash_write(pfl, addr, value, 4, 0); + pflash_write(opaque, addr, value, size, 0); } static const MemoryRegionOps pflash_cfi02_ops_be = { - .old_mmio = { - .read = { pflash_readb_be, pflash_readw_be, pflash_readl_be, }, - .write = { pflash_writeb_be, pflash_writew_be, pflash_writel_be, }, - }, + .read = pflash_be_readfn, + .write = pflash_be_writefn, + .valid.min_access_size = 1, + .valid.max_access_size = 4, .endianness = DEVICE_NATIVE_ENDIAN, }; static const MemoryRegionOps pflash_cfi02_ops_le = { - .old_mmio = { - .read = { pflash_readb_le, pflash_readw_le, pflash_readl_le, }, - .write = { pflash_writeb_le, pflash_writew_le, pflash_writel_le, }, - }, + .read = pflash_le_readfn, + .write = pflash_le_writefn, + .valid.min_access_size = 1, + .valid.max_access_size = 4, .endianness = DEVICE_NATIVE_ENDIAN, }; diff --git a/hw/char/parallel.c b/hw/char/parallel.c index 1542d62201..35748e6c1b 100644 --- a/hw/char/parallel.c +++ b/hw/char/parallel.c @@ -554,56 +554,28 @@ static void parallel_isa_realizefn(DeviceState *dev, Error **errp) } /* Memory mapped interface */ -static uint32_t parallel_mm_readb (void *opaque, hwaddr addr) +static uint64_t parallel_mm_readfn(void *opaque, hwaddr addr, unsigned size) { ParallelState *s = opaque; - return parallel_ioport_read_sw(s, addr >> s->it_shift) & 0xFF; + return parallel_ioport_read_sw(s, addr >> s->it_shift) & + MAKE_64BIT_MASK(0, size * 8); } -static void parallel_mm_writeb (void *opaque, - hwaddr addr, uint32_t value) +static void parallel_mm_writefn(void *opaque, hwaddr addr, + uint64_t value, unsigned size) { ParallelState *s = opaque; - parallel_ioport_write_sw(s, addr >> s->it_shift, value & 0xFF); -} - -static uint32_t parallel_mm_readw (void *opaque, hwaddr addr) -{ - ParallelState *s = opaque; - - return parallel_ioport_read_sw(s, addr >> s->it_shift) & 0xFFFF; -} - -static void parallel_mm_writew (void *opaque, - hwaddr addr, uint32_t value) -{ - ParallelState *s = opaque; - - parallel_ioport_write_sw(s, addr >> s->it_shift, value & 0xFFFF); -} - -static uint32_t parallel_mm_readl (void *opaque, hwaddr addr) -{ - ParallelState *s = opaque; - - return parallel_ioport_read_sw(s, addr >> s->it_shift); -} - -static void parallel_mm_writel (void *opaque, - hwaddr addr, uint32_t value) -{ - ParallelState *s = opaque; - - parallel_ioport_write_sw(s, addr >> s->it_shift, value); + parallel_ioport_write_sw(s, addr >> s->it_shift, + value & MAKE_64BIT_MASK(0, size * 8)); } static const MemoryRegionOps parallel_mm_ops = { - .old_mmio = { - .read = { parallel_mm_readb, parallel_mm_readw, parallel_mm_readl }, - .write = { parallel_mm_writeb, parallel_mm_writew, parallel_mm_writel }, - }, + .read = parallel_mm_readfn, + .write = parallel_mm_writefn, + .valid.min_access_size = 1, + .valid.max_access_size = 4, .endianness = DEVICE_NATIVE_ENDIAN, }; diff --git a/hw/core/or-irq.c b/hw/core/or-irq.c index f9d76c4641..a86901b673 100644 --- a/hw/core/or-irq.c +++ b/hw/core/or-irq.c @@ -66,14 +66,49 @@ static void or_irq_init(Object *obj) qdev_init_gpio_out(DEVICE(obj), &s->out_irq, 1); } +/* The original version of this device had a fixed 16 entries in its + * VMState array; devices with more inputs than this need to + * migrate the extra lines via a subsection. + * The subsection migrates as much of the levels[] array as is needed + * (including repeating the first 16 elements), to avoid the awkwardness + * of splitting it in two to meet the requirements of VMSTATE_VARRAY_UINT16. + */ +#define OLD_MAX_OR_LINES 16 +#if MAX_OR_LINES < OLD_MAX_OR_LINES +#error MAX_OR_LINES must be at least 16 for migration compatibility +#endif + +static bool vmstate_extras_needed(void *opaque) +{ + qemu_or_irq *s = OR_IRQ(opaque); + + return s->num_lines >= OLD_MAX_OR_LINES; +} + +static const VMStateDescription vmstate_or_irq_extras = { + .name = "or-irq-extras", + .version_id = 1, + .minimum_version_id = 1, + .needed = vmstate_extras_needed, + .fields = (VMStateField[]) { + VMSTATE_VARRAY_UINT16_UNSAFE(levels, qemu_or_irq, num_lines, 0, + vmstate_info_bool, bool), + VMSTATE_END_OF_LIST(), + }, +}; + static const VMStateDescription vmstate_or_irq = { .name = TYPE_OR_IRQ, .version_id = 1, .minimum_version_id = 1, .fields = (VMStateField[]) { - VMSTATE_BOOL_ARRAY(levels, qemu_or_irq, MAX_OR_LINES), + VMSTATE_BOOL_SUB_ARRAY(levels, qemu_or_irq, 0, OLD_MAX_OR_LINES), VMSTATE_END_OF_LIST(), - } + }, + .subsections = (const VMStateDescription*[]) { + &vmstate_or_irq_extras, + NULL + }, }; static Property or_irq_properties[] = { diff --git a/hw/dma/rc4030.c b/hw/dma/rc4030.c index 5d4833eeca..ccd8612888 100644 --- a/hw/dma/rc4030.c +++ b/hw/dma/rc4030.c @@ -491,7 +491,7 @@ static const MemoryRegionOps jazzio_ops = { }; static IOMMUTLBEntry rc4030_dma_translate(IOMMUMemoryRegion *iommu, hwaddr addr, - IOMMUAccessFlags flag) + IOMMUAccessFlags flag, int iommu_idx) { rc4030State *s = container_of(iommu, rc4030State, dma_mr); IOMMUTLBEntry ret = { diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 63d46ff6ee..1fd669fef8 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -991,7 +991,7 @@ static inline bool amdvi_is_interrupt_addr(hwaddr addr) } static IOMMUTLBEntry amdvi_translate(IOMMUMemoryRegion *iommu, hwaddr addr, - IOMMUAccessFlags flag) + IOMMUAccessFlags flag, int iommu_idx) { AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu); AMDVIState *s = as->iommu_state; diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index b5a09b7908..0a8cd4e9cc 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -1023,7 +1023,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry, void *private) { - memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry); + memory_region_notify_iommu((IOMMUMemoryRegion *)private, 0, *entry); return 0; } @@ -1581,7 +1581,7 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, .addr_mask = size - 1, .perm = IOMMU_NONE, }; - memory_region_notify_iommu(&vtd_as->iommu, entry); + memory_region_notify_iommu(&vtd_as->iommu, 0, entry); } } } @@ -2015,7 +2015,7 @@ static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, entry.iova = addr; entry.perm = IOMMU_NONE; entry.translated_addr = 0; - memory_region_notify_iommu(&vtd_dev_as->iommu, entry); + memory_region_notify_iommu(&vtd_dev_as->iommu, 0, entry); done: return true; @@ -2471,7 +2471,7 @@ static void vtd_mem_write(void *opaque, hwaddr addr, } static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr, - IOMMUAccessFlags flag) + IOMMUAccessFlags flag, int iommu_idx) { VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); IntelIOMMUState *s = vtd_as->iommu_state; diff --git a/hw/input/pckbd.c b/hw/input/pckbd.c index f17f18e51b..f33e3fc63d 100644 --- a/hw/input/pckbd.c +++ b/hw/input/pckbd.c @@ -434,7 +434,7 @@ static const VMStateDescription vmstate_kbd = { }; /* Memory mapped interface */ -static uint32_t kbd_mm_readb (void *opaque, hwaddr addr) +static uint64_t kbd_mm_readfn(void *opaque, hwaddr addr, unsigned size) { KBDState *s = opaque; @@ -444,7 +444,8 @@ static uint32_t kbd_mm_readb (void *opaque, hwaddr addr) return kbd_read_data(s, 0, 1) & 0xff; } -static void kbd_mm_writeb (void *opaque, hwaddr addr, uint32_t value) +static void kbd_mm_writefn(void *opaque, hwaddr addr, + uint64_t value, unsigned size) { KBDState *s = opaque; @@ -454,12 +455,13 @@ static void kbd_mm_writeb (void *opaque, hwaddr addr, uint32_t value) kbd_write_data(s, 0, value & 0xff, 1); } + static const MemoryRegionOps i8042_mmio_ops = { + .read = kbd_mm_readfn, + .write = kbd_mm_writefn, + .valid.min_access_size = 1, + .valid.max_access_size = 4, .endianness = DEVICE_NATIVE_ENDIAN, - .old_mmio = { - .read = { kbd_mm_readb, kbd_mm_readb, kbd_mm_readb }, - .write = { kbd_mm_writeb, kbd_mm_writeb, kbd_mm_writeb }, - }, }; void i8042_mm_init(qemu_irq kbd_irq, qemu_irq mouse_irq, diff --git a/hw/intc/arm_gicv3_kvm.c b/hw/intc/arm_gicv3_kvm.c index 5649cac46e..d8d3b25403 100644 --- a/hw/intc/arm_gicv3_kvm.c +++ b/hw/intc/arm_gicv3_kvm.c @@ -135,7 +135,14 @@ static void kvm_dist_get_priority(GICv3State *s, uint32_t offset, uint8_t *bmp) uint32_t reg, *field; int irq; - field = (uint32_t *)bmp; + /* For the KVM GICv3, affinity routing is always enabled, and the first 8 + * GICD_IPRIORITYR<n> registers are always RAZ/WI. The corresponding + * functionality is replaced by GICR_IPRIORITYR<n>. It doesn't need to + * sync them. So it needs to skip the field of GIC_INTERNAL irqs in bmp and + * offset. + */ + field = (uint32_t *)(bmp + GIC_INTERNAL); + offset += (GIC_INTERNAL * 8) / 8; for_each_dist_irq_reg(irq, s->num_irq, 8) { kvm_gicd_access(s, offset, ®, false); *field = reg; @@ -149,7 +156,14 @@ static void kvm_dist_put_priority(GICv3State *s, uint32_t offset, uint8_t *bmp) uint32_t reg, *field; int irq; - field = (uint32_t *)bmp; + /* For the KVM GICv3, affinity routing is always enabled, and the first 8 + * GICD_IPRIORITYR<n> registers are always RAZ/WI. The corresponding + * functionality is replaced by GICR_IPRIORITYR<n>. It doesn't need to + * sync them. So it needs to skip the field of GIC_INTERNAL irqs in bmp and + * offset. + */ + field = (uint32_t *)(bmp + GIC_INTERNAL); + offset += (GIC_INTERNAL * 8) / 8; for_each_dist_irq_reg(irq, s->num_irq, 8) { reg = *field; kvm_gicd_access(s, offset, ®, true); diff --git a/hw/intc/armv7m_nvic.c b/hw/intc/armv7m_nvic.c index c51151fa8a..661be8878a 100644 --- a/hw/intc/armv7m_nvic.c +++ b/hw/intc/armv7m_nvic.c @@ -2183,7 +2183,11 @@ static void armv7m_nvic_realize(DeviceState *dev, Error **errp) int regionlen; s->cpu = ARM_CPU(qemu_get_cpu(0)); - assert(s->cpu); + + if (!s->cpu || !arm_feature(&s->cpu->env, ARM_FEATURE_M)) { + error_setg(errp, "The NVIC can only be used with a Cortex-M CPU"); + return; + } if (s->num_irq > NVIC_MAX_IRQ) { error_setg(errp, "num-irq %d exceeds NVIC maximum", s->num_irq); diff --git a/hw/m68k/mcf5206.c b/hw/m68k/mcf5206.c index 7abd84ac47..d7f26d6810 100644 --- a/hw/m68k/mcf5206.c +++ b/hw/m68k/mcf5206.c @@ -512,19 +512,43 @@ static void m5206_mbar_writel(void *opaque, hwaddr offset, m5206_mbar_write(s, offset, value, 4); } +static uint64_t m5206_mbar_readfn(void *opaque, hwaddr addr, unsigned size) +{ + switch (size) { + case 1: + return m5206_mbar_readb(opaque, addr); + case 2: + return m5206_mbar_readw(opaque, addr); + case 4: + return m5206_mbar_readl(opaque, addr); + default: + g_assert_not_reached(); + } +} + +static void m5206_mbar_writefn(void *opaque, hwaddr addr, + uint64_t value, unsigned size) +{ + switch (size) { + case 1: + m5206_mbar_writeb(opaque, addr, value); + break; + case 2: + m5206_mbar_writew(opaque, addr, value); + break; + case 4: + m5206_mbar_writel(opaque, addr, value); + break; + default: + g_assert_not_reached(); + } +} + static const MemoryRegionOps m5206_mbar_ops = { - .old_mmio = { - .read = { - m5206_mbar_readb, - m5206_mbar_readw, - m5206_mbar_readl, - }, - .write = { - m5206_mbar_writeb, - m5206_mbar_writew, - m5206_mbar_writel, - }, - }, + .read = m5206_mbar_readfn, + .write = m5206_mbar_writefn, + .valid.min_access_size = 1, + .valid.max_access_size = 4, .endianness = DEVICE_NATIVE_ENDIAN, }; diff --git a/hw/misc/aspeed_scu.c b/hw/misc/aspeed_scu.c index 5e6d5744ee..59315010db 100644 --- a/hw/misc/aspeed_scu.c +++ b/hw/misc/aspeed_scu.c @@ -16,6 +16,7 @@ #include "qapi/visitor.h" #include "qemu/bitops.h" #include "qemu/log.h" +#include "crypto/random.h" #include "trace.h" #define TO_REG(offset) ((offset) >> 2) @@ -154,6 +155,19 @@ static const uint32_t ast2500_a1_resets[ASPEED_SCU_NR_REGS] = { [BMC_DEV_ID] = 0x00002402U }; +static uint32_t aspeed_scu_get_random(void) +{ + Error *err = NULL; + uint32_t num; + + if (qcrypto_random_bytes((uint8_t *)&num, sizeof(num), &err)) { + error_report_err(err); + exit(1); + } + + return num; +} + static uint64_t aspeed_scu_read(void *opaque, hwaddr offset, unsigned size) { AspeedSCUState *s = ASPEED_SCU(opaque); @@ -167,6 +181,12 @@ static uint64_t aspeed_scu_read(void *opaque, hwaddr offset, unsigned size) } switch (reg) { + case RNG_DATA: + /* On hardware, RNG_DATA works regardless of + * the state of the enable bit in RNG_CTRL + */ + s->regs[RNG_DATA] = aspeed_scu_get_random(); + break; case WAKEUP_EN: qemu_log_mask(LOG_GUEST_ERROR, "%s: Read of write-only offset 0x%" HWADDR_PRIx "\n", diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c index aaa6010d5c..1b0880ac9e 100644 --- a/hw/ppc/spapr_iommu.c +++ b/hw/ppc/spapr_iommu.c @@ -112,7 +112,8 @@ static void spapr_tce_free_table(uint64_t *table, int fd, uint32_t nb_table) /* Called from RCU critical section */ static IOMMUTLBEntry spapr_tce_translate_iommu(IOMMUMemoryRegion *iommu, hwaddr addr, - IOMMUAccessFlags flag) + IOMMUAccessFlags flag, + int iommu_idx) { sPAPRTCETable *tcet = container_of(iommu, sPAPRTCETable, iommu); uint64_t tce; @@ -428,7 +429,7 @@ static target_ulong put_tce_emu(sPAPRTCETable *tcet, target_ulong ioba, entry.translated_addr = tce & page_mask; entry.addr_mask = ~page_mask; entry.perm = spapr_tce_iommu_access_flags(tce); - memory_region_notify_iommu(&tcet->iommu, entry); + memory_region_notify_iommu(&tcet->iommu, 0, entry); return H_SUCCESS; } diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 10da87458e..e3e0ebb7f6 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -484,7 +484,7 @@ uint16_t s390_guest_io_table_walk(uint64_t g_iota, hwaddr addr, } static IOMMUTLBEntry s390_translate_iommu(IOMMUMemoryRegion *mr, hwaddr addr, - IOMMUAccessFlags flag) + IOMMUAccessFlags flag, int iommu_idx) { S390PCIIOMMU *iommu = container_of(mr, S390PCIIOMMU, iommu_mr); S390IOTLBEntry *entry; diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c index d1a5f79678..7b61367ee3 100644 --- a/hw/s390x/s390-pci-inst.c +++ b/hw/s390x/s390-pci-inst.c @@ -589,7 +589,7 @@ static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) } notify.perm = IOMMU_NONE; - memory_region_notify_iommu(&iommu->iommu_mr, notify); + memory_region_notify_iommu(&iommu->iommu_mr, 0, notify); notify.perm = entry->perm; } @@ -601,7 +601,7 @@ static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) g_hash_table_replace(iommu->iotlb, &cache->iova, cache); } - memory_region_notify_iommu(&iommu->iommu_mr, notify); + memory_region_notify_iommu(&iommu->iommu_mr, 0, notify); } int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) diff --git a/hw/sh4/sh7750.c b/hw/sh4/sh7750.c index 2dc07a904b..2fb6e618d9 100644 --- a/hw/sh4/sh7750.c +++ b/hw/sh4/sh7750.c @@ -450,15 +450,43 @@ static void sh7750_mem_writel(void *opaque, hwaddr addr, } } +static uint64_t sh7750_mem_readfn(void *opaque, hwaddr addr, unsigned size) +{ + switch (size) { + case 1: + return sh7750_mem_readb(opaque, addr); + case 2: + return sh7750_mem_readw(opaque, addr); + case 4: + return sh7750_mem_readl(opaque, addr); + default: + g_assert_not_reached(); + } +} + +static void sh7750_mem_writefn(void *opaque, hwaddr addr, + uint64_t value, unsigned size) +{ + switch (size) { + case 1: + sh7750_mem_writeb(opaque, addr, value); + break; + case 2: + sh7750_mem_writew(opaque, addr, value); + break; + case 4: + sh7750_mem_writel(opaque, addr, value); + break; + default: + g_assert_not_reached(); + } +} + static const MemoryRegionOps sh7750_mem_ops = { - .old_mmio = { - .read = {sh7750_mem_readb, - sh7750_mem_readw, - sh7750_mem_readl }, - .write = {sh7750_mem_writeb, - sh7750_mem_writew, - sh7750_mem_writel }, - }, + .read = sh7750_mem_readfn, + .write = sh7750_mem_writefn, + .valid.min_access_size = 1, + .valid.max_access_size = 4, .endianness = DEVICE_NATIVE_ENDIAN, }; diff --git a/hw/sparc/sun4m_iommu.c b/hw/sparc/sun4m_iommu.c index b677601fc6..7ca1e3fce4 100644 --- a/hw/sparc/sun4m_iommu.c +++ b/hw/sparc/sun4m_iommu.c @@ -282,7 +282,8 @@ static void iommu_bad_addr(IOMMUState *s, hwaddr addr, /* Called from RCU critical section */ static IOMMUTLBEntry sun4m_translate_iommu(IOMMUMemoryRegion *iommu, hwaddr addr, - IOMMUAccessFlags flags) + IOMMUAccessFlags flags, + int iommu_idx) { IOMMUState *is = container_of(iommu, IOMMUState, iommu); hwaddr page, pa; diff --git a/hw/sparc64/sun4u_iommu.c b/hw/sparc64/sun4u_iommu.c index eb3aaa87e6..1ef7645ba5 100644 --- a/hw/sparc64/sun4u_iommu.c +++ b/hw/sparc64/sun4u_iommu.c @@ -73,7 +73,7 @@ /* Called from RCU critical section */ static IOMMUTLBEntry sun4u_translate_iommu(IOMMUMemoryRegion *iommu, hwaddr addr, - IOMMUAccessFlags flag) + IOMMUAccessFlags flag, int iommu_idx) { IOMMUState *is = container_of(iommu, IOMMUState, iommu); hwaddr baseaddr, offset; diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 8e57265edf..fb396cf00a 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -507,6 +507,7 @@ static void vfio_listener_region_add(MemoryListener *listener, if (memory_region_is_iommu(section->mr)) { VFIOGuestIOMMU *giommu; IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); + int iommu_idx; trace_vfio_listener_region_add_iommu(iova, end); /* @@ -523,10 +524,13 @@ static void vfio_listener_region_add(MemoryListener *listener, llend = int128_add(int128_make64(section->offset_within_region), section->size); llend = int128_sub(llend, int128_one()); + iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, + MEMTXATTRS_UNSPECIFIED); iommu_notifier_init(&giommu->n, vfio_iommu_map_notify, IOMMU_NOTIFIER_ALL, section->offset_within_region, - int128_get64(llend)); + int128_get64(llend), + iommu_idx); QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); memory_region_register_iommu_notifier(section->mr, &giommu->n); diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 96175b214d..b129cb9ddd 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -662,6 +662,8 @@ static void vhost_iommu_region_add(MemoryListener *listener, iommu_listener); struct vhost_iommu *iommu; Int128 end; + int iommu_idx; + IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); if (!memory_region_is_iommu(section->mr)) { return; @@ -671,10 +673,13 @@ static void vhost_iommu_region_add(MemoryListener *listener, end = int128_add(int128_make64(section->offset_within_region), section->size); end = int128_sub(end, int128_one()); + iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, + MEMTXATTRS_UNSPECIFIED); iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify, IOMMU_NOTIFIER_UNMAP, section->offset_within_region, - int128_get64(end)); + int128_get64(end), + iommu_idx); iommu->mr = section->mr; iommu->iommu_offset = section->offset_within_address_space - section->offset_within_region; diff --git a/hw/watchdog/wdt_i6300esb.c b/hw/watchdog/wdt_i6300esb.c index e596b0804d..7b59469888 100644 --- a/hw/watchdog/wdt_i6300esb.c +++ b/hw/watchdog/wdt_i6300esb.c @@ -361,19 +361,43 @@ static void i6300esb_mem_writel(void *vp, hwaddr addr, uint32_t val) } } +static uint64_t i6300esb_mem_readfn(void *opaque, hwaddr addr, unsigned size) +{ + switch (size) { + case 1: + return i6300esb_mem_readb(opaque, addr); + case 2: + return i6300esb_mem_readw(opaque, addr); + case 4: + return i6300esb_mem_readl(opaque, addr); + default: + g_assert_not_reached(); + } +} + +static void i6300esb_mem_writefn(void *opaque, hwaddr addr, + uint64_t value, unsigned size) +{ + switch (size) { + case 1: + i6300esb_mem_writeb(opaque, addr, value); + break; + case 2: + i6300esb_mem_writew(opaque, addr, value); + break; + case 4: + i6300esb_mem_writel(opaque, addr, value); + break; + default: + g_assert_not_reached(); + } +} + static const MemoryRegionOps i6300esb_ops = { - .old_mmio = { - .read = { - i6300esb_mem_readb, - i6300esb_mem_readw, - i6300esb_mem_readl, - }, - .write = { - i6300esb_mem_writeb, - i6300esb_mem_writew, - i6300esb_mem_writel, - }, - }, + .read = i6300esb_mem_readfn, + .write = i6300esb_mem_writefn, + .valid.min_access_size = 1, + .valid.max_access_size = 4, .endianness = DEVICE_LITTLE_ENDIAN, }; diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h index a635f532f9..7fa726b8e3 100644 --- a/include/exec/cpu-all.h +++ b/include/exec/cpu-all.h @@ -133,6 +133,8 @@ static inline void tswap64s(uint64_t *s) #define stq_p(p, v) stq_be_p(p, v) #define stfl_p(p, v) stfl_be_p(p, v) #define stfq_p(p, v) stfq_be_p(p, v) +#define ldn_p(p, sz) ldn_be_p(p, sz) +#define stn_p(p, sz, v) stn_be_p(p, sz, v) #else #define lduw_p(p) lduw_le_p(p) #define ldsw_p(p) ldsw_le_p(p) @@ -145,6 +147,8 @@ static inline void tswap64s(uint64_t *s) #define stq_p(p, v) stq_le_p(p, v) #define stfl_p(p, v) stfl_le_p(p, v) #define stfq_p(p, v) stfq_le_p(p, v) +#define ldn_p(p, sz) ldn_le_p(p, sz) +#define stn_p(p, sz, v) stn_le_p(p, sz, v) #endif /* MMU memory access macros */ diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h index e43ff8346b..a171ffc1a4 100644 --- a/include/exec/cpu-defs.h +++ b/include/exec/cpu-defs.h @@ -127,6 +127,15 @@ QEMU_BUILD_BUG_ON(sizeof(CPUTLBEntry) != (1 << CPU_TLB_ENTRY_BITS)); * structs into one.) */ typedef struct CPUIOTLBEntry { + /* + * @addr contains: + * - in the lower TARGET_PAGE_BITS, a physical section number + * - with the lower TARGET_PAGE_BITS masked off, an offset which + * must be added to the virtual address to obtain: + * + the ram_addr_t of the target RAM (if the physical section + * number is PHYS_SECTION_NOTDIRTY or PHYS_SECTION_ROM) + * + the offset within the target MemoryRegion (otherwise) + */ hwaddr addr; MemTxAttrs attrs; } CPUIOTLBEntry; diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index 4d09eaba72..8bbea787a9 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -437,8 +437,17 @@ void tb_lock_reset(void); #if !defined(CONFIG_USER_ONLY) -struct MemoryRegion *iotlb_to_region(CPUState *cpu, - hwaddr index, MemTxAttrs attrs); +/** + * iotlb_to_section: + * @cpu: CPU performing the access + * @index: TCG CPU IOTLB entry + * + * Given a TCG CPU IOTLB entry, return the MemoryRegionSection that + * it refers to. @index will have been initially created and returned + * by memory_region_section_get_iotlb(). + */ +struct MemoryRegionSection *iotlb_to_section(CPUState *cpu, + hwaddr index, MemTxAttrs attrs); void tlb_fill(CPUState *cpu, target_ulong addr, int size, MMUAccessType access_type, int mmu_idx, uintptr_t retaddr); @@ -469,7 +478,8 @@ void tb_flush_jmp_cache(CPUState *cpu, target_ulong addr); MemoryRegionSection * address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr, - hwaddr *xlat, hwaddr *plen); + hwaddr *xlat, hwaddr *plen, + MemTxAttrs attrs, int *prot); hwaddr memory_region_section_get_iotlb(CPUState *cpu, MemoryRegionSection *section, target_ulong vaddr, diff --git a/include/exec/memory.h b/include/exec/memory.h index eb2ba06519..050323f532 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -98,18 +98,21 @@ struct IOMMUNotifier { /* Notify for address space range start <= addr <= end */ hwaddr start; hwaddr end; + int iommu_idx; QLIST_ENTRY(IOMMUNotifier) node; }; typedef struct IOMMUNotifier IOMMUNotifier; static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn, IOMMUNotifierFlag flags, - hwaddr start, hwaddr end) + hwaddr start, hwaddr end, + int iommu_idx) { n->notify = fn; n->notifier_flags = flags; n->start = start; n->end = end; + n->iommu_idx = iommu_idx; } /* @@ -206,6 +209,20 @@ enum IOMMUMemoryRegionAttr { * to report whenever mappings are changed, by calling * memory_region_notify_iommu() (or, if necessary, by calling * memory_region_notify_one() for each registered notifier). + * + * Conceptually an IOMMU provides a mapping from input address + * to an output TLB entry. If the IOMMU is aware of memory transaction + * attributes and the output TLB entry depends on the transaction + * attributes, we represent this using IOMMU indexes. Each index + * selects a particular translation table that the IOMMU has: + * @attrs_to_index returns the IOMMU index for a set of transaction attributes + * @translate takes an input address and an IOMMU index + * and the mapping returned can only depend on the input address and the + * IOMMU index. + * + * Most IOMMUs don't care about the transaction attributes and support + * only a single IOMMU index. A more complex IOMMU might have one index + * for secure transactions and one for non-secure transactions. */ typedef struct IOMMUMemoryRegionClass { /* private */ @@ -234,9 +251,10 @@ typedef struct IOMMUMemoryRegionClass { * @iommu: the IOMMUMemoryRegion * @hwaddr: address to be translated within the memory region * @flag: requested access permissions + * @iommu_idx: IOMMU index for the translation */ IOMMUTLBEntry (*translate)(IOMMUMemoryRegion *iommu, hwaddr addr, - IOMMUAccessFlags flag); + IOMMUAccessFlags flag, int iommu_idx); /* Returns minimum supported page size in bytes. * If this method is not provided then the minimum is assumed to * be TARGET_PAGE_SIZE. @@ -290,6 +308,29 @@ typedef struct IOMMUMemoryRegionClass { */ int (*get_attr)(IOMMUMemoryRegion *iommu, enum IOMMUMemoryRegionAttr attr, void *data); + + /* Return the IOMMU index to use for a given set of transaction attributes. + * + * Optional method: if an IOMMU only supports a single IOMMU index then + * the default implementation of memory_region_iommu_attrs_to_index() + * will return 0. + * + * The indexes supported by an IOMMU must be contiguous, starting at 0. + * + * @iommu: the IOMMUMemoryRegion + * @attrs: memory transaction attributes + */ + int (*attrs_to_index)(IOMMUMemoryRegion *iommu, MemTxAttrs attrs); + + /* Return the number of IOMMU indexes this IOMMU supports. + * + * Optional method: if this method is not provided, then + * memory_region_iommu_num_indexes() will return 1, indicating that + * only a single IOMMU index is supported. + * + * @iommu: the IOMMUMemoryRegion + */ + int (*num_indexes)(IOMMUMemoryRegion *iommu); } IOMMUMemoryRegionClass; typedef struct CoalescedMemoryRange CoalescedMemoryRange; @@ -971,11 +1012,13 @@ uint64_t memory_region_iommu_get_min_page_size(IOMMUMemoryRegion *iommu_mr); * should be notified with an UNMAP followed by a MAP. * * @iommu_mr: the memory region that was changed + * @iommu_idx: the IOMMU index for the translation table which has changed * @entry: the new entry in the IOMMU translation table. The entry * replaces all old entries for the same virtual I/O address range. * Deleted entries have .@perm == 0. */ void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr, + int iommu_idx, IOMMUTLBEntry entry); /** @@ -1055,6 +1098,24 @@ int memory_region_iommu_get_attr(IOMMUMemoryRegion *iommu_mr, void *data); /** + * memory_region_iommu_attrs_to_index: return the IOMMU index to + * use for translations with the given memory transaction attributes. + * + * @iommu_mr: the memory region + * @attrs: the memory transaction attributes + */ +int memory_region_iommu_attrs_to_index(IOMMUMemoryRegion *iommu_mr, + MemTxAttrs attrs); + +/** + * memory_region_iommu_num_indexes: return the total number of IOMMU + * indexes that this IOMMU supports. + * + * @iommu_mr: the memory region + */ +int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr); + +/** * memory_region_name: get a memory region's name * * Returns the string that was used to initialize the memory region. diff --git a/include/hw/arm/arm.h b/include/hw/arm/arm.h index 70fa2287e2..ffed39252d 100644 --- a/include/hw/arm/arm.h +++ b/include/hw/arm/arm.h @@ -23,9 +23,6 @@ typedef enum { ARM_ENDIANNESS_BE32, } arm_endianness; -/* armv7m.c */ -DeviceState *armv7m_init(MemoryRegion *system_memory, int mem_size, int num_irq, - const char *kernel_filename, const char *cpu_type); /** * armv7m_load_kernel: * @cpu: CPU @@ -33,9 +30,8 @@ DeviceState *armv7m_init(MemoryRegion *system_memory, int mem_size, int num_irq, * @mem_size: mem_size: maximum image size to load * * Load the guest image for an ARMv7M system. This must be called by - * any ARMv7M board, either directly or via armv7m_init(). (This is - * necessary to ensure that the CPU resets correctly on system reset, - * as well as for kernel loading.) + * any ARMv7M board. (This is necessary to ensure that the CPU resets + * correctly on system reset, as well as for kernel loading.) */ void armv7m_load_kernel(ARMCPU *cpu, const char *kernel_filename, int mem_size); diff --git a/include/hw/or-irq.h b/include/hw/or-irq.h index 3f6fc1b58a..5a31e5a188 100644 --- a/include/hw/or-irq.h +++ b/include/hw/or-irq.h @@ -31,7 +31,10 @@ #define TYPE_OR_IRQ "or-irq" -#define MAX_OR_LINES 16 +/* This can safely be increased if necessary without breaking + * migration compatibility (as long as it remains greater than 15). + */ +#define MAX_OR_LINES 32 typedef struct OrIRQState qemu_or_irq; diff --git a/include/qemu/bswap.h b/include/qemu/bswap.h index 3f28f661b1..a684c1a7a2 100644 --- a/include/qemu/bswap.h +++ b/include/qemu/bswap.h @@ -290,6 +290,15 @@ typedef union { * For accessors that take a guest address rather than a * host address, see the cpu_{ld,st}_* accessors defined in * cpu_ldst.h. + * + * For cases where the size to be used is not fixed at compile time, + * there are + * stn{endian}_p(ptr, sz, val) + * which stores @val to @ptr as an @endian-order number @sz bytes in size + * and + * ldn{endian}_p(ptr, sz) + * which loads @sz bytes from @ptr as an unsigned @endian-order number + * and returns it in a uint64_t. */ static inline int ldub_p(const void *ptr) @@ -495,6 +504,49 @@ static inline unsigned long leul_to_cpu(unsigned long v) #endif } +/* Store v to p as a sz byte value in host order */ +#define DO_STN_LDN_P(END) \ + static inline void stn_## END ## _p(void *ptr, int sz, uint64_t v) \ + { \ + switch (sz) { \ + case 1: \ + stb_p(ptr, v); \ + break; \ + case 2: \ + stw_ ## END ## _p(ptr, v); \ + break; \ + case 4: \ + stl_ ## END ## _p(ptr, v); \ + break; \ + case 8: \ + stq_ ## END ## _p(ptr, v); \ + break; \ + default: \ + g_assert_not_reached(); \ + } \ + } \ + static inline uint64_t ldn_## END ## _p(const void *ptr, int sz) \ + { \ + switch (sz) { \ + case 1: \ + return ldub_p(ptr); \ + case 2: \ + return lduw_ ## END ## _p(ptr); \ + case 4: \ + return (uint32_t)ldl_ ## END ## _p(ptr); \ + case 8: \ + return ldq_ ## END ## _p(ptr); \ + default: \ + g_assert_not_reached(); \ + } \ + } + +DO_STN_LDN_P(he) +DO_STN_LDN_P(le) +DO_STN_LDN_P(be) + +#undef DO_STN_LDN_P + #undef le_bswap #undef be_bswap #undef le_bswaps diff --git a/include/qom/cpu.h b/include/qom/cpu.h index 9d3afc6c75..cce2fd6acc 100644 --- a/include/qom/cpu.h +++ b/include/qom/cpu.h @@ -429,6 +429,9 @@ struct CPUState { uint16_t pending_tlb_flush; int hvf_fd; + + /* track IOMMUs whose translations we've cached in the TCG TLB */ + GArray *iommu_notifiers; }; QTAILQ_HEAD(CPUTailQ, CPUState); @@ -1799,6 +1799,9 @@ void memory_region_register_iommu_notifier(MemoryRegion *mr, iommu_mr = IOMMU_MEMORY_REGION(mr); assert(n->notifier_flags != IOMMU_NOTIFIER_NONE); assert(n->start <= n->end); + assert(n->iommu_idx >= 0 && + n->iommu_idx < memory_region_iommu_num_indexes(iommu_mr)); + QLIST_INSERT_HEAD(&iommu_mr->iommu_notify, n, node); memory_region_update_iommu_notify_flags(iommu_mr); } @@ -1829,7 +1832,7 @@ void memory_region_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) granularity = memory_region_iommu_get_min_page_size(iommu_mr); for (addr = 0; addr < memory_region_size(mr); addr += granularity) { - iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE); + iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, n->iommu_idx); if (iotlb.perm != IOMMU_NONE) { n->notify(n, &iotlb); } @@ -1891,6 +1894,7 @@ void memory_region_notify_one(IOMMUNotifier *notifier, } void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr, + int iommu_idx, IOMMUTLBEntry entry) { IOMMUNotifier *iommu_notifier; @@ -1898,7 +1902,9 @@ void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr, assert(memory_region_is_iommu(MEMORY_REGION(iommu_mr))); IOMMU_NOTIFIER_FOREACH(iommu_notifier, iommu_mr) { - memory_region_notify_one(iommu_notifier, &entry); + if (iommu_notifier->iommu_idx == iommu_idx) { + memory_region_notify_one(iommu_notifier, &entry); + } } } @@ -1915,6 +1921,29 @@ int memory_region_iommu_get_attr(IOMMUMemoryRegion *iommu_mr, return imrc->get_attr(iommu_mr, attr, data); } +int memory_region_iommu_attrs_to_index(IOMMUMemoryRegion *iommu_mr, + MemTxAttrs attrs) +{ + IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); + + if (!imrc->attrs_to_index) { + return 0; + } + + return imrc->attrs_to_index(iommu_mr, attrs); +} + +int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr) +{ + IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); + + if (!imrc->num_indexes) { + return 1; + } + + return imrc->num_indexes(iommu_mr); +} + void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client) { uint8_t mask = 1 << client; diff --git a/target/arm/cpu.c b/target/arm/cpu.c index ab047b9402..e1de45e904 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -767,6 +767,24 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp) return; } +#ifndef CONFIG_USER_ONLY + /* The NVIC and M-profile CPU are two halves of a single piece of + * hardware; trying to use one without the other is a command line + * error and will result in segfaults if not caught here. + */ + if (arm_feature(env, ARM_FEATURE_M)) { + if (!env->nvic) { + error_setg(errp, "This board cannot be used with Cortex-M CPUs"); + return; + } + } else { + if (env->nvic) { + error_setg(errp, "This board can only be used with Cortex-M CPUs"); + return; + } + } +#endif + cpu_exec_realizefn(cs, &local_err); if (local_err != NULL) { error_propagate(errp, local_err); diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h index 94f4356ce9..2e76084992 100644 --- a/target/arm/helper-sve.h +++ b/target/arm/helper-sve.h @@ -195,6 +195,15 @@ DEF_HELPER_FLAGS_5(sve_lsl_zpzz_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_5(sve_lsl_zpzz_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_sel_zpzz_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_sel_zpzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_sel_zpzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_sel_zpzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + DEF_HELPER_FLAGS_5(sve_asr_zpzw_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(sve_asr_zpzw_h, TCG_CALL_NO_RWG, @@ -416,6 +425,230 @@ DEF_HELPER_FLAGS_4(sve_cpy_z_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) DEF_HELPER_FLAGS_4(sve_ext, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_insr_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_insr_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_insr_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_insr_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_3(sve_rev_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_rev_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_rev_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_rev_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_tbl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_tbl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_tbl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_tbl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(sve_sunpk_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_sunpk_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_sunpk_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(sve_uunpk_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_uunpk_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_uunpk_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_zip_p, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_uzp_p, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_trn_p, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_rev_p, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_punpk_p, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_zip_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_zip_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_zip_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_zip_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_uzp_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_uzp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_uzp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_uzp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_trn_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_trn_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_trn_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_trn_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_compact_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_compact_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_2(sve_last_active_element, TCG_CALL_NO_RWG, s32, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_revb_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_revb_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_revb_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_revh_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_revh_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_revw_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_rbit_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_rbit_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_rbit_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_rbit_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_splice, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_cmpeq_ppzz_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpne_ppzz_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpge_ppzz_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpgt_ppzz_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphi_ppzz_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphs_ppzz_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_cmpeq_ppzz_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpne_ppzz_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpge_ppzz_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpgt_ppzz_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphi_ppzz_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphs_ppzz_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_cmpeq_ppzz_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpne_ppzz_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpge_ppzz_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpgt_ppzz_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphi_ppzz_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphs_ppzz_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_cmpeq_ppzz_d, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpne_ppzz_d, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpge_ppzz_d, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpgt_ppzz_d, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphi_ppzz_d, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphs_ppzz_d, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_cmpeq_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpne_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpge_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpgt_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphi_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphs_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmple_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmplt_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmplo_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpls_ppzw_b, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_cmpeq_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpne_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpge_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpgt_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphi_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphs_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmple_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmplt_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmplo_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpls_ppzw_h, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_cmpeq_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpne_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpge_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpgt_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphi_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmphs_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmple_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmplt_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmplo_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_cmpls_ppzw_s, TCG_CALL_NO_RWG, + i32, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_cmpeq_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpne_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpgt_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpge_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmplt_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmple_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmphs_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmphi_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmplo_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpls_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_cmpeq_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpne_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpgt_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpge_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmplt_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmple_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmphs_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmphi_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmplo_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpls_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_cmpeq_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpne_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpgt_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpge_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmplt_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmple_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmphs_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmphi_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmplo_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpls_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_cmpeq_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpne_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpgt_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpge_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmplt_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmple_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmphs_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmphi_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmplo_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_cmpls_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) + DEF_HELPER_FLAGS_5(sve_and_pppp, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(sve_bic_pppp, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(sve_eor_pppp, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) @@ -425,3 +658,64 @@ DEF_HELPER_FLAGS_5(sve_orn_pppp, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(sve_nor_pppp, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(sve_nand_pppp, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_brkpa, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_brkpb, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_brkpas, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_brkpbs, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_brka_z, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_brkb_z, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_brka_m, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_brkb_m, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_brkas_z, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_brkbs_z, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_brkas_m, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_brkbs_m, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_brkn, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_brkns, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(sve_cntp, TCG_CALL_NO_RWG, i64, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(sve_while, TCG_CALL_NO_RWG, i32, ptr, i32, i32) + +DEF_HELPER_FLAGS_4(sve_subri_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_subri_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_subri_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_subri_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_4(sve_smaxi_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_smaxi_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_smaxi_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_smaxi_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_4(sve_smini_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_smini_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_smini_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_smini_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_4(sve_umaxi_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_umaxi_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_umaxi_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_umaxi_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_4(sve_umini_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_umini_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_umini_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(sve_umini_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_5(gvec_recps_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_recps_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_recps_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_rsqrts_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_rsqrts_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_rsqrts_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) diff --git a/target/arm/helper.h b/target/arm/helper.h index 0c6a144458..879a7229e9 100644 --- a/target/arm/helper.h +++ b/target/arm/helper.h @@ -601,6 +601,25 @@ DEF_HELPER_FLAGS_5(gvec_fcmlas_idx, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_5(gvec_fcmlad, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fadd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fadd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fadd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_fsub_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fsub_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fsub_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_fmul_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fmul_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fmul_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_ftsmul_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_ftsmul_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_ftsmul_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + #ifdef TARGET_AARCH64 #include "helper-a64.h" #include "helper-sve.h" diff --git a/target/arm/sve.decode b/target/arm/sve.decode index 4761d1921e..6f436f9096 100644 --- a/target/arm/sve.decode +++ b/target/arm/sve.decode @@ -24,6 +24,7 @@ %imm4_16_p1 16:4 !function=plus1 %imm6_22_5 22:1 5:5 +%imm7_22_16 22:2 16:5 %imm8_16_10 16:5 10:3 %imm9_16_10 16:s6 10:3 @@ -41,6 +42,8 @@ # Signed 8-bit immediate, optionally shifted left by 8. %sh8_i8s 5:9 !function=expand_imm_sh8s +# Unsigned 8-bit immediate, optionally shifted left by 8. +%sh8_i8u 5:9 !function=expand_imm_sh8u # Either a copy of rd (at bit 0), or a different source # as propagated via the MOVPRFX instruction. @@ -58,6 +61,7 @@ &rri_esz rd rn imm esz &rrr_esz rd rn rm esz &rpr_esz rd pg rn esz +&rpr_s rd pg rn s &rprr_s rd pg rn rm s &rprr_esz rd pg rn rm esz &rprrr_esz rd pg rn rm ra esz @@ -65,6 +69,8 @@ &ptrue rd esz pat s &incdec_cnt rd pat esz imm d u &incdec2_cnt rd rn pat esz imm d u +&incdec_pred rd pg esz d u +&incdec2_pred rd rn pg esz d u ########################################################################### # Named instruction formats. These are generally used to @@ -77,6 +83,9 @@ @pd_pn ........ esz:2 .. .... ....... rn:4 . rd:4 &rr_esz @rd_rn ........ esz:2 ...... ...... rn:5 rd:5 &rr_esz +# Two operand with governing predicate, flags setting +@pd_pg_pn_s ........ . s:1 ...... .. pg:4 . rn:4 . rd:4 &rpr_s + # Three operand with unused vector element size @rd_rn_rm_e0 ........ ... rm:5 ... ... rn:5 rd:5 &rrr_esz esz=0 @@ -85,6 +94,15 @@ # Three operand, vector element size @rd_rn_rm ........ esz:2 . rm:5 ... ... rn:5 rd:5 &rrr_esz +@pd_pn_pm ........ esz:2 .. rm:4 ....... rn:4 . rd:4 &rrr_esz +@rdn_rm ........ esz:2 ...... ...... rm:5 rd:5 \ + &rrr_esz rn=%reg_movprfx +@rdn_sh_i8u ........ esz:2 ...... ...... ..... rd:5 \ + &rri_esz rn=%reg_movprfx imm=%sh8_i8u +@rdn_i8u ........ esz:2 ...... ... imm:8 rd:5 \ + &rri_esz rn=%reg_movprfx +@rdn_i8s ........ esz:2 ...... ... imm:s8 rd:5 \ + &rri_esz rn=%reg_movprfx # Three operand with "memory" size, aka immediate left shift @rd_rn_msz_rm ........ ... rm:5 .... imm:2 rn:5 rd:5 &rrri @@ -94,6 +112,8 @@ &rprr_esz rn=%reg_movprfx @rdm_pg_rn ........ esz:2 ... ... ... pg:3 rn:5 rd:5 \ &rprr_esz rm=%reg_movprfx +@rd_pg4_rn_rm ........ esz:2 . rm:5 .. pg:4 rn:5 rd:5 &rprr_esz +@pd_pg_rn_rm ........ esz:2 . rm:5 ... pg:3 rn:5 . rd:4 &rprr_esz # Three register operand, with governing predicate, vector element size @rda_pg_rn_rm ........ esz:2 . rm:5 ... pg:3 rn:5 rd:5 \ @@ -103,6 +123,7 @@ # One register operand, with governing predicate, vector element size @rd_pg_rn ........ esz:2 ... ... ... pg:3 rn:5 rd:5 &rpr_esz +@rd_pg4_pn ........ esz:2 ... ... .. pg:4 . rn:4 rd:5 &rpr_esz # Two register operands with a 6-bit signed immediate. @rd_rn_i6 ........ ... rn:5 ..... imm:s6 rd:5 &rri @@ -125,6 +146,11 @@ @rdn_dbm ........ .. .... dbm:13 rd:5 \ &rr_dbm rn=%reg_movprfx +# Predicate output, vector and immediate input, +# controlling predicate, element size. +@pd_pg_rn_i7 ........ esz:2 . imm:7 . pg:3 rn:5 . rd:4 &rpri_esz +@pd_pg_rn_i5 ........ esz:2 . imm:s5 ... pg:3 rn:5 . rd:4 &rpri_esz + # Basic Load/Store with 9-bit immediate offset @pd_rn_i9 ........ ........ ...... rn:5 . rd:4 \ &rri imm=%imm9_16_10 @@ -138,6 +164,12 @@ @incdec2_cnt ........ esz:2 .. .... ...... pat:5 rd:5 \ &incdec2_cnt imm=%imm4_16_p1 rn=%reg_movprfx +# One register, predicate. +# User must fill in U and D. +@incdec_pred ........ esz:2 .... .. ..... .. pg:4 rd:5 &incdec_pred +@incdec2_pred ........ esz:2 .... .. ..... .. pg:4 rd:5 \ + &incdec2_pred rn=%reg_movprfx + ########################################################################### # Instruction patterns. Grouped according to the SVE encodingindex.xhtml. @@ -369,6 +401,145 @@ CPY_z_i 00000101 .. 01 .... 00 . ........ ..... @rdn_pg4 imm=%sh8_i8s EXT 00000101 001 ..... 000 ... rm:5 rd:5 \ &rrri rn=%reg_movprfx imm=%imm8_16_10 +### SVE Permute - Unpredicated Group + +# SVE broadcast general register +DUP_s 00000101 .. 1 00000 001110 ..... ..... @rd_rn + +# SVE broadcast indexed element +DUP_x 00000101 .. 1 ..... 001000 rn:5 rd:5 \ + &rri imm=%imm7_22_16 + +# SVE insert SIMD&FP scalar register +INSR_f 00000101 .. 1 10100 001110 ..... ..... @rdn_rm + +# SVE insert general register +INSR_r 00000101 .. 1 00100 001110 ..... ..... @rdn_rm + +# SVE reverse vector elements +REV_v 00000101 .. 1 11000 001110 ..... ..... @rd_rn + +# SVE vector table lookup +TBL 00000101 .. 1 ..... 001100 ..... ..... @rd_rn_rm + +# SVE unpack vector elements +UNPK 00000101 esz:2 1100 u:1 h:1 001110 rn:5 rd:5 + +### SVE Permute - Predicates Group + +# SVE permute predicate elements +ZIP1_p 00000101 .. 10 .... 010 000 0 .... 0 .... @pd_pn_pm +ZIP2_p 00000101 .. 10 .... 010 001 0 .... 0 .... @pd_pn_pm +UZP1_p 00000101 .. 10 .... 010 010 0 .... 0 .... @pd_pn_pm +UZP2_p 00000101 .. 10 .... 010 011 0 .... 0 .... @pd_pn_pm +TRN1_p 00000101 .. 10 .... 010 100 0 .... 0 .... @pd_pn_pm +TRN2_p 00000101 .. 10 .... 010 101 0 .... 0 .... @pd_pn_pm + +# SVE reverse predicate elements +REV_p 00000101 .. 11 0100 010 000 0 .... 0 .... @pd_pn + +# SVE unpack predicate elements +PUNPKLO 00000101 00 11 0000 010 000 0 .... 0 .... @pd_pn_e0 +PUNPKHI 00000101 00 11 0001 010 000 0 .... 0 .... @pd_pn_e0 + +### SVE Permute - Interleaving Group + +# SVE permute vector elements +ZIP1_z 00000101 .. 1 ..... 011 000 ..... ..... @rd_rn_rm +ZIP2_z 00000101 .. 1 ..... 011 001 ..... ..... @rd_rn_rm +UZP1_z 00000101 .. 1 ..... 011 010 ..... ..... @rd_rn_rm +UZP2_z 00000101 .. 1 ..... 011 011 ..... ..... @rd_rn_rm +TRN1_z 00000101 .. 1 ..... 011 100 ..... ..... @rd_rn_rm +TRN2_z 00000101 .. 1 ..... 011 101 ..... ..... @rd_rn_rm + +### SVE Permute - Predicated Group + +# SVE compress active elements +# Note esz >= 2 +COMPACT 00000101 .. 100001 100 ... ..... ..... @rd_pg_rn + +# SVE conditionally broadcast element to vector +CLASTA_z 00000101 .. 10100 0 100 ... ..... ..... @rdn_pg_rm +CLASTB_z 00000101 .. 10100 1 100 ... ..... ..... @rdn_pg_rm + +# SVE conditionally copy element to SIMD&FP scalar +CLASTA_v 00000101 .. 10101 0 100 ... ..... ..... @rd_pg_rn +CLASTB_v 00000101 .. 10101 1 100 ... ..... ..... @rd_pg_rn + +# SVE conditionally copy element to general register +CLASTA_r 00000101 .. 11000 0 101 ... ..... ..... @rd_pg_rn +CLASTB_r 00000101 .. 11000 1 101 ... ..... ..... @rd_pg_rn + +# SVE copy element to SIMD&FP scalar register +LASTA_v 00000101 .. 10001 0 100 ... ..... ..... @rd_pg_rn +LASTB_v 00000101 .. 10001 1 100 ... ..... ..... @rd_pg_rn + +# SVE copy element to general register +LASTA_r 00000101 .. 10000 0 101 ... ..... ..... @rd_pg_rn +LASTB_r 00000101 .. 10000 1 101 ... ..... ..... @rd_pg_rn + +# SVE copy element from SIMD&FP scalar register +CPY_m_v 00000101 .. 100000 100 ... ..... ..... @rd_pg_rn + +# SVE copy element from general register to vector (predicated) +CPY_m_r 00000101 .. 101000 101 ... ..... ..... @rd_pg_rn + +# SVE reverse within elements +# Note esz >= operation size +REVB 00000101 .. 1001 00 100 ... ..... ..... @rd_pg_rn +REVH 00000101 .. 1001 01 100 ... ..... ..... @rd_pg_rn +REVW 00000101 .. 1001 10 100 ... ..... ..... @rd_pg_rn +RBIT 00000101 .. 1001 11 100 ... ..... ..... @rd_pg_rn + +# SVE vector splice (predicated) +SPLICE 00000101 .. 101 100 100 ... ..... ..... @rdn_pg_rm + +### SVE Select Vectors Group + +# SVE select vector elements (predicated) +SEL_zpzz 00000101 .. 1 ..... 11 .... ..... ..... @rd_pg4_rn_rm + +### SVE Integer Compare - Vectors Group + +# SVE integer compare_vectors +CMPHS_ppzz 00100100 .. 0 ..... 000 ... ..... 0 .... @pd_pg_rn_rm +CMPHI_ppzz 00100100 .. 0 ..... 000 ... ..... 1 .... @pd_pg_rn_rm +CMPGE_ppzz 00100100 .. 0 ..... 100 ... ..... 0 .... @pd_pg_rn_rm +CMPGT_ppzz 00100100 .. 0 ..... 100 ... ..... 1 .... @pd_pg_rn_rm +CMPEQ_ppzz 00100100 .. 0 ..... 101 ... ..... 0 .... @pd_pg_rn_rm +CMPNE_ppzz 00100100 .. 0 ..... 101 ... ..... 1 .... @pd_pg_rn_rm + +# SVE integer compare with wide elements +# Note these require esz != 3. +CMPEQ_ppzw 00100100 .. 0 ..... 001 ... ..... 0 .... @pd_pg_rn_rm +CMPNE_ppzw 00100100 .. 0 ..... 001 ... ..... 1 .... @pd_pg_rn_rm +CMPGE_ppzw 00100100 .. 0 ..... 010 ... ..... 0 .... @pd_pg_rn_rm +CMPGT_ppzw 00100100 .. 0 ..... 010 ... ..... 1 .... @pd_pg_rn_rm +CMPLT_ppzw 00100100 .. 0 ..... 011 ... ..... 0 .... @pd_pg_rn_rm +CMPLE_ppzw 00100100 .. 0 ..... 011 ... ..... 1 .... @pd_pg_rn_rm +CMPHS_ppzw 00100100 .. 0 ..... 110 ... ..... 0 .... @pd_pg_rn_rm +CMPHI_ppzw 00100100 .. 0 ..... 110 ... ..... 1 .... @pd_pg_rn_rm +CMPLO_ppzw 00100100 .. 0 ..... 111 ... ..... 0 .... @pd_pg_rn_rm +CMPLS_ppzw 00100100 .. 0 ..... 111 ... ..... 1 .... @pd_pg_rn_rm + +### SVE Integer Compare - Unsigned Immediate Group + +# SVE integer compare with unsigned immediate +CMPHS_ppzi 00100100 .. 1 ....... 0 ... ..... 0 .... @pd_pg_rn_i7 +CMPHI_ppzi 00100100 .. 1 ....... 0 ... ..... 1 .... @pd_pg_rn_i7 +CMPLO_ppzi 00100100 .. 1 ....... 1 ... ..... 0 .... @pd_pg_rn_i7 +CMPLS_ppzi 00100100 .. 1 ....... 1 ... ..... 1 .... @pd_pg_rn_i7 + +### SVE Integer Compare - Signed Immediate Group + +# SVE integer compare with signed immediate +CMPGE_ppzi 00100101 .. 0 ..... 000 ... ..... 0 .... @pd_pg_rn_i5 +CMPGT_ppzi 00100101 .. 0 ..... 000 ... ..... 1 .... @pd_pg_rn_i5 +CMPLT_ppzi 00100101 .. 0 ..... 001 ... ..... 0 .... @pd_pg_rn_i5 +CMPLE_ppzi 00100101 .. 0 ..... 001 ... ..... 1 .... @pd_pg_rn_i5 +CMPEQ_ppzi 00100101 .. 0 ..... 100 ... ..... 0 .... @pd_pg_rn_i5 +CMPNE_ppzi 00100101 .. 0 ..... 100 ... ..... 1 .... @pd_pg_rn_i5 + ### SVE Predicate Logical Operations Group # SVE predicate logical operations @@ -410,6 +581,83 @@ PFIRST 00100101 01 011 000 11000 00 .... 0 .... @pd_pn_e0 # SVE predicate next active PNEXT 00100101 .. 011 001 11000 10 .... 0 .... @pd_pn +### SVE Partition Break Group + +# SVE propagate break from previous partition +BRKPA 00100101 0. 00 .... 11 .... 0 .... 0 .... @pd_pg_pn_pm_s +BRKPB 00100101 0. 00 .... 11 .... 0 .... 1 .... @pd_pg_pn_pm_s + +# SVE partition break condition +BRKA_z 00100101 0. 01000001 .... 0 .... 0 .... @pd_pg_pn_s +BRKB_z 00100101 1. 01000001 .... 0 .... 0 .... @pd_pg_pn_s +BRKA_m 00100101 0. 01000001 .... 0 .... 1 .... @pd_pg_pn_s +BRKB_m 00100101 1. 01000001 .... 0 .... 1 .... @pd_pg_pn_s + +# SVE propagate break to next partition +BRKN 00100101 0. 01100001 .... 0 .... 0 .... @pd_pg_pn_s + +### SVE Predicate Count Group + +# SVE predicate count +CNTP 00100101 .. 100 000 10 .... 0 .... ..... @rd_pg4_pn + +# SVE inc/dec register by predicate count +INCDECP_r 00100101 .. 10110 d:1 10001 00 .... ..... @incdec_pred u=1 + +# SVE inc/dec vector by predicate count +INCDECP_z 00100101 .. 10110 d:1 10000 00 .... ..... @incdec2_pred u=1 + +# SVE saturating inc/dec register by predicate count +SINCDECP_r_32 00100101 .. 1010 d:1 u:1 10001 00 .... ..... @incdec_pred +SINCDECP_r_64 00100101 .. 1010 d:1 u:1 10001 10 .... ..... @incdec_pred + +# SVE saturating inc/dec vector by predicate count +SINCDECP_z 00100101 .. 1010 d:1 u:1 10000 00 .... ..... @incdec2_pred + +### SVE Integer Compare - Scalars Group + +# SVE conditionally terminate scalars +CTERM 00100101 1 sf:1 1 rm:5 001000 rn:5 ne:1 0000 + +# SVE integer compare scalar count and limit +WHILE 00100101 esz:2 1 rm:5 000 sf:1 u:1 1 rn:5 eq:1 rd:4 + +### SVE Integer Wide Immediate - Unpredicated Group + +# SVE broadcast floating-point immediate (unpredicated) +FDUP 00100101 esz:2 111 00 1110 imm:8 rd:5 + +# SVE broadcast integer immediate (unpredicated) +DUP_i 00100101 esz:2 111 00 011 . ........ rd:5 imm=%sh8_i8s + +# SVE integer add/subtract immediate (unpredicated) +ADD_zzi 00100101 .. 100 000 11 . ........ ..... @rdn_sh_i8u +SUB_zzi 00100101 .. 100 001 11 . ........ ..... @rdn_sh_i8u +SUBR_zzi 00100101 .. 100 011 11 . ........ ..... @rdn_sh_i8u +SQADD_zzi 00100101 .. 100 100 11 . ........ ..... @rdn_sh_i8u +UQADD_zzi 00100101 .. 100 101 11 . ........ ..... @rdn_sh_i8u +SQSUB_zzi 00100101 .. 100 110 11 . ........ ..... @rdn_sh_i8u +UQSUB_zzi 00100101 .. 100 111 11 . ........ ..... @rdn_sh_i8u + +# SVE integer min/max immediate (unpredicated) +SMAX_zzi 00100101 .. 101 000 110 ........ ..... @rdn_i8s +UMAX_zzi 00100101 .. 101 001 110 ........ ..... @rdn_i8u +SMIN_zzi 00100101 .. 101 010 110 ........ ..... @rdn_i8s +UMIN_zzi 00100101 .. 101 011 110 ........ ..... @rdn_i8u + +# SVE integer multiply immediate (unpredicated) +MUL_zzi 00100101 .. 110 000 110 ........ ..... @rdn_i8s + +### SVE Floating Point Arithmetic - Unpredicated Group + +# SVE floating-point arithmetic (unpredicated) +FADD_zzz 01100101 .. 0 ..... 000 000 ..... ..... @rd_rn_rm +FSUB_zzz 01100101 .. 0 ..... 000 001 ..... ..... @rd_rn_rm +FMUL_zzz 01100101 .. 0 ..... 000 010 ..... ..... @rd_rn_rm +FTSMUL 01100101 .. 0 ..... 000 011 ..... ..... @rd_rn_rm +FRECPS 01100101 .. 0 ..... 000 110 ..... ..... @rd_rn_rm +FRSQRTS 01100101 .. 0 ..... 000 111 ..... ..... @rd_rn_rm + ### SVE Memory - 32-bit Gather and Unsized Contiguous Group # SVE load predicate register diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c index b825e44cb5..128bbf9b04 100644 --- a/target/arm/sve_helper.c +++ b/target/arm/sve_helper.c @@ -74,6 +74,28 @@ static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) return flags; } +/* This is an iterative function, called for each Pd and Pg word + * moving backward. + */ +static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags) +{ + if (likely(g)) { + /* Compute C from first (i.e last) !(D & G). + Use bit 2 to signal first G bit seen. */ + if (!(flags & 4)) { + flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */ + flags |= (d & pow2floor(g)) == 0; + } + + /* Accumulate Z from each D & G. */ + flags |= ((d & g) != 0) << 1; + + /* Compute N from last (i.e first) D & G. Replace previous. */ + flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0); + } + return flags; +} + /* The same for a single word predicate. */ uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) { @@ -238,6 +260,26 @@ static inline uint64_t expand_pred_s(uint8_t byte) return word[byte & 0x11]; } +/* Swap 16-bit words within a 32-bit word. */ +static inline uint32_t hswap32(uint32_t h) +{ + return rol32(h, 16); +} + +/* Swap 16-bit words within a 64-bit word. */ +static inline uint64_t hswap64(uint64_t h) +{ + uint64_t m = 0x0000ffff0000ffffull; + h = rol64(h, 32); + return ((h & m) << 16) | ((h >> 16) & m); +} + +/* Swap 32-bit words within a 64-bit word. */ +static inline uint64_t wswap64(uint64_t h) +{ + return rol64(h, 32); +} + #define LOGICAL_PPPP(NAME, FUNC) \ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ { \ @@ -616,6 +658,20 @@ DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG) DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG) DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG) +DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16) +DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32) +DO_ZPZ_D(sve_revb_d, uint64_t, bswap64) + +DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32) +DO_ZPZ_D(sve_revh_d, uint64_t, hswap64) + +DO_ZPZ_D(sve_revw_d, uint64_t, wswap64) + +DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8) +DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16) +DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32) +DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64) + /* Three-operand expander, unpredicated, in which the third operand is "wide". */ #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \ @@ -748,6 +804,46 @@ DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) #undef DO_VPZ #undef DO_VPZ_D +/* Two vector operand, one scalar operand, unpredicated. */ +#define DO_ZZI(NAME, TYPE, OP) \ +void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ + TYPE s = s64, *d = vd, *n = vn; \ + for (i = 0; i < opr_sz; ++i) { \ + d[i] = OP(n[i], s); \ + } \ +} + +#define DO_SUBR(X, Y) (Y - X) + +DO_ZZI(sve_subri_b, uint8_t, DO_SUBR) +DO_ZZI(sve_subri_h, uint16_t, DO_SUBR) +DO_ZZI(sve_subri_s, uint32_t, DO_SUBR) +DO_ZZI(sve_subri_d, uint64_t, DO_SUBR) + +DO_ZZI(sve_smaxi_b, int8_t, DO_MAX) +DO_ZZI(sve_smaxi_h, int16_t, DO_MAX) +DO_ZZI(sve_smaxi_s, int32_t, DO_MAX) +DO_ZZI(sve_smaxi_d, int64_t, DO_MAX) + +DO_ZZI(sve_smini_b, int8_t, DO_MIN) +DO_ZZI(sve_smini_h, int16_t, DO_MIN) +DO_ZZI(sve_smini_s, int32_t, DO_MIN) +DO_ZZI(sve_smini_d, int64_t, DO_MIN) + +DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX) +DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX) +DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX) +DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX) + +DO_ZZI(sve_umini_b, uint8_t, DO_MIN) +DO_ZZI(sve_umini_h, uint16_t, DO_MIN) +DO_ZZI(sve_umini_s, uint32_t, DO_MIN) +DO_ZZI(sve_umini_d, uint64_t, DO_MIN) + +#undef DO_ZZI + #undef DO_AND #undef DO_ORR #undef DO_EOR @@ -762,6 +858,7 @@ DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) #undef DO_ASR #undef DO_LSR #undef DO_LSL +#undef DO_SUBR /* Similar to the ARM LastActiveElement pseudocode function, except the result is multiplied by the element size. This includes the not found @@ -1560,3 +1657,1156 @@ void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc) memcpy(vd + n_siz, &tmp, n_ofs); } } + +#define DO_INSR(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \ +{ \ + intptr_t opr_sz = simd_oprsz(desc); \ + swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \ + *(TYPE *)(vd + H(0)) = val; \ +} + +DO_INSR(sve_insr_b, uint8_t, H1) +DO_INSR(sve_insr_h, uint16_t, H1_2) +DO_INSR(sve_insr_s, uint32_t, H1_4) +DO_INSR(sve_insr_d, uint64_t, ) + +#undef DO_INSR + +void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { + uint64_t f = *(uint64_t *)(vn + i); + uint64_t b = *(uint64_t *)(vn + j); + *(uint64_t *)(vd + i) = bswap64(b); + *(uint64_t *)(vd + j) = bswap64(f); + } +} + +void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { + uint64_t f = *(uint64_t *)(vn + i); + uint64_t b = *(uint64_t *)(vn + j); + *(uint64_t *)(vd + i) = hswap64(b); + *(uint64_t *)(vd + j) = hswap64(f); + } +} + +void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { + uint64_t f = *(uint64_t *)(vn + i); + uint64_t b = *(uint64_t *)(vn + j); + *(uint64_t *)(vd + i) = rol64(b, 32); + *(uint64_t *)(vd + j) = rol64(f, 32); + } +} + +void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { + uint64_t f = *(uint64_t *)(vn + i); + uint64_t b = *(uint64_t *)(vn + j); + *(uint64_t *)(vd + i) = b; + *(uint64_t *)(vd + j) = f; + } +} + +#define DO_TBL(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + uintptr_t elem = opr_sz / sizeof(TYPE); \ + TYPE *d = vd, *n = vn, *m = vm; \ + ARMVectorReg tmp; \ + if (unlikely(vd == vn)) { \ + n = memcpy(&tmp, vn, opr_sz); \ + } \ + for (i = 0; i < elem; i++) { \ + TYPE j = m[H(i)]; \ + d[H(i)] = j < elem ? n[H(j)] : 0; \ + } \ +} + +DO_TBL(sve_tbl_b, uint8_t, H1) +DO_TBL(sve_tbl_h, uint16_t, H2) +DO_TBL(sve_tbl_s, uint32_t, H4) +DO_TBL(sve_tbl_d, uint64_t, ) + +#undef TBL + +#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \ +void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + TYPED *d = vd; \ + TYPES *n = vn; \ + ARMVectorReg tmp; \ + if (unlikely(vn - vd < opr_sz)) { \ + n = memcpy(&tmp, n, opr_sz / 2); \ + } \ + for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \ + d[HD(i)] = n[HS(i)]; \ + } \ +} + +DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1) +DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2) +DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4) + +DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1) +DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2) +DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4) + +#undef DO_UNPK + +/* Mask of bits included in the even numbered predicates of width esz. + * We also use this for expand_bits/compress_bits, and so extend the + * same pattern out to 16-bit units. + */ +static const uint64_t even_bit_esz_masks[5] = { + 0x5555555555555555ull, + 0x3333333333333333ull, + 0x0f0f0f0f0f0f0f0full, + 0x00ff00ff00ff00ffull, + 0x0000ffff0000ffffull, +}; + +/* Zero-extend units of 2**N bits to units of 2**(N+1) bits. + * For N==0, this corresponds to the operation that in qemu/bitops.h + * we call half_shuffle64; this algorithm is from Hacker's Delight, + * section 7-2 Shuffling Bits. + */ +static uint64_t expand_bits(uint64_t x, int n) +{ + int i; + + x &= 0xffffffffu; + for (i = 4; i >= n; i--) { + int sh = 1 << i; + x = ((x << sh) | x) & even_bit_esz_masks[i]; + } + return x; +} + +/* Compress units of 2**(N+1) bits to units of 2**N bits. + * For N==0, this corresponds to the operation that in qemu/bitops.h + * we call half_unshuffle64; this algorithm is from Hacker's Delight, + * section 7-2 Shuffling Bits, where it is called an inverse half shuffle. + */ +static uint64_t compress_bits(uint64_t x, int n) +{ + int i; + + for (i = n; i <= 4; i++) { + int sh = 1 << i; + x &= even_bit_esz_masks[i]; + x = (x >> sh) | x; + } + return x & 0xffffffffu; +} + +void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2); + intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1); + uint64_t *d = vd; + intptr_t i; + + if (oprsz <= 8) { + uint64_t nn = *(uint64_t *)vn; + uint64_t mm = *(uint64_t *)vm; + int half = 4 * oprsz; + + nn = extract64(nn, high * half, half); + mm = extract64(mm, high * half, half); + nn = expand_bits(nn, esz); + mm = expand_bits(mm, esz); + d[0] = nn + (mm << (1 << esz)); + } else { + ARMPredicateReg tmp_n, tmp_m; + + /* We produce output faster than we consume input. + Therefore we must be mindful of possible overlap. */ + if ((vn - vd) < (uintptr_t)oprsz) { + vn = memcpy(&tmp_n, vn, oprsz); + } + if ((vm - vd) < (uintptr_t)oprsz) { + vm = memcpy(&tmp_m, vm, oprsz); + } + if (high) { + high = oprsz >> 1; + } + + if ((high & 3) == 0) { + uint32_t *n = vn, *m = vm; + high >>= 2; + + for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { + uint64_t nn = n[H4(high + i)]; + uint64_t mm = m[H4(high + i)]; + + nn = expand_bits(nn, esz); + mm = expand_bits(mm, esz); + d[i] = nn + (mm << (1 << esz)); + } + } else { + uint8_t *n = vn, *m = vm; + uint16_t *d16 = vd; + + for (i = 0; i < oprsz / 2; i++) { + uint16_t nn = n[H1(high + i)]; + uint16_t mm = m[H1(high + i)]; + + nn = expand_bits(nn, esz); + mm = expand_bits(mm, esz); + d16[H2(i)] = nn + (mm << (1 << esz)); + } + } + } +} + +void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2); + int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz; + uint64_t *d = vd, *n = vn, *m = vm; + uint64_t l, h; + intptr_t i; + + if (oprsz <= 8) { + l = compress_bits(n[0] >> odd, esz); + h = compress_bits(m[0] >> odd, esz); + d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz); + } else { + ARMPredicateReg tmp_m; + intptr_t oprsz_16 = oprsz / 16; + + if ((vm - vd) < (uintptr_t)oprsz) { + m = memcpy(&tmp_m, vm, oprsz); + } + + for (i = 0; i < oprsz_16; i++) { + l = n[2 * i + 0]; + h = n[2 * i + 1]; + l = compress_bits(l >> odd, esz); + h = compress_bits(h >> odd, esz); + d[i] = l + (h << 32); + } + + /* For VL which is not a power of 2, the results from M do not + align nicely with the uint64_t for D. Put the aligned results + from M into TMP_M and then copy it into place afterward. */ + if (oprsz & 15) { + d[i] = compress_bits(n[2 * i] >> odd, esz); + + for (i = 0; i < oprsz_16; i++) { + l = m[2 * i + 0]; + h = m[2 * i + 1]; + l = compress_bits(l >> odd, esz); + h = compress_bits(h >> odd, esz); + tmp_m.p[i] = l + (h << 32); + } + tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz); + + swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); + } else { + for (i = 0; i < oprsz_16; i++) { + l = m[2 * i + 0]; + h = m[2 * i + 1]; + l = compress_bits(l >> odd, esz); + h = compress_bits(h >> odd, esz); + d[oprsz_16 + i] = l + (h << 32); + } + } + } +} + +void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2); + bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1); + uint64_t *d = vd, *n = vn, *m = vm; + uint64_t mask; + int shr, shl; + intptr_t i; + + shl = 1 << esz; + shr = 0; + mask = even_bit_esz_masks[esz]; + if (odd) { + mask <<= shl; + shr = shl; + shl = 0; + } + + for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { + uint64_t nn = (n[i] & mask) >> shr; + uint64_t mm = (m[i] & mask) << shl; + d[i] = nn + mm; + } +} + +/* Reverse units of 2**N bits. */ +static uint64_t reverse_bits_64(uint64_t x, int n) +{ + int i, sh; + + x = bswap64(x); + for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { + uint64_t mask = even_bit_esz_masks[i]; + x = ((x & mask) << sh) | ((x >> sh) & mask); + } + return x; +} + +static uint8_t reverse_bits_8(uint8_t x, int n) +{ + static const uint8_t mask[3] = { 0x55, 0x33, 0x0f }; + int i, sh; + + for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { + x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]); + } + return x; +} + +void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2); + intptr_t i, oprsz_2 = oprsz / 2; + + if (oprsz <= 8) { + uint64_t l = *(uint64_t *)vn; + l = reverse_bits_64(l << (64 - 8 * oprsz), esz); + *(uint64_t *)vd = l; + } else if ((oprsz & 15) == 0) { + for (i = 0; i < oprsz_2; i += 8) { + intptr_t ih = oprsz - 8 - i; + uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz); + uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz); + *(uint64_t *)(vd + i) = h; + *(uint64_t *)(vd + ih) = l; + } + } else { + for (i = 0; i < oprsz_2; i += 1) { + intptr_t il = H1(i); + intptr_t ih = H1(oprsz - 1 - i); + uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz); + uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz); + *(uint8_t *)(vd + il) = h; + *(uint8_t *)(vd + ih) = l; + } + } +} + +void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1); + uint64_t *d = vd; + intptr_t i; + + if (oprsz <= 8) { + uint64_t nn = *(uint64_t *)vn; + int half = 4 * oprsz; + + nn = extract64(nn, high * half, half); + nn = expand_bits(nn, 0); + d[0] = nn; + } else { + ARMPredicateReg tmp_n; + + /* We produce output faster than we consume input. + Therefore we must be mindful of possible overlap. */ + if ((vn - vd) < (uintptr_t)oprsz) { + vn = memcpy(&tmp_n, vn, oprsz); + } + if (high) { + high = oprsz >> 1; + } + + if ((high & 3) == 0) { + uint32_t *n = vn; + high >>= 2; + + for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { + uint64_t nn = n[H4(high + i)]; + d[i] = expand_bits(nn, 0); + } + } else { + uint16_t *d16 = vd; + uint8_t *n = vn; + + for (i = 0; i < oprsz / 2; i++) { + uint16_t nn = n[H1(high + i)]; + d16[H2(i)] = expand_bits(nn, 0); + } + } + } +} + +#define DO_ZIP(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t oprsz = simd_oprsz(desc); \ + intptr_t i, oprsz_2 = oprsz / 2; \ + ARMVectorReg tmp_n, tmp_m; \ + /* We produce output faster than we consume input. \ + Therefore we must be mindful of possible overlap. */ \ + if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \ + vn = memcpy(&tmp_n, vn, oprsz_2); \ + } \ + if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ + vm = memcpy(&tmp_m, vm, oprsz_2); \ + } \ + for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ + *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \ + *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \ + } \ +} + +DO_ZIP(sve_zip_b, uint8_t, H1) +DO_ZIP(sve_zip_h, uint16_t, H1_2) +DO_ZIP(sve_zip_s, uint32_t, H1_4) +DO_ZIP(sve_zip_d, uint64_t, ) + +#define DO_UZP(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t oprsz = simd_oprsz(desc); \ + intptr_t oprsz_2 = oprsz / 2; \ + intptr_t odd_ofs = simd_data(desc); \ + intptr_t i; \ + ARMVectorReg tmp_m; \ + if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ + vm = memcpy(&tmp_m, vm, oprsz); \ + } \ + for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ + *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \ + } \ + for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ + *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \ + } \ +} + +DO_UZP(sve_uzp_b, uint8_t, H1) +DO_UZP(sve_uzp_h, uint16_t, H1_2) +DO_UZP(sve_uzp_s, uint32_t, H1_4) +DO_UZP(sve_uzp_d, uint64_t, ) + +#define DO_TRN(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t oprsz = simd_oprsz(desc); \ + intptr_t odd_ofs = simd_data(desc); \ + intptr_t i; \ + for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \ + TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \ + TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \ + *(TYPE *)(vd + H(i + 0)) = ae; \ + *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \ + } \ +} + +DO_TRN(sve_trn_b, uint8_t, H1) +DO_TRN(sve_trn_h, uint16_t, H1_2) +DO_TRN(sve_trn_s, uint32_t, H1_4) +DO_TRN(sve_trn_d, uint64_t, ) + +#undef DO_ZIP +#undef DO_UZP +#undef DO_TRN + +void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc) / 4; + uint32_t *d = vd, *n = vn; + uint8_t *pg = vg; + + for (i = j = 0; i < opr_sz; i++) { + if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) { + d[H4(j)] = n[H4(i)]; + j++; + } + } + for (; j < opr_sz; j++) { + d[H4(j)] = 0; + } +} + +void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn; + uint8_t *pg = vg; + + for (i = j = 0; i < opr_sz; i++) { + if (pg[H1(i)] & 1) { + d[j] = n[i]; + j++; + } + } + for (; j < opr_sz; j++) { + d[j] = 0; + } +} + +/* Similar to the ARM LastActiveElement pseudocode function, except the + * result is multiplied by the element size. This includes the not found + * indication; e.g. not found for esz=3 is -8. + */ +int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2); + + return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz); +} + +void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) +{ + intptr_t opr_sz = simd_oprsz(desc) / 8; + int esz = simd_data(desc); + uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz]; + intptr_t i, first_i, last_i; + ARMVectorReg tmp; + + first_i = last_i = 0; + first_g = last_g = 0; + + /* Find the extent of the active elements within VG. */ + for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) { + pg = *(uint64_t *)(vg + i) & mask; + if (pg) { + if (last_g == 0) { + last_g = pg; + last_i = i; + } + first_g = pg; + first_i = i; + } + } + + len = 0; + if (first_g != 0) { + first_i = first_i * 8 + ctz64(first_g); + last_i = last_i * 8 + 63 - clz64(last_g); + len = last_i - first_i + (1 << esz); + if (vd == vm) { + vm = memcpy(&tmp, vm, opr_sz * 8); + } + swap_memmove(vd, vn + first_i, len); + } + swap_memmove(vd + len, vm, opr_sz * 8 - len); +} + +void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm, + void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn, *m = vm; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i += 1) { + uint64_t nn = n[i], mm = m[i]; + uint64_t pp = expand_pred_b(pg[H1(i)]); + d[i] = (nn & pp) | (mm & ~pp); + } +} + +void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm, + void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn, *m = vm; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i += 1) { + uint64_t nn = n[i], mm = m[i]; + uint64_t pp = expand_pred_h(pg[H1(i)]); + d[i] = (nn & pp) | (mm & ~pp); + } +} + +void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm, + void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn, *m = vm; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i += 1) { + uint64_t nn = n[i], mm = m[i]; + uint64_t pp = expand_pred_s(pg[H1(i)]); + d[i] = (nn & pp) | (mm & ~pp); + } +} + +void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm, + void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn, *m = vm; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i += 1) { + uint64_t nn = n[i], mm = m[i]; + d[i] = (pg[H1(i)] & 1 ? nn : mm); + } +} + +/* Two operand comparison controlled by a predicate. + * ??? It is very tempting to want to be able to expand this inline + * with x86 instructions, e.g. + * + * vcmpeqw zm, zn, %ymm0 + * vpmovmskb %ymm0, %eax + * and $0x5555, %eax + * and pg, %eax + * + * or even aarch64, e.g. + * + * // mask = 4000 1000 0400 0100 0040 0010 0004 0001 + * cmeq v0.8h, zn, zm + * and v0.8h, v0.8h, mask + * addv h0, v0.8h + * and v0.8b, pg + * + * However, coming up with an abstraction that allows vector inputs and + * a scalar output, and also handles the byte-ordering of sub-uint64_t + * scalar outputs, is tricky. + */ +#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \ +uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ +{ \ + intptr_t opr_sz = simd_oprsz(desc); \ + uint32_t flags = PREDTEST_INIT; \ + intptr_t i = opr_sz; \ + do { \ + uint64_t out = 0, pg; \ + do { \ + i -= sizeof(TYPE), out <<= sizeof(TYPE); \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + TYPE mm = *(TYPE *)(vm + H(i)); \ + out |= nn OP mm; \ + } while (i & 63); \ + pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ + out &= pg; \ + *(uint64_t *)(vd + (i >> 3)) = out; \ + flags = iter_predtest_bwd(out, pg, flags); \ + } while (i > 0); \ + return flags; \ +} + +#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \ + DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull) +#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \ + DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) +#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \ + DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) +#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \ + DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull) + +DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==) +DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==) +DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==) +DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==) + +DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=) +DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=) +DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=) +DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=) + +DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >) +DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >) +DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >) +DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >) + +DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=) +DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=) +DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=) +DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=) + +DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >) +DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >) +DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >) +DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >) + +DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=) +DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=) +DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=) +DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=) + +#undef DO_CMP_PPZZ_B +#undef DO_CMP_PPZZ_H +#undef DO_CMP_PPZZ_S +#undef DO_CMP_PPZZ_D +#undef DO_CMP_PPZZ + +/* Similar, but the second source is "wide". */ +#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \ +uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ +{ \ + intptr_t opr_sz = simd_oprsz(desc); \ + uint32_t flags = PREDTEST_INIT; \ + intptr_t i = opr_sz; \ + do { \ + uint64_t out = 0, pg; \ + do { \ + TYPEW mm = *(TYPEW *)(vm + i - 8); \ + do { \ + i -= sizeof(TYPE), out <<= sizeof(TYPE); \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + out |= nn OP mm; \ + } while (i & 7); \ + } while (i & 63); \ + pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ + out &= pg; \ + *(uint64_t *)(vd + (i >> 3)) = out; \ + flags = iter_predtest_bwd(out, pg, flags); \ + } while (i > 0); \ + return flags; \ +} + +#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \ + DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull) +#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \ + DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull) +#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \ + DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull) + +DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, uint8_t, uint64_t, ==) +DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, uint16_t, uint64_t, ==) +DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, uint32_t, uint64_t, ==) + +DO_CMP_PPZW_B(sve_cmpne_ppzw_b, uint8_t, uint64_t, !=) +DO_CMP_PPZW_H(sve_cmpne_ppzw_h, uint16_t, uint64_t, !=) +DO_CMP_PPZW_S(sve_cmpne_ppzw_s, uint32_t, uint64_t, !=) + +DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >) +DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >) +DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >) + +DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=) +DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=) +DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=) + +DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >) +DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >) +DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >) + +DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=) +DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=) +DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=) + +DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <) +DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <) +DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <) + +DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=) +DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=) +DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=) + +DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <) +DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <) +DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <) + +DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=) +DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=) +DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=) + +#undef DO_CMP_PPZW_B +#undef DO_CMP_PPZW_H +#undef DO_CMP_PPZW_S +#undef DO_CMP_PPZW + +/* Similar, but the second source is immediate. */ +#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \ +uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ +{ \ + intptr_t opr_sz = simd_oprsz(desc); \ + uint32_t flags = PREDTEST_INIT; \ + TYPE mm = simd_data(desc); \ + intptr_t i = opr_sz; \ + do { \ + uint64_t out = 0, pg; \ + do { \ + i -= sizeof(TYPE), out <<= sizeof(TYPE); \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + out |= nn OP mm; \ + } while (i & 63); \ + pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ + out &= pg; \ + *(uint64_t *)(vd + (i >> 3)) = out; \ + flags = iter_predtest_bwd(out, pg, flags); \ + } while (i > 0); \ + return flags; \ +} + +#define DO_CMP_PPZI_B(NAME, TYPE, OP) \ + DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull) +#define DO_CMP_PPZI_H(NAME, TYPE, OP) \ + DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) +#define DO_CMP_PPZI_S(NAME, TYPE, OP) \ + DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) +#define DO_CMP_PPZI_D(NAME, TYPE, OP) \ + DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull) + +DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==) +DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==) +DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==) +DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==) + +DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=) +DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=) +DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=) +DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=) + +DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >) +DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >) +DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >) +DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >) + +DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=) +DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=) +DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=) +DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=) + +DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >) +DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >) +DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >) +DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >) + +DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=) +DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=) +DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=) +DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=) + +DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <) +DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <) +DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <) +DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <) + +DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=) +DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=) +DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=) +DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=) + +DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <) +DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <) +DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <) +DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <) + +DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=) +DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=) +DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=) +DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=) + +#undef DO_CMP_PPZI_B +#undef DO_CMP_PPZI_H +#undef DO_CMP_PPZI_S +#undef DO_CMP_PPZI_D +#undef DO_CMP_PPZI + +/* Similar to the ARM LastActive pseudocode function. */ +static bool last_active_pred(void *vd, void *vg, intptr_t oprsz) +{ + intptr_t i; + + for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) { + uint64_t pg = *(uint64_t *)(vg + i); + if (pg) { + return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0; + } + } + return 0; +} + +/* Compute a mask into RETB that is true for all G, up to and including + * (if after) or excluding (if !after) the first G & N. + * Return true if BRK found. + */ +static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g, + bool brk, bool after) +{ + uint64_t b; + + if (brk) { + b = 0; + } else if ((g & n) == 0) { + /* For all G, no N are set; break not found. */ + b = g; + } else { + /* Break somewhere in N. Locate it. */ + b = g & n; /* guard true, pred true */ + b = b & -b; /* first such */ + if (after) { + b = b | (b - 1); /* break after same */ + } else { + b = b - 1; /* break before same */ + } + brk = true; + } + + *retb = b; + return brk; +} + +/* Compute a zeroing BRK. */ +static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g, + intptr_t oprsz, bool after) +{ + bool brk = false; + intptr_t i; + + for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { + uint64_t this_b, this_g = g[i]; + + brk = compute_brk(&this_b, n[i], this_g, brk, after); + d[i] = this_b & this_g; + } +} + +/* Likewise, but also compute flags. */ +static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g, + intptr_t oprsz, bool after) +{ + uint32_t flags = PREDTEST_INIT; + bool brk = false; + intptr_t i; + + for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { + uint64_t this_b, this_d, this_g = g[i]; + + brk = compute_brk(&this_b, n[i], this_g, brk, after); + d[i] = this_d = this_b & this_g; + flags = iter_predtest_fwd(this_d, this_g, flags); + } + return flags; +} + +/* Compute a merging BRK. */ +static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g, + intptr_t oprsz, bool after) +{ + bool brk = false; + intptr_t i; + + for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { + uint64_t this_b, this_g = g[i]; + + brk = compute_brk(&this_b, n[i], this_g, brk, after); + d[i] = (this_b & this_g) | (d[i] & ~this_g); + } +} + +/* Likewise, but also compute flags. */ +static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, + intptr_t oprsz, bool after) +{ + uint32_t flags = PREDTEST_INIT; + bool brk = false; + intptr_t i; + + for (i = 0; i < oprsz / 8; ++i) { + uint64_t this_b, this_d = d[i], this_g = g[i]; + + brk = compute_brk(&this_b, n[i], this_g, brk, after); + d[i] = this_d = (this_b & this_g) | (this_d & ~this_g); + flags = iter_predtest_fwd(this_d, this_g, flags); + } + return flags; +} + +static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz) +{ + /* It is quicker to zero the whole predicate than loop on OPRSZ. + * The compiler should turn this into 4 64-bit integer stores. + */ + memset(d, 0, sizeof(ARMPredicateReg)); + return PREDTEST_INIT; +} + +void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, + uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + if (last_active_pred(vn, vg, oprsz)) { + compute_brk_z(vd, vm, vg, oprsz, true); + } else { + do_zero(vd, oprsz); + } +} + +uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, + uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + if (last_active_pred(vn, vg, oprsz)) { + return compute_brks_z(vd, vm, vg, oprsz, true); + } else { + return do_zero(vd, oprsz); + } +} + +void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, + uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + if (last_active_pred(vn, vg, oprsz)) { + compute_brk_z(vd, vm, vg, oprsz, false); + } else { + do_zero(vd, oprsz); + } +} + +uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, + uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + if (last_active_pred(vn, vg, oprsz)) { + return compute_brks_z(vd, vm, vg, oprsz, false); + } else { + return do_zero(vd, oprsz); + } +} + +void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + compute_brk_z(vd, vn, vg, oprsz, true); +} + +uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + return compute_brks_z(vd, vn, vg, oprsz, true); +} + +void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + compute_brk_z(vd, vn, vg, oprsz, false); +} + +uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + return compute_brks_z(vd, vn, vg, oprsz, false); +} + +void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + compute_brk_m(vd, vn, vg, oprsz, true); +} + +uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + return compute_brks_m(vd, vn, vg, oprsz, true); +} + +void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + compute_brk_m(vd, vn, vg, oprsz, false); +} + +uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + return compute_brks_m(vd, vn, vg, oprsz, false); +} + +void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + + if (!last_active_pred(vn, vg, oprsz)) { + do_zero(vd, oprsz); + } +} + +/* As if PredTest(Ones(PL), D, esz). */ +static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz, + uint64_t esz_mask) +{ + uint32_t flags = PREDTEST_INIT; + intptr_t i; + + for (i = 0; i < oprsz / 8; i++) { + flags = iter_predtest_fwd(d->p[i], esz_mask, flags); + } + if (oprsz & 7) { + uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); + flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags); + } + return flags; +} + +uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + + if (last_active_pred(vn, vg, oprsz)) { + return predtest_ones(vd, oprsz, -1); + } else { + return do_zero(vd, oprsz); + } +} + +uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2); + uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz]; + intptr_t i; + + for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { + uint64_t t = n[i] & g[i] & mask; + sum += ctpop64(t); + } + return sum; +} + +uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc) +{ + uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2; + intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2); + uint64_t esz_mask = pred_esz_masks[esz]; + ARMPredicateReg *d = vd; + uint32_t flags; + intptr_t i; + + /* Begin with a zero predicate register. */ + flags = do_zero(d, oprsz); + if (count == 0) { + return flags; + } + + /* Scale from predicate element count to bits. */ + count <<= esz; + /* Bound to the bits in the predicate. */ + count = MIN(count, oprsz * 8); + + /* Set all of the requested bits. */ + for (i = 0; i < count / 64; ++i) { + d->p[i] = esz_mask; + } + if (count & 63) { + d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; + } + + return predtest_ones(d, oprsz, esz_mask); +} diff --git a/target/arm/translate-a64.h b/target/arm/translate-a64.h index dd9c09f89b..63d958cf50 100644 --- a/target/arm/translate-a64.h +++ b/target/arm/translate-a64.h @@ -67,18 +67,26 @@ static inline void assert_fp_access_checked(DisasContext *s) static inline int vec_reg_offset(DisasContext *s, int regno, int element, TCGMemOp size) { - int offs = 0; + int element_size = 1 << size; + int offs = element * element_size; #ifdef HOST_WORDS_BIGENDIAN /* This is complicated slightly because vfp.zregs[n].d[0] is - * still the low half and vfp.zregs[n].d[1] the high half - * of the 128 bit vector, even on big endian systems. - * Calculate the offset assuming a fully bigendian 128 bits, - * then XOR to account for the order of the two 64 bit halves. + * still the lowest and vfp.zregs[n].d[15] the highest of the + * 256 byte vector, even on big endian systems. + * + * Calculate the offset assuming fully little-endian, + * then XOR to account for the order of the 8-byte units. + * + * For 16 byte elements, the two 8 byte halves will not form a + * host int128 if the host is bigendian, since they're in the + * wrong order. However the only 16 byte operation we have is + * a move, so we can ignore this for the moment. More complicated + * operations will have to special case loading and storing from + * the zregs array. */ - offs += (16 - ((element + 1) * (1 << size))); - offs ^= 8; -#else - offs += element * (1 << size); + if (element_size < 8) { + offs ^= 8 - element_size; + } #endif offs += offsetof(CPUARMState, vfp.zregs[regno]); assert_fp_access_checked(s); diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c index c48d4b530a..226c97579c 100644 --- a/target/arm/translate-sve.c +++ b/target/arm/translate-sve.c @@ -33,6 +33,15 @@ #include "trace-tcg.h" #include "translate-a64.h" + +typedef void GVecGen2sFn(unsigned, uint32_t, uint32_t, + TCGv_i64, uint32_t, uint32_t); + +typedef void gen_helper_gvec_flags_3(TCGv_i32, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_i32); +typedef void gen_helper_gvec_flags_4(TCGv_i32, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_ptr, TCGv_i32); + /* * Helpers for extracting complex instruction fields. */ @@ -68,6 +77,11 @@ static inline int expand_imm_sh8s(int x) return (int8_t)x << (x & 0x100 ? 8 : 0); } +static inline int expand_imm_sh8u(int x) +{ + return (uint8_t)x << (x & 0x100 ? 8 : 0); +} + /* * Include the generated decoder. */ @@ -373,6 +387,8 @@ static bool trans_UDIV_zpzz(DisasContext *s, arg_rprr_esz *a, uint32_t insn) return do_zpzz_ool(s, a, fns[a->esz]); } +DO_ZPZZ(SEL, sel) + #undef DO_ZPZZ /* @@ -1957,6 +1973,1448 @@ static bool trans_EXT(DisasContext *s, arg_EXT *a, uint32_t insn) } /* + *** SVE Permute - Unpredicated Group + */ + +static bool trans_DUP_s(DisasContext *s, arg_DUP_s *a, uint32_t insn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_dup_i64(a->esz, vec_full_reg_offset(s, a->rd), + vsz, vsz, cpu_reg_sp(s, a->rn)); + } + return true; +} + +static bool trans_DUP_x(DisasContext *s, arg_DUP_x *a, uint32_t insn) +{ + if ((a->imm & 0x1f) == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + unsigned dofs = vec_full_reg_offset(s, a->rd); + unsigned esz, index; + + esz = ctz32(a->imm); + index = a->imm >> (esz + 1); + + if ((index << esz) < vsz) { + unsigned nofs = vec_reg_offset(s, a->rn, index, esz); + tcg_gen_gvec_dup_mem(esz, dofs, nofs, vsz, vsz); + } else { + tcg_gen_gvec_dup64i(dofs, vsz, vsz, 0); + } + } + return true; +} + +static void do_insr_i64(DisasContext *s, arg_rrr_esz *a, TCGv_i64 val) +{ + typedef void gen_insr(TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32); + static gen_insr * const fns[4] = { + gen_helper_sve_insr_b, gen_helper_sve_insr_h, + gen_helper_sve_insr_s, gen_helper_sve_insr_d, + }; + unsigned vsz = vec_full_reg_size(s); + TCGv_i32 desc = tcg_const_i32(simd_desc(vsz, vsz, 0)); + TCGv_ptr t_zd = tcg_temp_new_ptr(); + TCGv_ptr t_zn = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, a->rn)); + + fns[a->esz](t_zd, t_zn, val, desc); + + tcg_temp_free_ptr(t_zd); + tcg_temp_free_ptr(t_zn); + tcg_temp_free_i32(desc); +} + +static bool trans_INSR_f(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + TCGv_i64 t = tcg_temp_new_i64(); + tcg_gen_ld_i64(t, cpu_env, vec_reg_offset(s, a->rm, 0, MO_64)); + do_insr_i64(s, a, t); + tcg_temp_free_i64(t); + } + return true; +} + +static bool trans_INSR_r(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + do_insr_i64(s, a, cpu_reg(s, a->rm)); + } + return true; +} + +static bool trans_REV_v(DisasContext *s, arg_rr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_2 * const fns[4] = { + gen_helper_sve_rev_b, gen_helper_sve_rev_h, + gen_helper_sve_rev_s, gen_helper_sve_rev_d + }; + + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_2_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vsz, vsz, 0, fns[a->esz]); + } + return true; +} + +static bool trans_TBL(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[4] = { + gen_helper_sve_tbl_b, gen_helper_sve_tbl_h, + gen_helper_sve_tbl_s, gen_helper_sve_tbl_d + }; + + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + vsz, vsz, 0, fns[a->esz]); + } + return true; +} + +static bool trans_UNPK(DisasContext *s, arg_UNPK *a, uint32_t insn) +{ + static gen_helper_gvec_2 * const fns[4][2] = { + { NULL, NULL }, + { gen_helper_sve_sunpk_h, gen_helper_sve_uunpk_h }, + { gen_helper_sve_sunpk_s, gen_helper_sve_uunpk_s }, + { gen_helper_sve_sunpk_d, gen_helper_sve_uunpk_d }, + }; + + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_2_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn) + + (a->h ? vsz / 2 : 0), + vsz, vsz, 0, fns[a->esz][a->u]); + } + return true; +} + +/* + *** SVE Permute - Predicates Group + */ + +static bool do_perm_pred3(DisasContext *s, arg_rrr_esz *a, bool high_odd, + gen_helper_gvec_3 *fn) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = pred_full_reg_size(s); + + /* Predicate sizes may be smaller and cannot use simd_desc. + We cannot round up, as we do elsewhere, because we need + the exact size for ZIP2 and REV. We retain the style for + the other helpers for consistency. */ + TCGv_ptr t_d = tcg_temp_new_ptr(); + TCGv_ptr t_n = tcg_temp_new_ptr(); + TCGv_ptr t_m = tcg_temp_new_ptr(); + TCGv_i32 t_desc; + int desc; + + desc = vsz - 2; + desc = deposit32(desc, SIMD_DATA_SHIFT, 2, a->esz); + desc = deposit32(desc, SIMD_DATA_SHIFT + 2, 2, high_odd); + + tcg_gen_addi_ptr(t_d, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(t_n, cpu_env, pred_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(t_m, cpu_env, pred_full_reg_offset(s, a->rm)); + t_desc = tcg_const_i32(desc); + + fn(t_d, t_n, t_m, t_desc); + + tcg_temp_free_ptr(t_d); + tcg_temp_free_ptr(t_n); + tcg_temp_free_ptr(t_m); + tcg_temp_free_i32(t_desc); + return true; +} + +static bool do_perm_pred2(DisasContext *s, arg_rr_esz *a, bool high_odd, + gen_helper_gvec_2 *fn) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = pred_full_reg_size(s); + TCGv_ptr t_d = tcg_temp_new_ptr(); + TCGv_ptr t_n = tcg_temp_new_ptr(); + TCGv_i32 t_desc; + int desc; + + tcg_gen_addi_ptr(t_d, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(t_n, cpu_env, pred_full_reg_offset(s, a->rn)); + + /* Predicate sizes may be smaller and cannot use simd_desc. + We cannot round up, as we do elsewhere, because we need + the exact size for ZIP2 and REV. We retain the style for + the other helpers for consistency. */ + + desc = vsz - 2; + desc = deposit32(desc, SIMD_DATA_SHIFT, 2, a->esz); + desc = deposit32(desc, SIMD_DATA_SHIFT + 2, 2, high_odd); + t_desc = tcg_const_i32(desc); + + fn(t_d, t_n, t_desc); + + tcg_temp_free_i32(t_desc); + tcg_temp_free_ptr(t_d); + tcg_temp_free_ptr(t_n); + return true; +} + +static bool trans_ZIP1_p(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_perm_pred3(s, a, 0, gen_helper_sve_zip_p); +} + +static bool trans_ZIP2_p(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_perm_pred3(s, a, 1, gen_helper_sve_zip_p); +} + +static bool trans_UZP1_p(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_perm_pred3(s, a, 0, gen_helper_sve_uzp_p); +} + +static bool trans_UZP2_p(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_perm_pred3(s, a, 1, gen_helper_sve_uzp_p); +} + +static bool trans_TRN1_p(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_perm_pred3(s, a, 0, gen_helper_sve_trn_p); +} + +static bool trans_TRN2_p(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_perm_pred3(s, a, 1, gen_helper_sve_trn_p); +} + +static bool trans_REV_p(DisasContext *s, arg_rr_esz *a, uint32_t insn) +{ + return do_perm_pred2(s, a, 0, gen_helper_sve_rev_p); +} + +static bool trans_PUNPKLO(DisasContext *s, arg_PUNPKLO *a, uint32_t insn) +{ + return do_perm_pred2(s, a, 0, gen_helper_sve_punpk_p); +} + +static bool trans_PUNPKHI(DisasContext *s, arg_PUNPKHI *a, uint32_t insn) +{ + return do_perm_pred2(s, a, 1, gen_helper_sve_punpk_p); +} + +/* + *** SVE Permute - Interleaving Group + */ + +static bool do_zip(DisasContext *s, arg_rrr_esz *a, bool high) +{ + static gen_helper_gvec_3 * const fns[4] = { + gen_helper_sve_zip_b, gen_helper_sve_zip_h, + gen_helper_sve_zip_s, gen_helper_sve_zip_d, + }; + + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + unsigned high_ofs = high ? vsz / 2 : 0; + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn) + high_ofs, + vec_full_reg_offset(s, a->rm) + high_ofs, + vsz, vsz, 0, fns[a->esz]); + } + return true; +} + +static bool do_zzz_data_ool(DisasContext *s, arg_rrr_esz *a, int data, + gen_helper_gvec_3 *fn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + vsz, vsz, data, fn); + } + return true; +} + +static bool trans_ZIP1_z(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_zip(s, a, false); +} + +static bool trans_ZIP2_z(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_zip(s, a, true); +} + +static gen_helper_gvec_3 * const uzp_fns[4] = { + gen_helper_sve_uzp_b, gen_helper_sve_uzp_h, + gen_helper_sve_uzp_s, gen_helper_sve_uzp_d, +}; + +static bool trans_UZP1_z(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_zzz_data_ool(s, a, 0, uzp_fns[a->esz]); +} + +static bool trans_UZP2_z(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_zzz_data_ool(s, a, 1 << a->esz, uzp_fns[a->esz]); +} + +static gen_helper_gvec_3 * const trn_fns[4] = { + gen_helper_sve_trn_b, gen_helper_sve_trn_h, + gen_helper_sve_trn_s, gen_helper_sve_trn_d, +}; + +static bool trans_TRN1_z(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_zzz_data_ool(s, a, 0, trn_fns[a->esz]); +} + +static bool trans_TRN2_z(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_zzz_data_ool(s, a, 1 << a->esz, trn_fns[a->esz]); +} + +/* + *** SVE Permute Vector - Predicated Group + */ + +static bool trans_COMPACT(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[4] = { + NULL, NULL, gen_helper_sve_compact_s, gen_helper_sve_compact_d + }; + return do_zpz_ool(s, a, fns[a->esz]); +} + +/* Call the helper that computes the ARM LastActiveElement pseudocode + * function, scaled by the element size. This includes the not found + * indication; e.g. not found for esz=3 is -8. + */ +static void find_last_active(DisasContext *s, TCGv_i32 ret, int esz, int pg) +{ + /* Predicate sizes may be smaller and cannot use simd_desc. We cannot + * round up, as we do elsewhere, because we need the exact size. + */ + TCGv_ptr t_p = tcg_temp_new_ptr(); + TCGv_i32 t_desc; + unsigned vsz = pred_full_reg_size(s); + unsigned desc; + + desc = vsz - 2; + desc = deposit32(desc, SIMD_DATA_SHIFT, 2, esz); + + tcg_gen_addi_ptr(t_p, cpu_env, pred_full_reg_offset(s, pg)); + t_desc = tcg_const_i32(desc); + + gen_helper_sve_last_active_element(ret, t_p, t_desc); + + tcg_temp_free_i32(t_desc); + tcg_temp_free_ptr(t_p); +} + +/* Increment LAST to the offset of the next element in the vector, + * wrapping around to 0. + */ +static void incr_last_active(DisasContext *s, TCGv_i32 last, int esz) +{ + unsigned vsz = vec_full_reg_size(s); + + tcg_gen_addi_i32(last, last, 1 << esz); + if (is_power_of_2(vsz)) { + tcg_gen_andi_i32(last, last, vsz - 1); + } else { + TCGv_i32 max = tcg_const_i32(vsz); + TCGv_i32 zero = tcg_const_i32(0); + tcg_gen_movcond_i32(TCG_COND_GEU, last, last, max, zero, last); + tcg_temp_free_i32(max); + tcg_temp_free_i32(zero); + } +} + +/* If LAST < 0, set LAST to the offset of the last element in the vector. */ +static void wrap_last_active(DisasContext *s, TCGv_i32 last, int esz) +{ + unsigned vsz = vec_full_reg_size(s); + + if (is_power_of_2(vsz)) { + tcg_gen_andi_i32(last, last, vsz - 1); + } else { + TCGv_i32 max = tcg_const_i32(vsz - (1 << esz)); + TCGv_i32 zero = tcg_const_i32(0); + tcg_gen_movcond_i32(TCG_COND_LT, last, last, zero, max, last); + tcg_temp_free_i32(max); + tcg_temp_free_i32(zero); + } +} + +/* Load an unsigned element of ESZ from BASE+OFS. */ +static TCGv_i64 load_esz(TCGv_ptr base, int ofs, int esz) +{ + TCGv_i64 r = tcg_temp_new_i64(); + + switch (esz) { + case 0: + tcg_gen_ld8u_i64(r, base, ofs); + break; + case 1: + tcg_gen_ld16u_i64(r, base, ofs); + break; + case 2: + tcg_gen_ld32u_i64(r, base, ofs); + break; + case 3: + tcg_gen_ld_i64(r, base, ofs); + break; + default: + g_assert_not_reached(); + } + return r; +} + +/* Load an unsigned element of ESZ from RM[LAST]. */ +static TCGv_i64 load_last_active(DisasContext *s, TCGv_i32 last, + int rm, int esz) +{ + TCGv_ptr p = tcg_temp_new_ptr(); + TCGv_i64 r; + + /* Convert offset into vector into offset into ENV. + * The final adjustment for the vector register base + * is added via constant offset to the load. + */ +#ifdef HOST_WORDS_BIGENDIAN + /* Adjust for element ordering. See vec_reg_offset. */ + if (esz < 3) { + tcg_gen_xori_i32(last, last, 8 - (1 << esz)); + } +#endif + tcg_gen_ext_i32_ptr(p, last); + tcg_gen_add_ptr(p, p, cpu_env); + + r = load_esz(p, vec_full_reg_offset(s, rm), esz); + tcg_temp_free_ptr(p); + + return r; +} + +/* Compute CLAST for a Zreg. */ +static bool do_clast_vector(DisasContext *s, arg_rprr_esz *a, bool before) +{ + TCGv_i32 last; + TCGLabel *over; + TCGv_i64 ele; + unsigned vsz, esz = a->esz; + + if (!sve_access_check(s)) { + return true; + } + + last = tcg_temp_local_new_i32(); + over = gen_new_label(); + + find_last_active(s, last, esz, a->pg); + + /* There is of course no movcond for a 2048-bit vector, + * so we must branch over the actual store. + */ + tcg_gen_brcondi_i32(TCG_COND_LT, last, 0, over); + + if (!before) { + incr_last_active(s, last, esz); + } + + ele = load_last_active(s, last, a->rm, esz); + tcg_temp_free_i32(last); + + vsz = vec_full_reg_size(s); + tcg_gen_gvec_dup_i64(esz, vec_full_reg_offset(s, a->rd), vsz, vsz, ele); + tcg_temp_free_i64(ele); + + /* If this insn used MOVPRFX, we may need a second move. */ + if (a->rd != a->rn) { + TCGLabel *done = gen_new_label(); + tcg_gen_br(done); + + gen_set_label(over); + do_mov_z(s, a->rd, a->rn); + + gen_set_label(done); + } else { + gen_set_label(over); + } + return true; +} + +static bool trans_CLASTA_z(DisasContext *s, arg_rprr_esz *a, uint32_t insn) +{ + return do_clast_vector(s, a, false); +} + +static bool trans_CLASTB_z(DisasContext *s, arg_rprr_esz *a, uint32_t insn) +{ + return do_clast_vector(s, a, true); +} + +/* Compute CLAST for a scalar. */ +static void do_clast_scalar(DisasContext *s, int esz, int pg, int rm, + bool before, TCGv_i64 reg_val) +{ + TCGv_i32 last = tcg_temp_new_i32(); + TCGv_i64 ele, cmp, zero; + + find_last_active(s, last, esz, pg); + + /* Extend the original value of last prior to incrementing. */ + cmp = tcg_temp_new_i64(); + tcg_gen_ext_i32_i64(cmp, last); + + if (!before) { + incr_last_active(s, last, esz); + } + + /* The conceit here is that while last < 0 indicates not found, after + * adjusting for cpu_env->vfp.zregs[rm], it is still a valid address + * from which we can load garbage. We then discard the garbage with + * a conditional move. + */ + ele = load_last_active(s, last, rm, esz); + tcg_temp_free_i32(last); + + zero = tcg_const_i64(0); + tcg_gen_movcond_i64(TCG_COND_GE, reg_val, cmp, zero, ele, reg_val); + + tcg_temp_free_i64(zero); + tcg_temp_free_i64(cmp); + tcg_temp_free_i64(ele); +} + +/* Compute CLAST for a Vreg. */ +static bool do_clast_fp(DisasContext *s, arg_rpr_esz *a, bool before) +{ + if (sve_access_check(s)) { + int esz = a->esz; + int ofs = vec_reg_offset(s, a->rd, 0, esz); + TCGv_i64 reg = load_esz(cpu_env, ofs, esz); + + do_clast_scalar(s, esz, a->pg, a->rn, before, reg); + write_fp_dreg(s, a->rd, reg); + tcg_temp_free_i64(reg); + } + return true; +} + +static bool trans_CLASTA_v(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_clast_fp(s, a, false); +} + +static bool trans_CLASTB_v(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_clast_fp(s, a, true); +} + +/* Compute CLAST for a Xreg. */ +static bool do_clast_general(DisasContext *s, arg_rpr_esz *a, bool before) +{ + TCGv_i64 reg; + + if (!sve_access_check(s)) { + return true; + } + + reg = cpu_reg(s, a->rd); + switch (a->esz) { + case 0: + tcg_gen_ext8u_i64(reg, reg); + break; + case 1: + tcg_gen_ext16u_i64(reg, reg); + break; + case 2: + tcg_gen_ext32u_i64(reg, reg); + break; + case 3: + break; + default: + g_assert_not_reached(); + } + + do_clast_scalar(s, a->esz, a->pg, a->rn, before, reg); + return true; +} + +static bool trans_CLASTA_r(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_clast_general(s, a, false); +} + +static bool trans_CLASTB_r(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_clast_general(s, a, true); +} + +/* Compute LAST for a scalar. */ +static TCGv_i64 do_last_scalar(DisasContext *s, int esz, + int pg, int rm, bool before) +{ + TCGv_i32 last = tcg_temp_new_i32(); + TCGv_i64 ret; + + find_last_active(s, last, esz, pg); + if (before) { + wrap_last_active(s, last, esz); + } else { + incr_last_active(s, last, esz); + } + + ret = load_last_active(s, last, rm, esz); + tcg_temp_free_i32(last); + return ret; +} + +/* Compute LAST for a Vreg. */ +static bool do_last_fp(DisasContext *s, arg_rpr_esz *a, bool before) +{ + if (sve_access_check(s)) { + TCGv_i64 val = do_last_scalar(s, a->esz, a->pg, a->rn, before); + write_fp_dreg(s, a->rd, val); + tcg_temp_free_i64(val); + } + return true; +} + +static bool trans_LASTA_v(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_last_fp(s, a, false); +} + +static bool trans_LASTB_v(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_last_fp(s, a, true); +} + +/* Compute LAST for a Xreg. */ +static bool do_last_general(DisasContext *s, arg_rpr_esz *a, bool before) +{ + if (sve_access_check(s)) { + TCGv_i64 val = do_last_scalar(s, a->esz, a->pg, a->rn, before); + tcg_gen_mov_i64(cpu_reg(s, a->rd), val); + tcg_temp_free_i64(val); + } + return true; +} + +static bool trans_LASTA_r(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_last_general(s, a, false); +} + +static bool trans_LASTB_r(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_last_general(s, a, true); +} + +static bool trans_CPY_m_r(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + do_cpy_m(s, a->esz, a->rd, a->rd, a->pg, cpu_reg_sp(s, a->rn)); + } + return true; +} + +static bool trans_CPY_m_v(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + int ofs = vec_reg_offset(s, a->rn, 0, a->esz); + TCGv_i64 t = load_esz(cpu_env, ofs, a->esz); + do_cpy_m(s, a->esz, a->rd, a->rd, a->pg, t); + tcg_temp_free_i64(t); + } + return true; +} + +static bool trans_REVB(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[4] = { + NULL, + gen_helper_sve_revb_h, + gen_helper_sve_revb_s, + gen_helper_sve_revb_d, + }; + return do_zpz_ool(s, a, fns[a->esz]); +} + +static bool trans_REVH(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[4] = { + NULL, + NULL, + gen_helper_sve_revh_s, + gen_helper_sve_revh_d, + }; + return do_zpz_ool(s, a, fns[a->esz]); +} + +static bool trans_REVW(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ool(s, a, a->esz == 3 ? gen_helper_sve_revw_d : NULL); +} + +static bool trans_RBIT(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[4] = { + gen_helper_sve_rbit_b, + gen_helper_sve_rbit_h, + gen_helper_sve_rbit_s, + gen_helper_sve_rbit_d, + }; + return do_zpz_ool(s, a, fns[a->esz]); +} + +static bool trans_SPLICE(DisasContext *s, arg_rprr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_4_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + pred_full_reg_offset(s, a->pg), + vsz, vsz, a->esz, gen_helper_sve_splice); + } + return true; +} + +/* + *** SVE Integer Compare - Vectors Group + */ + +static bool do_ppzz_flags(DisasContext *s, arg_rprr_esz *a, + gen_helper_gvec_flags_4 *gen_fn) +{ + TCGv_ptr pd, zn, zm, pg; + unsigned vsz; + TCGv_i32 t; + + if (gen_fn == NULL) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + vsz = vec_full_reg_size(s); + t = tcg_const_i32(simd_desc(vsz, vsz, 0)); + pd = tcg_temp_new_ptr(); + zn = tcg_temp_new_ptr(); + zm = tcg_temp_new_ptr(); + pg = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(pd, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(zn, cpu_env, vec_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(zm, cpu_env, vec_full_reg_offset(s, a->rm)); + tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg)); + + gen_fn(t, pd, zn, zm, pg, t); + + tcg_temp_free_ptr(pd); + tcg_temp_free_ptr(zn); + tcg_temp_free_ptr(zm); + tcg_temp_free_ptr(pg); + + do_pred_flags(t); + + tcg_temp_free_i32(t); + return true; +} + +#define DO_PPZZ(NAME, name) \ +static bool trans_##NAME##_ppzz(DisasContext *s, arg_rprr_esz *a, \ + uint32_t insn) \ +{ \ + static gen_helper_gvec_flags_4 * const fns[4] = { \ + gen_helper_sve_##name##_ppzz_b, gen_helper_sve_##name##_ppzz_h, \ + gen_helper_sve_##name##_ppzz_s, gen_helper_sve_##name##_ppzz_d, \ + }; \ + return do_ppzz_flags(s, a, fns[a->esz]); \ +} + +DO_PPZZ(CMPEQ, cmpeq) +DO_PPZZ(CMPNE, cmpne) +DO_PPZZ(CMPGT, cmpgt) +DO_PPZZ(CMPGE, cmpge) +DO_PPZZ(CMPHI, cmphi) +DO_PPZZ(CMPHS, cmphs) + +#undef DO_PPZZ + +#define DO_PPZW(NAME, name) \ +static bool trans_##NAME##_ppzw(DisasContext *s, arg_rprr_esz *a, \ + uint32_t insn) \ +{ \ + static gen_helper_gvec_flags_4 * const fns[4] = { \ + gen_helper_sve_##name##_ppzw_b, gen_helper_sve_##name##_ppzw_h, \ + gen_helper_sve_##name##_ppzw_s, NULL \ + }; \ + return do_ppzz_flags(s, a, fns[a->esz]); \ +} + +DO_PPZW(CMPEQ, cmpeq) +DO_PPZW(CMPNE, cmpne) +DO_PPZW(CMPGT, cmpgt) +DO_PPZW(CMPGE, cmpge) +DO_PPZW(CMPHI, cmphi) +DO_PPZW(CMPHS, cmphs) +DO_PPZW(CMPLT, cmplt) +DO_PPZW(CMPLE, cmple) +DO_PPZW(CMPLO, cmplo) +DO_PPZW(CMPLS, cmpls) + +#undef DO_PPZW + +/* + *** SVE Integer Compare - Immediate Groups + */ + +static bool do_ppzi_flags(DisasContext *s, arg_rpri_esz *a, + gen_helper_gvec_flags_3 *gen_fn) +{ + TCGv_ptr pd, zn, pg; + unsigned vsz; + TCGv_i32 t; + + if (gen_fn == NULL) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + vsz = vec_full_reg_size(s); + t = tcg_const_i32(simd_desc(vsz, vsz, a->imm)); + pd = tcg_temp_new_ptr(); + zn = tcg_temp_new_ptr(); + pg = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(pd, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(zn, cpu_env, vec_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg)); + + gen_fn(t, pd, zn, pg, t); + + tcg_temp_free_ptr(pd); + tcg_temp_free_ptr(zn); + tcg_temp_free_ptr(pg); + + do_pred_flags(t); + + tcg_temp_free_i32(t); + return true; +} + +#define DO_PPZI(NAME, name) \ +static bool trans_##NAME##_ppzi(DisasContext *s, arg_rpri_esz *a, \ + uint32_t insn) \ +{ \ + static gen_helper_gvec_flags_3 * const fns[4] = { \ + gen_helper_sve_##name##_ppzi_b, gen_helper_sve_##name##_ppzi_h, \ + gen_helper_sve_##name##_ppzi_s, gen_helper_sve_##name##_ppzi_d, \ + }; \ + return do_ppzi_flags(s, a, fns[a->esz]); \ +} + +DO_PPZI(CMPEQ, cmpeq) +DO_PPZI(CMPNE, cmpne) +DO_PPZI(CMPGT, cmpgt) +DO_PPZI(CMPGE, cmpge) +DO_PPZI(CMPHI, cmphi) +DO_PPZI(CMPHS, cmphs) +DO_PPZI(CMPLT, cmplt) +DO_PPZI(CMPLE, cmple) +DO_PPZI(CMPLO, cmplo) +DO_PPZI(CMPLS, cmpls) + +#undef DO_PPZI + +/* + *** SVE Partition Break Group + */ + +static bool do_brk3(DisasContext *s, arg_rprr_s *a, + gen_helper_gvec_4 *fn, gen_helper_gvec_flags_4 *fn_s) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = pred_full_reg_size(s); + + /* Predicate sizes may be smaller and cannot use simd_desc. */ + TCGv_ptr d = tcg_temp_new_ptr(); + TCGv_ptr n = tcg_temp_new_ptr(); + TCGv_ptr m = tcg_temp_new_ptr(); + TCGv_ptr g = tcg_temp_new_ptr(); + TCGv_i32 t = tcg_const_i32(vsz - 2); + + tcg_gen_addi_ptr(d, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(n, cpu_env, pred_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(m, cpu_env, pred_full_reg_offset(s, a->rm)); + tcg_gen_addi_ptr(g, cpu_env, pred_full_reg_offset(s, a->pg)); + + if (a->s) { + fn_s(t, d, n, m, g, t); + do_pred_flags(t); + } else { + fn(d, n, m, g, t); + } + tcg_temp_free_ptr(d); + tcg_temp_free_ptr(n); + tcg_temp_free_ptr(m); + tcg_temp_free_ptr(g); + tcg_temp_free_i32(t); + return true; +} + +static bool do_brk2(DisasContext *s, arg_rpr_s *a, + gen_helper_gvec_3 *fn, gen_helper_gvec_flags_3 *fn_s) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = pred_full_reg_size(s); + + /* Predicate sizes may be smaller and cannot use simd_desc. */ + TCGv_ptr d = tcg_temp_new_ptr(); + TCGv_ptr n = tcg_temp_new_ptr(); + TCGv_ptr g = tcg_temp_new_ptr(); + TCGv_i32 t = tcg_const_i32(vsz - 2); + + tcg_gen_addi_ptr(d, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(n, cpu_env, pred_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(g, cpu_env, pred_full_reg_offset(s, a->pg)); + + if (a->s) { + fn_s(t, d, n, g, t); + do_pred_flags(t); + } else { + fn(d, n, g, t); + } + tcg_temp_free_ptr(d); + tcg_temp_free_ptr(n); + tcg_temp_free_ptr(g); + tcg_temp_free_i32(t); + return true; +} + +static bool trans_BRKPA(DisasContext *s, arg_rprr_s *a, uint32_t insn) +{ + return do_brk3(s, a, gen_helper_sve_brkpa, gen_helper_sve_brkpas); +} + +static bool trans_BRKPB(DisasContext *s, arg_rprr_s *a, uint32_t insn) +{ + return do_brk3(s, a, gen_helper_sve_brkpb, gen_helper_sve_brkpbs); +} + +static bool trans_BRKA_m(DisasContext *s, arg_rpr_s *a, uint32_t insn) +{ + return do_brk2(s, a, gen_helper_sve_brka_m, gen_helper_sve_brkas_m); +} + +static bool trans_BRKB_m(DisasContext *s, arg_rpr_s *a, uint32_t insn) +{ + return do_brk2(s, a, gen_helper_sve_brkb_m, gen_helper_sve_brkbs_m); +} + +static bool trans_BRKA_z(DisasContext *s, arg_rpr_s *a, uint32_t insn) +{ + return do_brk2(s, a, gen_helper_sve_brka_z, gen_helper_sve_brkas_z); +} + +static bool trans_BRKB_z(DisasContext *s, arg_rpr_s *a, uint32_t insn) +{ + return do_brk2(s, a, gen_helper_sve_brkb_z, gen_helper_sve_brkbs_z); +} + +static bool trans_BRKN(DisasContext *s, arg_rpr_s *a, uint32_t insn) +{ + return do_brk2(s, a, gen_helper_sve_brkn, gen_helper_sve_brkns); +} + +/* + *** SVE Predicate Count Group + */ + +static void do_cntp(DisasContext *s, TCGv_i64 val, int esz, int pn, int pg) +{ + unsigned psz = pred_full_reg_size(s); + + if (psz <= 8) { + uint64_t psz_mask; + + tcg_gen_ld_i64(val, cpu_env, pred_full_reg_offset(s, pn)); + if (pn != pg) { + TCGv_i64 g = tcg_temp_new_i64(); + tcg_gen_ld_i64(g, cpu_env, pred_full_reg_offset(s, pg)); + tcg_gen_and_i64(val, val, g); + tcg_temp_free_i64(g); + } + + /* Reduce the pred_esz_masks value simply to reduce the + * size of the code generated here. + */ + psz_mask = MAKE_64BIT_MASK(0, psz * 8); + tcg_gen_andi_i64(val, val, pred_esz_masks[esz] & psz_mask); + + tcg_gen_ctpop_i64(val, val); + } else { + TCGv_ptr t_pn = tcg_temp_new_ptr(); + TCGv_ptr t_pg = tcg_temp_new_ptr(); + unsigned desc; + TCGv_i32 t_desc; + + desc = psz - 2; + desc = deposit32(desc, SIMD_DATA_SHIFT, 2, esz); + + tcg_gen_addi_ptr(t_pn, cpu_env, pred_full_reg_offset(s, pn)); + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg)); + t_desc = tcg_const_i32(desc); + + gen_helper_sve_cntp(val, t_pn, t_pg, t_desc); + tcg_temp_free_ptr(t_pn); + tcg_temp_free_ptr(t_pg); + tcg_temp_free_i32(t_desc); + } +} + +static bool trans_CNTP(DisasContext *s, arg_CNTP *a, uint32_t insn) +{ + if (sve_access_check(s)) { + do_cntp(s, cpu_reg(s, a->rd), a->esz, a->rn, a->pg); + } + return true; +} + +static bool trans_INCDECP_r(DisasContext *s, arg_incdec_pred *a, + uint32_t insn) +{ + if (sve_access_check(s)) { + TCGv_i64 reg = cpu_reg(s, a->rd); + TCGv_i64 val = tcg_temp_new_i64(); + + do_cntp(s, val, a->esz, a->pg, a->pg); + if (a->d) { + tcg_gen_sub_i64(reg, reg, val); + } else { + tcg_gen_add_i64(reg, reg, val); + } + tcg_temp_free_i64(val); + } + return true; +} + +static bool trans_INCDECP_z(DisasContext *s, arg_incdec2_pred *a, + uint32_t insn) +{ + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_i64 val = tcg_temp_new_i64(); + GVecGen2sFn *gvec_fn = a->d ? tcg_gen_gvec_subs : tcg_gen_gvec_adds; + + do_cntp(s, val, a->esz, a->pg, a->pg); + gvec_fn(a->esz, vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), val, vsz, vsz); + } + return true; +} + +static bool trans_SINCDECP_r_32(DisasContext *s, arg_incdec_pred *a, + uint32_t insn) +{ + if (sve_access_check(s)) { + TCGv_i64 reg = cpu_reg(s, a->rd); + TCGv_i64 val = tcg_temp_new_i64(); + + do_cntp(s, val, a->esz, a->pg, a->pg); + do_sat_addsub_32(reg, val, a->u, a->d); + } + return true; +} + +static bool trans_SINCDECP_r_64(DisasContext *s, arg_incdec_pred *a, + uint32_t insn) +{ + if (sve_access_check(s)) { + TCGv_i64 reg = cpu_reg(s, a->rd); + TCGv_i64 val = tcg_temp_new_i64(); + + do_cntp(s, val, a->esz, a->pg, a->pg); + do_sat_addsub_64(reg, val, a->u, a->d); + } + return true; +} + +static bool trans_SINCDECP_z(DisasContext *s, arg_incdec2_pred *a, + uint32_t insn) +{ + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 val = tcg_temp_new_i64(); + do_cntp(s, val, a->esz, a->pg, a->pg); + do_sat_addsub_vec(s, a->esz, a->rd, a->rn, val, a->u, a->d); + } + return true; +} + +/* + *** SVE Integer Compare Scalars Group + */ + +static bool trans_CTERM(DisasContext *s, arg_CTERM *a, uint32_t insn) +{ + if (!sve_access_check(s)) { + return true; + } + + TCGCond cond = (a->ne ? TCG_COND_NE : TCG_COND_EQ); + TCGv_i64 rn = read_cpu_reg(s, a->rn, a->sf); + TCGv_i64 rm = read_cpu_reg(s, a->rm, a->sf); + TCGv_i64 cmp = tcg_temp_new_i64(); + + tcg_gen_setcond_i64(cond, cmp, rn, rm); + tcg_gen_extrl_i64_i32(cpu_NF, cmp); + tcg_temp_free_i64(cmp); + + /* VF = !NF & !CF. */ + tcg_gen_xori_i32(cpu_VF, cpu_NF, 1); + tcg_gen_andc_i32(cpu_VF, cpu_VF, cpu_CF); + + /* Both NF and VF actually look at bit 31. */ + tcg_gen_neg_i32(cpu_NF, cpu_NF); + tcg_gen_neg_i32(cpu_VF, cpu_VF); + return true; +} + +static bool trans_WHILE(DisasContext *s, arg_WHILE *a, uint32_t insn) +{ + if (!sve_access_check(s)) { + return true; + } + + TCGv_i64 op0 = read_cpu_reg(s, a->rn, 1); + TCGv_i64 op1 = read_cpu_reg(s, a->rm, 1); + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i32 t2, t3; + TCGv_ptr ptr; + unsigned desc, vsz = vec_full_reg_size(s); + TCGCond cond; + + if (!a->sf) { + if (a->u) { + tcg_gen_ext32u_i64(op0, op0); + tcg_gen_ext32u_i64(op1, op1); + } else { + tcg_gen_ext32s_i64(op0, op0); + tcg_gen_ext32s_i64(op1, op1); + } + } + + /* For the helper, compress the different conditions into a computation + * of how many iterations for which the condition is true. + * + * This is slightly complicated by 0 <= UINT64_MAX, which is nominally + * 2**64 iterations, overflowing to 0. Of course, predicate registers + * aren't that large, so any value >= predicate size is sufficient. + */ + tcg_gen_sub_i64(t0, op1, op0); + + /* t0 = MIN(op1 - op0, vsz). */ + tcg_gen_movi_i64(t1, vsz); + tcg_gen_umin_i64(t0, t0, t1); + if (a->eq) { + /* Equality means one more iteration. */ + tcg_gen_addi_i64(t0, t0, 1); + } + + /* t0 = (condition true ? t0 : 0). */ + cond = (a->u + ? (a->eq ? TCG_COND_LEU : TCG_COND_LTU) + : (a->eq ? TCG_COND_LE : TCG_COND_LT)); + tcg_gen_movi_i64(t1, 0); + tcg_gen_movcond_i64(cond, t0, op0, op1, t0, t1); + + t2 = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(t2, t0); + tcg_temp_free_i64(t0); + tcg_temp_free_i64(t1); + + desc = (vsz / 8) - 2; + desc = deposit32(desc, SIMD_DATA_SHIFT, 2, a->esz); + t3 = tcg_const_i32(desc); + + ptr = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(ptr, cpu_env, pred_full_reg_offset(s, a->rd)); + + gen_helper_sve_while(t2, ptr, t2, t3); + do_pred_flags(t2); + + tcg_temp_free_ptr(ptr); + tcg_temp_free_i32(t2); + tcg_temp_free_i32(t3); + return true; +} + +/* + *** SVE Integer Wide Immediate - Unpredicated Group + */ + +static bool trans_FDUP(DisasContext *s, arg_FDUP *a, uint32_t insn) +{ + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + int dofs = vec_full_reg_offset(s, a->rd); + uint64_t imm; + + /* Decode the VFP immediate. */ + imm = vfp_expand_imm(a->esz, a->imm); + imm = dup_const(a->esz, imm); + + tcg_gen_gvec_dup64i(dofs, vsz, vsz, imm); + } + return true; +} + +static bool trans_DUP_i(DisasContext *s, arg_DUP_i *a, uint32_t insn) +{ + if (a->esz == 0 && extract32(insn, 13, 1)) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + int dofs = vec_full_reg_offset(s, a->rd); + + tcg_gen_gvec_dup64i(dofs, vsz, vsz, dup_const(a->esz, a->imm)); + } + return true; +} + +static bool trans_ADD_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + if (a->esz == 0 && extract32(insn, 13, 1)) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_addi(a->esz, vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), a->imm, vsz, vsz); + } + return true; +} + +static bool trans_SUB_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + a->imm = -a->imm; + return trans_ADD_zzi(s, a, insn); +} + +static bool trans_SUBR_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + static const GVecGen2s op[4] = { + { .fni8 = tcg_gen_vec_sub8_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_sve_subri_b, + .opc = INDEX_op_sub_vec, + .vece = MO_8, + .scalar_first = true }, + { .fni8 = tcg_gen_vec_sub16_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_sve_subri_h, + .opc = INDEX_op_sub_vec, + .vece = MO_16, + .scalar_first = true }, + { .fni4 = tcg_gen_sub_i32, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_sve_subri_s, + .opc = INDEX_op_sub_vec, + .vece = MO_32, + .scalar_first = true }, + { .fni8 = tcg_gen_sub_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_sve_subri_d, + .opc = INDEX_op_sub_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64, + .scalar_first = true } + }; + + if (a->esz == 0 && extract32(insn, 13, 1)) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_i64 c = tcg_const_i64(a->imm); + tcg_gen_gvec_2s(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vsz, vsz, c, &op[a->esz]); + tcg_temp_free_i64(c); + } + return true; +} + +static bool trans_MUL_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_muli(a->esz, vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), a->imm, vsz, vsz); + } + return true; +} + +static bool do_zzi_sat(DisasContext *s, arg_rri_esz *a, uint32_t insn, + bool u, bool d) +{ + if (a->esz == 0 && extract32(insn, 13, 1)) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 val = tcg_const_i64(a->imm); + do_sat_addsub_vec(s, a->esz, a->rd, a->rn, val, u, d); + tcg_temp_free_i64(val); + } + return true; +} + +static bool trans_SQADD_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + return do_zzi_sat(s, a, insn, false, false); +} + +static bool trans_UQADD_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + return do_zzi_sat(s, a, insn, true, false); +} + +static bool trans_SQSUB_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + return do_zzi_sat(s, a, insn, false, true); +} + +static bool trans_UQSUB_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + return do_zzi_sat(s, a, insn, true, true); +} + +static bool do_zzi_ool(DisasContext *s, arg_rri_esz *a, gen_helper_gvec_2i *fn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_i64 c = tcg_const_i64(a->imm); + + tcg_gen_gvec_2i_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + c, vsz, vsz, 0, fn); + tcg_temp_free_i64(c); + } + return true; +} + +#define DO_ZZI(NAME, name) \ +static bool trans_##NAME##_zzi(DisasContext *s, arg_rri_esz *a, \ + uint32_t insn) \ +{ \ + static gen_helper_gvec_2i * const fns[4] = { \ + gen_helper_sve_##name##i_b, gen_helper_sve_##name##i_h, \ + gen_helper_sve_##name##i_s, gen_helper_sve_##name##i_d, \ + }; \ + return do_zzi_ool(s, a, fns[a->esz]); \ +} + +DO_ZZI(SMAX, smax) +DO_ZZI(UMAX, umax) +DO_ZZI(SMIN, smin) +DO_ZZI(UMIN, umin) + +#undef DO_ZZI + +/* + *** SVE Floating Point Arithmetic - Unpredicated Group + */ + +static bool do_zzz_fp(DisasContext *s, arg_rrr_esz *a, + gen_helper_gvec_3_ptr *fn) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + status, vsz, vsz, 0, fn); + tcg_temp_free_ptr(status); + } + return true; +} + + +#define DO_FP3(NAME, name) \ +static bool trans_##NAME(DisasContext *s, arg_rrr_esz *a, uint32_t insn) \ +{ \ + static gen_helper_gvec_3_ptr * const fns[4] = { \ + NULL, gen_helper_gvec_##name##_h, \ + gen_helper_gvec_##name##_s, gen_helper_gvec_##name##_d \ + }; \ + return do_zzz_fp(s, a, fns[a->esz]); \ +} + +DO_FP3(FADD_zzz, fadd) +DO_FP3(FSUB_zzz, fsub) +DO_FP3(FMUL_zzz, fmul) +DO_FP3(FTSMUL, ftsmul) +DO_FP3(FRECPS, recps) +DO_FP3(FRSQRTS, rsqrts) + +#undef DO_FP3 + +/* *** SVE Memory - 32-bit Gather and Unsized Contiguous Group */ diff --git a/target/arm/translate.c b/target/arm/translate.c index 0ff5edf2ce..f405c82fb2 100644 --- a/target/arm/translate.c +++ b/target/arm/translate.c @@ -9965,7 +9965,8 @@ static bool thumb_insn_is_16bit(DisasContext *s, uint32_t insn) * end up actually treating this as two 16-bit insns, though, * if it's half of a bl/blx pair that might span a page boundary. */ - if (arm_dc_feature(s, ARM_FEATURE_THUMB2)) { + if (arm_dc_feature(s, ARM_FEATURE_THUMB2) || + arm_dc_feature(s, ARM_FEATURE_M)) { /* Thumb2 cores (including all M profile ones) always treat * 32-bit insns as 32-bit. */ @@ -10085,10 +10086,38 @@ static void disas_thumb2_insn(DisasContext *s, uint32_t insn) int conds; int logic_cc; - /* The only 32 bit insn that's allowed for Thumb1 is the combined - * BL/BLX prefix and suffix. + /* + * ARMv6-M supports a limited subset of Thumb2 instructions. + * Other Thumb1 architectures allow only 32-bit + * combined BL/BLX prefix and suffix. */ - if ((insn & 0xf800e800) != 0xf000e800) { + if (arm_dc_feature(s, ARM_FEATURE_M) && + !arm_dc_feature(s, ARM_FEATURE_V7)) { + int i; + bool found = false; + const uint32_t armv6m_insn[] = {0xf3808000 /* msr */, + 0xf3b08040 /* dsb */, + 0xf3b08050 /* dmb */, + 0xf3b08060 /* isb */, + 0xf3e08000 /* mrs */, + 0xf000d000 /* bl */}; + const uint32_t armv6m_mask[] = {0xffe0d000, + 0xfff0d0f0, + 0xfff0d0f0, + 0xfff0d0f0, + 0xffe0d000, + 0xf800d000}; + + for (i = 0; i < ARRAY_SIZE(armv6m_insn); i++) { + if ((insn & armv6m_mask[i]) == armv6m_insn[i]) { + found = true; + break; + } + } + if (!found) { + goto illegal_op; + } + } else if ((insn & 0xf800e800) != 0xf000e800) { ARCH(6T2); } @@ -11009,7 +11038,11 @@ static void disas_thumb2_insn(DisasContext *s, uint32_t insn) } break; case 3: /* Special control operations. */ - ARCH(7); + if (!arm_dc_feature(s, ARM_FEATURE_V7) && + !(arm_dc_feature(s, ARM_FEATURE_V6) && + arm_dc_feature(s, ARM_FEATURE_M))) { + goto illegal_op; + } op = (insn >> 4) & 0xf; switch (op) { case 2: /* clrex */ diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c index 25e209da31..f504dd53c8 100644 --- a/target/arm/vec_helper.c +++ b/target/arm/vec_helper.c @@ -426,3 +426,72 @@ void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, } clear_tail(d, opr_sz, simd_maxsz(desc)); } + +/* Floating-point trigonometric starting value. + * See the ARM ARM pseudocode function FPTrigSMul. + */ +static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) +{ + float16 result = float16_mul(op1, op1, stat); + if (!float16_is_any_nan(result)) { + result = float16_set_sign(result, op2 & 1); + } + return result; +} + +static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) +{ + float32 result = float32_mul(op1, op1, stat); + if (!float32_is_any_nan(result)) { + result = float32_set_sign(result, op2 & 1); + } + return result; +} + +static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) +{ + float64 result = float64_mul(op1, op1, stat); + if (!float64_is_any_nan(result)) { + result = float64_set_sign(result, op2 & 1); + } + return result; +} + +#define DO_3OP(NAME, FUNC, TYPE) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + TYPE *d = vd, *n = vn, *m = vm; \ + for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ + d[i] = FUNC(n[i], m[i], stat); \ + } \ +} + +DO_3OP(gvec_fadd_h, float16_add, float16) +DO_3OP(gvec_fadd_s, float32_add, float32) +DO_3OP(gvec_fadd_d, float64_add, float64) + +DO_3OP(gvec_fsub_h, float16_sub, float16) +DO_3OP(gvec_fsub_s, float32_sub, float32) +DO_3OP(gvec_fsub_d, float64_sub, float64) + +DO_3OP(gvec_fmul_h, float16_mul, float16) +DO_3OP(gvec_fmul_s, float32_mul, float32) +DO_3OP(gvec_fmul_d, float64_mul, float64) + +DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) +DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) +DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) + +#ifdef TARGET_AARCH64 + +DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) +DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) +DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) + +DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) +DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) +DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) + +#endif +#undef DO_3OP |