aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2021-06-24 15:00:33 +0100
committerPeter Maydell <peter.maydell@linaro.org>2021-06-24 15:00:34 +0100
commitecba223da6215d6f6ce2d343b70b2e9a19bfb90b (patch)
tree92f75f460810dd6aaa8620f9a62144fc34b9badd
parentd0ac9a61474cf594d19082bc8976247e984ea9a3 (diff)
parent90a76c6316cfe6416fc33814a838fb3928f746ee (diff)
Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20210624' into staging
target-arm queue: * Don't require 'virt' board to be compiled in for ACPI GHES code * docs: Document which architecture extensions we emulate * Fix bugs in M-profile FPCXT_NS accesses * First slice of MVE patches * Implement MTE3 * docs/system: arm: Add nRF boards description # gpg: Signature made Thu 24 Jun 2021 14:59:16 BST # gpg: using RSA key E1A5C593CD419DE28E8315CF3C2525ED14360CDE # gpg: issuer "peter.maydell@linaro.org" # gpg: Good signature from "Peter Maydell <peter.maydell@linaro.org>" [ultimate] # gpg: aka "Peter Maydell <pmaydell@gmail.com>" [ultimate] # gpg: aka "Peter Maydell <pmaydell@chiark.greenend.org.uk>" [ultimate] # Primary key fingerprint: E1A5 C593 CD41 9DE2 8E83 15CF 3C25 25ED 1436 0CDE * remotes/pmaydell/tags/pull-target-arm-20210624: (57 commits) docs/system: arm: Add nRF boards description target/arm: Implement MTE3 target/arm: Make VMOV scalar <-> gpreg beatwise for MVE target/arm: Implement MVE VADDV target/arm: Implement MVE VHCADD target/arm: Implement MVE VCADD target/arm: Implement MVE VADC, VSBC target/arm: Implement MVE VRHADD target/arm: Implement MVE VQDMULL (vector) target/arm: Implement MVE VQDMLSDH and VQRDMLSDH target/arm: Implement MVE VQDMLADH and VQRDMLADH target/arm: Implement MVE VRSHL target/arm: Implement MVE VSHL insn target/arm: Implement MVE VQRSHL target/arm: Implement MVE VQSHL (vector) target/arm: Implement MVE VQADD, VQSUB (vector) target/arm: Implement MVE VQDMULH, VQRDMULH (vector) target/arm: Implement MVE VQDMULL scalar target/arm: Implement MVE VQDMULH and VQRDMULH (scalar) target/arm: Implement MVE VQADD and VQSUB ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r--MAINTAINERS1
-rw-r--r--docs/system/arm/emulation.rst103
-rw-r--r--docs/system/arm/nrf.rst51
-rw-r--r--docs/system/target-arm.rst7
-rw-r--r--hw/acpi/ghes-stub.c22
-rw-r--r--hw/acpi/ghes.c17
-rw-r--r--hw/acpi/meson.build6
-rw-r--r--include/hw/acpi/ghes.h9
-rw-r--r--include/tcg/tcg-op.h8
-rw-r--r--include/tcg/tcg.h1
-rw-r--r--target/arm/cpu64.c2
-rw-r--r--target/arm/helper-mve.h357
-rw-r--r--target/arm/helper.h2
-rw-r--r--target/arm/internals.h11
-rw-r--r--target/arm/kvm64.c6
-rw-r--r--target/arm/m-nocp.decode24
-rw-r--r--target/arm/meson.build1
-rw-r--r--target/arm/mte_helper.c82
-rw-r--r--target/arm/mve.decode240
-rw-r--r--target/arm/mve_helper.c1160
-rw-r--r--target/arm/translate-a32.h3
-rw-r--r--target/arm/translate-m-nocp.c550
-rw-r--r--target/arm/translate-mve.c759
-rw-r--r--target/arm/translate-vfp.c741
-rw-r--r--target/arm/translate.h10
-rw-r--r--target/arm/vfp.decode14
-rw-r--r--tcg/tcg-op-gvec.c20
27 files changed, 3578 insertions, 629 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 1a041eaf86..3443d2a5b5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1032,6 +1032,7 @@ F: hw/*/microbit*.c
F: include/hw/*/nrf51*.h
F: include/hw/*/microbit*.h
F: tests/qtest/microbit-test.c
+F: docs/system/arm/nrf.rst
AVR Machines
-------------
diff --git a/docs/system/arm/emulation.rst b/docs/system/arm/emulation.rst
new file mode 100644
index 0000000000..144dc491d9
--- /dev/null
+++ b/docs/system/arm/emulation.rst
@@ -0,0 +1,103 @@
+A-profile CPU architecture support
+==================================
+
+QEMU's TCG emulation includes support for the Armv5, Armv6, Armv7 and
+Armv8 versions of the A-profile architecture. It also has support for
+the following architecture extensions:
+
+- FEAT_AA32BF16 (AArch32 BFloat16 instructions)
+- FEAT_AA32HPD (AArch32 hierarchical permission disables)
+- FEAT_AA32I8MM (AArch32 Int8 matrix multiplication instructions)
+- FEAT_AES (AESD and AESE instructions)
+- FEAT_BF16 (AArch64 BFloat16 instructions)
+- FEAT_BTI (Branch Target Identification)
+- FEAT_DIT (Data Independent Timing instructions)
+- FEAT_DPB (DC CVAP instruction)
+- FEAT_DotProd (Advanced SIMD dot product instructions)
+- FEAT_FCMA (Floating-point complex number instructions)
+- FEAT_FHM (Floating-point half-precision multiplication instructions)
+- FEAT_FP16 (Half-precision floating-point data processing)
+- FEAT_FRINTTS (Floating-point to integer instructions)
+- FEAT_FlagM (Flag manipulation instructions v2)
+- FEAT_FlagM2 (Enhancements to flag manipulation instructions)
+- FEAT_HPDS (Hierarchical permission disables)
+- FEAT_I8MM (AArch64 Int8 matrix multiplication instructions)
+- FEAT_JSCVT (JavaScript conversion instructions)
+- FEAT_LOR (Limited ordering regions)
+- FEAT_LRCPC (Load-acquire RCpc instructions)
+- FEAT_LRCPC2 (Load-acquire RCpc instructions v2)
+- FEAT_LSE (Large System Extensions)
+- FEAT_MTE (Memory Tagging Extension)
+- FEAT_MTE2 (Memory Tagging Extension)
+- FEAT_MTE3 (MTE Asymmetric Fault Handling)
+- FEAT_PAN (Privileged access never)
+- FEAT_PAN2 (AT S1E1R and AT S1E1W instruction variants affected by PSTATE.PAN)
+- FEAT_PAuth (Pointer authentication)
+- FEAT_PMULL (PMULL, PMULL2 instructions)
+- FEAT_PMUv3p1 (PMU Extensions v3.1)
+- FEAT_PMUv3p4 (PMU Extensions v3.4)
+- FEAT_RDM (Advanced SIMD rounding double multiply accumulate instructions)
+- FEAT_RNG (Random number generator)
+- FEAT_SB (Speculation Barrier)
+- FEAT_SEL2 (Secure EL2)
+- FEAT_SHA1 (SHA1 instructions)
+- FEAT_SHA256 (SHA256 instructions)
+- FEAT_SHA3 (Advanced SIMD SHA3 instructions)
+- FEAT_SHA512 (Advanced SIMD SHA512 instructions)
+- FEAT_SM3 (Advanced SIMD SM3 instructions)
+- FEAT_SM4 (Advanced SIMD SM4 instructions)
+- FEAT_SPECRES (Speculation restriction instructions)
+- FEAT_SSBS (Speculative Store Bypass Safe)
+- FEAT_TLBIOS (TLB invalidate instructions in Outer Shareable domain)
+- FEAT_TLBIRANGE (TLB invalidate range instructions)
+- FEAT_TTCNP (Translation table Common not private translations)
+- FEAT_TTST (Small translation tables)
+- FEAT_UAO (Unprivileged Access Override control)
+- FEAT_VHE (Virtualization Host Extensions)
+- FEAT_VMID16 (16-bit VMID)
+- FEAT_XNX (Translation table stage 2 Unprivileged Execute-never)
+- SVE (The Scalable Vector Extension)
+- SVE2 (The Scalable Vector Extension v2)
+
+For information on the specifics of these extensions, please refer
+to the `Armv8-A Arm Architecture Reference Manual
+<https://developer.arm.com/documentation/ddi0487/latest>`_.
+
+When a specific named CPU is being emulated, only those features which
+are present in hardware for that CPU are emulated. (If a feature is
+not in the list above then it is not supported, even if the real
+hardware should have it.) The ``max`` CPU enables all features.
+
+R-profile CPU architecture support
+==================================
+
+QEMU's TCG emulation support for R-profile CPUs is currently limited.
+We emulate only the Cortex-R5 and Cortex-R5F CPUs.
+
+M-profile CPU architecture support
+==================================
+
+QEMU's TCG emulation includes support for Armv6-M, Armv7-M, Armv8-M, and
+Armv8.1-M versions of the M-profile architucture. It also has support
+for the following architecture extensions:
+
+- FP (Floating-point Extension)
+- FPCXT (FPCXT access instructions)
+- HP (Half-precision floating-point instructions)
+- LOB (Low Overhead loops and Branch future)
+- M (Main Extension)
+- MPU (Memory Protection Unit Extension)
+- PXN (Privileged Execute Never)
+- RAS (Reliability, Serviceability and Availability): "minimum RAS Extension" only
+- S (Security Extension)
+- ST (System Timer Extension)
+
+For information on the specifics of these extensions, please refer
+to the `Armv8-M Arm Architecture Reference Manual
+<https://developer.arm.com/documentation/ddi0553/latest>`_.
+
+When a specific named CPU is being emulated, only those features which
+are present in hardware for that CPU are emulated. (If a feature is
+not in the list above then it is not supported, even if the real
+hardware should have it.) There is no equivalent of the ``max`` CPU for
+M-profile.
diff --git a/docs/system/arm/nrf.rst b/docs/system/arm/nrf.rst
new file mode 100644
index 0000000000..eda87bd760
--- /dev/null
+++ b/docs/system/arm/nrf.rst
@@ -0,0 +1,51 @@
+Nordic nRF boards (``microbit``)
+================================
+
+The `Nordic nRF`_ chips are a family of ARM-based System-on-Chip that
+are designed to be used for low-power and short-range wireless solutions.
+
+.. _Nordic nRF: https://www.nordicsemi.com/Products
+
+The nRF51 series is the first series for short range wireless applications.
+It is superseded by the nRF52 series.
+The following machines are based on this chip :
+
+- ``microbit`` BBC micro:bit board with nRF51822 SoC
+
+There are other series such as nRF52, nRF53 and nRF91 which are currently not
+supported by QEMU.
+
+Supported devices
+-----------------
+
+ * ARM Cortex-M0 (ARMv6-M)
+ * Serial ports (UART)
+ * Clock controller
+ * Timers
+ * Random Number Generator (RNG)
+ * GPIO controller
+ * NVMC
+ * SWI
+
+Missing devices
+---------------
+
+ * Watchdog
+ * Real-Time Clock (RTC) controller
+ * TWI (i2c)
+ * SPI controller
+ * Analog to Digital Converter (ADC)
+ * Quadrature decoder
+ * Radio
+
+Boot options
+------------
+
+The Micro:bit machine can be started using the ``-device`` option to load a
+firmware in `ihex format`_. Example:
+
+.. _ihex format: https://en.wikipedia.org/wiki/Intel_HEX
+
+.. code-block:: bash
+
+ $ qemu-system-arm -M microbit -device loader,file=test.hex
diff --git a/docs/system/target-arm.rst b/docs/system/target-arm.rst
index edd013c7bb..13b3eeaf07 100644
--- a/docs/system/target-arm.rst
+++ b/docs/system/target-arm.rst
@@ -87,6 +87,7 @@ undocumented; you can get a complete list by running
arm/digic
arm/musicpal
arm/gumstix
+ arm/nrf
arm/nseries
arm/nuvoton
arm/orangepi
@@ -99,6 +100,12 @@ undocumented; you can get a complete list by running
arm/virt
arm/xlnx-versal-virt
+Emulated CPU architecture support
+=================================
+
+.. toctree::
+ arm/emulation
+
Arm CPU features
================
diff --git a/hw/acpi/ghes-stub.c b/hw/acpi/ghes-stub.c
new file mode 100644
index 0000000000..c315de1802
--- /dev/null
+++ b/hw/acpi/ghes-stub.c
@@ -0,0 +1,22 @@
+/*
+ * Support for generating APEI tables and recording CPER for Guests:
+ * stub functions.
+ *
+ * Copyright (c) 2021 Linaro, Ltd
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/acpi/ghes.h"
+
+int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
+{
+ return -1;
+}
+
+bool acpi_ghes_present(void)
+{
+ return false;
+}
diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
index a4dac6bf15..a749b84d62 100644
--- a/hw/acpi/ghes.c
+++ b/hw/acpi/ghes.c
@@ -386,6 +386,8 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s,
/* Create a read-write fw_cfg file for Address */
fw_cfg_add_file_callback(s, ACPI_GHES_DATA_ADDR_FW_CFG_FILE, NULL, NULL,
NULL, &(ags->ghes_addr_le), sizeof(ags->ghes_addr_le), false);
+
+ ags->present = true;
}
int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
@@ -443,3 +445,18 @@ int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
return ret;
}
+
+bool acpi_ghes_present(void)
+{
+ AcpiGedState *acpi_ged_state;
+ AcpiGhesState *ags;
+
+ acpi_ged_state = ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED,
+ NULL));
+
+ if (!acpi_ged_state) {
+ return false;
+ }
+ ags = &acpi_ged_state->ghes_state;
+ return ags->present;
+}
diff --git a/hw/acpi/meson.build b/hw/acpi/meson.build
index dd69577212..9b7fa75719 100644
--- a/hw/acpi/meson.build
+++ b/hw/acpi/meson.build
@@ -13,13 +13,13 @@ acpi_ss.add(when: 'CONFIG_ACPI_PCI', if_true: files('pci.c'))
acpi_ss.add(when: 'CONFIG_ACPI_VMGENID', if_true: files('vmgenid.c'))
acpi_ss.add(when: 'CONFIG_ACPI_HW_REDUCED', if_true: files('generic_event_device.c'))
acpi_ss.add(when: 'CONFIG_ACPI_HMAT', if_true: files('hmat.c'))
-acpi_ss.add(when: 'CONFIG_ACPI_APEI', if_true: files('ghes.c'))
+acpi_ss.add(when: 'CONFIG_ACPI_APEI', if_true: files('ghes.c'), if_false: files('ghes-stub.c'))
acpi_ss.add(when: 'CONFIG_ACPI_X86', if_true: files('core.c', 'piix4.c', 'pcihp.c'), if_false: files('acpi-stub.c'))
acpi_ss.add(when: 'CONFIG_ACPI_X86_ICH', if_true: files('ich9.c', 'tco.c'))
acpi_ss.add(when: 'CONFIG_IPMI', if_true: files('ipmi.c'), if_false: files('ipmi-stub.c'))
acpi_ss.add(when: 'CONFIG_PC', if_false: files('acpi-x86-stub.c'))
acpi_ss.add(when: 'CONFIG_TPM', if_true: files('tpm.c'))
-softmmu_ss.add(when: 'CONFIG_ACPI', if_false: files('acpi-stub.c', 'aml-build-stub.c'))
+softmmu_ss.add(when: 'CONFIG_ACPI', if_false: files('acpi-stub.c', 'aml-build-stub.c', 'ghes-stub.c'))
softmmu_ss.add_all(when: 'CONFIG_ACPI', if_true: acpi_ss)
softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('acpi-stub.c', 'aml-build-stub.c',
- 'acpi-x86-stub.c', 'ipmi-stub.c'))
+ 'acpi-x86-stub.c', 'ipmi-stub.c', 'ghes-stub.c'))
diff --git a/include/hw/acpi/ghes.h b/include/hw/acpi/ghes.h
index 2ae8bc1ded..674f6958e9 100644
--- a/include/hw/acpi/ghes.h
+++ b/include/hw/acpi/ghes.h
@@ -64,6 +64,7 @@ enum {
typedef struct AcpiGhesState {
uint64_t ghes_addr_le;
+ bool present; /* True if GHES is present at all on this board */
} AcpiGhesState;
void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker);
@@ -72,4 +73,12 @@ void acpi_build_hest(GArray *table_data, BIOSLinker *linker,
void acpi_ghes_add_fw_cfg(AcpiGhesState *vms, FWCfgState *s,
GArray *hardware_errors);
int acpi_ghes_record_errors(uint8_t notify, uint64_t error_physical_addr);
+
+/**
+ * acpi_ghes_present: Report whether ACPI GHES table is present
+ *
+ * Returns: true if the system has an ACPI GHES table and it is
+ * safe to call acpi_ghes_record_errors() to record a memory error.
+ */
+bool acpi_ghes_present(void);
#endif
diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
index ef8a008ea7..1a2ae93758 100644
--- a/include/tcg/tcg-op.h
+++ b/include/tcg/tcg-op.h
@@ -338,6 +338,9 @@ void tcg_gen_umin_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
void tcg_gen_umax_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
void tcg_gen_abs_i32(TCGv_i32, TCGv_i32);
+/* Replicate a value of size @vece from @in to all the lanes in @out */
+void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in);
+
static inline void tcg_gen_discard_i32(TCGv_i32 arg)
{
tcg_gen_op1_i32(INDEX_op_discard, arg);
@@ -534,6 +537,9 @@ void tcg_gen_umin_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
void tcg_gen_umax_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
void tcg_gen_abs_i64(TCGv_i64, TCGv_i64);
+/* Replicate a value of size @vece from @in to all the lanes in @out */
+void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in);
+
#if TCG_TARGET_REG_BITS == 64
static inline void tcg_gen_discard_i64(TCGv_i64 arg)
{
@@ -1127,6 +1133,7 @@ void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
#define tcg_gen_atomic_smax_fetch_tl tcg_gen_atomic_smax_fetch_i64
#define tcg_gen_atomic_umax_fetch_tl tcg_gen_atomic_umax_fetch_i64
#define tcg_gen_dup_tl_vec tcg_gen_dup_i64_vec
+#define tcg_gen_dup_tl tcg_gen_dup_i64
#else
#define tcg_gen_movi_tl tcg_gen_movi_i32
#define tcg_gen_mov_tl tcg_gen_mov_i32
@@ -1241,6 +1248,7 @@ void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
#define tcg_gen_atomic_smax_fetch_tl tcg_gen_atomic_smax_fetch_i32
#define tcg_gen_atomic_umax_fetch_tl tcg_gen_atomic_umax_fetch_i32
#define tcg_gen_dup_tl_vec tcg_gen_dup_i32_vec
+#define tcg_gen_dup_tl tcg_gen_dup_i32
#endif
#if UINTPTR_MAX == UINT32_MAX
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index 41a6c4bfe5..2dad364240 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -1264,7 +1264,6 @@ uint64_t dup_const(unsigned vece, uint64_t c);
: (qemu_build_not_reached_always(), 0)) \
: dup_const(VECE, C))
-
/*
* Memory helpers that will be used by TCG generated code.
*/
diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index 1c23187d1a..c7a1626bec 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -683,7 +683,7 @@ static void aarch64_max_initfn(Object *obj)
* during realize if the board provides no tag memory, much like
* we do for EL2 with the virtualization=on property.
*/
- t = FIELD_DP64(t, ID_AA64PFR1, MTE, 2);
+ t = FIELD_DP64(t, ID_AA64PFR1, MTE, 3);
cpu->isar.id_aa64pfr1 = t;
t = cpu->isar.id_aa64mmfr0;
diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h
new file mode 100644
index 0000000000..4bbb9b3ae2
--- /dev/null
+++ b/target/arm/helper-mve.h
@@ -0,0 +1,357 @@
+/*
+ * M-profile MVE specific helper definitions
+ *
+ * Copyright (c) 2021 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+DEF_HELPER_FLAGS_3(mve_vldrb, TCG_CALL_NO_WG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vldrh, TCG_CALL_NO_WG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vldrw, TCG_CALL_NO_WG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vstrb, TCG_CALL_NO_WG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vstrh, TCG_CALL_NO_WG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vstrw, TCG_CALL_NO_WG, void, env, ptr, i32)
+
+DEF_HELPER_FLAGS_3(mve_vldrb_sh, TCG_CALL_NO_WG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vldrb_sw, TCG_CALL_NO_WG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vldrb_uh, TCG_CALL_NO_WG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vldrb_uw, TCG_CALL_NO_WG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vldrh_sw, TCG_CALL_NO_WG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vldrh_uw, TCG_CALL_NO_WG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vstrb_h, TCG_CALL_NO_WG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vstrb_w, TCG_CALL_NO_WG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vstrh_w, TCG_CALL_NO_WG, void, env, ptr, i32)
+
+DEF_HELPER_FLAGS_3(mve_vdup, TCG_CALL_NO_WG, void, env, ptr, i32)
+
+DEF_HELPER_FLAGS_3(mve_vclsb, TCG_CALL_NO_WG, void, env, ptr, ptr)
+DEF_HELPER_FLAGS_3(mve_vclsh, TCG_CALL_NO_WG, void, env, ptr, ptr)
+DEF_HELPER_FLAGS_3(mve_vclsw, TCG_CALL_NO_WG, void, env, ptr, ptr)
+
+DEF_HELPER_FLAGS_3(mve_vclzb, TCG_CALL_NO_WG, void, env, ptr, ptr)
+DEF_HELPER_FLAGS_3(mve_vclzh, TCG_CALL_NO_WG, void, env, ptr, ptr)
+DEF_HELPER_FLAGS_3(mve_vclzw, TCG_CALL_NO_WG, void, env, ptr, ptr)
+
+DEF_HELPER_FLAGS_3(mve_vrev16b, TCG_CALL_NO_WG, void, env, ptr, ptr)
+DEF_HELPER_FLAGS_3(mve_vrev32b, TCG_CALL_NO_WG, void, env, ptr, ptr)
+DEF_HELPER_FLAGS_3(mve_vrev32h, TCG_CALL_NO_WG, void, env, ptr, ptr)
+DEF_HELPER_FLAGS_3(mve_vrev64b, TCG_CALL_NO_WG, void, env, ptr, ptr)
+DEF_HELPER_FLAGS_3(mve_vrev64h, TCG_CALL_NO_WG, void, env, ptr, ptr)
+DEF_HELPER_FLAGS_3(mve_vrev64w, TCG_CALL_NO_WG, void, env, ptr, ptr)
+
+DEF_HELPER_FLAGS_3(mve_vmvn, TCG_CALL_NO_WG, void, env, ptr, ptr)
+
+DEF_HELPER_FLAGS_3(mve_vabsb, TCG_CALL_NO_WG, void, env, ptr, ptr)
+DEF_HELPER_FLAGS_3(mve_vabsh, TCG_CALL_NO_WG, void, env, ptr, ptr)
+DEF_HELPER_FLAGS_3(mve_vabsw, TCG_CALL_NO_WG, void, env, ptr, ptr)
+DEF_HELPER_FLAGS_3(mve_vfabsh, TCG_CALL_NO_WG, void, env, ptr, ptr)
+DEF_HELPER_FLAGS_3(mve_vfabss, TCG_CALL_NO_WG, void, env, ptr, ptr)
+
+DEF_HELPER_FLAGS_3(mve_vnegb, TCG_CALL_NO_WG, void, env, ptr, ptr)
+DEF_HELPER_FLAGS_3(mve_vnegh, TCG_CALL_NO_WG, void, env, ptr, ptr)
+DEF_HELPER_FLAGS_3(mve_vnegw, TCG_CALL_NO_WG, void, env, ptr, ptr)
+DEF_HELPER_FLAGS_3(mve_vfnegh, TCG_CALL_NO_WG, void, env, ptr, ptr)
+DEF_HELPER_FLAGS_3(mve_vfnegs, TCG_CALL_NO_WG, void, env, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vand, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vbic, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vorr, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vorn, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_veor, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vaddb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vaddh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vaddw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vsubb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vsubh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vsubw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vmulb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmulh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmulw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vmulhsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmulhsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmulhsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmulhub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmulhuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmulhuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vrmulhsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vrmulhsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vrmulhsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vrmulhub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vrmulhuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vrmulhuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vmaxsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmaxsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmaxsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmaxub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmaxuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmaxuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vminsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vminsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vminsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vminub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vminuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vminuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vabdsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vabdsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vabdsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vabdub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vabduh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vabduw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vhaddsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vhaddsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vhaddsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vhaddub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vhadduh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vhadduw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vhsubsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vhsubsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vhsubsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vhsubub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vhsubuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vhsubuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vmullbsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmullbsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmullbsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmullbub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmullbuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmullbuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vmulltsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmulltsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmulltsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmulltub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmulltuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vmulltuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vqdmulhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqdmulhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqdmulhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vqrdmulhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqrdmulhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqrdmulhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vqaddsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqaddsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqaddsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vqaddub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqadduh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqadduw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vqsubsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqsubsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqsubsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vqsubub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqsubuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqsubuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vshlsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vshlsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vshlsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vshlub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vshluh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vshluw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vrshlsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vrshlsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vrshlsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vrshlub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vrshluh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vrshluw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vqshlsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqshlsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqshlsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vqshlub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqshluh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqshluw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vqrshlsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqrshlsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqrshlsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vqrshlub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqrshluh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqrshluw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vqdmladhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqdmladhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqdmladhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vqdmladhxb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqdmladhxh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqdmladhxw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vqrdmladhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqrdmladhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqrdmladhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vqrdmladhxb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqrdmladhxh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqrdmladhxw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vqdmlsdhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqdmlsdhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqdmlsdhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vqdmlsdhxb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqdmlsdhxh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqdmlsdhxw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vqrdmlsdhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqrdmlsdhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqrdmlsdhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vqrdmlsdhxb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqrdmlsdhxh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqrdmlsdhxw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vqdmullbh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqdmullbw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqdmullth, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vqdmulltw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vrhaddsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vrhaddsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vrhaddsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vrhaddub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vrhadduh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vrhadduw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vadc, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vadci, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vsbc, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vsbci, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vcadd90b, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vcadd90h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vcadd90w, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vcadd270b, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vcadd270h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vcadd270w, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vhcadd90b, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vhcadd90h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vhcadd90w, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vhcadd270b, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vhcadd270h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vhcadd270w, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vadd_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vadd_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vadd_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(mve_vsub_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vsub_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vsub_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(mve_vmul_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vmul_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vmul_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(mve_vhadds_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vhadds_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vhadds_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(mve_vhaddu_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vhaddu_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vhaddu_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(mve_vhsubs_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vhsubs_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vhsubs_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(mve_vhsubu_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vhsubu_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vhsubu_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(mve_vqadds_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqadds_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqadds_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(mve_vqaddu_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqaddu_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqaddu_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(mve_vqsubs_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqsubs_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqsubs_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(mve_vqsubu_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqsubu_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqsubu_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(mve_vqdmulh_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqdmulh_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqdmulh_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(mve_vqrdmulh_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqrdmulh_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqrdmulh_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(mve_vbrsrb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vbrsrh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vbrsrw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(mve_vqdmullb_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqdmullb_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqdmullt_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vqdmullt_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(mve_vmlaldavsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
+DEF_HELPER_FLAGS_4(mve_vmlaldavsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
+DEF_HELPER_FLAGS_4(mve_vmlaldavxsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
+DEF_HELPER_FLAGS_4(mve_vmlaldavxsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
+
+DEF_HELPER_FLAGS_4(mve_vmlaldavuh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
+DEF_HELPER_FLAGS_4(mve_vmlaldavuw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
+
+DEF_HELPER_FLAGS_4(mve_vmlsldavsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
+DEF_HELPER_FLAGS_4(mve_vmlsldavsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
+DEF_HELPER_FLAGS_4(mve_vmlsldavxsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
+DEF_HELPER_FLAGS_4(mve_vmlsldavxsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
+
+DEF_HELPER_FLAGS_4(mve_vrmlaldavhsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
+DEF_HELPER_FLAGS_4(mve_vrmlaldavhxsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
+
+DEF_HELPER_FLAGS_4(mve_vrmlaldavhuw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
+
+DEF_HELPER_FLAGS_4(mve_vrmlsldavhsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
+DEF_HELPER_FLAGS_4(mve_vrmlsldavhxsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
+
+DEF_HELPER_FLAGS_3(mve_vaddvsb, TCG_CALL_NO_WG, i32, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vaddvub, TCG_CALL_NO_WG, i32, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vaddvsh, TCG_CALL_NO_WG, i32, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vaddvuh, TCG_CALL_NO_WG, i32, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vaddvsw, TCG_CALL_NO_WG, i32, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vaddvuw, TCG_CALL_NO_WG, i32, env, ptr, i32)
diff --git a/target/arm/helper.h b/target/arm/helper.h
index dc6eb96d43..db87d7d537 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -1019,3 +1019,5 @@ DEF_HELPER_FLAGS_6(gvec_bfmlal_idx, TCG_CALL_NO_RWG,
#include "helper-a64.h"
#include "helper-sve.h"
#endif
+
+#include "helper-mve.h"
diff --git a/target/arm/internals.h b/target/arm/internals.h
index 886db56b58..3ba86e8af8 100644
--- a/target/arm/internals.h
+++ b/target/arm/internals.h
@@ -1202,4 +1202,15 @@ static inline uint64_t useronly_maybe_clean_ptr(uint32_t desc, uint64_t ptr)
return ptr;
}
+/* Values for M-profile PSR.ECI for MVE insns */
+enum MVEECIState {
+ ECI_NONE = 0, /* No completed beats */
+ ECI_A0 = 1, /* Completed: A0 */
+ ECI_A0A1 = 2, /* Completed: A0, A1 */
+ /* 3 is reserved */
+ ECI_A0A1A2 = 4, /* Completed: A0, A1, A2 */
+ ECI_A0A1A2B0 = 5, /* Completed: A0, A1, A2, B0 */
+ /* All other values reserved */
+};
+
#endif
diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index 37ceadd9a9..59982d470d 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -1410,14 +1410,10 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
{
ram_addr_t ram_addr;
hwaddr paddr;
- Object *obj = qdev_get_machine();
- VirtMachineState *vms = VIRT_MACHINE(obj);
- bool acpi_enabled = virt_is_acpi_enabled(vms);
assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
- if (acpi_enabled && addr &&
- object_property_get_bool(obj, "ras", NULL)) {
+ if (acpi_ghes_present() && addr) {
ram_addr = qemu_ram_addr_from_host(addr);
if (ram_addr != RAM_ADDR_INVALID &&
kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
diff --git a/target/arm/m-nocp.decode b/target/arm/m-nocp.decode
index 6699626d7c..b65c801c97 100644
--- a/target/arm/m-nocp.decode
+++ b/target/arm/m-nocp.decode
@@ -34,6 +34,14 @@
&nocp cp
+# M-profile VLDR/VSTR to sysreg
+%vldr_sysreg 22:1 13:3
+%imm7_0x4 0:7 !function=times_4
+
+&vldr_sysreg rn reg imm a w p
+@vldr_sysreg .... ... . a:1 . . . rn:4 ... . ... .. ....... \
+ reg=%vldr_sysreg imm=%imm7_0x4 &vldr_sysreg
+
{
# Special cases which do not take an early NOCP: VLLDM and VLSTM
VLLDM_VLSTM 1110 1100 001 l:1 rn:4 0000 1010 op:1 000 0000
@@ -41,6 +49,22 @@
VSCCLRM 1110 1100 1.01 1111 .... 1011 imm:7 0 vd=%vd_dp size=3
VSCCLRM 1110 1100 1.01 1111 .... 1010 imm:8 vd=%vd_sp size=2
+ # FP system register accesses: these are a special case because accesses
+ # to FPCXT_NS succeed even if the FPU is disabled. We therefore need
+ # to handle them before the big NOCP blocks. Note that within these
+ # insns NOCP still has higher priority than UNDEFs; this is implemented
+ # by their returning 'false' for UNDEF so as to fall through into the
+ # NOCP check (in contrast to VLLDM etc, which call unallocated_encoding()
+ # for the UNDEFs there that must take precedence over NOCP.)
+
+ VMSR_VMRS ---- 1110 111 l:1 reg:4 rt:4 1010 0001 0000
+
+ # P=0 W=0 is SEE "Related encodings", so split into two patterns
+ VLDR_sysreg ---- 110 1 . . w:1 1 .... ... 0 111 11 ....... @vldr_sysreg p=1
+ VLDR_sysreg ---- 110 0 . . 1 1 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1
+ VSTR_sysreg ---- 110 1 . . w:1 0 .... ... 0 111 11 ....... @vldr_sysreg p=1
+ VSTR_sysreg ---- 110 0 . . 1 0 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1
+
NOCP 111- 1110 ---- ---- ---- cp:4 ---- ---- &nocp
NOCP 111- 110- ---- ---- ---- cp:4 ---- ---- &nocp
# From v8.1M onwards this range will also NOCP:
diff --git a/target/arm/meson.build b/target/arm/meson.build
index 2b50be3f86..25a02bf276 100644
--- a/target/arm/meson.build
+++ b/target/arm/meson.build
@@ -23,6 +23,7 @@ arm_ss.add(files(
'helper.c',
'iwmmxt_helper.c',
'm_helper.c',
+ 'mve_helper.c',
'neon_helper.c',
'op_helper.c',
'tlb_helper.c',
diff --git a/target/arm/mte_helper.c b/target/arm/mte_helper.c
index 9e615cc513..724175210b 100644
--- a/target/arm/mte_helper.c
+++ b/target/arm/mte_helper.c
@@ -538,13 +538,50 @@ void HELPER(stzgm_tags)(CPUARMState *env, uint64_t ptr, uint64_t val)
}
}
+static void mte_sync_check_fail(CPUARMState *env, uint32_t desc,
+ uint64_t dirty_ptr, uintptr_t ra)
+{
+ int is_write, syn;
+
+ env->exception.vaddress = dirty_ptr;
+
+ is_write = FIELD_EX32(desc, MTEDESC, WRITE);
+ syn = syn_data_abort_no_iss(arm_current_el(env) != 0, 0, 0, 0, 0, is_write,
+ 0x11);
+ raise_exception_ra(env, EXCP_DATA_ABORT, syn, exception_target_el(env), ra);
+ g_assert_not_reached();
+}
+
+static void mte_async_check_fail(CPUARMState *env, uint64_t dirty_ptr,
+ uintptr_t ra, ARMMMUIdx arm_mmu_idx, int el)
+{
+ int select;
+
+ if (regime_has_2_ranges(arm_mmu_idx)) {
+ select = extract64(dirty_ptr, 55, 1);
+ } else {
+ select = 0;
+ }
+ env->cp15.tfsr_el[el] |= 1 << select;
+#ifdef CONFIG_USER_ONLY
+ /*
+ * Stand in for a timer irq, setting _TIF_MTE_ASYNC_FAULT,
+ * which then sends a SIGSEGV when the thread is next scheduled.
+ * This cpu will return to the main loop at the end of the TB,
+ * which is rather sooner than "normal". But the alternative
+ * is waiting until the next syscall.
+ */
+ qemu_cpu_kick(env_cpu(env));
+#endif
+}
+
/* Record a tag check failure. */
static void mte_check_fail(CPUARMState *env, uint32_t desc,
uint64_t dirty_ptr, uintptr_t ra)
{
int mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX);
ARMMMUIdx arm_mmu_idx = core_to_aa64_mmu_idx(mmu_idx);
- int el, reg_el, tcf, select, is_write, syn;
+ int el, reg_el, tcf;
uint64_t sctlr;
reg_el = regime_el(env, arm_mmu_idx);
@@ -564,14 +601,8 @@ static void mte_check_fail(CPUARMState *env, uint32_t desc,
switch (tcf) {
case 1:
/* Tag check fail causes a synchronous exception. */
- env->exception.vaddress = dirty_ptr;
-
- is_write = FIELD_EX32(desc, MTEDESC, WRITE);
- syn = syn_data_abort_no_iss(arm_current_el(env) != 0, 0, 0, 0, 0,
- is_write, 0x11);
- raise_exception_ra(env, EXCP_DATA_ABORT, syn,
- exception_target_el(env), ra);
- /* noreturn, but fall through to the assert anyway */
+ mte_sync_check_fail(env, desc, dirty_ptr, ra);
+ break;
case 0:
/*
@@ -583,30 +614,19 @@ static void mte_check_fail(CPUARMState *env, uint32_t desc,
case 2:
/* Tag check fail causes asynchronous flag set. */
- if (regime_has_2_ranges(arm_mmu_idx)) {
- select = extract64(dirty_ptr, 55, 1);
- } else {
- select = 0;
- }
- env->cp15.tfsr_el[el] |= 1 << select;
-#ifdef CONFIG_USER_ONLY
- /*
- * Stand in for a timer irq, setting _TIF_MTE_ASYNC_FAULT,
- * which then sends a SIGSEGV when the thread is next scheduled.
- * This cpu will return to the main loop at the end of the TB,
- * which is rather sooner than "normal". But the alternative
- * is waiting until the next syscall.
- */
- qemu_cpu_kick(env_cpu(env));
-#endif
+ mte_async_check_fail(env, dirty_ptr, ra, arm_mmu_idx, el);
break;
- default:
- /* Case 3: Reserved. */
- qemu_log_mask(LOG_GUEST_ERROR,
- "Tag check failure with SCTLR_EL%d.TCF%s "
- "set to reserved value %d\n",
- reg_el, el ? "" : "0", tcf);
+ case 3:
+ /*
+ * Tag check fail causes asynchronous flag set for stores, or
+ * a synchronous exception for loads.
+ */
+ if (FIELD_EX32(desc, MTEDESC, WRITE)) {
+ mte_async_check_fail(env, dirty_ptr, ra, arm_mmu_idx, el);
+ } else {
+ mte_sync_check_fail(env, desc, dirty_ptr, ra);
+ }
break;
}
}
diff --git a/target/arm/mve.decode b/target/arm/mve.decode
index c8492bb576..d9ece7be5d 100644
--- a/target/arm/mve.decode
+++ b/target/arm/mve.decode
@@ -18,3 +18,243 @@
#
# This file is processed by scripts/decodetree.py
#
+
+%qd 22:1 13:3
+%qm 5:1 1:3
+%qn 7:1 17:3
+
+# VQDMULL has size in bit 28: 0 for 16 bit, 1 for 32 bit
+%size_28 28:1 !function=plus_1
+
+&vldr_vstr rn qd imm p a w size l u
+&1op qd qm size
+&2op qd qm qn size
+&2scalar qd qn rm size
+
+@vldr_vstr ....... . . . . l:1 rn:4 ... ...... imm:7 &vldr_vstr qd=%qd u=0
+# Note that both Rn and Qd are 3 bits only (no D bit)
+@vldst_wn ... u:1 ... . . . . l:1 . rn:3 qd:3 . ... .. imm:7 &vldr_vstr
+
+@1op .... .... .... size:2 .. .... .... .... .... &1op qd=%qd qm=%qm
+@1op_nosz .... .... .... .... .... .... .... .... &1op qd=%qd qm=%qm size=0
+@2op .... .... .. size:2 .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn
+@2op_nosz .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn size=0
+@2op_sz28 .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn \
+ size=%size_28
+
+# The _rev suffix indicates that Vn and Vm are reversed. This is
+# the case for shifts. In the Arm ARM these insns are documented
+# with the Vm and Vn fields in their usual places, but in the
+# assembly the operands are listed "backwards", ie in the order
+# Qd, Qm, Qn where other insns use Qd, Qn, Qm. For QEMU we choose
+# to consider Vm and Vn as being in different fields in the insn.
+# This gives us consistency with A64 and Neon.
+@2op_rev .... .... .. size:2 .... .... .... .... .... &2op qd=%qd qm=%qn qn=%qm
+
+@2scalar .... .... .. size:2 .... .... .... .... rm:4 &2scalar qd=%qd qn=%qn
+@2scalar_nosz .... .... .... .... .... .... .... rm:4 &2scalar qd=%qd qn=%qn
+
+# Vector loads and stores
+
+# Widening loads and narrowing stores:
+# for these P=0 W=0 is 'related encoding'; sz=11 is 'related encoding'
+# This means we need to expand out to multiple patterns for P, W, SZ.
+# For stores the U bit must be 0 but we catch that in the trans_ function.
+# The naming scheme here is "VLDSTB_H == in-memory byte load/store to/from
+# signed halfword element in register", etc.
+VLDSTB_H 111 . 110 0 a:1 0 1 . 0 ... ... 0 111 01 ....... @vldst_wn \
+ p=0 w=1 size=1
+VLDSTB_H 111 . 110 1 a:1 0 w:1 . 0 ... ... 0 111 01 ....... @vldst_wn \
+ p=1 size=1
+VLDSTB_W 111 . 110 0 a:1 0 1 . 0 ... ... 0 111 10 ....... @vldst_wn \
+ p=0 w=1 size=2
+VLDSTB_W 111 . 110 1 a:1 0 w:1 . 0 ... ... 0 111 10 ....... @vldst_wn \
+ p=1 size=2
+VLDSTH_W 111 . 110 0 a:1 0 1 . 1 ... ... 0 111 10 ....... @vldst_wn \
+ p=0 w=1 size=2
+VLDSTH_W 111 . 110 1 a:1 0 w:1 . 1 ... ... 0 111 10 ....... @vldst_wn \
+ p=1 size=2
+
+# Non-widening loads/stores (P=0 W=0 is 'related encoding')
+VLDR_VSTR 1110110 0 a:1 . 1 . .... ... 111100 ....... @vldr_vstr \
+ size=0 p=0 w=1
+VLDR_VSTR 1110110 0 a:1 . 1 . .... ... 111101 ....... @vldr_vstr \
+ size=1 p=0 w=1
+VLDR_VSTR 1110110 0 a:1 . 1 . .... ... 111110 ....... @vldr_vstr \
+ size=2 p=0 w=1
+VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111100 ....... @vldr_vstr \
+ size=0 p=1
+VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111101 ....... @vldr_vstr \
+ size=1 p=1
+VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111110 ....... @vldr_vstr \
+ size=2 p=1
+
+# Vector 2-op
+VAND 1110 1111 0 . 00 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
+VBIC 1110 1111 0 . 01 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
+VORR 1110 1111 0 . 10 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
+VORN 1110 1111 0 . 11 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
+VEOR 1111 1111 0 . 00 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
+
+VADD 1110 1111 0 . .. ... 0 ... 0 1000 . 1 . 0 ... 0 @2op
+VSUB 1111 1111 0 . .. ... 0 ... 0 1000 . 1 . 0 ... 0 @2op
+VMUL 1110 1111 0 . .. ... 0 ... 0 1001 . 1 . 1 ... 0 @2op
+
+VMULH_S 111 0 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op
+VMULH_U 111 1 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op
+
+VRMULH_S 111 0 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op
+VRMULH_U 111 1 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op
+
+VMAX_S 111 0 1111 0 . .. ... 0 ... 0 0110 . 1 . 0 ... 0 @2op
+VMAX_U 111 1 1111 0 . .. ... 0 ... 0 0110 . 1 . 0 ... 0 @2op
+VMIN_S 111 0 1111 0 . .. ... 0 ... 0 0110 . 1 . 1 ... 0 @2op
+VMIN_U 111 1 1111 0 . .. ... 0 ... 0 0110 . 1 . 1 ... 0 @2op
+
+VABD_S 111 0 1111 0 . .. ... 0 ... 0 0111 . 1 . 0 ... 0 @2op
+VABD_U 111 1 1111 0 . .. ... 0 ... 0 0111 . 1 . 0 ... 0 @2op
+
+VHADD_S 111 0 1111 0 . .. ... 0 ... 0 0000 . 1 . 0 ... 0 @2op
+VHADD_U 111 1 1111 0 . .. ... 0 ... 0 0000 . 1 . 0 ... 0 @2op
+VHSUB_S 111 0 1111 0 . .. ... 0 ... 0 0010 . 1 . 0 ... 0 @2op
+VHSUB_U 111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 0 ... 0 @2op
+
+VMULL_BS 111 0 1110 0 . .. ... 1 ... 0 1110 . 0 . 0 ... 0 @2op
+VMULL_BU 111 1 1110 0 . .. ... 1 ... 0 1110 . 0 . 0 ... 0 @2op
+VMULL_TS 111 0 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op
+VMULL_TU 111 1 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op
+
+VQDMULH 1110 1111 0 . .. ... 0 ... 0 1011 . 1 . 0 ... 0 @2op
+VQRDMULH 1111 1111 0 . .. ... 0 ... 0 1011 . 1 . 0 ... 0 @2op
+
+VQADD_S 111 0 1111 0 . .. ... 0 ... 0 0000 . 1 . 1 ... 0 @2op
+VQADD_U 111 1 1111 0 . .. ... 0 ... 0 0000 . 1 . 1 ... 0 @2op
+VQSUB_S 111 0 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op
+VQSUB_U 111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op
+
+VSHL_S 111 0 1111 0 . .. ... 0 ... 0 0100 . 1 . 0 ... 0 @2op_rev
+VSHL_U 111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 0 ... 0 @2op_rev
+
+VRSHL_S 111 0 1111 0 . .. ... 0 ... 0 0101 . 1 . 0 ... 0 @2op_rev
+VRSHL_U 111 1 1111 0 . .. ... 0 ... 0 0101 . 1 . 0 ... 0 @2op_rev
+
+VQSHL_S 111 0 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev
+VQSHL_U 111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev
+
+VQRSHL_S 111 0 1111 0 . .. ... 0 ... 0 0101 . 1 . 1 ... 0 @2op_rev
+VQRSHL_U 111 1 1111 0 . .. ... 0 ... 0 0101 . 1 . 1 ... 0 @2op_rev
+
+VQDMLADH 1110 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 0 @2op
+VQDMLADHX 1110 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op
+VQRDMLADH 1110 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op
+VQRDMLADHX 1110 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op
+
+VQDMLSDH 1111 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 0 @2op
+VQDMLSDHX 1111 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op
+VQRDMLSDH 1111 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op
+VQRDMLSDHX 1111 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op
+
+VQDMULLB 111 . 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 1 @2op_sz28
+VQDMULLT 111 . 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 1 @2op_sz28
+
+VRHADD_S 111 0 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op
+VRHADD_U 111 1 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op
+
+{
+ VADC 1110 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz
+ VADCI 1110 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 0 @2op_nosz
+ VHCADD90 1110 1110 0 . .. ... 0 ... 0 1111 . 0 . 0 ... 0 @2op
+ VHCADD270 1110 1110 0 . .. ... 0 ... 1 1111 . 0 . 0 ... 0 @2op
+}
+
+{
+ VSBC 1111 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz
+ VSBCI 1111 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 0 @2op_nosz
+ VCADD90 1111 1110 0 . .. ... 0 ... 0 1111 . 0 . 0 ... 0 @2op
+ VCADD270 1111 1110 0 . .. ... 0 ... 1 1111 . 0 . 0 ... 0 @2op
+}
+
+# Vector miscellaneous
+
+VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op
+VCLZ 1111 1111 1 . 11 .. 00 ... 0 0100 11 . 0 ... 0 @1op
+
+VREV16 1111 1111 1 . 11 .. 00 ... 0 0001 01 . 0 ... 0 @1op
+VREV32 1111 1111 1 . 11 .. 00 ... 0 0000 11 . 0 ... 0 @1op
+VREV64 1111 1111 1 . 11 .. 00 ... 0 0000 01 . 0 ... 0 @1op
+
+VMVN 1111 1111 1 . 11 00 00 ... 0 0101 11 . 0 ... 0 @1op_nosz
+
+VABS 1111 1111 1 . 11 .. 01 ... 0 0011 01 . 0 ... 0 @1op
+VABS_fp 1111 1111 1 . 11 .. 01 ... 0 0111 01 . 0 ... 0 @1op
+VNEG 1111 1111 1 . 11 .. 01 ... 0 0011 11 . 0 ... 0 @1op
+VNEG_fp 1111 1111 1 . 11 .. 01 ... 0 0111 11 . 0 ... 0 @1op
+
+&vdup qd rt size
+# Qd is in the fields usually named Qn
+@vdup .... .... . . .. ... . rt:4 .... . . . . .... qd=%qn &vdup
+
+# B and E bits encode size, which we decode here to the usual size values
+VDUP 1110 1110 1 1 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=0
+VDUP 1110 1110 1 0 10 ... 0 .... 1011 . 0 1 1 0000 @vdup size=1
+VDUP 1110 1110 1 0 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=2
+
+# multiply-add long dual accumulate
+# rdahi: bits [3:1] from insn, bit 0 is 1
+# rdalo: bits [3:1] from insn, bit 0 is 0
+%rdahi 20:3 !function=times_2_plus_1
+%rdalo 13:3 !function=times_2
+# size bit is 0 for 16 bit, 1 for 32 bit
+%size_16 16:1 !function=plus_1
+
+&vmlaldav rdahi rdalo size qn qm x a
+
+@vmlaldav .... .... . ... ... . ... . .... .... qm:3 . \
+ qn=%qn rdahi=%rdahi rdalo=%rdalo size=%size_16 &vmlaldav
+@vmlaldav_nosz .... .... . ... ... . ... . .... .... qm:3 . \
+ qn=%qn rdahi=%rdahi rdalo=%rdalo size=0 &vmlaldav
+VMLALDAV_S 1110 1110 1 ... ... . ... x:1 1110 . 0 a:1 0 ... 0 @vmlaldav
+VMLALDAV_U 1111 1110 1 ... ... . ... x:1 1110 . 0 a:1 0 ... 0 @vmlaldav
+
+VMLSLDAV 1110 1110 1 ... ... . ... x:1 1110 . 0 a:1 0 ... 1 @vmlaldav
+
+VRMLALDAVH_S 1110 1110 1 ... ... 0 ... x:1 1111 . 0 a:1 0 ... 0 @vmlaldav_nosz
+VRMLALDAVH_U 1111 1110 1 ... ... 0 ... x:1 1111 . 0 a:1 0 ... 0 @vmlaldav_nosz
+
+VRMLSLDAVH 1111 1110 1 ... ... 0 ... x:1 1110 . 0 a:1 0 ... 1 @vmlaldav_nosz
+
+# Scalar operations
+
+VADD_scalar 1110 1110 0 . .. ... 1 ... 0 1111 . 100 .... @2scalar
+VSUB_scalar 1110 1110 0 . .. ... 1 ... 1 1111 . 100 .... @2scalar
+VMUL_scalar 1110 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar
+VHADD_S_scalar 1110 1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar
+VHADD_U_scalar 1111 1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar
+VHSUB_S_scalar 1110 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar
+VHSUB_U_scalar 1111 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar
+
+{
+ VQADD_S_scalar 1110 1110 0 . .. ... 0 ... 0 1111 . 110 .... @2scalar
+ VQADD_U_scalar 1111 1110 0 . .. ... 0 ... 0 1111 . 110 .... @2scalar
+ VQDMULLB_scalar 111 . 1110 0 . 11 ... 0 ... 0 1111 . 110 .... @2scalar_nosz \
+ size=%size_28
+}
+
+{
+ VQSUB_S_scalar 1110 1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar
+ VQSUB_U_scalar 1111 1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar
+ VQDMULLT_scalar 111 . 1110 0 . 11 ... 0 ... 1 1111 . 110 .... @2scalar_nosz \
+ size=%size_28
+}
+
+VBRSR 1111 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar
+
+VQDMULH_scalar 1110 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar
+VQRDMULH_scalar 1111 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar
+
+# Vector add across vector
+VADDV 111 u:1 1110 1111 size:2 01 ... 0 1111 0 0 a:1 0 qm:3 0 rda=%rdalo
+
+# Predicate operations
+%mask_22_13 22:1 13:3
+VPST 1111 1110 0 . 11 000 1 ... 0 1111 0100 1101 mask=%mask_22_13
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
new file mode 100644
index 0000000000..05552ce7ee
--- /dev/null
+++ b/target/arm/mve_helper.c
@@ -0,0 +1,1160 @@
+/*
+ * M-profile MVE Operations
+ *
+ * Copyright (c) 2021 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/int128.h"
+#include "cpu.h"
+#include "internals.h"
+#include "vec_internal.h"
+#include "exec/helper-proto.h"
+#include "exec/cpu_ldst.h"
+#include "exec/exec-all.h"
+#include "tcg/tcg.h"
+
+static uint16_t mve_element_mask(CPUARMState *env)
+{
+ /*
+ * Return the mask of which elements in the MVE vector should be
+ * updated. This is a combination of multiple things:
+ * (1) by default, we update every lane in the vector
+ * (2) VPT predication stores its state in the VPR register;
+ * (3) low-overhead-branch tail predication will mask out part
+ * the vector on the final iteration of the loop
+ * (4) if EPSR.ECI is set then we must execute only some beats
+ * of the insn
+ * We combine all these into a 16-bit result with the same semantics
+ * as VPR.P0: 0 to mask the lane, 1 if it is active.
+ * 8-bit vector ops will look at all bits of the result;
+ * 16-bit ops will look at bits 0, 2, 4, ...;
+ * 32-bit ops will look at bits 0, 4, 8 and 12.
+ * Compare pseudocode GetCurInstrBeat(), though that only returns
+ * the 4-bit slice of the mask corresponding to a single beat.
+ */
+ uint16_t mask = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0);
+
+ if (!(env->v7m.vpr & R_V7M_VPR_MASK01_MASK)) {
+ mask |= 0xff;
+ }
+ if (!(env->v7m.vpr & R_V7M_VPR_MASK23_MASK)) {
+ mask |= 0xff00;
+ }
+
+ if (env->v7m.ltpsize < 4 &&
+ env->regs[14] <= (1 << (4 - env->v7m.ltpsize))) {
+ /*
+ * Tail predication active, and this is the last loop iteration.
+ * The element size is (1 << ltpsize), and we only want to process
+ * loopcount elements, so we want to retain the least significant
+ * (loopcount * esize) predicate bits and zero out bits above that.
+ */
+ int masklen = env->regs[14] << env->v7m.ltpsize;
+ assert(masklen <= 16);
+ mask &= MAKE_64BIT_MASK(0, masklen);
+ }
+
+ if ((env->condexec_bits & 0xf) == 0) {
+ /*
+ * ECI bits indicate which beats are already executed;
+ * we handle this by effectively predicating them out.
+ */
+ int eci = env->condexec_bits >> 4;
+ switch (eci) {
+ case ECI_NONE:
+ break;
+ case ECI_A0:
+ mask &= 0xfff0;
+ break;
+ case ECI_A0A1:
+ mask &= 0xff00;
+ break;
+ case ECI_A0A1A2:
+ case ECI_A0A1A2B0:
+ mask &= 0xf000;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+
+ return mask;
+}
+
+static void mve_advance_vpt(CPUARMState *env)
+{
+ /* Advance the VPT and ECI state if necessary */
+ uint32_t vpr = env->v7m.vpr;
+ unsigned mask01, mask23;
+
+ if ((env->condexec_bits & 0xf) == 0) {
+ env->condexec_bits = (env->condexec_bits == (ECI_A0A1A2B0 << 4)) ?
+ (ECI_A0 << 4) : (ECI_NONE << 4);
+ }
+
+ if (!(vpr & (R_V7M_VPR_MASK01_MASK | R_V7M_VPR_MASK23_MASK))) {
+ /* VPT not enabled, nothing to do */
+ return;
+ }
+
+ mask01 = FIELD_EX32(vpr, V7M_VPR, MASK01);
+ mask23 = FIELD_EX32(vpr, V7M_VPR, MASK23);
+ if (mask01 > 8) {
+ /* high bit set, but not 0b1000: invert the relevant half of P0 */
+ vpr ^= 0xff;
+ }
+ if (mask23 > 8) {
+ /* high bit set, but not 0b1000: invert the relevant half of P0 */
+ vpr ^= 0xff00;
+ }
+ vpr = FIELD_DP32(vpr, V7M_VPR, MASK01, mask01 << 1);
+ vpr = FIELD_DP32(vpr, V7M_VPR, MASK23, mask23 << 1);
+ env->v7m.vpr = vpr;
+}
+
+
+#define DO_VLDR(OP, MSIZE, LDTYPE, ESIZE, TYPE) \
+ void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \
+ { \
+ TYPE *d = vd; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned b, e; \
+ /* \
+ * R_SXTM allows the dest reg to become UNKNOWN for abandoned \
+ * beats so we don't care if we update part of the dest and \
+ * then take an exception. \
+ */ \
+ for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \
+ if (mask & (1 << b)) { \
+ d[H##ESIZE(e)] = cpu_##LDTYPE##_data_ra(env, addr, GETPC()); \
+ } \
+ addr += MSIZE; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_VSTR(OP, MSIZE, STTYPE, ESIZE, TYPE) \
+ void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \
+ { \
+ TYPE *d = vd; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned b, e; \
+ for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \
+ if (mask & (1 << b)) { \
+ cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \
+ } \
+ addr += MSIZE; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+DO_VLDR(vldrb, 1, ldub, 1, uint8_t)
+DO_VLDR(vldrh, 2, lduw, 2, uint16_t)
+DO_VLDR(vldrw, 4, ldl, 4, uint32_t)
+
+DO_VSTR(vstrb, 1, stb, 1, uint8_t)
+DO_VSTR(vstrh, 2, stw, 2, uint16_t)
+DO_VSTR(vstrw, 4, stl, 4, uint32_t)
+
+DO_VLDR(vldrb_sh, 1, ldsb, 2, int16_t)
+DO_VLDR(vldrb_sw, 1, ldsb, 4, int32_t)
+DO_VLDR(vldrb_uh, 1, ldub, 2, uint16_t)
+DO_VLDR(vldrb_uw, 1, ldub, 4, uint32_t)
+DO_VLDR(vldrh_sw, 2, ldsw, 4, int32_t)
+DO_VLDR(vldrh_uw, 2, lduw, 4, uint32_t)
+
+DO_VSTR(vstrb_h, 1, stb, 2, int16_t)
+DO_VSTR(vstrb_w, 1, stb, 4, int32_t)
+DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
+
+#undef DO_VLDR
+#undef DO_VSTR
+
+/*
+ * The mergemask(D, R, M) macro performs the operation "*D = R" but
+ * storing only the bytes which correspond to 1 bits in M,
+ * leaving other bytes in *D unchanged. We use _Generic
+ * to select the correct implementation based on the type of D.
+ */
+
+static void mergemask_ub(uint8_t *d, uint8_t r, uint16_t mask)
+{
+ if (mask & 1) {
+ *d = r;
+ }
+}
+
+static void mergemask_sb(int8_t *d, int8_t r, uint16_t mask)
+{
+ mergemask_ub((uint8_t *)d, r, mask);
+}
+
+static void mergemask_uh(uint16_t *d, uint16_t r, uint16_t mask)
+{
+ uint16_t bmask = expand_pred_b_data[mask & 3];
+ *d = (*d & ~bmask) | (r & bmask);
+}
+
+static void mergemask_sh(int16_t *d, int16_t r, uint16_t mask)
+{
+ mergemask_uh((uint16_t *)d, r, mask);
+}
+
+static void mergemask_uw(uint32_t *d, uint32_t r, uint16_t mask)
+{
+ uint32_t bmask = expand_pred_b_data[mask & 0xf];
+ *d = (*d & ~bmask) | (r & bmask);
+}
+
+static void mergemask_sw(int32_t *d, int32_t r, uint16_t mask)
+{
+ mergemask_uw((uint32_t *)d, r, mask);
+}
+
+static void mergemask_uq(uint64_t *d, uint64_t r, uint16_t mask)
+{
+ uint64_t bmask = expand_pred_b_data[mask & 0xff];
+ *d = (*d & ~bmask) | (r & bmask);
+}
+
+static void mergemask_sq(int64_t *d, int64_t r, uint16_t mask)
+{
+ mergemask_uq((uint64_t *)d, r, mask);
+}
+
+#define mergemask(D, R, M) \
+ _Generic(D, \
+ uint8_t *: mergemask_ub, \
+ int8_t *: mergemask_sb, \
+ uint16_t *: mergemask_uh, \
+ int16_t *: mergemask_sh, \
+ uint32_t *: mergemask_uw, \
+ int32_t *: mergemask_sw, \
+ uint64_t *: mergemask_uq, \
+ int64_t *: mergemask_sq)(D, R, M)
+
+void HELPER(mve_vdup)(CPUARMState *env, void *vd, uint32_t val)
+{
+ /*
+ * The generated code already replicated an 8 or 16 bit constant
+ * into the 32-bit value, so we only need to write the 32-bit
+ * value to all elements of the Qreg, allowing for predication.
+ */
+ uint32_t *d = vd;
+ uint16_t mask = mve_element_mask(env);
+ unsigned e;
+ for (e = 0; e < 16 / 4; e++, mask >>= 4) {
+ mergemask(&d[H4(e)], val, mask);
+ }
+ mve_advance_vpt(env);
+}
+
+#define DO_1OP(OP, ESIZE, TYPE, FN) \
+ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \
+ { \
+ TYPE *d = vd, *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)]), mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_CLS_B(N) (clrsb32(N) - 24)
+#define DO_CLS_H(N) (clrsb32(N) - 16)
+
+DO_1OP(vclsb, 1, int8_t, DO_CLS_B)
+DO_1OP(vclsh, 2, int16_t, DO_CLS_H)
+DO_1OP(vclsw, 4, int32_t, clrsb32)
+
+#define DO_CLZ_B(N) (clz32(N) - 24)
+#define DO_CLZ_H(N) (clz32(N) - 16)
+
+DO_1OP(vclzb, 1, uint8_t, DO_CLZ_B)
+DO_1OP(vclzh, 2, uint16_t, DO_CLZ_H)
+DO_1OP(vclzw, 4, uint32_t, clz32)
+
+DO_1OP(vrev16b, 2, uint16_t, bswap16)
+DO_1OP(vrev32b, 4, uint32_t, bswap32)
+DO_1OP(vrev32h, 4, uint32_t, hswap32)
+DO_1OP(vrev64b, 8, uint64_t, bswap64)
+DO_1OP(vrev64h, 8, uint64_t, hswap64)
+DO_1OP(vrev64w, 8, uint64_t, wswap64)
+
+#define DO_NOT(N) (~(N))
+
+DO_1OP(vmvn, 8, uint64_t, DO_NOT)
+
+#define DO_ABS(N) ((N) < 0 ? -(N) : (N))
+#define DO_FABSH(N) ((N) & dup_const(MO_16, 0x7fff))
+#define DO_FABSS(N) ((N) & dup_const(MO_32, 0x7fffffff))
+
+DO_1OP(vabsb, 1, int8_t, DO_ABS)
+DO_1OP(vabsh, 2, int16_t, DO_ABS)
+DO_1OP(vabsw, 4, int32_t, DO_ABS)
+
+/* We can do these 64 bits at a time */
+DO_1OP(vfabsh, 8, uint64_t, DO_FABSH)
+DO_1OP(vfabss, 8, uint64_t, DO_FABSS)
+
+#define DO_NEG(N) (-(N))
+#define DO_FNEGH(N) ((N) ^ dup_const(MO_16, 0x8000))
+#define DO_FNEGS(N) ((N) ^ dup_const(MO_32, 0x80000000))
+
+DO_1OP(vnegb, 1, int8_t, DO_NEG)
+DO_1OP(vnegh, 2, int16_t, DO_NEG)
+DO_1OP(vnegw, 4, int32_t, DO_NEG)
+
+/* We can do these 64 bits at a time */
+DO_1OP(vfnegh, 8, uint64_t, DO_FNEGH)
+DO_1OP(vfnegs, 8, uint64_t, DO_FNEGS)
+
+#define DO_2OP(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, \
+ void *vd, void *vn, void *vm) \
+ { \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ mergemask(&d[H##ESIZE(e)], \
+ FN(n[H##ESIZE(e)], m[H##ESIZE(e)]), mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+/* provide unsigned 2-op helpers for all sizes */
+#define DO_2OP_U(OP, FN) \
+ DO_2OP(OP##b, 1, uint8_t, FN) \
+ DO_2OP(OP##h, 2, uint16_t, FN) \
+ DO_2OP(OP##w, 4, uint32_t, FN)
+
+/* provide signed 2-op helpers for all sizes */
+#define DO_2OP_S(OP, FN) \
+ DO_2OP(OP##b, 1, int8_t, FN) \
+ DO_2OP(OP##h, 2, int16_t, FN) \
+ DO_2OP(OP##w, 4, int32_t, FN)
+
+/*
+ * "Long" operations where two half-sized inputs (taken from either the
+ * top or the bottom of the input vector) produce a double-width result.
+ * Here ESIZE, TYPE are for the input, and LESIZE, LTYPE for the output.
+ */
+#define DO_2OP_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
+ { \
+ LTYPE *d = vd; \
+ TYPE *n = vn, *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned le; \
+ for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
+ LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], \
+ m[H##ESIZE(le * 2 + TOP)]); \
+ mergemask(&d[H##LESIZE(le)], r, mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_2OP_SAT(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
+ { \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ bool qc = false; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ bool sat = false; \
+ TYPE r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], &sat); \
+ mergemask(&d[H##ESIZE(e)], r, mask); \
+ qc |= sat & mask & 1; \
+ } \
+ if (qc) { \
+ env->vfp.qc[0] = qc; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+/* provide unsigned 2-op helpers for all sizes */
+#define DO_2OP_SAT_U(OP, FN) \
+ DO_2OP_SAT(OP##b, 1, uint8_t, FN) \
+ DO_2OP_SAT(OP##h, 2, uint16_t, FN) \
+ DO_2OP_SAT(OP##w, 4, uint32_t, FN)
+
+/* provide signed 2-op helpers for all sizes */
+#define DO_2OP_SAT_S(OP, FN) \
+ DO_2OP_SAT(OP##b, 1, int8_t, FN) \
+ DO_2OP_SAT(OP##h, 2, int16_t, FN) \
+ DO_2OP_SAT(OP##w, 4, int32_t, FN)
+
+#define DO_AND(N, M) ((N) & (M))
+#define DO_BIC(N, M) ((N) & ~(M))
+#define DO_ORR(N, M) ((N) | (M))
+#define DO_ORN(N, M) ((N) | ~(M))
+#define DO_EOR(N, M) ((N) ^ (M))
+
+DO_2OP(vand, 8, uint64_t, DO_AND)
+DO_2OP(vbic, 8, uint64_t, DO_BIC)
+DO_2OP(vorr, 8, uint64_t, DO_ORR)
+DO_2OP(vorn, 8, uint64_t, DO_ORN)
+DO_2OP(veor, 8, uint64_t, DO_EOR)
+
+#define DO_ADD(N, M) ((N) + (M))
+#define DO_SUB(N, M) ((N) - (M))
+#define DO_MUL(N, M) ((N) * (M))
+
+DO_2OP_U(vadd, DO_ADD)
+DO_2OP_U(vsub, DO_SUB)
+DO_2OP_U(vmul, DO_MUL)
+
+DO_2OP_L(vmullbsb, 0, 1, int8_t, 2, int16_t, DO_MUL)
+DO_2OP_L(vmullbsh, 0, 2, int16_t, 4, int32_t, DO_MUL)
+DO_2OP_L(vmullbsw, 0, 4, int32_t, 8, int64_t, DO_MUL)
+DO_2OP_L(vmullbub, 0, 1, uint8_t, 2, uint16_t, DO_MUL)
+DO_2OP_L(vmullbuh, 0, 2, uint16_t, 4, uint32_t, DO_MUL)
+DO_2OP_L(vmullbuw, 0, 4, uint32_t, 8, uint64_t, DO_MUL)
+
+DO_2OP_L(vmulltsb, 1, 1, int8_t, 2, int16_t, DO_MUL)
+DO_2OP_L(vmulltsh, 1, 2, int16_t, 4, int32_t, DO_MUL)
+DO_2OP_L(vmulltsw, 1, 4, int32_t, 8, int64_t, DO_MUL)
+DO_2OP_L(vmulltub, 1, 1, uint8_t, 2, uint16_t, DO_MUL)
+DO_2OP_L(vmulltuh, 1, 2, uint16_t, 4, uint32_t, DO_MUL)
+DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL)
+
+/*
+ * Because the computation type is at least twice as large as required,
+ * these work for both signed and unsigned source types.
+ */
+static inline uint8_t do_mulh_b(int32_t n, int32_t m)
+{
+ return (n * m) >> 8;
+}
+
+static inline uint16_t do_mulh_h(int32_t n, int32_t m)
+{
+ return (n * m) >> 16;
+}
+
+static inline uint32_t do_mulh_w(int64_t n, int64_t m)
+{
+ return (n * m) >> 32;
+}
+
+static inline uint8_t do_rmulh_b(int32_t n, int32_t m)
+{
+ return (n * m + (1U << 7)) >> 8;
+}
+
+static inline uint16_t do_rmulh_h(int32_t n, int32_t m)
+{
+ return (n * m + (1U << 15)) >> 16;
+}
+
+static inline uint32_t do_rmulh_w(int64_t n, int64_t m)
+{
+ return (n * m + (1U << 31)) >> 32;
+}
+
+DO_2OP(vmulhsb, 1, int8_t, do_mulh_b)
+DO_2OP(vmulhsh, 2, int16_t, do_mulh_h)
+DO_2OP(vmulhsw, 4, int32_t, do_mulh_w)
+DO_2OP(vmulhub, 1, uint8_t, do_mulh_b)
+DO_2OP(vmulhuh, 2, uint16_t, do_mulh_h)
+DO_2OP(vmulhuw, 4, uint32_t, do_mulh_w)
+
+DO_2OP(vrmulhsb, 1, int8_t, do_rmulh_b)
+DO_2OP(vrmulhsh, 2, int16_t, do_rmulh_h)
+DO_2OP(vrmulhsw, 4, int32_t, do_rmulh_w)
+DO_2OP(vrmulhub, 1, uint8_t, do_rmulh_b)
+DO_2OP(vrmulhuh, 2, uint16_t, do_rmulh_h)
+DO_2OP(vrmulhuw, 4, uint32_t, do_rmulh_w)
+
+#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
+#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
+
+DO_2OP_S(vmaxs, DO_MAX)
+DO_2OP_U(vmaxu, DO_MAX)
+DO_2OP_S(vmins, DO_MIN)
+DO_2OP_U(vminu, DO_MIN)
+
+#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
+
+DO_2OP_S(vabds, DO_ABD)
+DO_2OP_U(vabdu, DO_ABD)
+
+static inline uint32_t do_vhadd_u(uint32_t n, uint32_t m)
+{
+ return ((uint64_t)n + m) >> 1;
+}
+
+static inline int32_t do_vhadd_s(int32_t n, int32_t m)
+{
+ return ((int64_t)n + m) >> 1;
+}
+
+static inline uint32_t do_vhsub_u(uint32_t n, uint32_t m)
+{
+ return ((uint64_t)n - m) >> 1;
+}
+
+static inline int32_t do_vhsub_s(int32_t n, int32_t m)
+{
+ return ((int64_t)n - m) >> 1;
+}
+
+DO_2OP_S(vhadds, do_vhadd_s)
+DO_2OP_U(vhaddu, do_vhadd_u)
+DO_2OP_S(vhsubs, do_vhsub_s)
+DO_2OP_U(vhsubu, do_vhsub_u)
+
+#define DO_VSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL)
+#define DO_VSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL)
+#define DO_VRSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL)
+#define DO_VRSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL)
+
+DO_2OP_S(vshls, DO_VSHLS)
+DO_2OP_U(vshlu, DO_VSHLU)
+DO_2OP_S(vrshls, DO_VRSHLS)
+DO_2OP_U(vrshlu, DO_VRSHLU)
+
+#define DO_RHADD_S(N, M) (((int64_t)(N) + (M) + 1) >> 1)
+#define DO_RHADD_U(N, M) (((uint64_t)(N) + (M) + 1) >> 1)
+
+DO_2OP_S(vrhadds, DO_RHADD_S)
+DO_2OP_U(vrhaddu, DO_RHADD_U)
+
+static void do_vadc(CPUARMState *env, uint32_t *d, uint32_t *n, uint32_t *m,
+ uint32_t inv, uint32_t carry_in, bool update_flags)
+{
+ uint16_t mask = mve_element_mask(env);
+ unsigned e;
+
+ /* If any additions trigger, we will update flags. */
+ if (mask & 0x1111) {
+ update_flags = true;
+ }
+
+ for (e = 0; e < 16 / 4; e++, mask >>= 4) {
+ uint64_t r = carry_in;
+ r += n[H4(e)];
+ r += m[H4(e)] ^ inv;
+ if (mask & 1) {
+ carry_in = r >> 32;
+ }
+ mergemask(&d[H4(e)], r, mask);
+ }
+
+ if (update_flags) {
+ /* Store C, clear NZV. */
+ env->vfp.xregs[ARM_VFP_FPSCR] &= ~FPCR_NZCV_MASK;
+ env->vfp.xregs[ARM_VFP_FPSCR] |= carry_in * FPCR_C;
+ }
+ mve_advance_vpt(env);
+}
+
+void HELPER(mve_vadc)(CPUARMState *env, void *vd, void *vn, void *vm)
+{
+ bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C;
+ do_vadc(env, vd, vn, vm, 0, carry_in, false);
+}
+
+void HELPER(mve_vsbc)(CPUARMState *env, void *vd, void *vn, void *vm)
+{
+ bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C;
+ do_vadc(env, vd, vn, vm, -1, carry_in, false);
+}
+
+
+void HELPER(mve_vadci)(CPUARMState *env, void *vd, void *vn, void *vm)
+{
+ do_vadc(env, vd, vn, vm, 0, 0, true);
+}
+
+void HELPER(mve_vsbci)(CPUARMState *env, void *vd, void *vn, void *vm)
+{
+ do_vadc(env, vd, vn, vm, -1, 1, true);
+}
+
+#define DO_VCADD(OP, ESIZE, TYPE, FN0, FN1) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
+ { \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ TYPE r[16 / ESIZE]; \
+ /* Calculate all results first to avoid overwriting inputs */ \
+ for (e = 0; e < 16 / ESIZE; e++) { \
+ if (!(e & 1)) { \
+ r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)]); \
+ } else { \
+ r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)]); \
+ } \
+ } \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ mergemask(&d[H##ESIZE(e)], r[e], mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_VCADD_ALL(OP, FN0, FN1) \
+ DO_VCADD(OP##b, 1, int8_t, FN0, FN1) \
+ DO_VCADD(OP##h, 2, int16_t, FN0, FN1) \
+ DO_VCADD(OP##w, 4, int32_t, FN0, FN1)
+
+DO_VCADD_ALL(vcadd90, DO_SUB, DO_ADD)
+DO_VCADD_ALL(vcadd270, DO_ADD, DO_SUB)
+DO_VCADD_ALL(vhcadd90, do_vhsub_s, do_vhadd_s)
+DO_VCADD_ALL(vhcadd270, do_vhadd_s, do_vhsub_s)
+
+static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s)
+{
+ if (val > max) {
+ *s = true;
+ return max;
+ } else if (val < min) {
+ *s = true;
+ return min;
+ }
+ return val;
+}
+
+#define DO_SQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, INT8_MIN, INT8_MAX, s)
+#define DO_SQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, INT16_MIN, INT16_MAX, s)
+#define DO_SQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, INT32_MIN, INT32_MAX, s)
+
+#define DO_UQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT8_MAX, s)
+#define DO_UQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT16_MAX, s)
+#define DO_UQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT32_MAX, s)
+
+#define DO_SQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, INT8_MIN, INT8_MAX, s)
+#define DO_SQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, INT16_MIN, INT16_MAX, s)
+#define DO_SQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, INT32_MIN, INT32_MAX, s)
+
+#define DO_UQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT8_MAX, s)
+#define DO_UQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT16_MAX, s)
+#define DO_UQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT32_MAX, s)
+
+/*
+ * For QDMULH and QRDMULH we simplify "double and shift by esize" into
+ * "shift by esize-1", adjusting the QRDMULH rounding constant to match.
+ */
+#define DO_QDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m) >> 7, \
+ INT8_MIN, INT8_MAX, s)
+#define DO_QDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m) >> 15, \
+ INT16_MIN, INT16_MAX, s)
+#define DO_QDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m) >> 31, \
+ INT32_MIN, INT32_MAX, s)
+
+#define DO_QRDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 6)) >> 7, \
+ INT8_MIN, INT8_MAX, s)
+#define DO_QRDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 14)) >> 15, \
+ INT16_MIN, INT16_MAX, s)
+#define DO_QRDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 30)) >> 31, \
+ INT32_MIN, INT32_MAX, s)
+
+DO_2OP_SAT(vqdmulhb, 1, int8_t, DO_QDMULH_B)
+DO_2OP_SAT(vqdmulhh, 2, int16_t, DO_QDMULH_H)
+DO_2OP_SAT(vqdmulhw, 4, int32_t, DO_QDMULH_W)
+
+DO_2OP_SAT(vqrdmulhb, 1, int8_t, DO_QRDMULH_B)
+DO_2OP_SAT(vqrdmulhh, 2, int16_t, DO_QRDMULH_H)
+DO_2OP_SAT(vqrdmulhw, 4, int32_t, DO_QRDMULH_W)
+
+DO_2OP_SAT(vqaddub, 1, uint8_t, DO_UQADD_B)
+DO_2OP_SAT(vqadduh, 2, uint16_t, DO_UQADD_H)
+DO_2OP_SAT(vqadduw, 4, uint32_t, DO_UQADD_W)
+DO_2OP_SAT(vqaddsb, 1, int8_t, DO_SQADD_B)
+DO_2OP_SAT(vqaddsh, 2, int16_t, DO_SQADD_H)
+DO_2OP_SAT(vqaddsw, 4, int32_t, DO_SQADD_W)
+
+DO_2OP_SAT(vqsubub, 1, uint8_t, DO_UQSUB_B)
+DO_2OP_SAT(vqsubuh, 2, uint16_t, DO_UQSUB_H)
+DO_2OP_SAT(vqsubuw, 4, uint32_t, DO_UQSUB_W)
+DO_2OP_SAT(vqsubsb, 1, int8_t, DO_SQSUB_B)
+DO_2OP_SAT(vqsubsh, 2, int16_t, DO_SQSUB_H)
+DO_2OP_SAT(vqsubsw, 4, int32_t, DO_SQSUB_W)
+
+/*
+ * This wrapper fixes up the impedance mismatch between do_sqrshl_bhs()
+ * and friends wanting a uint32_t* sat and our needing a bool*.
+ */
+#define WRAP_QRSHL_HELPER(FN, N, M, ROUND, satp) \
+ ({ \
+ uint32_t su32 = 0; \
+ typeof(N) r = FN(N, (int8_t)(M), sizeof(N) * 8, ROUND, &su32); \
+ if (su32) { \
+ *satp = true; \
+ } \
+ r; \
+ })
+
+#define DO_SQSHL_OP(N, M, satp) \
+ WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, false, satp)
+#define DO_UQSHL_OP(N, M, satp) \
+ WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, false, satp)
+#define DO_SQRSHL_OP(N, M, satp) \
+ WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, true, satp)
+#define DO_UQRSHL_OP(N, M, satp) \
+ WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, true, satp)
+
+DO_2OP_SAT_S(vqshls, DO_SQSHL_OP)
+DO_2OP_SAT_U(vqshlu, DO_UQSHL_OP)
+DO_2OP_SAT_S(vqrshls, DO_SQRSHL_OP)
+DO_2OP_SAT_U(vqrshlu, DO_UQRSHL_OP)
+
+/*
+ * Multiply add dual returning high half
+ * The 'FN' here takes four inputs A, B, C, D, a 0/1 indicator of
+ * whether to add the rounding constant, and the pointer to the
+ * saturation flag, and should do "(A * B + C * D) * 2 + rounding constant",
+ * saturate to twice the input size and return the high half; or
+ * (A * B - C * D) etc for VQDMLSDH.
+ */
+#define DO_VQDMLADH_OP(OP, ESIZE, TYPE, XCHG, ROUND, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
+ void *vm) \
+ { \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ bool qc = false; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ bool sat = false; \
+ if ((e & 1) == XCHG) { \
+ TYPE r = FN(n[H##ESIZE(e)], \
+ m[H##ESIZE(e - XCHG)], \
+ n[H##ESIZE(e + (1 - 2 * XCHG))], \
+ m[H##ESIZE(e + (1 - XCHG))], \
+ ROUND, &sat); \
+ mergemask(&d[H##ESIZE(e)], r, mask); \
+ qc |= sat & mask & 1; \
+ } \
+ } \
+ if (qc) { \
+ env->vfp.qc[0] = qc; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+static int8_t do_vqdmladh_b(int8_t a, int8_t b, int8_t c, int8_t d,
+ int round, bool *sat)
+{
+ int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 7);
+ return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
+}
+
+static int16_t do_vqdmladh_h(int16_t a, int16_t b, int16_t c, int16_t d,
+ int round, bool *sat)
+{
+ int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 15);
+ return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
+}
+
+static int32_t do_vqdmladh_w(int32_t a, int32_t b, int32_t c, int32_t d,
+ int round, bool *sat)
+{
+ int64_t m1 = (int64_t)a * b;
+ int64_t m2 = (int64_t)c * d;
+ int64_t r;
+ /*
+ * Architecturally we should do the entire add, double, round
+ * and then check for saturation. We do three saturating adds,
+ * but we need to be careful about the order. If the first
+ * m1 + m2 saturates then it's impossible for the *2+rc to
+ * bring it back into the non-saturated range. However, if
+ * m1 + m2 is negative then it's possible that doing the doubling
+ * would take the intermediate result below INT64_MAX and the
+ * addition of the rounding constant then brings it back in range.
+ * So we add half the rounding constant before doubling rather
+ * than adding the rounding constant after the doubling.
+ */
+ if (sadd64_overflow(m1, m2, &r) ||
+ sadd64_overflow(r, (round << 30), &r) ||
+ sadd64_overflow(r, r, &r)) {
+ *sat = true;
+ return r < 0 ? INT32_MAX : INT32_MIN;
+ }
+ return r >> 32;
+}
+
+static int8_t do_vqdmlsdh_b(int8_t a, int8_t b, int8_t c, int8_t d,
+ int round, bool *sat)
+{
+ int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 7);
+ return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
+}
+
+static int16_t do_vqdmlsdh_h(int16_t a, int16_t b, int16_t c, int16_t d,
+ int round, bool *sat)
+{
+ int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 15);
+ return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
+}
+
+static int32_t do_vqdmlsdh_w(int32_t a, int32_t b, int32_t c, int32_t d,
+ int round, bool *sat)
+{
+ int64_t m1 = (int64_t)a * b;
+ int64_t m2 = (int64_t)c * d;
+ int64_t r;
+ /* The same ordering issue as in do_vqdmladh_w applies here too */
+ if (ssub64_overflow(m1, m2, &r) ||
+ sadd64_overflow(r, (round << 30), &r) ||
+ sadd64_overflow(r, r, &r)) {
+ *sat = true;
+ return r < 0 ? INT32_MAX : INT32_MIN;
+ }
+ return r >> 32;
+}
+
+DO_VQDMLADH_OP(vqdmladhb, 1, int8_t, 0, 0, do_vqdmladh_b)
+DO_VQDMLADH_OP(vqdmladhh, 2, int16_t, 0, 0, do_vqdmladh_h)
+DO_VQDMLADH_OP(vqdmladhw, 4, int32_t, 0, 0, do_vqdmladh_w)
+DO_VQDMLADH_OP(vqdmladhxb, 1, int8_t, 1, 0, do_vqdmladh_b)
+DO_VQDMLADH_OP(vqdmladhxh, 2, int16_t, 1, 0, do_vqdmladh_h)
+DO_VQDMLADH_OP(vqdmladhxw, 4, int32_t, 1, 0, do_vqdmladh_w)
+
+DO_VQDMLADH_OP(vqrdmladhb, 1, int8_t, 0, 1, do_vqdmladh_b)
+DO_VQDMLADH_OP(vqrdmladhh, 2, int16_t, 0, 1, do_vqdmladh_h)
+DO_VQDMLADH_OP(vqrdmladhw, 4, int32_t, 0, 1, do_vqdmladh_w)
+DO_VQDMLADH_OP(vqrdmladhxb, 1, int8_t, 1, 1, do_vqdmladh_b)
+DO_VQDMLADH_OP(vqrdmladhxh, 2, int16_t, 1, 1, do_vqdmladh_h)
+DO_VQDMLADH_OP(vqrdmladhxw, 4, int32_t, 1, 1, do_vqdmladh_w)
+
+DO_VQDMLADH_OP(vqdmlsdhb, 1, int8_t, 0, 0, do_vqdmlsdh_b)
+DO_VQDMLADH_OP(vqdmlsdhh, 2, int16_t, 0, 0, do_vqdmlsdh_h)
+DO_VQDMLADH_OP(vqdmlsdhw, 4, int32_t, 0, 0, do_vqdmlsdh_w)
+DO_VQDMLADH_OP(vqdmlsdhxb, 1, int8_t, 1, 0, do_vqdmlsdh_b)
+DO_VQDMLADH_OP(vqdmlsdhxh, 2, int16_t, 1, 0, do_vqdmlsdh_h)
+DO_VQDMLADH_OP(vqdmlsdhxw, 4, int32_t, 1, 0, do_vqdmlsdh_w)
+
+DO_VQDMLADH_OP(vqrdmlsdhb, 1, int8_t, 0, 1, do_vqdmlsdh_b)
+DO_VQDMLADH_OP(vqrdmlsdhh, 2, int16_t, 0, 1, do_vqdmlsdh_h)
+DO_VQDMLADH_OP(vqrdmlsdhw, 4, int32_t, 0, 1, do_vqdmlsdh_w)
+DO_VQDMLADH_OP(vqrdmlsdhxb, 1, int8_t, 1, 1, do_vqdmlsdh_b)
+DO_VQDMLADH_OP(vqrdmlsdhxh, 2, int16_t, 1, 1, do_vqdmlsdh_h)
+DO_VQDMLADH_OP(vqrdmlsdhxw, 4, int32_t, 1, 1, do_vqdmlsdh_w)
+
+#define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
+ uint32_t rm) \
+ { \
+ TYPE *d = vd, *n = vn; \
+ TYPE m = rm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m), mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_2OP_SAT_SCALAR(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
+ uint32_t rm) \
+ { \
+ TYPE *d = vd, *n = vn; \
+ TYPE m = rm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ bool qc = false; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ bool sat = false; \
+ mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m, &sat), \
+ mask); \
+ qc |= sat & mask & 1; \
+ } \
+ if (qc) { \
+ env->vfp.qc[0] = qc; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+/* provide unsigned 2-op scalar helpers for all sizes */
+#define DO_2OP_SCALAR_U(OP, FN) \
+ DO_2OP_SCALAR(OP##b, 1, uint8_t, FN) \
+ DO_2OP_SCALAR(OP##h, 2, uint16_t, FN) \
+ DO_2OP_SCALAR(OP##w, 4, uint32_t, FN)
+#define DO_2OP_SCALAR_S(OP, FN) \
+ DO_2OP_SCALAR(OP##b, 1, int8_t, FN) \
+ DO_2OP_SCALAR(OP##h, 2, int16_t, FN) \
+ DO_2OP_SCALAR(OP##w, 4, int32_t, FN)
+
+DO_2OP_SCALAR_U(vadd_scalar, DO_ADD)
+DO_2OP_SCALAR_U(vsub_scalar, DO_SUB)
+DO_2OP_SCALAR_U(vmul_scalar, DO_MUL)
+DO_2OP_SCALAR_S(vhadds_scalar, do_vhadd_s)
+DO_2OP_SCALAR_U(vhaddu_scalar, do_vhadd_u)
+DO_2OP_SCALAR_S(vhsubs_scalar, do_vhsub_s)
+DO_2OP_SCALAR_U(vhsubu_scalar, do_vhsub_u)
+
+DO_2OP_SAT_SCALAR(vqaddu_scalarb, 1, uint8_t, DO_UQADD_B)
+DO_2OP_SAT_SCALAR(vqaddu_scalarh, 2, uint16_t, DO_UQADD_H)
+DO_2OP_SAT_SCALAR(vqaddu_scalarw, 4, uint32_t, DO_UQADD_W)
+DO_2OP_SAT_SCALAR(vqadds_scalarb, 1, int8_t, DO_SQADD_B)
+DO_2OP_SAT_SCALAR(vqadds_scalarh, 2, int16_t, DO_SQADD_H)
+DO_2OP_SAT_SCALAR(vqadds_scalarw, 4, int32_t, DO_SQADD_W)
+
+DO_2OP_SAT_SCALAR(vqsubu_scalarb, 1, uint8_t, DO_UQSUB_B)
+DO_2OP_SAT_SCALAR(vqsubu_scalarh, 2, uint16_t, DO_UQSUB_H)
+DO_2OP_SAT_SCALAR(vqsubu_scalarw, 4, uint32_t, DO_UQSUB_W)
+DO_2OP_SAT_SCALAR(vqsubs_scalarb, 1, int8_t, DO_SQSUB_B)
+DO_2OP_SAT_SCALAR(vqsubs_scalarh, 2, int16_t, DO_SQSUB_H)
+DO_2OP_SAT_SCALAR(vqsubs_scalarw, 4, int32_t, DO_SQSUB_W)
+
+DO_2OP_SAT_SCALAR(vqdmulh_scalarb, 1, int8_t, DO_QDMULH_B)
+DO_2OP_SAT_SCALAR(vqdmulh_scalarh, 2, int16_t, DO_QDMULH_H)
+DO_2OP_SAT_SCALAR(vqdmulh_scalarw, 4, int32_t, DO_QDMULH_W)
+DO_2OP_SAT_SCALAR(vqrdmulh_scalarb, 1, int8_t, DO_QRDMULH_B)
+DO_2OP_SAT_SCALAR(vqrdmulh_scalarh, 2, int16_t, DO_QRDMULH_H)
+DO_2OP_SAT_SCALAR(vqrdmulh_scalarw, 4, int32_t, DO_QRDMULH_W)
+
+/*
+ * Long saturating scalar ops. As with DO_2OP_L, TYPE and H are for the
+ * input (smaller) type and LESIZE, LTYPE, LH for the output (long) type.
+ * SATMASK specifies which bits of the predicate mask matter for determining
+ * whether to propagate a saturation indication into FPSCR.QC -- for
+ * the 16x16->32 case we must check only the bit corresponding to the T or B
+ * half that we used, but for the 32x32->64 case we propagate if the mask
+ * bit is set for either half.
+ */
+#define DO_2OP_SAT_SCALAR_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
+ uint32_t rm) \
+ { \
+ LTYPE *d = vd; \
+ TYPE *n = vn; \
+ TYPE m = rm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned le; \
+ bool qc = false; \
+ for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
+ bool sat = false; \
+ LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], m, &sat); \
+ mergemask(&d[H##LESIZE(le)], r, mask); \
+ qc |= sat && (mask & SATMASK); \
+ } \
+ if (qc) { \
+ env->vfp.qc[0] = qc; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+static inline int32_t do_qdmullh(int16_t n, int16_t m, bool *sat)
+{
+ int64_t r = ((int64_t)n * m) * 2;
+ return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat);
+}
+
+static inline int64_t do_qdmullw(int32_t n, int32_t m, bool *sat)
+{
+ /* The multiply can't overflow, but the doubling might */
+ int64_t r = (int64_t)n * m;
+ if (r > INT64_MAX / 2) {
+ *sat = true;
+ return INT64_MAX;
+ } else if (r < INT64_MIN / 2) {
+ *sat = true;
+ return INT64_MIN;
+ } else {
+ return r * 2;
+ }
+}
+
+#define SATMASK16B 1
+#define SATMASK16T (1 << 2)
+#define SATMASK32 ((1 << 4) | 1)
+
+DO_2OP_SAT_SCALAR_L(vqdmullb_scalarh, 0, 2, int16_t, 4, int32_t, \
+ do_qdmullh, SATMASK16B)
+DO_2OP_SAT_SCALAR_L(vqdmullb_scalarw, 0, 4, int32_t, 8, int64_t, \
+ do_qdmullw, SATMASK32)
+DO_2OP_SAT_SCALAR_L(vqdmullt_scalarh, 1, 2, int16_t, 4, int32_t, \
+ do_qdmullh, SATMASK16T)
+DO_2OP_SAT_SCALAR_L(vqdmullt_scalarw, 1, 4, int32_t, 8, int64_t, \
+ do_qdmullw, SATMASK32)
+
+/*
+ * Long saturating ops
+ */
+#define DO_2OP_SAT_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
+ void *vm) \
+ { \
+ LTYPE *d = vd; \
+ TYPE *n = vn, *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned le; \
+ bool qc = false; \
+ for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
+ bool sat = false; \
+ LTYPE op1 = n[H##ESIZE(le * 2 + TOP)]; \
+ LTYPE op2 = m[H##ESIZE(le * 2 + TOP)]; \
+ mergemask(&d[H##LESIZE(le)], FN(op1, op2, &sat), mask); \
+ qc |= sat && (mask & SATMASK); \
+ } \
+ if (qc) { \
+ env->vfp.qc[0] = qc; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+DO_2OP_SAT_L(vqdmullbh, 0, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16B)
+DO_2OP_SAT_L(vqdmullbw, 0, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32)
+DO_2OP_SAT_L(vqdmullth, 1, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16T)
+DO_2OP_SAT_L(vqdmulltw, 1, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32)
+
+static inline uint32_t do_vbrsrb(uint32_t n, uint32_t m)
+{
+ m &= 0xff;
+ if (m == 0) {
+ return 0;
+ }
+ n = revbit8(n);
+ if (m < 8) {
+ n >>= 8 - m;
+ }
+ return n;
+}
+
+static inline uint32_t do_vbrsrh(uint32_t n, uint32_t m)
+{
+ m &= 0xff;
+ if (m == 0) {
+ return 0;
+ }
+ n = revbit16(n);
+ if (m < 16) {
+ n >>= 16 - m;
+ }
+ return n;
+}
+
+static inline uint32_t do_vbrsrw(uint32_t n, uint32_t m)
+{
+ m &= 0xff;
+ if (m == 0) {
+ return 0;
+ }
+ n = revbit32(n);
+ if (m < 32) {
+ n >>= 32 - m;
+ }
+ return n;
+}
+
+DO_2OP_SCALAR(vbrsrb, 1, uint8_t, do_vbrsrb)
+DO_2OP_SCALAR(vbrsrh, 2, uint16_t, do_vbrsrh)
+DO_2OP_SCALAR(vbrsrw, 4, uint32_t, do_vbrsrw)
+
+/*
+ * Multiply add long dual accumulate ops.
+ */
+#define DO_LDAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC) \
+ uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \
+ void *vm, uint64_t a) \
+ { \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ TYPE *n = vn, *m = vm; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ if (mask & 1) { \
+ if (e & 1) { \
+ a ODDACC \
+ (int64_t)n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \
+ } else { \
+ a EVENACC \
+ (int64_t)n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)]; \
+ } \
+ } \
+ } \
+ mve_advance_vpt(env); \
+ return a; \
+ }
+
+DO_LDAV(vmlaldavsh, 2, int16_t, false, +=, +=)
+DO_LDAV(vmlaldavxsh, 2, int16_t, true, +=, +=)
+DO_LDAV(vmlaldavsw, 4, int32_t, false, +=, +=)
+DO_LDAV(vmlaldavxsw, 4, int32_t, true, +=, +=)
+
+DO_LDAV(vmlaldavuh, 2, uint16_t, false, +=, +=)
+DO_LDAV(vmlaldavuw, 4, uint32_t, false, +=, +=)
+
+DO_LDAV(vmlsldavsh, 2, int16_t, false, +=, -=)
+DO_LDAV(vmlsldavxsh, 2, int16_t, true, +=, -=)
+DO_LDAV(vmlsldavsw, 4, int32_t, false, +=, -=)
+DO_LDAV(vmlsldavxsw, 4, int32_t, true, +=, -=)
+
+/*
+ * Rounding multiply add long dual accumulate high: we must keep
+ * a 72-bit internal accumulator value and return the top 64 bits.
+ */
+#define DO_LDAVH(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC, TO128) \
+ uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \
+ void *vm, uint64_t a) \
+ { \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ TYPE *n = vn, *m = vm; \
+ Int128 acc = int128_lshift(TO128(a), 8); \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ if (mask & 1) { \
+ if (e & 1) { \
+ acc = ODDACC(acc, TO128(n[H##ESIZE(e - 1 * XCHG)] * \
+ m[H##ESIZE(e)])); \
+ } else { \
+ acc = EVENACC(acc, TO128(n[H##ESIZE(e + 1 * XCHG)] * \
+ m[H##ESIZE(e)])); \
+ } \
+ acc = int128_add(acc, int128_make64(1 << 7)); \
+ } \
+ } \
+ mve_advance_vpt(env); \
+ return int128_getlo(int128_rshift(acc, 8)); \
+ }
+
+DO_LDAVH(vrmlaldavhsw, 4, int32_t, false, int128_add, int128_add, int128_makes64)
+DO_LDAVH(vrmlaldavhxsw, 4, int32_t, true, int128_add, int128_add, int128_makes64)
+
+DO_LDAVH(vrmlaldavhuw, 4, uint32_t, false, int128_add, int128_add, int128_make64)
+
+DO_LDAVH(vrmlsldavhsw, 4, int32_t, false, int128_add, int128_sub, int128_makes64)
+DO_LDAVH(vrmlsldavhxsw, 4, int32_t, true, int128_add, int128_sub, int128_makes64)
+
+/* Vector add across vector */
+#define DO_VADDV(OP, ESIZE, TYPE) \
+ uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
+ uint32_t ra) \
+ { \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ TYPE *m = vm; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ if (mask & 1) { \
+ ra += m[H##ESIZE(e)]; \
+ } \
+ } \
+ mve_advance_vpt(env); \
+ return ra; \
+ } \
+
+DO_VADDV(vaddvsb, 1, uint8_t)
+DO_VADDV(vaddvsh, 2, uint16_t)
+DO_VADDV(vaddvsw, 4, uint32_t)
+DO_VADDV(vaddvub, 1, uint8_t)
+DO_VADDV(vaddvuh, 2, uint16_t)
+DO_VADDV(vaddvuw, 4, uint32_t)
diff --git a/target/arm/translate-a32.h b/target/arm/translate-a32.h
index 0a0053949f..6dfcafe179 100644
--- a/target/arm/translate-a32.h
+++ b/target/arm/translate-a32.h
@@ -32,6 +32,7 @@ bool disas_neon_shared(DisasContext *s, uint32_t insn);
void load_reg_var(DisasContext *s, TCGv_i32 var, int reg);
void arm_gen_condlabel(DisasContext *s);
bool vfp_access_check(DisasContext *s);
+bool vfp_access_check_m(DisasContext *s, bool skip_context_update);
void read_neon_element32(TCGv_i32 dest, int reg, int ele, MemOp memop);
void read_neon_element64(TCGv_i64 dest, int reg, int ele, MemOp memop);
void write_neon_element32(TCGv_i32 src, int reg, int ele, MemOp memop);
@@ -46,6 +47,8 @@ long neon_full_reg_offset(unsigned reg);
long neon_element_offset(int reg, int element, MemOp memop);
void gen_rev16(TCGv_i32 dest, TCGv_i32 var);
void clear_eci_state(DisasContext *s);
+bool mve_eci_check(DisasContext *s);
+void mve_update_and_store_eci(DisasContext *s);
static inline TCGv_i32 load_cpu_offset(int offset)
{
diff --git a/target/arm/translate-m-nocp.c b/target/arm/translate-m-nocp.c
index 09b3be4ed3..5eab04832c 100644
--- a/target/arm/translate-m-nocp.c
+++ b/target/arm/translate-m-nocp.c
@@ -19,6 +19,7 @@
#include "qemu/osdep.h"
#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
#include "translate.h"
#include "translate-a32.h"
@@ -191,6 +192,555 @@ static bool trans_VSCCLRM(DisasContext *s, arg_VSCCLRM *a)
return true;
}
+/*
+ * M-profile provides two different sets of instructions that can
+ * access floating point system registers: VMSR/VMRS (which move
+ * to/from a general purpose register) and VLDR/VSTR sysreg (which
+ * move directly to/from memory). In some cases there are also side
+ * effects which must happen after any write to memory (which could
+ * cause an exception). So we implement the common logic for the
+ * sysreg access in gen_M_fp_sysreg_write() and gen_M_fp_sysreg_read(),
+ * which take pointers to callback functions which will perform the
+ * actual "read/write general purpose register" and "read/write
+ * memory" operations.
+ */
+
+/*
+ * Emit code to store the sysreg to its final destination; frees the
+ * TCG temp 'value' it is passed. do_access is true to do the store,
+ * and false to skip it and only perform side-effects like base
+ * register writeback.
+ */
+typedef void fp_sysreg_storefn(DisasContext *s, void *opaque, TCGv_i32 value,
+ bool do_access);
+/*
+ * Emit code to load the value to be copied to the sysreg; returns
+ * a new TCG temporary. do_access is true to do the store,
+ * and false to skip it and only perform side-effects like base
+ * register writeback.
+ */
+typedef TCGv_i32 fp_sysreg_loadfn(DisasContext *s, void *opaque,
+ bool do_access);
+
+/* Common decode/access checks for fp sysreg read/write */
+typedef enum FPSysRegCheckResult {
+ FPSysRegCheckFailed, /* caller should return false */
+ FPSysRegCheckDone, /* caller should return true */
+ FPSysRegCheckContinue, /* caller should continue generating code */
+} FPSysRegCheckResult;
+
+static FPSysRegCheckResult fp_sysreg_checks(DisasContext *s, int regno)
+{
+ if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
+ return FPSysRegCheckFailed;
+ }
+
+ switch (regno) {
+ case ARM_VFP_FPSCR:
+ case QEMU_VFP_FPSCR_NZCV:
+ break;
+ case ARM_VFP_FPSCR_NZCVQC:
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+ return FPSysRegCheckFailed;
+ }
+ break;
+ case ARM_VFP_FPCXT_S:
+ case ARM_VFP_FPCXT_NS:
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+ return FPSysRegCheckFailed;
+ }
+ if (!s->v8m_secure) {
+ return FPSysRegCheckFailed;
+ }
+ break;
+ case ARM_VFP_VPR:
+ case ARM_VFP_P0:
+ if (!dc_isar_feature(aa32_mve, s)) {
+ return FPSysRegCheckFailed;
+ }
+ break;
+ default:
+ return FPSysRegCheckFailed;
+ }
+
+ /*
+ * FPCXT_NS is a special case: it has specific handling for
+ * "current FP state is inactive", and must do the PreserveFPState()
+ * but not the usual full set of actions done by ExecuteFPCheck().
+ * So we don't call vfp_access_check() and the callers must handle this.
+ */
+ if (regno != ARM_VFP_FPCXT_NS && !vfp_access_check(s)) {
+ return FPSysRegCheckDone;
+ }
+ return FPSysRegCheckContinue;
+}
+
+static void gen_branch_fpInactive(DisasContext *s, TCGCond cond,
+ TCGLabel *label)
+{
+ /*
+ * FPCXT_NS is a special case: it has specific handling for
+ * "current FP state is inactive", and must do the PreserveFPState()
+ * but not the usual full set of actions done by ExecuteFPCheck().
+ * We don't have a TB flag that matches the fpInactive check, so we
+ * do it at runtime as we don't expect FPCXT_NS accesses to be frequent.
+ *
+ * Emit code that checks fpInactive and does a conditional
+ * branch to label based on it:
+ * if cond is TCG_COND_NE then branch if fpInactive != 0 (ie if inactive)
+ * if cond is TCG_COND_EQ then branch if fpInactive == 0 (ie if active)
+ */
+ assert(cond == TCG_COND_EQ || cond == TCG_COND_NE);
+
+ /* fpInactive = FPCCR_NS.ASPEN == 1 && CONTROL.FPCA == 0 */
+ TCGv_i32 aspen, fpca;
+ aspen = load_cpu_field(v7m.fpccr[M_REG_NS]);
+ fpca = load_cpu_field(v7m.control[M_REG_S]);
+ tcg_gen_andi_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
+ tcg_gen_xori_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
+ tcg_gen_andi_i32(fpca, fpca, R_V7M_CONTROL_FPCA_MASK);
+ tcg_gen_or_i32(fpca, fpca, aspen);
+ tcg_gen_brcondi_i32(tcg_invert_cond(cond), fpca, 0, label);
+ tcg_temp_free_i32(aspen);
+ tcg_temp_free_i32(fpca);
+}
+
+static bool gen_M_fp_sysreg_write(DisasContext *s, int regno,
+ fp_sysreg_loadfn *loadfn,
+ void *opaque)
+{
+ /* Do a write to an M-profile floating point system register */
+ TCGv_i32 tmp;
+ TCGLabel *lab_end = NULL;
+
+ switch (fp_sysreg_checks(s, regno)) {
+ case FPSysRegCheckFailed:
+ return false;
+ case FPSysRegCheckDone:
+ return true;
+ case FPSysRegCheckContinue:
+ break;
+ }
+
+ switch (regno) {
+ case ARM_VFP_FPSCR:
+ tmp = loadfn(s, opaque, true);
+ gen_helper_vfp_set_fpscr(cpu_env, tmp);
+ tcg_temp_free_i32(tmp);
+ gen_lookup_tb(s);
+ break;
+ case ARM_VFP_FPSCR_NZCVQC:
+ {
+ TCGv_i32 fpscr;
+ tmp = loadfn(s, opaque, true);
+ if (dc_isar_feature(aa32_mve, s)) {
+ /* QC is only present for MVE; otherwise RES0 */
+ TCGv_i32 qc = tcg_temp_new_i32();
+ tcg_gen_andi_i32(qc, tmp, FPCR_QC);
+ /*
+ * The 4 vfp.qc[] fields need only be "zero" vs "non-zero";
+ * here writing the same value into all elements is simplest.
+ */
+ tcg_gen_gvec_dup_i32(MO_32, offsetof(CPUARMState, vfp.qc),
+ 16, 16, qc);
+ }
+ tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK);
+ fpscr = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]);
+ tcg_gen_andi_i32(fpscr, fpscr, ~FPCR_NZCV_MASK);
+ tcg_gen_or_i32(fpscr, fpscr, tmp);
+ store_cpu_field(fpscr, vfp.xregs[ARM_VFP_FPSCR]);
+ tcg_temp_free_i32(tmp);
+ break;
+ }
+ case ARM_VFP_FPCXT_NS:
+ {
+ TCGLabel *lab_active = gen_new_label();
+
+ lab_end = gen_new_label();
+ gen_branch_fpInactive(s, TCG_COND_EQ, lab_active);
+ /*
+ * fpInactive case: write is a NOP, so only do side effects
+ * like register writeback before we branch to end
+ */
+ loadfn(s, opaque, false);
+ tcg_gen_br(lab_end);
+
+ gen_set_label(lab_active);
+ /*
+ * !fpInactive: if FPU disabled, take NOCP exception;
+ * otherwise PreserveFPState(), and then FPCXT_NS writes
+ * behave the same as FPCXT_S writes.
+ */
+ if (!vfp_access_check_m(s, true)) {
+ /*
+ * This was only a conditional exception, so override
+ * gen_exception_insn()'s default to DISAS_NORETURN
+ */
+ s->base.is_jmp = DISAS_NEXT;
+ break;
+ }
+ }
+ /* fall through */
+ case ARM_VFP_FPCXT_S:
+ {
+ TCGv_i32 sfpa, control;
+ /*
+ * Set FPSCR and CONTROL.SFPA from value; the new FPSCR takes
+ * bits [27:0] from value and zeroes bits [31:28].
+ */
+ tmp = loadfn(s, opaque, true);
+ sfpa = tcg_temp_new_i32();
+ tcg_gen_shri_i32(sfpa, tmp, 31);
+ control = load_cpu_field(v7m.control[M_REG_S]);
+ tcg_gen_deposit_i32(control, control, sfpa,
+ R_V7M_CONTROL_SFPA_SHIFT, 1);
+ store_cpu_field(control, v7m.control[M_REG_S]);
+ tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK);
+ gen_helper_vfp_set_fpscr(cpu_env, tmp);
+ tcg_temp_free_i32(tmp);
+ tcg_temp_free_i32(sfpa);
+ break;
+ }
+ case ARM_VFP_VPR:
+ /* Behaves as NOP if not privileged */
+ if (IS_USER(s)) {
+ loadfn(s, opaque, false);
+ break;
+ }
+ tmp = loadfn(s, opaque, true);
+ store_cpu_field(tmp, v7m.vpr);
+ break;
+ case ARM_VFP_P0:
+ {
+ TCGv_i32 vpr;
+ tmp = loadfn(s, opaque, true);
+ vpr = load_cpu_field(v7m.vpr);
+ tcg_gen_deposit_i32(vpr, vpr, tmp,
+ R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH);
+ store_cpu_field(vpr, v7m.vpr);
+ tcg_temp_free_i32(tmp);
+ break;
+ }
+ default:
+ g_assert_not_reached();
+ }
+ if (lab_end) {
+ gen_set_label(lab_end);
+ }
+ return true;
+}
+
+static bool gen_M_fp_sysreg_read(DisasContext *s, int regno,
+ fp_sysreg_storefn *storefn,
+ void *opaque)
+{
+ /* Do a read from an M-profile floating point system register */
+ TCGv_i32 tmp;
+ TCGLabel *lab_end = NULL;
+ bool lookup_tb = false;
+
+ switch (fp_sysreg_checks(s, regno)) {
+ case FPSysRegCheckFailed:
+ return false;
+ case FPSysRegCheckDone:
+ return true;
+ case FPSysRegCheckContinue:
+ break;
+ }
+
+ if (regno == ARM_VFP_FPSCR_NZCVQC && !dc_isar_feature(aa32_mve, s)) {
+ /* QC is RES0 without MVE, so NZCVQC simplifies to NZCV */
+ regno = QEMU_VFP_FPSCR_NZCV;
+ }
+
+ switch (regno) {
+ case ARM_VFP_FPSCR:
+ tmp = tcg_temp_new_i32();
+ gen_helper_vfp_get_fpscr(tmp, cpu_env);
+ storefn(s, opaque, tmp, true);
+ break;
+ case ARM_VFP_FPSCR_NZCVQC:
+ tmp = tcg_temp_new_i32();
+ gen_helper_vfp_get_fpscr(tmp, cpu_env);
+ tcg_gen_andi_i32(tmp, tmp, FPCR_NZCVQC_MASK);
+ storefn(s, opaque, tmp, true);
+ break;
+ case QEMU_VFP_FPSCR_NZCV:
+ /*
+ * Read just NZCV; this is a special case to avoid the
+ * helper call for the "VMRS to CPSR.NZCV" insn.
+ */
+ tmp = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]);
+ tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK);
+ storefn(s, opaque, tmp, true);
+ break;
+ case ARM_VFP_FPCXT_S:
+ {
+ TCGv_i32 control, sfpa, fpscr;
+ /* Bits [27:0] from FPSCR, bit [31] from CONTROL.SFPA */
+ tmp = tcg_temp_new_i32();
+ sfpa = tcg_temp_new_i32();
+ gen_helper_vfp_get_fpscr(tmp, cpu_env);
+ tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK);
+ control = load_cpu_field(v7m.control[M_REG_S]);
+ tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK);
+ tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT);
+ tcg_gen_or_i32(tmp, tmp, sfpa);
+ tcg_temp_free_i32(sfpa);
+ /*
+ * Store result before updating FPSCR etc, in case
+ * it is a memory write which causes an exception.
+ */
+ storefn(s, opaque, tmp, true);
+ /*
+ * Now we must reset FPSCR from FPDSCR_NS, and clear
+ * CONTROL.SFPA; so we'll end the TB here.
+ */
+ tcg_gen_andi_i32(control, control, ~R_V7M_CONTROL_SFPA_MASK);
+ store_cpu_field(control, v7m.control[M_REG_S]);
+ fpscr = load_cpu_field(v7m.fpdscr[M_REG_NS]);
+ gen_helper_vfp_set_fpscr(cpu_env, fpscr);
+ tcg_temp_free_i32(fpscr);
+ lookup_tb = true;
+ break;
+ }
+ case ARM_VFP_FPCXT_NS:
+ {
+ TCGv_i32 control, sfpa, fpscr, fpdscr, zero;
+ TCGLabel *lab_active = gen_new_label();
+
+ lookup_tb = true;
+
+ gen_branch_fpInactive(s, TCG_COND_EQ, lab_active);
+ /* fpInactive case: reads as FPDSCR_NS */
+ TCGv_i32 tmp = load_cpu_field(v7m.fpdscr[M_REG_NS]);
+ storefn(s, opaque, tmp, true);
+ lab_end = gen_new_label();
+ tcg_gen_br(lab_end);
+
+ gen_set_label(lab_active);
+ /*
+ * !fpInactive: if FPU disabled, take NOCP exception;
+ * otherwise PreserveFPState(), and then FPCXT_NS
+ * reads the same as FPCXT_S.
+ */
+ if (!vfp_access_check_m(s, true)) {
+ /*
+ * This was only a conditional exception, so override
+ * gen_exception_insn()'s default to DISAS_NORETURN
+ */
+ s->base.is_jmp = DISAS_NEXT;
+ break;
+ }
+ tmp = tcg_temp_new_i32();
+ sfpa = tcg_temp_new_i32();
+ fpscr = tcg_temp_new_i32();
+ gen_helper_vfp_get_fpscr(fpscr, cpu_env);
+ tcg_gen_andi_i32(tmp, fpscr, ~FPCR_NZCV_MASK);
+ control = load_cpu_field(v7m.control[M_REG_S]);
+ tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK);
+ tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT);
+ tcg_gen_or_i32(tmp, tmp, sfpa);
+ tcg_temp_free_i32(control);
+ /* Store result before updating FPSCR, in case it faults */
+ storefn(s, opaque, tmp, true);
+ /* If SFPA is zero then set FPSCR from FPDSCR_NS */
+ fpdscr = load_cpu_field(v7m.fpdscr[M_REG_NS]);
+ zero = tcg_const_i32(0);
+ tcg_gen_movcond_i32(TCG_COND_EQ, fpscr, sfpa, zero, fpdscr, fpscr);
+ gen_helper_vfp_set_fpscr(cpu_env, fpscr);
+ tcg_temp_free_i32(zero);
+ tcg_temp_free_i32(sfpa);
+ tcg_temp_free_i32(fpdscr);
+ tcg_temp_free_i32(fpscr);
+ break;
+ }
+ case ARM_VFP_VPR:
+ /* Behaves as NOP if not privileged */
+ if (IS_USER(s)) {
+ storefn(s, opaque, NULL, false);
+ break;
+ }
+ tmp = load_cpu_field(v7m.vpr);
+ storefn(s, opaque, tmp, true);
+ break;
+ case ARM_VFP_P0:
+ tmp = load_cpu_field(v7m.vpr);
+ tcg_gen_extract_i32(tmp, tmp, R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH);
+ storefn(s, opaque, tmp, true);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ if (lab_end) {
+ gen_set_label(lab_end);
+ }
+ if (lookup_tb) {
+ gen_lookup_tb(s);
+ }
+ return true;
+}
+
+static void fp_sysreg_to_gpr(DisasContext *s, void *opaque, TCGv_i32 value,
+ bool do_access)
+{
+ arg_VMSR_VMRS *a = opaque;
+
+ if (!do_access) {
+ return;
+ }
+
+ if (a->rt == 15) {
+ /* Set the 4 flag bits in the CPSR */
+ gen_set_nzcv(value);
+ tcg_temp_free_i32(value);
+ } else {
+ store_reg(s, a->rt, value);
+ }
+}
+
+static TCGv_i32 gpr_to_fp_sysreg(DisasContext *s, void *opaque, bool do_access)
+{
+ arg_VMSR_VMRS *a = opaque;
+
+ if (!do_access) {
+ return NULL;
+ }
+ return load_reg(s, a->rt);
+}
+
+static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a)
+{
+ /*
+ * Accesses to R15 are UNPREDICTABLE; we choose to undef.
+ * FPSCR -> r15 is a special case which writes to the PSR flags;
+ * set a->reg to a special value to tell gen_M_fp_sysreg_read()
+ * we only care about the top 4 bits of FPSCR there.
+ */
+ if (a->rt == 15) {
+ if (a->l && a->reg == ARM_VFP_FPSCR) {
+ a->reg = QEMU_VFP_FPSCR_NZCV;
+ } else {
+ return false;
+ }
+ }
+
+ if (a->l) {
+ /* VMRS, move FP system register to gp register */
+ return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_gpr, a);
+ } else {
+ /* VMSR, move gp register to FP system register */
+ return gen_M_fp_sysreg_write(s, a->reg, gpr_to_fp_sysreg, a);
+ }
+}
+
+static void fp_sysreg_to_memory(DisasContext *s, void *opaque, TCGv_i32 value,
+ bool do_access)
+{
+ arg_vldr_sysreg *a = opaque;
+ uint32_t offset = a->imm;
+ TCGv_i32 addr;
+
+ if (!a->a) {
+ offset = -offset;
+ }
+
+ if (!do_access && !a->w) {
+ return;
+ }
+
+ addr = load_reg(s, a->rn);
+ if (a->p) {
+ tcg_gen_addi_i32(addr, addr, offset);
+ }
+
+ if (s->v8m_stackcheck && a->rn == 13 && a->w) {
+ gen_helper_v8m_stackcheck(cpu_env, addr);
+ }
+
+ if (do_access) {
+ gen_aa32_st_i32(s, value, addr, get_mem_index(s),
+ MO_UL | MO_ALIGN | s->be_data);
+ tcg_temp_free_i32(value);
+ }
+
+ if (a->w) {
+ /* writeback */
+ if (!a->p) {
+ tcg_gen_addi_i32(addr, addr, offset);
+ }
+ store_reg(s, a->rn, addr);
+ } else {
+ tcg_temp_free_i32(addr);
+ }
+}
+
+static TCGv_i32 memory_to_fp_sysreg(DisasContext *s, void *opaque,
+ bool do_access)
+{
+ arg_vldr_sysreg *a = opaque;
+ uint32_t offset = a->imm;
+ TCGv_i32 addr;
+ TCGv_i32 value = NULL;
+
+ if (!a->a) {
+ offset = -offset;
+ }
+
+ if (!do_access && !a->w) {
+ return NULL;
+ }
+
+ addr = load_reg(s, a->rn);
+ if (a->p) {
+ tcg_gen_addi_i32(addr, addr, offset);
+ }
+
+ if (s->v8m_stackcheck && a->rn == 13 && a->w) {
+ gen_helper_v8m_stackcheck(cpu_env, addr);
+ }
+
+ if (do_access) {
+ value = tcg_temp_new_i32();
+ gen_aa32_ld_i32(s, value, addr, get_mem_index(s),
+ MO_UL | MO_ALIGN | s->be_data);
+ }
+
+ if (a->w) {
+ /* writeback */
+ if (!a->p) {
+ tcg_gen_addi_i32(addr, addr, offset);
+ }
+ store_reg(s, a->rn, addr);
+ } else {
+ tcg_temp_free_i32(addr);
+ }
+ return value;
+}
+
+static bool trans_VLDR_sysreg(DisasContext *s, arg_vldr_sysreg *a)
+{
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+ return false;
+ }
+ if (a->rn == 15) {
+ return false;
+ }
+ return gen_M_fp_sysreg_write(s, a->reg, memory_to_fp_sysreg, a);
+}
+
+static bool trans_VSTR_sysreg(DisasContext *s, arg_vldr_sysreg *a)
+{
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+ return false;
+ }
+ if (a->rn == 15) {
+ return false;
+ }
+ return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_memory, a);
+}
+
static bool trans_NOCP(DisasContext *s, arg_nocp *a)
{
/*
diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c
index e91f526a1a..67462bdf27 100644
--- a/target/arm/translate-mve.c
+++ b/target/arm/translate-mve.c
@@ -27,3 +27,762 @@
/* Include the generated decoder */
#include "decode-mve.c.inc"
+
+typedef void MVEGenLdStFn(TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void MVEGenOneOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr);
+typedef void MVEGenTwoOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr);
+typedef void MVEGenTwoOpScalarFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void MVEGenDualAccOpFn(TCGv_i64, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i64);
+typedef void MVEGenVADDVFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_i32);
+
+/* Return the offset of a Qn register (same semantics as aa32_vfp_qreg()) */
+static inline long mve_qreg_offset(unsigned reg)
+{
+ return offsetof(CPUARMState, vfp.zregs[reg].d[0]);
+}
+
+static TCGv_ptr mve_qreg_ptr(unsigned reg)
+{
+ TCGv_ptr ret = tcg_temp_new_ptr();
+ tcg_gen_addi_ptr(ret, cpu_env, mve_qreg_offset(reg));
+ return ret;
+}
+
+static bool mve_check_qreg_bank(DisasContext *s, int qmask)
+{
+ /*
+ * Check whether Qregs are in range. For v8.1M only Q0..Q7
+ * are supported, see VFPSmallRegisterBank().
+ */
+ return qmask < 8;
+}
+
+bool mve_eci_check(DisasContext *s)
+{
+ /*
+ * This is a beatwise insn: check that ECI is valid (not a
+ * reserved value) and note that we are handling it.
+ * Return true if OK, false if we generated an exception.
+ */
+ s->eci_handled = true;
+ switch (s->eci) {
+ case ECI_NONE:
+ case ECI_A0:
+ case ECI_A0A1:
+ case ECI_A0A1A2:
+ case ECI_A0A1A2B0:
+ return true;
+ default:
+ /* Reserved value: INVSTATE UsageFault */
+ gen_exception_insn(s, s->pc_curr, EXCP_INVSTATE, syn_uncategorized(),
+ default_exception_el(s));
+ return false;
+ }
+}
+
+static void mve_update_eci(DisasContext *s)
+{
+ /*
+ * The helper function will always update the CPUState field,
+ * so we only need to update the DisasContext field.
+ */
+ if (s->eci) {
+ s->eci = (s->eci == ECI_A0A1A2B0) ? ECI_A0 : ECI_NONE;
+ }
+}
+
+void mve_update_and_store_eci(DisasContext *s)
+{
+ /*
+ * For insns which don't call a helper function that will call
+ * mve_advance_vpt(), this version updates s->eci and also stores
+ * it out to the CPUState field.
+ */
+ if (s->eci) {
+ mve_update_eci(s);
+ store_cpu_field(tcg_constant_i32(s->eci << 4), condexec_bits);
+ }
+}
+
+static bool mve_skip_first_beat(DisasContext *s)
+{
+ /* Return true if PSR.ECI says we must skip the first beat of this insn */
+ switch (s->eci) {
+ case ECI_NONE:
+ return false;
+ case ECI_A0:
+ case ECI_A0A1:
+ case ECI_A0A1A2:
+ case ECI_A0A1A2B0:
+ return true;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static bool do_ldst(DisasContext *s, arg_VLDR_VSTR *a, MVEGenLdStFn *fn)
+{
+ TCGv_i32 addr;
+ uint32_t offset;
+ TCGv_ptr qreg;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qd) ||
+ !fn) {
+ return false;
+ }
+
+ /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */
+ if (a->rn == 15 || (a->rn == 13 && a->w)) {
+ return false;
+ }
+
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ offset = a->imm << a->size;
+ if (!a->a) {
+ offset = -offset;
+ }
+ addr = load_reg(s, a->rn);
+ if (a->p) {
+ tcg_gen_addi_i32(addr, addr, offset);
+ }
+
+ qreg = mve_qreg_ptr(a->qd);
+ fn(cpu_env, qreg, addr);
+ tcg_temp_free_ptr(qreg);
+
+ /*
+ * Writeback always happens after the last beat of the insn,
+ * regardless of predication
+ */
+ if (a->w) {
+ if (!a->p) {
+ tcg_gen_addi_i32(addr, addr, offset);
+ }
+ store_reg(s, a->rn, addr);
+ } else {
+ tcg_temp_free_i32(addr);
+ }
+ mve_update_eci(s);
+ return true;
+}
+
+static bool trans_VLDR_VSTR(DisasContext *s, arg_VLDR_VSTR *a)
+{
+ static MVEGenLdStFn * const ldstfns[4][2] = {
+ { gen_helper_mve_vstrb, gen_helper_mve_vldrb },
+ { gen_helper_mve_vstrh, gen_helper_mve_vldrh },
+ { gen_helper_mve_vstrw, gen_helper_mve_vldrw },
+ { NULL, NULL }
+ };
+ return do_ldst(s, a, ldstfns[a->size][a->l]);
+}
+
+#define DO_VLDST_WIDE_NARROW(OP, SLD, ULD, ST) \
+ static bool trans_##OP(DisasContext *s, arg_VLDR_VSTR *a) \
+ { \
+ static MVEGenLdStFn * const ldstfns[2][2] = { \
+ { gen_helper_mve_##ST, gen_helper_mve_##SLD }, \
+ { NULL, gen_helper_mve_##ULD }, \
+ }; \
+ return do_ldst(s, a, ldstfns[a->u][a->l]); \
+ }
+
+DO_VLDST_WIDE_NARROW(VLDSTB_H, vldrb_sh, vldrb_uh, vstrb_h)
+DO_VLDST_WIDE_NARROW(VLDSTB_W, vldrb_sw, vldrb_uw, vstrb_w)
+DO_VLDST_WIDE_NARROW(VLDSTH_W, vldrh_sw, vldrh_uw, vstrh_w)
+
+static bool trans_VDUP(DisasContext *s, arg_VDUP *a)
+{
+ TCGv_ptr qd;
+ TCGv_i32 rt;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qd)) {
+ return false;
+ }
+ if (a->rt == 13 || a->rt == 15) {
+ /* UNPREDICTABLE; we choose to UNDEF */
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ qd = mve_qreg_ptr(a->qd);
+ rt = load_reg(s, a->rt);
+ tcg_gen_dup_i32(a->size, rt, rt);
+ gen_helper_mve_vdup(cpu_env, qd, rt);
+ tcg_temp_free_ptr(qd);
+ tcg_temp_free_i32(rt);
+ mve_update_eci(s);
+ return true;
+}
+
+static bool do_1op(DisasContext *s, arg_1op *a, MVEGenOneOpFn fn)
+{
+ TCGv_ptr qd, qm;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qd | a->qm) ||
+ !fn) {
+ return false;
+ }
+
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ qd = mve_qreg_ptr(a->qd);
+ qm = mve_qreg_ptr(a->qm);
+ fn(cpu_env, qd, qm);
+ tcg_temp_free_ptr(qd);
+ tcg_temp_free_ptr(qm);
+ mve_update_eci(s);
+ return true;
+}
+
+#define DO_1OP(INSN, FN) \
+ static bool trans_##INSN(DisasContext *s, arg_1op *a) \
+ { \
+ static MVEGenOneOpFn * const fns[] = { \
+ gen_helper_mve_##FN##b, \
+ gen_helper_mve_##FN##h, \
+ gen_helper_mve_##FN##w, \
+ NULL, \
+ }; \
+ return do_1op(s, a, fns[a->size]); \
+ }
+
+DO_1OP(VCLZ, vclz)
+DO_1OP(VCLS, vcls)
+DO_1OP(VABS, vabs)
+DO_1OP(VNEG, vneg)
+
+static bool trans_VREV16(DisasContext *s, arg_1op *a)
+{
+ static MVEGenOneOpFn * const fns[] = {
+ gen_helper_mve_vrev16b,
+ NULL,
+ NULL,
+ NULL,
+ };
+ return do_1op(s, a, fns[a->size]);
+}
+
+static bool trans_VREV32(DisasContext *s, arg_1op *a)
+{
+ static MVEGenOneOpFn * const fns[] = {
+ gen_helper_mve_vrev32b,
+ gen_helper_mve_vrev32h,
+ NULL,
+ NULL,
+ };
+ return do_1op(s, a, fns[a->size]);
+}
+
+static bool trans_VREV64(DisasContext *s, arg_1op *a)
+{
+ static MVEGenOneOpFn * const fns[] = {
+ gen_helper_mve_vrev64b,
+ gen_helper_mve_vrev64h,
+ gen_helper_mve_vrev64w,
+ NULL,
+ };
+ return do_1op(s, a, fns[a->size]);
+}
+
+static bool trans_VMVN(DisasContext *s, arg_1op *a)
+{
+ return do_1op(s, a, gen_helper_mve_vmvn);
+}
+
+static bool trans_VABS_fp(DisasContext *s, arg_1op *a)
+{
+ static MVEGenOneOpFn * const fns[] = {
+ NULL,
+ gen_helper_mve_vfabsh,
+ gen_helper_mve_vfabss,
+ NULL,
+ };
+ if (!dc_isar_feature(aa32_mve_fp, s)) {
+ return false;
+ }
+ return do_1op(s, a, fns[a->size]);
+}
+
+static bool trans_VNEG_fp(DisasContext *s, arg_1op *a)
+{
+ static MVEGenOneOpFn * const fns[] = {
+ NULL,
+ gen_helper_mve_vfnegh,
+ gen_helper_mve_vfnegs,
+ NULL,
+ };
+ if (!dc_isar_feature(aa32_mve_fp, s)) {
+ return false;
+ }
+ return do_1op(s, a, fns[a->size]);
+}
+
+static bool do_2op(DisasContext *s, arg_2op *a, MVEGenTwoOpFn fn)
+{
+ TCGv_ptr qd, qn, qm;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qd | a->qn | a->qm) ||
+ !fn) {
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ qd = mve_qreg_ptr(a->qd);
+ qn = mve_qreg_ptr(a->qn);
+ qm = mve_qreg_ptr(a->qm);
+ fn(cpu_env, qd, qn, qm);
+ tcg_temp_free_ptr(qd);
+ tcg_temp_free_ptr(qn);
+ tcg_temp_free_ptr(qm);
+ mve_update_eci(s);
+ return true;
+}
+
+#define DO_LOGIC(INSN, HELPER) \
+ static bool trans_##INSN(DisasContext *s, arg_2op *a) \
+ { \
+ return do_2op(s, a, HELPER); \
+ }
+
+DO_LOGIC(VAND, gen_helper_mve_vand)
+DO_LOGIC(VBIC, gen_helper_mve_vbic)
+DO_LOGIC(VORR, gen_helper_mve_vorr)
+DO_LOGIC(VORN, gen_helper_mve_vorn)
+DO_LOGIC(VEOR, gen_helper_mve_veor)
+
+#define DO_2OP(INSN, FN) \
+ static bool trans_##INSN(DisasContext *s, arg_2op *a) \
+ { \
+ static MVEGenTwoOpFn * const fns[] = { \
+ gen_helper_mve_##FN##b, \
+ gen_helper_mve_##FN##h, \
+ gen_helper_mve_##FN##w, \
+ NULL, \
+ }; \
+ return do_2op(s, a, fns[a->size]); \
+ }
+
+DO_2OP(VADD, vadd)
+DO_2OP(VSUB, vsub)
+DO_2OP(VMUL, vmul)
+DO_2OP(VMULH_S, vmulhs)
+DO_2OP(VMULH_U, vmulhu)
+DO_2OP(VRMULH_S, vrmulhs)
+DO_2OP(VRMULH_U, vrmulhu)
+DO_2OP(VMAX_S, vmaxs)
+DO_2OP(VMAX_U, vmaxu)
+DO_2OP(VMIN_S, vmins)
+DO_2OP(VMIN_U, vminu)
+DO_2OP(VABD_S, vabds)
+DO_2OP(VABD_U, vabdu)
+DO_2OP(VHADD_S, vhadds)
+DO_2OP(VHADD_U, vhaddu)
+DO_2OP(VHSUB_S, vhsubs)
+DO_2OP(VHSUB_U, vhsubu)
+DO_2OP(VMULL_BS, vmullbs)
+DO_2OP(VMULL_BU, vmullbu)
+DO_2OP(VMULL_TS, vmullts)
+DO_2OP(VMULL_TU, vmulltu)
+DO_2OP(VQDMULH, vqdmulh)
+DO_2OP(VQRDMULH, vqrdmulh)
+DO_2OP(VQADD_S, vqadds)
+DO_2OP(VQADD_U, vqaddu)
+DO_2OP(VQSUB_S, vqsubs)
+DO_2OP(VQSUB_U, vqsubu)
+DO_2OP(VSHL_S, vshls)
+DO_2OP(VSHL_U, vshlu)
+DO_2OP(VRSHL_S, vrshls)
+DO_2OP(VRSHL_U, vrshlu)
+DO_2OP(VQSHL_S, vqshls)
+DO_2OP(VQSHL_U, vqshlu)
+DO_2OP(VQRSHL_S, vqrshls)
+DO_2OP(VQRSHL_U, vqrshlu)
+DO_2OP(VQDMLADH, vqdmladh)
+DO_2OP(VQDMLADHX, vqdmladhx)
+DO_2OP(VQRDMLADH, vqrdmladh)
+DO_2OP(VQRDMLADHX, vqrdmladhx)
+DO_2OP(VQDMLSDH, vqdmlsdh)
+DO_2OP(VQDMLSDHX, vqdmlsdhx)
+DO_2OP(VQRDMLSDH, vqrdmlsdh)
+DO_2OP(VQRDMLSDHX, vqrdmlsdhx)
+DO_2OP(VRHADD_S, vrhadds)
+DO_2OP(VRHADD_U, vrhaddu)
+/*
+ * VCADD Qd == Qm at size MO_32 is UNPREDICTABLE; we choose not to diagnose
+ * so we can reuse the DO_2OP macro. (Our implementation calculates the
+ * "expected" results in this case.) Similarly for VHCADD.
+ */
+DO_2OP(VCADD90, vcadd90)
+DO_2OP(VCADD270, vcadd270)
+DO_2OP(VHCADD90, vhcadd90)
+DO_2OP(VHCADD270, vhcadd270)
+
+static bool trans_VQDMULLB(DisasContext *s, arg_2op *a)
+{
+ static MVEGenTwoOpFn * const fns[] = {
+ NULL,
+ gen_helper_mve_vqdmullbh,
+ gen_helper_mve_vqdmullbw,
+ NULL,
+ };
+ if (a->size == MO_32 && (a->qd == a->qm || a->qd == a->qn)) {
+ /* UNPREDICTABLE; we choose to undef */
+ return false;
+ }
+ return do_2op(s, a, fns[a->size]);
+}
+
+static bool trans_VQDMULLT(DisasContext *s, arg_2op *a)
+{
+ static MVEGenTwoOpFn * const fns[] = {
+ NULL,
+ gen_helper_mve_vqdmullth,
+ gen_helper_mve_vqdmulltw,
+ NULL,
+ };
+ if (a->size == MO_32 && (a->qd == a->qm || a->qd == a->qn)) {
+ /* UNPREDICTABLE; we choose to undef */
+ return false;
+ }
+ return do_2op(s, a, fns[a->size]);
+}
+
+/*
+ * VADC and VSBC: these perform an add-with-carry or subtract-with-carry
+ * of the 32-bit elements in each lane of the input vectors, where the
+ * carry-out of each add is the carry-in of the next. The initial carry
+ * input is either fixed (0 for VADCI, 1 for VSBCI) or is from FPSCR.C
+ * (for VADC and VSBC); the carry out at the end is written back to FPSCR.C.
+ * These insns are subject to beat-wise execution. Partial execution
+ * of an I=1 (initial carry input fixed) insn which does not
+ * execute the first beat must start with the current FPSCR.NZCV
+ * value, not the fixed constant input.
+ */
+static bool trans_VADC(DisasContext *s, arg_2op *a)
+{
+ return do_2op(s, a, gen_helper_mve_vadc);
+}
+
+static bool trans_VADCI(DisasContext *s, arg_2op *a)
+{
+ if (mve_skip_first_beat(s)) {
+ return trans_VADC(s, a);
+ }
+ return do_2op(s, a, gen_helper_mve_vadci);
+}
+
+static bool trans_VSBC(DisasContext *s, arg_2op *a)
+{
+ return do_2op(s, a, gen_helper_mve_vsbc);
+}
+
+static bool trans_VSBCI(DisasContext *s, arg_2op *a)
+{
+ if (mve_skip_first_beat(s)) {
+ return trans_VSBC(s, a);
+ }
+ return do_2op(s, a, gen_helper_mve_vsbci);
+}
+
+static bool do_2op_scalar(DisasContext *s, arg_2scalar *a,
+ MVEGenTwoOpScalarFn fn)
+{
+ TCGv_ptr qd, qn;
+ TCGv_i32 rm;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qd | a->qn) ||
+ !fn) {
+ return false;
+ }
+ if (a->rm == 13 || a->rm == 15) {
+ /* UNPREDICTABLE */
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ qd = mve_qreg_ptr(a->qd);
+ qn = mve_qreg_ptr(a->qn);
+ rm = load_reg(s, a->rm);
+ fn(cpu_env, qd, qn, rm);
+ tcg_temp_free_i32(rm);
+ tcg_temp_free_ptr(qd);
+ tcg_temp_free_ptr(qn);
+ mve_update_eci(s);
+ return true;
+}
+
+#define DO_2OP_SCALAR(INSN, FN) \
+ static bool trans_##INSN(DisasContext *s, arg_2scalar *a) \
+ { \
+ static MVEGenTwoOpScalarFn * const fns[] = { \
+ gen_helper_mve_##FN##b, \
+ gen_helper_mve_##FN##h, \
+ gen_helper_mve_##FN##w, \
+ NULL, \
+ }; \
+ return do_2op_scalar(s, a, fns[a->size]); \
+ }
+
+DO_2OP_SCALAR(VADD_scalar, vadd_scalar)
+DO_2OP_SCALAR(VSUB_scalar, vsub_scalar)
+DO_2OP_SCALAR(VMUL_scalar, vmul_scalar)
+DO_2OP_SCALAR(VHADD_S_scalar, vhadds_scalar)
+DO_2OP_SCALAR(VHADD_U_scalar, vhaddu_scalar)
+DO_2OP_SCALAR(VHSUB_S_scalar, vhsubs_scalar)
+DO_2OP_SCALAR(VHSUB_U_scalar, vhsubu_scalar)
+DO_2OP_SCALAR(VQADD_S_scalar, vqadds_scalar)
+DO_2OP_SCALAR(VQADD_U_scalar, vqaddu_scalar)
+DO_2OP_SCALAR(VQSUB_S_scalar, vqsubs_scalar)
+DO_2OP_SCALAR(VQSUB_U_scalar, vqsubu_scalar)
+DO_2OP_SCALAR(VQDMULH_scalar, vqdmulh_scalar)
+DO_2OP_SCALAR(VQRDMULH_scalar, vqrdmulh_scalar)
+DO_2OP_SCALAR(VBRSR, vbrsr)
+
+static bool trans_VQDMULLB_scalar(DisasContext *s, arg_2scalar *a)
+{
+ static MVEGenTwoOpScalarFn * const fns[] = {
+ NULL,
+ gen_helper_mve_vqdmullb_scalarh,
+ gen_helper_mve_vqdmullb_scalarw,
+ NULL,
+ };
+ if (a->qd == a->qn && a->size == MO_32) {
+ /* UNPREDICTABLE; we choose to undef */
+ return false;
+ }
+ return do_2op_scalar(s, a, fns[a->size]);
+}
+
+static bool trans_VQDMULLT_scalar(DisasContext *s, arg_2scalar *a)
+{
+ static MVEGenTwoOpScalarFn * const fns[] = {
+ NULL,
+ gen_helper_mve_vqdmullt_scalarh,
+ gen_helper_mve_vqdmullt_scalarw,
+ NULL,
+ };
+ if (a->qd == a->qn && a->size == MO_32) {
+ /* UNPREDICTABLE; we choose to undef */
+ return false;
+ }
+ return do_2op_scalar(s, a, fns[a->size]);
+}
+
+static bool do_long_dual_acc(DisasContext *s, arg_vmlaldav *a,
+ MVEGenDualAccOpFn *fn)
+{
+ TCGv_ptr qn, qm;
+ TCGv_i64 rda;
+ TCGv_i32 rdalo, rdahi;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ !mve_check_qreg_bank(s, a->qn | a->qm) ||
+ !fn) {
+ return false;
+ }
+ /*
+ * rdahi == 13 is UNPREDICTABLE; rdahi == 15 is a related
+ * encoding; rdalo always has bit 0 clear so cannot be 13 or 15.
+ */
+ if (a->rdahi == 13 || a->rdahi == 15) {
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ qn = mve_qreg_ptr(a->qn);
+ qm = mve_qreg_ptr(a->qm);
+
+ /*
+ * This insn is subject to beat-wise execution. Partial execution
+ * of an A=0 (no-accumulate) insn which does not execute the first
+ * beat must start with the current rda value, not 0.
+ */
+ if (a->a || mve_skip_first_beat(s)) {
+ rda = tcg_temp_new_i64();
+ rdalo = load_reg(s, a->rdalo);
+ rdahi = load_reg(s, a->rdahi);
+ tcg_gen_concat_i32_i64(rda, rdalo, rdahi);
+ tcg_temp_free_i32(rdalo);
+ tcg_temp_free_i32(rdahi);
+ } else {
+ rda = tcg_const_i64(0);
+ }
+
+ fn(rda, cpu_env, qn, qm, rda);
+ tcg_temp_free_ptr(qn);
+ tcg_temp_free_ptr(qm);
+
+ rdalo = tcg_temp_new_i32();
+ rdahi = tcg_temp_new_i32();
+ tcg_gen_extrl_i64_i32(rdalo, rda);
+ tcg_gen_extrh_i64_i32(rdahi, rda);
+ store_reg(s, a->rdalo, rdalo);
+ store_reg(s, a->rdahi, rdahi);
+ tcg_temp_free_i64(rda);
+ mve_update_eci(s);
+ return true;
+}
+
+static bool trans_VMLALDAV_S(DisasContext *s, arg_vmlaldav *a)
+{
+ static MVEGenDualAccOpFn * const fns[4][2] = {
+ { NULL, NULL },
+ { gen_helper_mve_vmlaldavsh, gen_helper_mve_vmlaldavxsh },
+ { gen_helper_mve_vmlaldavsw, gen_helper_mve_vmlaldavxsw },
+ { NULL, NULL },
+ };
+ return do_long_dual_acc(s, a, fns[a->size][a->x]);
+}
+
+static bool trans_VMLALDAV_U(DisasContext *s, arg_vmlaldav *a)
+{
+ static MVEGenDualAccOpFn * const fns[4][2] = {
+ { NULL, NULL },
+ { gen_helper_mve_vmlaldavuh, NULL },
+ { gen_helper_mve_vmlaldavuw, NULL },
+ { NULL, NULL },
+ };
+ return do_long_dual_acc(s, a, fns[a->size][a->x]);
+}
+
+static bool trans_VMLSLDAV(DisasContext *s, arg_vmlaldav *a)
+{
+ static MVEGenDualAccOpFn * const fns[4][2] = {
+ { NULL, NULL },
+ { gen_helper_mve_vmlsldavsh, gen_helper_mve_vmlsldavxsh },
+ { gen_helper_mve_vmlsldavsw, gen_helper_mve_vmlsldavxsw },
+ { NULL, NULL },
+ };
+ return do_long_dual_acc(s, a, fns[a->size][a->x]);
+}
+
+static bool trans_VRMLALDAVH_S(DisasContext *s, arg_vmlaldav *a)
+{
+ static MVEGenDualAccOpFn * const fns[] = {
+ gen_helper_mve_vrmlaldavhsw, gen_helper_mve_vrmlaldavhxsw,
+ };
+ return do_long_dual_acc(s, a, fns[a->x]);
+}
+
+static bool trans_VRMLALDAVH_U(DisasContext *s, arg_vmlaldav *a)
+{
+ static MVEGenDualAccOpFn * const fns[] = {
+ gen_helper_mve_vrmlaldavhuw, NULL,
+ };
+ return do_long_dual_acc(s, a, fns[a->x]);
+}
+
+static bool trans_VRMLSLDAVH(DisasContext *s, arg_vmlaldav *a)
+{
+ static MVEGenDualAccOpFn * const fns[] = {
+ gen_helper_mve_vrmlsldavhsw, gen_helper_mve_vrmlsldavhxsw,
+ };
+ return do_long_dual_acc(s, a, fns[a->x]);
+}
+
+static bool trans_VPST(DisasContext *s, arg_VPST *a)
+{
+ TCGv_i32 vpr;
+
+ /* mask == 0 is a "related encoding" */
+ if (!dc_isar_feature(aa32_mve, s) || !a->mask) {
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+ /*
+ * Set the VPR mask fields. We take advantage of MASK01 and MASK23
+ * being adjacent fields in the register.
+ *
+ * This insn is not predicated, but it is subject to beat-wise
+ * execution, and the mask is updated on the odd-numbered beats.
+ * So if PSR.ECI says we should skip beat 1, we mustn't update the
+ * 01 mask field.
+ */
+ vpr = load_cpu_field(v7m.vpr);
+ switch (s->eci) {
+ case ECI_NONE:
+ case ECI_A0:
+ /* Update both 01 and 23 fields */
+ tcg_gen_deposit_i32(vpr, vpr,
+ tcg_constant_i32(a->mask | (a->mask << 4)),
+ R_V7M_VPR_MASK01_SHIFT,
+ R_V7M_VPR_MASK01_LENGTH + R_V7M_VPR_MASK23_LENGTH);
+ break;
+ case ECI_A0A1:
+ case ECI_A0A1A2:
+ case ECI_A0A1A2B0:
+ /* Update only the 23 mask field */
+ tcg_gen_deposit_i32(vpr, vpr,
+ tcg_constant_i32(a->mask),
+ R_V7M_VPR_MASK23_SHIFT, R_V7M_VPR_MASK23_LENGTH);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ store_cpu_field(vpr, v7m.vpr);
+ mve_update_and_store_eci(s);
+ return true;
+}
+
+static bool trans_VADDV(DisasContext *s, arg_VADDV *a)
+{
+ /* VADDV: vector add across vector */
+ static MVEGenVADDVFn * const fns[4][2] = {
+ { gen_helper_mve_vaddvsb, gen_helper_mve_vaddvub },
+ { gen_helper_mve_vaddvsh, gen_helper_mve_vaddvuh },
+ { gen_helper_mve_vaddvsw, gen_helper_mve_vaddvuw },
+ { NULL, NULL }
+ };
+ TCGv_ptr qm;
+ TCGv_i32 rda;
+
+ if (!dc_isar_feature(aa32_mve, s) ||
+ a->size == 3) {
+ return false;
+ }
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
+ return true;
+ }
+
+ /*
+ * This insn is subject to beat-wise execution. Partial execution
+ * of an A=0 (no-accumulate) insn which does not execute the first
+ * beat must start with the current value of Rda, not zero.
+ */
+ if (a->a || mve_skip_first_beat(s)) {
+ /* Accumulate input from Rda */
+ rda = load_reg(s, a->rda);
+ } else {
+ /* Accumulate starting at zero */
+ rda = tcg_const_i32(0);
+ }
+
+ qm = mve_qreg_ptr(a->qm);
+ fns[a->size][a->u](rda, cpu_env, qm, rda);
+ store_reg(s, a->rda, rda);
+ tcg_temp_free_ptr(qm);
+
+ mve_update_eci(s);
+ return true;
+}
diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c
index 01e26a246d..b2991e21ec 100644
--- a/target/arm/translate-vfp.c
+++ b/target/arm/translate-vfp.c
@@ -132,32 +132,75 @@ static void gen_preserve_fp_state(DisasContext *s)
}
/*
- * Check that VFP access is enabled. If it is, do the necessary
- * M-profile lazy-FP handling and then return true.
- * If not, emit code to generate an appropriate exception and
- * return false.
+ * Generate code for M-profile FP context handling: update the
+ * ownership of the FP context, and create a new context if
+ * necessary. This corresponds to the parts of the pseudocode
+ * ExecuteFPCheck() after the inital PreserveFPState() call.
+ */
+static void gen_update_fp_context(DisasContext *s)
+{
+ /* Update ownership of FP context: set FPCCR.S to match current state */
+ if (s->v8m_fpccr_s_wrong) {
+ TCGv_i32 tmp;
+
+ tmp = load_cpu_field(v7m.fpccr[M_REG_S]);
+ if (s->v8m_secure) {
+ tcg_gen_ori_i32(tmp, tmp, R_V7M_FPCCR_S_MASK);
+ } else {
+ tcg_gen_andi_i32(tmp, tmp, ~R_V7M_FPCCR_S_MASK);
+ }
+ store_cpu_field(tmp, v7m.fpccr[M_REG_S]);
+ /* Don't need to do this for any further FP insns in this TB */
+ s->v8m_fpccr_s_wrong = false;
+ }
+
+ if (s->v7m_new_fp_ctxt_needed) {
+ /*
+ * Create new FP context by updating CONTROL.FPCA, CONTROL.SFPA,
+ * the FPSCR, and VPR.
+ */
+ TCGv_i32 control, fpscr;
+ uint32_t bits = R_V7M_CONTROL_FPCA_MASK;
+
+ fpscr = load_cpu_field(v7m.fpdscr[s->v8m_secure]);
+ gen_helper_vfp_set_fpscr(cpu_env, fpscr);
+ tcg_temp_free_i32(fpscr);
+ if (dc_isar_feature(aa32_mve, s)) {
+ TCGv_i32 z32 = tcg_const_i32(0);
+ store_cpu_field(z32, v7m.vpr);
+ }
+
+ /*
+ * We don't need to arrange to end the TB, because the only
+ * parts of FPSCR which we cache in the TB flags are the VECLEN
+ * and VECSTRIDE, and those don't exist for M-profile.
+ */
+
+ if (s->v8m_secure) {
+ bits |= R_V7M_CONTROL_SFPA_MASK;
+ }
+ control = load_cpu_field(v7m.control[M_REG_S]);
+ tcg_gen_ori_i32(control, control, bits);
+ store_cpu_field(control, v7m.control[M_REG_S]);
+ /* Don't need to do this for any further FP insns in this TB */
+ s->v7m_new_fp_ctxt_needed = false;
+ }
+}
+
+/*
+ * Check that VFP access is enabled, A-profile specific version.
+ *
+ * If VFP is enabled, return true. If not, emit code to generate an
+ * appropriate exception and return false.
* The ignore_vfp_enabled argument specifies that we should ignore
- * whether VFP is enabled via FPEXC[EN]: this should be true for FMXR/FMRX
+ * whether VFP is enabled via FPEXC.EN: this should be true for FMXR/FMRX
* accesses to FPSID, FPEXC, MVFR0, MVFR1, MVFR2, and false for all other insns.
*/
-static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled)
+static bool vfp_access_check_a(DisasContext *s, bool ignore_vfp_enabled)
{
if (s->fp_excp_el) {
- if (arm_dc_feature(s, ARM_FEATURE_M)) {
- /*
- * M-profile mostly catches the "FPU disabled" case early, in
- * disas_m_nocp(), but a few insns (eg LCTP, WLSTP, DLSTP)
- * which do coprocessor-checks are outside the large ranges of
- * the encoding space handled by the patterns in m-nocp.decode,
- * and for them we may need to raise NOCP here.
- */
- gen_exception_insn(s, s->pc_curr, EXCP_NOCP,
- syn_uncategorized(), s->fp_excp_el);
- } else {
- gen_exception_insn(s, s->pc_curr, EXCP_UDEF,
- syn_fp_access_trap(1, 0xe, false),
- s->fp_excp_el);
- }
+ gen_exception_insn(s, s->pc_curr, EXCP_UDEF,
+ syn_fp_access_trap(1, 0xe, false), s->fp_excp_el);
return false;
}
@@ -166,59 +209,40 @@ static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled)
unallocated_encoding(s);
return false;
}
+ return true;
+}
- if (arm_dc_feature(s, ARM_FEATURE_M)) {
- /* Handle M-profile lazy FP state mechanics */
-
- /* Trigger lazy-state preservation if necessary */
- gen_preserve_fp_state(s);
-
- /* Update ownership of FP context: set FPCCR.S to match current state */
- if (s->v8m_fpccr_s_wrong) {
- TCGv_i32 tmp;
-
- tmp = load_cpu_field(v7m.fpccr[M_REG_S]);
- if (s->v8m_secure) {
- tcg_gen_ori_i32(tmp, tmp, R_V7M_FPCCR_S_MASK);
- } else {
- tcg_gen_andi_i32(tmp, tmp, ~R_V7M_FPCCR_S_MASK);
- }
- store_cpu_field(tmp, v7m.fpccr[M_REG_S]);
- /* Don't need to do this for any further FP insns in this TB */
- s->v8m_fpccr_s_wrong = false;
- }
+/*
+ * Check that VFP access is enabled, M-profile specific version.
+ *
+ * If VFP is enabled, do the necessary M-profile lazy-FP handling and then
+ * return true. If not, emit code to generate an appropriate exception and
+ * return false.
+ * skip_context_update is true to skip the "update FP context" part of this.
+ */
+bool vfp_access_check_m(DisasContext *s, bool skip_context_update)
+{
+ if (s->fp_excp_el) {
+ /*
+ * M-profile mostly catches the "FPU disabled" case early, in
+ * disas_m_nocp(), but a few insns (eg LCTP, WLSTP, DLSTP)
+ * which do coprocessor-checks are outside the large ranges of
+ * the encoding space handled by the patterns in m-nocp.decode,
+ * and for them we may need to raise NOCP here.
+ */
+ gen_exception_insn(s, s->pc_curr, EXCP_NOCP,
+ syn_uncategorized(), s->fp_excp_el);
+ return false;
+ }
- if (s->v7m_new_fp_ctxt_needed) {
- /*
- * Create new FP context by updating CONTROL.FPCA, CONTROL.SFPA,
- * the FPSCR, and VPR.
- */
- TCGv_i32 control, fpscr;
- uint32_t bits = R_V7M_CONTROL_FPCA_MASK;
-
- fpscr = load_cpu_field(v7m.fpdscr[s->v8m_secure]);
- gen_helper_vfp_set_fpscr(cpu_env, fpscr);
- tcg_temp_free_i32(fpscr);
- if (dc_isar_feature(aa32_mve, s)) {
- TCGv_i32 z32 = tcg_const_i32(0);
- store_cpu_field(z32, v7m.vpr);
- }
+ /* Handle M-profile lazy FP state mechanics */
- /*
- * We don't need to arrange to end the TB, because the only
- * parts of FPSCR which we cache in the TB flags are the VECLEN
- * and VECSTRIDE, and those don't exist for M-profile.
- */
+ /* Trigger lazy-state preservation if necessary */
+ gen_preserve_fp_state(s);
- if (s->v8m_secure) {
- bits |= R_V7M_CONTROL_SFPA_MASK;
- }
- control = load_cpu_field(v7m.control[M_REG_S]);
- tcg_gen_ori_i32(control, control, bits);
- store_cpu_field(control, v7m.control[M_REG_S]);
- /* Don't need to do this for any further FP insns in this TB */
- s->v7m_new_fp_ctxt_needed = false;
- }
+ if (!skip_context_update) {
+ /* Update ownership of FP context and create new FP context if needed */
+ gen_update_fp_context(s);
}
return true;
@@ -230,7 +254,11 @@ static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled)
*/
bool vfp_access_check(DisasContext *s)
{
- return full_vfp_access_check(s, false);
+ if (arm_dc_feature(s, ARM_FEATURE_M)) {
+ return vfp_access_check_m(s, false);
+ } else {
+ return vfp_access_check_a(s, false);
+ }
}
static bool trans_VSEL(DisasContext *s, arg_VSEL *a)
@@ -553,6 +581,48 @@ static bool trans_VCVT(DisasContext *s, arg_VCVT *a)
return true;
}
+static bool mve_skip_vmov(DisasContext *s, int vn, int index, int size)
+{
+ /*
+ * In a CPU with MVE, the VMOV (vector lane to general-purpose register)
+ * and VMOV (general-purpose register to vector lane) insns are not
+ * predicated, but they are subject to beatwise execution if they are
+ * not in an IT block.
+ *
+ * Since our implementation always executes all 4 beats in one tick,
+ * this means only that if PSR.ECI says we should not be executing
+ * the beat corresponding to the lane of the vector register being
+ * accessed then we should skip performing the move, and that we need
+ * to do the usual check for bad ECI state and advance of ECI state.
+ *
+ * Note that if PSR.ECI is non-zero then we cannot be in an IT block.
+ *
+ * Return true if this VMOV scalar <-> gpreg should be skipped because
+ * the MVE PSR.ECI state says we skip the beat where the store happens.
+ */
+
+ /* Calculate the byte offset into Qn which we're going to access */
+ int ofs = (index << size) + ((vn & 1) * 8);
+
+ if (!dc_isar_feature(aa32_mve, s)) {
+ return false;
+ }
+
+ switch (s->eci) {
+ case ECI_NONE:
+ return false;
+ case ECI_A0:
+ return ofs < 4;
+ case ECI_A0A1:
+ return ofs < 8;
+ case ECI_A0A1A2:
+ case ECI_A0A1A2B0:
+ return ofs < 12;
+ default:
+ g_assert_not_reached();
+ }
+}
+
static bool trans_VMOV_to_gp(DisasContext *s, arg_VMOV_to_gp *a)
{
/* VMOV scalar to general purpose register */
@@ -575,14 +645,26 @@ static bool trans_VMOV_to_gp(DisasContext *s, arg_VMOV_to_gp *a)
return false;
}
+ if (dc_isar_feature(aa32_mve, s)) {
+ if (!mve_eci_check(s)) {
+ return true;
+ }
+ }
+
if (!vfp_access_check(s)) {
return true;
}
- tmp = tcg_temp_new_i32();
- read_neon_element32(tmp, a->vn, a->index, a->size | (a->u ? 0 : MO_SIGN));
- store_reg(s, a->rt, tmp);
+ if (!mve_skip_vmov(s, a->vn, a->index, a->size)) {
+ tmp = tcg_temp_new_i32();
+ read_neon_element32(tmp, a->vn, a->index,
+ a->size | (a->u ? 0 : MO_SIGN));
+ store_reg(s, a->rt, tmp);
+ }
+ if (dc_isar_feature(aa32_mve, s)) {
+ mve_update_and_store_eci(s);
+ }
return true;
}
@@ -608,14 +690,25 @@ static bool trans_VMOV_from_gp(DisasContext *s, arg_VMOV_from_gp *a)
return false;
}
+ if (dc_isar_feature(aa32_mve, s)) {
+ if (!mve_eci_check(s)) {
+ return true;
+ }
+ }
+
if (!vfp_access_check(s)) {
return true;
}
- tmp = load_reg(s, a->rt);
- write_neon_element32(tmp, a->vn, a->index, a->size);
- tcg_temp_free_i32(tmp);
+ if (!mve_skip_vmov(s, a->vn, a->index, a->size)) {
+ tmp = load_reg(s, a->rt);
+ write_neon_element32(tmp, a->vn, a->index, a->size);
+ tcg_temp_free_i32(tmp);
+ }
+ if (dc_isar_feature(aa32_mve, s)) {
+ mve_update_and_store_eci(s);
+ }
return true;
}
@@ -663,408 +756,14 @@ static bool trans_VDUP(DisasContext *s, arg_VDUP *a)
return true;
}
-/*
- * M-profile provides two different sets of instructions that can
- * access floating point system registers: VMSR/VMRS (which move
- * to/from a general purpose register) and VLDR/VSTR sysreg (which
- * move directly to/from memory). In some cases there are also side
- * effects which must happen after any write to memory (which could
- * cause an exception). So we implement the common logic for the
- * sysreg access in gen_M_fp_sysreg_write() and gen_M_fp_sysreg_read(),
- * which take pointers to callback functions which will perform the
- * actual "read/write general purpose register" and "read/write
- * memory" operations.
- */
-
-/*
- * Emit code to store the sysreg to its final destination; frees the
- * TCG temp 'value' it is passed.
- */
-typedef void fp_sysreg_storefn(DisasContext *s, void *opaque, TCGv_i32 value);
-/*
- * Emit code to load the value to be copied to the sysreg; returns
- * a new TCG temporary
- */
-typedef TCGv_i32 fp_sysreg_loadfn(DisasContext *s, void *opaque);
-
-/* Common decode/access checks for fp sysreg read/write */
-typedef enum FPSysRegCheckResult {
- FPSysRegCheckFailed, /* caller should return false */
- FPSysRegCheckDone, /* caller should return true */
- FPSysRegCheckContinue, /* caller should continue generating code */
-} FPSysRegCheckResult;
-
-static FPSysRegCheckResult fp_sysreg_checks(DisasContext *s, int regno)
-{
- if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
- return FPSysRegCheckFailed;
- }
-
- switch (regno) {
- case ARM_VFP_FPSCR:
- case QEMU_VFP_FPSCR_NZCV:
- break;
- case ARM_VFP_FPSCR_NZCVQC:
- if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
- return FPSysRegCheckFailed;
- }
- break;
- case ARM_VFP_FPCXT_S:
- case ARM_VFP_FPCXT_NS:
- if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
- return FPSysRegCheckFailed;
- }
- if (!s->v8m_secure) {
- return FPSysRegCheckFailed;
- }
- break;
- case ARM_VFP_VPR:
- case ARM_VFP_P0:
- if (!dc_isar_feature(aa32_mve, s)) {
- return FPSysRegCheckFailed;
- }
- break;
- default:
- return FPSysRegCheckFailed;
- }
-
- /*
- * FPCXT_NS is a special case: it has specific handling for
- * "current FP state is inactive", and must do the PreserveFPState()
- * but not the usual full set of actions done by ExecuteFPCheck().
- * So we don't call vfp_access_check() and the callers must handle this.
- */
- if (regno != ARM_VFP_FPCXT_NS && !vfp_access_check(s)) {
- return FPSysRegCheckDone;
- }
- return FPSysRegCheckContinue;
-}
-
-static void gen_branch_fpInactive(DisasContext *s, TCGCond cond,
- TCGLabel *label)
-{
- /*
- * FPCXT_NS is a special case: it has specific handling for
- * "current FP state is inactive", and must do the PreserveFPState()
- * but not the usual full set of actions done by ExecuteFPCheck().
- * We don't have a TB flag that matches the fpInactive check, so we
- * do it at runtime as we don't expect FPCXT_NS accesses to be frequent.
- *
- * Emit code that checks fpInactive and does a conditional
- * branch to label based on it:
- * if cond is TCG_COND_NE then branch if fpInactive != 0 (ie if inactive)
- * if cond is TCG_COND_EQ then branch if fpInactive == 0 (ie if active)
- */
- assert(cond == TCG_COND_EQ || cond == TCG_COND_NE);
-
- /* fpInactive = FPCCR_NS.ASPEN == 1 && CONTROL.FPCA == 0 */
- TCGv_i32 aspen, fpca;
- aspen = load_cpu_field(v7m.fpccr[M_REG_NS]);
- fpca = load_cpu_field(v7m.control[M_REG_S]);
- tcg_gen_andi_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
- tcg_gen_xori_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
- tcg_gen_andi_i32(fpca, fpca, R_V7M_CONTROL_FPCA_MASK);
- tcg_gen_or_i32(fpca, fpca, aspen);
- tcg_gen_brcondi_i32(tcg_invert_cond(cond), fpca, 0, label);
- tcg_temp_free_i32(aspen);
- tcg_temp_free_i32(fpca);
-}
-
-static bool gen_M_fp_sysreg_write(DisasContext *s, int regno,
-
- fp_sysreg_loadfn *loadfn,
- void *opaque)
-{
- /* Do a write to an M-profile floating point system register */
- TCGv_i32 tmp;
- TCGLabel *lab_end = NULL;
-
- switch (fp_sysreg_checks(s, regno)) {
- case FPSysRegCheckFailed:
- return false;
- case FPSysRegCheckDone:
- return true;
- case FPSysRegCheckContinue:
- break;
- }
-
- switch (regno) {
- case ARM_VFP_FPSCR:
- tmp = loadfn(s, opaque);
- gen_helper_vfp_set_fpscr(cpu_env, tmp);
- tcg_temp_free_i32(tmp);
- gen_lookup_tb(s);
- break;
- case ARM_VFP_FPSCR_NZCVQC:
- {
- TCGv_i32 fpscr;
- tmp = loadfn(s, opaque);
- if (dc_isar_feature(aa32_mve, s)) {
- /* QC is only present for MVE; otherwise RES0 */
- TCGv_i32 qc = tcg_temp_new_i32();
- tcg_gen_andi_i32(qc, tmp, FPCR_QC);
- /*
- * The 4 vfp.qc[] fields need only be "zero" vs "non-zero";
- * here writing the same value into all elements is simplest.
- */
- tcg_gen_gvec_dup_i32(MO_32, offsetof(CPUARMState, vfp.qc),
- 16, 16, qc);
- }
- tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK);
- fpscr = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]);
- tcg_gen_andi_i32(fpscr, fpscr, ~FPCR_NZCV_MASK);
- tcg_gen_or_i32(fpscr, fpscr, tmp);
- store_cpu_field(fpscr, vfp.xregs[ARM_VFP_FPSCR]);
- tcg_temp_free_i32(tmp);
- break;
- }
- case ARM_VFP_FPCXT_NS:
- lab_end = gen_new_label();
- /* fpInactive case: write is a NOP, so branch to end */
- gen_branch_fpInactive(s, TCG_COND_NE, lab_end);
- /* !fpInactive: PreserveFPState(), and reads same as FPCXT_S */
- gen_preserve_fp_state(s);
- /* fall through */
- case ARM_VFP_FPCXT_S:
- {
- TCGv_i32 sfpa, control;
- /*
- * Set FPSCR and CONTROL.SFPA from value; the new FPSCR takes
- * bits [27:0] from value and zeroes bits [31:28].
- */
- tmp = loadfn(s, opaque);
- sfpa = tcg_temp_new_i32();
- tcg_gen_shri_i32(sfpa, tmp, 31);
- control = load_cpu_field(v7m.control[M_REG_S]);
- tcg_gen_deposit_i32(control, control, sfpa,
- R_V7M_CONTROL_SFPA_SHIFT, 1);
- store_cpu_field(control, v7m.control[M_REG_S]);
- tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK);
- gen_helper_vfp_set_fpscr(cpu_env, tmp);
- tcg_temp_free_i32(tmp);
- tcg_temp_free_i32(sfpa);
- break;
- }
- case ARM_VFP_VPR:
- /* Behaves as NOP if not privileged */
- if (IS_USER(s)) {
- break;
- }
- tmp = loadfn(s, opaque);
- store_cpu_field(tmp, v7m.vpr);
- break;
- case ARM_VFP_P0:
- {
- TCGv_i32 vpr;
- tmp = loadfn(s, opaque);
- vpr = load_cpu_field(v7m.vpr);
- tcg_gen_deposit_i32(vpr, vpr, tmp,
- R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH);
- store_cpu_field(vpr, v7m.vpr);
- tcg_temp_free_i32(tmp);
- break;
- }
- default:
- g_assert_not_reached();
- }
- if (lab_end) {
- gen_set_label(lab_end);
- }
- return true;
-}
-
-static bool gen_M_fp_sysreg_read(DisasContext *s, int regno,
- fp_sysreg_storefn *storefn,
- void *opaque)
-{
- /* Do a read from an M-profile floating point system register */
- TCGv_i32 tmp;
- TCGLabel *lab_end = NULL;
- bool lookup_tb = false;
-
- switch (fp_sysreg_checks(s, regno)) {
- case FPSysRegCheckFailed:
- return false;
- case FPSysRegCheckDone:
- return true;
- case FPSysRegCheckContinue:
- break;
- }
-
- if (regno == ARM_VFP_FPSCR_NZCVQC && !dc_isar_feature(aa32_mve, s)) {
- /* QC is RES0 without MVE, so NZCVQC simplifies to NZCV */
- regno = QEMU_VFP_FPSCR_NZCV;
- }
-
- switch (regno) {
- case ARM_VFP_FPSCR:
- tmp = tcg_temp_new_i32();
- gen_helper_vfp_get_fpscr(tmp, cpu_env);
- storefn(s, opaque, tmp);
- break;
- case ARM_VFP_FPSCR_NZCVQC:
- tmp = tcg_temp_new_i32();
- gen_helper_vfp_get_fpscr(tmp, cpu_env);
- tcg_gen_andi_i32(tmp, tmp, FPCR_NZCVQC_MASK);
- storefn(s, opaque, tmp);
- break;
- case QEMU_VFP_FPSCR_NZCV:
- /*
- * Read just NZCV; this is a special case to avoid the
- * helper call for the "VMRS to CPSR.NZCV" insn.
- */
- tmp = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]);
- tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK);
- storefn(s, opaque, tmp);
- break;
- case ARM_VFP_FPCXT_S:
- {
- TCGv_i32 control, sfpa, fpscr;
- /* Bits [27:0] from FPSCR, bit [31] from CONTROL.SFPA */
- tmp = tcg_temp_new_i32();
- sfpa = tcg_temp_new_i32();
- gen_helper_vfp_get_fpscr(tmp, cpu_env);
- tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK);
- control = load_cpu_field(v7m.control[M_REG_S]);
- tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK);
- tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT);
- tcg_gen_or_i32(tmp, tmp, sfpa);
- tcg_temp_free_i32(sfpa);
- /*
- * Store result before updating FPSCR etc, in case
- * it is a memory write which causes an exception.
- */
- storefn(s, opaque, tmp);
- /*
- * Now we must reset FPSCR from FPDSCR_NS, and clear
- * CONTROL.SFPA; so we'll end the TB here.
- */
- tcg_gen_andi_i32(control, control, ~R_V7M_CONTROL_SFPA_MASK);
- store_cpu_field(control, v7m.control[M_REG_S]);
- fpscr = load_cpu_field(v7m.fpdscr[M_REG_NS]);
- gen_helper_vfp_set_fpscr(cpu_env, fpscr);
- tcg_temp_free_i32(fpscr);
- lookup_tb = true;
- break;
- }
- case ARM_VFP_FPCXT_NS:
- {
- TCGv_i32 control, sfpa, fpscr, fpdscr, zero;
- TCGLabel *lab_active = gen_new_label();
-
- lookup_tb = true;
-
- gen_branch_fpInactive(s, TCG_COND_EQ, lab_active);
- /* fpInactive case: reads as FPDSCR_NS */
- TCGv_i32 tmp = load_cpu_field(v7m.fpdscr[M_REG_NS]);
- storefn(s, opaque, tmp);
- lab_end = gen_new_label();
- tcg_gen_br(lab_end);
-
- gen_set_label(lab_active);
- /* !fpInactive: Reads the same as FPCXT_S, but side effects differ */
- gen_preserve_fp_state(s);
- tmp = tcg_temp_new_i32();
- sfpa = tcg_temp_new_i32();
- fpscr = tcg_temp_new_i32();
- gen_helper_vfp_get_fpscr(fpscr, cpu_env);
- tcg_gen_andi_i32(tmp, fpscr, ~FPCR_NZCV_MASK);
- control = load_cpu_field(v7m.control[M_REG_S]);
- tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK);
- tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT);
- tcg_gen_or_i32(tmp, tmp, sfpa);
- tcg_temp_free_i32(control);
- /* Store result before updating FPSCR, in case it faults */
- storefn(s, opaque, tmp);
- /* If SFPA is zero then set FPSCR from FPDSCR_NS */
- fpdscr = load_cpu_field(v7m.fpdscr[M_REG_NS]);
- zero = tcg_const_i32(0);
- tcg_gen_movcond_i32(TCG_COND_EQ, fpscr, sfpa, zero, fpdscr, fpscr);
- gen_helper_vfp_set_fpscr(cpu_env, fpscr);
- tcg_temp_free_i32(zero);
- tcg_temp_free_i32(sfpa);
- tcg_temp_free_i32(fpdscr);
- tcg_temp_free_i32(fpscr);
- break;
- }
- case ARM_VFP_VPR:
- /* Behaves as NOP if not privileged */
- if (IS_USER(s)) {
- break;
- }
- tmp = load_cpu_field(v7m.vpr);
- storefn(s, opaque, tmp);
- break;
- case ARM_VFP_P0:
- tmp = load_cpu_field(v7m.vpr);
- tcg_gen_extract_i32(tmp, tmp, R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH);
- storefn(s, opaque, tmp);
- break;
- default:
- g_assert_not_reached();
- }
-
- if (lab_end) {
- gen_set_label(lab_end);
- }
- if (lookup_tb) {
- gen_lookup_tb(s);
- }
- return true;
-}
-
-static void fp_sysreg_to_gpr(DisasContext *s, void *opaque, TCGv_i32 value)
-{
- arg_VMSR_VMRS *a = opaque;
-
- if (a->rt == 15) {
- /* Set the 4 flag bits in the CPSR */
- gen_set_nzcv(value);
- tcg_temp_free_i32(value);
- } else {
- store_reg(s, a->rt, value);
- }
-}
-
-static TCGv_i32 gpr_to_fp_sysreg(DisasContext *s, void *opaque)
-{
- arg_VMSR_VMRS *a = opaque;
-
- return load_reg(s, a->rt);
-}
-
-static bool gen_M_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a)
-{
- /*
- * Accesses to R15 are UNPREDICTABLE; we choose to undef.
- * FPSCR -> r15 is a special case which writes to the PSR flags;
- * set a->reg to a special value to tell gen_M_fp_sysreg_read()
- * we only care about the top 4 bits of FPSCR there.
- */
- if (a->rt == 15) {
- if (a->l && a->reg == ARM_VFP_FPSCR) {
- a->reg = QEMU_VFP_FPSCR_NZCV;
- } else {
- return false;
- }
- }
-
- if (a->l) {
- /* VMRS, move FP system register to gp register */
- return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_gpr, a);
- } else {
- /* VMSR, move gp register to FP system register */
- return gen_M_fp_sysreg_write(s, a->reg, gpr_to_fp_sysreg, a);
- }
-}
-
static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a)
{
TCGv_i32 tmp;
bool ignore_vfp_enabled = false;
if (arm_dc_feature(s, ARM_FEATURE_M)) {
- return gen_M_VMSR_VMRS(s, a);
+ /* M profile version was already handled in m-nocp.decode */
+ return false;
}
if (!dc_isar_feature(aa32_fpsp_v2, s)) {
@@ -1114,7 +813,11 @@ static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a)
return false;
}
- if (!full_vfp_access_check(s, ignore_vfp_enabled)) {
+ /*
+ * Call vfp_access_check_a() directly, because we need to tell
+ * it to ignore FPEXC.EN for some register accesses.
+ */
+ if (!vfp_access_check_a(s, ignore_vfp_enabled)) {
return true;
}
@@ -1200,96 +903,6 @@ static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a)
return true;
}
-static void fp_sysreg_to_memory(DisasContext *s, void *opaque, TCGv_i32 value)
-{
- arg_vldr_sysreg *a = opaque;
- uint32_t offset = a->imm;
- TCGv_i32 addr;
-
- if (!a->a) {
- offset = - offset;
- }
-
- addr = load_reg(s, a->rn);
- if (a->p) {
- tcg_gen_addi_i32(addr, addr, offset);
- }
-
- if (s->v8m_stackcheck && a->rn == 13 && a->w) {
- gen_helper_v8m_stackcheck(cpu_env, addr);
- }
-
- gen_aa32_st_i32(s, value, addr, get_mem_index(s),
- MO_UL | MO_ALIGN | s->be_data);
- tcg_temp_free_i32(value);
-
- if (a->w) {
- /* writeback */
- if (!a->p) {
- tcg_gen_addi_i32(addr, addr, offset);
- }
- store_reg(s, a->rn, addr);
- } else {
- tcg_temp_free_i32(addr);
- }
-}
-
-static TCGv_i32 memory_to_fp_sysreg(DisasContext *s, void *opaque)
-{
- arg_vldr_sysreg *a = opaque;
- uint32_t offset = a->imm;
- TCGv_i32 addr;
- TCGv_i32 value = tcg_temp_new_i32();
-
- if (!a->a) {
- offset = - offset;
- }
-
- addr = load_reg(s, a->rn);
- if (a->p) {
- tcg_gen_addi_i32(addr, addr, offset);
- }
-
- if (s->v8m_stackcheck && a->rn == 13 && a->w) {
- gen_helper_v8m_stackcheck(cpu_env, addr);
- }
-
- gen_aa32_ld_i32(s, value, addr, get_mem_index(s),
- MO_UL | MO_ALIGN | s->be_data);
-
- if (a->w) {
- /* writeback */
- if (!a->p) {
- tcg_gen_addi_i32(addr, addr, offset);
- }
- store_reg(s, a->rn, addr);
- } else {
- tcg_temp_free_i32(addr);
- }
- return value;
-}
-
-static bool trans_VLDR_sysreg(DisasContext *s, arg_vldr_sysreg *a)
-{
- if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
- return false;
- }
- if (a->rn == 15) {
- return false;
- }
- return gen_M_fp_sysreg_write(s, a->reg, memory_to_fp_sysreg, a);
-}
-
-static bool trans_VSTR_sysreg(DisasContext *s, arg_vldr_sysreg *a)
-{
- if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
- return false;
- }
- if (a->rn == 15) {
- return false;
- }
- return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_memory, a);
-}
static bool trans_VMOV_half(DisasContext *s, arg_VMOV_single *a)
{
diff --git a/target/arm/translate.h b/target/arm/translate.h
index 2821b325e3..99c917c571 100644
--- a/target/arm/translate.h
+++ b/target/arm/translate.h
@@ -136,6 +136,11 @@ static inline int negate(DisasContext *s, int x)
return -x;
}
+static inline int plus_1(DisasContext *s, int x)
+{
+ return x + 1;
+}
+
static inline int plus_2(DisasContext *s, int x)
{
return x + 2;
@@ -151,6 +156,11 @@ static inline int times_4(DisasContext *s, int x)
return x * 4;
}
+static inline int times_2_plus_1(DisasContext *s, int x)
+{
+ return x * 2 + 1;
+}
+
static inline int arm_dc_feature(DisasContext *dc, int feature)
{
return (dc->features & (1ULL << feature)) != 0;
diff --git a/target/arm/vfp.decode b/target/arm/vfp.decode
index 52535d9b0b..5405e80197 100644
--- a/target/arm/vfp.decode
+++ b/target/arm/vfp.decode
@@ -84,20 +84,6 @@ VLDR_VSTR_hp ---- 1101 u:1 .0 l:1 rn:4 .... 1001 imm:8 vd=%vd_sp
VLDR_VSTR_sp ---- 1101 u:1 .0 l:1 rn:4 .... 1010 imm:8 vd=%vd_sp
VLDR_VSTR_dp ---- 1101 u:1 .0 l:1 rn:4 .... 1011 imm:8 vd=%vd_dp
-# M-profile VLDR/VSTR to sysreg
-%vldr_sysreg 22:1 13:3
-%imm7_0x4 0:7 !function=times_4
-
-&vldr_sysreg rn reg imm a w p
-@vldr_sysreg .... ... . a:1 . . . rn:4 ... . ... .. ....... \
- reg=%vldr_sysreg imm=%imm7_0x4 &vldr_sysreg
-
-# P=0 W=0 is SEE "Related encodings", so split into two patterns
-VLDR_sysreg ---- 110 1 . . w:1 1 .... ... 0 111 11 ....... @vldr_sysreg p=1
-VLDR_sysreg ---- 110 0 . . 1 1 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1
-VSTR_sysreg ---- 110 1 . . w:1 0 .... ... 0 111 11 ....... @vldr_sysreg p=1
-VSTR_sysreg ---- 110 0 . . 1 0 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1
-
# We split the load/store multiple up into two patterns to avoid
# overlap with other insns in the "Advanced SIMD load/store and 64-bit move"
# grouping:
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 498a959839..515db120cc 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -386,7 +386,7 @@ uint64_t (dup_const)(unsigned vece, uint64_t c)
}
/* Duplicate IN into OUT as per VECE. */
-static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
+void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
{
switch (vece) {
case MO_8:
@@ -404,7 +404,7 @@ static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
}
}
-static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
+void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
{
switch (vece) {
case MO_8:
@@ -578,15 +578,15 @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
&& (vece != MO_32 || !check_size_impl(oprsz, 4))) {
t_64 = tcg_temp_new_i64();
tcg_gen_extu_i32_i64(t_64, in_32);
- gen_dup_i64(vece, t_64, t_64);
+ tcg_gen_dup_i64(vece, t_64, t_64);
} else {
t_32 = tcg_temp_new_i32();
- gen_dup_i32(vece, t_32, in_32);
+ tcg_gen_dup_i32(vece, t_32, in_32);
}
} else if (in_64) {
/* We are given a 64-bit variable input. */
t_64 = tcg_temp_new_i64();
- gen_dup_i64(vece, t_64, in_64);
+ tcg_gen_dup_i64(vece, t_64, in_64);
} else {
/* We are given a constant input. */
/* For 64-bit hosts, use 64-bit constants for "simple" constants
@@ -1311,14 +1311,14 @@ void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
} else if (g->fni8 && check_size_impl(oprsz, 8)) {
TCGv_i64 t64 = tcg_temp_new_i64();
- gen_dup_i64(g->vece, t64, c);
+ tcg_gen_dup_i64(g->vece, t64, c);
expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
tcg_temp_free_i64(t64);
} else if (g->fni4 && check_size_impl(oprsz, 4)) {
TCGv_i32 t32 = tcg_temp_new_i32();
tcg_gen_extrl_i64_i32(t32, c);
- gen_dup_i32(g->vece, t32, t32);
+ tcg_gen_dup_i32(g->vece, t32, t32);
expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
tcg_temp_free_i32(t32);
} else {
@@ -2538,7 +2538,7 @@ void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
{
TCGv_i64 tmp = tcg_temp_new_i64();
- gen_dup_i64(vece, tmp, c);
+ tcg_gen_dup_i64(vece, tmp, c);
tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
tcg_temp_free_i64(tmp);
}
@@ -2562,7 +2562,7 @@ void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
{
TCGv_i64 tmp = tcg_temp_new_i64();
- gen_dup_i64(vece, tmp, c);
+ tcg_gen_dup_i64(vece, tmp, c);
tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
tcg_temp_free_i64(tmp);
}
@@ -2586,7 +2586,7 @@ void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
{
TCGv_i64 tmp = tcg_temp_new_i64();
- gen_dup_i64(vece, tmp, c);
+ tcg_gen_dup_i64(vece, tmp, c);
tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
tcg_temp_free_i64(tmp);
}