target-arm: Fix VLD of single element to all lanes

Fix several bugs in VLD of single element to all lanes: The "single element to all lanes" form of VLD1 differs from those for VLD2, VLD3 and VLD4 in that bit 5 indicates whether the loaded element should be written to one or two Dregs (rather than being a register stride). Handle this by special-casing VLD1 rather than trying to have one loop which deals with both VLD1 and 2/3/4. Handle VLD4.32 with 16 byte alignment specified, rather than UNDEFfing. UNDEF for the invalid size and alignment combinations. Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
author: Peter Maydell <peter.maydell@linaro.org> 2011-03-15 16:26:51 +0000
committer: Aurelien Jarno <aurelien@aurel32.net> 2011-04-01 22:33:47 +0200
commit: 8e18cde30b06d2e7411bf38091c4e30602f85cdd (patch)
tree: b1c36d400bf36f68b0d20a29492b9aa383c71d9b
parent: ac60cc18711a9786af9844d7e3d002276fbd85f3 (diff)
1 files changed, 59 insertions, 25 deletions
diff --git a/target-arm/translate.c b/target-arm/translate.c
index f69912f20c..1dfd4823f5 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -2648,6 +2648,28 @@ static void gen_neon_dup_high16(TCGv var)
     tcg_temp_free_i32(tmp);
 }
 
+static TCGv gen_load_and_replicate(DisasContext *s, TCGv addr, int size)
+{
+    /* Load a single Neon element and replicate into a 32 bit TCG reg */
+    TCGv tmp;
+    switch (size) {
+    case 0:
+        tmp = gen_ld8u(addr, IS_USER(s));
+        gen_neon_dup_u8(tmp, 0);
+        break;
+    case 1:
+        tmp = gen_ld16u(addr, IS_USER(s));
+        gen_neon_dup_low16(tmp);
+        break;
+    case 2:
+        tmp = gen_ld32(addr, IS_USER(s));
+        break;
+    default: /* Avoid compiler warnings.  */
+        abort();
+    }
+    return tmp;
+}
+
 /* Disassemble a VFP instruction.  Returns nonzero if an error occured
    (ie. an undefined instruction).  */
 static int disas_vfp_insn(CPUState * env, DisasContext *s, uint32_t insn)
@@ -3890,36 +3912,48 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn)
         size = (insn >> 10) & 3;
         if (size == 3) {
             /* Load single element to all lanes.  */
-            if (!load)
+            int a = (insn >> 4) & 1;
+            if (!load) {
                 return 1;
+            }
             size = (insn >> 6) & 3;
             nregs = ((insn >> 8) & 3) + 1;
-            stride = (insn & (1 << 5)) ? 2 : 1;
-            load_reg_var(s, addr, rn);
-            for (reg = 0; reg < nregs; reg++) {
-                switch (size) {
-                case 0:
-                    tmp = gen_ld8u(addr, IS_USER(s));
-                    gen_neon_dup_u8(tmp, 0);
-                    break;
-                case 1:
-                    tmp = gen_ld16u(addr, IS_USER(s));
-                    gen_neon_dup_low16(tmp);
-                    break;
-                case 2:
-                    tmp = gen_ld32(addr, IS_USER(s));
-                    break;
-                case 3:
+
+            if (size == 3) {
+                if (nregs != 4 || a == 0) {
                     return 1;
-                default: /* Avoid compiler warnings.  */
-                    abort();
                 }
-                tcg_gen_addi_i32(addr, addr, 1 << size);
-                tmp2 = tcg_temp_new_i32();
-                tcg_gen_mov_i32(tmp2, tmp);
-                neon_store_reg(rd, 0, tmp2);
-                neon_store_reg(rd, 1, tmp);
-                rd += stride;
+                /* For VLD4 size==3 a == 1 means 32 bits at 16 byte alignment */
+                size = 2;
+            }
+            if (nregs == 1 && a == 1 && size == 0) {
+                return 1;
+            }
+            if (nregs == 3 && a == 1) {
+                return 1;
+            }
+            load_reg_var(s, addr, rn);
+            if (nregs == 1) {
+                /* VLD1 to all lanes: bit 5 indicates how many Dregs to write */
+                tmp = gen_load_and_replicate(s, addr, size);
+                tcg_gen_st_i32(tmp, cpu_env, neon_reg_offset(rd, 0));
+                tcg_gen_st_i32(tmp, cpu_env, neon_reg_offset(rd, 1));
+                if (insn & (1 << 5)) {
+                    tcg_gen_st_i32(tmp, cpu_env, neon_reg_offset(rd + 1, 0));
+                    tcg_gen_st_i32(tmp, cpu_env, neon_reg_offset(rd + 1, 1));
+                }
+                tcg_temp_free_i32(tmp);
+            } else {
+                /* VLD2/3/4 to all lanes: bit 5 indicates register stride */
+                stride = (insn & (1 << 5)) ? 2 : 1;
+                for (reg = 0; reg < nregs; reg++) {
+                    tmp = gen_load_and_replicate(s, addr, size);
+                    tcg_gen_st_i32(tmp, cpu_env, neon_reg_offset(rd, 0));
+                    tcg_gen_st_i32(tmp, cpu_env, neon_reg_offset(rd, 1));
+                    tcg_temp_free_i32(tmp);
+                    tcg_gen_addi_i32(addr, addr, 1 << size);
+                    rd += stride;
+                }
             }
             stride = (1 << size) * nregs;
         } else {
author	Peter Maydell <peter.maydell@linaro.org>	2011-03-15 16:26:51 +0000
committer	Aurelien Jarno <aurelien@aurel32.net>	2011-04-01 22:33:47 +0200
commit	8e18cde30b06d2e7411bf38091c4e30602f85cdd (patch)
tree	b1c36d400bf36f68b0d20a29492b9aa383c71d9b
parent	ac60cc18711a9786af9844d7e3d002276fbd85f3 (diff)