1 files changed, 565 insertions, 123 deletions
diff --git a/tcg/optimize.c b/tcg/optimize.c
index 9c65474a8c..9109b813e0 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -39,8 +39,6 @@ typedef enum {
     TCG_TEMP_UNDEF = 0,
     TCG_TEMP_CONST,
     TCG_TEMP_COPY,
-    TCG_TEMP_HAS_COPY,
-    TCG_TEMP_ANY
 } tcg_temp_state;
 
 struct tcg_temp_info {
@@ -52,39 +50,19 @@ struct tcg_temp_info {
 
 static struct tcg_temp_info temps[TCG_MAX_TEMPS];
 
-/* Reset TEMP's state to TCG_TEMP_ANY.  If TEMP was a representative of some
-   class of equivalent temp's, a new representative should be chosen in this
-   class. */
-static void reset_temp(TCGArg temp, int nb_temps, int nb_globals)
+/* Reset TEMP's state to TCG_TEMP_UNDEF.  If TEMP only had one copy, remove
+   the copy flag from the left temp.  */
+static void reset_temp(TCGArg temp)
 {
-    int i;
-    TCGArg new_base = (TCGArg)-1;
-    if (temps[temp].state == TCG_TEMP_HAS_COPY) {
-        for (i = temps[temp].next_copy; i != temp; i = temps[i].next_copy) {
-            if (i >= nb_globals) {
-                temps[i].state = TCG_TEMP_HAS_COPY;
-                new_base = i;
-                break;
-            }
+    if (temps[temp].state == TCG_TEMP_COPY) {
+        if (temps[temp].prev_copy == temps[temp].next_copy) {
+            temps[temps[temp].next_copy].state = TCG_TEMP_UNDEF;
+        } else {
+            temps[temps[temp].next_copy].prev_copy = temps[temp].prev_copy;
+            temps[temps[temp].prev_copy].next_copy = temps[temp].next_copy;
         }
-        for (i = temps[temp].next_copy; i != temp; i = temps[i].next_copy) {
-            if (new_base == (TCGArg)-1) {
-                temps[i].state = TCG_TEMP_ANY;
-            } else {
-                temps[i].val = new_base;
-            }
-        }
-        temps[temps[temp].next_copy].prev_copy = temps[temp].prev_copy;
-        temps[temps[temp].prev_copy].next_copy = temps[temp].next_copy;
-    } else if (temps[temp].state == TCG_TEMP_COPY) {
-        temps[temps[temp].next_copy].prev_copy = temps[temp].prev_copy;
-        temps[temps[temp].prev_copy].next_copy = temps[temp].next_copy;
-        new_base = temps[temp].val;
-    }
-    temps[temp].state = TCG_TEMP_ANY;
-    if (new_base != (TCGArg)-1 && temps[new_base].next_copy == new_base) {
-        temps[new_base].state = TCG_TEMP_ANY;
     }
+    temps[temp].state = TCG_TEMP_UNDEF;
 }
 
 static int op_bits(TCGOpcode op)
@@ -107,36 +85,83 @@ static TCGOpcode op_to_movi(TCGOpcode op)
     }
 }
 
-static void tcg_opt_gen_mov(TCGContext *s, TCGArg *gen_args, TCGArg dst,
-                            TCGArg src, int nb_temps, int nb_globals)
+static TCGArg find_better_copy(TCGContext *s, TCGArg temp)
+{
+    TCGArg i;
+
+    /* If this is already a global, we can't do better. */
+    if (temp < s->nb_globals) {
+        return temp;
+    }
+
+    /* Search for a global first. */
+    for (i = temps[temp].next_copy ; i != temp ; i = temps[i].next_copy) {
+        if (i < s->nb_globals) {
+            return i;
+        }
+    }
+
+    /* If it is a temp, search for a temp local. */
+    if (!s->temps[temp].temp_local) {
+        for (i = temps[temp].next_copy ; i != temp ; i = temps[i].next_copy) {
+            if (s->temps[i].temp_local) {
+                return i;
+            }
+        }
+    }
+
+    /* Failure to find a better representation, return the same temp. */
+    return temp;
+}
+
+static bool temps_are_copies(TCGArg arg1, TCGArg arg2)
+{
+    TCGArg i;
+
+    if (arg1 == arg2) {
+        return true;
+    }
+
+    if (temps[arg1].state != TCG_TEMP_COPY
+        || temps[arg2].state != TCG_TEMP_COPY) {
+        return false;
+    }
+
+    for (i = temps[arg1].next_copy ; i != arg1 ; i = temps[i].next_copy) {
+        if (i == arg2) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static void tcg_opt_gen_mov(TCGContext *s, TCGArg *gen_args,
+                            TCGArg dst, TCGArg src)
 {
-        reset_temp(dst, nb_temps, nb_globals);
-        assert(temps[src].state != TCG_TEMP_COPY);
-        /* Don't try to copy if one of temps is a global or either one
-           is local and another is register */
-        if (src >= nb_globals && dst >= nb_globals &&
-            tcg_arg_is_local(s, src) == tcg_arg_is_local(s, dst)) {
-            assert(temps[src].state != TCG_TEMP_CONST);
-            if (temps[src].state != TCG_TEMP_HAS_COPY) {
-                temps[src].state = TCG_TEMP_HAS_COPY;
+        reset_temp(dst);
+        assert(temps[src].state != TCG_TEMP_CONST);
+
+        if (s->temps[src].type == s->temps[dst].type) {
+            if (temps[src].state != TCG_TEMP_COPY) {
+                temps[src].state = TCG_TEMP_COPY;
                 temps[src].next_copy = src;
                 temps[src].prev_copy = src;
             }
             temps[dst].state = TCG_TEMP_COPY;
-            temps[dst].val = src;
             temps[dst].next_copy = temps[src].next_copy;
             temps[dst].prev_copy = src;
             temps[temps[dst].next_copy].prev_copy = dst;
             temps[src].next_copy = dst;
         }
+
         gen_args[0] = dst;
         gen_args[1] = src;
 }
 
-static void tcg_opt_gen_movi(TCGArg *gen_args, TCGArg dst, TCGArg val,
-                             int nb_temps, int nb_globals)
+static void tcg_opt_gen_movi(TCGArg *gen_args, TCGArg dst, TCGArg val)
 {
-        reset_temp(dst, nb_temps, nb_globals);
+        reset_temp(dst);
         temps[dst].state = TCG_TEMP_CONST;
         temps[dst].val = val;
         gen_args[0] = dst;
@@ -267,6 +292,179 @@ static TCGArg do_constant_folding(TCGOpcode op, TCGArg x, TCGArg y)
     return res;
 }
 
+static bool do_constant_folding_cond_32(uint32_t x, uint32_t y, TCGCond c)
+{
+    switch (c) {
+    case TCG_COND_EQ:
+        return x == y;
+    case TCG_COND_NE:
+        return x != y;
+    case TCG_COND_LT:
+        return (int32_t)x < (int32_t)y;
+    case TCG_COND_GE:
+        return (int32_t)x >= (int32_t)y;
+    case TCG_COND_LE:
+        return (int32_t)x <= (int32_t)y;
+    case TCG_COND_GT:
+        return (int32_t)x > (int32_t)y;
+    case TCG_COND_LTU:
+        return x < y;
+    case TCG_COND_GEU:
+        return x >= y;
+    case TCG_COND_LEU:
+        return x <= y;
+    case TCG_COND_GTU:
+        return x > y;
+    default:
+        tcg_abort();
+    }
+}
+
+static bool do_constant_folding_cond_64(uint64_t x, uint64_t y, TCGCond c)
+{
+    switch (c) {
+    case TCG_COND_EQ:
+        return x == y;
+    case TCG_COND_NE:
+        return x != y;
+    case TCG_COND_LT:
+        return (int64_t)x < (int64_t)y;
+    case TCG_COND_GE:
+        return (int64_t)x >= (int64_t)y;
+    case TCG_COND_LE:
+        return (int64_t)x <= (int64_t)y;
+    case TCG_COND_GT:
+        return (int64_t)x > (int64_t)y;
+    case TCG_COND_LTU:
+        return x < y;
+    case TCG_COND_GEU:
+        return x >= y;
+    case TCG_COND_LEU:
+        return x <= y;
+    case TCG_COND_GTU:
+        return x > y;
+    default:
+        tcg_abort();
+    }
+}
+
+static bool do_constant_folding_cond_eq(TCGCond c)
+{
+    switch (c) {
+    case TCG_COND_GT:
+    case TCG_COND_LTU:
+    case TCG_COND_LT:
+    case TCG_COND_GTU:
+    case TCG_COND_NE:
+        return 0;
+    case TCG_COND_GE:
+    case TCG_COND_GEU:
+    case TCG_COND_LE:
+    case TCG_COND_LEU:
+    case TCG_COND_EQ:
+        return 1;
+    default:
+        tcg_abort();
+    }
+}
+
+/* Return 2 if the condition can't be simplified, and the result
+   of the condition (0 or 1) if it can */
+static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
+                                       TCGArg y, TCGCond c)
+{
+    if (temps[x].state == TCG_TEMP_CONST && temps[y].state == TCG_TEMP_CONST) {
+        switch (op_bits(op)) {
+        case 32:
+            return do_constant_folding_cond_32(temps[x].val, temps[y].val, c);
+        case 64:
+            return do_constant_folding_cond_64(temps[x].val, temps[y].val, c);
+        default:
+            tcg_abort();
+        }
+    } else if (temps_are_copies(x, y)) {
+        return do_constant_folding_cond_eq(c);
+    } else if (temps[y].state == TCG_TEMP_CONST && temps[y].val == 0) {
+        switch (c) {
+        case TCG_COND_LTU:
+            return 0;
+        case TCG_COND_GEU:
+            return 1;
+        default:
+            return 2;
+        }
+    } else {
+        return 2;
+    }
+}
+
+/* Return 2 if the condition can't be simplified, and the result
+   of the condition (0 or 1) if it can */
+static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
+{
+    TCGArg al = p1[0], ah = p1[1];
+    TCGArg bl = p2[0], bh = p2[1];
+
+    if (temps[bl].state == TCG_TEMP_CONST
+        && temps[bh].state == TCG_TEMP_CONST) {
+        uint64_t b = ((uint64_t)temps[bh].val << 32) | (uint32_t)temps[bl].val;
+
+        if (temps[al].state == TCG_TEMP_CONST
+            && temps[ah].state == TCG_TEMP_CONST) {
+            uint64_t a;
+            a = ((uint64_t)temps[ah].val << 32) | (uint32_t)temps[al].val;
+            return do_constant_folding_cond_64(a, b, c);
+        }
+        if (b == 0) {
+            switch (c) {
+            case TCG_COND_LTU:
+                return 0;
+            case TCG_COND_GEU:
+                return 1;
+            default:
+                break;
+            }
+        }
+    }
+    if (temps_are_copies(al, bl) && temps_are_copies(ah, bh)) {
+        return do_constant_folding_cond_eq(c);
+    }
+    return 2;
+}
+
+static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
+{
+    TCGArg a1 = *p1, a2 = *p2;
+    int sum = 0;
+    sum += temps[a1].state == TCG_TEMP_CONST;
+    sum -= temps[a2].state == TCG_TEMP_CONST;
+
+    /* Prefer the constant in second argument, and then the form
+       op a, a, b, which is better handled on non-RISC hosts. */
+    if (sum > 0 || (sum == 0 && dest == a2)) {
+        *p1 = a2;
+        *p2 = a1;
+        return true;
+    }
+    return false;
+}
+
+static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
+{
+    int sum = 0;
+    sum += temps[p1[0]].state == TCG_TEMP_CONST;
+    sum += temps[p1[1]].state == TCG_TEMP_CONST;
+    sum -= temps[p2[0]].state == TCG_TEMP_CONST;
+    sum -= temps[p2[1]].state == TCG_TEMP_CONST;
+    if (sum > 0) {
+        TCGArg t;
+        t = p1[0], p1[0] = p2[0], p2[0] = t;
+        t = p1[1], p1[1] = p2[1], p2[1] = t;
+        return true;
+    }
+    return false;
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                                     TCGArg *args, TCGOpDef *tcg_op_defs)
@@ -276,28 +474,34 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
     const TCGOpDef *def;
     TCGArg *gen_args;
     TCGArg tmp;
+
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
-       If this temp is a copy of other ones then this equivalence class'
-       representative is kept in VALS' element.
-       If this temp is neither copy nor constant then corresponding VALS'
-       element is unused. */
+       If this temp is a copy of other ones then the other copies are
+       available through the doubly linked circular list. */
 
     nb_temps = s->nb_temps;
     nb_globals = s->nb_globals;
     memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
 
-    nb_ops = tcg_opc_ptr - gen_opc_buf;
+    nb_ops = tcg_opc_ptr - s->gen_opc_buf;
     gen_args = args;
     for (op_index = 0; op_index < nb_ops; op_index++) {
-        op = gen_opc_buf[op_index];
+        op = s->gen_opc_buf[op_index];
         def = &tcg_op_defs[op];
         /* Do copy propagation */
-        if (!(def->flags & (TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS))) {
-            assert(op != INDEX_op_call);
+        if (op == INDEX_op_call) {
+            int nb_oargs = args[0] >> 16;
+            int nb_iargs = args[0] & 0xffff;
+            for (i = nb_oargs + 1; i < nb_oargs + nb_iargs + 1; i++) {
+                if (temps[args[i]].state == TCG_TEMP_COPY) {
+                    args[i] = find_better_copy(s, args[i]);
+                }
+            }
+        } else {
             for (i = def->nb_oargs; i < def->nb_oargs + def->nb_iargs; i++) {
                 if (temps[args[i]].state == TCG_TEMP_COPY) {
-                    args[i] = temps[args[i]].val;
+                    args[i] = find_better_copy(s, args[i]);
                 }
             }
         }
@@ -312,17 +516,71 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
         CASE_OP_32_64(eqv):
         CASE_OP_32_64(nand):
         CASE_OP_32_64(nor):
-            if (temps[args[1]].state == TCG_TEMP_CONST) {
-                tmp = args[1];
-                args[1] = args[2];
-                args[2] = tmp;
+            swap_commutative(args[0], &args[1], &args[2]);
+            break;
+        CASE_OP_32_64(brcond):
+            if (swap_commutative(-1, &args[0], &args[1])) {
+                args[2] = tcg_swap_cond(args[2]);
+            }
+            break;
+        CASE_OP_32_64(setcond):
+            if (swap_commutative(args[0], &args[1], &args[2])) {
+                args[3] = tcg_swap_cond(args[3]);
+            }
+            break;
+        CASE_OP_32_64(movcond):
+            if (swap_commutative(-1, &args[1], &args[2])) {
+                args[5] = tcg_swap_cond(args[5]);
+            }
+            /* For movcond, we canonicalize the "false" input reg to match
+               the destination reg so that the tcg backend can implement
+               a "move if true" operation.  */
+            if (swap_commutative(args[0], &args[4], &args[3])) {
+                args[5] = tcg_invert_cond(args[5]);
+            }
+            break;
+        case INDEX_op_add2_i32:
+            swap_commutative(args[0], &args[2], &args[4]);
+            swap_commutative(args[1], &args[3], &args[5]);
+            break;
+        case INDEX_op_mulu2_i32:
+            swap_commutative(args[0], &args[2], &args[3]);
+            break;
+        case INDEX_op_brcond2_i32:
+            if (swap_commutative2(&args[0], &args[2])) {
+                args[4] = tcg_swap_cond(args[4]);
+            }
+            break;
+        case INDEX_op_setcond2_i32:
+            if (swap_commutative2(&args[1], &args[3])) {
+                args[5] = tcg_swap_cond(args[5]);
+            }
+            break;
+        default:
+            break;
+        }
+
+        /* Simplify expressions for "shift/rot r, 0, a => movi r, 0" */
+        switch (op) {
+        CASE_OP_32_64(shl):
+        CASE_OP_32_64(shr):
+        CASE_OP_32_64(sar):
+        CASE_OP_32_64(rotl):
+        CASE_OP_32_64(rotr):
+            if (temps[args[1]].state == TCG_TEMP_CONST
+                && temps[args[1]].val == 0) {
+                s->gen_opc_buf[op_index] = op_to_movi(op);
+                tcg_opt_gen_movi(gen_args, args[0], 0);
+                args += 3;
+                gen_args += 2;
+                continue;
             }
             break;
         default:
             break;
         }
 
-        /* Simplify expression if possible. */
+        /* Simplify expression for "op r, a, 0 => mov r, a" cases */
         switch (op) {
         CASE_OP_32_64(add):
         CASE_OP_32_64(sub):
@@ -331,50 +589,75 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
         CASE_OP_32_64(sar):
         CASE_OP_32_64(rotl):
         CASE_OP_32_64(rotr):
+        CASE_OP_32_64(or):
+        CASE_OP_32_64(xor):
             if (temps[args[1]].state == TCG_TEMP_CONST) {
                 /* Proceed with possible constant folding. */
                 break;
             }
             if (temps[args[2]].state == TCG_TEMP_CONST
                 && temps[args[2]].val == 0) {
-                if ((temps[args[0]].state == TCG_TEMP_COPY
-                    && temps[args[0]].val == args[1])
-                    || args[0] == args[1]) {
-                    args += 3;
-                    gen_opc_buf[op_index] = INDEX_op_nop;
+                if (temps_are_copies(args[0], args[1])) {
+                    s->gen_opc_buf[op_index] = INDEX_op_nop;
                 } else {
-                    gen_opc_buf[op_index] = op_to_mov(op);
-                    tcg_opt_gen_mov(s, gen_args, args[0], args[1],
-                                    nb_temps, nb_globals);
+                    s->gen_opc_buf[op_index] = op_to_mov(op);
+                    tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
                     gen_args += 2;
-                    args += 3;
                 }
+                args += 3;
                 continue;
             }
             break;
+        default:
+            break;
+        }
+
+        /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
+        switch (op) {
+        CASE_OP_32_64(and):
         CASE_OP_32_64(mul):
             if ((temps[args[2]].state == TCG_TEMP_CONST
                 && temps[args[2]].val == 0)) {
-                gen_opc_buf[op_index] = op_to_movi(op);
-                tcg_opt_gen_movi(gen_args, args[0], 0, nb_temps, nb_globals);
+                s->gen_opc_buf[op_index] = op_to_movi(op);
+                tcg_opt_gen_movi(gen_args, args[0], 0);
                 args += 3;
                 gen_args += 2;
                 continue;
             }
             break;
+        default:
+            break;
+        }
+
+        /* Simplify expression for "op r, a, a => mov r, a" cases */
+        switch (op) {
         CASE_OP_32_64(or):
         CASE_OP_32_64(and):
-            if (args[1] == args[2]) {
-                if (args[1] == args[0]) {
-                    args += 3;
-                    gen_opc_buf[op_index] = INDEX_op_nop;
+            if (temps_are_copies(args[1], args[2])) {
+                if (temps_are_copies(args[0], args[1])) {
+                    s->gen_opc_buf[op_index] = INDEX_op_nop;
                 } else {
-                    gen_opc_buf[op_index] = op_to_mov(op);
-                    tcg_opt_gen_mov(s, gen_args, args[0], args[1], nb_temps,
-                                    nb_globals);
+                    s->gen_opc_buf[op_index] = op_to_mov(op);
+                    tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
                     gen_args += 2;
-                    args += 3;
                 }
+                args += 3;
+                continue;
+            }
+            break;
+        default:
+            break;
+        }
+
+        /* Simplify expression for "op r, a, a => movi r, 0" cases */
+        switch (op) {
+        CASE_OP_32_64(sub):
+        CASE_OP_32_64(xor):
+            if (temps_are_copies(args[1], args[2])) {
+                s->gen_opc_buf[op_index] = op_to_movi(op);
+                tcg_opt_gen_movi(gen_args, args[0], 0);
+                gen_args += 2;
+                args += 3;
                 continue;
             }
             break;
@@ -387,16 +670,13 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
            allocator where needed and possible.  Also detect copies. */
         switch (op) {
         CASE_OP_32_64(mov):
-            if ((temps[args[1]].state == TCG_TEMP_COPY
-                && temps[args[1]].val == args[0])
-                || args[0] == args[1]) {
+            if (temps_are_copies(args[0], args[1])) {
                 args += 2;
-                gen_opc_buf[op_index] = INDEX_op_nop;
+                s->gen_opc_buf[op_index] = INDEX_op_nop;
                 break;
             }
             if (temps[args[1]].state != TCG_TEMP_CONST) {
-                tcg_opt_gen_mov(s, gen_args, args[0], args[1],
-                                nb_temps, nb_globals);
+                tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
                 gen_args += 2;
                 args += 2;
                 break;
@@ -404,14 +684,15 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             /* Source argument is constant.  Rewrite the operation and
                let movi case handle it. */
             op = op_to_movi(op);
-            gen_opc_buf[op_index] = op;
+            s->gen_opc_buf[op_index] = op;
             args[1] = temps[args[1]].val;
             /* fallthrough */
         CASE_OP_32_64(movi):
-            tcg_opt_gen_movi(gen_args, args[0], args[1], nb_temps, nb_globals);
+            tcg_opt_gen_movi(gen_args, args[0], args[1]);
             gen_args += 2;
             args += 2;
             break;
+
         CASE_OP_32_64(not):
         CASE_OP_32_64(neg):
         CASE_OP_32_64(ext8s):
@@ -421,20 +702,15 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
         case INDEX_op_ext32s_i64:
         case INDEX_op_ext32u_i64:
             if (temps[args[1]].state == TCG_TEMP_CONST) {
-                gen_opc_buf[op_index] = op_to_movi(op);
+                s->gen_opc_buf[op_index] = op_to_movi(op);
                 tmp = do_constant_folding(op, temps[args[1]].val, 0);
-                tcg_opt_gen_movi(gen_args, args[0], tmp, nb_temps, nb_globals);
-                gen_args += 2;
-                args += 2;
-                break;
-            } else {
-                reset_temp(args[0], nb_temps, nb_globals);
-                gen_args[0] = args[0];
-                gen_args[1] = args[1];
+                tcg_opt_gen_movi(gen_args, args[0], tmp);
                 gen_args += 2;
                 args += 2;
                 break;
             }
+            goto do_default;
+
         CASE_OP_32_64(add):
         CASE_OP_32_64(sub):
         CASE_OP_32_64(mul):
@@ -453,31 +729,200 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
         CASE_OP_32_64(nor):
             if (temps[args[1]].state == TCG_TEMP_CONST
                 && temps[args[2]].state == TCG_TEMP_CONST) {
-                gen_opc_buf[op_index] = op_to_movi(op);
+                s->gen_opc_buf[op_index] = op_to_movi(op);
                 tmp = do_constant_folding(op, temps[args[1]].val,
                                           temps[args[2]].val);
-                tcg_opt_gen_movi(gen_args, args[0], tmp, nb_temps, nb_globals);
+                tcg_opt_gen_movi(gen_args, args[0], tmp);
                 gen_args += 2;
                 args += 3;
                 break;
+            }
+            goto do_default;
+
+        CASE_OP_32_64(deposit):
+            if (temps[args[1]].state == TCG_TEMP_CONST
+                && temps[args[2]].state == TCG_TEMP_CONST) {
+                s->gen_opc_buf[op_index] = op_to_movi(op);
+                tmp = ((1ull << args[4]) - 1);
+                tmp = (temps[args[1]].val & ~(tmp << args[3]))
+                      | ((temps[args[2]].val & tmp) << args[3]);
+                tcg_opt_gen_movi(gen_args, args[0], tmp);
+                gen_args += 2;
+                args += 5;
+                break;
+            }
+            goto do_default;
+
+        CASE_OP_32_64(setcond):
+            tmp = do_constant_folding_cond(op, args[1], args[2], args[3]);
+            if (tmp != 2) {
+                s->gen_opc_buf[op_index] = op_to_movi(op);
+                tcg_opt_gen_movi(gen_args, args[0], tmp);
+                gen_args += 2;
+                args += 4;
+                break;
+            }
+            goto do_default;
+
+        CASE_OP_32_64(brcond):
+            tmp = do_constant_folding_cond(op, args[0], args[1], args[2]);
+            if (tmp != 2) {
+                if (tmp) {
+                    memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
+                    s->gen_opc_buf[op_index] = INDEX_op_br;
+                    gen_args[0] = args[3];
+                    gen_args += 1;
+                } else {
+                    s->gen_opc_buf[op_index] = INDEX_op_nop;
+                }
+                args += 4;
+                break;
+            }
+            goto do_default;
+
+        CASE_OP_32_64(movcond):
+            tmp = do_constant_folding_cond(op, args[1], args[2], args[5]);
+            if (tmp != 2) {
+                if (temps_are_copies(args[0], args[4-tmp])) {
+                    s->gen_opc_buf[op_index] = INDEX_op_nop;
+                } else if (temps[args[4-tmp]].state == TCG_TEMP_CONST) {
+                    s->gen_opc_buf[op_index] = op_to_movi(op);
+                    tcg_opt_gen_movi(gen_args, args[0], temps[args[4-tmp]].val);
+                    gen_args += 2;
+                } else {
+                    s->gen_opc_buf[op_index] = op_to_mov(op);
+                    tcg_opt_gen_mov(s, gen_args, args[0], args[4-tmp]);
+                    gen_args += 2;
+                }
+                args += 6;
+                break;
+            }
+            goto do_default;
+
+        case INDEX_op_add2_i32:
+        case INDEX_op_sub2_i32:
+            if (temps[args[2]].state == TCG_TEMP_CONST
+                && temps[args[3]].state == TCG_TEMP_CONST
+                && temps[args[4]].state == TCG_TEMP_CONST
+                && temps[args[5]].state == TCG_TEMP_CONST) {
+                uint32_t al = temps[args[2]].val;
+                uint32_t ah = temps[args[3]].val;
+                uint32_t bl = temps[args[4]].val;
+                uint32_t bh = temps[args[5]].val;
+                uint64_t a = ((uint64_t)ah << 32) | al;
+                uint64_t b = ((uint64_t)bh << 32) | bl;
+                TCGArg rl, rh;
+
+                if (op == INDEX_op_add2_i32) {
+                    a += b;
+                } else {
+                    a -= b;
+                }
+
+                /* We emit the extra nop when we emit the add2/sub2.  */
+                assert(s->gen_opc_buf[op_index + 1] == INDEX_op_nop);
+
+                rl = args[0];
+                rh = args[1];
+                s->gen_opc_buf[op_index] = INDEX_op_movi_i32;
+                s->gen_opc_buf[++op_index] = INDEX_op_movi_i32;
+                tcg_opt_gen_movi(&gen_args[0], rl, (uint32_t)a);
+                tcg_opt_gen_movi(&gen_args[2], rh, (uint32_t)(a >> 32));
+                gen_args += 4;
+                args += 6;
+                break;
+            }
+            goto do_default;
+
+        case INDEX_op_mulu2_i32:
+            if (temps[args[2]].state == TCG_TEMP_CONST
+                && temps[args[3]].state == TCG_TEMP_CONST) {
+                uint32_t a = temps[args[2]].val;
+                uint32_t b = temps[args[3]].val;
+                uint64_t r = (uint64_t)a * b;
+                TCGArg rl, rh;
+
+                /* We emit the extra nop when we emit the mulu2.  */
+                assert(s->gen_opc_buf[op_index + 1] == INDEX_op_nop);
+
+                rl = args[0];
+                rh = args[1];
+                s->gen_opc_buf[op_index] = INDEX_op_movi_i32;
+                s->gen_opc_buf[++op_index] = INDEX_op_movi_i32;
+                tcg_opt_gen_movi(&gen_args[0], rl, (uint32_t)r);
+                tcg_opt_gen_movi(&gen_args[2], rh, (uint32_t)(r >> 32));
+                gen_args += 4;
+                args += 4;
+                break;
+            }
+            goto do_default;
+
+        case INDEX_op_brcond2_i32:
+            tmp = do_constant_folding_cond2(&args[0], &args[2], args[4]);
+            if (tmp != 2) {
+                if (tmp) {
+                    memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
+                    s->gen_opc_buf[op_index] = INDEX_op_br;
+                    gen_args[0] = args[5];
+                    gen_args += 1;
+                } else {
+                    s->gen_opc_buf[op_index] = INDEX_op_nop;
+                }
+            } else if ((args[4] == TCG_COND_LT || args[4] == TCG_COND_GE)
+                       && temps[args[2]].state == TCG_TEMP_CONST
+                       && temps[args[3]].state == TCG_TEMP_CONST
+                       && temps[args[2]].val == 0
+                       && temps[args[3]].val == 0) {
+                /* Simplify LT/GE comparisons vs zero to a single compare
+                   vs the high word of the input.  */
+                memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
+                s->gen_opc_buf[op_index] = INDEX_op_brcond_i32;
+                gen_args[0] = args[1];
+                gen_args[1] = args[3];
+                gen_args[2] = args[4];
+                gen_args[3] = args[5];
+                gen_args += 4;
             } else {
-                reset_temp(args[0], nb_temps, nb_globals);
+                goto do_default;
+            }
+            args += 6;
+            break;
+
+        case INDEX_op_setcond2_i32:
+            tmp = do_constant_folding_cond2(&args[1], &args[3], args[5]);
+            if (tmp != 2) {
+                s->gen_opc_buf[op_index] = INDEX_op_movi_i32;
+                tcg_opt_gen_movi(gen_args, args[0], tmp);
+                gen_args += 2;
+            } else if ((args[5] == TCG_COND_LT || args[5] == TCG_COND_GE)
+                       && temps[args[3]].state == TCG_TEMP_CONST
+                       && temps[args[4]].state == TCG_TEMP_CONST
+                       && temps[args[3]].val == 0
+                       && temps[args[4]].val == 0) {
+                /* Simplify LT/GE comparisons vs zero to a single compare
+                   vs the high word of the input.  */
+                s->gen_opc_buf[op_index] = INDEX_op_setcond_i32;
                 gen_args[0] = args[0];
-                gen_args[1] = args[1];
-                gen_args[2] = args[2];
-                gen_args += 3;
-                args += 3;
-                break;
+                gen_args[1] = args[2];
+                gen_args[2] = args[4];
+                gen_args[3] = args[5];
+                gen_args += 4;
+            } else {
+                goto do_default;
             }
+            args += 6;
+            break;
+
         case INDEX_op_call:
             nb_call_args = (args[0] >> 16) + (args[0] & 0xffff);
-            if (!(args[nb_call_args + 1] & (TCG_CALL_CONST | TCG_CALL_PURE))) {
+            if (!(args[nb_call_args + 1] & (TCG_CALL_NO_READ_GLOBALS |
+                                            TCG_CALL_NO_WRITE_GLOBALS))) {
                 for (i = 0; i < nb_globals; i++) {
-                    reset_temp(i, nb_temps, nb_globals);
+                    reset_temp(i);
                 }
             }
             for (i = 0; i < (args[0] >> 16); i++) {
-                reset_temp(args[i + 1], nb_temps, nb_globals);
+                reset_temp(args[i + 1]);
             }
             i = nb_call_args + 3;
             while (i) {
@@ -487,22 +932,19 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                 i--;
             }
             break;
-        case INDEX_op_set_label:
-        case INDEX_op_jmp:
-        case INDEX_op_br:
-        CASE_OP_32_64(brcond):
-            memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
-            for (i = 0; i < def->nb_args; i++) {
-                *gen_args = *args;
-                args++;
-                gen_args++;
-            }
-            break;
+
         default:
-            /* Default case: we do know nothing about operation so no
-               propagation is done.  We only trash output args.  */
-            for (i = 0; i < def->nb_oargs; i++) {
-                reset_temp(args[i], nb_temps, nb_globals);
+        do_default:
+            /* Default case: we know nothing about operation (or were unable
+               to compute the operation result) so no propagation is done.
+               We trash everything if the operation is the end of a basic
+               block, otherwise we only trash the output args.  */
+            if (def->flags & TCG_OPF_BB_END) {
+                memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
+            } else {
+                for (i = 0; i < def->nb_oargs; i++) {
+                    reset_temp(args[i]);
+                }
             }
             for (i = 0; i < def->nb_args; i++) {
                 gen_args[i] = args[i];