Merge remote-tracking branch 'bonzini/split-main-loop-for-anthony' into staging

author: Anthony Liguori <aliguori@us.ibm.com> 2011-10-24 10:51:12 -0500
committer: Anthony Liguori <aliguori@us.ibm.com> 2011-10-24 10:51:12 -0500
commit: 952e849c150b4f1b89f8728cba00f925c1d6e75b (patch)
tree: 93e950b81e84d1d20a915d8b1da5c75d3911f553
parent: db418a0a7ef5887ea0f3d167584e6f500bb0c4c5 (diff)
parent: 99435906cc92a45d6c2f7e18221d31349836f90e (diff)
23 files changed, 1309 insertions, 1060 deletions
diff --git a/Makefile.objs b/Makefile.objs
index 9e2077833e..01587c8f8f 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -81,7 +81,7 @@ common-obj-y += $(oslib-obj-y)
 common-obj-$(CONFIG_WIN32) += os-win32.o
 common-obj-$(CONFIG_POSIX) += os-posix.o
 
-common-obj-y += tcg-runtime.o host-utils.o
+common-obj-y += tcg-runtime.o host-utils.o main-loop.o
 common-obj-y += irq.o input.o
 common-obj-$(CONFIG_PTIMER) += ptimer.o
 common-obj-$(CONFIG_MAX7310) += max7310.o
diff --git a/async.c b/async.c
index ca13962222..332d511ed5 100644
--- a/async.c
+++ b/async.c
@@ -24,6 +24,7 @@
 
 #include "qemu-common.h"
 #include "qemu-aio.h"
+#include "main-loop.h"
 
 /* Anchor of the list of Bottom Halves belonging to the context */
 static struct QEMUBH *first_bh;
diff --git a/cpus.c b/cpus.c
index 5f5b763e2c..79a76560cb 100644
--- a/cpus.c
+++ b/cpus.c
@@ -33,17 +33,12 @@
 
 #include "qemu-thread.h"
 #include "cpus.h"
+#include "main-loop.h"
 
 #ifndef _WIN32
 #include "compatfd.h"
 #endif
 
-#ifdef SIGRTMIN
-#define SIG_IPI (SIGRTMIN+4)
-#else
-#define SIG_IPI SIGUSR1
-#endif
-
 #ifdef CONFIG_LINUX
 
 #include <sys/prctl.h>
@@ -65,6 +60,281 @@
 static CPUState *next_cpu;
 
 /***********************************************************/
+/* guest cycle counter */
+
+/* Conversion factor from emulated instructions to virtual clock ticks.  */
+static int icount_time_shift;
+/* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
+#define MAX_ICOUNT_SHIFT 10
+/* Compensate for varying guest execution speed.  */
+static int64_t qemu_icount_bias;
+static QEMUTimer *icount_rt_timer;
+static QEMUTimer *icount_vm_timer;
+static QEMUTimer *icount_warp_timer;
+static int64_t vm_clock_warp_start;
+static int64_t qemu_icount;
+
+typedef struct TimersState {
+    int64_t cpu_ticks_prev;
+    int64_t cpu_ticks_offset;
+    int64_t cpu_clock_offset;
+    int32_t cpu_ticks_enabled;
+    int64_t dummy;
+} TimersState;
+
+TimersState timers_state;
+
+/* Return the virtual CPU time, based on the instruction counter.  */
+int64_t cpu_get_icount(void)
+{
+    int64_t icount;
+    CPUState *env = cpu_single_env;;
+
+    icount = qemu_icount;
+    if (env) {
+        if (!can_do_io(env)) {
+            fprintf(stderr, "Bad clock read\n");
+        }
+        icount -= (env->icount_decr.u16.low + env->icount_extra);
+    }
+    return qemu_icount_bias + (icount << icount_time_shift);
+}
+
+/* return the host CPU cycle counter and handle stop/restart */
+int64_t cpu_get_ticks(void)
+{
+    if (use_icount) {
+        return cpu_get_icount();
+    }
+    if (!timers_state.cpu_ticks_enabled) {
+        return timers_state.cpu_ticks_offset;
+    } else {
+        int64_t ticks;
+        ticks = cpu_get_real_ticks();
+        if (timers_state.cpu_ticks_prev > ticks) {
+            /* Note: non increasing ticks may happen if the host uses
+               software suspend */
+            timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
+        }
+        timers_state.cpu_ticks_prev = ticks;
+        return ticks + timers_state.cpu_ticks_offset;
+    }
+}
+
+/* return the host CPU monotonic timer and handle stop/restart */
+int64_t cpu_get_clock(void)
+{
+    int64_t ti;
+    if (!timers_state.cpu_ticks_enabled) {
+        return timers_state.cpu_clock_offset;
+    } else {
+        ti = get_clock();
+        return ti + timers_state.cpu_clock_offset;
+    }
+}
+
+/* enable cpu_get_ticks() */
+void cpu_enable_ticks(void)
+{
+    if (!timers_state.cpu_ticks_enabled) {
+        timers_state.cpu_ticks_offset -= cpu_get_real_ticks();
+        timers_state.cpu_clock_offset -= get_clock();
+        timers_state.cpu_ticks_enabled = 1;
+    }
+}
+
+/* disable cpu_get_ticks() : the clock is stopped. You must not call
+   cpu_get_ticks() after that.  */
+void cpu_disable_ticks(void)
+{
+    if (timers_state.cpu_ticks_enabled) {
+        timers_state.cpu_ticks_offset = cpu_get_ticks();
+        timers_state.cpu_clock_offset = cpu_get_clock();
+        timers_state.cpu_ticks_enabled = 0;
+    }
+}
+
+/* Correlation between real and virtual time is always going to be
+   fairly approximate, so ignore small variation.
+   When the guest is idle real and virtual time will be aligned in
+   the IO wait loop.  */
+#define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)
+
+static void icount_adjust(void)
+{
+    int64_t cur_time;
+    int64_t cur_icount;
+    int64_t delta;
+    static int64_t last_delta;
+    /* If the VM is not running, then do nothing.  */
+    if (!runstate_is_running()) {
+        return;
+    }
+    cur_time = cpu_get_clock();
+    cur_icount = qemu_get_clock_ns(vm_clock);
+    delta = cur_icount - cur_time;
+    /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
+    if (delta > 0
+        && last_delta + ICOUNT_WOBBLE < delta * 2
+        && icount_time_shift > 0) {
+        /* The guest is getting too far ahead.  Slow time down.  */
+        icount_time_shift--;
+    }
+    if (delta < 0
+        && last_delta - ICOUNT_WOBBLE > delta * 2
+        && icount_time_shift < MAX_ICOUNT_SHIFT) {
+        /* The guest is getting too far behind.  Speed time up.  */
+        icount_time_shift++;
+    }
+    last_delta = delta;
+    qemu_icount_bias = cur_icount - (qemu_icount << icount_time_shift);
+}
+
+static void icount_adjust_rt(void *opaque)
+{
+    qemu_mod_timer(icount_rt_timer,
+                   qemu_get_clock_ms(rt_clock) + 1000);
+    icount_adjust();
+}
+
+static void icount_adjust_vm(void *opaque)
+{
+    qemu_mod_timer(icount_vm_timer,
+                   qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10);
+    icount_adjust();
+}
+
+static int64_t qemu_icount_round(int64_t count)
+{
+    return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
+}
+
+static void icount_warp_rt(void *opaque)
+{
+    if (vm_clock_warp_start == -1) {
+        return;
+    }
+
+    if (runstate_is_running()) {
+        int64_t clock = qemu_get_clock_ns(rt_clock);
+        int64_t warp_delta = clock - vm_clock_warp_start;
+        if (use_icount == 1) {
+            qemu_icount_bias += warp_delta;
+        } else {
+            /*
+             * In adaptive mode, do not let the vm_clock run too
+             * far ahead of real time.
+             */
+            int64_t cur_time = cpu_get_clock();
+            int64_t cur_icount = qemu_get_clock_ns(vm_clock);
+            int64_t delta = cur_time - cur_icount;
+            qemu_icount_bias += MIN(warp_delta, delta);
+        }
+        if (qemu_clock_expired(vm_clock)) {
+            qemu_notify_event();
+        }
+    }
+    vm_clock_warp_start = -1;
+}
+
+void qemu_clock_warp(QEMUClock *clock)
+{
+    int64_t deadline;
+
+    /*
+     * There are too many global variables to make the "warp" behavior
+     * applicable to other clocks.  But a clock argument removes the
+     * need for if statements all over the place.
+     */
+    if (clock != vm_clock || !use_icount) {
+        return;
+    }
+
+    /*
+     * If the CPUs have been sleeping, advance the vm_clock timer now.  This
+     * ensures that the deadline for the timer is computed correctly below.
+     * This also makes sure that the insn counter is synchronized before the
+     * CPU starts running, in case the CPU is woken by an event other than
+     * the earliest vm_clock timer.
+     */
+    icount_warp_rt(NULL);
+    if (!all_cpu_threads_idle() || !qemu_clock_has_timers(vm_clock)) {
+        qemu_del_timer(icount_warp_timer);
+        return;
+    }
+
+    vm_clock_warp_start = qemu_get_clock_ns(rt_clock);
+    deadline = qemu_clock_deadline(vm_clock);
+    if (deadline > 0) {
+        /*
+         * Ensure the vm_clock proceeds even when the virtual CPU goes to
+         * sleep.  Otherwise, the CPU might be waiting for a future timer
+         * interrupt to wake it up, but the interrupt never comes because
+         * the vCPU isn't running any insns and thus doesn't advance the
+         * vm_clock.
+         *
+         * An extreme solution for this problem would be to never let VCPUs
+         * sleep in icount mode if there is a pending vm_clock timer; rather
+         * time could just advance to the next vm_clock event.  Instead, we
+         * do stop VCPUs and only advance vm_clock after some "real" time,
+         * (related to the time left until the next event) has passed.  This
+         * rt_clock timer will do this.  This avoids that the warps are too
+         * visible externally---for example, you will not be sending network
+         * packets continously instead of every 100ms.
+         */
+        qemu_mod_timer(icount_warp_timer, vm_clock_warp_start + deadline);
+    } else {
+        qemu_notify_event();
+    }
+}
+
+static const VMStateDescription vmstate_timers = {
+    .name = "timer",
+    .version_id = 2,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .fields      = (VMStateField[]) {
+        VMSTATE_INT64(cpu_ticks_offset, TimersState),
+        VMSTATE_INT64(dummy, TimersState),
+        VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+void configure_icount(const char *option)
+{
+    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
+    if (!option) {
+        return;
+    }
+
+    icount_warp_timer = qemu_new_timer_ns(rt_clock, icount_warp_rt, NULL);
+    if (strcmp(option, "auto") != 0) {
+        icount_time_shift = strtol(option, NULL, 0);
+        use_icount = 1;
+        return;
+    }
+
+    use_icount = 2;
+
+    /* 125MIPS seems a reasonable initial guess at the guest speed.
+       It will be corrected fairly quickly anyway.  */
+    icount_time_shift = 3;
+
+    /* Have both realtime and virtual time triggers for speed adjustment.
+       The realtime trigger catches emulated time passing too slowly,
+       the virtual time trigger catches emulated time passing too fast.
+       Realtime triggers occur even when idle, so use them less frequently
+       than VM triggers.  */
+    icount_rt_timer = qemu_new_timer_ms(rt_clock, icount_adjust_rt, NULL);
+    qemu_mod_timer(icount_rt_timer,
+                   qemu_get_clock_ms(rt_clock) + 1000);
+    icount_vm_timer = qemu_new_timer_ns(vm_clock, icount_adjust_vm, NULL);
+    qemu_mod_timer(icount_vm_timer,
+                   qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10);
+}
+
+/***********************************************************/
 void hw_error(const char *fmt, ...)
 {
     va_list ap;
@@ -272,143 +542,10 @@ static void qemu_kvm_eat_signals(CPUState *env)
 #endif /* !CONFIG_LINUX */
 
 #ifndef _WIN32
-static int io_thread_fd = -1;
-
-static void qemu_event_increment(void)
-{
-    /* Write 8 bytes to be compatible with eventfd.  */
-    static const uint64_t val = 1;
-    ssize_t ret;
-
-    if (io_thread_fd == -1) {
-        return;
-    }
-    do {
-        ret = write(io_thread_fd, &val, sizeof(val));
-    } while (ret < 0 && errno == EINTR);
-
-    /* EAGAIN is fine, a read must be pending.  */
-    if (ret < 0 && errno != EAGAIN) {
-        fprintf(stderr, "qemu_event_increment: write() failed: %s\n",
-                strerror(errno));
-        exit (1);
-    }
-}
-
-static void qemu_event_read(void *opaque)
-{
-    int fd = (intptr_t)opaque;
-    ssize_t len;
-    char buffer[512];
-
-    /* Drain the notify pipe.  For eventfd, only 8 bytes will be read.  */
-    do {
-        len = read(fd, buffer, sizeof(buffer));
-    } while ((len == -1 && errno == EINTR) || len == sizeof(buffer));
-}
-
-static int qemu_event_init(void)
-{
-    int err;
-    int fds[2];
-
-    err = qemu_eventfd(fds);
-    if (err == -1) {
-        return -errno;
-    }
-    err = fcntl_setfl(fds[0], O_NONBLOCK);
-    if (err < 0) {
-        goto fail;
-    }
-    err = fcntl_setfl(fds[1], O_NONBLOCK);
-    if (err < 0) {
-        goto fail;
-    }
-    qemu_set_fd_handler2(fds[0], NULL, qemu_event_read, NULL,
-                         (void *)(intptr_t)fds[0]);
-
-    io_thread_fd = fds[1];
-    return 0;
-
-fail:
-    close(fds[0]);
-    close(fds[1]);
-    return err;
-}
-
 static void dummy_signal(int sig)
 {
 }
 
-/* If we have signalfd, we mask out the signals we want to handle and then
- * use signalfd to listen for them.  We rely on whatever the current signal
- * handler is to dispatch the signals when we receive them.
- */
-static void sigfd_handler(void *opaque)
-{
-    int fd = (intptr_t)opaque;
-    struct qemu_signalfd_siginfo info;
-    struct sigaction action;
-    ssize_t len;
-
-    while (1) {
-        do {
-            len = read(fd, &info, sizeof(info));
-        } while (len == -1 && errno == EINTR);
-
-        if (len == -1 && errno == EAGAIN) {
-            break;
-        }
-
-        if (len != sizeof(info)) {
-            printf("read from sigfd returned %zd: %m\n", len);
-            return;
-        }
-
-        sigaction(info.ssi_signo, NULL, &action);
-        if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) {
-            action.sa_sigaction(info.ssi_signo,
-                                (siginfo_t *)&info, NULL);
-        } else if (action.sa_handler) {
-            action.sa_handler(info.ssi_signo);
-        }
-    }
-}
-
-static int qemu_signal_init(void)
-{
-    int sigfd;
-    sigset_t set;
-
-    /*
-     * SIG_IPI must be blocked in the main thread and must not be caught
-     * by sigwait() in the signal thread. Otherwise, the cpu thread will
-     * not catch it reliably.
-     */
-    sigemptyset(&set);
-    sigaddset(&set, SIG_IPI);
-    pthread_sigmask(SIG_BLOCK, &set, NULL);
-
-    sigemptyset(&set);
-    sigaddset(&set, SIGIO);
-    sigaddset(&set, SIGALRM);
-    sigaddset(&set, SIGBUS);
-    pthread_sigmask(SIG_BLOCK, &set, NULL);
-
-    sigfd = qemu_signalfd(&set);
-    if (sigfd == -1) {
-        fprintf(stderr, "failed to create signalfd\n");
-        return -errno;
-    }
-
-    fcntl_setfl(sigfd, O_NONBLOCK);
-
-    qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
-                         (void *)(intptr_t)sigfd);
-
-    return 0;
-}
-
 static void qemu_kvm_init_cpu_signals(CPUState *env)
 {
     int r;
@@ -452,38 +589,6 @@ static void qemu_tcg_init_cpu_signals(void)
 }
 
 #else /* _WIN32 */
-
-HANDLE qemu_event_handle;
-
-static void dummy_event_handler(void *opaque)
-{
-}
-
-static int qemu_event_init(void)
-{
-    qemu_event_handle = CreateEvent(NULL, FALSE, FALSE, NULL);
-    if (!qemu_event_handle) {
-        fprintf(stderr, "Failed CreateEvent: %ld\n", GetLastError());
-        return -1;
-    }
-    qemu_add_wait_object(qemu_event_handle, dummy_event_handler, NULL);
-    return 0;
-}
-
-static void qemu_event_increment(void)
-{
-    if (!SetEvent(qemu_event_handle)) {
-        fprintf(stderr, "qemu_event_increment: SetEvent failed: %ld\n",
-                GetLastError());
-        exit (1);
-    }
-}
-
-static int qemu_signal_init(void)
-{
-    return 0;
-}
-
 static void qemu_kvm_init_cpu_signals(CPUState *env)
 {
     abort();
@@ -509,38 +614,16 @@ static QemuCond qemu_cpu_cond;
 static QemuCond qemu_pause_cond;
 static QemuCond qemu_work_cond;
 
-int qemu_init_main_loop(void)
+void qemu_init_cpu_loop(void)
 {
-    int ret;
-
     qemu_init_sigbus();
-
-    ret = qemu_signal_init();
-    if (ret) {
-        return ret;
-    }
-
-    /* Note eventfd must be drained before signalfd handlers run */
-    ret = qemu_event_init();
-    if (ret) {
-        return ret;
-    }
-
     qemu_cond_init(&qemu_cpu_cond);
     qemu_cond_init(&qemu_pause_cond);
     qemu_cond_init(&qemu_work_cond);
     qemu_cond_init(&qemu_io_proceeded_cond);
     qemu_mutex_init(&qemu_global_mutex);
-    qemu_mutex_lock(&qemu_global_mutex);
 
     qemu_thread_get_self(&io_thread);
-
-    return 0;
-}
-
-void qemu_main_loop_start(void)
-{
-    resume_all_vcpus();
 }
 
 void run_on_cpu(CPUState *env, void (*func)(void *data), void *data)
@@ -686,7 +769,7 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
 
     while (1) {
         cpu_exec_all();
-        if (use_icount && qemu_next_icount_deadline() <= 0) {
+        if (use_icount && qemu_clock_deadline(vm_clock) <= 0) {
             qemu_notify_event();
         }
         qemu_tcg_wait_io_event();
@@ -784,6 +867,7 @@ void pause_all_vcpus(void)
 {
     CPUState *penv = first_cpu;
 
+    qemu_clock_enable(vm_clock, false);
     while (penv) {
         penv->stop = 1;
         qemu_cpu_kick(penv);
@@ -858,11 +942,6 @@ void qemu_init_vcpu(void *_env)
     }
 }
 
-void qemu_notify_event(void)
-{
-    qemu_event_increment();
-}
-
 void cpu_stop_current(void)
 {
     if (cpu_single_env) {
@@ -914,7 +993,7 @@ static int tcg_cpu_exec(CPUState *env)
         qemu_icount -= (env->icount_decr.u16.low + env->icount_extra);
         env->icount_decr.u16.low = 0;
         env->icount_extra = 0;
-        count = qemu_icount_round(qemu_next_icount_deadline());
+        count = qemu_icount_round(qemu_clock_deadline(vm_clock));
         qemu_icount += count;
         decr = (count > 0xffff) ? 0xffff : count;
         count -= decr;
@@ -1006,22 +1085,6 @@ void set_cpu_log_filename(const char *optarg)
     cpu_set_log_filename(optarg);
 }
 
-/* Return the virtual CPU time, based on the instruction counter.  */
-int64_t cpu_get_icount(void)
-{
-    int64_t icount;
-    CPUState *env = cpu_single_env;;
-
-    icount = qemu_icount;
-    if (env) {
-        if (!can_do_io(env)) {
-            fprintf(stderr, "Bad clock read\n");
-        }
-        icount -= (env->icount_decr.u16.low + env->icount_extra);
-    }
-    return qemu_icount_bias + (icount << icount_time_shift);
-}
-
 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
 {
     /* XXX: implement xxx_cpu_list for targets that still miss it */
diff --git a/cpus.h b/cpus.h
index 58858855ff..3525375756 100644
--- a/cpus.h
+++ b/cpus.h
@@ -2,8 +2,7 @@
 #define QEMU_CPUS_H
 
 /* cpus.c */
-int qemu_init_main_loop(void);
-void qemu_main_loop_start(void);
+void qemu_init_cpu_loop(void);
 void resume_all_vcpus(void);
 void pause_all_vcpus(void);
 void cpu_stop_current(void);
diff --git a/exec-all.h b/exec-all.h
index 1120f84661..72ef246793 100644
--- a/exec-all.h
+++ b/exec-all.h
@@ -356,4 +356,18 @@ extern int singlestep;
 /* cpu-exec.c */
 extern volatile sig_atomic_t exit_request;
 
+/* Deterministic execution requires that IO only be performed on the last
+   instruction of a TB so that interrupts take effect immediately.  */
+static inline int can_do_io(CPUState *env)
+{
+    if (!use_icount) {
+        return 1;
+    }
+    /* If not executing code then assume we are ok.  */
+    if (!env->current_tb) {
+        return 1;
+    }
+    return env->can_do_io != 0;
+}
+
 #endif
diff --git a/exec.c b/exec.c
index d0cbf15822..9dc4edbf61 100644
--- a/exec.c
+++ b/exec.c
@@ -125,9 +125,6 @@ CPUState *cpu_single_env;
    1 = Precise instruction counting.
    2 = Adaptive rate instruction counting.  */
 int use_icount = 0;
-/* Current instruction counter.  While executing translated code this may
-   include some instructions that have not yet been executed.  */
-int64_t qemu_icount;
 
 typedef struct PageDesc {
     /* list of TBs intersecting this ram page */
diff --git a/hw/mac_dbdma.c b/hw/mac_dbdma.c
index 5affdd18a5..1791ec12e1 100644
--- a/hw/mac_dbdma.c
+++ b/hw/mac_dbdma.c
@@ -661,11 +661,6 @@ void DBDMA_register_channel(void *dbdma, int nchan, qemu_irq irq,
     ch->io.channel = ch;
 }
 
-void DBDMA_schedule(void)
-{
-    qemu_notify_event();
-}
-
 static void
 dbdma_control_write(DBDMA_channel *ch)
 {
diff --git a/hw/mac_dbdma.h b/hw/mac_dbdma.h
index 933e17c5b9..6d1abe6aae 100644
--- a/hw/mac_dbdma.h
+++ b/hw/mac_dbdma.h
@@ -41,5 +41,4 @@ struct DBDMA_io {
 void DBDMA_register_channel(void *dbdma, int nchan, qemu_irq irq,
                             DBDMA_rw rw, DBDMA_flush flush,
                             void *opaque);
-void DBDMA_schedule(void);
 void* DBDMA_init (MemoryRegion **dbdma_mem);
diff --git a/iohandler.c b/iohandler.c
index 4cc1c5ade6..5640d49388 100644
--- a/iohandler.c
+++ b/iohandler.c
@@ -26,6 +26,7 @@
 #include "qemu-common.h"
 #include "qemu-char.h"
 #include "qemu-queue.h"
+#include "main-loop.h"
 
 #ifndef _WIN32
 #include <sys/wait.h>
@@ -80,64 +81,12 @@ int qemu_set_fd_handler2(int fd,
     return 0;
 }
 
-typedef struct IOTrampoline
-{
-    GIOChannel *chan;
-    IOHandler *fd_read;
-    IOHandler *fd_write;
-    void *opaque;
-    guint tag;
-} IOTrampoline;
-
-static gboolean fd_trampoline(GIOChannel *chan, GIOCondition cond, gpointer opaque)
-{
-    IOTrampoline *tramp = opaque;
-
-    if ((cond & G_IO_IN) && tramp->fd_read) {
-        tramp->fd_read(tramp->opaque);
-    }
-
-    if ((cond & G_IO_OUT) && tramp->fd_write) {
-        tramp->fd_write(tramp->opaque);
-    }
-
-    return TRUE;
-}
-
 int qemu_set_fd_handler(int fd,
                         IOHandler *fd_read,
                         IOHandler *fd_write,
                         void *opaque)
 {
-    static IOTrampoline fd_trampolines[FD_SETSIZE];
-    IOTrampoline *tramp = &fd_trampolines[fd];
-
-    if (tramp->tag != 0) {
-        g_io_channel_unref(tramp->chan);
-        g_source_remove(tramp->tag);
-        tramp->tag = 0;
-    }
-
-    if (fd_read || fd_write || opaque) {
-        GIOCondition cond = 0;
-
-        tramp->fd_read = fd_read;
-        tramp->fd_write = fd_write;
-        tramp->opaque = opaque;
-
-        if (fd_read) {
-            cond |= G_IO_IN | G_IO_ERR;
-        }
-
-        if (fd_write) {
-            cond |= G_IO_OUT | G_IO_ERR;
-        }
-
-        tramp->chan = g_io_channel_unix_new(fd);
-        tramp->tag = g_io_add_watch(tramp->chan, cond, fd_trampoline, tramp);
-    }
-
-    return 0;
+    return qemu_set_fd_handler2(fd, NULL, fd_read, fd_write, opaque);
 }
 
 void qemu_iohandler_fill(int *pnfds, fd_set *readfds, fd_set *writefds, fd_set *xfds)
diff --git a/main-loop.c b/main-loop.c
new file mode 100644
index 0000000000..bfecdb7769
--- /dev/null
+++ b/main-loop.c
@@ -0,0 +1,495 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "config-host.h"
+#include <unistd.h>
+#include <signal.h>
+#include <time.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <stdbool.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#else
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <net/if.h>
+#include <arpa/inet.h>
+#include <sys/select.h>
+#include <sys/stat.h>
+#include "compatfd.h"
+#endif
+
+#include <glib.h>
+
+#include "main-loop.h"
+#include "qemu-timer.h"
+#include "slirp/libslirp.h"
+
+#ifndef _WIN32
+
+static int io_thread_fd = -1;
+
+void qemu_notify_event(void)
+{
+    /* Write 8 bytes to be compatible with eventfd.  */
+    static const uint64_t val = 1;
+    ssize_t ret;
+
+    if (io_thread_fd == -1) {
+        return;
+    }
+    do {
+        ret = write(io_thread_fd, &val, sizeof(val));
+    } while (ret < 0 && errno == EINTR);
+
+    /* EAGAIN is fine, a read must be pending.  */
+    if (ret < 0 && errno != EAGAIN) {
+        fprintf(stderr, "qemu_notify_event: write() failed: %s\n",
+                strerror(errno));
+        exit(1);
+    }
+}
+
+static void qemu_event_read(void *opaque)
+{
+    int fd = (intptr_t)opaque;
+    ssize_t len;
+    char buffer[512];
+
+    /* Drain the notify pipe.  For eventfd, only 8 bytes will be read.  */
+    do {
+        len = read(fd, buffer, sizeof(buffer));
+    } while ((len == -1 && errno == EINTR) || len == sizeof(buffer));
+}
+
+static int qemu_event_init(void)
+{
+    int err;
+    int fds[2];
+
+    err = qemu_eventfd(fds);
+    if (err == -1) {
+        return -errno;
+    }
+    err = fcntl_setfl(fds[0], O_NONBLOCK);
+    if (err < 0) {
+        goto fail;
+    }
+    err = fcntl_setfl(fds[1], O_NONBLOCK);
+    if (err < 0) {
+        goto fail;
+    }
+    qemu_set_fd_handler2(fds[0], NULL, qemu_event_read, NULL,
+                         (void *)(intptr_t)fds[0]);
+
+    io_thread_fd = fds[1];
+    return 0;
+
+fail:
+    close(fds[0]);
+    close(fds[1]);
+    return err;
+}
+
+/* If we have signalfd, we mask out the signals we want to handle and then
+ * use signalfd to listen for them.  We rely on whatever the current signal
+ * handler is to dispatch the signals when we receive them.
+ */
+static void sigfd_handler(void *opaque)
+{
+    int fd = (intptr_t)opaque;
+    struct qemu_signalfd_siginfo info;
+    struct sigaction action;
+    ssize_t len;
+
+    while (1) {
+        do {
+            len = read(fd, &info, sizeof(info));
+        } while (len == -1 && errno == EINTR);
+
+        if (len == -1 && errno == EAGAIN) {
+            break;
+        }
+
+        if (len != sizeof(info)) {
+            printf("read from sigfd returned %zd: %m\n", len);
+            return;
+        }
+
+        sigaction(info.ssi_signo, NULL, &action);
+        if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) {
+            action.sa_sigaction(info.ssi_signo,
+                                (siginfo_t *)&info, NULL);
+        } else if (action.sa_handler) {
+            action.sa_handler(info.ssi_signo);
+        }
+    }
+}
+
+static int qemu_signal_init(void)
+{
+    int sigfd;
+    sigset_t set;
+
+    /*
+     * SIG_IPI must be blocked in the main thread and must not be caught
+     * by sigwait() in the signal thread. Otherwise, the cpu thread will
+     * not catch it reliably.
+     */
+    sigemptyset(&set);
+    sigaddset(&set, SIG_IPI);
+    pthread_sigmask(SIG_BLOCK, &set, NULL);
+
+    sigemptyset(&set);
+    sigaddset(&set, SIGIO);
+    sigaddset(&set, SIGALRM);
+    sigaddset(&set, SIGBUS);
+    pthread_sigmask(SIG_BLOCK, &set, NULL);
+
+    sigfd = qemu_signalfd(&set);
+    if (sigfd == -1) {
+        fprintf(stderr, "failed to create signalfd\n");
+        return -errno;
+    }
+
+    fcntl_setfl(sigfd, O_NONBLOCK);
+
+    qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
+                         (void *)(intptr_t)sigfd);
+
+    return 0;
+}
+
+#else /* _WIN32 */
+
+HANDLE qemu_event_handle;
+
+static void dummy_event_handler(void *opaque)
+{
+}
+
+static int qemu_event_init(void)
+{
+    qemu_event_handle = CreateEvent(NULL, FALSE, FALSE, NULL);
+    if (!qemu_event_handle) {
+        fprintf(stderr, "Failed CreateEvent: %ld\n", GetLastError());
+        return -1;
+    }
+    qemu_add_wait_object(qemu_event_handle, dummy_event_handler, NULL);
+    return 0;
+}
+
+void qemu_notify_event(void)
+{
+    if (!SetEvent(qemu_event_handle)) {
+        fprintf(stderr, "qemu_notify_event: SetEvent failed: %ld\n",
+                GetLastError());
+        exit(1);
+    }
+}
+
+static int qemu_signal_init(void)
+{
+    return 0;
+}
+#endif
+
+int qemu_init_main_loop(void)
+{
+    int ret;
+
+    qemu_mutex_lock_iothread();
+    ret = qemu_signal_init();
+    if (ret) {
+        return ret;
+    }
+
+    /* Note eventfd must be drained before signalfd handlers run */
+    ret = qemu_event_init();
+    if (ret) {
+        return ret;
+    }
+
+    return 0;
+}
+
+
+static GPollFD poll_fds[1024 * 2]; /* this is probably overkill */
+static int n_poll_fds;
+static int max_priority;
+
+static void glib_select_fill(int *max_fd, fd_set *rfds, fd_set *wfds,
+                             fd_set *xfds, struct timeval *tv)
+{
+    GMainContext *context = g_main_context_default();
+    int i;
+    int timeout = 0, cur_timeout;
+
+    g_main_context_prepare(context, &max_priority);
+
+    n_poll_fds = g_main_context_query(context, max_priority, &timeout,
+                                      poll_fds, ARRAY_SIZE(poll_fds));
+    g_assert(n_poll_fds <= ARRAY_SIZE(poll_fds));
+
+    for (i = 0; i < n_poll_fds; i++) {
+        GPollFD *p = &poll_fds[i];
+
+        if ((p->events & G_IO_IN)) {
+            FD_SET(p->fd, rfds);
+            *max_fd = MAX(*max_fd, p->fd);
+        }
+        if ((p->events & G_IO_OUT)) {
+            FD_SET(p->fd, wfds);
+            *max_fd = MAX(*max_fd, p->fd);
+        }
+        if ((p->events & G_IO_ERR)) {
+            FD_SET(p->fd, xfds);
+            *max_fd = MAX(*max_fd, p->fd);
+        }
+    }
+
+    cur_timeout = (tv->tv_sec * 1000) + ((tv->tv_usec + 500) / 1000);
+    if (timeout >= 0 && timeout < cur_timeout) {
+        tv->tv_sec = timeout / 1000;
+        tv->tv_usec = (timeout % 1000) * 1000;
+    }
+}
+
+static void glib_select_poll(fd_set *rfds, fd_set *wfds, fd_set *xfds,
+                             bool err)
+{
+    GMainContext *context = g_main_context_default();
+
+    if (!err) {
+        int i;
+
+        for (i = 0; i < n_poll_fds; i++) {
+            GPollFD *p = &poll_fds[i];
+
+            if ((p->events & G_IO_IN) && FD_ISSET(p->fd, rfds)) {
+                p->revents |= G_IO_IN;
+            }
+            if ((p->events & G_IO_OUT) && FD_ISSET(p->fd, wfds)) {
+                p->revents |= G_IO_OUT;
+            }
+            if ((p->events & G_IO_ERR) && FD_ISSET(p->fd, xfds)) {
+                p->revents |= G_IO_ERR;
+            }
+        }
+    }
+
+    if (g_main_context_check(context, max_priority, poll_fds, n_poll_fds)) {
+        g_main_context_dispatch(context);
+    }
+}
+
+#ifdef _WIN32
+/***********************************************************/
+/* Polling handling */
+
+typedef struct PollingEntry {
+    PollingFunc *func;
+    void *opaque;
+    struct PollingEntry *next;
+} PollingEntry;
+
+static PollingEntry *first_polling_entry;
+
+int qemu_add_polling_cb(PollingFunc *func, void *opaque)
+{
+    PollingEntry **ppe, *pe;
+    pe = g_malloc0(sizeof(PollingEntry));
+    pe->func = func;
+    pe->opaque = opaque;
+    for(ppe = &first_polling_entry; *ppe != NULL; ppe = &(*ppe)->next);
+    *ppe = pe;
+    return 0;
+}
+
+void qemu_del_polling_cb(PollingFunc *func, void *opaque)
+{
+    PollingEntry **ppe, *pe;
+    for(ppe = &first_polling_entry; *ppe != NULL; ppe = &(*ppe)->next) {
+        pe = *ppe;
+        if (pe->func == func && pe->opaque == opaque) {
+            *ppe = pe->next;
+            g_free(pe);
+            break;
+        }
+    }
+}
+
+/***********************************************************/
+/* Wait objects support */
+typedef struct WaitObjects {
+    int num;
+    HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
+    WaitObjectFunc *func[MAXIMUM_WAIT_OBJECTS + 1];
+    void *opaque[MAXIMUM_WAIT_OBJECTS + 1];
+} WaitObjects;
+
+static WaitObjects wait_objects = {0};
+
+int qemu_add_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque)
+{
+    WaitObjects *w = &wait_objects;
+    if (w->num >= MAXIMUM_WAIT_OBJECTS) {
+        return -1;
+    }
+    w->events[w->num] = handle;
+    w->func[w->num] = func;
+    w->opaque[w->num] = opaque;
+    w->num++;
+    return 0;
+}
+
+void qemu_del_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque)
+{
+    int i, found;
+    WaitObjects *w = &wait_objects;
+
+    found = 0;
+    for (i = 0; i < w->num; i++) {
+        if (w->events[i] == handle) {
+            found = 1;
+        }
+        if (found) {
+            w->events[i] = w->events[i + 1];
+            w->func[i] = w->func[i + 1];
+            w->opaque[i] = w->opaque[i + 1];
+        }
+    }
+    if (found) {
+        w->num--;
+    }
+}
+
+static void os_host_main_loop_wait(int *timeout)
+{
+    int ret, ret2, i;
+    PollingEntry *pe;
+
+    /* XXX: need to suppress polling by better using win32 events */
+    ret = 0;
+    for (pe = first_polling_entry; pe != NULL; pe = pe->next) {
+        ret |= pe->func(pe->opaque);
+    }
+    if (ret == 0) {
+        int err;
+        WaitObjects *w = &wait_objects;
+
+        qemu_mutex_unlock_iothread();
+        ret = WaitForMultipleObjects(w->num, w->events, FALSE, *timeout);
+        qemu_mutex_lock_iothread();
+        if (WAIT_OBJECT_0 + 0 <= ret && ret <= WAIT_OBJECT_0 + w->num - 1) {
+            if (w->func[ret - WAIT_OBJECT_0]) {
+                w->func[ret - WAIT_OBJECT_0](w->opaque[ret - WAIT_OBJECT_0]);
+            }
+
+            /* Check for additional signaled events */
+            for (i = (ret - WAIT_OBJECT_0 + 1); i < w->num; i++) {
+                /* Check if event is signaled */
+                ret2 = WaitForSingleObject(w->events[i], 0);
+                if (ret2 == WAIT_OBJECT_0) {
+                    if (w->func[i]) {
+                        w->func[i](w->opaque[i]);
+                    }
+                } else if (ret2 != WAIT_TIMEOUT) {
+                    err = GetLastError();
+                    fprintf(stderr, "WaitForSingleObject error %d %d\n", i, err);
+                }
+            }
+        } else if (ret != WAIT_TIMEOUT) {
+            err = GetLastError();
+            fprintf(stderr, "WaitForMultipleObjects error %d %d\n", ret, err);
+        }
+    }
+
+    *timeout = 0;
+}
+#else
+static inline void os_host_main_loop_wait(int *timeout)
+{
+}
+#endif
+
+int main_loop_wait(int nonblocking)
+{
+    fd_set rfds, wfds, xfds;
+    int ret, nfds;
+    struct timeval tv;
+    int timeout;
+
+    if (nonblocking) {
+        timeout = 0;
+    } else {
+        timeout = qemu_calculate_timeout();
+        qemu_bh_update_timeout(&timeout);
+    }
+
+    os_host_main_loop_wait(&timeout);
+
+    tv.tv_sec = timeout / 1000;
+    tv.tv_usec = (timeout % 1000) * 1000;
+
+    /* poll any events */
+    /* XXX: separate device handlers from system ones */
+    nfds = -1;
+    FD_ZERO(&rfds);
+    FD_ZERO(&wfds);
+    FD_ZERO(&xfds);
+
+#ifdef CONFIG_SLIRP
+    slirp_select_fill(&nfds, &rfds, &wfds, &xfds);
+#endif
+    qemu_iohandler_fill(&nfds, &rfds, &wfds, &xfds);
+    glib_select_fill(&nfds, &rfds, &wfds, &xfds, &tv);
+
+    if (timeout > 0) {
+        qemu_mutex_unlock_iothread();
+    }
+
+    ret = select(nfds + 1, &rfds, &wfds, &xfds, &tv);
+
+    if (timeout > 0) {
+        qemu_mutex_lock_iothread();
+    }
+
+    glib_select_poll(&rfds, &wfds, &xfds, (ret < 0));
+    qemu_iohandler_poll(&rfds, &wfds, &xfds, ret);
+#ifdef CONFIG_SLIRP
+    slirp_select_poll(&rfds, &wfds, &xfds, (ret < 0));
+#endif
+
+    qemu_run_all_timers();
+
+    /* Check bottom-halves last in case any of the earlier events triggered
+       them.  */
+    qemu_bh_poll();
+
+    return ret;
+}
diff --git a/main-loop.h b/main-loop.h
new file mode 100644
index 0000000000..8a716b133f
--- /dev/null
+++ b/main-loop.h
@@ -0,0 +1,351 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef QEMU_MAIN_LOOP_H
+#define QEMU_MAIN_LOOP_H 1
+
+#ifdef SIGRTMIN
+#define SIG_IPI (SIGRTMIN+4)
+#else
+#define SIG_IPI SIGUSR1
+#endif
+
+/**
+ * qemu_init_main_loop: Set up the process so that it can run the main loop.
+ *
+ * This includes setting up signal handlers.  It should be called before
+ * any other threads are created.  In addition, threads other than the
+ * main one should block signals that are trapped by the main loop.
+ * For simplicity, you can consider these signals to be safe: SIGUSR1,
+ * SIGUSR2, thread signals (SIGFPE, SIGILL, SIGSEGV, SIGBUS) and real-time
+ * signals if available.  Remember that Windows in practice does not have
+ * signals, though.
+ */
+int qemu_init_main_loop(void);
+
+/**
+ * main_loop_wait: Run one iteration of the main loop.
+ *
+ * If @nonblocking is true, poll for events, otherwise suspend until
+ * one actually occurs.  The main loop usually consists of a loop that
+ * repeatedly calls main_loop_wait(false).
+ *
+ * Main loop services include file descriptor callbacks, bottom halves
+ * and timers (defined in qemu-timer.h).  Bottom halves are similar to timers
+ * that execute immediately, but have a lower overhead and scheduling them
+ * is wait-free, thread-safe and signal-safe.
+ *
+ * It is sometimes useful to put a whole program in a coroutine.  In this
+ * case, the coroutine actually should be started from within the main loop,
+ * so that the main loop can run whenever the coroutine yields.  To do this,
+ * you can use a bottom half to enter the coroutine as soon as the main loop
+ * starts:
+ *
+ *     void enter_co_bh(void *opaque) {
+ *         QEMUCoroutine *co = opaque;
+ *         qemu_coroutine_enter(co, NULL);
+ *     }
+ *
+ *     ...
+ *     QEMUCoroutine *co = qemu_coroutine_create(coroutine_entry);
+ *     QEMUBH *start_bh = qemu_bh_new(enter_co_bh, co);
+ *     qemu_bh_schedule(start_bh);
+ *     while (...) {
+ *         main_loop_wait(false);
+ *     }
+ *
+ * (In the future we may provide a wrapper for this).
+ *
+ * @nonblocking: Whether the caller should block until an event occurs.
+ */
+int main_loop_wait(int nonblocking);
+
+/**
+ * qemu_notify_event: Force processing of pending events.
+ *
+ * Similar to signaling a condition variable, qemu_notify_event forces
+ * main_loop_wait to look at pending events and exit.  The caller of
+ * main_loop_wait will usually call it again very soon, so qemu_notify_event
+ * also has the side effect of recalculating the sets of file descriptors
+ * that the main loop waits for.
+ *
+ * Calling qemu_notify_event is rarely necessary, because main loop
+ * services (bottom halves and timers) call it themselves.  One notable
+ * exception occurs when using qemu_set_fd_handler2 (see below).
+ */
+void qemu_notify_event(void);
+
+#ifdef _WIN32
+/* return TRUE if no sleep should be done afterwards */
+typedef int PollingFunc(void *opaque);
+
+/**
+ * qemu_add_polling_cb: Register a Windows-specific polling callback
+ *
+ * Currently, under Windows some events are polled rather than waited for.
+ * Polling callbacks do not ensure that @func is called timely, because
+ * the main loop might wait for an arbitrarily long time.  If possible,
+ * you should instead create a separate thread that does a blocking poll
+ * and set a Win32 event object.  The event can then be passed to
+ * qemu_add_wait_object.
+ *
+ * Polling callbacks really have nothing Windows specific in them, but
+ * as they are a hack and are currenly not necessary under POSIX systems,
+ * they are only available when QEMU is running under Windows.
+ *
+ * @func: The function that does the polling, and returns 1 to force
+ * immediate completion of main_loop_wait.
+ * @opaque: A pointer-size value that is passed to @func.
+ */
+int qemu_add_polling_cb(PollingFunc *func, void *opaque);
+
+/**
+ * qemu_del_polling_cb: Unregister a Windows-specific polling callback
+ *
+ * This function removes a callback that was registered with
+ * qemu_add_polling_cb.
+ *
+ * @func: The function that was passed to qemu_add_polling_cb.
+ * @opaque: A pointer-size value that was passed to qemu_add_polling_cb.
+ */
+void qemu_del_polling_cb(PollingFunc *func, void *opaque);
+
+/* Wait objects handling */
+typedef void WaitObjectFunc(void *opaque);
+
+/**
+ * qemu_add_wait_object: Register a callback for a Windows handle
+ *
+ * Under Windows, the iohandler mechanism can only be used with sockets.
+ * QEMU must use the WaitForMultipleObjects API to wait on other handles.
+ * This function registers a #HANDLE with QEMU, so that it will be included
+ * in the main loop's calls to WaitForMultipleObjects.  When the handle
+ * is in a signaled state, QEMU will call @func.
+ *
+ * @handle: The Windows handle to be observed.
+ * @func: A function to be called when @handle is in a signaled state.
+ * @opaque: A pointer-size value that is passed to @func.
+ */
+int qemu_add_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque);
+
+/**
+ * qemu_del_wait_object: Unregister a callback for a Windows handle
+ *
+ * This function removes a callback that was registered with
+ * qemu_add_wait_object.
+ *
+ * @func: The function that was passed to qemu_add_wait_object.
+ * @opaque: A pointer-size value that was passed to qemu_add_wait_object.
+ */
+void qemu_del_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque);
+#endif
+
+/* async I/O support */
+
+typedef void IOReadHandler(void *opaque, const uint8_t *buf, int size);
+typedef int IOCanReadHandler(void *opaque);
+typedef void IOHandler(void *opaque);
+
+/**
+ * qemu_set_fd_handler2: Register a file descriptor with the main loop
+ *
+ * This function tells the main loop to wake up whenever one of the
+ * following conditions is true:
+ *
+ * 1) if @fd_write is not %NULL, when the file descriptor is writable;
+ *
+ * 2) if @fd_read is not %NULL, when the file descriptor is readable.
+ *
+ * @fd_read_poll can be used to disable the @fd_read callback temporarily.
+ * This is useful to avoid calling qemu_set_fd_handler2 every time the
+ * client becomes interested in reading (or dually, stops being interested).
+ * A typical example is when @fd is a listening socket and you want to bound
+ * the number of active clients.  Remember to call qemu_notify_event whenever
+ * the condition may change from %false to %true.
+ *
+ * The callbacks that are set up by qemu_set_fd_handler2 are level-triggered.
+ * If @fd_read does not read from @fd, or @fd_write does not write to @fd
+ * until its buffers are full, they will be called again on the next
+ * iteration.
+ *
+ * @fd: The file descriptor to be observed.  Under Windows it must be
+ * a #SOCKET.
+ *
+ * @fd_read_poll: A function that returns 1 if the @fd_read callback
+ * should be fired.  If the function returns 0, the main loop will not
+ * end its iteration even if @fd becomes readable.
+ *
+ * @fd_read: A level-triggered callback that is fired if @fd is readable
+ * at the beginning of a main loop iteration, or if it becomes readable
+ * during one.
+ *
+ * @fd_write: A level-triggered callback that is fired when @fd is writable
+ * at the beginning of a main loop iteration, or if it becomes writable
+ * during one.
+ *
+ * @opaque: A pointer-sized value that is passed to @fd_read_poll,
+ * @fd_read and @fd_write.
+ */
+int qemu_set_fd_handler2(int fd,
+                         IOCanReadHandler *fd_read_poll,
+                         IOHandler *fd_read,
+                         IOHandler *fd_write,
+                         void *opaque);
+
+/**
+ * qemu_set_fd_handler: Register a file descriptor with the main loop
+ *
+ * This function tells the main loop to wake up whenever one of the
+ * following conditions is true:
+ *
+ * 1) if @fd_write is not %NULL, when the file descriptor is writable;
+ *
+ * 2) if @fd_read is not %NULL, when the file descriptor is readable.
+ *
+ * The callbacks that are set up by qemu_set_fd_handler are level-triggered.
+ * If @fd_read does not read from @fd, or @fd_write does not write to @fd
+ * until its buffers are full, they will be called again on the next
+ * iteration.
+ *
+ * @fd: The file descriptor to be observed.  Under Windows it must be
+ * a #SOCKET.
+ *
+ * @fd_read: A level-triggered callback that is fired if @fd is readable
+ * at the beginning of a main loop iteration, or if it becomes readable
+ * during one.
+ *
+ * @fd_write: A level-triggered callback that is fired when @fd is writable
+ * at the beginning of a main loop iteration, or if it becomes writable
+ * during one.
+ *
+ * @opaque: A pointer-sized value that is passed to @fd_read and @fd_write.
+ */
+int qemu_set_fd_handler(int fd,
+                        IOHandler *fd_read,
+                        IOHandler *fd_write,
+                        void *opaque);
+
+typedef struct QEMUBH QEMUBH;
+typedef void QEMUBHFunc(void *opaque);
+
+/**
+ * qemu_bh_new: Allocate a new bottom half structure.
+ *
+ * Bottom halves are lightweight callbacks whose invocation is guaranteed
+ * to be wait-free, thread-safe and signal-safe.  The #QEMUBH structure
+ * is opaque and must be allocated prior to its use.
+ */
+QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque);
+
+/**
+ * qemu_bh_schedule: Schedule a bottom half.
+ *
+ * Scheduling a bottom half interrupts the main loop and causes the
+ * execution of the callback that was passed to qemu_bh_new.
+ *
+ * Bottom halves that are scheduled from a bottom half handler are instantly
+ * invoked.  This can create an infinite loop if a bottom half handler
+ * schedules itself.
+ *
+ * @bh: The bottom half to be scheduled.
+ */
+void qemu_bh_schedule(QEMUBH *bh);
+
+/**
+ * qemu_bh_cancel: Cancel execution of a bottom half.
+ *
+ * Canceling execution of a bottom half undoes the effect of calls to
+ * qemu_bh_schedule without freeing its resources yet.  While cancellation
+ * itself is also wait-free and thread-safe, it can of course race with the
+ * loop that executes bottom halves unless you are holding the iothread
+ * mutex.  This makes it mostly useless if you are not holding the mutex.
+ *
+ * @bh: The bottom half to be canceled.
+ */
+void qemu_bh_cancel(QEMUBH *bh);
+
+/**
+ *qemu_bh_delete: Cancel execution of a bottom half and free its resources.
+ *
+ * Deleting a bottom half frees the memory that was allocated for it by
+ * qemu_bh_new.  It also implies canceling the bottom half if it was
+ * scheduled.
+ *
+ * @bh: The bottom half to be deleted.
+ */
+void qemu_bh_delete(QEMUBH *bh);
+
+#ifdef CONFIG_POSIX
+/**
+ * qemu_add_child_watch: Register a child process for reaping.
+ *
+ * Under POSIX systems, a parent process must read the exit status of
+ * its child processes using waitpid, or the operating system will not
+ * free some of the resources attached to that process.
+ *
+ * This function directs the QEMU main loop to observe a child process
+ * and call waitpid as soon as it exits; the watch is then removed
+ * automatically.  It is useful whenever QEMU forks a child process
+ * but will find out about its termination by other means such as a
+ * "broken pipe".
+ *
+ * @pid: The pid that QEMU should observe.
+ */
+int qemu_add_child_watch(pid_t pid);
+#endif
+
+/**
+ * qemu_mutex_lock_iothread: Lock the main loop mutex.
+ *
+ * This function locks the main loop mutex.  The mutex is taken by
+ * qemu_init_main_loop and always taken except while waiting on
+ * external events (such as with select).  The mutex should be taken
+ * by threads other than the main loop thread when calling
+ * qemu_bh_new(), qemu_set_fd_handler() and basically all other
+ * functions documented in this file.
+ */
+void qemu_mutex_lock_iothread(void);
+
+/**
+ * qemu_mutex_unlock_iothread: Unlock the main loop mutex.
+ *
+ * This function unlocks the main loop mutex.  The mutex is taken by
+ * qemu_init_main_loop and always taken except while waiting on
+ * external events (such as with select).  The mutex should be unlocked
+ * as soon as possible by threads other than the main loop thread,
+ * because it prevents the main loop from processing callbacks,
+ * including timers and bottom halves.
+ */
+void qemu_mutex_unlock_iothread(void);
+
+/* internal interfaces */
+
+void qemu_iohandler_fill(int *pnfds, fd_set *readfds, fd_set *writefds, fd_set *xfds);
+void qemu_iohandler_poll(fd_set *readfds, fd_set *writefds, fd_set *xfds, int rc);
+
+void qemu_bh_schedule_idle(QEMUBH *bh);
+int qemu_bh_poll(void);
+void qemu_bh_update_timeout(int *timeout);
+
+#endif
diff --git a/os-win32.c b/os-win32.c
index f09f01fc49..79094016f1 100644
--- a/os-win32.c
+++ b/os-win32.c
@@ -48,129 +48,6 @@ int setenv(const char *name, const char *value, int overwrite)
     return result;
 }
 
-/***********************************************************/
-/* Polling handling */
-
-typedef struct PollingEntry {
-    PollingFunc *func;
-    void *opaque;
-    struct PollingEntry *next;
-} PollingEntry;
-
-static PollingEntry *first_polling_entry;
-
-int qemu_add_polling_cb(PollingFunc *func, void *opaque)
-{
-    PollingEntry **ppe, *pe;
-    pe = g_malloc0(sizeof(PollingEntry));
-    pe->func = func;
-    pe->opaque = opaque;
-    for(ppe = &first_polling_entry; *ppe != NULL; ppe = &(*ppe)->next);
-    *ppe = pe;
-    return 0;
-}
-
-void qemu_del_polling_cb(PollingFunc *func, void *opaque)
-{
-    PollingEntry **ppe, *pe;
-    for(ppe = &first_polling_entry; *ppe != NULL; ppe = &(*ppe)->next) {
-        pe = *ppe;
-        if (pe->func == func && pe->opaque == opaque) {
-            *ppe = pe->next;
-            g_free(pe);
-            break;
-        }
-    }
-}
-
-/***********************************************************/
-/* Wait objects support */
-typedef struct WaitObjects {
-    int num;
-    HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
-    WaitObjectFunc *func[MAXIMUM_WAIT_OBJECTS + 1];
-    void *opaque[MAXIMUM_WAIT_OBJECTS + 1];
-} WaitObjects;
-
-static WaitObjects wait_objects = {0};
-
-int qemu_add_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque)
-{
-    WaitObjects *w = &wait_objects;
-
-    if (w->num >= MAXIMUM_WAIT_OBJECTS)
-        return -1;
-    w->events[w->num] = handle;
-    w->func[w->num] = func;
-    w->opaque[w->num] = opaque;
-    w->num++;
-    return 0;
-}
-
-void qemu_del_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque)
-{
-    int i, found;
-    WaitObjects *w = &wait_objects;
-
-    found = 0;
-    for (i = 0; i < w->num; i++) {
-        if (w->events[i] == handle)
-            found = 1;
-        if (found) {
-            w->events[i] = w->events[i + 1];
-            w->func[i] = w->func[i + 1];
-            w->opaque[i] = w->opaque[i + 1];
-        }
-    }
-    if (found)
-        w->num--;
-}
-
-void os_host_main_loop_wait(int *timeout)
-{
-    int ret, ret2, i;
-    PollingEntry *pe;
-
-    /* XXX: need to suppress polling by better using win32 events */
-    ret = 0;
-    for(pe = first_polling_entry; pe != NULL; pe = pe->next) {
-        ret |= pe->func(pe->opaque);
-    }
-    if (ret == 0) {
-        int err;
-        WaitObjects *w = &wait_objects;
-
-        qemu_mutex_unlock_iothread();
-        ret = WaitForMultipleObjects(w->num, w->events, FALSE, *timeout);
-        qemu_mutex_lock_iothread();
-        if (WAIT_OBJECT_0 + 0 <= ret && ret <= WAIT_OBJECT_0 + w->num - 1) {
-            if (w->func[ret - WAIT_OBJECT_0])
-                w->func[ret - WAIT_OBJECT_0](w->opaque[ret - WAIT_OBJECT_0]);
-
-            /* Check for additional signaled events */
-            for(i = (ret - WAIT_OBJECT_0 + 1); i < w->num; i++) {
-
-                /* Check if event is signaled */
-                ret2 = WaitForSingleObject(w->events[i], 0);
-                if(ret2 == WAIT_OBJECT_0) {
-                    if (w->func[i])
-                        w->func[i](w->opaque[i]);
-                } else if (ret2 == WAIT_TIMEOUT) {
-                } else {
-                    err = GetLastError();
-                    fprintf(stderr, "WaitForSingleObject error %d %d\n", i, err);
-                }
-            }
-        } else if (ret == WAIT_TIMEOUT) {
-        } else {
-            err = GetLastError();
-            fprintf(stderr, "WaitForMultipleObjects error %d %d\n", ret, err);
-        }
-    }
-
-    *timeout = 0;
-}
-
 static BOOL WINAPI qemu_ctrl_handler(DWORD type)
 {
     exit(STATUS_CONTROL_C_EXIT);
diff --git a/qemu-char.h b/qemu-char.h
index eebbdd8f01..7efcf99f53 100644
--- a/qemu-char.h
+++ b/qemu-char.h
@@ -7,6 +7,7 @@
 #include "qemu-config.h"
 #include "qobject.h"
 #include "qstring.h"
+#include "main-loop.h"
 
 /* character device */
 
@@ -237,15 +238,4 @@ void qemu_chr_close_mem(CharDriverState *chr);
 QString *qemu_chr_mem_to_qs(CharDriverState *chr);
 size_t qemu_chr_mem_osize(const CharDriverState *chr);
 
-/* async I/O support */
-
-int qemu_set_fd_handler2(int fd,
-                         IOCanReadHandler *fd_read_poll,
-                         IOHandler *fd_read,
-                         IOHandler *fd_write,
-                         void *opaque);
-int qemu_set_fd_handler(int fd,
-                        IOHandler *fd_read,
-                        IOHandler *fd_write,
-                        void *opaque);
 #endif
diff --git a/qemu-common.h b/qemu-common.h
index 5e87bdf2f2..1c15cb17a7 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -13,7 +13,6 @@
 
 typedef struct QEMUTimer QEMUTimer;
 typedef struct QEMUFile QEMUFile;
-typedef struct QEMUBH QEMUBH;
 typedef struct DeviceState DeviceState;
 
 struct Monitor;
@@ -96,6 +95,10 @@ static inline char *realpath(const char *path, char *resolved_path)
 }
 #endif
 
+/* icount */
+void configure_icount(const char *option);
+extern int use_icount;
+
 /* FIXME: Remove NEED_CPU_H.  */
 #ifndef NEED_CPU_H
 
@@ -113,23 +116,6 @@ static inline char *realpath(const char *path, char *resolved_path)
 int qemu_main(int argc, char **argv, char **envp);
 #endif
 
-/* bottom halves */
-typedef void QEMUBHFunc(void *opaque);
-
-QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque);
-void qemu_bh_schedule(QEMUBH *bh);
-/* Bottom halfs that are scheduled from a bottom half handler are instantly
- * invoked.  This can create an infinite loop if a bottom half handler
- * schedules itself.  qemu_bh_schedule_idle() avoids this infinite loop by
- * ensuring that the bottom half isn't executed until the next main loop
- * iteration.
- */
-void qemu_bh_schedule_idle(QEMUBH *bh);
-void qemu_bh_cancel(QEMUBH *bh);
-void qemu_bh_delete(QEMUBH *bh);
-int qemu_bh_poll(void);
-void qemu_bh_update_timeout(int *timeout);
-
 void qemu_get_timedate(struct tm *tm, int offset);
 int qemu_timedate_diff(struct tm *tm);
 
@@ -183,16 +169,12 @@ const char *path(const char *pathname);
 
 void *qemu_oom_check(void *ptr);
 
-void qemu_mutex_lock_iothread(void);
-void qemu_mutex_unlock_iothread(void);
-
 int qemu_open(const char *name, int flags, ...);
 ssize_t qemu_write_full(int fd, const void *buf, size_t count)
     QEMU_WARN_UNUSED_RESULT;
 void qemu_set_cloexec(int fd);
 
 #ifndef _WIN32
-int qemu_add_child_watch(pid_t pid);
 int qemu_eventfd(int pipefd[2]);
 int qemu_pipe(int pipefd[2]);
 #endif
@@ -207,14 +189,6 @@ int qemu_pipe(int pipefd[2]);
 
 void QEMU_NORETURN hw_error(const char *fmt, ...) GCC_FMT_ATTR(1, 2);
 
-/* IO callbacks.  */
-typedef void IOReadHandler(void *opaque, const uint8_t *buf, int size);
-typedef int IOCanReadHandler(void *opaque);
-typedef void IOHandler(void *opaque);
-
-void qemu_iohandler_fill(int *pnfds, fd_set *readfds, fd_set *writefds, fd_set *xfds);
-void qemu_iohandler_poll(fd_set *readfds, fd_set *writefds, fd_set *xfds, int rc);
-
 struct ParallelIOArg {
     void *buffer;
     int count;
@@ -276,9 +250,6 @@ void cpu_exec_init_all(void);
 void cpu_save(QEMUFile *f, void *opaque);
 int cpu_load(QEMUFile *f, void *opaque, int version_id);
 
-/* Force QEMU to process pending events */
-void qemu_notify_event(void);
-
 /* Unblock cpu */
 void qemu_cpu_kick(void *env);
 void qemu_cpu_kick_self(void);
diff --git a/qemu-coroutine-lock.c b/qemu-coroutine-lock.c
index 2a385a3bb8..6b58160058 100644
--- a/qemu-coroutine-lock.c
+++ b/qemu-coroutine-lock.c
@@ -26,6 +26,7 @@
 #include "qemu-coroutine.h"
 #include "qemu-coroutine-int.h"
 #include "qemu-queue.h"
+#include "main-loop.h"
 #include "trace.h"
 
 static QTAILQ_HEAD(, Coroutine) unlock_bh_queue =
diff --git a/qemu-os-posix.h b/qemu-os-posix.h
index 81fd9ab389..920499d836 100644
--- a/qemu-os-posix.h
+++ b/qemu-os-posix.h
@@ -26,10 +26,6 @@
 #ifndef QEMU_OS_POSIX_H
 #define QEMU_OS_POSIX_H
 
-static inline void os_host_main_loop_wait(int *timeout)
-{
-}
-
 void os_set_line_buffering(void);
 void os_set_proc_name(const char *s);
 void os_setup_signal_handling(void);
diff --git a/qemu-os-win32.h b/qemu-os-win32.h
index 8a069d7fb6..8eda4bdc20 100644
--- a/qemu-os-win32.h
+++ b/qemu-os-win32.h
@@ -28,26 +28,11 @@
 
 #include <windows.h>
 #include <winsock2.h>
+#include "main-loop.h"
 
 /* Declaration of ffs() is missing in MinGW's strings.h. */
 int ffs(int i);
 
-/* Polling handling */
-
-/* return TRUE if no sleep should be done afterwards */
-typedef int PollingFunc(void *opaque);
-
-int qemu_add_polling_cb(PollingFunc *func, void *opaque);
-void qemu_del_polling_cb(PollingFunc *func, void *opaque);
-
-/* Wait objects handling */
-typedef void WaitObjectFunc(void *opaque);
-
-int qemu_add_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque);
-void qemu_del_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque);
-
-void os_host_main_loop_wait(int *timeout);
-
 static inline void os_setup_signal_handling(void) {}
 static inline void os_daemonize(void) {}
 static inline void os_setup_post(void) {}
diff --git a/qemu-timer.c b/qemu-timer.c
index ad1fc8b871..f11a28dd03 100644
--- a/qemu-timer.c
+++ b/qemu-timer.c
@@ -46,82 +46,6 @@
 
 #include "qemu-timer.h"
 
-/* Conversion factor from emulated instructions to virtual clock ticks.  */
-int icount_time_shift;
-/* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
-#define MAX_ICOUNT_SHIFT 10
-/* Compensate for varying guest execution speed.  */
-int64_t qemu_icount_bias;
-static QEMUTimer *icount_rt_timer;
-static QEMUTimer *icount_vm_timer;
-
-/***********************************************************/
-/* guest cycle counter */
-
-typedef struct TimersState {
-    int64_t cpu_ticks_prev;
-    int64_t cpu_ticks_offset;
-    int64_t cpu_clock_offset;
-    int32_t cpu_ticks_enabled;
-    int64_t dummy;
-} TimersState;
-
-TimersState timers_state;
-
-/* return the host CPU cycle counter and handle stop/restart */
-int64_t cpu_get_ticks(void)
-{
-    if (use_icount) {
-        return cpu_get_icount();
-    }
-    if (!timers_state.cpu_ticks_enabled) {
-        return timers_state.cpu_ticks_offset;
-    } else {
-        int64_t ticks;
-        ticks = cpu_get_real_ticks();
-        if (timers_state.cpu_ticks_prev > ticks) {
-            /* Note: non increasing ticks may happen if the host uses
-               software suspend */
-            timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
-        }
-        timers_state.cpu_ticks_prev = ticks;
-        return ticks + timers_state.cpu_ticks_offset;
-    }
-}
-
-/* return the host CPU monotonic timer and handle stop/restart */
-static int64_t cpu_get_clock(void)
-{
-    int64_t ti;
-    if (!timers_state.cpu_ticks_enabled) {
-        return timers_state.cpu_clock_offset;
-    } else {
-        ti = get_clock();
-        return ti + timers_state.cpu_clock_offset;
-    }
-}
-
-/* enable cpu_get_ticks() */
-void cpu_enable_ticks(void)
-{
-    if (!timers_state.cpu_ticks_enabled) {
-        timers_state.cpu_ticks_offset -= cpu_get_real_ticks();
-        timers_state.cpu_clock_offset -= get_clock();
-        timers_state.cpu_ticks_enabled = 1;
-    }
-}
-
-/* disable cpu_get_ticks() : the clock is stopped. You must not call
-   cpu_get_ticks() after that.  */
-void cpu_disable_ticks(void)
-{
-    if (timers_state.cpu_ticks_enabled) {
-        timers_state.cpu_ticks_offset = cpu_get_ticks();
-        timers_state.cpu_clock_offset = cpu_get_clock();
-        timers_state.cpu_ticks_enabled = 0;
-    }
-}
-
 /***********************************************************/
 /* timers */
 
@@ -133,7 +57,7 @@ struct QEMUClock {
     int type;
     int enabled;
 
-    QEMUTimer *warp_timer;
+    QEMUTimer *active_timers;
 
     NotifierList reset_notifiers;
     int64_t last;
@@ -152,7 +76,7 @@ struct qemu_alarm_timer {
     char const *name;
     int (*start)(struct qemu_alarm_timer *t);
     void (*stop)(struct qemu_alarm_timer *t);
-    void (*rearm)(struct qemu_alarm_timer *t);
+    void (*rearm)(struct qemu_alarm_timer *t, int64_t nearest_delta_ns);
 #if defined(__linux__)
     int fd;
     timer_t timer;
@@ -180,12 +104,46 @@ static inline int alarm_has_dynticks(struct qemu_alarm_timer *t)
     return !!t->rearm;
 }
 
+static int64_t qemu_next_alarm_deadline(void)
+{
+    int64_t delta;
+    int64_t rtdelta;
+
+    if (!use_icount && vm_clock->active_timers) {
+        delta = vm_clock->active_timers->expire_time -
+                     qemu_get_clock_ns(vm_clock);
+    } else {
+        delta = INT32_MAX;
+    }
+    if (host_clock->active_timers) {
+        int64_t hdelta = host_clock->active_timers->expire_time -
+                 qemu_get_clock_ns(host_clock);
+        if (hdelta < delta) {
+            delta = hdelta;
+        }
+    }
+    if (rt_clock->active_timers) {
+        rtdelta = (rt_clock->active_timers->expire_time -
+                 qemu_get_clock_ns(rt_clock));
+        if (rtdelta < delta) {
+            delta = rtdelta;
+        }
+    }
+
+    return delta;
+}
+
 static void qemu_rearm_alarm_timer(struct qemu_alarm_timer *t)
 {
-    if (!alarm_has_dynticks(t))
+    int64_t nearest_delta_ns;
+    assert(alarm_has_dynticks(t));
+    if (!rt_clock->active_timers &&
+        !vm_clock->active_timers &&
+        !host_clock->active_timers) {
         return;
-
-    t->rearm(t);
+    }
+    nearest_delta_ns = qemu_next_alarm_deadline();
+    t->rearm(t, nearest_delta_ns);
 }
 
 /* TODO: MIN_TIMER_REARM_NS should be optimized */
@@ -195,83 +153,28 @@ static void qemu_rearm_alarm_timer(struct qemu_alarm_timer *t)
 
 static int mm_start_timer(struct qemu_alarm_timer *t);
 static void mm_stop_timer(struct qemu_alarm_timer *t);
-static void mm_rearm_timer(struct qemu_alarm_timer *t);
+static void mm_rearm_timer(struct qemu_alarm_timer *t, int64_t delta);
 
 static int win32_start_timer(struct qemu_alarm_timer *t);
 static void win32_stop_timer(struct qemu_alarm_timer *t);
-static void win32_rearm_timer(struct qemu_alarm_timer *t);
+static void win32_rearm_timer(struct qemu_alarm_timer *t, int64_t delta);
 
 #else
 
 static int unix_start_timer(struct qemu_alarm_timer *t);
 static void unix_stop_timer(struct qemu_alarm_timer *t);
-static void unix_rearm_timer(struct qemu_alarm_timer *t);
+static void unix_rearm_timer(struct qemu_alarm_timer *t, int64_t delta);
 
 #ifdef __linux__
 
 static int dynticks_start_timer(struct qemu_alarm_timer *t);
 static void dynticks_stop_timer(struct qemu_alarm_timer *t);
-static void dynticks_rearm_timer(struct qemu_alarm_timer *t);
+static void dynticks_rearm_timer(struct qemu_alarm_timer *t, int64_t delta);
 
 #endif /* __linux__ */
 
 #endif /* _WIN32 */
 
-/* Correlation between real and virtual time is always going to be
-   fairly approximate, so ignore small variation.
-   When the guest is idle real and virtual time will be aligned in
-   the IO wait loop.  */
-#define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)
-
-static void icount_adjust(void)
-{
-    int64_t cur_time;
-    int64_t cur_icount;
-    int64_t delta;
-    static int64_t last_delta;
-    /* If the VM is not running, then do nothing.  */
-    if (!runstate_is_running())
-        return;
-
-    cur_time = cpu_get_clock();
-    cur_icount = qemu_get_clock_ns(vm_clock);
-    delta = cur_icount - cur_time;
-    /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
-    if (delta > 0
-        && last_delta + ICOUNT_WOBBLE < delta * 2
-        && icount_time_shift > 0) {
-        /* The guest is getting too far ahead.  Slow time down.  */
-        icount_time_shift--;
-    }
-    if (delta < 0
-        && last_delta - ICOUNT_WOBBLE > delta * 2
-        && icount_time_shift < MAX_ICOUNT_SHIFT) {
-        /* The guest is getting too far behind.  Speed time up.  */
-        icount_time_shift++;
-    }
-    last_delta = delta;
-    qemu_icount_bias = cur_icount - (qemu_icount << icount_time_shift);
-}
-
-static void icount_adjust_rt(void * opaque)
-{
-    qemu_mod_timer(icount_rt_timer,
-                   qemu_get_clock_ms(rt_clock) + 1000);
-    icount_adjust();
-}
-
-static void icount_adjust_vm(void * opaque)
-{
-    qemu_mod_timer(icount_vm_timer,
-                   qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10);
-    icount_adjust();
-}
-
-int64_t qemu_icount_round(int64_t count)
-{
-    return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
-}
-
 static struct qemu_alarm_timer alarm_timers[] = {
 #ifndef _WIN32
 #ifdef __linux__
@@ -352,14 +255,10 @@ next:
     }
 }
 
-#define QEMU_NUM_CLOCKS 3
-
 QEMUClock *rt_clock;
 QEMUClock *vm_clock;
 QEMUClock *host_clock;
 
-static QEMUTimer *active_timers[QEMU_NUM_CLOCKS];
-
 static QEMUClock *qemu_new_clock(int type)
 {
     QEMUClock *clock;
@@ -367,101 +266,43 @@ static QEMUClock *qemu_new_clock(int type)
     clock = g_malloc0(sizeof(QEMUClock));
     clock->type = type;
     clock->enabled = 1;
+    clock->last = INT64_MIN;
     notifier_list_init(&clock->reset_notifiers);
-    /* required to detect & report backward jumps */
-    if (type == QEMU_CLOCK_HOST) {
-        clock->last = get_clock_realtime();
-    }
     return clock;
 }
 
 void qemu_clock_enable(QEMUClock *clock, int enabled)
 {
+    bool old = clock->enabled;
     clock->enabled = enabled;
+    if (enabled && !old) {
+        qemu_rearm_alarm_timer(alarm_timer);
+    }
 }
 
-static int64_t vm_clock_warp_start;
-
-static void icount_warp_rt(void *opaque)
+int64_t qemu_clock_has_timers(QEMUClock *clock)
 {
-    if (vm_clock_warp_start == -1) {
-        return;
-    }
-
-    if (runstate_is_running()) {
-        int64_t clock = qemu_get_clock_ns(rt_clock);
-        int64_t warp_delta = clock - vm_clock_warp_start;
-        if (use_icount == 1) {
-            qemu_icount_bias += warp_delta;
-        } else {
-            /*
-             * In adaptive mode, do not let the vm_clock run too
-             * far ahead of real time.
-             */
-            int64_t cur_time = cpu_get_clock();
-            int64_t cur_icount = qemu_get_clock_ns(vm_clock);
-            int64_t delta = cur_time - cur_icount;
-            qemu_icount_bias += MIN(warp_delta, delta);
-        }
-        if (qemu_timer_expired(active_timers[QEMU_CLOCK_VIRTUAL],
-                               qemu_get_clock_ns(vm_clock))) {
-            qemu_notify_event();
-        }
-    }
-    vm_clock_warp_start = -1;
+    return !!clock->active_timers;
 }
 
-void qemu_clock_warp(QEMUClock *clock)
+int64_t qemu_clock_expired(QEMUClock *clock)
 {
-    int64_t deadline;
+    return (clock->active_timers &&
+            clock->active_timers->expire_time < qemu_get_clock_ns(clock));
+}
 
-    if (!clock->warp_timer) {
-        return;
-    }
+int64_t qemu_clock_deadline(QEMUClock *clock)
+{
+    /* To avoid problems with overflow limit this to 2^32.  */
+    int64_t delta = INT32_MAX;
 
-    /*
-     * There are too many global variables to make the "warp" behavior
-     * applicable to other clocks.  But a clock argument removes the
-     * need for if statements all over the place.
-     */
-    assert(clock == vm_clock);
-
-    /*
-     * If the CPUs have been sleeping, advance the vm_clock timer now.  This
-     * ensures that the deadline for the timer is computed correctly below.
-     * This also makes sure that the insn counter is synchronized before the
-     * CPU starts running, in case the CPU is woken by an event other than
-     * the earliest vm_clock timer.
-     */
-    icount_warp_rt(NULL);
-    if (!all_cpu_threads_idle() || !active_timers[clock->type]) {
-        qemu_del_timer(clock->warp_timer);
-        return;
+    if (clock->active_timers) {
+        delta = clock->active_timers->expire_time - qemu_get_clock_ns(clock);
     }
-
-    vm_clock_warp_start = qemu_get_clock_ns(rt_clock);
-    deadline = qemu_next_icount_deadline();
-    if (deadline > 0) {
-        /*
-         * Ensure the vm_clock proceeds even when the virtual CPU goes to
-         * sleep.  Otherwise, the CPU might be waiting for a future timer
-         * interrupt to wake it up, but the interrupt never comes because
-         * the vCPU isn't running any insns and thus doesn't advance the
-         * vm_clock.
-         *
-         * An extreme solution for this problem would be to never let VCPUs
-         * sleep in icount mode if there is a pending vm_clock timer; rather
-         * time could just advance to the next vm_clock event.  Instead, we
-         * do stop VCPUs and only advance vm_clock after some "real" time,
-         * (related to the time left until the next event) has passed.  This
-         * rt_clock timer will do this.  This avoids that the warps are too
-         * visible externally---for example, you will not be sending network
-         * packets continously instead of every 100ms.
-         */
-        qemu_mod_timer(clock->warp_timer, vm_clock_warp_start + deadline);
-    } else {
-        qemu_notify_event();
+    if (delta < 0) {
+        delta = 0;
     }
+    return delta;
 }
 
 QEMUTimer *qemu_new_timer(QEMUClock *clock, int scale,
@@ -489,7 +330,7 @@ void qemu_del_timer(QEMUTimer *ts)
 
     /* NOTE: this code must be signal safe because
        qemu_timer_expired() can be called from a signal. */
-    pt = &active_timers[ts->clock->type];
+    pt = &ts->clock->active_timers;
     for(;;) {
         t = *pt;
         if (!t)
@@ -504,7 +345,7 @@ void qemu_del_timer(QEMUTimer *ts)
 
 /* modify the current timer so that it will be fired when current_time
    >= expire_time. The corresponding callback will be called. */
-static void qemu_mod_timer_ns(QEMUTimer *ts, int64_t expire_time)
+void qemu_mod_timer_ns(QEMUTimer *ts, int64_t expire_time)
 {
     QEMUTimer **pt, *t;
 
@@ -513,7 +354,7 @@ static void qemu_mod_timer_ns(QEMUTimer *ts, int64_t expire_time)
     /* add the timer in the sorted list */
     /* NOTE: this code must be signal safe because
        qemu_timer_expired() can be called from a signal. */
-    pt = &active_timers[ts->clock->type];
+    pt = &ts->clock->active_timers;
     for(;;) {
         t = *pt;
         if (!qemu_timer_expired_ns(t, expire_time)) {
@@ -526,7 +367,7 @@ static void qemu_mod_timer_ns(QEMUTimer *ts, int64_t expire_time)
     *pt = ts;
 
     /* Rearm if necessary  */
-    if (pt == &active_timers[ts->clock->type]) {
+    if (pt == &ts->clock->active_timers) {
         if (!alarm_timer->pending) {
             qemu_rearm_alarm_timer(alarm_timer);
         }
@@ -538,8 +379,6 @@ static void qemu_mod_timer_ns(QEMUTimer *ts, int64_t expire_time)
     }
 }
 
-/* modify the current timer so that it will be fired when current_time
-   >= expire_time. The corresponding callback will be called. */
 void qemu_mod_timer(QEMUTimer *ts, int64_t expire_time)
 {
     qemu_mod_timer_ns(ts, expire_time * ts->scale);
@@ -548,7 +387,7 @@ void qemu_mod_timer(QEMUTimer *ts, int64_t expire_time)
 int qemu_timer_pending(QEMUTimer *ts)
 {
     QEMUTimer *t;
-    for(t = active_timers[ts->clock->type]; t != NULL; t = t->next) {
+    for (t = ts->clock->active_timers; t != NULL; t = t->next) {
         if (t == ts)
             return 1;
     }
@@ -569,7 +408,7 @@ static void qemu_run_timers(QEMUClock *clock)
         return;
 
     current_time = qemu_get_clock_ns(clock);
-    ptimer_head = &active_timers[clock->type];
+    ptimer_head = &clock->active_timers;
     for(;;) {
         ts = *ptimer_head;
         if (!qemu_timer_expired_ns(ts, current_time)) {
@@ -624,79 +463,11 @@ void init_clocks(void)
     rt_clock = qemu_new_clock(QEMU_CLOCK_REALTIME);
     vm_clock = qemu_new_clock(QEMU_CLOCK_VIRTUAL);
     host_clock = qemu_new_clock(QEMU_CLOCK_HOST);
-
-    rtc_clock = host_clock;
 }
 
-/* save a timer */
-void qemu_put_timer(QEMUFile *f, QEMUTimer *ts)
+uint64_t qemu_timer_expire_time_ns(QEMUTimer *ts)
 {
-    uint64_t expire_time;
-
-    if (qemu_timer_pending(ts)) {
-        expire_time = ts->expire_time;
-    } else {
-        expire_time = -1;
-    }
-    qemu_put_be64(f, expire_time);
-}
-
-void qemu_get_timer(QEMUFile *f, QEMUTimer *ts)
-{
-    uint64_t expire_time;
-
-    expire_time = qemu_get_be64(f);
-    if (expire_time != -1) {
-        qemu_mod_timer_ns(ts, expire_time);
-    } else {
-        qemu_del_timer(ts);
-    }
-}
-
-static const VMStateDescription vmstate_timers = {
-    .name = "timer",
-    .version_id = 2,
-    .minimum_version_id = 1,
-    .minimum_version_id_old = 1,
-    .fields      = (VMStateField []) {
-        VMSTATE_INT64(cpu_ticks_offset, TimersState),
-        VMSTATE_INT64(dummy, TimersState),
-        VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
-        VMSTATE_END_OF_LIST()
-    }
-};
-
-void configure_icount(const char *option)
-{
-    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
-    if (!option)
-        return;
-
-    vm_clock->warp_timer = qemu_new_timer_ns(rt_clock, icount_warp_rt, NULL);
-
-    if (strcmp(option, "auto") != 0) {
-        icount_time_shift = strtol(option, NULL, 0);
-        use_icount = 1;
-        return;
-    }
-
-    use_icount = 2;
-
-    /* 125MIPS seems a reasonable initial guess at the guest speed.
-       It will be corrected fairly quickly anyway.  */
-    icount_time_shift = 3;
-
-    /* Have both realtime and virtual time triggers for speed adjustment.
-       The realtime trigger catches emulated time passing too slowly,
-       the virtual time trigger catches emulated time passing too fast.
-       Realtime triggers occur even when idle, so use them less frequently
-       than VM triggers.  */
-    icount_rt_timer = qemu_new_timer_ms(rt_clock, icount_adjust_rt, NULL);
-    qemu_mod_timer(icount_rt_timer,
-                   qemu_get_clock_ms(rt_clock) + 1000);
-    icount_vm_timer = qemu_new_timer_ns(vm_clock, icount_adjust_vm, NULL);
-    qemu_mod_timer(icount_vm_timer,
-                   qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10);
+    return qemu_timer_pending(ts) ? ts->expire_time : -1;
 }
 
 void qemu_run_all_timers(void)
@@ -710,16 +481,11 @@ void qemu_run_all_timers(void)
     }
 
     /* vm time timers */
-    if (runstate_is_running()) {
-        qemu_run_timers(vm_clock);
-    }
-
+    qemu_run_timers(vm_clock);
     qemu_run_timers(rt_clock);
     qemu_run_timers(host_clock);
 }
 
-static int64_t qemu_next_alarm_deadline(void);
-
 #ifdef _WIN32
 static void CALLBACK host_alarm_handler(PVOID lpParam, BOOLEAN unused)
 #else
@@ -767,50 +533,6 @@ static void host_alarm_handler(int host_signum)
     }
 }
 
-int64_t qemu_next_icount_deadline(void)
-{
-    /* To avoid problems with overflow limit this to 2^32.  */
-    int64_t delta = INT32_MAX;
-
-    assert(use_icount);
-    if (active_timers[QEMU_CLOCK_VIRTUAL]) {
-        delta = active_timers[QEMU_CLOCK_VIRTUAL]->expire_time -
-                     qemu_get_clock_ns(vm_clock);
-    }
-
-    if (delta < 0)
-        delta = 0;
-
-    return delta;
-}
-
-static int64_t qemu_next_alarm_deadline(void)
-{
-    int64_t delta;
-    int64_t rtdelta;
-
-    if (!use_icount && active_timers[QEMU_CLOCK_VIRTUAL]) {
-        delta = active_timers[QEMU_CLOCK_VIRTUAL]->expire_time -
-                     qemu_get_clock_ns(vm_clock);
-    } else {
-        delta = INT32_MAX;
-    }
-    if (active_timers[QEMU_CLOCK_HOST]) {
-        int64_t hdelta = active_timers[QEMU_CLOCK_HOST]->expire_time -
-                 qemu_get_clock_ns(host_clock);
-        if (hdelta < delta)
-            delta = hdelta;
-    }
-    if (active_timers[QEMU_CLOCK_REALTIME]) {
-        rtdelta = (active_timers[QEMU_CLOCK_REALTIME]->expire_time -
-                 qemu_get_clock_ns(rt_clock));
-        if (rtdelta < delta)
-            delta = rtdelta;
-    }
-
-    return delta;
-}
-
 #if defined(__linux__)
 
 #include "compatfd.h"
@@ -863,20 +585,13 @@ static void dynticks_stop_timer(struct qemu_alarm_timer *t)
     timer_delete(host_timer);
 }
 
-static void dynticks_rearm_timer(struct qemu_alarm_timer *t)
+static void dynticks_rearm_timer(struct qemu_alarm_timer *t,
+                                 int64_t nearest_delta_ns)
 {
     timer_t host_timer = t->timer;
     struct itimerspec timeout;
-    int64_t nearest_delta_ns = INT64_MAX;
     int64_t current_ns;
 
-    assert(alarm_has_dynticks(t));
-    if (!active_timers[QEMU_CLOCK_REALTIME] &&
-        !active_timers[QEMU_CLOCK_VIRTUAL] &&
-        !active_timers[QEMU_CLOCK_HOST])
-        return;
-
-    nearest_delta_ns = qemu_next_alarm_deadline();
     if (nearest_delta_ns < MIN_TIMER_REARM_NS)
         nearest_delta_ns = MIN_TIMER_REARM_NS;
 
@@ -918,19 +633,12 @@ static int unix_start_timer(struct qemu_alarm_timer *t)
     return 0;
 }
 
-static void unix_rearm_timer(struct qemu_alarm_timer *t)
+static void unix_rearm_timer(struct qemu_alarm_timer *t,
+                             int64_t nearest_delta_ns)
 {
     struct itimerval itv;
-    int64_t nearest_delta_ns = INT64_MAX;
     int err;
 
-    assert(alarm_has_dynticks(t));
-    if (!active_timers[QEMU_CLOCK_REALTIME] &&
-        !active_timers[QEMU_CLOCK_VIRTUAL] &&
-        !active_timers[QEMU_CLOCK_HOST])
-        return;
-
-    nearest_delta_ns = qemu_next_alarm_deadline();
     if (nearest_delta_ns < MIN_TIMER_REARM_NS)
         nearest_delta_ns = MIN_TIMER_REARM_NS;
 
@@ -1017,23 +725,14 @@ static void mm_stop_timer(struct qemu_alarm_timer *t)
     timeEndPeriod(mm_period);
 }
 
-static void mm_rearm_timer(struct qemu_alarm_timer *t)
+static void mm_rearm_timer(struct qemu_alarm_timer *t, int64_t delta)
 {
-    int nearest_delta_ms;
-
-    assert(alarm_has_dynticks(t));
-    if (!active_timers[QEMU_CLOCK_REALTIME] &&
-        !active_timers[QEMU_CLOCK_VIRTUAL] &&
-        !active_timers[QEMU_CLOCK_HOST]) {
-        return;
-    }
-
-    timeKillEvent(mm_timer);
-
-    nearest_delta_ms = (qemu_next_alarm_deadline() + 999999) / 1000000;
+    int nearest_delta_ms = (delta + 999999) / 1000000;
     if (nearest_delta_ms < 1) {
         nearest_delta_ms = 1;
     }
+
+    timeKillEvent(mm_timer);
     mm_timer = timeSetEvent(nearest_delta_ms,
                             mm_period,
                             mm_alarm_handler,
@@ -1085,19 +784,14 @@ static void win32_stop_timer(struct qemu_alarm_timer *t)
     }
 }
 
-static void win32_rearm_timer(struct qemu_alarm_timer *t)
+static void win32_rearm_timer(struct qemu_alarm_timer *t,
+                              int64_t nearest_delta_ns)
 {
     HANDLE hTimer = t->timer;
     int nearest_delta_ms;
     BOOLEAN success;
 
-    assert(alarm_has_dynticks(t));
-    if (!active_timers[QEMU_CLOCK_REALTIME] &&
-        !active_timers[QEMU_CLOCK_VIRTUAL] &&
-        !active_timers[QEMU_CLOCK_HOST])
-        return;
-
-    nearest_delta_ms = (qemu_next_alarm_deadline() + 999999) / 1000000;
+    nearest_delta_ms = (nearest_delta_ns + 999999) / 1000000;
     if (nearest_delta_ms < 1) {
         nearest_delta_ms = 1;
     }
@@ -1116,11 +810,11 @@ static void win32_rearm_timer(struct qemu_alarm_timer *t)
 
 #endif /* _WIN32 */
 
-static void alarm_timer_on_change_state_rearm(void *opaque, int running,
-                                              RunState state)
+static void quit_timers(void)
 {
-    if (running)
-        qemu_rearm_alarm_timer((struct qemu_alarm_timer *) opaque);
+    struct qemu_alarm_timer *t = alarm_timer;
+    alarm_timer = NULL;
+    t->stop(t);
 }
 
 int init_timer_alarm(void)
@@ -1142,9 +836,9 @@ int init_timer_alarm(void)
     }
 
     /* first event is at time 0 */
+    atexit(quit_timers);
     t->pending = 1;
     alarm_timer = t;
-    qemu_add_vm_change_state_handler(alarm_timer_on_change_state_rearm, t);
 
     return 0;
 
@@ -1152,13 +846,6 @@ fail:
     return err;
 }
 
-void quit_timers(void)
-{
-    struct qemu_alarm_timer *t = alarm_timer;
-    alarm_timer = NULL;
-    t->stop(t);
-}
-
 int qemu_calculate_timeout(void)
 {
     return 1000;
diff --git a/qemu-timer.h b/qemu-timer.h
index 0a43469847..67ca72e045 100644
--- a/qemu-timer.h
+++ b/qemu-timer.h
@@ -2,6 +2,7 @@
 #define QEMU_TIMER_H
 
 #include "qemu-common.h"
+#include "main-loop.h"
 #include "notify.h"
 #include <time.h>
 #include <sys/time.h>
@@ -38,6 +39,9 @@ extern QEMUClock *vm_clock;
 extern QEMUClock *host_clock;
 
 int64_t qemu_get_clock_ns(QEMUClock *clock);
+int64_t qemu_clock_has_timers(QEMUClock *clock);
+int64_t qemu_clock_expired(QEMUClock *clock);
+int64_t qemu_clock_deadline(QEMUClock *clock);
 void qemu_clock_enable(QEMUClock *clock, int enabled);
 void qemu_clock_warp(QEMUClock *clock);
 
@@ -49,19 +53,18 @@ QEMUTimer *qemu_new_timer(QEMUClock *clock, int scale,
                           QEMUTimerCB *cb, void *opaque);
 void qemu_free_timer(QEMUTimer *ts);
 void qemu_del_timer(QEMUTimer *ts);
+void qemu_mod_timer_ns(QEMUTimer *ts, int64_t expire_time);
 void qemu_mod_timer(QEMUTimer *ts, int64_t expire_time);
 int qemu_timer_pending(QEMUTimer *ts);
 int qemu_timer_expired(QEMUTimer *timer_head, int64_t current_time);
+uint64_t qemu_timer_expire_time_ns(QEMUTimer *ts);
 
 void qemu_run_all_timers(void);
 int qemu_alarm_pending(void);
-int64_t qemu_next_icount_deadline(void);
 void configure_alarms(char const *opt);
-void configure_icount(const char *option);
 int qemu_calculate_timeout(void);
 void init_clocks(void);
 int init_timer_alarm(void);
-void quit_timers(void);
 
 int64_t cpu_get_ticks(void);
 void cpu_enable_ticks(void);
@@ -150,12 +153,8 @@ void ptimer_run(ptimer_state *s, int oneshot);
 void ptimer_stop(ptimer_state *s);
 
 /* icount */
-int64_t qemu_icount_round(int64_t count);
-extern int64_t qemu_icount;
-extern int use_icount;
-extern int icount_time_shift;
-extern int64_t qemu_icount_bias;
 int64_t cpu_get_icount(void);
+int64_t cpu_get_clock(void);
 
 /*******************************************/
 /* host CPU ticks (if available) */
@@ -311,22 +310,6 @@ static inline int64_t cpu_get_real_ticks (void)
 }
 #endif
 
-#ifdef NEED_CPU_H
-/* Deterministic execution requires that IO only be performed on the last
-   instruction of a TB so that interrupts take effect immediately.  */
-static inline int can_do_io(CPUState *env)
-{
-    if (!use_icount)
-        return 1;
-
-    /* If not executing code then assume we are ok.  */
-    if (!env->current_tb)
-        return 1;
-
-    return env->can_do_io != 0;
-}
-#endif
-
 #ifdef CONFIG_PROFILER
 static inline int64_t profile_getclock(void)
 {
diff --git a/savevm.c b/savevm.c
index cf79a56871..f01838fb2d 100644
--- a/savevm.c
+++ b/savevm.c
@@ -81,6 +81,7 @@
 #include "migration.h"
 #include "qemu_socket.h"
 #include "qemu-queue.h"
+#include "qemu-timer.h"
 #include "cpus.h"
 
 #define SELF_ANNOUNCE_ROUNDS 5
@@ -712,6 +713,30 @@ uint64_t qemu_get_be64(QEMUFile *f)
     return v;
 }
 
+
+/* timer */
+
+void qemu_put_timer(QEMUFile *f, QEMUTimer *ts)
+{
+    uint64_t expire_time;
+
+    expire_time = qemu_timer_expire_time_ns(ts);
+    qemu_put_be64(f, expire_time);
+}
+
+void qemu_get_timer(QEMUFile *f, QEMUTimer *ts)
+{
+    uint64_t expire_time;
+
+    expire_time = qemu_get_be64(f);
+    if (expire_time != -1) {
+        qemu_mod_timer_ns(ts, expire_time);
+    } else {
+        qemu_del_timer(ts);
+    }
+}
+
+
 /* bool */
 
 static int get_bool(QEMUFile *f, void *pv, size_t size)
diff --git a/slirp/libslirp.h b/slirp/libslirp.h
index a7551235e2..890fd86c3c 100644
--- a/slirp/libslirp.h
+++ b/slirp/libslirp.h
@@ -3,8 +3,6 @@
 
 #include "qemu-common.h"
 
-#ifdef CONFIG_SLIRP
-
 struct Slirp;
 typedef struct Slirp Slirp;
 
@@ -44,13 +42,4 @@ void slirp_socket_recv(Slirp *slirp, struct in_addr guest_addr,
 size_t slirp_socket_can_recv(Slirp *slirp, struct in_addr guest_addr,
                              int guest_port);
 
-#else /* !CONFIG_SLIRP */
-
-static inline void slirp_select_fill(int *pnfds, fd_set *readfds,
-                                     fd_set *writefds, fd_set *xfds) { }
-
-static inline void slirp_select_poll(fd_set *readfds, fd_set *writefds,
-                                     fd_set *xfds, int select_error) { }
-#endif /* !CONFIG_SLIRP */
-
 #endif
diff --git a/sysemu.h b/sysemu.h
index 7d288f865d..22cd720016 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -8,6 +8,7 @@
 #include "qemu-timer.h"
 #include "qapi-types.h"
 #include "notify.h"
+#include "main-loop.h"
 
 /* vl.c */
 
@@ -64,8 +65,6 @@ void do_info_snapshots(Monitor *mon);
 
 void qemu_announce_self(void);
 
-int main_loop_wait(int nonblocking);
-
 bool qemu_savevm_state_blocked(Monitor *mon);
 int qemu_savevm_state_begin(Monitor *mon, QEMUFile *f, int blk_enable,
                             int shared);
diff --git a/vl.c b/vl.c
index 66f70fb6ae..1ddb17bfd9 100644
--- a/vl.c
+++ b/vl.c
@@ -148,6 +148,7 @@ int main(int argc, char **argv)
 #include "qemu-objects.h"
 #include "qemu-options.h"
 #include "qmp-commands.h"
+#include "main-loop.h"
 #ifdef CONFIG_VIRTFS
 #include "fsdev/qemu-fsdev.h"
 #endif
@@ -1425,142 +1426,51 @@ void qemu_system_vmstop_request(RunState state)
     qemu_notify_event();
 }
 
-static GPollFD poll_fds[1024 * 2]; /* this is probably overkill */
-static int n_poll_fds;
-static int max_priority;
+qemu_irq qemu_system_powerdown;
 
-static void glib_select_fill(int *max_fd, fd_set *rfds, fd_set *wfds,
-                             fd_set *xfds, struct timeval *tv)
+static bool main_loop_should_exit(void)
 {
-    GMainContext *context = g_main_context_default();
-    int i;
-    int timeout = 0, cur_timeout;
-
-    g_main_context_prepare(context, &max_priority);
-
-    n_poll_fds = g_main_context_query(context, max_priority, &timeout,
-                                      poll_fds, ARRAY_SIZE(poll_fds));
-    g_assert(n_poll_fds <= ARRAY_SIZE(poll_fds));
-
-    for (i = 0; i < n_poll_fds; i++) {
-        GPollFD *p = &poll_fds[i];
-
-        if ((p->events & G_IO_IN)) {
-            FD_SET(p->fd, rfds);
-            *max_fd = MAX(*max_fd, p->fd);
-        }
-        if ((p->events & G_IO_OUT)) {
-            FD_SET(p->fd, wfds);
-            *max_fd = MAX(*max_fd, p->fd);
-        }
-        if ((p->events & G_IO_ERR)) {
-            FD_SET(p->fd, xfds);
-            *max_fd = MAX(*max_fd, p->fd);
+    RunState r;
+    if (qemu_debug_requested()) {
+        vm_stop(RUN_STATE_DEBUG);
+    }
+    if (qemu_shutdown_requested()) {
+        qemu_kill_report();
+        monitor_protocol_event(QEVENT_SHUTDOWN, NULL);
+        if (no_shutdown) {
+            vm_stop(RUN_STATE_SHUTDOWN);
+        } else {
+            return true;
         }
     }
-
-    cur_timeout = (tv->tv_sec * 1000) + ((tv->tv_usec + 500) / 1000);
-    if (timeout >= 0 && timeout < cur_timeout) {
-        tv->tv_sec = timeout / 1000;
-        tv->tv_usec = (timeout % 1000) * 1000;
-    }
-}
-
-static void glib_select_poll(fd_set *rfds, fd_set *wfds, fd_set *xfds,
-                             bool err)
-{
-    GMainContext *context = g_main_context_default();
-
-    if (!err) {
-        int i;
-
-        for (i = 0; i < n_poll_fds; i++) {
-            GPollFD *p = &poll_fds[i];
-
-            if ((p->events & G_IO_IN) && FD_ISSET(p->fd, rfds)) {
-                p->revents |= G_IO_IN;
-            }
-            if ((p->events & G_IO_OUT) && FD_ISSET(p->fd, wfds)) {
-                p->revents |= G_IO_OUT;
-            }
-            if ((p->events & G_IO_ERR) && FD_ISSET(p->fd, xfds)) {
-                p->revents |= G_IO_ERR;
-            }
+    if (qemu_reset_requested()) {
+        pause_all_vcpus();
+        cpu_synchronize_all_states();
+        qemu_system_reset(VMRESET_REPORT);
+        resume_all_vcpus();
+        if (runstate_check(RUN_STATE_INTERNAL_ERROR) ||
+            runstate_check(RUN_STATE_SHUTDOWN)) {
+            runstate_set(RUN_STATE_PAUSED);
         }
     }
-
-    if (g_main_context_check(context, max_priority, poll_fds, n_poll_fds)) {
-        g_main_context_dispatch(context);
-    }
-}
-
-int main_loop_wait(int nonblocking)
-{
-    fd_set rfds, wfds, xfds;
-    int ret, nfds;
-    struct timeval tv;
-    int timeout;
-
-    if (nonblocking)
-        timeout = 0;
-    else {
-        timeout = qemu_calculate_timeout();
-        qemu_bh_update_timeout(&timeout);
-    }
-
-    os_host_main_loop_wait(&timeout);
-
-    tv.tv_sec = timeout / 1000;
-    tv.tv_usec = (timeout % 1000) * 1000;
-
-    /* poll any events */
-    /* XXX: separate device handlers from system ones */
-    nfds = -1;
-    FD_ZERO(&rfds);
-    FD_ZERO(&wfds);
-    FD_ZERO(&xfds);
-
-    qemu_iohandler_fill(&nfds, &rfds, &wfds, &xfds);
-    slirp_select_fill(&nfds, &rfds, &wfds, &xfds);
-    glib_select_fill(&nfds, &rfds, &wfds, &xfds, &tv);
-
-    if (timeout > 0) {
-        qemu_mutex_unlock_iothread();
+    if (qemu_powerdown_requested()) {
+        monitor_protocol_event(QEVENT_POWERDOWN, NULL);
+        qemu_irq_raise(qemu_system_powerdown);
     }
-
-    ret = select(nfds + 1, &rfds, &wfds, &xfds, &tv);
-
-    if (timeout > 0) {
-        qemu_mutex_lock_iothread();
+    if (qemu_vmstop_requested(&r)) {
+        vm_stop(r);
     }
-
-    qemu_iohandler_poll(&rfds, &wfds, &xfds, ret);
-    slirp_select_poll(&rfds, &wfds, &xfds, (ret < 0));
-    glib_select_poll(&rfds, &wfds, &xfds, (ret < 0));
-
-    qemu_run_all_timers();
-
-    /* Check bottom-halves last in case any of the earlier events triggered
-       them.  */
-    qemu_bh_poll();
-
-    return ret;
+    return false;
 }
 
-qemu_irq qemu_system_powerdown;
-
 static void main_loop(void)
 {
     bool nonblocking;
-    int last_io __attribute__ ((unused)) = 0;
+    int last_io = 0;
 #ifdef CONFIG_PROFILER
     int64_t ti;
 #endif
-    RunState r;
-
-    qemu_main_loop_start();
-
-    for (;;) {
+    do {
         nonblocking = !kvm_enabled() && last_io > 0;
 #ifdef CONFIG_PROFILER
         ti = profile_getclock();
@@ -1569,38 +1479,7 @@ static void main_loop(void)
 #ifdef CONFIG_PROFILER
         dev_time += profile_getclock() - ti;
 #endif
-
-        if (qemu_debug_requested()) {
-            vm_stop(RUN_STATE_DEBUG);
-        }
-        if (qemu_shutdown_requested()) {
-            qemu_kill_report();
-            monitor_protocol_event(QEVENT_SHUTDOWN, NULL);
-            if (no_shutdown) {
-                vm_stop(RUN_STATE_SHUTDOWN);
-            } else
-                break;
-        }
-        if (qemu_reset_requested()) {
-            pause_all_vcpus();
-            cpu_synchronize_all_states();
-            qemu_system_reset(VMRESET_REPORT);
-            resume_all_vcpus();
-            if (runstate_check(RUN_STATE_INTERNAL_ERROR) ||
-                runstate_check(RUN_STATE_SHUTDOWN)) {
-                runstate_set(RUN_STATE_PAUSED);
-            }
-        }
-        if (qemu_powerdown_requested()) {
-            monitor_protocol_event(QEVENT_POWERDOWN, NULL);
-            qemu_irq_raise(qemu_system_powerdown);
-        }
-        if (qemu_vmstop_requested(&r)) {
-            vm_stop(r);
-        }
-    }
-    bdrv_close_all();
-    pause_all_vcpus();
+    } while (!main_loop_should_exit());
 }
 
 static void version(void)
@@ -2311,6 +2190,7 @@ int main(int argc, char **argv, char **envp)
     runstate_init();
 
     init_clocks();
+    rtc_clock = host_clock;
 
     qemu_cache_utils_init(envp);
 
@@ -3298,6 +3178,7 @@ int main(int argc, char **argv, char **envp)
 
     configure_accelerator();
 
+    qemu_init_cpu_loop();
     if (qemu_init_main_loop()) {
         fprintf(stderr, "qemu_init_main_loop failed\n");
         exit(1);
@@ -3564,8 +3445,10 @@ int main(int argc, char **argv, char **envp)
 
     os_setup_post();
 
+    resume_all_vcpus();
     main_loop();
-    quit_timers();
+    bdrv_close_all();
+    pause_all_vcpus();
     net_cleanup();
     res_free();
author	Anthony Liguori <aliguori@us.ibm.com>	2011-10-24 10:51:12 -0500
committer	Anthony Liguori <aliguori@us.ibm.com>	2011-10-24 10:51:12 -0500
commit	952e849c150b4f1b89f8728cba00f925c1d6e75b (patch)
tree	93e950b81e84d1d20a915d8b1da5c75d3911f553
parent	db418a0a7ef5887ea0f3d167584e6f500bb0c4c5 (diff)
parent	99435906cc92a45d6c2f7e18221d31349836f90e (diff)