4 files changed, 520 insertions, 56 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs
index 0d8c2a4ab6..c34fd7cdc2 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -10,6 +10,7 @@ block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 block-obj-y += null.o mirror.o io.o
+block-obj-y += throttle-groups.o
 
 block-obj-y += nbd.o nbd-client.o sheepdog.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
diff --git a/block/io.c b/block/io.c
index e394d92626..bb4f78784e 100644
--- a/block/io.c
+++ b/block/io.c
@@ -23,9 +23,9 @@
  */
 
 #include "trace.h"
-#include "sysemu/qtest.h"
 #include "block/blockjob.h"
 #include "block/block_int.h"
+#include "block/throttle-groups.h"
 
 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
 
@@ -65,7 +65,7 @@ void bdrv_set_io_limits(BlockDriverState *bs,
 {
     int i;
 
-    throttle_config(&bs->throttle_state, cfg);
+    throttle_group_config(bs, cfg);
 
     for (i = 0; i < 2; i++) {
         qemu_co_enter_next(&bs->throttled_reqs[i]);
@@ -95,72 +95,33 @@ static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
 void bdrv_io_limits_disable(BlockDriverState *bs)
 {
     bs->io_limits_enabled = false;
-
     bdrv_start_throttled_reqs(bs);
-
-    throttle_destroy(&bs->throttle_state);
-}
-
-static void bdrv_throttle_read_timer_cb(void *opaque)
-{
-    BlockDriverState *bs = opaque;
-    qemu_co_enter_next(&bs->throttled_reqs[0]);
-}
-
-static void bdrv_throttle_write_timer_cb(void *opaque)
-{
-    BlockDriverState *bs = opaque;
-    qemu_co_enter_next(&bs->throttled_reqs[1]);
+    throttle_group_unregister_bs(bs);
 }
 
 /* should be called before bdrv_set_io_limits if a limit is set */
-void bdrv_io_limits_enable(BlockDriverState *bs)
+void bdrv_io_limits_enable(BlockDriverState *bs, const char *group)
 {
-    int clock_type = QEMU_CLOCK_REALTIME;
-
-    if (qtest_enabled()) {
-        /* For testing block IO throttling only */
-        clock_type = QEMU_CLOCK_VIRTUAL;
-    }
     assert(!bs->io_limits_enabled);
-    throttle_init(&bs->throttle_state,
-                  bdrv_get_aio_context(bs),
-                  clock_type,
-                  bdrv_throttle_read_timer_cb,
-                  bdrv_throttle_write_timer_cb,
-                  bs);
+    throttle_group_register_bs(bs, group);
     bs->io_limits_enabled = true;
 }
 
-/* This function makes an IO wait if needed
- *
- * @nb_sectors: the number of sectors of the IO
- * @is_write:   is the IO a write
- */
-static void bdrv_io_limits_intercept(BlockDriverState *bs,
-                                     unsigned int bytes,
-                                     bool is_write)
+void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group)
 {
-    /* does this io must wait */
-    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
-
-    /* if must wait or any request of this type throttled queue the IO */
-    if (must_wait ||
-        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
-        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+    /* this bs is not part of any group */
+    if (!bs->throttle_state) {
+        return;
     }
 
-    /* the IO will be executed, do the accounting */
-    throttle_account(&bs->throttle_state, is_write, bytes);
-
-
-    /* if the next request must wait -> do nothing */
-    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
+    /* this bs is a part of the same group than the one we want */
+    if (!g_strcmp0(throttle_group_get_name(bs), group)) {
         return;
     }
 
-    /* else queue next request for execution */
-    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
+    /* need to change the group this bs belong to */
+    bdrv_io_limits_disable(bs);
+    bdrv_io_limits_enable(bs, group);
 }
 
 void bdrv_setup_io_funcs(BlockDriver *bdrv)
@@ -967,7 +928,7 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
 
     /* throttling disk I/O */
     if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, bytes, false);
+        throttle_group_co_io_limits_intercept(bs, bytes, false);
     }
 
     /* Align read if necessary by padding qiov */
@@ -1297,7 +1258,7 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
 
     /* throttling disk I/O */
     if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, bytes, true);
+        throttle_group_co_io_limits_intercept(bs, bytes, true);
     }
 
     /*
diff --git a/block/qapi.c b/block/qapi.c
index 18d2b95f54..a738148bce 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -24,6 +24,7 @@
 
 #include "block/qapi.h"
 #include "block/block_int.h"
+#include "block/throttle-groups.h"
 #include "block/write-threshold.h"
 #include "qmp-commands.h"
 #include "qapi-visit.h"
@@ -65,7 +66,9 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp)
 
     if (bs->io_limits_enabled) {
         ThrottleConfig cfg;
-        throttle_get_config(&bs->throttle_state, &cfg);
+
+        throttle_group_get_config(bs, &cfg);
+
         info->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
         info->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg;
         info->bps_wr  = cfg.buckets[THROTTLE_BPS_WRITE].avg;
@@ -90,6 +93,9 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp)
 
         info->has_iops_size = cfg.op_size;
         info->iops_size = cfg.op_size;
+
+        info->has_group = true;
+        info->group = g_strdup(throttle_group_get_name(bs));
     }
 
     info->write_threshold = bdrv_write_threshold_get(bs);
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
new file mode 100644
index 0000000000..efc462fbc5
--- /dev/null
+++ b/block/throttle-groups.c
@@ -0,0 +1,496 @@
+/*
+ * QEMU block throttling group infrastructure
+ *
+ * Copyright (C) Nodalink, EURL. 2014
+ * Copyright (C) Igalia, S.L. 2015
+ *
+ * Authors:
+ *   Benoît Canet <benoit.canet@nodalink.com>
+ *   Alberto Garcia <berto@igalia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "block/throttle-groups.h"
+#include "qemu/queue.h"
+#include "qemu/thread.h"
+#include "sysemu/qtest.h"
+
+/* The ThrottleGroup structure (with its ThrottleState) is shared
+ * among different BlockDriverState and it's independent from
+ * AioContext, so in order to use it from different threads it needs
+ * its own locking.
+ *
+ * This locking is however handled internally in this file, so it's
+ * mostly transparent to outside users (but see the documentation in
+ * throttle_groups_lock()).
+ *
+ * The whole ThrottleGroup structure is private and invisible to
+ * outside users, that only use it through its ThrottleState.
+ *
+ * In addition to the ThrottleGroup structure, BlockDriverState has
+ * fields that need to be accessed by other members of the group and
+ * therefore also need to be protected by this lock. Once a BDS is
+ * registered in a group those fields can be accessed by other threads
+ * any time.
+ *
+ * Again, all this is handled internally and is mostly transparent to
+ * the outside. The 'throttle_timers' field however has an additional
+ * constraint because it may be temporarily invalid (see for example
+ * bdrv_set_aio_context()). Therefore in this file a thread will
+ * access some other BDS's timers only after verifying that that BDS
+ * has throttled requests in the queue.
+ */
+typedef struct ThrottleGroup {
+    char *name; /* This is constant during the lifetime of the group */
+
+    QemuMutex lock; /* This lock protects the following four fields */
+    ThrottleState ts;
+    QLIST_HEAD(, BlockDriverState) head;
+    BlockDriverState *tokens[2];
+    bool any_timer_armed[2];
+
+    /* These two are protected by the global throttle_groups_lock */
+    unsigned refcount;
+    QTAILQ_ENTRY(ThrottleGroup) list;
+} ThrottleGroup;
+
+static QemuMutex throttle_groups_lock;
+static QTAILQ_HEAD(, ThrottleGroup) throttle_groups =
+    QTAILQ_HEAD_INITIALIZER(throttle_groups);
+
+/* Increments the reference count of a ThrottleGroup given its name.
+ *
+ * If no ThrottleGroup is found with the given name a new one is
+ * created.
+ *
+ * @name: the name of the ThrottleGroup
+ * @ret:  the ThrottleGroup
+ */
+static ThrottleGroup *throttle_group_incref(const char *name)
+{
+    ThrottleGroup *tg = NULL;
+    ThrottleGroup *iter;
+
+    qemu_mutex_lock(&throttle_groups_lock);
+
+    /* Look for an existing group with that name */
+    QTAILQ_FOREACH(iter, &throttle_groups, list) {
+        if (!strcmp(name, iter->name)) {
+            tg = iter;
+            break;
+        }
+    }
+
+    /* Create a new one if not found */
+    if (!tg) {
+        tg = g_new0(ThrottleGroup, 1);
+        tg->name = g_strdup(name);
+        qemu_mutex_init(&tg->lock);
+        throttle_init(&tg->ts);
+        QLIST_INIT(&tg->head);
+
+        QTAILQ_INSERT_TAIL(&throttle_groups, tg, list);
+    }
+
+    tg->refcount++;
+
+    qemu_mutex_unlock(&throttle_groups_lock);
+
+    return tg;
+}
+
+/* Decrease the reference count of a ThrottleGroup.
+ *
+ * When the reference count reaches zero the ThrottleGroup is
+ * destroyed.
+ *
+ * @tg:  The ThrottleGroup to unref
+ */
+static void throttle_group_unref(ThrottleGroup *tg)
+{
+    qemu_mutex_lock(&throttle_groups_lock);
+    if (--tg->refcount == 0) {
+        QTAILQ_REMOVE(&throttle_groups, tg, list);
+        qemu_mutex_destroy(&tg->lock);
+        g_free(tg->name);
+        g_free(tg);
+    }
+    qemu_mutex_unlock(&throttle_groups_lock);
+}
+
+/* Get the name from a BlockDriverState's ThrottleGroup. The name (and
+ * the pointer) is guaranteed to remain constant during the lifetime
+ * of the group.
+ *
+ * @bs:   a BlockDriverState that is member of a throttling group
+ * @ret:  the name of the group.
+ */
+const char *throttle_group_get_name(BlockDriverState *bs)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    return tg->name;
+}
+
+/* Return the next BlockDriverState in the round-robin sequence,
+ * simulating a circular list.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @bs:  the current BlockDriverState
+ * @ret: the next BlockDriverState in the sequence
+ */
+static BlockDriverState *throttle_group_next_bs(BlockDriverState *bs)
+{
+    ThrottleState *ts = bs->throttle_state;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    BlockDriverState *next = QLIST_NEXT(bs, round_robin);
+
+    if (!next) {
+        return QLIST_FIRST(&tg->head);
+    }
+
+    return next;
+}
+
+/* Return the next BlockDriverState in the round-robin sequence with
+ * pending I/O requests.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @bs:        the current BlockDriverState
+ * @is_write:  the type of operation (read/write)
+ * @ret:       the next BlockDriverState with pending requests, or bs
+ *             if there is none.
+ */
+static BlockDriverState *next_throttle_token(BlockDriverState *bs,
+                                             bool is_write)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockDriverState *token, *start;
+
+    start = token = tg->tokens[is_write];
+
+    /* get next bs round in round robin style */
+    token = throttle_group_next_bs(token);
+    while (token != start && !token->pending_reqs[is_write]) {
+        token = throttle_group_next_bs(token);
+    }
+
+    /* If no IO are queued for scheduling on the next round robin token
+     * then decide the token is the current bs because chances are
+     * the current bs get the current request queued.
+     */
+    if (token == start && !token->pending_reqs[is_write]) {
+        token = bs;
+    }
+
+    return token;
+}
+
+/* Check if the next I/O request for a BlockDriverState needs to be
+ * throttled or not. If there's no timer set in this group, set one
+ * and update the token accordingly.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @bs:         the current BlockDriverState
+ * @is_write:   the type of operation (read/write)
+ * @ret:        whether the I/O request needs to be throttled or not
+ */
+static bool throttle_group_schedule_timer(BlockDriverState *bs,
+                                          bool is_write)
+{
+    ThrottleState *ts = bs->throttle_state;
+    ThrottleTimers *tt = &bs->throttle_timers;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    bool must_wait;
+
+    /* Check if any of the timers in this group is already armed */
+    if (tg->any_timer_armed[is_write]) {
+        return true;
+    }
+
+    must_wait = throttle_schedule_timer(ts, tt, is_write);
+
+    /* If a timer just got armed, set bs as the current token */
+    if (must_wait) {
+        tg->tokens[is_write] = bs;
+        tg->any_timer_armed[is_write] = true;
+    }
+
+    return must_wait;
+}
+
+/* Look for the next pending I/O request and schedule it.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @bs:        the current BlockDriverState
+ * @is_write:  the type of operation (read/write)
+ */
+static void schedule_next_request(BlockDriverState *bs, bool is_write)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    bool must_wait;
+    BlockDriverState *token;
+
+    /* Check if there's any pending request to schedule next */
+    token = next_throttle_token(bs, is_write);
+    if (!token->pending_reqs[is_write]) {
+        return;
+    }
+
+    /* Set a timer for the request if it needs to be throttled */
+    must_wait = throttle_group_schedule_timer(token, is_write);
+
+    /* If it doesn't have to wait, queue it for immediate execution */
+    if (!must_wait) {
+        /* Give preference to requests from the current bs */
+        if (qemu_in_coroutine() &&
+            qemu_co_queue_next(&bs->throttled_reqs[is_write])) {
+            token = bs;
+        } else {
+            ThrottleTimers *tt = &token->throttle_timers;
+            int64_t now = qemu_clock_get_ns(tt->clock_type);
+            timer_mod(tt->timers[is_write], now + 1);
+            tg->any_timer_armed[is_write] = true;
+        }
+        tg->tokens[is_write] = token;
+    }
+}
+
+/* Check if an I/O request needs to be throttled, wait and set a timer
+ * if necessary, and schedule the next request using a round robin
+ * algorithm.
+ *
+ * @bs:        the current BlockDriverState
+ * @bytes:     the number of bytes for this I/O
+ * @is_write:  the type of operation (read/write)
+ */
+void coroutine_fn throttle_group_co_io_limits_intercept(BlockDriverState *bs,
+                                                        unsigned int bytes,
+                                                        bool is_write)
+{
+    bool must_wait;
+    BlockDriverState *token;
+
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    qemu_mutex_lock(&tg->lock);
+
+    /* First we check if this I/O has to be throttled. */
+    token = next_throttle_token(bs, is_write);
+    must_wait = throttle_group_schedule_timer(token, is_write);
+
+    /* Wait if there's a timer set or queued requests of this type */
+    if (must_wait || bs->pending_reqs[is_write]) {
+        bs->pending_reqs[is_write]++;
+        qemu_mutex_unlock(&tg->lock);
+        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+        qemu_mutex_lock(&tg->lock);
+        bs->pending_reqs[is_write]--;
+    }
+
+    /* The I/O will be executed, so do the accounting */
+    throttle_account(bs->throttle_state, is_write, bytes);
+
+    /* Schedule the next request */
+    schedule_next_request(bs, is_write);
+
+    qemu_mutex_unlock(&tg->lock);
+}
+
+/* Update the throttle configuration for a particular group. Similar
+ * to throttle_config(), but guarantees atomicity within the
+ * throttling group.
+ *
+ * @bs:  a BlockDriverState that is member of the group
+ * @cfg: the configuration to set
+ */
+void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg)
+{
+    ThrottleTimers *tt = &bs->throttle_timers;
+    ThrottleState *ts = bs->throttle_state;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    qemu_mutex_lock(&tg->lock);
+    throttle_config(ts, tt, cfg);
+    /* throttle_config() cancels the timers */
+    tg->any_timer_armed[0] = tg->any_timer_armed[1] = false;
+    qemu_mutex_unlock(&tg->lock);
+}
+
+/* Get the throttle configuration from a particular group. Similar to
+ * throttle_get_config(), but guarantees atomicity within the
+ * throttling group.
+ *
+ * @bs:  a BlockDriverState that is member of the group
+ * @cfg: the configuration will be written here
+ */
+void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg)
+{
+    ThrottleState *ts = bs->throttle_state;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    qemu_mutex_lock(&tg->lock);
+    throttle_get_config(ts, cfg);
+    qemu_mutex_unlock(&tg->lock);
+}
+
+/* ThrottleTimers callback. This wakes up a request that was waiting
+ * because it had been throttled.
+ *
+ * @bs:        the BlockDriverState whose request had been throttled
+ * @is_write:  the type of operation (read/write)
+ */
+static void timer_cb(BlockDriverState *bs, bool is_write)
+{
+    ThrottleState *ts = bs->throttle_state;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    bool empty_queue;
+
+    /* The timer has just been fired, so we can update the flag */
+    qemu_mutex_lock(&tg->lock);
+    tg->any_timer_armed[is_write] = false;
+    qemu_mutex_unlock(&tg->lock);
+
+    /* Run the request that was waiting for this timer */
+    empty_queue = !qemu_co_enter_next(&bs->throttled_reqs[is_write]);
+
+    /* If the request queue was empty then we have to take care of
+     * scheduling the next one */
+    if (empty_queue) {
+        qemu_mutex_lock(&tg->lock);
+        schedule_next_request(bs, is_write);
+        qemu_mutex_unlock(&tg->lock);
+    }
+}
+
+static void read_timer_cb(void *opaque)
+{
+    timer_cb(opaque, false);
+}
+
+static void write_timer_cb(void *opaque)
+{
+    timer_cb(opaque, true);
+}
+
+/* Register a BlockDriverState in the throttling group, also
+ * initializing its timers and updating its throttle_state pointer to
+ * point to it. If a throttling group with that name does not exist
+ * yet, it will be created.
+ *
+ * @bs:        the BlockDriverState to insert
+ * @groupname: the name of the group
+ */
+void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
+{
+    int i;
+    ThrottleGroup *tg = throttle_group_incref(groupname);
+    int clock_type = QEMU_CLOCK_REALTIME;
+
+    if (qtest_enabled()) {
+        /* For testing block IO throttling only */
+        clock_type = QEMU_CLOCK_VIRTUAL;
+    }
+
+    bs->throttle_state = &tg->ts;
+
+    qemu_mutex_lock(&tg->lock);
+    /* If the ThrottleGroup is new set this BlockDriverState as the token */
+    for (i = 0; i < 2; i++) {
+        if (!tg->tokens[i]) {
+            tg->tokens[i] = bs;
+        }
+    }
+
+    QLIST_INSERT_HEAD(&tg->head, bs, round_robin);
+
+    throttle_timers_init(&bs->throttle_timers,
+                         bdrv_get_aio_context(bs),
+                         clock_type,
+                         read_timer_cb,
+                         write_timer_cb,
+                         bs);
+
+    qemu_mutex_unlock(&tg->lock);
+}
+
+/* Unregister a BlockDriverState from its group, removing it from the
+ * list, destroying the timers and setting the throttle_state pointer
+ * to NULL.
+ *
+ * The group will be destroyed if it's empty after this operation.
+ *
+ * @bs: the BlockDriverState to remove
+ */
+void throttle_group_unregister_bs(BlockDriverState *bs)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    int i;
+
+    qemu_mutex_lock(&tg->lock);
+    for (i = 0; i < 2; i++) {
+        if (tg->tokens[i] == bs) {
+            BlockDriverState *token = throttle_group_next_bs(bs);
+            /* Take care of the case where this is the last bs in the group */
+            if (token == bs) {
+                token = NULL;
+            }
+            tg->tokens[i] = token;
+        }
+    }
+
+    /* remove the current bs from the list */
+    QLIST_REMOVE(bs, round_robin);
+    throttle_timers_destroy(&bs->throttle_timers);
+    qemu_mutex_unlock(&tg->lock);
+
+    throttle_group_unref(tg);
+    bs->throttle_state = NULL;
+}
+
+/* Acquire the lock of this throttling group.
+ *
+ * You won't normally need to use this. None of the functions from the
+ * ThrottleGroup API require you to acquire the lock since all of them
+ * deal with it internally.
+ *
+ * This should only be used in exceptional cases when you want to
+ * access the protected fields of a BlockDriverState directly
+ * (e.g. bdrv_swap()).
+ *
+ * @bs: a BlockDriverState that is member of the group
+ */
+void throttle_group_lock(BlockDriverState *bs)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    qemu_mutex_lock(&tg->lock);
+}
+
+/* Release the lock of this throttling group.
+ *
+ * See the comments in throttle_group_lock().
+ */
+void throttle_group_unlock(BlockDriverState *bs)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    qemu_mutex_unlock(&tg->lock);
+}
+
+static void throttle_groups_init(void)
+{
+    qemu_mutex_init(&throttle_groups_lock);
+}
+
+block_init(throttle_groups_init);