1/*
2 * QEMU System Emulator block driver
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24#include "config-host.h"
25#include "qemu-common.h"
26#include "monitor/monitor.h"
27#include "block/block_int.h"
28#include "qemu/iov.h"
29#include "qemu/module.h"
30//#include "qapi/qmp/types.h"
31#include "qapi/qmp/qjson.h"
32
33#ifdef CONFIG_BSD
34#include <sys/types.h>
35#include <sys/stat.h>
36#include <sys/ioctl.h>
37#include <sys/queue.h>
38#ifndef __DragonFly__
39#include <sys/disk.h>
40#endif
41#endif
42
43#ifdef _WIN32
44#include <windows.h>
45#endif
46
47static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
48        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
49        BlockDriverCompletionFunc *cb, void *opaque);
50static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
51        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
52        BlockDriverCompletionFunc *cb, void *opaque);
53static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
54        BlockDriverCompletionFunc *cb, void *opaque);
55static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
56        BlockDriverCompletionFunc *cb, void *opaque);
57static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
58                        uint8_t *buf, int nb_sectors);
59static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
60                         const uint8_t *buf, int nb_sectors);
61
62static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
63    QTAILQ_HEAD_INITIALIZER(bdrv_states);
64
65static QLIST_HEAD(, BlockDriver) bdrv_drivers =
66    QLIST_HEAD_INITIALIZER(bdrv_drivers);
67
68/* The device to use for VM snapshots */
69static BlockDriverState *bs_snapshots;
70
71/* If non-zero, use only whitelisted block drivers */
72static int use_bdrv_whitelist;
73
74int _path_is_absolute(const char *path)
75{
76    const char *p;
77#ifdef _WIN32
78    /* specific case for names like: "\\.\d:" */
79    if (*path == '/' || *path == '\\')
80        return 1;
81#endif
82    p = strchr(path, ':');
83    if (p)
84        p++;
85    else
86        p = path;
87#ifdef _WIN32
88    return (*p == '/' || *p == '\\');
89#else
90    return (*p == '/');
91#endif
92}
93
94/* if filename is absolute, just copy it to dest. Otherwise, build a
95   path to it by considering it is relative to base_path. URL are
96   supported. */
97void path_combine(char *dest, int dest_size,
98                  const char *base_path,
99                  const char *filename)
100{
101    const char *p, *p1;
102    int len;
103
104    if (dest_size <= 0)
105        return;
106    if (_path_is_absolute(filename)) {
107        pstrcpy(dest, dest_size, filename);
108    } else {
109        p = strchr(base_path, ':');
110        if (p)
111            p++;
112        else
113            p = base_path;
114        p1 = strrchr(base_path, '/');
115#ifdef _WIN32
116        {
117            const char *p2;
118            p2 = strrchr(base_path, '\\');
119            if (!p1 || p2 > p1)
120                p1 = p2;
121        }
122#endif
123        if (p1)
124            p1++;
125        else
126            p1 = base_path;
127        if (p1 > p)
128            p = p1;
129        len = p - base_path;
130        if (len > dest_size - 1)
131            len = dest_size - 1;
132        memcpy(dest, base_path, len);
133        dest[len] = '\0';
134        pstrcat(dest, dest_size, filename);
135    }
136}
137
138void bdrv_register(BlockDriver *bdrv)
139{
140    if (!bdrv->bdrv_aio_readv) {
141        /* add AIO emulation layer */
142        bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
143        bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
144    } else if (!bdrv->bdrv_read) {
145        /* add synchronous IO emulation layer */
146        bdrv->bdrv_read = bdrv_read_em;
147        bdrv->bdrv_write = bdrv_write_em;
148    }
149
150    if (!bdrv->bdrv_aio_flush)
151        bdrv->bdrv_aio_flush = bdrv_aio_flush_em;
152
153    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
154}
155
156/* create a new block device (by default it is empty) */
157BlockDriverState *bdrv_new(const char *device_name)
158{
159    BlockDriverState *bs;
160
161    bs = g_malloc0(sizeof(BlockDriverState));
162    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
163    if (device_name[0] != '\0') {
164        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
165    }
166    return bs;
167}
168
169BlockDriver *bdrv_find_format(const char *format_name)
170{
171    BlockDriver *drv1;
172    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
173        if (!strcmp(drv1->format_name, format_name)) {
174            return drv1;
175        }
176    }
177    return NULL;
178}
179
180static int bdrv_is_whitelisted(BlockDriver *drv)
181{
182    static const char *whitelist[] = {
183        CONFIG_BDRV_WHITELIST
184    };
185    const char **p;
186
187    if (!whitelist[0])
188        return 1;               /* no whitelist, anything goes */
189
190    for (p = whitelist; *p; p++) {
191        if (!strcmp(drv->format_name, *p)) {
192            return 1;
193        }
194    }
195    return 0;
196}
197
198BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
199{
200    BlockDriver *drv = bdrv_find_format(format_name);
201    return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
202}
203
204int bdrv_create(BlockDriver *drv, const char* filename,
205    QEMUOptionParameter *options)
206{
207    if (!drv->bdrv_create)
208        return -ENOTSUP;
209
210    return drv->bdrv_create(filename, options);
211}
212
213int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
214{
215    BlockDriver *drv;
216
217    drv = bdrv_find_protocol(filename);
218    if (drv == NULL) {
219        drv = bdrv_find_format("file");
220    }
221
222    return bdrv_create(drv, filename, options);
223}
224
225#ifdef _WIN32
226void get_tmp_filename(char *filename, int size)
227{
228    char temp_dir[MAX_PATH];
229
230    GetTempPath(MAX_PATH, temp_dir);
231    GetTempFileName(temp_dir, "qem", 0, filename);
232}
233#else
234void get_tmp_filename(char *filename, int size)
235{
236    int fd;
237    const char *tmpdir;
238    /* XXX: race condition possible */
239    tmpdir = getenv("TMPDIR");
240    if (!tmpdir)
241        tmpdir = "/tmp";
242    snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
243    fd = mkstemp(filename);
244    close(fd);
245}
246#endif
247
248#ifdef _WIN32
249static int is_windows_drive_prefix(const char *filename)
250{
251    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
252             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
253            filename[1] == ':');
254}
255
256int is_windows_drive(const char *filename)
257{
258    if (is_windows_drive_prefix(filename) &&
259        filename[2] == '\0')
260        return 1;
261    if (strstart(filename, "\\\\.\\", NULL) ||
262        strstart(filename, "//./", NULL))
263        return 1;
264    return 0;
265}
266#endif
267
268/*
269 * Detect host devices. By convention, /dev/cdrom[N] is always
270 * recognized as a host CDROM.
271 */
272static BlockDriver *find_hdev_driver(const char *filename)
273{
274    int score_max = 0, score;
275    BlockDriver *drv = NULL, *d;
276
277    QLIST_FOREACH(d, &bdrv_drivers, list) {
278        if (d->bdrv_probe_device) {
279            score = d->bdrv_probe_device(filename);
280            if (score > score_max) {
281                score_max = score;
282                drv = d;
283            }
284        }
285    }
286
287    return drv;
288}
289
290BlockDriver *bdrv_find_protocol(const char *filename)
291{
292    BlockDriver *drv1;
293    char protocol[128];
294    int len;
295    const char *p;
296
297    /* TODO Drivers without bdrv_file_open must be specified explicitly */
298
299    /*
300     * XXX(hch): we really should not let host device detection
301     * override an explicit protocol specification, but moving this
302     * later breaks access to device names with colons in them.
303     * Thanks to the brain-dead persistent naming schemes on udev-
304     * based Linux systems those actually are quite common.
305     */
306    drv1 = find_hdev_driver(filename);
307    if (drv1) {
308        return drv1;
309    }
310
311#ifdef _WIN32
312     if (is_windows_drive(filename) ||
313         is_windows_drive_prefix(filename))
314         return bdrv_find_format("file");
315#endif
316
317    p = strchr(filename, ':');
318    if (!p) {
319        return bdrv_find_format("file");
320    }
321    len = p - filename;
322    if (len > sizeof(protocol) - 1)
323        len = sizeof(protocol) - 1;
324    memcpy(protocol, filename, len);
325    protocol[len] = '\0';
326    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
327        if (drv1->protocol_name &&
328            !strcmp(drv1->protocol_name, protocol)) {
329            return drv1;
330        }
331    }
332    return NULL;
333}
334
335static int find_image_format(const char *filename, BlockDriver **pdrv)
336{
337    int ret, score, score_max;
338    BlockDriver *drv1, *drv;
339    uint8_t buf[2048];
340    BlockDriverState *bs;
341
342    ret = bdrv_file_open(&bs, filename, 0);
343    if (ret < 0) {
344        *pdrv = NULL;
345        return ret;
346    }
347
348    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
349    if (bs->sg || !bdrv_is_inserted(bs)) {
350        bdrv_delete(bs);
351        drv = bdrv_find_format("raw");
352        if (!drv) {
353            ret = -ENOENT;
354        }
355        *pdrv = drv;
356        return ret;
357    }
358
359    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
360    bdrv_delete(bs);
361    if (ret < 0) {
362        *pdrv = NULL;
363        return ret;
364    }
365
366    score_max = 0;
367    drv = NULL;
368    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
369        if (drv1->bdrv_probe) {
370            score = drv1->bdrv_probe(buf, ret, filename);
371            if (score > score_max) {
372                score_max = score;
373                drv = drv1;
374            }
375        }
376    }
377    if (!drv) {
378        ret = -ENOENT;
379    }
380    *pdrv = drv;
381    return ret;
382}
383
384/**
385 * Set the current 'total_sectors' value
386 */
387static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
388{
389    BlockDriver *drv = bs->drv;
390
391    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
392    if (bs->sg)
393        return 0;
394
395    /* query actual device if possible, otherwise just trust the hint */
396    if (drv->bdrv_getlength) {
397        int64_t length = drv->bdrv_getlength(bs);
398        if (length < 0) {
399            return length;
400        }
401        hint = length >> BDRV_SECTOR_BITS;
402    }
403
404    bs->total_sectors = hint;
405    return 0;
406}
407
408/*
409 * Common part for opening disk images and files
410 */
411static int bdrv_open_common(BlockDriverState *bs, const char *filename,
412    int flags, BlockDriver *drv)
413{
414    int ret, open_flags;
415
416    assert(drv != NULL);
417
418    bs->file = NULL;
419    bs->total_sectors = 0;
420    bs->encrypted = 0;
421    bs->valid_key = 0;
422    bs->open_flags = flags;
423    /* buffer_alignment defaulted to 512, drivers can change this value */
424    bs->buffer_alignment = 512;
425
426    pstrcpy(bs->filename, sizeof(bs->filename), filename);
427
428    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
429        return -ENOTSUP;
430    }
431
432    bs->drv = drv;
433    bs->opaque = g_malloc0(drv->instance_size);
434
435    /*
436     * Yes, BDRV_O_NOCACHE aka O_DIRECT means we have to present a
437     * write cache to the guest.  We do need the fdatasync to flush
438     * out transactions for block allocations, and we maybe have a
439     * volatile write cache in our backing device to deal with.
440     */
441    if (flags & (BDRV_O_CACHE_WB|BDRV_O_NOCACHE))
442        bs->enable_write_cache = 1;
443
444    /*
445     * Clear flags that are internal to the block layer before opening the
446     * image.
447     */
448    open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
449
450    /*
451     * Snapshots should be writeable.
452     */
453    if (bs->is_temporary) {
454        open_flags |= BDRV_O_RDWR;
455    }
456
457    /* Open the image, either directly or using a protocol */
458    if (drv->bdrv_file_open) {
459        ret = drv->bdrv_file_open(bs, filename, open_flags);
460    } else {
461        ret = bdrv_file_open(&bs->file, filename, open_flags);
462        if (ret >= 0) {
463            ret = drv->bdrv_open(bs, open_flags);
464        }
465    }
466
467    if (ret < 0) {
468        goto free_and_fail;
469    }
470
471    bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
472
473    ret = refresh_total_sectors(bs, bs->total_sectors);
474    if (ret < 0) {
475        goto free_and_fail;
476    }
477
478#ifndef _WIN32
479    if (bs->is_temporary) {
480        unlink(filename);
481    }
482#endif
483    return 0;
484
485free_and_fail:
486    if (bs->file) {
487        bdrv_delete(bs->file);
488        bs->file = NULL;
489    }
490    g_free(bs->opaque);
491    bs->opaque = NULL;
492    bs->drv = NULL;
493    return ret;
494}
495
496/*
497 * Opens a file using a protocol (file, host_device, nbd, ...)
498 */
499int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
500{
501    BlockDriverState *bs;
502    BlockDriver *drv;
503    int ret;
504
505    drv = bdrv_find_protocol(filename);
506    if (!drv) {
507        return -ENOENT;
508    }
509
510    bs = bdrv_new("");
511    ret = bdrv_open_common(bs, filename, flags, drv);
512    if (ret < 0) {
513        bdrv_delete(bs);
514        return ret;
515    }
516    bs->growable = 1;
517    *pbs = bs;
518    return 0;
519}
520
521/*
522 * Opens a disk image (raw, qcow2, vmdk, ...)
523 */
524int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
525              BlockDriver *drv)
526{
527    int ret;
528    int probed = 0;
529
530    if (flags & BDRV_O_SNAPSHOT) {
531        BlockDriverState *bs1;
532        int64_t total_size;
533        int is_protocol = 0;
534        BlockDriver *bdrv_qcow2;
535        QEMUOptionParameter *options;
536        char tmp_filename[PATH_MAX];
537        char backing_filename[PATH_MAX];
538
539        /* if snapshot, we create a temporary backing file and open it
540           instead of opening 'filename' directly */
541
542        /* if there is a backing file, use it */
543        bs1 = bdrv_new("");
544        ret = bdrv_open(bs1, filename, 0, drv);
545        if (ret < 0) {
546            bdrv_delete(bs1);
547            return ret;
548        }
549        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
550
551        if (bs1->drv && bs1->drv->protocol_name)
552            is_protocol = 1;
553
554        bdrv_delete(bs1);
555
556        get_tmp_filename(tmp_filename, sizeof(tmp_filename));
557
558        /* Real path is meaningless for protocols */
559        if (is_protocol)
560            snprintf(backing_filename, sizeof(backing_filename),
561                     "%s", filename);
562        else if (!realpath(filename, backing_filename))
563            return -errno;
564
565        bdrv_qcow2 = bdrv_find_format("qcow2");
566        options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
567
568        set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
569        set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
570        if (drv) {
571            set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
572                drv->format_name);
573        }
574
575        ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
576        free_option_parameters(options);
577        if (ret < 0) {
578            return ret;
579        }
580
581        filename = tmp_filename;
582        drv = bdrv_qcow2;
583        bs->is_temporary = 1;
584    }
585
586    /* Find the right image format driver */
587    if (!drv) {
588        ret = find_image_format(filename, &drv);
589        probed = 1;
590    }
591
592    if (!drv) {
593        goto unlink_and_fail;
594    }
595
596    /* Open the image */
597    ret = bdrv_open_common(bs, filename, flags, drv);
598    if (ret < 0) {
599        goto unlink_and_fail;
600    }
601
602    bs->probed = probed;
603
604    /* If there is a backing file, use it */
605    if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
606        char backing_filename[PATH_MAX];
607        int back_flags;
608        BlockDriver *back_drv = NULL;
609
610        bs->backing_hd = bdrv_new("");
611        path_combine(backing_filename, sizeof(backing_filename),
612                     filename, bs->backing_file);
613        if (bs->backing_format[0] != '\0')
614            back_drv = bdrv_find_format(bs->backing_format);
615
616        /* backing files always opened read-only */
617        back_flags =
618            flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
619
620        ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
621        if (ret < 0) {
622            bdrv_close(bs);
623            return ret;
624        }
625        if (bs->is_temporary) {
626            bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
627        } else {
628            /* base image inherits from "parent" */
629            bs->backing_hd->keep_read_only = bs->keep_read_only;
630        }
631    }
632
633    if (!bdrv_key_required(bs)) {
634        /* call the change callback */
635        bs->media_changed = 1;
636        if (bs->change_cb)
637            bs->change_cb(bs->change_opaque);
638    }
639
640    return 0;
641
642unlink_and_fail:
643    if (bs->is_temporary) {
644        unlink(filename);
645    }
646    return ret;
647}
648
649void bdrv_close(BlockDriverState *bs)
650{
651    if (bs->drv) {
652        if (bs == bs_snapshots) {
653            bs_snapshots = NULL;
654        }
655        if (bs->backing_hd) {
656            bdrv_delete(bs->backing_hd);
657            bs->backing_hd = NULL;
658        }
659        bs->drv->bdrv_close(bs);
660        g_free(bs->opaque);
661#ifdef _WIN32
662        if (bs->is_temporary) {
663            unlink(bs->filename);
664        }
665#endif
666        bs->opaque = NULL;
667        bs->drv = NULL;
668
669        if (bs->file != NULL) {
670            bdrv_close(bs->file);
671        }
672
673        /* call the change callback */
674        bs->media_changed = 1;
675        if (bs->change_cb)
676            bs->change_cb(bs->change_opaque);
677    }
678}
679
680void bdrv_close_all(void)
681{
682    BlockDriverState *bs;
683
684    QTAILQ_FOREACH(bs, &bdrv_states, list) {
685        bdrv_close(bs);
686    }
687}
688
689void bdrv_delete(BlockDriverState *bs)
690{
691    assert(!bs->peer);
692
693    /* remove from list, if necessary */
694    if (bs->device_name[0] != '\0') {
695        QTAILQ_REMOVE(&bdrv_states, bs, list);
696    }
697
698    bdrv_close(bs);
699    if (bs->file != NULL) {
700        bdrv_delete(bs->file);
701    }
702
703    assert(bs != bs_snapshots);
704    g_free(bs);
705}
706
707int bdrv_attach(BlockDriverState *bs, DeviceState *qdev)
708{
709    if (bs->peer) {
710        return -EBUSY;
711    }
712    bs->peer = qdev;
713    return 0;
714}
715
716void bdrv_detach(BlockDriverState *bs, DeviceState *qdev)
717{
718    assert(bs->peer == qdev);
719    bs->peer = NULL;
720}
721
722DeviceState *bdrv_get_attached(BlockDriverState *bs)
723{
724    return bs->peer;
725}
726
727/*
728 * Run consistency checks on an image
729 *
730 * Returns 0 if the check could be completed (it doesn't mean that the image is
731 * free of errors) or -errno when an internal error occured. The results of the
732 * check are stored in res.
733 */
734int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
735{
736    if (bs->drv->bdrv_check == NULL) {
737        return -ENOTSUP;
738    }
739
740    memset(res, 0, sizeof(*res));
741    return bs->drv->bdrv_check(bs, res);
742}
743
744#define COMMIT_BUF_SECTORS 2048
745
746/* commit COW file into the raw image */
747int bdrv_commit(BlockDriverState *bs)
748{
749    BlockDriver *drv = bs->drv;
750    int64_t sector, total_sectors;
751    int n, ro, open_flags;
752    int ret = 0, rw_ret = 0;
753    uint8_t *buf;
754    char filename[1024];
755    BlockDriverState *bs_rw, *bs_ro;
756
757    if (!drv)
758        return -ENOMEDIUM;
759
760    if (!bs->backing_hd) {
761        return -ENOTSUP;
762    }
763
764    if (bs->backing_hd->keep_read_only) {
765        return -EACCES;
766    }
767
768    ro = bs->backing_hd->read_only;
769    strncpy(filename, bs->backing_hd->filename, sizeof(filename));
770    open_flags =  bs->backing_hd->open_flags;
771
772    if (ro) {
773        /* re-open as RW */
774        bdrv_delete(bs->backing_hd);
775        bs->backing_hd = NULL;
776        bs_rw = bdrv_new("");
777        rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR, drv);
778        if (rw_ret < 0) {
779            bdrv_delete(bs_rw);
780            /* try to re-open read-only */
781            bs_ro = bdrv_new("");
782            ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR, drv);
783            if (ret < 0) {
784                bdrv_delete(bs_ro);
785                /* drive not functional anymore */
786                bs->drv = NULL;
787                return ret;
788            }
789            bs->backing_hd = bs_ro;
790            return rw_ret;
791        }
792        bs->backing_hd = bs_rw;
793    }
794
795    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
796    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
797
798    for (sector = 0; sector < total_sectors; sector += n) {
799        if (drv->bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
800
801            if (bdrv_read(bs, sector, buf, n) != 0) {
802                ret = -EIO;
803                goto ro_cleanup;
804            }
805
806            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
807                ret = -EIO;
808                goto ro_cleanup;
809            }
810        }
811    }
812
813    if (drv->bdrv_make_empty) {
814        ret = drv->bdrv_make_empty(bs);
815        bdrv_flush(bs);
816    }
817
818    /*
819     * Make sure all data we wrote to the backing device is actually
820     * stable on disk.
821     */
822    if (bs->backing_hd)
823        bdrv_flush(bs->backing_hd);
824
825ro_cleanup:
826    g_free(buf);
827
828    if (ro) {
829        /* re-open as RO */
830        bdrv_delete(bs->backing_hd);
831        bs->backing_hd = NULL;
832        bs_ro = bdrv_new("");
833        ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR, drv);
834        if (ret < 0) {
835            bdrv_delete(bs_ro);
836            /* drive not functional anymore */
837            bs->drv = NULL;
838            return ret;
839        }
840        bs->backing_hd = bs_ro;
841        bs->backing_hd->keep_read_only = 0;
842    }
843
844    return ret;
845}
846
847void bdrv_commit_all(void)
848{
849    BlockDriverState *bs;
850
851    QTAILQ_FOREACH(bs, &bdrv_states, list) {
852        bdrv_commit(bs);
853    }
854}
855
856/*
857 * Return values:
858 * 0        - success
859 * -EINVAL  - backing format specified, but no file
860 * -ENOSPC  - can't update the backing file because no space is left in the
861 *            image file header
862 * -ENOTSUP - format driver doesn't support changing the backing file
863 */
864int bdrv_change_backing_file(BlockDriverState *bs,
865    const char *backing_file, const char *backing_fmt)
866{
867    BlockDriver *drv = bs->drv;
868
869    if (drv->bdrv_change_backing_file != NULL) {
870        return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
871    } else {
872        return -ENOTSUP;
873    }
874}
875
876static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
877                                   size_t size)
878{
879    int64_t len;
880
881    if (!bdrv_is_inserted(bs))
882        return -ENOMEDIUM;
883
884    if (bs->growable)
885        return 0;
886
887    len = bdrv_getlength(bs);
888
889    if (offset < 0)
890        return -EIO;
891
892    if ((offset > len) || (len - offset < size))
893        return -EIO;
894
895    return 0;
896}
897
898static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
899                              int nb_sectors)
900{
901    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
902                                   nb_sectors * BDRV_SECTOR_SIZE);
903}
904
905/* return < 0 if error. See bdrv_write() for the return codes */
906int bdrv_read(BlockDriverState *bs, int64_t sector_num,
907              uint8_t *buf, int nb_sectors)
908{
909    BlockDriver *drv = bs->drv;
910
911    if (!drv)
912        return -ENOMEDIUM;
913    if (bdrv_check_request(bs, sector_num, nb_sectors))
914        return -EIO;
915
916    return drv->bdrv_read(bs, sector_num, buf, nb_sectors);
917}
918
919static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
920                             int nb_sectors, int dirty)
921{
922    int64_t start, end;
923    unsigned long val, idx, bit;
924
925    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
926    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
927
928    for (; start <= end; start++) {
929        idx = start / (sizeof(unsigned long) * 8);
930        bit = start % (sizeof(unsigned long) * 8);
931        val = bs->dirty_bitmap[idx];
932        if (dirty) {
933            if (!(val & (1 << bit))) {
934                bs->dirty_count++;
935                val |= 1 << bit;
936            }
937        } else {
938            if (val & (1 << bit)) {
939                bs->dirty_count--;
940                val &= ~(1 << bit);
941            }
942        }
943        bs->dirty_bitmap[idx] = val;
944    }
945}
946
947/* Return < 0 if error. Important errors are:
948  -EIO         generic I/O error (may happen for all errors)
949  -ENOMEDIUM   No media inserted.
950  -EINVAL      Invalid sector number or nb_sectors
951  -EACCES      Trying to write a read-only device
952*/
953int bdrv_write(BlockDriverState *bs, int64_t sector_num,
954               const uint8_t *buf, int nb_sectors)
955{
956    BlockDriver *drv = bs->drv;
957    if (!bs->drv)
958        return -ENOMEDIUM;
959    if (bs->read_only)
960        return -EACCES;
961    if (bdrv_check_request(bs, sector_num, nb_sectors))
962        return -EIO;
963
964    if (bs->dirty_bitmap) {
965        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
966    }
967
968    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
969        bs->wr_highest_sector = sector_num + nb_sectors - 1;
970    }
971
972    return drv->bdrv_write(bs, sector_num, buf, nb_sectors);
973}
974
975int bdrv_pread(BlockDriverState *bs, int64_t offset,
976               void *buf, int count1)
977{
978    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
979    int len, nb_sectors, count;
980    int64_t sector_num;
981    int ret;
982
983    count = count1;
984    /* first read to align to sector start */
985    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
986    if (len > count)
987        len = count;
988    sector_num = offset >> BDRV_SECTOR_BITS;
989    if (len > 0) {
990        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
991            return ret;
992        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
993        count -= len;
994        if (count == 0)
995            return count1;
996        sector_num++;
997        buf += len;
998    }
999
1000    /* read the sectors "in place" */
1001    nb_sectors = count >> BDRV_SECTOR_BITS;
1002    if (nb_sectors > 0) {
1003        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1004            return ret;
1005        sector_num += nb_sectors;
1006        len = nb_sectors << BDRV_SECTOR_BITS;
1007        buf += len;
1008        count -= len;
1009    }
1010
1011    /* add data from the last sector */
1012    if (count > 0) {
1013        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1014            return ret;
1015        memcpy(buf, tmp_buf, count);
1016    }
1017    return count1;
1018}
1019
1020int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1021                const void *buf, int count1)
1022{
1023    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1024    int len, nb_sectors, count;
1025    int64_t sector_num;
1026    int ret;
1027
1028    count = count1;
1029    /* first write to align to sector start */
1030    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1031    if (len > count)
1032        len = count;
1033    sector_num = offset >> BDRV_SECTOR_BITS;
1034    if (len > 0) {
1035        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1036            return ret;
1037        memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1038        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1039            return ret;
1040        count -= len;
1041        if (count == 0)
1042            return count1;
1043        sector_num++;
1044        buf += len;
1045    }
1046
1047    /* write the sectors "in place" */
1048    nb_sectors = count >> BDRV_SECTOR_BITS;
1049    if (nb_sectors > 0) {
1050        if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1051            return ret;
1052        sector_num += nb_sectors;
1053        len = nb_sectors << BDRV_SECTOR_BITS;
1054        buf += len;
1055        count -= len;
1056    }
1057
1058    /* add data from the last sector */
1059    if (count > 0) {
1060        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1061            return ret;
1062        memcpy(tmp_buf, buf, count);
1063        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1064            return ret;
1065    }
1066    return count1;
1067}
1068
1069/*
1070 * Writes to the file and ensures that no writes are reordered across this
1071 * request (acts as a barrier)
1072 *
1073 * Returns 0 on success, -errno in error cases.
1074 */
1075int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1076    const void *buf, int count)
1077{
1078    int ret;
1079
1080    ret = bdrv_pwrite(bs, offset, buf, count);
1081    if (ret < 0) {
1082        return ret;
1083    }
1084
1085    /* No flush needed for cache=writethrough, it uses O_DSYNC */
1086    if ((bs->open_flags & BDRV_O_CACHE_MASK) != 0) {
1087        bdrv_flush(bs);
1088    }
1089
1090    return 0;
1091}
1092
1093/*
1094 * Writes to the file and ensures that no writes are reordered across this
1095 * request (acts as a barrier)
1096 *
1097 * Returns 0 on success, -errno in error cases.
1098 */
1099int bdrv_write_sync(BlockDriverState *bs, int64_t sector_num,
1100    const uint8_t *buf, int nb_sectors)
1101{
1102    return bdrv_pwrite_sync(bs, BDRV_SECTOR_SIZE * sector_num,
1103        buf, BDRV_SECTOR_SIZE * nb_sectors);
1104}
1105
1106/**
1107 * Truncate file to 'offset' bytes (needed only for file protocols)
1108 */
1109int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1110{
1111    BlockDriver *drv = bs->drv;
1112    int ret;
1113    if (!drv)
1114        return -ENOMEDIUM;
1115    if (!drv->bdrv_truncate)
1116        return -ENOTSUP;
1117    if (bs->read_only)
1118        return -EACCES;
1119    ret = drv->bdrv_truncate(bs, offset);
1120    if (ret == 0) {
1121        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1122    }
1123    return ret;
1124}
1125
1126/**
1127 * Length of a file in bytes. Return < 0 if error or unknown.
1128 */
1129int64_t bdrv_getlength(BlockDriverState *bs)
1130{
1131    BlockDriver *drv = bs->drv;
1132    if (!drv)
1133        return -ENOMEDIUM;
1134
1135    /* Fixed size devices use the total_sectors value for speed instead of
1136       issuing a length query (like lseek) on each call.  Also, legacy block
1137       drivers don't provide a bdrv_getlength function and must use
1138       total_sectors. */
1139    if (!bs->growable || !drv->bdrv_getlength) {
1140        return bs->total_sectors * BDRV_SECTOR_SIZE;
1141    }
1142    return drv->bdrv_getlength(bs);
1143}
1144
1145/* return 0 as number of sectors if no device present or error */
1146void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1147{
1148    int64_t length;
1149    length = bdrv_getlength(bs);
1150    if (length < 0)
1151        length = 0;
1152    else
1153        length = length >> BDRV_SECTOR_BITS;
1154    *nb_sectors_ptr = length;
1155}
1156
1157struct partition {
1158        uint8_t boot_ind;           /* 0x80 - active */
1159        uint8_t head;               /* starting head */
1160        uint8_t sector;             /* starting sector */
1161        uint8_t cyl;                /* starting cylinder */
1162        uint8_t sys_ind;            /* What partition type */
1163        uint8_t end_head;           /* end head */
1164        uint8_t end_sector;         /* end sector */
1165        uint8_t end_cyl;            /* end cylinder */
1166        uint32_t start_sect;        /* starting sector counting from 0 */
1167        uint32_t nr_sects;          /* nr of sectors in partition */
1168} __attribute__((packed));
1169
1170/* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1171static int guess_disk_lchs(BlockDriverState *bs,
1172                           int *pcylinders, int *pheads, int *psectors)
1173{
1174    uint8_t buf[BDRV_SECTOR_SIZE];
1175    int ret, i, heads, sectors, cylinders;
1176    struct partition *p;
1177    uint32_t nr_sects;
1178    uint64_t nb_sectors;
1179
1180    bdrv_get_geometry(bs, &nb_sectors);
1181
1182    ret = bdrv_read(bs, 0, buf, 1);
1183    if (ret < 0)
1184        return -1;
1185    /* test msdos magic */
1186    if (buf[510] != 0x55 || buf[511] != 0xaa)
1187        return -1;
1188    for(i = 0; i < 4; i++) {
1189        p = ((struct partition *)(buf + 0x1be)) + i;
1190        nr_sects = le32_to_cpu(p->nr_sects);
1191        if (nr_sects && p->end_head) {
1192            /* We make the assumption that the partition terminates on
1193               a cylinder boundary */
1194            heads = p->end_head + 1;
1195            sectors = p->end_sector & 63;
1196            if (sectors == 0)
1197                continue;
1198            cylinders = nb_sectors / (heads * sectors);
1199            if (cylinders < 1 || cylinders > 16383)
1200                continue;
1201            *pheads = heads;
1202            *psectors = sectors;
1203            *pcylinders = cylinders;
1204#if 0
1205            printf("guessed geometry: LCHS=%d %d %d\n",
1206                   cylinders, heads, sectors);
1207#endif
1208            return 0;
1209        }
1210    }
1211    return -1;
1212}
1213
1214void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
1215{
1216    int translation, lba_detected = 0;
1217    int cylinders, heads, secs;
1218    uint64_t nb_sectors;
1219
1220    /* if a geometry hint is available, use it */
1221    bdrv_get_geometry(bs, &nb_sectors);
1222    bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
1223    translation = bdrv_get_translation_hint(bs);
1224    if (cylinders != 0) {
1225        *pcyls = cylinders;
1226        *pheads = heads;
1227        *psecs = secs;
1228    } else {
1229        if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
1230            if (heads > 16) {
1231                /* if heads > 16, it means that a BIOS LBA
1232                   translation was active, so the default
1233                   hardware geometry is OK */
1234                lba_detected = 1;
1235                goto default_geometry;
1236            } else {
1237                *pcyls = cylinders;
1238                *pheads = heads;
1239                *psecs = secs;
1240                /* disable any translation to be in sync with
1241                   the logical geometry */
1242                if (translation == BIOS_ATA_TRANSLATION_AUTO) {
1243                    bdrv_set_translation_hint(bs,
1244                                              BIOS_ATA_TRANSLATION_NONE);
1245                }
1246            }
1247        } else {
1248        default_geometry:
1249            /* if no geometry, use a standard physical disk geometry */
1250            cylinders = nb_sectors / (16 * 63);
1251
1252            if (cylinders > 16383)
1253                cylinders = 16383;
1254            else if (cylinders < 2)
1255                cylinders = 2;
1256            *pcyls = cylinders;
1257            *pheads = 16;
1258            *psecs = 63;
1259            if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
1260                if ((*pcyls * *pheads) <= 131072) {
1261                    bdrv_set_translation_hint(bs,
1262                                              BIOS_ATA_TRANSLATION_LARGE);
1263                } else {
1264                    bdrv_set_translation_hint(bs,
1265                                              BIOS_ATA_TRANSLATION_LBA);
1266                }
1267            }
1268        }
1269        bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
1270    }
1271}
1272
1273void bdrv_set_geometry_hint(BlockDriverState *bs,
1274                            int cyls, int heads, int secs)
1275{
1276    bs->cyls = cyls;
1277    bs->heads = heads;
1278    bs->secs = secs;
1279}
1280
1281void bdrv_set_type_hint(BlockDriverState *bs, int type)
1282{
1283    bs->type = type;
1284    bs->removable = ((type == BDRV_TYPE_CDROM ||
1285                      type == BDRV_TYPE_FLOPPY));
1286}
1287
1288void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
1289{
1290    bs->translation = translation;
1291}
1292
1293void bdrv_get_geometry_hint(BlockDriverState *bs,
1294                            int *pcyls, int *pheads, int *psecs)
1295{
1296    *pcyls = bs->cyls;
1297    *pheads = bs->heads;
1298    *psecs = bs->secs;
1299}
1300
1301int bdrv_get_type_hint(BlockDriverState *bs)
1302{
1303    return bs->type;
1304}
1305
1306int bdrv_get_translation_hint(BlockDriverState *bs)
1307{
1308    return bs->translation;
1309}
1310
1311void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
1312                       BlockErrorAction on_write_error)
1313{
1314    bs->on_read_error = on_read_error;
1315    bs->on_write_error = on_write_error;
1316}
1317
1318BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
1319{
1320    return is_read ? bs->on_read_error : bs->on_write_error;
1321}
1322
1323void bdrv_set_removable(BlockDriverState *bs, int removable)
1324{
1325    bs->removable = removable;
1326    if (removable && bs == bs_snapshots) {
1327        bs_snapshots = NULL;
1328    }
1329}
1330
1331int bdrv_is_removable(BlockDriverState *bs)
1332{
1333    return bs->removable;
1334}
1335
1336int bdrv_is_read_only(BlockDriverState *bs)
1337{
1338    return bs->read_only;
1339}
1340
1341int bdrv_is_sg(BlockDriverState *bs)
1342{
1343    return bs->sg;
1344}
1345
1346int bdrv_enable_write_cache(BlockDriverState *bs)
1347{
1348    return bs->enable_write_cache;
1349}
1350
1351/* XXX: no longer used */
1352void bdrv_set_change_cb(BlockDriverState *bs,
1353                        void (*change_cb)(void *opaque), void *opaque)
1354{
1355    bs->change_cb = change_cb;
1356    bs->change_opaque = opaque;
1357}
1358
1359int bdrv_is_encrypted(BlockDriverState *bs)
1360{
1361    if (bs->backing_hd && bs->backing_hd->encrypted)
1362        return 1;
1363    return bs->encrypted;
1364}
1365
1366int bdrv_key_required(BlockDriverState *bs)
1367{
1368    BlockDriverState *backing_hd = bs->backing_hd;
1369
1370    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
1371        return 1;
1372    return (bs->encrypted && !bs->valid_key);
1373}
1374
1375int bdrv_set_key(BlockDriverState *bs, const char *key)
1376{
1377    int ret;
1378    if (bs->backing_hd && bs->backing_hd->encrypted) {
1379        ret = bdrv_set_key(bs->backing_hd, key);
1380        if (ret < 0)
1381            return ret;
1382        if (!bs->encrypted)
1383            return 0;
1384    }
1385    if (!bs->encrypted) {
1386        return -EINVAL;
1387    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
1388        return -ENOMEDIUM;
1389    }
1390    ret = bs->drv->bdrv_set_key(bs, key);
1391    if (ret < 0) {
1392        bs->valid_key = 0;
1393    } else if (!bs->valid_key) {
1394        bs->valid_key = 1;
1395        /* call the change callback now, we skipped it on open */
1396        bs->media_changed = 1;
1397        if (bs->change_cb)
1398            bs->change_cb(bs->change_opaque);
1399    }
1400    return ret;
1401}
1402
1403void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
1404{
1405    if (!bs->drv) {
1406        buf[0] = '\0';
1407    } else {
1408        pstrcpy(buf, buf_size, bs->drv->format_name);
1409    }
1410}
1411
1412void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
1413                         void *opaque)
1414{
1415    BlockDriver *drv;
1416
1417    QLIST_FOREACH(drv, &bdrv_drivers, list) {
1418        it(opaque, drv->format_name);
1419    }
1420}
1421
1422BlockDriverState *bdrv_find(const char *name)
1423{
1424    BlockDriverState *bs;
1425
1426    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1427        if (!strcmp(name, bs->device_name)) {
1428            return bs;
1429        }
1430    }
1431    return NULL;
1432}
1433
1434BlockDriverState *bdrv_next(BlockDriverState *bs)
1435{
1436    if (!bs) {
1437        return QTAILQ_FIRST(&bdrv_states);
1438    }
1439    return QTAILQ_NEXT(bs, list);
1440}
1441
1442void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
1443{
1444    BlockDriverState *bs;
1445
1446    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1447        it(opaque, bs);
1448    }
1449}
1450
1451const char *bdrv_get_device_name(BlockDriverState *bs)
1452{
1453    return bs->device_name;
1454}
1455
1456void bdrv_flush(BlockDriverState *bs)
1457{
1458    if (bs->open_flags & BDRV_O_NO_FLUSH) {
1459        return;
1460    }
1461
1462    if (bs->drv && bs->drv->bdrv_flush)
1463        bs->drv->bdrv_flush(bs);
1464}
1465
1466void bdrv_flush_all(void)
1467{
1468    BlockDriverState *bs;
1469
1470    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1471        if (bs->drv && !bdrv_is_read_only(bs) &&
1472            (!bdrv_is_removable(bs) || bdrv_is_inserted(bs))) {
1473            bdrv_flush(bs);
1474        }
1475    }
1476}
1477
1478int bdrv_has_zero_init(BlockDriverState *bs)
1479{
1480    assert(bs->drv);
1481
1482    if (bs->drv->bdrv_has_zero_init) {
1483        return bs->drv->bdrv_has_zero_init(bs);
1484    }
1485
1486    return 1;
1487}
1488
1489/*
1490 * Returns true iff the specified sector is present in the disk image. Drivers
1491 * not implementing the functionality are assumed to not support backing files,
1492 * hence all their sectors are reported as allocated.
1493 *
1494 * 'pnum' is set to the number of sectors (including and immediately following
1495 * the specified sector) that are known to be in the same
1496 * allocated/unallocated state.
1497 *
1498 * 'nb_sectors' is the max value 'pnum' should be set to.
1499 */
1500int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1501	int *pnum)
1502{
1503    int64_t n;
1504    if (!bs->drv->bdrv_is_allocated) {
1505        if (sector_num >= bs->total_sectors) {
1506            *pnum = 0;
1507            return 0;
1508        }
1509        n = bs->total_sectors - sector_num;
1510        *pnum = (n < nb_sectors) ? (n) : (nb_sectors);
1511        return 1;
1512    }
1513    return bs->drv->bdrv_is_allocated(bs, sector_num, nb_sectors, pnum);
1514}
1515
1516void bdrv_mon_event(const BlockDriverState *bdrv,
1517                    BlockMonEventAction action, int is_read)
1518{
1519    QObject *data;
1520    const char *action_str;
1521
1522    switch (action) {
1523    case BDRV_ACTION_REPORT:
1524        action_str = "report";
1525        break;
1526    case BDRV_ACTION_IGNORE:
1527        action_str = "ignore";
1528        break;
1529    case BDRV_ACTION_STOP:
1530        action_str = "stop";
1531        break;
1532    default:
1533        abort();
1534    }
1535
1536    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1537                              bdrv->device_name,
1538                              action_str,
1539                              is_read ? "read" : "write");
1540    monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1541
1542    qobject_decref(data);
1543}
1544
1545static void bdrv_print_dict(QObject *obj, void *opaque)
1546{
1547    QDict *bs_dict;
1548    Monitor *mon = opaque;
1549
1550    bs_dict = qobject_to_qdict(obj);
1551
1552    monitor_printf(mon, "%s: type=%s removable=%d",
1553                        qdict_get_str(bs_dict, "device"),
1554                        qdict_get_str(bs_dict, "type"),
1555                        qdict_get_bool(bs_dict, "removable"));
1556
1557    if (qdict_get_bool(bs_dict, "removable")) {
1558        monitor_printf(mon, " locked=%d", qdict_get_bool(bs_dict, "locked"));
1559    }
1560
1561    if (qdict_haskey(bs_dict, "inserted")) {
1562        QDict *qdict = qobject_to_qdict(qdict_get(bs_dict, "inserted"));
1563
1564        monitor_printf(mon, " file=");
1565        monitor_print_filename(mon, qdict_get_str(qdict, "file"));
1566        if (qdict_haskey(qdict, "backing_file")) {
1567            monitor_printf(mon, " backing_file=");
1568            monitor_print_filename(mon, qdict_get_str(qdict, "backing_file"));
1569        }
1570        monitor_printf(mon, " ro=%d drv=%s encrypted=%d",
1571                            qdict_get_bool(qdict, "ro"),
1572                            qdict_get_str(qdict, "drv"),
1573                            qdict_get_bool(qdict, "encrypted"));
1574    } else {
1575        monitor_printf(mon, " [not inserted]");
1576    }
1577
1578    monitor_printf(mon, "\n");
1579}
1580
1581void bdrv_info_print(Monitor *mon, const QObject *data)
1582{
1583    qlist_iter(qobject_to_qlist(data), bdrv_print_dict, mon);
1584}
1585
1586void bdrv_info(Monitor *mon, QObject **ret_data)
1587{
1588    QList *bs_list;
1589    BlockDriverState *bs;
1590
1591    bs_list = qlist_new();
1592
1593    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1594        QObject *bs_obj;
1595        const char *type = "unknown";
1596
1597        switch(bs->type) {
1598        case BDRV_TYPE_HD:
1599            type = "hd";
1600            break;
1601        case BDRV_TYPE_CDROM:
1602            type = "cdrom";
1603            break;
1604        case BDRV_TYPE_FLOPPY:
1605            type = "floppy";
1606            break;
1607        }
1608
1609        bs_obj = qobject_from_jsonf("{ 'device': %s, 'type': %s, "
1610                                    "'removable': %i, 'locked': %i }",
1611                                    bs->device_name, type, bs->removable,
1612                                    bs->locked);
1613
1614        if (bs->drv) {
1615            QObject *obj;
1616            QDict *bs_dict = qobject_to_qdict(bs_obj);
1617
1618            obj = qobject_from_jsonf("{ 'file': %s, 'ro': %i, 'drv': %s, "
1619                                     "'encrypted': %i }",
1620                                     bs->filename, bs->read_only,
1621                                     bs->drv->format_name,
1622                                     bdrv_is_encrypted(bs));
1623            if (bs->backing_file[0] != '\0') {
1624                QDict *qdict = qobject_to_qdict(obj);
1625                qdict_put(qdict, "backing_file",
1626                          qstring_from_str(bs->backing_file));
1627            }
1628
1629            qdict_put_obj(bs_dict, "inserted", obj);
1630        }
1631        qlist_append_obj(bs_list, bs_obj);
1632    }
1633
1634    *ret_data = QOBJECT(bs_list);
1635}
1636
1637static void bdrv_stats_iter(QObject *data, void *opaque)
1638{
1639    QDict *qdict;
1640    Monitor *mon = opaque;
1641
1642    qdict = qobject_to_qdict(data);
1643    monitor_printf(mon, "%s:", qdict_get_str(qdict, "device"));
1644
1645    qdict = qobject_to_qdict(qdict_get(qdict, "stats"));
1646    monitor_printf(mon, " rd_bytes=%" PRId64
1647                        " wr_bytes=%" PRId64
1648                        " rd_operations=%" PRId64
1649                        " wr_operations=%" PRId64
1650                        "\n",
1651                        qdict_get_int(qdict, "rd_bytes"),
1652                        qdict_get_int(qdict, "wr_bytes"),
1653                        qdict_get_int(qdict, "rd_operations"),
1654                        qdict_get_int(qdict, "wr_operations"));
1655}
1656
1657void bdrv_stats_print(Monitor *mon, const QObject *data)
1658{
1659    qlist_iter(qobject_to_qlist(data), bdrv_stats_iter, mon);
1660}
1661
1662static QObject* bdrv_info_stats_bs(BlockDriverState *bs)
1663{
1664    QObject *res;
1665    QDict *dict;
1666
1667    res = qobject_from_jsonf("{ 'stats': {"
1668                             "'rd_bytes': %" PRId64 ","
1669                             "'wr_bytes': %" PRId64 ","
1670                             "'rd_operations': %" PRId64 ","
1671                             "'wr_operations': %" PRId64 ","
1672                             "'wr_highest_offset': %" PRId64
1673                             "} }",
1674                             bs->rd_bytes, bs->wr_bytes,
1675                             bs->rd_ops, bs->wr_ops,
1676                             bs->wr_highest_sector *
1677                             (uint64_t)BDRV_SECTOR_SIZE);
1678    dict  = qobject_to_qdict(res);
1679
1680    if (*bs->device_name) {
1681        qdict_put(dict, "device", qstring_from_str(bs->device_name));
1682    }
1683
1684    if (bs->file) {
1685        QObject *parent = bdrv_info_stats_bs(bs->file);
1686        qdict_put_obj(dict, "parent", parent);
1687    }
1688
1689    return res;
1690}
1691
1692void bdrv_info_stats(Monitor *mon, QObject **ret_data)
1693{
1694    QObject *obj;
1695    QList *devices;
1696    BlockDriverState *bs;
1697
1698    devices = qlist_new();
1699
1700    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1701        obj = bdrv_info_stats_bs(bs);
1702        qlist_append_obj(devices, obj);
1703    }
1704
1705    *ret_data = QOBJECT(devices);
1706}
1707
1708const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
1709{
1710    if (bs->backing_hd && bs->backing_hd->encrypted)
1711        return bs->backing_file;
1712    else if (bs->encrypted)
1713        return bs->filename;
1714    else
1715        return NULL;
1716}
1717
1718void bdrv_get_backing_filename(BlockDriverState *bs,
1719                               char *filename, int filename_size)
1720{
1721    if (!bs->backing_file) {
1722        pstrcpy(filename, filename_size, "");
1723    } else {
1724        pstrcpy(filename, filename_size, bs->backing_file);
1725    }
1726}
1727
1728int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
1729                          const uint8_t *buf, int nb_sectors)
1730{
1731    BlockDriver *drv = bs->drv;
1732    if (!drv)
1733        return -ENOMEDIUM;
1734    if (!drv->bdrv_write_compressed)
1735        return -ENOTSUP;
1736    if (bdrv_check_request(bs, sector_num, nb_sectors))
1737        return -EIO;
1738
1739    if (bs->dirty_bitmap) {
1740        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1741    }
1742
1743    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
1744}
1745
1746int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1747{
1748    BlockDriver *drv = bs->drv;
1749    if (!drv)
1750        return -ENOMEDIUM;
1751    if (!drv->bdrv_get_info)
1752        return -ENOTSUP;
1753    memset(bdi, 0, sizeof(*bdi));
1754    return drv->bdrv_get_info(bs, bdi);
1755}
1756
1757int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
1758                      int64_t pos, int size)
1759{
1760    BlockDriver *drv = bs->drv;
1761    if (!drv)
1762        return -ENOMEDIUM;
1763    if (drv->bdrv_save_vmstate)
1764        return drv->bdrv_save_vmstate(bs, buf, pos, size);
1765    if (bs->file)
1766        return bdrv_save_vmstate(bs->file, buf, pos, size);
1767    return -ENOTSUP;
1768}
1769
1770int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1771                      int64_t pos, int size)
1772{
1773    BlockDriver *drv = bs->drv;
1774    if (!drv)
1775        return -ENOMEDIUM;
1776    if (drv->bdrv_load_vmstate)
1777        return drv->bdrv_load_vmstate(bs, buf, pos, size);
1778    if (bs->file)
1779        return bdrv_load_vmstate(bs->file, buf, pos, size);
1780    return -ENOTSUP;
1781}
1782
1783void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
1784{
1785    BlockDriver *drv = bs->drv;
1786
1787    if (!drv || !drv->bdrv_debug_event) {
1788        return;
1789    }
1790
1791    return drv->bdrv_debug_event(bs, event);
1792
1793}
1794
1795/**************************************************************/
1796/* handling of snapshots */
1797
1798int bdrv_can_snapshot(BlockDriverState *bs)
1799{
1800    BlockDriver *drv = bs->drv;
1801    if (!drv || bdrv_is_removable(bs) || bdrv_is_read_only(bs)) {
1802        return 0;
1803    }
1804
1805    if (!drv->bdrv_snapshot_create) {
1806        if (bs->file != NULL) {
1807            return bdrv_can_snapshot(bs->file);
1808        }
1809        return 0;
1810    }
1811
1812    return 1;
1813}
1814
1815int bdrv_is_snapshot(BlockDriverState *bs)
1816{
1817    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
1818}
1819
1820BlockDriverState *bdrv_snapshots(void)
1821{
1822    BlockDriverState *bs;
1823
1824    if (bs_snapshots) {
1825        return bs_snapshots;
1826    }
1827
1828    bs = NULL;
1829    while ((bs = bdrv_next(bs))) {
1830        if (bdrv_can_snapshot(bs)) {
1831            bs_snapshots = bs;
1832            return bs;
1833        }
1834    }
1835    return NULL;
1836}
1837
1838int bdrv_snapshot_create(BlockDriverState *bs,
1839                         QEMUSnapshotInfo *sn_info)
1840{
1841    BlockDriver *drv = bs->drv;
1842    if (!drv)
1843        return -ENOMEDIUM;
1844    if (drv->bdrv_snapshot_create)
1845        return drv->bdrv_snapshot_create(bs, sn_info);
1846    if (bs->file)
1847        return bdrv_snapshot_create(bs->file, sn_info);
1848    return -ENOTSUP;
1849}
1850
1851int bdrv_snapshot_goto(BlockDriverState *bs,
1852                       const char *snapshot_id)
1853{
1854    BlockDriver *drv = bs->drv;
1855    int ret, open_ret;
1856
1857    if (!drv)
1858        return -ENOMEDIUM;
1859    if (drv->bdrv_snapshot_goto)
1860        return drv->bdrv_snapshot_goto(bs, snapshot_id);
1861
1862    if (bs->file) {
1863        drv->bdrv_close(bs);
1864        ret = bdrv_snapshot_goto(bs->file, snapshot_id);
1865        open_ret = drv->bdrv_open(bs, bs->open_flags);
1866        if (open_ret < 0) {
1867            bdrv_delete(bs->file);
1868            bs->drv = NULL;
1869            return open_ret;
1870        }
1871        return ret;
1872    }
1873
1874    return -ENOTSUP;
1875}
1876
1877int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
1878{
1879    BlockDriver *drv = bs->drv;
1880    if (!drv)
1881        return -ENOMEDIUM;
1882    if (drv->bdrv_snapshot_delete)
1883        return drv->bdrv_snapshot_delete(bs, snapshot_id);
1884    if (bs->file)
1885        return bdrv_snapshot_delete(bs->file, snapshot_id);
1886    return -ENOTSUP;
1887}
1888
1889int bdrv_snapshot_list(BlockDriverState *bs,
1890                       QEMUSnapshotInfo **psn_info)
1891{
1892    BlockDriver *drv = bs->drv;
1893    if (!drv)
1894        return -ENOMEDIUM;
1895    if (drv->bdrv_snapshot_list)
1896        return drv->bdrv_snapshot_list(bs, psn_info);
1897    if (bs->file)
1898        return bdrv_snapshot_list(bs->file, psn_info);
1899    return -ENOTSUP;
1900}
1901
1902#define NB_SUFFIXES 4
1903
1904char *get_human_readable_size(char *buf, int buf_size, int64_t size)
1905{
1906    static const char suffixes[NB_SUFFIXES] = "KMGT";
1907    int64_t base;
1908    int i;
1909
1910    if (size <= 999) {
1911        snprintf(buf, buf_size, "%" PRId64, size);
1912    } else {
1913        base = 1024;
1914        for(i = 0; i < NB_SUFFIXES; i++) {
1915            if (size < (10 * base)) {
1916                snprintf(buf, buf_size, "%0.1f%c",
1917                         (double)size / base,
1918                         suffixes[i]);
1919                break;
1920            } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
1921                snprintf(buf, buf_size, "%" PRId64 "%c",
1922                         ((size + (base >> 1)) / base),
1923                         suffixes[i]);
1924                break;
1925            }
1926            base = base * 1024;
1927        }
1928    }
1929    return buf;
1930}
1931
1932char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
1933{
1934    char buf1[128], date_buf[128], clock_buf[128];
1935#ifdef _WIN32
1936    struct tm *ptm;
1937#else
1938    struct tm tm;
1939#endif
1940    time_t ti;
1941    int64_t secs;
1942
1943    if (!sn) {
1944        snprintf(buf, buf_size,
1945                 "%-10s%-20s%7s%20s%15s",
1946                 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
1947    } else {
1948        ti = sn->date_sec;
1949#ifdef _WIN32
1950        ptm = localtime(&ti);
1951        strftime(date_buf, sizeof(date_buf),
1952                 "%Y-%m-%d %H:%M:%S", ptm);
1953#else
1954        localtime_r(&ti, &tm);
1955        strftime(date_buf, sizeof(date_buf),
1956                 "%Y-%m-%d %H:%M:%S", &tm);
1957#endif
1958        secs = sn->vm_clock_nsec / 1000000000;
1959        snprintf(clock_buf, sizeof(clock_buf),
1960                 "%02d:%02d:%02d.%03d",
1961                 (int)(secs / 3600),
1962                 (int)((secs / 60) % 60),
1963                 (int)(secs % 60),
1964                 (int)((sn->vm_clock_nsec / 1000000) % 1000));
1965        snprintf(buf, buf_size,
1966                 "%-10s%-20s%7s%20s%15s",
1967                 sn->id_str, sn->name,
1968                 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
1969                 date_buf,
1970                 clock_buf);
1971    }
1972    return buf;
1973}
1974
1975
1976/**************************************************************/
1977/* async I/Os */
1978
1979BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
1980                                 QEMUIOVector *qiov, int nb_sectors,
1981                                 BlockDriverCompletionFunc *cb, void *opaque)
1982{
1983    BlockDriver *drv = bs->drv;
1984    BlockDriverAIOCB *ret;
1985
1986    if (!drv)
1987        return NULL;
1988    if (bdrv_check_request(bs, sector_num, nb_sectors))
1989        return NULL;
1990
1991    ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
1992                              cb, opaque);
1993
1994    if (ret) {
1995	/* Update stats even though technically transfer has not happened. */
1996	bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
1997	bs->rd_ops ++;
1998    }
1999
2000    return ret;
2001}
2002
2003BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2004                                  QEMUIOVector *qiov, int nb_sectors,
2005                                  BlockDriverCompletionFunc *cb, void *opaque)
2006{
2007    BlockDriver *drv = bs->drv;
2008    BlockDriverAIOCB *ret;
2009
2010    if (!drv)
2011        return NULL;
2012    if (bs->read_only)
2013        return NULL;
2014    if (bdrv_check_request(bs, sector_num, nb_sectors))
2015        return NULL;
2016
2017    if (bs->dirty_bitmap) {
2018        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2019    }
2020
2021    ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
2022                               cb, opaque);
2023
2024    if (ret) {
2025        /* Update stats even though technically transfer has not happened. */
2026        bs->wr_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
2027        bs->wr_ops ++;
2028        if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2029            bs->wr_highest_sector = sector_num + nb_sectors - 1;
2030        }
2031    }
2032
2033    return ret;
2034}
2035
2036
2037typedef struct MultiwriteCB {
2038    int error;
2039    int num_requests;
2040    int num_callbacks;
2041    struct {
2042        BlockDriverCompletionFunc *cb;
2043        void *opaque;
2044        QEMUIOVector *free_qiov;
2045        void *free_buf;
2046    } callbacks[];
2047} MultiwriteCB;
2048
2049static void multiwrite_user_cb(MultiwriteCB *mcb)
2050{
2051    int i;
2052
2053    for (i = 0; i < mcb->num_callbacks; i++) {
2054        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2055        if (mcb->callbacks[i].free_qiov) {
2056            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2057        }
2058        g_free(mcb->callbacks[i].free_qiov);
2059        qemu_vfree(mcb->callbacks[i].free_buf);
2060    }
2061}
2062
2063static void multiwrite_cb(void *opaque, int ret)
2064{
2065    MultiwriteCB *mcb = opaque;
2066
2067    if (ret < 0 && !mcb->error) {
2068        mcb->error = ret;
2069    }
2070
2071    mcb->num_requests--;
2072    if (mcb->num_requests == 0) {
2073        multiwrite_user_cb(mcb);
2074        g_free(mcb);
2075    }
2076}
2077
2078static int multiwrite_req_compare(const void *a, const void *b)
2079{
2080    const BlockRequest *req1 = a, *req2 = b;
2081
2082    /*
2083     * Note that we can't simply subtract req2->sector from req1->sector
2084     * here as that could overflow the return value.
2085     */
2086    if (req1->sector > req2->sector) {
2087        return 1;
2088    } else if (req1->sector < req2->sector) {
2089        return -1;
2090    } else {
2091        return 0;
2092    }
2093}
2094
2095/*
2096 * Takes a bunch of requests and tries to merge them. Returns the number of
2097 * requests that remain after merging.
2098 */
2099static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2100    int num_reqs, MultiwriteCB *mcb)
2101{
2102    int i, outidx;
2103
2104    // Sort requests by start sector
2105    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2106
2107    // Check if adjacent requests touch the same clusters. If so, combine them,
2108    // filling up gaps with zero sectors.
2109    outidx = 0;
2110    for (i = 1; i < num_reqs; i++) {
2111        int merge = 0;
2112        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2113
2114        // This handles the cases that are valid for all block drivers, namely
2115        // exactly sequential writes and overlapping writes.
2116        if (reqs[i].sector <= oldreq_last) {
2117            merge = 1;
2118        }
2119
2120        // The block driver may decide that it makes sense to combine requests
2121        // even if there is a gap of some sectors between them. In this case,
2122        // the gap is filled with zeros (therefore only applicable for yet
2123        // unused space in format like qcow2).
2124        if (!merge && bs->drv->bdrv_merge_requests) {
2125            merge = bs->drv->bdrv_merge_requests(bs, &reqs[outidx], &reqs[i]);
2126        }
2127
2128        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2129            merge = 0;
2130        }
2131
2132        if (merge) {
2133            size_t size;
2134            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2135            qemu_iovec_init(qiov,
2136                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2137
2138            // Add the first request to the merged one. If the requests are
2139            // overlapping, drop the last sectors of the first request.
2140            size = (reqs[i].sector - reqs[outidx].sector) << 9;
2141            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
2142
2143            // We might need to add some zeros between the two requests
2144            if (reqs[i].sector > oldreq_last) {
2145                size_t zero_bytes = (reqs[i].sector - oldreq_last) << 9;
2146                uint8_t *buf = qemu_blockalign(bs, zero_bytes);
2147                memset(buf, 0, zero_bytes);
2148                qemu_iovec_add(qiov, buf, zero_bytes);
2149                mcb->callbacks[i].free_buf = buf;
2150            }
2151
2152            // Add the second request
2153            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
2154
2155            reqs[outidx].nb_sectors = qiov->size >> 9;
2156            reqs[outidx].qiov = qiov;
2157
2158            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2159        } else {
2160            outidx++;
2161            reqs[outidx].sector     = reqs[i].sector;
2162            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2163            reqs[outidx].qiov       = reqs[i].qiov;
2164        }
2165    }
2166
2167    return outidx + 1;
2168}
2169
2170/*
2171 * Submit multiple AIO write requests at once.
2172 *
2173 * On success, the function returns 0 and all requests in the reqs array have
2174 * been submitted. In error case this function returns -1, and any of the
2175 * requests may or may not be submitted yet. In particular, this means that the
2176 * callback will be called for some of the requests, for others it won't. The
2177 * caller must check the error field of the BlockRequest to wait for the right
2178 * callbacks (if error != 0, no callback will be called).
2179 *
2180 * The implementation may modify the contents of the reqs array, e.g. to merge
2181 * requests. However, the fields opaque and error are left unmodified as they
2182 * are used to signal failure for a single request to the caller.
2183 */
2184int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
2185{
2186    BlockDriverAIOCB *acb;
2187    MultiwriteCB *mcb;
2188    int i;
2189
2190    if (num_reqs == 0) {
2191        return 0;
2192    }
2193
2194    // Create MultiwriteCB structure
2195    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
2196    mcb->num_requests = 0;
2197    mcb->num_callbacks = num_reqs;
2198
2199    for (i = 0; i < num_reqs; i++) {
2200        mcb->callbacks[i].cb = reqs[i].cb;
2201        mcb->callbacks[i].opaque = reqs[i].opaque;
2202    }
2203
2204    // Check for mergable requests
2205    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
2206
2207    /*
2208     * Run the aio requests. As soon as one request can't be submitted
2209     * successfully, fail all requests that are not yet submitted (we must
2210     * return failure for all requests anyway)
2211     *
2212     * num_requests cannot be set to the right value immediately: If
2213     * bdrv_aio_writev fails for some request, num_requests would be too high
2214     * and therefore multiwrite_cb() would never recognize the multiwrite
2215     * request as completed. We also cannot use the loop variable i to set it
2216     * when the first request fails because the callback may already have been
2217     * called for previously submitted requests. Thus, num_requests must be
2218     * incremented for each request that is submitted.
2219     *
2220     * The problem that callbacks may be called early also means that we need
2221     * to take care that num_requests doesn't become 0 before all requests are
2222     * submitted - multiwrite_cb() would consider the multiwrite request
2223     * completed. A dummy request that is "completed" by a manual call to
2224     * multiwrite_cb() takes care of this.
2225     */
2226    mcb->num_requests = 1;
2227
2228    for (i = 0; i < num_reqs; i++) {
2229        mcb->num_requests++;
2230        acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
2231            reqs[i].nb_sectors, multiwrite_cb, mcb);
2232
2233        if (acb == NULL) {
2234            // We can only fail the whole thing if no request has been
2235            // submitted yet. Otherwise we'll wait for the submitted AIOs to
2236            // complete and report the error in the callback.
2237            if (i == 0) {
2238                goto fail;
2239            } else {
2240                multiwrite_cb(mcb, -EIO);
2241                break;
2242            }
2243        }
2244    }
2245
2246    /* Complete the dummy request */
2247    multiwrite_cb(mcb, 0);
2248
2249    return 0;
2250
2251fail:
2252    for (i = 0; i < mcb->num_callbacks; i++) {
2253        reqs[i].error = -EIO;
2254    }
2255    g_free(mcb);
2256    return -1;
2257}
2258
2259BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2260        BlockDriverCompletionFunc *cb, void *opaque)
2261{
2262    BlockDriver *drv = bs->drv;
2263
2264    if (bs->open_flags & BDRV_O_NO_FLUSH) {
2265        return bdrv_aio_noop_em(bs, cb, opaque);
2266    }
2267
2268    if (!drv)
2269        return NULL;
2270    return drv->bdrv_aio_flush(bs, cb, opaque);
2271}
2272
2273void bdrv_aio_cancel(BlockDriverAIOCB *acb)
2274{
2275    acb->pool->cancel(acb);
2276}
2277
2278
2279/**************************************************************/
2280/* async block device emulation */
2281
2282typedef struct BlockDriverAIOCBSync {
2283    BlockDriverAIOCB common;
2284    QEMUBH *bh;
2285    int ret;
2286    /* vector translation state */
2287    QEMUIOVector *qiov;
2288    uint8_t *bounce;
2289    int is_write;
2290} BlockDriverAIOCBSync;
2291
2292static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
2293{
2294    BlockDriverAIOCBSync *acb =
2295        container_of(blockacb, BlockDriverAIOCBSync, common);
2296    qemu_bh_delete(acb->bh);
2297    acb->bh = NULL;
2298    qemu_aio_release(acb);
2299}
2300
2301static AIOPool bdrv_em_aio_pool = {
2302    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
2303    .cancel             = bdrv_aio_cancel_em,
2304};
2305
2306static void bdrv_aio_bh_cb(void *opaque)
2307{
2308    BlockDriverAIOCBSync *acb = opaque;
2309
2310    if (!acb->is_write)
2311        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
2312    qemu_vfree(acb->bounce);
2313    acb->common.cb(acb->common.opaque, acb->ret);
2314    qemu_bh_delete(acb->bh);
2315    acb->bh = NULL;
2316    qemu_aio_release(acb);
2317}
2318
2319static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
2320                                            int64_t sector_num,
2321                                            QEMUIOVector *qiov,
2322                                            int nb_sectors,
2323                                            BlockDriverCompletionFunc *cb,
2324                                            void *opaque,
2325                                            int is_write)
2326
2327{
2328    BlockDriverAIOCBSync *acb;
2329
2330    acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2331    acb->is_write = is_write;
2332    acb->qiov = qiov;
2333    acb->bounce = qemu_blockalign(bs, qiov->size);
2334
2335    if (!acb->bh)
2336        acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2337
2338    if (is_write) {
2339        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
2340        acb->ret = bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
2341    } else {
2342        acb->ret = bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
2343    }
2344
2345    qemu_bh_schedule(acb->bh);
2346
2347    return &acb->common;
2348}
2349
2350static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
2351        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2352        BlockDriverCompletionFunc *cb, void *opaque)
2353{
2354    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
2355}
2356
2357static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
2358        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2359        BlockDriverCompletionFunc *cb, void *opaque)
2360{
2361    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
2362}
2363
2364static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
2365        BlockDriverCompletionFunc *cb, void *opaque)
2366{
2367    BlockDriverAIOCBSync *acb;
2368
2369    acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2370    acb->is_write = 1; /* don't bounce in the completion hadler */
2371    acb->qiov = NULL;
2372    acb->bounce = NULL;
2373    acb->ret = 0;
2374
2375    if (!acb->bh)
2376        acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2377
2378    bdrv_flush(bs);
2379    qemu_bh_schedule(acb->bh);
2380    return &acb->common;
2381}
2382
2383static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
2384        BlockDriverCompletionFunc *cb, void *opaque)
2385{
2386    BlockDriverAIOCBSync *acb;
2387
2388    acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2389    acb->is_write = 1; /* don't bounce in the completion handler */
2390    acb->qiov = NULL;
2391    acb->bounce = NULL;
2392    acb->ret = 0;
2393
2394    if (!acb->bh) {
2395        acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2396    }
2397
2398    qemu_bh_schedule(acb->bh);
2399    return &acb->common;
2400}
2401
2402/**************************************************************/
2403/* sync block device emulation */
2404
2405static void bdrv_rw_em_cb(void *opaque, int ret)
2406{
2407    *(int *)opaque = ret;
2408}
2409
2410#define NOT_DONE 0x7fffffff
2411
2412static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
2413                        uint8_t *buf, int nb_sectors)
2414{
2415    int async_ret;
2416    BlockDriverAIOCB *acb;
2417    struct iovec iov;
2418    QEMUIOVector qiov;
2419
2420    async_context_push();
2421
2422    async_ret = NOT_DONE;
2423    iov.iov_base = (void *)buf;
2424    iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
2425    qemu_iovec_init_external(&qiov, &iov, 1);
2426    acb = bdrv_aio_readv(bs, sector_num, &qiov, nb_sectors,
2427        bdrv_rw_em_cb, &async_ret);
2428    if (acb == NULL) {
2429        async_ret = -1;
2430        goto fail;
2431    }
2432
2433    while (async_ret == NOT_DONE) {
2434        qemu_aio_wait();
2435    }
2436
2437
2438fail:
2439    async_context_pop();
2440    return async_ret;
2441}
2442
2443static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
2444                         const uint8_t *buf, int nb_sectors)
2445{
2446    int async_ret;
2447    BlockDriverAIOCB *acb;
2448    struct iovec iov;
2449    QEMUIOVector qiov;
2450
2451    async_context_push();
2452
2453    async_ret = NOT_DONE;
2454    iov.iov_base = (void *)buf;
2455    iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
2456    qemu_iovec_init_external(&qiov, &iov, 1);
2457    acb = bdrv_aio_writev(bs, sector_num, &qiov, nb_sectors,
2458        bdrv_rw_em_cb, &async_ret);
2459    if (acb == NULL) {
2460        async_ret = -1;
2461        goto fail;
2462    }
2463    while (async_ret == NOT_DONE) {
2464        qemu_aio_wait();
2465    }
2466
2467fail:
2468    async_context_pop();
2469    return async_ret;
2470}
2471
2472void bdrv_init(void)
2473{
2474    module_call_init(MODULE_INIT_BLOCK);
2475}
2476
2477void bdrv_init_with_whitelist(void)
2478{
2479    use_bdrv_whitelist = 1;
2480    bdrv_init();
2481}
2482
2483void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
2484                   BlockDriverCompletionFunc *cb, void *opaque)
2485{
2486    BlockDriverAIOCB *acb;
2487
2488    if (pool->free_aiocb) {
2489        acb = pool->free_aiocb;
2490        pool->free_aiocb = acb->next;
2491    } else {
2492        acb = g_malloc0(pool->aiocb_size);
2493        acb->pool = pool;
2494    }
2495    acb->bs = bs;
2496    acb->cb = cb;
2497    acb->opaque = opaque;
2498    return acb;
2499}
2500
2501void qemu_aio_release(void *p)
2502{
2503    BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
2504    AIOPool *pool = acb->pool;
2505    acb->next = pool->free_aiocb;
2506    pool->free_aiocb = acb;
2507}
2508
2509/**************************************************************/
2510/* removable device support */
2511
2512/**
2513 * Return TRUE if the media is present
2514 */
2515int bdrv_is_inserted(BlockDriverState *bs)
2516{
2517    BlockDriver *drv = bs->drv;
2518    int ret;
2519    if (!drv)
2520        return 0;
2521    if (!drv->bdrv_is_inserted)
2522        return !bs->tray_open;
2523    ret = drv->bdrv_is_inserted(bs);
2524    return ret;
2525}
2526
2527/**
2528 * Return TRUE if the media changed since the last call to this
2529 * function. It is currently only used for floppy disks
2530 */
2531int bdrv_media_changed(BlockDriverState *bs)
2532{
2533    BlockDriver *drv = bs->drv;
2534    int ret;
2535
2536    if (!drv || !drv->bdrv_media_changed)
2537        ret = -ENOTSUP;
2538    else
2539        ret = drv->bdrv_media_changed(bs);
2540    if (ret == -ENOTSUP)
2541        ret = bs->media_changed;
2542    bs->media_changed = 0;
2543    return ret;
2544}
2545
2546/**
2547 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
2548 */
2549int bdrv_eject(BlockDriverState *bs, int eject_flag)
2550{
2551    BlockDriver *drv = bs->drv;
2552    int ret;
2553
2554    if (bs->locked) {
2555        return -EBUSY;
2556    }
2557
2558    if (!drv || !drv->bdrv_eject) {
2559        ret = -ENOTSUP;
2560    } else {
2561        ret = drv->bdrv_eject(bs, eject_flag);
2562    }
2563    if (ret == -ENOTSUP) {
2564        ret = 0;
2565    }
2566    if (ret >= 0) {
2567        bs->tray_open = eject_flag;
2568    }
2569
2570    return ret;
2571}
2572
2573int bdrv_is_locked(BlockDriverState *bs)
2574{
2575    return bs->locked;
2576}
2577
2578/**
2579 * Lock or unlock the media (if it is locked, the user won't be able
2580 * to eject it manually).
2581 */
2582void bdrv_set_locked(BlockDriverState *bs, int locked)
2583{
2584    BlockDriver *drv = bs->drv;
2585
2586    bs->locked = locked;
2587    if (drv && drv->bdrv_set_locked) {
2588        drv->bdrv_set_locked(bs, locked);
2589    }
2590}
2591
2592/* needed for generic scsi interface */
2593
2594int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
2595{
2596    BlockDriver *drv = bs->drv;
2597
2598    if (drv && drv->bdrv_ioctl)
2599        return drv->bdrv_ioctl(bs, req, buf);
2600    return -ENOTSUP;
2601}
2602
2603BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
2604        unsigned long int req, void *buf,
2605        BlockDriverCompletionFunc *cb, void *opaque)
2606{
2607    BlockDriver *drv = bs->drv;
2608
2609    if (drv && drv->bdrv_aio_ioctl)
2610        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
2611    return NULL;
2612}
2613
2614
2615
2616void *qemu_blockalign(BlockDriverState *bs, size_t size)
2617{
2618    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
2619}
2620
2621void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
2622{
2623    int64_t bitmap_size;
2624
2625    bs->dirty_count = 0;
2626    if (enable) {
2627        if (!bs->dirty_bitmap) {
2628            bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
2629                    BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
2630            bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
2631
2632            bs->dirty_bitmap = g_malloc0(bitmap_size);
2633        }
2634    } else {
2635        if (bs->dirty_bitmap) {
2636            g_free(bs->dirty_bitmap);
2637            bs->dirty_bitmap = NULL;
2638        }
2639    }
2640}
2641
2642int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
2643{
2644    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
2645
2646    if (bs->dirty_bitmap &&
2647        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
2648        return bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
2649            (1 << (chunk % (sizeof(unsigned long) * 8)));
2650    } else {
2651        return 0;
2652    }
2653}
2654
2655void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
2656                      int nr_sectors)
2657{
2658    set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
2659}
2660
2661int64_t bdrv_get_dirty_count(BlockDriverState *bs)
2662{
2663    return bs->dirty_count;
2664}
2665