1/*
2 * QEMU System Emulator block driver
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24#include "config-host.h"
25#include "qemu-common.h"
26#include "monitor.h"
27#include "block_int.h"
28#include "module.h"
29#include "qemu-objects.h"
30
31#ifdef CONFIG_BSD
32#include <sys/types.h>
33#include <sys/stat.h>
34#include <sys/ioctl.h>
35#include <sys/queue.h>
36#ifndef __DragonFly__
37#include <sys/disk.h>
38#endif
39#endif
40
41#ifdef _WIN32
42#include <windows.h>
43#endif
44
45static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
46        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
47        BlockDriverCompletionFunc *cb, void *opaque);
48static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
49        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
50        BlockDriverCompletionFunc *cb, void *opaque);
51static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
52        BlockDriverCompletionFunc *cb, void *opaque);
53static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
54        BlockDriverCompletionFunc *cb, void *opaque);
55static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
56                        uint8_t *buf, int nb_sectors);
57static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
58                         const uint8_t *buf, int nb_sectors);
59
60static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
61    QTAILQ_HEAD_INITIALIZER(bdrv_states);
62
63static QLIST_HEAD(, BlockDriver) bdrv_drivers =
64    QLIST_HEAD_INITIALIZER(bdrv_drivers);
65
66/* The device to use for VM snapshots */
67static BlockDriverState *bs_snapshots;
68
69/* If non-zero, use only whitelisted block drivers */
70static int use_bdrv_whitelist;
71
72int _path_is_absolute(const char *path)
73{
74    const char *p;
75#ifdef _WIN32
76    /* specific case for names like: "\\.\d:" */
77    if (*path == '/' || *path == '\\')
78        return 1;
79#endif
80    p = strchr(path, ':');
81    if (p)
82        p++;
83    else
84        p = path;
85#ifdef _WIN32
86    return (*p == '/' || *p == '\\');
87#else
88    return (*p == '/');
89#endif
90}
91
92/* if filename is absolute, just copy it to dest. Otherwise, build a
93   path to it by considering it is relative to base_path. URL are
94   supported. */
95void path_combine(char *dest, int dest_size,
96                  const char *base_path,
97                  const char *filename)
98{
99    const char *p, *p1;
100    int len;
101
102    if (dest_size <= 0)
103        return;
104    if (_path_is_absolute(filename)) {
105        pstrcpy(dest, dest_size, filename);
106    } else {
107        p = strchr(base_path, ':');
108        if (p)
109            p++;
110        else
111            p = base_path;
112        p1 = strrchr(base_path, '/');
113#ifdef _WIN32
114        {
115            const char *p2;
116            p2 = strrchr(base_path, '\\');
117            if (!p1 || p2 > p1)
118                p1 = p2;
119        }
120#endif
121        if (p1)
122            p1++;
123        else
124            p1 = base_path;
125        if (p1 > p)
126            p = p1;
127        len = p - base_path;
128        if (len > dest_size - 1)
129            len = dest_size - 1;
130        memcpy(dest, base_path, len);
131        dest[len] = '\0';
132        pstrcat(dest, dest_size, filename);
133    }
134}
135
136void bdrv_register(BlockDriver *bdrv)
137{
138    if (!bdrv->bdrv_aio_readv) {
139        /* add AIO emulation layer */
140        bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
141        bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
142    } else if (!bdrv->bdrv_read) {
143        /* add synchronous IO emulation layer */
144        bdrv->bdrv_read = bdrv_read_em;
145        bdrv->bdrv_write = bdrv_write_em;
146    }
147
148    if (!bdrv->bdrv_aio_flush)
149        bdrv->bdrv_aio_flush = bdrv_aio_flush_em;
150
151    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
152}
153
154/* create a new block device (by default it is empty) */
155BlockDriverState *bdrv_new(const char *device_name)
156{
157    BlockDriverState *bs;
158
159    bs = qemu_mallocz(sizeof(BlockDriverState));
160    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
161    if (device_name[0] != '\0') {
162        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
163    }
164    return bs;
165}
166
167BlockDriver *bdrv_find_format(const char *format_name)
168{
169    BlockDriver *drv1;
170    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
171        if (!strcmp(drv1->format_name, format_name)) {
172            return drv1;
173        }
174    }
175    return NULL;
176}
177
178static int bdrv_is_whitelisted(BlockDriver *drv)
179{
180    static const char *whitelist[] = {
181        CONFIG_BDRV_WHITELIST
182    };
183    const char **p;
184
185    if (!whitelist[0])
186        return 1;               /* no whitelist, anything goes */
187
188    for (p = whitelist; *p; p++) {
189        if (!strcmp(drv->format_name, *p)) {
190            return 1;
191        }
192    }
193    return 0;
194}
195
196BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
197{
198    BlockDriver *drv = bdrv_find_format(format_name);
199    return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
200}
201
202int bdrv_create(BlockDriver *drv, const char* filename,
203    QEMUOptionParameter *options)
204{
205    if (!drv->bdrv_create)
206        return -ENOTSUP;
207
208    return drv->bdrv_create(filename, options);
209}
210
211int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
212{
213    BlockDriver *drv;
214
215    drv = bdrv_find_protocol(filename);
216    if (drv == NULL) {
217        drv = bdrv_find_format("file");
218    }
219
220    return bdrv_create(drv, filename, options);
221}
222
223#ifdef _WIN32
224void get_tmp_filename(char *filename, int size)
225{
226    char temp_dir[MAX_PATH];
227
228    GetTempPath(MAX_PATH, temp_dir);
229    GetTempFileName(temp_dir, "qem", 0, filename);
230}
231#else
232void get_tmp_filename(char *filename, int size)
233{
234    int fd;
235    const char *tmpdir;
236    /* XXX: race condition possible */
237    tmpdir = getenv("TMPDIR");
238    if (!tmpdir)
239        tmpdir = "/tmp";
240    snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
241    fd = mkstemp(filename);
242    close(fd);
243}
244#endif
245
246#ifdef _WIN32
247static int is_windows_drive_prefix(const char *filename)
248{
249    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
250             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
251            filename[1] == ':');
252}
253
254int is_windows_drive(const char *filename)
255{
256    if (is_windows_drive_prefix(filename) &&
257        filename[2] == '\0')
258        return 1;
259    if (strstart(filename, "\\\\.\\", NULL) ||
260        strstart(filename, "//./", NULL))
261        return 1;
262    return 0;
263}
264#endif
265
266/*
267 * Detect host devices. By convention, /dev/cdrom[N] is always
268 * recognized as a host CDROM.
269 */
270static BlockDriver *find_hdev_driver(const char *filename)
271{
272    int score_max = 0, score;
273    BlockDriver *drv = NULL, *d;
274
275    QLIST_FOREACH(d, &bdrv_drivers, list) {
276        if (d->bdrv_probe_device) {
277            score = d->bdrv_probe_device(filename);
278            if (score > score_max) {
279                score_max = score;
280                drv = d;
281            }
282        }
283    }
284
285    return drv;
286}
287
288BlockDriver *bdrv_find_protocol(const char *filename)
289{
290    BlockDriver *drv1;
291    char protocol[128];
292    int len;
293    const char *p;
294
295    /* TODO Drivers without bdrv_file_open must be specified explicitly */
296
297    /*
298     * XXX(hch): we really should not let host device detection
299     * override an explicit protocol specification, but moving this
300     * later breaks access to device names with colons in them.
301     * Thanks to the brain-dead persistent naming schemes on udev-
302     * based Linux systems those actually are quite common.
303     */
304    drv1 = find_hdev_driver(filename);
305    if (drv1) {
306        return drv1;
307    }
308
309#ifdef _WIN32
310     if (is_windows_drive(filename) ||
311         is_windows_drive_prefix(filename))
312         return bdrv_find_format("file");
313#endif
314
315    p = strchr(filename, ':');
316    if (!p) {
317        return bdrv_find_format("file");
318    }
319    len = p - filename;
320    if (len > sizeof(protocol) - 1)
321        len = sizeof(protocol) - 1;
322    memcpy(protocol, filename, len);
323    protocol[len] = '\0';
324    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
325        if (drv1->protocol_name &&
326            !strcmp(drv1->protocol_name, protocol)) {
327            return drv1;
328        }
329    }
330    return NULL;
331}
332
333static int find_image_format(const char *filename, BlockDriver **pdrv)
334{
335    int ret, score, score_max;
336    BlockDriver *drv1, *drv;
337    uint8_t buf[2048];
338    BlockDriverState *bs;
339
340    ret = bdrv_file_open(&bs, filename, 0);
341    if (ret < 0) {
342        *pdrv = NULL;
343        return ret;
344    }
345
346    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
347    if (bs->sg || !bdrv_is_inserted(bs)) {
348        bdrv_delete(bs);
349        drv = bdrv_find_format("raw");
350        if (!drv) {
351            ret = -ENOENT;
352        }
353        *pdrv = drv;
354        return ret;
355    }
356
357    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
358    bdrv_delete(bs);
359    if (ret < 0) {
360        *pdrv = NULL;
361        return ret;
362    }
363
364    score_max = 0;
365    drv = NULL;
366    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
367        if (drv1->bdrv_probe) {
368            score = drv1->bdrv_probe(buf, ret, filename);
369            if (score > score_max) {
370                score_max = score;
371                drv = drv1;
372            }
373        }
374    }
375    if (!drv) {
376        ret = -ENOENT;
377    }
378    *pdrv = drv;
379    return ret;
380}
381
382/**
383 * Set the current 'total_sectors' value
384 */
385static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
386{
387    BlockDriver *drv = bs->drv;
388
389    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
390    if (bs->sg)
391        return 0;
392
393    /* query actual device if possible, otherwise just trust the hint */
394    if (drv->bdrv_getlength) {
395        int64_t length = drv->bdrv_getlength(bs);
396        if (length < 0) {
397            return length;
398        }
399        hint = length >> BDRV_SECTOR_BITS;
400    }
401
402    bs->total_sectors = hint;
403    return 0;
404}
405
406/*
407 * Common part for opening disk images and files
408 */
409static int bdrv_open_common(BlockDriverState *bs, const char *filename,
410    int flags, BlockDriver *drv)
411{
412    int ret, open_flags;
413
414    assert(drv != NULL);
415
416    bs->file = NULL;
417    bs->total_sectors = 0;
418    bs->encrypted = 0;
419    bs->valid_key = 0;
420    bs->open_flags = flags;
421    /* buffer_alignment defaulted to 512, drivers can change this value */
422    bs->buffer_alignment = 512;
423
424    pstrcpy(bs->filename, sizeof(bs->filename), filename);
425
426    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
427        return -ENOTSUP;
428    }
429
430    bs->drv = drv;
431    bs->opaque = qemu_mallocz(drv->instance_size);
432
433    /*
434     * Yes, BDRV_O_NOCACHE aka O_DIRECT means we have to present a
435     * write cache to the guest.  We do need the fdatasync to flush
436     * out transactions for block allocations, and we maybe have a
437     * volatile write cache in our backing device to deal with.
438     */
439    if (flags & (BDRV_O_CACHE_WB|BDRV_O_NOCACHE))
440        bs->enable_write_cache = 1;
441
442    /*
443     * Clear flags that are internal to the block layer before opening the
444     * image.
445     */
446    open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
447
448    /*
449     * Snapshots should be writeable.
450     */
451    if (bs->is_temporary) {
452        open_flags |= BDRV_O_RDWR;
453    }
454
455    /* Open the image, either directly or using a protocol */
456    if (drv->bdrv_file_open) {
457        ret = drv->bdrv_file_open(bs, filename, open_flags);
458    } else {
459        ret = bdrv_file_open(&bs->file, filename, open_flags);
460        if (ret >= 0) {
461            ret = drv->bdrv_open(bs, open_flags);
462        }
463    }
464
465    if (ret < 0) {
466        goto free_and_fail;
467    }
468
469    bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
470
471    ret = refresh_total_sectors(bs, bs->total_sectors);
472    if (ret < 0) {
473        goto free_and_fail;
474    }
475
476#ifndef _WIN32
477    if (bs->is_temporary) {
478        unlink(filename);
479    }
480#endif
481    return 0;
482
483free_and_fail:
484    if (bs->file) {
485        bdrv_delete(bs->file);
486        bs->file = NULL;
487    }
488    qemu_free(bs->opaque);
489    bs->opaque = NULL;
490    bs->drv = NULL;
491    return ret;
492}
493
494/*
495 * Opens a file using a protocol (file, host_device, nbd, ...)
496 */
497int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
498{
499    BlockDriverState *bs;
500    BlockDriver *drv;
501    int ret;
502
503    drv = bdrv_find_protocol(filename);
504    if (!drv) {
505        return -ENOENT;
506    }
507
508    bs = bdrv_new("");
509    ret = bdrv_open_common(bs, filename, flags, drv);
510    if (ret < 0) {
511        bdrv_delete(bs);
512        return ret;
513    }
514    bs->growable = 1;
515    *pbs = bs;
516    return 0;
517}
518
519/*
520 * Opens a disk image (raw, qcow2, vmdk, ...)
521 */
522int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
523              BlockDriver *drv)
524{
525    int ret;
526    int probed = 0;
527
528    if (flags & BDRV_O_SNAPSHOT) {
529        BlockDriverState *bs1;
530        int64_t total_size;
531        int is_protocol = 0;
532        BlockDriver *bdrv_qcow2;
533        QEMUOptionParameter *options;
534        char tmp_filename[PATH_MAX];
535        char backing_filename[PATH_MAX];
536
537        /* if snapshot, we create a temporary backing file and open it
538           instead of opening 'filename' directly */
539
540        /* if there is a backing file, use it */
541        bs1 = bdrv_new("");
542        ret = bdrv_open(bs1, filename, 0, drv);
543        if (ret < 0) {
544            bdrv_delete(bs1);
545            return ret;
546        }
547        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
548
549        if (bs1->drv && bs1->drv->protocol_name)
550            is_protocol = 1;
551
552        bdrv_delete(bs1);
553
554        get_tmp_filename(tmp_filename, sizeof(tmp_filename));
555
556        /* Real path is meaningless for protocols */
557        if (is_protocol)
558            snprintf(backing_filename, sizeof(backing_filename),
559                     "%s", filename);
560        else if (!realpath(filename, backing_filename))
561            return -errno;
562
563        bdrv_qcow2 = bdrv_find_format("qcow2");
564        options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
565
566        set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
567        set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
568        if (drv) {
569            set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
570                drv->format_name);
571        }
572
573        ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
574        free_option_parameters(options);
575        if (ret < 0) {
576            return ret;
577        }
578
579        filename = tmp_filename;
580        drv = bdrv_qcow2;
581        bs->is_temporary = 1;
582    }
583
584    /* Find the right image format driver */
585    if (!drv) {
586        ret = find_image_format(filename, &drv);
587        probed = 1;
588    }
589
590    if (!drv) {
591        goto unlink_and_fail;
592    }
593
594    /* Open the image */
595    ret = bdrv_open_common(bs, filename, flags, drv);
596    if (ret < 0) {
597        goto unlink_and_fail;
598    }
599
600    bs->probed = probed;
601
602    /* If there is a backing file, use it */
603    if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
604        char backing_filename[PATH_MAX];
605        int back_flags;
606        BlockDriver *back_drv = NULL;
607
608        bs->backing_hd = bdrv_new("");
609        path_combine(backing_filename, sizeof(backing_filename),
610                     filename, bs->backing_file);
611        if (bs->backing_format[0] != '\0')
612            back_drv = bdrv_find_format(bs->backing_format);
613
614        /* backing files always opened read-only */
615        back_flags =
616            flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
617
618        ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
619        if (ret < 0) {
620            bdrv_close(bs);
621            return ret;
622        }
623        if (bs->is_temporary) {
624            bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
625        } else {
626            /* base image inherits from "parent" */
627            bs->backing_hd->keep_read_only = bs->keep_read_only;
628        }
629    }
630
631    if (!bdrv_key_required(bs)) {
632        /* call the change callback */
633        bs->media_changed = 1;
634        if (bs->change_cb)
635            bs->change_cb(bs->change_opaque);
636    }
637
638    return 0;
639
640unlink_and_fail:
641    if (bs->is_temporary) {
642        unlink(filename);
643    }
644    return ret;
645}
646
647void bdrv_close(BlockDriverState *bs)
648{
649    if (bs->drv) {
650        if (bs == bs_snapshots) {
651            bs_snapshots = NULL;
652        }
653        if (bs->backing_hd) {
654            bdrv_delete(bs->backing_hd);
655            bs->backing_hd = NULL;
656        }
657        bs->drv->bdrv_close(bs);
658        qemu_free(bs->opaque);
659#ifdef _WIN32
660        if (bs->is_temporary) {
661            unlink(bs->filename);
662        }
663#endif
664        bs->opaque = NULL;
665        bs->drv = NULL;
666
667        if (bs->file != NULL) {
668            bdrv_close(bs->file);
669        }
670
671        /* call the change callback */
672        bs->media_changed = 1;
673        if (bs->change_cb)
674            bs->change_cb(bs->change_opaque);
675    }
676}
677
678void bdrv_close_all(void)
679{
680    BlockDriverState *bs;
681
682    QTAILQ_FOREACH(bs, &bdrv_states, list) {
683        bdrv_close(bs);
684    }
685}
686
687void bdrv_delete(BlockDriverState *bs)
688{
689    assert(!bs->peer);
690
691    /* remove from list, if necessary */
692    if (bs->device_name[0] != '\0') {
693        QTAILQ_REMOVE(&bdrv_states, bs, list);
694    }
695
696    bdrv_close(bs);
697    if (bs->file != NULL) {
698        bdrv_delete(bs->file);
699    }
700
701    assert(bs != bs_snapshots);
702    qemu_free(bs);
703}
704
705int bdrv_attach(BlockDriverState *bs, DeviceState *qdev)
706{
707    if (bs->peer) {
708        return -EBUSY;
709    }
710    bs->peer = qdev;
711    return 0;
712}
713
714void bdrv_detach(BlockDriverState *bs, DeviceState *qdev)
715{
716    assert(bs->peer == qdev);
717    bs->peer = NULL;
718}
719
720DeviceState *bdrv_get_attached(BlockDriverState *bs)
721{
722    return bs->peer;
723}
724
725/*
726 * Run consistency checks on an image
727 *
728 * Returns 0 if the check could be completed (it doesn't mean that the image is
729 * free of errors) or -errno when an internal error occured. The results of the
730 * check are stored in res.
731 */
732int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
733{
734    if (bs->drv->bdrv_check == NULL) {
735        return -ENOTSUP;
736    }
737
738    memset(res, 0, sizeof(*res));
739    return bs->drv->bdrv_check(bs, res);
740}
741
742#define COMMIT_BUF_SECTORS 2048
743
744/* commit COW file into the raw image */
745int bdrv_commit(BlockDriverState *bs)
746{
747    BlockDriver *drv = bs->drv;
748    int64_t sector, total_sectors;
749    int n, ro, open_flags;
750    int ret = 0, rw_ret = 0;
751    uint8_t *buf;
752    char filename[1024];
753    BlockDriverState *bs_rw, *bs_ro;
754
755    if (!drv)
756        return -ENOMEDIUM;
757
758    if (!bs->backing_hd) {
759        return -ENOTSUP;
760    }
761
762    if (bs->backing_hd->keep_read_only) {
763        return -EACCES;
764    }
765
766    ro = bs->backing_hd->read_only;
767    strncpy(filename, bs->backing_hd->filename, sizeof(filename));
768    open_flags =  bs->backing_hd->open_flags;
769
770    if (ro) {
771        /* re-open as RW */
772        bdrv_delete(bs->backing_hd);
773        bs->backing_hd = NULL;
774        bs_rw = bdrv_new("");
775        rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR, drv);
776        if (rw_ret < 0) {
777            bdrv_delete(bs_rw);
778            /* try to re-open read-only */
779            bs_ro = bdrv_new("");
780            ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR, drv);
781            if (ret < 0) {
782                bdrv_delete(bs_ro);
783                /* drive not functional anymore */
784                bs->drv = NULL;
785                return ret;
786            }
787            bs->backing_hd = bs_ro;
788            return rw_ret;
789        }
790        bs->backing_hd = bs_rw;
791    }
792
793    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
794    buf = qemu_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
795
796    for (sector = 0; sector < total_sectors; sector += n) {
797        if (drv->bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
798
799            if (bdrv_read(bs, sector, buf, n) != 0) {
800                ret = -EIO;
801                goto ro_cleanup;
802            }
803
804            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
805                ret = -EIO;
806                goto ro_cleanup;
807            }
808        }
809    }
810
811    if (drv->bdrv_make_empty) {
812        ret = drv->bdrv_make_empty(bs);
813        bdrv_flush(bs);
814    }
815
816    /*
817     * Make sure all data we wrote to the backing device is actually
818     * stable on disk.
819     */
820    if (bs->backing_hd)
821        bdrv_flush(bs->backing_hd);
822
823ro_cleanup:
824    qemu_free(buf);
825
826    if (ro) {
827        /* re-open as RO */
828        bdrv_delete(bs->backing_hd);
829        bs->backing_hd = NULL;
830        bs_ro = bdrv_new("");
831        ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR, drv);
832        if (ret < 0) {
833            bdrv_delete(bs_ro);
834            /* drive not functional anymore */
835            bs->drv = NULL;
836            return ret;
837        }
838        bs->backing_hd = bs_ro;
839        bs->backing_hd->keep_read_only = 0;
840    }
841
842    return ret;
843}
844
845void bdrv_commit_all(void)
846{
847    BlockDriverState *bs;
848
849    QTAILQ_FOREACH(bs, &bdrv_states, list) {
850        bdrv_commit(bs);
851    }
852}
853
854/*
855 * Return values:
856 * 0        - success
857 * -EINVAL  - backing format specified, but no file
858 * -ENOSPC  - can't update the backing file because no space is left in the
859 *            image file header
860 * -ENOTSUP - format driver doesn't support changing the backing file
861 */
862int bdrv_change_backing_file(BlockDriverState *bs,
863    const char *backing_file, const char *backing_fmt)
864{
865    BlockDriver *drv = bs->drv;
866
867    if (drv->bdrv_change_backing_file != NULL) {
868        return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
869    } else {
870        return -ENOTSUP;
871    }
872}
873
874static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
875                                   size_t size)
876{
877    int64_t len;
878
879    if (!bdrv_is_inserted(bs))
880        return -ENOMEDIUM;
881
882    if (bs->growable)
883        return 0;
884
885    len = bdrv_getlength(bs);
886
887    if (offset < 0)
888        return -EIO;
889
890    if ((offset > len) || (len - offset < size))
891        return -EIO;
892
893    return 0;
894}
895
896static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
897                              int nb_sectors)
898{
899    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
900                                   nb_sectors * BDRV_SECTOR_SIZE);
901}
902
903/* return < 0 if error. See bdrv_write() for the return codes */
904int bdrv_read(BlockDriverState *bs, int64_t sector_num,
905              uint8_t *buf, int nb_sectors)
906{
907    BlockDriver *drv = bs->drv;
908
909    if (!drv)
910        return -ENOMEDIUM;
911    if (bdrv_check_request(bs, sector_num, nb_sectors))
912        return -EIO;
913
914    return drv->bdrv_read(bs, sector_num, buf, nb_sectors);
915}
916
917static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
918                             int nb_sectors, int dirty)
919{
920    int64_t start, end;
921    unsigned long val, idx, bit;
922
923    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
924    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
925
926    for (; start <= end; start++) {
927        idx = start / (sizeof(unsigned long) * 8);
928        bit = start % (sizeof(unsigned long) * 8);
929        val = bs->dirty_bitmap[idx];
930        if (dirty) {
931            if (!(val & (1 << bit))) {
932                bs->dirty_count++;
933                val |= 1 << bit;
934            }
935        } else {
936            if (val & (1 << bit)) {
937                bs->dirty_count--;
938                val &= ~(1 << bit);
939            }
940        }
941        bs->dirty_bitmap[idx] = val;
942    }
943}
944
945/* Return < 0 if error. Important errors are:
946  -EIO         generic I/O error (may happen for all errors)
947  -ENOMEDIUM   No media inserted.
948  -EINVAL      Invalid sector number or nb_sectors
949  -EACCES      Trying to write a read-only device
950*/
951int bdrv_write(BlockDriverState *bs, int64_t sector_num,
952               const uint8_t *buf, int nb_sectors)
953{
954    BlockDriver *drv = bs->drv;
955    if (!bs->drv)
956        return -ENOMEDIUM;
957    if (bs->read_only)
958        return -EACCES;
959    if (bdrv_check_request(bs, sector_num, nb_sectors))
960        return -EIO;
961
962    if (bs->dirty_bitmap) {
963        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
964    }
965
966    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
967        bs->wr_highest_sector = sector_num + nb_sectors - 1;
968    }
969
970    return drv->bdrv_write(bs, sector_num, buf, nb_sectors);
971}
972
973int bdrv_pread(BlockDriverState *bs, int64_t offset,
974               void *buf, int count1)
975{
976    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
977    int len, nb_sectors, count;
978    int64_t sector_num;
979    int ret;
980
981    count = count1;
982    /* first read to align to sector start */
983    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
984    if (len > count)
985        len = count;
986    sector_num = offset >> BDRV_SECTOR_BITS;
987    if (len > 0) {
988        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
989            return ret;
990        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
991        count -= len;
992        if (count == 0)
993            return count1;
994        sector_num++;
995        buf += len;
996    }
997
998    /* read the sectors "in place" */
999    nb_sectors = count >> BDRV_SECTOR_BITS;
1000    if (nb_sectors > 0) {
1001        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1002            return ret;
1003        sector_num += nb_sectors;
1004        len = nb_sectors << BDRV_SECTOR_BITS;
1005        buf += len;
1006        count -= len;
1007    }
1008
1009    /* add data from the last sector */
1010    if (count > 0) {
1011        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1012            return ret;
1013        memcpy(buf, tmp_buf, count);
1014    }
1015    return count1;
1016}
1017
1018int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1019                const void *buf, int count1)
1020{
1021    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1022    int len, nb_sectors, count;
1023    int64_t sector_num;
1024    int ret;
1025
1026    count = count1;
1027    /* first write to align to sector start */
1028    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1029    if (len > count)
1030        len = count;
1031    sector_num = offset >> BDRV_SECTOR_BITS;
1032    if (len > 0) {
1033        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1034            return ret;
1035        memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1036        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1037            return ret;
1038        count -= len;
1039        if (count == 0)
1040            return count1;
1041        sector_num++;
1042        buf += len;
1043    }
1044
1045    /* write the sectors "in place" */
1046    nb_sectors = count >> BDRV_SECTOR_BITS;
1047    if (nb_sectors > 0) {
1048        if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1049            return ret;
1050        sector_num += nb_sectors;
1051        len = nb_sectors << BDRV_SECTOR_BITS;
1052        buf += len;
1053        count -= len;
1054    }
1055
1056    /* add data from the last sector */
1057    if (count > 0) {
1058        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1059            return ret;
1060        memcpy(tmp_buf, buf, count);
1061        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1062            return ret;
1063    }
1064    return count1;
1065}
1066
1067/*
1068 * Writes to the file and ensures that no writes are reordered across this
1069 * request (acts as a barrier)
1070 *
1071 * Returns 0 on success, -errno in error cases.
1072 */
1073int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1074    const void *buf, int count)
1075{
1076    int ret;
1077
1078    ret = bdrv_pwrite(bs, offset, buf, count);
1079    if (ret < 0) {
1080        return ret;
1081    }
1082
1083    /* No flush needed for cache=writethrough, it uses O_DSYNC */
1084    if ((bs->open_flags & BDRV_O_CACHE_MASK) != 0) {
1085        bdrv_flush(bs);
1086    }
1087
1088    return 0;
1089}
1090
1091/*
1092 * Writes to the file and ensures that no writes are reordered across this
1093 * request (acts as a barrier)
1094 *
1095 * Returns 0 on success, -errno in error cases.
1096 */
1097int bdrv_write_sync(BlockDriverState *bs, int64_t sector_num,
1098    const uint8_t *buf, int nb_sectors)
1099{
1100    return bdrv_pwrite_sync(bs, BDRV_SECTOR_SIZE * sector_num,
1101        buf, BDRV_SECTOR_SIZE * nb_sectors);
1102}
1103
1104/**
1105 * Truncate file to 'offset' bytes (needed only for file protocols)
1106 */
1107int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1108{
1109    BlockDriver *drv = bs->drv;
1110    int ret;
1111    if (!drv)
1112        return -ENOMEDIUM;
1113    if (!drv->bdrv_truncate)
1114        return -ENOTSUP;
1115    if (bs->read_only)
1116        return -EACCES;
1117    ret = drv->bdrv_truncate(bs, offset);
1118    if (ret == 0) {
1119        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1120    }
1121    return ret;
1122}
1123
1124/**
1125 * Length of a file in bytes. Return < 0 if error or unknown.
1126 */
1127int64_t bdrv_getlength(BlockDriverState *bs)
1128{
1129    BlockDriver *drv = bs->drv;
1130    if (!drv)
1131        return -ENOMEDIUM;
1132
1133    /* Fixed size devices use the total_sectors value for speed instead of
1134       issuing a length query (like lseek) on each call.  Also, legacy block
1135       drivers don't provide a bdrv_getlength function and must use
1136       total_sectors. */
1137    if (!bs->growable || !drv->bdrv_getlength) {
1138        return bs->total_sectors * BDRV_SECTOR_SIZE;
1139    }
1140    return drv->bdrv_getlength(bs);
1141}
1142
1143/* return 0 as number of sectors if no device present or error */
1144void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1145{
1146    int64_t length;
1147    length = bdrv_getlength(bs);
1148    if (length < 0)
1149        length = 0;
1150    else
1151        length = length >> BDRV_SECTOR_BITS;
1152    *nb_sectors_ptr = length;
1153}
1154
1155struct partition {
1156        uint8_t boot_ind;           /* 0x80 - active */
1157        uint8_t head;               /* starting head */
1158        uint8_t sector;             /* starting sector */
1159        uint8_t cyl;                /* starting cylinder */
1160        uint8_t sys_ind;            /* What partition type */
1161        uint8_t end_head;           /* end head */
1162        uint8_t end_sector;         /* end sector */
1163        uint8_t end_cyl;            /* end cylinder */
1164        uint32_t start_sect;        /* starting sector counting from 0 */
1165        uint32_t nr_sects;          /* nr of sectors in partition */
1166} __attribute__((packed));
1167
1168/* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1169static int guess_disk_lchs(BlockDriverState *bs,
1170                           int *pcylinders, int *pheads, int *psectors)
1171{
1172    uint8_t buf[BDRV_SECTOR_SIZE];
1173    int ret, i, heads, sectors, cylinders;
1174    struct partition *p;
1175    uint32_t nr_sects;
1176    uint64_t nb_sectors;
1177
1178    bdrv_get_geometry(bs, &nb_sectors);
1179
1180    ret = bdrv_read(bs, 0, buf, 1);
1181    if (ret < 0)
1182        return -1;
1183    /* test msdos magic */
1184    if (buf[510] != 0x55 || buf[511] != 0xaa)
1185        return -1;
1186    for(i = 0; i < 4; i++) {
1187        p = ((struct partition *)(buf + 0x1be)) + i;
1188        nr_sects = le32_to_cpu(p->nr_sects);
1189        if (nr_sects && p->end_head) {
1190            /* We make the assumption that the partition terminates on
1191               a cylinder boundary */
1192            heads = p->end_head + 1;
1193            sectors = p->end_sector & 63;
1194            if (sectors == 0)
1195                continue;
1196            cylinders = nb_sectors / (heads * sectors);
1197            if (cylinders < 1 || cylinders > 16383)
1198                continue;
1199            *pheads = heads;
1200            *psectors = sectors;
1201            *pcylinders = cylinders;
1202#if 0
1203            printf("guessed geometry: LCHS=%d %d %d\n",
1204                   cylinders, heads, sectors);
1205#endif
1206            return 0;
1207        }
1208    }
1209    return -1;
1210}
1211
1212void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
1213{
1214    int translation, lba_detected = 0;
1215    int cylinders, heads, secs;
1216    uint64_t nb_sectors;
1217
1218    /* if a geometry hint is available, use it */
1219    bdrv_get_geometry(bs, &nb_sectors);
1220    bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
1221    translation = bdrv_get_translation_hint(bs);
1222    if (cylinders != 0) {
1223        *pcyls = cylinders;
1224        *pheads = heads;
1225        *psecs = secs;
1226    } else {
1227        if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
1228            if (heads > 16) {
1229                /* if heads > 16, it means that a BIOS LBA
1230                   translation was active, so the default
1231                   hardware geometry is OK */
1232                lba_detected = 1;
1233                goto default_geometry;
1234            } else {
1235                *pcyls = cylinders;
1236                *pheads = heads;
1237                *psecs = secs;
1238                /* disable any translation to be in sync with
1239                   the logical geometry */
1240                if (translation == BIOS_ATA_TRANSLATION_AUTO) {
1241                    bdrv_set_translation_hint(bs,
1242                                              BIOS_ATA_TRANSLATION_NONE);
1243                }
1244            }
1245        } else {
1246        default_geometry:
1247            /* if no geometry, use a standard physical disk geometry */
1248            cylinders = nb_sectors / (16 * 63);
1249
1250            if (cylinders > 16383)
1251                cylinders = 16383;
1252            else if (cylinders < 2)
1253                cylinders = 2;
1254            *pcyls = cylinders;
1255            *pheads = 16;
1256            *psecs = 63;
1257            if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
1258                if ((*pcyls * *pheads) <= 131072) {
1259                    bdrv_set_translation_hint(bs,
1260                                              BIOS_ATA_TRANSLATION_LARGE);
1261                } else {
1262                    bdrv_set_translation_hint(bs,
1263                                              BIOS_ATA_TRANSLATION_LBA);
1264                }
1265            }
1266        }
1267        bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
1268    }
1269}
1270
1271void bdrv_set_geometry_hint(BlockDriverState *bs,
1272                            int cyls, int heads, int secs)
1273{
1274    bs->cyls = cyls;
1275    bs->heads = heads;
1276    bs->secs = secs;
1277}
1278
1279void bdrv_set_type_hint(BlockDriverState *bs, int type)
1280{
1281    bs->type = type;
1282    bs->removable = ((type == BDRV_TYPE_CDROM ||
1283                      type == BDRV_TYPE_FLOPPY));
1284}
1285
1286void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
1287{
1288    bs->translation = translation;
1289}
1290
1291void bdrv_get_geometry_hint(BlockDriverState *bs,
1292                            int *pcyls, int *pheads, int *psecs)
1293{
1294    *pcyls = bs->cyls;
1295    *pheads = bs->heads;
1296    *psecs = bs->secs;
1297}
1298
1299int bdrv_get_type_hint(BlockDriverState *bs)
1300{
1301    return bs->type;
1302}
1303
1304int bdrv_get_translation_hint(BlockDriverState *bs)
1305{
1306    return bs->translation;
1307}
1308
1309void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
1310                       BlockErrorAction on_write_error)
1311{
1312    bs->on_read_error = on_read_error;
1313    bs->on_write_error = on_write_error;
1314}
1315
1316BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
1317{
1318    return is_read ? bs->on_read_error : bs->on_write_error;
1319}
1320
1321void bdrv_set_removable(BlockDriverState *bs, int removable)
1322{
1323    bs->removable = removable;
1324    if (removable && bs == bs_snapshots) {
1325        bs_snapshots = NULL;
1326    }
1327}
1328
1329int bdrv_is_removable(BlockDriverState *bs)
1330{
1331    return bs->removable;
1332}
1333
1334int bdrv_is_read_only(BlockDriverState *bs)
1335{
1336    return bs->read_only;
1337}
1338
1339int bdrv_is_sg(BlockDriverState *bs)
1340{
1341    return bs->sg;
1342}
1343
1344int bdrv_enable_write_cache(BlockDriverState *bs)
1345{
1346    return bs->enable_write_cache;
1347}
1348
1349/* XXX: no longer used */
1350void bdrv_set_change_cb(BlockDriverState *bs,
1351                        void (*change_cb)(void *opaque), void *opaque)
1352{
1353    bs->change_cb = change_cb;
1354    bs->change_opaque = opaque;
1355}
1356
1357int bdrv_is_encrypted(BlockDriverState *bs)
1358{
1359    if (bs->backing_hd && bs->backing_hd->encrypted)
1360        return 1;
1361    return bs->encrypted;
1362}
1363
1364int bdrv_key_required(BlockDriverState *bs)
1365{
1366    BlockDriverState *backing_hd = bs->backing_hd;
1367
1368    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
1369        return 1;
1370    return (bs->encrypted && !bs->valid_key);
1371}
1372
1373int bdrv_set_key(BlockDriverState *bs, const char *key)
1374{
1375    int ret;
1376    if (bs->backing_hd && bs->backing_hd->encrypted) {
1377        ret = bdrv_set_key(bs->backing_hd, key);
1378        if (ret < 0)
1379            return ret;
1380        if (!bs->encrypted)
1381            return 0;
1382    }
1383    if (!bs->encrypted) {
1384        return -EINVAL;
1385    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
1386        return -ENOMEDIUM;
1387    }
1388    ret = bs->drv->bdrv_set_key(bs, key);
1389    if (ret < 0) {
1390        bs->valid_key = 0;
1391    } else if (!bs->valid_key) {
1392        bs->valid_key = 1;
1393        /* call the change callback now, we skipped it on open */
1394        bs->media_changed = 1;
1395        if (bs->change_cb)
1396            bs->change_cb(bs->change_opaque);
1397    }
1398    return ret;
1399}
1400
1401void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
1402{
1403    if (!bs->drv) {
1404        buf[0] = '\0';
1405    } else {
1406        pstrcpy(buf, buf_size, bs->drv->format_name);
1407    }
1408}
1409
1410void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
1411                         void *opaque)
1412{
1413    BlockDriver *drv;
1414
1415    QLIST_FOREACH(drv, &bdrv_drivers, list) {
1416        it(opaque, drv->format_name);
1417    }
1418}
1419
1420BlockDriverState *bdrv_find(const char *name)
1421{
1422    BlockDriverState *bs;
1423
1424    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1425        if (!strcmp(name, bs->device_name)) {
1426            return bs;
1427        }
1428    }
1429    return NULL;
1430}
1431
1432BlockDriverState *bdrv_next(BlockDriverState *bs)
1433{
1434    if (!bs) {
1435        return QTAILQ_FIRST(&bdrv_states);
1436    }
1437    return QTAILQ_NEXT(bs, list);
1438}
1439
1440void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
1441{
1442    BlockDriverState *bs;
1443
1444    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1445        it(opaque, bs);
1446    }
1447}
1448
1449const char *bdrv_get_device_name(BlockDriverState *bs)
1450{
1451    return bs->device_name;
1452}
1453
1454void bdrv_flush(BlockDriverState *bs)
1455{
1456    if (bs->open_flags & BDRV_O_NO_FLUSH) {
1457        return;
1458    }
1459
1460    if (bs->drv && bs->drv->bdrv_flush)
1461        bs->drv->bdrv_flush(bs);
1462}
1463
1464void bdrv_flush_all(void)
1465{
1466    BlockDriverState *bs;
1467
1468    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1469        if (bs->drv && !bdrv_is_read_only(bs) &&
1470            (!bdrv_is_removable(bs) || bdrv_is_inserted(bs))) {
1471            bdrv_flush(bs);
1472        }
1473    }
1474}
1475
1476int bdrv_has_zero_init(BlockDriverState *bs)
1477{
1478    assert(bs->drv);
1479
1480    if (bs->drv->bdrv_has_zero_init) {
1481        return bs->drv->bdrv_has_zero_init(bs);
1482    }
1483
1484    return 1;
1485}
1486
1487/*
1488 * Returns true iff the specified sector is present in the disk image. Drivers
1489 * not implementing the functionality are assumed to not support backing files,
1490 * hence all their sectors are reported as allocated.
1491 *
1492 * 'pnum' is set to the number of sectors (including and immediately following
1493 * the specified sector) that are known to be in the same
1494 * allocated/unallocated state.
1495 *
1496 * 'nb_sectors' is the max value 'pnum' should be set to.
1497 */
1498int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1499	int *pnum)
1500{
1501    int64_t n;
1502    if (!bs->drv->bdrv_is_allocated) {
1503        if (sector_num >= bs->total_sectors) {
1504            *pnum = 0;
1505            return 0;
1506        }
1507        n = bs->total_sectors - sector_num;
1508        *pnum = (n < nb_sectors) ? (n) : (nb_sectors);
1509        return 1;
1510    }
1511    return bs->drv->bdrv_is_allocated(bs, sector_num, nb_sectors, pnum);
1512}
1513
1514void bdrv_mon_event(const BlockDriverState *bdrv,
1515                    BlockMonEventAction action, int is_read)
1516{
1517    QObject *data;
1518    const char *action_str;
1519
1520    switch (action) {
1521    case BDRV_ACTION_REPORT:
1522        action_str = "report";
1523        break;
1524    case BDRV_ACTION_IGNORE:
1525        action_str = "ignore";
1526        break;
1527    case BDRV_ACTION_STOP:
1528        action_str = "stop";
1529        break;
1530    default:
1531        abort();
1532    }
1533
1534    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1535                              bdrv->device_name,
1536                              action_str,
1537                              is_read ? "read" : "write");
1538    monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1539
1540    qobject_decref(data);
1541}
1542
1543static void bdrv_print_dict(QObject *obj, void *opaque)
1544{
1545    QDict *bs_dict;
1546    Monitor *mon = opaque;
1547
1548    bs_dict = qobject_to_qdict(obj);
1549
1550    monitor_printf(mon, "%s: type=%s removable=%d",
1551                        qdict_get_str(bs_dict, "device"),
1552                        qdict_get_str(bs_dict, "type"),
1553                        qdict_get_bool(bs_dict, "removable"));
1554
1555    if (qdict_get_bool(bs_dict, "removable")) {
1556        monitor_printf(mon, " locked=%d", qdict_get_bool(bs_dict, "locked"));
1557    }
1558
1559    if (qdict_haskey(bs_dict, "inserted")) {
1560        QDict *qdict = qobject_to_qdict(qdict_get(bs_dict, "inserted"));
1561
1562        monitor_printf(mon, " file=");
1563        monitor_print_filename(mon, qdict_get_str(qdict, "file"));
1564        if (qdict_haskey(qdict, "backing_file")) {
1565            monitor_printf(mon, " backing_file=");
1566            monitor_print_filename(mon, qdict_get_str(qdict, "backing_file"));
1567        }
1568        monitor_printf(mon, " ro=%d drv=%s encrypted=%d",
1569                            qdict_get_bool(qdict, "ro"),
1570                            qdict_get_str(qdict, "drv"),
1571                            qdict_get_bool(qdict, "encrypted"));
1572    } else {
1573        monitor_printf(mon, " [not inserted]");
1574    }
1575
1576    monitor_printf(mon, "\n");
1577}
1578
1579void bdrv_info_print(Monitor *mon, const QObject *data)
1580{
1581    qlist_iter(qobject_to_qlist(data), bdrv_print_dict, mon);
1582}
1583
1584void bdrv_info(Monitor *mon, QObject **ret_data)
1585{
1586    QList *bs_list;
1587    BlockDriverState *bs;
1588
1589    bs_list = qlist_new();
1590
1591    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1592        QObject *bs_obj;
1593        const char *type = "unknown";
1594
1595        switch(bs->type) {
1596        case BDRV_TYPE_HD:
1597            type = "hd";
1598            break;
1599        case BDRV_TYPE_CDROM:
1600            type = "cdrom";
1601            break;
1602        case BDRV_TYPE_FLOPPY:
1603            type = "floppy";
1604            break;
1605        }
1606
1607        bs_obj = qobject_from_jsonf("{ 'device': %s, 'type': %s, "
1608                                    "'removable': %i, 'locked': %i }",
1609                                    bs->device_name, type, bs->removable,
1610                                    bs->locked);
1611
1612        if (bs->drv) {
1613            QObject *obj;
1614            QDict *bs_dict = qobject_to_qdict(bs_obj);
1615
1616            obj = qobject_from_jsonf("{ 'file': %s, 'ro': %i, 'drv': %s, "
1617                                     "'encrypted': %i }",
1618                                     bs->filename, bs->read_only,
1619                                     bs->drv->format_name,
1620                                     bdrv_is_encrypted(bs));
1621            if (bs->backing_file[0] != '\0') {
1622                QDict *qdict = qobject_to_qdict(obj);
1623                qdict_put(qdict, "backing_file",
1624                          qstring_from_str(bs->backing_file));
1625            }
1626
1627            qdict_put_obj(bs_dict, "inserted", obj);
1628        }
1629        qlist_append_obj(bs_list, bs_obj);
1630    }
1631
1632    *ret_data = QOBJECT(bs_list);
1633}
1634
1635static void bdrv_stats_iter(QObject *data, void *opaque)
1636{
1637    QDict *qdict;
1638    Monitor *mon = opaque;
1639
1640    qdict = qobject_to_qdict(data);
1641    monitor_printf(mon, "%s:", qdict_get_str(qdict, "device"));
1642
1643    qdict = qobject_to_qdict(qdict_get(qdict, "stats"));
1644    monitor_printf(mon, " rd_bytes=%" PRId64
1645                        " wr_bytes=%" PRId64
1646                        " rd_operations=%" PRId64
1647                        " wr_operations=%" PRId64
1648                        "\n",
1649                        qdict_get_int(qdict, "rd_bytes"),
1650                        qdict_get_int(qdict, "wr_bytes"),
1651                        qdict_get_int(qdict, "rd_operations"),
1652                        qdict_get_int(qdict, "wr_operations"));
1653}
1654
1655void bdrv_stats_print(Monitor *mon, const QObject *data)
1656{
1657    qlist_iter(qobject_to_qlist(data), bdrv_stats_iter, mon);
1658}
1659
1660static QObject* bdrv_info_stats_bs(BlockDriverState *bs)
1661{
1662    QObject *res;
1663    QDict *dict;
1664
1665    res = qobject_from_jsonf("{ 'stats': {"
1666                             "'rd_bytes': %" PRId64 ","
1667                             "'wr_bytes': %" PRId64 ","
1668                             "'rd_operations': %" PRId64 ","
1669                             "'wr_operations': %" PRId64 ","
1670                             "'wr_highest_offset': %" PRId64
1671                             "} }",
1672                             bs->rd_bytes, bs->wr_bytes,
1673                             bs->rd_ops, bs->wr_ops,
1674                             bs->wr_highest_sector *
1675                             (uint64_t)BDRV_SECTOR_SIZE);
1676    dict  = qobject_to_qdict(res);
1677
1678    if (*bs->device_name) {
1679        qdict_put(dict, "device", qstring_from_str(bs->device_name));
1680    }
1681
1682    if (bs->file) {
1683        QObject *parent = bdrv_info_stats_bs(bs->file);
1684        qdict_put_obj(dict, "parent", parent);
1685    }
1686
1687    return res;
1688}
1689
1690void bdrv_info_stats(Monitor *mon, QObject **ret_data)
1691{
1692    QObject *obj;
1693    QList *devices;
1694    BlockDriverState *bs;
1695
1696    devices = qlist_new();
1697
1698    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1699        obj = bdrv_info_stats_bs(bs);
1700        qlist_append_obj(devices, obj);
1701    }
1702
1703    *ret_data = QOBJECT(devices);
1704}
1705
1706const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
1707{
1708    if (bs->backing_hd && bs->backing_hd->encrypted)
1709        return bs->backing_file;
1710    else if (bs->encrypted)
1711        return bs->filename;
1712    else
1713        return NULL;
1714}
1715
1716void bdrv_get_backing_filename(BlockDriverState *bs,
1717                               char *filename, int filename_size)
1718{
1719    if (!bs->backing_file) {
1720        pstrcpy(filename, filename_size, "");
1721    } else {
1722        pstrcpy(filename, filename_size, bs->backing_file);
1723    }
1724}
1725
1726int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
1727                          const uint8_t *buf, int nb_sectors)
1728{
1729    BlockDriver *drv = bs->drv;
1730    if (!drv)
1731        return -ENOMEDIUM;
1732    if (!drv->bdrv_write_compressed)
1733        return -ENOTSUP;
1734    if (bdrv_check_request(bs, sector_num, nb_sectors))
1735        return -EIO;
1736
1737    if (bs->dirty_bitmap) {
1738        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1739    }
1740
1741    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
1742}
1743
1744int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1745{
1746    BlockDriver *drv = bs->drv;
1747    if (!drv)
1748        return -ENOMEDIUM;
1749    if (!drv->bdrv_get_info)
1750        return -ENOTSUP;
1751    memset(bdi, 0, sizeof(*bdi));
1752    return drv->bdrv_get_info(bs, bdi);
1753}
1754
1755int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
1756                      int64_t pos, int size)
1757{
1758    BlockDriver *drv = bs->drv;
1759    if (!drv)
1760        return -ENOMEDIUM;
1761    if (drv->bdrv_save_vmstate)
1762        return drv->bdrv_save_vmstate(bs, buf, pos, size);
1763    if (bs->file)
1764        return bdrv_save_vmstate(bs->file, buf, pos, size);
1765    return -ENOTSUP;
1766}
1767
1768int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1769                      int64_t pos, int size)
1770{
1771    BlockDriver *drv = bs->drv;
1772    if (!drv)
1773        return -ENOMEDIUM;
1774    if (drv->bdrv_load_vmstate)
1775        return drv->bdrv_load_vmstate(bs, buf, pos, size);
1776    if (bs->file)
1777        return bdrv_load_vmstate(bs->file, buf, pos, size);
1778    return -ENOTSUP;
1779}
1780
1781void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
1782{
1783    BlockDriver *drv = bs->drv;
1784
1785    if (!drv || !drv->bdrv_debug_event) {
1786        return;
1787    }
1788
1789    return drv->bdrv_debug_event(bs, event);
1790
1791}
1792
1793/**************************************************************/
1794/* handling of snapshots */
1795
1796int bdrv_can_snapshot(BlockDriverState *bs)
1797{
1798    BlockDriver *drv = bs->drv;
1799    if (!drv || bdrv_is_removable(bs) || bdrv_is_read_only(bs)) {
1800        return 0;
1801    }
1802
1803    if (!drv->bdrv_snapshot_create) {
1804        if (bs->file != NULL) {
1805            return bdrv_can_snapshot(bs->file);
1806        }
1807        return 0;
1808    }
1809
1810    return 1;
1811}
1812
1813int bdrv_is_snapshot(BlockDriverState *bs)
1814{
1815    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
1816}
1817
1818BlockDriverState *bdrv_snapshots(void)
1819{
1820    BlockDriverState *bs;
1821
1822    if (bs_snapshots) {
1823        return bs_snapshots;
1824    }
1825
1826    bs = NULL;
1827    while ((bs = bdrv_next(bs))) {
1828        if (bdrv_can_snapshot(bs)) {
1829            bs_snapshots = bs;
1830            return bs;
1831        }
1832    }
1833    return NULL;
1834}
1835
1836int bdrv_snapshot_create(BlockDriverState *bs,
1837                         QEMUSnapshotInfo *sn_info)
1838{
1839    BlockDriver *drv = bs->drv;
1840    if (!drv)
1841        return -ENOMEDIUM;
1842    if (drv->bdrv_snapshot_create)
1843        return drv->bdrv_snapshot_create(bs, sn_info);
1844    if (bs->file)
1845        return bdrv_snapshot_create(bs->file, sn_info);
1846    return -ENOTSUP;
1847}
1848
1849int bdrv_snapshot_goto(BlockDriverState *bs,
1850                       const char *snapshot_id)
1851{
1852    BlockDriver *drv = bs->drv;
1853    int ret, open_ret;
1854
1855    if (!drv)
1856        return -ENOMEDIUM;
1857    if (drv->bdrv_snapshot_goto)
1858        return drv->bdrv_snapshot_goto(bs, snapshot_id);
1859
1860    if (bs->file) {
1861        drv->bdrv_close(bs);
1862        ret = bdrv_snapshot_goto(bs->file, snapshot_id);
1863        open_ret = drv->bdrv_open(bs, bs->open_flags);
1864        if (open_ret < 0) {
1865            bdrv_delete(bs->file);
1866            bs->drv = NULL;
1867            return open_ret;
1868        }
1869        return ret;
1870    }
1871
1872    return -ENOTSUP;
1873}
1874
1875int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
1876{
1877    BlockDriver *drv = bs->drv;
1878    if (!drv)
1879        return -ENOMEDIUM;
1880    if (drv->bdrv_snapshot_delete)
1881        return drv->bdrv_snapshot_delete(bs, snapshot_id);
1882    if (bs->file)
1883        return bdrv_snapshot_delete(bs->file, snapshot_id);
1884    return -ENOTSUP;
1885}
1886
1887int bdrv_snapshot_list(BlockDriverState *bs,
1888                       QEMUSnapshotInfo **psn_info)
1889{
1890    BlockDriver *drv = bs->drv;
1891    if (!drv)
1892        return -ENOMEDIUM;
1893    if (drv->bdrv_snapshot_list)
1894        return drv->bdrv_snapshot_list(bs, psn_info);
1895    if (bs->file)
1896        return bdrv_snapshot_list(bs->file, psn_info);
1897    return -ENOTSUP;
1898}
1899
1900#define NB_SUFFIXES 4
1901
1902char *get_human_readable_size(char *buf, int buf_size, int64_t size)
1903{
1904    static const char suffixes[NB_SUFFIXES] = "KMGT";
1905    int64_t base;
1906    int i;
1907
1908    if (size <= 999) {
1909        snprintf(buf, buf_size, "%" PRId64, size);
1910    } else {
1911        base = 1024;
1912        for(i = 0; i < NB_SUFFIXES; i++) {
1913            if (size < (10 * base)) {
1914                snprintf(buf, buf_size, "%0.1f%c",
1915                         (double)size / base,
1916                         suffixes[i]);
1917                break;
1918            } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
1919                snprintf(buf, buf_size, "%" PRId64 "%c",
1920                         ((size + (base >> 1)) / base),
1921                         suffixes[i]);
1922                break;
1923            }
1924            base = base * 1024;
1925        }
1926    }
1927    return buf;
1928}
1929
1930char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
1931{
1932    char buf1[128], date_buf[128], clock_buf[128];
1933#ifdef _WIN32
1934    struct tm *ptm;
1935#else
1936    struct tm tm;
1937#endif
1938    time_t ti;
1939    int64_t secs;
1940
1941    if (!sn) {
1942        snprintf(buf, buf_size,
1943                 "%-10s%-20s%7s%20s%15s",
1944                 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
1945    } else {
1946        ti = sn->date_sec;
1947#ifdef _WIN32
1948        ptm = localtime(&ti);
1949        strftime(date_buf, sizeof(date_buf),
1950                 "%Y-%m-%d %H:%M:%S", ptm);
1951#else
1952        localtime_r(&ti, &tm);
1953        strftime(date_buf, sizeof(date_buf),
1954                 "%Y-%m-%d %H:%M:%S", &tm);
1955#endif
1956        secs = sn->vm_clock_nsec / 1000000000;
1957        snprintf(clock_buf, sizeof(clock_buf),
1958                 "%02d:%02d:%02d.%03d",
1959                 (int)(secs / 3600),
1960                 (int)((secs / 60) % 60),
1961                 (int)(secs % 60),
1962                 (int)((sn->vm_clock_nsec / 1000000) % 1000));
1963        snprintf(buf, buf_size,
1964                 "%-10s%-20s%7s%20s%15s",
1965                 sn->id_str, sn->name,
1966                 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
1967                 date_buf,
1968                 clock_buf);
1969    }
1970    return buf;
1971}
1972
1973
1974/**************************************************************/
1975/* async I/Os */
1976
1977BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
1978                                 QEMUIOVector *qiov, int nb_sectors,
1979                                 BlockDriverCompletionFunc *cb, void *opaque)
1980{
1981    BlockDriver *drv = bs->drv;
1982    BlockDriverAIOCB *ret;
1983
1984    if (!drv)
1985        return NULL;
1986    if (bdrv_check_request(bs, sector_num, nb_sectors))
1987        return NULL;
1988
1989    ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
1990                              cb, opaque);
1991
1992    if (ret) {
1993	/* Update stats even though technically transfer has not happened. */
1994	bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
1995	bs->rd_ops ++;
1996    }
1997
1998    return ret;
1999}
2000
2001BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2002                                  QEMUIOVector *qiov, int nb_sectors,
2003                                  BlockDriverCompletionFunc *cb, void *opaque)
2004{
2005    BlockDriver *drv = bs->drv;
2006    BlockDriverAIOCB *ret;
2007
2008    if (!drv)
2009        return NULL;
2010    if (bs->read_only)
2011        return NULL;
2012    if (bdrv_check_request(bs, sector_num, nb_sectors))
2013        return NULL;
2014
2015    if (bs->dirty_bitmap) {
2016        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2017    }
2018
2019    ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
2020                               cb, opaque);
2021
2022    if (ret) {
2023        /* Update stats even though technically transfer has not happened. */
2024        bs->wr_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
2025        bs->wr_ops ++;
2026        if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2027            bs->wr_highest_sector = sector_num + nb_sectors - 1;
2028        }
2029    }
2030
2031    return ret;
2032}
2033
2034
2035typedef struct MultiwriteCB {
2036    int error;
2037    int num_requests;
2038    int num_callbacks;
2039    struct {
2040        BlockDriverCompletionFunc *cb;
2041        void *opaque;
2042        QEMUIOVector *free_qiov;
2043        void *free_buf;
2044    } callbacks[];
2045} MultiwriteCB;
2046
2047static void multiwrite_user_cb(MultiwriteCB *mcb)
2048{
2049    int i;
2050
2051    for (i = 0; i < mcb->num_callbacks; i++) {
2052        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2053        if (mcb->callbacks[i].free_qiov) {
2054            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2055        }
2056        qemu_free(mcb->callbacks[i].free_qiov);
2057        qemu_vfree(mcb->callbacks[i].free_buf);
2058    }
2059}
2060
2061static void multiwrite_cb(void *opaque, int ret)
2062{
2063    MultiwriteCB *mcb = opaque;
2064
2065    if (ret < 0 && !mcb->error) {
2066        mcb->error = ret;
2067    }
2068
2069    mcb->num_requests--;
2070    if (mcb->num_requests == 0) {
2071        multiwrite_user_cb(mcb);
2072        qemu_free(mcb);
2073    }
2074}
2075
2076static int multiwrite_req_compare(const void *a, const void *b)
2077{
2078    const BlockRequest *req1 = a, *req2 = b;
2079
2080    /*
2081     * Note that we can't simply subtract req2->sector from req1->sector
2082     * here as that could overflow the return value.
2083     */
2084    if (req1->sector > req2->sector) {
2085        return 1;
2086    } else if (req1->sector < req2->sector) {
2087        return -1;
2088    } else {
2089        return 0;
2090    }
2091}
2092
2093/*
2094 * Takes a bunch of requests and tries to merge them. Returns the number of
2095 * requests that remain after merging.
2096 */
2097static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2098    int num_reqs, MultiwriteCB *mcb)
2099{
2100    int i, outidx;
2101
2102    // Sort requests by start sector
2103    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2104
2105    // Check if adjacent requests touch the same clusters. If so, combine them,
2106    // filling up gaps with zero sectors.
2107    outidx = 0;
2108    for (i = 1; i < num_reqs; i++) {
2109        int merge = 0;
2110        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2111
2112        // This handles the cases that are valid for all block drivers, namely
2113        // exactly sequential writes and overlapping writes.
2114        if (reqs[i].sector <= oldreq_last) {
2115            merge = 1;
2116        }
2117
2118        // The block driver may decide that it makes sense to combine requests
2119        // even if there is a gap of some sectors between them. In this case,
2120        // the gap is filled with zeros (therefore only applicable for yet
2121        // unused space in format like qcow2).
2122        if (!merge && bs->drv->bdrv_merge_requests) {
2123            merge = bs->drv->bdrv_merge_requests(bs, &reqs[outidx], &reqs[i]);
2124        }
2125
2126        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2127            merge = 0;
2128        }
2129
2130        if (merge) {
2131            size_t size;
2132            QEMUIOVector *qiov = qemu_mallocz(sizeof(*qiov));
2133            qemu_iovec_init(qiov,
2134                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2135
2136            // Add the first request to the merged one. If the requests are
2137            // overlapping, drop the last sectors of the first request.
2138            size = (reqs[i].sector - reqs[outidx].sector) << 9;
2139            qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2140
2141            // We might need to add some zeros between the two requests
2142            if (reqs[i].sector > oldreq_last) {
2143                size_t zero_bytes = (reqs[i].sector - oldreq_last) << 9;
2144                uint8_t *buf = qemu_blockalign(bs, zero_bytes);
2145                memset(buf, 0, zero_bytes);
2146                qemu_iovec_add(qiov, buf, zero_bytes);
2147                mcb->callbacks[i].free_buf = buf;
2148            }
2149
2150            // Add the second request
2151            qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
2152
2153            reqs[outidx].nb_sectors = qiov->size >> 9;
2154            reqs[outidx].qiov = qiov;
2155
2156            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2157        } else {
2158            outidx++;
2159            reqs[outidx].sector     = reqs[i].sector;
2160            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2161            reqs[outidx].qiov       = reqs[i].qiov;
2162        }
2163    }
2164
2165    return outidx + 1;
2166}
2167
2168/*
2169 * Submit multiple AIO write requests at once.
2170 *
2171 * On success, the function returns 0 and all requests in the reqs array have
2172 * been submitted. In error case this function returns -1, and any of the
2173 * requests may or may not be submitted yet. In particular, this means that the
2174 * callback will be called for some of the requests, for others it won't. The
2175 * caller must check the error field of the BlockRequest to wait for the right
2176 * callbacks (if error != 0, no callback will be called).
2177 *
2178 * The implementation may modify the contents of the reqs array, e.g. to merge
2179 * requests. However, the fields opaque and error are left unmodified as they
2180 * are used to signal failure for a single request to the caller.
2181 */
2182int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
2183{
2184    BlockDriverAIOCB *acb;
2185    MultiwriteCB *mcb;
2186    int i;
2187
2188    if (num_reqs == 0) {
2189        return 0;
2190    }
2191
2192    // Create MultiwriteCB structure
2193    mcb = qemu_mallocz(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
2194    mcb->num_requests = 0;
2195    mcb->num_callbacks = num_reqs;
2196
2197    for (i = 0; i < num_reqs; i++) {
2198        mcb->callbacks[i].cb = reqs[i].cb;
2199        mcb->callbacks[i].opaque = reqs[i].opaque;
2200    }
2201
2202    // Check for mergable requests
2203    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
2204
2205    /*
2206     * Run the aio requests. As soon as one request can't be submitted
2207     * successfully, fail all requests that are not yet submitted (we must
2208     * return failure for all requests anyway)
2209     *
2210     * num_requests cannot be set to the right value immediately: If
2211     * bdrv_aio_writev fails for some request, num_requests would be too high
2212     * and therefore multiwrite_cb() would never recognize the multiwrite
2213     * request as completed. We also cannot use the loop variable i to set it
2214     * when the first request fails because the callback may already have been
2215     * called for previously submitted requests. Thus, num_requests must be
2216     * incremented for each request that is submitted.
2217     *
2218     * The problem that callbacks may be called early also means that we need
2219     * to take care that num_requests doesn't become 0 before all requests are
2220     * submitted - multiwrite_cb() would consider the multiwrite request
2221     * completed. A dummy request that is "completed" by a manual call to
2222     * multiwrite_cb() takes care of this.
2223     */
2224    mcb->num_requests = 1;
2225
2226    for (i = 0; i < num_reqs; i++) {
2227        mcb->num_requests++;
2228        acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
2229            reqs[i].nb_sectors, multiwrite_cb, mcb);
2230
2231        if (acb == NULL) {
2232            // We can only fail the whole thing if no request has been
2233            // submitted yet. Otherwise we'll wait for the submitted AIOs to
2234            // complete and report the error in the callback.
2235            if (i == 0) {
2236                goto fail;
2237            } else {
2238                multiwrite_cb(mcb, -EIO);
2239                break;
2240            }
2241        }
2242    }
2243
2244    /* Complete the dummy request */
2245    multiwrite_cb(mcb, 0);
2246
2247    return 0;
2248
2249fail:
2250    for (i = 0; i < mcb->num_callbacks; i++) {
2251        reqs[i].error = -EIO;
2252    }
2253    qemu_free(mcb);
2254    return -1;
2255}
2256
2257BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2258        BlockDriverCompletionFunc *cb, void *opaque)
2259{
2260    BlockDriver *drv = bs->drv;
2261
2262    if (bs->open_flags & BDRV_O_NO_FLUSH) {
2263        return bdrv_aio_noop_em(bs, cb, opaque);
2264    }
2265
2266    if (!drv)
2267        return NULL;
2268    return drv->bdrv_aio_flush(bs, cb, opaque);
2269}
2270
2271void bdrv_aio_cancel(BlockDriverAIOCB *acb)
2272{
2273    acb->pool->cancel(acb);
2274}
2275
2276
2277/**************************************************************/
2278/* async block device emulation */
2279
2280typedef struct BlockDriverAIOCBSync {
2281    BlockDriverAIOCB common;
2282    QEMUBH *bh;
2283    int ret;
2284    /* vector translation state */
2285    QEMUIOVector *qiov;
2286    uint8_t *bounce;
2287    int is_write;
2288} BlockDriverAIOCBSync;
2289
2290static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
2291{
2292    BlockDriverAIOCBSync *acb =
2293        container_of(blockacb, BlockDriverAIOCBSync, common);
2294    qemu_bh_delete(acb->bh);
2295    acb->bh = NULL;
2296    qemu_aio_release(acb);
2297}
2298
2299static AIOPool bdrv_em_aio_pool = {
2300    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
2301    .cancel             = bdrv_aio_cancel_em,
2302};
2303
2304static void bdrv_aio_bh_cb(void *opaque)
2305{
2306    BlockDriverAIOCBSync *acb = opaque;
2307
2308    if (!acb->is_write)
2309        qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
2310    qemu_vfree(acb->bounce);
2311    acb->common.cb(acb->common.opaque, acb->ret);
2312    qemu_bh_delete(acb->bh);
2313    acb->bh = NULL;
2314    qemu_aio_release(acb);
2315}
2316
2317static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
2318                                            int64_t sector_num,
2319                                            QEMUIOVector *qiov,
2320                                            int nb_sectors,
2321                                            BlockDriverCompletionFunc *cb,
2322                                            void *opaque,
2323                                            int is_write)
2324
2325{
2326    BlockDriverAIOCBSync *acb;
2327
2328    acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2329    acb->is_write = is_write;
2330    acb->qiov = qiov;
2331    acb->bounce = qemu_blockalign(bs, qiov->size);
2332
2333    if (!acb->bh)
2334        acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2335
2336    if (is_write) {
2337        qemu_iovec_to_buffer(acb->qiov, acb->bounce);
2338        acb->ret = bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
2339    } else {
2340        acb->ret = bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
2341    }
2342
2343    qemu_bh_schedule(acb->bh);
2344
2345    return &acb->common;
2346}
2347
2348static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
2349        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2350        BlockDriverCompletionFunc *cb, void *opaque)
2351{
2352    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
2353}
2354
2355static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
2356        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2357        BlockDriverCompletionFunc *cb, void *opaque)
2358{
2359    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
2360}
2361
2362static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
2363        BlockDriverCompletionFunc *cb, void *opaque)
2364{
2365    BlockDriverAIOCBSync *acb;
2366
2367    acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2368    acb->is_write = 1; /* don't bounce in the completion hadler */
2369    acb->qiov = NULL;
2370    acb->bounce = NULL;
2371    acb->ret = 0;
2372
2373    if (!acb->bh)
2374        acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2375
2376    bdrv_flush(bs);
2377    qemu_bh_schedule(acb->bh);
2378    return &acb->common;
2379}
2380
2381static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
2382        BlockDriverCompletionFunc *cb, void *opaque)
2383{
2384    BlockDriverAIOCBSync *acb;
2385
2386    acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2387    acb->is_write = 1; /* don't bounce in the completion handler */
2388    acb->qiov = NULL;
2389    acb->bounce = NULL;
2390    acb->ret = 0;
2391
2392    if (!acb->bh) {
2393        acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2394    }
2395
2396    qemu_bh_schedule(acb->bh);
2397    return &acb->common;
2398}
2399
2400/**************************************************************/
2401/* sync block device emulation */
2402
2403static void bdrv_rw_em_cb(void *opaque, int ret)
2404{
2405    *(int *)opaque = ret;
2406}
2407
2408#define NOT_DONE 0x7fffffff
2409
2410static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
2411                        uint8_t *buf, int nb_sectors)
2412{
2413    int async_ret;
2414    BlockDriverAIOCB *acb;
2415    struct iovec iov;
2416    QEMUIOVector qiov;
2417
2418    async_context_push();
2419
2420    async_ret = NOT_DONE;
2421    iov.iov_base = (void *)buf;
2422    iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
2423    qemu_iovec_init_external(&qiov, &iov, 1);
2424    acb = bdrv_aio_readv(bs, sector_num, &qiov, nb_sectors,
2425        bdrv_rw_em_cb, &async_ret);
2426    if (acb == NULL) {
2427        async_ret = -1;
2428        goto fail;
2429    }
2430
2431    while (async_ret == NOT_DONE) {
2432        qemu_aio_wait();
2433    }
2434
2435
2436fail:
2437    async_context_pop();
2438    return async_ret;
2439}
2440
2441static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
2442                         const uint8_t *buf, int nb_sectors)
2443{
2444    int async_ret;
2445    BlockDriverAIOCB *acb;
2446    struct iovec iov;
2447    QEMUIOVector qiov;
2448
2449    async_context_push();
2450
2451    async_ret = NOT_DONE;
2452    iov.iov_base = (void *)buf;
2453    iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
2454    qemu_iovec_init_external(&qiov, &iov, 1);
2455    acb = bdrv_aio_writev(bs, sector_num, &qiov, nb_sectors,
2456        bdrv_rw_em_cb, &async_ret);
2457    if (acb == NULL) {
2458        async_ret = -1;
2459        goto fail;
2460    }
2461    while (async_ret == NOT_DONE) {
2462        qemu_aio_wait();
2463    }
2464
2465fail:
2466    async_context_pop();
2467    return async_ret;
2468}
2469
2470void bdrv_init(void)
2471{
2472    module_call_init(MODULE_INIT_BLOCK);
2473}
2474
2475void bdrv_init_with_whitelist(void)
2476{
2477    use_bdrv_whitelist = 1;
2478    bdrv_init();
2479}
2480
2481void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
2482                   BlockDriverCompletionFunc *cb, void *opaque)
2483{
2484    BlockDriverAIOCB *acb;
2485
2486    if (pool->free_aiocb) {
2487        acb = pool->free_aiocb;
2488        pool->free_aiocb = acb->next;
2489    } else {
2490        acb = qemu_mallocz(pool->aiocb_size);
2491        acb->pool = pool;
2492    }
2493    acb->bs = bs;
2494    acb->cb = cb;
2495    acb->opaque = opaque;
2496    return acb;
2497}
2498
2499void qemu_aio_release(void *p)
2500{
2501    BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
2502    AIOPool *pool = acb->pool;
2503    acb->next = pool->free_aiocb;
2504    pool->free_aiocb = acb;
2505}
2506
2507/**************************************************************/
2508/* removable device support */
2509
2510/**
2511 * Return TRUE if the media is present
2512 */
2513int bdrv_is_inserted(BlockDriverState *bs)
2514{
2515    BlockDriver *drv = bs->drv;
2516    int ret;
2517    if (!drv)
2518        return 0;
2519    if (!drv->bdrv_is_inserted)
2520        return !bs->tray_open;
2521    ret = drv->bdrv_is_inserted(bs);
2522    return ret;
2523}
2524
2525/**
2526 * Return TRUE if the media changed since the last call to this
2527 * function. It is currently only used for floppy disks
2528 */
2529int bdrv_media_changed(BlockDriverState *bs)
2530{
2531    BlockDriver *drv = bs->drv;
2532    int ret;
2533
2534    if (!drv || !drv->bdrv_media_changed)
2535        ret = -ENOTSUP;
2536    else
2537        ret = drv->bdrv_media_changed(bs);
2538    if (ret == -ENOTSUP)
2539        ret = bs->media_changed;
2540    bs->media_changed = 0;
2541    return ret;
2542}
2543
2544/**
2545 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
2546 */
2547int bdrv_eject(BlockDriverState *bs, int eject_flag)
2548{
2549    BlockDriver *drv = bs->drv;
2550    int ret;
2551
2552    if (bs->locked) {
2553        return -EBUSY;
2554    }
2555
2556    if (!drv || !drv->bdrv_eject) {
2557        ret = -ENOTSUP;
2558    } else {
2559        ret = drv->bdrv_eject(bs, eject_flag);
2560    }
2561    if (ret == -ENOTSUP) {
2562        ret = 0;
2563    }
2564    if (ret >= 0) {
2565        bs->tray_open = eject_flag;
2566    }
2567
2568    return ret;
2569}
2570
2571int bdrv_is_locked(BlockDriverState *bs)
2572{
2573    return bs->locked;
2574}
2575
2576/**
2577 * Lock or unlock the media (if it is locked, the user won't be able
2578 * to eject it manually).
2579 */
2580void bdrv_set_locked(BlockDriverState *bs, int locked)
2581{
2582    BlockDriver *drv = bs->drv;
2583
2584    bs->locked = locked;
2585    if (drv && drv->bdrv_set_locked) {
2586        drv->bdrv_set_locked(bs, locked);
2587    }
2588}
2589
2590/* needed for generic scsi interface */
2591
2592int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
2593{
2594    BlockDriver *drv = bs->drv;
2595
2596    if (drv && drv->bdrv_ioctl)
2597        return drv->bdrv_ioctl(bs, req, buf);
2598    return -ENOTSUP;
2599}
2600
2601BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
2602        unsigned long int req, void *buf,
2603        BlockDriverCompletionFunc *cb, void *opaque)
2604{
2605    BlockDriver *drv = bs->drv;
2606
2607    if (drv && drv->bdrv_aio_ioctl)
2608        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
2609    return NULL;
2610}
2611
2612
2613
2614void *qemu_blockalign(BlockDriverState *bs, size_t size)
2615{
2616    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
2617}
2618
2619void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
2620{
2621    int64_t bitmap_size;
2622
2623    bs->dirty_count = 0;
2624    if (enable) {
2625        if (!bs->dirty_bitmap) {
2626            bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
2627                    BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
2628            bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
2629
2630            bs->dirty_bitmap = qemu_mallocz(bitmap_size);
2631        }
2632    } else {
2633        if (bs->dirty_bitmap) {
2634            qemu_free(bs->dirty_bitmap);
2635            bs->dirty_bitmap = NULL;
2636        }
2637    }
2638}
2639
2640int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
2641{
2642    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
2643
2644    if (bs->dirty_bitmap &&
2645        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
2646        return bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
2647            (1 << (chunk % (sizeof(unsigned long) * 8)));
2648    } else {
2649        return 0;
2650    }
2651}
2652
2653void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
2654                      int nr_sectors)
2655{
2656    set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
2657}
2658
2659int64_t bdrv_get_dirty_count(BlockDriverState *bs)
2660{
2661    return bs->dirty_count;
2662}
2663