vmdk.c revision 5d8f37ad78fc66901af50c762029a501561f3b23
1/*
2 * Block driver for the VMDK format
3 *
4 * Copyright (c) 2004 Fabrice Bellard
5 * Copyright (c) 2005 Filip Navara
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 * THE SOFTWARE.
24 */
25
26#include "qemu-common.h"
27#include "block_int.h"
28#include "module.h"
29
30#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
31#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
32
33typedef struct {
34    uint32_t version;
35    uint32_t flags;
36    uint32_t disk_sectors;
37    uint32_t granularity;
38    uint32_t l1dir_offset;
39    uint32_t l1dir_size;
40    uint32_t file_sectors;
41    uint32_t cylinders;
42    uint32_t heads;
43    uint32_t sectors_per_track;
44} VMDK3Header;
45
46typedef struct {
47    uint32_t version;
48    uint32_t flags;
49    int64_t capacity;
50    int64_t granularity;
51    int64_t desc_offset;
52    int64_t desc_size;
53    int32_t num_gtes_per_gte;
54    int64_t rgd_offset;
55    int64_t gd_offset;
56    int64_t grain_offset;
57    char filler[1];
58    char check_bytes[4];
59} __attribute__((packed)) VMDK4Header;
60
61#define L2_CACHE_SIZE 16
62
63typedef struct BDRVVmdkState {
64    BlockDriverState *hd;
65    int64_t l1_table_offset;
66    int64_t l1_backup_table_offset;
67    uint32_t *l1_table;
68    uint32_t *l1_backup_table;
69    unsigned int l1_size;
70    uint32_t l1_entry_sectors;
71
72    unsigned int l2_size;
73    uint32_t *l2_cache;
74    uint32_t l2_cache_offsets[L2_CACHE_SIZE];
75    uint32_t l2_cache_counts[L2_CACHE_SIZE];
76
77    unsigned int cluster_sectors;
78    uint32_t parent_cid;
79    int is_parent;
80} BDRVVmdkState;
81
82typedef struct VmdkMetaData {
83    uint32_t offset;
84    unsigned int l1_index;
85    unsigned int l2_index;
86    unsigned int l2_offset;
87    int valid;
88} VmdkMetaData;
89
90typedef struct ActiveBDRVState{
91    BlockDriverState *hd;            // active image handler
92    uint64_t cluster_offset;         // current write offset
93}ActiveBDRVState;
94
95static ActiveBDRVState activeBDRV;
96
97
98static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
99{
100    uint32_t magic;
101
102    if (buf_size < 4)
103        return 0;
104    magic = be32_to_cpu(*(uint32_t *)buf);
105    if (magic == VMDK3_MAGIC ||
106        magic == VMDK4_MAGIC)
107        return 100;
108    else
109        return 0;
110}
111
112#define CHECK_CID 1
113
114#define SECTOR_SIZE 512
115#define DESC_SIZE 20*SECTOR_SIZE	// 20 sectors of 512 bytes each
116#define HEADER_SIZE 512   			// first sector of 512 bytes
117
118static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
119{
120    BDRVVmdkState *s = bs->opaque;
121    char desc[DESC_SIZE];
122    uint32_t cid;
123    const char *p_name, *cid_str;
124    size_t cid_str_size;
125
126    /* the descriptor offset = 0x200 */
127    if (bdrv_pread(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
128        return 0;
129
130    if (parent) {
131        cid_str = "parentCID";
132        cid_str_size = sizeof("parentCID");
133    } else {
134        cid_str = "CID";
135        cid_str_size = sizeof("CID");
136    }
137
138    if ((p_name = strstr(desc,cid_str)) != NULL) {
139        p_name += cid_str_size;
140        sscanf(p_name,"%x",&cid);
141    }
142
143    return cid;
144}
145
146static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
147{
148    BDRVVmdkState *s = bs->opaque;
149    char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
150    char *p_name, *tmp_str;
151
152    /* the descriptor offset = 0x200 */
153    if (bdrv_pread(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
154        return -1;
155
156    tmp_str = strstr(desc,"parentCID");
157    pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
158    if ((p_name = strstr(desc,"CID")) != NULL) {
159        p_name += sizeof("CID");
160        snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid);
161        pstrcat(desc, sizeof(desc), tmp_desc);
162    }
163
164    if (bdrv_pwrite(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
165        return -1;
166    return 0;
167}
168
169static int vmdk_is_cid_valid(BlockDriverState *bs)
170{
171#ifdef CHECK_CID
172    BDRVVmdkState *s = bs->opaque;
173    BlockDriverState *p_bs = s->hd->backing_hd;
174    uint32_t cur_pcid;
175
176    if (p_bs) {
177        cur_pcid = vmdk_read_cid(p_bs,0);
178        if (s->parent_cid != cur_pcid)
179            // CID not valid
180            return 0;
181    }
182#endif
183    // CID valid
184    return 1;
185}
186
187static int vmdk_snapshot_create(const char *filename, const char *backing_file)
188{
189    int snp_fd, p_fd;
190    uint32_t p_cid;
191    char *p_name, *gd_buf, *rgd_buf;
192    const char *real_filename, *temp_str;
193    VMDK4Header header;
194    uint32_t gde_entries, gd_size;
195    int64_t gd_offset, rgd_offset, capacity, gt_size;
196    char p_desc[DESC_SIZE], s_desc[DESC_SIZE], hdr[HEADER_SIZE];
197    static const char desc_template[] =
198    "# Disk DescriptorFile\n"
199    "version=1\n"
200    "CID=%x\n"
201    "parentCID=%x\n"
202    "createType=\"monolithicSparse\"\n"
203    "parentFileNameHint=\"%s\"\n"
204    "\n"
205    "# Extent description\n"
206    "RW %u SPARSE \"%s\"\n"
207    "\n"
208    "# The Disk Data Base \n"
209    "#DDB\n"
210    "\n";
211
212    snp_fd = open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, 0644);
213    if (snp_fd < 0)
214        return -1;
215    p_fd = open(backing_file, O_RDONLY | O_BINARY | O_LARGEFILE);
216    if (p_fd < 0) {
217        close(snp_fd);
218        return -1;
219    }
220
221    /* read the header */
222    if (lseek(p_fd, 0x0, SEEK_SET) == -1)
223        goto fail;
224    if (read(p_fd, hdr, HEADER_SIZE) != HEADER_SIZE)
225        goto fail;
226
227    /* write the header */
228    if (lseek(snp_fd, 0x0, SEEK_SET) == -1)
229        goto fail;
230    if (write(snp_fd, hdr, HEADER_SIZE) == -1)
231        goto fail;
232
233    memset(&header, 0, sizeof(header));
234    memcpy(&header,&hdr[4], sizeof(header)); // skip the VMDK4_MAGIC
235
236    ftruncate(snp_fd, header.grain_offset << 9);
237    /* the descriptor offset = 0x200 */
238    if (lseek(p_fd, 0x200, SEEK_SET) == -1)
239        goto fail;
240    if (read(p_fd, p_desc, DESC_SIZE) != DESC_SIZE)
241        goto fail;
242
243    if ((p_name = strstr(p_desc,"CID")) != NULL) {
244        p_name += sizeof("CID");
245        sscanf(p_name,"%x",&p_cid);
246    }
247
248    real_filename = filename;
249    if ((temp_str = strrchr(real_filename, '\\')) != NULL)
250        real_filename = temp_str + 1;
251    if ((temp_str = strrchr(real_filename, '/')) != NULL)
252        real_filename = temp_str + 1;
253    if ((temp_str = strrchr(real_filename, ':')) != NULL)
254        real_filename = temp_str + 1;
255
256    snprintf(s_desc, sizeof(s_desc), desc_template, p_cid, p_cid, backing_file,
257             (uint32_t)header.capacity, real_filename);
258
259    /* write the descriptor */
260    if (lseek(snp_fd, 0x200, SEEK_SET) == -1)
261        goto fail;
262    if (write(snp_fd, s_desc, strlen(s_desc)) == -1)
263        goto fail;
264
265    gd_offset = header.gd_offset * SECTOR_SIZE;     // offset of GD table
266    rgd_offset = header.rgd_offset * SECTOR_SIZE;   // offset of RGD table
267    capacity = header.capacity * SECTOR_SIZE;       // Extent size
268    /*
269     * Each GDE span 32M disk, means:
270     * 512 GTE per GT, each GTE points to grain
271     */
272    gt_size = (int64_t)header.num_gtes_per_gte * header.granularity * SECTOR_SIZE;
273    if (!gt_size)
274        goto fail;
275    gde_entries = (uint32_t)(capacity / gt_size);  // number of gde/rgde
276    gd_size = gde_entries * sizeof(uint32_t);
277
278    /* write RGD */
279    rgd_buf = qemu_malloc(gd_size);
280    if (lseek(p_fd, rgd_offset, SEEK_SET) == -1)
281        goto fail_rgd;
282    if (read(p_fd, rgd_buf, gd_size) != gd_size)
283        goto fail_rgd;
284    if (lseek(snp_fd, rgd_offset, SEEK_SET) == -1)
285        goto fail_rgd;
286    if (write(snp_fd, rgd_buf, gd_size) == -1)
287        goto fail_rgd;
288    qemu_free(rgd_buf);
289
290    /* write GD */
291    gd_buf = qemu_malloc(gd_size);
292    if (lseek(p_fd, gd_offset, SEEK_SET) == -1)
293        goto fail_gd;
294    if (read(p_fd, gd_buf, gd_size) != gd_size)
295        goto fail_gd;
296    if (lseek(snp_fd, gd_offset, SEEK_SET) == -1)
297        goto fail_gd;
298    if (write(snp_fd, gd_buf, gd_size) == -1)
299        goto fail_gd;
300    qemu_free(gd_buf);
301
302    close(p_fd);
303    close(snp_fd);
304    return 0;
305
306    fail_gd:
307    qemu_free(gd_buf);
308    fail_rgd:
309    qemu_free(rgd_buf);
310    fail:
311    close(p_fd);
312    close(snp_fd);
313    return -1;
314}
315
316static void vmdk_parent_close(BlockDriverState *bs)
317{
318    if (bs->backing_hd)
319        bdrv_close(bs->backing_hd);
320}
321
322static int parent_open = 0;
323static int vmdk_parent_open(BlockDriverState *bs, const char * filename)
324{
325    BDRVVmdkState *s = bs->opaque;
326    char *p_name;
327    char desc[DESC_SIZE];
328    char parent_img_name[1024];
329
330    /* the descriptor offset = 0x200 */
331    if (bdrv_pread(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
332        return -1;
333
334    if ((p_name = strstr(desc,"parentFileNameHint")) != NULL) {
335        char *end_name;
336        struct stat file_buf;
337
338        p_name += sizeof("parentFileNameHint") + 1;
339        if ((end_name = strchr(p_name,'\"')) == NULL)
340            return -1;
341        if ((end_name - p_name) > sizeof (s->hd->backing_file) - 1)
342            return -1;
343
344        pstrcpy(s->hd->backing_file, end_name - p_name + 1, p_name);
345        if (stat(s->hd->backing_file, &file_buf) != 0) {
346            path_combine(parent_img_name, sizeof(parent_img_name),
347                         filename, s->hd->backing_file);
348        } else {
349            pstrcpy(parent_img_name, sizeof(parent_img_name),
350                    s->hd->backing_file);
351        }
352
353        s->hd->backing_hd = bdrv_new("");
354        if (!s->hd->backing_hd) {
355            failure:
356            bdrv_close(s->hd);
357            return -1;
358        }
359        parent_open = 1;
360        if (bdrv_open(s->hd->backing_hd, parent_img_name, BDRV_O_RDONLY) < 0)
361            goto failure;
362        parent_open = 0;
363    }
364
365    return 0;
366}
367
368static int vmdk_open(BlockDriverState *bs, const char *filename, int flags)
369{
370    BDRVVmdkState *s = bs->opaque;
371    uint32_t magic;
372    int l1_size, i, ret;
373
374    if (parent_open)
375        // Parent must be opened as RO.
376        flags = BDRV_O_RDONLY;
377
378    ret = bdrv_file_open(&s->hd, filename, flags);
379    if (ret < 0)
380        return ret;
381    if (bdrv_pread(s->hd, 0, &magic, sizeof(magic)) != sizeof(magic))
382        goto fail;
383
384    magic = be32_to_cpu(magic);
385    if (magic == VMDK3_MAGIC) {
386        VMDK3Header header;
387
388        if (bdrv_pread(s->hd, sizeof(magic), &header, sizeof(header)) != sizeof(header))
389            goto fail;
390        s->cluster_sectors = le32_to_cpu(header.granularity);
391        s->l2_size = 1 << 9;
392        s->l1_size = 1 << 6;
393        bs->total_sectors = le32_to_cpu(header.disk_sectors);
394        s->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9;
395        s->l1_backup_table_offset = 0;
396        s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
397    } else if (magic == VMDK4_MAGIC) {
398        VMDK4Header header;
399
400        if (bdrv_pread(s->hd, sizeof(magic), &header, sizeof(header)) != sizeof(header))
401            goto fail;
402        bs->total_sectors = le64_to_cpu(header.capacity);
403        s->cluster_sectors = le64_to_cpu(header.granularity);
404        s->l2_size = le32_to_cpu(header.num_gtes_per_gte);
405        s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
406        if (s->l1_entry_sectors <= 0)
407            goto fail;
408        s->l1_size = (bs->total_sectors + s->l1_entry_sectors - 1)
409            / s->l1_entry_sectors;
410        s->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
411        s->l1_backup_table_offset = le64_to_cpu(header.gd_offset) << 9;
412
413        if (parent_open)
414            s->is_parent = 1;
415        else
416            s->is_parent = 0;
417
418        // try to open parent images, if exist
419        if (vmdk_parent_open(bs, filename) != 0)
420            goto fail;
421        // write the CID once after the image creation
422        s->parent_cid = vmdk_read_cid(bs,1);
423    } else {
424        goto fail;
425    }
426
427    /* read the L1 table */
428    l1_size = s->l1_size * sizeof(uint32_t);
429    s->l1_table = qemu_malloc(l1_size);
430    if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, l1_size) != l1_size)
431        goto fail;
432    for(i = 0; i < s->l1_size; i++) {
433        le32_to_cpus(&s->l1_table[i]);
434    }
435
436    if (s->l1_backup_table_offset) {
437        s->l1_backup_table = qemu_malloc(l1_size);
438        if (bdrv_pread(s->hd, s->l1_backup_table_offset, s->l1_backup_table, l1_size) != l1_size)
439            goto fail;
440        for(i = 0; i < s->l1_size; i++) {
441            le32_to_cpus(&s->l1_backup_table[i]);
442        }
443    }
444
445    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
446    return 0;
447 fail:
448    qemu_free(s->l1_backup_table);
449    qemu_free(s->l1_table);
450    qemu_free(s->l2_cache);
451    bdrv_delete(s->hd);
452    return -1;
453}
454
455static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
456                                   uint64_t offset, int allocate);
457
458static int get_whole_cluster(BlockDriverState *bs, uint64_t cluster_offset,
459                             uint64_t offset, int allocate)
460{
461    uint64_t parent_cluster_offset;
462    BDRVVmdkState *s = bs->opaque;
463    uint8_t  whole_grain[s->cluster_sectors*512];        // 128 sectors * 512 bytes each = grain size 64KB
464
465    // we will be here if it's first write on non-exist grain(cluster).
466    // try to read from parent image, if exist
467    if (s->hd->backing_hd) {
468        BDRVVmdkState *ps = s->hd->backing_hd->opaque;
469
470        if (!vmdk_is_cid_valid(bs))
471            return -1;
472
473        parent_cluster_offset = get_cluster_offset(s->hd->backing_hd, NULL, offset, allocate);
474
475        if (parent_cluster_offset) {
476            BDRVVmdkState *act_s = activeBDRV.hd->opaque;
477
478            if (bdrv_pread(ps->hd, parent_cluster_offset, whole_grain, ps->cluster_sectors*512) != ps->cluster_sectors*512)
479                return -1;
480
481            //Write grain only into the active image
482            if (bdrv_pwrite(act_s->hd, activeBDRV.cluster_offset << 9, whole_grain, sizeof(whole_grain)) != sizeof(whole_grain))
483                return -1;
484        }
485    }
486    return 0;
487}
488
489static int vmdk_L2update(BlockDriverState *bs, VmdkMetaData *m_data)
490{
491    BDRVVmdkState *s = bs->opaque;
492
493    /* update L2 table */
494    if (bdrv_pwrite(s->hd, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)),
495                    &(m_data->offset), sizeof(m_data->offset)) != sizeof(m_data->offset))
496        return -1;
497    /* update backup L2 table */
498    if (s->l1_backup_table_offset != 0) {
499        m_data->l2_offset = s->l1_backup_table[m_data->l1_index];
500        if (bdrv_pwrite(s->hd, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)),
501                        &(m_data->offset), sizeof(m_data->offset)) != sizeof(m_data->offset))
502            return -1;
503    }
504
505    return 0;
506}
507
508static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
509                                   uint64_t offset, int allocate)
510{
511    BDRVVmdkState *s = bs->opaque;
512    unsigned int l1_index, l2_offset, l2_index;
513    int min_index, i, j;
514    uint32_t min_count, *l2_table, tmp = 0;
515    uint64_t cluster_offset;
516
517    if (m_data)
518        m_data->valid = 0;
519
520    l1_index = (offset >> 9) / s->l1_entry_sectors;
521    if (l1_index >= s->l1_size)
522        return 0;
523    l2_offset = s->l1_table[l1_index];
524    if (!l2_offset)
525        return 0;
526    for(i = 0; i < L2_CACHE_SIZE; i++) {
527        if (l2_offset == s->l2_cache_offsets[i]) {
528            /* increment the hit count */
529            if (++s->l2_cache_counts[i] == 0xffffffff) {
530                for(j = 0; j < L2_CACHE_SIZE; j++) {
531                    s->l2_cache_counts[j] >>= 1;
532                }
533            }
534            l2_table = s->l2_cache + (i * s->l2_size);
535            goto found;
536        }
537    }
538    /* not found: load a new entry in the least used one */
539    min_index = 0;
540    min_count = 0xffffffff;
541    for(i = 0; i < L2_CACHE_SIZE; i++) {
542        if (s->l2_cache_counts[i] < min_count) {
543            min_count = s->l2_cache_counts[i];
544            min_index = i;
545        }
546    }
547    l2_table = s->l2_cache + (min_index * s->l2_size);
548    if (bdrv_pread(s->hd, (int64_t)l2_offset * 512, l2_table, s->l2_size * sizeof(uint32_t)) !=
549                                                                        s->l2_size * sizeof(uint32_t))
550        return 0;
551
552    s->l2_cache_offsets[min_index] = l2_offset;
553    s->l2_cache_counts[min_index] = 1;
554 found:
555    l2_index = ((offset >> 9) / s->cluster_sectors) % s->l2_size;
556    cluster_offset = le32_to_cpu(l2_table[l2_index]);
557
558    if (!cluster_offset) {
559        if (!allocate)
560            return 0;
561        // Avoid the L2 tables update for the images that have snapshots.
562        if (!s->is_parent) {
563            cluster_offset = bdrv_getlength(s->hd);
564            bdrv_truncate(s->hd, cluster_offset + (s->cluster_sectors << 9));
565
566            cluster_offset >>= 9;
567            tmp = cpu_to_le32(cluster_offset);
568            l2_table[l2_index] = tmp;
569            // Save the active image state
570            activeBDRV.cluster_offset = cluster_offset;
571            activeBDRV.hd = bs;
572        }
573        /* First of all we write grain itself, to avoid race condition
574         * that may to corrupt the image.
575         * This problem may occur because of insufficient space on host disk
576         * or inappropriate VM shutdown.
577         */
578        if (get_whole_cluster(bs, cluster_offset, offset, allocate) == -1)
579            return 0;
580
581        if (m_data) {
582            m_data->offset = tmp;
583            m_data->l1_index = l1_index;
584            m_data->l2_index = l2_index;
585            m_data->l2_offset = l2_offset;
586            m_data->valid = 1;
587        }
588    }
589    cluster_offset <<= 9;
590    return cluster_offset;
591}
592
593static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
594                             int nb_sectors, int *pnum)
595{
596    BDRVVmdkState *s = bs->opaque;
597    int index_in_cluster, n;
598    uint64_t cluster_offset;
599
600    cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
601    index_in_cluster = sector_num % s->cluster_sectors;
602    n = s->cluster_sectors - index_in_cluster;
603    if (n > nb_sectors)
604        n = nb_sectors;
605    *pnum = n;
606    return (cluster_offset != 0);
607}
608
609static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
610                    uint8_t *buf, int nb_sectors)
611{
612    BDRVVmdkState *s = bs->opaque;
613    int index_in_cluster, n, ret;
614    uint64_t cluster_offset;
615
616    while (nb_sectors > 0) {
617        cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
618        index_in_cluster = sector_num % s->cluster_sectors;
619        n = s->cluster_sectors - index_in_cluster;
620        if (n > nb_sectors)
621            n = nb_sectors;
622        if (!cluster_offset) {
623            // try to read from parent image, if exist
624            if (s->hd->backing_hd) {
625                if (!vmdk_is_cid_valid(bs))
626                    return -1;
627                ret = bdrv_read(s->hd->backing_hd, sector_num, buf, n);
628                if (ret < 0)
629                    return -1;
630            } else {
631                memset(buf, 0, 512 * n);
632            }
633        } else {
634            if(bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512)
635                return -1;
636        }
637        nb_sectors -= n;
638        sector_num += n;
639        buf += n * 512;
640    }
641    return 0;
642}
643
644static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
645                     const uint8_t *buf, int nb_sectors)
646{
647    BDRVVmdkState *s = bs->opaque;
648    VmdkMetaData m_data;
649    int index_in_cluster, n;
650    uint64_t cluster_offset;
651    static int cid_update = 0;
652
653    if (sector_num > bs->total_sectors) {
654        fprintf(stderr,
655                "(VMDK) Wrong offset: sector_num=0x%" PRIx64
656                " total_sectors=0x%" PRIx64 "\n",
657                sector_num, bs->total_sectors);
658        return -1;
659    }
660
661    while (nb_sectors > 0) {
662        index_in_cluster = sector_num & (s->cluster_sectors - 1);
663        n = s->cluster_sectors - index_in_cluster;
664        if (n > nb_sectors)
665            n = nb_sectors;
666        cluster_offset = get_cluster_offset(bs, &m_data, sector_num << 9, 1);
667        if (!cluster_offset)
668            return -1;
669
670        if (bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512)
671            return -1;
672        if (m_data.valid) {
673            /* update L2 tables */
674            if (vmdk_L2update(bs, &m_data) == -1)
675                return -1;
676        }
677        nb_sectors -= n;
678        sector_num += n;
679        buf += n * 512;
680
681        // update CID on the first write every time the virtual disk is opened
682        if (!cid_update) {
683            vmdk_write_cid(bs, time(NULL));
684            cid_update++;
685        }
686    }
687    return 0;
688}
689
690static int vmdk_create(const char *filename, QEMUOptionParameter *options)
691{
692    int fd, i;
693    VMDK4Header header;
694    uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
695    static const char desc_template[] =
696        "# Disk DescriptorFile\n"
697        "version=1\n"
698        "CID=%x\n"
699        "parentCID=ffffffff\n"
700        "createType=\"monolithicSparse\"\n"
701        "\n"
702        "# Extent description\n"
703        "RW %" PRId64 " SPARSE \"%s\"\n"
704        "\n"
705        "# The Disk Data Base \n"
706        "#DDB\n"
707        "\n"
708        "ddb.virtualHWVersion = \"%d\"\n"
709        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
710        "ddb.geometry.heads = \"16\"\n"
711        "ddb.geometry.sectors = \"63\"\n"
712        "ddb.adapterType = \"ide\"\n";
713    char desc[1024];
714    const char *real_filename, *temp_str;
715    int64_t total_size = 0;
716    const char *backing_file = NULL;
717    int flags = 0;
718
719    // Read out options
720    while (options && options->name) {
721        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
722            total_size = options->value.n / 512;
723        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
724            backing_file = options->value.s;
725        } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
726            flags |= options->value.n ? BLOCK_FLAG_COMPAT6: 0;
727        }
728        options++;
729    }
730
731    /* XXX: add support for backing file */
732    if (backing_file) {
733        return vmdk_snapshot_create(filename, backing_file);
734    }
735
736    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
737              0644);
738    if (fd < 0)
739        return -1;
740    magic = cpu_to_be32(VMDK4_MAGIC);
741    memset(&header, 0, sizeof(header));
742    header.version = cpu_to_le32(1);
743    header.flags = cpu_to_le32(3); /* ?? */
744    header.capacity = cpu_to_le64(total_size);
745    header.granularity = cpu_to_le64(128);
746    header.num_gtes_per_gte = cpu_to_le32(512);
747
748    grains = (total_size + header.granularity - 1) / header.granularity;
749    gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
750    gt_count = (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
751    gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
752
753    header.desc_offset = 1;
754    header.desc_size = 20;
755    header.rgd_offset = header.desc_offset + header.desc_size;
756    header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count);
757    header.grain_offset =
758       ((header.gd_offset + gd_size + (gt_size * gt_count) +
759         header.granularity - 1) / header.granularity) *
760        header.granularity;
761
762    header.desc_offset = cpu_to_le64(header.desc_offset);
763    header.desc_size = cpu_to_le64(header.desc_size);
764    header.rgd_offset = cpu_to_le64(header.rgd_offset);
765    header.gd_offset = cpu_to_le64(header.gd_offset);
766    header.grain_offset = cpu_to_le64(header.grain_offset);
767
768    header.check_bytes[0] = 0xa;
769    header.check_bytes[1] = 0x20;
770    header.check_bytes[2] = 0xd;
771    header.check_bytes[3] = 0xa;
772
773    /* write all the data */
774    write(fd, &magic, sizeof(magic));
775    write(fd, &header, sizeof(header));
776
777    ftruncate(fd, header.grain_offset << 9);
778
779    /* write grain directory */
780    lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET);
781    for (i = 0, tmp = header.rgd_offset + gd_size;
782         i < gt_count; i++, tmp += gt_size)
783        write(fd, &tmp, sizeof(tmp));
784
785    /* write backup grain directory */
786    lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET);
787    for (i = 0, tmp = header.gd_offset + gd_size;
788         i < gt_count; i++, tmp += gt_size)
789        write(fd, &tmp, sizeof(tmp));
790
791    /* compose the descriptor */
792    real_filename = filename;
793    if ((temp_str = strrchr(real_filename, '\\')) != NULL)
794        real_filename = temp_str + 1;
795    if ((temp_str = strrchr(real_filename, '/')) != NULL)
796        real_filename = temp_str + 1;
797    if ((temp_str = strrchr(real_filename, ':')) != NULL)
798        real_filename = temp_str + 1;
799    snprintf(desc, sizeof(desc), desc_template, (unsigned int)time(NULL),
800             total_size, real_filename,
801             (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
802             total_size / (int64_t)(63 * 16));
803
804    /* write the descriptor */
805    lseek(fd, le64_to_cpu(header.desc_offset) << 9, SEEK_SET);
806    write(fd, desc, strlen(desc));
807
808    close(fd);
809    return 0;
810}
811
812static void vmdk_close(BlockDriverState *bs)
813{
814    BDRVVmdkState *s = bs->opaque;
815
816    qemu_free(s->l1_table);
817    qemu_free(s->l2_cache);
818    // try to close parent image, if exist
819    vmdk_parent_close(s->hd);
820    bdrv_delete(s->hd);
821}
822
823static void vmdk_flush(BlockDriverState *bs)
824{
825    BDRVVmdkState *s = bs->opaque;
826    bdrv_flush(s->hd);
827}
828
829
830static QEMUOptionParameter vmdk_create_options[] = {
831    {
832        .name = BLOCK_OPT_SIZE,
833        .type = OPT_SIZE,
834        .help = "Virtual disk size"
835    },
836    {
837        .name = BLOCK_OPT_BACKING_FILE,
838        .type = OPT_STRING,
839        .help = "File name of a base image"
840    },
841    {
842        .name = BLOCK_OPT_COMPAT6,
843        .type = OPT_FLAG,
844        .help = "VMDK version 6 image"
845    },
846    { NULL }
847};
848
849static BlockDriver bdrv_vmdk = {
850    .format_name	= "vmdk",
851    .instance_size	= sizeof(BDRVVmdkState),
852    .bdrv_probe		= vmdk_probe,
853    .bdrv_open		= vmdk_open,
854    .bdrv_read		= vmdk_read,
855    .bdrv_write		= vmdk_write,
856    .bdrv_close		= vmdk_close,
857    .bdrv_create	= vmdk_create,
858    .bdrv_flush		= vmdk_flush,
859    .bdrv_is_allocated	= vmdk_is_allocated,
860
861    .create_options = vmdk_create_options,
862};
863
864static void bdrv_vmdk_init(void)
865{
866    bdrv_register(&bdrv_vmdk);
867}
868
869block_init(bdrv_vmdk_init);
870