datagen.c revision 6b0c39b839b8343da195252a8c46e6d93138f3b8
1/*
2    datagen.c - compressible data generator test tool
3    Copyright (C) Yann Collet 2012-2015
4
5    GPL v2 License
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 2 of the License, or
10    (at your option) any later version.
11
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16
17    You should have received a copy of the GNU General Public License along
18    with this program; if not, write to the Free Software Foundation, Inc.,
19    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20
21    You can contact the author at :
22   - ZSTD source repository : https://github.com/Cyan4973/zstd
23   - Public forum : https://groups.google.com/forum/#!forum/lz4c
24*/
25
26/**************************************
27*  Includes
28**************************************/
29#include <stdlib.h>    /* malloc */
30#include <stdio.h>     /* FILE, fwrite */
31#include <string.h>    /* memcpy */
32
33
34/**************************************
35*  Basic Types
36**************************************/
37#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)   /* C99 */
38# include <stdint.h>
39  typedef  uint8_t BYTE;
40  typedef uint16_t U16;
41  typedef uint32_t U32;
42  typedef  int32_t S32;
43  typedef uint64_t U64;
44#else
45  typedef unsigned char       BYTE;
46  typedef unsigned short      U16;
47  typedef unsigned int        U32;
48  typedef   signed int        S32;
49  typedef unsigned long long  U64;
50#endif
51
52
53/**************************************
54*  OS-specific Includes
55**************************************/
56#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__)
57#  include <fcntl.h>   /* _O_BINARY */
58#  include <io.h>      /* _setmode, _isatty */
59#  define SET_BINARY_MODE(file) _setmode(_fileno(file), _O_BINARY)
60#else
61#  define SET_BINARY_MODE(file)
62#endif
63
64
65/**************************************
66*  Constants
67**************************************/
68#define KB *(1 <<10)
69
70#define PRIME1   2654435761U
71#define PRIME2   2246822519U
72
73
74/*********************************************************
75*  Local Functions
76*********************************************************/
77#define RDG_rotl32(x,r) ((x << r) | (x >> (32 - r)))
78static unsigned int RDG_rand(U32* src)
79{
80    U32 rand32 = *src;
81    rand32 *= PRIME1;
82    rand32 ^= PRIME2;
83    rand32  = RDG_rotl32(rand32, 13);
84    *src = rand32;
85    return rand32;
86}
87
88
89#define LTSIZE 8192
90#define LTMASK (LTSIZE-1)
91static void* RDG_createLiteralDistrib(double ld)
92{
93    BYTE* lt = malloc(LTSIZE);
94    U32 i = 0;
95    BYTE character = '0';
96    BYTE firstChar = '(';
97    BYTE lastChar = '}';
98
99    if (ld==0.0)
100    {
101        character = 0;
102        firstChar = 0;
103        lastChar =255;
104    }
105    while (i<LTSIZE)
106    {
107        U32 weight = (U32)((double)(LTSIZE - i) * ld) + 1;
108        U32 end;
109        if (weight + i > LTSIZE) weight = LTSIZE-i;
110        end = i + weight;
111        while (i < end) lt[i++] = character;
112        character++;
113        if (character > lastChar) character = firstChar;
114    }
115    return lt;
116}
117
118static char RDG_genChar(U32* seed, const void* ltctx)
119{
120    const BYTE* lt = ltctx;
121    U32 id = RDG_rand(seed) & LTMASK;
122    return lt[id];
123}
124
125#define RDG_DICTSIZE    (32 KB)
126#define RDG_RAND15BITS  ((RDG_rand(seed) >> 3) & 32767)
127#define RDG_RANDLENGTH  ( ((RDG_rand(seed) >> 7) & 7) ? (RDG_rand(seed) & 15) : (RDG_rand(seed) & 511) + 15)
128void RDG_genBlock(void* buffer, size_t buffSize, size_t prefixSize, double matchProba, void* litTable, unsigned* seedPtr)
129{
130    BYTE* buffPtr = (BYTE*)buffer;
131    const U32 matchProba32 = (U32)(32768 * matchProba);
132    size_t pos = prefixSize;
133    void* ldctx = litTable;
134    U32* seed = seedPtr;
135
136    /* special case */
137    while (matchProba >= 1.0)
138    {
139        size_t size0 = RDG_rand(seed) & 3;
140        size0  = 1U << (16 + size0 * 2);
141        size0 += RDG_rand(seed) & (size0-1);   /* because size0 is power of 2*/
142        if (buffSize < pos + size0)
143        {
144            memset(buffPtr+pos, 0, buffSize-pos);
145            return;
146        }
147        memset(buffPtr+pos, 0, size0);
148        pos += size0;
149        buffPtr[pos-1] = RDG_genChar(seed, ldctx);
150    }
151
152    /* init */
153    if (pos==0) buffPtr[0] = RDG_genChar(seed, ldctx), pos=1;
154
155    /* Generate compressible data */
156    while (pos < buffSize)
157    {
158        /* Select : Literal (char) or Match (within 32K) */
159        if (RDG_RAND15BITS < matchProba32)
160        {
161            /* Copy (within 32K) */
162            int match;
163            U32 d;
164            int length = RDG_RANDLENGTH + 4;
165            U32 offset = RDG_RAND15BITS + 1;
166            if (offset > pos) offset = pos;
167            match = pos - offset;
168            d = pos + length;
169            if (d > buffSize) d = buffSize;
170            while (pos < d) buffPtr[pos++] = buffPtr[match++];
171        }
172        else
173        {
174            /* Literal (noise) */
175            size_t d;
176            size_t length = RDG_RANDLENGTH;
177            d = pos + length;
178            if (d > buffSize) d = buffSize;
179            while (pos < d) buffPtr[pos++] = RDG_genChar(seed, ldctx);
180        }
181    }
182}
183
184
185void RDG_genBuffer(void* buffer, size_t size, double matchProba, double litProba, unsigned seed)
186{
187    void* ldctx;
188    if (litProba==0.0) litProba = matchProba / 4.5;
189    ldctx = RDG_createLiteralDistrib(litProba);
190    RDG_genBlock(buffer, size, 0, matchProba, ldctx, &seed);
191    free(ldctx);
192}
193
194
195#define RDG_BLOCKSIZE (128 KB)
196void RDG_genOut(unsigned long long size, double matchProba, double litProba, unsigned seed)
197{
198    BYTE buff[RDG_DICTSIZE + RDG_BLOCKSIZE];
199    U64 total = 0;
200    size_t genBlockSize = RDG_BLOCKSIZE;
201    void* ldctx;
202
203    /* init */
204    if (litProba==0.0) litProba = matchProba / 4.5;
205    ldctx = RDG_createLiteralDistrib(litProba);
206    SET_BINARY_MODE(stdout);
207
208    /* Generate dict */
209    RDG_genBlock(buff, RDG_DICTSIZE, 0, matchProba, ldctx, &seed);
210
211    /* Generate compressible data */
212    while (total < size)
213    {
214        RDG_genBlock(buff, RDG_DICTSIZE+RDG_BLOCKSIZE, RDG_DICTSIZE, matchProba, ldctx, &seed);
215        if (size-total < RDG_BLOCKSIZE) genBlockSize = (size_t)(size-total);
216        total += genBlockSize;
217        fwrite(buff, 1, genBlockSize, stdout);
218        /* update dict */
219        memcpy(buff, buff + RDG_BLOCKSIZE, RDG_DICTSIZE);
220    }
221
222    free(ldctx);
223}
224