11e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet/*
21e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    datagen.c - compressible data generator test tool
38b233b228dbda484d1d545c5a3ecaf06aef3e930Yann Collet    Copyright (C) Yann Collet 2012-2016
495cc6cef6444b202a93ba414b7a9996eb2c72ca3Yann Collet
51e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    GPL v2 License
61e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet
71e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    This program is free software; you can redistribute it and/or modify
81e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    it under the terms of the GNU General Public License as published by
91e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    the Free Software Foundation; either version 2 of the License, or
101e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    (at your option) any later version.
111e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet
121e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    This program is distributed in the hope that it will be useful,
131e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    but WITHOUT ANY WARRANTY; without even the implied warranty of
141e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
151e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    GNU General Public License for more details.
161e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet
171e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    You should have received a copy of the GNU General Public License along
181e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    with this program; if not, write to the Free Software Foundation, Inc.,
191e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
201e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet
211e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    You can contact the author at :
2284cedb4632ab87fbb108b4f7ed6e9ec164b6a4d4Przemyslaw Skibinski   - LZ4 source repository : https://github.com/lz4/lz4
236b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet   - Public forum : https://groups.google.com/forum/#!forum/lz4c
241e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet*/
251e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet
261e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet/**************************************
276b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet*  Includes
281e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet**************************************/
296adf05d1ed83053b8bb1f762494d2c10fdd8ac1dPrzemyslaw Skibinski#include "platform.h"  /* Compiler options, SET_BINARY_MODE */
309546ba62d01a9618aab91eafe77929120653d275Przemyslaw Skibinski#include "util.h"      /* U32 */
316b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet#include <stdlib.h>    /* malloc */
326b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet#include <stdio.h>     /* FILE, fwrite */
336b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet#include <string.h>    /* memcpy */
341e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet
351e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet
361e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet/**************************************
376b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet*  Constants
381e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet**************************************/
396b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet#define KB *(1 <<10)
401e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet
416b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet#define PRIME1   2654435761U
426b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet#define PRIME2   2246822519U
431e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet
441e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet
456fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet/**************************************
466fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet*  Local types
476fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet**************************************/
486fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet#define LTLOG 13
496fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet#define LTSIZE (1<<LTLOG)
506fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet#define LTMASK (LTSIZE-1)
516fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collettypedef BYTE litDistribTable[LTSIZE];
526fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet
536fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet
546fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet
551e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet/*********************************************************
566b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet*  Local Functions
571e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet*********************************************************/
58d2be69b144d6c5fd9a3dcbc4133e93e710cda998Yann Collet#define MIN(a,b)   ( (a) < (b) ? (a) :(b) )
596b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet#define RDG_rotl32(x,r) ((x << r) | (x >> (32 - r)))
606b0c39b839b8343da195252a8c46e6d93138f3b8Yann Colletstatic unsigned int RDG_rand(U32* src)
611e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet{
621e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    U32 rand32 = *src;
631e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    rand32 *= PRIME1;
646b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    rand32 ^= PRIME2;
656b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    rand32  = RDG_rotl32(rand32, 13);
661e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    *src = rand32;
671e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    return rand32;
681e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet}
691e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet
701e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet
716fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Colletstatic void RDG_fillLiteralDistrib(litDistribTable lt, double ld)
721e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet{
73d2be69b144d6c5fd9a3dcbc4133e93e710cda998Yann Collet    BYTE const firstChar = ld <= 0.0 ? 0 : '(';
74d2be69b144d6c5fd9a3dcbc4133e93e710cda998Yann Collet    BYTE const lastChar  = ld <= 0.0 ? 255 : '}';
75d2be69b144d6c5fd9a3dcbc4133e93e710cda998Yann Collet    BYTE character = ld <= 0.0 ? 0 : '0';
76d2be69b144d6c5fd9a3dcbc4133e93e710cda998Yann Collet    U32 u = 0;
77d2be69b144d6c5fd9a3dcbc4133e93e710cda998Yann Collet
78d2be69b144d6c5fd9a3dcbc4133e93e710cda998Yann Collet    while (u<LTSIZE) {
79d2be69b144d6c5fd9a3dcbc4133e93e710cda998Yann Collet        U32 const weight = (U32)((double)(LTSIZE - u) * ld) + 1;
80d2be69b144d6c5fd9a3dcbc4133e93e710cda998Yann Collet        U32 const end = MIN(u+weight, LTSIZE);
81d2be69b144d6c5fd9a3dcbc4133e93e710cda998Yann Collet        while (u < end) lt[u++] = character;
826b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet        character++;
836b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet        if (character > lastChar) character = firstChar;
846b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    }
856b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet}
866b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet
876fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet
886fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Colletstatic BYTE RDG_genChar(U32* seed, const litDistribTable lt)
896b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet{
906b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    U32 id = RDG_rand(seed) & LTMASK;
916fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet    return (lt[id]);
926b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet}
936b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet
946fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet
956b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet#define RDG_DICTSIZE    (32 KB)
966b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet#define RDG_RAND15BITS  ((RDG_rand(seed) >> 3) & 32767)
976b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet#define RDG_RANDLENGTH  ( ((RDG_rand(seed) >> 7) & 7) ? (RDG_rand(seed) & 15) : (RDG_rand(seed) & 511) + 15)
986fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Colletvoid RDG_genBlock(void* buffer, size_t buffSize, size_t prefixSize, double matchProba, litDistribTable lt, unsigned* seedPtr)
996b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet{
1006b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    BYTE* buffPtr = (BYTE*)buffer;
1016b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    const U32 matchProba32 = (U32)(32768 * matchProba);
1026b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    size_t pos = prefixSize;
1036b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    U32* seed = seedPtr;
1046b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet
1056b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    /* special case */
1066b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    while (matchProba >= 1.0)
1076b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    {
1086b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet        size_t size0 = RDG_rand(seed) & 3;
10945a357fd1704e9c6d2d8037277bda62e8c86308eYann Collet        size0  = (size_t)1 << (16 + size0 * 2);
1106b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet        size0 += RDG_rand(seed) & (size0-1);   /* because size0 is power of 2*/
1116b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet        if (buffSize < pos + size0)
1126b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet        {
1136b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet            memset(buffPtr+pos, 0, buffSize-pos);
1146b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet            return;
1156b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet        }
1166b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet        memset(buffPtr+pos, 0, size0);
1176b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet        pos += size0;
1186fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet        buffPtr[pos-1] = RDG_genChar(seed, lt);
1196b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    }
1206b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet
1216b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    /* init */
1226fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet    if (pos==0) buffPtr[0] = RDG_genChar(seed, lt), pos=1;
1236b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet
1246b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    /* Generate compressible data */
1256b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    while (pos < buffSize)
1266b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    {
1276b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet        /* Select : Literal (char) or Match (within 32K) */
1286b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet        if (RDG_RAND15BITS < matchProba32)
129661e4ddb78ce89d5de3ad0824e6abb161044aa06Yann Collet        {
1306b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet            /* Copy (within 32K) */
13145a357fd1704e9c6d2d8037277bda62e8c86308eYann Collet            size_t match;
13245a357fd1704e9c6d2d8037277bda62e8c86308eYann Collet            size_t d;
1336b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet            int length = RDG_RANDLENGTH + 4;
1346b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet            U32 offset = RDG_RAND15BITS + 1;
13545a357fd1704e9c6d2d8037277bda62e8c86308eYann Collet            if (offset > pos) offset = (U32)pos;
1366b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet            match = pos - offset;
137661e4ddb78ce89d5de3ad0824e6abb161044aa06Yann Collet            d = pos + length;
1386b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet            if (d > buffSize) d = buffSize;
1396b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet            while (pos < d) buffPtr[pos++] = buffPtr[match++];
140661e4ddb78ce89d5de3ad0824e6abb161044aa06Yann Collet        }
141661e4ddb78ce89d5de3ad0824e6abb161044aa06Yann Collet        else
142661e4ddb78ce89d5de3ad0824e6abb161044aa06Yann Collet        {
1436b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet            /* Literal (noise) */
1446b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet            size_t d;
1456b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet            size_t length = RDG_RANDLENGTH;
1466b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet            d = pos + length;
1476b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet            if (d > buffSize) d = buffSize;
1486fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet            while (pos < d) buffPtr[pos++] = RDG_genChar(seed, lt);
1491e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet        }
1501e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    }
1511e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet}
1521e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet
1531e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet
1546b0c39b839b8343da195252a8c46e6d93138f3b8Yann Colletvoid RDG_genBuffer(void* buffer, size_t size, double matchProba, double litProba, unsigned seed)
1551e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet{
1566fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet    litDistribTable lt;
1576b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    if (litProba==0.0) litProba = matchProba / 4.5;
1586fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet    RDG_fillLiteralDistrib(lt, litProba);
1596fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet    RDG_genBlock(buffer, size, 0, matchProba, lt, &seed);
1601e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet}
1611e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet
1621e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet
1636b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet#define RDG_BLOCKSIZE (128 KB)
1646b0c39b839b8343da195252a8c46e6d93138f3b8Yann Colletvoid RDG_genOut(unsigned long long size, double matchProba, double litProba, unsigned seed)
1651e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet{
1666b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    BYTE buff[RDG_DICTSIZE + RDG_BLOCKSIZE];
1676b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    U64 total = 0;
1686b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    size_t genBlockSize = RDG_BLOCKSIZE;
1696fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet    litDistribTable lt;
1701e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet
1716b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    /* init */
1726b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    if (litProba==0.0) litProba = matchProba / 4.5;
1736fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet    RDG_fillLiteralDistrib(lt, litProba);
1746b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    SET_BINARY_MODE(stdout);
1751e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet
1766b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    /* Generate dict */
1776fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet    RDG_genBlock(buff, RDG_DICTSIZE, 0, matchProba, lt, &seed);
1781e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet
1796b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    /* Generate compressible data */
1806b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    while (total < size)
1816b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet    {
1826fe48b91832b9f8b5869a8a6d2a86b2b0d99988aYann Collet        RDG_genBlock(buff, RDG_DICTSIZE+RDG_BLOCKSIZE, RDG_DICTSIZE, matchProba, lt, &seed);
1836b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet        if (size-total < RDG_BLOCKSIZE) genBlockSize = (size_t)(size-total);
1846b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet        total += genBlockSize;
1856b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet        fwrite(buff, 1, genBlockSize, stdout);
1866b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet        /* update dict */
1876b0c39b839b8343da195252a8c46e6d93138f3b8Yann Collet        memcpy(buff, buff + RDG_BLOCKSIZE, RDG_DICTSIZE);
1881e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet    }
1891e053a290ab55af5d03eae209e9ee64f555670b7Yann Collet}
190