MappedHash.h revision 37bb8ddd443da172f42bb8657f15ec856a525c84
1//
2//  MappedHash.h
3//
4
5#ifndef liblldb_MappedHash_h_
6#define liblldb_MappedHash_h_
7
8#include <assert.h>
9#include <stdint.h>
10
11#include <map>
12#include <vector>
13
14#include "lldb/Core/DataExtractor.h"
15#include "lldb/Core/Stream.h"
16
17class MappedHash
18{
19public:
20
21    enum HashFunctionType
22    {
23        eHashFunctionDJB        = 0u    // Daniel J Bernstein hash function that is also used by the ELF GNU_HASH sections
24    };
25
26
27    static uint32_t
28    HashStringUsingDJB (const char *s)
29    {
30        uint32_t h = 5381;
31
32        for (unsigned char c = *s; c; c = *++s)
33            h = ((h << 5) + h) + c;
34
35        return h;
36    }
37
38    static uint32_t
39    HashString (const uint8_t hash_function, const char *s)
40    {
41        switch (hash_function)
42        {
43            case MappedHash::eHashFunctionDJB:
44                return HashStringUsingDJB (s);
45
46            default:
47                break;
48        }
49        assert (!"Invalid hash function index");
50        return 0;
51    }
52
53
54    static const uint32_t HASH_MAGIC = 0x48415348u;
55    static const uint32_t HASH_CIGAM = 0x48534148u;
56
57    template <typename T>
58    struct Header
59	{
60        typedef T HeaderData;
61
62        uint32_t    magic;             // HASH_MAGIC or HASH_CIGAM magic value to allow endian detection
63        uint16_t    version;           // Version number
64		uint16_t    hash_function;     // The hash function enumeration that was used
65		uint32_t    bucket_count;      // The number of buckets in this hash table
66		uint32_t    hashes_count;      // The total number of unique hash values and hash data offsets in this table
67        uint32_t    header_data_len;   // The size in bytes of the "header_data" template member below
68        HeaderData  header_data;       //
69
70		Header () :
71            magic (HASH_MAGIC),
72            version (1),
73            hash_function (eHashFunctionDJB),
74            bucket_count (0),
75            hashes_count (0),
76            header_data_len (sizeof(T)),
77            header_data ()
78		{
79		}
80
81        virtual
82        ~Header()
83        {
84        }
85
86        size_t
87        GetByteSize() const
88        {
89            return  sizeof(magic) +
90                    sizeof(version) +
91                    sizeof(hash_function) +
92                    sizeof(bucket_count) +
93                    sizeof(hashes_count) +
94                    sizeof(header_data_len) +
95                    header_data_len;
96        }
97
98        virtual size_t
99        GetByteSize (const HeaderData &header_data) = 0;
100
101        void
102        SetHeaderDataByteSize (uint32_t header_data_byte_size)
103        {
104            header_data_len = header_data_byte_size;
105        }
106
107        void
108        Dump (lldb_private::Stream &s)
109        {
110            s.Printf ("header.magic              = 0x%8.8x\n", magic);
111            s.Printf ("header.version            = 0x%4.4x\n", version);
112            s.Printf ("header.hash_function      = 0x%4.4x\n", hash_function);
113            s.Printf ("header.bucket_count       = 0x%8.8x %u\n", bucket_count, bucket_count);
114            s.Printf ("header.hashes_count       = 0x%8.8x %u\n", hashes_count, hashes_count);
115            s.Printf ("header.header_data_len    = 0x%8.8x %u\n", header_data_len, header_data_len);
116        }
117
118        virtual uint32_t
119        Read (lldb_private::DataExtractor &data, uint32_t offset)
120        {
121            if (data.ValidOffsetForDataOfSize (offset,
122                                               sizeof (magic) +
123                                               sizeof (version) +
124                                               sizeof (hash_function) +
125                                               sizeof (bucket_count) +
126                                               sizeof (hashes_count) +
127                                               sizeof (header_data_len)))
128            {
129                magic = data.GetU32 (&offset);
130                if (magic != HASH_MAGIC)
131                {
132                    if (magic == HASH_CIGAM)
133                    {
134                        switch (data.GetByteOrder())
135                        {
136                            case lldb::eByteOrderBig:
137                                data.SetByteOrder(lldb::eByteOrderLittle);
138                                break;
139                            case lldb::eByteOrderLittle:
140                                data.SetByteOrder(lldb::eByteOrderBig);
141                                break;
142                            default:
143                                return UINT32_MAX;
144                        }
145                    }
146                    else
147                    {
148                        // Magic bytes didn't match
149                        version = 0;
150                        return UINT32_MAX;
151                    }
152                }
153
154                version = data.GetU16 (&offset);
155                if (version != 1)
156                {
157                    // Unsupported version
158                    return UINT32_MAX;
159                }
160                hash_function       = data.GetU16 (&offset);
161                if (hash_function == 4)
162                    hash_function = 0; // Deal with pre-release version of this table...
163                bucket_count        = data.GetU32 (&offset);
164                hashes_count        = data.GetU32 (&offset);
165                header_data_len     = data.GetU32 (&offset);
166                return offset;
167            }
168            return UINT32_MAX;
169        }
170//
171//        // Returns a buffer that contains a serialized version of this table
172//        // that must be freed with free().
173//        virtual void *
174//        Write (int fd);
175	};
176
177    template <typename __KeyType, class __HeaderDataType, class __ValueType>
178    class ExportTable
179    {
180    public:
181        typedef __HeaderDataType HeaderDataType;
182        typedef Header<HeaderDataType> HeaderType;
183        typedef __KeyType KeyType;
184        typedef __ValueType ValueType;
185
186        struct Entry
187        {
188            uint32_t    hash;
189            KeyType     key;
190            ValueType   value;
191        };
192
193        typedef std::vector<ValueType> ValueArrayType;
194
195        typedef std::map<KeyType, ValueArrayType> HashData;
196        // Map a name hash to one or more name infos
197        typedef std::map<uint32_t, HashData> HashToHashData;
198
199        virtual KeyType
200        GetKeyForStringType (const char *cstr) const = 0;
201
202        virtual size_t
203        GetByteSize (const HashData &key_to_key_values) = 0;
204
205        virtual bool
206        WriteHashData (const HashData &hash_data,
207                       lldb_private::Stream &ostrm) = 0;
208//
209        void
210        AddEntry (const char *cstr, const ValueType &value)
211        {
212            Entry entry;
213            entry.hash = MappedHash::HashString (eHashFunctionDJB, cstr);
214            entry.key = GetKeyForStringType (cstr);
215            entry.value = value;
216            m_entries.push_back (entry);
217        }
218
219        void
220        Save (const HeaderDataType &header_data,
221              lldb_private::Stream &ostrm)
222        {
223            if (m_entries.empty())
224                return;
225
226            const uint32_t num_entries = m_entries.size();
227            uint32_t i = 0;
228
229            HeaderType header;
230
231            header.magic = HASH_MAGIC;
232            header.version = 1;
233            header.hash_function = eHashFunctionDJB;
234            header.bucket_count = 0;
235            header.hashes_count = 0;
236            header.prologue_length = header_data.GetByteSize();
237
238            // We need to figure out the number of unique hashes first before we can
239            // calculate the number of buckets we want to use.
240            typedef std::vector<uint32_t> hash_coll;
241            hash_coll unique_hashes;
242            unique_hashes.resize (num_entries);
243            for (i=0; i<num_entries; ++i)
244                unique_hashes[i] = m_entries[i].hash;
245            std::sort (unique_hashes.begin(), unique_hashes.end());
246            hash_coll::iterator pos = std::unique (unique_hashes.begin(), unique_hashes.end());
247            const size_t num_unique_hashes = std::distance (unique_hashes.begin(), pos);
248
249            if (num_unique_hashes > 1024)
250                header.bucket_count = num_unique_hashes/4;
251            else if (num_unique_hashes > 16)
252                header.bucket_count = num_unique_hashes/2;
253            else
254                header.bucket_count = num_unique_hashes;
255            if (header.bucket_count == 0)
256                header.bucket_count = 1;
257
258
259            std::vector<HashToHashData> hash_buckets;
260            std::vector<uint32_t> hash_indexes (header.bucket_count, 0);
261            std::vector<uint32_t> hash_values;
262            std::vector<uint32_t> hash_offsets;
263            hash_buckets.resize (header.bucket_count);
264            uint32_t bucket_entry_empties = 0;
265            //StreamString hash_file_data(Stream::eBinary, dwarf->GetObjectFile()->GetAddressByteSize(), dwarf->GetObjectFile()->GetByteSize());
266
267            // Push all of the hashes into their buckets and create all bucket
268            // entries all populated with data.
269            for (i=0; i<num_entries; ++i)
270            {
271                const uint32_t hash = m_entries[i].hash;
272                const uint32_t bucket_idx = hash % header.bucket_count;
273                const uint32_t strp_offset = m_entries[i].str_offset;
274                const dw_offset_t die_offset = m_entries[i].die_offset;
275                hash_buckets[bucket_idx][hash][strp_offset].push_back(die_offset);
276            }
277
278            // Now for each bucket we write the bucket value which is the
279            // number of hashes and the hash index encoded into a single
280            // 32 bit unsigned integer.
281            for (i=0; i<header.bucket_count; ++i)
282            {
283                HashToHashData &bucket_entry = hash_buckets[i];
284
285                if (bucket_entry.empty())
286                {
287                    // Empty bucket
288                    ++bucket_entry_empties;
289                    hash_indexes[i] = UINT32_MAX;
290                }
291                else
292                {
293                    const uint32_t hash_value_index = hash_values.size();
294                    uint32_t hash_count = 0;
295                    typename HashToHashData::const_iterator pos, end = bucket_entry.end();
296                    for (pos = bucket_entry.begin(); pos != end; ++pos)
297                    {
298                        hash_values.push_back (pos->first);
299                        hash_offsets.push_back (GetByteSize (pos->second));
300                        ++hash_count;
301                    }
302
303                    hash_indexes[i] = hash_value_index;
304                }
305            }
306            header.hashes_count = hash_values.size();
307
308            // Write the header out now that we have the hash_count
309            header.Write (ostrm);
310
311            // Now for each bucket we write the start index of the hashes
312            // for the current bucket, or UINT32_MAX if the bucket is empty
313            for (i=0; i<header.bucket_count; ++i)
314            {
315                ostrm.PutHex32(hash_indexes[i]);
316            }
317
318            // Now we need to write out all of the hash values
319            for (i=0; i<header.hashes_count; ++i)
320            {
321                ostrm.PutHex32(hash_values[i]);
322            }
323
324            // Now we need to write out all of the hash data offsets,
325            // there is an offset for each hash in the hashes array
326            // that was written out above
327            for (i=0; i<header.hashes_count; ++i)
328            {
329                ostrm.PutHex32(hash_offsets[i]);
330            }
331
332            // Now we write the data for each hash and verify we got the offset
333            // correct above...
334            for (i=0; i<header.bucket_count; ++i)
335            {
336                HashToHashData &bucket_entry = hash_buckets[i];
337
338                typename HashToHashData::const_iterator pos, end = bucket_entry.end();
339                for (pos = bucket_entry.begin(); pos != end; ++pos)
340                {
341                    if (!bucket_entry.empty())
342                    {
343                        WriteHashData (pos->second);
344                    }
345                }
346            }
347        }
348    protected:
349        typedef std::vector<Entry> collection;
350        collection m_entries;
351    };
352    // A class for reading and using a saved hash table from a block of data
353    // in memory
354    template <typename __KeyType, class __HeaderType, class __HashData>
355    class MemoryTable
356    {
357    public:
358        typedef __HeaderType HeaderType;
359        typedef __KeyType KeyType;
360        typedef __HashData HashData;
361
362        enum Result
363        {
364            eResultKeyMatch         = 0u, // The entry was found, key matched and "pair" was filled in successfully
365            eResultKeyMismatch      = 1u, // Bucket hash data collision, but key didn't match
366            eResultEndOfHashData    = 2u, // The chain of items for this hash data in this bucket is terminated, search no more
367            eResultError            = 3u  // Error parsing the hash data, abort
368        };
369
370        struct Pair
371        {
372            KeyType key;
373            HashData value;
374        };
375
376        MemoryTable (lldb_private::DataExtractor &data) :
377            m_header (),
378            m_hash_indexes (NULL),
379            m_hash_values (NULL),
380            m_hash_offsets (NULL)
381        {
382            uint32_t offset = m_header.Read (data, 0);
383            if (offset != UINT32_MAX && IsValid ())
384            {
385                m_hash_indexes = (uint32_t *)data.GetData (&offset, m_header.bucket_count * sizeof(uint32_t));
386                m_hash_values  = (uint32_t *)data.GetData (&offset, m_header.hashes_count * sizeof(uint32_t));
387                m_hash_offsets = (uint32_t *)data.GetData (&offset, m_header.hashes_count * sizeof(uint32_t));
388            }
389        }
390
391        virtual
392        ~MemoryTable ()
393        {
394        }
395
396        bool
397        IsValid () const
398        {
399            return m_header.version == 1 &&
400                   m_header.hash_function == eHashFunctionDJB &&
401                   m_header.bucket_count > 0 &&
402                   m_header.hashes_count > 0;
403        }
404
405        uint32_t
406        GetHashIndex (uint32_t bucket_idx) const
407        {
408            if (m_hash_indexes && bucket_idx < m_header.bucket_count)
409                return m_hash_indexes[bucket_idx];
410            return UINT32_MAX;
411        }
412
413        uint32_t
414        GetHashValue (uint32_t hash_idx) const
415        {
416            if (m_hash_values && hash_idx < m_header.hashes_count)
417                return m_hash_values[hash_idx];
418            return UINT32_MAX;
419        }
420
421        uint32_t
422        GetHashDataOffset (uint32_t hash_idx) const
423        {
424            if (m_hash_offsets && hash_idx < m_header.hashes_count)
425                return m_hash_offsets[hash_idx];
426            return UINT32_MAX;
427        }
428
429        bool
430        Find (const char *name, Pair &pair) const
431        {
432            if (IsValid ())
433            {
434                const uint32_t bucket_count = m_header.bucket_count;
435                const uint32_t hash_count = m_header.hashes_count;
436                const uint32_t hash_value = MappedHash::HashString (m_header.hash_function, name);
437                const uint32_t bucket_idx = hash_value % bucket_count;
438                uint32_t hash_idx = GetHashIndex (bucket_idx);
439                if (hash_idx < hash_count)
440                {
441                    for (; hash_idx < hash_count; ++hash_idx)
442                    {
443                        const uint32_t curr_hash_value = GetHashValue (hash_idx);
444                        if (curr_hash_value == hash_value)
445                        {
446                            uint32_t hash_data_offset = GetHashDataOffset (hash_idx);
447                            while (hash_data_offset != UINT32_MAX)
448                            {
449                                const uint32_t prev_hash_data_offset = hash_data_offset;
450                                Result hash_result = GetHashDataForName (name, &hash_data_offset, pair);
451                                // Check the result of getting our hash data
452                                switch (hash_result)
453                                {
454                                case eResultKeyMatch:
455                                    return true;
456
457                                case eResultKeyMismatch:
458                                    if (prev_hash_data_offset == hash_data_offset)
459                                        return false;
460                                    break;
461
462                                case eResultEndOfHashData:
463                                    // The last HashData for this key has been reached, stop searching
464                                    return false;
465                                case eResultError:
466                                    // Error parsing the hash data, abort
467                                    return false;
468                                }
469                            }
470                        }
471                        if ((curr_hash_value % bucket_count) != bucket_idx)
472                            break;
473                    }
474                }
475            }
476            return false;
477        }
478
479        // This method must be implemented in any subclasses.
480        // The KeyType is user specified and must somehow result in a string
481        // value. For example, the KeyType might be a string offset in a string
482        // table and subclasses can store their string table as a member of the
483        // subclass and return a valie "const char *" given a "key". The value
484        // could also be a C string pointer, in which case just returning "key"
485        // will suffice.
486
487        virtual const char *
488        GetStringForKeyType (KeyType key) const = 0;
489
490        // This method must be implemented in any subclasses and it must try to
491        // read one "Pair" at the offset pointed to by the "hash_data_offset_ptr"
492        // parameter. This offset should be updated as bytes are consumed and
493        // a value "Result" enum should be returned. If the "name" matches the
494        // full name for the "pair.key" (which must be filled in by this call),
495        // then the HashData in the pair ("pair.value") should be extracted and
496        // filled in and "eResultKeyMatch" should be returned. If "name" doesn't
497        // match this string for the key, then "eResultKeyMismatch" should be
498        // returned and all data for the current HashData must be consumed or
499        // skipped and the "hash_data_offset_ptr" offset needs to be updated to
500        // point to the next HashData. If the end of the HashData objects for
501        // a given hash value have been reached, then "eResultEndOfHashData"
502        // should be returned. If anything else goes wrong during parsing,
503        // return "eResultError" and the corresponding "Find()" function will
504        // be canceled and return false.
505
506        virtual Result
507        GetHashDataForName (const char *name,
508                            uint32_t* hash_data_offset_ptr,
509                            Pair &pair) const = 0;
510
511        const HeaderType &
512        GetHeader()
513        {
514            return m_header;
515        }
516
517    protected:
518        // Implementation agnostic information
519        HeaderType m_header;
520        uint32_t *m_hash_indexes;
521        uint32_t *m_hash_values;
522        uint32_t *m_hash_offsets;
523    };
524
525};
526
527#endif // #ifndef liblldb_MappedHash_h_
528