1d0825bca7fe65beaee391d30da42e937db621564Steve Block/*
2d0825bca7fe65beaee391d30da42e937db621564Steve Block * Copyright (C) 2005, 2006, 2008, 2010 Apple Inc. All rights reserved.
3bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
4d0825bca7fe65beaee391d30da42e937db621564Steve Block *
5d0825bca7fe65beaee391d30da42e937db621564Steve Block * This library is free software; you can redistribute it and/or
6d0825bca7fe65beaee391d30da42e937db621564Steve Block * modify it under the terms of the GNU Library General Public
7d0825bca7fe65beaee391d30da42e937db621564Steve Block * License as published by the Free Software Foundation; either
8d0825bca7fe65beaee391d30da42e937db621564Steve Block * version 2 of the License, or (at your option) any later version.
9d0825bca7fe65beaee391d30da42e937db621564Steve Block *
10d0825bca7fe65beaee391d30da42e937db621564Steve Block * This library is distributed in the hope that it will be useful,
11d0825bca7fe65beaee391d30da42e937db621564Steve Block * but WITHOUT ANY WARRANTY; without even the implied warranty of
12d0825bca7fe65beaee391d30da42e937db621564Steve Block * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13d0825bca7fe65beaee391d30da42e937db621564Steve Block * Library General Public License for more details.
14d0825bca7fe65beaee391d30da42e937db621564Steve Block *
15d0825bca7fe65beaee391d30da42e937db621564Steve Block * You should have received a copy of the GNU Library General Public License
16d0825bca7fe65beaee391d30da42e937db621564Steve Block * along with this library; see the file COPYING.LIB.  If not, write to
17d0825bca7fe65beaee391d30da42e937db621564Steve Block * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18d0825bca7fe65beaee391d30da42e937db621564Steve Block * Boston, MA 02110-1301, USA.
19d0825bca7fe65beaee391d30da42e937db621564Steve Block *
20d0825bca7fe65beaee391d30da42e937db621564Steve Block */
21a94275402997c11dd2e778633dacf4b7e630a35dBen Murdoch#ifndef WTF_StringHasher_h
22a94275402997c11dd2e778633dacf4b7e630a35dBen Murdoch#define WTF_StringHasher_h
23d0825bca7fe65beaee391d30da42e937db621564Steve Block
24d0825bca7fe65beaee391d30da42e937db621564Steve Block#include <wtf/unicode/Unicode.h>
25d0825bca7fe65beaee391d30da42e937db621564Steve Block
26d0825bca7fe65beaee391d30da42e937db621564Steve Blocknamespace WTF {
27d0825bca7fe65beaee391d30da42e937db621564Steve Block
28d0825bca7fe65beaee391d30da42e937db621564Steve Block// Golden ratio - arbitrary start value to avoid mapping all 0's to all 0's
29d0825bca7fe65beaee391d30da42e937db621564Steve Blockstatic const unsigned stringHashingStartValue = 0x9e3779b9U;
30d0825bca7fe65beaee391d30da42e937db621564Steve Block
31bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen// Paul Hsieh's SuperFastHash
32d0825bca7fe65beaee391d30da42e937db621564Steve Block// http://www.azillionmonkeys.com/qed/hash.html
33d0825bca7fe65beaee391d30da42e937db621564Steve Block// char* data is interpreted as latin-encoded (zero extended to 16 bits).
34bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsenclass StringHasher {
35bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsenpublic:
36bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    inline StringHasher()
37bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        : m_hash(stringHashingStartValue)
38bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        , m_hasPendingCharacter(false)
39bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        , m_pendingCharacter(0)
40bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    {
41bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    }
42d0825bca7fe65beaee391d30da42e937db621564Steve Block
43bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    inline void addCharacters(UChar a, UChar b)
44bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    {
45bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        ASSERT(!m_hasPendingCharacter);
46bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        addCharactersToHash(a, b);
47d0825bca7fe65beaee391d30da42e937db621564Steve Block    }
48d0825bca7fe65beaee391d30da42e937db621564Steve Block
49bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    inline void addCharacter(UChar ch)
50bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    {
51bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        if (m_hasPendingCharacter) {
52bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen            addCharactersToHash(m_pendingCharacter, ch);
53bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen            m_hasPendingCharacter = false;
54bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen            return;
55bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        }
56bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen
57bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        m_pendingCharacter = ch;
58bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        m_hasPendingCharacter = true;
59d0825bca7fe65beaee391d30da42e937db621564Steve Block    }
60d0825bca7fe65beaee391d30da42e937db621564Steve Block
61bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    inline unsigned hash() const
62bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    {
63bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        unsigned result = m_hash;
64bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen
65bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        // Handle end case.
66bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        if (m_hasPendingCharacter) {
67bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen            result += m_pendingCharacter;
68bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen            result ^= result << 11;
69bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen            result += result >> 17;
70bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        }
71bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen
72bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        // Force "avalanching" of final 31 bits.
73bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        result ^= result << 3;
74bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        result += result >> 5;
75bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        result ^= result << 2;
76bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        result += result >> 15;
77bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        result ^= result << 10;
78bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen
79bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        // First bit is used in UStringImpl for m_isIdentifier.
80bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        result &= 0x7fffffff;
81bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen
82bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        // This avoids ever returning a hash code of 0, since that is used to
83bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        // signal "hash not computed yet", using a value that is likely to be
84bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        // effectively the same as 0 when the low bits are masked.
85bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        if (!result)
86bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen            return 0x40000000;
87bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen
88bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        return result;
89d0825bca7fe65beaee391d30da42e937db621564Steve Block    }
90d0825bca7fe65beaee391d30da42e937db621564Steve Block
912bde8e466a4451c7319e3a072d118917957d6554Steve Block    template<typename T, UChar Converter(T)> static inline unsigned computeHash(const T* data, unsigned length)
92bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    {
93bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        StringHasher hasher;
94bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        bool rem = length & 1;
95bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        length >>= 1;
96bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen
97bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        while (length--) {
98bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen            hasher.addCharacters(Converter(data[0]), Converter(data[1]));
99bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen            data += 2;
100bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        }
101bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen
102bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        if (rem)
103bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen            hasher.addCharacter(Converter(*data));
104bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen
105bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        return hasher.hash();
106d0825bca7fe65beaee391d30da42e937db621564Steve Block    }
107d0825bca7fe65beaee391d30da42e937db621564Steve Block
1082bde8e466a4451c7319e3a072d118917957d6554Steve Block    template<typename T, UChar Converter(T)> static inline unsigned computeHash(const T* data)
109bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    {
110bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        StringHasher hasher;
111bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen
112bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        while (true) {
113bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen            UChar b0 = Converter(*data++);
114bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen            if (!b0)
115bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen                break;
116bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen            UChar b1 = Converter(*data++);
117bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen            if (!b1) {
118bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen                hasher.addCharacter(b0);
119bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen                break;
120bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen            }
121bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen
122bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen            hasher.addCharacters(b0, b1);
123d0825bca7fe65beaee391d30da42e937db621564Steve Block        }
124bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen
125bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        return hasher.hash();
126bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    }
127bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen
1282bde8e466a4451c7319e3a072d118917957d6554Steve Block    template<typename T> static inline unsigned computeHash(const T* data, unsigned length)
129bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    {
1302bde8e466a4451c7319e3a072d118917957d6554Steve Block        return computeHash<T, defaultCoverter>(data, length);
131d0825bca7fe65beaee391d30da42e937db621564Steve Block    }
132d0825bca7fe65beaee391d30da42e937db621564Steve Block
1332bde8e466a4451c7319e3a072d118917957d6554Steve Block    template<typename T> static inline unsigned computeHash(const T* data)
134bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    {
1352bde8e466a4451c7319e3a072d118917957d6554Steve Block        return computeHash<T, defaultCoverter>(data);
136bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    }
137d0825bca7fe65beaee391d30da42e937db621564Steve Block
1382bde8e466a4451c7319e3a072d118917957d6554Steve Block    template<size_t length> static inline unsigned hashMemory(const void* data)
139bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    {
140bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        COMPILE_ASSERT(!(length % 4), length_must_be_a_multible_of_four);
1412bde8e466a4451c7319e3a072d118917957d6554Steve Block        return computeHash<UChar>(static_cast<const UChar*>(data), length / sizeof(UChar));
142bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    }
143d0825bca7fe65beaee391d30da42e937db621564Steve Block
1442bde8e466a4451c7319e3a072d118917957d6554Steve Block    static inline unsigned hashMemory(const void* data, unsigned size)
1452fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    {
1462fc2651226baac27029e38c9d6ef883fa32084dbSteve Block        ASSERT(!(size % 2));
1472bde8e466a4451c7319e3a072d118917957d6554Steve Block        return computeHash<UChar>(static_cast<const UChar*>(data), size / sizeof(UChar));
1482fc2651226baac27029e38c9d6ef883fa32084dbSteve Block    }
1492fc2651226baac27029e38c9d6ef883fa32084dbSteve Block
150bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsenprivate:
151bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    static inline UChar defaultCoverter(UChar ch)
152bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    {
153bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        return ch;
154bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    }
155bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen
156bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    static inline UChar defaultCoverter(char ch)
157bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    {
158bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        return static_cast<unsigned char>(ch);
159bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    }
160bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen
161bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    inline void addCharactersToHash(UChar a, UChar b)
162bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    {
163bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        m_hash += a;
164bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        unsigned tmp = (b << 11) ^ m_hash;
165bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        m_hash = (m_hash << 16) ^ tmp;
166bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen        m_hash += m_hash >> 11;
167bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    }
168d0825bca7fe65beaee391d30da42e937db621564Steve Block
169bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    unsigned m_hash;
170bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    bool m_hasPendingCharacter;
171bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen    UChar m_pendingCharacter;
172bec39347bb3bb5bf1187ccaf471d26247f28b585Kristian Monsen};
173d0825bca7fe65beaee391d30da42e937db621564Steve Block
174d0825bca7fe65beaee391d30da42e937db621564Steve Block} // namespace WTF
175d0825bca7fe65beaee391d30da42e937db621564Steve Block
1762bde8e466a4451c7319e3a072d118917957d6554Steve Blockusing WTF::StringHasher;
1772bde8e466a4451c7319e3a072d118917957d6554Steve Block
178a94275402997c11dd2e778633dacf4b7e630a35dBen Murdoch#endif // WTF_StringHasher_h
179