153b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel/* stringlib: fastsearch implementation */ 253b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 353b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel#ifndef STRINGLIB_FASTSEARCH_H 453b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel#define STRINGLIB_FASTSEARCH_H 553b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 653b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel/* fast search/count implementation, based on a mix between boyer- 753b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel moore and horspool, with a few more bells and whistles on the top. 853b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel for some more background, see: http://effbot.org/zone/stringlib.htm */ 953b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 1053b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel/* note: fastsearch may access s[n], which isn't a problem when using 1153b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel Python's ordinary string types, but may cause problems if you're 1253b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel using this code in other contexts. also, the count mode returns -1 1353b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if there cannot possible be a match in the target string, and 0 if 1453b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel it has actually checked for matches, but didn't find any. callers 1553b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel beware! */ 1653b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 1753b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel#define FAST_COUNT 0 1853b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel#define FAST_SEARCH 1 1953b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel#define FAST_RSEARCH 2 2053b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 2153b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel#if LONG_BIT >= 128 2253b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel#define STRINGLIB_BLOOM_WIDTH 128 2353b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel#elif LONG_BIT >= 64 2453b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel#define STRINGLIB_BLOOM_WIDTH 64 2553b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel#elif LONG_BIT >= 32 2653b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel#define STRINGLIB_BLOOM_WIDTH 32 2753b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel#else 2853b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel#error "LONG_BIT is smaller than 32" 2953b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel#endif 3053b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 3153b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel#define STRINGLIB_BLOOM_ADD(mask, ch) \ 3253b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) 3353b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel#define STRINGLIB_BLOOM(mask, ch) \ 3453b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) 3553b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 3653b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDanielPy_LOCAL_INLINE(Py_ssize_t) 3753b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDanielfastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, 3853b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel const STRINGLIB_CHAR* p, Py_ssize_t m, 3953b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel Py_ssize_t maxcount, int mode) 4053b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel{ 4153b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel unsigned long mask; 4253b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel Py_ssize_t skip, count = 0; 4353b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel Py_ssize_t i, j, mlast, w; 4453b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 4553b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel w = n - m; 4653b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 4753b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (w < 0 || (mode == FAST_COUNT && maxcount == 0)) 4853b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel return -1; 4953b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 5053b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel /* look for special cases */ 5153b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (m <= 1) { 5253b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (m <= 0) 5353b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel return -1; 5453b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel /* use special case for 1-character strings */ 5553b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (mode == FAST_COUNT) { 5653b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel for (i = 0; i < n; i++) 5753b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (s[i] == p[0]) { 5853b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel count++; 5953b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (count == maxcount) 6053b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel return maxcount; 6153b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel } 6253b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel return count; 6353b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel } else if (mode == FAST_SEARCH) { 6453b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel for (i = 0; i < n; i++) 6553b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (s[i] == p[0]) 6653b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel return i; 6753b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel } else { /* FAST_RSEARCH */ 6853b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel for (i = n - 1; i > -1; i--) 6953b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (s[i] == p[0]) 7053b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel return i; 7153b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel } 7253b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel return -1; 7353b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel } 7453b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 7553b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel mlast = m - 1; 7653b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel skip = mlast - 1; 7753b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel mask = 0; 7853b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 7953b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (mode != FAST_RSEARCH) { 8053b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 8153b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel /* create compressed boyer-moore delta 1 table */ 8253b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 8353b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel /* process pattern[:-1] */ 8453b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel for (i = 0; i < mlast; i++) { 8553b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel STRINGLIB_BLOOM_ADD(mask, p[i]); 8653b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (p[i] == p[mlast]) 8753b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel skip = mlast - i - 1; 8853b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel } 8953b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel /* process pattern[-1] outside the loop */ 9053b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel STRINGLIB_BLOOM_ADD(mask, p[mlast]); 9153b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 9253b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel for (i = 0; i <= w; i++) { 9353b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel /* note: using mlast in the skip path slows things down on x86 */ 9453b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (s[i+m-1] == p[m-1]) { 9553b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel /* candidate match */ 9653b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel for (j = 0; j < mlast; j++) 9753b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (s[i+j] != p[j]) 9853b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel break; 9953b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (j == mlast) { 10053b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel /* got a match! */ 10153b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (mode != FAST_COUNT) 10253b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel return i; 10353b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel count++; 10453b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (count == maxcount) 10553b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel return maxcount; 10653b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel i = i + mlast; 10753b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel continue; 10853b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel } 10953b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel /* miss: check if next character is part of pattern */ 11053b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (!STRINGLIB_BLOOM(mask, s[i+m])) 11153b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel i = i + m; 11253b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel else 11353b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel i = i + skip; 11453b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel } else { 11553b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel /* skip: check if next character is part of pattern */ 11653b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (!STRINGLIB_BLOOM(mask, s[i+m])) 11753b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel i = i + m; 11853b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel } 11953b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel } 12053b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel } else { /* FAST_RSEARCH */ 12153b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 12253b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel /* create compressed boyer-moore delta 1 table */ 12353b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 12453b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel /* process pattern[0] outside the loop */ 12553b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel STRINGLIB_BLOOM_ADD(mask, p[0]); 12653b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel /* process pattern[:0:-1] */ 12753b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel for (i = mlast; i > 0; i--) { 12853b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel STRINGLIB_BLOOM_ADD(mask, p[i]); 12953b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (p[i] == p[0]) 13053b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel skip = i - 1; 13153b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel } 13253b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 13353b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel for (i = w; i >= 0; i--) { 13453b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (s[i] == p[0]) { 13553b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel /* candidate match */ 13653b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel for (j = mlast; j > 0; j--) 13753b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (s[i+j] != p[j]) 13853b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel break; 13953b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (j == 0) 14053b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel /* got a match! */ 14153b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel return i; 14253b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel /* miss: check if previous character is part of pattern */ 14353b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) 14453b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel i = i - m; 14553b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel else 14653b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel i = i - skip; 14753b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel } else { 14853b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel /* skip: check if previous character is part of pattern */ 14953b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) 15053b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel i = i - m; 15153b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel } 15253b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel } 15353b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel } 15453b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 15553b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel if (mode != FAST_COUNT) 15653b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel return -1; 15753b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel return count; 15853b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel} 15953b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel 16053b2ba5790b57b3dcdfbb9fa5835a979d94908faDaryl McDaniel#endif 161