1/* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
2 * licenses follows.
3 */
4
5/* LibHnj - a library for high quality hyphenation and justification
6 * Copyright (C) 1998 Raph Levien,
7 * 	     (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
8 *           (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
9 *           (C) 2006, 2007, 2008 László Németh (nemeth at OOo)
10 *
11 * This library is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Library General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This library is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19 * Library General Public License for more details.
20 *
21 * You should have received a copy of the GNU Library General Public
22 * License along with this library; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA  02111-1307  USA.
25 */
26
27/*
28 * The contents of this file are subject to the Mozilla Public License
29 * Version 1.0 (the "MPL"); you may not use this file except in
30 * compliance with the MPL.  You may obtain a copy of the MPL at
31 * http://www.mozilla.org/MPL/
32 *
33 * Software distributed under the MPL is distributed on an "AS IS" basis,
34 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
35 * for the specific language governing rights and limitations under the
36 * MPL.
37 *
38 */
39#include <fcntl.h>
40#include <sys/mman.h>
41#include <sys/stat.h>
42#include <stdlib.h> /* for NULL, malloc */
43#include <stdio.h>  /* for fprintf */
44#include <string.h> /* for strdup */
45#include <unistd.h> /* for close */
46
47#define noVERBOSE
48
49#include "hnjalloc.h"
50#include "hyphen.h"
51
52static char *
53hnj_strdup (const char *s)
54{
55    char *new;
56    int l;
57
58    l = strlen (s);
59    new = hnj_malloc (l + 1);
60    memcpy (new, s, l);
61    new[l] = 0;
62    return new;
63}
64
65/* remove cross-platform text line end characters */
66void hnj_strchomp(char * s)
67{
68    int k = strlen(s);
69    if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
70    if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
71}
72
73/* a little bit of a hash table implementation. This simply maps strings
74   to state numbers */
75
76typedef struct _HashTab HashTab;
77typedef struct _HashEntry HashEntry;
78
79/* A cheap, but effective, hack. */
80#define HASH_SIZE 31627
81
82struct _HashTab {
83    HashEntry *entries[HASH_SIZE];
84};
85
86struct _HashEntry {
87    HashEntry *next;
88    char *key;
89    int val;
90};
91
92/* a char* hash function from ASU - adapted from Gtk+ */
93static unsigned int
94hnj_string_hash (const char *s)
95{
96    const char *p;
97    unsigned int h=0, g;
98    for(p = s; *p != '\0'; p += 1) {
99        h = ( h << 4 ) + *p;
100        if ( ( g = h & 0xf0000000 ) ) {
101            h = h ^ (g >> 24);
102            h = h ^ g;
103        }
104    }
105    return h /* % M */;
106}
107
108static HashTab *
109hnj_hash_new (void)
110{
111    HashTab *hashtab;
112    int i;
113
114    hashtab = hnj_malloc (sizeof(HashTab));
115    for (i = 0; i < HASH_SIZE; i++)
116        hashtab->entries[i] = NULL;
117
118    return hashtab;
119}
120
121static void
122hnj_hash_free (HashTab *hashtab)
123{
124    int i;
125    HashEntry *e, *next;
126
127    for (i = 0; i < HASH_SIZE; i++)
128        for (e = hashtab->entries[i]; e; e = next)
129        {
130            next = e->next;
131            hnj_free (e->key);
132            hnj_free (e);
133        }
134
135    hnj_free (hashtab);
136}
137
138/* assumes that key is not already present! */
139static void
140hnj_hash_insert (HashTab *hashtab, const char *key, int val)
141{
142    int i;
143    HashEntry *e;
144
145    i = hnj_string_hash (key) % HASH_SIZE;
146    e = hnj_malloc (sizeof(HashEntry));
147    e->next = hashtab->entries[i];
148    e->key = hnj_strdup (key);
149    e->val = val;
150    hashtab->entries[i] = e;
151}
152
153/* return val if found, otherwise -1 */
154static int
155hnj_hash_lookup (HashTab *hashtab, const char *key)
156{
157    int i;
158    HashEntry *e;
159    i = hnj_string_hash (key) % HASH_SIZE;
160    for (e = hashtab->entries[i]; e; e = e->next)
161        if (!strcmp (key, e->key))
162            return e->val;
163    return -1;
164}
165
166/* Get the state number, allocating a new state if necessary. */
167static int
168hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
169{
170    int state_num;
171
172    state_num = hnj_hash_lookup (hashtab, string);
173
174    if (state_num >= 0)
175        return state_num;
176
177    hnj_hash_insert (hashtab, string, dict->num_states);
178    /* predicate is true if dict->num_states is a power of two */
179    if (!(dict->num_states & (dict->num_states - 1)))
180    {
181        dict->states = hnj_realloc (dict->states,
182            (dict->num_states << 1) *
183            sizeof(HyphenState));
184    }
185    dict->states[dict->num_states].match = NULL;
186    dict->states[dict->num_states].repl = NULL;
187    dict->states[dict->num_states].fallback_state = -1;
188    dict->states[dict->num_states].num_trans = 0;
189    dict->states[dict->num_states].trans = NULL;
190    return dict->num_states++;
191}
192
193/* add a transition from state1 to state2 through ch - assumes that the
194   transition does not already exist */
195static void
196hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
197{
198    int num_trans;
199
200    num_trans = dict->states[state1].num_trans;
201    if (num_trans == 0)
202    {
203        dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
204    }
205    else if (!(num_trans & (num_trans - 1)))
206    {
207        dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
208            (num_trans << 1) *
209            sizeof(HyphenTrans));
210    }
211    dict->states[state1].trans[num_trans].ch = ch;
212    dict->states[state1].trans[num_trans].new_state = state2;
213    dict->states[state1].num_trans++;
214}
215
216#ifdef VERBOSE
217HashTab *global;
218
219static char *
220get_state_str (int state)
221{
222    int i;
223    HashEntry *e;
224
225    for (i = 0; i < HASH_SIZE; i++)
226        for (e = global->entries[i]; e; e = e->next)
227            if (e->val == state)
228                return e->key;
229    return NULL;
230}
231#endif
232
233// Get a line from the dictionary contents.
234static char *
235get_line (char *s, int size, const char *dict_contents, int dict_length,
236    int *dict_ptr)
237{
238    int len = 0;
239    while (len < (size - 1) && *dict_ptr < dict_length) {
240        s[len++] = *(dict_contents + *dict_ptr);
241        (*dict_ptr)++;
242        if (s[len - 1] == '\n')
243            break;
244    }
245    s[len] = '\0';
246    if (len > 0) {
247        return s;
248    } else {
249        return NULL;
250    }
251}
252
253HyphenDict *
254hnj_hyphen_load (const char *fn)
255{
256    if (fn == NULL)
257        return NULL;
258    const int fd = open(fn, O_RDONLY);
259    if (fd == -1)
260        return NULL;
261    struct stat sb;
262    if (fstat(fd, &sb) == -1)  {  /* To obtain file size */
263        close(fd);
264        return NULL;
265    }
266
267    const char *addr = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
268    if (addr == MAP_FAILED) {
269        close(fd);
270        return NULL;
271    }
272    HyphenDict *dict = hnj_hyphen_load_from_buffer(addr, sb.st_size);
273    munmap((void *)addr, sb.st_size);
274    close(fd);
275
276    return dict;
277}
278
279HyphenDict *
280hnj_hyphen_load_from_buffer (const char *dict_contents, int dict_length)
281{
282    HyphenDict *dict[2];
283    HashTab *hashtab;
284    char buf[MAX_CHARS];
285    char word[MAX_CHARS];
286    char pattern[MAX_CHARS];
287    char * repl;
288    signed char replindex;
289    signed char replcut;
290    int state_num = 0, last_state;
291    int i, j, k;
292    char ch;
293    int found;
294    HashEntry *e;
295    int nextlevel = 0;
296
297    if (dict_contents == NULL)
298        return NULL;
299
300    int dict_ptr = 0;
301// loading one or two dictionaries (separated by NEXTLEVEL keyword)
302    for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {
303        hashtab = hnj_hash_new ();
304#ifdef VERBOSE
305        global = hashtab;
306#endif
307        hnj_hash_insert (hashtab, "", 0);
308        dict[k] = hnj_malloc (sizeof(HyphenDict));
309        dict[k]->num_states = 1;
310        dict[k]->states = hnj_malloc (sizeof(HyphenState));
311        dict[k]->states[0].match = NULL;
312        dict[k]->states[0].repl = NULL;
313        dict[k]->states[0].fallback_state = -1;
314        dict[k]->states[0].num_trans = 0;
315        dict[k]->states[0].trans = NULL;
316        dict[k]->nextlevel = NULL;
317        dict[k]->lhmin = 0;
318        dict[k]->rhmin = 0;
319        dict[k]->clhmin = 0;
320        dict[k]->crhmin = 0;
321
322        /* read in character set info */
323        if (k == 0) {
324            for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
325            get_line(dict[k]->cset, sizeof(dict[k]->cset), dict_contents,
326                dict_length, &dict_ptr);
327            for (i=0;i<MAX_NAME;i++)
328                if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
329                    dict[k]->cset[i] = 0;
330            dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
331        } else {
332            strcpy(dict[k]->cset, dict[0]->cset);
333            dict[k]->utf8 = dict[0]->utf8;
334        }
335
336        while (get_line(buf, sizeof(buf), dict_contents, dict_length,
337                &dict_ptr) != NULL)
338        {
339            if (buf[0] != '%')
340            {
341                if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
342                    nextlevel = 1;
343                    break;
344                } else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
345                    dict[k]->lhmin = atoi(buf + 13);
346                    continue;
347                } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
348                    dict[k]->rhmin = atoi(buf + 14);
349                    continue;
350                } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
351                    dict[k]->clhmin = atoi(buf + 21);
352                    continue;
353                } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
354                    dict[k]->crhmin = atoi(buf + 22);
355                    continue;
356                }
357                j = 0;
358                pattern[j] = '0';
359                repl = strchr(buf, '/');
360                replindex = 0;
361                replcut = 0;
362                if (repl) {
363                    char * index = strchr(repl + 1, ',');
364                    *repl = '\0';
365                    if (index) {
366                        char * index2 = strchr(index + 1, ',');
367                        *index = '\0';
368                        if (index2) {
369                            *index2 = '\0';
370                            replindex = (signed char) atoi(index + 1) - 1;
371                            replcut = (signed char) atoi(index2 + 1);
372                        }
373                    } else {
374                        hnj_strchomp(repl + 1);
375                        replindex = 0;
376                        replcut = strlen(buf);
377                    }
378                    repl = hnj_strdup(repl + 1);
379                }
380                for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
381                {
382                    if (buf[i] >= '0' && buf[i] <= '9')
383                        pattern[j] = buf[i];
384                    else
385                    {
386                        word[j] = buf[i];
387                        pattern[++j] = '0';
388                    }
389                }
390                word[j] = '\0';
391                pattern[j + 1] = '\0';
392
393                i = 0;
394                if (!repl) {
395                    /* Optimize away leading zeroes */
396                    for (; pattern[i] == '0'; i++);
397                } else {
398                    if (*word == '.') i++;
399                    /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
400                    if (dict[k]->utf8) {
401                        int pu = -1;        /* unicode character position */
402                        int ps = -1;        /* unicode start position (original replindex) */
403                        int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
404                        for (; pc < (strlen(word) + 1); pc++) {
405                            /* beginning of an UTF-8 character (not '10' start bits) */
406                            if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
407                            if ((ps < 0) && (replindex == pu)) {
408                                ps = replindex;
409                                replindex = pc;
410                            }
411                            if ((ps >= 0) && ((pu - ps) == replcut)) {
412                                replcut = (pc - replindex);
413                                break;
414                            }
415                        }
416                        if (*word == '.') replindex--;
417                    }
418                }
419
420#ifdef VERBOSE
421                printf ("word %s pattern %s, j = %d  repl: %s\n", word, pattern + i, j, repl);
422#endif
423                found = hnj_hash_lookup (hashtab, word);
424                state_num = hnj_get_state (dict[k], hashtab, word);
425                dict[k]->states[state_num].match = hnj_strdup (pattern + i);
426                dict[k]->states[state_num].repl = repl;
427                dict[k]->states[state_num].replindex = replindex;
428                if (!replcut) {
429                    dict[k]->states[state_num].replcut = strlen(word);
430                } else {
431                    dict[k]->states[state_num].replcut = replcut;
432                }
433
434                /* now, put in the prefix transitions */
435                for (; found < 0 ;j--)
436                {
437                    last_state = state_num;
438                    ch = word[j - 1];
439                    word[j - 1] = '\0';
440                    found = hnj_hash_lookup (hashtab, word);
441                    state_num = hnj_get_state (dict[k], hashtab, word);
442                    hnj_add_trans (dict[k], state_num, last_state, ch);
443                }
444            }
445        }
446
447        /* Could do unioning of matches here (instead of the preprocessor script).
448           If we did, the pseudocode would look something like this:
449
450           foreach state in the hash table
451           foreach i = [1..length(state) - 1]
452           state to check is substr (state, i)
453           look it up
454           if found, and if there is a match, union the match in.
455
456           It's also possible to avoid the quadratic blowup by doing the
457           search in order of increasing state string sizes - then you
458           can break the loop after finding the first match.
459
460           This step should be optional in any case - if there is a
461           preprocessed rule table, it's always faster to use that.
462
463        */
464
465        /* put in the fallback states */
466        for (i = 0; i < HASH_SIZE; i++)
467            for (e = hashtab->entries[i]; e; e = e->next)
468            {
469                if (*(e->key)) for (j = 1; 1; j++)
470                               {
471                                   state_num = hnj_hash_lookup (hashtab, e->key + j);
472                                   if (state_num >= 0)
473                                       break;
474                               }
475                /* KBH: FIXME state 0 fallback_state should always be -1? */
476                if (e->val)
477                    dict[k]->states[e->val].fallback_state = state_num;
478            }
479#ifdef VERBOSE
480        for (i = 0; i < HASH_SIZE; i++)
481            for (e = hashtab->entries[i]; e; e = e->next)
482            {
483                printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
484                    dict[k]->states[e->val].fallback_state);
485                for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
486                    printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
487                        dict[k]->states[e->val].trans[j].new_state);
488            }
489#endif
490
491#ifndef VERBOSE
492        hnj_hash_free (hashtab);
493#endif
494        state_num = 0;
495    }
496    if (k == 2) dict[0]->nextlevel = dict[1];
497    return dict[0];
498}
499
500void hnj_hyphen_free (HyphenDict *dict)
501{
502    int state_num;
503    HyphenState *hstate;
504
505    for (state_num = 0; state_num < dict->num_states; state_num++)
506    {
507        hstate = &dict->states[state_num];
508        if (hstate->match)
509            hnj_free (hstate->match);
510        if (hstate->repl)
511            hnj_free (hstate->repl);
512        if (hstate->trans)
513            hnj_free (hstate->trans);
514    }
515    if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
516
517    hnj_free (dict->states);
518
519    hnj_free (dict);
520}
521
522#define MAX_WORD 256
523
524int hnj_hyphen_hyphenate (HyphenDict *dict,
525    const char *word, int word_size,
526    char *hyphens)
527{
528    char prep_word_buf[MAX_WORD];
529    char *prep_word;
530    int i, j, k;
531    int state;
532    char ch;
533    HyphenState *hstate;
534    char *match;
535    int offset;
536
537    if (word_size + 3 < MAX_WORD)
538        prep_word = prep_word_buf;
539    else
540        prep_word = hnj_malloc (word_size + 3);
541
542    j = 0;
543    prep_word[j++] = '.';
544
545    for (i = 0; i < word_size; i++)
546        prep_word[j++] = word[i];
547
548    prep_word[j++] = '.';
549    prep_word[j] = '\0';
550
551    for (i = 0; i < j; i++)
552        hyphens[i] = '0';
553
554#ifdef VERBOSE
555    printf ("prep_word = %s\n", prep_word);
556#endif
557
558    /* now, run the finite state machine */
559    state = 0;
560    for (i = 0; i < j; i++)
561    {
562        ch = prep_word[i];
563        for (;;)
564        {
565
566            if (state == -1) {
567                /* return 1; */
568                /*  KBH: FIXME shouldn't this be as follows? */
569                state = 0;
570                goto try_next_letter;
571            }
572
573#ifdef VERBOSE
574            char *state_str;
575            state_str = get_state_str (state);
576
577            for (k = 0; k < i - strlen (state_str); k++)
578                putchar (' ');
579            printf ("%s", state_str);
580#endif
581
582            hstate = &dict->states[state];
583            for (k = 0; k < hstate->num_trans; k++)
584                if (hstate->trans[k].ch == ch)
585                {
586                    state = hstate->trans[k].new_state;
587                    goto found_state;
588                }
589            state = hstate->fallback_state;
590#ifdef VERBOSE
591            printf (" falling back, fallback_state %d\n", state);
592#endif
593        }
594      found_state:
595#ifdef VERBOSE
596        printf ("found state %d\n",state);
597#endif
598        /* Additional optimization is possible here - especially,
599           elimination of trailing zeroes from the match. Leading zeroes
600           have already been optimized. */
601        match = dict->states[state].match;
602        /* replacing rules not handled by hyphen_hyphenate() */
603        if (match && !dict->states[state].repl)
604        {
605            offset = i + 1 - strlen (match);
606#ifdef VERBOSE
607            for (k = 0; k < offset; k++)
608                putchar (' ');
609            printf ("%s\n", match);
610#endif
611            /* This is a linear search because I tried a binary search and
612               found it to be just a teeny bit slower. */
613            for (k = 0; match[k]; k++)
614                if (hyphens[offset + k] < match[k])
615                    hyphens[offset + k] = match[k];
616        }
617
618        /* KBH: we need this to make sure we keep looking in a word */
619        /* for patterns even if the current character is not known in state 0 */
620        /* since patterns for hyphenation may occur anywhere in the word */
621      try_next_letter: ;
622
623    }
624#ifdef VERBOSE
625    for (i = 0; i < j; i++)
626        putchar (hyphens[i]);
627    putchar ('\n');
628#endif
629
630    for (i = 0; i < j - 4; i++)
631#if 0
632        if (hyphens[i + 1] & 1)
633            hyphens[i] = '-';
634#else
635    hyphens[i] = hyphens[i + 1];
636#endif
637    hyphens[0] = '0';
638    for (; i < word_size; i++)
639        hyphens[i] = '0';
640    hyphens[word_size] = '\0';
641
642    if (prep_word != prep_word_buf)
643        hnj_free (prep_word);
644
645    return 0;
646}
647
648/* character length of the first n byte of the input word */
649int hnj_hyphen_strnlen(const char * word, int n, int utf8)
650{
651    int i = 0;
652    int j = 0;
653    while (j < n && word[j] != '\0') {
654        i++;
655        for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
656    }
657    return i;
658}
659
660int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
661	char *** rep, int ** pos, int ** cut, int lhmin)
662{
663    int i, j;
664    for (i = 1, j = 0; i < lhmin && word[j] != '\0'; i++) do {
665            // check length of the non-standard part
666            if (*rep && *pos && *cut && (*rep)[j]) {
667                char * rh = strchr((*rep)[j], '=');
668                if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
669                        hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
670                    free((*rep)[j]);
671                    (*rep)[j] = NULL;
672                    hyphens[j] = '0';
673                }
674            } else {
675                hyphens[j] = '0';
676            }
677            j++;
678        } while (utf8 && (word[j + 1] & 0xc0) == 0xc0);
679    return 0;
680}
681
682int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
683	char *** rep, int ** pos, int ** cut, int rhmin)
684{
685    int i;
686    int j = word_size - 2;
687    for (i = 1; i < rhmin && j > 0; j--) {
688        // check length of the non-standard part
689        if (*rep && *pos && *cut && (*rep)[j]) {
690            char * rh = strchr((*rep)[j], '=');
691            if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
692                    hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
693                free((*rep)[j]);
694                (*rep)[j] = NULL;
695                hyphens[j] = '0';
696            }
697        } else {
698            hyphens[j] = '0';
699        }
700        if (!utf8 || (word[j] & 0xc0) != 0xc0) i++;
701    }
702    return 0;
703}
704
705// recursive function for compound level hyphenation
706int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
707    char * hyphens, char *** rep, int ** pos, int ** cut,
708    int clhmin, int crhmin, int lend, int rend)
709{
710    char prep_word_buf[MAX_WORD];
711    char *prep_word;
712    int i, j, k;
713    int state;
714    char ch;
715    HyphenState *hstate;
716    char *match;
717    char *repl;
718    signed char replindex;
719    signed char replcut;
720    int offset;
721    int matchlen_buf[MAX_CHARS];
722    int matchindex_buf[MAX_CHARS];
723    char * matchrepl_buf[MAX_CHARS];
724    int * matchlen;
725    int * matchindex;
726    char ** matchrepl;
727    int isrepl = 0;
728    int nHyphCount;
729
730    if (word_size + 3 < MAX_CHARS) {
731        prep_word = prep_word_buf;
732        matchlen = matchlen_buf;
733        matchindex = matchindex_buf;
734        matchrepl = matchrepl_buf;
735    } else {
736        prep_word = hnj_malloc (word_size + 3);
737        matchlen = hnj_malloc ((word_size + 3) * sizeof(int));
738        matchindex = hnj_malloc ((word_size + 3) * sizeof(int));
739        matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));
740    }
741
742    j = 0;
743    prep_word[j++] = '.';
744
745    for (i = 0; i < word_size; i++)
746        prep_word[j++] = word[i];
747
748    prep_word[j++] = '.';
749    prep_word[j] = '\0';
750
751    for (i = 0; i < j; i++)
752        hyphens[i] = '0';
753
754#ifdef VERBOSE
755    printf ("prep_word = %s\n", prep_word);
756#endif
757
758    /* now, run the finite state machine */
759    state = 0;
760    for (i = 0; i < j; i++)
761    {
762        ch = prep_word[i];
763        for (;;)
764        {
765
766            if (state == -1) {
767                /* return 1; */
768                /*  KBH: FIXME shouldn't this be as follows? */
769                state = 0;
770                goto try_next_letter;
771            }
772
773#ifdef VERBOSE
774            char *state_str;
775            state_str = get_state_str (state);
776
777            for (k = 0; k < i - strlen (state_str); k++)
778                putchar (' ');
779            printf ("%s", state_str);
780#endif
781
782            hstate = &dict->states[state];
783            for (k = 0; k < hstate->num_trans; k++)
784                if (hstate->trans[k].ch == ch)
785                {
786                    state = hstate->trans[k].new_state;
787                    goto found_state;
788                }
789            state = hstate->fallback_state;
790#ifdef VERBOSE
791            printf (" falling back, fallback_state %d\n", state);
792#endif
793        }
794      found_state:
795#ifdef VERBOSE
796        printf ("found state %d\n",state);
797#endif
798        /* Additional optimization is possible here - especially,
799           elimination of trailing zeroes from the match. Leading zeroes
800           have already been optimized. */
801        match = dict->states[state].match;
802        repl = dict->states[state].repl;
803        replindex = dict->states[state].replindex;
804        replcut = dict->states[state].replcut;
805        /* replacing rules not handled by hyphen_hyphenate() */
806        if (match)
807        {
808            offset = i + 1 - strlen (match);
809#ifdef VERBOSE
810            for (k = 0; k < offset; k++)
811                putchar (' ');
812            printf ("%s (%s)\n", match, repl);
813#endif
814            if (repl) {
815                if (!isrepl) for(; isrepl < word_size; isrepl++) {
816                        matchrepl[isrepl] = NULL;
817                        matchindex[isrepl] = -1;
818                    }
819                matchlen[offset + replindex] = replcut;
820            }
821            /* This is a linear search because I tried a binary search and
822               found it to be just a teeny bit slower. */
823            for (k = 0; match[k]; k++) {
824                if ((hyphens[offset + k] < match[k])) {
825                    hyphens[offset + k] = match[k];
826                    if (match[k]&1) {
827                        matchrepl[offset + k] = repl;
828                        if (repl && (k >= replindex) && (k <= replindex + replcut)) {
829                            matchindex[offset + replindex] = offset + k;
830                        }
831                    }
832                }
833            }
834
835        }
836
837        /* KBH: we need this to make sure we keep looking in a word */
838        /* for patterns even if the current character is not known in state 0 */
839        /* since patterns for hyphenation may occur anywhere in the word */
840      try_next_letter: ;
841
842    }
843#ifdef VERBOSE
844    for (i = 0; i < j; i++)
845        putchar (hyphens[i]);
846    putchar ('\n');
847#endif
848
849    for (i = 0; i < j - 3; i++)
850#if 0
851        if (hyphens[i + 1] & 1)
852            hyphens[i] = '-';
853#else
854    hyphens[i] = hyphens[i + 1];
855#endif
856    for (; i < word_size; i++)
857        hyphens[i] = '0';
858    hyphens[word_size] = '\0';
859
860    /* now create a new char string showing hyphenation positions */
861    /* count the hyphens and allocate space for the new hyphenated string */
862    nHyphCount = 0;
863    for (i = 0; i < word_size; i++)
864        if (hyphens[i]&1)
865            nHyphCount++;
866    j = 0;
867    for (i = 0; i < word_size; i++) {
868        if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {
869            if (rep && pos && cut) {
870                if (!*rep && !*pos && !*cut) {
871                    int k;
872                    *rep = (char **) malloc(sizeof(char *) * word_size);
873                    *pos = (int *) malloc(sizeof(int) * word_size);
874                    *cut = (int *) malloc(sizeof(int) * word_size);
875                    for (k = 0; k < word_size; k++) {
876                        (*rep)[k] = NULL;
877                        (*pos)[k] = 0;
878                        (*cut)[k] = 0;
879                    }
880                }
881                (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
882                (*pos)[matchindex[i] - 1] = matchindex[i] - i;
883                (*cut)[matchindex[i] - 1] = matchlen[i];
884            }
885            j += strlen(matchrepl[matchindex[i]]);
886            i += matchlen[i] - 1;
887        }
888    }
889
890    if (matchrepl != matchrepl_buf) {
891        hnj_free (matchrepl);
892        hnj_free (matchlen);
893        hnj_free (matchindex);
894    }
895
896    // recursive hyphenation of the first (compound) level segments
897    if (dict->nextlevel) {
898        char * rep2_buf[MAX_WORD];
899        int pos2_buf[MAX_WORD];
900        int cut2_buf[MAX_WORD];
901        char hyphens2_buf[MAX_WORD];
902        char ** rep2;
903        int * pos2;
904        int * cut2;
905        char * hyphens2;
906        int begin = 0;
907        if (word_size < MAX_CHARS) {
908            rep2 = rep2_buf;
909            pos2 = pos2_buf;
910            cut2 = cut2_buf;
911            hyphens2 = hyphens2_buf;
912        } else {
913            rep2 = hnj_malloc (word_size * sizeof(char *));
914            pos2 = hnj_malloc (word_size * sizeof(int));
915            cut2 = hnj_malloc (word_size * sizeof(int));
916            hyphens2 = hnj_malloc (word_size);
917        }
918        for (i = 0; i < word_size; i++) rep2[i] = NULL;
919        for (i = 0; i < word_size; i++)
920            if (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
921                if (i - begin > 1) {
922                    int hyph = 0;
923                    prep_word[i + 2] = '\0';
924                    /* non-standard hyphenation at compound boundary (Schiffahrt) */
925                    if (*rep && *pos && *cut && (*rep)[i]) {
926                        char * l = strchr((*rep)[i], '=');
927                        strcpy(prep_word + 2 + i - (*pos)[i], (*rep)[i]);
928                        if (l) {
929                            hyph = (l - (*rep)[i]) - (*pos)[i];
930                            prep_word[2 + i + hyph] = '\0';
931                        }
932                    }
933                    hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
934                        hyphens2, &rep2, &pos2, &cut2, clhmin,
935                        crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
936                    for (j = 0; j < i - begin - 1; j++) {
937                        hyphens[begin + j] = hyphens2[j];
938                        if (rep2[j] && rep && pos && cut) {
939                            if (!*rep && !*pos && !*cut) {
940                                int k;
941                                *rep = (char **) malloc(sizeof(char *) * word_size);
942                                *pos = (int *) malloc(sizeof(int) * word_size);
943                                *cut = (int *) malloc(sizeof(int) * word_size);
944                                for (k = 0; k < word_size; k++) {
945                                    (*rep)[k] = NULL;
946                                    (*pos)[k] = 0;
947                                    (*cut)[k] = 0;
948                                }
949                            }
950                            (*rep)[begin + j] = rep2[j];
951                            (*pos)[begin + j] = pos2[j];
952                            (*cut)[begin + j] = cut2[j];
953                        }
954                    }
955                    prep_word[i + 2] = word[i + 1];
956                    if (*rep && *pos && *cut && (*rep)[i]) {
957                        strcpy(prep_word + 1, word);
958                    }
959                }
960                begin = i + 1;
961                for (j = 0; j < word_size; j++) rep2[j] = NULL;
962            }
963
964        // non-compound
965        if (begin == 0) {
966            hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
967                hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
968            if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
969                rep, pos, cut, clhmin);
970            if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
971                rep, pos, cut, crhmin);
972        }
973
974        if (rep2 != rep2_buf) {
975            free(rep2);
976            free(cut2);
977            free(pos2);
978            free(hyphens2);
979        }
980    }
981
982    if (prep_word != prep_word_buf) hnj_free (prep_word);
983    return 0;
984}
985
986/* UTF-8 normalization of hyphen and non-standard positions */
987int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
988	char *** rep, int ** pos, int ** cut)
989{
990    if ((((unsigned char) word[0]) >> 6) == 2) {
991        fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
992        return 1;
993    }
994
995    /* calculate UTF-8 character positions */
996    int i, j, k;
997    for (i = 0, j = -1; i < word_size; i++) {
998        /* beginning of an UTF-8 character (not '10' start bits) */
999        if ((((unsigned char) word[i]) >> 6) != 2) j++;
1000        hyphens[j] = hyphens[i];
1001        if (rep && pos && cut && *rep && *pos && *cut) {
1002            int l = (*pos)[i];
1003            (*pos)[j] = 0;
1004            for (k = 0; k < l; k++) {
1005                if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
1006            }
1007            k = i - l + 1;
1008            l = k + (*cut)[i];
1009            (*cut)[j] = 0;
1010            for (; k < l; k++) {
1011                if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
1012            }
1013            (*rep)[j] = (*rep)[i];
1014            if (j < i) {
1015                (*rep)[i] = NULL;
1016                (*pos)[i] = 0;
1017                (*cut)[i] = 0;
1018            }
1019        }
1020    }
1021    hyphens[j + 1] = '\0';
1022    return 0;
1023}
1024
1025/* get the word with all possible hyphenations (output: hyphword) */
1026void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,
1027    char * hyphword, char *** rep, int ** pos, int ** cut)
1028{
1029    int i, j;
1030    for (i = 0, j = 0; i < l; i++, j++) {
1031        if (hyphens[i]&1) {
1032            hyphword[j] = word[i];
1033            if (*rep && *pos && *cut && (*rep)[i]) {
1034                strcpy(hyphword + j - (*pos)[i] + 1, (*rep)[i]);
1035                j += strlen((*rep)[i]) - (*pos)[i];
1036                i += (*cut)[i] - (*pos)[i];
1037            } else hyphword[++j] = '=';
1038        } else hyphword[j] = word[i];
1039    }
1040    hyphword[j] = '\0';
1041}
1042
1043
1044/* main api function with default hyphenmin parameters */
1045int hnj_hyphen_hyphenate2 (HyphenDict *dict,
1046    const char *word, int word_size, char * hyphens,
1047    char *hyphword, char *** rep, int ** pos, int ** cut)
1048{
1049    hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1050        dict->clhmin, dict->crhmin, 1, 1);
1051    hnj_hyphen_lhmin(dict->utf8, word, word_size,
1052        hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
1053    hnj_hyphen_rhmin(dict->utf8, word, word_size,
1054        hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
1055    if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
1056    if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
1057    return 0;
1058}
1059
1060/* previous main api function with hyphenmin parameters */
1061int hnj_hyphen_hyphenate3 (HyphenDict *dict,
1062	const char *word, int word_size, char * hyphens,
1063	char *hyphword, char *** rep, int ** pos, int ** cut,
1064	int lhmin, int rhmin, int clhmin, int crhmin)
1065{
1066    lhmin = (lhmin > 0 ? lhmin : dict->lhmin);
1067    rhmin = (rhmin > 0 ? rhmin : dict->rhmin);
1068    hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1069        clhmin, crhmin, 1, 1);
1070    hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
1071        rep, pos, cut, (lhmin > 0 ? lhmin : 2));
1072    hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
1073        rep, pos, cut, (rhmin > 0 ? rhmin : 2));
1074    if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
1075    if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
1076    return 0;
1077}
1078