1// Copyright 2014 PDFium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7#include "../../include/fpdfapi/fpdf_resource.h"
8#include "../../include/fpdfapi/fpdf_pageobj.h"
9#include "../../include/fpdftext/fpdf_text.h"
10#include "../../include/fpdfapi/fpdf_page.h"
11#include "../../include/fpdfapi/fpdf_module.h"
12#include <ctype.h>
13#include "text_int.h"
14FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar)
15{
16    if(curChar < 255 ) {
17        return FALSE;
18    }
19    if ( (curChar >= 0x0600 && curChar <= 0x06FF)
20            || (curChar >= 0xFE70 && curChar <= 0xFEFF)
21            || (curChar >= 0xFB50 && curChar <= 0xFDFF)
22            || (curChar >= 0x0400 && curChar <= 0x04FF)
23            || (curChar >= 0x0500 && curChar <= 0x052F)
24            || (curChar >= 0xA640 && curChar <= 0xA69F)
25            || (curChar >= 0x2DE0 && curChar <= 0x2DFF)
26            || curChar == 8467
27            || (curChar >= 0x2000 && curChar <= 0x206F)) {
28        return FALSE;
29    }
30    return TRUE;
31}
32CPDFText_ParseOptions::CPDFText_ParseOptions()
33    : m_bGetCharCodeOnly(FALSE), m_bNormalizeObjs(TRUE), m_bOutputHyphen(FALSE)
34{
35}
36IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions)
37{
38    CPDF_TextPage* pTextPageEx = FX_NEW CPDF_TextPage(pPage, ParserOptions);
39    return pTextPageEx;
40}
41IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage, int flags)
42{
43    CPDF_TextPage* pTextPage = FX_NEW CPDF_TextPage(pPage, flags);
44    return	pTextPage;
45}
46IPDF_TextPage*	IPDF_TextPage::CreateTextPage(const CPDF_PageObjects* pObjs, int flags)
47{
48    CPDF_TextPage* pTextPage = FX_NEW CPDF_TextPage(pObjs, flags);
49    return	pTextPage;
50}
51IPDF_TextPageFind*	IPDF_TextPageFind::CreatePageFind(const IPDF_TextPage* pTextPage)
52{
53    if (!pTextPage) {
54        return NULL;
55    }
56    return FX_NEW CPDF_TextPageFind(pTextPage);
57}
58IPDF_LinkExtract* IPDF_LinkExtract::CreateLinkExtract()
59{
60    return FX_NEW CPDF_LinkExtract();
61}
62#define  TEXT_BLANK_CHAR		L' '
63#define  TEXT_LINEFEED_CHAR		L'\n'
64#define	 TEXT_RETURN_CHAR		L'\r'
65#define  TEXT_EMPTY				L""
66#define  TEXT_BLANK				L" "
67#define  TEXT_RETURN_LINEFEED	L"\r\n"
68#define  TEXT_LINEFEED			L"\n"
69#define	 TEXT_CHARRATIO_GAPDELTA	0.070
70CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, int flags)
71    : m_pPreTextObj(NULL),
72      m_IsParsered(FALSE),
73      m_charList(512),
74      m_TempCharList(50),
75      m_TextlineDir(-1),
76      m_CurlineRect(0, 0, 0, 0)
77{
78    m_pPage = pPage;
79    m_parserflag = flags;
80    m_TextBuf.EstimateSize(0, 10240);
81    pPage->GetDisplayMatrix(m_DisplayMatrix, 0, 0, (int) pPage->GetPageWidth(), (int)pPage->GetPageHeight(), 0);
82}
83CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions)
84    : m_pPreTextObj(NULL)
85    , m_IsParsered(FALSE)
86    , m_charList(512)
87    , m_TempCharList(50)
88    , m_TextlineDir(-1)
89    , m_CurlineRect(0, 0, 0, 0)
90    , m_ParseOptions(ParserOptions)
91{
92    m_pPage = pPage;
93    m_parserflag = 0;
94    m_TextBuf.EstimateSize(0, 10240);
95    pPage->GetDisplayMatrix(m_DisplayMatrix, 0, 0, (int) pPage->GetPageWidth(), (int)pPage->GetPageHeight(), 0);
96}
97CPDF_TextPage::CPDF_TextPage(const CPDF_PageObjects* pPage, int flags)
98    : m_pPreTextObj(NULL),
99      m_IsParsered(FALSE),
100      m_charList(512),
101      m_TempCharList(50),
102      m_TextlineDir(-1),
103      m_CurlineRect(0, 0, 0, 0)
104{
105    m_pPage = pPage;
106    m_parserflag = flags;
107    m_TextBuf.EstimateSize(0, 10240);
108    CFX_FloatRect pageRect = pPage->CalcBoundingBox();
109    m_DisplayMatrix = CFX_AffineMatrix(1, 0, 0, -1, pageRect.right, pageRect.top);
110}
111void CPDF_TextPage::NormalizeObjects(FX_BOOL bNormalize)
112{
113    m_ParseOptions.m_bNormalizeObjs = bNormalize;
114}
115FX_BOOL CPDF_TextPage::IsControlChar(PAGECHAR_INFO* pCharInfo)
116{
117    if(!pCharInfo) {
118        return FALSE;
119    }
120    switch(pCharInfo->m_Unicode) {
121        case 0x2:
122        case 0x3:
123        case 0x93:
124        case 0x94:
125        case 0x96:
126        case 0x97:
127        case 0x98:
128        case 0xfffe:
129            if(pCharInfo->m_Flag == FPDFTEXT_CHAR_HYPHEN) {
130                return FALSE;
131            } else {
132                return TRUE;
133            }
134        default:
135            return FALSE;
136    }
137}
138FX_BOOL CPDF_TextPage::ParseTextPage()
139{
140    if (!m_pPage) {
141        m_IsParsered = FALSE;
142        return FALSE;
143    }
144    m_IsParsered = FALSE;
145    m_TextBuf.Clear();
146    m_charList.RemoveAll();
147    m_pPreTextObj = NULL;
148    ProcessObject();
149    m_IsParsered = TRUE;
150    if(!m_ParseOptions.m_bGetCharCodeOnly) {
151        m_CharIndex.RemoveAll();
152        int nCount = m_charList.GetSize();
153        if(nCount) {
154            m_CharIndex.Add(0);
155        }
156        for(int i = 0; i < nCount; i++) {
157            int indexSize = m_CharIndex.GetSize();
158            FX_BOOL bNormal = FALSE;
159            PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(i);
160            if(charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) {
161                bNormal = TRUE;
162            }
163#ifdef FOXIT_CHROME_BUILD
164            else if(charinfo.m_Unicode == 0 || IsControlChar(&charinfo))
165#else
166            else if(charinfo.m_Unicode == 0)
167#endif
168                bNormal = FALSE;
169            else {
170                bNormal = TRUE;
171            }
172            if(bNormal) {
173                if(indexSize % 2) {
174                    m_CharIndex.Add(1);
175                } else {
176                    if(indexSize <= 0) {
177                        continue;
178                    }
179                    m_CharIndex.SetAt(indexSize - 1, m_CharIndex.GetAt(indexSize - 1) + 1);
180                }
181            } else {
182                if(indexSize % 2) {
183                    if(indexSize <= 0) {
184                        continue;
185                    }
186                    m_CharIndex.SetAt(indexSize - 1, i + 1);
187                } else {
188                    m_CharIndex.Add(i + 1);
189                }
190            }
191        }
192        int indexSize = m_CharIndex.GetSize();
193        if(indexSize % 2) {
194            m_CharIndex.RemoveAt(indexSize - 1);
195        }
196    }
197    return TRUE;
198}
199int	CPDF_TextPage::CountChars() const
200{
201    if(m_ParseOptions.m_bGetCharCodeOnly) {
202        return m_TextBuf.GetSize();
203    }
204    return m_charList.GetSize();
205}
206int CPDF_TextPage::CharIndexFromTextIndex(int TextIndex) const
207{
208    int indexSize = m_CharIndex.GetSize();
209    int count = 0;
210    for(int i = 0; i < indexSize; i += 2) {
211        count += m_CharIndex.GetAt(i + 1);
212        if(count > TextIndex) {
213            return 	TextIndex - count + m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i);
214        }
215    }
216    return -1;
217}
218int CPDF_TextPage::TextIndexFromCharIndex(int CharIndex) const
219{
220    int indexSize = m_CharIndex.GetSize();
221    int count = 0;
222    for(int i = 0; i < indexSize; i += 2) {
223        count += m_CharIndex.GetAt(i + 1);
224        if(m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i) > CharIndex) {
225            if(CharIndex - m_CharIndex.GetAt(i) < 0) {
226                return -1;
227            }
228            return 	CharIndex - m_CharIndex.GetAt(i) + count - m_CharIndex.GetAt(i + 1);
229        }
230    }
231    return -1;
232}
233void CPDF_TextPage::GetRectArray(int start, int nCount, CFX_RectArray& rectArray) const
234{
235    if(m_ParseOptions.m_bGetCharCodeOnly) {
236        return;
237    }
238    if(start < 0 || nCount == 0) {
239        return;
240    }
241    if (!m_IsParsered)	{
242        return;
243    }
244    PAGECHAR_INFO		info_curchar;
245    CPDF_TextObject*	pCurObj = NULL;
246    CFX_FloatRect		rect;
247    int					curPos = start;
248    FX_BOOL				flagNewRect = TRUE;
249    if (nCount + start > m_charList.GetSize() || nCount == -1) {
250        nCount = m_charList.GetSize() - start;
251    }
252    while (nCount--) {
253        info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(curPos++);
254        if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) {
255            continue;
256        }
257        if(info_curchar.m_CharBox.Width() < 0.01 || info_curchar.m_CharBox.Height() < 0.01) {
258            continue;
259        }
260        if(!pCurObj) {
261            pCurObj = info_curchar.m_pTextObj;
262        }
263        if (pCurObj != info_curchar.m_pTextObj) {
264            rectArray.Add(rect);
265            pCurObj = info_curchar.m_pTextObj;
266            flagNewRect = TRUE;
267        }
268        if (flagNewRect) {
269            FX_FLOAT orgX = info_curchar.m_OriginX, orgY = info_curchar.m_OriginY;
270            CFX_AffineMatrix matrix, matrix_reverse;
271            info_curchar.m_pTextObj->GetTextMatrix(&matrix);
272            matrix.Concat(info_curchar.m_Matrix);
273            matrix_reverse.SetReverse(matrix);
274            matrix_reverse.Transform(orgX, orgY);
275            rect.left = info_curchar.m_CharBox.left;
276            rect.right = info_curchar.m_CharBox.right;
277            if (pCurObj->GetFont()->GetTypeDescent()) {
278                rect.bottom = orgY + pCurObj->GetFont()->GetTypeDescent() * pCurObj->GetFontSize() / 1000;
279                FX_FLOAT xPosTemp = orgX;
280                matrix.Transform(xPosTemp, rect.bottom);
281            } else {
282                rect.bottom = info_curchar.m_CharBox.bottom;
283            }
284            if (pCurObj->GetFont()->GetTypeAscent()) {
285                rect.top = orgY + pCurObj->GetFont()->GetTypeAscent() * pCurObj->GetFontSize() / 1000;
286                FX_FLOAT xPosTemp = orgX + GetCharWidth(info_curchar.m_CharCode, pCurObj->GetFont()) * pCurObj->GetFontSize() / 1000;
287                matrix.Transform(xPosTemp, rect.top);
288            } else {
289                rect.top = info_curchar.m_CharBox.top;
290            }
291            flagNewRect = FALSE;
292            rect = info_curchar.m_CharBox;
293            rect.Normalize();
294        } else {
295            info_curchar.m_CharBox.Normalize();
296            if (rect.left > info_curchar.m_CharBox.left) {
297                rect.left = info_curchar.m_CharBox.left;
298            }
299            if (rect.right < info_curchar.m_CharBox.right) {
300                rect.right = info_curchar.m_CharBox.right;
301            }
302            if ( rect.top < info_curchar.m_CharBox.top) {
303                rect.top = info_curchar.m_CharBox.top;
304            }
305            if (rect.bottom > info_curchar.m_CharBox.bottom) {
306                rect.bottom = info_curchar.m_CharBox.bottom;
307            }
308        }
309    }
310    rectArray.Add(rect);
311    return;
312}
313int CPDF_TextPage::GetIndexAtPos(CPDF_Point point , FX_FLOAT xTorelance, FX_FLOAT yTorelance) const
314{
315    if(m_ParseOptions.m_bGetCharCodeOnly) {
316        return -3;
317    }
318    if (!m_IsParsered)	{
319        return	-3;
320    }
321    FX_FLOAT distance = 0;
322    int pos = 0;
323    int NearPos = -1;
324    double xdif = 5000, ydif = 5000;
325    while(pos < m_charList.GetSize()) {
326        PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)(m_charList.GetAt(pos));
327        CFX_FloatRect charrect = charinfo.m_CharBox;
328        if (charrect.Contains(point.x, point.y)) {
329            break;
330        }
331        if (xTorelance > 0 || yTorelance > 0) {
332            CFX_FloatRect charRectExt;
333            charrect.Normalize();
334            charRectExt.left = charrect.left - xTorelance / 2;
335            charRectExt.right = charrect.right + xTorelance / 2;
336            charRectExt.top = charrect.top + yTorelance / 2;
337            charRectExt.bottom = charrect.bottom - yTorelance / 2;
338            if (charRectExt.Contains(point.x, point.y)) {
339                double curXdif, curYdif;
340                curXdif = FXSYS_fabs(point.x - charrect.left) < FXSYS_fabs(point.x - charrect.right) ? FXSYS_fabs(point.x - charrect.left) : FXSYS_fabs(point.x - charrect.right);
341                curYdif = FXSYS_fabs(point.y - charrect.bottom) < FXSYS_fabs(point.y - charrect.top	) ? FXSYS_fabs(point.y - charrect.bottom) : FXSYS_fabs(point.y - charrect.top);
342                if (curYdif + curXdif < xdif + ydif) {
343                    ydif = curYdif;
344                    xdif = curXdif;
345                    NearPos = pos;
346                }
347            }
348        }
349        ++pos;
350    }
351    if (pos >= m_charList.GetSize()) {
352        pos = NearPos;
353    }
354    return pos;
355}
356CFX_WideString CPDF_TextPage::GetTextByRect(CFX_FloatRect rect) const
357{
358    CFX_WideString strText;
359    if(m_ParseOptions.m_bGetCharCodeOnly || !m_IsParsered) {
360        return strText;
361    }
362    int nCount = m_charList.GetSize();
363    int pos = 0;
364    FX_FLOAT posy = 0;
365    FX_BOOL IsContainPreChar = FALSE;
366    FX_BOOL	ISAddLineFeed = FALSE;
367    while (pos < nCount) {
368        PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos++);
369        if (IsRectIntersect(rect, charinfo.m_CharBox)) {
370            if (FXSYS_fabs(posy - charinfo.m_OriginY) > 0 && !IsContainPreChar && ISAddLineFeed) {
371                posy = charinfo.m_OriginY;
372                if (strText.GetLength() > 0) {
373                    strText += L"\r\n";
374                }
375            }
376            IsContainPreChar = TRUE;
377            ISAddLineFeed = FALSE;
378            if (charinfo.m_Unicode) {
379                strText += charinfo.m_Unicode;
380            }
381        } else if (charinfo.m_Unicode == 32) {
382            if (IsContainPreChar && charinfo.m_Unicode) {
383                strText += charinfo.m_Unicode;
384                IsContainPreChar = FALSE;
385                ISAddLineFeed = FALSE;
386            }
387        } else {
388            IsContainPreChar = FALSE;
389            ISAddLineFeed = TRUE;
390        }
391    }
392    return strText;
393}
394void CPDF_TextPage::GetRectsArrayByRect(CFX_FloatRect rect, CFX_RectArray& resRectArray) const
395{
396    if(m_ParseOptions.m_bGetCharCodeOnly) {
397        return;
398    }
399    if (!m_IsParsered)	{
400        return;
401    }
402    CFX_FloatRect		curRect;
403    FX_BOOL				flagNewRect = TRUE;
404    CPDF_TextObject*	pCurObj = NULL;
405    int nCount = m_charList.GetSize();
406    int pos = 0;
407    while (pos < nCount) {
408        PAGECHAR_INFO info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(pos++);
409        if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) {
410            continue;
411        }
412        if(pos == 494) {
413            int a = 0;
414        }
415        if (IsRectIntersect(rect, info_curchar.m_CharBox)) {
416            if(!pCurObj) {
417                pCurObj = info_curchar.m_pTextObj;
418            }
419            if (pCurObj != info_curchar.m_pTextObj) {
420                resRectArray.Add(curRect);
421                pCurObj = info_curchar.m_pTextObj;
422                flagNewRect = TRUE;
423            }
424            if (flagNewRect) {
425                curRect = info_curchar.m_CharBox;
426                flagNewRect = FALSE;
427                curRect.Normalize();
428            } else {
429                info_curchar.m_CharBox.Normalize();
430                if (curRect.left > info_curchar.m_CharBox.left) {
431                    curRect.left = info_curchar.m_CharBox.left;
432                }
433                if (curRect.right < info_curchar.m_CharBox.right) {
434                    curRect.right = info_curchar.m_CharBox.right;
435                }
436                if ( curRect.top < info_curchar.m_CharBox.top) {
437                    curRect.top = info_curchar.m_CharBox.top;
438                }
439                if (curRect.bottom > info_curchar.m_CharBox.bottom) {
440                    curRect.bottom = info_curchar.m_CharBox.bottom;
441                }
442            }
443        }
444    }
445    resRectArray.Add(curRect);
446    return;
447}
448int	CPDF_TextPage::GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const
449{
450    if(m_ParseOptions.m_bGetCharCodeOnly) {
451        return -3;
452    }
453    CPDF_Point point(x, y);
454    return GetIndexAtPos(point, xTorelance, yTorelance);
455}
456int CPDF_TextPage::GetOrderByDirection(int order, int direction) const
457{
458    if(m_ParseOptions.m_bGetCharCodeOnly) {
459        return -3;
460    }
461    if (!m_IsParsered) {
462        return -3;
463    }
464    if (direction == FPDFTEXT_RIGHT || direction == FPDFTEXT_LEFT) {
465        order += direction;
466        while(order >= 0 && order < m_charList.GetSize()) {
467            PAGECHAR_INFO cinfo = *(PAGECHAR_INFO*)m_charList.GetAt(order);
468            if (cinfo.m_Flag != FPDFTEXT_CHAR_GENERATED) {
469                break;
470            } else {
471                if (cinfo.m_Unicode == TEXT_LINEFEED_CHAR || cinfo.m_Unicode == TEXT_RETURN_CHAR) {
472                    order += direction;
473                } else {
474                    break;
475                }
476            }
477        }
478        if (order >= m_charList.GetSize()) {
479            order = -2;
480        }
481        return order;
482    }
483    PAGECHAR_INFO charinfo;
484    charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(order);
485    CPDF_Point curPos(charinfo.m_OriginX, charinfo.m_OriginY);
486    FX_FLOAT difPosY = 0.0, minXdif = 1000;
487    int	minIndex = -2;
488    int index = order;
489    FX_FLOAT height = charinfo.m_CharBox.Height();
490    if (direction == FPDFTEXT_UP) {
491        minIndex = -1;
492        while (1) {
493            if (--index < 0)	{
494                return -1;
495            }
496            charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index);
497            if (FXSYS_fabs(charinfo.m_OriginY - curPos.y) > FX_MAX(height, charinfo.m_CharBox.Height()) / 2) {
498                difPosY = charinfo.m_OriginY;
499                minIndex = index;
500                break;
501            }
502        }
503        FX_FLOAT PreXdif = charinfo.m_OriginX - curPos.x;
504        minXdif = PreXdif;
505        if (PreXdif == 0)	{
506            return index;
507        }
508        FX_FLOAT curXdif = 0;
509        while (--index >= 0) {
510            charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index);
511            if (difPosY != charinfo.m_OriginY) {
512                break;
513            }
514            curXdif = charinfo.m_OriginX - curPos.x;
515            if (curXdif == 0) {
516                return index;
517            }
518            int signflag = 0;
519            if (curXdif > 0) {
520                signflag = 1;
521            } else {
522                signflag = -1;
523            }
524            if (signflag * PreXdif < 0) {
525                if (FXSYS_fabs(PreXdif) < FXSYS_fabs(curXdif)) {
526                    return index + 1;
527                } else {
528                    return index;
529                }
530            }
531            if (FXSYS_fabs(curXdif) < FXSYS_fabs(minXdif)) {
532                minIndex = index;
533                minXdif = curXdif;
534            }
535            PreXdif = curXdif;
536            if (difPosY != charinfo.m_OriginY) {
537                break;
538            }
539        }
540        return minIndex;
541    } else if(FPDFTEXT_DOWN) {
542        minIndex = -2;
543        while (1) {
544            if (++index > m_charList.GetSize() - 1)	{
545                return minIndex;
546            }
547            charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index);
548            if (FXSYS_fabs(charinfo.m_OriginY - curPos.y) > FX_MAX(height, charinfo.m_CharBox.Height()) / 2) {
549                difPosY = charinfo.m_OriginY;
550                minIndex = index;
551                break;
552            }
553        }
554        FX_FLOAT PreXdif = charinfo.m_OriginX - curPos.x;
555        minXdif = PreXdif;
556        if (PreXdif == 0)	{
557            return index;
558        }
559        FX_FLOAT curXdif = 0;
560        while (++index < m_charList.GetSize()) {
561            charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index);
562            if (difPosY != charinfo.m_OriginY) {
563                break;
564            }
565            curXdif = charinfo.m_OriginX - curPos.x;
566            if (curXdif == 0) {
567                return index;
568            }
569            int signflag = 0;
570            if (curXdif > 0) {
571                signflag = 1;
572            } else {
573                signflag = -1;
574            }
575            if (signflag * PreXdif < 0) {
576                if (FXSYS_fabs(PreXdif) < FXSYS_fabs(curXdif)) {
577                    return index - 1;
578                } else {
579                    return index;
580                }
581            }
582            if (FXSYS_fabs(curXdif) < FXSYS_fabs(minXdif)) {
583                minXdif = curXdif;
584                minIndex = index;
585            }
586            PreXdif = curXdif;
587        }
588        return minIndex;
589    }
590}
591void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO & info) const
592{
593    if(m_ParseOptions.m_bGetCharCodeOnly) {
594        return;
595    }
596    if (!m_IsParsered)	{
597        return;
598    }
599    if (index < 0 || index >= m_charList.GetSize())	{
600        return;
601    }
602    PAGECHAR_INFO charinfo;
603    charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index);
604    info.m_Charcode = charinfo.m_CharCode;
605    info.m_OriginX = charinfo.m_OriginX;
606    info.m_OriginY = charinfo.m_OriginY;
607    info.m_Unicode = charinfo.m_Unicode;
608    info.m_Flag = charinfo.m_Flag;
609    info.m_CharBox = charinfo.m_CharBox;
610    info.m_pTextObj = charinfo.m_pTextObj;
611    if (charinfo.m_pTextObj && charinfo.m_pTextObj->GetFont()) {
612        info.m_FontSize = charinfo.m_pTextObj->GetFontSize();
613    }
614    info.m_Matrix.Copy(charinfo.m_Matrix);
615    return;
616}
617void CPDF_TextPage::CheckMarkedContentObject(FX_INT32& start, FX_INT32& nCount) const
618{
619    PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start);
620    PAGECHAR_INFO charinfo2 = *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1);
621    if (FPDFTEXT_CHAR_PIECE != charinfo.m_Flag && FPDFTEXT_CHAR_PIECE != charinfo2.m_Flag) {
622        return;
623    }
624    if (FPDFTEXT_CHAR_PIECE == charinfo.m_Flag) {
625        PAGECHAR_INFO charinfo1 = charinfo;
626        int startIndex = start;
627        while(FPDFTEXT_CHAR_PIECE == charinfo1.m_Flag && charinfo1.m_Index == charinfo.m_Index) {
628            startIndex--;
629            if (startIndex < 0)	{
630                break;
631            }
632            charinfo1 = *(PAGECHAR_INFO*)m_charList.GetAt(startIndex);
633        }
634        startIndex++;
635        start = startIndex;
636    }
637    if (FPDFTEXT_CHAR_PIECE == charinfo2.m_Flag) {
638        PAGECHAR_INFO charinfo3 = charinfo2;
639        int endIndex = start + nCount - 1;
640        while(FPDFTEXT_CHAR_PIECE == charinfo3.m_Flag && charinfo3.m_Index == charinfo2.m_Index) {
641            endIndex++;
642            if (endIndex >= m_charList.GetSize())	{
643                break;
644            }
645            charinfo3 = *(PAGECHAR_INFO*)m_charList.GetAt(endIndex);
646        }
647        endIndex--;
648        nCount = endIndex - start + 1;
649    }
650}
651CFX_WideString CPDF_TextPage::GetPageText(int start , int nCount) const
652{
653    if (!m_IsParsered || nCount == 0) {
654        return L"";
655    }
656    if (start < 0) {
657        start = 0;
658    }
659    if	(nCount == -1) {
660        nCount = m_charList.GetSize() - start;
661        return m_TextBuf.GetWideString().Mid(start, m_TextBuf.GetWideString().GetLength());
662    }
663    if(nCount <= 0 || m_charList.GetSize() <= 0) {
664        return L"";
665    }
666    if(nCount + start > m_charList.GetSize() - 1) {
667        nCount = m_charList.GetSize() - start;
668    }
669    if (nCount <= 0) {
670        return L"";
671    }
672    CheckMarkedContentObject(start, nCount);
673    int startindex = 0;
674    PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start);
675    int startOffset = 0;
676    while(charinfo.m_Index == -1) {
677        startOffset++;
678        if (startOffset > nCount || start + startOffset >= m_charList.GetSize())	{
679            return L"";
680        }
681        charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + startOffset);
682    }
683    startindex = charinfo.m_Index;
684    charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1);
685    int nCountOffset = 0;
686    while (charinfo.m_Index == -1) {
687        nCountOffset++;
688        if (nCountOffset >= nCount) {
689            return L"";
690        }
691        charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - nCountOffset - 1);
692    }
693    nCount = start + nCount - nCountOffset - startindex;
694    if(nCount <= 0) {
695        return L"";
696    }
697    return m_TextBuf.GetWideString().Mid(startindex, nCount);
698}
699int CPDF_TextPage::CountRects(int start, int nCount)
700{
701    if(m_ParseOptions.m_bGetCharCodeOnly) {
702        return -1;
703    }
704    if (!m_IsParsered)	{
705        return -1;
706    }
707    if (start < 0) {
708        return -1;
709    }
710    if (nCount == -1 || nCount + start > m_charList.GetSize() ) {
711        nCount = m_charList.GetSize() - start;
712    }
713    m_SelRects.RemoveAll();
714    GetRectArray(start, nCount, m_SelRects);
715    return m_SelRects.GetSize();
716}
717void CPDF_TextPage::GetRect(int rectIndex, FX_FLOAT& left, FX_FLOAT& top, FX_FLOAT& right, FX_FLOAT &bottom) const
718{
719    if(m_ParseOptions.m_bGetCharCodeOnly) {
720        return ;
721    }
722    if (!m_IsParsered || rectIndex < 0 || rectIndex >= m_SelRects.GetSize()) {
723        return;
724    }
725    left = m_SelRects.GetAt(rectIndex).left;
726    top = m_SelRects.GetAt(rectIndex).top;
727    right = m_SelRects.GetAt(rectIndex).right;
728    bottom = m_SelRects.GetAt(rectIndex).bottom;
729}
730FX_BOOL CPDF_TextPage::GetBaselineRotate(int start, int end, int& Rotate)
731{
732    if(m_ParseOptions.m_bGetCharCodeOnly) {
733        return FALSE;
734    }
735    if(end == start) {
736        return FALSE;
737    }
738    FX_FLOAT dx, dy;
739    FPDF_CHAR_INFO info1, info2;
740    GetCharInfo(start, info1);
741    GetCharInfo(end, info2);
742    while(info2.m_CharBox.Width() == 0 || info2.m_CharBox.Height() == 0) {
743        end--;
744        if(end <= start) {
745            return FALSE;
746        }
747        GetCharInfo(end, info2);
748    }
749    dx = (info2.m_OriginX - info1.m_OriginX);
750    dy = (info2.m_OriginY - info1.m_OriginY);
751    if(dx == 0) {
752        if(dy > 0) {
753            Rotate = 90;
754        } else if (dy < 0) {
755            Rotate = 270;
756        } else {
757            Rotate = 0;
758        }
759    } else {
760        float a = FXSYS_atan2(dy, dx);
761        Rotate = (int)(a * 180 / FX_PI + 0.5);
762    }
763    if(Rotate < 0) {
764        Rotate = -Rotate;
765    } else if(Rotate > 0) {
766        Rotate = 360 - Rotate;
767    }
768    return TRUE;
769}
770FX_BOOL	CPDF_TextPage::GetBaselineRotate(CFX_FloatRect rect , int& Rotate)
771{
772    if(m_ParseOptions.m_bGetCharCodeOnly) {
773        return FALSE;
774    }
775    int start, end, count, n = CountBoundedSegments(rect.left, rect.top, rect.right, rect.bottom, TRUE);
776    if(n < 1) {
777        return FALSE;
778    }
779    if(n > 1) {
780        GetBoundedSegment(n - 1, start, count);
781        end = start + count - 1;
782        GetBoundedSegment(0, start, count);
783    } else {
784        GetBoundedSegment(0, start, count);
785        end = start + count - 1;
786    }
787    return GetBaselineRotate(start, end, Rotate);
788}
789FX_BOOL	CPDF_TextPage::GetBaselineRotate(int rectIndex, int& Rotate)
790{
791    if(m_ParseOptions.m_bGetCharCodeOnly) {
792        return FALSE;
793    }
794    if (!m_IsParsered || rectIndex < 0 || rectIndex > m_SelRects.GetSize()) {
795        return FALSE;
796    }
797    CFX_FloatRect rect = m_SelRects.GetAt(rectIndex);
798    return GetBaselineRotate(rect , Rotate);
799}
800int	CPDF_TextPage::CountBoundedSegments(FX_FLOAT left, FX_FLOAT top, FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains )
801{
802    if(m_ParseOptions.m_bGetCharCodeOnly) {
803        return -1;
804    }
805    m_Segment.RemoveAll();
806    if (!m_IsParsered)	{
807        return -1;
808    }
809    CFX_FloatRect rect(left, bottom, right, top);
810    rect.Normalize();
811    int nCount = m_charList.GetSize();
812    int pos = 0;
813    FPDF_SEGMENT	segment;
814    segment.m_Start = 0;
815    segment.m_nCount = 0;
816    FX_BOOL		segmentStatus = 0;
817    FX_BOOL		IsContainPreChar = FALSE;
818    while (pos < nCount) {
819        if(pos == 493) {
820            int a = 0;
821        }
822        PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos);
823        if(bContains && rect.Contains(charinfo.m_CharBox)) {
824            if (segmentStatus == 0 || segmentStatus == 2) {
825                segment.m_Start = pos;
826                segment.m_nCount = 1;
827                segmentStatus = 1;
828            } else if (segmentStatus == 1) {
829                segment.m_nCount++;
830            }
831            IsContainPreChar = TRUE;
832        } else if (!bContains && (IsRectIntersect(rect, charinfo.m_CharBox) || rect.Contains(charinfo.m_OriginX, charinfo.m_OriginY))) {
833            if (segmentStatus == 0 || segmentStatus == 2) {
834                segment.m_Start = pos;
835                segment.m_nCount = 1;
836                segmentStatus = 1;
837            } else if (segmentStatus == 1) {
838                segment.m_nCount++;
839            }
840            IsContainPreChar = TRUE;
841        } else if (charinfo.m_Unicode == 32) {
842            if (IsContainPreChar == TRUE) {
843                if (segmentStatus == 0 || segmentStatus == 2) {
844                    segment.m_Start = pos;
845                    segment.m_nCount = 1;
846                    segmentStatus = 1;
847                } else if (segmentStatus == 1) {
848                    segment.m_nCount++;
849                }
850                IsContainPreChar = FALSE;
851            } else {
852                if (segmentStatus == 1) {
853                    segmentStatus = 2;
854                    m_Segment.Add(segment);
855                    segment.m_Start = 0;
856                    segment.m_nCount = 0;
857                }
858            }
859        } else {
860            if (segmentStatus == 1) {
861                segmentStatus = 2;
862                m_Segment.Add(segment);
863                segment.m_Start = 0;
864                segment.m_nCount = 0;
865            }
866            IsContainPreChar = FALSE;
867        }
868        pos++;
869    }
870    if (segmentStatus == 1) {
871        segmentStatus = 2;
872        m_Segment.Add(segment);
873        segment.m_Start = 0;
874        segment.m_nCount = 0;
875    }
876    return m_Segment.GetSize();
877}
878void CPDF_TextPage::GetBoundedSegment(int index, int& start, int& count) const
879{
880    if(m_ParseOptions.m_bGetCharCodeOnly) {
881        return ;
882    }
883    if (index < 0 || index >= m_Segment.GetSize()) {
884        return;
885    }
886    start = m_Segment.GetAt(index).m_Start;
887    count = m_Segment.GetAt(index).m_nCount;
888}
889int CPDF_TextPage::GetWordBreak(int index, int direction) const
890{
891    if(m_ParseOptions.m_bGetCharCodeOnly) {
892        return -1;
893    }
894    if (!m_IsParsered)	{
895        return -1;
896    }
897    if (direction != FPDFTEXT_LEFT && direction != FPDFTEXT_RIGHT) {
898        return -1;
899    }
900    if (index < 0 || index >= m_charList.GetSize()) {
901        return -1;
902    }
903    PAGECHAR_INFO charinfo;
904    charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index);
905    if (charinfo.m_Index == -1 || charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED)	{
906        return index;
907    }
908    if (!IsLetter(charinfo.m_Unicode)) {
909        return index;
910    }
911    int breakPos = index;
912    if (direction == FPDFTEXT_LEFT) {
913        while (--breakPos > 0) {
914            charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos);
915            if (!IsLetter(charinfo.m_Unicode)) {
916                return breakPos;
917            }
918        }
919        return breakPos;
920    } else if (direction == FPDFTEXT_RIGHT) {
921        while (++breakPos < m_charList.GetSize()) {
922            charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos);
923            if (!IsLetter(charinfo.m_Unicode)) {
924                return breakPos;
925            }
926        }
927        return breakPos;
928    }
929    return breakPos;
930}
931FX_INT32 CPDF_TextPage::FindTextlineFlowDirection()
932{
933    if (!m_pPage)	{
934        return -1;
935    }
936    const FX_INT32 nPageWidth = (FX_INT32)((CPDF_Page*)m_pPage)->GetPageWidth();
937    const FX_INT32 nPageHeight = (FX_INT32)((CPDF_Page*)m_pPage)->GetPageHeight();
938    CFX_ByteArray nHorizontalMask;
939    if (!nHorizontalMask.SetSize(nPageWidth)) {
940        return -1;
941    }
942	FX_BYTE* pDataH = nHorizontalMask.GetData();
943    CFX_ByteArray nVerticalMask;
944    if (!nVerticalMask.SetSize(nPageHeight)) {
945        return -1;
946    }
947	FX_BYTE* pDataV = nVerticalMask.GetData();
948    FX_INT32 index = 0;
949    FX_FLOAT fLineHeight = 0.0f;
950    CPDF_PageObject* pPageObj = NULL;
951    FX_POSITION	pos = NULL;
952    pos = m_pPage->GetFirstObjectPosition();
953    if(!pos) {
954        return -1;
955    }
956    while(pos) {
957        pPageObj = m_pPage->GetNextObject(pos);
958        if(NULL == pPageObj) {
959            continue;
960        }
961        if(PDFPAGE_TEXT != pPageObj->m_Type) {
962            continue;
963        }
964		FX_INT32 minH = (FX_INT32)pPageObj->m_Left < 0 ? 0 : (FX_INT32)pPageObj->m_Left;
965		FX_INT32 maxH = (FX_INT32)pPageObj->m_Right > nPageWidth ? nPageWidth : (FX_INT32)pPageObj->m_Right;
966		FX_INT32 minV = (FX_INT32)pPageObj->m_Bottom < 0 ? 0 : (FX_INT32)pPageObj->m_Bottom;
967		FX_INT32 maxV = (FX_INT32)pPageObj->m_Top > nPageHeight ? nPageHeight : (FX_INT32)pPageObj->m_Top;
968		if (minH >= maxH || minV >= maxV){
969			continue;
970		}
971
972		FXSYS_memset8(pDataH + minH, 1, maxH - minH);
973		FXSYS_memset8(pDataV + minV, 1, maxV - minV);
974
975		if (fLineHeight <= 0.0f) {
976			fLineHeight = pPageObj->m_Top - pPageObj->m_Bottom;
977		}
978
979		pPageObj = NULL;
980    }
981    FX_INT32 nStartH = 0;
982    FX_INT32 nEndH = 0;
983    FX_FLOAT nSumH = 0.0f;
984    for (index = 0; index < nPageWidth; index++)
985        if(1 == nHorizontalMask[index]) {
986            break;
987        }
988    nStartH = index;
989    for (index = nPageWidth; index > 0; index--)
990        if(1 == nHorizontalMask[index - 1]) {
991            break;
992        }
993    nEndH = index;
994    for (index = nStartH; index < nEndH; index++) {
995        nSumH += nHorizontalMask[index];
996    }
997    nSumH /= nEndH - nStartH;
998    FX_INT32 nStartV = 0;
999    FX_INT32 nEndV = 0;
1000    FX_FLOAT nSumV = 0.0f;
1001    for (index = 0; index < nPageHeight; index++)
1002        if(1 == nVerticalMask[index]) {
1003            break;
1004        }
1005    nStartV = index;
1006    for (index = nPageHeight; index > 0; index--)
1007        if(1 == nVerticalMask[index - 1]) {
1008            break;
1009        }
1010    nEndV = index;
1011    for (index = nStartV; index < nEndV; index++) {
1012        nSumV += nVerticalMask[index];
1013    }
1014    nSumV /= nEndV - nStartV;
1015    if ((nEndV - nStartV) < (FX_INT32)(2 * fLineHeight)) {
1016        return 0;
1017    }
1018    if ((nEndH - nStartH) < (FX_INT32)(2 * fLineHeight)) {
1019        return 1;
1020    }
1021    if (nSumH > 0.8f) {
1022        return 0;
1023    }
1024    if (nSumH - nSumV > 0.0f) {
1025        return 0;
1026    }
1027    if (nSumV - nSumH > 0.0f) {
1028        return 1;
1029    }
1030    return -1;
1031}
1032void CPDF_TextPage::ProcessObject()
1033{
1034    CPDF_PageObject*	pPageObj = NULL;
1035    if (!m_pPage)	{
1036        return;
1037    }
1038    FX_POSITION	pos;
1039    pos = m_pPage->GetFirstObjectPosition();
1040    if (!pos)	{
1041        return;
1042    }
1043    m_TextlineDir = FindTextlineFlowDirection();
1044    int nCount = 0;
1045    while (pos) {
1046        pPageObj = m_pPage->GetNextObject(pos);
1047        if(pPageObj) {
1048            if(pPageObj->m_Type == PDFPAGE_TEXT) {
1049                if (nCount == 3) {
1050                    nCount = nCount;
1051                }
1052                CFX_AffineMatrix matrix;
1053                ProcessTextObject((CPDF_TextObject*)pPageObj, matrix, pos);
1054                nCount++;
1055            } else if (pPageObj->m_Type == PDFPAGE_FORM) {
1056                CFX_AffineMatrix formMatrix(1, 0, 0, 1, 0, 0);
1057                ProcessFormObject((CPDF_FormObject*)pPageObj, formMatrix);
1058            }
1059        }
1060        pPageObj = NULL;
1061    }
1062    int count = m_LineObj.GetSize();
1063    for(int i = 0; i < count; i++) {
1064        ProcessTextObject(m_LineObj.GetAt(i));
1065    }
1066    m_LineObj.RemoveAll();
1067    CloseTempLine();
1068}
1069void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj, CFX_AffineMatrix formMatrix)
1070{
1071    CPDF_PageObject*	pPageObj = NULL;
1072    FX_POSITION	pos;
1073    if (!pFormObj)	{
1074        return;
1075    }
1076    pos = pFormObj->m_pForm->GetFirstObjectPosition();
1077    if (!pos)	{
1078        return;
1079    }
1080    CFX_AffineMatrix curFormMatrix;
1081    curFormMatrix.Copy(pFormObj->m_FormMatrix);
1082    curFormMatrix.Concat(formMatrix);
1083    while (pos) {
1084        pPageObj = pFormObj->m_pForm->GetNextObject(pos);
1085        if(pPageObj) {
1086            if(pPageObj->m_Type == PDFPAGE_TEXT) {
1087                ProcessTextObject((CPDF_TextObject*)pPageObj, curFormMatrix, pos);
1088            } else if (pPageObj->m_Type == PDFPAGE_FORM) {
1089                ProcessFormObject((CPDF_FormObject*)pPageObj, curFormMatrix);
1090            }
1091        }
1092        pPageObj = NULL;
1093    }
1094}
1095int CPDF_TextPage::GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const
1096{
1097    if(charCode == -1) {
1098        return 0;
1099    }
1100    int w = pFont->GetCharWidthF(charCode);
1101    if(w == 0) {
1102        CFX_ByteString str;
1103        pFont->AppendChar(str, charCode);
1104        w = pFont->GetStringWidth(str, 1);
1105        if(w == 0) {
1106            FX_RECT BBox;
1107            pFont->GetCharBBox(charCode, BBox);
1108            w = BBox.right - BBox.left;
1109        }
1110    }
1111    return w;
1112}
1113void CPDF_TextPage::OnPiece(IFX_BidiChar* pBidi, CFX_WideString& str)
1114{
1115    FX_INT32 start, count;
1116    FX_INT32 ret = pBidi->GetBidiInfo(start, count);
1117    if(ret == 2) {
1118        for(int i = start + count - 1; i >= start; i--) {
1119            m_TextBuf.AppendChar(str.GetAt(i));
1120            m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i));
1121        }
1122    } else {
1123        int end = start + count ;
1124        for(int i = start; i < end; i++) {
1125            m_TextBuf.AppendChar(str.GetAt(i));
1126            m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i));
1127        }
1128    }
1129}
1130void CPDF_TextPage::AddCharInfoByLRDirection(CFX_WideString& str, int i)
1131{
1132    PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i);
1133    FX_WCHAR wChar = str.GetAt(i);
1134#ifdef FOXIT_CHROME_BUILD
1135    if(!IsControlChar(&Info)) {
1136#else
1137    if(wChar != 0xfffe) {
1138#endif
1139        Info.m_Index = m_TextBuf.GetLength();
1140        if (wChar >= 0xFB00 && wChar <= 0xFB06) {
1141            FX_LPWSTR pDst = NULL;
1142            FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
1143            if (nCount >= 1) {
1144                pDst = FX_Alloc(FX_WCHAR, nCount);
1145                if (!pDst) {
1146                    return;
1147                }
1148                FX_Unicode_GetNormalization(wChar, pDst);
1149                for (int nIndex = 0; nIndex < nCount; nIndex++) {
1150                    PAGECHAR_INFO Info2 = Info;
1151                    Info2.m_Unicode = pDst[nIndex];
1152                    Info2.m_Flag = FPDFTEXT_CHAR_PIECE;
1153                    m_TextBuf.AppendChar(Info2.m_Unicode);
1154                    if( !m_ParseOptions.m_bGetCharCodeOnly) {
1155                        m_charList.Add(Info2);
1156                    }
1157                }
1158                FX_Free(pDst);
1159                return;
1160            }
1161        }
1162        m_TextBuf.AppendChar(wChar);
1163    } else {
1164        Info.m_Index = -1;
1165    }
1166    if( !m_ParseOptions.m_bGetCharCodeOnly) {
1167        m_charList.Add(Info);
1168    }
1169}
1170void CPDF_TextPage::AddCharInfoByRLDirection(CFX_WideString& str, int i)
1171{
1172    PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i);
1173#ifdef FOXIT_CHROME_BUILD
1174    if(!IsControlChar(&Info)) {
1175#else
1176    if(str.GetAt(i) != 0xfffe) {
1177#endif
1178        Info.m_Index = m_TextBuf.GetLength();
1179        FX_WCHAR wChar = FX_GetMirrorChar(str.GetAt(i), TRUE, FALSE);
1180        FX_LPWSTR pDst = NULL;
1181        FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
1182        if (nCount >= 1) {
1183            pDst = FX_Alloc(FX_WCHAR, nCount);
1184            if (!pDst) {
1185                return;
1186            }
1187            FX_Unicode_GetNormalization(wChar, pDst);
1188            for (int nIndex = 0; nIndex < nCount; nIndex++) {
1189                PAGECHAR_INFO Info2 = Info;
1190                Info2.m_Unicode = pDst[nIndex];
1191                Info2.m_Flag = FPDFTEXT_CHAR_PIECE;
1192                m_TextBuf.AppendChar(Info2.m_Unicode);
1193                if( !m_ParseOptions.m_bGetCharCodeOnly) {
1194                    m_charList.Add(Info2);
1195                }
1196            }
1197            FX_Free(pDst);
1198            return;
1199        } else {
1200            Info.m_Unicode = wChar;
1201        }
1202        m_TextBuf.AppendChar(Info.m_Unicode);
1203    } else {
1204        Info.m_Index = -1;
1205    }
1206    if( !m_ParseOptions.m_bGetCharCodeOnly) {
1207        m_charList.Add(Info);
1208    }
1209}
1210void CPDF_TextPage::CloseTempLine()
1211{
1212    int count1 = m_TempCharList.GetSize();
1213    if (count1 <= 0) {
1214        return;
1215    }
1216    IFX_BidiChar* BidiChar = IFX_BidiChar::Create();
1217    CFX_WideString str = m_TempTextBuf.GetWideString();
1218    CFX_WordArray order;
1219    FX_BOOL bR2L = FALSE;
1220    FX_INT32 start = 0, count = 0, i = 0;
1221    int nR2L = 0, nL2R = 0;
1222    FX_BOOL bPrevSpace = FALSE;
1223    for (i = 0; i < str.GetLength(); i++) {
1224        if(str.GetAt(i) == 32) {
1225            if(bPrevSpace) {
1226                m_TempTextBuf.Delete(i, 1);
1227                m_TempCharList.Delete(i);
1228                str.Delete(i);
1229                count1 --;
1230                i--;
1231                continue;
1232            }
1233            bPrevSpace = TRUE;
1234        } else {
1235            bPrevSpace = FALSE;
1236        }
1237        if(BidiChar && BidiChar->AppendChar(str.GetAt(i))) {
1238            FX_INT32 ret = BidiChar->GetBidiInfo(start, count);
1239            order.Add(start);
1240            order.Add(count);
1241            order.Add(ret);
1242            if(!bR2L) {
1243                if(ret == 2) {
1244                    nR2L++;
1245                } else if (ret == 1) {
1246                    nL2R++;
1247                }
1248            }
1249        }
1250    }
1251    if(BidiChar && BidiChar->EndChar()) {
1252        FX_INT32 ret = BidiChar->GetBidiInfo(start, count);
1253        order.Add(start);
1254        order.Add(count);
1255        order.Add(ret);
1256        if(!bR2L) {
1257            if(ret == 2) {
1258                nR2L++;
1259            } else if(ret == 1) {
1260                nL2R++;
1261            }
1262        }
1263    }
1264    if(nR2L > 0 && nR2L >= nL2R) {
1265        bR2L = TRUE;
1266    }
1267    if(this->m_parserflag == FPDFTEXT_RLTB || bR2L) {
1268        int count = order.GetSize();
1269        for(int j = count - 1; j > 0; j -= 3) {
1270            int ret = order.GetAt(j);
1271            int start = order.GetAt(j - 2);
1272            int count1 = order.GetAt(j - 1);
1273            if(ret == 2 || ret == 0) {
1274                for(int i = start + count1 - 1; i >= start; i--) {
1275                    AddCharInfoByRLDirection(str, i);
1276                }
1277            } else {
1278                i = j;
1279                FX_BOOL bSymbol = FALSE;
1280                while(i > 0 && order.GetAt(i) != 2) {
1281                    bSymbol = !order.GetAt(i);
1282                    i -= 3;
1283                }
1284                int end = start + count1 ;
1285                int n = 0;
1286                if(bSymbol) {
1287                    n = i + 6;
1288                } else {
1289                    n = i + 3;
1290                }
1291                if(n >= j) {
1292                    for(int m = start; m < end; m++) {
1293                        AddCharInfoByLRDirection(str, m);
1294                    }
1295                } else {
1296                    i = j;
1297                    j = n;
1298                    for(; n <= i; n += 3) {
1299                        int ret = order.GetAt(n);
1300                        int start = order.GetAt(n - 2);
1301                        int count1 = order.GetAt(n - 1);
1302                        int end = start + count1 ;
1303                        for(int m = start; m < end; m++) {
1304                            AddCharInfoByLRDirection(str, m);
1305                        }
1306                    }
1307                }
1308            }
1309        }
1310    } else {
1311        int count = order.GetSize();
1312        FX_BOOL bL2R = FALSE;
1313        for(int j = 0; j < count; j += 3) {
1314            int ret = order.GetAt(j + 2);
1315            int start = order.GetAt(j);
1316            int count1 = order.GetAt(j + 1);
1317            if(ret == 2 || (j == 0 && ret == 0 && !bL2R)) {
1318                int i = j + 3;
1319                while(bR2L && i < count) {
1320                    if(order.GetAt(i + 2) == 1) {
1321                        break;
1322                    } else {
1323                        i += 3;
1324                    }
1325                }
1326                if(i == 3) {
1327                    j = -3;
1328                    bL2R = TRUE;
1329                    continue;
1330                }
1331                int end = m_TempCharList.GetSize() - 1;
1332                if(i < count) {
1333                    end = order.GetAt(i) - 1;
1334                }
1335                j = i - 3;
1336                for(int n = end; n >= start; n--) {
1337                    AddCharInfoByRLDirection(str, n);
1338                }
1339            } else {
1340                int end = start + count1 ;
1341                for(int i = start; i < end; i++) {
1342                    AddCharInfoByLRDirection(str, i);
1343                }
1344            }
1345        }
1346    }
1347    int ntext = m_TextBuf.GetSize();
1348    ntext = m_charList.GetSize();
1349    order.RemoveAll();
1350    m_TempCharList.RemoveAll();
1351    m_TempTextBuf.Delete(0, m_TempTextBuf.GetLength());
1352    BidiChar->Release();
1353}
1354void CPDF_TextPage::ProcessTextObject(CPDF_TextObject*	pTextObj, CFX_AffineMatrix formMatrix, FX_POSITION ObjPos)
1355{
1356    CFX_FloatRect re(pTextObj->m_Left, pTextObj->m_Bottom, pTextObj->m_Right, pTextObj->m_Top);
1357    if(FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f ) {
1358        return;
1359    }
1360    int count = m_LineObj.GetSize();
1361    PDFTEXT_Obj Obj;
1362    Obj.m_pTextObj = pTextObj;
1363    Obj.m_formMatrix = formMatrix;
1364    if(count == 0) {
1365        m_LineObj.Add(Obj);
1366        return;
1367    }
1368    if (IsSameAsPreTextObject(pTextObj, ObjPos)) {
1369        return;
1370    }
1371    PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(count - 1);
1372    CPDF_TextObjectItem item;
1373    int nItem = prev_Obj.m_pTextObj->CountItems();
1374    prev_Obj.m_pTextObj->GetItemInfo(nItem - 1, &item);
1375    FX_FLOAT prev_width = GetCharWidth(item.m_CharCode, prev_Obj.m_pTextObj->GetFont()) * prev_Obj.m_pTextObj->GetFontSize() / 1000;
1376    CFX_AffineMatrix prev_matrix;
1377    prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix);
1378    prev_width = FXSYS_fabs(prev_width);
1379    prev_matrix.Concat(prev_Obj.m_formMatrix);
1380    prev_width = prev_matrix.TransformDistance(prev_width);
1381    pTextObj->GetItemInfo(0, &item);
1382    FX_FLOAT this_width = GetCharWidth(item.m_CharCode, pTextObj->GetFont()) * pTextObj->GetFontSize() / 1000;
1383    this_width = FXSYS_fabs(this_width);
1384    CFX_AffineMatrix this_matrix;
1385    pTextObj->GetTextMatrix(&this_matrix);
1386    this_width = FXSYS_fabs(this_width);
1387    this_matrix.Concat(formMatrix);
1388    this_width = this_matrix.TransformDistance(this_width);
1389    FX_FLOAT threshold = prev_width > this_width ? prev_width / 4 : this_width / 4;
1390    FX_FLOAT prev_x = prev_Obj.m_pTextObj->GetPosX(), prev_y = prev_Obj.m_pTextObj->GetPosY();
1391    prev_Obj.m_formMatrix.Transform(prev_x, prev_y);
1392    m_DisplayMatrix.Transform(prev_x, prev_y);
1393    FX_FLOAT this_x = pTextObj->GetPosX(), this_y = pTextObj->GetPosY();
1394    formMatrix.Transform(this_x, this_y);
1395    m_DisplayMatrix.Transform(this_x, this_y);
1396    if (FXSYS_fabs(this_y - prev_y) > threshold * 2) {
1397        for(int i = 0; i < count; i++) {
1398            ProcessTextObject(m_LineObj.GetAt(i));
1399        }
1400        m_LineObj.RemoveAll();
1401        m_LineObj.Add(Obj);
1402        return;
1403    }
1404    int i = 0;
1405    if(m_ParseOptions.m_bNormalizeObjs) {
1406        for(i = count - 1; i >= 0; i--) {
1407            PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(i);
1408            CFX_AffineMatrix prev_matrix;
1409            prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix);
1410            FX_FLOAT Prev_x = prev_Obj.m_pTextObj->GetPosX(), Prev_y = prev_Obj.m_pTextObj->GetPosY();
1411            prev_Obj.m_formMatrix.Transform(Prev_x, Prev_y);
1412            m_DisplayMatrix.Transform(Prev_x, Prev_y);
1413            if(this_x >= Prev_x) {
1414                if(i == count - 1) {
1415                    m_LineObj.Add(Obj);
1416                } else {
1417                    m_LineObj.InsertAt(i + 1, Obj);
1418                }
1419                break;
1420            }
1421        }
1422        if(i < 0) {
1423            m_LineObj.InsertAt(0, Obj);
1424        }
1425    } else {
1426        m_LineObj.Add(Obj);
1427    }
1428}
1429FX_INT32 CPDF_TextPage::PreMarkedContent(PDFTEXT_Obj Obj)
1430{
1431    CPDF_TextObject* pTextObj = Obj.m_pTextObj;
1432    CPDF_ContentMarkData* pMarkData = (CPDF_ContentMarkData*)pTextObj->m_ContentMark.GetObject();
1433    if(!pMarkData) {
1434        return FPDFTEXT_MC_PASS;
1435    }
1436    int nContentMark = pMarkData->CountItems();
1437    if (nContentMark < 1) {
1438        return FPDFTEXT_MC_PASS;
1439    }
1440    CFX_WideString actText;
1441    FX_BOOL bExist = FALSE;
1442    CPDF_Dictionary* pDict = NULL;
1443    int n = 0;
1444    for (n = 0; n < nContentMark; n++) {
1445        CPDF_ContentMarkItem& item = pMarkData->GetItem(n);
1446        CFX_ByteString tagStr = (CFX_ByteString)item.GetName();
1447        pDict = (CPDF_Dictionary*)item.GetParam();
1448        CPDF_String* temp = (CPDF_String*)pDict->GetElement(FX_BSTRC("ActualText"));
1449        if (temp) {
1450            bExist = TRUE;
1451            actText = temp->GetUnicodeText();
1452        }
1453    }
1454    if (!bExist) {
1455        return FPDFTEXT_MC_PASS;
1456    }
1457    if (m_pPreTextObj) {
1458        if (CPDF_ContentMarkData* pPreMarkData = (CPDF_ContentMarkData*)m_pPreTextObj->m_ContentMark.GetObject()) {
1459            if (pPreMarkData->CountItems() == n) {
1460                CPDF_ContentMarkItem& item = pPreMarkData->GetItem(n - 1);
1461                if (pDict == item.GetParam()) {
1462                    return FPDFTEXT_MC_DONE;
1463                }
1464            }
1465        }
1466    }
1467    CPDF_Font*	pFont = pTextObj->GetFont();
1468    FX_STRSIZE nItems = actText.GetLength();
1469    if (nItems < 1) {
1470        return FPDFTEXT_MC_PASS;
1471    }
1472    bExist = FALSE;
1473    for (FX_STRSIZE i = 0; i < nItems; i++) {
1474        FX_WCHAR wChar = actText.GetAt(i);
1475        if (-1 == pFont->CharCodeFromUnicode(wChar)) {
1476            continue;
1477        } else {
1478            bExist = TRUE;
1479            break;
1480        }
1481    }
1482    if (!bExist) {
1483        return FPDFTEXT_MC_PASS;
1484    }
1485    bExist = FALSE;
1486    for (FX_STRSIZE j = 0; j < nItems; j++) {
1487        FX_WCHAR wChar = actText.GetAt(j);
1488        if ((wChar > 0x80 && wChar < 0xFFFD) || (wChar <= 0x80 && isprint(wChar))) {
1489            bExist = TRUE;
1490            break;
1491        }
1492    }
1493    if (!bExist) {
1494        return FPDFTEXT_MC_DONE;
1495    }
1496    return FPDFTEXT_MC_DELAY;
1497}
1498void CPDF_TextPage::ProcessMarkedContent(PDFTEXT_Obj Obj)
1499{
1500    CPDF_TextObject* pTextObj = Obj.m_pTextObj;
1501    CPDF_ContentMarkData* pMarkData = (CPDF_ContentMarkData*)pTextObj->m_ContentMark.GetObject();
1502    if(!pMarkData) {
1503        return;
1504    }
1505    int nContentMark = pMarkData->CountItems();
1506    if (nContentMark < 1) {
1507        return;
1508    }
1509    CFX_WideString actText;
1510    CPDF_Dictionary* pDict = NULL;
1511    int n = 0;
1512    for (n = 0; n < nContentMark; n++) {
1513        CPDF_ContentMarkItem& item = pMarkData->GetItem(n);
1514        CFX_ByteString tagStr = (CFX_ByteString)item.GetName();
1515        pDict = (CPDF_Dictionary*)item.GetParam();
1516        CPDF_String* temp = (CPDF_String*)pDict->GetElement(FX_BSTRC("ActualText"));
1517        if (temp) {
1518            actText = temp->GetUnicodeText();
1519        }
1520    }
1521    FX_STRSIZE nItems = actText.GetLength();
1522    if (nItems < 1) {
1523        return;
1524    }
1525    CPDF_Font*	pFont = pTextObj->GetFont();
1526    CFX_AffineMatrix formMatrix = Obj.m_formMatrix;
1527    CFX_AffineMatrix matrix;
1528    pTextObj->GetTextMatrix(&matrix);
1529    matrix.Concat(formMatrix);
1530    FX_FLOAT fPosX = pTextObj->GetPosX();
1531    FX_FLOAT fPosY = pTextObj->GetPosY();
1532    int nCharInfoIndex = m_TextBuf.GetLength();
1533    CFX_FloatRect charBox;
1534    charBox.top = pTextObj->m_Top;
1535    charBox.left = pTextObj->m_Left;
1536    charBox.right = pTextObj->m_Right;
1537    charBox.bottom = pTextObj->m_Bottom;
1538    for (FX_STRSIZE k = 0; k < nItems; k++) {
1539        FX_WCHAR wChar = actText.GetAt(k);
1540        if (wChar <= 0x80 && !isprint(wChar)) {
1541            wChar = 0x20;
1542        }
1543        if (wChar >= 0xFFFD) {
1544            continue;
1545        }
1546        PAGECHAR_INFO charinfo;
1547        charinfo.m_OriginX = fPosX;
1548        charinfo.m_OriginY = fPosY;
1549        charinfo.m_Index = nCharInfoIndex;
1550        charinfo.m_Unicode = wChar;
1551        charinfo.m_CharCode = pFont->CharCodeFromUnicode(wChar);
1552        charinfo.m_Flag = FPDFTEXT_CHAR_PIECE;
1553        charinfo.m_pTextObj = pTextObj;
1554        charinfo.m_CharBox.top = charBox.top;
1555        charinfo.m_CharBox.left = charBox.left;
1556        charinfo.m_CharBox.right = charBox.right;
1557        charinfo.m_CharBox.bottom = charBox.bottom;
1558        charinfo.m_Matrix.Copy(matrix);
1559        m_TempTextBuf.AppendChar(wChar);
1560        m_TempCharList.Add(charinfo);
1561    }
1562}
1563void CPDF_TextPage::FindPreviousTextObject(void)
1564{
1565    if (m_TempCharList.GetSize() < 1 && m_charList.GetSize() < 1) {
1566        return;
1567    }
1568    PAGECHAR_INFO preChar;
1569    if (m_TempCharList.GetSize() >= 1) {
1570        preChar = *(PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - 1);
1571    } else {
1572        preChar = *(PAGECHAR_INFO*)m_charList.GetAt(m_charList.GetSize() - 1);
1573    }
1574    if (preChar.m_pTextObj) {
1575        m_pPreTextObj = preChar.m_pTextObj;
1576    }
1577}
1578void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj)
1579{
1580    CPDF_TextObject* pTextObj = Obj.m_pTextObj;
1581    if(FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f ) {
1582        return;
1583    }
1584    CFX_AffineMatrix formMatrix = Obj.m_formMatrix;
1585    CPDF_Font*	pFont = pTextObj->GetFont();
1586    CFX_AffineMatrix matrix;
1587    pTextObj->GetTextMatrix(&matrix);
1588    matrix.Concat(formMatrix);
1589    FX_INT32 bPreMKC = PreMarkedContent(Obj);
1590    if (FPDFTEXT_MC_DONE == bPreMKC) {
1591        m_pPreTextObj = pTextObj;
1592        m_perMatrix.Copy(formMatrix);
1593        return;
1594    }
1595    int result = 0;
1596    if (m_pPreTextObj) {
1597        result = ProcessInsertObject(pTextObj, formMatrix);
1598        if (2 == result) {
1599            m_CurlineRect = CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom, Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top);
1600        } else {
1601            m_CurlineRect.Union(CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom, Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top));
1602        }
1603        PAGECHAR_INFO generateChar;
1604        if (result == 1) {
1605            if (GenerateCharInfo(TEXT_BLANK_CHAR, generateChar)) {
1606                if (!formMatrix.IsIdentity()) {
1607                    generateChar.m_Matrix.Copy(formMatrix);
1608                }
1609                m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR);
1610                m_TempCharList.Add(generateChar);
1611            }
1612        } else if(result == 2) {
1613            CloseTempLine();
1614            if(m_TextBuf.GetSize()) {
1615                if(m_ParseOptions.m_bGetCharCodeOnly) {
1616                    m_TextBuf.AppendChar(TEXT_RETURN_CHAR);
1617                    m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR);
1618                } else {
1619                    if(GenerateCharInfo(TEXT_RETURN_CHAR, generateChar)) {
1620                        m_TextBuf.AppendChar(TEXT_RETURN_CHAR);
1621                        if (!formMatrix.IsIdentity()) {
1622                            generateChar.m_Matrix.Copy(formMatrix);
1623                        }
1624                        m_charList.Add(generateChar);
1625                    }
1626                    if(GenerateCharInfo(TEXT_LINEFEED_CHAR, generateChar)) {
1627                        m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR);
1628                        if (!formMatrix.IsIdentity()) {
1629                            generateChar.m_Matrix.Copy(formMatrix);
1630                        }
1631                        m_charList.Add(generateChar);
1632                    }
1633                }
1634            }
1635        } else if (result == 3 && !m_ParseOptions.m_bOutputHyphen) {
1636            FX_INT32 nChars = pTextObj->CountChars();
1637            if (nChars == 1) {
1638                CPDF_TextObjectItem item;
1639                pTextObj->GetCharInfo(0, &item);
1640                CFX_WideString wstrItem = pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
1641                if(wstrItem.IsEmpty()) {
1642                    wstrItem += (FX_WCHAR)item.m_CharCode;
1643                }
1644                FX_WCHAR curChar = wstrItem.GetAt(0);
1645                if (0x2D == curChar || 0xAD == curChar) {
1646                    return;
1647                }
1648            }
1649            while (m_TempTextBuf.GetSize() > 0 && m_TempTextBuf.GetWideString().GetAt(m_TempTextBuf.GetLength() - 1) == 0x20) {
1650                m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1651                m_TempCharList.Delete(m_TempCharList.GetSize() - 1);
1652            }
1653            PAGECHAR_INFO* cha = (PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - 1);
1654            m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1655#ifdef FOXIT_CHROME_BUILD
1656            cha->m_Unicode = 0x2;
1657            cha->m_Flag = FPDFTEXT_CHAR_HYPHEN;
1658            m_TempTextBuf.AppendChar(0xfffe);
1659#else
1660            cha->m_Unicode = 0;
1661            m_TempTextBuf.AppendChar(0xfffe);
1662#endif
1663        }
1664    } else {
1665        m_CurlineRect = CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom, Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top);
1666    }
1667    if (FPDFTEXT_MC_DELAY == bPreMKC) {
1668        ProcessMarkedContent(Obj);
1669        m_pPreTextObj = pTextObj;
1670        m_perMatrix.Copy(formMatrix);
1671        return;
1672    }
1673    m_pPreTextObj = pTextObj;
1674    m_perMatrix.Copy(formMatrix);
1675    int nItems = pTextObj->CountItems();
1676    FX_FLOAT spacing = 0;
1677    FX_FLOAT baseSpace = 0.0;
1678    FX_BOOL bAllChar = TRUE;
1679    if (pTextObj->m_TextState.GetObject()->m_CharSpace && nItems >= 3) {
1680        spacing = matrix.TransformDistance(pTextObj->m_TextState.GetObject()->m_CharSpace);
1681        baseSpace = spacing;
1682        for (int i = 0; i < nItems; i++) {
1683            CPDF_TextObjectItem item;
1684            pTextObj->GetItemInfo(i, &item);
1685            if (item.m_CharCode == (FX_DWORD) - 1) {
1686                FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1687                FX_FLOAT kerning = -fontsize_h * item.m_OriginX / 1000;
1688                if(kerning + spacing < baseSpace) {
1689                    baseSpace = kerning + spacing;
1690                }
1691                bAllChar = FALSE;
1692            }
1693        }
1694        spacing = 0;
1695        if(baseSpace < 0.0 || (nItems == 3 && !bAllChar)) {
1696            baseSpace = 0.0;
1697        }
1698    }
1699    for (int i = 0; i < nItems; i++) {
1700        CPDF_TextObjectItem item;
1701        PAGECHAR_INFO charinfo;
1702        charinfo.m_OriginX = 0;
1703        charinfo.m_OriginY = 0;
1704        pTextObj->GetItemInfo(i, &item);
1705        if (item.m_CharCode == (FX_DWORD) - 1) {
1706            CFX_WideString str = m_TempTextBuf.GetWideString();
1707            if(str.IsEmpty()) {
1708                str = m_TextBuf.GetWideString();
1709            }
1710            if (str.IsEmpty() || str.GetAt(str.GetLength() - 1) == TEXT_BLANK_CHAR) {
1711                continue;
1712            }
1713            FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1714            spacing = -fontsize_h * item.m_OriginX / 1000;
1715            continue;
1716        }
1717        FX_FLOAT charSpace = pTextObj->m_TextState.GetObject()->m_CharSpace;
1718        if (charSpace > 0.001) {
1719            spacing += matrix.TransformDistance(charSpace);
1720        } else if(charSpace < -0.001) {
1721            spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace));
1722        }
1723        spacing -= baseSpace;
1724        if (spacing && i > 0) {
1725            int last_width = 0;
1726            FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1727            FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
1728            FX_FLOAT threshold = 0;
1729            if (space_charcode != -1) {
1730                threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000 ;
1731            }
1732            if (threshold > fontsize_h / 3) {
1733                threshold = 0;
1734            } else {
1735                threshold /= 2;
1736            }
1737            if (threshold == 0) {
1738                threshold = fontsize_h;
1739                int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
1740                threshold = this_width > last_width ? (FX_FLOAT)this_width : (FX_FLOAT)last_width;
1741                int nDivide = 6;
1742                if (threshold < 300) {
1743                    nDivide = 2;
1744                } else if (threshold < 500) {
1745                    nDivide = 4;
1746                } else if (threshold < 700) {
1747                    nDivide = 5;
1748                }
1749                threshold = threshold / nDivide;
1750                threshold = fontsize_h * threshold / 1000;
1751            }
1752            if (threshold && (spacing && spacing >= threshold) ) {
1753                charinfo.m_Unicode = TEXT_BLANK_CHAR;
1754                charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED;
1755                charinfo.m_pTextObj = pTextObj;
1756                charinfo.m_Index = m_TextBuf.GetLength();
1757                m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR);
1758                charinfo.m_CharCode = -1;
1759                charinfo.m_Matrix.Copy(formMatrix);
1760                matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_OriginX, charinfo.m_OriginY);
1761                charinfo.m_CharBox = CFX_FloatRect(charinfo.m_OriginX, charinfo.m_OriginY, charinfo.m_OriginX, charinfo.m_OriginY);
1762                m_TempCharList.Add(charinfo);
1763            }
1764            if (item.m_CharCode == (FX_DWORD) - 1) {
1765                continue;
1766            }
1767        }
1768        spacing = 0;
1769        CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
1770        FX_BOOL bNoUnicode = FALSE;
1771        FX_WCHAR wChar = wstrItem.GetAt(0);
1772        if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode) {
1773            if(wstrItem.IsEmpty()) {
1774                wstrItem += (FX_WCHAR)item.m_CharCode;
1775            } else {
1776                wstrItem.SetAt(0, (FX_WCHAR)item.m_CharCode);
1777            }
1778            bNoUnicode = TRUE;
1779        }
1780        charinfo.m_Index = -1;
1781        charinfo.m_CharCode = item.m_CharCode;
1782        if(bNoUnicode) {
1783            charinfo.m_Flag = FPDFTEXT_CHAR_UNUNICODE;
1784        } else {
1785            charinfo.m_Flag = FPDFTEXT_CHAR_NORMAL;
1786        }
1787        charinfo.m_pTextObj = pTextObj;
1788        charinfo.m_OriginX = 0, charinfo.m_OriginY = 0;
1789        matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_OriginX, charinfo.m_OriginY);
1790        FX_RECT rect(0, 0, 0, 0);
1791        rect.Intersect(0, 0, 0, 0);
1792        charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode, rect);
1793        charinfo.m_CharBox.top = rect.top * pTextObj->GetFontSize() / 1000 + item.m_OriginY;
1794        charinfo.m_CharBox.left = rect.left * pTextObj->GetFontSize() / 1000 + item.m_OriginX;
1795        charinfo.m_CharBox.right = rect.right * pTextObj->GetFontSize() / 1000 + item.m_OriginX;
1796        charinfo.m_CharBox.bottom = rect.bottom * pTextObj->GetFontSize() / 1000 + item.m_OriginY;
1797        if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) < 0.01f) {
1798            charinfo.m_CharBox.top = charinfo.m_CharBox.bottom + pTextObj->GetFontSize();
1799        }
1800        if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) < 0.01f) {
1801            charinfo.m_CharBox.right = charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode);
1802        }
1803        matrix.TransformRect(charinfo.m_CharBox);
1804        charinfo.m_Matrix.Copy(matrix);
1805        if (wstrItem.IsEmpty()) {
1806            charinfo.m_Unicode = 0;
1807            m_TempCharList.Add(charinfo);
1808            m_TempTextBuf.AppendChar(0xfffe);
1809            continue;
1810        } else {
1811            int nTotal = wstrItem.GetLength();
1812            int n = 0;
1813            FX_BOOL bDel = FALSE;
1814            while (n < m_TempCharList.GetSize() && n < 7) {
1815                n++;
1816                PAGECHAR_INFO* charinfo1 = (PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - n);
1817                if(charinfo1->m_CharCode == charinfo.m_CharCode &&
1818                        charinfo1->m_pTextObj->GetFont() == charinfo.m_pTextObj->GetFont()  &&
1819                        FXSYS_fabs(charinfo1->m_OriginX - charinfo.m_OriginX) < TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize()  &&
1820                        FXSYS_fabs(charinfo1->m_OriginY - charinfo.m_OriginY) < TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize() ) {
1821                    bDel = TRUE;
1822                    break;
1823                }
1824            }
1825            if(!bDel) {
1826                for (int nIndex = 0; nIndex < nTotal; nIndex++) {
1827                    charinfo.m_Unicode = wstrItem.GetAt(nIndex);
1828                    if (charinfo.m_Unicode) {
1829                        charinfo.m_Index = m_TextBuf.GetLength();
1830                        m_TempTextBuf.AppendChar(charinfo.m_Unicode);
1831                    } else {
1832                        m_TempTextBuf.AppendChar(0xfffe);
1833                    }
1834                    m_TempCharList.Add(charinfo);
1835                }
1836            } else if(i == 0) {
1837                CFX_WideString str = m_TempTextBuf.GetWideString();
1838                if (!str.IsEmpty() && str.GetAt(str.GetLength() - 1) == TEXT_BLANK_CHAR) {
1839                    m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1840                    m_TempCharList.Delete(m_TempCharList.GetSize() - 1);
1841                }
1842            }
1843        }
1844    }
1845}
1846FX_INT32 CPDF_TextPage::GetTextObjectWritingMode(const CPDF_TextObject* pTextObj)
1847{
1848    FX_INT32 nChars = pTextObj->CountChars();
1849    if (nChars == 1) {
1850        return m_TextlineDir;
1851    }
1852    CPDF_TextObjectItem first, last;
1853    pTextObj->GetCharInfo(0, &first);
1854    pTextObj->GetCharInfo(nChars - 1, &last);
1855    CFX_Matrix textMatrix;
1856    pTextObj->GetTextMatrix(&textMatrix);
1857    textMatrix.TransformPoint(first.m_OriginX, first.m_OriginY);
1858    textMatrix.TransformPoint(last.m_OriginX, last.m_OriginY);
1859    FX_FLOAT dX = FXSYS_fabs(last.m_OriginX - first.m_OriginX);
1860    FX_FLOAT dY = FXSYS_fabs(last.m_OriginY - first.m_OriginY);
1861    if (dX <= 0.0001f && dY <= 0.0001f) {
1862        return -1;
1863    }
1864    CFX_VectorF v;
1865    v.Set(dX, dY);
1866    v.Normalize();
1867    if (v.y <= 0.0872f) {
1868        if (v.x <= 0.0872f) {
1869            return m_TextlineDir;
1870        }
1871        return 0;
1872    } else if (v.x <= 0.0872f) {
1873        return 1;
1874    }
1875    return m_TextlineDir;
1876}
1877FX_BOOL CPDF_TextPage::IsHyphen(FX_WCHAR curChar)
1878{
1879    CFX_WideString strCurText = m_TempTextBuf.GetWideString();
1880    if(strCurText.GetLength() == 0) {
1881        strCurText = m_TextBuf.GetWideString();
1882    }
1883    FX_STRSIZE nCount = strCurText.GetLength();
1884    int nIndex = nCount - 1;
1885    FX_WCHAR wcTmp = strCurText.GetAt(nIndex);
1886    while(wcTmp == 0x20 && nIndex <= nCount - 1 && nIndex >= 0) {
1887        wcTmp = strCurText.GetAt(--nIndex);
1888    }
1889    if (0x2D == wcTmp || 0xAD == wcTmp) {
1890        if (--nIndex > 0) {
1891            FX_WCHAR preChar = strCurText.GetAt((nIndex));
1892            if (((preChar >= L'A' && preChar <= L'Z') || (preChar >= L'a' && preChar <= L'z'))
1893                    && ((curChar >= L'A' && curChar <= L'Z') || (curChar >= L'a' && curChar <= L'z'))) {
1894                return TRUE;
1895            }
1896        }
1897        int size = m_TempCharList.GetSize();
1898        PAGECHAR_INFO preChar;
1899        if (size) {
1900            preChar = (PAGECHAR_INFO)m_TempCharList[size - 1];
1901        } else {
1902            size = m_charList.GetSize();
1903            if(size == 0) {
1904                return FALSE;
1905            }
1906            preChar = (PAGECHAR_INFO)m_charList[size - 1];
1907        }
1908        if (FPDFTEXT_CHAR_PIECE == preChar.m_Flag)
1909            if (0xAD == preChar.m_Unicode || 0x2D == preChar.m_Unicode) {
1910                return TRUE;
1911            }
1912    }
1913    return FALSE;
1914}
1915int CPDF_TextPage::ProcessInsertObject(const CPDF_TextObject* pObj, CFX_AffineMatrix formMatrix)
1916{
1917    FindPreviousTextObject();
1918    FX_BOOL bNewline = FALSE;
1919    int WritingMode = GetTextObjectWritingMode(pObj);
1920    if(WritingMode == -1) {
1921        WritingMode = GetTextObjectWritingMode(m_pPreTextObj);
1922    }
1923    CFX_FloatRect this_rect(pObj->m_Left, pObj->m_Bottom, pObj->m_Right, pObj->m_Top);
1924    CFX_FloatRect prev_rect(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom, m_pPreTextObj->m_Right, m_pPreTextObj->m_Top);
1925    CPDF_TextObjectItem PrevItem, item;
1926    int nItem = m_pPreTextObj->CountItems();
1927    m_pPreTextObj->GetItemInfo(nItem - 1, &PrevItem);
1928    pObj->GetItemInfo(0, &item);
1929    CFX_WideString wstrItem = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
1930    if(wstrItem.IsEmpty()) {
1931        wstrItem += (FX_WCHAR)item.m_CharCode;
1932    }
1933    FX_WCHAR curChar = wstrItem.GetAt(0);
1934    if(WritingMode == 0) {
1935        if(this_rect.Height() > 4.5 && prev_rect.Height() > 4.5) {
1936            FX_FLOAT top = this_rect.top < prev_rect.top ? this_rect.top : prev_rect.top;
1937            FX_FLOAT bottom = this_rect.bottom > prev_rect.bottom ? this_rect.bottom : prev_rect.bottom;
1938            if(bottom >= top) {
1939                if(IsHyphen(curChar)) {
1940                    return 3;
1941                }
1942                return 2;
1943            }
1944        }
1945    } else if (WritingMode == 1) {
1946        if(this_rect.Width() > pObj->GetFontSize() * 0.1f && prev_rect.Width() > m_pPreTextObj->GetFontSize() * 0.1f) {
1947            FX_FLOAT left = this_rect.left > m_CurlineRect.left ? this_rect.left : m_CurlineRect.left;
1948            FX_FLOAT right = this_rect.right < m_CurlineRect.right ? this_rect.right : m_CurlineRect.right;
1949            if(right <= left) {
1950                if(IsHyphen(curChar)) {
1951                    return 3;
1952                }
1953                return 2;
1954            }
1955        }
1956    }
1957    FX_FLOAT last_pos = PrevItem.m_OriginX;
1958    int nLastWidth = GetCharWidth(PrevItem.m_CharCode, m_pPreTextObj->GetFont());
1959    FX_FLOAT last_width = nLastWidth * m_pPreTextObj->GetFontSize() / 1000;
1960    last_width = FXSYS_fabs(last_width);
1961    int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont());
1962    FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000;
1963    this_width = FXSYS_fabs(this_width);
1964    FX_FLOAT threshold = last_width > this_width ? last_width / 4 : this_width / 4;
1965    CFX_AffineMatrix prev_matrix, prev_reverse;
1966    m_pPreTextObj->GetTextMatrix(&prev_matrix);
1967    prev_matrix.Concat(m_perMatrix);
1968    prev_reverse.SetReverse(prev_matrix);
1969    FX_FLOAT x = pObj->GetPosX();
1970    FX_FLOAT y = pObj->GetPosY();
1971    formMatrix.Transform(x, y);
1972    prev_reverse.Transform(x, y);
1973    if(last_width < this_width) {
1974        threshold = prev_reverse.TransformDistance(threshold);
1975    }
1976    CFX_FloatRect rect1(m_pPreTextObj->m_Left, pObj->m_Bottom, m_pPreTextObj->m_Right, pObj->m_Top);
1977    CFX_FloatRect rect2(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom, m_pPreTextObj->m_Right, m_pPreTextObj->m_Top);
1978    CFX_FloatRect rect3 = rect1;
1979    rect1.Intersect(rect2);
1980    if (WritingMode == 0) {
1981        if ((rect1.IsEmpty() && rect2.Height() > 5 && rect3.Height() > 5)
1982                || ((y > threshold * 2 || y < threshold * -3) && (FXSYS_fabs(y) < 1 ? FXSYS_fabs(x) < FXSYS_fabs(y) : TRUE))) {
1983            bNewline = TRUE;
1984            if(nItem > 1 ) {
1985                CPDF_TextObjectItem tempItem;
1986                m_pPreTextObj->GetItemInfo(0, &tempItem);
1987                CFX_AffineMatrix m;
1988                m_pPreTextObj->GetTextMatrix(&m);
1989                if(PrevItem.m_OriginX > tempItem.m_OriginX &&
1990                        m_DisplayMatrix.a > 0.9 && m_DisplayMatrix.b < 0.1 &&
1991                        m_DisplayMatrix.c < 0.1 && m_DisplayMatrix.d < -0.9
1992                        && m.b < 0.1 && m.c < 0.1 ) {
1993                    CFX_FloatRect re(0, m_pPreTextObj->m_Bottom, 1000, m_pPreTextObj->m_Top);
1994                    if(re.Contains(pObj->GetPosX(), pObj->GetPosY())) {
1995                        bNewline = FALSE;
1996                    } else {
1997                        CFX_FloatRect re(0, pObj->m_Bottom, 1000, pObj->m_Top);
1998                        if(re.Contains(m_pPreTextObj->GetPosX(), m_pPreTextObj->GetPosY())) {
1999                            bNewline = FALSE;
2000                        }
2001                    }
2002                }
2003            }
2004        }
2005    }
2006    if(bNewline) {
2007        if(IsHyphen(curChar)) {
2008            return 3;
2009        }
2010        return 2;
2011    }
2012    FX_INT32 nChars = pObj->CountChars();
2013    if (nChars == 1 && ( 0x2D == curChar || 0xAD == curChar))
2014        if (IsHyphen(curChar)) {
2015            return 3;
2016        }
2017    CFX_WideString PrevStr = m_pPreTextObj->GetFont()->UnicodeFromCharCode(PrevItem.m_CharCode);
2018    FX_WCHAR preChar = PrevStr.GetAt(PrevStr.GetLength() - 1);
2019    CFX_AffineMatrix matrix;
2020    pObj->GetTextMatrix(&matrix);
2021    matrix.Concat(formMatrix);
2022    threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth);
2023    threshold = threshold > 400 ? (threshold < 700 ? threshold / 4 :  (threshold > 800 ? threshold / 6 : threshold / 5)) : (threshold / 2);
2024    if(nLastWidth >= nThisWidth) {
2025        threshold *= FXSYS_fabs(m_pPreTextObj->GetFontSize());
2026    } else {
2027        threshold *= FXSYS_fabs(pObj->GetFontSize());
2028        threshold = matrix.TransformDistance(threshold);
2029        threshold = prev_reverse.TransformDistance(threshold);
2030    }
2031    threshold /= 1000;
2032    if((threshold < 1.4881 && threshold > 1.4879)
2033            || (threshold < 1.39001 && threshold > 1.38999)) {
2034        threshold *= 1.5;
2035    }
2036    if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' && preChar != L' ')
2037        if (curChar != L' ' && preChar != L' ') {
2038            if((x - last_pos - last_width) > threshold || (last_pos - x - last_width) > threshold) {
2039                return 1;
2040            }
2041            if(x < 0 && (last_pos - x - last_width) > threshold) {
2042                return 1;
2043            }
2044            if((x - last_pos - last_width) > this_width || (x - last_pos - this_width) > last_width ) {
2045                return 1;
2046            }
2047        }
2048    return 0;
2049}
2050FX_BOOL CPDF_TextPage::IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2)
2051{
2052    if (!pTextObj1 || !pTextObj2) {
2053        return FALSE;
2054    }
2055    CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom, pTextObj2->m_Right, pTextObj2->m_Top);
2056    CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom, pTextObj1->m_Right, pTextObj1->m_Top);
2057    if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty() && !m_ParseOptions.m_bGetCharCodeOnly) {
2058        FX_FLOAT dbXdif = FXSYS_fabs(rcPreObj.left - rcCurObj.left);
2059        int nCount = m_charList.GetSize();
2060        if (nCount >= 2) {
2061            PAGECHAR_INFO perCharTemp = (PAGECHAR_INFO)m_charList[nCount - 2];
2062            FX_FLOAT dbSpace = perCharTemp.m_CharBox.Width();
2063            if (dbXdif > dbSpace) {
2064                return FALSE;
2065            }
2066        }
2067    }
2068    if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
2069        rcPreObj.Intersect(rcCurObj);
2070        if (rcPreObj.IsEmpty()) {
2071            return FALSE;
2072        }
2073        if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) > rcCurObj.Width() / 2) {
2074            return FALSE;
2075        }
2076        if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) {
2077            return FALSE;
2078        }
2079    }
2080    int nPreCount = pTextObj2->CountItems();
2081    int nCurCount = pTextObj1->CountItems();
2082    if (nPreCount != nCurCount) {
2083        return FALSE;
2084    }
2085    CPDF_TextObjectItem itemPer, itemCur;
2086    for (int i = 0; i < nPreCount; i++) {
2087        pTextObj2->GetItemInfo(i, &itemPer);
2088        pTextObj1->GetItemInfo(i, &itemCur);
2089        if (itemCur.m_CharCode != itemPer.m_CharCode) {
2090            return FALSE;
2091        }
2092    }
2093    if(FXSYS_fabs(pTextObj1->GetPosX() - pTextObj2->GetPosX()) > GetCharWidth(itemPer.m_CharCode, pTextObj2->GetFont())*pTextObj2->GetFontSize() / 1000 * 0.9 ||
2094            FXSYS_fabs(pTextObj1->GetPosY() - pTextObj2->GetPosY()) >
2095            FX_MAX(FX_MAX(rcPreObj.Height() , rcPreObj.Width()), pTextObj2->GetFontSize()) / 8) {
2096        return FALSE;
2097    }
2098    return TRUE;
2099}
2100FX_BOOL CPDF_TextPage::IsSameAsPreTextObject(CPDF_TextObject* pTextObj, FX_POSITION ObjPos)
2101{
2102    if (!pTextObj) {
2103        return FALSE;
2104    }
2105    int i = 0;
2106    if (!ObjPos) {
2107        ObjPos = m_pPage->GetLastObjectPosition();
2108    }
2109    CPDF_PageObject* pObj = m_pPage->GetPrevObject(ObjPos);
2110    while (i < 5 && ObjPos) {
2111        pObj = m_pPage->GetPrevObject(ObjPos);
2112        if(pObj == pTextObj) {
2113            continue;
2114        }
2115        if(pObj->m_Type != PDFPAGE_TEXT) {
2116            continue;
2117        }
2118        if(IsSameTextObject((CPDF_TextObject*)pObj, pTextObj)) {
2119            return TRUE;
2120        }
2121        i++;
2122    }
2123    return FALSE;
2124}
2125FX_BOOL CPDF_TextPage::GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info)
2126{
2127    int size = m_TempCharList.GetSize();
2128    PAGECHAR_INFO preChar;
2129    if (size) {
2130        preChar = (PAGECHAR_INFO)m_TempCharList[size - 1];
2131    } else {
2132        size = m_charList.GetSize();
2133        if(size == 0) {
2134            return FALSE;
2135        }
2136        preChar = (PAGECHAR_INFO)m_charList[size - 1];
2137    }
2138    info.m_Index = m_TextBuf.GetLength();
2139    info.m_Unicode = unicode;
2140    info.m_pTextObj = NULL;
2141    info.m_CharCode = -1;
2142    info.m_Flag = FPDFTEXT_CHAR_GENERATED;
2143    int preWidth = 0;
2144    if (preChar.m_pTextObj && preChar.m_CharCode != (FX_DWORD) - 1) {
2145        preWidth = GetCharWidth(preChar.m_CharCode, preChar.m_pTextObj->GetFont());
2146    }
2147    FX_FLOAT fs = 0;
2148    if(preChar.m_pTextObj) {
2149        fs = preChar.m_pTextObj->GetFontSize();
2150    } else {
2151        fs = preChar.m_CharBox.Height();
2152    }
2153    if(!fs) {
2154        fs = 1;
2155    }
2156    info.m_OriginX = preChar.m_OriginX + preWidth * (fs) / 1000;
2157    info.m_OriginY = preChar.m_OriginY;
2158    info.m_CharBox = CFX_FloatRect(info.m_OriginX, info.m_OriginY, info.m_OriginX, info.m_OriginY);
2159    return TRUE;
2160}
2161FX_BOOL CPDF_TextPage::IsRectIntersect(CFX_FloatRect rect1, CFX_FloatRect rect2)
2162{
2163    rect1.Intersect(rect2);
2164    if(rect1.IsEmpty()) {
2165        return FALSE;
2166    }
2167    return TRUE;
2168}
2169FX_BOOL	CPDF_TextPage::IsLetter(FX_WCHAR unicode)
2170{
2171    if (unicode < L'A') {
2172        return FALSE;
2173    }
2174    if (unicode > L'Z' && unicode < L'a') {
2175        return FALSE;
2176    }
2177    if (unicode > L'z') {
2178        return FALSE;
2179    }
2180    return TRUE;
2181}
2182CPDF_TextPageFind::CPDF_TextPageFind(const IPDF_TextPage* pTextPage)
2183    : m_IsFind(FALSE),
2184      m_pTextPage(NULL)
2185{
2186    if (!pTextPage) {
2187        return;
2188    }
2189    CPDF_ModuleMgr* pPDFModule = CPDF_ModuleMgr::Get();
2190    m_pTextPage = pTextPage;
2191    m_strText = m_pTextPage->GetPageText();
2192    int nCount = pTextPage->CountChars();
2193    if(nCount) {
2194        m_CharIndex.Add(0);
2195    }
2196    for(int i = 0; i < nCount; i++) {
2197        FPDF_CHAR_INFO info;
2198        pTextPage->GetCharInfo(i, info);
2199        int indexSize = m_CharIndex.GetSize();
2200        if(info.m_Flag == CHAR_NORMAL || info.m_Flag == CHAR_GENERATED) {
2201            if(indexSize % 2) {
2202                m_CharIndex.Add(1);
2203            } else {
2204                if(indexSize <= 0) {
2205                    continue;
2206                }
2207                m_CharIndex.SetAt(indexSize - 1, m_CharIndex.GetAt(indexSize - 1) + 1);
2208            }
2209        } else {
2210            if(indexSize % 2) {
2211                if(indexSize <= 0) {
2212                    continue;
2213                }
2214                m_CharIndex.SetAt(indexSize - 1, i + 1);
2215            } else {
2216                m_CharIndex.Add(i + 1);
2217            }
2218        }
2219    }
2220    int indexSize = m_CharIndex.GetSize();
2221    if(indexSize % 2) {
2222        m_CharIndex.RemoveAt(indexSize - 1);
2223    }
2224    m_resStart = 0;
2225    m_resEnd = -1;
2226}
2227int CPDF_TextPageFind::GetCharIndex(int index) const
2228{
2229    return m_pTextPage->CharIndexFromTextIndex(index);
2230    int indexSize = m_CharIndex.GetSize();
2231    int count = 0;
2232    for(int i = 0; i < indexSize; i += 2) {
2233        count += m_CharIndex.GetAt(i + 1);
2234        if(count > index) {
2235            return 	index - count + m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i);
2236        }
2237    }
2238    return -1;
2239}
2240FX_BOOL	CPDF_TextPageFind::FindFirst(CFX_WideString findwhat, int flags, int startPos)
2241{
2242    if (!m_pTextPage) {
2243        return FALSE;
2244    }
2245    if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) {
2246        m_strText = m_pTextPage->GetPageText();
2247    }
2248    m_findWhat = findwhat;
2249    m_flags = flags;
2250    m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
2251    if (m_strText.IsEmpty()) {
2252        m_IsFind = FALSE;
2253        return TRUE;
2254    }
2255    FX_STRSIZE len = findwhat.GetLength();
2256    if (!m_bMatchCase) {
2257        findwhat.MakeLower();
2258        m_strText.MakeLower();
2259    }
2260    m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD;
2261    m_findNextStart = startPos;
2262    if (startPos == -1) {
2263        m_findPreStart = m_strText.GetLength() - 1;
2264    } else {
2265        m_findPreStart = startPos;
2266    }
2267    m_csFindWhatArray.RemoveAll();
2268    int i = 0;
2269    while(i < len) {
2270        if(findwhat.GetAt(i) != ' ') {
2271            break;
2272        }
2273        i++;
2274    }
2275    if(i < len) {
2276        ExtractFindWhat(findwhat);
2277    } else {
2278        m_csFindWhatArray.Add(findwhat);
2279    }
2280    if(m_csFindWhatArray.GetSize() <= 0) {
2281        return FALSE;
2282    }
2283    m_IsFind = TRUE;
2284    m_resStart = 0;
2285    m_resEnd = -1;
2286    return TRUE;
2287}
2288FX_BOOL CPDF_TextPageFind::FindNext()
2289{
2290    if (!m_pTextPage) {
2291        return FALSE;
2292    }
2293    m_resArray.RemoveAll();
2294    if(m_findNextStart == -1) {
2295        return FALSE;
2296    }
2297    if(m_strText.IsEmpty()) {
2298        m_IsFind = FALSE;
2299        return m_IsFind;
2300    }
2301    int strLen = m_strText.GetLength();
2302    if (m_findNextStart > strLen - 1) {
2303        m_IsFind = FALSE;
2304        return m_IsFind;
2305    }
2306    int nCount = m_csFindWhatArray.GetSize();
2307    int nResultPos = 0;
2308    int	nStartPos = 0;
2309    nStartPos = m_findNextStart;
2310    FX_BOOL bSpaceStart = FALSE;
2311    for(int iWord = 0; iWord < nCount; iWord++) {
2312        CFX_WideString csWord = m_csFindWhatArray[iWord];
2313        if(csWord.IsEmpty()) {
2314            if(iWord == nCount - 1) {
2315                FX_WCHAR strInsert = m_strText.GetAt(nStartPos);
2316                if(strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_BLANK_CHAR || strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
2317                    nResultPos = nStartPos + 1;
2318                    break;
2319                }
2320                iWord = -1;
2321            } else if(iWord == 0) {
2322                bSpaceStart = TRUE;
2323            }
2324            continue;
2325        }
2326        int endIndex;
2327        nResultPos = m_strText.Find(csWord, nStartPos);
2328        if (nResultPos == -1) {
2329            m_IsFind = FALSE;
2330            return m_IsFind;
2331        }
2332        endIndex = nResultPos + csWord.GetLength() - 1;
2333        if(iWord == 0) {
2334            m_resStart = nResultPos;
2335        }
2336        FX_BOOL bMatch = TRUE;
2337        if(iWord != 0 && !bSpaceStart) {
2338            int PreResEndPos = nStartPos;
2339            int curChar = csWord.GetAt(0);
2340            CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];
2341            int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);
2342            if(nStartPos == nResultPos && !(_IsIgnoreSpaceCharacter(lastChar) || _IsIgnoreSpaceCharacter(curChar))) {
2343                bMatch = FALSE;
2344            }
2345            for(int d = PreResEndPos; d < nResultPos; d++) {
2346                FX_WCHAR strInsert = m_strText.GetAt(d);
2347                if(strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CHAR && strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
2348                    bMatch = FALSE;
2349                    break;
2350                }
2351            }
2352        } else if(bSpaceStart) {
2353            if(nResultPos > 0) {
2354                FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1);
2355                if(strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CHAR && strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
2356                    bMatch = FALSE;
2357                    m_resStart = nResultPos;
2358                } else {
2359                    m_resStart = nResultPos - 1;
2360                }
2361            }
2362        }
2363        if(m_bMatchWholeWord && bMatch) {
2364            bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex);
2365        }
2366        nStartPos = endIndex + 1;
2367        if(!bMatch) {
2368            iWord = -1;
2369            if(bSpaceStart) {
2370                nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
2371            } else {
2372                nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
2373            }
2374        }
2375    }
2376    m_resEnd = nResultPos + m_csFindWhatArray[m_csFindWhatArray.GetSize() - 1].GetLength() - 1;
2377    m_IsFind = TRUE;
2378    int resStart = GetCharIndex(m_resStart);
2379    int resEnd = GetCharIndex(m_resEnd);
2380    m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1, m_resArray);
2381    if(m_flags & FPDFTEXT_CONSECUTIVE) {
2382        m_findNextStart = m_resStart + 1;
2383        m_findPreStart = m_resEnd - 1;
2384    } else {
2385        m_findNextStart = m_resEnd + 1;
2386        m_findPreStart = m_resStart - 1;
2387    }
2388    return m_IsFind;
2389}
2390FX_BOOL CPDF_TextPageFind::FindPrev()
2391{
2392    if (!m_pTextPage) {
2393        return FALSE;
2394    }
2395    m_resArray.RemoveAll();
2396    if(m_strText.IsEmpty() || m_findPreStart < 0) {
2397        m_IsFind = FALSE;
2398        return m_IsFind;
2399    }
2400    CPDF_TextPageFind findEngine(m_pTextPage);
2401    FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags);
2402    if(!ret) {
2403        m_IsFind = FALSE;
2404        return m_IsFind;
2405    }
2406    int	order = -1, MatchedCount = 0;
2407    while(ret) {
2408        ret = findEngine.FindNext();
2409        if(ret) {
2410            int order1 = findEngine.GetCurOrder() ;
2411            int	MatchedCount1 = findEngine.GetMatchedCount();
2412            if(((order1 + MatchedCount1) - 1) > m_findPreStart) {
2413                break;
2414            }
2415            order = order1;
2416            MatchedCount = MatchedCount1;
2417        }
2418    }
2419    if(order == -1) {
2420        m_IsFind = FALSE;
2421        return m_IsFind;
2422    }
2423    m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
2424    m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
2425    m_IsFind = TRUE;
2426    m_pTextPage->GetRectArray(order, MatchedCount, m_resArray);
2427    if(m_flags & FPDFTEXT_CONSECUTIVE) {
2428        m_findNextStart = m_resStart + 1;
2429        m_findPreStart = m_resEnd - 1;
2430    } else {
2431        m_findNextStart = m_resEnd + 1;
2432        m_findPreStart = m_resStart - 1;
2433    }
2434    return m_IsFind;
2435}
2436void CPDF_TextPageFind::ExtractFindWhat(CFX_WideString findwhat)
2437{
2438    if(findwhat.IsEmpty()) {
2439        return ;
2440    }
2441    int index = 0;
2442    while(1) {
2443        CFX_WideString csWord = TEXT_EMPTY;
2444        int ret = ExtractSubString(csWord, findwhat, index, TEXT_BLANK_CHAR);
2445        if(csWord.IsEmpty()) {
2446            if(ret) {
2447                m_csFindWhatArray.Add(CFX_WideString(L""));
2448                index++;
2449                continue;
2450            } else {
2451                break;
2452            }
2453        }
2454        int pos = 0;
2455        FX_BOOL bLastIgnore = FALSE;
2456        while(pos < csWord.GetLength()) {
2457            CFX_WideString curStr = csWord.Mid(pos, 1);
2458            FX_WCHAR curChar = csWord.GetAt(pos);
2459            if (_IsIgnoreSpaceCharacter(curChar)) {
2460                if (pos > 0 && curChar == 0x2019) {
2461                    pos++;
2462                    continue;
2463                }
2464                if (pos > 0 ) {
2465                    CFX_WideString preStr = csWord.Mid(0, pos);
2466                    m_csFindWhatArray.Add(preStr);
2467                }
2468                m_csFindWhatArray.Add(curStr);
2469                if (pos == csWord.GetLength() - 1) {
2470                    csWord.Empty();
2471                    break;
2472                }
2473                csWord = csWord.Right(csWord.GetLength() - pos - 1);
2474                pos = 0;
2475                bLastIgnore = TRUE;
2476                continue;
2477            } else {
2478                bLastIgnore = FALSE;
2479            }
2480            pos++;
2481        }
2482        if (!csWord.IsEmpty()) {
2483            m_csFindWhatArray.Add(csWord);
2484        }
2485        index++;
2486    }
2487    return;
2488}
2489FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(CFX_WideString csPageText, int startPos, int endPos)
2490{
2491    int char_left = 0;
2492    int char_right = 0;
2493    int char_count = endPos - startPos + 1;
2494    if(char_count < 1) {
2495        return FALSE;
2496    }
2497    if (char_count == 1 && csPageText.GetAt(startPos) > 255) {
2498        return TRUE;
2499    }
2500    if(startPos - 1 >= 0 ) {
2501        char_left = csPageText.GetAt(startPos - 1);
2502    }
2503    if(startPos + char_count < csPageText.GetLength()) {
2504        char_right = csPageText.GetAt(startPos + char_count);
2505    }
2506    if(char_left == 0x61) {
2507        int a = 0;
2508    }
2509    if ((char_left > 'A' && char_left < 'a') || (char_left > 'a' && char_left < 'z') || (char_left > 0xfb00 && char_left < 0xfb06) || (char_left >= '0' && char_left <= '9') ||
2510            (char_right > 'A' && char_right < 'a') || (char_right > 'a' && char_right < 'z') || (char_right > 0xfb00 && char_right < 0xfb06) || (char_right >= '0' && char_right <= '9')) {
2511        return FALSE;
2512    }
2513    if(!(('A' > char_left || char_left > 'Z')  && ('a' > char_left || char_left > 'z')
2514            && ('A' > char_right || char_right > 'Z')  && ('a' > char_right || char_right > 'z'))) {
2515        return FALSE;
2516    }
2517    if (char_count > 0) {
2518        if (csPageText.GetAt(startPos) >= L'0' && csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' && char_left <= L'9') {
2519            return FALSE;
2520        }
2521        if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' && char_right >= L'0' && char_right <= L'9') {
2522            return FALSE;
2523        }
2524    }
2525    return TRUE;
2526}
2527FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString, FX_LPCWSTR lpszFullString,
2528        int iSubString, FX_WCHAR chSep)
2529{
2530    if (lpszFullString == NULL) {
2531        return FALSE;
2532    }
2533    while (iSubString--) {
2534        lpszFullString = FXSYS_wcschr(lpszFullString, chSep);
2535        if (lpszFullString == NULL) {
2536            rString.Empty();
2537            return FALSE;
2538        }
2539        lpszFullString++;
2540        while(*lpszFullString == chSep) {
2541            lpszFullString++;
2542        }
2543    }
2544    FX_LPCWSTR lpchEnd = FXSYS_wcschr(lpszFullString, chSep);
2545    int nLen = (lpchEnd == NULL) ?
2546               (int)FXSYS_wcslen(lpszFullString) : (int)(lpchEnd - lpszFullString);
2547    ASSERT(nLen >= 0);
2548    FXSYS_memcpy32(rString.GetBuffer(nLen), lpszFullString, nLen * sizeof(FX_WCHAR));
2549    rString.ReleaseBuffer();
2550    return TRUE;
2551}
2552CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString str)
2553{
2554    CFX_WideString str2;
2555    str2.Empty();
2556    int nlen = str.GetLength();
2557    for(int i = nlen - 1; i >= 0; i--) {
2558        str2 += str.GetAt(i);
2559    }
2560    return str2;
2561}
2562void CPDF_TextPageFind::GetRectArray(CFX_RectArray& rects) const
2563{
2564    rects.Copy(m_resArray);
2565}
2566int	CPDF_TextPageFind::GetCurOrder() const
2567{
2568    return GetCharIndex(m_resStart);
2569}
2570int	CPDF_TextPageFind::GetMatchedCount()const
2571{
2572    int resStart = GetCharIndex(m_resStart);
2573    int resEnd = GetCharIndex(m_resEnd);
2574    return resEnd - resStart + 1;
2575}
2576CPDF_LinkExtract::CPDF_LinkExtract()
2577    : m_pTextPage(NULL),
2578      m_IsParserd(FALSE)
2579{
2580}
2581CPDF_LinkExtract::~CPDF_LinkExtract()
2582{
2583    DeleteLinkList();
2584}
2585FX_BOOL CPDF_LinkExtract::ExtractLinks(const IPDF_TextPage* pTextPage)
2586{
2587    if (!pTextPage || !pTextPage->IsParsered()) {
2588        return FALSE;
2589    }
2590    m_pTextPage = (const CPDF_TextPage*)pTextPage;
2591    m_strPageText = m_pTextPage->GetPageText(0, -1);
2592    DeleteLinkList();
2593    if (m_strPageText.IsEmpty()) {
2594        return FALSE;
2595    }
2596    parserLink();
2597    m_IsParserd = TRUE;
2598    return TRUE;
2599}
2600void CPDF_LinkExtract::DeleteLinkList()
2601{
2602    while (m_LinkList.GetSize()) {
2603        CPDF_LinkExt* linkinfo = NULL;
2604        linkinfo = m_LinkList.GetAt(0);
2605        m_LinkList.RemoveAt(0);
2606        delete linkinfo;
2607    }
2608    m_LinkList.RemoveAll();
2609}
2610int CPDF_LinkExtract::CountLinks() const
2611{
2612    if (!m_IsParserd)	{
2613        return -1;
2614    }
2615    return m_LinkList.GetSize();
2616}
2617void CPDF_LinkExtract::parserLink()
2618{
2619    int start = 0, pos = 0;
2620    int TotalChar = m_pTextPage->CountChars();
2621    while (pos < TotalChar) {
2622        FPDF_CHAR_INFO pageChar;
2623        m_pTextPage->GetCharInfo(pos, pageChar);
2624        if (pageChar.m_Flag == CHAR_GENERATED || pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) {
2625            int nCount = pos - start;
2626            if(pos == TotalChar - 1) {
2627                nCount++;
2628            }
2629            CFX_WideString strBeCheck;
2630            strBeCheck = m_pTextPage->GetPageText(start, nCount);
2631            if (strBeCheck.GetLength() > 5) {
2632                while(strBeCheck.GetLength() > 0) {
2633                    FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
2634                    if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
2635                        strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);
2636                        nCount--;
2637                    } else {
2638                        break;
2639                    }
2640                }
2641                if (nCount > 5 && (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
2642                    if (!AppendToLinkList(start, nCount, strBeCheck)) {
2643                        break;
2644                    }
2645                }
2646            }
2647            start = ++pos;
2648        } else {
2649            pos++;
2650        }
2651    }
2652}
2653FX_BOOL CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck)
2654{
2655    CFX_WideString str = strBeCheck;
2656    str.MakeLower();
2657    if (str.Find(L"http://www.") != -1) {
2658        strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));
2659        return TRUE;
2660    } else if (str.Find(L"http://") != -1) {
2661        strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
2662        return TRUE;
2663    } else if (str.Find(L"https://www.") != -1) {
2664        strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));
2665        return TRUE;
2666    } else if (str.Find(L"https://") != -1) {
2667        strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
2668        return TRUE;
2669    } else if (str.Find(L"www.") != -1) {
2670        strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
2671        strBeCheck = L"http://" + strBeCheck;
2672        return TRUE;
2673    } else {
2674        return FALSE;
2675    }
2676}
2677FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str)
2678{
2679    str.MakeLower();
2680    int aPos = str.Find(L'@');
2681    if (aPos < 1) {
2682        return FALSE;
2683    }
2684    if (str.GetAt(aPos - 1) == L'.' || str.GetAt(aPos - 1) == L'_') {
2685        return FALSE;
2686    }
2687    int i;
2688    for (i = aPos - 1; i >= 0; i--) {
2689        FX_WCHAR ch = str.GetAt(i);
2690        if (ch == L'_' || ch == L'.' || (ch >= L'a' && ch <= L'z') || (ch >= L'0' && ch <= L'9')) {
2691            continue;
2692        } else {
2693            if (i == aPos - 1) {
2694                return FALSE;
2695            }
2696            str = str.Right(str.GetLength() - i - 1);
2697            break;
2698        }
2699    }
2700    aPos = str.Find(L'@');
2701    if (aPos < 1) {
2702        return FALSE;
2703    }
2704    CFX_WideString strtemp = L"";
2705    for (i = 0; i < aPos; i++) {
2706        FX_WCHAR wch = str.GetAt(i);
2707        if (wch >= L'a' && wch <= L'z') {
2708            break;
2709        } else {
2710            strtemp = str.Right(str.GetLength() - i + 1);
2711        }
2712    }
2713    if (strtemp != L"") {
2714        str = strtemp;
2715    }
2716    aPos = str.Find(L'@');
2717    if (aPos < 1) {
2718        return FALSE;
2719    }
2720    str.TrimRight(L'.');
2721    strtemp = str;
2722    int ePos = str.Find(L'.');
2723    if (ePos == -1) {
2724        return FALSE;
2725    }
2726    while (ePos != -1) {
2727        strtemp = strtemp.Right(strtemp.GetLength() - ePos - 1);
2728        ePos = strtemp.Find('.');
2729    }
2730    ePos = strtemp.GetLength();
2731    for (i = 0; i < ePos; i++) {
2732        FX_WCHAR wch = str.GetAt(i);
2733        if ((wch >= L'a' && wch <= L'z') || (wch >= L'0' && wch <= L'9')) {
2734            continue;
2735        } else {
2736            str = str.Left(str.GetLength() - ePos + i + 1);
2737            ePos = ePos - i - 1;
2738            break;
2739        }
2740    }
2741    int nLen = str.GetLength();
2742    for (i = aPos + 1; i < nLen - ePos; i++) {
2743        FX_WCHAR wch = str.GetAt(i);
2744        if (wch == L'-' || wch == L'.' || (wch >= L'a' && wch <= L'z') || (wch >= L'0' && wch <= L'9')) {
2745            continue;
2746        } else {
2747            return FALSE;
2748        }
2749    }
2750    if (str.Find(L"mailto:") == -1) {
2751        str = L"mailto:" + str;
2752    }
2753    return TRUE;
2754}
2755FX_BOOL CPDF_LinkExtract::AppendToLinkList(int start, int count, CFX_WideString strUrl)
2756{
2757    CPDF_LinkExt* linkInfo = NULL;
2758    linkInfo = FX_NEW CPDF_LinkExt;
2759    if (!linkInfo) {
2760        return FALSE;
2761    }
2762    linkInfo->m_strUrl = strUrl;
2763    linkInfo->m_Start = start;
2764    linkInfo->m_Count = count;
2765    m_LinkList.Add(linkInfo);
2766    return TRUE;
2767}
2768CFX_WideString CPDF_LinkExtract::GetURL(int index) const
2769{
2770    if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) {
2771        return L"";
2772    }
2773    CPDF_LinkExt* link = NULL;
2774    link = m_LinkList.GetAt(index);
2775    if (!link) {
2776        return L"";
2777    }
2778    return link->m_strUrl;
2779}
2780void CPDF_LinkExtract::GetBoundedSegment(int index, int& start, int& count) const
2781{
2782    if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) {
2783        return ;
2784    }
2785    CPDF_LinkExt* link = NULL;
2786    link = m_LinkList.GetAt(index);
2787    if (!link) {
2788        return ;
2789    }
2790    start = link->m_Start;
2791    count = link->m_Count;
2792}
2793void CPDF_LinkExtract::GetRects(int index, CFX_RectArray& rects) const
2794{
2795    if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) {
2796        return;
2797    }
2798    CPDF_LinkExt* link = NULL;
2799    link = m_LinkList.GetAt(index);
2800    if (!link) {
2801        return ;
2802    }
2803    m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects);
2804}
2805