1// Copyright 2014 PDFium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7#include "../../include/fpdfapi/fpdf_pageobj.h"
8#include "../../include/fpdftext/fpdf_text.h"
9#include "../../include/fpdfapi/fpdf_page.h"
10class CPDF_TextStream
11{
12public:
13    CPDF_TextStream(CFX_WideTextBuf& buffer, FX_BOOL bUseLF, CFX_PtrArray* pObjArray);
14    ~CPDF_TextStream() {}
15    FX_BOOL ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine);
16    CFX_WideTextBuf&	m_Buffer;
17    FX_BOOL				m_bUseLF;
18    CFX_PtrArray*		m_pObjArray;
19    const CPDF_TextObject*	m_pLastObj;
20};
21CPDF_TextStream::CPDF_TextStream(CFX_WideTextBuf& buffer, FX_BOOL bUseLF, CFX_PtrArray* pObjArray) : m_Buffer(buffer)
22{
23    m_pLastObj = NULL;
24    m_bUseLF = bUseLF;
25    m_pObjArray = pObjArray;
26}
27FX_BOOL FPDFText_IsSameTextObject(const CPDF_TextObject* pTextObj1, const CPDF_TextObject* pTextObj2)
28{
29    if (!pTextObj1 || !pTextObj2) {
30        return FALSE;
31    }
32    CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom, pTextObj2->m_Right, pTextObj2->m_Top);
33    CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom, pTextObj1->m_Right, pTextObj1->m_Top);
34    if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty()) {
35        return TRUE;
36    }
37    if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
38        rcPreObj.Intersect(rcCurObj);
39        if (rcPreObj.IsEmpty()) {
40            return FALSE;
41        }
42        if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) > rcCurObj.Width() / 2) {
43            return FALSE;
44        }
45        if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) {
46            return FALSE;
47        }
48    }
49    int nPreCount = pTextObj2->CountItems();
50    int nCurCount = pTextObj1->CountItems();
51    if (nPreCount != nCurCount) {
52        return FALSE;
53    }
54    for (int i = 0; i < nPreCount; i++) {
55        CPDF_TextObjectItem itemPer, itemCur;
56        pTextObj2->GetItemInfo(i, &itemPer);
57        pTextObj1->GetItemInfo(i, &itemCur);
58        if (itemCur.m_CharCode != itemPer.m_CharCode) {
59            return FALSE;
60        }
61    }
62    return TRUE;
63}
64int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont)
65{
66    if(charCode == -1) {
67        return 0;
68    }
69    int w = pFont->GetCharWidthF(charCode);
70    if(w == 0) {
71        CFX_ByteString str;
72        pFont->AppendChar(str, charCode);
73        w = pFont->GetStringWidth(str, 1);
74        if(w == 0) {
75            FX_RECT BBox;
76            pFont->GetCharBBox(charCode, BBox);
77            w = BBox.right - BBox.left;
78        }
79    }
80    return w;
81}
82int FPDFText_ProcessInterObj(const CPDF_TextObject* pPrevObj, const CPDF_TextObject* pObj)
83{
84    if(FPDFText_IsSameTextObject(pPrevObj, pObj)) {
85        return -1;
86    }
87    CPDF_TextObjectItem item;
88    int nItem = pPrevObj->CountItems();
89    pPrevObj->GetItemInfo(nItem - 1, &item);
90    FX_WCHAR preChar = 0, curChar = 0;
91    CFX_WideString wstr = pPrevObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
92    if(wstr.GetLength()) {
93        preChar = wstr.GetAt(0);
94    }
95    FX_FLOAT last_pos = item.m_OriginX;
96    int nLastWidth = GetCharWidth(item.m_CharCode, pPrevObj->GetFont());
97    FX_FLOAT last_width = nLastWidth * pPrevObj->GetFontSize() / 1000;
98    last_width = FXSYS_fabs(last_width);
99    pObj->GetItemInfo(0, &item);
100    wstr = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
101    if(wstr.GetLength()) {
102        curChar = wstr.GetAt(0);
103    }
104    int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont());
105    FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000;
106    this_width = FXSYS_fabs(this_width);
107    FX_FLOAT threshold = last_width > this_width ? last_width / 4 : this_width / 4;
108    CFX_AffineMatrix prev_matrix, prev_reverse;
109    pPrevObj->GetTextMatrix(&prev_matrix);
110    prev_reverse.SetReverse(prev_matrix);
111    FX_FLOAT x = pObj->GetPosX(), y = pObj->GetPosY();
112    prev_reverse.Transform(x, y);
113    if (FXSYS_fabs(y) > threshold * 2) {
114        return 2;
115    }
116    threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth);
117    threshold = threshold > 400 ? (threshold < 700 ? threshold / 4 :  threshold / 5) : (threshold / 2);
118    threshold *= nLastWidth > nThisWidth ? FXSYS_fabs(pPrevObj->GetFontSize()) : FXSYS_fabs(pObj->GetFontSize());
119    threshold /= 1000;
120    if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' && preChar != L' ')
121        if(curChar != L' ' && preChar != L' ') {
122            if((x - last_pos - last_width) > threshold || (last_pos - x - last_width) > threshold) {
123                return 1;
124            }
125            if(x < 0 && (last_pos - x - last_width) > threshold) {
126                return 1;
127            }
128            if((x - last_pos - last_width) > this_width || (x - last_pos - this_width) > last_width ) {
129                return 1;
130            }
131        }
132    if(last_pos + last_width > x + this_width && curChar == L' ') {
133        return 3;
134    }
135    return 0;
136}
137FX_BOOL CPDF_TextStream::ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine)
138{
139    CPDF_Font* pFont = pObj->GetFont();
140    CFX_AffineMatrix matrix;
141    pObj->GetTextMatrix(&matrix);
142    int item_index = 0;
143    if (m_pLastObj) {
144        int result = FPDFText_ProcessInterObj(m_pLastObj, pObj);
145        if (result == 2) {
146            int len = m_Buffer.GetLength();
147            if (len && m_bUseLF && m_Buffer.GetBuffer()[len - 1] == L'-') {
148                m_Buffer.Delete(len - 1, 1);
149                if (m_pObjArray) {
150                    m_pObjArray->RemoveAt((len - 1) * 2, 2);
151                }
152            } else {
153                if (bFirstLine) {
154                    return TRUE;
155                }
156                if (m_bUseLF) {
157                    m_Buffer.AppendChar(L'\r');
158                    m_Buffer.AppendChar(L'\n');
159                    if (m_pObjArray) {
160                        for (int i = 0; i < 4; i ++) {
161                            m_pObjArray->Add(NULL);
162                        }
163                    }
164                } else {
165                    m_Buffer.AppendChar(' ');
166                    if (m_pObjArray) {
167                        m_pObjArray->Add(NULL);
168                        m_pObjArray->Add(NULL);
169                    }
170                }
171            }
172        } else if (result == 1) {
173            m_Buffer.AppendChar(L' ');
174            if (m_pObjArray) {
175                m_pObjArray->Add(NULL);
176                m_pObjArray->Add(NULL);
177            }
178        } else if (result == -1) {
179            m_pLastObj = pObj;
180            return FALSE;
181        } else if (result == 3) {
182            item_index = 1;
183        }
184    }
185    m_pLastObj = pObj;
186    int nItems = pObj->CountItems();
187    FX_FLOAT Ignorekerning = 0;
188    for(int i = 1; i < nItems - 1; i += 2) {
189        CPDF_TextObjectItem item;
190        pObj->GetItemInfo(i, &item);
191        if (item.m_CharCode == (FX_DWORD) - 1) {
192            if(i == 1) {
193                Ignorekerning = item.m_OriginX;
194            } else if(Ignorekerning > item.m_OriginX) {
195                Ignorekerning = item.m_OriginX;
196            }
197        } else {
198            Ignorekerning = 0;
199            break;
200        }
201    }
202    FX_FLOAT spacing = 0;
203    for (; item_index < nItems; item_index ++) {
204        CPDF_TextObjectItem item;
205        pObj->GetItemInfo(item_index, &item);
206        if (item.m_CharCode == (FX_DWORD) - 1) {
207            CFX_WideString wstr = m_Buffer.GetWideString();
208            if (wstr.IsEmpty() || wstr.GetAt(wstr.GetLength() - 1) == L' ') {
209                continue;
210            }
211            FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
212            spacing = -fontsize_h * (item.m_OriginX - Ignorekerning) / 1000;
213            continue;
214        }
215        FX_FLOAT charSpace = pObj->m_TextState.GetObject()->m_CharSpace;
216        if(nItems > 3 && !spacing) {
217            charSpace = 0;
218        }
219        if((spacing || charSpace) && item_index > 0) {
220            int last_width = 0;
221            FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
222            FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
223            FX_FLOAT threshold = 0;
224            if (space_charcode != -1) {
225                threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000 ;
226            }
227            if(threshold > fontsize_h / 3) {
228                threshold = 0;
229            } else {
230                threshold /= 2;
231            }
232            if (threshold == 0) {
233                threshold = fontsize_h;
234                int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
235                threshold = this_width > last_width ? (FX_FLOAT)this_width : (FX_FLOAT)last_width;
236                int nDivide = 6;
237                if (threshold < 300) {
238                    nDivide = 2;
239                } else if (threshold < 500) {
240                    nDivide = 4;
241                } else if (threshold < 700) {
242                    nDivide = 5;
243                }
244                threshold = threshold / nDivide;
245                threshold = fontsize_h * threshold / 1000;
246            }
247            if(charSpace > 0.001) {
248                spacing += matrix.TransformDistance(charSpace);
249            } else if(charSpace < -0.001) {
250                spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace));
251            }
252            if (threshold && (spacing && spacing >= threshold) ) {
253                m_Buffer.AppendChar(L' ');
254                if (m_pObjArray) {
255                    m_pObjArray->Add(NULL);
256                    m_pObjArray->Add(NULL);
257                }
258            }
259            if (item.m_CharCode == (FX_DWORD) - 1) {
260                continue;
261            }
262            spacing = 0;
263        }
264        CFX_WideString unicode_str = pFont->UnicodeFromCharCode(item.m_CharCode);
265        if (unicode_str.IsEmpty()) {
266            m_Buffer.AppendChar((FX_WCHAR)item.m_CharCode);
267            if (m_pObjArray) {
268                m_pObjArray->Add((void*)pObj);
269                m_pObjArray->Add((void*)(FX_INTPTR)item_index);
270            }
271        } else {
272            m_Buffer << unicode_str;
273            if (m_pObjArray) {
274                for (int i = 0; i < unicode_str.GetLength(); i ++) {
275                    m_pObjArray->Add((void*)pObj);
276                    m_pObjArray->Add((void*)(FX_INTPTR)item_index);
277                }
278            }
279        }
280    }
281    return FALSE;
282}
283void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_PageObjects* pPage, FX_BOOL bUseLF,
284                                CFX_PtrArray* pObjArray)
285{
286    CPDF_TextStream textstream(buffer, bUseLF, pObjArray);
287    FX_POSITION pos = pPage->GetFirstObjectPosition();
288    while (pos) {
289        CPDF_PageObject* pObject = pPage->GetNextObject(pos);
290        if (pObject == NULL) {
291            continue;
292        }
293        if (pObject->m_Type != PDFPAGE_TEXT) {
294            continue;
295        }
296        textstream.ProcessObject((CPDF_TextObject*)pObject, FALSE);
297    }
298}
299CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, CPDF_Dictionary* pPage)
300{
301    CFX_WideTextBuf buffer;
302    buffer.EstimateSize(0, 1024);
303    CPDF_Page page;
304    page.Load(pDoc, pPage);
305    CPDF_ParseOptions options;
306    options.m_bTextOnly = TRUE;
307    options.m_bSeparateForm = FALSE;
308    page.ParseContent(&options);
309    CPDF_TextStream textstream(buffer, FALSE, NULL);
310    FX_POSITION pos = page.GetFirstObjectPosition();
311    while (pos) {
312        CPDF_PageObject* pObject = page.GetNextObject(pos);
313        if (pObject->m_Type != PDFPAGE_TEXT) {
314            continue;
315        }
316        if (textstream.ProcessObject((CPDF_TextObject*)pObject, TRUE)) {
317            break;
318        }
319    }
320    return buffer.GetWideString();
321}
322