1// Copyright 2014 PDFium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7#include "core/include/fpdfapi/fpdf_parser.h"
8
9#include "core/include/fxcrt/fx_ext.h"
10
11// Indexed by 8-bit character code, contains either:
12//   'W' - for whitespace: NUL, TAB, CR, LF, FF, SPACE, 0x80, 0xff
13//   'N' - for numeric: 0123456789+-.
14//   'D' - for delimiter: %()/<>[]{}
15//   'R' - otherwise.
16const char PDF_CharType[256] = {
17    // NUL  SOH  STX  ETX  EOT  ENQ  ACK  BEL  BS   HT   LF   VT   FF   CR   SO
18    // SI
19    'W', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'W', 'W', 'R', 'W', 'W', 'R',
20    'R',
21
22    // DLE  DC1  DC2  DC3  DC4  NAK  SYN  ETB  CAN  EM   SUB  ESC  FS   GS   RS
23    // US
24    'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
25    'R',
26
27    // SP    !    "    #    $    %    &    ´    (    )    *    +    ,    -    .
28    // /
29    'W', 'R', 'R', 'R', 'R', 'D', 'R', 'R', 'D', 'D', 'R', 'N', 'R', 'N', 'N',
30    'D',
31
32    // 0    1    2    3    4    5    6    7    8    9    :    ;    <    =    > ?
33    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'R', 'R', 'D', 'R', 'D',
34    'R',
35
36    // @    A    B    C    D    E    F    G    H    I    J    K    L    M    N O
37    'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
38    'R',
39
40    // P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^ _
41    'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'D', 'R', 'D', 'R',
42    'R',
43
44    // `    a    b    c    d    e    f    g    h    i    j    k    l    m    n o
45    'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
46    'R',
47
48    // p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
49    // DEL
50    'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'D', 'R', 'D', 'R',
51    'R',
52
53    'W', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
54    'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
55    'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
56    'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
57    'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
58    'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
59    'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
60    'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
61    'R', 'R', 'R', 'R', 'R', 'R', 'R', 'W'};
62
63CPDF_SimpleParser::CPDF_SimpleParser(const uint8_t* pData, FX_DWORD dwSize) {
64  m_pData = pData;
65  m_dwSize = dwSize;
66  m_dwCurPos = 0;
67}
68CPDF_SimpleParser::CPDF_SimpleParser(const CFX_ByteStringC& str) {
69  m_pData = str.GetPtr();
70  m_dwSize = str.GetLength();
71  m_dwCurPos = 0;
72}
73void CPDF_SimpleParser::ParseWord(const uint8_t*& pStart,
74                                  FX_DWORD& dwSize,
75                                  int& type) {
76  pStart = NULL;
77  dwSize = 0;
78  type = PDFWORD_EOF;
79  uint8_t ch;
80  while (1) {
81    if (m_dwSize <= m_dwCurPos)
82      return;
83    ch = m_pData[m_dwCurPos++];
84    while (PDFCharIsWhitespace(ch)) {
85      if (m_dwSize <= m_dwCurPos)
86        return;
87      ch = m_pData[m_dwCurPos++];
88    }
89
90    if (ch != '%')
91      break;
92
93    while (1) {
94      if (m_dwSize <= m_dwCurPos)
95        return;
96      ch = m_pData[m_dwCurPos++];
97      if (ch == '\r' || ch == '\n')
98        break;
99    }
100  }
101
102  FX_DWORD start_pos = m_dwCurPos - 1;
103  pStart = m_pData + start_pos;
104  if (PDFCharIsDelimiter(ch)) {
105    if (ch == '/') {
106      while (1) {
107        if (m_dwSize <= m_dwCurPos)
108          return;
109        ch = m_pData[m_dwCurPos++];
110        if (!PDFCharIsOther(ch) && !PDFCharIsNumeric(ch)) {
111          m_dwCurPos--;
112          dwSize = m_dwCurPos - start_pos;
113          type = PDFWORD_NAME;
114          return;
115        }
116      }
117    } else {
118      type = PDFWORD_DELIMITER;
119      dwSize = 1;
120      if (ch == '<') {
121        if (m_dwSize <= m_dwCurPos)
122          return;
123        ch = m_pData[m_dwCurPos++];
124        if (ch == '<')
125          dwSize = 2;
126        else
127          m_dwCurPos--;
128      } else if (ch == '>') {
129        if (m_dwSize <= m_dwCurPos)
130          return;
131        ch = m_pData[m_dwCurPos++];
132        if (ch == '>')
133          dwSize = 2;
134        else
135          m_dwCurPos--;
136      }
137    }
138    return;
139  }
140
141  type = PDFWORD_NUMBER;
142  dwSize = 1;
143  while (1) {
144    if (!PDFCharIsNumeric(ch))
145      type = PDFWORD_TEXT;
146    if (m_dwSize <= m_dwCurPos)
147      return;
148    ch = m_pData[m_dwCurPos++];
149
150    if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
151      m_dwCurPos--;
152      break;
153    }
154    dwSize++;
155  }
156}
157CFX_ByteStringC CPDF_SimpleParser::GetWord() {
158  const uint8_t* pStart;
159  FX_DWORD dwSize;
160  int type;
161  ParseWord(pStart, dwSize, type);
162  if (dwSize == 1 && pStart[0] == '<') {
163    while (m_dwCurPos < m_dwSize && m_pData[m_dwCurPos] != '>') {
164      m_dwCurPos++;
165    }
166    if (m_dwCurPos < m_dwSize) {
167      m_dwCurPos++;
168    }
169    return CFX_ByteStringC(pStart,
170                           (FX_STRSIZE)(m_dwCurPos - (pStart - m_pData)));
171  }
172  if (dwSize == 1 && pStart[0] == '(') {
173    int level = 1;
174    while (m_dwCurPos < m_dwSize) {
175      if (m_pData[m_dwCurPos] == ')') {
176        level--;
177        if (level == 0) {
178          break;
179        }
180      }
181      if (m_pData[m_dwCurPos] == '\\') {
182        if (m_dwSize <= m_dwCurPos) {
183          break;
184        }
185        m_dwCurPos++;
186      } else if (m_pData[m_dwCurPos] == '(') {
187        level++;
188      }
189      if (m_dwSize <= m_dwCurPos) {
190        break;
191      }
192      m_dwCurPos++;
193    }
194    if (m_dwCurPos < m_dwSize) {
195      m_dwCurPos++;
196    }
197    return CFX_ByteStringC(pStart,
198                           (FX_STRSIZE)(m_dwCurPos - (pStart - m_pData)));
199  }
200  return CFX_ByteStringC(pStart, dwSize);
201}
202FX_BOOL CPDF_SimpleParser::SearchToken(const CFX_ByteStringC& token) {
203  int token_len = token.GetLength();
204  while (m_dwCurPos < m_dwSize - token_len) {
205    if (FXSYS_memcmp(m_pData + m_dwCurPos, token.GetPtr(), token_len) == 0) {
206      break;
207    }
208    m_dwCurPos++;
209  }
210  if (m_dwCurPos == m_dwSize - token_len) {
211    return FALSE;
212  }
213  m_dwCurPos += token_len;
214  return TRUE;
215}
216FX_BOOL CPDF_SimpleParser::SkipWord(const CFX_ByteStringC& token) {
217  while (1) {
218    CFX_ByteStringC word = GetWord();
219    if (word.IsEmpty()) {
220      return FALSE;
221    }
222    if (word == token) {
223      return TRUE;
224    }
225  }
226  return FALSE;
227}
228FX_BOOL CPDF_SimpleParser::FindTagPair(const CFX_ByteStringC& start_token,
229                                       const CFX_ByteStringC& end_token,
230                                       FX_DWORD& start_pos,
231                                       FX_DWORD& end_pos) {
232  if (!start_token.IsEmpty()) {
233    if (!SkipWord(start_token)) {
234      return FALSE;
235    }
236    start_pos = m_dwCurPos;
237  }
238  while (1) {
239    end_pos = m_dwCurPos;
240    CFX_ByteStringC word = GetWord();
241    if (word.IsEmpty()) {
242      return FALSE;
243    }
244    if (word == end_token) {
245      return TRUE;
246    }
247  }
248  return FALSE;
249}
250FX_BOOL CPDF_SimpleParser::FindTagParam(const CFX_ByteStringC& token,
251                                        int nParams) {
252  nParams++;
253  FX_DWORD* pBuf = FX_Alloc(FX_DWORD, nParams);
254  int buf_index = 0;
255  int buf_count = 0;
256  while (1) {
257    pBuf[buf_index++] = m_dwCurPos;
258    if (buf_index == nParams) {
259      buf_index = 0;
260    }
261    buf_count++;
262    if (buf_count > nParams) {
263      buf_count = nParams;
264    }
265    CFX_ByteStringC word = GetWord();
266    if (word.IsEmpty()) {
267      FX_Free(pBuf);
268      return FALSE;
269    }
270    if (word == token) {
271      if (buf_count < nParams) {
272        continue;
273      }
274      m_dwCurPos = pBuf[buf_index];
275      FX_Free(pBuf);
276      return TRUE;
277    }
278  }
279  return FALSE;
280}
281
282CFX_ByteString PDF_NameDecode(const CFX_ByteStringC& bstr) {
283  int size = bstr.GetLength();
284  const FX_CHAR* pSrc = bstr.GetCStr();
285  if (!FXSYS_memchr(pSrc, '#', size)) {
286    return bstr;
287  }
288  CFX_ByteString result;
289  FX_CHAR* pDestStart = result.GetBuffer(size);
290  FX_CHAR* pDest = pDestStart;
291  for (int i = 0; i < size; i++) {
292    if (pSrc[i] == '#' && i < size - 2) {
293      *pDest++ =
294          FXSYS_toHexDigit(pSrc[i + 1]) * 16 + FXSYS_toHexDigit(pSrc[i + 2]);
295      i += 2;
296    } else {
297      *pDest++ = pSrc[i];
298    }
299  }
300  result.ReleaseBuffer((FX_STRSIZE)(pDest - pDestStart));
301  return result;
302}
303CFX_ByteString PDF_NameDecode(const CFX_ByteString& orig) {
304  if (!FXSYS_memchr(orig.c_str(), '#', orig.GetLength())) {
305    return orig;
306  }
307  return PDF_NameDecode(CFX_ByteStringC(orig));
308}
309CFX_ByteString PDF_NameEncode(const CFX_ByteString& orig) {
310  uint8_t* src_buf = (uint8_t*)orig.c_str();
311  int src_len = orig.GetLength();
312  int dest_len = 0;
313  int i;
314  for (i = 0; i < src_len; i++) {
315    uint8_t ch = src_buf[i];
316    if (ch >= 0x80 || PDFCharIsWhitespace(ch) || ch == '#' ||
317        PDFCharIsDelimiter(ch)) {
318      dest_len += 3;
319    } else {
320      dest_len++;
321    }
322  }
323  if (dest_len == src_len)
324    return orig;
325
326  CFX_ByteString res;
327  FX_CHAR* dest_buf = res.GetBuffer(dest_len);
328  dest_len = 0;
329  for (i = 0; i < src_len; i++) {
330    uint8_t ch = src_buf[i];
331    if (ch >= 0x80 || PDFCharIsWhitespace(ch) || ch == '#' ||
332        PDFCharIsDelimiter(ch)) {
333      dest_buf[dest_len++] = '#';
334      dest_buf[dest_len++] = "0123456789ABCDEF"[ch / 16];
335      dest_buf[dest_len++] = "0123456789ABCDEF"[ch % 16];
336    } else {
337      dest_buf[dest_len++] = ch;
338    }
339  }
340  dest_buf[dest_len] = 0;
341  res.ReleaseBuffer();
342  return res;
343}
344CFX_ByteTextBuf& operator<<(CFX_ByteTextBuf& buf, const CPDF_Object* pObj) {
345  if (!pObj) {
346    buf << " null";
347    return buf;
348  }
349  switch (pObj->GetType()) {
350    case PDFOBJ_NULL:
351      buf << " null";
352      break;
353    case PDFOBJ_BOOLEAN:
354    case PDFOBJ_NUMBER:
355      buf << " " << pObj->GetString();
356      break;
357    case PDFOBJ_STRING:
358      buf << PDF_EncodeString(pObj->GetString(), pObj->AsString()->IsHex());
359      break;
360    case PDFOBJ_NAME: {
361      CFX_ByteString str = pObj->GetString();
362      buf << "/" << PDF_NameEncode(str);
363      break;
364    }
365    case PDFOBJ_REFERENCE: {
366      buf << " " << pObj->AsReference()->GetRefObjNum() << " 0 R ";
367      break;
368    }
369    case PDFOBJ_ARRAY: {
370      const CPDF_Array* p = pObj->AsArray();
371      buf << "[";
372      for (FX_DWORD i = 0; i < p->GetCount(); i++) {
373        CPDF_Object* pElement = p->GetElement(i);
374        if (pElement->GetObjNum()) {
375          buf << " " << pElement->GetObjNum() << " 0 R";
376        } else {
377          buf << pElement;
378        }
379      }
380      buf << "]";
381      break;
382    }
383    case PDFOBJ_DICTIONARY: {
384      const CPDF_Dictionary* p = pObj->AsDictionary();
385      buf << "<<";
386      for (const auto& it : *p) {
387        const CFX_ByteString& key = it.first;
388        CPDF_Object* pValue = it.second;
389        buf << "/" << PDF_NameEncode(key);
390        if (pValue && pValue->GetObjNum()) {
391          buf << " " << pValue->GetObjNum() << " 0 R ";
392        } else {
393          buf << pValue;
394        }
395      }
396      buf << ">>";
397      break;
398    }
399    case PDFOBJ_STREAM: {
400      const CPDF_Stream* p = pObj->AsStream();
401      buf << p->GetDict() << "stream\r\n";
402      CPDF_StreamAcc acc;
403      acc.LoadAllData(p, TRUE);
404      buf.AppendBlock(acc.GetData(), acc.GetSize());
405      buf << "\r\nendstream";
406      break;
407    }
408    default:
409      ASSERT(FALSE);
410      break;
411  }
412  return buf;
413}
414FX_FLOAT PDF_ClipFloat(FX_FLOAT f) {
415  if (f < 0) {
416    return 0;
417  }
418  if (f > 1.0f) {
419    return 1.0f;
420  }
421  return f;
422}
423static CPDF_Object* SearchNumberNode(CPDF_Dictionary* pNode, int num) {
424  CPDF_Array* pLimits = pNode->GetArray("Limits");
425  if (pLimits &&
426      (num < pLimits->GetInteger(0) || num > pLimits->GetInteger(1))) {
427    return NULL;
428  }
429  CPDF_Array* pNumbers = pNode->GetArray("Nums");
430  if (pNumbers) {
431    FX_DWORD dwCount = pNumbers->GetCount() / 2;
432    for (FX_DWORD i = 0; i < dwCount; i++) {
433      int index = pNumbers->GetInteger(i * 2);
434      if (num == index) {
435        return pNumbers->GetElementValue(i * 2 + 1);
436      }
437      if (index > num) {
438        break;
439      }
440    }
441    return NULL;
442  }
443  CPDF_Array* pKids = pNode->GetArray("Kids");
444  if (!pKids) {
445    return NULL;
446  }
447  for (FX_DWORD i = 0; i < pKids->GetCount(); i++) {
448    CPDF_Dictionary* pKid = pKids->GetDict(i);
449    if (!pKid) {
450      continue;
451    }
452    CPDF_Object* pFound = SearchNumberNode(pKid, num);
453    if (pFound) {
454      return pFound;
455    }
456  }
457  return NULL;
458}
459CPDF_Object* CPDF_NumberTree::LookupValue(int num) {
460  return SearchNumberNode(m_pRoot, num);
461}
462