1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4 **********************************************************************
5 *   Copyright (C) 2005-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 */
9
10#include "unicode/utypes.h"
11
12#if !UCONFIG_NO_CONVERSION
13
14#include "inputext.h"
15
16#include "cmemory.h"
17#include "cstring.h"
18
19#include <string.h>
20
21U_NAMESPACE_BEGIN
22
23#define BUFFER_SIZE 8192
24
25#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
26#define DELETE_ARRAY(array) uprv_free((void *) (array))
27
28InputText::InputText(UErrorCode &status)
29    : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.  Markup will have been
30                                                 //   removed if appropriate.
31      fByteStats(NEW_ARRAY(int16_t, 256)),       // byte frequency statistics for the input text.
32                                                 //   Value is percent, not absolute.
33      fDeclaredEncoding(0),
34      fRawInput(0),
35      fRawLength(0)
36{
37    if (fInputBytes == NULL || fByteStats == NULL) {
38        status = U_MEMORY_ALLOCATION_ERROR;
39    }
40}
41
42InputText::~InputText()
43{
44    DELETE_ARRAY(fDeclaredEncoding);
45    DELETE_ARRAY(fByteStats);
46    DELETE_ARRAY(fInputBytes);
47}
48
49void InputText::setText(const char *in, int32_t len)
50{
51    fInputLen  = 0;
52    fC1Bytes   = FALSE;
53    fRawInput  = (const uint8_t *) in;
54    fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
55}
56
57void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
58{
59    if(encoding) {
60        if (len == -1) {
61            len = (int32_t)uprv_strlen(encoding);
62        }
63
64        len += 1;     // to make place for the \0 at the end.
65        uprv_free(fDeclaredEncoding);
66        fDeclaredEncoding = NEW_ARRAY(char, len);
67        uprv_strncpy(fDeclaredEncoding, encoding, len);
68    }
69}
70
71UBool InputText::isSet() const
72{
73    return fRawInput != NULL;
74}
75
76/**
77*  MungeInput - after getting a set of raw input data to be analyzed, preprocess
78*               it by removing what appears to be html markup.
79*
80* @internal
81*/
82void InputText::MungeInput(UBool fStripTags) {
83    int     srci = 0;
84    int     dsti = 0;
85    uint8_t b;
86    bool    inMarkup = FALSE;
87    int32_t openTags = 0;
88    int32_t badTags  = 0;
89
90    //
91    //  html / xml markup stripping.
92    //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
93    //     discard everything within < brackets >
94    //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
95    //     guess as to whether the input was actually marked up at all.
96    // TODO: Think about how this interacts with EBCDIC charsets that are detected.
97    if (fStripTags) {
98        for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
99            b = fRawInput[srci];
100
101            if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
102                if (inMarkup) {
103                    badTags += 1;
104                }
105
106                inMarkup = TRUE;
107                openTags += 1;
108            }
109
110            if (! inMarkup) {
111                fInputBytes[dsti++] = b;
112            }
113
114            if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
115                inMarkup = FALSE;
116            }
117        }
118
119        fInputLen = dsti;
120    }
121
122    //
123    //  If it looks like this input wasn't marked up, or if it looks like it's
124    //    essentially nothing but markup abandon the markup stripping.
125    //    Detection will have to work on the unstripped input.
126    //
127    if (openTags<5 || openTags/5 < badTags ||
128        (fInputLen < 100 && fRawLength>600))
129    {
130        int32_t limit = fRawLength;
131
132        if (limit > BUFFER_SIZE) {
133            limit = BUFFER_SIZE;
134        }
135
136        for (srci=0; srci<limit; srci++) {
137            fInputBytes[srci] = fRawInput[srci];
138        }
139
140        fInputLen = srci;
141    }
142
143    //
144    // Tally up the byte occurence statistics.
145    // These are available for use by the various detectors.
146    //
147
148    uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
149
150    for (srci = 0; srci < fInputLen; srci += 1) {
151        fByteStats[fInputBytes[srci]] += 1;
152    }
153
154    for (int32_t i = 0x80; i <= 0x9F; i += 1) {
155        if (fByteStats[i] != 0) {
156            fC1Bytes = TRUE;
157            break;
158        }
159    }
160}
161
162U_NAMESPACE_END
163#endif
164
165