1c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott/*
2c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott **********************************************************************
3c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott *   Copyright (C) 2005-2008, International Business Machines
4c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott *   Corporation and others.  All Rights Reserved.
5c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott **********************************************************************
6c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott */
7c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
8c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/utypes.h"
9c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
10c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#if !UCONFIG_NO_CONVERSION
11c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
12c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "inputext.h"
13c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
14c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "cmemory.h"
15c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "cstring.h"
16c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
17c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include <string.h>
18c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
19c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_NAMESPACE_BEGIN
20c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
21c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define BUFFER_SIZE 8192
22c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
23c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
24c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
25c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
26c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define DELETE_ARRAY(array) uprv_free((void *) (array))
27c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
28c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottInputText::InputText(UErrorCode &status)
29c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.  Markup will have been
30c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                                                 //   removed if appropriate.
31c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      fByteStats(NEW_ARRAY(int16_t, 256)),       // byte frequency statistics for the input text.
32c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                                                 //   Value is percent, not absolute.
33c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      fDeclaredEncoding(0),
34c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      fRawInput(0),
35c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      fRawLength(0)
36c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{
37c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if (fInputBytes == NULL || fByteStats == NULL) {
38c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        status = U_MEMORY_ALLOCATION_ERROR;
39c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
40c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
41c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
42c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottInputText::~InputText()
43c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{
44c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    DELETE_ARRAY(fDeclaredEncoding);
45c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    DELETE_ARRAY(fByteStats);
46c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    DELETE_ARRAY(fInputBytes);
47c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
48c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
49c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid InputText::setText(const char *in, int32_t len)
50c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{
51c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    fInputLen  = 0;
52c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    fC1Bytes   = FALSE;
53c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    fRawInput  = (const uint8_t *) in;
54c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    fRawLength = len == -1? uprv_strlen(in) : len;
55c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
56c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
57c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid InputText::setDeclaredEncoding(const char* encoding, int32_t len)
58c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{
59c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if(encoding) {
60c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        if (len == -1) {
61c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            len = uprv_strlen(encoding);
62c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        }
63c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
64c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        len += 1;     // to make place for the \0 at the end.
65c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        uprv_free(fDeclaredEncoding);
66c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        fDeclaredEncoding = NEW_ARRAY(char, len);
67c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        uprv_strncpy(fDeclaredEncoding, encoding, len);
68c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
69c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
70c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
71c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottUBool InputText::isSet() const
72c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{
73c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return fRawInput != NULL;
74c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
75c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
76c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott/**
77c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott*  MungeInput - after getting a set of raw input data to be analyzed, preprocess
78c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott*               it by removing what appears to be html markup.
79c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott*
80c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott* @internal
81c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott*/
82c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid InputText::MungeInput(UBool fStripTags) {
83c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    int     srci = 0;
84c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    int     dsti = 0;
85c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    uint8_t b;
86c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    bool    inMarkup = FALSE;
87c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    int32_t openTags = 0;
88c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    int32_t badTags  = 0;
89c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
90c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //
91c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //  html / xml markup stripping.
92c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
93c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //     discard everything within < brackets >
94c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
95c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //     guess as to whether the input was actually marked up at all.
96c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // TODO: Think about how this interacts with EBCDIC charsets that are detected.
97c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if (fStripTags) {
98c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
99c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            b = fRawInput[srci];
100c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
101c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
102c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                if (inMarkup) {
103c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                    badTags += 1;
104c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                }
105c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
106c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                inMarkup = TRUE;
107c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                openTags += 1;
108c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            }
109c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
110c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            if (! inMarkup) {
111c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                fInputBytes[dsti++] = b;
112c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            }
113c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
114c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
115c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                inMarkup = FALSE;
116c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            }
117c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        }
118c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
119c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        fInputLen = dsti;
120c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
121c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
122c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //
123c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //  If it looks like this input wasn't marked up, or if it looks like it's
124c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //    essentially nothing but markup abandon the markup stripping.
125c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //    Detection will have to work on the unstripped input.
126c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //
127c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if (openTags<5 || openTags/5 < badTags ||
128c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        (fInputLen < 100 && fRawLength>600))
129c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    {
130c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        int32_t limit = fRawLength;
131c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
132c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        if (limit > BUFFER_SIZE) {
133c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            limit = BUFFER_SIZE;
134c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        }
135c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
136c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        for (srci=0; srci<limit; srci++) {
137c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            fInputBytes[srci] = fRawInput[srci];
138c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        }
139c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
140c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        fInputLen = srci;
141c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
142c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
143c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //
144c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // Tally up the byte occurence statistics.
145c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // These are available for use by the various detectors.
146c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //
147c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
148c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
149c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
150c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    for (srci = 0; srci < fInputLen; srci += 1) {
151c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        fByteStats[fInputBytes[srci]] += 1;
152c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
153c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
154c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    for (int32_t i = 0x80; i <= 0x9F; i += 1) {
155c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        if (fByteStats[i] != 0) {
156c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            fC1Bytes = TRUE;
157c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            break;
158c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        }
159c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
160c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
161c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
162c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_NAMESPACE_END
163c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif
164c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
165