15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) **********************************************************************
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *   Copyright (C) 2005-2009, International Business Machines
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *   Corporation and others.  All Rights Reserved.
52a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) **********************************************************************
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "unicode/utypes.h"
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
102a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#if !UCONFIG_NO_CONVERSION
11868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)
12e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch#include "inputext.h"
13e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "cmemory.h"
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "cstring.h"
162a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string.h>
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
194e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)U_NAMESPACE_BEGIN
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
21e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch#define BUFFER_SIZE 8192
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
241320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci
25e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define DELETE_ARRAY(array) uprv_free((void *) (array))
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)InputText::InputText(UErrorCode &status)
29e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch    : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.  Markup will have been
30e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch                                                 //   removed if appropriate.
31eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch      fByteStats(NEW_ARRAY(int16_t, 256)),       // byte frequency statistics for the input text.
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                                 //   Value is percent, not absolute.
332a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)      fDeclaredEncoding(0),
342a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)      fRawInput(0),
35116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch      fRawLength(0)
361320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci{
37116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    if (fInputBytes == NULL || fByteStats == NULL) {
38116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch        status = U_MEMORY_ALLOCATION_ERROR;
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)InputText::~InputText()
4303b57e008b61dfcb1fbad3aea950ae0e001748b0Torne (Richard Coles){
4403b57e008b61dfcb1fbad3aea950ae0e001748b0Torne (Richard Coles)    DELETE_ARRAY(fDeclaredEncoding);
4503b57e008b61dfcb1fbad3aea950ae0e001748b0Torne (Richard Coles)    DELETE_ARRAY(fByteStats);
4603b57e008b61dfcb1fbad3aea950ae0e001748b0Torne (Richard Coles)    DELETE_ARRAY(fInputBytes);
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
48ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void InputText::setText(const char *in, int32_t len)
502a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles){
512a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    fInputLen  = 0;
522a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    fC1Bytes   = FALSE;
532a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    fRawInput  = (const uint8_t *) in;
542a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
552a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)}
562a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
572a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
582a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles){
5903b57e008b61dfcb1fbad3aea950ae0e001748b0Torne (Richard Coles)    if(encoding) {
602a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        if (len == -1) {
6103b57e008b61dfcb1fbad3aea950ae0e001748b0Torne (Richard Coles)            len = (int32_t)uprv_strlen(encoding);
622a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        }
6303b57e008b61dfcb1fbad3aea950ae0e001748b0Torne (Richard Coles)
642a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        len += 1;     // to make place for the \0 at the end.
65eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch        uprv_free(fDeclaredEncoding);
662a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        fDeclaredEncoding = NEW_ARRAY(char, len);
675d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        uprv_strncpy(fDeclaredEncoding, encoding, len);
682a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    }
692a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)}
702a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
71eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen MurdochUBool InputText::isSet() const
722a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles){
732a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    return fRawInput != NULL;
742a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)}
752a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)/**
775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)*  MungeInput - after getting a set of raw input data to be analyzed, preprocess
785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)*               it by removing what appears to be html markup.
795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)*
805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)* @internal
815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)*/
825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)void InputText::MungeInput(UBool fStripTags) {
835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    int     srci = 0;
845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    int     dsti = 0;
855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    uint8_t b;
865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    bool    inMarkup = FALSE;
875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    int32_t openTags = 0;
885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    int32_t badTags  = 0;
895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    //
915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    //  html / xml markup stripping.
925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    //     discard everything within < brackets >
945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    //     guess as to whether the input was actually marked up at all.
965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    // TODO: Think about how this interacts with EBCDIC charsets that are detected.
975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    if (fStripTags) {
985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)        for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
99e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch            b = fRawInput[srci];
100e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch
101e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch            if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
102e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch                if (inMarkup) {
103e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch                    badTags += 1;
104e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch                }
105e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch
106e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch                inMarkup = TRUE;
107e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch                openTags += 1;
108e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch            }
109e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch
110e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch            if (! inMarkup) {
111e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch                fInputBytes[dsti++] = b;
112e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch            }
113e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch
114e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch            if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
115e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch                inMarkup = FALSE;
116e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch            }
117e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch        }
118e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch
119e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch        fInputLen = dsti;
120e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch    }
121e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch
122e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch    //
123e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch    //  If it looks like this input wasn't marked up, or if it looks like it's
124e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch    //    essentially nothing but markup abandon the markup stripping.
125e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch    //    Detection will have to work on the unstripped input.
126e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch    //
127e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch    if (openTags<5 || openTags/5 < badTags ||
128e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch        (fInputLen < 100 && fRawLength>600))
129e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch    {
130e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch        int32_t limit = fRawLength;
131e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch
132e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch        if (limit > BUFFER_SIZE) {
133e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch            limit = BUFFER_SIZE;
1342a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        }
1352a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        for (srci=0; srci<limit; srci++) {
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            fInputBytes[srci] = fRawInput[srci];
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1404e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)        fInputLen = srci;
1414e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)    }
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1432a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    //
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Tally up the byte occurence statistics.
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // These are available for use by the various detectors.
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    //
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1482a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    for (srci = 0; srci < fInputLen; srci += 1) {
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        fByteStats[fInputBytes[srci]] += 1;
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    for (int32_t i = 0x80; i <= 0x9F; i += 1) {
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (fByteStats[i] != 0) {
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            fC1Bytes = TRUE;
157e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch            break;
158e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch        }
1595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    }
1605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}
161e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch
162e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben MurdochU_NAMESPACE_END
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
165cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)