15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/* 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ********************************************************************** 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * Copyright (C) 2005-2009, International Business Machines 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * Corporation and others. All Rights Reserved. 52a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) ********************************************************************** 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "unicode/utypes.h" 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 102a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#if !UCONFIG_NO_CONVERSION 11868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles) 12e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch#include "inputext.h" 13e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "cmemory.h" 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "cstring.h" 162a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string.h> 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 194e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)U_NAMESPACE_BEGIN 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 21e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch#define BUFFER_SIZE 8192 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 241320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci 25e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define DELETE_ARRAY(array) uprv_free((void *) (array)) 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)InputText::InputText(UErrorCode &status) 29e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been 30e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch // removed if appropriate. 31eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text. 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Value is percent, not absolute. 332a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) fDeclaredEncoding(0), 342a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) fRawInput(0), 35116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch fRawLength(0) 361320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci{ 37116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch if (fInputBytes == NULL || fByteStats == NULL) { 38116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch status = U_MEMORY_ALLOCATION_ERROR; 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)InputText::~InputText() 4303b57e008b61dfcb1fbad3aea950ae0e001748b0Torne (Richard Coles){ 4403b57e008b61dfcb1fbad3aea950ae0e001748b0Torne (Richard Coles) DELETE_ARRAY(fDeclaredEncoding); 4503b57e008b61dfcb1fbad3aea950ae0e001748b0Torne (Richard Coles) DELETE_ARRAY(fByteStats); 4603b57e008b61dfcb1fbad3aea950ae0e001748b0Torne (Richard Coles) DELETE_ARRAY(fInputBytes); 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 48ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void InputText::setText(const char *in, int32_t len) 502a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles){ 512a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) fInputLen = 0; 522a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) fC1Bytes = FALSE; 532a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) fRawInput = (const uint8_t *) in; 542a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) fRawLength = len == -1? (int32_t)uprv_strlen(in) : len; 552a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)} 562a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 572a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)void InputText::setDeclaredEncoding(const char* encoding, int32_t len) 582a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles){ 5903b57e008b61dfcb1fbad3aea950ae0e001748b0Torne (Richard Coles) if(encoding) { 602a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) if (len == -1) { 6103b57e008b61dfcb1fbad3aea950ae0e001748b0Torne (Richard Coles) len = (int32_t)uprv_strlen(encoding); 622a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) } 6303b57e008b61dfcb1fbad3aea950ae0e001748b0Torne (Richard Coles) 642a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) len += 1; // to make place for the \0 at the end. 65eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch uprv_free(fDeclaredEncoding); 662a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) fDeclaredEncoding = NEW_ARRAY(char, len); 675d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) uprv_strncpy(fDeclaredEncoding, encoding, len); 682a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) } 692a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)} 702a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 71eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen MurdochUBool InputText::isSet() const 722a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles){ 732a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) return fRawInput != NULL; 742a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)} 752a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)/** 775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)* MungeInput - after getting a set of raw input data to be analyzed, preprocess 785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)* it by removing what appears to be html markup. 795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)* 805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)* @internal 815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)*/ 825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)void InputText::MungeInput(UBool fStripTags) { 835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int srci = 0; 845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int dsti = 0; 855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8_t b; 865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) bool inMarkup = FALSE; 875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int32_t openTags = 0; 885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int32_t badTags = 0; 895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // 915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // html / xml markup stripping. 925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // quick and dirty, not 100% accurate, but hopefully good enough, statistically. 935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // discard everything within < brackets > 945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // Count how many total '<' and illegal (nested) '<' occur, so we can make some 955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // guess as to whether the input was actually marked up at all. 965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // TODO: Think about how this interacts with EBCDIC charsets that are detected. 975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) if (fStripTags) { 985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) { 99e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch b = fRawInput[srci]; 100e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch 101e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */ 102e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch if (inMarkup) { 103e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch badTags += 1; 104e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch } 105e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch 106e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch inMarkup = TRUE; 107e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch openTags += 1; 108e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch } 109e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch 110e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch if (! inMarkup) { 111e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch fInputBytes[dsti++] = b; 112e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch } 113e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch 114e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */ 115e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch inMarkup = FALSE; 116e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch } 117e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch } 118e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch 119e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch fInputLen = dsti; 120e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch } 121e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch 122e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch // 123e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch // If it looks like this input wasn't marked up, or if it looks like it's 124e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch // essentially nothing but markup abandon the markup stripping. 125e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch // Detection will have to work on the unstripped input. 126e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch // 127e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch if (openTags<5 || openTags/5 < badTags || 128e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch (fInputLen < 100 && fRawLength>600)) 129e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch { 130e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch int32_t limit = fRawLength; 131e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch 132e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch if (limit > BUFFER_SIZE) { 133e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch limit = BUFFER_SIZE; 1342a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) } 1352a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (srci=0; srci<limit; srci++) { 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fInputBytes[srci] = fRawInput[srci]; 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1404e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) fInputLen = srci; 1414e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) } 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1432a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Tally up the byte occurence statistics. 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // These are available for use by the various detectors. 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1482a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256); 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (srci = 0; srci < fInputLen; srci += 1) { 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fByteStats[fInputBytes[srci]] += 1; 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int32_t i = 0x80; i <= 0x9F; i += 1) { 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (fByteStats[i] != 0) { 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fC1Bytes = TRUE; 157e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch break; 158e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch } 1595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 1605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} 161e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben Murdoch 162e4256316f8b5e8d1ec0df1f7762771622a53fa63Ben MurdochU_NAMESPACE_END 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 165cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)