1c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott/* 2c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ********************************************************************** 3c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott * Copyright (C) 2005-2008, International Business Machines 4c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott * Corporation and others. All Rights Reserved. 5c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ********************************************************************** 6c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott */ 7c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 8c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/utypes.h" 9c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 10c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#if !UCONFIG_NO_CONVERSION 11c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 12c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "inputext.h" 13c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 14c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "cmemory.h" 15c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "cstring.h" 16c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 17c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include <string.h> 18c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 19c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_NAMESPACE_BEGIN 20c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 21c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define BUFFER_SIZE 8192 22c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 23c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 24c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 25c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) 26c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define DELETE_ARRAY(array) uprv_free((void *) (array)) 27c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 28c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottInputText::InputText(UErrorCode &status) 29c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been 30c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // removed if appropriate. 31c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text. 32c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Value is percent, not absolute. 33c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fDeclaredEncoding(0), 34c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fRawInput(0), 35c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fRawLength(0) 36c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 37c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (fInputBytes == NULL || fByteStats == NULL) { 38c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott status = U_MEMORY_ALLOCATION_ERROR; 39c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 40c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 41c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 42c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottInputText::~InputText() 43c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 44c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DELETE_ARRAY(fDeclaredEncoding); 45c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DELETE_ARRAY(fByteStats); 46c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DELETE_ARRAY(fInputBytes); 47c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 48c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 49c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid InputText::setText(const char *in, int32_t len) 50c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 51c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fInputLen = 0; 52c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fC1Bytes = FALSE; 53c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fRawInput = (const uint8_t *) in; 54c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fRawLength = len == -1? uprv_strlen(in) : len; 55c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 56c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 57c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid InputText::setDeclaredEncoding(const char* encoding, int32_t len) 58c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 59c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if(encoding) { 60c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (len == -1) { 61c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott len = uprv_strlen(encoding); 62c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 63c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 64c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott len += 1; // to make place for the \0 at the end. 65c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott uprv_free(fDeclaredEncoding); 66c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fDeclaredEncoding = NEW_ARRAY(char, len); 67c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott uprv_strncpy(fDeclaredEncoding, encoding, len); 68c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 69c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 70c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 71c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottUBool InputText::isSet() const 72c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 73c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return fRawInput != NULL; 74c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 75c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 76c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott/** 77c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott* MungeInput - after getting a set of raw input data to be analyzed, preprocess 78c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott* it by removing what appears to be html markup. 79c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott* 80c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott* @internal 81c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott*/ 82c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid InputText::MungeInput(UBool fStripTags) { 83c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int srci = 0; 84c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int dsti = 0; 85c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott uint8_t b; 86c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott bool inMarkup = FALSE; 87c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t openTags = 0; 88c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t badTags = 0; 89c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 90c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // 91c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // html / xml markup stripping. 92c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // quick and dirty, not 100% accurate, but hopefully good enough, statistically. 93c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // discard everything within < brackets > 94c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Count how many total '<' and illegal (nested) '<' occur, so we can make some 95c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // guess as to whether the input was actually marked up at all. 96c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // TODO: Think about how this interacts with EBCDIC charsets that are detected. 97c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (fStripTags) { 98c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) { 99c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott b = fRawInput[srci]; 100c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 101c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */ 102c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (inMarkup) { 103c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott badTags += 1; 104c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 105c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 106c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott inMarkup = TRUE; 107c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott openTags += 1; 108c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 109c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 110c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (! inMarkup) { 111c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fInputBytes[dsti++] = b; 112c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 113c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 114c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */ 115c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott inMarkup = FALSE; 116c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 117c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 118c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 119c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fInputLen = dsti; 120c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 121c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 122c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // 123c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // If it looks like this input wasn't marked up, or if it looks like it's 124c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // essentially nothing but markup abandon the markup stripping. 125c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Detection will have to work on the unstripped input. 126c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // 127c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (openTags<5 || openTags/5 < badTags || 128c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott (fInputLen < 100 && fRawLength>600)) 129c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott { 130c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t limit = fRawLength; 131c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 132c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (limit > BUFFER_SIZE) { 133c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott limit = BUFFER_SIZE; 134c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 135c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 136c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for (srci=0; srci<limit; srci++) { 137c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fInputBytes[srci] = fRawInput[srci]; 138c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 139c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 140c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fInputLen = srci; 141c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 142c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 143c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // 144c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Tally up the byte occurence statistics. 145c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // These are available for use by the various detectors. 146c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // 147c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 148c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256); 149c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 150c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for (srci = 0; srci < fInputLen; srci += 1) { 151c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fByteStats[fInputBytes[srci]] += 1; 152c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 153c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 154c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for (int32_t i = 0x80; i <= 0x9F; i += 1) { 155c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (fByteStats[i] != 0) { 156c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fC1Bytes = TRUE; 157c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott break; 158c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 159c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 160c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 161c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 162c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_NAMESPACE_END 163c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif 164c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 165