streaming_utf8_validator.cc revision 5d1f7b1de12d16ceb2c938c56701a3e8bfa558f7
15d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Copyright 2014 The Chromium Authors. All rights reserved. 25d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// found in the LICENSE file. 45d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 55d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// This implementation doesn't use ICU. The ICU macros are oriented towards 65d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// character-at-a-time processing, whereas byte-at-a-time processing is easier 75d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// with streaming input. 85d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 95d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/i18n/streaming_utf8_validator.h" 105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 115d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/i18n/utf8_validator_tables.h" 125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/logging.h" 135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)namespace base { 155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)namespace { 165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 175d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)uint8 StateTableLookup(uint8 offset) { 185d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // Skip the bounds check on non-debug builds so that it isn't necessary to set 195d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // LOGGING_IS_OFFICIAL_BUILD just to do a performance test. 205d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if (logging::DEBUG_MODE) 215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) DCHECK_LT(offset, internal::kUtf8ValidatorTablesSize); 225d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) return internal::kUtf8ValidatorTables[offset]; 235d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} 245d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} // namespace 265d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 275d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)StreamingUtf8Validator::State StreamingUtf8Validator::AddBytes(const char* data, 285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) size_t size) { 295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // Copy |state_| into a local variable so that the compiler doesn't have to be 305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // careful of aliasing. 315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) uint8 state = state_; 325d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) for (const char* p = data; p != data + size; ++p) { 335d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if ((*p & 0x80) == 0) { 345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if (state == 0) 355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) continue; 365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) state = internal::I18N_UTF8_VALIDATOR_INVALID_INDEX; 375d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) break; 385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const uint8 shift_amount = StateTableLookup(state); 405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const uint8 shifted_char = (*p & 0x7F) >> shift_amount; 415d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) state = StateTableLookup(state + shifted_char + 1); 425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // State may be INVALID here, but this code is optimised for the case of 435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // valid UTF-8 and it is more efficient (by about 2%) to not attempt an 445d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // early loop exit unless we hit an ASCII character. 455d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } 465d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) state_ = state; 475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) return state == 0 ? VALID_ENDPOINT 485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) : state == internal::I18N_UTF8_VALIDATOR_INVALID_INDEX 495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) ? INVALID 505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) : VALID_MIDPOINT; 515d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} 525d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)void StreamingUtf8Validator::Reset() { 545d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) state_ = 0u; 555d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} 565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)bool StreamingUtf8Validator::Validate(const std::string& string) { 585d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) return StreamingUtf8Validator().AddBytes(string.data(), string.size()) == 595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) VALID_ENDPOINT; 605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} 615d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 625d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} // namespace base 63