streaming_utf8_validator.cc revision 5d1f7b1de12d16ceb2c938c56701a3e8bfa558f7
15d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Copyright 2014 The Chromium Authors. All rights reserved.
25d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// found in the LICENSE file.
45d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
55d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// This implementation doesn't use ICU. The ICU macros are oriented towards
65d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// character-at-a-time processing, whereas byte-at-a-time processing is easier
75d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// with streaming input.
85d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
95d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/i18n/streaming_utf8_validator.h"
105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
115d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/i18n/utf8_validator_tables.h"
125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "base/logging.h"
135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)namespace base {
155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)namespace {
165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
175d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)uint8 StateTableLookup(uint8 offset) {
185d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // Skip the bounds check on non-debug builds so that it isn't necessary to set
195d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // LOGGING_IS_OFFICIAL_BUILD just to do a performance test.
205d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  if (logging::DEBUG_MODE)
215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    DCHECK_LT(offset, internal::kUtf8ValidatorTablesSize);
225d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  return internal::kUtf8ValidatorTables[offset];
235d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}
245d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}  // namespace
265d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
275d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)StreamingUtf8Validator::State StreamingUtf8Validator::AddBytes(const char* data,
285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                                                               size_t size) {
295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // Copy |state_| into a local variable so that the compiler doesn't have to be
305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  // careful of aliasing.
315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  uint8 state = state_;
325d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  for (const char* p = data; p != data + size; ++p) {
335d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    if ((*p & 0x80) == 0) {
345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      if (state == 0)
355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        continue;
365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      state = internal::I18N_UTF8_VALIDATOR_INVALID_INDEX;
375d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      break;
385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    }
395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    const uint8 shift_amount = StateTableLookup(state);
405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    const uint8 shifted_char = (*p & 0x7F) >> shift_amount;
415d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    state = StateTableLookup(state + shifted_char + 1);
425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    // State may be INVALID here, but this code is optimised for the case of
435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    // valid UTF-8 and it is more efficient (by about 2%) to not attempt an
445d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    // early loop exit unless we hit an ASCII character.
455d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  }
465d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  state_ = state;
475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  return state == 0 ? VALID_ENDPOINT
485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      : state == internal::I18N_UTF8_VALIDATOR_INVALID_INDEX
495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      ? INVALID
505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      : VALID_MIDPOINT;
515d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}
525d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)void StreamingUtf8Validator::Reset() {
545d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  state_ = 0u;
555d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}
565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)bool StreamingUtf8Validator::Validate(const std::string& string) {
585d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  return StreamingUtf8Validator().AddBytes(string.data(), string.size()) ==
595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)         VALID_ENDPOINT;
605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}
615d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
625d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}  // namespace base
63