15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
52a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#ifndef BASE_STRINGS_UTF_OFFSET_STRING_CONVERSIONS_H_
62a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#define BASE_STRINGS_UTF_OFFSET_STRING_CONVERSIONS_H_
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string>
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector>
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/base_export.h"
12868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/string16.h"
13c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#include "base/strings/string_piece.h"
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
152a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)namespace base {
162a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
170529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch// A helper class and associated data structures to adjust offsets into a
180529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch// string in response to various adjustments one might do to that string
190529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch// (e.g., eliminating a range).  For details on offsets, see the comments by
200529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch// the AdjustOffsets() function below.
210529e5d033099cbfc42635f6f6183833b09dff6eBen Murdochclass BASE_EXPORT OffsetAdjuster {
220529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch public:
230529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  struct BASE_EXPORT Adjustment {
240529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch    Adjustment(size_t original_offset,
250529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch               size_t original_length,
260529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch               size_t output_length);
270529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch
280529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch    size_t original_offset;
290529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch    size_t original_length;
300529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch    size_t output_length;
310529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  };
320529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  typedef std::vector<Adjustment> Adjustments;
330529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch
340529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // Adjusts all offsets in |offsets_for_adjustment| to reflect the adjustments
350529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // recorded in |adjustments|.
360529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  //
370529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // Offsets represents insertion/selection points between characters: if |src|
380529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // is "abcd", then 0 is before 'a', 2 is between 'b' and 'c', and 4 is at the
390529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // end of the string.  Valid input offsets range from 0 to |src_len|.  On
400529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // exit, each offset will have been modified to point at the same logical
410529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // position in the output string.  If an offset cannot be successfully
420529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // adjusted (e.g., because it points into the middle of a multibyte sequence),
430529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // it will be set to string16::npos.
440529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  static void AdjustOffsets(const Adjustments& adjustments,
450529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch                            std::vector<size_t>* offsets_for_adjustment);
460529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch
470529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // Adjusts the single |offset| to reflect the adjustments recorded in
480529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // |adjustments|.
490529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  static void AdjustOffset(const Adjustments& adjustments,
500529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch                           size_t* offset);
510529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch
525c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  // Adjusts all offsets in |offsets_for_unadjustment| to reflect the reverse
535c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  // of the adjustments recorded in |adjustments|.  In other words, the offsets
545c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  // provided represent offsets into an adjusted string and the caller wants
555c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  // to know the offsets they correspond to in the original string.  If an
565c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  // offset cannot be successfully unadjusted (e.g., because it points into
575c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  // the middle of a multibyte sequence), it will be set to string16::npos.
585c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  static void UnadjustOffsets(const Adjustments& adjustments,
595c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu                              std::vector<size_t>* offsets_for_unadjustment);
605c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu
615c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  // Adjusts the single |offset| to reflect the reverse of the adjustments
625c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  // recorded in |adjustments|.
635c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  static void UnadjustOffset(const Adjustments& adjustments,
645c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu                             size_t* offset);
655c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu
660529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // Combines two sequential sets of adjustments, storing the combined revised
670529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // adjustments in |adjustments_on_adjusted_string|.  That is, suppose a
680529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // string was altered in some way, with the alterations recorded as
690529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // adjustments in |first_adjustments|.  Then suppose the resulting string is
700529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // further altered, with the alterations recorded as adjustments scored in
710529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // |adjustments_on_adjusted_string|, with the offsets recorded in these
720529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // adjustments being with respect to the intermediate string.  This function
730529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // combines the two sets of adjustments into one, storing the result in
740529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // |adjustments_on_adjusted_string|, whose offsets are correct with respect
750529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // to the original string.
760529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  //
770529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // Assumes both parameters are sorted by increasing offset.
780529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  //
790529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // WARNING: Only supports |first_adjustments| that involve collapsing ranges
800529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  // of text, not expanding ranges.
810529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  static void MergeSequentialAdjustments(
820529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch      const Adjustments& first_adjustments,
830529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch      Adjustments* adjustments_on_adjusted_string);
840529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch};
850529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch
860529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch// Like the conversions in utf_string_conversions.h, but also fills in an
870529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch// |adjustments| parameter that reflects the alterations done to the string.
880529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch// It may be NULL.
890529e5d033099cbfc42635f6f6183833b09dff6eBen MurdochBASE_EXPORT bool UTF8ToUTF16WithAdjustments(
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const char* src,
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    size_t src_len,
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    string16* output,
930529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch    base::OffsetAdjuster::Adjustments* adjustments);
940529e5d033099cbfc42635f6f6183833b09dff6eBen MurdochBASE_EXPORT string16 UTF8ToUTF16WithAdjustments(
950529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch    const base::StringPiece& utf8,
960529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch    base::OffsetAdjuster::Adjustments* adjustments);
970529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch// As above, but instead internally examines the adjustments and applies them
980529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch// to |offsets_for_adjustment|.  See comments by AdjustOffsets().
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)BASE_EXPORT string16 UTF8ToUTF16AndAdjustOffsets(
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const base::StringPiece& utf8,
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    std::vector<size_t>* offsets_for_adjustment);
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)BASE_EXPORT std::string UTF16ToUTF8AndAdjustOffsets(
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const base::StringPiece16& utf16,
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    std::vector<size_t>* offsets_for_adjustment);
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Limiting function callable by std::for_each which will replace any value
10858537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)// which is greater than |limit| with npos.  Typically this is called with a
10958537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)// string length to clamp offsets into the string to [0, length] (as opposed to
11058537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)// [0, length); see comments above).
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)template <typename T>
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct LimitOffset {
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  explicit LimitOffset(size_t limit)
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : limit_(limit) {}
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void operator()(size_t& offset) {
11758537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)    if (offset > limit_)
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      offset = T::npos;
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  size_t limit_;
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1242a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)}  // namespace base
1252a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
1262a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#endif  // BASE_STRINGS_UTF_OFFSET_STRING_CONVERSIONS_H_
127