1//
2//  ========================================================================
3//  Copyright (c) 1995-2014 Mort Bay Consulting Pty. Ltd.
4//  ------------------------------------------------------------------------
5//  All rights reserved. This program and the accompanying materials
6//  are made available under the terms of the Eclipse Public License v1.0
7//  and Apache License v2.0 which accompanies this distribution.
8//
9//      The Eclipse Public License is available at
10//      http://www.eclipse.org/legal/epl-v10.html
11//
12//      The Apache License v2.0 is available at
13//      http://www.opensource.org/licenses/apache2.0.php
14//
15//  You may elect to redistribute this code under either of these licenses.
16//  ========================================================================
17//
18
19package org.eclipse.jetty.util;
20
21import java.io.IOException;
22
23import org.eclipse.jetty.util.log.Log;
24import org.eclipse.jetty.util.log.Logger;
25
26/* ------------------------------------------------------------ */
27/**
28 * Utf8 Appendable abstract base class
29 *
30 * This abstract class wraps a standard {@link java.lang.Appendable} and provides methods to append UTF-8 encoded bytes, that are converted into characters.
31 *
32 * This class is stateful and up to 4 calls to {@link #append(byte)} may be needed before state a character is appended to the string buffer.
33 *
34 * The UTF-8 decoding is done by this class and no additional buffers or Readers are used. The UTF-8 code was inspired by
35 * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
36 *
37 * License information for Bjoern Hoehrmann's code:
38 *
39 * Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
40 * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal
41 * in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
42 * copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
43 *
44 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
45 *
46 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
47 * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
48 * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
49 **/
50public abstract class Utf8Appendable
51{
52    protected static final Logger LOG = Log.getLogger(Utf8Appendable.class);
53    public static final char REPLACEMENT = '\ufffd';
54    private static final int UTF8_ACCEPT = 0;
55    private static final int UTF8_REJECT = 12;
56
57    protected final Appendable _appendable;
58    protected int _state = UTF8_ACCEPT;
59
60    private static final byte[] BYTE_TABLE =
61    {
62        // The first part of the table maps bytes to character classes that
63        // to reduce the size of the transition table and create bitmasks.
64         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
65         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
66         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
67         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
68         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
69         7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
70         8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
71        10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8
72    };
73
74    private static final byte[] TRANS_TABLE =
75    {
76        // The second part is a transition table that maps a combination
77        // of a state of the automaton and a character class to a state.
78         0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
79        12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
80        12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
81        12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
82        12,36,12,12,12,12,12,12,12,12,12,12
83    };
84
85    private int _codep;
86
87    public Utf8Appendable(Appendable appendable)
88    {
89        _appendable = appendable;
90    }
91
92    public abstract int length();
93
94    protected void reset()
95    {
96        _state = UTF8_ACCEPT;
97    }
98
99    public void append(byte b)
100    {
101        try
102        {
103            appendByte(b);
104        }
105        catch (IOException e)
106        {
107            throw new RuntimeException(e);
108        }
109    }
110
111    public void append(byte[] b, int offset, int length)
112    {
113        try
114        {
115            int end = offset + length;
116            for (int i = offset; i < end; i++)
117                appendByte(b[i]);
118        }
119        catch (IOException e)
120        {
121            throw new RuntimeException(e);
122        }
123    }
124
125    public boolean append(byte[] b, int offset, int length, int maxChars)
126    {
127        try
128        {
129            int end = offset + length;
130            for (int i = offset; i < end; i++)
131            {
132                if (length() > maxChars)
133                    return false;
134                appendByte(b[i]);
135            }
136            return true;
137        }
138        catch (IOException e)
139        {
140            throw new RuntimeException(e);
141        }
142    }
143
144    protected void appendByte(byte b) throws IOException
145    {
146
147        if (b > 0 && _state == UTF8_ACCEPT)
148        {
149            _appendable.append((char)(b & 0xFF));
150        }
151        else
152        {
153            int i = b & 0xFF;
154            int type = BYTE_TABLE[i];
155            _codep = _state == UTF8_ACCEPT ? (0xFF >> type) & i : (i & 0x3F) | (_codep << 6);
156            int next = TRANS_TABLE[_state + type];
157
158            switch(next)
159            {
160                case UTF8_ACCEPT:
161                    _state=next;
162                    if (_codep < Character.MIN_HIGH_SURROGATE)
163                    {
164                        _appendable.append((char)_codep);
165                    }
166                    else
167                    {
168                        for (char c : Character.toChars(_codep))
169                            _appendable.append(c);
170                    }
171                    break;
172
173                case UTF8_REJECT:
174                    String reason = "byte "+TypeUtil.toHexString(b)+" in state "+(_state/12);
175                    _codep=0;
176                    _state = UTF8_ACCEPT;
177                    _appendable.append(REPLACEMENT);
178                    throw new NotUtf8Exception(reason);
179
180                default:
181                    _state=next;
182
183            }
184        }
185    }
186
187    public boolean isUtf8SequenceComplete()
188    {
189        return _state == UTF8_ACCEPT;
190    }
191
192    public static class NotUtf8Exception extends IllegalArgumentException
193    {
194        public NotUtf8Exception(String reason)
195        {
196            super("Not valid UTF8! "+reason);
197        }
198    }
199
200    protected void checkState()
201    {
202        if (!isUtf8SequenceComplete())
203        {
204            _codep=0;
205            _state = UTF8_ACCEPT;
206            try
207            {
208                _appendable.append(REPLACEMENT);
209            }
210            catch(IOException e)
211            {
212                throw new RuntimeException(e);
213            }
214            throw new NotUtf8Exception("incomplete UTF8 sequence");
215        }
216    }
217
218    public String toReplacedString()
219    {
220        if (!isUtf8SequenceComplete())
221        {
222            _codep=0;
223            _state = UTF8_ACCEPT;
224            try
225            {
226                _appendable.append(REPLACEMENT);
227            }
228            catch(IOException e)
229            {
230                throw new RuntimeException(e);
231            }
232            Throwable th= new NotUtf8Exception("incomplete UTF8 sequence");
233            LOG.warn(th.toString());
234            LOG.debug(th);
235        }
236        return _appendable.toString();
237    }
238}
239