1/****************************************************************
2 * Licensed to the Apache Software Foundation (ASF) under one   *
3 * or more contributor license agreements.  See the NOTICE file *
4 * distributed with this work for additional information        *
5 * regarding copyright ownership.  The ASF licenses this file   *
6 * to you under the Apache License, Version 2.0 (the            *
7 * "License"); you may not use this file except in compliance   *
8 * with the License.  You may obtain a copy of the License at   *
9 *                                                              *
10 *   http://www.apache.org/licenses/LICENSE-2.0                 *
11 *                                                              *
12 * Unless required by applicable law or agreed to in writing,   *
13 * software distributed under the License is distributed on an  *
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15 * KIND, either express or implied.  See the License for the    *
16 * specific language governing permissions and limitations      *
17 * under the License.                                           *
18 ****************************************************************/
19
20package org.apache.james.mime4j.codec;
21
22import java.nio.ByteBuffer;
23import java.nio.charset.Charset;
24import java.util.BitSet;
25import java.util.Locale;
26
27import org.apache.james.mime4j.util.CharsetUtil;
28
29/**
30 * ANDROID:  THIS CLASS IS COPIED FROM A NEWER VERSION OF MIME4J
31 */
32
33/**
34 * Static methods for encoding header field values. This includes encoded-words
35 * as defined in <a href='http://www.faqs.org/rfcs/rfc2047.html'>RFC 2047</a>
36 * or display-names of an e-mail address, for example.
37 *
38 */
39public class EncoderUtil {
40
41    // This array is a lookup table that translates 6-bit positive integer index
42    // values into their "Base64 Alphabet" equivalents as specified in Table 1
43    // of RFC 2045.
44    // ANDROID:  THIS TABLE IS COPIED FROM BASE64OUTPUTSTREAM
45    static final byte[] BASE64_TABLE = { 'A', 'B', 'C', 'D', 'E', 'F',
46            'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
47            'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
48            'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
49            't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5',
50            '6', '7', '8', '9', '+', '/' };
51
52    // Byte used to pad output.
53    private static final byte BASE64_PAD = '=';
54
55    private static final BitSet Q_REGULAR_CHARS = initChars("=_?");
56
57    private static final BitSet Q_RESTRICTED_CHARS = initChars("=_?\"#$%&'(),.:;<>@[\\]^`{|}~");
58
59    private static final int MAX_USED_CHARACTERS = 50;
60
61    private static final String ENC_WORD_PREFIX = "=?";
62    private static final String ENC_WORD_SUFFIX = "?=";
63
64    private static final int ENCODED_WORD_MAX_LENGTH = 75; // RFC 2047
65
66    private static final BitSet TOKEN_CHARS = initChars("()<>@,;:\\\"/[]?=");
67
68    private static final BitSet ATEXT_CHARS = initChars("()<>@.,;:\\\"[]");
69
70    private static BitSet initChars(String specials) {
71        BitSet bs = new BitSet(128);
72        for (char ch = 33; ch < 127; ch++) {
73            if (specials.indexOf(ch) == -1) {
74                bs.set(ch);
75            }
76        }
77        return bs;
78    }
79
80    /**
81     * Selects one of the two encodings specified in RFC 2047.
82     */
83    public enum Encoding {
84        /** The B encoding (identical to base64 defined in RFC 2045). */
85        B,
86        /** The Q encoding (similar to quoted-printable defined in RFC 2045). */
87        Q
88    }
89
90    /**
91     * Indicates the intended usage of an encoded word.
92     */
93    public enum Usage {
94        /**
95         * Encoded word is used to replace a 'text' token in any Subject or
96         * Comments header field.
97         */
98        TEXT_TOKEN,
99        /**
100         * Encoded word is used to replace a 'word' entity within a 'phrase',
101         * for example, one that precedes an address in a From, To, or Cc
102         * header.
103         */
104        WORD_ENTITY
105    }
106
107    private EncoderUtil() {
108    }
109
110    /**
111     * Encodes the display-name portion of an address. See <a
112     * href='http://www.faqs.org/rfcs/rfc5322.html'>RFC 5322</a> section 3.4
113     * and <a href='http://www.faqs.org/rfcs/rfc2047.html'>RFC 2047</a> section
114     * 5.3. The specified string should not be folded.
115     *
116     * @param displayName
117     *            display-name to encode.
118     * @return encoded display-name.
119     */
120    public static String encodeAddressDisplayName(String displayName) {
121        // display-name = phrase
122        // phrase = 1*( encoded-word / word )
123        // word = atom / quoted-string
124        // atom = [CFWS] 1*atext [CFWS]
125        // CFWS = comment or folding white space
126
127        if (isAtomPhrase(displayName)) {
128            return displayName;
129        } else if (hasToBeEncoded(displayName, 0)) {
130            return encodeEncodedWord(displayName, Usage.WORD_ENTITY);
131        } else {
132            return quote(displayName);
133        }
134    }
135
136    /**
137     * Encodes the local part of an address specification as described in RFC
138     * 5322 section 3.4.1. Leading and trailing CFWS should have been removed
139     * before calling this method. The specified string should not contain any
140     * illegal (control or non-ASCII) characters.
141     *
142     * @param localPart
143     *            the local part to encode
144     * @return the encoded local part.
145     */
146    public static String encodeAddressLocalPart(String localPart) {
147        // local-part = dot-atom / quoted-string
148        // dot-atom = [CFWS] dot-atom-text [CFWS]
149        // CFWS = comment or folding white space
150
151        if (isDotAtomText(localPart)) {
152            return localPart;
153        } else {
154            return quote(localPart);
155        }
156    }
157
158    /**
159     * Encodes the specified strings into a header parameter as described in RFC
160     * 2045 section 5.1 and RFC 2183 section 2. The specified strings should not
161     * contain any illegal (control or non-ASCII) characters.
162     *
163     * @param name
164     *            parameter name.
165     * @param value
166     *            parameter value.
167     * @return encoded result.
168     */
169    public static String encodeHeaderParameter(String name, String value) {
170        name = name.toLowerCase(Locale.US);
171
172        // value := token / quoted-string
173        if (isToken(value)) {
174            return name + "=" + value;
175        } else {
176            return name + "=" + quote(value);
177        }
178    }
179
180    /**
181     * Shortcut method that encodes the specified text into an encoded-word if
182     * the text has to be encoded.
183     *
184     * @param text
185     *            text to encode.
186     * @param usage
187     *            whether the encoded-word is to be used to replace a text token
188     *            or a word entity (see RFC 822).
189     * @param usedCharacters
190     *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
191     * @return the specified text if encoding is not necessary or an encoded
192     *         word or a sequence of encoded words otherwise.
193     */
194    public static String encodeIfNecessary(String text, Usage usage,
195            int usedCharacters) {
196        if (hasToBeEncoded(text, usedCharacters))
197            return encodeEncodedWord(text, usage, usedCharacters);
198        else
199            return text;
200    }
201
202    /**
203     * Determines if the specified string has to encoded into an encoded-word.
204     * Returns <code>true</code> if the text contains characters that don't
205     * fall into the printable ASCII character set or if the text contains a
206     * 'word' (sequence of non-whitespace characters) longer than 77 characters
207     * (including characters already used up in the line).
208     *
209     * @param text
210     *            text to analyze.
211     * @param usedCharacters
212     *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
213     * @return <code>true</code> if the specified text has to be encoded into
214     *         an encoded-word, <code>false</code> otherwise.
215     */
216    public static boolean hasToBeEncoded(String text, int usedCharacters) {
217        if (text == null)
218            throw new IllegalArgumentException();
219        if (usedCharacters < 0 || usedCharacters > MAX_USED_CHARACTERS)
220            throw new IllegalArgumentException();
221
222        int nonWhiteSpaceCount = usedCharacters;
223
224        for (int idx = 0; idx < text.length(); idx++) {
225            char ch = text.charAt(idx);
226            if (ch == '\t' || ch == ' ') {
227                nonWhiteSpaceCount = 0;
228            } else {
229                nonWhiteSpaceCount++;
230                if (nonWhiteSpaceCount > 77) {
231                    // Line cannot be folded into multiple lines with no more
232                    // than 78 characters each. Encoding as encoded-words makes
233                    // that possible. One character has to be reserved for
234                    // folding white space; that leaves 77 characters.
235                    return true;
236                }
237
238                if (ch < 32 || ch >= 127) {
239                    // non-printable ascii character has to be encoded
240                    return true;
241                }
242            }
243        }
244
245        return false;
246    }
247
248    /**
249     * Encodes the specified text into an encoded word or a sequence of encoded
250     * words separated by space. The text is separated into a sequence of
251     * encoded words if it does not fit in a single one.
252     * <p>
253     * The charset to encode the specified text into a byte array and the
254     * encoding to use for the encoded-word are detected automatically.
255     * <p>
256     * This method assumes that zero characters have already been used up in the
257     * current line.
258     *
259     * @param text
260     *            text to encode.
261     * @param usage
262     *            whether the encoded-word is to be used to replace a text token
263     *            or a word entity (see RFC 822).
264     * @return the encoded word (or sequence of encoded words if the given text
265     *         does not fit in a single encoded word).
266     * @see #hasToBeEncoded(String, int)
267     */
268    public static String encodeEncodedWord(String text, Usage usage) {
269        return encodeEncodedWord(text, usage, 0, null, null);
270    }
271
272    /**
273     * Encodes the specified text into an encoded word or a sequence of encoded
274     * words separated by space. The text is separated into a sequence of
275     * encoded words if it does not fit in a single one.
276     * <p>
277     * The charset to encode the specified text into a byte array and the
278     * encoding to use for the encoded-word are detected automatically.
279     *
280     * @param text
281     *            text to encode.
282     * @param usage
283     *            whether the encoded-word is to be used to replace a text token
284     *            or a word entity (see RFC 822).
285     * @param usedCharacters
286     *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
287     * @return the encoded word (or sequence of encoded words if the given text
288     *         does not fit in a single encoded word).
289     * @see #hasToBeEncoded(String, int)
290     */
291    public static String encodeEncodedWord(String text, Usage usage,
292            int usedCharacters) {
293        return encodeEncodedWord(text, usage, usedCharacters, null, null);
294    }
295
296    /**
297     * Encodes the specified text into an encoded word or a sequence of encoded
298     * words separated by space. The text is separated into a sequence of
299     * encoded words if it does not fit in a single one.
300     *
301     * @param text
302     *            text to encode.
303     * @param usage
304     *            whether the encoded-word is to be used to replace a text token
305     *            or a word entity (see RFC 822).
306     * @param usedCharacters
307     *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
308     * @param charset
309     *            the Java charset that should be used to encode the specified
310     *            string into a byte array. A suitable charset is detected
311     *            automatically if this parameter is <code>null</code>.
312     * @param encoding
313     *            the encoding to use for the encoded-word (either B or Q). A
314     *            suitable encoding is automatically chosen if this parameter is
315     *            <code>null</code>.
316     * @return the encoded word (or sequence of encoded words if the given text
317     *         does not fit in a single encoded word).
318     * @see #hasToBeEncoded(String, int)
319     */
320    public static String encodeEncodedWord(String text, Usage usage,
321            int usedCharacters, Charset charset, Encoding encoding) {
322        if (text == null)
323            throw new IllegalArgumentException();
324        if (usedCharacters < 0 || usedCharacters > MAX_USED_CHARACTERS)
325            throw new IllegalArgumentException();
326
327        if (charset == null)
328            charset = determineCharset(text);
329
330        String mimeCharset = CharsetUtil.toMimeCharset(charset.name());
331        if (mimeCharset == null) {
332            // cannot happen if charset was originally null
333            throw new IllegalArgumentException("Unsupported charset");
334        }
335
336        byte[] bytes = encode(text, charset);
337
338        if (encoding == null)
339            encoding = determineEncoding(bytes, usage);
340
341        if (encoding == Encoding.B) {
342            String prefix = ENC_WORD_PREFIX + mimeCharset + "?B?";
343            return encodeB(prefix, text, usedCharacters, charset, bytes);
344        } else {
345            String prefix = ENC_WORD_PREFIX + mimeCharset + "?Q?";
346            return encodeQ(prefix, text, usage, usedCharacters, charset, bytes);
347        }
348    }
349
350    /**
351     * Encodes the specified byte array using the B encoding defined in RFC
352     * 2047.
353     *
354     * @param bytes
355     *            byte array to encode.
356     * @return encoded string.
357     */
358    public static String encodeB(byte[] bytes) {
359        StringBuilder sb = new StringBuilder();
360
361        int idx = 0;
362        final int end = bytes.length;
363        for (; idx < end - 2; idx += 3) {
364            int data = (bytes[idx] & 0xff) << 16 | (bytes[idx + 1] & 0xff) << 8
365                    | bytes[idx + 2] & 0xff;
366            sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
367            sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
368            sb.append((char) BASE64_TABLE[data >> 6 & 0x3f]);
369            sb.append((char) BASE64_TABLE[data & 0x3f]);
370        }
371
372        if (idx == end - 2) {
373            int data = (bytes[idx] & 0xff) << 16 | (bytes[idx + 1] & 0xff) << 8;
374            sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
375            sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
376            sb.append((char) BASE64_TABLE[data >> 6 & 0x3f]);
377            sb.append((char) BASE64_PAD);
378
379        } else if (idx == end - 1) {
380            int data = (bytes[idx] & 0xff) << 16;
381            sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
382            sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
383            sb.append((char) BASE64_PAD);
384            sb.append((char) BASE64_PAD);
385        }
386
387        return sb.toString();
388    }
389
390    /**
391     * Encodes the specified byte array using the Q encoding defined in RFC
392     * 2047.
393     *
394     * @param bytes
395     *            byte array to encode.
396     * @param usage
397     *            whether the encoded-word is to be used to replace a text token
398     *            or a word entity (see RFC 822).
399     * @return encoded string.
400     */
401    public static String encodeQ(byte[] bytes, Usage usage) {
402        BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
403                : Q_RESTRICTED_CHARS;
404
405        StringBuilder sb = new StringBuilder();
406
407        final int end = bytes.length;
408        for (int idx = 0; idx < end; idx++) {
409            int v = bytes[idx] & 0xff;
410            if (v == 32) {
411                sb.append('_');
412            } else if (!qChars.get(v)) {
413                sb.append('=');
414                sb.append(hexDigit(v >>> 4));
415                sb.append(hexDigit(v & 0xf));
416            } else {
417                sb.append((char) v);
418            }
419        }
420
421        return sb.toString();
422    }
423
424    /**
425     * Tests whether the specified string is a token as defined in RFC 2045
426     * section 5.1.
427     *
428     * @param str
429     *            string to test.
430     * @return <code>true</code> if the specified string is a RFC 2045 token,
431     *         <code>false</code> otherwise.
432     */
433    public static boolean isToken(String str) {
434        // token := 1*<any (US-ASCII) CHAR except SPACE, CTLs, or tspecials>
435        // tspecials := "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / "\" /
436        // <"> / "/" / "[" / "]" / "?" / "="
437        // CTL := 0.- 31., 127.
438
439        final int length = str.length();
440        if (length == 0)
441            return false;
442
443        for (int idx = 0; idx < length; idx++) {
444            char ch = str.charAt(idx);
445            if (!TOKEN_CHARS.get(ch))
446                return false;
447        }
448
449        return true;
450    }
451
452    private static boolean isAtomPhrase(String str) {
453        // atom = [CFWS] 1*atext [CFWS]
454
455        boolean containsAText = false;
456
457        final int length = str.length();
458        for (int idx = 0; idx < length; idx++) {
459            char ch = str.charAt(idx);
460            if (ATEXT_CHARS.get(ch)) {
461                containsAText = true;
462            } else if (!CharsetUtil.isWhitespace(ch)) {
463                return false;
464            }
465        }
466
467        return containsAText;
468    }
469
470    // RFC 5322 section 3.2.3
471    private static boolean isDotAtomText(String str) {
472        // dot-atom-text = 1*atext *("." 1*atext)
473        // atext = ALPHA / DIGIT / "!" / "#" / "$" / "%" / "&" / "'" / "*" /
474        // "+" / "-" / "/" / "=" / "?" / "^" / "_" / "`" / "{" / "|" / "}" / "~"
475
476        char prev = '.';
477
478        final int length = str.length();
479        if (length == 0)
480            return false;
481
482        for (int idx = 0; idx < length; idx++) {
483            char ch = str.charAt(idx);
484
485            if (ch == '.') {
486                if (prev == '.' || idx == length - 1)
487                    return false;
488            } else {
489                if (!ATEXT_CHARS.get(ch))
490                    return false;
491            }
492
493            prev = ch;
494        }
495
496        return true;
497    }
498
499    // RFC 5322 section 3.2.4
500    private static String quote(String str) {
501        // quoted-string = [CFWS] DQUOTE *([FWS] qcontent) [FWS] DQUOTE [CFWS]
502        // qcontent = qtext / quoted-pair
503        // qtext = %d33 / %d35-91 / %d93-126
504        // quoted-pair = ("\" (VCHAR / WSP))
505        // VCHAR = %x21-7E
506        // DQUOTE = %x22
507
508        String escaped = str.replaceAll("[\\\\\"]", "\\\\$0");
509        return "\"" + escaped + "\"";
510    }
511
512    private static String encodeB(String prefix, String text,
513            int usedCharacters, Charset charset, byte[] bytes) {
514        int encodedLength = bEncodedLength(bytes);
515
516        int totalLength = prefix.length() + encodedLength
517                + ENC_WORD_SUFFIX.length();
518        if (totalLength <= ENCODED_WORD_MAX_LENGTH - usedCharacters) {
519            return prefix + encodeB(bytes) + ENC_WORD_SUFFIX;
520        } else {
521            int splitOffset = text.offsetByCodePoints(text.length() / 2, -1);
522
523            String part1 = text.substring(0, splitOffset);
524            byte[] bytes1 = encode(part1, charset);
525            String word1 = encodeB(prefix, part1, usedCharacters, charset,
526                    bytes1);
527
528            String part2 = text.substring(splitOffset);
529            byte[] bytes2 = encode(part2, charset);
530            String word2 = encodeB(prefix, part2, 0, charset, bytes2);
531
532            return word1 + " " + word2;
533        }
534    }
535
536    private static int bEncodedLength(byte[] bytes) {
537        return (bytes.length + 2) / 3 * 4;
538    }
539
540    private static String encodeQ(String prefix, String text, Usage usage,
541            int usedCharacters, Charset charset, byte[] bytes) {
542        int encodedLength = qEncodedLength(bytes, usage);
543
544        int totalLength = prefix.length() + encodedLength
545                + ENC_WORD_SUFFIX.length();
546        if (totalLength <= ENCODED_WORD_MAX_LENGTH - usedCharacters) {
547            return prefix + encodeQ(bytes, usage) + ENC_WORD_SUFFIX;
548        } else {
549            int splitOffset = text.offsetByCodePoints(text.length() / 2, -1);
550
551            String part1 = text.substring(0, splitOffset);
552            byte[] bytes1 = encode(part1, charset);
553            String word1 = encodeQ(prefix, part1, usage, usedCharacters,
554                    charset, bytes1);
555
556            String part2 = text.substring(splitOffset);
557            byte[] bytes2 = encode(part2, charset);
558            String word2 = encodeQ(prefix, part2, usage, 0, charset, bytes2);
559
560            return word1 + " " + word2;
561        }
562    }
563
564    private static int qEncodedLength(byte[] bytes, Usage usage) {
565        BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
566                : Q_RESTRICTED_CHARS;
567
568        int count = 0;
569
570        for (int idx = 0; idx < bytes.length; idx++) {
571            int v = bytes[idx] & 0xff;
572            if (v == 32) {
573                count++;
574            } else if (!qChars.get(v)) {
575                count += 3;
576            } else {
577                count++;
578            }
579        }
580
581        return count;
582    }
583
584    private static byte[] encode(String text, Charset charset) {
585        ByteBuffer buffer = charset.encode(text);
586        byte[] bytes = new byte[buffer.limit()];
587        buffer.get(bytes);
588        return bytes;
589    }
590
591    private static Charset determineCharset(String text) {
592        // it is an important property of iso-8859-1 that it directly maps
593        // unicode code points 0000 to 00ff to byte values 00 to ff.
594        boolean ascii = true;
595        final int len = text.length();
596        for (int index = 0; index < len; index++) {
597            char ch = text.charAt(index);
598            if (ch > 0xff) {
599                return CharsetUtil.UTF_8;
600            }
601            if (ch > 0x7f) {
602                ascii = false;
603            }
604        }
605        return ascii ? CharsetUtil.US_ASCII : CharsetUtil.ISO_8859_1;
606    }
607
608    private static Encoding determineEncoding(byte[] bytes, Usage usage) {
609        if (bytes.length == 0)
610            return Encoding.Q;
611
612        BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
613                : Q_RESTRICTED_CHARS;
614
615        int qEncoded = 0;
616        for (int i = 0; i < bytes.length; i++) {
617            int v = bytes[i] & 0xff;
618            if (v != 32 && !qChars.get(v)) {
619                qEncoded++;
620            }
621        }
622
623        int percentage = qEncoded * 100 / bytes.length;
624        return percentage > 30 ? Encoding.B : Encoding.Q;
625    }
626
627    private static char hexDigit(int i) {
628        return i < 10 ? (char) (i + '0') : (char) (i - 10 + 'A');
629    }
630}
631