1f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling// =================================================================================================
2f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling// ADOBE SYSTEMS INCORPORATED
3f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling// Copyright 2006 Adobe Systems Incorporated
4f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling// All Rights Reserved
5f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling//
6f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling// NOTICE:  Adobe permits you to use, modify, and distribute this file in accordance with the terms
7f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling// of the Adobe license agreement accompanying it.
8f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling// =================================================================================================
9f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
10f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
11f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
12f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberlingpackage com.adobe.xmp.impl;
13f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
14f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberlingimport java.io.UnsupportedEncodingException;
15f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
16f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
17f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling/**
18f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * @since   12.10.2006
19f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling */
20f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberlingpublic class Latin1Converter
21f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling{
22f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	/** */
23f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	private static final int STATE_START = 0;
24f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	/** */
25f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	private static final int STATE_UTF8CHAR = 11;
26f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
27f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
28f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	/**
29f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * Private constructor
30f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 */
31f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	private Latin1Converter()
32f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	{
33f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling		// EMPTY
34f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	}
35f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
36f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
37f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	/**
38f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * A converter that processes a byte buffer containing a mix of UTF8 and Latin-1/Cp1252 chars.
39f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * The result is a buffer where those chars have been converted to UTF-8;
40f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * that means it contains only valid UTF-8 chars.
41f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * <p>
42f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * <em>Explanation of the processing:</em> First the encoding of the buffer is detected looking
43f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * at the first four bytes (that works only if the buffer starts with an ASCII-char,
44f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * like xmls &apos;&lt;&apos;). UTF-16/32 flavours do not require further proccessing.
45f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * <p>
46f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * In the case, UTF-8 is detected, it assumes wrong UTF8 chars to be a sequence of
47f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * Latin-1/Cp1252 encoded bytes and converts the chars to their corresponding UTF-8 byte
48f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * sequence.
49f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * <p>
50f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * The 0x80..0x9F range is undefined in Latin-1, but is defined in Windows code
51f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * page 1252. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are formally undefined
52f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * by Windows 1252. These are in XML's RestrictedChar set, so we map them to a
53f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * space.
54f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * <p>
55f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * The official Latin-1 characters in the range 0xA0..0xFF are converted into
56f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * the Unicode Latin Supplement range U+00A0 - U+00FF.
57f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * <p>
58f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * <em>Example:</em> If an Euro-symbol (€) appears in the byte buffer (0xE2, 0x82, 0xAC),
59f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * it will be left as is. But if only the first two bytes are appearing,
60f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * followed by an ASCII char a (0xE2 - 0x82 - 0x41), it will be converted to
61f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * 0xC3, 0xA2 (â) - 0xE2, 0x80, 0x9A (‚) - 0x41 (a).
62f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 *
63f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * @param buffer a byte buffer contain
64f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * @return Returns a new buffer containing valid UTF-8
65f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 */
66f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	public static ByteBuffer convert(ByteBuffer buffer)
67f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	{
68f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling		if ("UTF-8".equals(buffer.getEncoding()))
69f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling		{
70f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			// the buffer containing one UTF-8 char (up to 8 bytes)
71f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			byte[] readAheadBuffer = new byte[8];
72f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			// the number of bytes read ahead.
73f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			int readAhead  = 0;
74f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			// expected UTF8 bytesto come
75f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			int expectedBytes = 0;
76f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			// output buffer with estimated length
77f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			ByteBuffer out = new ByteBuffer(buffer.length() * 4 / 3);
78f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
79f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			int state = STATE_START;
80f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			for (int i = 0; i < buffer.length(); i++)
81f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			{
82f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling				int b = buffer.charAt(i);
83f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
84f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling				switch (state)
85f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling				{
86f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling					default:
87f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling					case STATE_START:
88f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling						if (b < 0x7F)
89f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling						{
90f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							out.append((byte) b);
91f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling						}
92f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling						else if (b >= 0xC0)
93f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling						{
94f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							// start of UTF8 sequence
95f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							expectedBytes = -1;
96f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							int test = b;
97f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							for (; expectedBytes < 8  &&  (test & 0x80) == 0x80; test = test << 1)
98f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							{
99f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling								expectedBytes++;
100f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							}
101f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							readAheadBuffer[readAhead++] = (byte) b;
102f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							state = STATE_UTF8CHAR;
103f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling						}
104f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling						else //  implicitly:  b >= 0x80  &&  b < 0xC0
105f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling						{
106f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							// invalid UTF8 start char, assume to be Latin-1
107f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							byte[] utf8 = convertToUTF8((byte) b);
108f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							out.append(utf8);
109f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling						}
110f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling						break;
111f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
112f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling					case STATE_UTF8CHAR:
113f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling						if (expectedBytes > 0  &&  (b & 0xC0) == 0x80)
114f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling						{
115f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							// valid UTF8 char, add to readAheadBuffer
116f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							readAheadBuffer[readAhead++] = (byte) b;
117f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							expectedBytes--;
118f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
119f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							if (expectedBytes == 0)
120f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							{
121f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling								out.append(readAheadBuffer, 0, readAhead);
122f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling								readAhead = 0;
123f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
124f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling								state = STATE_START;
125f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							}
126f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling						}
127f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling						else
128f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling						{
129f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							// invalid UTF8 char:
130f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							// 1. convert first of seq to UTF8
131f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							byte[] utf8 = convertToUTF8(readAheadBuffer[0]);
132f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							out.append(utf8);
133f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
134f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							// 2. continue processing at second byte of sequence
135f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							i = i - readAhead;
136f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							readAhead = 0;
137f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
138f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling							state = STATE_START;
139f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling						}
140f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling						break;
141f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling				}
142f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			}
143f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
144f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			// loop ends with "half" Utf8 char --> assume that the bytes are Latin-1
145f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			if (state == STATE_UTF8CHAR)
146f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			{
147f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling				for (int j = 0; j < readAhead; j++)
148f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling				{
149f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling					byte b = readAheadBuffer[j];
150f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling					byte[] utf8 = convertToUTF8(b);
151f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling					out.append(utf8);
152f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling				}
153f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			}
154f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
155f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			return out;
156f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling		}
157f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling		else
158f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling		{
159f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			// Latin-1 fixing applies only to UTF-8
160f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			return buffer;
161f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling		}
162f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	}
163f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
164f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
165f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	/**
166f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * Converts a Cp1252 char (contains all Latin-1 chars above 0x80) into a
167f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * UTF-8 byte sequence. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are
168f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * formally undefined by Windows 1252 and therefore replaced by a space
169f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * (0x20).
170f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 *
171f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * @param ch
172f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 *            an Cp1252 / Latin-1 byte
173f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 * @return Returns a byte array containing a UTF-8 byte sequence.
174f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	 */
175f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	private static byte[] convertToUTF8(byte ch)
176f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	{
177f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling		int c = ch & 0xFF;
178f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling		try
179f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling		{
180f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			if (c >= 0x80)
181f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			{
182f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling				if (c == 0x81  ||  c == 0x8D  ||  c == 0x8F  ||  c == 0x90  ||  c == 0x9D)
183f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling				{
184f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling					return new byte[] { 0x20 }; // space for undefined
185f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling				}
186f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling
187f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling				// interpret byte as Windows Cp1252 char
188f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling				return new String(new byte[] { ch }, "cp1252").getBytes("UTF-8");
189f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			}
190f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling		}
191f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling		catch (UnsupportedEncodingException e)
192f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling		{
193f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling			// EMPTY
194f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling		}
195f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling		return new byte[] { ch };
196f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling	}
197f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling}
198