1f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling// ================================================================================================= 2f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling// ADOBE SYSTEMS INCORPORATED 3f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling// Copyright 2006 Adobe Systems Incorporated 4f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling// All Rights Reserved 5f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling// 6f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling// NOTICE: Adobe permits you to use, modify, and distribute this file in accordance with the terms 7f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling// of the Adobe license agreement accompanying it. 8f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling// ================================================================================================= 9f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 10f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 11f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 12f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberlingpackage com.adobe.xmp.impl; 13f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 14f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberlingimport java.io.UnsupportedEncodingException; 15f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 16f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 17f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling/** 18f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * @since 12.10.2006 19f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling */ 20f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberlingpublic class Latin1Converter 21f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling{ 22f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling /** */ 23f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling private static final int STATE_START = 0; 24f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling /** */ 25f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling private static final int STATE_UTF8CHAR = 11; 26f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 27f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 28f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling /** 29f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * Private constructor 30f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling */ 31f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling private Latin1Converter() 32f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 33f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling // EMPTY 34f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 35f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 36f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 37f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling /** 38f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * A converter that processes a byte buffer containing a mix of UTF8 and Latin-1/Cp1252 chars. 39f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * The result is a buffer where those chars have been converted to UTF-8; 40f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * that means it contains only valid UTF-8 chars. 41f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * <p> 42f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * <em>Explanation of the processing:</em> First the encoding of the buffer is detected looking 43f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * at the first four bytes (that works only if the buffer starts with an ASCII-char, 44f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * like xmls '<'). UTF-16/32 flavours do not require further proccessing. 45f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * <p> 46f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * In the case, UTF-8 is detected, it assumes wrong UTF8 chars to be a sequence of 47f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * Latin-1/Cp1252 encoded bytes and converts the chars to their corresponding UTF-8 byte 48f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * sequence. 49f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * <p> 50f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * The 0x80..0x9F range is undefined in Latin-1, but is defined in Windows code 51f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * page 1252. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are formally undefined 52f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * by Windows 1252. These are in XML's RestrictedChar set, so we map them to a 53f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * space. 54f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * <p> 55f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * The official Latin-1 characters in the range 0xA0..0xFF are converted into 56f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * the Unicode Latin Supplement range U+00A0 - U+00FF. 57f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * <p> 58f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * <em>Example:</em> If an Euro-symbol (€) appears in the byte buffer (0xE2, 0x82, 0xAC), 59f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * it will be left as is. But if only the first two bytes are appearing, 60f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * followed by an ASCII char a (0xE2 - 0x82 - 0x41), it will be converted to 61f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * 0xC3, 0xA2 (â) - 0xE2, 0x80, 0x9A (‚) - 0x41 (a). 62f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * 63f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * @param buffer a byte buffer contain 64f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * @return Returns a new buffer containing valid UTF-8 65f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling */ 66f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling public static ByteBuffer convert(ByteBuffer buffer) 67f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 68f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling if ("UTF-8".equals(buffer.getEncoding())) 69f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 70f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling // the buffer containing one UTF-8 char (up to 8 bytes) 71f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling byte[] readAheadBuffer = new byte[8]; 72f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling // the number of bytes read ahead. 73f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling int readAhead = 0; 74f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling // expected UTF8 bytesto come 75f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling int expectedBytes = 0; 76f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling // output buffer with estimated length 77f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling ByteBuffer out = new ByteBuffer(buffer.length() * 4 / 3); 78f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 79f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling int state = STATE_START; 80f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling for (int i = 0; i < buffer.length(); i++) 81f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 82f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling int b = buffer.charAt(i); 83f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 84f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling switch (state) 85f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 86f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling default: 87f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling case STATE_START: 88f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling if (b < 0x7F) 89f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 90f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling out.append((byte) b); 91f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 92f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling else if (b >= 0xC0) 93f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 94f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling // start of UTF8 sequence 95f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling expectedBytes = -1; 96f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling int test = b; 97f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling for (; expectedBytes < 8 && (test & 0x80) == 0x80; test = test << 1) 98f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 99f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling expectedBytes++; 100f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 101f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling readAheadBuffer[readAhead++] = (byte) b; 102f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling state = STATE_UTF8CHAR; 103f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 104f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling else // implicitly: b >= 0x80 && b < 0xC0 105f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 106f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling // invalid UTF8 start char, assume to be Latin-1 107f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling byte[] utf8 = convertToUTF8((byte) b); 108f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling out.append(utf8); 109f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 110f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling break; 111f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 112f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling case STATE_UTF8CHAR: 113f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling if (expectedBytes > 0 && (b & 0xC0) == 0x80) 114f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 115f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling // valid UTF8 char, add to readAheadBuffer 116f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling readAheadBuffer[readAhead++] = (byte) b; 117f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling expectedBytes--; 118f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 119f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling if (expectedBytes == 0) 120f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 121f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling out.append(readAheadBuffer, 0, readAhead); 122f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling readAhead = 0; 123f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 124f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling state = STATE_START; 125f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 126f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 127f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling else 128f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 129f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling // invalid UTF8 char: 130f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling // 1. convert first of seq to UTF8 131f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling byte[] utf8 = convertToUTF8(readAheadBuffer[0]); 132f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling out.append(utf8); 133f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 134f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling // 2. continue processing at second byte of sequence 135f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling i = i - readAhead; 136f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling readAhead = 0; 137f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 138f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling state = STATE_START; 139f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 140f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling break; 141f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 142f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 143f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 144f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling // loop ends with "half" Utf8 char --> assume that the bytes are Latin-1 145f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling if (state == STATE_UTF8CHAR) 146f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 147f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling for (int j = 0; j < readAhead; j++) 148f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 149f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling byte b = readAheadBuffer[j]; 150f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling byte[] utf8 = convertToUTF8(b); 151f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling out.append(utf8); 152f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 153f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 154f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 155f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling return out; 156f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 157f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling else 158f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 159f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling // Latin-1 fixing applies only to UTF-8 160f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling return buffer; 161f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 162f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 163f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 164f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 165f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling /** 166f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * Converts a Cp1252 char (contains all Latin-1 chars above 0x80) into a 167f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * UTF-8 byte sequence. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are 168f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * formally undefined by Windows 1252 and therefore replaced by a space 169f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * (0x20). 170f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * 171f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * @param ch 172f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * an Cp1252 / Latin-1 byte 173f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling * @return Returns a byte array containing a UTF-8 byte sequence. 174f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling */ 175f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling private static byte[] convertToUTF8(byte ch) 176f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 177f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling int c = ch & 0xFF; 178f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling try 179f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 180f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling if (c >= 0x80) 181f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 182f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling if (c == 0x81 || c == 0x8D || c == 0x8F || c == 0x90 || c == 0x9D) 183f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 184f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling return new byte[] { 0x20 }; // space for undefined 185f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 186f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling 187f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling // interpret byte as Windows Cp1252 char 188f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling return new String(new byte[] { ch }, "cp1252").getBytes("UTF-8"); 189f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 190f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 191f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling catch (UnsupportedEncodingException e) 192f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling { 193f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling // EMPTY 194f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 195f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling return new byte[] { ch }; 196f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling } 197f12f744843a67c910ec325fc6dfa73988f67b97cSascha Haeberling} 198