1ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org#!/bin/sh 2ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# Copyright (c) 2014 The Chromium Authors. All rights reserved. 3ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# Use of this source code is governed by a BSD-style license that can be 4ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# found in the LICENSE file. 5ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org 6ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# References: 7ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# http://encoding.spec.whatwg.org/#shift_jis 8ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org 9ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# Download the following file, run it in source/data/mappings directory 10ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# and save the result to euc-jp-html5.ucm 11ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# http://encoding.spec.whatwg.org/index-jis0208.txt 12ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org 13ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgfunction preamble { 14ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgcat <<PREAMBLE 15ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# *************************************************************************** 16ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# * 17ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# * Copyright (C) 1995-2014, International Business Machines 18ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# * Corporation and others. All Rights Reserved. 19ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# * 20ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# * Generated per the algorithm for Shift_JIS 21ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# * described at http://encoding.spec.whatwg.org/#shift_jis 22ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# * 23ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# *************************************************************************** 24ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<code_set_name> "shift_jis-html5" 25ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<char_name_mask> "AXXXX" 26ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<mb_cur_max> 2 27ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<mb_cur_min> 1 28ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<uconv_class> "MBCS" 29ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<subchar> \xFC\xFC 30ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<subchar1> \x7F 31ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<icu:charsetFamily> "ASCII" 32ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org 33ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<icu:state> 0-80, 81-9f:1, a1-df, e0-fc:1 34ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<icu:state> 40-7e, 80-fc 35ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org 36ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgCHARMAP 37ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgPREAMBLE 38ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org} 39ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org 40ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# The encoding spec for Shift_JIS says U+0080 has to be round-tripped with 41ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# 0x80. So, this is one character more than ASCII up to 128 (0x80). 42ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgfunction ascii { 43ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org for i in $(seq 0 128) 44ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org do 45ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org printf '<U%04X> \\x%02X |0\n' $i $i 46ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org done 47ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org} 48ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org 49ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org 50ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# Map 0x[A1-DF] to U+FF61 to U+FF9F 51ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgfunction half_width_kana { 52ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org for i in $(seq 0xA1 0xDF) 53ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org do 54ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org # 65377 = 0xFF61, 161 = 0xA1 55ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org printf '<U%04X> \\x%02X |0\n' $(($i + 65377 - 161)) $i 56ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org done 57ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org} 58ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org 59ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org 60ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# From http://encoding.spec.whatwg.org/#index-shift_jis-pointer 61ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# The index shift_jis pointer for code point is the return value of 62ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# these steps for the round-trip code points (tag = 0) 63ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# 64ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# Let index be index jis0208 excluding all pointers in the range 8272 to 8835. 65ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# Return the index pointer for code point in index. 66ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# For index ($1) outside the above range, it's for decoding only and tag 67ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# is set to '3'. 68ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# Besides, there are 24 more characters with multiple SJIS representations. 69ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# Only the first of multiple is tagged with '0' (bi-directional mapping) 70ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# while the rest is tagged with '3'. 71ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org 72ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgfunction jis208 { 73ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org awk '!/^#/ && !/^$/ \ 74ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org { lead = $1 / 188; \ 75ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org lead_offset = lead < 0x1F ? 0x81 : 0xC1; \ 76ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org trail = $1 % 188; \ 77ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org trail_offset = trail < 0x3F ? 0x40 : 0x41; \ 78ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org is_in_range = ($1 < 8272 || $1 > 8835); \ 79ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org tag = (is_in_range && has_seen[$2] == 0) ? 0 : 3; \ 80ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\ 81ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org lead + lead_offset, trail + trail_offset, tag);\ 82ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org if (is_in_range) has_seen[$2] = 1; \ 83ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org }' \ 84ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org index-jis0208.txt 85ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org} 86ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org 87ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# EUDC (End User Defined Characters) is for decoding only 88ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# (use '|3' to denote that). 89ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# See http://encoding.spec.whatwg.org/#shift_jis-decoder - step 5 90ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# This function is called twice with {0x40, 0x7E, 0x40} and {0x80, 0xFC, 0x41} 91ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# to implement it. 92ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org 93ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgfunction eudc { 94ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org # The upper bound for the lead byte is 0xF8 because each lead can 95ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org # have 188 characters and the total # of characters in the EUDC 96ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org # is 1692 = 188 * (0xF9 - 0xF0) = 10528 - 8836 (see Shift_JIS decoder 97ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org # step 3.5 in the encoding spec.) 98ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org for lead in $(seq 0xF0 0xF8) 99ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org do 100ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org for byte in $(seq $1 $2) 101ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org do 102ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org offset=$3 103ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org pointer=$((($lead - 0xC1) * 188 + $byte - $offset)) 104ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org unicode=$(($pointer - 8836 + 0xE000)) 105ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org printf "<U%4X> \\\x%02X\\\x%02X |3\n" $unicode $lead $byte 106ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org done 107ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org done 108ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org} 109ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org 110ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgfunction unsorted_table { 111ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org ascii 112ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org half_width_kana 113ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org jis208 114ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org eudc "0x40" "0x7E" "0x40" 115ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org eudc "0x80" "0xFC" "0x41" 116ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org echo '<U00A5> \x5C |1' 117ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org echo '<U203E> \x7E |1' 118ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org} 119ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org 120ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgpreamble 121ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgunsorted_table | sort | uniq 122ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgecho 'END CHARMAP' 123