1#!/bin/sh 2# Copyright (c) 2014 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6# References: 7# http://encoding.spec.whatwg.org/#shift_jis 8 9# Download the following file, run it in source/data/mappings directory 10# and save the result to euc-jp-html5.ucm 11# http://encoding.spec.whatwg.org/index-jis0208.txt 12 13function preamble { 14cat <<PREAMBLE 15# *************************************************************************** 16# * 17# * Copyright (C) 1995-2014, International Business Machines 18# * Corporation and others. All Rights Reserved. 19# * 20# * Generated per the algorithm for Shift_JIS 21# * described at http://encoding.spec.whatwg.org/#shift_jis 22# * 23# *************************************************************************** 24<code_set_name> "shift_jis-html5" 25<char_name_mask> "AXXXX" 26<mb_cur_max> 2 27<mb_cur_min> 1 28<uconv_class> "MBCS" 29<subchar> \xFC\xFC 30<subchar1> \x7F 31<icu:charsetFamily> "ASCII" 32 33<icu:state> 0-80, 81-9f:1, a1-df, e0-fc:1 34<icu:state> 40-7e, 80-fc 35 36CHARMAP 37PREAMBLE 38} 39 40# The encoding spec for Shift_JIS says U+0080 has to be round-tripped with 41# 0x80. So, this is one character more than ASCII up to 128 (0x80). 42function ascii { 43 for i in $(seq 0 128) 44 do 45 printf '<U%04X> \\x%02X |0\n' $i $i 46 done 47} 48 49 50# Map 0x[A1-DF] to U+FF61 to U+FF9F 51function half_width_kana { 52 for i in $(seq 0xA1 0xDF) 53 do 54 # 65377 = 0xFF61, 161 = 0xA1 55 printf '<U%04X> \\x%02X |0\n' $(($i + 65377 - 161)) $i 56 done 57} 58 59 60# From http://encoding.spec.whatwg.org/#index-shift_jis-pointer 61# The index shift_jis pointer for code point is the return value of 62# these steps for the round-trip code points (tag = 0) 63# 64# Let index be index jis0208 excluding all pointers in the range 8272 to 8835. 65# Return the index pointer for code point in index. 66# For index ($1) outside the above range, it's for decoding only and tag 67# is set to '3'. 68# Besides, there are 24 more characters with multiple SJIS representations. 69# Only the first of multiple is tagged with '0' (bi-directional mapping) 70# while the rest is tagged with '3'. 71 72function jis208 { 73 awk '!/^#/ && !/^$/ \ 74 { lead = $1 / 188; \ 75 lead_offset = lead < 0x1F ? 0x81 : 0xC1; \ 76 trail = $1 % 188; \ 77 trail_offset = trail < 0x3F ? 0x40 : 0x41; \ 78 is_in_range = ($1 < 8272 || $1 > 8835); \ 79 tag = (is_in_range && has_seen[$2] == 0) ? 0 : 3; \ 80 printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\ 81 lead + lead_offset, trail + trail_offset, tag);\ 82 if (is_in_range) has_seen[$2] = 1; \ 83 }' \ 84 index-jis0208.txt 85} 86 87# EUDC (End User Defined Characters) is for decoding only 88# (use '|3' to denote that). 89# See http://encoding.spec.whatwg.org/#shift_jis-decoder - step 5 90# This function is called twice with {0x40, 0x7E, 0x40} and {0x80, 0xFC, 0x41} 91# to implement it. 92 93function eudc { 94 # The upper bound for the lead byte is 0xF8 because each lead can 95 # have 188 characters and the total # of characters in the EUDC 96 # is 1692 = 188 * (0xF9 - 0xF0) = 10528 - 8836 (see Shift_JIS decoder 97 # step 3.5 in the encoding spec.) 98 for lead in $(seq 0xF0 0xF8) 99 do 100 for byte in $(seq $1 $2) 101 do 102 offset=$3 103 pointer=$((($lead - 0xC1) * 188 + $byte - $offset)) 104 unicode=$(($pointer - 8836 + 0xE000)) 105 printf "<U%4X> \\\x%02X\\\x%02X |3\n" $unicode $lead $byte 106 done 107 done 108} 109 110function unsorted_table { 111 ascii 112 half_width_kana 113 jis208 114 eudc "0x40" "0x7E" "0x40" 115 eudc "0x80" "0xFC" "0x41" 116 echo '<U00A5> \x5C |1' 117 echo '<U203E> \x7E |1' 118} 119 120preamble 121unsorted_table | sort | uniq 122echo 'END CHARMAP' 123