1ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org#!/bin/sh
2ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# Copyright (c) 2014 The Chromium Authors. All rights reserved.
3ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# Use of this source code is governed by a BSD-style license that can be
4ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# found in the LICENSE file.
5ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org
6ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# References:
7ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org#   http://encoding.spec.whatwg.org/#shift_jis
8ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org
9ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# Download the following file, run it in source/data/mappings directory
10ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# and save the result to euc-jp-html5.ucm
11ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org#   http://encoding.spec.whatwg.org/index-jis0208.txt
12ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org
13ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgfunction preamble {
14ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgcat <<PREAMBLE
15ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# ***************************************************************************
16ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# *
17ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# *   Copyright (C) 1995-2014, International Business Machines
18ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# *   Corporation and others.  All Rights Reserved.
19ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# *
20ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# *   Generated per the algorithm for Shift_JIS
21ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# *   described at http://encoding.spec.whatwg.org/#shift_jis
22ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# *
23ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# ***************************************************************************
24ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<code_set_name>               "shift_jis-html5"
25ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<char_name_mask>              "AXXXX"
26ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<mb_cur_max>                  2
27ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<mb_cur_min>                  1
28ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<uconv_class>                 "MBCS"
29ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<subchar>                     \xFC\xFC
30ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<subchar1>                    \x7F
31ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<icu:charsetFamily>           "ASCII"
32ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org
33ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<icu:state>                   0-80, 81-9f:1, a1-df, e0-fc:1
34ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org<icu:state>                   40-7e, 80-fc
35ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org
36ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgCHARMAP
37ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgPREAMBLE
38ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org}
39ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org
40ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# The encoding spec for Shift_JIS says U+0080 has to be round-tripped with
41ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# 0x80. So, this is one character more than ASCII up to 128 (0x80).
42ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgfunction ascii {
43ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  for i in $(seq 0 128)
44ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  do
45ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org    printf '<U%04X> \\x%02X |0\n' $i $i
46ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  done
47ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org}
48ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org
49ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org
50ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# Map 0x[A1-DF] to U+FF61 to U+FF9F
51ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgfunction half_width_kana {
52ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  for i in $(seq 0xA1 0xDF)
53ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  do
54ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org    # 65377 = 0xFF61, 161 = 0xA1
55ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org    printf '<U%04X> \\x%02X |0\n' $(($i + 65377 - 161))  $i
56ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  done
57ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org}
58ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org
59ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org
60ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# From http://encoding.spec.whatwg.org/#index-shift_jis-pointer
61ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# The index shift_jis pointer for code point is the return value of
62ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# these steps for the round-trip code points (tag = 0)
63ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org#
64ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org#   Let index be index jis0208 excluding all pointers in the range 8272 to 8835.
65ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org#   Return the index pointer for code point in index.
66ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# For index ($1) outside the above range, it's for decoding only and tag
67ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# is set to '3'.
68ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# Besides, there are 24 more characters with multiple SJIS representations.
69ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# Only the first of multiple is tagged with '0' (bi-directional mapping)
70ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# while the rest is tagged with '3'.
71ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org
72ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgfunction jis208 {
73ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  awk '!/^#/ && !/^$/ \
74ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org       { lead = $1 / 188; \
75ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org         lead_offset = lead < 0x1F ? 0x81 : 0xC1; \
76ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org         trail = $1 % 188; \
77ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org         trail_offset = trail < 0x3F ? 0x40 : 0x41; \
78ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org         is_in_range = ($1 < 8272 || $1 > 8835); \
79ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org         tag = (is_in_range && has_seen[$2] == 0) ? 0 : 3; \
80ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org         printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\
81ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org                 lead + lead_offset, trail + trail_offset, tag);\
82ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org         if (is_in_range) has_seen[$2] = 1; \
83ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org       }' \
84ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  index-jis0208.txt
85ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org}
86ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org
87ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# EUDC (End User Defined Characters)  is for decoding only
88ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# (use '|3' to denote that).
89ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# See http://encoding.spec.whatwg.org/#shift_jis-decoder - step 5
90ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# This function is called twice with {0x40, 0x7E, 0x40} and {0x80, 0xFC, 0x41}
91ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org# to implement it.
92ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org
93ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgfunction eudc {
94ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  # The upper bound for the lead byte is 0xF8 because each lead can
95ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  # have 188 characters and the total # of characters in the EUDC
96ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  # is 1692 = 188 * (0xF9 - 0xF0) = 10528 - 8836 (see Shift_JIS decoder
97ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  # step 3.5 in the encoding spec.)
98ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  for lead in $(seq 0xF0 0xF8)
99ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  do
100ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org    for byte in $(seq $1 $2)
101ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org    do
102ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org      offset=$3
103ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org      pointer=$((($lead - 0xC1) * 188 + $byte - $offset))
104ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org      unicode=$(($pointer - 8836 + 0xE000))
105ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org      printf "<U%4X> \\\x%02X\\\x%02X |3\n" $unicode $lead $byte
106ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org    done
107ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  done
108ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org}
109ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org
110ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgfunction unsorted_table {
111ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  ascii
112ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  half_width_kana
113ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  jis208
114ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  eudc "0x40" "0x7E" "0x40"
115ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  eudc "0x80" "0xFC" "0x41"
116ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  echo '<U00A5> \x5C |1'
117ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org  echo '<U203E> \x7E |1'
118ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org}
119ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.org
120ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgpreamble
121ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgunsorted_table | sort  | uniq
122ff8353094b0b29f2d50059c452d15bbf0a84c0adjshin@chromium.orgecho 'END CHARMAP'
123