hb-ot-shape-normalize.cc revision c311d852080b50ffc85e80168de62abb05a6be59
1/* 2 * Copyright © 2011 Google, Inc. 3 * 4 * This is part of HarfBuzz, a text shaping library. 5 * 6 * Permission is hereby granted, without written agreement and without 7 * license or royalty fees, to use, copy, modify, and distribute this 8 * software and its documentation for any purpose, provided that the 9 * above copyright notice and the following two paragraphs appear in 10 * all copies of this software. 11 * 12 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR 13 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES 14 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN 15 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 16 * DAMAGE. 17 * 18 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, 19 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 20 * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS 21 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO 22 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 23 * 24 * Google Author(s): Behdad Esfahbod 25 */ 26 27#include "hb-ot-shape-private.hh" 28#include "hb-ot-shape-complex-private.hh" 29 30HB_BEGIN_DECLS 31 32/* 33 * HIGHLEVEL DESIGN: 34 * 35 * This file exports one main function: _hb_ot_shape_normalize(). 36 * 37 * This function closely reflects the Unicode Normalization Algorithm, 38 * yet it's different. The shaper an either prefer decomposed (NFD) or 39 * composed (NFC). 40 * 41 * In general what happens is that: each grapheme is decomposed in a chain 42 * of 1:2 decompositions, marks reordered, and then recomposed if desires, 43 * so far it's like Unicode Normalization. However, the decomposition and 44 * recomposition only happens if the font supports the resulting characters. 45 * 46 * The goals are: 47 * 48 * - Try to render all canonically equivalent strings similarly. To really 49 * achieve this we have to always do the full decomposition and then 50 * selectively recompose from there. It's kinda too expensive though, so 51 * we skip some cases. For example, if composed is desired, we simply 52 * don't touch 1-character clusters that are supported by the font, even 53 * though their NFC may be different. 54 * 55 * - When a font has a precomposed character for a sequence but the 'ccmp' 56 * feature in the font is not adequate, form use the precomposed character 57 * which typically has better mark positioning. 58 * 59 * - When a font does not support a character but supports its decomposition, 60 * well, use the decomposition. 61 * 62 * - The Indic shaper requests decomposed output. This will handle splitting 63 * matra for the Indic shaper. 64 */ 65 66static void 67output_glyph (hb_ot_shape_context_t *c, 68 hb_codepoint_t glyph) 69{ 70 hb_buffer_t *buffer = c->buffer; 71 72 buffer->output_glyph (glyph); 73 hb_glyph_info_set_unicode_props (&buffer->out_info[buffer->out_len - 1], buffer->unicode); 74} 75 76static bool 77decompose (hb_ot_shape_context_t *c, 78 bool shortest, 79 hb_codepoint_t ab) 80{ 81 hb_codepoint_t a, b, glyph; 82 83 if (!hb_unicode_decompose (c->buffer->unicode, ab, &a, &b) || 84 (b && !hb_font_get_glyph (c->font, b, 0, &glyph))) 85 return FALSE; 86 87 bool has_a = hb_font_get_glyph (c->font, a, 0, &glyph); 88 if (shortest && has_a) { 89 /* Output a and b */ 90 output_glyph (c, a); 91 if (b) 92 output_glyph (c, b); 93 return TRUE; 94 } 95 96 if (decompose (c, shortest, a)) { 97 if (b) 98 output_glyph (c, b); 99 return TRUE; 100 } 101 102 if (has_a) { 103 output_glyph (c, a); 104 if (b) 105 output_glyph (c, b); 106 return TRUE; 107 } 108 109 return FALSE; 110} 111 112static void 113decompose_current_glyph (hb_ot_shape_context_t *c, 114 bool shortest) 115{ 116 if (decompose (c, shortest, c->buffer->info[c->buffer->idx].codepoint)) 117 c->buffer->skip_glyph (); 118 else 119 c->buffer->next_glyph (); 120} 121 122static void 123decompose_single_char_cluster (hb_ot_shape_context_t *c, 124 bool will_recompose) 125{ 126 hb_codepoint_t glyph; 127 128 /* If recomposing and font supports this, we're good to go */ 129 if (will_recompose && hb_font_get_glyph (c->font, c->buffer->info[c->buffer->idx].codepoint, 0, &glyph)) { 130 c->buffer->next_glyph (); 131 return; 132 } 133 134 decompose_current_glyph (c, will_recompose); 135} 136 137static void 138decompose_multi_char_cluster (hb_ot_shape_context_t *c, 139 unsigned int end) 140{ 141 /* TODO Currently if there's a variation-selector we give-up, it's just too hard. */ 142 for (unsigned int i = c->buffer->idx; i < end; i++) 143 if (unlikely (is_variation_selector (c->buffer->info[i].codepoint))) 144 return; 145 146 while (c->buffer->idx < end) 147 decompose_current_glyph (c, FALSE); 148} 149 150void 151_hb_ot_shape_normalize (hb_ot_shape_context_t *c) 152{ 153 hb_buffer_t *buffer = c->buffer; 154 bool recompose = !hb_ot_shape_complex_prefer_decomposed (c->plan->shaper); 155 bool has_multichar_clusters = FALSE; 156 unsigned int count; 157 158 /* We do a fairly straightforward yet custom normalization process in three 159 * separate rounds: decompose, reorder, recompose (if desired). Currently 160 * this makes two buffer swaps. We can make it faster by moving the last 161 * two rounds into the inner loop for the first round, but it's more readable 162 * this way. */ 163 164 165 /* First round, decompose */ 166 167 buffer->clear_output (); 168 count = buffer->len; 169 for (buffer->idx = 0; buffer->idx < count;) 170 { 171 unsigned int end; 172 for (end = buffer->idx + 1; end < count; end++) 173 if (buffer->info[buffer->idx].cluster != buffer->info[end].cluster) 174 break; 175 176 if (buffer->idx + 1 == end) 177 decompose_single_char_cluster (c, recompose); 178 else { 179 decompose_multi_char_cluster (c, end); 180 has_multichar_clusters = TRUE; 181 } 182 } 183 buffer->swap_buffers (); 184 185 186 /* Technically speaking, two characters with ccc=0 may combine. But all 187 * those cases are in languages that the indic module handles (which expects 188 * decomposed), or in Hangul jamo, which again, we want decomposed anyway. 189 * So we don't bother combining across cluster boundaries. 190 * 191 * TODO: Am I right about Hangul? If I am, we should add a Hangul module 192 * that requests decomposed. */ 193 194 if (!has_multichar_clusters) 195 return; /* Done! */ 196 197 198 /* Second round, reorder (inplace) */ 199 200 count = buffer->len; 201 for (unsigned int i = 0; i < count; i++) 202 { 203 if (buffer->info[i].combining_class() == 0) 204 continue; 205 206 unsigned int end; 207 for (end = i + 1; end < count; end++) 208 if (buffer->info[end].combining_class() == 0) 209 break; 210 211 /* We are going to do a bubble-sort. Only do this if the 212 * sequence is short. Doing it on long sequences can result 213 * in an O(n^2) DoS. */ 214 if (end - i > 10) { 215 i = end; 216 continue; 217 } 218 219 unsigned int k = end - i - 1; 220 do { 221 hb_glyph_info_t *pinfo = buffer->info + i; 222 unsigned int new_k = 0; 223 224 for (unsigned int j = 0; j < k; j++) 225 if (pinfo[j].combining_class() > pinfo[j+1].combining_class()) { 226 hb_glyph_info_t t; 227 t = pinfo[j]; 228 pinfo[j] = pinfo[j + 1]; 229 pinfo[j + 1] = t; 230 231 new_k = j; 232 } 233 k = new_k; 234 } while (k); 235 236 i = end; 237 } 238 239 240 if (!recompose) 241 return; 242 243 /* Third round, recompose */ 244 245 /* As noted in the comment earlier, we don't try to combine 246 * ccc=0 chars with their previous Starter. */ 247 248 buffer->clear_output (); 249 count = buffer->len; 250 unsigned int starter = 0; 251 buffer->next_glyph (); 252 while (buffer->idx < count) 253 { 254 if (buffer->info[buffer->idx].combining_class() == 0) { 255 starter = buffer->out_len; 256 buffer->next_glyph (); 257 continue; 258 } 259 260 hb_codepoint_t composed, glyph; 261 if ((buffer->out_info[buffer->out_len - 1].combining_class() >= 262 buffer->info[buffer->idx].combining_class()) || 263 !hb_unicode_compose (c->buffer->unicode, 264 buffer->out_info[starter].codepoint, 265 buffer->info[buffer->idx].codepoint, 266 &composed) || 267 !hb_font_get_glyph (c->font, composed, 0, &glyph)) 268 { 269 /* Blocked, or doesn't compose. */ 270 buffer->next_glyph (); 271 continue; 272 } 273 274 /* Composes. Modify starter and carry on. */ 275 buffer->out_info[starter].codepoint = composed; 276 hb_glyph_info_set_unicode_props (&buffer->out_info[starter], buffer->unicode); 277 278 buffer->skip_glyph (); 279 } 280 buffer->swap_buffers (); 281 282} 283 284HB_END_DECLS 285