1# Copyright (c) 2002-2010  International Business Machines Corporation and
2# others. All Rights Reserved.
3#
4#  file:  line.txt
5#
6#         Line Breaking Rules
7#         Implement default line breaking as defined by 
8#         Unicode Standard Annex #14 Revision 24 for Unicode 6.0
9#         http://www.unicode.org/reports/tr14/
10#
11#         TODO:  Rule LB 8 remains as it was in Unicode 5.2
12#         This is only because of a limitation of ICU break engine implementation,
13#         not because the older behavior is desirable.
14
15#
16#  Character Classes defined by TR 14.
17#
18
19!!chain;
20!!LBCMNoChain;
21
22
23!!lookAheadHardBreak;
24#
25#  !!lookAheadHardBreak    Described here because it is (as yet) undocumented elsewhere
26#                          and only used for the line break rules.
27#
28#           It is used in the implementation of rule LB 10
29#           which says to treat any combining mark that is not attached to a base
30#           character as if it were of class AL  (alphabetic).
31#
32#           The problem occurs in the reverse rules.
33#
34#           Consider a sequence like, with correct breaks as shown
35#               LF  ID  CM  AL  AL
36#                  ^       ^       ^
37#           Then consider the sequence without the initial ID (ideographic)
38#                 LF  CM  AL  AL
39#                    ^           ^
40#           Our CM, which in the first example was attached to the ideograph,
41#           is now unattached, becomes an alpha, and joins in with the other
42#           alphas.
43#
44#           When iterating forwards, these sequences do not present any problems
45#           When iterating backwards, we need to look ahead when encountering
46#           a CM to see whether it attaches to something further on or not.
47#           (Look-ahead in a reverse rule is looking towards the start)
48#
49#           If the CM is unattached, we need to force a break.
50#
51#           !!lookAheadHardBreak forces the run time state machine to
52#           stop immediately when a look ahead rule ( '/' operator) matches,
53#           and set the match position to that of the look-ahead operator,
54#           no matter what other rules may be in play at the time.
55#
56#           See rule LB 19 for an example.
57#
58
59$AI = [:LineBreak =  Ambiguous:];
60$AL = [:LineBreak =  Alphabetic:];
61$BA = [:LineBreak =  Break_After:];
62$BB = [:LineBreak =  Break_Before:];
63$BK = [:LineBreak =  Mandatory_Break:];
64$B2 = [:LineBreak =  Break_Both:];
65$CB = [:LineBreak =  Contingent_Break:];
66$CL = [:LineBreak =  Close_Punctuation:];
67$CM = [:LineBreak =  Combining_Mark:];
68$CP = [:LineBreak =  Close_Parenthesis:];
69$CR = [:LineBreak =  Carriage_Return:];
70$EX = [:LineBreak =  Exclamation:];
71$GL = [:LineBreak =  Glue:];
72$HY = [:LineBreak =  Hyphen:];
73$H2 = [:LineBreak =  H2:];
74$H3 = [:LineBreak =  H3:];
75$ID = [:LineBreak =  Ideographic:];
76$IN = [:LineBreak =  Inseperable:];
77$IS = [:LineBreak =  Infix_Numeric:];
78$JL = [:LineBreak =  JL:];
79$JV = [:LineBreak =  JV:];
80$JT = [:LineBreak =  JT:];
81$LF = [:LineBreak =  Line_Feed:];
82$NL = [:LineBreak =  Next_Line:];
83$NS = [:LineBreak =  Nonstarter:];
84$NU = [:LineBreak =  Numeric:];
85$OP = [:LineBreak =  Open_Punctuation:];
86$PO = [:LineBreak =  Postfix_Numeric:];
87$PR = [:LineBreak =  Prefix_Numeric:];
88$QU = [:LineBreak =  Quotation:];
89$SA = [:LineBreak =  Complex_Context:];
90$SG = [:LineBreak =  Surrogate:];
91$SP = [:LineBreak =  Space:];
92$SY = [:LineBreak =  Break_Symbols:];
93$WJ = [:LineBreak =  Word_Joiner:];
94$XX = [:LineBreak =  Unknown:];
95$ZW = [:LineBreak =  ZWSpace:];
96
97#   Dictionary character set, for triggering language-based break engines. Currently
98#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
99#   5.0 or later as the definition of Complex_Context was corrected to include all
100#   characters requiring dictionary break.
101
102$dictionary = [:LineBreak = Complex_Context:];
103
104#
105#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
106#                               SA  (South East Asian: Thai, Lao, Khmer)
107#                               SG  (Unpaired Surrogates)
108#                               XX  (Unknown, unassigned)
109#                         as $AL  (Alphabetic)
110#
111$ALPlus = [$AL $AI $SA $SG $XX];
112
113#
114#  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
115#
116$ALcm = $ALPlus $CM*;
117$BAcm = $BA $CM*;
118$BBcm = $BB $CM*;
119$B2cm = $B2 $CM*;
120$CLcm = $CL $CM*;
121$CPcm = $CP $CM*;
122$EXcm = $EX $CM*;
123$GLcm = $GL $CM*;
124$HYcm = $HY $CM*;
125$H2cm = $H2 $CM*;
126$H3cm = $H3 $CM*;
127$IDcm = $ID $CM*;
128$INcm = $IN $CM*;
129$IScm = $IS $CM*;
130$JLcm = $JL $CM*;
131$JVcm = $JV $CM*;
132$JTcm = $JT $CM*;
133$NScm = $NS $CM*;
134$NUcm = $NU $CM*;
135$OPcm = $OP $CM*;
136$POcm = $PO $CM*;
137$PRcm = $PR $CM*;
138$QUcm = $QU $CM*;
139$SYcm = $SY $CM*;
140$WJcm = $WJ $CM*;
141
142## -------------------------------------------------
143
144!!forward;
145
146#
147#  Each class of character can stand by itself as an unbroken token, with trailing combining stuff
148#
149$ALPlus $CM+;
150$BA $CM+;
151$BB $CM+;
152$B2 $CM+;
153$CL $CM+;
154$CP $CM+;
155$EX $CM+;
156$GL $CM+;
157$HY $CM+;
158$H2 $CM+;
159$H3 $CM+;
160$ID $CM+;
161$IN $CM+;
162$IS $CM+;
163$JL $CM+;
164$JV $CM+;
165$JT $CM+;
166$NS $CM+;
167$NU $CM+;
168$OP $CM+;
169$PO $CM+;
170$PR $CM+;
171$QU $CM+;
172$SY $CM+;
173$WJ $CM+;
174
175#
176# CAN_CM  is the set of characters that may combine with CM combining chars.
177#         Note that Linebreak UAX 14's concept of a combining char and the rules
178#         for what they can combine with are _very_ different from the rest of Unicode.
179#
180#         Note that $CM itself is left out of this set.  If CM is needed as a base
181#         it must be listed separately in the rule.
182#
183$CAN_CM  = [^$SP $BK $CR $LF $NL $ZW $CM];       # Bases that can   take CMs
184$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM];       # Bases that can't take CMs
185
186#
187# AL_FOLLOW  set of chars that can unconditionally follow an AL
188#            Needed in rules where stand-alone $CM s are treated as AL.
189#            Chaining is disabled with CM because it causes other failures,
190#            so for this one case we need to manually list out longer sequences.
191#
192$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
193$AL_FOLLOW_CM   = [$CL $CP $EX $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $ALPlus];
194$AL_FOLLOW      = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
195
196
197#
198#  Rule LB 4, 5    Mandatory (Hard) breaks.
199#
200$LB4Breaks    = [$BK $CR $LF $NL];
201$LB4NonBreaks = [^$BK $CR $LF $NL];
202$CR $LF {100};
203
204#
205#  LB 6    Do not break before hard line breaks.
206#
207$LB4NonBreaks?  $LB4Breaks {100};    # LB 5  do not break before hard breaks.
208$CAN_CM $CM*    $LB4Breaks {100};
209$CM+            $LB4Breaks {100};
210
211# LB 7         x SP
212#              x ZW
213$LB4NonBreaks [$SP $ZW];
214$CAN_CM $CM*  [$SP $ZW];
215$CM+          [$SP $ZW];
216
217#
218# LB 8         Break after zero width space
219#              TODO:  ZW SP* <break>
220#              An engine change is required to write the reverse rule for this.
221#              For now, leave the Unicode 5.2 rule, ZW <break>
222#
223$LB8Breaks    = [$LB4Breaks $ZW];
224$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
225
226
227# LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL 
228#                                $CM not covered by the above needs to behave like $AL   
229#                                See definition of $CAN_CM.
230
231$CAN_CM $CM+;                   #  Stick together any combining sequences that don't match other rules.
232$CM+;
233
234#
235# LB 11  Do not break before or after WORD JOINER & related characters.
236#
237$CAN_CM $CM*  $WJcm;
238$LB8NonBreaks $WJcm;
239$CM+          $WJcm;
240
241$WJcm $CANT_CM;
242$WJcm $CAN_CM $CM*;
243
244#
245# LB 12  Do not break after NBSP and related characters.
246#         GL  x
247#
248$GLcm $CAN_CM $CM*;
249$GLcm $CANT_CM;
250 
251#
252# LB 12a  Do not break before NBSP and related characters ...
253#            [^SP BA HY] x GL
254#
255[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;
256$CM+ GLcm;
257
258
259
260#
261# LB 13   Don't break before ']' or '!' or ';' or '/', even after spaces.
262#
263$LB8NonBreaks $CL;
264$CAN_CM $CM*  $CL;
265$CM+          $CL;              # by rule 10, stand-alone CM behaves as AL
266
267$LB8NonBreaks $CP;
268$CAN_CM $CM*  $CP;
269$CM+          $CP;              # by rule 10, stand-alone CM behaves as AL
270
271$LB8NonBreaks $EX;
272$CAN_CM $CM*  $EX;
273$CM+          $EX;              # by rule 10, stand-alone CM behaves as AL
274
275$LB8NonBreaks $IS;
276$CAN_CM $CM*  $IS;
277$CM+          $IS;              # by rule 10, stand-alone CM behaves as AL
278
279$LB8NonBreaks $SY;
280$CAN_CM $CM*  $SY;
281$CM+          $SY;              # by rule 10, stand-alone CM behaves as AL
282
283
284#
285# LB 14  Do not break after OP, even after spaces
286#
287$OPcm $SP* $CAN_CM $CM*;
288$OPcm $SP* $CANT_CM;
289
290$OPcm $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
291
292# LB 15
293$QUcm $SP* $OPcm;
294
295# LB 16
296($CLcm | $CPcm) $SP* $NScm;
297
298# LB 17
299$B2cm $SP* $B2cm;
300
301#
302# LB 18  Break after spaces.
303#
304$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
305$LB18Breaks    = [$LB8Breaks $SP];
306
307
308# LB 19
309#         x QU
310$LB18NonBreaks $CM* $QUcm;
311$CM+                $QUcm;
312
313#         QU  x
314$QUcm .?;
315$QUcm $LB18NonBreaks $CM*;    # Don't let a combining mark go onto $CR, $BK, etc.
316                              #  TODO:  I don't think this rule is needed.
317
318
319# LB 20
320#        <break>  $CB
321#        $CB   <break>
322
323$LB20NonBreaks = [$LB18NonBreaks - $CB];
324
325# LB 21        x   (BA | HY | NS)
326#           BB x
327#
328$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); 
329
330$BBcm [^$CB];                                  #  $BB  x
331$BBcm $LB20NonBreaks $CM*;
332
333# LB 22
334$ALcm    $INcm;
335$CM+     $INcm;     #  by rule 10, any otherwise unattached CM behaves as AL
336$IDcm    $INcm;
337$INcm    $INcm;
338$NUcm    $INcm;
339
340
341# $LB 23
342$IDcm  $POcm;
343$ALcm  $NUcm;       # includes $LB19
344$CM+   $NUcm;       # Rule 10, any otherwise unattached CM behaves as AL
345$NUcm  $ALcm;
346
347#
348# LB 24
349#
350$PRcm $IDcm;
351$PRcm $ALcm;
352$POcm $ALcm;
353
354#
355# LB 25   Numbers.
356#
357($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;
358
359# LB 26  Do not break a Korean syllable
360#
361$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
362($JVcm | $H2cm) ($JVcm | $JTcm);
363($JTcm | $H3cm) $JTcm;
364
365# LB 27  Treat korean Syllable Block the same as ID  (don't break it)
366($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
367($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
368$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
369
370
371# LB 28   Do not break between alphabetics
372#
373$ALcm $ALcm;
374$CM+ $ALcm;      # The $CM+ is from rule 10, an unattached CM is treated as AL
375
376# LB 29
377$IScm $ALcm;
378
379# LB 30
380($ALcm | $NUcm) $OPcm;
381$CM+ $OPcm;         # The $CM+ is from rule 10, an unattached CM is treated as AL.          
382$CPcm ($ALcm | $NUcm);
383
384
385#
386#  Reverse Rules.
387#
388## -------------------------------------------------
389
390!!reverse;
391
392$CM+ $ALPlus;
393$CM+ $BA;
394$CM+ $BB;
395$CM+ $B2;
396$CM+ $CL;
397$CM+ $CP;
398$CM+ $EX;
399$CM+ $GL;
400$CM+ $HY;
401$CM+ $H2;
402$CM+ $H3;
403$CM+ $ID;
404$CM+ $IN;
405$CM+ $IS;
406$CM+ $JL;
407$CM+ $JV;
408$CM+ $JT;
409$CM+ $NS;
410$CM+ $NU;
411$CM+ $OP;
412$CM+ $PO;
413$CM+ $PR;
414$CM+ $QU;
415$CM+ $SY;
416$CM+ $WJ;
417$CM+;
418
419
420#
421#  Sequences of the form  (shown forwards)
422#      [CANT_CM]  <break>  [CM]  [whatever]
423#  The CM needs to behave as an AL
424#
425$AL_FOLLOW $CM+ / (
426          [$BK $CR $LF $NL $ZW {eof}] |
427          $SP+ $CM+ $SP |
428          $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}]));   # if LB 14 will match, need to surpress this break.
429                                               #  LB14 says    OP SP* x .        
430                                               #    becomes    OP SP* x AL
431                                               #    becomes    OP SP* x CM+ AL_FOLLOW
432                                               #
433                                               # Further note:  the $AL in [$AL {eof}] is only to work around
434                                               #                a rule compiler bug which complains about
435                                               #                empty sets otherwise.
436          
437#
438#  Sequences of the form  (shown forwards)
439#      [CANT_CM]  <break> [CM]  <break>  [PR]
440#  The CM needs to behave as an AL
441#  This rule is concerned about getting the second of the two <breaks> in place.
442#
443
444[$PR   ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
445
446
447
448# LB 4, 5, 5
449
450$LB4Breaks [$LB4NonBreaks-$CM];
451$LB4Breaks $CM+ $CAN_CM;
452$LF $CR;
453
454
455# LB 7         x SP
456#              x ZW
457[$SP $ZW] [$LB4NonBreaks-$CM];
458[$SP $ZW] $CM+ $CAN_CM;
459
460# LB 8 ZW SP* <break>
461#     TODO: to implement this, we need more than one look-ahead hard break in play at a time.
462#           Requires an engine enhancement.
463#   / $SP* $ZW
464
465# LB 9,10  Combining marks.
466#    X   $CM needs to behave like X, where X is not $SP or controls.
467#    $CM not covered by the above needs to behave like $AL
468# Stick together any combining sequences that don't match other rules.
469$CM+ $CAN_CM;
470
471
472# LB 11
473$CM* $WJ $CM* $CAN_CM;
474$CM* $WJ      [$LB8NonBreaks-$CM];
475
476     $CANT_CM $CM* $WJ;
477$CM* $CAN_CM  $CM* $WJ;
478
479# LB 12a
480#      [^SP BA HY] x GL
481#
482$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
483
484# LB 12
485#     GL  x
486#
487$CANT_CM $CM* $GL;
488$CM* $CAN_CM $CM* $GL;
489
490
491# LB 13
492$CL $CM+ $CAN_CM;
493$CP $CM+ $CAN_CM;
494$EX $CM+ $CAN_CM;
495$IS $CM+ $CAN_CM;
496$SY $CM+ $CAN_CM;
497
498$CL [$LB8NonBreaks-$CM];
499$CP [$LB8NonBreaks-$CM];
500$EX [$LB8NonBreaks-$CM];
501$IS [$LB8NonBreaks-$CM];
502$SY [$LB8NonBreaks-$CM];
503
504# Rule 13 & 14 taken together for an edge case.
505#   Match this, shown forward
506#     OP SP+  ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
507#   This really wants to chain at the $CM+ (which is acting as an $AL)
508#   except for $CM chaining being disabled.
509[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;  
510
511# LB 14    OP SP* x
512#
513$CM* $CAN_CM    $SP* $CM* $OP;
514     $CANT_CM   $SP* $CM* $OP;
515$AL_FOLLOW? $CM+  $SP $SP* $CM* $OP;     #  by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
516     
517     $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
518$CM* $AL_FOLLOW_CM   $CM+ $SP+ $CM* $OP;
519$SY $CM $SP+ $OP;   # TODO:  Experiment.  Remove.
520
521
522
523# LB 15
524$CM* $OP $SP* $CM* $QU;
525
526# LB 16
527$CM* $NS $SP* $CM* ($CL | $CP);
528
529# LB 17
530$CM* $B2 $SP* $CM* $B2;
531
532# LB 18  break after spaces
533#        Nothing explicit needed here.
534
535
536#
537# LB 19
538#
539$CM* $QU $CM* $CAN_CM;                                #   . x QU
540$CM* $QU      $LB18NonBreaks;
541
542
543$CM* $CAN_CM  $CM* $QU;                               #   QU x .
544     $CANT_CM $CM* $QU;
545     
546#
547#  LB 20  Break before and after CB.
548#         nothing needed here.
549#
550
551# LB 21
552$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM];     #  . x (BA | HY | NS)
553
554$CM* [$LB20NonBreaks-$CM] $CM* $BB;                   #  BB x .
555[^$CB] $CM* $BB;                                      # 
556
557
558
559# LB 22
560$CM* $IN $CM* $ALPlus;
561$CM* $IN $CM* $ID;
562$CM* $IN $CM* $IN;
563$CM* $IN $CM* $NU;
564
565# LB 23
566$CM* $PO $CM* $ID;
567$CM* $NU $CM* $ALPlus;
568$CM* $ALPlus $CM* $NU;
569
570# LB 24
571$CM* $ID $CM* $PR;
572$CM* $ALPlus $CM* $PR;
573$CM* $ALPlus $CM* $PO;
574
575
576# LB 25
577($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
578
579# LB 26
580$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
581$CM* ($JT | $JV) $CM* ($H2 | $JV);
582$CM* $JT $CM* ($H3 | $JT);
583
584# LB 27
585$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
586$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
587$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
588
589# LB 28
590$CM* $ALPlus $CM* $ALPlus;
591
592
593# LB 29
594$CM* $ALPlus $CM* $IS;
595
596# LB 30
597$CM* $OP $CM* ($ALPlus | $NU);
598$CM* ($ALPlus | $NU) $CM* $CP;
599
600
601## -------------------------------------------------
602
603!!safe_reverse;
604
605# LB 9
606$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
607$CM+ $SP / .;
608
609# LB 14
610$SP+ $CM* $OP;
611
612# LB 15
613$SP+ $CM* $QU;
614
615# LB 16
616$SP+ $CM* ($CL | $CP);
617
618# LB 17
619$SP+ $CM* $B2;
620
621# LB 25
622($CM* ($IS | $SY))+ $CM* $NU;
623($CL | $CP) $CM* ($NU | $IS | $SY);
624
625# For dictionary-based break
626$dictionary $dictionary;
627
628## -------------------------------------------------
629
630!!safe_forward;
631
632# Skip forward over all character classes that are involved in
633#   rules containing patterns with possibly more than one char
634#   of context.
635#
636#  It might be slightly more efficient to have specific rules
637#  instead of one generic one, but only if we could
638#  turn off rule chaining.  We don't want to move more
639#  than necessary.
640#
641[$CM $OP $QU $CL $CP $B2 $PR $HY $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $dictionary];
642$dictionary $dictionary;
643
644