185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# 227f654740f2a26ad62a5c155af9199af9e69b889claireho# Copyright (C) 2002-2010, International Business Machines Corporation and others. 385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# All Rights Reserved. 485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# 585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# file: sent_el.txt 685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# 785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# ICU Sentence Break Rules 885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# See Unicode Standard Annex #29. 927f654740f2a26ad62a5c155af9199af9e69b889claireho# These rules are based on UAX 29 Revision 16 for Unicode Version 6.0 1085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# 1185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 1285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 1385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# 1485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# Character categories as defined in TR 29 1585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# 1685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$CR = [\p{Sentence_Break = CR}]; 1785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$LF = [\p{Sentence_Break = LF}]; 1885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$Extend = [\p{Sentence_Break = Extend}]; 1985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$Sep = [\p{Sentence_Break = Sep}]; 2085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$Format = [\p{Sentence_Break = Format}]; 2185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$Sp = [\p{Sentence_Break = Sp}]; 2285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$Lower = [\p{Sentence_Break = Lower}]; 2385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$Upper = [\p{Sentence_Break = Upper}]; 2485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$OLetter = [\p{Sentence_Break = OLetter}]; 2585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$Numeric = [\p{Sentence_Break = Numeric}]; 2685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$ATerm = [\p{Sentence_Break = ATerm}]; 2785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$SContinue = [\p{Sentence_Break = SContinue}]; 2885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$STerm = [\p{Sentence_Break = STerm} [\u003B \u037E]]; 2985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$Close = [\p{Sentence_Break = Close}]; 3085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 3185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# 3285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# Define extended forms of the character classes, 3385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# incorporate trailing Extend or Format chars. 3485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# Rules 4 and 5. 3585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 3685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$SpEx = $Sp ($Extend | $Format)*; 3785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$LowerEx = $Lower ($Extend | $Format)*; 3885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$UpperEx = $Upper ($Extend | $Format)*; 3985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$OLetterEx = $OLetter ($Extend | $Format)*; 4085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$NumericEx = $Numeric ($Extend | $Format)*; 4185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$ATermEx = $ATerm ($Extend | $Format)*; 4285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$SContinueEx= $SContinue ($Extend | $Format)*; 4385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$STermEx = $STerm ($Extend | $Format)*; 4485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$CloseEx = $Close ($Extend | $Format)*; 4585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 4685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 4785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho## ------------------------------------------------- 4885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 4985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho!!chain; 5085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho!!forward; 5185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 5285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# Rule 3 - break after separators. Keep CR/LF together. 5385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# 5485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$CR $LF; 5585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 5685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 5785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# Rule 4 - Break after $Sep. 5885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# Rule 5 - Ignore $Format and $Extend 5985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# 6085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho[^$Sep $CR $LF]? ($Extend | $Format)*; 6185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 6285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 6385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# Rule 6 6485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$ATermEx $NumericEx; 6585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 6685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# Rule 7 6785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$UpperEx $ATermEx $UpperEx; 6885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 6985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho#Rule 8 7085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$NotLettersEx = [^$OLetter $Upper $Lower $Sep $CR $LF $ATerm $STerm] ($Extend | $Format)*; 7185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower; 7285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 7385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# Rule 8a 7485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho($STermEx | $ATermEx) $CloseEx* $SpEx* ($SContinueEx | $STermEx | $ATermEx); 7585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 7685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho#Rule 9, 10, 11 7785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?; 7885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 7985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho#Rule 12 8085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .; 8185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100}; 8285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 8385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho## ------------------------------------------------- 8485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 8585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho!!reverse; 8685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 8785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$SpEx_R = ($Extend | $Format)* $Sp; 8885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$ATermEx_R = ($Extend | $Format)* $ATerm; 8985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$STermEx_R = ($Extend | $Format)* $STerm; 9085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho$CloseEx_R = ($Extend | $Format)* $Close; 9185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 9285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# 9385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# Reverse rules. 9485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# For now, use the old style inexact reverse rules, which are easier 9585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# to write, but less efficient. 9685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# TODO: exact reverse rules. It appears that exact reverse rules 9785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# may require improving support for look-ahead breaks in the 9885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# builder. Needs more investigation. 9985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# 10085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 10185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho[{bof}] (.? | $LF $CR) [^$Sep $CR $LF]* [$Sep $CR $LF {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*; 10285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho#.*; 10385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 10485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# Explanation for this rule: 10585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# 10685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# It needs to back over 10785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# The $Sep at which we probably begin 10885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# All of the non $Sep chars leading to the preceding $Sep 10985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# The preceding $Sep, which will be the second one that the rule matches. 11085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# Any immediately preceding STerm or ATerm sequences. We need to see these 11185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# to get the correct rule status when moving forwards again. 11285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# 11385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match 11485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# the entire string. 11585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# 11685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because position might be 11785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# at the beginning of the string at this point, and we don't want to fail. 11885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# Can only use {eof} once, and it is used later. 11985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho# 120