/// <summary> /// Implements <see cref="M:IBM.ICU.Text.Transliterator.HandleTransliterate(IBM.ICU.Text.Replaceable, null, System.Boolean)"/>. /// </summary> /// protected internal override void HandleTransliterate(Replaceable text, Transliterator.Position offsets, bool isIncremental) { int start = offsets.start; int limit = offsets.limit; if (start >= limit) { return; } int overallDelta = 0; // Walk through the string looking for safe characters. // Whenever you hit one normalize from the start of the last // safe character up to just before the next safe character // Also, if you hit the end and we are not in incremental mode, // do to end. // TODO: fix for surrogates // TODO: add QuickCheck, so we rarely convert OK stuff int lastSafe = start; // go back to start in any event int cp; for (int i = start + 1; i < limit; i += IBM.ICU.Text.UTF16.GetCharCount(cp)) { cp = text.Char32At(i); if (IBM.ICU.Lang.UCharacter.GetCombiningClass(cp) == 0 && !unsafeStart.Contains(cp)) { int delta = Convert(text, lastSafe, i, null); i += delta; limit += delta; overallDelta += delta; lastSafe = i; } } if (!isIncremental) { int delta_0 = Convert(text, lastSafe, limit, null); overallDelta += delta_0; lastSafe = limit + delta_0; } else { // We are incremental, so accept the last characters IF they turn // into skippables int delta_1 = Convert(text, lastSafe, limit, skippable); if (delta_1 != Int32.MinValue) { overallDelta += delta_1; lastSafe = limit + delta_1; } } offsets.contextLimit += overallDelta; offsets.limit += overallDelta; offsets.start = lastSafe; }
protected internal override bool Match(String s, Pick.Position p) { int cp = IBM.ICU.Text.UTF16.CharAt(s, p.index); if (source.Contains(cp)) { p.index += IBM.ICU.Text.UTF16.GetCharCount(cp); return(true); } p.SetMax("codePoint"); return(false); }
// private methods ------------------------------------------------------ /// <summary> /// Gets the index of the next delimiter after offset /// </summary> /// /// <param name="offset">to the source string</param> /// <returns>offset of the immediate next delimiter, otherwise (- source /// string length - 1) if there are no more delimiters after /// m_nextOffset</returns> private int GetNextDelimiter(int offset) { if (offset >= 0) { int result = offset; int c = 0; if (delims == null) { do { c = IBM.ICU.Text.UTF16.CharAt(m_source_, result); if (m_delimiters_.Contains(c)) { break; } result++; } while (result < m_length_); } else { do { c = IBM.ICU.Text.UTF16.CharAt(m_source_, result); if (c < delims.Length && delims[c]) { break; } result++; } while (result < m_length_); } if (result < m_length_) { return(result); } } return(-1 - m_length_); }
public void TestScriptMetadata() { UnicodeSet rtl = new UnicodeSet("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]"); // So far, sample characters are uppercase. // Georgian is special. UnicodeSet cased = new UnicodeSet("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]"); for (int sc = 0; sc < UScript.CodeLimit; ++sc) { String sn = UScript.GetShortName(sc); ScriptUsage usage = UScript.GetUsage(sc); String sample = UScript.GetSampleString(sc); UnicodeSet scriptSet = new UnicodeSet(); scriptSet.ApplyInt32PropertyValue(UProperty.Script, sc); if (usage == ScriptUsage.NotEncoded) { assertTrue(sn + " not encoded, no sample", sample.Length == 0); // Java 6: sample.isEmpty() assertFalse(sn + " not encoded, not RTL", UScript.IsRightToLeft(sc)); assertFalse(sn + " not encoded, not LB letters", UScript.BreaksBetweenLetters(sc)); assertFalse(sn + " not encoded, not cased", UScript.IsCased(sc)); assertTrue(sn + " not encoded, no characters", scriptSet.IsEmpty); } else { assertFalse(sn + " encoded, has a sample character", sample.Length == 0); // Java 6: sample.isEmpty() int firstChar = sample.CodePointAt(0); int charScript = GetCharScript(sc); assertEquals(sn + " script(sample(script))", charScript, UScript.GetScript(firstChar)); assertEquals(sn + " RTL vs. set", rtl.Contains(firstChar), UScript.IsRightToLeft(sc)); assertEquals(sn + " cased vs. set", cased.Contains(firstChar), UScript.IsCased(sc)); assertEquals(sn + " encoded, has characters", sc == charScript, !scriptSet.IsEmpty); if (UScript.IsRightToLeft(sc)) { rtl.RemoveAll(scriptSet); } if (UScript.IsCased(sc)) { cased.RemoveAll(scriptSet); } } } assertEquals("no remaining RTL characters", "[]", rtl.ToPattern(true)); assertEquals("no remaining cased characters", "[]", cased.ToPattern(true)); assertTrue("Hani breaks between letters", UScript.BreaksBetweenLetters(UScript.Han)); assertTrue("Thai breaks between letters", UScript.BreaksBetweenLetters(UScript.Thai)); assertFalse("Latn does not break between letters", UScript.BreaksBetweenLetters(UScript.Latin)); }
/// <summary> /// Checks whether a token is a valid keyword. /// </summary> /// /// <param name="token">the token to be checked</param> /// <returns>true if the token is a valid keyword.</returns> private static bool IsValidKeyword(String token) { if (token.Length > 0 && START_CHARS.Contains(token[0])) { for (int i = 1; i < token.Length; ++i) { if (!CONT_CHARS.Contains(token[i])) { return(false); } } return(true); } return(false); }
public static String Remove(String source, UnicodeSet removals) { StringBuilder result = new StringBuilder(); int cp; for (int i = 0; i < source.Length; i += IBM.ICU.Text.UTF16.GetCharCount(cp)) { cp = IBM.ICU.Text.UTF16.CharAt(source, i); if (!removals.Contains(cp)) { IBM.ICU.Text.UTF16.Append(result, cp); } } return(result.ToString()); }
internal PrettyPrinter AppendQuoted(int codePoint) { if (toQuote.Contains(codePoint)) { if (quoter != null) { target.Append(quoter.Transliterate(IBM.ICU.Text.UTF16.ValueOf(codePoint))); return(this); } if (codePoint > 0xFFFF) { target.Append("\\U"); target.Append(IBM.ICU.Impl.Utility.Hex(codePoint, 8)); } else { target.Append("\\u"); target.Append(IBM.ICU.Impl.Utility.Hex(codePoint, 4)); } return(this); } switch (codePoint) { case '[': // SET_OPEN: case ']': // SET_CLOSE: case '-': // HYPHEN: case '^': // COMPLEMENT: case '&': // INTERSECTION: case '\\': // BACKSLASH: case '{': case '}': case '$': case ':': target.Append('\\'); break; default: // Escape whitespace if (patternWhitespace.Contains(codePoint)) { target.Append('\\'); } break; } IBM.ICU.Text.UTF16.Append(target, codePoint); return(this); }
public static String ReplaceAll(String source, UnicodeSet set, String replacement) { StringBuffer results = new StringBuffer(); int cp; for (int i = 0; i < source.Length; i += UTF16.GetCharCount(cp)) { cp = UTF16.CharAt(source, i); if (set.Contains(cp)) { results.Append(replacement); } else { UTF16.Append(results, cp); } } return(results.ToString()); }
/// <summary>Returns <c>true</c> if the current text represents emoji character or sequence.</summary> private bool IsEmoji(int current, int next) { int begin = start + current; int end = start + next; int codepoint = UTF16.CharAt(text, 0, end, begin); if (EMOJI.Contains(codepoint)) { if (EMOJI_RK.Contains(codepoint)) { // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence, // an emoji presentation selector or keycap follows. int trailer = begin + Character.CharCount(codepoint); return(trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3)); } else { return(true); } } return(false); }
// --------------------------------------------------------------------------------- // // Parse RBBI rules. The state machine for rules parsing is here. // The state tables are hand-written in the file rbbirpt.txt, // and converted to the form used here by a perl // script rbbicst.pl // // --------------------------------------------------------------------------------- internal void Parse() { int state; RBBIRuleParseTable.RBBIRuleTableElement tableEl; state = 1; NextChar(fC); // // Main loop for the rule parsing state machine. // Runs once per state transition. // Each time through optionally performs, depending on the state table, // - an advance to the the next input char // - an action to be performed. // - pushing or popping a state to/from the local state return stack. // for (;;) { // Quit if state == 0. This is the normal way to exit the state // machine. // if (state == 0) { break; } // Find the state table element that matches the input char from the // rule, or the // class of the input character. Start with the first table row for // this // state, then linearly scan forward until we find a row that // matches the // character. The last row for each state always matches all // characters, so // the search will stop there, if not before. // tableEl = IBM.ICU.Text.RBBIRuleParseTable.gRuleParseStateTable[state]; if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("scan") >= 0) { System.Console.Out.WriteLine("char, line, col = (\'" + (char) fC.fChar + "\', " + fLineNum + ", " + fCharNum + " state = " + tableEl.fStateName); } for (int tableRow = state;; tableRow++) { // loop over the state // table rows associated // with this state. tableEl = IBM.ICU.Text.RBBIRuleParseTable.gRuleParseStateTable[tableRow]; if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("scan") >= 0) { System.Console.Out.Write("."); } if (tableEl.fCharClass < 127 && fC.fEscaped == false && tableEl.fCharClass == fC.fChar) { // Table row specified an individual character, not a set, // and // the input character is not escaped, and // the input character matched it. break; } if (tableEl.fCharClass == 255) { // Table row specified default, match anything character // class. break; } if (tableEl.fCharClass == 254 && fC.fEscaped) { // Table row specified "escaped" and the char was escaped. break; } if (tableEl.fCharClass == 253 && fC.fEscaped && (fC.fChar == 0x50 || fC.fChar == 0x70)) { // Table row specified "escaped P" and the char is either // 'p' or 'P'. break; } if (tableEl.fCharClass == 252 && fC.fChar == (int) -1) { // Table row specified eof and we hit eof on the input. break; } if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table // specs // a // char // class // && fC.fEscaped == false && // char is not escaped && fC.fChar != (int) -1) { // char is not EOF UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128]; if (uniset.Contains(fC.fChar)) { // Table row specified a character class, or set of // characters, // and the current char matches it. break; } } } if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("scan") >= 0) { System.Console.Out.WriteLine(""); } // // We've found the row of the state table that matches the current // input // character from the rules string. // Perform any action specified by this row in the state table. if (DoParseActions(tableEl.fAction) == false) { // Break out of the state machine loop if the // the action signalled some kind of error, or // the action was to exit, occurs on normal end-of-rules-input. break; } if (tableEl.fPushState != 0) { fStackPtr++; if (fStackPtr >= kStackSize) { System.Console.Out .WriteLine("RBBIRuleScanner.parse() - state stack overflow."); Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); } fStack[fStackPtr] = tableEl.fPushState; } if (tableEl.fNextChar) { NextChar(fC); } // Get the next state from the table entry, or from the // state stack if the next state was specified as "pop". if (tableEl.fNextState != 255) { state = tableEl.fNextState; } else { state = fStack[fStackPtr]; fStackPtr--; if (fStackPtr < 0) { System.Console.Out .WriteLine("RBBIRuleScanner.parse() - state stack underflow."); Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); } } } // // If there were NO user specified reverse rules, set up the equivalent // of ".*;" // if (fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fReverseTree] == null) { fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fReverseTree] = PushNewNode(IBM.ICU.Text.RBBINode.opStar); RBBINode operand = PushNewNode(IBM.ICU.Text.RBBINode.setRef); FindSetFor(kAny, operand, null); fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fReverseTree].fLeftChild = operand; operand.fParent = fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fReverseTree]; fNodeStackPtr -= 2; } // // Parsing of the input RBBI rules is complete. // We now have a parse tree for the rule expressions // and a list of all UnicodeSets that are referenced. // if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("symbols") >= 0) { fSymbolTable.RbbiSymtablePrint(); } if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("ptree") >= 0) { System.Console.Out.WriteLine("Completed Forward Rules Parse Tree..."); fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fForwardTree].PrintTree(true); System.Console.Out.WriteLine("\nCompleted Reverse Rules Parse Tree..."); fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fReverseTree].PrintTree(true); System.Console.Out .WriteLine("\nCompleted Safe Point Forward Rules Parse Tree..."); if (fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fSafeFwdTree] == null) { System.Console.Out.WriteLine(" -- null -- "); } else { fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fSafeFwdTree].PrintTree(true); } System.Console.Out .WriteLine("\nCompleted Safe Point Reverse Rules Parse Tree..."); if (fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fSafeRevTree] == null) { System.Console.Out.WriteLine(" -- null -- "); } else { fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fSafeRevTree].PrintTree(true); } } }
/// <summary> /// Implements <see cref="M:IBM.ICU.Text.Transliterator.HandleTransliterate(IBM.ICU.Text.Replaceable, null, System.Boolean)"/>. /// </summary> /// protected internal override void HandleTransliterate(Replaceable text, Transliterator.Position offsets, bool isIncremental) { int maxLen = IBM.ICU.Impl.UCharacterName.GetInstance().GetMaxCharNameLength() + 1; // allow // for // temporary // trailing // space StringBuilder name = new StringBuilder(maxLen); // Get the legal character set UnicodeSet legal = new UnicodeSet(); IBM.ICU.Impl.UCharacterName.GetInstance().GetCharNameCharacters(legal); int cursor = offsets.start; int limit = offsets.limit; // Modes: // 0 - looking for open delimiter // 1 - after open delimiter int mode = 0; int openPos = -1; // open delim candidate pos int c; while (cursor < limit) { c = text.Char32At(cursor); switch (mode) { case 0: // looking for open delimiter if (c == OPEN_DELIM) // quick check first { openPos = cursor; int i = IBM.ICU.Impl.Utility.ParsePattern(OPEN_PAT, text, cursor, limit); if (i >= 0 && i < limit) { mode = 1; name.Length = 0; cursor = i; continue; // *** reprocess char32At(cursor) } } break; case 1: // after open delimiter // Look for legal chars. If \s+ is found, convert it // to a single space. If closeDelimiter is found, exit // the loop. If any other character is found, exit the // loop. If the limit is reached, exit the loop. // Convert \s+ => SPACE. This assumes there are no // runs of >1 space characters in names. if (IBM.ICU.Impl.UCharacterProperty.IsRuleWhiteSpace(c)) { // Ignore leading whitespace if (name.Length > 0 && name[name.Length - 1] != SPACE) { name.Append(SPACE); // If we are too long then abort. maxLen includes // temporary trailing space, so use '>'. if (name.Length > maxLen) { mode = 0; } } break; } if (c == CLOSE_DELIM) { int len = name.Length; // Delete trailing space, if any if (len > 0 && name[len - 1] == SPACE) { name.Length = --len; } c = IBM.ICU.Lang.UCharacter.GetCharFromExtendedName(name.ToString()); if (c != -1) { // Lookup succeeded // assert(UTF16.getCharCount(CLOSE_DELIM) == 1); cursor++; // advance over CLOSE_DELIM String str = IBM.ICU.Text.UTF16.ValueOf(c); text.Replace(openPos, cursor, str); // Adjust indices for the change in the length of // the string. Do not assume that str.length() == // 1, in case of surrogates. int delta = cursor - openPos - str.Length; cursor -= delta; limit -= delta; // assert(cursor == openPos + str.length()); } // If the lookup failed, we leave things as-is and // still switch to mode 0 and continue. mode = 0; openPos = -1; // close off candidate continue; // *** reprocess char32At(cursor) } if (legal.Contains(c)) { IBM.ICU.Text.UTF16.Append(name, c); // If we go past the longest possible name then abort. // maxLen includes temporary trailing space, so use '>='. if (name.Length >= maxLen) { mode = 0; } } // Invalid character else { --cursor; // Backup and reprocess this character mode = 0; } break; } cursor += IBM.ICU.Text.UTF16.GetCharCount(c); } offsets.contextLimit += limit - offsets.limit; offsets.limit = limit; // In incremental mode, only advance the cursor up to the last // open delimiter candidate. offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor; }
/// <summary> /// Get the pattern for a particular set. /// </summary> /// /// <param name="uset"></param> /// <returns>formatted UnicodeSet</returns> public String ToPattern(UnicodeSet uset) { first = true; UnicodeSet putAtEnd = new UnicodeSet(uset).RetainAll(sortAtEnd); // remove // all // the // unassigned // gorp // for // now // make sure that comparison separates all strings, even canonically // equivalent ones ILOG.J2CsMapping.Collections.ISet orderedStrings = new SortedSet(ordering); for (UnicodeSetIterator it = new UnicodeSetIterator(uset); it .NextRange();) { if (it.codepoint == IBM.ICU.Text.UnicodeSetIterator.IS_STRING) { ILOG.J2CsMapping.Collections.Generics.Collections.Add(orderedStrings, it.str0); } else { for (int i = it.codepoint; i <= it.codepointEnd; ++i) { if (!putAtEnd.Contains(i)) { ILOG.J2CsMapping.Collections.Generics.Collections.Add(orderedStrings, IBM.ICU.Text.UTF16.ValueOf(i)); } } } } target.Length = 0; target.Append("["); for (IIterator it_0 = new ILOG.J2CsMapping.Collections.IteratorAdapter(orderedStrings.GetEnumerator()); it_0.HasNext();) { AppendUnicodeSetItem((String)it_0.Next()); } for (UnicodeSetIterator it_1 = new UnicodeSetIterator(putAtEnd); it_1 .Next();) // add back the unassigned gorp { AppendUnicodeSetItem(it_1.codepoint); } FlushLast(); target.Append("]"); String sresult = target.ToString(); // double check the results. This can be removed once we have more // tests. // try { // UnicodeSet doubleCheck = new UnicodeSet(sresult); // if (!uset.equals(doubleCheck)) { // throw new // IllegalStateException("Failure to round-trip in pretty-print " + uset // + " => " + sresult + "\r\n source-result: " + new // UnicodeSet(uset).removeAll(doubleCheck) + "\r\n result-source: " + // new UnicodeSet(doubleCheck).removeAll(uset)); // } // } catch (RuntimeException e) { // throw (RuntimeException) new // IllegalStateException("Failure to round-trip in pretty-print " + // uset).initCause(e); // } return(sresult); }
public int Next(StringBuilder buffer) { if (start >= limit) { return(DONE); } int status = UNKNOWN; int lastQuote = UNKNOWN; int quoteStatus = NONE; int hexCount = 0; int hexValue = 0; int cp; main : { for (int i = start; i < limit; i += IBM.ICU.Text.UTF16.GetCharCount(cp)) { cp = IBM.ICU.Text.UTF16.CharAt(pattern, i); // if we are in a quote, then handle it. switch (quoteStatus) { case SLASH_START: switch (cp) { case 'u': quoteStatus = HEX; hexCount = 4; hexValue = 0; goto main; case 'U': quoteStatus = HEX; hexCount = 8; hexValue = 0; goto main; default: if (usingSlash) { IBM.ICU.Text.UTF16.Append(buffer, cp); quoteStatus = NONE; goto main; } else { buffer.Append(BACK_SLASH); quoteStatus = NONE; } break; } break; // fall through to NONE case HEX: hexValue <<= 4; hexValue += cp; switch (cp) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': hexValue -= '0'; break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': hexValue -= 'a' - 10; break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': hexValue -= 'A' - 10; break; default: start = i; return(BROKEN_ESCAPE); } --hexCount; if (hexCount == 0) { quoteStatus = NONE; IBM.ICU.Text.UTF16.Append(buffer, hexValue); } goto main; case AFTER_QUOTE: // see if we get another quote character // if we just ended a quote BUT the following character is the // lastQuote character, then we have a situation like // '...''...', so we restart the quote if (cp == lastQuote) { IBM.ICU.Text.UTF16.Append(buffer, cp); quoteStatus = NORMAL_QUOTE; goto main; } quoteStatus = NONE; break; // fall through to NONE case START_QUOTE: // if we are at the very start of a quote, and we hit another // quote mark then we emit a literal quote character and end the // quote if (cp == lastQuote) { IBM.ICU.Text.UTF16.Append(buffer, cp); quoteStatus = NONE; // get out of quote, with no trace // remaining continue; } // otherwise get into quote IBM.ICU.Text.UTF16.Append(buffer, cp); quoteStatus = NORMAL_QUOTE; goto main; case NORMAL_QUOTE: if (cp == lastQuote) { quoteStatus = AFTER_QUOTE; // get out of quote goto main; } IBM.ICU.Text.UTF16.Append(buffer, cp); goto main; } if (ignorableCharacters.Contains(cp)) { continue; } // do syntax characters if (syntaxCharacters.Contains(cp)) { if (status == UNKNOWN) { IBM.ICU.Text.UTF16.Append(buffer, cp); start = i + IBM.ICU.Text.UTF16.GetCharCount(cp); return(SYNTAX); } else // LITERAL, so back up and break { start = i; return(status); } } // otherwise it is a literal; keep on going status = LITERAL; if (cp == BACK_SLASH) { quoteStatus = SLASH_START; continue; } else if (usingQuote && cp == SINGLE_QUOTE) { lastQuote = cp; quoteStatus = START_QUOTE; continue; } // normal literals IBM.ICU.Text.UTF16.Append(buffer, cp); } } gotomain: ; // handle final cleanup start = limit; switch (quoteStatus) { case HEX: status = BROKEN_ESCAPE; break; case SLASH_START: if (usingSlash) { status = BROKEN_ESCAPE; } else { buffer.Append(BACK_SLASH); } break; case START_QUOTE: case NORMAL_QUOTE: status = BROKEN_QUOTE; break; } return(status); }
/// <summary> /// Quote a literal string, using the available settings. Thus syntax /// characters, quote characters, and ignorable characters will be put into /// quotes. /// </summary> /// /// <param name="string"></param> /// <returns></returns> public String QuoteLiteral(String str0) { if (needingQuoteCharacters == null) { needingQuoteCharacters = new UnicodeSet().AddAll(syntaxCharacters) .AddAll(ignorableCharacters).AddAll(extraQuotingCharacters); // .addAll(quoteCharacters) if (usingSlash) { needingQuoteCharacters.Add(BACK_SLASH); } if (usingQuote) { needingQuoteCharacters.Add(SINGLE_QUOTE); } } StringBuilder result = new StringBuilder(); int quotedChar = NO_QUOTE; int cp; for (int i = 0; i < str0.Length; i += IBM.ICU.Text.UTF16.GetCharCount(cp)) { cp = IBM.ICU.Text.UTF16.CharAt(str0, i); if (escapeCharacters.Contains(cp)) { // we may have to fix up previous characters if (quotedChar == IN_QUOTE) { result.Append(SINGLE_QUOTE); quotedChar = NO_QUOTE; } AppendEscaped(result, cp); continue; } if (needingQuoteCharacters.Contains(cp)) { // if we have already started a quote if (quotedChar == IN_QUOTE) { IBM.ICU.Text.UTF16.Append(result, cp); if (usingQuote && cp == SINGLE_QUOTE) // double it { result.Append(SINGLE_QUOTE); } continue; } // otherwise not already in quote if (usingSlash) { result.Append(BACK_SLASH); IBM.ICU.Text.UTF16.Append(result, cp); continue; } if (usingQuote) { if (cp == SINGLE_QUOTE) // double it and continue { result.Append(SINGLE_QUOTE); result.Append(SINGLE_QUOTE); continue; } result.Append(SINGLE_QUOTE); IBM.ICU.Text.UTF16.Append(result, cp); quotedChar = IN_QUOTE; continue; } // we have no choice but to use \\u or \\U AppendEscaped(result, cp); continue; } // otherwise cp doesn't need quoting // we may have to fix up previous characters if (quotedChar == IN_QUOTE) { result.Append(SINGLE_QUOTE); quotedChar = NO_QUOTE; } IBM.ICU.Text.UTF16.Append(result, cp); } // all done. // we may have to fix up previous characters if (quotedChar == IN_QUOTE) { result.Append(SINGLE_QUOTE); } return(result.ToString()); }
private void Compare(int c, int ce32, int baseCE32) { if (Collation.IsPrefixCE32(ce32)) { int dataIndex = Collation.IndexFromCE32(ce32); ce32 = data.GetFinalCE32(data.GetCE32FromContexts(dataIndex)); if (Collation.IsPrefixCE32(baseCE32)) { int baseIndex = Collation.IndexFromCE32(baseCE32); baseCE32 = baseData.GetFinalCE32(baseData.GetCE32FromContexts(baseIndex)); ComparePrefixes(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2); } else { AddPrefixes(data, c, data.contexts, dataIndex + 2); } } else if (Collation.IsPrefixCE32(baseCE32)) { int baseIndex = Collation.IndexFromCE32(baseCE32); baseCE32 = baseData.GetFinalCE32(baseData.GetCE32FromContexts(baseIndex)); AddPrefixes(baseData, c, baseData.contexts, baseIndex + 2); } if (Collation.IsContractionCE32(ce32)) { int dataIndex = Collation.IndexFromCE32(ce32); if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) { ce32 = Collation.NO_CE32; } else { ce32 = data.GetFinalCE32(data.GetCE32FromContexts(dataIndex)); } if (Collation.IsContractionCE32(baseCE32)) { int baseIndex = Collation.IndexFromCE32(baseCE32); if ((baseCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) { baseCE32 = Collation.NO_CE32; } else { baseCE32 = baseData.GetFinalCE32(baseData.GetCE32FromContexts(baseIndex)); } CompareContractions(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2); } else { AddContractions(c, data.contexts, dataIndex + 2); } } else if (Collation.IsContractionCE32(baseCE32)) { int baseIndex = Collation.IndexFromCE32(baseCE32); baseCE32 = baseData.GetFinalCE32(baseData.GetCE32FromContexts(baseIndex)); AddContractions(c, baseData.contexts, baseIndex + 2); } int tag; if (Collation.IsSpecialCE32(ce32)) { tag = Collation.TagFromCE32(ce32); Debug.Assert(tag != Collation.PREFIX_TAG); Debug.Assert(tag != Collation.CONTRACTION_TAG); // Currently, the tailoring data builder does not write offset tags. // They might be useful for saving space, // but they would complicate the builder, // and in tailorings we assume that performance of tailored characters is more important. Debug.Assert(tag != Collation.OFFSET_TAG); } else { tag = -1; } int baseTag; if (Collation.IsSpecialCE32(baseCE32)) { baseTag = Collation.TagFromCE32(baseCE32); Debug.Assert(baseTag != Collation.PREFIX_TAG); Debug.Assert(baseTag != Collation.CONTRACTION_TAG); } else { baseTag = -1; } // Non-contextual mappings, expansions, etc. if (baseTag == Collation.OFFSET_TAG) { // We might be comparing a tailoring CE which is a copy of // a base offset-tag CE, via the [optimize [set]] syntax // or when a single-character mapping was copied for tailored contractions. // Offset tags always result in long-primary CEs, // with common secondary/tertiary weights. if (!Collation.IsLongPrimaryCE32(ce32)) { Add(c); return; } long dataCE = baseData.ces[Collation.IndexFromCE32(baseCE32)]; long p = Collation.GetThreeBytePrimaryForOffsetData(c, dataCE); if (Collation.PrimaryFromLongPrimaryCE32(ce32) != p) { Add(c); return; } } if (tag != baseTag) { Add(c); return; } if (tag == Collation.EXPANSION32_TAG) { int length = Collation.LengthFromCE32(ce32); int baseLength = Collation.LengthFromCE32(baseCE32); if (length != baseLength) { Add(c); return; } int idx0 = Collation.IndexFromCE32(ce32); int idx1 = Collation.IndexFromCE32(baseCE32); for (int i = 0; i < length; ++i) { if (data.ce32s[idx0 + i] != baseData.ce32s[idx1 + i]) { Add(c); break; } } } else if (tag == Collation.EXPANSION_TAG) { int length = Collation.LengthFromCE32(ce32); int baseLength = Collation.LengthFromCE32(baseCE32); if (length != baseLength) { Add(c); return; } int idx0 = Collation.IndexFromCE32(ce32); int idx1 = Collation.IndexFromCE32(baseCE32); for (int i = 0; i < length; ++i) { if (data.ces[idx0 + i] != baseData.ces[idx1 + i]) { Add(c); break; } } } else if (tag == Collation.HANGUL_TAG) { StringBuilder jamos = new StringBuilder(); int length = Hangul.Decompose(c, jamos); if (tailored.Contains(jamos[0]) || tailored.Contains(jamos[1]) || (length == 3 && tailored.Contains(jamos[2]))) { Add(c); } } else if (ce32 != baseCE32) { Add(c); } }
// ------------------------------------------------------------------------ // // build Build the list of non-overlapping character ranges // from the Unicode Sets. // // ------------------------------------------------------------------------ internal void Build() { RBBINode usetNode; RBBISetBuilder.RangeDescriptor rlRange; if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("usets") >= 0) { PrintSets(); } // Initialize the process by creating a single range encompassing all // characters // that is in no sets. // fRangeList = new RBBISetBuilder.RangeDescriptor(); fRangeList.fStartChar = 0; fRangeList.fEndChar = 0x10ffff; // // Find the set of non-overlapping ranges of characters // IIterator ni = new ILOG.J2CsMapping.Collections.IteratorAdapter(fRB.fUSetNodes.GetEnumerator()); while (ni.HasNext()) { usetNode = (RBBINode)ni.Next(); UnicodeSet inputSet = usetNode.fInputSet; int inputSetRangeCount = inputSet.GetRangeCount(); int inputSetRangeIndex = 0; rlRange = fRangeList; for (;;) { if (inputSetRangeIndex >= inputSetRangeCount) { break; } int inputSetRangeBegin = inputSet .GetRangeStart(inputSetRangeIndex); int inputSetRangeEnd = inputSet.GetRangeEnd(inputSetRangeIndex); // skip over ranges from the range list that are completely // below the current range from the input unicode set. while (rlRange.fEndChar < inputSetRangeBegin) { rlRange = rlRange.fNext; } // If the start of the range from the range list is before with // the start of the range from the unicode set, split the range // list range // in two, with one part being before (wholly outside of) the // unicode set // and the other containing the rest. // Then continue the loop; the post-split current range will // then be skipped // over if (rlRange.fStartChar < inputSetRangeBegin) { rlRange.Split(inputSetRangeBegin); continue; } // Same thing at the end of the ranges... // If the end of the range from the range list doesn't coincide // with // the end of the range from the unicode set, split the range // list // range in two. The first part of the split range will be // wholly inside the Unicode set. if (rlRange.fEndChar > inputSetRangeEnd) { rlRange.Split(inputSetRangeEnd + 1); } // The current rlRange is now entirely within the UnicodeSet // range. // Add this unicode set to the list of sets for this rlRange if (rlRange.fIncludesSets.IndexOf(usetNode) == -1) { ILOG.J2CsMapping.Collections.Generics.Collections.Add(rlRange.fIncludesSets, usetNode); } // Advance over ranges that we are finished with. if (inputSetRangeEnd == rlRange.fEndChar) { inputSetRangeIndex++; } rlRange = rlRange.fNext; } } if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("range") >= 0) { PrintRanges(); } // // Group the above ranges, with each group consisting of one or more // ranges that are in exactly the same set of original UnicodeSets. // The groups are numbered, and these group numbers are the set of // input symbols recognized by the run-time state machine. // // Numbering: # 0 (state table column 0) is unused. // # 1 is reserved - table column 1 is for end-of-input // # 2 is reserved - table column 2 is for beginning-in-input // # 3 is the first range list. // RBBISetBuilder.RangeDescriptor rlSearchRange; for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext) { for (rlSearchRange = fRangeList; rlSearchRange != rlRange; rlSearchRange = rlSearchRange.fNext) { if (rlRange.fIncludesSets.Equals(rlSearchRange.fIncludesSets)) { rlRange.fNum = rlSearchRange.fNum; break; } } if (rlRange.fNum == 0) { fGroupCount++; rlRange.fNum = fGroupCount + 2; rlRange.SetDictionaryFlag(); AddValToSets(rlRange.fIncludesSets, fGroupCount + 2); } } // Handle input sets that contain the special string {eof}. // Column 1 of the state table is reserved for EOF on input. // Column 2 is reserved for before-the-start-input. // (This column can be optimized away later if there are no rule // references to {bof}.) // Add this column value (1 or 2) to the equivalent expression // subtree for each UnicodeSet that contains the string {eof} // Because {bof} and {eof} are not a characters in the normal sense, // they doesn't affect the computation of ranges or TRIE. String eofString = "eof"; String bofString = "bof"; ni = new ILOG.J2CsMapping.Collections.IteratorAdapter(fRB.fUSetNodes.GetEnumerator()); while (ni.HasNext()) { usetNode = (RBBINode)ni.Next(); UnicodeSet inputSet_0 = usetNode.fInputSet; if (inputSet_0.Contains(eofString)) { AddValToSet(usetNode, 1); } if (inputSet_0.Contains(bofString)) { AddValToSet(usetNode, 2); fSawBOF = true; } } if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("rgroup") >= 0) { PrintRangeGroups(); } if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("esets") >= 0) { PrintSets(); } // IntTrieBuilder(int aliasdata[], int maxdatalength, // int initialvalue, int leadunitvalue, // boolean latin1linear) fTrie = new IntTrieBuilder(null, // Data array (utrie will allocate one) 100000, // Max Data Length 0, // Initial value for all code points 0, // Lead Surrogate unit value, true); // Keep Latin 1 in separately. for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext) { fTrie.SetRange(rlRange.fStartChar, rlRange.fEndChar + 1, rlRange.fNum, true); } }