/// <summary> /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, Position, bool)"/>. /// </summary> protected override void HandleTransliterate(IReplaceable text, Position pos, bool isIncremental) { int start = pos.Start; int limit = pos.Limit; int i, ipat; //loop: while (start < limit) { // Loop over the forms in spec[]. Exit this loop when we // match one of the specs. Exit the outer loop if a // partial match is detected and isIncremental is true. for (ipat = 0; spec[ipat] != END;) { // Read the header int prefixLen = spec[ipat++]; int suffixLen = spec[ipat++]; int radix = spec[ipat++]; int minDigits = spec[ipat++]; int maxDigits = spec[ipat++]; // s is a copy of start that is advanced over the // characters as we parse them. int s = start; bool match = true; for (i = 0; i < prefixLen; ++i) { if (s >= limit) { if (i > 0) { // We've already matched a character. This is // a partial match, so we return if in // incremental mode. In non-incremental mode, // go to the next spec. if (isIncremental) { goto loop_break; } match = false; break; } } char c = text[s++]; if (c != spec[ipat + i]) { match = false; break; } } if (match) { int u = 0; int digitCount = 0; for (; ;) { if (s >= limit) { // Check for partial match in incremental mode. if (s > start && isIncremental) { goto loop_break; } break; } int ch = text.Char32At(s); int digit = UCharacter.Digit(ch, radix); if (digit < 0) { break; } s += UTF16.GetCharCount(ch); u = (u * radix) + digit; if (++digitCount == maxDigits) { break; } } match = (digitCount >= minDigits); if (match) { for (i = 0; i < suffixLen; ++i) { if (s >= limit) { // Check for partial match in incremental mode. if (s > start && isIncremental) { goto loop_break; } match = false; break; } char c = text[s++]; if (c != spec[ipat + prefixLen + i]) { match = false; break; } } if (match) { // At this point, we have a match string str = UTF16.ValueOf(u); text.Replace(start, s, str); limit -= s - start - str.Length; // The following break statement leaves the // loop that is traversing the forms in // spec[]. We then parse the next input // character. break; } } } ipat += prefixLen + suffixLen; } if (start < limit) { start += UTF16.GetCharCount(text.Char32At(start)); } } loop_break : { } pos.ContextLimit += limit - pos.Limit; pos.Limit = limit; pos.Start = start; }
public void TestExtended() { TestParams tp = new TestParams(); // // Open and read the test data file. // StringBuilder testFileBuf = new StringBuilder(); Stream @is = null; try { @is = typeof(RBBITestExtended).GetTypeInfo().Assembly.GetManifestResourceStream("ICU4N.Dev.Test.Rbbi.rbbitst.txt"); if (@is == null) { Errln("Could not open test data file rbbitst.txt"); return; } StreamReader isr = new StreamReader(@is, Encoding.UTF8); try { int c; int count = 0; for (; ;) { c = isr.Read(); if (c < 0) { break; } count++; if (c == 0xFEFF && count == 1) { // BOM in the test data file. Discard it. continue; } testFileBuf.AppendCodePoint(c); } } finally { isr.Dispose(); } } catch (IOException e) { Errln(e.ToString()); try { @is.Dispose(); } catch (IOException ignored) { } return; } String testString = testFileBuf.ToString(); const int PARSE_COMMENT = 1; const int PARSE_TAG = 2; const int PARSE_DATA = 3; const int PARSE_NUM = 4; const int PARSE_RULES = 5; int parseState = PARSE_TAG; int savedState = PARSE_TAG; int lineNum = 1; int colStart = 0; int column = 0; int charIdx = 0; int i; int tagValue = 0; // The numeric value of a <nnn> tag. StringBuilder rules = new StringBuilder(); // Holds rules from a <rules> ... </rules> block int rulesFirstLine = 0; // Line number of the start of current <rules> block int len = testString.Length; for (charIdx = 0; charIdx < len;) { int c = testString.CodePointAt(charIdx); charIdx++; if (c == '\r' && charIdx < len && testString[charIdx] == '\n') { // treat CRLF as a unit c = '\n'; charIdx++; } if (c == '\n' || c == '\r') { lineNum++; colStart = charIdx; } column = charIdx - colStart + 1; switch (parseState) { case PARSE_COMMENT: if (c == 0x0a || c == 0x0d) { parseState = savedState; } break; case PARSE_TAG: { if (c == '#') { parseState = PARSE_COMMENT; savedState = PARSE_TAG; break; } if (UCharacter.IsWhitespace(c)) { break; } if (testString.StartsWith("<word>", charIdx - 1)) { tp.bi = BreakIterator.GetWordInstance(tp.currentLocale); charIdx += 5; break; } if (testString.StartsWith("<char>", charIdx - 1)) { tp.bi = BreakIterator.GetCharacterInstance(tp.currentLocale); charIdx += 5; break; } if (testString.StartsWith("<line>", charIdx - 1)) { tp.bi = BreakIterator.GetLineInstance(tp.currentLocale); charIdx += 5; break; } if (testString.StartsWith("<sent>", charIdx - 1)) { tp.bi = BreakIterator.GetSentenceInstance(tp.currentLocale); charIdx += 5; break; } if (testString.StartsWith("<title>", charIdx - 1)) { tp.bi = BreakIterator.GetTitleInstance(tp.currentLocale); charIdx += 6; break; } if (testString.StartsWith("<rules>", charIdx - 1) || testString.StartsWith("<badrules>", charIdx - 1)) { charIdx = testString.IndexOf('>', charIdx) + 1; parseState = PARSE_RULES; rules.Length = (0); rulesFirstLine = lineNum; break; } if (testString.StartsWith("<locale ", charIdx - 1)) { int closeIndex = testString.IndexOf(">", charIdx); if (closeIndex < 0) { Errln("line" + lineNum + ": missing close on <locale tag."); break; } String localeName = testString.Substring(charIdx + 6, closeIndex - (charIdx + 6)); // ICU4N: Corrected 2nd parameter localeName = localeName.Trim(); tp.currentLocale = new ULocale(localeName); charIdx = closeIndex + 1; break; } if (testString.StartsWith("<data>", charIdx - 1)) { parseState = PARSE_DATA; charIdx += 5; tp.dataToBreak.Length = (0); Arrays.Fill(tp.expectedBreaks, 0); Arrays.Fill(tp.srcCol, 0); Arrays.Fill(tp.srcLine, 0); break; } Errln("line" + lineNum + ": Tag expected in test file."); return; //parseState = PARSE_COMMENT; //savedState = PARSE_DATA; } case PARSE_RULES: if (testString.StartsWith("</rules>", charIdx - 1)) { charIdx += 7; parseState = PARSE_TAG; try { tp.bi = new RuleBasedBreakIterator(rules.ToString()); } catch (ArgumentException e) { Errln(String.Format("rbbitst.txt:{0} Error creating break iterator from rules. {1}", lineNum, e)); } } else if (testString.StartsWith("</badrules>", charIdx - 1)) { charIdx += 10; parseState = PARSE_TAG; bool goodRules = true; try { new RuleBasedBreakIterator(rules.ToString()); } catch (ArgumentException e) { goodRules = false; } if (goodRules) { Errln(String.Format( "rbbitst.txt:{0} Expected, but did not get, a failure creating break iterator from rules.", lineNum)); } } else { rules.AppendCodePoint(c); } break; case PARSE_DATA: if (c == '•') { int breakIdx = tp.dataToBreak.Length; tp.expectedBreaks[breakIdx] = -1; tp.srcLine[breakIdx] = lineNum; tp.srcCol[breakIdx] = column; break; } if (testString.StartsWith("</data>", charIdx - 1)) { // Add final entry to mappings from break location to source file position. // Need one extra because last break position returned is after the // last char in the data, not at the last char. int idx = tp.dataToBreak.Length; tp.srcLine[idx] = lineNum; tp.srcCol[idx] = column; parseState = PARSE_TAG; charIdx += 6; // RUN THE TEST! executeTest(tp); break; } if (testString.StartsWith("\\N{", charIdx - 1)) { int nameEndIdx = testString.IndexOf('}', charIdx); if (nameEndIdx == -1) { Errln("Error in named character in test file at line " + lineNum + ", col " + column); } // Named character, e.g. \N{COMBINING GRAVE ACCENT} // Get the code point from the name and insert it into the test data. String charName = testString.Substring(charIdx + 2, nameEndIdx - (charIdx + 2)); // ICU4N: Corrected 2nd parameter c = UCharacter.GetCharFromName(charName); if (c == -1) { Errln("Error in named character in test file at line " + lineNum + ", col " + column); } else { // Named code point was recognized. Insert it // into the test data. tp.dataToBreak.AppendCodePoint(c); for (i = tp.dataToBreak.Length - 1; i >= 0 && tp.srcLine[i] == 0; i--) { tp.srcLine[i] = lineNum; tp.srcCol[i] = column; } } if (nameEndIdx > charIdx) { charIdx = nameEndIdx + 1; } break; } if (testString.StartsWith("<>", charIdx - 1)) { charIdx++; int breakIdx = tp.dataToBreak.Length; tp.expectedBreaks[breakIdx] = -1; tp.srcLine[breakIdx] = lineNum; tp.srcCol[breakIdx] = column; break; } if (c == '<') { tagValue = 0; parseState = PARSE_NUM; break; } if (c == '#' && column == 3) { // TODO: why is column off so far? parseState = PARSE_COMMENT; savedState = PARSE_DATA; break; } if (c == '\\') { // Check for \ at end of line, a line continuation. // Advance over (discard) the newline int cp = testString.CodePointAt(charIdx); if (cp == '\r' && charIdx < len && testString.CodePointAt(charIdx + 1) == '\n') { // We have a CR LF // Need an extra increment of the input ptr to move over both of them charIdx++; } if (cp == '\n' || cp == '\r') { lineNum++; column = 0; charIdx++; colStart = charIdx; break; } // Let unescape handle the back slash. int[] charIdxAr = new int[1]; charIdxAr[0] = charIdx; cp = Utility.UnescapeAt(testString, charIdxAr); if (cp != -1) { // Escape sequence was recognized. Insert the char // into the test data. charIdx = charIdxAr[0]; tp.dataToBreak.AppendCodePoint(cp); for (i = tp.dataToBreak.Length - 1; i >= 0 && tp.srcLine[i] == 0; i--) { tp.srcLine[i] = lineNum; tp.srcCol[i] = column; } break; } // Not a recognized backslash escape sequence. // Take the next char as a literal. // TODO: Should this be an error? c = testString.CodePointAt(charIdx); charIdx = testString.OffsetByCodePoints(charIdx, 1); } // Normal, non-escaped data char. tp.dataToBreak.AppendCodePoint(c); // Save the mapping from offset in the data to line/column numbers in // the original input file. Will be used for better error messages only. // If there's an expected break before this char, the slot in the mapping // vector will already be set for this char; don't overwrite it. for (i = tp.dataToBreak.Length - 1; i >= 0 && tp.srcLine[i] == 0; i--) { tp.srcLine[i] = lineNum; tp.srcCol[i] = column; } break; case PARSE_NUM: // We are parsing an expected numeric tag value, like <1234>, // within a chunk of data. if (UCharacter.IsWhitespace(c)) { break; } if (c == '>') { // Finished the number. Add the info to the expected break data, // and switch parse state back to doing plain data. parseState = PARSE_DATA; if (tagValue == 0) { tagValue = -1; } int breakIdx = tp.dataToBreak.Length; tp.expectedBreaks[breakIdx] = tagValue; tp.srcLine[breakIdx] = lineNum; tp.srcCol[breakIdx] = column; break; } if (UCharacter.IsDigit(c)) { tagValue = tagValue * 10 + UCharacter.Digit(c); break; } Errln(String.Format("Syntax Error in rbbitst.txt at line {0}, col {1}", lineNum, column)); return; } } // Reached end of test file. Raise an error if parseState indicates that we are // within a block that should have been terminated. if (parseState == PARSE_RULES) { Errln(String.Format("rbbitst.txt:{0} <rules> block beginning at line {1} is not closed.", lineNum, rulesFirstLine)); } if (parseState == PARSE_DATA) { Errln(String.Format("rbbitst.txt:{0} <data> block not closed.", lineNum)); } }