/// <summary> /// Return the number of bytes that compress() would write. /// </summary> /// /// <param name="source">text source string</param> /// <returns>the length of the BOCU result</returns> /// <seealso cref="M:IBM.ICU.Impl.BOCU.Compress(System.String, null, System.Int32)"/> public static int GetCompressionLength(String source) { int prev = 0; int result = 0; UCharacterIterator iterator = IBM.ICU.Text.UCharacterIterator.GetInstance(source); int codepoint = iterator.NextCodePoint(); while (codepoint != IBM.ICU.Text.UForwardCharacterIterator_Constants.DONE) { if (prev < 0x4e00 || prev >= 0xa000) { prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_; } else { // Unihan U+4e00..U+9fa5: // double-bytes down from the upper end prev = 0x9fff - SLOPE_REACH_POS_2_; } codepoint = iterator.NextCodePoint(); result += LengthOfDiff(codepoint - prev); prev = codepoint; } return(result); }
// public constructors -------------------------------------------------- // public methods ------------------------------------------------------- /// <summary> /// <p> /// Encode the code points of a string as a sequence of bytes, preserving /// lexical order. /// </p> /// <p> /// The minimum size of buffer required for the compression can be /// preflighted by getCompressionLength(String). /// </p> /// </summary> /// /// <param name="source">text source</param> /// <param name="buffer">output buffer</param> /// <param name="offset">to start writing to</param> /// <returns>end offset where the writing stopped</returns> /// <seealso cref="M:IBM.ICU.Impl.BOCU.GetCompressionLength(System.String)"/> /// <exception cref="ArrayIndexOutOfBoundsException">thrown if size of buffer is too small for the output.</exception> public static int Compress(String source, byte[] buffer, int offset) { int prev = 0; UCharacterIterator iterator = IBM.ICU.Text.UCharacterIterator.GetInstance(source); int codepoint = iterator.NextCodePoint(); while (codepoint != IBM.ICU.Text.UForwardCharacterIterator_Constants.DONE) { if (prev < 0x4e00 || prev >= 0xa000) { prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_; } else { // Unihan U+4e00..U+9fa5: // double-bytes down from the upper end prev = 0x9fff - SLOPE_REACH_POS_2_; } offset = WriteDiff(codepoint - prev, buffer, offset); prev = codepoint; codepoint = iterator.NextCodePoint(); } return(offset); }
public void TestJitterbug1952() { //test previous code point char[] src = new char[] { '\uDC00', '\uD800', '\uDC01', '\uD802', '\uDC02', '\uDC03' }; UCharacterIterator iter = UCharacterIterator.GetInstance(src); iter.Index = 1; int ch; // this should never go into a infinite loop // if it does then we have a problem while ((ch = iter.PreviousCodePoint()) != UCharacterIterator.DONE) { if (ch != 0xDc00) { Errln("iter.PreviousCodePoint() failed"); } } iter.Index = (5); while ((ch = iter.NextCodePoint()) != UCharacterIterator.DONE) { if (ch != 0xDC03) { Errln("iter.NextCodePoint() failed"); } } }
public StringBuffer Prepare(String src, StringPrepOptions options) { int ch; String mapOut = Map(src, options); UCharacterIterator iter = UCharacterIterator.GetInstance(mapOut); UCharacterDirection direction = UCharacterDirectionExtensions.CharDirectionCount, firstCharDir = UCharacterDirectionExtensions.CharDirectionCount; int rtlPos = -1, ltrPos = -1; bool rightToLeft = false, leftToRight = false; while ((ch = iter.NextCodePoint()) != UCharacterIterator.Done) { if (transform.prohibitedSet.Contains(ch) == true && ch != 0x0020) { throw new StringPrepParseException("A prohibited code point was found in the input", StringPrepErrorType.ProhibitedError, iter.GetText(), iter.Index); } direction = UChar.GetDirection(ch); if (firstCharDir == UCharacterDirectionExtensions.CharDirectionCount) { firstCharDir = direction; } if (direction == UCharacterDirection.LeftToRight) { leftToRight = true; ltrPos = iter.Index - 1; } if (direction == UCharacterDirection.RightToLeft || direction == UCharacterDirection.RightToLeftArabic) { rightToLeft = true; rtlPos = iter.Index - 1; } } // satisfy 2 if (leftToRight == true && rightToLeft == true) { throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.", StringPrepErrorType.CheckBiDiError, iter.GetText(), (rtlPos > ltrPos) ? rtlPos : ltrPos); } //satisfy 3 if (rightToLeft == true && !((firstCharDir == UCharacterDirection.RightToLeft || firstCharDir == UCharacterDirection.RightToLeftArabic) && (direction == UCharacterDirection.RightToLeft || direction == UCharacterDirection.RightToLeftArabic)) ) { throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.", StringPrepErrorType.CheckBiDiError, iter.GetText(), (rtlPos > ltrPos) ? rtlPos : ltrPos); } return(new StringBuffer(mapOut)); }
private String Map(String src, StringPrepOptions options) { // map bool allowUnassigned = ((options & ALLOW_UNASSIGNED) > 0); // disable test String caseMapOut = mapTransform.Transliterate(src); UCharacterIterator iter = UCharacterIterator.GetInstance(caseMapOut); int ch; while ((ch = iter.NextCodePoint()) != UCharacterIterator.Done) { if (transform.unassignedSet.Contains(ch) == true && allowUnassigned == false) { throw new StringPrepParseException("An unassigned code point was found in the input", StringPrepErrorType.UnassignedError); } } return(caseMapOut); }
public static StringBuffer Encode(StringBuffer input, char[] case_flags) { int[] @in = new int[input.Length]; int inLen = 0; int ch; StringBuffer result = new StringBuffer(); UCharacterIterator iter = UCharacterIterator.GetInstance(input); while ((ch = iter.NextCodePoint()) != UCharacterIterator.Done) { @in[inLen++] = ch; } int[] outLen = new int[1]; outLen[0] = input.Length * 4; char[] output = new char[outLen[0]]; int rc = punycode_success; for (; ;) { rc = Encode(inLen, @in, case_flags, outLen, output); if (rc == punycode_big_output) { outLen[0] = outLen[0] * 4; output = new char[outLen[0]]; // continue to convert continue; } break; } if (rc == punycode_success) { return(result.Append(output, 0, outLen[0])); } GetException(rc); return(result); }
public void TestIteration() { UCharacterIterator iterator = UCharacterIterator.GetInstance( ITERATION_STRING_); UCharacterIterator iterator2 = UCharacterIterator.GetInstance( ITERATION_STRING_); iterator.SetToStart(); if (iterator.Current != ITERATION_STRING_[0]) { Errln("Iterator failed retrieving first character"); } iterator.SetToLimit(); if (iterator.Previous() != ITERATION_STRING_[ ITERATION_STRING_.Length - 1]) { Errln("Iterator failed retrieving last character"); } if (iterator.Length != ITERATION_STRING_.Length) { Errln("Iterator failed determining begin and end index"); } iterator2.Index = 0; iterator.Index = 0; int ch = 0; while (ch != UCharacterIterator.DONE) { int index = iterator2.Index; ch = iterator2.NextCodePoint(); if (index != ITERATION_SUPPLEMENTARY_INDEX) { if (ch != iterator.Next() && ch != UCharacterIterator.DONE) { Errln("Error mismatch in next() and nextCodePoint()"); } } else { if (UTF16.GetLeadSurrogate(ch) != iterator.Next() || UTF16.GetTrailSurrogate(ch) != iterator.Next()) { Errln("Error mismatch in next and nextCodePoint for " + "supplementary characters"); } } } iterator.Index = ITERATION_STRING_.Length; iterator2.Index = ITERATION_STRING_.Length; while (ch != UCharacterIterator.DONE) { int index = iterator2.Index; ch = iterator2.PreviousCodePoint(); if (index != ITERATION_SUPPLEMENTARY_INDEX) { if (ch != iterator.Previous() && ch != UCharacterIterator.DONE) { Errln("Error mismatch in previous() and " + "previousCodePoint()"); } } else { if (UTF16.GetLeadSurrogate(ch) != iterator.Previous() || UTF16.GetTrailSurrogate(ch) != iterator.Previous()) { Errln("Error mismatch in previous and " + "previousCodePoint for supplementary characters"); } } } }
public void previousNext(UCharacterIterator iter) { int[] expect = { 0x2f999, 0x1d15f, 0xc4, 0x1ed0 }; // expected src indexes corresponding to expect indexes int[] expectIndex = { 0, 0, 1, 1, 2, 3, 4 //needed }; // initial indexes into the src and expect strings int SRC_MIDDLE = 4; int EXPECT_MIDDLE = 2; // movement vector // - for previous(), 0 for current(), + for next() // not const so that we can terminate it below for the error message String moves = "0+0+0--0-0-+++0--+++++++0--------"; UCharIterator iter32 = new UCharIterator(expect, expect.Length, EXPECT_MIDDLE); int c1, c2; char m; // initially set the indexes into the middle of the strings iter.Index = (SRC_MIDDLE); // move around and compare the iteration code points with // the expected ones int movesIndex = 0; while (movesIndex < moves.Length) { m = moves[movesIndex++]; if (m == '-') { c1 = iter.PreviousCodePoint(); c2 = iter32.Previous(); } else if (m == '0') { c1 = iter.CurrentCodePoint; c2 = iter32.Current; } else {// m=='+' c1 = iter.NextCodePoint(); c2 = iter32.Next(); } // compare results if (c1 != c2) { // copy the moves until the current (m) move, and terminate String history = moves.Substring(0, movesIndex - 0); // ICU4N: Checked 2nd parameter Errln("error: mismatch in Normalizer iteration at " + history + ": " + "got c1= " + Hex(c1) + " != expected c2= " + Hex(c2)); break; } // compare indexes if (expectIndex[iter.Index] != iter32.Index) { // copy the moves until the current (m) move, and terminate String history = moves.Substring(0, movesIndex - 0); // ICU4N: Checked 2nd parameter Errln("error: index mismatch in Normalizer iteration at " + history + " : " + "Normalizer index " + iter.Index + " expected " + expectIndex[iter32.Index]); break; } } }
public void TestIterationUChar32() { String text = "\u0061\u0062\ud841\udc02\u20ac\ud7ff\ud842\udc06\ud801\udc00\u0061"; int c; int i; { UCharacterIterator iter = UCharacterIterator.GetInstance(text); String iterText = iter.GetText(); if (!iterText.Equals(text)) { Errln("iter.getText() failed"); } iter.Index = (1); if (iter.CurrentCodePoint != UTF16.CharAt(text, 1)) { Errln("Iterator didn't start out in the right place."); } iter.SetToStart(); c = iter.CurrentCodePoint; i = 0; i = iter.MoveCodePointIndex(1); c = iter.CurrentCodePoint; if (c != UTF16.CharAt(text, 1) || i != 1) { Errln("moveCodePointIndex(1) didn't work correctly expected " + Hex(c) + " got " + Hex(UTF16.CharAt(text, 1)) + " i= " + i); } i = iter.MoveCodePointIndex(2); c = iter.CurrentCodePoint; if (c != UTF16.CharAt(text, 4) || i != 4) { Errln("moveCodePointIndex(2) didn't work correctly expected " + Hex(c) + " got " + Hex(UTF16.CharAt(text, 4)) + " i= " + i); } i = iter.MoveCodePointIndex(-2); c = iter.CurrentCodePoint; if (c != UTF16.CharAt(text, 1) || i != 1) { Errln("moveCodePointIndex(-2) didn't work correctly expected " + Hex(c) + " got " + Hex(UTF16.CharAt(text, 1)) + " i= " + i); } iter.SetToLimit(); i = iter.MoveCodePointIndex(-2); c = iter.CurrentCodePoint; if (c != UTF16.CharAt(text, (text.Length - 3)) || i != (text.Length - 3)) { Errln("moveCodePointIndex(-2) didn't work correctly expected " + Hex(c) + " got " + Hex(UTF16.CharAt(text, (text.Length - 3))) + " i= " + i); } iter.SetToStart(); c = iter.CurrentCodePoint; i = 0; //testing first32PostInc, nextCodePointPostInc, setTostart i = 0; iter.SetToStart(); c = iter.Next(); if (c != UTF16.CharAt(text, i)) { Errln("first32PostInc failed. Expected->" + Hex(UTF16.CharAt(text, i)) + " Got-> " + Hex(c)); } if (iter.Index != UTF16.GetCharCount(c) + i) { Errln("getIndex() after first32PostInc() failed"); } iter.SetToStart(); i = 0; if (iter.Index != 0) { Errln("setToStart failed"); } Logln("Testing forward iteration..."); do { if (c != UCharacterIterator.DONE) { c = iter.NextCodePoint(); } if (c != UTF16.CharAt(text, i)) { Errln("Character mismatch at position " + i + ", iterator has " + Hex(c) + ", string has " + Hex(UTF16.CharAt(text, i))); } i += UTF16.GetCharCount(c); if (iter.Index != i) { Errln("getIndex() aftr nextCodePointPostInc() isn't working right"); } c = iter.CurrentCodePoint; if (c != UCharacterIterator.DONE && c != UTF16.CharAt(text, i)) { Errln("current() after nextCodePointPostInc() isn't working right"); } } while (c != UCharacterIterator.DONE); c = iter.NextCodePoint(); if (c != UCharacterIterator.DONE) { Errln("nextCodePointPostInc() didn't return DONE at the beginning"); } } }
/// <summary> /// Is there an exception at this point? /// </summary> /// <param name="n">The location of the possible break.</param> /// <returns></returns> private bool BreakExceptionAt(int n) { // Note: the C++ version of this function is SimpleFilteredSentenceBreakIterator::breakExceptionAt() int bestPosn = -1; int bestValue = -1; // loops while 'n' points to an exception text.Index = n; backwardsTrie.Reset(); int uch; // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") if ((uch = text.PreviousCodePoint()) == ' ') { // TODO: skip a class of chars here?? // TODO only do this the 1st time? } else { uch = text.NextCodePoint(); } Result r = Result.IntermediateValue; while ((uch = text.PreviousCodePoint()) != UCharacterIterator.Done && // more to consume backwards and.. ((r = backwardsTrie.NextForCodePoint(uch)).HasNext())) { // more in the trie if (r.HasValue()) { // remember the best match so far bestPosn = text.Index; bestValue = backwardsTrie.GetValue(); } } if (r.Matches()) { // exact match? bestValue = backwardsTrie.GetValue(); bestPosn = text.Index; } if (bestPosn >= 0) { if (bestValue == SimpleFilteredSentenceBreakIteratorBuilder.Match) { // exact match! return(true); // Exception here. } else if (bestValue == SimpleFilteredSentenceBreakIteratorBuilder.Partial && forwardsPartialTrie != null) { // make sure there's a forward trie // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie // to see if it matches something going forward. forwardsPartialTrie.Reset(); Result rfwd = Result.IntermediateValue; text.Index = bestPosn; // hope that's close .. while ((uch = text.NextCodePoint()) != BreakIterator.Done && ((rfwd = forwardsPartialTrie.NextForCodePoint(uch)).HasNext())) { } if (rfwd.Matches()) { // Exception here return(true); } // else fall through } // else fall through } // else fall through return(false); // No exception here. }