public void TestExhaustive() { int counter = 0; CanonicalIterator it = new CanonicalIterator(""); /* * CanonicalIterator slowIt = new CanonicalIterator(""); * slowIt.SKIP_ZEROS = false; */ //Transliterator name = Transliterator.getInstance("[^\\u0020-\\u007F] name"); //Set itSet = new TreeSet(); //Set slowItSet = new TreeSet(); for (int i = 0; i < 0x10FFFF; ++i) { // skip characters we know don't have decomps UUnicodeCategory type = UChar.GetUnicodeCategory(i); if (type == UUnicodeCategory.OtherNotAssigned || type == UUnicodeCategory.PrivateUse || type == UUnicodeCategory.Surrogate) { continue; } if ((++counter % 5000) == 0) { Logln("Testing " + Utility.Hex(i, 0)); } string s = UTF16.ValueOf(i); CharacterTest(s, i, it); CharacterTest(s + "\u0345", i, it); } }
/// <summary> /// Iterates to the next script run, returning true if one exists. /// </summary> /// <returns>true if there is another script run, false otherwise.</returns> public bool Next() { if (scriptLimit >= limit) { return(false); } scriptCode = UScript.Common; scriptStart = scriptLimit; while (index < limit) { int ch = UTF16.CharAt(text, start, limit, index - start); int sc = GetScript(ch); /* * From UTR #24: Implementations that determine the boundaries between * characters of given scripts should never break between a non-spacing * mark and its base character. Thus for boundary determinations and * similar sorts of processing, a non-spacing mark — whatever its script * value — should inherit the script value of its base character. */ if (IsSameScript(scriptCode, sc) || UChar.GetUnicodeCategory(ch) == UUnicodeCategory.NonSpacingMark) { index += UTF16.GetCharCount(ch); /* * Inherited or Common becomes the script code of the surrounding text. */ if (scriptCode <= UScript.Inherited && sc > UScript.Inherited) { scriptCode = sc; } } else { break; } } scriptLimit = index; return(true); }
internal String GetTestSource() { if (random == null) { random = CreateRandom(); // use test framework's random seed } String source = ""; int i = 0; while (i < (random.Next(maxCharCount) + 1)) { int codepoint = random.Next(maxCodePoint); //Elimate unassigned characters while (UChar.GetUnicodeCategory(codepoint) == UUnicodeCategory.OtherNotAssigned) { codepoint = random.Next(maxCodePoint); } source = source + UTF16.ValueOf(codepoint); i++; } return(source); }
private static int U_GET_GC_MASK(int c) { return(1 << UChar.GetUnicodeCategory(c).ToInt32()); }
protected override void HandleTransliterate(IReplaceable text, TransliterationPosition pos, bool incremental) { lock (this) { boundaryCount = 0; int boundary = 0; GetBreakIterator(); // Lazy-create it if necessary bi.SetText(new ReplaceableCharacterIterator(text, pos.Start, pos.Limit, pos.Start)); // TODO: fix clumsy workaround used below. /* * char[] tempBuffer = new char[text.length()]; * text.getChars(0, text.length(), tempBuffer, 0); * bi.setText(new StringCharacterIterator(new String(tempBuffer), pos.start, pos.limit, pos.start)); */ // end debugging // To make things much easier, we will stack the boundaries, and then insert at the end. // generally, we won't need too many, since we will be filtered. for (boundary = bi.First(); boundary != BreakIterator.Done && boundary < pos.Limit; boundary = bi.Next()) { if (boundary == 0) { continue; } // HACK: Check to see that preceeding item was a letter int cp = UTF16.CharAt(text, boundary - 1); int type = UChar.GetUnicodeCategory(cp).ToInt32(); //System.out.println(Integer.toString(cp,16) + " (before): " + type); if (((1 << type) & LETTER_OR_MARK_MASK) == 0) { continue; } cp = UTF16.CharAt(text, boundary); type = UChar.GetUnicodeCategory(cp).ToInt32(); //System.out.println(Integer.toString(cp,16) + " (after): " + type); if (((1 << type) & LETTER_OR_MARK_MASK) == 0) { continue; } if (boundaryCount >= boundaries.Length) { // realloc if necessary int[] temp = new int[boundaries.Length * 2]; System.Array.Copy(boundaries, 0, temp, 0, boundaries.Length); boundaries = temp; } boundaries[boundaryCount++] = boundary; //System.out.println(boundary); } int delta = 0; int lastBoundary = 0; if (boundaryCount != 0) { // if we found something, adjust delta = boundaryCount * insertion.Length; lastBoundary = boundaries[boundaryCount - 1]; // we do this from the end backwards, so that we don't have to keep updating. while (boundaryCount > 0) { boundary = boundaries[--boundaryCount]; text.Replace(boundary, boundary, insertion); } } // Now fix up the return values pos.ContextLimit += delta; pos.Limit += delta; pos.Start = incremental ? lastBoundary + delta : pos.Limit; } }