/// <summary>Fast version of <see cref="UScript.GetScript(int)"/>. Basic Latin is an array lookup.</summary> private int GetScript(int codepoint) { if (0 <= codepoint && codepoint < basicLatin.Length) { return(basicLatin[codepoint]); } else { int script = UScript.GetScript(codepoint); if (combineCJ) { if (script == UScript.Han || script == UScript.Hiragana || script == UScript.Katakana) { return(UScript.Japanese); } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) { // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise // they are treated as punctuation. we currently have no cleaner way to fix this! return(UScript.Latin); } else { return(script); } } else { return(script); } } }
public void TestAllCodepoints() { int code; //String oldId=""; //String oldAbbrId=""; for (int i = 0; i <= 0x10ffff; i++) { code = UScript.InvalidCode; code = UScript.GetScript(i); if (code == UScript.InvalidCode) { Errln("UScript.getScript for codepoint 0x" + Hex(i) + " failed"); } String id = UScript.GetName(code); if (id.IndexOf("INVALID", StringComparison.Ordinal) >= 0) { Errln("UScript.getScript for codepoint 0x" + Hex(i) + " failed"); } String abbr = UScript.GetShortName(code); if (abbr.IndexOf("INV", StringComparison.Ordinal) >= 0) { Errln("UScript.getScript for codepoint 0x" + Hex(i) + " failed"); } } }
static ScriptIterator() { for (int i = 0; i < basicLatin.Length; i++) { basicLatin[i] = UScript.GetScript(i); } }
private static int[] LoadBasicLatin() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006) { var basicLatin = new int[128]; for (int i = 0; i < basicLatin.Length; i++) { basicLatin[i] = UScript.GetScript(i); } return(basicLatin); }
/// <summary> /// Returns TRUE if there are any more runs. TRUE is always /// returned at least once. Upon return, the caller should /// examine scriptCode, start, and limit. /// </summary> public virtual bool Next() { int ch; int s; ScriptCode = UScript.InvalidCode; // don't know script yet Start = Limit; // Are we done? if (Start == textLimit) { return(false); } // Move start back to include adjacent <see cref="UScript.Common"/> / <see cref="UScript.Inherited"/> // characters while (Start > textStart) { ch = text.Char32At(Start - 1); // look back s = UScript.GetScript(ch); if (s == UScript.Common || s == UScript.Inherited) { --Start; } else { break; } } // Move limit ahead to include COMMON, INHERITED, and characters // of the current script. while (Limit < textLimit) { ch = text.Char32At(Limit); // look ahead s = UScript.GetScript(ch); if (s != UScript.Common && s != UScript.Inherited) { if (ScriptCode == UScript.InvalidCode) { ScriptCode = s; } else if (s != ScriptCode) { break; } } ++Limit; } // Return TRUE even if the entire text is COMMON / INHERITED, in // which case scriptCode will be UScript.InvalidCode. return(true); }
public void TestGetScriptOfCharsWithScriptExtensions() { /* test characters which have Script_Extensions */ if (!( UScript.Common == UScript.GetScript(0x0640) && UScript.Inherited == UScript.GetScript(0x0650) && UScript.Arabic == UScript.GetScript(0xfdf2)) ) { Errln("UScript.getScript(character with Script_Extensions) failed"); } }
public void TestGetScript(int codepoint, int expected) { int code = UScript.InvalidCode; code = UScript.GetScript(codepoint); if (code != expected) { Errln("Error testing UScript.getScript(). Got: " + code + " Expected: " + expected + " for codepoint 0x + Hex(codepoint)."); } }
public void TestScriptMetadataAPI() { /* API & code coverage. */ String sample = UScript.GetSampleString(UScript.Latin); if (sample.Length != 1 || UScript.GetScript(sample[0]) != UScript.Latin) { Errln("UScript.getSampleString(Latn) failed"); } sample = UScript.GetSampleString(UScript.InvalidCode); if (sample.Length != 0) { Errln("UScript.getSampleString(invalid) failed"); } if (UScript.GetUsage(UScript.Latin) != ScriptUsage.Recommended || // Unicode 10 gives up on "aspirational". UScript.GetUsage(UScript.Yi) != ScriptUsage.LimitedUse || UScript.GetUsage(UScript.Cherokee) != ScriptUsage.LimitedUse || UScript.GetUsage(UScript.Coptic) != ScriptUsage.Excluded || UScript.GetUsage(UScript.Cirth) != ScriptUsage.NotEncoded || UScript.GetUsage(UScript.InvalidCode) != ScriptUsage.NotEncoded || UScript.GetUsage(UScript.CodeLimit) != ScriptUsage.NotEncoded) { Errln("UScript.getUsage() failed"); } if (UScript.IsRightToLeft(UScript.Latin) || UScript.IsRightToLeft(UScript.Cirth) || !UScript.IsRightToLeft(UScript.Arabic) || !UScript.IsRightToLeft(UScript.Hebrew)) { Errln("UScript.isRightToLeft() failed"); } if (UScript.BreaksBetweenLetters(UScript.Latin) || UScript.BreaksBetweenLetters(UScript.Cirth) || !UScript.BreaksBetweenLetters(UScript.Han) || !UScript.BreaksBetweenLetters(UScript.Thai)) { Errln("UScript.breaksBetweenLetters() failed"); } if (UScript.IsCased(UScript.Cirth) || UScript.IsCased(UScript.Han) || !UScript.IsCased(UScript.Latin) || !UScript.IsCased(UScript.Greek)) { Errln("UScript.isCased() failed"); } }
public void TestScriptMetadata() { UnicodeSet rtl = new UnicodeSet("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]"); // So far, sample characters are uppercase. // Georgian is special. UnicodeSet cased = new UnicodeSet("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]"); for (int sc = 0; sc < UScript.CodeLimit; ++sc) { String sn = UScript.GetShortName(sc); ScriptUsage usage = UScript.GetUsage(sc); String sample = UScript.GetSampleString(sc); UnicodeSet scriptSet = new UnicodeSet(); scriptSet.ApplyInt32PropertyValue(UProperty.Script, sc); if (usage == ScriptUsage.NotEncoded) { assertTrue(sn + " not encoded, no sample", sample.Length == 0); // Java 6: sample.isEmpty() assertFalse(sn + " not encoded, not RTL", UScript.IsRightToLeft(sc)); assertFalse(sn + " not encoded, not LB letters", UScript.BreaksBetweenLetters(sc)); assertFalse(sn + " not encoded, not cased", UScript.IsCased(sc)); assertTrue(sn + " not encoded, no characters", scriptSet.IsEmpty); } else { assertFalse(sn + " encoded, has a sample character", sample.Length == 0); // Java 6: sample.isEmpty() int firstChar = sample.CodePointAt(0); int charScript = GetCharScript(sc); assertEquals(sn + " script(sample(script))", charScript, UScript.GetScript(firstChar)); assertEquals(sn + " RTL vs. set", rtl.Contains(firstChar), UScript.IsRightToLeft(sc)); assertEquals(sn + " cased vs. set", cased.Contains(firstChar), UScript.IsCased(sc)); assertEquals(sn + " encoded, has characters", sc == charScript, !scriptSet.IsEmpty); if (UScript.IsRightToLeft(sc)) { rtl.RemoveAll(scriptSet); } if (UScript.IsCased(sc)) { cased.RemoveAll(scriptSet); } } } assertEquals("no remaining RTL characters", "[]", rtl.ToPattern(true)); assertEquals("no remaining cased characters", "[]", cased.ToPattern(true)); assertTrue("Hani breaks between letters", UScript.BreaksBetweenLetters(UScript.Han)); assertTrue("Thai breaks between letters", UScript.BreaksBetweenLetters(UScript.Thai)); assertFalse("Latn does not break between letters", UScript.BreaksBetweenLetters(UScript.Latin)); }
public void TestAllCodepointsUsingTry() { int code; for (int i = 0; i <= 0x10ffff; i++) { code = UScript.GetScript(i); if (code == UScript.InvalidCode) { Errln("UScript.GetScript for codepoint 0x" + Hex(i) + " failed"); } if (!UScript.TryGetName(code, out string id) || id.IndexOf("INVALID", StringComparison.Ordinal) >= 0) { Errln("UScript.GetScript for codepoint 0x" + Hex(i) + " failed"); } if (!UScript.TryGetShortName(code, out string abbr) || abbr.IndexOf("INV", StringComparison.Ordinal) >= 0) { Errln("UScript.GetScript for codepoint 0x" + Hex(i) + " failed"); } } }