Ejemplo n.º 1
0
 /// <summary>Fast version of <see cref="UScript.GetScript(int)"/>. Basic Latin is an array lookup.</summary>
 private int GetScript(int codepoint)
 {
     if (0 <= codepoint && codepoint < basicLatin.Length)
     {
         return(basicLatin[codepoint]);
     }
     else
     {
         int script = UScript.GetScript(codepoint);
         if (combineCJ)
         {
             if (script == UScript.Han || script == UScript.Hiragana || script == UScript.Katakana)
             {
                 return(UScript.Japanese);
             }
             else if (codepoint >= 0xFF10 && codepoint <= 0xFF19)
             {
                 // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
                 // they are treated as punctuation. we currently have no cleaner way to fix this!
                 return(UScript.Latin);
             }
             else
             {
                 return(script);
             }
         }
         else
         {
             return(script);
         }
     }
 }
Ejemplo n.º 2
0
        public void TestAllCodepoints()
        {
            int code;

            //String oldId="";
            //String oldAbbrId="";
            for (int i = 0; i <= 0x10ffff; i++)
            {
                code = UScript.InvalidCode;
                code = UScript.GetScript(i);
                if (code == UScript.InvalidCode)
                {
                    Errln("UScript.getScript for codepoint 0x" + Hex(i) + " failed");
                }
                String id = UScript.GetName(code);
                if (id.IndexOf("INVALID", StringComparison.Ordinal) >= 0)
                {
                    Errln("UScript.getScript for codepoint 0x" + Hex(i) + " failed");
                }
                String abbr = UScript.GetShortName(code);
                if (abbr.IndexOf("INV", StringComparison.Ordinal) >= 0)
                {
                    Errln("UScript.getScript for codepoint 0x" + Hex(i) + " failed");
                }
            }
        }
Ejemplo n.º 3
0
 static ScriptIterator()
 {
     for (int i = 0; i < basicLatin.Length; i++)
     {
         basicLatin[i] = UScript.GetScript(i);
     }
 }
Ejemplo n.º 4
0
        private static int[] LoadBasicLatin() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
        {
            var basicLatin = new int[128];

            for (int i = 0; i < basicLatin.Length; i++)
            {
                basicLatin[i] = UScript.GetScript(i);
            }
            return(basicLatin);
        }
Ejemplo n.º 5
0
            /// <summary>
            /// Returns TRUE if there are any more runs.  TRUE is always
            /// returned at least once.  Upon return, the caller should
            /// examine scriptCode, start, and limit.
            /// </summary>
            public virtual bool Next()
            {
                int ch;
                int s;

                ScriptCode = UScript.InvalidCode; // don't know script yet
                Start      = Limit;

                // Are we done?
                if (Start == textLimit)
                {
                    return(false);
                }

                // Move start back to include adjacent <see cref="UScript.Common"/> / <see cref="UScript.Inherited"/>
                // characters
                while (Start > textStart)
                {
                    ch = text.Char32At(Start - 1); // look back
                    s  = UScript.GetScript(ch);
                    if (s == UScript.Common || s == UScript.Inherited)
                    {
                        --Start;
                    }
                    else
                    {
                        break;
                    }
                }

                // Move limit ahead to include COMMON, INHERITED, and characters
                // of the current script.
                while (Limit < textLimit)
                {
                    ch = text.Char32At(Limit); // look ahead
                    s  = UScript.GetScript(ch);
                    if (s != UScript.Common && s != UScript.Inherited)
                    {
                        if (ScriptCode == UScript.InvalidCode)
                        {
                            ScriptCode = s;
                        }
                        else if (s != ScriptCode)
                        {
                            break;
                        }
                    }
                    ++Limit;
                }

                // Return TRUE even if the entire text is COMMON / INHERITED, in
                // which case scriptCode will be UScript.InvalidCode.
                return(true);
            }
Ejemplo n.º 6
0
 public void TestGetScriptOfCharsWithScriptExtensions()
 {
     /* test characters which have Script_Extensions */
     if (!(
             UScript.Common == UScript.GetScript(0x0640) &&
             UScript.Inherited == UScript.GetScript(0x0650) &&
             UScript.Arabic == UScript.GetScript(0xfdf2))
         )
     {
         Errln("UScript.getScript(character with Script_Extensions) failed");
     }
 }
Ejemplo n.º 7
0
            public void TestGetScript(int codepoint, int expected)
            {
                int code = UScript.InvalidCode;

                code = UScript.GetScript(codepoint);

                if (code != expected)
                {
                    Errln("Error testing UScript.getScript(). Got: " + code + " Expected: " + expected
                          + " for codepoint 0x + Hex(codepoint).");
                }
            }
Ejemplo n.º 8
0
        public void TestScriptMetadataAPI()
        {
            /* API & code coverage. */
            String sample = UScript.GetSampleString(UScript.Latin);

            if (sample.Length != 1 || UScript.GetScript(sample[0]) != UScript.Latin)
            {
                Errln("UScript.getSampleString(Latn) failed");
            }
            sample = UScript.GetSampleString(UScript.InvalidCode);
            if (sample.Length != 0)
            {
                Errln("UScript.getSampleString(invalid) failed");
            }

            if (UScript.GetUsage(UScript.Latin) != ScriptUsage.Recommended ||
                // Unicode 10 gives up on "aspirational".
                UScript.GetUsage(UScript.Yi) != ScriptUsage.LimitedUse ||
                UScript.GetUsage(UScript.Cherokee) != ScriptUsage.LimitedUse ||
                UScript.GetUsage(UScript.Coptic) != ScriptUsage.Excluded ||
                UScript.GetUsage(UScript.Cirth) != ScriptUsage.NotEncoded ||
                UScript.GetUsage(UScript.InvalidCode) != ScriptUsage.NotEncoded ||
                UScript.GetUsage(UScript.CodeLimit) != ScriptUsage.NotEncoded)
            {
                Errln("UScript.getUsage() failed");
            }

            if (UScript.IsRightToLeft(UScript.Latin) ||
                UScript.IsRightToLeft(UScript.Cirth) ||
                !UScript.IsRightToLeft(UScript.Arabic) ||
                !UScript.IsRightToLeft(UScript.Hebrew))
            {
                Errln("UScript.isRightToLeft() failed");
            }

            if (UScript.BreaksBetweenLetters(UScript.Latin) ||
                UScript.BreaksBetweenLetters(UScript.Cirth) ||
                !UScript.BreaksBetweenLetters(UScript.Han) ||
                !UScript.BreaksBetweenLetters(UScript.Thai))
            {
                Errln("UScript.breaksBetweenLetters() failed");
            }

            if (UScript.IsCased(UScript.Cirth) ||
                UScript.IsCased(UScript.Han) ||
                !UScript.IsCased(UScript.Latin) ||
                !UScript.IsCased(UScript.Greek))
            {
                Errln("UScript.isCased() failed");
            }
        }
Ejemplo n.º 9
0
        public void TestScriptMetadata()
        {
            UnicodeSet rtl = new UnicodeSet("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]");
            // So far, sample characters are uppercase.
            // Georgian is special.
            UnicodeSet cased = new UnicodeSet("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]");

            for (int sc = 0; sc < UScript.CodeLimit; ++sc)
            {
                String      sn        = UScript.GetShortName(sc);
                ScriptUsage usage     = UScript.GetUsage(sc);
                String      sample    = UScript.GetSampleString(sc);
                UnicodeSet  scriptSet = new UnicodeSet();
                scriptSet.ApplyInt32PropertyValue(UProperty.Script, sc);
                if (usage == ScriptUsage.NotEncoded)
                {
                    assertTrue(sn + " not encoded, no sample", sample.Length == 0);  // Java 6: sample.isEmpty()
                    assertFalse(sn + " not encoded, not RTL", UScript.IsRightToLeft(sc));
                    assertFalse(sn + " not encoded, not LB letters", UScript.BreaksBetweenLetters(sc));
                    assertFalse(sn + " not encoded, not cased", UScript.IsCased(sc));
                    assertTrue(sn + " not encoded, no characters", scriptSet.IsEmpty);
                }
                else
                {
                    assertFalse(sn + " encoded, has a sample character", sample.Length == 0);  // Java 6: sample.isEmpty()
                    int firstChar  = sample.CodePointAt(0);
                    int charScript = GetCharScript(sc);
                    assertEquals(sn + " script(sample(script))",
                                 charScript, UScript.GetScript(firstChar));
                    assertEquals(sn + " RTL vs. set", rtl.Contains(firstChar), UScript.IsRightToLeft(sc));
                    assertEquals(sn + " cased vs. set", cased.Contains(firstChar), UScript.IsCased(sc));
                    assertEquals(sn + " encoded, has characters", sc == charScript, !scriptSet.IsEmpty);
                    if (UScript.IsRightToLeft(sc))
                    {
                        rtl.RemoveAll(scriptSet);
                    }
                    if (UScript.IsCased(sc))
                    {
                        cased.RemoveAll(scriptSet);
                    }
                }
            }
            assertEquals("no remaining RTL characters", "[]", rtl.ToPattern(true));
            assertEquals("no remaining cased characters", "[]", cased.ToPattern(true));

            assertTrue("Hani breaks between letters", UScript.BreaksBetweenLetters(UScript.Han));
            assertTrue("Thai breaks between letters", UScript.BreaksBetweenLetters(UScript.Thai));
            assertFalse("Latn does not break between letters", UScript.BreaksBetweenLetters(UScript.Latin));
        }
Ejemplo n.º 10
0
        public void TestAllCodepointsUsingTry()
        {
            int code;

            for (int i = 0; i <= 0x10ffff; i++)
            {
                code = UScript.GetScript(i);
                if (code == UScript.InvalidCode)
                {
                    Errln("UScript.GetScript for codepoint 0x" + Hex(i) + " failed");
                }

                if (!UScript.TryGetName(code, out string id) || id.IndexOf("INVALID", StringComparison.Ordinal) >= 0)
                {
                    Errln("UScript.GetScript for codepoint 0x" + Hex(i) + " failed");
                }

                if (!UScript.TryGetShortName(code, out string abbr) || abbr.IndexOf("INV", StringComparison.Ordinal) >= 0)
                {
                    Errln("UScript.GetScript for codepoint 0x" + Hex(i) + " failed");
                }
            }
        }