示例#1
0
        /// ------------------------------------------------------------------------------------
        /// <summary>
        /// Determines whether the specified character is not a word-forming character
        /// according to ICU, but should be allowed to be a word-forming override.
        /// </summary>
        /// <param name="chr">The character to test</param>
        /// <returns><c>true</c> if the specified character is able to be overridden to be
        /// word-forming (i.e., is a punctuation or symbol character according to ICU or is one
        /// of the special exceptions);
        /// <c>false</c> otherwise</returns>
        /// ------------------------------------------------------------------------------------
        public bool CanBeWordFormingOverride(string chr)
        {
            if (string.IsNullOrEmpty(chr) || chr.Length > 1)
            {
                return(false);
            }

            int code = chr[0];

            if (code == 0x200C || code == 0x200D)
            {
                return(true);                // Zero-width non-joiner or zero-width joiner
            }
            if (Icu.IsSymbol(code))
            {
                return(true);                // symbol
            }
            if (Icu.IsPunct(code))
            {
                return(true);                // punctuation
            }
            return(false);
        }
示例#2
0
 private static IEnumerable <string> RemoveWhitespaceAndPunctTokens(IEnumerable <string> tokens)
 {
     return(tokens.Where(t => !t.All(c => Icu.IsSpace(c) || Icu.IsPunct(c))));
 }
示例#3
0
        private static void VerifyNewlyCreatedChars()
        {
            Icu.InitIcuDataDir();

            // The commented out methods below use u_getIntPropertyValue(), which doesn't
            // work reliably with the limited number of data files that we modify.
            //Assert.IsTrue(Icu.IsAlphabetic(kChar1));	// now true
            //Assert.IsTrue(Icu.IsAlphabetic(kChar2));	// now true
            //Assert.IsFalse(Icu.IsAlphabetic(kChar3));
            //Assert.IsFalse(Icu.IsAlphabetic(kChar4));
            Assert.IsFalse(Icu.IsControl(kChar1));
            Assert.IsFalse(Icu.IsControl(kChar2));
            Assert.IsFalse(Icu.IsControl(kChar3));
            Assert.IsFalse(Icu.IsControl(kChar4));
            //Assert.IsFalse(Icu.IsDiacritic(kChar1));
            //Assert.IsFalse(Icu.IsDiacritic(kChar2));
            //Assert.IsFalse(Icu.IsDiacritic(kChar3));
            //Assert.IsFalse(Icu.IsDiacritic(kChar4));
            //Assert.IsFalse(Icu.IsIdeographic(kChar1));
            //Assert.IsFalse(Icu.IsIdeographic(kChar2));
            //Assert.IsFalse(Icu.IsIdeographic(kChar3));
            //Assert.IsFalse(Icu.IsIdeographic(kChar4));
            //Assert.IsFalse(Icu.IsNumeric(kChar1));
            //Assert.IsFalse(Icu.IsNumeric(kChar2));
            //Assert.IsFalse(Icu.IsNumeric(kChar3));
            //Assert.IsTrue(Icu.IsNumeric(kChar4));		// now true
            Assert.IsFalse(Icu.IsPunct(kChar1));
            Assert.IsFalse(Icu.IsPunct(kChar2));
            Assert.IsTrue(Icu.IsPunct(kChar3));                                 // now true
            Assert.IsFalse(Icu.IsPunct(kChar4));
            Assert.IsFalse(Icu.IsSpace(kChar1));
            Assert.IsFalse(Icu.IsSpace(kChar2));
            Assert.IsFalse(Icu.IsSpace(kChar3));
            Assert.IsFalse(Icu.IsSpace(kChar4));
            Assert.IsFalse(Icu.IsSymbol(kChar1));
            Assert.IsFalse(Icu.IsSymbol(kChar2));
            Assert.IsFalse(Icu.IsSymbol(kChar3));
            Assert.IsFalse(Icu.IsSymbol(kChar4));

            var cat = Icu.GetCharType(kChar1);

            Assert.AreEqual(Icu.UCharCategory.U_LOWERCASE_LETTER, cat);
            cat = Icu.GetCharType(kChar2);
            Assert.AreEqual(Icu.UCharCategory.U_UPPERCASE_LETTER, cat);
            cat = Icu.GetCharType(kChar3);
            Assert.AreEqual(Icu.UCharCategory.U_OTHER_PUNCTUATION, cat);
            cat = Icu.GetCharType(kChar4);
            Assert.AreEqual(Icu.UCharCategory.U_DECIMAL_DIGIT_NUMBER, cat);
            var decompositionType = Icu.GetDecompositionType(kChar1);

            Assert.AreEqual("[none]", decompositionType.Description);
            decompositionType = Icu.GetDecompositionType(kChar2);
            Assert.AreEqual("[none]", decompositionType.Description);
            decompositionType = Icu.GetDecompositionType(kChar3);
            Assert.AreEqual("[none]", decompositionType.Description);
            decompositionType = Icu.GetDecompositionType(kChar4);
            Assert.AreEqual("[none]", decompositionType.Description);
            var numericType = Icu.GetNumericType(kChar1);

            Assert.AreEqual("[none]", numericType.Description);
            numericType = Icu.GetNumericType(kChar2);
            Assert.AreEqual("[none]", numericType.Description);
            numericType = Icu.GetNumericType(kChar3);
            Assert.AreEqual("[none]", numericType.Description);

            // Current implementation (as of ICU50) is not overriding numeric type since we don't use it anywhere.
            // Enhance silmods.c in icu patch if needed.
            //numericType = Icu.GetNumericType(kChar4);
            //Assert.AreEqual("Decimal Digit", numericType.Description);

            // Current implementation (as of ICU50) is not overriding character names since we don't use them anywhere.
            // Enhance silmods.c in icu patch if needed.
            //var prettyName = Icu.GetPrettyICUCharName("\xE000");
            //Assert.AreEqual("My Special Character", prettyName);
            //prettyName = Icu.GetPrettyICUCharName("\xE001");
            //Assert.AreEqual("My Uppercase Character", prettyName);
            //prettyName = Icu.GetPrettyICUCharName(kChar3S);
            //Assert.AreEqual("New Punctuation Mark", prettyName);
            //var rawName = Icu.GetCharName(kChar4);	// can't pass large character code as 16-bit char.
            //Assert.AreEqual("NEW DIGIT NINE", rawName);
        }
示例#4
0
        private static void VerifyNonexistentChars()
        {
            Icu.InitIcuDataDir();

            Assert.IsFalse(Icu.IsAlphabetic(kChar1));
            Assert.IsFalse(Icu.IsAlphabetic(kChar2));
            Assert.IsFalse(Icu.IsAlphabetic(kChar3));
            Assert.IsFalse(Icu.IsAlphabetic(kChar4));
            Assert.IsFalse(Icu.IsControl(kChar1));
            Assert.IsFalse(Icu.IsControl(kChar2));
            Assert.IsFalse(Icu.IsControl(kChar3));
            Assert.IsFalse(Icu.IsControl(kChar4));
            Assert.IsFalse(Icu.IsDiacritic(kChar1));
            Assert.IsFalse(Icu.IsDiacritic(kChar2));
            Assert.IsFalse(Icu.IsDiacritic(kChar3));
            Assert.IsFalse(Icu.IsDiacritic(kChar4));
            Assert.IsFalse(Icu.IsIdeographic(kChar1));
            Assert.IsFalse(Icu.IsIdeographic(kChar2));
            Assert.IsFalse(Icu.IsIdeographic(kChar3));
            Assert.IsFalse(Icu.IsIdeographic(kChar4));
            Assert.IsFalse(Icu.IsNumeric(kChar1));
            Assert.IsFalse(Icu.IsNumeric(kChar2));
            Assert.IsFalse(Icu.IsNumeric(kChar3));
            Assert.IsFalse(Icu.IsNumeric(kChar4));
            Assert.IsFalse(Icu.IsPunct(kChar1));
            Assert.IsFalse(Icu.IsPunct(kChar2));
            Assert.IsFalse(Icu.IsPunct(kChar3));
            Assert.IsFalse(Icu.IsPunct(kChar4));
            Assert.IsFalse(Icu.IsSpace(kChar1));
            Assert.IsFalse(Icu.IsSpace(kChar2));
            Assert.IsFalse(Icu.IsSpace(kChar3));
            Assert.IsFalse(Icu.IsSpace(kChar4));
            Assert.IsFalse(Icu.IsSymbol(kChar1));
            Assert.IsFalse(Icu.IsSymbol(kChar2));
            Assert.IsFalse(Icu.IsSymbol(kChar3));
            Assert.IsFalse(Icu.IsSymbol(kChar4));

            Assert.AreEqual(Icu.UCharCategory.U_PRIVATE_USE_CHAR, Icu.GetCharType(kChar1));
            Assert.AreEqual(Icu.UCharCategory.U_PRIVATE_USE_CHAR, Icu.GetCharType(kChar2));
            Assert.AreEqual(Icu.UCharCategory.U_UNASSIGNED, Icu.GetCharType(kChar3));
            Assert.AreEqual(Icu.UCharCategory.U_UNASSIGNED, Icu.GetCharType(kChar4));
            var decompositionType = Icu.GetDecompositionType(kChar1);

            Assert.AreEqual("[none]", decompositionType.Description);
            decompositionType = Icu.GetDecompositionType(kChar2);
            Assert.AreEqual("[none]", decompositionType.Description);
            decompositionType = Icu.GetDecompositionType(kChar3);
            Assert.AreEqual("[none]", decompositionType.Description);
            decompositionType = Icu.GetDecompositionType(kChar4);
            Assert.AreEqual("[none]", decompositionType.Description);
            var numericType = Icu.GetNumericType(kChar1);

            Assert.AreEqual("[none]", numericType.Description);
            numericType = Icu.GetNumericType(kChar2);
            Assert.AreEqual("[none]", numericType.Description);
            numericType = Icu.GetNumericType(kChar3);
            Assert.AreEqual("[none]", numericType.Description);
            numericType = Icu.GetNumericType(kChar4);
            Assert.AreEqual("[none]", numericType.Description);
            var prettyName = Icu.GetPrettyICUCharName("\xE000");

            Assert.IsNull(prettyName);
            prettyName = Icu.GetPrettyICUCharName("\xE001");
            Assert.IsNull(prettyName);
            prettyName = Icu.GetPrettyICUCharName(kChar3S);
            Assert.IsNull(prettyName);
            prettyName = Icu.GetPrettyICUCharName("\xDDDDD");
            Assert.IsNull(prettyName);
        }
示例#5
0
        /// ------------------------------------------------------------------------------------
        /// <summary>
        /// Given a TS string and an index, find the closest start of a word before that
        /// position (or after that position if at a word boundary already or in the last word
        /// of the string. If in a run marked using one of the special styles, always returns
        /// the position at the end of that run. Runs having these special styles are also
        /// always regarded as word boundaries.
        /// </summary>
        /// <param name="tss">the structured string of the paragraph or translation</param>
        /// <param name="ich">the given index</param>
        /// <param name="specialStyles">The special styles.</param>
        /// <returns>adjusted character index</returns>
        /// ------------------------------------------------------------------------------------
        public static int FindWordBoundary(this ITsString tss, int ich, params string[] specialStyles)
        {
            if (ich < 0 || ich > tss.Length)
            {
                throw new ArgumentOutOfRangeException("ich");
            }

            if (ich == 0 || ich == tss.Length)
            {
                return(ich);
            }

            string text          = tss.Text;
            string startingStyle = tss.StyleAt(ich);
            string prevStyle     = ich > 0 ? tss.StyleAt(ich - 1) : startingStyle;

            if (!specialStyles.Contains(startingStyle) || prevStyle == null)
            {
                startingStyle = null;
            }
            else if (startingStyle != null)
            {
                startingStyle = prevStyle;
            }

            // Advance to the next word boundary if appropriate)
            while (ich < text.Length)
            {
                // if the current character is space...
                if (Icu.IsSeparator(text[ich]))
                {
                    ich++;
                }
                else if (Icu.IsPunct(text[ich]) && ich > 0 && !Icu.IsSeparator(text[ich - 1]))
                {
                    // if word-final punctuation advance
                    ich++;
                }
                else if (startingStyle != null && tss.StyleAt(ich) == startingStyle)
                {
                    ich++;
                }
                else
                {
                    break;
                }
            }

            // NEVER move backward if at the end of the paragraph.
            if (ich < text.Length)
            {
                // While the insertion point is in the middle of a word then back up to the
                // start of the word or the start of a paragraph.
                while (ich > 0 && !Icu.IsSeparator(text[ich - 1]) && !specialStyles.Contains(tss.StyleAt(ich - 1)))
                {
                    ich--;
                }
            }

            return(ich);
        }