/// ------------------------------------------------------------------------------------ /// <summary> /// Determines whether the specified character is not a word-forming character /// according to ICU, but should be allowed to be a word-forming override. /// </summary> /// <param name="chr">The character to test</param> /// <returns><c>true</c> if the specified character is able to be overridden to be /// word-forming (i.e., is a punctuation or symbol character according to ICU or is one /// of the special exceptions); /// <c>false</c> otherwise</returns> /// ------------------------------------------------------------------------------------ public bool CanBeWordFormingOverride(string chr) { if (string.IsNullOrEmpty(chr) || chr.Length > 1) { return(false); } int code = chr[0]; if (code == 0x200C || code == 0x200D) { return(true); // Zero-width non-joiner or zero-width joiner } if (Icu.IsSymbol(code)) { return(true); // symbol } if (Icu.IsPunct(code)) { return(true); // punctuation } return(false); }
private static IEnumerable <string> RemoveWhitespaceAndPunctTokens(IEnumerable <string> tokens) { return(tokens.Where(t => !t.All(c => Icu.IsSpace(c) || Icu.IsPunct(c)))); }
private static void VerifyNewlyCreatedChars() { Icu.InitIcuDataDir(); // The commented out methods below use u_getIntPropertyValue(), which doesn't // work reliably with the limited number of data files that we modify. //Assert.IsTrue(Icu.IsAlphabetic(kChar1)); // now true //Assert.IsTrue(Icu.IsAlphabetic(kChar2)); // now true //Assert.IsFalse(Icu.IsAlphabetic(kChar3)); //Assert.IsFalse(Icu.IsAlphabetic(kChar4)); Assert.IsFalse(Icu.IsControl(kChar1)); Assert.IsFalse(Icu.IsControl(kChar2)); Assert.IsFalse(Icu.IsControl(kChar3)); Assert.IsFalse(Icu.IsControl(kChar4)); //Assert.IsFalse(Icu.IsDiacritic(kChar1)); //Assert.IsFalse(Icu.IsDiacritic(kChar2)); //Assert.IsFalse(Icu.IsDiacritic(kChar3)); //Assert.IsFalse(Icu.IsDiacritic(kChar4)); //Assert.IsFalse(Icu.IsIdeographic(kChar1)); //Assert.IsFalse(Icu.IsIdeographic(kChar2)); //Assert.IsFalse(Icu.IsIdeographic(kChar3)); //Assert.IsFalse(Icu.IsIdeographic(kChar4)); //Assert.IsFalse(Icu.IsNumeric(kChar1)); //Assert.IsFalse(Icu.IsNumeric(kChar2)); //Assert.IsFalse(Icu.IsNumeric(kChar3)); //Assert.IsTrue(Icu.IsNumeric(kChar4)); // now true Assert.IsFalse(Icu.IsPunct(kChar1)); Assert.IsFalse(Icu.IsPunct(kChar2)); Assert.IsTrue(Icu.IsPunct(kChar3)); // now true Assert.IsFalse(Icu.IsPunct(kChar4)); Assert.IsFalse(Icu.IsSpace(kChar1)); Assert.IsFalse(Icu.IsSpace(kChar2)); Assert.IsFalse(Icu.IsSpace(kChar3)); Assert.IsFalse(Icu.IsSpace(kChar4)); Assert.IsFalse(Icu.IsSymbol(kChar1)); Assert.IsFalse(Icu.IsSymbol(kChar2)); Assert.IsFalse(Icu.IsSymbol(kChar3)); Assert.IsFalse(Icu.IsSymbol(kChar4)); var cat = Icu.GetCharType(kChar1); Assert.AreEqual(Icu.UCharCategory.U_LOWERCASE_LETTER, cat); cat = Icu.GetCharType(kChar2); Assert.AreEqual(Icu.UCharCategory.U_UPPERCASE_LETTER, cat); cat = Icu.GetCharType(kChar3); Assert.AreEqual(Icu.UCharCategory.U_OTHER_PUNCTUATION, cat); cat = Icu.GetCharType(kChar4); Assert.AreEqual(Icu.UCharCategory.U_DECIMAL_DIGIT_NUMBER, cat); var decompositionType = Icu.GetDecompositionType(kChar1); Assert.AreEqual("[none]", decompositionType.Description); decompositionType = Icu.GetDecompositionType(kChar2); Assert.AreEqual("[none]", decompositionType.Description); decompositionType = Icu.GetDecompositionType(kChar3); Assert.AreEqual("[none]", decompositionType.Description); decompositionType = Icu.GetDecompositionType(kChar4); Assert.AreEqual("[none]", decompositionType.Description); var numericType = Icu.GetNumericType(kChar1); Assert.AreEqual("[none]", numericType.Description); numericType = Icu.GetNumericType(kChar2); Assert.AreEqual("[none]", numericType.Description); numericType = Icu.GetNumericType(kChar3); Assert.AreEqual("[none]", numericType.Description); // Current implementation (as of ICU50) is not overriding numeric type since we don't use it anywhere. // Enhance silmods.c in icu patch if needed. //numericType = Icu.GetNumericType(kChar4); //Assert.AreEqual("Decimal Digit", numericType.Description); // Current implementation (as of ICU50) is not overriding character names since we don't use them anywhere. // Enhance silmods.c in icu patch if needed. //var prettyName = Icu.GetPrettyICUCharName("\xE000"); //Assert.AreEqual("My Special Character", prettyName); //prettyName = Icu.GetPrettyICUCharName("\xE001"); //Assert.AreEqual("My Uppercase Character", prettyName); //prettyName = Icu.GetPrettyICUCharName(kChar3S); //Assert.AreEqual("New Punctuation Mark", prettyName); //var rawName = Icu.GetCharName(kChar4); // can't pass large character code as 16-bit char. //Assert.AreEqual("NEW DIGIT NINE", rawName); }
private static void VerifyNonexistentChars() { Icu.InitIcuDataDir(); Assert.IsFalse(Icu.IsAlphabetic(kChar1)); Assert.IsFalse(Icu.IsAlphabetic(kChar2)); Assert.IsFalse(Icu.IsAlphabetic(kChar3)); Assert.IsFalse(Icu.IsAlphabetic(kChar4)); Assert.IsFalse(Icu.IsControl(kChar1)); Assert.IsFalse(Icu.IsControl(kChar2)); Assert.IsFalse(Icu.IsControl(kChar3)); Assert.IsFalse(Icu.IsControl(kChar4)); Assert.IsFalse(Icu.IsDiacritic(kChar1)); Assert.IsFalse(Icu.IsDiacritic(kChar2)); Assert.IsFalse(Icu.IsDiacritic(kChar3)); Assert.IsFalse(Icu.IsDiacritic(kChar4)); Assert.IsFalse(Icu.IsIdeographic(kChar1)); Assert.IsFalse(Icu.IsIdeographic(kChar2)); Assert.IsFalse(Icu.IsIdeographic(kChar3)); Assert.IsFalse(Icu.IsIdeographic(kChar4)); Assert.IsFalse(Icu.IsNumeric(kChar1)); Assert.IsFalse(Icu.IsNumeric(kChar2)); Assert.IsFalse(Icu.IsNumeric(kChar3)); Assert.IsFalse(Icu.IsNumeric(kChar4)); Assert.IsFalse(Icu.IsPunct(kChar1)); Assert.IsFalse(Icu.IsPunct(kChar2)); Assert.IsFalse(Icu.IsPunct(kChar3)); Assert.IsFalse(Icu.IsPunct(kChar4)); Assert.IsFalse(Icu.IsSpace(kChar1)); Assert.IsFalse(Icu.IsSpace(kChar2)); Assert.IsFalse(Icu.IsSpace(kChar3)); Assert.IsFalse(Icu.IsSpace(kChar4)); Assert.IsFalse(Icu.IsSymbol(kChar1)); Assert.IsFalse(Icu.IsSymbol(kChar2)); Assert.IsFalse(Icu.IsSymbol(kChar3)); Assert.IsFalse(Icu.IsSymbol(kChar4)); Assert.AreEqual(Icu.UCharCategory.U_PRIVATE_USE_CHAR, Icu.GetCharType(kChar1)); Assert.AreEqual(Icu.UCharCategory.U_PRIVATE_USE_CHAR, Icu.GetCharType(kChar2)); Assert.AreEqual(Icu.UCharCategory.U_UNASSIGNED, Icu.GetCharType(kChar3)); Assert.AreEqual(Icu.UCharCategory.U_UNASSIGNED, Icu.GetCharType(kChar4)); var decompositionType = Icu.GetDecompositionType(kChar1); Assert.AreEqual("[none]", decompositionType.Description); decompositionType = Icu.GetDecompositionType(kChar2); Assert.AreEqual("[none]", decompositionType.Description); decompositionType = Icu.GetDecompositionType(kChar3); Assert.AreEqual("[none]", decompositionType.Description); decompositionType = Icu.GetDecompositionType(kChar4); Assert.AreEqual("[none]", decompositionType.Description); var numericType = Icu.GetNumericType(kChar1); Assert.AreEqual("[none]", numericType.Description); numericType = Icu.GetNumericType(kChar2); Assert.AreEqual("[none]", numericType.Description); numericType = Icu.GetNumericType(kChar3); Assert.AreEqual("[none]", numericType.Description); numericType = Icu.GetNumericType(kChar4); Assert.AreEqual("[none]", numericType.Description); var prettyName = Icu.GetPrettyICUCharName("\xE000"); Assert.IsNull(prettyName); prettyName = Icu.GetPrettyICUCharName("\xE001"); Assert.IsNull(prettyName); prettyName = Icu.GetPrettyICUCharName(kChar3S); Assert.IsNull(prettyName); prettyName = Icu.GetPrettyICUCharName("\xDDDDD"); Assert.IsNull(prettyName); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Given a TS string and an index, find the closest start of a word before that /// position (or after that position if at a word boundary already or in the last word /// of the string. If in a run marked using one of the special styles, always returns /// the position at the end of that run. Runs having these special styles are also /// always regarded as word boundaries. /// </summary> /// <param name="tss">the structured string of the paragraph or translation</param> /// <param name="ich">the given index</param> /// <param name="specialStyles">The special styles.</param> /// <returns>adjusted character index</returns> /// ------------------------------------------------------------------------------------ public static int FindWordBoundary(this ITsString tss, int ich, params string[] specialStyles) { if (ich < 0 || ich > tss.Length) { throw new ArgumentOutOfRangeException("ich"); } if (ich == 0 || ich == tss.Length) { return(ich); } string text = tss.Text; string startingStyle = tss.StyleAt(ich); string prevStyle = ich > 0 ? tss.StyleAt(ich - 1) : startingStyle; if (!specialStyles.Contains(startingStyle) || prevStyle == null) { startingStyle = null; } else if (startingStyle != null) { startingStyle = prevStyle; } // Advance to the next word boundary if appropriate) while (ich < text.Length) { // if the current character is space... if (Icu.IsSeparator(text[ich])) { ich++; } else if (Icu.IsPunct(text[ich]) && ich > 0 && !Icu.IsSeparator(text[ich - 1])) { // if word-final punctuation advance ich++; } else if (startingStyle != null && tss.StyleAt(ich) == startingStyle) { ich++; } else { break; } } // NEVER move backward if at the end of the paragraph. if (ich < text.Length) { // While the insertion point is in the middle of a word then back up to the // start of the word or the start of a paragraph. while (ich > 0 && !Icu.IsSeparator(text[ich - 1]) && !specialStyles.Contains(tss.StyleAt(ich - 1))) { ich--; } } return(ich); }