/// <summary> /// Remove diacritics (i.e., accents) from String /// </summary> /// <param name="word"> /// A <see cref="String"/> /// </param> /// <returns> /// A <see cref="String"/> /// </returns> public static String removeDiacritics(String word) { String kdform = word.Normalize(NormalizationForm.FormKD); StringBuilder sb = new StringBuilder(); for(int i = 0; i < kdform.Length; i++) { UnicodeCategory uc = CharUnicodeInfo.GetUnicodeCategory(kdform[i]); if(uc != UnicodeCategory.NonSpacingMark) sb.Append(kdform[i]); } return sb.ToString(); }
/// <summary> /// Remove os acentos de uma string /// </summary> /// <param name="s"></param> /// <returns></returns> public static String RemoveDiacritics(String s) { var normalizedString = s.Normalize(NormalizationForm.FormD); var stringBuilder = new StringBuilder(); for (var i = 0; i < normalizedString.Length; i++) { var c = normalizedString[i]; if (CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark) stringBuilder.Append(c); } return stringBuilder.ToString(); }
public static System.String RemoveDiacritics(this System.String s) { System.String normalizedString = s.Normalize(NormalizationForm.FormD); var stringBuilder = new StringBuilder(); foreach (char c in normalizedString) { if (CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark) { stringBuilder.Append(c); } } return(stringBuilder.ToString()); }
public static String asciify(String text) { text = text.Normalize (System.Text.NormalizationForm.FormC); StringBuilder newText = new StringBuilder (); foreach (char c in text) { newText.Append (specialCases.ContainsKey(c) ? specialCases[c] : c.ToString ()); } text = newText.ToString().Normalize (System.Text.NormalizationForm.FormD); newText = new StringBuilder (); foreach (char c in text) { if (c <= 127) { newText.Append (c); } } return newText.ToString (); }
public static string RemoveDiacritics(String s) { // oddělení znaků od modifikátorů (háčků, čárek, atd.) s = s.Normalize(System.Text.NormalizationForm.FormD); System.Text.StringBuilder sb = new System.Text.StringBuilder(); for (int i = 0; i < s.Length; i++) { // do řetězce přidá všechny znaky kromě modifikátorů if (System.Globalization.CharUnicodeInfo.GetUnicodeCategory(s[i]) != System.Globalization.UnicodeCategory.NonSpacingMark) { sb.Append(s[i]); } } // vrátí řetězec bez diakritiky return sb.ToString(); }
/// <summary> /// /// </summary> /// <param name="text"></param> /// <returns></returns> public int GetTweetLength(String text) { try { text = text.Normalize(NormalizationForm.FormC); } catch { } int length = new StringInfo(text).LengthInTextElements; foreach (Extractor.Entity urlEntity in __Extractor.ExtractURLsWithIndices(text)) { // Subtract the length of the original URL length -= (urlEntity.End - urlEntity.Start); // Add `ShortUrlLengthHttps` characters for URL starting with https:// Otherwise add `ShortUrlLength` characters length += urlEntity.Value.ToLower().StartsWith("https://") ? ShortUrlLengthHttps : ShortUrlLength; } return length; }
/// <summary> /// http://msdn.microsoft.com/en-us/library/system.text.normalizationform.aspx /// </summary> /// <param name="Input"></param> /// <returns></returns> public static String RemoveAccents(this String Expr) { String result = Expr; if (!String.IsNullOrEmpty(Expr)) { // FormC full canonical decomposition, followed by the replacement of sequences with their primary composites, if possible. // FormD full canonical decomposition. // FormKC full compatibility decomposition, followed by the replacement of sequences with their primary composites, if possible. // FormKD full compatibility decomposition. String normalizer = Expr.Normalize(NormalizationForm.FormKD); Encoding cleaner = Encoding.GetEncoding(Encoding.ASCII.CodePage, new EncoderReplacementFallback(""), new DecoderReplacementFallback("")); Byte[] bytes = cleaner.GetBytes(normalizer); result = Encoding.ASCII.GetString(bytes); } return(result); }
private static string RemoveDiacritics(String s) { // oddìlení znakù od modifikátorù (háèkù, èárek, atd.) s = s.Normalize(NormalizationForm.FormD); StringBuilder sb = new StringBuilder(); for (int i = 0; i < s.Length; i++) { // do øetìzce pøidá všechny znaky kromì modifikátorù if (CharUnicodeInfo.GetUnicodeCategory(s[i]) != UnicodeCategory.NonSpacingMark) { sb.Append(s[i]); } } // vrátí øetìzec bez diakritiky return sb.ToString(); }
public static string[] MessagePraser(String messageContent) { string[] messagePrased; messagePrased = messageContent.Normalize().Split(' '); return messagePrased; }
/// <summary> /// Removes diacritics /// </summary> /// <param name="s">text string possibly with diaqcritics</param> /// <returns></returns> private string RemoveDiacritics(String s) { //normalize string s = s.Normalize(NormalizationForm.FormD); StringBuilder sb = new StringBuilder(); for (int i = 0; i < s.Length; i++) { //substitute characters with diacritics by characters without diacritics if (CharUnicodeInfo.GetUnicodeCategory(s[i]) != UnicodeCategory.NonSpacingMark) { sb.Append(s[i]); } } //return string where diacritics have been removed return sb.ToString(); }
/// <summary> /// Remove todos os acentos das palavras. /// </summary> /// <param name="value">palavra acentuada</param> /// <returns>palavra sem acento</returns> internal static String RemoveAcento(String value) { String normalizedString = value.Normalize(NormalizationForm.FormD); StringBuilder stringBuilder = new StringBuilder(); for (int i = 0; i < normalizedString.Length; i++) { Char c = normalizedString[i]; if (CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark) stringBuilder.Append(c); } return stringBuilder.ToString(); }
public static unsafe void NormalizationTest() { // U+0063 LATIN SMALL LETTER C // U+0301 COMBINING ACUTE ACCENT // U+0327 COMBINING CEDILLA // U+00BE VULGAR FRACTION THREE QUARTERS string s = new String( new char[] {'\u0063', '\u0301', '\u0327', '\u00BE'}); Assert.False(s.IsNormalized(), "String should be not normalized when checking with the default which same as FormC"); Assert.False(s.IsNormalized(NormalizationForm.FormC), "String should be not normalized when checking with FormC"); Assert.False(s.IsNormalized(NormalizationForm.FormD), "String should be not normalized when checking with FormD"); Assert.False(s.IsNormalized(NormalizationForm.FormKC), "String should be not normalized when checking with FormKC"); Assert.False(s.IsNormalized(NormalizationForm.FormKD), "String should be not normalized when checking with FormKD"); string normalized = s.Normalize(); // FormC Assert.True(normalized.IsNormalized(), "Expected to have the normalized string with default form FormC"); Assert.True(normalized.IsNormalized(NormalizationForm.FormC), "Expected to have the normalized string with FormC"); normalized = s.Normalize(NormalizationForm.FormC); Assert.True(normalized.IsNormalized(), "Expected to have the normalized string with default form FormC when using NormalizationForm.FormC"); Assert.True(normalized.IsNormalized(NormalizationForm.FormC), "Expected to have the normalized string with FormC when using NormalizationForm.FormC"); normalized = s.Normalize(NormalizationForm.FormD); Assert.True(normalized.IsNormalized(NormalizationForm.FormD), "Expected to have the normalized string with FormD"); normalized = s.Normalize(NormalizationForm.FormKC); Assert.True(normalized.IsNormalized(NormalizationForm.FormKC), "Expected to have the normalized string with FormKC"); normalized = s.Normalize(NormalizationForm.FormKD); Assert.True(normalized.IsNormalized(NormalizationForm.FormKD), "Expected to have the normalized string with FormKD"); }
public String GetAscii(String unicode, int index, int count) { if (unicode==null) throw new ArgumentNullException("unicode"); if (index < 0 || count < 0) throw new ArgumentOutOfRangeException((index < 0) ? "index" : "count", Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); if (index > unicode.Length) throw new ArgumentOutOfRangeException("byteIndex", Environment.GetResourceString("ArgumentOutOfRange_Index")); if (index > unicode.Length - count) throw new ArgumentOutOfRangeException("unicode", Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer")); Contract.EndContractBlock(); // We're only using part of the string unicode = unicode.Substring(index, count); if (Environment.IsWindows8OrAbove) { return GetAsciiUsingOS(unicode); } // Check for ASCII only string, which will be unchanged if (ValidateStd3AndAscii(unicode, UseStd3AsciiRules, true)) { return unicode; } // Cannot be null terminated (normalization won't help us with this one, and // may have returned false before checking the whole string above) Contract.Assert(unicode.Length >= 1, "[IdnMapping.GetAscii]Expected 0 length strings to fail before now."); if (unicode[unicode.Length - 1] <= 0x1f) { throw new ArgumentException( Environment.GetResourceString("Argument_InvalidCharSequence", unicode.Length-1 ), "unicode"); } // Have to correctly IDNA normalize the string and Unassigned flags bool bHasLastDot = (unicode.Length > 0) && IsDot(unicode[unicode.Length - 1]); unicode = unicode.Normalize((NormalizationForm)(m_bAllowUnassigned ? ExtendedNormalizationForms.FormIdna : ExtendedNormalizationForms.FormIdnaDisallowUnassigned)); // Make sure we didn't normalize away something after a last dot if ((!bHasLastDot) && unicode.Length > 0 && IsDot(unicode[unicode.Length - 1])) { throw new ArgumentException(Environment.GetResourceString( "Argument_IdnBadLabelSize"), "unicode"); } // May need to check Std3 rules again for non-ascii if (UseStd3AsciiRules) { ValidateStd3AndAscii(unicode, true, false); } // Go ahead and encode it return punycode_encode(unicode); }
private static string RemoveDiacritics(String s) { // oddìlení znakù od modifikátorù (háèkù, èárek, atd.) s = s.Normalize(NormalizationForm.FormD); // only upper case characters for FIC Spectra display s = s.ToUpper(); StringBuilder sb = new StringBuilder(); for (int i = 0; i < s.Length; i++) { if ((s[i] >= 'A' && s[i] <= 'Z') || (s[i] >= '0' && s[i] <= '9') || (s[i] == ' ')) { sb.Append(s[i]); } else { sb.Append(' '); } } // vrátí øetìzec bez diakritiky return sb.ToString(); }
/// <summary> /// Returns the URL formated. /// </summary> /// <param name="path">The path.</param> /// <param name="title">The title.</param> /// <returns></returns> public static String ReturnUrlFormated(String path, String title) { if (String.IsNullOrEmpty(title)) return ""; //change diacritic characters to non-diacritic ones title = Regex.Replace(title.Normalize(NormalizationForm.FormD), @"[^A-Za-z 0-9 \.,\?'""!@#\$%\^&\*\(\)-_=\+;:<>\/\\\|\}\{\[\]`~]*", string.Empty).Trim(); // remove entities title = Regex.Replace(title, @"&\w+;", ""); // remove anything that is not letters, numbers, dash, or space title = Regex.Replace(title, @"[^A-Za-z0-9\-\s]", ""); // remove any leading or trailing spaces left over title = title.Trim(); // replace spaces with single dash title = Regex.Replace(title, @"\s+", "-"); // if we end up with multiple dashes, collapse to single dash title = Regex.Replace(title, @"\-{2,}", "-"); // make it all lower case title = title.ToLower(); //// if it's too long, clip it //if (title.Length > 80) // title = title.Substring(0, 79); // remove trailing dash, if there is one if (title.EndsWith("-")) title = title.Substring(0, title.Length - 1); return path + title; }