Example #1
0
        /// <summary>
        /// Remove diacritics (i.e., accents) from String
        /// </summary>
        /// <param name="word">
        /// A <see cref="String"/>
        /// </param>
        /// <returns>
        /// A <see cref="String"/>
        /// </returns>
        public static String removeDiacritics(String word)
        {
            String kdform =  word.Normalize(NormalizationForm.FormKD);
            StringBuilder sb = new StringBuilder();

            for(int i = 0; i < kdform.Length; i++) {
                UnicodeCategory uc = CharUnicodeInfo.GetUnicodeCategory(kdform[i]);
                if(uc != UnicodeCategory.NonSpacingMark)
                    sb.Append(kdform[i]);
            }
            return sb.ToString();
        }
        /// <summary>
        /// Remove os acentos de uma string
        /// </summary>
        /// <param name="s"></param>
        /// <returns></returns>
        public static String RemoveDiacritics(String s)
        {
            var normalizedString = s.Normalize(NormalizationForm.FormD);
            var stringBuilder = new StringBuilder();

            for (var i = 0; i < normalizedString.Length; i++)
            {
                var c = normalizedString[i];
                if (CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)
                    stringBuilder.Append(c);
            }

            return stringBuilder.ToString();
        }
Example #3
0
		public static String asciify(String text) {
			text = text.Normalize (System.Text.NormalizationForm.FormC);
			StringBuilder newText = new StringBuilder ();
			foreach (char c in text) {
				newText.Append (specialCases.ContainsKey(c) ? specialCases[c] : c.ToString ());
			}

			text = newText.ToString().Normalize (System.Text.NormalizationForm.FormD);
			newText = new StringBuilder ();
			foreach (char c in text) {
				if (c <= 127) {
					newText.Append (c);
				}
			}
			return newText.ToString ();
		}
Example #4
0
        public static string RemoveDiacritics(String s)
        {
            // oddělení znaků od modifikátorů (háčků, čárek, atd.)
            s = s.Normalize(System.Text.NormalizationForm.FormD);
            System.Text.StringBuilder sb = new System.Text.StringBuilder();

            for (int i = 0; i < s.Length; i++)
            {
                // do řetězce přidá všechny znaky kromě modifikátorů
                if (System.Globalization.CharUnicodeInfo.GetUnicodeCategory(s[i]) != System.Globalization.UnicodeCategory.NonSpacingMark)
                {
                    sb.Append(s[i]);
                }
            }

            // vrátí řetězec bez diakritiky
            return sb.ToString();
        }
Example #5
0
        /// <summary>
        /// 
        /// </summary>
        /// <param name="text"></param>
        /// <returns></returns>
        public int GetTweetLength(String text)
        {
            try
            {
                text = text.Normalize(NormalizationForm.FormC);
            }
            catch { }

            int length = new StringInfo(text).LengthInTextElements;
            foreach (Extractor.Entity urlEntity in __Extractor.ExtractURLsWithIndices(text))
            {
                // Subtract the length of the original URL
                length -= (urlEntity.End - urlEntity.Start);

                // Add `ShortUrlLengthHttps` characters for URL starting with https:// Otherwise add `ShortUrlLength` characters
                length += urlEntity.Value.ToLower().StartsWith("https://") ? ShortUrlLengthHttps : ShortUrlLength;
            }
            return length;
        }
Example #6
0
    private static string RemoveDiacritics(String s)
    {
      // oddìlení znakù od modifikátorù (háèkù, èárek, atd.)
      s = s.Normalize(NormalizationForm.FormD);
      StringBuilder sb = new StringBuilder();

      for (int i = 0; i < s.Length; i++)
      {
        // do øetìzce pøidá všechny znaky kromì modifikátorù
        if (CharUnicodeInfo.GetUnicodeCategory(s[i]) != UnicodeCategory.NonSpacingMark)
        {
          sb.Append(s[i]);
        }
      }

      // vrátí øetìzec bez diakritiky
      return sb.ToString();
    }
Example #7
0
 public static string[] MessagePraser(String messageContent)
 {
     string[] messagePrased;
     messagePrased = messageContent.Normalize().Split(' ');
     return messagePrased;
 }
        /// <summary>
        /// Removes diacritics
        /// </summary>
        /// <param name="s">text string possibly with diaqcritics</param>
        /// <returns></returns>
        private string RemoveDiacritics(String s)
        {
            //normalize string
            s = s.Normalize(NormalizationForm.FormD);
            StringBuilder sb = new StringBuilder();

            for (int i = 0; i < s.Length; i++)
            {
                //substitute characters with diacritics by characters without diacritics
                if (CharUnicodeInfo.GetUnicodeCategory(s[i]) != UnicodeCategory.NonSpacingMark)
                {
                    sb.Append(s[i]);
                }
            }
            //return string where diacritics have been removed
            return sb.ToString();
        }
Example #9
0
        /// <summary>
        /// Remove todos os acentos das palavras.
        /// </summary>
        /// <param name="value">palavra acentuada</param>
        /// <returns>palavra sem acento</returns>
        internal static String RemoveAcento(String value)
        {
            String normalizedString = value.Normalize(NormalizationForm.FormD);
            StringBuilder stringBuilder = new StringBuilder();

            for (int i = 0; i < normalizedString.Length; i++)
            {
                Char c = normalizedString[i];
                if (CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)
                    stringBuilder.Append(c);
            }

            return stringBuilder.ToString();
        }
        public static unsafe void NormalizationTest()
        {
            // U+0063  LATIN SMALL LETTER C
            // U+0301  COMBINING ACUTE ACCENT
            // U+0327  COMBINING CEDILLA
            // U+00BE  VULGAR FRACTION THREE QUARTERS            
            string s = new String( new char[] {'\u0063', '\u0301', '\u0327', '\u00BE'});

            Assert.False(s.IsNormalized(), "String should be not normalized when checking with the default which same as FormC");
            Assert.False(s.IsNormalized(NormalizationForm.FormC), "String should be not normalized when checking with FormC");
            Assert.False(s.IsNormalized(NormalizationForm.FormD), "String should be not normalized when checking with FormD");
            Assert.False(s.IsNormalized(NormalizationForm.FormKC), "String should be not normalized when checking with FormKC");
            Assert.False(s.IsNormalized(NormalizationForm.FormKD), "String should be not normalized when checking with FormKD");

            string normalized = s.Normalize(); // FormC
            Assert.True(normalized.IsNormalized(), "Expected to have the normalized string with default form FormC");
            Assert.True(normalized.IsNormalized(NormalizationForm.FormC), "Expected to have the normalized string with FormC");
            
            normalized = s.Normalize(NormalizationForm.FormC);
            Assert.True(normalized.IsNormalized(), "Expected to have the normalized string with default form FormC when using NormalizationForm.FormC");
            Assert.True(normalized.IsNormalized(NormalizationForm.FormC), "Expected to have the normalized string with FormC when using NormalizationForm.FormC");

            normalized = s.Normalize(NormalizationForm.FormD);
            Assert.True(normalized.IsNormalized(NormalizationForm.FormD), "Expected to have the normalized string with FormD");

            normalized = s.Normalize(NormalizationForm.FormKC);
            Assert.True(normalized.IsNormalized(NormalizationForm.FormKC), "Expected to have the normalized string with FormKC");

            normalized = s.Normalize(NormalizationForm.FormKD);
            Assert.True(normalized.IsNormalized(NormalizationForm.FormKD), "Expected to have the normalized string with FormKD");
        }
Example #11
0
        public String GetAscii(String unicode, int index, int count)
        {
            if (unicode==null) throw new ArgumentNullException("unicode");
            if (index < 0 || count < 0)
                throw new ArgumentOutOfRangeException((index < 0) ? "index" : "count",
                      Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
            if (index > unicode.Length)
                throw new ArgumentOutOfRangeException("byteIndex",
                    Environment.GetResourceString("ArgumentOutOfRange_Index"));
            if (index > unicode.Length - count)
                throw new ArgumentOutOfRangeException("unicode",
                      Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer"));
            Contract.EndContractBlock();

            // We're only using part of the string
            unicode = unicode.Substring(index, count);

            if (Environment.IsWindows8OrAbove)
            {
                return GetAsciiUsingOS(unicode);
            }

            // Check for ASCII only string, which will be unchanged
            if (ValidateStd3AndAscii(unicode, UseStd3AsciiRules, true))
            {
                return unicode;
            }

            // Cannot be null terminated (normalization won't help us with this one, and
            // may have returned false before checking the whole string above)
            Contract.Assert(unicode.Length >= 1, "[IdnMapping.GetAscii]Expected 0 length strings to fail before now.");
            if (unicode[unicode.Length - 1] <= 0x1f)
            {
                throw new ArgumentException(
                    Environment.GetResourceString("Argument_InvalidCharSequence", unicode.Length-1 ),
                    "unicode");
            }

            // Have to correctly IDNA normalize the string and Unassigned flags
            bool bHasLastDot = (unicode.Length > 0) && IsDot(unicode[unicode.Length - 1]);
            unicode = unicode.Normalize((NormalizationForm)(m_bAllowUnassigned ?
                ExtendedNormalizationForms.FormIdna : ExtendedNormalizationForms.FormIdnaDisallowUnassigned));

            // Make sure we didn't normalize away something after a last dot
            if ((!bHasLastDot) && unicode.Length > 0 && IsDot(unicode[unicode.Length - 1]))
            {
                throw new ArgumentException(Environment.GetResourceString(
                    "Argument_IdnBadLabelSize"), "unicode");
            }

            // May need to check Std3 rules again for non-ascii
            if (UseStd3AsciiRules)
            {
                ValidateStd3AndAscii(unicode, true, false);
            }

            // Go ahead and encode it
            return punycode_encode(unicode);
        }
Example #12
0
    private static string RemoveDiacritics(String s)
    {
      // oddìlení znakù od modifikátorù (háèkù, èárek, atd.)
      s = s.Normalize(NormalizationForm.FormD);
      // only upper case characters for FIC Spectra display
      s = s.ToUpper();
      StringBuilder sb = new StringBuilder();

      for (int i = 0; i < s.Length; i++)
      {
        if ((s[i] >= 'A' && s[i] <= 'Z') || (s[i] >= '0' && s[i] <= '9') || (s[i] == ' '))
        {
          sb.Append(s[i]);
        }
        else
        {
          sb.Append(' ');
        }
      }

      // vrátí øetìzec bez diakritiky
      return sb.ToString();
    }
Example #13
0
        /// <summary>
        /// Returns the URL formated.
        /// </summary>
        /// <param name="path">The path.</param>
        /// <param name="title">The title.</param>
        /// <returns></returns>
        public static String ReturnUrlFormated(String path, String title)
        {
            if (String.IsNullOrEmpty(title)) return "";
            //change diacritic characters to non-diacritic ones
            title = Regex.Replace(title.Normalize(NormalizationForm.FormD), @"[^A-Za-z 0-9 \.,\?'""[email protected]#\$%\^&\*\(\)-_=\+;:<>\/\\\|\}\{\[\]`~]*", string.Empty).Trim();
            // remove entities
            title = Regex.Replace(title, @"&\w+;", "");
            // remove anything that is not letters, numbers, dash, or space
            title = Regex.Replace(title, @"[^A-Za-z0-9\-\s]", "");
            // remove any leading or trailing spaces left over
            title = title.Trim();
            // replace spaces with single dash
            title = Regex.Replace(title, @"\s+", "-");
            // if we end up with multiple dashes, collapse to single dash            
            title = Regex.Replace(title, @"\-{2,}", "-");
            // make it all lower case
            title = title.ToLower();
            //// if it's too long, clip it
            //if (title.Length > 80)
            //    title = title.Substring(0, 79);
            // remove trailing dash, if there is one
            if (title.EndsWith("-"))
                title = title.Substring(0, title.Length - 1);

            return path + title;
        }