Example #1
        /// <summary>
        /// Remove diacritics (i.e., accents) from String
        /// </summary>
        /// <param name="word">
        /// A <see cref="String"/>
        /// </param>
        /// <returns>
        /// A <see cref="String"/>
        /// </returns>
        public static String removeDiacritics(String word)
            String kdform =  word.Normalize(NormalizationForm.FormKD);
            StringBuilder sb = new StringBuilder();

            for(int i = 0; i < kdform.Length; i++) {
                UnicodeCategory uc = CharUnicodeInfo.GetUnicodeCategory(kdform[i]);
                if(uc != UnicodeCategory.NonSpacingMark)
            return sb.ToString();
        /// <summary>
        /// Remove os acentos de uma string
        /// </summary>
        /// <param name="s"></param>
        /// <returns></returns>
        public static String RemoveDiacritics(String s)
            var normalizedString = s.Normalize(NormalizationForm.FormD);
            var stringBuilder = new StringBuilder();

            for (var i = 0; i < normalizedString.Length; i++)
                var c = normalizedString[i];
                if (CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)

            return stringBuilder.ToString();
Example #3
        public static System.String RemoveDiacritics(this System.String s)
            System.String normalizedString = s.Normalize(NormalizationForm.FormD);
            var           stringBuilder    = new StringBuilder();

            foreach (char c in normalizedString)
                if (CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)

Example #4
		public static String asciify(String text) {
			text = text.Normalize (System.Text.NormalizationForm.FormC);
			StringBuilder newText = new StringBuilder ();
			foreach (char c in text) {
				newText.Append (specialCases.ContainsKey(c) ? specialCases[c] : c.ToString ());

			text = newText.ToString().Normalize (System.Text.NormalizationForm.FormD);
			newText = new StringBuilder ();
			foreach (char c in text) {
				if (c <= 127) {
					newText.Append (c);
			return newText.ToString ();
Example #5
        public static string RemoveDiacritics(String s)
            // oddělení znaků od modifikátorů (háčků, čárek, atd.)
            s = s.Normalize(System.Text.NormalizationForm.FormD);
            System.Text.StringBuilder sb = new System.Text.StringBuilder();

            for (int i = 0; i < s.Length; i++)
                // do řetězce přidá všechny znaky kromě modifikátorů
                if (System.Globalization.CharUnicodeInfo.GetUnicodeCategory(s[i]) != System.Globalization.UnicodeCategory.NonSpacingMark)

            // vrátí řetězec bez diakritiky
            return sb.ToString();
Example #6
        /// <summary>
        /// </summary>
        /// <param name="text"></param>
        /// <returns></returns>
        public int GetTweetLength(String text)
                text = text.Normalize(NormalizationForm.FormC);
            catch { }

            int length = new StringInfo(text).LengthInTextElements;
            foreach (Extractor.Entity urlEntity in __Extractor.ExtractURLsWithIndices(text))
                // Subtract the length of the original URL
                length -= (urlEntity.End - urlEntity.Start);

                // Add `ShortUrlLengthHttps` characters for URL starting with https:// Otherwise add `ShortUrlLength` characters
                length += urlEntity.Value.ToLower().StartsWith("https://") ? ShortUrlLengthHttps : ShortUrlLength;
            return length;
Example #7
        /// <summary>
        /// http://msdn.microsoft.com/en-us/library/system.text.normalizationform.aspx
        /// </summary>
        /// <param name="Input"></param>
        /// <returns></returns>
        public static String RemoveAccents(this String Expr)
            String result = Expr;

            if (!String.IsNullOrEmpty(Expr))
                // FormC	full canonical decomposition, followed by the replacement of sequences with their primary composites, if possible.
                // FormD	full canonical decomposition.
                // FormKC	full compatibility decomposition, followed by the replacement of sequences with their primary composites, if possible.
                // FormKD	full compatibility decomposition.

                String   normalizer = Expr.Normalize(NormalizationForm.FormKD);
                Encoding cleaner    = Encoding.GetEncoding(Encoding.ASCII.CodePage, new EncoderReplacementFallback(""), new DecoderReplacementFallback(""));

                Byte[] bytes = cleaner.GetBytes(normalizer);
                result = Encoding.ASCII.GetString(bytes);

Example #8
    private static string RemoveDiacritics(String s)
      // oddìlení znakù od modifikátorù (háèkù, èárek, atd.)
      s = s.Normalize(NormalizationForm.FormD);
      StringBuilder sb = new StringBuilder();

      for (int i = 0; i < s.Length; i++)
        // do øetìzce pøidá všechny znaky kromì modifikátorù
        if (CharUnicodeInfo.GetUnicodeCategory(s[i]) != UnicodeCategory.NonSpacingMark)

      // vrátí øetìzec bez diakritiky
      return sb.ToString();
Example #9
 public static string[] MessagePraser(String messageContent)
     string[] messagePrased;
     messagePrased = messageContent.Normalize().Split(' ');
     return messagePrased;
        /// <summary>
        /// Removes diacritics
        /// </summary>
        /// <param name="s">text string possibly with diaqcritics</param>
        /// <returns></returns>
        private string RemoveDiacritics(String s)
            //normalize string
            s = s.Normalize(NormalizationForm.FormD);
            StringBuilder sb = new StringBuilder();

            for (int i = 0; i < s.Length; i++)
                //substitute characters with diacritics by characters without diacritics
                if (CharUnicodeInfo.GetUnicodeCategory(s[i]) != UnicodeCategory.NonSpacingMark)
            //return string where diacritics have been removed
            return sb.ToString();
Example #11
        /// <summary>
        /// Remove todos os acentos das palavras.
        /// </summary>
        /// <param name="value">palavra acentuada</param>
        /// <returns>palavra sem acento</returns>
        internal static String RemoveAcento(String value)
            String normalizedString = value.Normalize(NormalizationForm.FormD);
            StringBuilder stringBuilder = new StringBuilder();

            for (int i = 0; i < normalizedString.Length; i++)
                Char c = normalizedString[i];
                if (CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)

            return stringBuilder.ToString();
        public static unsafe void NormalizationTest()
            // U+0063  LATIN SMALL LETTER C
            // U+0301  COMBINING ACUTE ACCENT
            // U+0327  COMBINING CEDILLA
            // U+00BE  VULGAR FRACTION THREE QUARTERS            
            string s = new String( new char[] {'\u0063', '\u0301', '\u0327', '\u00BE'});

            Assert.False(s.IsNormalized(), "String should be not normalized when checking with the default which same as FormC");
            Assert.False(s.IsNormalized(NormalizationForm.FormC), "String should be not normalized when checking with FormC");
            Assert.False(s.IsNormalized(NormalizationForm.FormD), "String should be not normalized when checking with FormD");
            Assert.False(s.IsNormalized(NormalizationForm.FormKC), "String should be not normalized when checking with FormKC");
            Assert.False(s.IsNormalized(NormalizationForm.FormKD), "String should be not normalized when checking with FormKD");

            string normalized = s.Normalize(); // FormC
            Assert.True(normalized.IsNormalized(), "Expected to have the normalized string with default form FormC");
            Assert.True(normalized.IsNormalized(NormalizationForm.FormC), "Expected to have the normalized string with FormC");
            normalized = s.Normalize(NormalizationForm.FormC);
            Assert.True(normalized.IsNormalized(), "Expected to have the normalized string with default form FormC when using NormalizationForm.FormC");
            Assert.True(normalized.IsNormalized(NormalizationForm.FormC), "Expected to have the normalized string with FormC when using NormalizationForm.FormC");

            normalized = s.Normalize(NormalizationForm.FormD);
            Assert.True(normalized.IsNormalized(NormalizationForm.FormD), "Expected to have the normalized string with FormD");

            normalized = s.Normalize(NormalizationForm.FormKC);
            Assert.True(normalized.IsNormalized(NormalizationForm.FormKC), "Expected to have the normalized string with FormKC");

            normalized = s.Normalize(NormalizationForm.FormKD);
            Assert.True(normalized.IsNormalized(NormalizationForm.FormKD), "Expected to have the normalized string with FormKD");
Example #13
        public String GetAscii(String unicode, int index, int count)
            if (unicode==null) throw new ArgumentNullException("unicode");
            if (index < 0 || count < 0)
                throw new ArgumentOutOfRangeException((index < 0) ? "index" : "count",
            if (index > unicode.Length)
                throw new ArgumentOutOfRangeException("byteIndex",
            if (index > unicode.Length - count)
                throw new ArgumentOutOfRangeException("unicode",

            // We're only using part of the string
            unicode = unicode.Substring(index, count);

            if (Environment.IsWindows8OrAbove)
                return GetAsciiUsingOS(unicode);

            // Check for ASCII only string, which will be unchanged
            if (ValidateStd3AndAscii(unicode, UseStd3AsciiRules, true))
                return unicode;

            // Cannot be null terminated (normalization won't help us with this one, and
            // may have returned false before checking the whole string above)
            Contract.Assert(unicode.Length >= 1, "[IdnMapping.GetAscii]Expected 0 length strings to fail before now.");
            if (unicode[unicode.Length - 1] <= 0x1f)
                throw new ArgumentException(
                    Environment.GetResourceString("Argument_InvalidCharSequence", unicode.Length-1 ),

            // Have to correctly IDNA normalize the string and Unassigned flags
            bool bHasLastDot = (unicode.Length > 0) && IsDot(unicode[unicode.Length - 1]);
            unicode = unicode.Normalize((NormalizationForm)(m_bAllowUnassigned ?
                ExtendedNormalizationForms.FormIdna : ExtendedNormalizationForms.FormIdnaDisallowUnassigned));

            // Make sure we didn't normalize away something after a last dot
            if ((!bHasLastDot) && unicode.Length > 0 && IsDot(unicode[unicode.Length - 1]))
                throw new ArgumentException(Environment.GetResourceString(
                    "Argument_IdnBadLabelSize"), "unicode");

            // May need to check Std3 rules again for non-ascii
            if (UseStd3AsciiRules)
                ValidateStd3AndAscii(unicode, true, false);

            // Go ahead and encode it
            return punycode_encode(unicode);
Example #14
    private static string RemoveDiacritics(String s)
      // oddìlení znakù od modifikátorù (háèkù, èárek, atd.)
      s = s.Normalize(NormalizationForm.FormD);
      // only upper case characters for FIC Spectra display
      s = s.ToUpper();
      StringBuilder sb = new StringBuilder();

      for (int i = 0; i < s.Length; i++)
        if ((s[i] >= 'A' && s[i] <= 'Z') || (s[i] >= '0' && s[i] <= '9') || (s[i] == ' '))
          sb.Append(' ');

      // vrátí øetìzec bez diakritiky
      return sb.ToString();
Example #15
        /// <summary>
        /// Returns the URL formated.
        /// </summary>
        /// <param name="path">The path.</param>
        /// <param name="title">The title.</param>
        /// <returns></returns>
        public static String ReturnUrlFormated(String path, String title)
            if (String.IsNullOrEmpty(title)) return "";
            //change diacritic characters to non-diacritic ones
            title = Regex.Replace(title.Normalize(NormalizationForm.FormD), @"[^A-Za-z 0-9 \.,\?'""!@#\$%\^&\*\(\)-_=\+;:<>\/\\\|\}\{\[\]`~]*", string.Empty).Trim();
            // remove entities
            title = Regex.Replace(title, @"&\w+;", "");
            // remove anything that is not letters, numbers, dash, or space
            title = Regex.Replace(title, @"[^A-Za-z0-9\-\s]", "");
            // remove any leading or trailing spaces left over
            title = title.Trim();
            // replace spaces with single dash
            title = Regex.Replace(title, @"\s+", "-");
            // if we end up with multiple dashes, collapse to single dash            
            title = Regex.Replace(title, @"\-{2,}", "-");
            // make it all lower case
            title = title.ToLower();
            //// if it's too long, clip it
            //if (title.Length > 80)
            //    title = title.Substring(0, 79);
            // remove trailing dash, if there is one
            if (title.EndsWith("-"))
                title = title.Substring(0, title.Length - 1);

            return path + title;