RemoveDiacriticsAndPunctuation() public static method

Removes "accents" and punctuation from a string, transforming it to lowercase (culture invariant).
public static RemoveDiacriticsAndPunctuation ( string input, bool isSingleWord ) : string
input string The input string.
isSingleWord bool A value indicating whether the input string is a single word.
return string
示例#1
0
        /// <summary>
        /// Tokenizes a string.
        /// </summary>
        /// <param name="text">The text to tokenize.</param>
        /// <param name="location">The location of the words that are extracted.</param>
        /// <returns>The tokens.</returns>
        /// <exception cref="ArgumentNullException">If <paramref name="text"/> is <c>null</c>.</exception>
        public static WordInfo[] Tokenize(string text, WordLocation location)
        {
            if (text == null)
            {
                throw new ArgumentNullException("text");
            }

            List <WordInfo> words = new List <WordInfo>(text.Length / 5);           // Average 5 chars/word

            ushort currentIndex = 0, currentWordStart;

            // Skip all trailing splitChars
            currentIndex = SkipSplitChars(0, text);

            currentWordStart = currentIndex;

            while (currentIndex < text.Length && currentIndex < 65500)
            {
                while (currentIndex < text.Length && !Tools.IsSplitChar(text[currentIndex]))
                {
                    currentIndex++;
                }
                string w = text.Substring(currentWordStart, currentIndex - currentWordStart);
                w = Tools.RemoveDiacriticsAndPunctuation(w, true);
                if (!string.IsNullOrEmpty(w))
                {
                    words.Add(new WordInfo(w, currentWordStart, (ushort)words.Count, location));
                }
                currentIndex     = SkipSplitChars((ushort)(currentIndex + 1), text);
                currentWordStart = currentIndex;
            }

            return(words.ToArray());
        }
示例#2
0
        /// <summary>
        /// Initializes a new instance of the <see cref="WordInfo" /> class.
        /// </summary>
        /// <param name="text">The text of the word.</param>
        /// <param name="firstCharIndex">The index of the first character of the word in the document.</param>
        /// <param name="wordIndex">The index of the word in the document.</param>
        /// <param name="location">The location of the word in the document.</param>
        /// <exception cref="ArgumentNullException">If <paramref name="text"/> is <c>null</c>.</exception>
        /// <exception cref="ArgumentException">If <paramref name="text"/> is empty.</exception>
        /// <exception cref="ArgumentOutOfRangeException">If <paramref name="firstCharIndex"/> or <paramref name="wordIndex"/> are less than zero.</exception>
        public WordInfo(string text, ushort firstCharIndex, ushort wordIndex, WordLocation location)
            : base(firstCharIndex, wordIndex, location)
        {
            if (text == null)
            {
                throw new ArgumentNullException("text");
            }
            if (text.Length == 0)
            {
                throw new ArgumentException("Invalid text", "text");
            }

            this.text = Tools.RemoveDiacriticsAndPunctuation(text, true);
            //if(this.text.Length == 0) throw new InvalidOperationException();
        }
示例#3
0
        /// <summary>
        ///     Initializes a new instance of the <see cref="Word" /> class.
        /// </summary>
        /// <param name="id">The word ID.</param>
        /// <param name="text">The text of the word (lowercase).</param>
        /// <param name="occurrences">The occurrences.</param>
        /// <exception cref="ArgumentNullException">If <paramref name="text" /> or <paramref name="occurrences" /> are <c>null</c>.</exception>
        /// <exception cref="ArgumentException">If <paramref name="text" /> is empty.</exception>
        public Word(uint id, string text, OccurrenceDictionary occurrences)
        {
            if (text == null)
            {
                throw new ArgumentNullException("text");
            }
            if (text.Length == 0)
            {
                throw new ArgumentException("Text must contain at least one character", "text");
            }
            if (occurrences == null)
            {
                throw new ArgumentNullException("occurrences");
            }

            this.text = Tools.RemoveDiacriticsAndPunctuation(text, true);
            //if(this.text.Length == 0) throw new InvalidOperationException();
            this.id          = id;
            this.occurrences = occurrences;
        }
示例#4
0
        /// <summary>
        /// Prepares a query for searching.
        /// </summary>
        /// <param name="query">The query.</param>
        /// <returns>The prepared query.</returns>
        private static string PrepareQuery(string query)
        {
            StringBuilder sb = new StringBuilder(query.Length);

            // This behavior is slightly different from RemoveDiacriticsAndPunctuation
            foreach (char c in query)
            {
                if (!ScrewTurn.Wiki.SearchEngine.Tools.IsSplitChar(c))
                {
                    sb.Append(c);
                }
                else
                {
                    sb.Append(" ");
                }
            }

            string normalized = Tools.RemoveDiacriticsAndPunctuation(sb.ToString(), false);

            return(normalized);
        }