public static RemoveDiacriticsAndPunctuation ( string input, bool isSingleWord ) : string | ||
input | string | The input string. |
isSingleWord | bool | A value indicating whether the input string is a single word. |
return | string |
/// <summary> /// Tokenizes a string. /// </summary> /// <param name="text">The text to tokenize.</param> /// <param name="location">The location of the words that are extracted.</param> /// <returns>The tokens.</returns> /// <exception cref="ArgumentNullException">If <paramref name="text"/> is <c>null</c>.</exception> public static WordInfo[] Tokenize(string text, WordLocation location) { if (text == null) { throw new ArgumentNullException("text"); } List <WordInfo> words = new List <WordInfo>(text.Length / 5); // Average 5 chars/word ushort currentIndex = 0, currentWordStart; // Skip all trailing splitChars currentIndex = SkipSplitChars(0, text); currentWordStart = currentIndex; while (currentIndex < text.Length && currentIndex < 65500) { while (currentIndex < text.Length && !Tools.IsSplitChar(text[currentIndex])) { currentIndex++; } string w = text.Substring(currentWordStart, currentIndex - currentWordStart); w = Tools.RemoveDiacriticsAndPunctuation(w, true); if (!string.IsNullOrEmpty(w)) { words.Add(new WordInfo(w, currentWordStart, (ushort)words.Count, location)); } currentIndex = SkipSplitChars((ushort)(currentIndex + 1), text); currentWordStart = currentIndex; } return(words.ToArray()); }
/// <summary> /// Initializes a new instance of the <see cref="WordInfo" /> class. /// </summary> /// <param name="text">The text of the word.</param> /// <param name="firstCharIndex">The index of the first character of the word in the document.</param> /// <param name="wordIndex">The index of the word in the document.</param> /// <param name="location">The location of the word in the document.</param> /// <exception cref="ArgumentNullException">If <paramref name="text"/> is <c>null</c>.</exception> /// <exception cref="ArgumentException">If <paramref name="text"/> is empty.</exception> /// <exception cref="ArgumentOutOfRangeException">If <paramref name="firstCharIndex"/> or <paramref name="wordIndex"/> are less than zero.</exception> public WordInfo(string text, ushort firstCharIndex, ushort wordIndex, WordLocation location) : base(firstCharIndex, wordIndex, location) { if (text == null) { throw new ArgumentNullException("text"); } if (text.Length == 0) { throw new ArgumentException("Invalid text", "text"); } this.text = Tools.RemoveDiacriticsAndPunctuation(text, true); //if(this.text.Length == 0) throw new InvalidOperationException(); }
/// <summary> /// Initializes a new instance of the <see cref="Word" /> class. /// </summary> /// <param name="id">The word ID.</param> /// <param name="text">The text of the word (lowercase).</param> /// <param name="occurrences">The occurrences.</param> /// <exception cref="ArgumentNullException">If <paramref name="text" /> or <paramref name="occurrences" /> are <c>null</c>.</exception> /// <exception cref="ArgumentException">If <paramref name="text" /> is empty.</exception> public Word(uint id, string text, OccurrenceDictionary occurrences) { if (text == null) { throw new ArgumentNullException("text"); } if (text.Length == 0) { throw new ArgumentException("Text must contain at least one character", "text"); } if (occurrences == null) { throw new ArgumentNullException("occurrences"); } this.text = Tools.RemoveDiacriticsAndPunctuation(text, true); //if(this.text.Length == 0) throw new InvalidOperationException(); this.id = id; this.occurrences = occurrences; }
/// <summary> /// Prepares a query for searching. /// </summary> /// <param name="query">The query.</param> /// <returns>The prepared query.</returns> private static string PrepareQuery(string query) { StringBuilder sb = new StringBuilder(query.Length); // This behavior is slightly different from RemoveDiacriticsAndPunctuation foreach (char c in query) { if (!ScrewTurn.Wiki.SearchEngine.Tools.IsSplitChar(c)) { sb.Append(c); } else { sb.Append(" "); } } string normalized = Tools.RemoveDiacriticsAndPunctuation(sb.ToString(), false); return(normalized); }