/// <summary> /// Tokenizes a string. /// </summary> /// <param name="text">The text to tokenize.</param> /// <param name="location">The location of the words that are extracted.</param> /// <returns>The tokens.</returns> /// <exception cref="ArgumentNullException">If <paramref name="text"/> is <c>null</c>.</exception> public static WordInfo[] Tokenize(string text, WordLocation location) { if (text == null) { throw new ArgumentNullException("text"); } List <WordInfo> words = new List <WordInfo>(text.Length / 5); // Average 5 chars/word ushort currentIndex = 0, currentWordStart; // Skip all trailing splitChars currentIndex = SkipSplitChars(0, text); currentWordStart = currentIndex; while (currentIndex < text.Length && currentIndex < 65500) { while (currentIndex < text.Length && !Tools.IsSplitChar(text[currentIndex])) { currentIndex++; } string w = text.Substring(currentWordStart, currentIndex - currentWordStart); w = Tools.RemoveDiacriticsAndPunctuation(w, true); if (!string.IsNullOrEmpty(w)) { words.Add(new WordInfo(w, currentWordStart, (ushort)words.Count, location)); } currentIndex = SkipSplitChars((ushort)(currentIndex + 1), text); currentWordStart = currentIndex; } return(words.ToArray()); }
/// <summary> /// Stores a word in the catalog. /// </summary> /// <param name="wordText">The word to store.</param> /// <param name="document">The document the word occurs in.</param> /// <param name="firstCharIndex">The index of the first character of the word in the document the word occurs at.</param> /// <param name="wordIndex">The index of the word in the document.</param> /// <param name="location">The location of the word.</param> /// <param name="newWord">The new word, or <c>null</c>.</param> /// <param name="dumpedWord">The dumped word data, or <c>null</c>.</param> /// <returns>The dumped word mapping data.</returns> /// <remarks>Storing a word in the index is <b>O(n log n)</b>, /// where <b>n</b> is the number of words already in the index.</remarks> protected DumpedWordMapping StoreWord(string wordText, IDocument document, ushort firstCharIndex, ushort wordIndex, WordLocation location, out Word newWord, out DumpedWord dumpedWord) { wordText = wordText.ToLower(CultureInfo.InvariantCulture); lock (this) { Word word = null; if (!_catalog.TryGetValue(wordText, out word)) { // Use ZERO as initial ID, update when IndexStorer has stored the word // A reference to this newly-created word must be passed outside this method word = new Word(0, wordText); _catalog.Add(wordText, word); newWord = word; dumpedWord = new DumpedWord(word); } else { newWord = null; dumpedWord = null; } word.AddOccurrence(document, firstCharIndex, wordIndex, location); return(new DumpedWordMapping(word.ID, document.ID, firstCharIndex, wordIndex, location.Location)); } }
/// <summary> /// Initializes a new instance of the <see cref="WordInfo" /> class. /// </summary> /// <param name="text">The text of the word.</param> /// <param name="firstCharIndex">The index of the first character of the word in the document.</param> /// <param name="wordIndex">The index of the word in the document.</param> /// <param name="location">The location of the word in the document.</param> /// <exception cref="ArgumentNullException">If <paramref name="text"/> is <c>null</c>.</exception> /// <exception cref="ArgumentException">If <paramref name="text"/> is empty.</exception> /// <exception cref="ArgumentOutOfRangeException">If <paramref name="firstCharIndex"/> or <paramref name="wordIndex"/> are less than zero.</exception> public WordInfo(string text, ushort firstCharIndex, ushort wordIndex, WordLocation location) : base(firstCharIndex, wordIndex, location) { if (text == null) { throw new ArgumentNullException("text"); } if (text.Length == 0) { throw new ArgumentException("Invalid text.", "text"); } Text = Tools.RemoveDiacriticsAndPunctuation(text, true); }
/// Initializes a new instance of the <see cref="BasicWordInfo" /> class. /// </summary> /// <param name="firstCharIndex">The index of the first character of the word in the document.</param> /// <param name="wordIndex">The index of the word in the document.</param> /// <param name="location">The location of the word in the document.</param> /// <exception cref="ArgumentOutOfRangeException">If <paramref name="firstCharIndex"/> or <paramref name="wordIndex"/> are less than zero.</exception> public BasicWordInfo(ushort firstCharIndex, ushort wordIndex, WordLocation location) { if (firstCharIndex < 0) { throw new ArgumentOutOfRangeException("firstCharIndex", "Invalid first char index: must be greater than or equal to zero."); } if (wordIndex < 0) { throw new ArgumentOutOfRangeException("wordIndex", "Invalid word index: must be greater than or equal to zero."); } FirstCharIndex = firstCharIndex; WordIndex = wordIndex; Location = location; }
/// <summary> /// Stores an occurrence. /// </summary> /// <param name="document">The document the occurrence is referred to.</param> /// <param name="firstCharIndex">The index of the first character of the word in the document.</param> /// <param name="wordIndex">The index of the word in the document.</param> /// <param name="location">The location of the word.</param> /// <remarks>Adding an occurrence is <b>O(n)</b>, where <b>n</b> is the number of occurrences /// of the word already stored for the same document. If there were no occurrences previously stored, /// the operation is <b>O(1)</b>.</remarks> /// <exception cref="ArgumentNullException">If <paramref name="document"/> is <c>null</c>.</exception> /// <exception cref="ArgumentOutOfRangeException">If <paramref name="firstCharIndex"/> or <paramref name="wordIndex"/> are less than zero.</exception> public void AddOccurrence(IDocument document, ushort firstCharIndex, ushort wordIndex, WordLocation location) { if (document == null) { throw new ArgumentNullException("document"); } if (firstCharIndex < 0) { throw new ArgumentOutOfRangeException("firstCharIndex", "Invalid first char index: must be greater than or equal to zero."); } if (wordIndex < 0) { throw new ArgumentOutOfRangeException("wordIndex", "Invalid word index: must be greater than or equal to zero."); } lock (_occurrences) { if (_occurrences.ContainsKey(document)) { // Existing document _occurrences[document].Add(new BasicWordInfo(firstCharIndex, wordIndex, location)); } else { // New document SortedBasicWordInfoSet set = new SortedBasicWordInfoSet(); set.Add(new BasicWordInfo(firstCharIndex, wordIndex, location)); _occurrences.Add(document, set); } } }
/// <summary> /// Initializes index data by completely emptying the index catalog and storing the specified data. /// </summary> /// <param name="documents">The documents.</param> /// <param name="words">The words.</param> /// <param name="mappings">The mappings.</param> /// <remarks>The method <b>does not</b> check the consistency of the data passed as arguments.</remarks> /// <exception cref="ArgumentNullException">If <paramref name="documents"/>, <paramref name="words"/> or <paramref name="mappings"/> are <c>null</c>.</exception> /// <exception cref="InvalidOperationException">If <see cref="M:SetBuildDocumentDelegate"/> was not called.</exception> public void InitializeData(DumpedDocument[] documents, DumpedWord[] words, DumpedWordMapping[] mappings) { if (documents == null) { throw new ArgumentNullException("documents"); } if (words == null) { throw new ArgumentNullException("words"); } if (mappings == null) { throw new ArgumentNullException("mappings"); } if (_buildDocument == null) { throw new InvalidOperationException("InitializeData can be invoked only when the BuildDocument delegate is set."); } lock (this) { _catalog.Clear(); _catalog = new Dictionary <string, Word>(words.Length); // Contains the IDs of documents that are missing List <uint> missingDocuments = new List <uint>(50); // 1. Prepare a dictionary with all documents for use in the last step Dictionary <uint, IDocument> tempDocuments = new Dictionary <uint, IDocument>(documents.Length); foreach (DumpedDocument doc in documents) { IDocument builtDoc = _buildDocument(doc); // Null means that the document no longer exists - silently skip it if (builtDoc != null) { tempDocuments.Add(doc.ID, builtDoc); } else { missingDocuments.Add(doc.ID); } } // 2. Load words into the catalog, keeping track of them by ID in a dictionary for the next step Dictionary <ulong, Word> tempWords = new Dictionary <ulong, Word>(words.Length); foreach (DumpedWord w in words) { Word word = new Word(w.ID, w.Text); tempWords.Add(w.ID, word); _catalog.Add(w.Text, word); } // 3. Add mappings and documents foreach (DumpedWordMapping map in mappings) { // HACK: Skip mappings that refer to missing documents and gracefully skip unknown words if (!missingDocuments.Contains(map.DocumentID)) { try { tempWords[map.WordID].AddOccurrence(tempDocuments[map.DocumentID], map.FirstCharIndex, map.WordIndex, WordLocation.GetInstance(map.Location)); } catch (KeyNotFoundException) { } } } } }