/// <summary> /// Stores a word in the catalog. /// </summary> /// <param name="wordText">The word to store.</param> /// <param name="document">The document the word occurs in.</param> /// <param name="firstCharIndex">The index of the first character of the word in the document the word occurs at.</param> /// <param name="wordIndex">The index of the word in the document.</param> /// <param name="location">The location of the word.</param> /// <param name="newWord">The new word, or <c>null</c>.</param> /// <param name="dumpedWord">The dumped word data, or <c>null</c>.</param> /// <returns>The dumped word mapping data.</returns> /// <remarks>Storing a word in the index is <b>O(n log n)</b>, /// where <b>n</b> is the number of words already in the index.</remarks> protected DumpedWordMapping StoreWord(string wordText, IDocument document, ushort firstCharIndex, ushort wordIndex, WordLocation location, out Word newWord, out DumpedWord dumpedWord) { wordText = wordText.ToLower(CultureInfo.InvariantCulture); lock (this) { Word word = null; if (!catalog.TryGetValue(wordText, out word)) { // Use ZERO as initial ID, update when IndexStorer has stored the word // A reference to this newly-created word must be passed outside this method word = new Word(0, wordText); catalog.Add(wordText, word); newWord = word; dumpedWord = new DumpedWord(word); } else { newWord = null; dumpedWord = null; } word.AddOccurrence(document, firstCharIndex, wordIndex, location); return(new DumpedWordMapping(word.ID, document.ID, firstCharIndex, wordIndex, location.Location)); } }
/// <summary> /// Determines whether a <see cref="DumpedWord" /> is contained in a list. /// </summary> /// <param name="word">The word.</param> /// <param name="list">The list.</param> /// <returns><c>true</c> if the word is contained in the list, <c>false</c> orherwise.</returns> protected static bool Find(DumpedWord word, IEnumerable <DumpedWord> list) { foreach (DumpedWord w in list) { if (w.ID == word.ID && w.Text == word.Text) { return(true); } } return(false); }
/// <summary> /// Writes a <see cref="DumpedWord" /> to a <see cref="BinaryWriter" />. /// </summary> /// <param name="writer">The <see cref="BinaryWriter" />.</param> /// <param name="word">The <see cref="DumpedWord" />.</param> private static void WriteDumpedWord(BinaryWriter writer, DumpedWord word) { //if(word.Text.Length == 0) throw new InvalidOperationException(); writer.Write(word.ID); writer.Write(word.Text); }
/// <summary> /// Loads the index from the data store the first time. /// </summary> /// <param name="documents">The dumped documents.</param> /// <param name="words">The dumped words.</param> /// <param name="mappings">The dumped word mappings.</param> protected override void LoadIndexInternal(out DumpedDocument[] documents, out DumpedWord[] words, out DumpedWordMapping[] mappings) { uint maxDocumentId = 0; uint maxWordId = 0; // 1. Load Documents using(FileStream fs = new FileStream(documentsFile, FileMode.Open, FileAccess.Read, FileShare.None)) { int count = ReadCount(fs); BinaryReader reader = new BinaryReader(fs, Encoding.UTF8); documents = new DumpedDocument[count]; for(int i = 0; i < count; i++) { documents[i] = ReadDumpedDocument(reader); if(documents[i].ID > maxDocumentId) maxDocumentId = documents[i].ID; } firstFreeDocumentId = maxDocumentId + 1; } // 2. Load Words using(FileStream fs = new FileStream(wordsFile, FileMode.Open, FileAccess.Read, FileShare.None)) { int count = ReadCount(fs); BinaryReader reader = new BinaryReader(fs, Encoding.UTF8); words = new DumpedWord[count]; for(int i = 0; i < count; i++) { words[i] = ReadDumpedWord(reader); if(words[i].ID > maxWordId) maxWordId = words[i].ID; } firstFreeWordId = maxWordId + 1; } // 3. Load Mappings using(FileStream fs = new FileStream(mappingsFile, FileMode.Open, FileAccess.Read, FileShare.None)) { int count = ReadCount(fs); BinaryReader reader = new BinaryReader(fs, Encoding.UTF8); mappings = new DumpedWordMapping[count]; for(int i = 0; i < count; i++) { mappings[i] = ReadDumpedWordMapping(reader); } } }
/// <summary> /// Stores a word in the catalog. /// </summary> /// <param name="wordText">The word to store.</param> /// <param name="document">The document the word occurs in.</param> /// <param name="firstCharIndex">The index of the first character of the word in the document the word occurs at.</param> /// <param name="wordIndex">The index of the word in the document.</param> /// <param name="location">The location of the word.</param> /// <param name="newWord">The new word, or <c>null</c>.</param> /// <param name="dumpedWord">The dumped word data, or <c>null</c>.</param> /// <returns>The dumped word mapping data.</returns> /// <remarks>Storing a word in the index is <b>O(n log n)</b>, /// where <b>n</b> is the number of words already in the index.</remarks> protected DumpedWordMapping StoreWord(string wordText, IDocument document, ushort firstCharIndex, ushort wordIndex, WordLocation location, out Word newWord, out DumpedWord dumpedWord) { wordText = wordText.ToLower(CultureInfo.InvariantCulture); lock(this) { Word word = null; if(!catalog.TryGetValue(wordText, out word)) { // Use ZERO as initial ID, update when IndexStorer has stored the word // A reference to this newly-created word must be passed outside this method word = new Word(0, wordText); catalog.Add(wordText, word); newWord = word; dumpedWord = new DumpedWord(word); } else { newWord = null; dumpedWord = null; } word.AddOccurrence(document, firstCharIndex, wordIndex, location); return new DumpedWordMapping(word.ID, document.ID, firstCharIndex, wordIndex, location.Location); } }
/// <summary> /// Initializes index data by completely emptying the index catalog and storing the specified data. /// </summary> /// <param name="documents">The documents.</param> /// <param name="words">The words.</param> /// <param name="mappings">The mappings.</param> /// <remarks>The method <b>does not</b> check the consistency of the data passed as arguments.</remarks> /// <exception cref="ArgumentNullException">If <paramref name="documents"/>, <paramref name="words"/> or <paramref name="mappings"/> are <c>null</c>.</exception> /// <exception cref="InvalidOperationException">If <see cref="M:SetBuildDocumentDelegate"/> was not called.</exception> public void InitializeData(DumpedDocument[] documents, DumpedWord[] words, DumpedWordMapping[] mappings) { if(documents == null) throw new ArgumentNullException("documents"); if(words == null) throw new ArgumentNullException("words"); if(mappings == null) throw new ArgumentNullException("mappings"); if(buildDocument == null) throw new InvalidOperationException("InitializeData can be invoked only when the BuildDocument delegate is set"); lock(this) { catalog.Clear(); catalog = new Dictionary<string, Word>(words.Length); // Contains the IDs of documents that are missing List<uint> missingDocuments = new List<uint>(50); // 1. Prepare a dictionary with all documents for use in the last step Dictionary<uint, IDocument> tempDocuments = new Dictionary<uint, IDocument>(documents.Length); foreach(DumpedDocument doc in documents) { IDocument builtDoc = buildDocument(doc); // Null means that the document no longer exists - silently skip it if(builtDoc != null) { tempDocuments.Add(doc.ID, builtDoc); } else { missingDocuments.Add(doc.ID); } } // 2. Load words into the catalog, keeping track of them by ID in a dictionary for the next step Dictionary<ulong, Word> tempWords = new Dictionary<ulong, Word>(words.Length); // Test for hashing algorithm -- no more used since sequential IDs //if(words.Length > 0 && words[0].ID != Tools.HashString(words[0].Text)) { // throw new InvalidOperationException("The search engine index seems to use an outdated hashing algorithm"); //} foreach(DumpedWord w in words) { Word word = new Word(w.ID, w.Text); /*if(tempWords.ContainsKey(w.ID)) { string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, tempWords[w.ID]); Console.WriteLine(t); }*/ tempWords.Add(w.ID, word); /*if(catalog.ContainsKey(w.Text)) { string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, catalog[w.Text]); Console.WriteLine(t); }*/ catalog.Add(w.Text, word); } // 3. Add mappings and documents foreach(DumpedWordMapping map in mappings) { // HACK: Skip mappings that refer to missing documents and gracefully skip unknown words if(!missingDocuments.Contains(map.DocumentID)) { try { tempWords[map.WordID].AddOccurrence(tempDocuments[map.DocumentID], map.FirstCharIndex, map.WordIndex, WordLocation.GetInstance(map.Location)); } catch(KeyNotFoundException) { } } } } }
/// <summary> /// Loads the index from the data store the first time. /// </summary> /// <param name="documents">The dumped documents.</param> /// <param name="words">The dumped words.</param> /// <param name="mappings">The dumped word mappings.</param> protected abstract void LoadIndexInternal(out DumpedDocument[] documents, out DumpedWord[] words, out DumpedWordMapping[] mappings);
/// <summary> /// Determines whether a <see cref="DumpedWord" /> is contained in a list. /// </summary> /// <param name="word">The word.</param> /// <param name="list">The list.</param> /// <returns><c>true</c> if the word is contained in the list, <c>false</c> orherwise.</returns> protected static bool Find(DumpedWord word, IEnumerable<DumpedWord> list) { foreach(DumpedWord w in list) { if(w.ID == word.ID && w.Text == word.Text) return true; } return false; }
/// <summary> /// Stores a document in the index. /// </summary> /// <param name="document">The document.</param> /// <param name="keywords">The document keywords, if any, an empty array or <c>null</c> otherwise.</param> /// <param name="content">The content of the document.</param> /// <param name="state">A state object that is passed to the IndexStorer SaveDate/DeleteData function.</param> /// <returns>The number of indexed words (including duplicates) in the document title and content.</returns> /// <remarks>Indexing the content of the document is <b>O(n)</b>, /// where <b>n</b> is the total number of words in the document. /// If the specified document was already in the index, all the old occurrences /// are deleted from the index.</remarks> /// <exception cref="ArgumentNullException">If <paramref name="document"/> or <paramref name="content"/> are <c>null</c>.</exception> public int StoreDocument(IDocument document, string[] keywords, string content, object state) { if (document == null) { throw new ArgumentNullException("document"); } if (keywords == null) { keywords = new string[0]; } if (content == null) { throw new ArgumentNullException("content"); } lock (this) { DumpedChange removeChange = RemoveDocumentInternal(document); if (removeChange != null) { OnIndexChange(document, IndexChangeType.DocumentRemoved, removeChange, state); } } keywords = Tools.CleanupKeywords(keywords); // When the IndexStorer handles the IndexChanged event and a document is added, the storer generates a new ID and returns it // via the event handler, then the in-memory index is updated (the document instance is shared across all words) - the final ID // is generated by the actual IndexStorer implementation (SaveData properly populates the Result field in the args) List <DumpedWord> dw = new List <DumpedWord>(content.Length / 5); List <DumpedWordMapping> dm = new List <DumpedWordMapping>(content.Length / 5); Word tempWord = null; List <Word> newWords = new List <Word>(50); DumpedWord tempDumpedWord = null; int count = 0; uint sequentialWordId = uint.MaxValue; // Store content words WordInfo[] words = document.Tokenize(content); words = Tools.RemoveStopWords(words, stopWords); foreach (WordInfo info in words) { dm.Add(StoreWord(info.Text, document, info.FirstCharIndex, info.WordIndex, WordLocation.Content, out tempWord, out tempDumpedWord)); if (tempDumpedWord != null && tempWord != null) { dm[dm.Count - 1].WordID = sequentialWordId; tempDumpedWord.ID = sequentialWordId; dw.Add(tempDumpedWord); tempWord.ID = sequentialWordId; newWords.Add(tempWord); sequentialWordId--; } } count += words.Length; // Store title words words = document.Tokenize(document.Title); words = Tools.RemoveStopWords(words, stopWords); foreach (WordInfo info in words) { dm.Add(StoreWord(info.Text, document, info.FirstCharIndex, info.WordIndex, WordLocation.Title, out tempWord, out tempDumpedWord)); if (tempDumpedWord != null && tempWord != null) { dm[dm.Count - 1].WordID = sequentialWordId; tempDumpedWord.ID = sequentialWordId; dw.Add(tempDumpedWord); tempWord.ID = sequentialWordId; newWords.Add(tempWord); sequentialWordId--; } } count += words.Length; ushort tempCount = 0; // Store keywords for (ushort i = 0; i < (ushort)keywords.Length; i++) { dm.Add(StoreWord(keywords[i], document, tempCount, i, WordLocation.Keywords, out tempWord, out tempDumpedWord)); if (tempDumpedWord != null && tempWord != null) { dm[dm.Count - 1].WordID = sequentialWordId; tempDumpedWord.ID = sequentialWordId; dw.Add(tempDumpedWord); tempWord.ID = sequentialWordId; newWords.Add(tempWord); sequentialWordId--; } tempCount += (ushort)(1 + keywords[i].Length); } count += keywords.Length; IndexStorerResult result = OnIndexChange(document, IndexChangeType.DocumentAdded, new DumpedChange(new DumpedDocument(document), dw, dm), state); // Update document ID if (result != null && result.DocumentID.HasValue) { document.ID = result.DocumentID.Value; } else { // HACK: result is null -> index is corrupted, silently return return(0); } // Update word IDs in newWords bool wordIdUpdated = false; foreach (Word word in newWords) { wordIdUpdated = false; foreach (WordId id in result.WordIDs) { if (id.Text == word.Text) { word.ID = id.ID; wordIdUpdated = true; break; } } if (!wordIdUpdated) { throw new InvalidOperationException("No ID for new word"); } } return(count); }