/// <summary> /// Determines whether a <see cref="DumpedWordMapping" /> is contained in a list. /// </summary> /// <param name="mapping">The mapping.</param> /// <param name="list">The list.</param> /// <returns><c>true</c> if the mapping is contained in the list, <c>false</c> otherwise.</returns> protected static bool Find(DumpedWordMapping mapping, IEnumerable <DumpedWordMapping> list) { foreach (DumpedWordMapping m in list) { if (m.WordID == mapping.WordID && m.DocumentID == mapping.DocumentID && m.FirstCharIndex == mapping.FirstCharIndex && m.WordIndex == mapping.WordIndex && m.Location == mapping.Location) { return(true); } } return(false); }
/// <summary> /// Writes a <see cref="DumpedWordMapping" /> to a <see cref="BinaryWriter" />. /// </summary> /// <param name="writer">The <see cref="BinaryWriter" />.</param> /// <param name="mapping">The <see cref="DumpedWordMapping" />.</param> private static void WriteDumpedWordMapping(BinaryWriter writer, DumpedWordMapping mapping) { writer.Write(mapping.WordID); writer.Write(mapping.DocumentID); writer.Write(mapping.FirstCharIndex); writer.Write(mapping.WordIndex); writer.Write(mapping.Location); }
/// <summary> /// Loads the index from the data store the first time. /// </summary> /// <param name="documents">The dumped documents.</param> /// <param name="words">The dumped words.</param> /// <param name="mappings">The dumped word mappings.</param> protected override void LoadIndexInternal(out DumpedDocument[] documents, out DumpedWord[] words, out DumpedWordMapping[] mappings) { uint maxDocumentId = 0; uint maxWordId = 0; // 1. Load Documents using(FileStream fs = new FileStream(documentsFile, FileMode.Open, FileAccess.Read, FileShare.None)) { int count = ReadCount(fs); BinaryReader reader = new BinaryReader(fs, Encoding.UTF8); documents = new DumpedDocument[count]; for(int i = 0; i < count; i++) { documents[i] = ReadDumpedDocument(reader); if(documents[i].ID > maxDocumentId) maxDocumentId = documents[i].ID; } firstFreeDocumentId = maxDocumentId + 1; } // 2. Load Words using(FileStream fs = new FileStream(wordsFile, FileMode.Open, FileAccess.Read, FileShare.None)) { int count = ReadCount(fs); BinaryReader reader = new BinaryReader(fs, Encoding.UTF8); words = new DumpedWord[count]; for(int i = 0; i < count; i++) { words[i] = ReadDumpedWord(reader); if(words[i].ID > maxWordId) maxWordId = words[i].ID; } firstFreeWordId = maxWordId + 1; } // 3. Load Mappings using(FileStream fs = new FileStream(mappingsFile, FileMode.Open, FileAccess.Read, FileShare.None)) { int count = ReadCount(fs); BinaryReader reader = new BinaryReader(fs, Encoding.UTF8); mappings = new DumpedWordMapping[count]; for(int i = 0; i < count; i++) { mappings[i] = ReadDumpedWordMapping(reader); } } }
/// <summary> /// Initializes index data by completely emptying the index catalog and storing the specified data. /// </summary> /// <param name="documents">The documents.</param> /// <param name="words">The words.</param> /// <param name="mappings">The mappings.</param> /// <remarks>The method <b>does not</b> check the consistency of the data passed as arguments.</remarks> /// <exception cref="ArgumentNullException">If <paramref name="documents"/>, <paramref name="words"/> or <paramref name="mappings"/> are <c>null</c>.</exception> /// <exception cref="InvalidOperationException">If <see cref="M:SetBuildDocumentDelegate"/> was not called.</exception> public void InitializeData(DumpedDocument[] documents, DumpedWord[] words, DumpedWordMapping[] mappings) { if(documents == null) throw new ArgumentNullException("documents"); if(words == null) throw new ArgumentNullException("words"); if(mappings == null) throw new ArgumentNullException("mappings"); if(buildDocument == null) throw new InvalidOperationException("InitializeData can be invoked only when the BuildDocument delegate is set"); lock(this) { catalog.Clear(); catalog = new Dictionary<string, Word>(words.Length); // Contains the IDs of documents that are missing List<uint> missingDocuments = new List<uint>(50); // 1. Prepare a dictionary with all documents for use in the last step Dictionary<uint, IDocument> tempDocuments = new Dictionary<uint, IDocument>(documents.Length); foreach(DumpedDocument doc in documents) { IDocument builtDoc = buildDocument(doc); // Null means that the document no longer exists - silently skip it if(builtDoc != null) { tempDocuments.Add(doc.ID, builtDoc); } else { missingDocuments.Add(doc.ID); } } // 2. Load words into the catalog, keeping track of them by ID in a dictionary for the next step Dictionary<ulong, Word> tempWords = new Dictionary<ulong, Word>(words.Length); // Test for hashing algorithm -- no more used since sequential IDs //if(words.Length > 0 && words[0].ID != Tools.HashString(words[0].Text)) { // throw new InvalidOperationException("The search engine index seems to use an outdated hashing algorithm"); //} foreach(DumpedWord w in words) { Word word = new Word(w.ID, w.Text); /*if(tempWords.ContainsKey(w.ID)) { string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, tempWords[w.ID]); Console.WriteLine(t); }*/ tempWords.Add(w.ID, word); /*if(catalog.ContainsKey(w.Text)) { string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, catalog[w.Text]); Console.WriteLine(t); }*/ catalog.Add(w.Text, word); } // 3. Add mappings and documents foreach(DumpedWordMapping map in mappings) { // HACK: Skip mappings that refer to missing documents and gracefully skip unknown words if(!missingDocuments.Contains(map.DocumentID)) { try { tempWords[map.WordID].AddOccurrence(tempDocuments[map.DocumentID], map.FirstCharIndex, map.WordIndex, WordLocation.GetInstance(map.Location)); } catch(KeyNotFoundException) { } } } } }
/// <summary> /// Loads the index from the data store the first time. /// </summary> /// <param name="documents">The dumped documents.</param> /// <param name="words">The dumped words.</param> /// <param name="mappings">The dumped word mappings.</param> protected abstract void LoadIndexInternal(out DumpedDocument[] documents, out DumpedWord[] words, out DumpedWordMapping[] mappings);
/// <summary> /// Determines whether a <see cref="DumpedWordMapping" /> is contained in a list. /// </summary> /// <param name="mapping">The mapping.</param> /// <param name="list">The list.</param> /// <returns><c>true</c> if the mapping is contained in the list, <c>false</c> otherwise.</returns> protected static bool Find(DumpedWordMapping mapping, IEnumerable<DumpedWordMapping> list) { foreach(DumpedWordMapping m in list) { if(m.WordID == mapping.WordID && m.DocumentID == mapping.DocumentID && m.FirstCharIndex == mapping.FirstCharIndex && m.WordIndex == mapping.WordIndex && m.Location == mapping.Location) return true; } return false; }