/// <summary> /// Initializes a new instance of the <see cref="T:FileDocument" /> class. /// </summary> /// <param name="doc">The dumped document.</param> public FileDocument(DumpedDocument doc) { string[] fields = doc.Name.Split('|'); id = doc.ID; name = doc.Name; title = doc.Title; dateTime = doc.DateTime; provider = fields[0]; }
/// <summary> /// Initializes a new instance of the <see cref="T:PageAttachmentDocument" /> class. /// </summary> /// <param name="doc">The dumped document.</param> public PageAttachmentDocument(DumpedDocument doc) { string[] fields = doc.Name.Split('|'); id = doc.ID; name = doc.Name; title = doc.Title; dateTime = doc.DateTime; provider = fields[0]; page = Pages.FindPage(fields[1]); }
/// <summary> /// Initializes a new instance of the <see cref="T:PageDocument" /> class. /// </summary> /// <param name="pageInfo">The page.</param> /// <param name="dumpedDocument">The dumped document data.</param> /// <param name="tokenizer">The tokenizer.</param> public PageDocument(PageInfo pageInfo, DumpedDocument dumpedDocument, Tokenizer tokenizer) { if(dumpedDocument == null) throw new ArgumentNullException("dumpedDocument"); if(tokenizer == null) throw new ArgumentNullException("tokenizer"); this.pageInfo = pageInfo; id = dumpedDocument.ID; name = dumpedDocument.Name; typeTag = dumpedDocument.TypeTag; title = dumpedDocument.Title; dateTime = dumpedDocument.DateTime; this.tokenizer = tokenizer; }
/// <summary> /// Initializes a new instance of the <see cref="DumpedChange" /> class. /// </summary> /// <param name="document">The dumped document data.</param> /// <param name="words">The list of dumped words data.</param> /// <param name="mappings">The list of dumped mappings data.</param> /// <exception cref="ArgumentNullException">If <paramref name="document"/>, <paramref name="words"/> or <paramref name="mappings"/> are <c>null</c>.</exception> public DumpedChange(DumpedDocument document, List<DumpedWord> words, List<DumpedWordMapping> mappings) { if(document == null) throw new ArgumentNullException("document"); if(words == null) throw new ArgumentNullException("words"); if(mappings == null) throw new ArgumentNullException("mappings"); // mappings can be empty if the document did not have any indexable content //if(mappings.Count == 0) throw new ArgumentException("Mappings cannot be empty", "mappings"); this.document = document; this.words = words; this.mappings = mappings; }
/// <summary> /// Initializes a new instance of the <see cref="DumpedChange" /> class. /// </summary> /// <param name="document">The dumped document data.</param> /// <param name="words">The list of dumped words data.</param> /// <param name="mappings">The list of dumped mappings data.</param> /// <exception cref="ArgumentNullException">If <paramref name="document"/>, <paramref name="words"/> or <paramref name="mappings"/> are <c>null</c>.</exception> public DumpedChange(DumpedDocument document, List <DumpedWord> words, List <DumpedWordMapping> mappings) { if (document == null) { throw new ArgumentNullException("document"); } if (words == null) { throw new ArgumentNullException("words"); } if (mappings == null) { throw new ArgumentNullException("mappings"); } // mappings can be empty if the document did not have any indexable content //if(mappings.Count == 0) throw new ArgumentException("Mappings cannot be empty", "mappings"); this.document = document; this.words = words; this.mappings = mappings; }
/// <summary> /// Indexes a message. /// </summary> /// <param name="page">The page.</param> /// <param name="id">The message ID.</param> /// <param name="subject">The subject.</param> /// <param name="dateTime">The date/time.</param> /// <param name="body">The body.</param> /// <returns>The number of indexed words, including duplicates.</returns> private int IndexMessage(PageInfo page, int id, string subject, DateTime dateTime, string body) { lock(this) { try { // Trim "RE:" to avoid polluting the search engine index if(subject.ToLowerInvariant().StartsWith("re:") && subject.Length > 3) subject = subject.Substring(3).Trim(); string documentName = MessageDocument.GetDocumentName(page, id); DumpedDocument ddoc = new DumpedDocument(0, documentName, host.PrepareTitleForIndexing(null, subject), MessageDocument.StandardTypeTag, dateTime); // Store the document // The content should always be prepared using IHost.PrepareForSearchEngineIndexing() int count = index.StoreDocument(new MessageDocument(page, id, ddoc, TokenizeContent), null, host.PrepareContentForIndexing(null, body), null); if(count == 0 && body.Length > 0) { host.LogEntry("Indexed 0 words for message " + page.FullName + ":" + id.ToString() + ": possible index corruption. Please report this error to the developers", LogEntryType.Warning, null, this); } return count; } catch(Exception ex) { host.LogEntry("Message indexing error for " + page.FullName + ":" + id.ToString() + " (skipping message): " + ex.ToString(), LogEntryType.Error, null, this); return 0; } } }
/// <summary> /// Writes a <see cref="DumpedDocument" /> to a <see cref="BinaryWriter" />. /// </summary> /// <param name="writer">The <see cref="BinaryWriter" />.</param> /// <param name="document">The <see cref="DumpedDocument" />.</param> private static void WriteDumpedDocument(BinaryWriter writer, DumpedDocument document) { writer.Write(document.ID); writer.Write(document.Name); writer.Write(document.Title); writer.Write(document.TypeTag); writer.Write(document.DateTime.ToBinary()); }
/// <summary> /// Determines whether two <see cref="DumpedDocument" />s are equal. /// </summary> /// <param name="d1">The first document.</param> /// <param name="d2">The second document.</param> /// <returns><c>true</c> if the documents are equal, <c>false</c> otherwise.</returns> private static bool EqualDumpedDocument(DumpedDocument d1, DumpedDocument d2) { // Only consider ID, Name and TypeTag //return d1.ID == d2.ID && d1.Name == d2.Name && d1.Title == d2.Title && // d1.TypeTag == d2.TypeTag && d1.DateTime == d2.DateTime; return d1.ID == d2.ID && d1.Name == d2.Name && d1.TypeTag == d2.TypeTag; }
/// <summary> /// Loads the index from the data store the first time. /// </summary> /// <param name="documents">The dumped documents.</param> /// <param name="words">The dumped words.</param> /// <param name="mappings">The dumped word mappings.</param> protected override void LoadIndexInternal(out DumpedDocument[] documents, out DumpedWord[] words, out DumpedWordMapping[] mappings) { uint maxDocumentId = 0; uint maxWordId = 0; // 1. Load Documents using(FileStream fs = new FileStream(documentsFile, FileMode.Open, FileAccess.Read, FileShare.None)) { int count = ReadCount(fs); BinaryReader reader = new BinaryReader(fs, Encoding.UTF8); documents = new DumpedDocument[count]; for(int i = 0; i < count; i++) { documents[i] = ReadDumpedDocument(reader); if(documents[i].ID > maxDocumentId) maxDocumentId = documents[i].ID; } firstFreeDocumentId = maxDocumentId + 1; } // 2. Load Words using(FileStream fs = new FileStream(wordsFile, FileMode.Open, FileAccess.Read, FileShare.None)) { int count = ReadCount(fs); BinaryReader reader = new BinaryReader(fs, Encoding.UTF8); words = new DumpedWord[count]; for(int i = 0; i < count; i++) { words[i] = ReadDumpedWord(reader); if(words[i].ID > maxWordId) maxWordId = words[i].ID; } firstFreeWordId = maxWordId + 1; } // 3. Load Mappings using(FileStream fs = new FileStream(mappingsFile, FileMode.Open, FileAccess.Read, FileShare.None)) { int count = ReadCount(fs); BinaryReader reader = new BinaryReader(fs, Encoding.UTF8); mappings = new DumpedWordMapping[count]; for(int i = 0; i < count; i++) { mappings[i] = ReadDumpedWordMapping(reader); } } }
/// <summary> /// Removes a page from the search engine index. /// </summary> /// <param name="content">The content of the page to remove.</param> private void UnindexPage(PageContent content) { lock(this) { string documentName = PageDocument.GetDocumentName(content.PageInfo); DumpedDocument ddoc = new DumpedDocument(0, documentName, host.PrepareTitleForIndexing(content.PageInfo, content.Title), PageDocument.StandardTypeTag, content.LastModified); index.RemoveDocument(new PageDocument(content.PageInfo, ddoc, TokenizeContent), null); } }
/// <summary> /// Removes a message from the search engine index. /// </summary> /// <param name="page">The page.</param> /// <param name="id">The message ID.</param> /// <param name="subject">The subject.</param> /// <param name="dateTime">The date/time.</param> /// <param name="body">The body.</param> /// <returns>The number of indexed words, including duplicates.</returns> private void UnindexMessage(PageInfo page, int id, string subject, DateTime dateTime, string body) { lock(this) { // Trim "RE:" to avoid polluting the search engine index if(subject.ToLowerInvariant().StartsWith("re:") && subject.Length > 3) subject = subject.Substring(3).Trim(); string documentName = MessageDocument.GetDocumentName(page, id); DumpedDocument ddoc = new DumpedDocument(0, documentName, host.PrepareTitleForIndexing(null, subject), MessageDocument.StandardTypeTag, DateTime.Now); index.RemoveDocument(new MessageDocument(page, id, ddoc, TokenizeContent), null); } }
/// <summary> /// Indexes a page. /// </summary> /// <param name="content">The content of the page.</param> /// <returns>The number of indexed words, including duplicates.</returns> private int IndexPage(PageContent content) { lock(this) { try { string documentName = PageDocument.GetDocumentName(content.PageInfo); DumpedDocument ddoc = new DumpedDocument(0, documentName, host.PrepareTitleForIndexing(content.PageInfo, content.Title), PageDocument.StandardTypeTag, content.LastModified); // Store the document // The content should always be prepared using IHost.PrepareForSearchEngineIndexing() int count = index.StoreDocument(new PageDocument(content.PageInfo, ddoc, TokenizeContent), content.Keywords, host.PrepareContentForIndexing(content.PageInfo, content.Content), null); if(count == 0 && content.Content.Length > 0) { host.LogEntry("Indexed 0 words for page " + content.PageInfo.FullName + ": possible index corruption. Please report this error to the developers", LogEntryType.Warning, null, this); } return count; } catch(Exception ex) { host.LogEntry("Page indexing error for " + content.PageInfo.FullName + " (skipping page): " + ex.ToString(), LogEntryType.Error, null, this); return 0; } } }
/// <summary> /// Loads the index from the data store the first time. /// </summary> /// <param name="documents">The dumped documents.</param> /// <param name="words">The dumped words.</param> /// <param name="mappings">The dumped word mappings.</param> protected abstract void LoadIndexInternal(out DumpedDocument[] documents, out DumpedWord[] words, out DumpedWordMapping[] mappings);
/// <summary> /// Handles the construction of an <see cref="T:IDocument" /> for the search engine. /// </summary> /// <param name="dumpedDocument">The input dumped document.</param> /// <returns>The resulting <see cref="T:IDocument" />.</returns> private IDocument BuildDocumentHandler(DumpedDocument dumpedDocument) { if(dumpedDocument.TypeTag == PageDocument.StandardTypeTag) { string pageName = PageDocument.GetPageName(dumpedDocument.Name); PageInfo page = FindPage(NameTools.GetNamespace(pageName), NameTools.GetLocalName(pageName), GetAllPages()); if(page == null) return null; else return new PageDocument(page, dumpedDocument, TokenizeContent); } else if(dumpedDocument.TypeTag == MessageDocument.StandardTypeTag) { string pageFullName; int id; MessageDocument.GetMessageDetails(dumpedDocument.Name, out pageFullName, out id); PageInfo page = FindPage(NameTools.GetNamespace(pageFullName), NameTools.GetLocalName(pageFullName), GetAllPages()); if(page == null) return null; else return new MessageDocument(page, id, dumpedDocument, TokenizeContent); } else return null; }
/// <summary> /// Detects the document in a dumped instance for files and attachments. /// </summary> /// <param name="doc">The dumped document instance.</param> /// <returns>The proper document instance.</returns> private static IDocument DetectFileOrAttachment(DumpedDocument doc) { if(doc.TypeTag == FileDocument.StandardTypeTag) { return new FileDocument(doc); } else if(doc.TypeTag == PageAttachmentDocument.StandardTypeTag) { return new PageAttachmentDocument(doc); } else throw new NotSupportedException(); }
/// <summary> /// Tries to load all data related to a word from the database. /// </summary> /// <param name="text">The word text.</param> /// <param name="word">The returned word.</param> /// <param name="connection">An open database connection.</param> /// <returns><c>true</c> if the word is found, <c>false</c> otherwise.</returns> private bool TryFindWord(string text, out Word word, DbConnection connection) { // 1. Find word - if not found, return // 2. Read all raw word mappings // 3. Read all documents (unique) // 4. Build result data structure ICommandBuilder builder = GetCommandBuilder(); QueryBuilder queryBuilder = new QueryBuilder(builder); string query = queryBuilder.SelectFrom("IndexWord", new string[] { "Id" }); query = queryBuilder.Where(query, "Text", WhereOperator.Equals, "Text"); List<Parameter> parameters = new List<Parameter>(1); parameters.Add(new Parameter(ParameterType.String, "Text", text)); DbCommand command = builder.GetCommand(connection, query, parameters); int wordId = ExecuteScalar<int>(command, -1, false); if(wordId == -1) { word = null; return false; } // Read all raw mappings query = queryBuilder.SelectFrom("IndexWordMapping"); query = queryBuilder.Where(query, "Word", WhereOperator.Equals, "WordId"); parameters = new List<Parameter>(1); parameters.Add(new Parameter(ParameterType.Int32, "WordId", wordId)); command = builder.GetCommand(connection, query, parameters); DbDataReader reader = ExecuteReader(command, false); List<DumpedWordMapping> mappings = new List<DumpedWordMapping>(2048); while(reader != null && reader.Read()) { mappings.Add(new DumpedWordMapping((uint)wordId, (uint)(int)reader["Document"], (ushort)(short)reader["FirstCharIndex"], (ushort)(short)reader["WordIndex"], (byte)reader["Location"])); } CloseReader(reader); if(mappings.Count == 0) { word = null; return false; } // Find all documents query = queryBuilder.SelectFrom("IndexDocument"); query = queryBuilder.Where(query, "Id", WhereOperator.Equals, "DocId"); parameters = new List<Parameter>(1); parameters.Add(new Parameter(ParameterType.Int32, "DocId", 0)); Dictionary<uint, IDocument> documents = new Dictionary<uint, IDocument>(64); foreach(DumpedWordMapping map in mappings) { uint docId = map.DocumentID; if(documents.ContainsKey(docId)) continue; parameters[0].Value = (int)docId; command = builder.GetCommand(connection, query, parameters); reader = ExecuteReader(command, false); if(reader != null && reader.Read()) { DumpedDocument dumpedDoc = new DumpedDocument(docId, reader["Name"] as string, reader["Title"] as string, reader["TypeTag"] as string, (DateTime)reader["DateTime"]); IDocument document = BuildDocument(dumpedDoc); if(document != null) documents.Add(docId, document); } CloseReader(reader); } OccurrenceDictionary occurrences = new OccurrenceDictionary(mappings.Count); foreach(DumpedWordMapping map in mappings) { if(!occurrences.ContainsKey(documents[map.DocumentID])) { occurrences.Add(documents[map.DocumentID], new SortedBasicWordInfoSet(2)); } occurrences[documents[map.DocumentID]].Add(new BasicWordInfo( map.FirstCharIndex, map.WordIndex, WordLocation.GetInstance(map.Location))); } word = new Word((uint)wordId, text, occurrences); return true; }
/// <summary> /// Handles the construction of an <see cref="T:IDocument" /> for the search engine. /// </summary> /// <param name="dumpedDocument">The input dumped document.</param> /// <returns>The resulting <see cref="T:IDocument" />.</returns> private IDocument BuildDocument(DumpedDocument dumpedDocument) { if(alwaysGenerateDocument) { return new DummyDocument() { ID = dumpedDocument.ID, Name = dumpedDocument.Name, Title = dumpedDocument.Title, TypeTag = dumpedDocument.TypeTag, DateTime = dumpedDocument.DateTime }; } if(dumpedDocument.TypeTag == PageDocument.StandardTypeTag) { string pageName = PageDocument.GetPageName(dumpedDocument.Name); PageInfo page = GetPage(pageName); if(page == null) return null; else return new PageDocument(page, dumpedDocument, TokenizeContent); } else if(dumpedDocument.TypeTag == MessageDocument.StandardTypeTag) { string pageFullName; int id; MessageDocument.GetMessageDetails(dumpedDocument.Name, out pageFullName, out id); PageInfo page = GetPage(pageFullName); if(page == null) return null; else return new MessageDocument(page, id, dumpedDocument, TokenizeContent); } else return null; }
/// <summary> /// Initializes index data by completely emptying the index catalog and storing the specified data. /// </summary> /// <param name="documents">The documents.</param> /// <param name="words">The words.</param> /// <param name="mappings">The mappings.</param> /// <remarks>The method <b>does not</b> check the consistency of the data passed as arguments.</remarks> /// <exception cref="ArgumentNullException">If <paramref name="documents"/>, <paramref name="words"/> or <paramref name="mappings"/> are <c>null</c>.</exception> /// <exception cref="InvalidOperationException">If <see cref="M:SetBuildDocumentDelegate"/> was not called.</exception> public void InitializeData(DumpedDocument[] documents, DumpedWord[] words, DumpedWordMapping[] mappings) { if(documents == null) throw new ArgumentNullException("documents"); if(words == null) throw new ArgumentNullException("words"); if(mappings == null) throw new ArgumentNullException("mappings"); if(buildDocument == null) throw new InvalidOperationException("InitializeData can be invoked only when the BuildDocument delegate is set"); lock(this) { catalog.Clear(); catalog = new Dictionary<string, Word>(words.Length); // Contains the IDs of documents that are missing List<uint> missingDocuments = new List<uint>(50); // 1. Prepare a dictionary with all documents for use in the last step Dictionary<uint, IDocument> tempDocuments = new Dictionary<uint, IDocument>(documents.Length); foreach(DumpedDocument doc in documents) { IDocument builtDoc = buildDocument(doc); // Null means that the document no longer exists - silently skip it if(builtDoc != null) { tempDocuments.Add(doc.ID, builtDoc); } else { missingDocuments.Add(doc.ID); } } // 2. Load words into the catalog, keeping track of them by ID in a dictionary for the next step Dictionary<ulong, Word> tempWords = new Dictionary<ulong, Word>(words.Length); // Test for hashing algorithm -- no more used since sequential IDs //if(words.Length > 0 && words[0].ID != Tools.HashString(words[0].Text)) { // throw new InvalidOperationException("The search engine index seems to use an outdated hashing algorithm"); //} foreach(DumpedWord w in words) { Word word = new Word(w.ID, w.Text); /*if(tempWords.ContainsKey(w.ID)) { string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, tempWords[w.ID]); Console.WriteLine(t); }*/ tempWords.Add(w.ID, word); /*if(catalog.ContainsKey(w.Text)) { string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, catalog[w.Text]); Console.WriteLine(t); }*/ catalog.Add(w.Text, word); } // 3. Add mappings and documents foreach(DumpedWordMapping map in mappings) { // HACK: Skip mappings that refer to missing documents and gracefully skip unknown words if(!missingDocuments.Contains(map.DocumentID)) { try { tempWords[map.WordID].AddOccurrence(tempDocuments[map.DocumentID], map.FirstCharIndex, map.WordIndex, WordLocation.GetInstance(map.Location)); } catch(KeyNotFoundException) { } } } } }