/// <summary> /// Initializes a new instance of the <see cref="DumpedWord" /> class. /// </summary> /// <param name="word">The word to extract the information from.</param> /// <exception cref="ArgumentNullException">If <paramref name="word"/> is <c>null</c>.</exception> public DumpedWord(Word word) { if(word == null) throw new ArgumentNullException("word"); this.id = word.ID; this.text = word.Text; }
/// <summary> /// Tries to get a word. /// </summary> /// <param name="text">The text of the word.</param> /// <param name="word">The found word, if any, <c>null</c> otherwise.</param> /// <returns><c>true</c> if the word is found, <c>false</c> otherwise.</returns> public bool TryGetWord(string text, out Word word) { return implementation(text, out word, connection); }
/// <summary> /// Tries to load all data related to a word from the database. /// </summary> /// <param name="text">The word text.</param> /// <param name="word">The returned word.</param> /// <param name="connection">An open database connection.</param> /// <returns><c>true</c> if the word is found, <c>false</c> otherwise.</returns> private bool TryFindWord(string text, out Word word, DbConnection connection) { // 1. Find word - if not found, return // 2. Read all raw word mappings // 3. Read all documents (unique) // 4. Build result data structure ICommandBuilder builder = GetCommandBuilder(); QueryBuilder queryBuilder = new QueryBuilder(builder); string query = queryBuilder.SelectFrom("IndexWord", new string[] { "Id" }); query = queryBuilder.Where(query, "Text", WhereOperator.Equals, "Text"); List<Parameter> parameters = new List<Parameter>(1); parameters.Add(new Parameter(ParameterType.String, "Text", text)); DbCommand command = builder.GetCommand(connection, query, parameters); int wordId = ExecuteScalar<int>(command, -1, false); if(wordId == -1) { word = null; return false; } // Read all raw mappings query = queryBuilder.SelectFrom("IndexWordMapping"); query = queryBuilder.Where(query, "Word", WhereOperator.Equals, "WordId"); parameters = new List<Parameter>(1); parameters.Add(new Parameter(ParameterType.Int32, "WordId", wordId)); command = builder.GetCommand(connection, query, parameters); DbDataReader reader = ExecuteReader(command, false); List<DumpedWordMapping> mappings = new List<DumpedWordMapping>(2048); while(reader != null && reader.Read()) { mappings.Add(new DumpedWordMapping((uint)wordId, (uint)(int)reader["Document"], (ushort)(short)reader["FirstCharIndex"], (ushort)(short)reader["WordIndex"], (byte)reader["Location"])); } CloseReader(reader); if(mappings.Count == 0) { word = null; return false; } // Find all documents query = queryBuilder.SelectFrom("IndexDocument"); query = queryBuilder.Where(query, "Id", WhereOperator.Equals, "DocId"); parameters = new List<Parameter>(1); parameters.Add(new Parameter(ParameterType.Int32, "DocId", 0)); Dictionary<uint, IDocument> documents = new Dictionary<uint, IDocument>(64); foreach(DumpedWordMapping map in mappings) { uint docId = map.DocumentID; if(documents.ContainsKey(docId)) continue; parameters[0].Value = (int)docId; command = builder.GetCommand(connection, query, parameters); reader = ExecuteReader(command, false); if(reader != null && reader.Read()) { DumpedDocument dumpedDoc = new DumpedDocument(docId, reader["Name"] as string, reader["Title"] as string, reader["TypeTag"] as string, (DateTime)reader["DateTime"]); IDocument document = BuildDocument(dumpedDoc); if(document != null) documents.Add(docId, document); } CloseReader(reader); } OccurrenceDictionary occurrences = new OccurrenceDictionary(mappings.Count); foreach(DumpedWordMapping map in mappings) { if(!occurrences.ContainsKey(documents[map.DocumentID])) { occurrences.Add(documents[map.DocumentID], new SortedBasicWordInfoSet(2)); } occurrences[documents[map.DocumentID]].Add(new BasicWordInfo( map.FirstCharIndex, map.WordIndex, WordLocation.GetInstance(map.Location))); } word = new Word((uint)wordId, text, occurrences); return true; }
/// <summary> /// Tries to get a word. /// </summary> /// <param name="text">The text of the word.</param> /// <param name="word">The found word, if any, <c>null</c> otherwise.</param> /// <returns><c>true</c> if the word is found, <c>false</c> otherwise.</returns> public bool TryGetWord(string text, out Word word) { lock(catalog) { return catalog.TryGetValue(text, out word); } }
/// <summary> /// Stores a word in the catalog. /// </summary> /// <param name="wordText">The word to store.</param> /// <param name="document">The document the word occurs in.</param> /// <param name="firstCharIndex">The index of the first character of the word in the document the word occurs at.</param> /// <param name="wordIndex">The index of the word in the document.</param> /// <param name="location">The location of the word.</param> /// <param name="newWord">The new word, or <c>null</c>.</param> /// <param name="dumpedWord">The dumped word data, or <c>null</c>.</param> /// <returns>The dumped word mapping data.</returns> /// <remarks>Storing a word in the index is <b>O(n log n)</b>, /// where <b>n</b> is the number of words already in the index.</remarks> protected DumpedWordMapping StoreWord(string wordText, IDocument document, ushort firstCharIndex, ushort wordIndex, WordLocation location, out Word newWord, out DumpedWord dumpedWord) { wordText = wordText.ToLower(CultureInfo.InvariantCulture); lock(this) { Word word = null; if(!catalog.TryGetValue(wordText, out word)) { // Use ZERO as initial ID, update when IndexStorer has stored the word // A reference to this newly-created word must be passed outside this method word = new Word(0, wordText); catalog.Add(wordText, word); newWord = word; dumpedWord = new DumpedWord(word); } else { newWord = null; dumpedWord = null; } word.AddOccurrence(document, firstCharIndex, wordIndex, location); return new DumpedWordMapping(word.ID, document.ID, firstCharIndex, wordIndex, location.Location); } }
/// <summary> /// Initializes index data by completely emptying the index catalog and storing the specified data. /// </summary> /// <param name="documents">The documents.</param> /// <param name="words">The words.</param> /// <param name="mappings">The mappings.</param> /// <remarks>The method <b>does not</b> check the consistency of the data passed as arguments.</remarks> /// <exception cref="ArgumentNullException">If <paramref name="documents"/>, <paramref name="words"/> or <paramref name="mappings"/> are <c>null</c>.</exception> /// <exception cref="InvalidOperationException">If <see cref="M:SetBuildDocumentDelegate"/> was not called.</exception> public void InitializeData(DumpedDocument[] documents, DumpedWord[] words, DumpedWordMapping[] mappings) { if(documents == null) throw new ArgumentNullException("documents"); if(words == null) throw new ArgumentNullException("words"); if(mappings == null) throw new ArgumentNullException("mappings"); if(buildDocument == null) throw new InvalidOperationException("InitializeData can be invoked only when the BuildDocument delegate is set"); lock(this) { catalog.Clear(); catalog = new Dictionary<string, Word>(words.Length); // Contains the IDs of documents that are missing List<uint> missingDocuments = new List<uint>(50); // 1. Prepare a dictionary with all documents for use in the last step Dictionary<uint, IDocument> tempDocuments = new Dictionary<uint, IDocument>(documents.Length); foreach(DumpedDocument doc in documents) { IDocument builtDoc = buildDocument(doc); // Null means that the document no longer exists - silently skip it if(builtDoc != null) { tempDocuments.Add(doc.ID, builtDoc); } else { missingDocuments.Add(doc.ID); } } // 2. Load words into the catalog, keeping track of them by ID in a dictionary for the next step Dictionary<ulong, Word> tempWords = new Dictionary<ulong, Word>(words.Length); // Test for hashing algorithm -- no more used since sequential IDs //if(words.Length > 0 && words[0].ID != Tools.HashString(words[0].Text)) { // throw new InvalidOperationException("The search engine index seems to use an outdated hashing algorithm"); //} foreach(DumpedWord w in words) { Word word = new Word(w.ID, w.Text); /*if(tempWords.ContainsKey(w.ID)) { string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, tempWords[w.ID]); Console.WriteLine(t); }*/ tempWords.Add(w.ID, word); /*if(catalog.ContainsKey(w.Text)) { string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, catalog[w.Text]); Console.WriteLine(t); }*/ catalog.Add(w.Text, word); } // 3. Add mappings and documents foreach(DumpedWordMapping map in mappings) { // HACK: Skip mappings that refer to missing documents and gracefully skip unknown words if(!missingDocuments.Contains(map.DocumentID)) { try { tempWords[map.WordID].AddOccurrence(tempDocuments[map.DocumentID], map.FirstCharIndex, map.WordIndex, WordLocation.GetInstance(map.Location)); } catch(KeyNotFoundException) { } } } } }
/// <summary> /// Performs a search in the index. /// </summary> /// <param name="query">The search query.</param> /// <param name="documentTypeTags">The document type tags to include in the search.</param> /// <param name="filterDocumentType"><c>true</c> to apply the filter on the document type.</param> /// <param name="options">The search options.</param> /// <param name="fetcher">An object that is able to fetch words.</param> /// <returns>The results.</returns> /// <exception cref="ArgumentNullException">If <paramref name="query"/> or <paramref name="fetcher"/> are <c>null</c>.</exception> /// <exception cref="ArgumentException">If <paramref name="query"/> is empty.</exception> /// <exception cref="ArgumentNullException">If <paramref name="filterDocumentType"/> is <c>true</c> and <paramref name="documentTypeTags"/> is <c>null</c>.</exception> /// <exception cref="ArgumentException">If <paramref name="filterDocumentType"/> is <c>true</c> and <paramref name="documentTypeTags"/> is empty.</exception> public static SearchResultCollection SearchInternal(string query, string[] documentTypeTags, bool filterDocumentType, SearchOptions options, IWordFetcher fetcher) { if (query == null) { throw new ArgumentNullException("query"); } if (query.Length == 0) { throw new ArgumentException("Query cannot be empty", "query"); } if (filterDocumentType && documentTypeTags == null) { throw new ArgumentNullException("documentTypeTags"); } if (filterDocumentType && documentTypeTags.Length == 0) { throw new ArgumentException("documentTypeTags cannot be empty", "documentTypeTags"); } if (fetcher == null) { throw new ArgumentNullException("fetcher"); } SearchResultCollection results = new SearchResultCollection(); query = query.ToLowerInvariant(); string[] queryWords = query.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); float totalRelevance = 0; Word word = null; foreach (string q in queryWords) { if (fetcher.TryGetWord(q, out word)) { foreach (IDocument doc in word.Occurrences.Keys) { // Skip documents with excluded tags if (filterDocumentType && !IsDocumentTypeTagIncluded(doc.TypeTag, documentTypeTags)) { continue; } foreach (BasicWordInfo info in word.Occurrences[doc]) { // If a search result is already present, add a new match to it, // otherwise create a new search result object WordInfo mi = new WordInfo(q, info.FirstCharIndex, info.WordIndex, info.Location); SearchResult res = results.GetSearchResult(doc); if (res == null) { res = new SearchResult(doc); res.Relevance.SetValue(info.Location.RelativeRelevance); res.Matches.Add(mi); results.Add(res); } else { // Avoid adding duplicate matches (happens when query contains the same word multiple times) if (!res.Matches.ContainsOccurrence(mi.Text, mi.FirstCharIndex)) { res.Matches.Add(mi); } res.Relevance.SetValue(res.Relevance.Value + info.Location.RelativeRelevance); } totalRelevance += info.Location.RelativeRelevance; } } } } if (options == SearchOptions.AllWords) { totalRelevance -= PurgeResultsForAllWords(results, queryWords); } else if (options == SearchOptions.ExactPhrase) { totalRelevance -= PurgeResultsForExactPhrase(results, queryWords); } else if (options == SearchOptions.AtLeastOneWord) { // Nothing to do } else { throw new InvalidOperationException("Unsupported SearchOptions"); } // Finalize relevance values for (int i = 0; i < results.Count; i++) { results[i].Relevance.Finalize(totalRelevance); } return(results); }
/// <summary> /// Tries to get a word. /// </summary> /// <param name="text">The text of the word.</param> /// <param name="word">The found word, if any, <c>null</c> otherwise.</param> /// <returns><c>true</c> if the word is found, <c>false</c> otherwise.</returns> public bool TryGetWord(string text, out Word word) { lock (catalog) { return(catalog.TryGetValue(text, out word)); } }
/// <summary> /// Stores a document in the index. /// </summary> /// <param name="document">The document.</param> /// <param name="keywords">The document keywords, if any, an empty array or <c>null</c> otherwise.</param> /// <param name="content">The content of the document.</param> /// <param name="state">A state object that is passed to the IndexStorer SaveDate/DeleteData function.</param> /// <returns>The number of indexed words (including duplicates) in the document title and content.</returns> /// <remarks>Indexing the content of the document is <b>O(n)</b>, /// where <b>n</b> is the total number of words in the document. /// If the specified document was already in the index, all the old occurrences /// are deleted from the index.</remarks> /// <exception cref="ArgumentNullException">If <paramref name="document"/> or <paramref name="content"/> are <c>null</c>.</exception> public int StoreDocument(IDocument document, string[] keywords, string content, object state) { if (document == null) { throw new ArgumentNullException("document"); } if (keywords == null) { keywords = new string[0]; } if (content == null) { throw new ArgumentNullException("content"); } lock (this) { DumpedChange removeChange = RemoveDocumentInternal(document); if (removeChange != null) { OnIndexChange(document, IndexChangeType.DocumentRemoved, removeChange, state); } } keywords = Tools.CleanupKeywords(keywords); // When the IndexStorer handles the IndexChanged event and a document is added, the storer generates a new ID and returns it // via the event handler, then the in-memory index is updated (the document instance is shared across all words) - the final ID // is generated by the actual IndexStorer implementation (SaveData properly populates the Result field in the args) List <DumpedWord> dw = new List <DumpedWord>(content.Length / 5); List <DumpedWordMapping> dm = new List <DumpedWordMapping>(content.Length / 5); Word tempWord = null; List <Word> newWords = new List <Word>(50); DumpedWord tempDumpedWord = null; int count = 0; uint sequentialWordId = uint.MaxValue; // Store content words WordInfo[] words = document.Tokenize(content); words = Tools.RemoveStopWords(words, stopWords); foreach (WordInfo info in words) { dm.Add(StoreWord(info.Text, document, info.FirstCharIndex, info.WordIndex, WordLocation.Content, out tempWord, out tempDumpedWord)); if (tempDumpedWord != null && tempWord != null) { dm[dm.Count - 1].WordID = sequentialWordId; tempDumpedWord.ID = sequentialWordId; dw.Add(tempDumpedWord); tempWord.ID = sequentialWordId; newWords.Add(tempWord); sequentialWordId--; } } count += words.Length; // Store title words words = document.Tokenize(document.Title); words = Tools.RemoveStopWords(words, stopWords); foreach (WordInfo info in words) { dm.Add(StoreWord(info.Text, document, info.FirstCharIndex, info.WordIndex, WordLocation.Title, out tempWord, out tempDumpedWord)); if (tempDumpedWord != null && tempWord != null) { dm[dm.Count - 1].WordID = sequentialWordId; tempDumpedWord.ID = sequentialWordId; dw.Add(tempDumpedWord); tempWord.ID = sequentialWordId; newWords.Add(tempWord); sequentialWordId--; } } count += words.Length; ushort tempCount = 0; // Store keywords for (ushort i = 0; i < (ushort)keywords.Length; i++) { dm.Add(StoreWord(keywords[i], document, tempCount, i, WordLocation.Keywords, out tempWord, out tempDumpedWord)); if (tempDumpedWord != null && tempWord != null) { dm[dm.Count - 1].WordID = sequentialWordId; tempDumpedWord.ID = sequentialWordId; dw.Add(tempDumpedWord); tempWord.ID = sequentialWordId; newWords.Add(tempWord); sequentialWordId--; } tempCount += (ushort)(1 + keywords[i].Length); } count += keywords.Length; IndexStorerResult result = OnIndexChange(document, IndexChangeType.DocumentAdded, new DumpedChange(new DumpedDocument(document), dw, dm), state); // Update document ID if (result != null && result.DocumentID.HasValue) { document.ID = result.DocumentID.Value; } else { // HACK: result is null -> index is corrupted, silently return return(0); } // Update word IDs in newWords bool wordIdUpdated = false; foreach (Word word in newWords) { wordIdUpdated = false; foreach (WordId id in result.WordIDs) { if (id.Text == word.Text) { word.ID = id.ID; wordIdUpdated = true; break; } } if (!wordIdUpdated) { throw new InvalidOperationException("No ID for new word"); } } return(count); }
/// <summary> /// Initializes index data by completely emptying the index catalog and storing the specified data. /// </summary> /// <param name="documents">The documents.</param> /// <param name="words">The words.</param> /// <param name="mappings">The mappings.</param> /// <remarks>The method <b>does not</b> check the consistency of the data passed as arguments.</remarks> /// <exception cref="ArgumentNullException">If <paramref name="documents"/>, <paramref name="words"/> or <paramref name="mappings"/> are <c>null</c>.</exception> /// <exception cref="InvalidOperationException">If <see cref="M:SetBuildDocumentDelegate"/> was not called.</exception> public void InitializeData(DumpedDocument[] documents, DumpedWord[] words, DumpedWordMapping[] mappings) { if (documents == null) { throw new ArgumentNullException("documents"); } if (words == null) { throw new ArgumentNullException("words"); } if (mappings == null) { throw new ArgumentNullException("mappings"); } if (buildDocument == null) { throw new InvalidOperationException("InitializeData can be invoked only when the BuildDocument delegate is set"); } lock (this) { catalog.Clear(); catalog = new Dictionary <string, Word>(words.Length); // Contains the IDs of documents that are missing List <uint> missingDocuments = new List <uint>(50); // 1. Prepare a dictionary with all documents for use in the last step Dictionary <uint, IDocument> tempDocuments = new Dictionary <uint, IDocument>(documents.Length); foreach (DumpedDocument doc in documents) { IDocument builtDoc = buildDocument(doc); // Null means that the document no longer exists - silently skip it if (builtDoc != null) { tempDocuments.Add(doc.ID, builtDoc); } else { missingDocuments.Add(doc.ID); } } // 2. Load words into the catalog, keeping track of them by ID in a dictionary for the next step Dictionary <ulong, Word> tempWords = new Dictionary <ulong, Word>(words.Length); // Test for hashing algorithm -- no more used since sequential IDs //if(words.Length > 0 && words[0].ID != Tools.HashString(words[0].Text)) { // throw new InvalidOperationException("The search engine index seems to use an outdated hashing algorithm"); //} foreach (DumpedWord w in words) { Word word = new Word(w.ID, w.Text); /*if(tempWords.ContainsKey(w.ID)) { * string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, tempWords[w.ID]); * Console.WriteLine(t); * }*/ tempWords.Add(w.ID, word); /*if(catalog.ContainsKey(w.Text)) { * string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, catalog[w.Text]); * Console.WriteLine(t); * }*/ catalog.Add(w.Text, word); } // 3. Add mappings and documents foreach (DumpedWordMapping map in mappings) { // HACK: Skip mappings that refer to missing documents and gracefully skip unknown words if (!missingDocuments.Contains(map.DocumentID)) { try { tempWords[map.WordID].AddOccurrence(tempDocuments[map.DocumentID], map.FirstCharIndex, map.WordIndex, WordLocation.GetInstance(map.Location)); } catch (KeyNotFoundException) { } } } } }