Represents a word in a document.
All instance and static members are thread-safe.
Пример #1
0
        /// <summary>
        /// Initializes a new instance of the <see cref="DumpedWord" /> class.
        /// </summary>
        /// <param name="word">The word to extract the information from.</param>
        /// <exception cref="ArgumentNullException">If <paramref name="word"/> is <c>null</c>.</exception>
        public DumpedWord(Word word)
        {
            if(word == null) throw new ArgumentNullException("word");

            this.id = word.ID;
            this.text = word.Text;
        }
		/// <summary>
		/// Tries to get a word.
		/// </summary>
		/// <param name="text">The text of the word.</param>
		/// <param name="word">The found word, if any, <c>null</c> otherwise.</param>
		/// <returns><c>true</c> if the word is found, <c>false</c> otherwise.</returns>
		public bool TryGetWord(string text, out Word word) {
			return implementation(text, out word, connection);
		}
        /// <summary>
        /// Tries to load all data related to a word from the database.
        /// </summary>
        /// <param name="text">The word text.</param>
        /// <param name="word">The returned word.</param>
        /// <param name="connection">An open database connection.</param>
        /// <returns><c>true</c> if the word is found, <c>false</c> otherwise.</returns>
        private bool TryFindWord(string text, out Word word, DbConnection connection)
        {
            // 1. Find word - if not found, return
            // 2. Read all raw word mappings
            // 3. Read all documents (unique)
            // 4. Build result data structure

            ICommandBuilder builder = GetCommandBuilder();
            QueryBuilder queryBuilder = new QueryBuilder(builder);

            string query = queryBuilder.SelectFrom("IndexWord", new string[] { "Id" });
            query = queryBuilder.Where(query, "Text", WhereOperator.Equals, "Text");

            List<Parameter> parameters = new List<Parameter>(1);
            parameters.Add(new Parameter(ParameterType.String, "Text", text));

            DbCommand command = builder.GetCommand(connection, query, parameters);

            int wordId = ExecuteScalar<int>(command, -1, false);

            if(wordId == -1) {
                word = null;
                return false;
            }

            // Read all raw mappings
            query = queryBuilder.SelectFrom("IndexWordMapping");
            query = queryBuilder.Where(query, "Word", WhereOperator.Equals, "WordId");

            parameters = new List<Parameter>(1);
            parameters.Add(new Parameter(ParameterType.Int32, "WordId", wordId));

            command = builder.GetCommand(connection, query, parameters);

            DbDataReader reader = ExecuteReader(command, false);

            List<DumpedWordMapping> mappings = new List<DumpedWordMapping>(2048);
            while(reader != null && reader.Read()) {
                mappings.Add(new DumpedWordMapping((uint)wordId,
                    (uint)(int)reader["Document"],
                    (ushort)(short)reader["FirstCharIndex"], (ushort)(short)reader["WordIndex"],
                    (byte)reader["Location"]));
            }
            CloseReader(reader);

            if(mappings.Count == 0) {
                word = null;
                return false;
            }

            // Find all documents
            query = queryBuilder.SelectFrom("IndexDocument");
            query = queryBuilder.Where(query, "Id", WhereOperator.Equals, "DocId");

            parameters = new List<Parameter>(1);
            parameters.Add(new Parameter(ParameterType.Int32, "DocId", 0));

            Dictionary<uint, IDocument> documents = new Dictionary<uint, IDocument>(64);
            foreach(DumpedWordMapping map in mappings) {
                uint docId = map.DocumentID;
                if(documents.ContainsKey(docId)) continue;

                parameters[0].Value = (int)docId;
                command = builder.GetCommand(connection, query, parameters);

                reader = ExecuteReader(command, false);

                if(reader != null && reader.Read()) {
                    DumpedDocument dumpedDoc = new DumpedDocument(docId,
                        reader["Name"] as string, reader["Title"] as string,
                        reader["TypeTag"] as string,
                        (DateTime)reader["DateTime"]);

                    IDocument document = BuildDocument(dumpedDoc);

                    if(document != null) documents.Add(docId, document);
                }
                CloseReader(reader);
            }

            OccurrenceDictionary occurrences = new OccurrenceDictionary(mappings.Count);
            foreach(DumpedWordMapping map in mappings) {
                if(!occurrences.ContainsKey(documents[map.DocumentID])) {
                    occurrences.Add(documents[map.DocumentID], new SortedBasicWordInfoSet(2));
                }

                occurrences[documents[map.DocumentID]].Add(new BasicWordInfo(
                    map.FirstCharIndex, map.WordIndex, WordLocation.GetInstance(map.Location)));
            }

            word = new Word((uint)wordId, text, occurrences);
            return true;
        }
Пример #4
0
 /// <summary>
 /// Tries to get a word.
 /// </summary>
 /// <param name="text">The text of the word.</param>
 /// <param name="word">The found word, if any, <c>null</c> otherwise.</param>
 /// <returns><c>true</c> if the word is found, <c>false</c> otherwise.</returns>
 public bool TryGetWord(string text, out Word word)
 {
     lock(catalog) {
         return catalog.TryGetValue(text, out word);
     }
 }
Пример #5
0
        /// <summary>
        /// Stores a word in the catalog.
        /// </summary>
        /// <param name="wordText">The word to store.</param>
        /// <param name="document">The document the word occurs in.</param>
        /// <param name="firstCharIndex">The index of the first character of the word in the document the word occurs at.</param>
        /// <param name="wordIndex">The index of the word in the document.</param>
        /// <param name="location">The location of the word.</param>
        /// <param name="newWord">The new word, or <c>null</c>.</param>
        /// <param name="dumpedWord">The dumped word data, or <c>null</c>.</param>
        /// <returns>The dumped word mapping data.</returns>
        /// <remarks>Storing a word in the index is <b>O(n log n)</b>, 
        /// where <b>n</b> is the number of words already in the index.</remarks>
        protected DumpedWordMapping StoreWord(string wordText, IDocument document, ushort firstCharIndex, ushort wordIndex,
            WordLocation location, out Word newWord, out DumpedWord dumpedWord)
        {
            wordText = wordText.ToLower(CultureInfo.InvariantCulture);

            lock(this) {
                Word word = null;

                if(!catalog.TryGetValue(wordText, out word)) {
                    // Use ZERO as initial ID, update when IndexStorer has stored the word
                    // A reference to this newly-created word must be passed outside this method
                    word = new Word(0, wordText);
                    catalog.Add(wordText, word);
                    newWord = word;
                    dumpedWord = new DumpedWord(word);
                }
                else {
                    newWord = null;
                    dumpedWord = null;
                }

                word.AddOccurrence(document, firstCharIndex, wordIndex, location);
                return new DumpedWordMapping(word.ID, document.ID, firstCharIndex, wordIndex, location.Location);
            }
        }
Пример #6
0
        /// <summary>
        /// Initializes index data by completely emptying the index catalog and storing the specified data.
        /// </summary>
        /// <param name="documents">The documents.</param>
        /// <param name="words">The words.</param>
        /// <param name="mappings">The mappings.</param>
        /// <remarks>The method <b>does not</b> check the consistency of the data passed as arguments.</remarks>
        /// <exception cref="ArgumentNullException">If <paramref name="documents"/>, <paramref name="words"/> or <paramref name="mappings"/> are <c>null</c>.</exception>
        /// <exception cref="InvalidOperationException">If <see cref="M:SetBuildDocumentDelegate"/> was not called.</exception>
        public void InitializeData(DumpedDocument[] documents, DumpedWord[] words, DumpedWordMapping[] mappings)
        {
            if(documents == null) throw new ArgumentNullException("documents");
            if(words == null) throw new ArgumentNullException("words");
            if(mappings == null) throw new ArgumentNullException("mappings");

            if(buildDocument == null) throw new InvalidOperationException("InitializeData can be invoked only when the BuildDocument delegate is set");

            lock(this) {
                catalog.Clear();
                catalog = new Dictionary<string, Word>(words.Length);

                // Contains the IDs of documents that are missing
                List<uint> missingDocuments = new List<uint>(50);

                // 1. Prepare a dictionary with all documents for use in the last step
                Dictionary<uint, IDocument> tempDocuments = new Dictionary<uint, IDocument>(documents.Length);
                foreach(DumpedDocument doc in documents) {
                    IDocument builtDoc = buildDocument(doc);
                    // Null means that the document no longer exists - silently skip it
                    if(builtDoc != null) {
                        tempDocuments.Add(doc.ID, builtDoc);
                    }
                    else {
                        missingDocuments.Add(doc.ID);
                    }
                }

                // 2. Load words into the catalog, keeping track of them by ID in a dictionary for the next step
                Dictionary<ulong, Word> tempWords = new Dictionary<ulong, Word>(words.Length);

                // Test for hashing algorithm -- no more used since sequential IDs
                //if(words.Length > 0 && words[0].ID != Tools.HashString(words[0].Text)) {
                //	throw new InvalidOperationException("The search engine index seems to use an outdated hashing algorithm");
                //}

                foreach(DumpedWord w in words) {
                    Word word = new Word(w.ID, w.Text);
                    /*if(tempWords.ContainsKey(w.ID)) {
                        string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, tempWords[w.ID]);
                        Console.WriteLine(t);
                    }*/
                    tempWords.Add(w.ID, word);
                    /*if(catalog.ContainsKey(w.Text)) {
                        string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, catalog[w.Text]);
                        Console.WriteLine(t);
                    }*/
                    catalog.Add(w.Text, word);
                }

                // 3. Add mappings and documents
                foreach(DumpedWordMapping map in mappings) {
                    // HACK: Skip mappings that refer to missing documents and gracefully skip unknown words
                    if(!missingDocuments.Contains(map.DocumentID)) {
                        try {
                            tempWords[map.WordID].AddOccurrence(tempDocuments[map.DocumentID],
                                map.FirstCharIndex, map.WordIndex, WordLocation.GetInstance(map.Location));
                        }
                        catch(KeyNotFoundException) { }
                    }
                }
            }
        }
Пример #7
0
        /// <summary>
        /// Performs a search in the index.
        /// </summary>
        /// <param name="query">The search query.</param>
        /// <param name="documentTypeTags">The document type tags to include in the search.</param>
        /// <param name="filterDocumentType"><c>true</c> to apply the filter on the document type.</param>
        /// <param name="options">The search options.</param>
        /// <param name="fetcher">An object that is able to fetch words.</param>
        /// <returns>The results.</returns>
        /// <exception cref="ArgumentNullException">If <paramref name="query"/> or <paramref name="fetcher"/> are <c>null</c>.</exception>
        /// <exception cref="ArgumentException">If <paramref name="query"/> is empty.</exception>
        /// <exception cref="ArgumentNullException">If <paramref name="filterDocumentType"/> is <c>true</c> and <paramref name="documentTypeTags"/> is <c>null</c>.</exception>
        /// <exception cref="ArgumentException">If <paramref name="filterDocumentType"/> is <c>true</c> and <paramref name="documentTypeTags"/> is empty.</exception>
        public static SearchResultCollection SearchInternal(string query, string[] documentTypeTags, bool filterDocumentType, SearchOptions options, IWordFetcher fetcher)
        {
            if (query == null)
            {
                throw new ArgumentNullException("query");
            }
            if (query.Length == 0)
            {
                throw new ArgumentException("Query cannot be empty", "query");
            }

            if (filterDocumentType && documentTypeTags == null)
            {
                throw new ArgumentNullException("documentTypeTags");
            }
            if (filterDocumentType && documentTypeTags.Length == 0)
            {
                throw new ArgumentException("documentTypeTags cannot be empty", "documentTypeTags");
            }

            if (fetcher == null)
            {
                throw new ArgumentNullException("fetcher");
            }

            SearchResultCollection results = new SearchResultCollection();

            query = query.ToLowerInvariant();
            string[] queryWords = query.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

            float totalRelevance = 0;

            Word word = null;

            foreach (string q in queryWords)
            {
                if (fetcher.TryGetWord(q, out word))
                {
                    foreach (IDocument doc in word.Occurrences.Keys)
                    {
                        // Skip documents with excluded tags
                        if (filterDocumentType &&
                            !IsDocumentTypeTagIncluded(doc.TypeTag, documentTypeTags))
                        {
                            continue;
                        }
                        foreach (BasicWordInfo info in word.Occurrences[doc])
                        {
                            // If a search result is already present, add a new match to it,
                            // otherwise create a new search result object
                            WordInfo     mi  = new WordInfo(q, info.FirstCharIndex, info.WordIndex, info.Location);
                            SearchResult res = results.GetSearchResult(doc);
                            if (res == null)
                            {
                                res = new SearchResult(doc);
                                res.Relevance.SetValue(info.Location.RelativeRelevance);
                                res.Matches.Add(mi);
                                results.Add(res);
                            }
                            else
                            {
                                // Avoid adding duplicate matches (happens when query contains the same word multiple times)
                                if (!res.Matches.ContainsOccurrence(mi.Text, mi.FirstCharIndex))
                                {
                                    res.Matches.Add(mi);
                                }
                                res.Relevance.SetValue(res.Relevance.Value + info.Location.RelativeRelevance);
                            }
                            totalRelevance += info.Location.RelativeRelevance;
                        }
                    }
                }
            }

            if (options == SearchOptions.AllWords)
            {
                totalRelevance -= PurgeResultsForAllWords(results, queryWords);
            }
            else if (options == SearchOptions.ExactPhrase)
            {
                totalRelevance -= PurgeResultsForExactPhrase(results, queryWords);
            }
            else if (options == SearchOptions.AtLeastOneWord)
            {
                // Nothing to do
            }
            else
            {
                throw new InvalidOperationException("Unsupported SearchOptions");
            }

            // Finalize relevance values
            for (int i = 0; i < results.Count; i++)
            {
                results[i].Relevance.Finalize(totalRelevance);
            }

            return(results);
        }
Пример #8
0
 /// <summary>
 /// Tries to get a word.
 /// </summary>
 /// <param name="text">The text of the word.</param>
 /// <param name="word">The found word, if any, <c>null</c> otherwise.</param>
 /// <returns><c>true</c> if the word is found, <c>false</c> otherwise.</returns>
 public bool TryGetWord(string text, out Word word)
 {
     lock (catalog) {
         return(catalog.TryGetValue(text, out word));
     }
 }
Пример #9
0
        /// <summary>
        /// Stores a document in the index.
        /// </summary>
        /// <param name="document">The document.</param>
        /// <param name="keywords">The document keywords, if any, an empty array or <c>null</c> otherwise.</param>
        /// <param name="content">The content of the document.</param>
        /// <param name="state">A state object that is passed to the IndexStorer SaveDate/DeleteData function.</param>
        /// <returns>The number of indexed words (including duplicates) in the document title and content.</returns>
        /// <remarks>Indexing the content of the document is <b>O(n)</b>,
        /// where <b>n</b> is the total number of words in the document.
        /// If the specified document was already in the index, all the old occurrences
        /// are deleted from the index.</remarks>
        /// <exception cref="ArgumentNullException">If <paramref name="document"/> or <paramref name="content"/> are <c>null</c>.</exception>
        public int StoreDocument(IDocument document, string[] keywords, string content, object state)
        {
            if (document == null)
            {
                throw new ArgumentNullException("document");
            }
            if (keywords == null)
            {
                keywords = new string[0];
            }
            if (content == null)
            {
                throw new ArgumentNullException("content");
            }

            lock (this) {
                DumpedChange removeChange = RemoveDocumentInternal(document);

                if (removeChange != null)
                {
                    OnIndexChange(document, IndexChangeType.DocumentRemoved, removeChange, state);
                }
            }

            keywords = Tools.CleanupKeywords(keywords);

            // When the IndexStorer handles the IndexChanged event and a document is added, the storer generates a new ID and returns it
            // via the event handler, then the in-memory index is updated (the document instance is shared across all words) - the final ID
            // is generated by the actual IndexStorer implementation (SaveData properly populates the Result field in the args)

            List <DumpedWord>        dw = new List <DumpedWord>(content.Length / 5);
            List <DumpedWordMapping> dm = new List <DumpedWordMapping>(content.Length / 5);
            Word        tempWord        = null;
            List <Word> newWords        = new List <Word>(50);
            DumpedWord  tempDumpedWord  = null;

            int  count            = 0;
            uint sequentialWordId = uint.MaxValue;

            // Store content words
            WordInfo[] words = document.Tokenize(content);
            words = Tools.RemoveStopWords(words, stopWords);

            foreach (WordInfo info in words)
            {
                dm.Add(StoreWord(info.Text, document, info.FirstCharIndex, info.WordIndex, WordLocation.Content, out tempWord, out tempDumpedWord));
                if (tempDumpedWord != null && tempWord != null)
                {
                    dm[dm.Count - 1].WordID = sequentialWordId;
                    tempDumpedWord.ID       = sequentialWordId;
                    dw.Add(tempDumpedWord);
                    tempWord.ID = sequentialWordId;
                    newWords.Add(tempWord);
                    sequentialWordId--;
                }
            }
            count += words.Length;

            // Store title words
            words = document.Tokenize(document.Title);
            words = Tools.RemoveStopWords(words, stopWords);

            foreach (WordInfo info in words)
            {
                dm.Add(StoreWord(info.Text, document, info.FirstCharIndex, info.WordIndex, WordLocation.Title, out tempWord, out tempDumpedWord));
                if (tempDumpedWord != null && tempWord != null)
                {
                    dm[dm.Count - 1].WordID = sequentialWordId;
                    tempDumpedWord.ID       = sequentialWordId;
                    dw.Add(tempDumpedWord);
                    tempWord.ID = sequentialWordId;
                    newWords.Add(tempWord);
                    sequentialWordId--;
                }
            }
            count += words.Length;

            ushort tempCount = 0;

            // Store keywords
            for (ushort i = 0; i < (ushort)keywords.Length; i++)
            {
                dm.Add(StoreWord(keywords[i], document, tempCount, i, WordLocation.Keywords, out tempWord, out tempDumpedWord));
                if (tempDumpedWord != null && tempWord != null)
                {
                    dm[dm.Count - 1].WordID = sequentialWordId;
                    tempDumpedWord.ID       = sequentialWordId;
                    dw.Add(tempDumpedWord);
                    tempWord.ID = sequentialWordId;
                    newWords.Add(tempWord);
                    sequentialWordId--;
                }
                tempCount += (ushort)(1 + keywords[i].Length);
            }
            count += keywords.Length;

            IndexStorerResult result = OnIndexChange(document, IndexChangeType.DocumentAdded,
                                                     new DumpedChange(new DumpedDocument(document), dw, dm), state);

            // Update document ID
            if (result != null && result.DocumentID.HasValue)
            {
                document.ID = result.DocumentID.Value;
            }
            else
            {
                // HACK: result is null -> index is corrupted, silently return
                return(0);
            }

            // Update word IDs in newWords
            bool wordIdUpdated = false;

            foreach (Word word in newWords)
            {
                wordIdUpdated = false;
                foreach (WordId id in result.WordIDs)
                {
                    if (id.Text == word.Text)
                    {
                        word.ID       = id.ID;
                        wordIdUpdated = true;
                        break;
                    }
                }
                if (!wordIdUpdated)
                {
                    throw new InvalidOperationException("No ID for new word");
                }
            }

            return(count);
        }
Пример #10
0
        /// <summary>
        /// Initializes index data by completely emptying the index catalog and storing the specified data.
        /// </summary>
        /// <param name="documents">The documents.</param>
        /// <param name="words">The words.</param>
        /// <param name="mappings">The mappings.</param>
        /// <remarks>The method <b>does not</b> check the consistency of the data passed as arguments.</remarks>
        /// <exception cref="ArgumentNullException">If <paramref name="documents"/>, <paramref name="words"/> or <paramref name="mappings"/> are <c>null</c>.</exception>
        /// <exception cref="InvalidOperationException">If <see cref="M:SetBuildDocumentDelegate"/> was not called.</exception>
        public void InitializeData(DumpedDocument[] documents, DumpedWord[] words, DumpedWordMapping[] mappings)
        {
            if (documents == null)
            {
                throw new ArgumentNullException("documents");
            }
            if (words == null)
            {
                throw new ArgumentNullException("words");
            }
            if (mappings == null)
            {
                throw new ArgumentNullException("mappings");
            }

            if (buildDocument == null)
            {
                throw new InvalidOperationException("InitializeData can be invoked only when the BuildDocument delegate is set");
            }

            lock (this) {
                catalog.Clear();
                catalog = new Dictionary <string, Word>(words.Length);

                // Contains the IDs of documents that are missing
                List <uint> missingDocuments = new List <uint>(50);

                // 1. Prepare a dictionary with all documents for use in the last step
                Dictionary <uint, IDocument> tempDocuments = new Dictionary <uint, IDocument>(documents.Length);
                foreach (DumpedDocument doc in documents)
                {
                    IDocument builtDoc = buildDocument(doc);
                    // Null means that the document no longer exists - silently skip it
                    if (builtDoc != null)
                    {
                        tempDocuments.Add(doc.ID, builtDoc);
                    }
                    else
                    {
                        missingDocuments.Add(doc.ID);
                    }
                }

                // 2. Load words into the catalog, keeping track of them by ID in a dictionary for the next step
                Dictionary <ulong, Word> tempWords = new Dictionary <ulong, Word>(words.Length);

                // Test for hashing algorithm -- no more used since sequential IDs
                //if(words.Length > 0 && words[0].ID != Tools.HashString(words[0].Text)) {
                //	throw new InvalidOperationException("The search engine index seems to use an outdated hashing algorithm");
                //}

                foreach (DumpedWord w in words)
                {
                    Word word = new Word(w.ID, w.Text);

                    /*if(tempWords.ContainsKey(w.ID)) {
                     *      string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, tempWords[w.ID]);
                     *      Console.WriteLine(t);
                     * }*/
                    tempWords.Add(w.ID, word);

                    /*if(catalog.ContainsKey(w.Text)) {
                     *      string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, catalog[w.Text]);
                     *      Console.WriteLine(t);
                     * }*/
                    catalog.Add(w.Text, word);
                }

                // 3. Add mappings and documents
                foreach (DumpedWordMapping map in mappings)
                {
                    // HACK: Skip mappings that refer to missing documents and gracefully skip unknown words
                    if (!missingDocuments.Contains(map.DocumentID))
                    {
                        try {
                            tempWords[map.WordID].AddOccurrence(tempDocuments[map.DocumentID],
                                                                map.FirstCharIndex, map.WordIndex, WordLocation.GetInstance(map.Location));
                        }
                        catch (KeyNotFoundException) { }
                    }
                }
            }
        }