Represents a word structured for easy dumping to disk or database.
The class is not thread-safe.
        /// <summary>
        /// Stores a word in the catalog.
        /// </summary>
        /// <param name="wordText">The word to store.</param>
        /// <param name="document">The document the word occurs in.</param>
        /// <param name="firstCharIndex">The index of the first character of the word in the document the word occurs at.</param>
        /// <param name="wordIndex">The index of the word in the document.</param>
        /// <param name="location">The location of the word.</param>
        /// <param name="newWord">The new word, or <c>null</c>.</param>
        /// <param name="dumpedWord">The dumped word data, or <c>null</c>.</param>
        /// <returns>The dumped word mapping data.</returns>
        /// <remarks>Storing a word in the index is <b>O(n log n)</b>,
        /// where <b>n</b> is the number of words already in the index.</remarks>
        protected DumpedWordMapping StoreWord(string wordText, IDocument document, ushort firstCharIndex, ushort wordIndex,
                                              WordLocation location, out Word newWord, out DumpedWord dumpedWord)
        {
            wordText = wordText.ToLower(CultureInfo.InvariantCulture);

            lock (this) {
                Word word = null;

                if (!catalog.TryGetValue(wordText, out word))
                {
                    // Use ZERO as initial ID, update when IndexStorer has stored the word
                    // A reference to this newly-created word must be passed outside this method
                    word = new Word(0, wordText);
                    catalog.Add(wordText, word);
                    newWord    = word;
                    dumpedWord = new DumpedWord(word);
                }
                else
                {
                    newWord    = null;
                    dumpedWord = null;
                }

                word.AddOccurrence(document, firstCharIndex, wordIndex, location);
                return(new DumpedWordMapping(word.ID, document.ID, firstCharIndex, wordIndex, location.Location));
            }
        }
Exemple #2
0
 /// <summary>
 /// Determines whether a <see cref="DumpedWord" /> is contained in a list.
 /// </summary>
 /// <param name="word">The word.</param>
 /// <param name="list">The list.</param>
 /// <returns><c>true</c> if the word is contained in the list, <c>false</c> orherwise.</returns>
 protected static bool Find(DumpedWord word, IEnumerable <DumpedWord> list)
 {
     foreach (DumpedWord w in list)
     {
         if (w.ID == word.ID && w.Text == word.Text)
         {
             return(true);
         }
     }
     return(false);
 }
Exemple #3
0
        /// <summary>
        /// Writes a <see cref="DumpedWord" /> to a <see cref="BinaryWriter" />.
        /// </summary>
        /// <param name="writer">The <see cref="BinaryWriter" />.</param>
        /// <param name="word">The <see cref="DumpedWord" />.</param>
        private static void WriteDumpedWord(BinaryWriter writer, DumpedWord word)
        {
            //if(word.Text.Length == 0) throw new InvalidOperationException();

            writer.Write(word.ID);
            writer.Write(word.Text);
        }
Exemple #4
0
        /// <summary>
        /// Loads the index from the data store the first time.
        /// </summary>
        /// <param name="documents">The dumped documents.</param>
        /// <param name="words">The dumped words.</param>
        /// <param name="mappings">The dumped word mappings.</param>
        protected override void LoadIndexInternal(out DumpedDocument[] documents, out DumpedWord[] words, out DumpedWordMapping[] mappings)
        {
            uint maxDocumentId = 0;
            uint maxWordId = 0;

            // 1. Load Documents
            using(FileStream fs = new FileStream(documentsFile, FileMode.Open, FileAccess.Read, FileShare.None)) {
                int count = ReadCount(fs);
                BinaryReader reader = new BinaryReader(fs, Encoding.UTF8);
                documents = new DumpedDocument[count];
                for(int i = 0; i < count; i++) {
                    documents[i] = ReadDumpedDocument(reader);
                    if(documents[i].ID > maxDocumentId) maxDocumentId = documents[i].ID;
                }
                firstFreeDocumentId = maxDocumentId + 1;
            }

            // 2. Load Words
            using(FileStream fs = new FileStream(wordsFile, FileMode.Open, FileAccess.Read, FileShare.None)) {
                int count = ReadCount(fs);
                BinaryReader reader = new BinaryReader(fs, Encoding.UTF8);
                words = new DumpedWord[count];
                for(int i = 0; i < count; i++) {
                    words[i] = ReadDumpedWord(reader);
                    if(words[i].ID > maxWordId) maxWordId = words[i].ID;
                }
                firstFreeWordId = maxWordId + 1;
            }

            // 3. Load Mappings
            using(FileStream fs = new FileStream(mappingsFile, FileMode.Open, FileAccess.Read, FileShare.None)) {
                int count = ReadCount(fs);
                BinaryReader reader = new BinaryReader(fs, Encoding.UTF8);
                mappings = new DumpedWordMapping[count];
                for(int i = 0; i < count; i++) {
                    mappings[i] = ReadDumpedWordMapping(reader);
                }
            }
        }
        /// <summary>
        /// Stores a word in the catalog.
        /// </summary>
        /// <param name="wordText">The word to store.</param>
        /// <param name="document">The document the word occurs in.</param>
        /// <param name="firstCharIndex">The index of the first character of the word in the document the word occurs at.</param>
        /// <param name="wordIndex">The index of the word in the document.</param>
        /// <param name="location">The location of the word.</param>
        /// <param name="newWord">The new word, or <c>null</c>.</param>
        /// <param name="dumpedWord">The dumped word data, or <c>null</c>.</param>
        /// <returns>The dumped word mapping data.</returns>
        /// <remarks>Storing a word in the index is <b>O(n log n)</b>, 
        /// where <b>n</b> is the number of words already in the index.</remarks>
        protected DumpedWordMapping StoreWord(string wordText, IDocument document, ushort firstCharIndex, ushort wordIndex,
            WordLocation location, out Word newWord, out DumpedWord dumpedWord)
        {
            wordText = wordText.ToLower(CultureInfo.InvariantCulture);

            lock(this) {
                Word word = null;

                if(!catalog.TryGetValue(wordText, out word)) {
                    // Use ZERO as initial ID, update when IndexStorer has stored the word
                    // A reference to this newly-created word must be passed outside this method
                    word = new Word(0, wordText);
                    catalog.Add(wordText, word);
                    newWord = word;
                    dumpedWord = new DumpedWord(word);
                }
                else {
                    newWord = null;
                    dumpedWord = null;
                }

                word.AddOccurrence(document, firstCharIndex, wordIndex, location);
                return new DumpedWordMapping(word.ID, document.ID, firstCharIndex, wordIndex, location.Location);
            }
        }
        /// <summary>
        /// Initializes index data by completely emptying the index catalog and storing the specified data.
        /// </summary>
        /// <param name="documents">The documents.</param>
        /// <param name="words">The words.</param>
        /// <param name="mappings">The mappings.</param>
        /// <remarks>The method <b>does not</b> check the consistency of the data passed as arguments.</remarks>
        /// <exception cref="ArgumentNullException">If <paramref name="documents"/>, <paramref name="words"/> or <paramref name="mappings"/> are <c>null</c>.</exception>
        /// <exception cref="InvalidOperationException">If <see cref="M:SetBuildDocumentDelegate"/> was not called.</exception>
        public void InitializeData(DumpedDocument[] documents, DumpedWord[] words, DumpedWordMapping[] mappings)
        {
            if(documents == null) throw new ArgumentNullException("documents");
            if(words == null) throw new ArgumentNullException("words");
            if(mappings == null) throw new ArgumentNullException("mappings");

            if(buildDocument == null) throw new InvalidOperationException("InitializeData can be invoked only when the BuildDocument delegate is set");

            lock(this) {
                catalog.Clear();
                catalog = new Dictionary<string, Word>(words.Length);

                // Contains the IDs of documents that are missing
                List<uint> missingDocuments = new List<uint>(50);

                // 1. Prepare a dictionary with all documents for use in the last step
                Dictionary<uint, IDocument> tempDocuments = new Dictionary<uint, IDocument>(documents.Length);
                foreach(DumpedDocument doc in documents) {
                    IDocument builtDoc = buildDocument(doc);
                    // Null means that the document no longer exists - silently skip it
                    if(builtDoc != null) {
                        tempDocuments.Add(doc.ID, builtDoc);
                    }
                    else {
                        missingDocuments.Add(doc.ID);
                    }
                }

                // 2. Load words into the catalog, keeping track of them by ID in a dictionary for the next step
                Dictionary<ulong, Word> tempWords = new Dictionary<ulong, Word>(words.Length);

                // Test for hashing algorithm -- no more used since sequential IDs
                //if(words.Length > 0 && words[0].ID != Tools.HashString(words[0].Text)) {
                //	throw new InvalidOperationException("The search engine index seems to use an outdated hashing algorithm");
                //}

                foreach(DumpedWord w in words) {
                    Word word = new Word(w.ID, w.Text);
                    /*if(tempWords.ContainsKey(w.ID)) {
                        string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, tempWords[w.ID]);
                        Console.WriteLine(t);
                    }*/
                    tempWords.Add(w.ID, word);
                    /*if(catalog.ContainsKey(w.Text)) {
                        string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, catalog[w.Text]);
                        Console.WriteLine(t);
                    }*/
                    catalog.Add(w.Text, word);
                }

                // 3. Add mappings and documents
                foreach(DumpedWordMapping map in mappings) {
                    // HACK: Skip mappings that refer to missing documents and gracefully skip unknown words
                    if(!missingDocuments.Contains(map.DocumentID)) {
                        try {
                            tempWords[map.WordID].AddOccurrence(tempDocuments[map.DocumentID],
                                map.FirstCharIndex, map.WordIndex, WordLocation.GetInstance(map.Location));
                        }
                        catch(KeyNotFoundException) { }
                    }
                }
            }
        }
 /// <summary>
 /// Loads the index from the data store the first time.
 /// </summary>
 /// <param name="documents">The dumped documents.</param>
 /// <param name="words">The dumped words.</param>
 /// <param name="mappings">The dumped word mappings.</param>
 protected abstract void LoadIndexInternal(out DumpedDocument[] documents, out DumpedWord[] words, out DumpedWordMapping[] mappings);
 /// <summary>
 /// Determines whether a <see cref="DumpedWord" /> is contained in a list.
 /// </summary>
 /// <param name="word">The word.</param>
 /// <param name="list">The list.</param>
 /// <returns><c>true</c> if the word is contained in the list, <c>false</c> orherwise.</returns>
 protected static bool Find(DumpedWord word, IEnumerable<DumpedWord> list)
 {
     foreach(DumpedWord w in list) {
         if(w.ID == word.ID && w.Text == word.Text) return true;
     }
     return false;
 }
        /// <summary>
        /// Stores a document in the index.
        /// </summary>
        /// <param name="document">The document.</param>
        /// <param name="keywords">The document keywords, if any, an empty array or <c>null</c> otherwise.</param>
        /// <param name="content">The content of the document.</param>
        /// <param name="state">A state object that is passed to the IndexStorer SaveDate/DeleteData function.</param>
        /// <returns>The number of indexed words (including duplicates) in the document title and content.</returns>
        /// <remarks>Indexing the content of the document is <b>O(n)</b>,
        /// where <b>n</b> is the total number of words in the document.
        /// If the specified document was already in the index, all the old occurrences
        /// are deleted from the index.</remarks>
        /// <exception cref="ArgumentNullException">If <paramref name="document"/> or <paramref name="content"/> are <c>null</c>.</exception>
        public int StoreDocument(IDocument document, string[] keywords, string content, object state)
        {
            if (document == null)
            {
                throw new ArgumentNullException("document");
            }
            if (keywords == null)
            {
                keywords = new string[0];
            }
            if (content == null)
            {
                throw new ArgumentNullException("content");
            }

            lock (this) {
                DumpedChange removeChange = RemoveDocumentInternal(document);

                if (removeChange != null)
                {
                    OnIndexChange(document, IndexChangeType.DocumentRemoved, removeChange, state);
                }
            }

            keywords = Tools.CleanupKeywords(keywords);

            // When the IndexStorer handles the IndexChanged event and a document is added, the storer generates a new ID and returns it
            // via the event handler, then the in-memory index is updated (the document instance is shared across all words) - the final ID
            // is generated by the actual IndexStorer implementation (SaveData properly populates the Result field in the args)

            List <DumpedWord>        dw = new List <DumpedWord>(content.Length / 5);
            List <DumpedWordMapping> dm = new List <DumpedWordMapping>(content.Length / 5);
            Word        tempWord        = null;
            List <Word> newWords        = new List <Word>(50);
            DumpedWord  tempDumpedWord  = null;

            int  count            = 0;
            uint sequentialWordId = uint.MaxValue;

            // Store content words
            WordInfo[] words = document.Tokenize(content);
            words = Tools.RemoveStopWords(words, stopWords);

            foreach (WordInfo info in words)
            {
                dm.Add(StoreWord(info.Text, document, info.FirstCharIndex, info.WordIndex, WordLocation.Content, out tempWord, out tempDumpedWord));
                if (tempDumpedWord != null && tempWord != null)
                {
                    dm[dm.Count - 1].WordID = sequentialWordId;
                    tempDumpedWord.ID       = sequentialWordId;
                    dw.Add(tempDumpedWord);
                    tempWord.ID = sequentialWordId;
                    newWords.Add(tempWord);
                    sequentialWordId--;
                }
            }
            count += words.Length;

            // Store title words
            words = document.Tokenize(document.Title);
            words = Tools.RemoveStopWords(words, stopWords);

            foreach (WordInfo info in words)
            {
                dm.Add(StoreWord(info.Text, document, info.FirstCharIndex, info.WordIndex, WordLocation.Title, out tempWord, out tempDumpedWord));
                if (tempDumpedWord != null && tempWord != null)
                {
                    dm[dm.Count - 1].WordID = sequentialWordId;
                    tempDumpedWord.ID       = sequentialWordId;
                    dw.Add(tempDumpedWord);
                    tempWord.ID = sequentialWordId;
                    newWords.Add(tempWord);
                    sequentialWordId--;
                }
            }
            count += words.Length;

            ushort tempCount = 0;

            // Store keywords
            for (ushort i = 0; i < (ushort)keywords.Length; i++)
            {
                dm.Add(StoreWord(keywords[i], document, tempCount, i, WordLocation.Keywords, out tempWord, out tempDumpedWord));
                if (tempDumpedWord != null && tempWord != null)
                {
                    dm[dm.Count - 1].WordID = sequentialWordId;
                    tempDumpedWord.ID       = sequentialWordId;
                    dw.Add(tempDumpedWord);
                    tempWord.ID = sequentialWordId;
                    newWords.Add(tempWord);
                    sequentialWordId--;
                }
                tempCount += (ushort)(1 + keywords[i].Length);
            }
            count += keywords.Length;

            IndexStorerResult result = OnIndexChange(document, IndexChangeType.DocumentAdded,
                                                     new DumpedChange(new DumpedDocument(document), dw, dm), state);

            // Update document ID
            if (result != null && result.DocumentID.HasValue)
            {
                document.ID = result.DocumentID.Value;
            }
            else
            {
                // HACK: result is null -> index is corrupted, silently return
                return(0);
            }

            // Update word IDs in newWords
            bool wordIdUpdated = false;

            foreach (Word word in newWords)
            {
                wordIdUpdated = false;
                foreach (WordId id in result.WordIDs)
                {
                    if (id.Text == word.Text)
                    {
                        word.ID       = id.ID;
                        wordIdUpdated = true;
                        break;
                    }
                }
                if (!wordIdUpdated)
                {
                    throw new InvalidOperationException("No ID for new word");
                }
            }

            return(count);
        }