Contains a word mapping data, structured for easy dumping on disk or database.
The class is not thread-safe.
Esempio n. 1
0
 /// <summary>
 /// Determines whether a <see cref="DumpedWordMapping" /> is contained in a list.
 /// </summary>
 /// <param name="mapping">The mapping.</param>
 /// <param name="list">The list.</param>
 /// <returns><c>true</c> if the mapping is contained in the list, <c>false</c> otherwise.</returns>
 protected static bool Find(DumpedWordMapping mapping, IEnumerable <DumpedWordMapping> list)
 {
     foreach (DumpedWordMapping m in list)
     {
         if (m.WordID == mapping.WordID &&
             m.DocumentID == mapping.DocumentID &&
             m.FirstCharIndex == mapping.FirstCharIndex &&
             m.WordIndex == mapping.WordIndex &&
             m.Location == mapping.Location)
         {
             return(true);
         }
     }
     return(false);
 }
Esempio n. 2
0
 /// <summary>
 /// Writes a <see cref="DumpedWordMapping" /> to a <see cref="BinaryWriter" />.
 /// </summary>
 /// <param name="writer">The <see cref="BinaryWriter" />.</param>
 /// <param name="mapping">The <see cref="DumpedWordMapping" />.</param>
 private static void WriteDumpedWordMapping(BinaryWriter writer, DumpedWordMapping mapping)
 {
     writer.Write(mapping.WordID);
     writer.Write(mapping.DocumentID);
     writer.Write(mapping.FirstCharIndex);
     writer.Write(mapping.WordIndex);
     writer.Write(mapping.Location);
 }
Esempio n. 3
0
        /// <summary>
        /// Loads the index from the data store the first time.
        /// </summary>
        /// <param name="documents">The dumped documents.</param>
        /// <param name="words">The dumped words.</param>
        /// <param name="mappings">The dumped word mappings.</param>
        protected override void LoadIndexInternal(out DumpedDocument[] documents, out DumpedWord[] words, out DumpedWordMapping[] mappings)
        {
            uint maxDocumentId = 0;
            uint maxWordId = 0;

            // 1. Load Documents
            using(FileStream fs = new FileStream(documentsFile, FileMode.Open, FileAccess.Read, FileShare.None)) {
                int count = ReadCount(fs);
                BinaryReader reader = new BinaryReader(fs, Encoding.UTF8);
                documents = new DumpedDocument[count];
                for(int i = 0; i < count; i++) {
                    documents[i] = ReadDumpedDocument(reader);
                    if(documents[i].ID > maxDocumentId) maxDocumentId = documents[i].ID;
                }
                firstFreeDocumentId = maxDocumentId + 1;
            }

            // 2. Load Words
            using(FileStream fs = new FileStream(wordsFile, FileMode.Open, FileAccess.Read, FileShare.None)) {
                int count = ReadCount(fs);
                BinaryReader reader = new BinaryReader(fs, Encoding.UTF8);
                words = new DumpedWord[count];
                for(int i = 0; i < count; i++) {
                    words[i] = ReadDumpedWord(reader);
                    if(words[i].ID > maxWordId) maxWordId = words[i].ID;
                }
                firstFreeWordId = maxWordId + 1;
            }

            // 3. Load Mappings
            using(FileStream fs = new FileStream(mappingsFile, FileMode.Open, FileAccess.Read, FileShare.None)) {
                int count = ReadCount(fs);
                BinaryReader reader = new BinaryReader(fs, Encoding.UTF8);
                mappings = new DumpedWordMapping[count];
                for(int i = 0; i < count; i++) {
                    mappings[i] = ReadDumpedWordMapping(reader);
                }
            }
        }
Esempio n. 4
0
        /// <summary>
        /// Initializes index data by completely emptying the index catalog and storing the specified data.
        /// </summary>
        /// <param name="documents">The documents.</param>
        /// <param name="words">The words.</param>
        /// <param name="mappings">The mappings.</param>
        /// <remarks>The method <b>does not</b> check the consistency of the data passed as arguments.</remarks>
        /// <exception cref="ArgumentNullException">If <paramref name="documents"/>, <paramref name="words"/> or <paramref name="mappings"/> are <c>null</c>.</exception>
        /// <exception cref="InvalidOperationException">If <see cref="M:SetBuildDocumentDelegate"/> was not called.</exception>
        public void InitializeData(DumpedDocument[] documents, DumpedWord[] words, DumpedWordMapping[] mappings)
        {
            if(documents == null) throw new ArgumentNullException("documents");
            if(words == null) throw new ArgumentNullException("words");
            if(mappings == null) throw new ArgumentNullException("mappings");

            if(buildDocument == null) throw new InvalidOperationException("InitializeData can be invoked only when the BuildDocument delegate is set");

            lock(this) {
                catalog.Clear();
                catalog = new Dictionary<string, Word>(words.Length);

                // Contains the IDs of documents that are missing
                List<uint> missingDocuments = new List<uint>(50);

                // 1. Prepare a dictionary with all documents for use in the last step
                Dictionary<uint, IDocument> tempDocuments = new Dictionary<uint, IDocument>(documents.Length);
                foreach(DumpedDocument doc in documents) {
                    IDocument builtDoc = buildDocument(doc);
                    // Null means that the document no longer exists - silently skip it
                    if(builtDoc != null) {
                        tempDocuments.Add(doc.ID, builtDoc);
                    }
                    else {
                        missingDocuments.Add(doc.ID);
                    }
                }

                // 2. Load words into the catalog, keeping track of them by ID in a dictionary for the next step
                Dictionary<ulong, Word> tempWords = new Dictionary<ulong, Word>(words.Length);

                // Test for hashing algorithm -- no more used since sequential IDs
                //if(words.Length > 0 && words[0].ID != Tools.HashString(words[0].Text)) {
                //	throw new InvalidOperationException("The search engine index seems to use an outdated hashing algorithm");
                //}

                foreach(DumpedWord w in words) {
                    Word word = new Word(w.ID, w.Text);
                    /*if(tempWords.ContainsKey(w.ID)) {
                        string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, tempWords[w.ID]);
                        Console.WriteLine(t);
                    }*/
                    tempWords.Add(w.ID, word);
                    /*if(catalog.ContainsKey(w.Text)) {
                        string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, catalog[w.Text]);
                        Console.WriteLine(t);
                    }*/
                    catalog.Add(w.Text, word);
                }

                // 3. Add mappings and documents
                foreach(DumpedWordMapping map in mappings) {
                    // HACK: Skip mappings that refer to missing documents and gracefully skip unknown words
                    if(!missingDocuments.Contains(map.DocumentID)) {
                        try {
                            tempWords[map.WordID].AddOccurrence(tempDocuments[map.DocumentID],
                                map.FirstCharIndex, map.WordIndex, WordLocation.GetInstance(map.Location));
                        }
                        catch(KeyNotFoundException) { }
                    }
                }
            }
        }
Esempio n. 5
0
 /// <summary>
 /// Loads the index from the data store the first time.
 /// </summary>
 /// <param name="documents">The dumped documents.</param>
 /// <param name="words">The dumped words.</param>
 /// <param name="mappings">The dumped word mappings.</param>
 protected abstract void LoadIndexInternal(out DumpedDocument[] documents, out DumpedWord[] words, out DumpedWordMapping[] mappings);
Esempio n. 6
0
 /// <summary>
 /// Determines whether a <see cref="DumpedWordMapping" /> is contained in a list.
 /// </summary>
 /// <param name="mapping">The mapping.</param>
 /// <param name="list">The list.</param>
 /// <returns><c>true</c> if the mapping is contained in the list, <c>false</c> otherwise.</returns>
 protected static bool Find(DumpedWordMapping mapping, IEnumerable<DumpedWordMapping> list)
 {
     foreach(DumpedWordMapping m in list) {
         if(m.WordID == mapping.WordID &&
             m.DocumentID == mapping.DocumentID &&
             m.FirstCharIndex == mapping.FirstCharIndex &&
             m.WordIndex == mapping.WordIndex &&
             m.Location == mapping.Location) return true;
     }
     return false;
 }