public HashSet <string> GetDocumentsWithWord(string word)
        {
            word = word.ToLower();
            if (use_make_reasonable_word)
            {
                word = ReasonableWord.MakeReasonableWord(word);
            }
            if (null == word)
            {
                return(EMPTY_DOCUMENT_SET);
            }

            lock (locker)
            {
                WordInWordIndex word_in_word_index = GetWordInWordIndex_LOCKER(word, false);
                if (null == word_in_word_index)
                {
                    return(EMPTY_DOCUMENT_SET);
                }

                List <int>       document_ids          = word_in_word_index.DocumentIds;
                HashSet <string> document_fingerprints = new HashSet <string>();
                foreach (int document_id in document_ids)
                {
                    string document_fingerprint;
                    if (document_id_to_fingerprints.TryGetValue(document_id, out document_fingerprint))
                    {
                        document_fingerprints.Add(document_fingerprint);
                    }
                }

                return(document_fingerprints);
            }
        }
        void LoadWord_LOCK(WordInWordIndex wiwi)
        {
            // If the word is already loaded, nothing to do...
            if (wiwi.IsLoaded)
            {
                return;
            }

            try
            {
                string filename = Filename_GangList(wiwi.WordId);
                using (FileStream fs = File.OpenRead(filename))
                {
                    {
                        bool gang_has_corrupted_word_counts = false;

                        List <WordEntry> word_entrys = Serializer.Deserialize <List <WordEntry> >(fs);

                        int gang_start = GangStart(wiwi.WordId);
                        for (int i = 0; i < word_entrys.Count; ++i)
                        {
                            if (0 != String.Compare(word_in_word_indexes[gang_start + i].Word, word_entrys[i].Word))
                            {
                                throw new Exception("The ordering of the word index is corrupt: words don't match");
                            }

                            if (null != word_in_word_indexes[gang_start + i].DocumentIds)
                            {
                                Logging.Warn("The ordering of the word index is corrupt: document_ids should be null");
                            }

                            WordInWordIndex wiwi_just_loaded    = word_in_word_indexes[gang_start + i];
                            bool            corruption_detected = wiwi_just_loaded.SetDocumentIds(word_entrys[i].DocumentIds, word_entrys[i].DocumentIdsCount);
                            if (corruption_detected)
                            {
                                gang_has_corrupted_word_counts = true;
                            }
                        }

                        if (gang_has_corrupted_word_counts)
                        {
                            Logging.Warn("The ordering of a word index in the gang is corrupt: doc counts don't match (the user probably exited before the gang was saved...)");
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                Logging.Error(ex, "There was a problem loading the word document list for word {0}:{1}.  Assuming it was empty.", wiwi.WordId, wiwi.Word);
                bool set_result = wiwi.SetDocumentIds(new List <int>(), new List <int>());
            }
        }
Exemple #3
0
        private void ReadMasterList()
        {
            Logging.Info("+ReadMasterList");

            Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start();
            lock (locker)
            {
                l1_clk.LockPerfTimerStop();
                try
                {
                    using (FileStream fs = File.OpenRead(GetFilename_MasterList()))
                    {
                        Headers headers = Serializer.Deserialize <Headers>(fs);

                        // First the documents
                        {
                            foreach (var header in headers.documents)
                            {
                                fingerprint_to_document_ids[header.Fingerprint] = header.DocumentId;
                                document_id_to_fingerprints[header.DocumentId]  = header.Fingerprint;
                            }
                        }

                        // Then the words
                        {
                            foreach (var header in headers.words)
                            {
                                WordInWordIndex wiwi = new WordInWordIndex(header.Word, header.WordId, header.DocCount);

                                // Sanity check that they are in the right order
                                if (wiwi.WordId != word_in_word_indexes.Count)
                                {
                                    throw new Exception("The ordering of the word index is corrupt");
                                }

                                // Add to our maps
                                word_in_word_indexes.Add(wiwi);
                                word_in_word_index_lookups[wiwi.Word] = wiwi;
                            }
                        }
                    }
                }

                catch (Exception ex)
                {
                    Logging.Warn(ex, "Unable to load index master list, so starting from scratch");
                }
            }

            Logging.Info("-ReadMasterList");
        }
Exemple #4
0
        public SearchResult Search(string word)
        {
            SearchResult search_result = new SearchResult();

            search_result.word = word;

            if (null == word)
            {
                return(search_result);
            }

            word = word.ToLower();
            if (use_make_reasonable_word)
            {
                word = ReasonableWord.MakeReasonableWord(word);
            }
            if (null == word)
            {
                return(search_result);
            }

            Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start();
            lock (locker)
            {
                l1_clk.LockPerfTimerStop();
                WordInWordIndex word_in_word_index = GetWordInWordIndex_LOCKER(word, false);
                if (null == word_in_word_index)
                {
                    return(search_result);
                }

                search_result.doc_count = word_in_word_index.DocumentCount;

                for (int i = 0; i < word_in_word_index.DocumentIds.Count; ++i)
                {
                    int document_id       = word_in_word_index.DocumentIds[i];
                    int document_id_count = word_in_word_index.DocumentIdsCount[i];

                    string document_fingerprint;
                    if (document_id_to_fingerprints.TryGetValue(document_id, out document_fingerprint))
                    {
                        search_result.doc_counts[document_fingerprint] = document_id_count;
                    }
                }

                return(search_result);
            }
        }
Exemple #5
0
        public void AddDocumentWord(string document_fingerprint, string word)
        {
            if (String.IsNullOrEmpty(word))
            {
                throw new Exception("Can not index null word");
            }

            word = word.ToLower();
            if (use_make_reasonable_word)
            {
                word = ReasonableWord.MakeReasonableWord(word);
            }
            if (null == word)
            {
                return;
            }

            Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start();
            lock (locker)
            {
                l1_clk.LockPerfTimerStop();
                // Get the doc_id
                int document_id;
                if (!fingerprint_to_document_ids.TryGetValue(document_fingerprint, out document_id))
                {
                    if (0 == document_id_to_fingerprints.Count)
                    {
                        document_id = 1;
                    }
                    else
                    {
                        document_id = document_id_to_fingerprints.Keys.Max() + 1;
                    }

                    fingerprint_to_document_ids[document_fingerprint] = document_id;
                    document_id_to_fingerprints[document_id]          = document_fingerprint;
                }

                // Get the word from the index or add it if it does not exist
                WordInWordIndex word_in_word_index = GetWordInWordIndex_LOCKER(word, true);
                if (null == word_in_word_index)
                {
                    throw new Exception("Not expecting to get back a null WordInWordIndex");
                }
                word_in_word_index.TallyDocId(document_id);
            }
        }
        internal WordInWordIndex GetWordInWordIndex_LOCKER(string word, bool for_modification)
        {
            WordInWordIndex wiwi;

            // We have never seen this word before, so create an index entry from scratch
            if (!word_in_word_index_lookups.TryGetValue(word, out wiwi))
            {
                // If this is just a query request, dont create a new one
                if (!for_modification)
                {
                    return(null);
                }


                // Make sure the last page of words is loaded
                if (0 < word_in_word_indexes.Count)
                {
                    LoadWord_LOCK(word_in_word_indexes[word_in_word_indexes.Count - 1]);
                }

                // Create the new wiwi
                wiwi = new WordInWordIndex(word, word_in_word_indexes.Count);

                // Add to our lookups
                word_in_word_indexes.Add(wiwi);
                word_in_word_index_lookups[wiwi.Word] = wiwi;
            }
            else // We have seen this word before
            {
                LoadWord_LOCK(wiwi);
            }

            // Set some access properties
            wiwi.last_accessed  = DateTime.UtcNow;
            wiwi.last_flushed   = DateTime.MinValue;
            wiwi.needs_flushing = wiwi.needs_flushing || for_modification;

            return(wiwi);
        }
        /// <summary>
        /// Flushes the record for the keyword (and its entire gang) out to disk.  You must call this from inside a lock.
        /// </summary>
        /// <param name="wiwi"></param>
        /// <param name="create_directory_first"></param>
        void FlushKeyword_LOCK(WordInWordIndex wiwi, bool create_directory_first)
        {
            // If this is not loaded, there is nothing to do
            if (!wiwi.IsLoaded)
            {
                return;
            }

            // If this one doesnt need flushing, don't do it
            if (!wiwi.needs_flushing)
            {
                return;
            }

            try
            {
                // Build up the gangs
                int gang_start = GangStart(wiwi.WordId);
                List <WordEntry> word_entrys = new List <WordEntry>();

                string filename_temp = Path.GetTempFileName();
                using (FileStream fs = File.Open(filename_temp, FileMode.Create))
                {
                    {
                        for (int i = 0; i < GANG_SIZE; ++i)
                        {
                            // If this is the last gang, there may be too few words
                            if (word_in_word_indexes.Count <= gang_start + i)
                            {
                                break;
                            }

                            if (null == word_in_word_indexes[gang_start + i].DocumentIds)
                            {
                                throw new Exception("Document ids should not be null");
                            }

                            WordEntry word_entry = new WordEntry
                            {
                                Word             = word_in_word_indexes[gang_start + i].Word,
                                DocumentIds      = word_in_word_indexes[gang_start + i].DocumentIds,
                                DocumentIdsCount = word_in_word_indexes[gang_start + i].DocumentIdsCount
                            };

                            word_entrys.Add(word_entry);
                        }

                        Serializer.Serialize <List <WordEntry> >(fs, word_entrys);
                    }
                }

                // Move the temp file into place
                string filename = Filename_GangList(wiwi.WordId);

                // Create the directory for the file
                if (create_directory_first)
                {
                    Directory.CreateDirectory(Path.GetDirectoryName(filename));
                }

                FileTools.MoveSafelyWithOverwriting(filename_temp, filename);

                // Mark the gang as flushed
                for (int i = 0; i < GANG_SIZE; ++i)
                {
                    // If this is the last gang, there may be too few words
                    if (word_in_word_indexes.Count <= gang_start + i)
                    {
                        break;
                    }

                    word_in_word_indexes[gang_start + i].last_flushed   = DateTime.UtcNow;
                    word_in_word_indexes[gang_start + i].needs_flushing = false;
                }
            }

            catch (Exception)
            {
                //  If we have an exception, it is probably because we have not created the directory, so try that
                if (!create_directory_first)
                {
                    FlushKeyword_LOCK(wiwi, true);
                }
                else
                {
                    // If we have created the directory before, then there must be some other problem
                    throw;
                }
            }
        }
 /// <summary>
 /// Flushes the record for the keyword (and its entire gang) out to disk.  You must call this from inside a lock.
 /// </summary>
 /// <param name="wiwi"></param>
 /// <param name="create_directory_first"></param>
 void FlushKeyword_LOCK(WordInWordIndex wiwi)
 {
     FlushKeyword_LOCK(wiwi, false);
 }