private void LoadWord_LOCK(WordInWordIndex wiwi) { // If the word is already loaded, nothing to do... if (wiwi.IsLoaded) { return; } try { string filename = Filename_GangList(wiwi.WordId); using (FileStream fs = File.OpenRead(filename)) { { bool gang_has_corrupted_word_counts = false; List <WordEntry> word_entrys = Serializer.Deserialize <List <WordEntry> >(fs); int gang_start = GangStart(wiwi.WordId); for (int i = 0; i < word_entrys.Count; ++i) { if (0 != String.Compare(word_in_word_indexes[gang_start + i].Word, word_entrys[i].Word)) { throw new Exception("The ordering of the word index is corrupt: words don't match"); } if (null != word_in_word_indexes[gang_start + i].DocumentIds) { Logging.Warn("The ordering of the word index is corrupt: document_ids should be null"); } WordInWordIndex wiwi_just_loaded = word_in_word_indexes[gang_start + i]; bool corruption_detected = wiwi_just_loaded.SetDocumentIds(word_entrys[i].DocumentIds, word_entrys[i].DocumentIdsCount); if (corruption_detected) { gang_has_corrupted_word_counts = true; } } if (gang_has_corrupted_word_counts) { Logging.Warn("The ordering of a word index in the gang is corrupt: doc counts don't match (the user probably exited before the gang was saved...)"); } } } } catch (Exception ex) { Logging.Error(ex, "There was a problem loading the word document list for word {0}:{1}. Assuming it was empty.", wiwi.WordId, wiwi.Word); bool set_result = wiwi.SetDocumentIds(new List <int>(), new List <int>()); } }
private void ReadMasterList() { Logging.Info("+ReadMasterList"); Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start(); lock (locker) { l1_clk.LockPerfTimerStop(); try { using (FileStream fs = File.OpenRead(GetFilename_MasterList())) { Headers headers = Serializer.Deserialize <Headers>(fs); // First the documents { foreach (var header in headers.documents) { fingerprint_to_document_ids[header.Fingerprint] = header.DocumentId; document_id_to_fingerprints[header.DocumentId] = header.Fingerprint; } } // Then the words { foreach (var header in headers.words) { WordInWordIndex wiwi = new WordInWordIndex(header.Word, header.WordId, header.DocCount); // Sanity check that they are in the right order if (wiwi.WordId != word_in_word_indexes.Count) { throw new Exception("The ordering of the word index is corrupt"); } // Add to our maps word_in_word_indexes.Add(wiwi); word_in_word_index_lookups[wiwi.Word] = wiwi; } } } } catch (Exception ex) { Logging.Warn(ex, "Unable to load index master list, so starting from scratch"); } } Logging.Info("-ReadMasterList"); }
public SearchResult Search(string word) { SearchResult search_result = new SearchResult(); search_result.word = word; if (null == word) { return(search_result); } word = word.ToLower(); if (use_make_reasonable_word) { word = ReasonableWord.MakeReasonableWord(word); } if (null == word) { return(search_result); } Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start(); lock (locker) { l1_clk.LockPerfTimerStop(); WordInWordIndex word_in_word_index = GetWordInWordIndex_LOCKER(word, false); if (null == word_in_word_index) { return(search_result); } search_result.doc_count = word_in_word_index.DocumentCount; for (int i = 0; i < word_in_word_index.DocumentIds.Count; ++i) { int document_id = word_in_word_index.DocumentIds[i]; int document_id_count = word_in_word_index.DocumentIdsCount[i]; string document_fingerprint; if (document_id_to_fingerprints.TryGetValue(document_id, out document_fingerprint)) { search_result.doc_counts[document_fingerprint] = document_id_count; } } return(search_result); } }
public void AddDocumentWord(string document_fingerprint, string word) { if (String.IsNullOrEmpty(word)) { throw new Exception("Can not index null word"); } word = word.ToLower(); if (use_make_reasonable_word) { word = ReasonableWord.MakeReasonableWord(word); } if (null == word) { return; } Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start(); lock (locker) { l1_clk.LockPerfTimerStop(); // Get the doc_id int document_id; if (!fingerprint_to_document_ids.TryGetValue(document_fingerprint, out document_id)) { if (0 == document_id_to_fingerprints.Count) { document_id = 1; } else { document_id = document_id_to_fingerprints.Keys.Max() + 1; } fingerprint_to_document_ids[document_fingerprint] = document_id; document_id_to_fingerprints[document_id] = document_fingerprint; } // Get the word from the index or add it if it does not exist WordInWordIndex word_in_word_index = GetWordInWordIndex_LOCKER(word, true); if (null == word_in_word_index) { throw new Exception("Not expecting to get back a null WordInWordIndex"); } word_in_word_index.TallyDocId(document_id); } }
internal WordInWordIndex GetWordInWordIndex_LOCKER(string word, bool for_modification) { WordInWordIndex wiwi; // We have never seen this word before, so create an index entry from scratch if (!word_in_word_index_lookups.TryGetValue(word, out wiwi)) { // If this is just a query request, dont create a new one if (!for_modification) { return(null); } // Make sure the last page of words is loaded if (0 < word_in_word_indexes.Count) { LoadWord_LOCK(word_in_word_indexes[word_in_word_indexes.Count - 1]); } // Create the new wiwi wiwi = new WordInWordIndex(word, word_in_word_indexes.Count); // Add to our lookups word_in_word_indexes.Add(wiwi); word_in_word_index_lookups[wiwi.Word] = wiwi; } else // We have seen this word before { LoadWord_LOCK(wiwi); } // Set some access properties wiwi.last_accessed = DateTime.UtcNow; wiwi.last_flushed = DateTime.MinValue; wiwi.needs_flushing = wiwi.needs_flushing || for_modification; return(wiwi); }
public HashSet <string> GetDocumentsWithWord(string word) { word = word.ToLower(); if (use_make_reasonable_word) { word = ReasonableWord.MakeReasonableWord(word); } if (null == word) { return(EMPTY_DOCUMENT_SET); } Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start(); lock (locker) { l1_clk.LockPerfTimerStop(); WordInWordIndex word_in_word_index = GetWordInWordIndex_LOCKER(word, false); if (null == word_in_word_index) { return(EMPTY_DOCUMENT_SET); } List <int> document_ids = word_in_word_index.DocumentIds; HashSet <string> document_fingerprints = new HashSet <string>(); foreach (int document_id in document_ids) { string document_fingerprint; if (document_id_to_fingerprints.TryGetValue(document_id, out document_fingerprint)) { document_fingerprints.Add(document_fingerprint); } } return(document_fingerprints); } }
/// <summary> /// Flushes the record for the keyword (and its entire gang) out to disk. You must call this from inside a lock. /// </summary> /// <param name="wiwi"></param> /// <param name="create_directory_first"></param> private void FlushKeyword_LOCK(WordInWordIndex wiwi, bool create_directory_first) { // If this is not loaded, there is nothing to do if (!wiwi.IsLoaded) { return; } // If this one doesnt need flushing, don't do it if (!wiwi.needs_flushing) { return; } try { // Build up the gangs int gang_start = GangStart(wiwi.WordId); List <WordEntry> word_entrys = new List <WordEntry>(); string filename_temp = Path.GetTempFileName(); using (FileStream fs = File.Open(filename_temp, FileMode.Create)) { { for (int i = 0; i < GANG_SIZE; ++i) { // If this is the last gang, there may be too few words if (word_in_word_indexes.Count <= gang_start + i) { break; } if (null == word_in_word_indexes[gang_start + i].DocumentIds) { throw new Exception("Document ids should not be null"); } WordEntry word_entry = new WordEntry { Word = word_in_word_indexes[gang_start + i].Word, DocumentIds = word_in_word_indexes[gang_start + i].DocumentIds, DocumentIdsCount = word_in_word_indexes[gang_start + i].DocumentIdsCount }; word_entrys.Add(word_entry); } Serializer.Serialize <List <WordEntry> >(fs, word_entrys); } } // Move the temp file into place string filename = Filename_GangList(wiwi.WordId); // Create the directory for the file if (create_directory_first) { Directory.CreateDirectory(Path.GetDirectoryName(filename)); } FileTools.MoveSafelyWithOverwriting(filename_temp, filename); // Mark the gang as flushed for (int i = 0; i < GANG_SIZE; ++i) { // If this is the last gang, there may be too few words if (word_in_word_indexes.Count <= gang_start + i) { break; } word_in_word_indexes[gang_start + i].last_flushed = DateTime.UtcNow; word_in_word_indexes[gang_start + i].needs_flushing = false; } } catch (Exception ex) { // If we have an exception, it is probably because we have not created the directory, so try that if (!create_directory_first) { FlushKeyword_LOCK(wiwi, true); } else { // If we have created the directory before, then there must be some other problem throw ex; } } }
/// <summary> /// Flushes the record for the keyword (and its entire gang) out to disk. You must call this from inside a lock. /// </summary> /// <param name="wiwi"></param> /// <param name="create_directory_first"></param> private void FlushKeyword_LOCK(WordInWordIndex wiwi) { FlushKeyword_LOCK(wiwi, false); }