public IndexerSetupResult Setup(IXDescriptor descriptor) { if (setup) { return(IndexerSetupResult.Failure); } hashFactory = new System.Security.Cryptography.SHA256Managed(); string _v = Convert.ToBase64String(hashFactory.ComputeHash(System.Text.UTF8Encoding.UTF8.GetBytes(descriptor.ToString()))); int df = indexSearcher.DocFreq(new Term(indexerDocumentDescriptorVersion, _v)); // set up searcher TermDocs term = indexSearcher.IndexReader.TermDocs(); List <Document> docs = new List <Document>(); while (term.Next()) { docs.Add(indexSearcher.Doc(term.Doc)); } return(IndexerSetupResult.Okay); }
/// <summary> Check whether the word exists in the index.</summary> /// <param name="word">String /// </param> /// <throws> IOException </throws> /// <returns> true iff the word exists in the index /// </returns> public virtual bool Exist(System.String word) { // obtainSearcher calls ensureOpen IndexSearcher indexSearcher = ObtainSearcher(); try { return(indexSearcher.DocFreq(F_WORD_TERM.CreateTerm(word)) > 0); } finally { ReleaseSearcher(indexSearcher); } }
public void AddDocument(global::Lucene.Net.Documents.Document doc, Analyzer analyzer, IState state) { var fieldables = doc.GetFieldables(_field); if (fieldables == null) { return; } foreach (var fieldable in fieldables) { if (fieldable == null) { continue; } TextReader reader; var str = fieldable.StringValue(state); if (!string.IsNullOrEmpty(str)) { reader = new StringReader(str); } else { // We are reusing the fieldable for indexing. Instead of recreating it, we just reset the underlying text reader. reader = fieldable.ReaderValue; if (reader is ReusableStringReader stringReader) { if (stringReader.Length == 0) { continue; } stringReader.Reset(); } else if (reader is StreamReader streamReader) { if (streamReader.BaseStream.Length == 0) { continue; } streamReader.BaseStream.Position = 0; } else { continue; } } var tokenStream = analyzer.ReusableTokenStream(_field, reader); while (tokenStream.IncrementToken()) { var word = tokenStream.GetAttribute <ITermAttribute>().Term; // Index int len = word.Length; if (len < 3) { continue; // too short we bail but "too long" is fine... } // Early skip avoiding allocation of terms and searching. if (_alreadySeen.Contains(word)) { continue; } _indexSearcher ??= new IndexSearcher(_directory, true, state); if (_indexSearcher.DocFreq(_fWordTerm.CreateTerm(word), state) <= 0) { // the word does not exist in the gramindex int min = GetMin(len); _indexWriter.AddDocument(CreateDocument(word, min, min + 1), state); } _alreadySeen.Add(word); } } }
private string[] QueryOverSingleWord <TDistance>(SuggestionField suggestionField, string word, SuggestionOptions options, TDistance sd) where TDistance : IStringDistance { var min = options.Accuracy ?? SuggestionOptions.DefaultAccuracy; var field = suggestionField.Name; var pageSize = options.PageSize; var morePopular = options.SortMode == SuggestionSortMode.Popularity; int lengthWord = word.Length; var ir = _searcher.IndexReader; int freq = (ir != null && field != null) ? ir.DocFreq(new Term(FWord, word), _state) : 0; int goalFreq = (morePopular && ir != null && field != null) ? freq : 0; // if the word exists in the real index and we don't care for word frequency, return the word itself if (!morePopular && freq > 0) { return(new[] { word }); } var query = new BooleanQuery(); var alreadySeen = new HashSet <string>(); int ng = GetMin(lengthWord); int max = ng + 1; var table = GramsTable; for (; ng <= max; ng++) { string[] grams = FormGrams(word, ng); if (grams.Length == 0) { continue; // hmm } if (BoostStart > 0) { // should we boost prefixes? Add(query, table[ng].Start, grams[0], BoostStart); // matches start of word } if (BoostEnd > 0) { // should we boost suffixes Add(query, table[ng].End, grams[grams.Length - 1], BoostEnd); // matches end of word } for (int i = 0; i < grams.Length; i++) { Add(query, table[ng].Gram, grams[i]); } } int maxHits = 10 * pageSize; // System.out.println("Q: " + query); ScoreDoc[] hits = _searcher.Search(query, null, maxHits, _state).ScoreDocs; // System.out.println("HITS: " + hits.length()); var queue = new SuggestWordQueue(pageSize); // go thru more than 'maxr' matches in case the distance filter triggers int stop = Math.Min(hits.Length, maxHits); var suggestedWord = new SuggestWord(); for (int i = 0; i < stop; i++) { suggestedWord.Term = _searcher.Doc(hits[i].Doc, _state).Get(FWord, _state); // get orig word // don't suggest a word for itself, that would be silly if (suggestedWord.Term.Equals(word, StringComparison.OrdinalIgnoreCase)) { continue; } // edit distance suggestedWord.Score = sd.GetDistance(word, suggestedWord.Term); if (suggestedWord.Score < min) { continue; } if (ir != null && field != null) { // use the user index suggestedWord.Freq = _searcher.DocFreq(new Term(FWord, suggestedWord.Term), _state); // freq in the index // don't suggest a word that is not present in the field if ((morePopular && goalFreq > suggestedWord.Freq) || suggestedWord.Freq < 1) { continue; } } if (alreadySeen.Add(suggestedWord.Term) == false) // we already seen this word, no point returning it twice { continue; } queue.InsertWithOverflow(suggestedWord); if (queue.Size() == pageSize) { // if queue full, maintain the minScore score min = queue.Top().Score; } suggestedWord = new SuggestWord(); } int size = queue.Size(); if (size == 0) { return(EmptyArray); } // convert to array string string[] list = new string[size]; for (int i = size - 1; i >= 0; i--) { list[i] = queue.Pop().Term; } return(list); }