/// <summary> /// Returns <see cref="Search.CollectionStatistics"/> for a field. /// <para/> /// This can be overridden for example, to return a field's statistics /// across a distributed collection. /// <para/> /// @lucene.experimental /// </summary> public virtual CollectionStatistics CollectionStatistics(string field) { int docCount; long sumTotalTermFreq; long sumDocFreq; Debug.Assert(field != null); Terms terms = MultiFields.GetTerms(reader, field); if (terms == null) { docCount = 0; sumTotalTermFreq = 0; sumDocFreq = 0; } else { docCount = terms.DocCount; sumTotalTermFreq = terms.SumTotalTermFreq; sumDocFreq = terms.SumDocFreq; } return(new CollectionStatistics(field, reader.MaxDoc, docCount, sumTotalTermFreq, sumDocFreq)); }
public void TestKeepsFirstFilter() { DuplicateFilter df = new DuplicateFilter(KEY_FIELD); df.KeepMode = (KeepMode.KM_USE_FIRST_OCCURRENCE); ScoreDoc[] hits = searcher.Search(tq, df, 1000).ScoreDocs; assertTrue("Filtered searching should have found some matches", hits.Length > 0); foreach (ScoreDoc hit in hits) { Document d = searcher.Doc(hit.Doc); string url = d.Get(KEY_FIELD); DocsEnum td = TestUtil.Docs(Random, reader, KEY_FIELD, new BytesRef(url), MultiFields.GetLiveDocs(reader), null, 0); int lastDoc = 0; td.NextDoc(); lastDoc = td.DocID; assertEquals("Duplicate urls should return first doc", lastDoc, hit.Doc); } }
public void AddWithAnalyzerSuccess() { TestObject t = new TestObject() { Number = 9876, String = "Test Object 9876", }; Assert.AreEqual(0, writer.NumDocs); writer.Add(t, new KeywordAnalyzer()); writer.Commit(); Assert.AreEqual(1, writer.NumDocs); DirectoryReader reader = DirectoryReader.Open(dir); int nTerms = 0; Fields fields = MultiFields.GetFields(reader); foreach (string field in fields) { Terms tms = fields.GetTerms(field); TermsEnum termsEnum = tms.GetIterator(null); BytesRef text = termsEnum.Next(); while (text != null) { if (String.Equals("String", field)) { string str = text.Utf8ToString(); Assert.AreEqual("Test Object 9876", str); nTerms++; } text = termsEnum.Next(); } } Assert.AreEqual(1, nTerms); }
public void UpdateWithKindWithAnalyzerSuccess() { TestObject t = new TestObject() { Number = 1234, String = "Test Object 1234", }; Assert.AreEqual(0, writer.NumDocs); writer.Add <object>(t, new KeywordAnalyzer()); writer.Commit(); Assert.AreEqual(1, writer.NumDocs); TestObject t2 = new TestObject() { Number = 2345, String = "Something Else 2345", }; writer.Update(t2, DocumentObjectTypeKind.Static, new TermQuery(new Term("String", "Test Object 1234")), new KeywordAnalyzer()); writer.Commit(); Assert.AreEqual(2, writer.NumDocs); writer.DeleteDocuments <object>(new MatchAllDocsQuery()); writer.Commit(); Assert.AreEqual(1, writer.NumDocs); TestObject t3 = new TestObject() { Number = 3456, String = "Completely Different 3456", }; writer.Update(t3, DocumentObjectTypeKind.Actual, new TermQuery(new Term("String", "Something Else 2345")), new KeywordAnalyzer()); writer.Commit(); Assert.AreEqual(1, writer.NumDocs); DirectoryReader reader = DirectoryReader.Open(dir); Fields fields = MultiFields.GetFields(reader); int nTerms = 0; foreach (string field in fields) { Terms tms = fields.GetTerms(field); TermsEnum termsEnum = tms.GetIterator(null); BytesRef text = termsEnum.Next(); while (text != null) { if (String.Equals("String", field)) { if (String.Equals("Completely Different 3456", text.Utf8ToString())) { nTerms++; } } text = termsEnum.Next(); } } Assert.AreEqual(1, nTerms); }
/// <summary> /// Provide spelling corrections based on several parameters. /// </summary> /// <param name="term"> The term to suggest spelling corrections for </param> /// <param name="numSug"> The maximum number of spelling corrections </param> /// <param name="ir"> The index reader to fetch the candidate spelling corrections from </param> /// <param name="docfreq"> The minimum document frequency a potential suggestion need to have in order to be included </param> /// <param name="editDistance"> The maximum edit distance candidates are allowed to have </param> /// <param name="accuracy"> The minimum accuracy a suggested spelling correction needs to have in order to be included </param> /// <param name="spare"> a chars scratch </param> /// <returns> a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order. </returns> /// <exception cref="System.IO.IOException"> If I/O related errors occur </exception> protected internal virtual IEnumerable <ScoreTerm> SuggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy, CharsRef spare) { var atts = new AttributeSource(); IMaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>(); Terms terms = MultiFields.GetTerms(ir, term.Field); if (terms == null) { return(new List <ScoreTerm>()); } FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.Max(minPrefix, editDistance - 1), true); var stQueue = new Support.PriorityQueue <ScoreTerm>(); BytesRef queryTerm = new BytesRef(term.Text()); BytesRef candidateTerm; ScoreTerm st = new ScoreTerm(); IBoostAttribute boostAtt = e.Attributes.AddAttribute <IBoostAttribute>(); while ((candidateTerm = e.Next()) != null) { float boost = boostAtt.Boost; // ignore uncompetitive hits if (stQueue.Count >= numSug && boost <= stQueue.Peek().Boost) { continue; } // ignore exact match of the same term if (queryTerm.BytesEquals(candidateTerm)) { continue; } int df = e.DocFreq; // check docFreq if required if (df <= docfreq) { continue; } float score; string termAsString; if (distance == INTERNAL_LEVENSHTEIN) { // delay creating strings until the end termAsString = null; // undo FuzzyTermsEnum's scale factor for a real scaled lev score score = boost / e.ScaleFactor + e.MinSimilarity; } else { UnicodeUtil.UTF8toUTF16(candidateTerm, spare); termAsString = spare.ToString(); score = distance.GetDistance(term.Text(), termAsString); } if (score < accuracy) { continue; } // add new entry in PQ st.Term = BytesRef.DeepCopyOf(candidateTerm); st.Boost = boost; st.Docfreq = df; st.TermAsString = termAsString; st.Score = score; stQueue.Offer(st); // possibly drop entries from queue st = (stQueue.Count > numSug) ? stQueue.Poll() : new ScoreTerm(); maxBoostAtt.MaxNonCompetitiveBoost = (stQueue.Count >= numSug) ? stQueue.Peek().Boost : float.NegativeInfinity; } return(stQueue); }
// randomly seeks to term that we know exists, then next's // from there private void DoTestSeekExists(Random r, IList <Term> fieldTerms, IndexReader reader) { IDictionary <string, TermsEnum> tes = new Dictionary <string, TermsEnum>(); // Test random seek to existing term, then enum: if (Verbose) { Console.WriteLine("\nTEST: top now seek"); } int num = AtLeast(100); for (int iter = 0; iter < num; iter++) { // pick random field+term int spot = r.Next(fieldTerms.Count); Term term = fieldTerms[spot]; string field = term.Field; if (Verbose) { Console.WriteLine("TEST: exist seek field=" + field + " term=" + UnicodeUtil.ToHexString(term.Text)); } // seek to it if (!tes.TryGetValue(field, out TermsEnum te)) { te = MultiFields.GetTerms(reader, field).GetEnumerator(); tes[field] = te; } if (Verbose) { Console.WriteLine(" done get enum"); } // seek should find the term Assert.AreEqual(TermsEnum.SeekStatus.FOUND, te.SeekCeil(term.Bytes)); // now .next() this many times: int ct = TestUtil.NextInt32(r, 5, 100); for (int i = 0; i < ct; i++) { if (Verbose) { Console.WriteLine("TEST: now next()"); } if (1 + spot + i >= fieldTerms.Count) { break; } term = fieldTerms[1 + spot + i]; if (!term.Field.Equals(field, StringComparison.Ordinal)) { Assert.IsFalse(te.MoveNext()); break; } else { Assert.IsTrue(te.MoveNext()); BytesRef t = te.Term; if (Verbose) { Console.WriteLine(" got term=" + (t == null ? null : UnicodeUtil.ToHexString(t.Utf8ToString()))); Console.WriteLine(" exp=" + UnicodeUtil.ToHexString(term.Text.ToString())); } Assert.AreEqual(term.Bytes, t); } } } }
/// <summary> /// a variant, that uses pulsing, but uses a high TF to force pass thru to the underlying codec /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void test10kNotPulsed() throws Exception public virtual void test10kNotPulsed() { // we always run this test with pulsing codec. int freqCutoff = TestUtil.Next(random(), 1, 10); Codec cp = TestUtil.alwaysPostingsFormat(new Pulsing41PostingsFormat(freqCutoff)); File f = createTempDir("10knotpulsed"); BaseDirectoryWrapper dir = newFSDirectory(f); dir.CheckIndexOnClose = false; // we do this ourselves explicitly RandomIndexWriter iw = new RandomIndexWriter(random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setCodec(cp)); Document document = new Document(); FieldType ft = new FieldType(TextField.TYPE_STORED); switch (TestUtil.Next(random(), 0, 2)) { case 0: ft.IndexOptions = IndexOptions.DOCS_ONLY; break; case 1: ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; break; default: ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; break; } Field field = newField("field", "", ft); document.add(field); NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT)); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int freq = freqCutoff + 1; int freq = freqCutoff + 1; for (int i = 0; i < 10050; i++) { StringBuilder sb = new StringBuilder(); for (int j = 0; j < freq; j++) { sb.Append(df.format(i)); sb.Append(' '); // whitespace } field.StringValue = sb.ToString(); iw.addDocument(document); } IndexReader ir = iw.Reader; iw.close(); TermsEnum te = MultiFields.getTerms(ir, "field").iterator(null); DocsEnum de = null; for (int i = 0; i < 10050; i++) { string expected = df.format(i); assertEquals(expected, te.next().utf8ToString()); de = TestUtil.docs(random(), te, null, de, DocsEnum.FLAG_NONE); assertTrue(de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.nextDoc()); } ir.close(); TestUtil.checkIndex(dir); dir.close(); }
public virtual void Test() { Directory dir = NewDirectory(); MockAnalyzer analyzer = new MockAnalyzer(Random); analyzer.MaxTokenLength = TestUtil.NextInt32(Random, 1, IndexWriter.MAX_TERM_LENGTH); RandomIndexWriter w = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir, analyzer); LineFileDocs docs = new LineFileDocs(Random, DefaultCodecSupportsDocValues); int charsToIndex = AtLeast(100000); int charsIndexed = 0; //System.out.println("bytesToIndex=" + charsToIndex); while (charsIndexed < charsToIndex) { Document doc = docs.NextDoc(); charsIndexed += doc.Get("body").Length; w.AddDocument(doc); //System.out.println(" bytes=" + charsIndexed + " add: " + doc); } IndexReader r = w.GetReader(); //System.out.println("numDocs=" + r.NumDocs); w.Dispose(); IndexSearcher s = NewSearcher(r); Terms terms = MultiFields.GetFields(r).GetTerms("body"); int termCount = 0; TermsEnum termsEnum = terms.GetIterator(null); while (termsEnum.Next() != null) { termCount++; } Assert.IsTrue(termCount > 0); // Target ~10 terms to search: double chance = 10.0 / termCount; termsEnum = terms.GetIterator(termsEnum); IDictionary <BytesRef, TopDocs> answers = new Dictionary <BytesRef, TopDocs>(); while (termsEnum.Next() != null) { if (Random.NextDouble() <= chance) { BytesRef term = BytesRef.DeepCopyOf(termsEnum.Term); answers[term] = s.Search(new TermQuery(new Term("body", term)), 100); } } if (answers.Count > 0) { CountdownEvent startingGun = new CountdownEvent(1); int numThreads = TestUtil.NextInt32(Random, 2, 5); ThreadJob[] threads = new ThreadJob[numThreads]; for (int threadID = 0; threadID < numThreads; threadID++) { ThreadJob thread = new ThreadAnonymousInnerClassHelper(this, s, answers, startingGun); threads[threadID] = thread; thread.Start(); } startingGun.Signal(); foreach (ThreadJob thread in threads) { thread.Join(); } } r.Dispose(); dir.Dispose(); }
public override void Run() { if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": launch search thread"); } while (Environment.TickCount < stopTimeMS) { try { IndexSearcher s = outerInstance.GetCurrentSearcher(); try { // Verify 1) IW is correctly setting // diagnostics, and 2) segment warming for // merged segments is actually happening: foreach (AtomicReaderContext sub in s.IndexReader.Leaves) { SegmentReader segReader = (SegmentReader)sub.Reader; IDictionary <string, string> diagnostics = segReader.SegmentInfo.Info.Diagnostics; assertNotNull(diagnostics); string source; diagnostics.TryGetValue("source", out source); assertNotNull(source); if (source.Equals("merge", StringComparison.Ordinal)) { assertTrue("sub reader " + sub + " wasn't warmed: warmed=" + outerInstance.warmed + " diagnostics=" + diagnostics + " si=" + segReader.SegmentInfo, !outerInstance.m_assertMergedSegmentsWarmed || outerInstance.warmed.ContainsKey(segReader.core)); } } if (s.IndexReader.NumDocs > 0) { outerInstance.SmokeTestSearcher(s); Fields fields = MultiFields.GetFields(s.IndexReader); if (fields == null) { continue; } Terms terms = fields.GetTerms("body"); if (terms == null) { continue; } TermsEnum termsEnum = terms.GetIterator(null); int seenTermCount = 0; int shift; int trigger; if (totTermCount.Get() < 30) { shift = 0; trigger = 1; } else { trigger = totTermCount.Get() / 30; shift = Random.Next(trigger); } while (Environment.TickCount < stopTimeMS) { BytesRef term = termsEnum.Next(); if (term == null) { totTermCount.Set(seenTermCount); break; } seenTermCount++; // search 30 terms if ((seenTermCount + shift) % trigger == 0) { //if (VERBOSE) { //System.out.println(Thread.currentThread().getName() + " now search body:" + term.Utf8ToString()); //} totHits.AddAndGet(outerInstance.RunQuery(s, new TermQuery(new Term("body", term)))); } } //if (VERBOSE) { //System.out.println(Thread.currentThread().getName() + ": search done"); //} } } finally { outerInstance.ReleaseSearcher(s); } } catch (Exception t) { Console.WriteLine(Thread.CurrentThread.Name + ": hit exc"); outerInstance.m_failed.Set(true); Console.WriteLine(t.ToString()); throw new Exception(t.ToString(), t); } } }
/// <summary> /// Build the suggest index, using up to the specified /// amount of temporary RAM while building. Note that /// the weights for the suggestions are ignored. /// </summary> public virtual void Build(IInputIterator iterator, double ramBufferSizeMB) { if (iterator.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); // TODO: messy ... java7 has Files.createTempDirectory // ... but 4.x is java6: DirectoryInfo tempIndexPath = null; Random random = new Random(); while (true) { tempIndexPath = new DirectoryInfo(Path.Combine(directory.FullName, prefix + ".index." + random.Next(int.MaxValue))); tempIndexPath.Create(); if (System.IO.Directory.Exists(tempIndexPath.FullName)) { break; } } using (Directory dir = FSDirectory.Open(tempIndexPath)) { #pragma warning disable 612, 618 IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, indexAnalyzer); #pragma warning restore 612, 618 iwc.SetOpenMode(OpenMode.CREATE); iwc.SetRAMBufferSizeMB(ramBufferSizeMB); IndexWriter writer = new IndexWriter(dir, iwc); var ft = new FieldType(TextField.TYPE_NOT_STORED); // TODO: if only we had IndexOptions.TERMS_ONLY... ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; ft.OmitNorms = true; ft.Freeze(); Document doc = new Document(); Field field = new Field("body", "", ft); doc.Add(field); totTokens = 0; IndexReader reader = null; bool success = false; count = 0; try { while (true) { BytesRef surfaceForm = iterator.Next(); if (surfaceForm == null) { break; } field.SetStringValue(surfaceForm.Utf8ToString()); writer.AddDocument(doc); count++; } reader = DirectoryReader.Open(writer, false); Terms terms = MultiFields.GetTerms(reader, "body"); if (terms == null) { throw new System.ArgumentException("need at least one suggestion"); } // Move all ngrams into an FST: TermsEnum termsEnum = terms.GetIterator(null); Outputs <long?> outputs = PositiveInt32Outputs.Singleton; Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs); Int32sRef scratchInts = new Int32sRef(); while (true) { BytesRef term = termsEnum.Next(); if (term == null) { break; } int ngramCount = CountGrams(term); if (ngramCount > grams) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams); } if (ngramCount == 1) { totTokens += termsEnum.TotalTermFreq; } builder.Add(Lucene.Net.Util.Fst.Util.ToInt32sRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq)); } fst = builder.Finish(); if (fst == null) { throw new System.ArgumentException("need at least one suggestion"); } //System.out.println("FST: " + fst.getNodeCount() + " nodes"); /* * PrintWriter pw = new PrintWriter("/x/tmp/out.dot"); * Util.toDot(fst, pw, true, true); * pw.close(); */ success = true; } finally { try { if (success) { IOUtils.Dispose(writer, reader); } else { IOUtils.DisposeWhileHandlingException(writer, reader); } } finally { foreach (string file in dir.ListAll()) { FileInfo path = new FileInfo(Path.Combine(tempIndexPath.FullName, file)); try { path.Delete(); } catch (Exception e) { throw new InvalidOperationException("failed to remove " + path, e); } } try { tempIndexPath.Delete(); } catch (Exception e) { throw new InvalidOperationException("failed to remove " + tempIndexPath, e); } } } } }
/// <summary> /// Make sure we skip wicked long terms. /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testWickedLongTerm() throws java.io.IOException public virtual void testWickedLongTerm() { RAMDirectory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new ClassicAnalyzer(TEST_VERSION_CURRENT))); char[] chars = new char[IndexWriter.MAX_TERM_LENGTH]; Arrays.fill(chars, 'x'); Document doc = new Document(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String bigTerm = new String(chars); string bigTerm = new string(chars); // This produces a too-long term: string contents = "abc xyz x" + bigTerm + " another term"; doc.add(new TextField("content", contents, Field.Store.NO)); writer.addDocument(doc); // Make sure we can add another normal document doc = new Document(); doc.add(new TextField("content", "abc bbb ccc", Field.Store.NO)); writer.addDocument(doc); writer.close(); IndexReader reader = IndexReader.open(dir); // Make sure all terms < max size were indexed assertEquals(2, reader.docFreq(new Term("content", "abc"))); assertEquals(1, reader.docFreq(new Term("content", "bbb"))); assertEquals(1, reader.docFreq(new Term("content", "term"))); assertEquals(1, reader.docFreq(new Term("content", "another"))); // Make sure position is still incremented when // massive term is skipped: DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader), "content", new BytesRef("another")); assertTrue(tps.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(1, tps.freq()); assertEquals(3, tps.nextPosition()); // Make sure the doc that has the massive term is in // the index: assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs()); reader.close(); // Make sure we can add a document with exactly the // maximum length term, and search on that term: doc = new Document(); doc.add(new TextField("content", bigTerm, Field.Store.NO)); ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT); sa.MaxTokenLength = 100000; writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa)); writer.addDocument(doc); writer.close(); reader = IndexReader.open(dir); assertEquals(1, reader.docFreq(new Term("content", bigTerm))); reader.close(); dir.close(); }
public virtual void TestCaching() { Directory dir = new RAMDirectory(); RandomIndexWriter writer = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir); Document doc = new Document(); TokenStream stream = new TokenStreamAnonymousInnerClassHelper(this); stream = new CachingTokenFilter(stream); doc.Add(new TextField("preanalyzed", stream)); // 1) we consume all tokens twice before we add the doc to the index CheckTokens(stream); stream.Reset(); CheckTokens(stream); // 2) now add the document to the index and verify if all tokens are indexed // don't reset the stream here, the DocumentWriter should do that implicitly writer.AddDocument(doc); IndexReader reader = writer.GetReader(); DocsAndPositionsEnum termPositions = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "preanalyzed", new BytesRef("term1")); Assert.IsTrue(termPositions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(1, termPositions.Freq); Assert.AreEqual(0, termPositions.NextPosition()); termPositions = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "preanalyzed", new BytesRef("term2")); Assert.IsTrue(termPositions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(2, termPositions.Freq); Assert.AreEqual(1, termPositions.NextPosition()); Assert.AreEqual(3, termPositions.NextPosition()); termPositions = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "preanalyzed", new BytesRef("term3")); Assert.IsTrue(termPositions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(1, termPositions.Freq); Assert.AreEqual(2, termPositions.NextPosition()); reader.Dispose(); writer.Dispose(); // 3) reset stream and consume tokens again stream.Reset(); CheckTokens(stream); dir.Dispose(); }
/// <summary> /// Creates an iterator over term, weight and payload fields from the lucene /// index. Setting <paramref name="hasPayloads"/> to <c>false</c>, implies an enumerator /// over only term and weight. /// </summary> public DocumentInputEnumerator(DocumentDictionary documentDictionary, bool hasPayloads, bool hasContexts) { this.outerInstance = documentDictionary; this.hasPayloads = hasPayloads; this.hasContexts = hasContexts; docCount = documentDictionary.m_reader.MaxDoc - 1; weightValues = (documentDictionary.weightField != null) ? MultiDocValues.GetNumericValues(documentDictionary.m_reader, documentDictionary.weightField) : null; liveDocs = (documentDictionary.m_reader.Leaves.Count > 0) ? MultiFields.GetLiveDocs(documentDictionary.m_reader) : null; relevantFields = GetRelevantFields(new string[] { documentDictionary.field, documentDictionary.weightField, documentDictionary.m_payloadField, documentDictionary.m_contextsField }); }
public override int DoLogic() { int res = 0; // open reader or use existing one IndexSearcher searcher = RunData.GetIndexSearcher(); IndexReader reader; bool closeSearcher; if (searcher == null) { // open our own reader Directory dir = RunData.Directory; reader = DirectoryReader.Open(dir); searcher = new IndexSearcher(reader); closeSearcher = true; } else { // use existing one; this passes +1 ref to us reader = searcher.IndexReader; closeSearcher = false; } // optionally warm and add num docs traversed to count if (WithWarm) { Document doc = null; IBits liveDocs = MultiFields.GetLiveDocs(reader); for (int m = 0; m < reader.MaxDoc; m++) { if (null == liveDocs || liveDocs.Get(m)) { doc = reader.Document(m); res += (doc == null ? 0 : 1); } } } if (WithSearch) { res++; Query q = queryMaker.MakeQuery(); Sort sort = Sort; TopDocs hits = null; int numHits = NumHits; if (numHits > 0) { if (WithCollector == false) { if (sort != null) { // TODO: instead of always passing false we // should detect based on the query; if we make // the IndexSearcher search methods that take // Weight public again, we can go back to // pulling the Weight ourselves: TopFieldCollector collector = TopFieldCollector.Create(sort, numHits, true, WithScore, WithMaxScore, false); searcher.Search(q, null, collector); hits = collector.GetTopDocs(); } else { hits = searcher.Search(q, numHits); } } else { ICollector collector = CreateCollector(); searcher.Search(q, null, collector); //hits = collector.topDocs(); } string printHitsField = RunData.Config.Get("print.hits.field", null); if (hits != null && printHitsField != null && printHitsField.Length > 0) { Console.WriteLine("totalHits = " + hits.TotalHits); Console.WriteLine("maxDoc() = " + reader.MaxDoc); Console.WriteLine("numDocs() = " + reader.NumDocs); for (int i = 0; i < hits.ScoreDocs.Length; i++) { int docID = hits.ScoreDocs[i].Doc; Document doc = reader.Document(docID); Console.WriteLine(" " + i + ": doc=" + docID + " score=" + hits.ScoreDocs[i].Score + " " + printHitsField + " =" + doc.Get(printHitsField)); } } if (WithTraverse) { ScoreDoc[] scoreDocs = hits.ScoreDocs; int traversalSize = Math.Min(scoreDocs.Length, TraversalSize); if (traversalSize > 0) { bool retrieve = WithRetrieve; int numHighlight = Math.Min(NumToHighlight, scoreDocs.Length); Analyzer analyzer = RunData.Analyzer; BenchmarkHighlighter highlighter = null; if (numHighlight > 0) { highlighter = GetBenchmarkHighlighter(q); } for (int m = 0; m < traversalSize; m++) { int id = scoreDocs[m].Doc; res++; if (retrieve) { Document document = RetrieveDoc(reader, id); res += document != null ? 1 : 0; if (numHighlight > 0 && m < numHighlight) { ICollection <string> fieldsToHighlight = GetFieldsToHighlight(document); foreach (string field in fieldsToHighlight) { string text = document.Get(field); res += highlighter.DoHighlight(reader, id, field, document, analyzer, text); } } } } } } } } if (closeSearcher) { reader.Dispose(); } else { // Release our +1 ref from above reader.DecRef(); } return(res); }
public virtual void TestMutipleDocument() { RAMDirectory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new KeywordAnalyzer())); Document doc = new Document(); doc.Add(new TextField("partnum", "Q36", Field.Store.YES)); writer.AddDocument(doc); doc = new Document(); doc.Add(new TextField("partnum", "Q37", Field.Store.YES)); writer.AddDocument(doc); writer.Dispose(); IndexReader reader = DirectoryReader.Open(dir); DocsEnum td = TestUtil.Docs(Random(), reader, "partnum", new BytesRef("Q36"), MultiFields.GetLiveDocs(reader), null, 0); assertTrue(td.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); td = TestUtil.Docs(Random(), reader, "partnum", new BytesRef("Q37"), MultiFields.GetLiveDocs(reader), null, 0); assertTrue(td.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); }
/// <summary> /// Build the suggest index, using up to the specified /// amount of temporary RAM while building. Note that /// the weights for the suggestions are ignored. /// </summary> public virtual void Build(IInputEnumerator enumerator, double ramBufferSizeMB) { if (enumerator.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); // LUCENENET specific - using GetRandomFileName() instead of picking a random int DirectoryInfo tempIndexPath = null; while (true) { tempIndexPath = new DirectoryInfo(Path.Combine(directory.FullName, prefix + ".index." + Path.GetFileNameWithoutExtension(Path.GetRandomFileName()))); tempIndexPath.Create(); if (System.IO.Directory.Exists(tempIndexPath.FullName)) { break; } } Directory dir = FSDirectory.Open(tempIndexPath); try { #pragma warning disable 612, 618 IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, indexAnalyzer); #pragma warning restore 612, 618 iwc.SetOpenMode(OpenMode.CREATE); iwc.SetRAMBufferSizeMB(ramBufferSizeMB); IndexWriter writer = new IndexWriter(dir, iwc); var ft = new FieldType(TextField.TYPE_NOT_STORED); // TODO: if only we had IndexOptions.TERMS_ONLY... ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; ft.OmitNorms = true; ft.Freeze(); Document doc = new Document(); Field field = new Field("body", "", ft); doc.Add(field); totTokens = 0; IndexReader reader = null; bool success = false; count = 0; try { while (enumerator.MoveNext()) { BytesRef surfaceForm = enumerator.Current; field.SetStringValue(surfaceForm.Utf8ToString()); writer.AddDocument(doc); count++; } reader = DirectoryReader.Open(writer, false); Terms terms = MultiFields.GetTerms(reader, "body"); if (terms == null) { throw new ArgumentException("need at least one suggestion"); } // Move all ngrams into an FST: TermsEnum termsEnum = terms.GetEnumerator(null); Outputs <long?> outputs = PositiveInt32Outputs.Singleton; Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs); Int32sRef scratchInts = new Int32sRef(); while (termsEnum.MoveNext()) { BytesRef term = termsEnum.Term; int ngramCount = CountGrams(term); if (ngramCount > grams) { throw new ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams); } if (ngramCount == 1) { totTokens += termsEnum.TotalTermFreq; } builder.Add(Lucene.Net.Util.Fst.Util.ToInt32sRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq)); } fst = builder.Finish(); if (fst == null) { throw new ArgumentException("need at least one suggestion"); } //System.out.println("FST: " + fst.getNodeCount() + " nodes"); /* * PrintWriter pw = new PrintWriter("/x/tmp/out.dot"); * Util.toDot(fst, pw, true, true); * pw.close(); */ success = true; } finally { if (success) { IOUtils.Dispose(writer, reader); } else { IOUtils.DisposeWhileHandlingException(writer, reader); } } } finally { try { IOUtils.Dispose(dir); } finally { // LUCENENET specific - since we are removing the entire directory anyway, // it doesn't make sense to first do a loop in order remove the files. // Let the System.IO.Directory.Delete() method handle that. // We also need to dispose the Directory instance first before deleting from disk. try { System.IO.Directory.Delete(tempIndexPath.FullName, true); } catch (Exception e) { throw new InvalidOperationException("failed to remove " + tempIndexPath, e); } } } }
/// <summary> /// Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all /// indexed fields from terms with a document frequency greater than the given /// maxDocFreq /// </summary> /// <param name="matchVersion"> Version to be used in <seealso cref="StopFilter"/> </param> /// <param name="delegate"> Analyzer whose TokenStream will be filtered </param> /// <param name="indexReader"> IndexReader to identify the stopwords from </param> /// <param name="maxDocFreq"> Document frequency terms should be above in order to be stopwords </param> /// <exception cref="IOException"> Can be thrown while reading from the IndexReader </exception> public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer @delegate, IndexReader indexReader, int maxDocFreq) : this(matchVersion, @delegate, indexReader, MultiFields.GetIndexedFields(indexReader), maxDocFreq) { }
public virtual void TestSetPosition() { Analyzer analyzer = new AnalyzerAnonymousClass(this); Directory store = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random, store, analyzer); Document d = new Document(); d.Add(NewTextField("field", "bogus", Field.Store.YES)); writer.AddDocument(d); IndexReader reader = writer.GetReader(); writer.Dispose(); IndexSearcher searcher = NewSearcher(reader); DocsAndPositionsEnum pos = MultiFields.GetTermPositionsEnum(searcher.IndexReader, MultiFields.GetLiveDocs(searcher.IndexReader), "field", new BytesRef("1")); pos.NextDoc(); // first token should be at position 0 Assert.AreEqual(0, pos.NextPosition()); pos = MultiFields.GetTermPositionsEnum(searcher.IndexReader, MultiFields.GetLiveDocs(searcher.IndexReader), "field", new BytesRef("2")); pos.NextDoc(); // second token should be at position 2 Assert.AreEqual(2, pos.NextPosition()); PhraseQuery q; ScoreDoc[] hits; q = new PhraseQuery(); q.Add(new Term("field", "1")); q.Add(new Term("field", "2")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // same as previous, just specify positions explicitely. q = new PhraseQuery(); q.Add(new Term("field", "1"), 0); q.Add(new Term("field", "2"), 1); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // specifying correct positions should find the phrase. q = new PhraseQuery(); q.Add(new Term("field", "1"), 0); q.Add(new Term("field", "2"), 2); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "2")); q.Add(new Term("field", "3")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "3")); q.Add(new Term("field", "4")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // phrase query would find it when correct positions are specified. q = new PhraseQuery(); q.Add(new Term("field", "3"), 0); q.Add(new Term("field", "4"), 0); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); // phrase query should fail for non existing searched term // even if there exist another searched terms in the same searched position. q = new PhraseQuery(); q.Add(new Term("field", "3"), 0); q.Add(new Term("field", "9"), 0); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // multi-phrase query should succed for non existing searched term // because there exist another searched terms in the same searched position. MultiPhraseQuery mq = new MultiPhraseQuery(); mq.Add(new Term[] { new Term("field", "3"), new Term("field", "9") }, 0); hits = searcher.Search(mq, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "2")); q.Add(new Term("field", "4")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "3")); q.Add(new Term("field", "5")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "4")); q.Add(new Term("field", "5")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "2")); q.Add(new Term("field", "5")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); reader.Dispose(); store.Dispose(); }
public virtual void TestWickedLongTerm() { using (RAMDirectory dir = new RAMDirectory()) { char[] chars = new char[IndexWriter.MAX_TERM_LENGTH]; Arrays.Fill(chars, 'x'); string bigTerm = new string(chars); Document doc = new Document(); using (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new ClassicAnalyzer(TEST_VERSION_CURRENT)))) { // This produces a too-long term: string contents = "abc xyz x" + bigTerm + " another term"; doc.Add(new TextField("content", contents, Field.Store.NO)); writer.AddDocument(doc); // Make sure we can add another normal document doc = new Document(); doc.Add(new TextField("content", "abc bbb ccc", Field.Store.NO)); writer.AddDocument(doc); } #pragma warning disable 612, 618 using (IndexReader reader = IndexReader.Open(dir)) #pragma warning restore 612, 618 { // Make sure all terms < max size were indexed assertEquals(2, reader.DocFreq(new Term("content", "abc"))); assertEquals(1, reader.DocFreq(new Term("content", "bbb"))); assertEquals(1, reader.DocFreq(new Term("content", "term"))); assertEquals(1, reader.DocFreq(new Term("content", "another"))); // Make sure position is still incremented when // massive term is skipped: DocsAndPositionsEnum tps = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "content", new BytesRef("another")); assertTrue(tps.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(1, tps.Freq); assertEquals(3, tps.NextPosition()); // Make sure the doc that has the massive term is in // the index: assertEquals("document with wicked long term should is not in the index!", 2, reader.NumDocs); } // Make sure we can add a document with exactly the // maximum length term, and search on that term: doc = new Document(); doc.Add(new TextField("content", bigTerm, Field.Store.NO)); ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT); sa.MaxTokenLength = 100000; using (var writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa))) { writer.AddDocument(doc); } #pragma warning disable 612, 618 using (var reader = IndexReader.Open(dir)) #pragma warning restore 612, 618 { assertEquals(1, reader.DocFreq(new Term("content", bigTerm))); } } }
/// <summary> /// Creates a new <see cref="QueryAutoStopWordAnalyzer"/> with stopwords calculated for all /// indexed fields from terms with a document frequency percentage greater than /// the given <paramref name="maxPercentDocs"/> /// </summary> /// <param name="matchVersion"> Version to be used in <see cref="StopFilter"/> </param> /// <param name="delegate"> <see cref="Analyzer"/> whose <see cref="TokenStream"/> will be filtered </param> /// <param name="indexReader"> <see cref="IndexReader"/> to identify the stopwords from </param> /// <param name="maxPercentDocs"> The maximum percentage (between 0.0 and 1.0) of index documents which /// contain a term, after which the word is considered to be a stop word </param> /// <exception cref="IOException"> Can be thrown while reading from the <see cref="IndexReader"/> </exception> public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, float maxPercentDocs) : this(matchVersion, @delegate, indexReader, MultiFields.GetIndexedFields(indexReader), maxPercentDocs) { }
public override int GetOrdinal(FacetLabel cp) { EnsureOpen(); if (cp.Length == 0) { return(ROOT_ORDINAL); } // First try to find the answer in the LRU cache: // LUCENENET: Despite LRUHashMap being thread-safe, we get much better performance // if reads are separated from writes. ordinalCacheLock.EnterReadLock(); try { if (ordinalCache.TryGetValue(cp, out Int32Class res)) { if (res < indexReader.MaxDoc) { // Since the cache is shared with DTR instances allocated from // doOpenIfChanged, we need to ensure that the ordinal is one that // this DTR instance recognizes. return(res); } else { // if we get here, it means that the category was found in the cache, // but is not recognized by this TR instance. Therefore there's no // need to continue search for the path on disk, because we won't find // it there too. return(TaxonomyReader.INVALID_ORDINAL); } } } finally { ordinalCacheLock.ExitReadLock(); } // If we're still here, we have a cache miss. We need to fetch the // value from disk, and then also put it in the cache: int ret = TaxonomyReader.INVALID_ORDINAL; DocsEnum docs = MultiFields.GetTermDocsEnum(indexReader, null, Consts.FULL, new BytesRef(FacetsConfig.PathToString(cp.Components, cp.Length)), 0); if (docs != null && docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { ret = docs.DocID; // we only store the fact that a category exists, not its inexistence. // This is required because the caches are shared with new DTR instances // that are allocated from doOpenIfChanged. Therefore, if we only store // information about found categories, we cannot accidently tell a new // generation of DTR that a category does not exist. ordinalCacheLock.EnterWriteLock(); try { ordinalCache[cp] = ret; } finally { ordinalCacheLock.ExitWriteLock(); } } return(ret); }
public virtual void TestSimple() { int numNodes = TestUtil.NextInt(Random(), 1, 10); double runTimeSec = AtLeast(3); int minDocsToMakeTerms = TestUtil.NextInt(Random(), 5, 20); int maxSearcherAgeSeconds = TestUtil.NextInt(Random(), 1, 3); if (VERBOSE) { Console.WriteLine("TEST: numNodes=" + numNodes + " runTimeSec=" + runTimeSec + " maxSearcherAgeSeconds=" + maxSearcherAgeSeconds); } Start(numNodes, runTimeSec, maxSearcherAgeSeconds); List <PreviousSearchState> priorSearches = new List <PreviousSearchState>(); List <BytesRef> terms = null; while (Time.NanoTime() < endTimeNanos) { bool doFollowon = priorSearches.Count > 0 && Random().Next(7) == 1; // Pick a random node; we will run the query on this node: int myNodeID = Random().Next(numNodes); NodeState.ShardIndexSearcher localShardSearcher; PreviousSearchState prevSearchState; if (doFollowon) { // Pretend user issued a followon query: prevSearchState = priorSearches[Random().Next(priorSearches.Count)]; if (VERBOSE) { Console.WriteLine("\nTEST: follow-on query age=" + ((Time.NanoTime() - prevSearchState.SearchTimeNanos) / 1000000000.0)); } try { localShardSearcher = Nodes[myNodeID].Acquire(prevSearchState.Versions); } catch (SearcherExpiredException see) { // Expected, sometimes; in a "real" app we would // either forward this error to the user ("too // much time has passed; please re-run your // search") or sneakily just switch to newest // searcher w/o telling them... if (VERBOSE) { Console.WriteLine(" searcher expired during local shard searcher init: " + see); } priorSearches.Remove(prevSearchState); continue; } } else { if (VERBOSE) { Console.WriteLine("\nTEST: fresh query"); } // Do fresh query: localShardSearcher = Nodes[myNodeID].Acquire(); prevSearchState = null; } IndexReader[] subs = new IndexReader[numNodes]; PreviousSearchState searchState = null; try { // Mock: now make a single reader (MultiReader) from all node // searchers. In a real shard env you can't do this... we // do it to confirm results from the shard searcher // are correct: int docCount = 0; try { for (int nodeID = 0; nodeID < numNodes; nodeID++) { long subVersion = localShardSearcher.NodeVersions[nodeID]; IndexSearcher sub = Nodes[nodeID].Searchers.Acquire(subVersion); if (sub == null) { nodeID--; while (nodeID >= 0) { subs[nodeID].DecRef(); subs[nodeID] = null; nodeID--; } throw new SearcherExpiredException("nodeID=" + nodeID + " version=" + subVersion); } subs[nodeID] = sub.IndexReader; docCount += subs[nodeID].MaxDoc; } } catch (SearcherExpiredException see) { // Expected if (VERBOSE) { Console.WriteLine(" searcher expired during mock reader init: " + see); } continue; } IndexReader mockReader = new MultiReader(subs); IndexSearcher mockSearcher = new IndexSearcher(mockReader); Query query; Sort sort; if (prevSearchState != null) { query = prevSearchState.Query; sort = prevSearchState.Sort; } else { if (terms == null && docCount > minDocsToMakeTerms) { // TODO: try to "focus" on high freq terms sometimes too // TODO: maybe also periodically reset the terms...? TermsEnum termsEnum = MultiFields.GetTerms(mockReader, "body").GetIterator(null); terms = new List <BytesRef>(); while (termsEnum.Next() != null) { terms.Add(BytesRef.DeepCopyOf(termsEnum.Term)); } if (VERBOSE) { Console.WriteLine("TEST: init terms: " + terms.Count + " terms"); } if (terms.Count == 0) { terms = null; } } if (VERBOSE) { Console.WriteLine(" maxDoc=" + mockReader.MaxDoc); } if (terms != null) { if (Random().NextBoolean()) { query = new TermQuery(new Term("body", terms[Random().Next(terms.Count)])); } else { string t = terms[Random().Next(terms.Count)].Utf8ToString(); string prefix; if (t.Length <= 1) { prefix = t; } else { prefix = t.Substring(0, TestUtil.NextInt(Random(), 1, 2)); } query = new PrefixQuery(new Term("body", prefix)); } if (Random().NextBoolean()) { sort = null; } else { // TODO: sort by more than 1 field int what = Random().Next(3); if (what == 0) { sort = new Sort(SortField.FIELD_SCORE); } else if (what == 1) { // TODO: this sort doesn't merge // correctly... it's tricky because you // could have > 2.1B docs across all shards: //sort = new Sort(SortField.FIELD_DOC); sort = null; } else if (what == 2) { sort = new Sort(new SortField[] { new SortField("docid", SortFieldType.INT32, Random().NextBoolean()) }); } else { sort = new Sort(new SortField[] { new SortField("title", SortFieldType.STRING, Random().NextBoolean()) }); } } } else { query = null; sort = null; } } if (query != null) { try { searchState = AssertSame(mockSearcher, localShardSearcher, query, sort, prevSearchState); } catch (SearcherExpiredException see) { // Expected; in a "real" app we would // either forward this error to the user ("too // much time has passed; please re-run your // search") or sneakily just switch to newest // searcher w/o telling them... if (VERBOSE) { Console.WriteLine(" searcher expired during search: " + see); Console.Out.Write(see.StackTrace); } // We can't do this in general: on a very slow // computer it's possible the local searcher // expires before we can finish our search: // assert prevSearchState != null; if (prevSearchState != null) { priorSearches.Remove(prevSearchState); } } } } finally { Nodes[myNodeID].Release(localShardSearcher); foreach (IndexReader sub in subs) { if (sub != null) { sub.DecRef(); } } } if (searchState != null && searchState.SearchAfterLocal != null && Random().Next(5) == 3) { priorSearches.Add(searchState); if (priorSearches.Count > 200) { Collections.Shuffle(priorSearches); priorSearches.SubList(100, priorSearches.Count).Clear(); } } } Finish(); }
public virtual void Test10kPulsed() { // we always run this test with pulsing codec. Codec cp = TestUtil.AlwaysPostingsFormat(new Pulsing41PostingsFormat(1)); DirectoryInfo f = CreateTempDir("10kpulsed"); BaseDirectoryWrapper dir = NewFSDirectory(f); dir.CheckIndexOnDispose = false; // we do this ourselves explicitly RandomIndexWriter iw = new RandomIndexWriter(Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetCodec(cp)); Document document = new Document(); FieldType ft = new FieldType(TextField.TYPE_STORED); switch (TestUtil.NextInt32(Random, 0, 2)) { case 0: ft.IndexOptions = IndexOptions.DOCS_ONLY; break; case 1: ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; break; default: ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; break; } Field field = NewField("field", "", ft); document.Add(field); //NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT)); for (int i = 0; i < 10050; i++) { //field.StringValue = df.format(i); field.SetStringValue(i.ToString("00000", CultureInfo.InvariantCulture)); iw.AddDocument(document); } IndexReader ir = iw.GetReader(); iw.Dispose(); TermsEnum te = MultiFields.GetTerms(ir, "field").GetIterator(null); DocsEnum de = null; for (int i = 0; i < 10050; i++) { //string expected = df.format(i); string expected = i.ToString("00000", CultureInfo.InvariantCulture); assertEquals(expected, te.Next().Utf8ToString()); de = TestUtil.Docs(Random, te, null, de, DocsFlags.NONE); assertTrue(de.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.NextDoc()); } ir.Dispose(); TestUtil.CheckIndex(dir); dir.Dispose(); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void test10kPulsed() throws Exception public virtual void test10kPulsed() { // we always run this test with pulsing codec. Codec cp = TestUtil.alwaysPostingsFormat(new Pulsing41PostingsFormat(1)); File f = createTempDir("10kpulsed"); BaseDirectoryWrapper dir = newFSDirectory(f); dir.CheckIndexOnClose = false; // we do this ourselves explicitly RandomIndexWriter iw = new RandomIndexWriter(random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setCodec(cp)); Document document = new Document(); FieldType ft = new FieldType(TextField.TYPE_STORED); switch (TestUtil.Next(random(), 0, 2)) { case 0: ft.IndexOptions = IndexOptions.DOCS_ONLY; break; case 1: ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; break; default: ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; break; } Field field = newField("field", "", ft); document.add(field); NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT)); for (int i = 0; i < 10050; i++) { field.StringValue = df.format(i); iw.addDocument(document); } IndexReader ir = iw.Reader; iw.close(); TermsEnum te = MultiFields.getTerms(ir, "field").iterator(null); DocsEnum de = null; for (int i = 0; i < 10050; i++) { string expected = df.format(i); assertEquals(expected, te.next().utf8ToString()); de = TestUtil.docs(random(), te, null, de, DocsEnum.FLAG_NONE); assertTrue(de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.nextDoc()); } ir.close(); TestUtil.checkIndex(dir); dir.close(); }
public virtual void TestPhrasePrefix() { Directory indexStore = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random(), indexStore); Add("blueberry pie", writer); Add("blueberry strudel", writer); Add("blueberry pizza", writer); Add("blueberry chewing gum", writer); Add("bluebird pizza", writer); Add("bluebird foobar pizza", writer); Add("piccadilly circus", writer); IndexReader reader = writer.Reader; IndexSearcher searcher = NewSearcher(reader); // search for "blueberry pi*": MultiPhraseQuery query1 = new MultiPhraseQuery(); // search for "strawberry pi*": MultiPhraseQuery query2 = new MultiPhraseQuery(); query1.Add(new Term("body", "blueberry")); query2.Add(new Term("body", "strawberry")); LinkedList <Term> termsWithPrefix = new LinkedList <Term>(); // this TermEnum gives "piccadilly", "pie" and "pizza". string prefix = "pi"; TermsEnum te = MultiFields.GetFields(reader).Terms("body").Iterator(null); te.SeekCeil(new BytesRef(prefix)); do { string s = te.Term().Utf8ToString(); if (s.StartsWith(prefix)) { termsWithPrefix.AddLast(new Term("body", s)); } else { break; } } while (te.Next() != null); query1.Add(termsWithPrefix.ToArray(/*new Term[0]*/)); Assert.AreEqual("body:\"blueberry (piccadilly pie pizza)\"", query1.ToString()); query2.Add(termsWithPrefix.ToArray(/*new Term[0]*/)); Assert.AreEqual("body:\"strawberry (piccadilly pie pizza)\"", query2.ToString()); ScoreDoc[] result; result = searcher.Search(query1, null, 1000).ScoreDocs; Assert.AreEqual(2, result.Length); result = searcher.Search(query2, null, 1000).ScoreDocs; Assert.AreEqual(0, result.Length); // search for "blue* pizza": MultiPhraseQuery query3 = new MultiPhraseQuery(); termsWithPrefix.Clear(); prefix = "blue"; te.SeekCeil(new BytesRef(prefix)); do { if (te.Term().Utf8ToString().StartsWith(prefix)) { termsWithPrefix.AddLast(new Term("body", te.Term().Utf8ToString())); } } while (te.Next() != null); query3.Add(termsWithPrefix.ToArray(/*new Term[0]*/)); query3.Add(new Term("body", "pizza")); result = searcher.Search(query3, null, 1000).ScoreDocs; Assert.AreEqual(2, result.Length); // blueberry pizza, bluebird pizza Assert.AreEqual("body:\"(blueberry bluebird) pizza\"", query3.ToString()); // test slop: query3.Slop = 1; result = searcher.Search(query3, null, 1000).ScoreDocs; // just make sure no exc: searcher.Explain(query3, 0); Assert.AreEqual(3, result.Length); // blueberry pizza, bluebird pizza, bluebird // foobar pizza MultiPhraseQuery query4 = new MultiPhraseQuery(); try { query4.Add(new Term("field1", "foo")); query4.Add(new Term("field2", "foobar")); Assert.Fail(); } catch (System.ArgumentException e) { // okay, all terms must belong to the same field } writer.Dispose(); reader.Dispose(); indexStore.Dispose(); }
private void DoTestSeekDoesNotExist(Random r, int numField, IList <Term> fieldTerms, Term[] fieldTermsArray, IndexReader reader) { IDictionary <string, TermsEnum> tes = new Dictionary <string, TermsEnum>(); if (Verbose) { Console.WriteLine("TEST: top random seeks"); } { int num = AtLeast(100); for (int iter = 0; iter < num; iter++) { // seek to random spot string field = ("f" + r.Next(numField)).Intern(); Term tx = new Term(field, GetRandomString(r)); int spot = Array.BinarySearch(fieldTermsArray, tx); if (spot < 0) { if (Verbose) { Console.WriteLine("TEST: non-exist seek to " + field + ":" + UnicodeUtil.ToHexString(tx.Text)); } // term does not exist: if (!tes.TryGetValue(field, out TermsEnum te)) { te = MultiFields.GetTerms(reader, field).GetEnumerator(); tes[field] = te; } if (Verbose) { Console.WriteLine(" got enum"); } spot = -spot - 1; if (spot == fieldTerms.Count || !fieldTerms[spot].Field.Equals(field, StringComparison.Ordinal)) { Assert.AreEqual(TermsEnum.SeekStatus.END, te.SeekCeil(tx.Bytes)); } else { Assert.AreEqual(TermsEnum.SeekStatus.NOT_FOUND, te.SeekCeil(tx.Bytes)); if (Verbose) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(te.Term.Utf8ToString())); Console.WriteLine(" exp term=" + UnicodeUtil.ToHexString(fieldTerms[spot].Text)); } Assert.AreEqual(fieldTerms[spot].Bytes, te.Term); // now .next() this many times: int ct = TestUtil.NextInt32(r, 5, 100); for (int i = 0; i < ct; i++) { if (Verbose) { Console.WriteLine("TEST: now next()"); } if (1 + spot + i >= fieldTerms.Count) { break; } Term term = fieldTerms[1 + spot + i]; if (!term.Field.Equals(field, StringComparison.Ordinal)) { Assert.IsFalse(te.MoveNext()); break; } else { Assert.IsTrue(te.MoveNext()); BytesRef t = te.Term; if (Verbose) { Console.WriteLine(" got term=" + (t == null ? null : UnicodeUtil.ToHexString(t.Utf8ToString()))); Console.WriteLine(" exp=" + UnicodeUtil.ToHexString(term.Text.ToString())); } Assert.AreEqual(term.Bytes, t); } } } } } } }
public virtual void TestPhrasePrefix() { Directory indexStore = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, indexStore); Document doc1 = new Document(); Document doc2 = new Document(); Document doc3 = new Document(); Document doc4 = new Document(); Document doc5 = new Document(); doc1.Add(NewTextField("body", "blueberry pie", Field.Store.YES)); doc2.Add(NewTextField("body", "blueberry strudel", Field.Store.YES)); doc3.Add(NewTextField("body", "blueberry pizza", Field.Store.YES)); doc4.Add(NewTextField("body", "blueberry chewing gum", Field.Store.YES)); doc5.Add(NewTextField("body", "piccadilly circus", Field.Store.YES)); writer.AddDocument(doc1); writer.AddDocument(doc2); writer.AddDocument(doc3); writer.AddDocument(doc4); writer.AddDocument(doc5); IndexReader reader = writer.GetReader(); writer.Dispose(); IndexSearcher searcher = NewSearcher(reader); // PhrasePrefixQuery query1 = new PhrasePrefixQuery(); MultiPhraseQuery query1 = new MultiPhraseQuery(); // PhrasePrefixQuery query2 = new PhrasePrefixQuery(); MultiPhraseQuery query2 = new MultiPhraseQuery(); query1.Add(new Term("body", "blueberry")); query2.Add(new Term("body", "strawberry")); LinkedList <Term> termsWithPrefix = new LinkedList <Term>(); // this TermEnum gives "piccadilly", "pie" and "pizza". string prefix = "pi"; TermsEnum te = MultiFields.GetFields(reader).GetTerms("body").GetIterator(null); te.SeekCeil(new BytesRef(prefix)); do { string s = te.Term.Utf8ToString(); if (s.StartsWith(prefix, StringComparison.Ordinal)) { termsWithPrefix.AddLast(new Term("body", s)); } else { break; } } while (te.Next() != null); query1.Add(termsWithPrefix.ToArray(/*new Term[0]*/)); query2.Add(termsWithPrefix.ToArray(/*new Term[0]*/)); ScoreDoc[] result; result = searcher.Search(query1, null, 1000).ScoreDocs; Assert.AreEqual(2, result.Length); result = searcher.Search(query2, null, 1000).ScoreDocs; Assert.AreEqual(0, result.Length); reader.Dispose(); indexStore.Dispose(); }
private void AddTerms(IndexReader reader, FieldVals f) { if (f.queryString is null) { return; } Terms terms = MultiFields.GetTerms(reader, f.fieldName); if (terms is null) { return; } TokenStream ts = analyzer.GetTokenStream(f.fieldName, f.queryString); try { ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>(); int corpusNumDocs = reader.NumDocs; ISet <string> processedTerms = new JCG.HashSet <string>(); ts.Reset(); while (ts.IncrementToken()) { string term = termAtt.ToString(); if (!processedTerms.Contains(term)) { processedTerms.Add(term); ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term float minScore = 0; Term startTerm = new Term(f.fieldName, term); AttributeSource atts = new AttributeSource(); IMaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>(); #pragma warning disable 612, 618 SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength); #pragma warning restore 612, 618 //store the df so all variants use same idf int df = reader.DocFreq(startTerm); int numVariants = 0; int totalVariantDocFreqs = 0; BytesRef possibleMatch; IBoostAttribute boostAtt = fe.Attributes.AddAttribute <IBoostAttribute>(); while (fe.MoveNext()) { possibleMatch = fe.Term; numVariants++; totalVariantDocFreqs += fe.DocFreq; float score = boostAtt.Boost; if (variantsQ.Count < MAX_VARIANTS_PER_TERM || score > minScore) { ScoreTerm st = new ScoreTerm(new Term(startTerm.Field, BytesRef.DeepCopyOf(possibleMatch)), score, startTerm); variantsQ.InsertWithOverflow(st); minScore = variantsQ.Top.Score; // maintain minScore } maxBoostAtt.MaxNonCompetitiveBoost = variantsQ.Count >= MAX_VARIANTS_PER_TERM ? minScore : float.NegativeInfinity; } if (numVariants > 0) { int avgDf = totalVariantDocFreqs / numVariants; if (df == 0) //no direct match we can use as df for all variants { df = avgDf; //use avg df of all variants } // take the top variants (scored by edit distance) and reset the score // to include an IDF factor then add to the global queue for ranking // overall top query terms int size = variantsQ.Count; for (int i = 0; i < size; i++) { ScoreTerm st = variantsQ.Pop(); st.Score = (st.Score * st.Score) * sim.Idf(df, corpusNumDocs); q.InsertWithOverflow(st); } } } } ts.End(); } finally { IOUtils.DisposeWhileHandlingException(ts); } }
public override void Run() { if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": launch search thread"); } while (J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond < stopTimeMS) // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results { try { IndexSearcher s = outerInstance.GetCurrentSearcher(); try { // Verify 1) IW is correctly setting // diagnostics, and 2) segment warming for // merged segments is actually happening: foreach (AtomicReaderContext sub in s.IndexReader.Leaves) { SegmentReader segReader = (SegmentReader)sub.Reader; IDictionary <string, string> diagnostics = segReader.SegmentInfo.Info.Diagnostics; assertNotNull(diagnostics); diagnostics.TryGetValue("source", out string source); assertNotNull(source); if (source.Equals("merge", StringComparison.Ordinal)) { assertTrue("sub reader " + sub + " wasn't warmed: warmed=" + outerInstance.warmed + " diagnostics=" + diagnostics + " si=" + segReader.SegmentInfo, // LUCENENET: ConditionalWeakTable doesn't have ContainsKey, so we normalize to TryGetValue !outerInstance.m_assertMergedSegmentsWarmed || outerInstance.warmed.TryGetValue(segReader.core, out BooleanRef _)); } } if (s.IndexReader.NumDocs > 0) { outerInstance.SmokeTestSearcher(s); Fields fields = MultiFields.GetFields(s.IndexReader); if (fields == null) { continue; } Terms terms = fields.GetTerms("body"); if (terms == null) { continue; } TermsEnum termsEnum = terms.GetEnumerator(); int seenTermCount = 0; int shift; int trigger; if (totTermCount < 30) { shift = 0; trigger = 1; } else { trigger = totTermCount / 30; shift = Random.Next(trigger); } while (J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond < stopTimeMS) // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results { if (!termsEnum.MoveNext()) { totTermCount.Value = seenTermCount; break; } seenTermCount++; // search 30 terms if ((seenTermCount + shift) % trigger == 0) { //if (VERBOSE) { //System.out.println(Thread.currentThread().getName() + " now search body:" + term.Utf8ToString()); //} totHits.AddAndGet(outerInstance.RunQuery(s, new TermQuery(new Term("body", termsEnum.Term)))); } } //if (VERBOSE) { //System.out.println(Thread.currentThread().getName() + ": search done"); //} } } finally { outerInstance.ReleaseSearcher(s); } } catch (Exception t) when(t.IsThrowable()) { Console.WriteLine(Thread.CurrentThread.Name + ": hit exc"); outerInstance.m_failed.Value = (true); Console.WriteLine(t.ToString()); throw RuntimeException.Create(t); } } }