internal void CollectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) { IndexReaderContext topReaderContext = reader.Context; IComparer <BytesRef> lastTermComp = null; foreach (AtomicReaderContext context in topReaderContext.Leaves) { Fields fields = context.AtomicReader.Fields; if (fields == null) { // reader has no fields continue; } Terms terms = fields.GetTerms(query.m_field); if (terms == null) { // field does not exist continue; } TermsEnum termsEnum = GetTermsEnum(query, terms, collector.Attributes); if (Debugging.AssertsEnabled) { Debugging.Assert(termsEnum != null); } if (termsEnum == TermsEnum.EMPTY) { continue; } // Check comparer compatibility: IComparer <BytesRef> newTermComp = termsEnum.Comparer; if (lastTermComp != null && newTermComp != null && newTermComp != lastTermComp) { throw new Exception("term comparer should not change between segments: " + lastTermComp + " != " + newTermComp); } lastTermComp = newTermComp; collector.SetReaderContext(topReaderContext, context); collector.SetNextEnum(termsEnum); BytesRef bytes; while ((bytes = termsEnum.Next()) != null) { if (!collector.Collect(bytes)) { return; // interrupt whole term collection, so also don't iterate other subReaders } } } }
/// <summary> /// Returns a <see cref="DocIdSet"/> with documents that should be permitted in search /// results. /// </summary> public override DocIdSet GetDocIdSet(AtomicReaderContext context, IBits acceptDocs) { SortedSetDocValues docTermOrds = FieldCache.DEFAULT.GetDocTermOrds((context.AtomicReader), m_query.m_field); // Cannot use FixedBitSet because we require long index (ord): Int64BitSet termSet = new Int64BitSet(docTermOrds.ValueCount); TermsEnum termsEnum = m_query.GetTermsEnum(new TermsAnonymousInnerClassHelper(this, docTermOrds)); if (Debugging.AssertsEnabled) { Debugging.Assert(termsEnum != null); } if (termsEnum.Next() != null) { // fill into a bitset do { termSet.Set(termsEnum.Ord); } while (termsEnum.Next() != null); } else { return(null); } return(new FieldCacheDocIdSet(context.Reader.MaxDoc, acceptDocs, (doc) => { docTermOrds.SetDocument(doc); long ord; // TODO: we could track max bit set and early terminate (since they come in sorted order) while ((ord = docTermOrds.NextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { if (termSet.Get(ord)) { return true; } } return false; })); }
/// <summary> /// Returns a DocIdSet with documents that should be permitted in search /// results. /// </summary> public override DocIdSet GetDocIdSet(AtomicReaderContext context, IBits acceptDocs) { SortedDocValues fcsi = FieldCache.DEFAULT.GetTermsIndex((context.AtomicReader), m_query.m_field); // Cannot use FixedBitSet because we require long index (ord): Int64BitSet termSet = new Int64BitSet(fcsi.ValueCount); TermsEnum termsEnum = m_query.GetTermsEnum(new TermsAnonymousInnerClassHelper(fcsi)); if (Debugging.AssertsEnabled) { Debugging.Assert(termsEnum != null); } if (termsEnum.Next() != null) { // fill into a bitset do { long ord = termsEnum.Ord; if (ord >= 0) { termSet.Set(ord); } } while (termsEnum.Next() != null); } else { return(null); } return(new FieldCacheDocIdSet(context.Reader.MaxDoc, acceptDocs, (doc) => { int ord = fcsi.GetOrd(doc); if (ord == -1) { return false; } return termSet.Get(ord); })); }
/// <summary> /// Returns a <see cref="DocIdSet"/> with documents that should be permitted in search /// results. /// </summary> public override DocIdSet GetDocIdSet(AtomicReaderContext context, IBits acceptDocs) { SortedSetDocValues docTermOrds = FieldCache.DEFAULT.GetDocTermOrds((context.AtomicReader), m_query.m_field); // Cannot use FixedBitSet because we require long index (ord): Int64BitSet termSet = new Int64BitSet(docTermOrds.ValueCount); TermsEnum termsEnum = m_query.GetTermsEnum(new TermsAnonymousInnerClassHelper(this, docTermOrds)); Debug.Assert(termsEnum != null); if (termsEnum.Next() != null) { // fill into a bitset do { termSet.Set(termsEnum.Ord); } while (termsEnum.Next() != null); } else { return(null); } return(new FieldCacheDocIdSetAnonymousInnerClassHelper(this, context.Reader.MaxDoc, acceptDocs, docTermOrds, termSet)); }
private FixedBitSet CorrectBits(AtomicReader reader, IBits acceptDocs) { FixedBitSet bits = new FixedBitSet(reader.MaxDoc); //assume all are INvalid Terms terms = reader.Fields.GetTerms(fieldName); if (terms == null) { return(bits); } TermsEnum termsEnum = terms.GetIterator(null); DocsEnum docs = null; while (true) { BytesRef currTerm = termsEnum.Next(); if (currTerm == null) { break; } else { docs = termsEnum.Docs(acceptDocs, docs, DocsFlags.NONE); int doc = docs.NextDoc(); if (doc != DocIdSetIterator.NO_MORE_DOCS) { if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) { bits.Set(doc); } else { int lastDoc = doc; while (true) { lastDoc = doc; doc = docs.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } } bits.Set(lastDoc); } } } } return(bits); }
public override void VisitMatchingTerms(IndexReader reader, string fieldName, IMatchingTermVisitor mtv) { /* inspired by PrefixQuery.rewrite(): */ Terms terms = MultiFields.GetTerms(reader, fieldName); if (terms != null) { TermsEnum termsEnum = terms.GetIterator(null); bool skip = false; TermsEnum.SeekStatus status = termsEnum.SeekCeil(new BytesRef(Prefix)); if (status == TermsEnum.SeekStatus.FOUND) { mtv.VisitMatchingTerm(GetLucenePrefixTerm(fieldName)); } else if (status == TermsEnum.SeekStatus.NOT_FOUND) { if (StringHelper.StartsWith(termsEnum.Term, prefixRef)) { mtv.VisitMatchingTerm(new Term(fieldName, termsEnum.Term.Utf8ToString())); } else { skip = true; } } else { // EOF skip = true; } if (!skip) { while (true) { BytesRef text = termsEnum.Next(); if (text != null && StringHelper.StartsWith(text, prefixRef)) { mtv.VisitMatchingTerm(new Term(fieldName, text.Utf8ToString())); } else { break; } } } } }
protected void CompareTermVectors(Terms terms, Terms memTerms, string field_name) { TermsEnum termEnum = terms.GetIterator(null); TermsEnum memTermEnum = memTerms.GetIterator(null); while (termEnum.Next() != null) { assertNotNull(memTermEnum.Next()); assertEquals(termEnum.TotalTermFreq, memTermEnum.TotalTermFreq); DocsAndPositionsEnum docsPosEnum = termEnum.DocsAndPositions(null, null, 0); DocsAndPositionsEnum memDocsPosEnum = memTermEnum.DocsAndPositions(null, null, 0); String currentTerm = termEnum.Term.Utf8ToString(); assertEquals("Token mismatch for field: " + field_name, currentTerm, memTermEnum.Term.Utf8ToString()); docsPosEnum.NextDoc(); memDocsPosEnum.NextDoc(); int freq = docsPosEnum.Freq; assertEquals(freq, memDocsPosEnum.Freq); for (int i = 0; i < freq; i++) { string failDesc = " (field:" + field_name + " term:" + currentTerm + ")"; int memPos = memDocsPosEnum.NextPosition(); int pos = docsPosEnum.NextPosition(); assertEquals("Position test failed" + failDesc, memPos, pos); assertEquals("Start offset test failed" + failDesc, memDocsPosEnum.StartOffset, docsPosEnum.StartOffset); assertEquals("End offset test failed" + failDesc, memDocsPosEnum.EndOffset, docsPosEnum.EndOffset); assertEquals("Missing payload test failed" + failDesc, docsPosEnum.GetPayload(), null); } } assertNull("Still some tokens not processed", memTermEnum.Next()); }
public void AddWithAnalyzerSuccess() { TestObject t = new TestObject() { Number = 9876, String = "Test Object 9876", }; Assert.AreEqual(0, writer.NumDocs); writer.Add(t, new KeywordAnalyzer()); writer.Commit(); Assert.AreEqual(1, writer.NumDocs); DirectoryReader reader = DirectoryReader.Open(dir); int nTerms = 0; Fields fields = MultiFields.GetFields(reader); foreach (string field in fields) { Terms tms = fields.GetTerms(field); TermsEnum termsEnum = tms.GetIterator(null); BytesRef text = termsEnum.Next(); while (text != null) { if (String.Equals("String", field)) { string str = text.Utf8ToString(); Assert.AreEqual("Test Object 9876", str); nTerms++; } text = termsEnum.Next(); } } Assert.AreEqual(1, nTerms); }
// single straight enum private void DoTestStraightEnum(IList <Term> fieldTerms, IndexReader reader, int uniqueTermCount) { if (VERBOSE) { Console.WriteLine("\nTEST: top now enum reader=" + reader); } Fields fields = MultiFields.GetFields(reader); { // Test straight enum: int termCount = 0; foreach (string field in fields) { Terms terms = fields.GetTerms(field); Assert.IsNotNull(terms); TermsEnum termsEnum = terms.GetIterator(null); BytesRef text; BytesRef lastText = null; while ((text = termsEnum.Next()) != null) { Term exp = fieldTerms[termCount]; if (VERBOSE) { Console.WriteLine(" got term=" + field + ":" + UnicodeUtil.ToHexString(text.Utf8ToString())); Console.WriteLine(" exp=" + exp.Field + ":" + UnicodeUtil.ToHexString(exp.Text())); Console.WriteLine(); } if (lastText == null) { lastText = BytesRef.DeepCopyOf(text); } else { Assert.IsTrue(lastText.CompareTo(text) < 0); lastText.CopyBytes(text); } Assert.AreEqual(exp.Field, field); Assert.AreEqual(exp.Bytes, text); termCount++; } if (VERBOSE) { Console.WriteLine(" no more terms for field=" + field); } } Assert.AreEqual(uniqueTermCount, termCount); } }
public BytesRef Next() { if (termsEnum != null) { BytesRef next; while ((next = termsEnum.Next()) != null) { if (IsFrequent(termsEnum.DocFreq())) { freq = termsEnum.DocFreq(); spare.CopyBytes(next); return(spare); } } } return(null); }
public virtual void TestEndOffsetPositionWithTeeSinkTokenFilter() { Store.Directory dir = NewDirectory(); Analyzer analyzer = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false); IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); Document doc = new Document(); TokenStream tokenStream = analyzer.GetTokenStream("field", "abcd "); TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream); TokenStream sink = tee.NewSinkTokenStream(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.StoreTermVectors = true; ft.StoreTermVectorOffsets = true; ft.StoreTermVectorPositions = true; Field f1 = new Field("field", tee, ft); Field f2 = new Field("field", sink, ft); doc.Add(f1); doc.Add(f2); w.AddDocument(doc); w.Dispose(); IndexReader r = DirectoryReader.Open(dir); Terms vector = r.GetTermVectors(0).GetTerms("field"); assertEquals(1, vector.Count); TermsEnum termsEnum = vector.GetIterator(null); termsEnum.Next(); assertEquals(2, termsEnum.TotalTermFreq); DocsAndPositionsEnum positions = termsEnum.DocsAndPositions(null, null); assertTrue(positions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(2, positions.Freq); positions.NextPosition(); assertEquals(0, positions.StartOffset); assertEquals(4, positions.EndOffset); positions.NextPosition(); assertEquals(8, positions.StartOffset); assertEquals(12, positions.EndOffset); assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.NextDoc()); r.Dispose(); dir.Dispose(); }
public override void VisitMatchingTerms(IndexReader reader, string fieldName, SimpleTerm.IMatchingTermVisitor mtv) { int prefixLength = prefix.Length; Terms terms = MultiFields.GetTerms(reader, fieldName); if (terms != null) { TermsEnum termsEnum = terms.GetIterator(null); TermsEnum.SeekStatus status = termsEnum.SeekCeil(prefixRef); BytesRef text; if (status == TermsEnum.SeekStatus.FOUND) { text = prefixRef; } else if (status == TermsEnum.SeekStatus.NOT_FOUND) { text = termsEnum.Term; } else { text = null; } while (text != null) { if (text != null && StringHelper.StartsWith(text, prefixRef)) { string textString = text.Utf8ToString(); Match matcher = pattern.Match(textString.Substring(prefixLength)); if (matcher.Success) { mtv.VisitMatchingTerm(new Term(fieldName, textString)); } } else { break; } text = termsEnum.Next(); } } }
protected int GetNegativeValueCount(AtomicReader reader, string field) { int ret = 0; Terms terms = reader.GetTerms(field); if (terms == null) { return(ret); } TermsEnum termsEnum = terms.GetIterator(null); BytesRef text; while ((text = termsEnum.Next()) != null) { if (!text.Utf8ToString().StartsWith("-")) { break; } ret++; } return(ret); }
/// <summary> /// Adds terms and frequencies found in vector into the Map termFreqMap /// </summary> /// <param name="termFreqMap"> a Map of terms and their frequencies </param> /// <param name="vector"> List of terms and their frequencies for a doc/field </param> private void AddTermFrequencies(IDictionary <string, Int> termFreqMap, Terms vector) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.index.TermsEnum termsEnum = vector.iterator(null); TermsEnum termsEnum = vector.Iterator(null); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.CharsRef spare = new org.apache.lucene.util.CharsRef(); CharsRef spare = new CharsRef(); BytesRef text; while ((text = termsEnum.Next()) != null) { UnicodeUtil.UTF8toUTF16(text, spare); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String term = spare.toString(); string term = spare.ToString(); if (IsNoiseWord(term)) { continue; } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int freq = (int) termsEnum.totalTermFreq(); int freq = (int)termsEnum.TotalTermFreq(); // increment frequency Int cnt = termFreqMap[term]; if (cnt == null) { cnt = new Int(); termFreqMap[term] = cnt; cnt.x = freq; } else { cnt.x += freq; } } }
private string[] BestTerms(string field, int numTerms) { Util.PriorityQueue <TermDf> pq = new TermsDfQueue(numTerms); IndexReader ir = DirectoryReader.Open(dir); try { int threshold = ir.MaxDoc / 10; // ignore words too common. Terms terms = MultiFields.GetTerms(ir, field); if (terms != null) { TermsEnum termsEnum = terms.GetIterator(null); while (termsEnum.Next() != null) { int df = termsEnum.DocFreq; if (df < threshold) { string ttxt = termsEnum.Term.Utf8ToString(); pq.InsertWithOverflow(new TermDf(ttxt, df)); } } } } finally { ir.Dispose(); } string[] res = new string[pq.Count]; int i = 0; while (pq.Count > 0) { TermDf tdf = pq.Pop(); res[i++] = tdf.word; Console.WriteLine(i + ". word: " + tdf.df + " " + tdf.word); } return(res); }
/// <summary> /// Default merge impl </summary> public virtual void Merge(MergeState mergeState, FieldInfo.IndexOptions? indexOptions, TermsEnum termsEnum) { BytesRef term; Debug.Assert(termsEnum != null); long sumTotalTermFreq = 0; long sumDocFreq = 0; long sumDFsinceLastAbortCheck = 0; FixedBitSet visitedDocs = new FixedBitSet(mergeState.SegmentInfo.DocCount); if (indexOptions == FieldInfo.IndexOptions.DOCS_ONLY) { if (DocsEnum == null) { DocsEnum = new MappingMultiDocsEnum(); } DocsEnum.MergeState = mergeState; MultiDocsEnum docsEnumIn = null; while ((term = termsEnum.Next()) != null) { // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: docsEnumIn = (MultiDocsEnum)termsEnum.Docs(null, docsEnumIn, Index.DocsEnum.FLAG_NONE); if (docsEnumIn != null) { DocsEnum.Reset(docsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, DocsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.DocFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } } else if (indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS) { if (DocsAndFreqsEnum == null) { DocsAndFreqsEnum = new MappingMultiDocsEnum(); } DocsAndFreqsEnum.MergeState = mergeState; MultiDocsEnum docsAndFreqsEnumIn = null; while ((term = termsEnum.Next()) != null) { // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: docsAndFreqsEnumIn = (MultiDocsEnum)termsEnum.Docs(null, docsAndFreqsEnumIn); Debug.Assert(docsAndFreqsEnumIn != null); DocsAndFreqsEnum.Reset(docsAndFreqsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, DocsAndFreqsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.TotalTermFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } else if (indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { if (PostingsEnum == null) { PostingsEnum = new MappingMultiDocsAndPositionsEnum(); } PostingsEnum.MergeState = mergeState; MultiDocsAndPositionsEnum postingsEnumIn = null; while ((term = termsEnum.Next()) != null) { // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: postingsEnumIn = (MultiDocsAndPositionsEnum)termsEnum.DocsAndPositions(null, postingsEnumIn, DocsAndPositionsEnum.FLAG_PAYLOADS); Debug.Assert(postingsEnumIn != null); PostingsEnum.Reset(postingsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, PostingsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.TotalTermFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } else { Debug.Assert(indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); if (PostingsEnum == null) { PostingsEnum = new MappingMultiDocsAndPositionsEnum(); } PostingsEnum.MergeState = mergeState; MultiDocsAndPositionsEnum postingsEnumIn = null; while ((term = termsEnum.Next()) != null) { // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: postingsEnumIn = (MultiDocsAndPositionsEnum)termsEnum.DocsAndPositions(null, postingsEnumIn); Debug.Assert(postingsEnumIn != null); PostingsEnum.Reset(postingsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, PostingsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.TotalTermFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } Finish(indexOptions == FieldInfo.IndexOptions.DOCS_ONLY ? -1 : sumTotalTermFreq, sumDocFreq, visitedDocs.Cardinality()); }
private void VerifyVector(TermsEnum vector, int num) { StringBuilder temp = new StringBuilder(); while (vector.Next() != null) { temp.Append(vector.Term().Utf8ToString()); } if (!English.IntToEnglish(num).Trim().Equals(temp.ToString().Trim())) { Console.WriteLine("wrong term result"); } }
private void DuellReaders(CompositeReader other, AtomicReader memIndexReader) { AtomicReader competitor = SlowCompositeReaderWrapper.Wrap(other); Fields memFields = memIndexReader.Fields; foreach (string field in competitor.Fields) { Terms memTerms = memFields.GetTerms(field); Terms iwTerms = memIndexReader.GetTerms(field); if (iwTerms == null) { assertNull(memTerms); } else { NumericDocValues normValues = competitor.GetNormValues(field); NumericDocValues memNormValues = memIndexReader.GetNormValues(field); if (normValues != null) { // mem idx always computes norms on the fly assertNotNull(memNormValues); assertEquals(normValues.Get(0), memNormValues.Get(0)); } assertNotNull(memTerms); assertEquals(iwTerms.DocCount, memTerms.DocCount); assertEquals(iwTerms.SumDocFreq, memTerms.SumDocFreq); assertEquals(iwTerms.SumTotalTermFreq, memTerms.SumTotalTermFreq); TermsEnum iwTermsIter = iwTerms.GetIterator(null); TermsEnum memTermsIter = memTerms.GetIterator(null); if (iwTerms.HasPositions) { bool offsets = iwTerms.HasOffsets && memTerms.HasOffsets; while (iwTermsIter.Next() != null) { assertNotNull(memTermsIter.Next()); assertEquals(iwTermsIter.Term, memTermsIter.Term); DocsAndPositionsEnum iwDocsAndPos = iwTermsIter.DocsAndPositions(null, null); DocsAndPositionsEnum memDocsAndPos = memTermsIter.DocsAndPositions(null, null); while (iwDocsAndPos.NextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.DocID, memDocsAndPos.NextDoc()); assertEquals(iwDocsAndPos.Freq, memDocsAndPos.Freq); for (int i = 0; i < iwDocsAndPos.Freq; i++) { assertEquals("term: " + iwTermsIter.Term.Utf8ToString(), iwDocsAndPos.NextPosition(), memDocsAndPos.NextPosition()); if (offsets) { assertEquals(iwDocsAndPos.StartOffset, memDocsAndPos.StartOffset); assertEquals(iwDocsAndPos.EndOffset, memDocsAndPos.EndOffset); } } } } } else { while (iwTermsIter.Next() != null) { assertEquals(iwTermsIter.Term, memTermsIter.Term); DocsEnum iwDocsAndPos = iwTermsIter.Docs(null, null); DocsEnum memDocsAndPos = memTermsIter.Docs(null, null); while (iwDocsAndPos.NextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.DocID, memDocsAndPos.NextDoc()); assertEquals(iwDocsAndPos.Freq, memDocsAndPos.Freq); } } } } } }
public virtual void Test() { #pragma warning disable 612, 618 IFieldCache cache = FieldCache.DEFAULT; FieldCache.Doubles doubles = cache.GetDoubles(Reader, "theDouble", Random.NextBoolean()); Assert.AreSame(doubles, cache.GetDoubles(Reader, "theDouble", Random.NextBoolean()), "Second request to cache return same array"); Assert.AreSame(doubles, cache.GetDoubles(Reader, "theDouble", FieldCache.DEFAULT_DOUBLE_PARSER, Random.NextBoolean()), "Second request with explicit parser return same array"); for (int i = 0; i < NUM_DOCS; i++) { Assert.IsTrue(doubles.Get(i) == (double.MaxValue - i), doubles.Get(i) + " does not equal: " + (double.MaxValue - i)); } FieldCache.Int64s longs = cache.GetInt64s(Reader, "theLong", Random.NextBoolean()); Assert.AreSame(longs, cache.GetInt64s(Reader, "theLong", Random.NextBoolean()), "Second request to cache return same array"); Assert.AreSame(longs, cache.GetInt64s(Reader, "theLong", FieldCache.DEFAULT_INT64_PARSER, Random.NextBoolean()), "Second request with explicit parser return same array"); for (int i = 0; i < NUM_DOCS; i++) { Assert.IsTrue(longs.Get(i) == (long.MaxValue - i), longs.Get(i) + " does not equal: " + (long.MaxValue - i) + " i=" + i); } FieldCache.Bytes bytes = cache.GetBytes(Reader, "theByte", Random.NextBoolean()); Assert.AreSame(bytes, cache.GetBytes(Reader, "theByte", Random.NextBoolean()), "Second request to cache return same array"); Assert.AreSame(bytes, cache.GetBytes(Reader, "theByte", FieldCache.DEFAULT_BYTE_PARSER, Random.NextBoolean()), "Second request with explicit parser return same array"); for (int i = 0; i < NUM_DOCS; i++) { Assert.IsTrue((sbyte)bytes.Get(i) == (sbyte)(sbyte.MaxValue - i), (sbyte)bytes.Get(i) + " does not equal: " + (sbyte.MaxValue - i)); } FieldCache.Int16s shorts = cache.GetInt16s(Reader, "theShort", Random.NextBoolean()); Assert.AreSame(shorts, cache.GetInt16s(Reader, "theShort", Random.NextBoolean()), "Second request to cache return same array"); Assert.AreSame(shorts, cache.GetInt16s(Reader, "theShort", FieldCache.DEFAULT_INT16_PARSER, Random.NextBoolean()), "Second request with explicit parser return same array"); for (int i = 0; i < NUM_DOCS; i++) { Assert.IsTrue(shorts.Get(i) == (short)(short.MaxValue - i), shorts.Get(i) + " does not equal: " + (short.MaxValue - i)); } FieldCache.Int32s ints = cache.GetInt32s(Reader, "theInt", Random.NextBoolean()); Assert.AreSame(ints, cache.GetInt32s(Reader, "theInt", Random.NextBoolean()), "Second request to cache return same array"); Assert.AreSame(ints, cache.GetInt32s(Reader, "theInt", FieldCache.DEFAULT_INT32_PARSER, Random.NextBoolean()), "Second request with explicit parser return same array"); for (int i = 0; i < NUM_DOCS; i++) { Assert.IsTrue(ints.Get(i) == (int.MaxValue - i), ints.Get(i) + " does not equal: " + (int.MaxValue - i)); } FieldCache.Singles floats = cache.GetSingles(Reader, "theFloat", Random.NextBoolean()); Assert.AreSame(floats, cache.GetSingles(Reader, "theFloat", Random.NextBoolean()), "Second request to cache return same array"); Assert.AreSame(floats, cache.GetSingles(Reader, "theFloat", FieldCache.DEFAULT_SINGLE_PARSER, Random.NextBoolean()), "Second request with explicit parser return same array"); for (int i = 0; i < NUM_DOCS; i++) { Assert.IsTrue(floats.Get(i) == (float.MaxValue - i), floats.Get(i) + " does not equal: " + (float.MaxValue - i)); } #pragma warning restore 612, 618 IBits docsWithField = cache.GetDocsWithField(Reader, "theLong"); Assert.AreSame(docsWithField, cache.GetDocsWithField(Reader, "theLong"), "Second request to cache return same array"); Assert.IsTrue(docsWithField is Bits.MatchAllBits, "docsWithField(theLong) must be class Bits.MatchAllBits"); Assert.IsTrue(docsWithField.Length == NUM_DOCS, "docsWithField(theLong) Size: " + docsWithField.Length + " is not: " + NUM_DOCS); for (int i = 0; i < docsWithField.Length; i++) { Assert.IsTrue(docsWithField.Get(i)); } docsWithField = cache.GetDocsWithField(Reader, "sparse"); Assert.AreSame(docsWithField, cache.GetDocsWithField(Reader, "sparse"), "Second request to cache return same array"); Assert.IsFalse(docsWithField is Bits.MatchAllBits, "docsWithField(sparse) must not be class Bits.MatchAllBits"); Assert.IsTrue(docsWithField.Length == NUM_DOCS, "docsWithField(sparse) Size: " + docsWithField.Length + " is not: " + NUM_DOCS); for (int i = 0; i < docsWithField.Length; i++) { Assert.AreEqual(i % 2 == 0, docsWithField.Get(i)); } // getTermsIndex SortedDocValues termsIndex = cache.GetTermsIndex(Reader, "theRandomUnicodeString"); Assert.AreSame(termsIndex, cache.GetTermsIndex(Reader, "theRandomUnicodeString"), "Second request to cache return same array"); BytesRef br = new BytesRef(); for (int i = 0; i < NUM_DOCS; i++) { BytesRef term; int ord = termsIndex.GetOrd(i); if (ord == -1) { term = null; } else { termsIndex.LookupOrd(ord, br); term = br; } string s = term == null ? null : term.Utf8ToString(); Assert.IsTrue(UnicodeStrings[i] == null || UnicodeStrings[i].Equals(s, StringComparison.Ordinal), "for doc " + i + ": " + s + " does not equal: " + UnicodeStrings[i]); } int nTerms = termsIndex.ValueCount; TermsEnum tenum = termsIndex.GetTermsEnum(); BytesRef val = new BytesRef(); for (int i = 0; i < nTerms; i++) { BytesRef val1 = tenum.Next(); termsIndex.LookupOrd(i, val); // System.out.println("i="+i); Assert.AreEqual(val, val1); } // seek the enum around (note this isn't a great test here) int num = AtLeast(100); for (int i = 0; i < num; i++) { int k = Random.Next(nTerms); termsIndex.LookupOrd(k, val); Assert.AreEqual(TermsEnum.SeekStatus.FOUND, tenum.SeekCeil(val)); Assert.AreEqual(val, tenum.Term); } for (int i = 0; i < nTerms; i++) { termsIndex.LookupOrd(i, val); Assert.AreEqual(TermsEnum.SeekStatus.FOUND, tenum.SeekCeil(val)); Assert.AreEqual(val, tenum.Term); } // test bad field termsIndex = cache.GetTermsIndex(Reader, "bogusfield"); // getTerms BinaryDocValues terms = cache.GetTerms(Reader, "theRandomUnicodeString", true); Assert.AreSame(terms, cache.GetTerms(Reader, "theRandomUnicodeString", true), "Second request to cache return same array"); IBits bits = cache.GetDocsWithField(Reader, "theRandomUnicodeString"); for (int i = 0; i < NUM_DOCS; i++) { terms.Get(i, br); BytesRef term; if (!bits.Get(i)) { term = null; } else { term = br; } string s = term == null ? null : term.Utf8ToString(); Assert.IsTrue(UnicodeStrings[i] == null || UnicodeStrings[i].Equals(s, StringComparison.Ordinal), "for doc " + i + ": " + s + " does not equal: " + UnicodeStrings[i]); } // test bad field terms = cache.GetTerms(Reader, "bogusfield", false); // getDocTermOrds SortedSetDocValues termOrds = cache.GetDocTermOrds(Reader, "theRandomUnicodeMultiValuedField"); int numEntries = cache.GetCacheEntries().Length; // ask for it again, and check that we didnt create any additional entries: termOrds = cache.GetDocTermOrds(Reader, "theRandomUnicodeMultiValuedField"); Assert.AreEqual(numEntries, cache.GetCacheEntries().Length); for (int i = 0; i < NUM_DOCS; i++) { termOrds.SetDocument(i); // this will remove identical terms. A DocTermOrds doesn't return duplicate ords for a docId IList <BytesRef> values = MultiValued[i].Distinct().ToList(); foreach (BytesRef v in values) { if (v == null) { // why does this test use null values... instead of an empty list: confusing break; } long ord = termOrds.NextOrd(); Debug.Assert(ord != SortedSetDocValues.NO_MORE_ORDS); BytesRef scratch = new BytesRef(); termOrds.LookupOrd(ord, scratch); Assert.AreEqual(v, scratch); } Assert.AreEqual(SortedSetDocValues.NO_MORE_ORDS, termOrds.NextOrd()); } // test bad field termOrds = cache.GetDocTermOrds(Reader, "bogusfield"); Assert.IsTrue(termOrds.ValueCount == 0); FieldCache.DEFAULT.PurgeByCacheKey(Reader.CoreCacheKey); }
public void TestRandomIndex() { Directory dir = NewDirectory(); MockAnalyzer analyzer = new MockAnalyzer(Random); analyzer.MaxTokenLength = TestUtil.NextInt32(Random, 1, IndexWriter.MAX_TERM_LENGTH); RandomIndexWriter w = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir, analyzer); CreateRandomIndex(AtLeast(50), w, Random.NextInt64()); DirectoryReader reader = w.GetReader(); AtomicReader wrapper = SlowCompositeReaderWrapper.Wrap(reader); string field = @"body"; Terms terms = wrapper.GetTerms(field); var lowFreqQueue = new AnonymousPriorityQueue(this, 5); Util.PriorityQueue<TermAndFreq> highFreqQueue = new AnonymousPriorityQueue1(this, 5); try { TermsEnum iterator = terms.GetIterator(null); while (iterator.Next() != null) { if (highFreqQueue.Count < 5) { highFreqQueue.Add(new TermAndFreq(BytesRef.DeepCopyOf(iterator.Term), iterator.DocFreq)); lowFreqQueue.Add(new TermAndFreq(BytesRef.DeepCopyOf(iterator.Term), iterator.DocFreq)); } else { if (highFreqQueue.Top.freq < iterator.DocFreq) { highFreqQueue.Top.freq = iterator.DocFreq; highFreqQueue.Top.term = BytesRef.DeepCopyOf(iterator.Term); highFreqQueue.UpdateTop(); } if (lowFreqQueue.Top.freq > iterator.DocFreq) { lowFreqQueue.Top.freq = iterator.DocFreq; lowFreqQueue.Top.term = BytesRef.DeepCopyOf(iterator.Term); lowFreqQueue.UpdateTop(); } } } int lowFreq = lowFreqQueue.Top.freq; int highFreq = highFreqQueue.Top.freq; AssumeTrue(@"unlucky index", highFreq - 1 > lowFreq); List<TermAndFreq> highTerms = QueueToList(highFreqQueue); List<TermAndFreq> lowTerms = QueueToList(lowFreqQueue); IndexSearcher searcher = NewSearcher(reader); Occur lowFreqOccur = RandomOccur(Random); BooleanQuery verifyQuery = new BooleanQuery(); CommonTermsQuery cq = new CommonTermsQuery(RandomOccur(Random), lowFreqOccur, highFreq - 1, Random.NextBoolean()); foreach (TermAndFreq termAndFreq in lowTerms) { cq.Add(new Term(field, termAndFreq.term)); verifyQuery.Add(new BooleanClause(new TermQuery(new Term(field, termAndFreq.term)), lowFreqOccur)); } foreach (TermAndFreq termAndFreq in highTerms) { cq.Add(new Term(field, termAndFreq.term)); } TopDocs cqSearch = searcher.Search(cq, reader.MaxDoc); TopDocs verifySearch = searcher.Search(verifyQuery, reader.MaxDoc); assertEquals(verifySearch.TotalHits, cqSearch.TotalHits); var hits = new JCG.HashSet<int>(); foreach (ScoreDoc doc in verifySearch.ScoreDocs) { hits.Add(doc.Doc); } foreach (ScoreDoc doc in cqSearch.ScoreDocs) { assertTrue(hits.Remove(doc.Doc)); } assertTrue(hits.Count == 0); w.ForceMerge(1); DirectoryReader reader2 = w.GetReader(); QueryUtils.Check( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, cq, NewSearcher(reader2)); reader2.Dispose(); } finally { reader.Dispose(); wrapper.Dispose(); w.Dispose(); dir.Dispose(); } }
public override void Run() { if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": launch search thread"); } while (Environment.TickCount < stopTimeMS) { try { IndexSearcher s = outerInstance.GetCurrentSearcher(); try { // Verify 1) IW is correctly setting // diagnostics, and 2) segment warming for // merged segments is actually happening: foreach (AtomicReaderContext sub in s.IndexReader.Leaves) { SegmentReader segReader = (SegmentReader)sub.Reader; IDictionary <string, string> diagnostics = segReader.SegmentInfo.Info.Diagnostics; assertNotNull(diagnostics); string source; diagnostics.TryGetValue("source", out source); assertNotNull(source); if (source.Equals("merge", StringComparison.Ordinal)) { assertTrue("sub reader " + sub + " wasn't warmed: warmed=" + outerInstance.warmed + " diagnostics=" + diagnostics + " si=" + segReader.SegmentInfo, !outerInstance.m_assertMergedSegmentsWarmed || outerInstance.warmed.ContainsKey(segReader.core)); } } if (s.IndexReader.NumDocs > 0) { outerInstance.SmokeTestSearcher(s); Fields fields = MultiFields.GetFields(s.IndexReader); if (fields == null) { continue; } Terms terms = fields.GetTerms("body"); if (terms == null) { continue; } TermsEnum termsEnum = terms.GetIterator(null); int seenTermCount = 0; int shift; int trigger; if (totTermCount.Get() < 30) { shift = 0; trigger = 1; } else { trigger = totTermCount.Get() / 30; shift = Random.Next(trigger); } while (Environment.TickCount < stopTimeMS) { BytesRef term = termsEnum.Next(); if (term == null) { totTermCount.Set(seenTermCount); break; } seenTermCount++; // search 30 terms if ((seenTermCount + shift) % trigger == 0) { //if (VERBOSE) { //System.out.println(Thread.currentThread().getName() + " now search body:" + term.Utf8ToString()); //} totHits.AddAndGet(outerInstance.RunQuery(s, new TermQuery(new Term("body", term)))); } } //if (VERBOSE) { //System.out.println(Thread.currentThread().getName() + ": search done"); //} } } finally { outerInstance.ReleaseSearcher(s); } } catch (Exception t) { Console.WriteLine(Thread.CurrentThread.Name + ": hit exc"); outerInstance.m_failed.Set(true); Console.WriteLine(t.ToString()); throw new Exception(t.ToString(), t); } } }
//public static void main( string[] args ) throws Exception { // Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_CURRENT); // QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "f", analyzer ); // Query query = parser.parse( "a x:b" ); // FieldQuery fieldQuery = new FieldQuery( query, true, false ); // Directory dir = new RAMDirectory(); // IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)); // Document doc = new Document(); // FieldType ft = new FieldType(TextField.TYPE_STORED); // ft.setStoreTermVectors(true); // ft.setStoreTermVectorOffsets(true); // ft.setStoreTermVectorPositions(true); // doc.add( new Field( "f", ft, "a a a b b c a b b c d e f" ) ); // doc.add( new Field( "f", ft, "b a b a f" ) ); // writer.addDocument( doc ); // writer.close(); // IndexReader reader = IndexReader.open(dir1); // new FieldTermStack( reader, 0, "f", fieldQuery ); // reader.close(); //} /// <summary> /// a constructor. /// </summary> /// <param name="reader"><see cref="IndexReader"/> of the index</param> /// <param name="docId">document id to be highlighted</param> /// <param name="fieldName">field of the document to be highlighted</param> /// <param name="fieldQuery"><see cref="FieldQuery"/> object</param> /// <exception cref="System.IO.IOException">If there is a low-level I/O error</exception> public FieldTermStack(IndexReader reader, int docId, string fieldName, FieldQuery fieldQuery) { this.fieldName = fieldName; ISet <string> termSet = fieldQuery.GetTermSet(fieldName); // just return to make null snippet if un-matched fieldName specified when fieldMatch == true if (termSet == null) { return; } Fields vectors = reader.GetTermVectors(docId); if (vectors == null) { // null snippet return; } Terms vector = vectors.GetTerms(fieldName); if (vector == null) { // null snippet return; } CharsRef spare = new CharsRef(); TermsEnum termsEnum = vector.GetIterator(null); DocsAndPositionsEnum dpEnum = null; BytesRef text; int numDocs = reader.MaxDoc; while ((text = termsEnum.Next()) != null) { UnicodeUtil.UTF8toUTF16(text, spare); string term = spare.ToString(); if (!termSet.Contains(term)) { continue; } dpEnum = termsEnum.DocsAndPositions(null, dpEnum); if (dpEnum == null) { // null snippet return; } dpEnum.NextDoc(); // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html float weight = (float)(Math.Log(numDocs / (double)(reader.DocFreq(new Term(fieldName, text)) + 1)) + 1.0); int freq = dpEnum.Freq; for (int i = 0; i < freq; i++) { int pos = dpEnum.NextPosition(); if (dpEnum.StartOffset < 0) { return; // no offsets, null snippet } termList.Add(new TermInfo(term, dpEnum.StartOffset, dpEnum.EndOffset, pos, weight)); } } // sort by position CollectionUtil.TimSort(termList); // now look for dups at the same position, linking them together int currentPos = -1; TermInfo previous = null; TermInfo first = null; for (int i = 0; i < termList.Count;) { TermInfo current = termList[i]; if (current.Position == currentPos) { Debug.Assert(previous != null); previous.SetNext(current); previous = current; //iterator.Remove(); // LUCENENET NOTE: Remove, but don't advance the i position (since removing will advance to the next item) termList.RemoveAt(i); } else { if (previous != null) { previous.SetNext(first); } previous = first = current; currentPos = current.Position; // LUCENENET NOTE: Only increment the position if we don't do a delete. i++; } } if (previous != null) { previous.SetNext(first); } }
public override sealed BytesRef Next() { return(Delegate.Next()); }
public virtual void TestPhrasePrefix() { Directory indexStore = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, indexStore); Document doc1 = new Document(); Document doc2 = new Document(); Document doc3 = new Document(); Document doc4 = new Document(); Document doc5 = new Document(); doc1.Add(NewTextField("body", "blueberry pie", Field.Store.YES)); doc2.Add(NewTextField("body", "blueberry strudel", Field.Store.YES)); doc3.Add(NewTextField("body", "blueberry pizza", Field.Store.YES)); doc4.Add(NewTextField("body", "blueberry chewing gum", Field.Store.YES)); doc5.Add(NewTextField("body", "piccadilly circus", Field.Store.YES)); writer.AddDocument(doc1); writer.AddDocument(doc2); writer.AddDocument(doc3); writer.AddDocument(doc4); writer.AddDocument(doc5); IndexReader reader = writer.GetReader(); writer.Dispose(); IndexSearcher searcher = NewSearcher(reader); // PhrasePrefixQuery query1 = new PhrasePrefixQuery(); MultiPhraseQuery query1 = new MultiPhraseQuery(); // PhrasePrefixQuery query2 = new PhrasePrefixQuery(); MultiPhraseQuery query2 = new MultiPhraseQuery(); query1.Add(new Term("body", "blueberry")); query2.Add(new Term("body", "strawberry")); LinkedList <Term> termsWithPrefix = new LinkedList <Term>(); // this TermEnum gives "piccadilly", "pie" and "pizza". string prefix = "pi"; TermsEnum te = MultiFields.GetFields(reader).GetTerms("body").GetIterator(null); te.SeekCeil(new BytesRef(prefix)); do { string s = te.Term.Utf8ToString(); if (s.StartsWith(prefix, StringComparison.Ordinal)) { termsWithPrefix.AddLast(new Term("body", s)); } else { break; } } while (te.Next() != null); query1.Add(termsWithPrefix.ToArray(/*new Term[0]*/)); query2.Add(termsWithPrefix.ToArray(/*new Term[0]*/)); ScoreDoc[] result; result = searcher.Search(query1, null, 1000).ScoreDocs; Assert.AreEqual(2, result.Length); result = searcher.Search(query2, null, 1000).ScoreDocs; Assert.AreEqual(0, result.Length); reader.Dispose(); indexStore.Dispose(); }
private void AssertTermsSeeking(Terms leftTerms, Terms rightTerms) { TermsEnum leftEnum = null; TermsEnum rightEnum = null; // just an upper bound int numTests = AtLeast(20); Random random = Random; // collect this number of terms from the left side ISet <BytesRef> tests = new JCG.HashSet <BytesRef>(); int numPasses = 0; while (numPasses < 10 && tests.Count < numTests) { leftEnum = leftTerms.GetIterator(leftEnum); BytesRef term = null; while ((term = leftEnum.Next()) != null) { int code = random.Next(10); if (code == 0) { // the term tests.Add(BytesRef.DeepCopyOf(term)); } else if (code == 1) { // truncated subsequence of term term = BytesRef.DeepCopyOf(term); if (term.Length > 0) { // truncate it term.Length = random.Next(term.Length); } } else if (code == 2) { // term, but ensure a non-zero offset var newbytes = new byte[term.Length + 5]; Array.Copy(term.Bytes, term.Offset, newbytes, 5, term.Length); tests.Add(new BytesRef(newbytes, 5, term.Length)); } } numPasses++; } List <BytesRef> shuffledTests = new List <BytesRef>(tests); shuffledTests.Shuffle(Random); foreach (BytesRef b in shuffledTests) { leftEnum = leftTerms.GetIterator(leftEnum); rightEnum = rightTerms.GetIterator(rightEnum); Assert.AreEqual(leftEnum.SeekExact(b), rightEnum.SeekExact(b)); Assert.AreEqual(leftEnum.SeekExact(b), rightEnum.SeekExact(b)); SeekStatus leftStatus; SeekStatus rightStatus; leftStatus = leftEnum.SeekCeil(b); rightStatus = rightEnum.SeekCeil(b); Assert.AreEqual(leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { Assert.AreEqual(leftEnum.Term, rightEnum.Term); } leftStatus = leftEnum.SeekCeil(b); rightStatus = rightEnum.SeekCeil(b); Assert.AreEqual(leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { Assert.AreEqual(leftEnum.Term, rightEnum.Term); } } }
public virtual void TestMixedVectrosVectors() { RandomIndexWriter writer = new RandomIndexWriter(Random(), Directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.SIMPLE, true)).SetOpenMode(OpenMode.CREATE)); Document doc = new Document(); FieldType ft2 = new FieldType(TextField.TYPE_STORED); ft2.StoreTermVectors = true; FieldType ft3 = new FieldType(TextField.TYPE_STORED); ft3.StoreTermVectors = true; ft3.StoreTermVectorPositions = true; FieldType ft4 = new FieldType(TextField.TYPE_STORED); ft4.StoreTermVectors = true; ft4.StoreTermVectorOffsets = true; FieldType ft5 = new FieldType(TextField.TYPE_STORED); ft5.StoreTermVectors = true; ft5.StoreTermVectorOffsets = true; ft5.StoreTermVectorPositions = true; doc.Add(NewTextField("field", "one", Field.Store.YES)); doc.Add(NewField("field", "one", ft2)); doc.Add(NewField("field", "one", ft3)); doc.Add(NewField("field", "one", ft4)); doc.Add(NewField("field", "one", ft5)); writer.AddDocument(doc); IndexReader reader = writer.Reader; writer.Dispose(); IndexSearcher searcher = NewSearcher(reader); Query query = new TermQuery(new Term("field", "one")); ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); Fields vectors = searcher.IndexReader.GetTermVectors(hits[0].Doc); Assert.IsNotNull(vectors); Assert.AreEqual(1, vectors.Size); Terms vector = vectors.Terms("field"); Assert.IsNotNull(vector); Assert.AreEqual(1, vector.Size()); TermsEnum termsEnum = vector.Iterator(null); Assert.IsNotNull(termsEnum.Next()); Assert.AreEqual("one", termsEnum.Term().Utf8ToString()); Assert.AreEqual(5, termsEnum.TotalTermFreq()); DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null); Assert.IsNotNull(dpEnum); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(5, dpEnum.Freq()); for (int i = 0; i < 5; i++) { Assert.AreEqual(i, dpEnum.NextPosition()); } dpEnum = termsEnum.DocsAndPositions(null, dpEnum); Assert.IsNotNull(dpEnum); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(5, dpEnum.Freq()); for (int i = 0; i < 5; i++) { dpEnum.NextPosition(); Assert.AreEqual(4 * i, dpEnum.StartOffset()); Assert.AreEqual(4 * i + 3, dpEnum.EndOffset()); } reader.Dispose(); }
/// <summary> /// checks the terms enum sequentially /// if deep is false, it does a 'shallow' test that doesnt go down to the docsenums /// </summary> public virtual void AssertTermsEnum(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum, bool deep) { BytesRef term; IBits randomBits = new RandomBits(MAXDOC, Random.NextDouble(), Random); DocsAndPositionsEnum leftPositions = null; DocsAndPositionsEnum rightPositions = null; DocsEnum leftDocs = null; DocsEnum rightDocs = null; while ((term = leftTermsEnum.Next()) != null) { Assert.AreEqual(term, rightTermsEnum.Next()); AssertTermStats(leftTermsEnum, rightTermsEnum); if (deep) { // with payloads + off AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions)); // with payloads only AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.PAYLOADS)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.PAYLOADS)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.PAYLOADS)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.PAYLOADS)); // with offsets only AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.OFFSETS)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.OFFSETS)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.OFFSETS)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.OFFSETS)); // with positions only AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.NONE), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.NONE)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.NONE), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.NONE)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.NONE), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.NONE)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.NONE), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.NONE)); // with freqs: AssertDocsEnum(leftDocs = leftTermsEnum.Docs(null, leftDocs), rightDocs = rightTermsEnum.Docs(null, rightDocs)); AssertDocsEnum(leftDocs = leftTermsEnum.Docs(randomBits, leftDocs), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs)); // w/o freqs: AssertDocsEnum(leftDocs = leftTermsEnum.Docs(null, leftDocs, DocsFlags.NONE), rightDocs = rightTermsEnum.Docs(null, rightDocs, DocsFlags.NONE)); AssertDocsEnum(leftDocs = leftTermsEnum.Docs(randomBits, leftDocs, DocsFlags.NONE), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs, DocsFlags.NONE)); // with freqs: AssertDocsSkipping(leftTermsEnum.DocFreq, leftDocs = leftTermsEnum.Docs(null, leftDocs), rightDocs = rightTermsEnum.Docs(null, rightDocs)); AssertDocsSkipping(leftTermsEnum.DocFreq, leftDocs = leftTermsEnum.Docs(randomBits, leftDocs), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs)); // w/o freqs: AssertDocsSkipping(leftTermsEnum.DocFreq, leftDocs = leftTermsEnum.Docs(null, leftDocs, DocsFlags.NONE), rightDocs = rightTermsEnum.Docs(null, rightDocs, DocsFlags.NONE)); AssertDocsSkipping(leftTermsEnum.DocFreq, leftDocs = leftTermsEnum.Docs(randomBits, leftDocs, DocsFlags.NONE), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs, DocsFlags.NONE)); } } Assert.IsNull(rightTermsEnum.Next()); }
public virtual void Test() { Directory dir = NewDirectory(); MockAnalyzer analyzer = new MockAnalyzer(Random); analyzer.MaxTokenLength = TestUtil.NextInt32(Random, 1, IndexWriter.MAX_TERM_LENGTH); RandomIndexWriter w = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir, analyzer); LineFileDocs docs = new LineFileDocs(Random, DefaultCodecSupportsDocValues); int charsToIndex = AtLeast(100000); int charsIndexed = 0; //System.out.println("bytesToIndex=" + charsToIndex); while (charsIndexed < charsToIndex) { Document doc = docs.NextDoc(); charsIndexed += doc.Get("body").Length; w.AddDocument(doc); //System.out.println(" bytes=" + charsIndexed + " add: " + doc); } IndexReader r = w.GetReader(); //System.out.println("numDocs=" + r.NumDocs); w.Dispose(); IndexSearcher s = NewSearcher(r); Terms terms = MultiFields.GetFields(r).GetTerms("body"); int termCount = 0; TermsEnum termsEnum = terms.GetIterator(null); while (termsEnum.Next() != null) { termCount++; } Assert.IsTrue(termCount > 0); // Target ~10 terms to search: double chance = 10.0 / termCount; termsEnum = terms.GetIterator(termsEnum); IDictionary <BytesRef, TopDocs> answers = new Dictionary <BytesRef, TopDocs>(); while (termsEnum.Next() != null) { if (Random.NextDouble() <= chance) { BytesRef term = BytesRef.DeepCopyOf(termsEnum.Term); answers[term] = s.Search(new TermQuery(new Term("body", term)), 100); } } if (answers.Count > 0) { CountdownEvent startingGun = new CountdownEvent(1); int numThreads = TestUtil.NextInt32(Random, 2, 5); ThreadJob[] threads = new ThreadJob[numThreads]; for (int threadID = 0; threadID < numThreads; threadID++) { ThreadJob thread = new ThreadAnonymousInnerClassHelper(this, s, answers, startingGun); threads[threadID] = thread; thread.Start(); } startingGun.Signal(); foreach (ThreadJob thread in threads) { thread.Join(); } } r.Dispose(); dir.Dispose(); }
public virtual void TestTransitionAPI() { Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); Documents.Document doc = new Documents.Document(); doc.Add(new Field("stored", "abc", Field.Store.YES, Field.Index.NO)); doc.Add(new Field("stored_indexed", "abc xyz", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field("stored_tokenized", "abc xyz", Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("indexed", "abc xyz", Field.Store.NO, Field.Index.NOT_ANALYZED)); doc.Add(new Field("tokenized", "abc xyz", Field.Store.NO, Field.Index.ANALYZED)); doc.Add(new Field("tokenized_reader", new StringReader("abc xyz"))); doc.Add(new Field("tokenized_tokenstream", w.w.Analyzer.TokenStream("tokenized_tokenstream", new StringReader("abc xyz")))); doc.Add(new Field("binary", new byte[10])); doc.Add(new Field("tv", "abc xyz", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)); doc.Add(new Field("tv_pos", "abc xyz", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS)); doc.Add(new Field("tv_off", "abc xyz", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_OFFSETS)); doc.Add(new Field("tv_pos_off", "abc xyz", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); w.AddDocument(doc); IndexReader r = w.Reader; w.Dispose(); doc = r.Document(0); // 4 stored fields Assert.AreEqual(4, doc.Fields.Count); Assert.AreEqual("abc", doc.Get("stored")); Assert.AreEqual("abc xyz", doc.Get("stored_indexed")); Assert.AreEqual("abc xyz", doc.Get("stored_tokenized")); BytesRef br = doc.GetBinaryValue("binary"); Assert.IsNotNull(br); Assert.AreEqual(10, br.Length); IndexSearcher s = new IndexSearcher(r); Assert.AreEqual(1, s.Search(new TermQuery(new Term("stored_indexed", "abc xyz")), 1).TotalHits); Assert.AreEqual(1, s.Search(new TermQuery(new Term("stored_tokenized", "abc")), 1).TotalHits); Assert.AreEqual(1, s.Search(new TermQuery(new Term("stored_tokenized", "xyz")), 1).TotalHits); Assert.AreEqual(1, s.Search(new TermQuery(new Term("indexed", "abc xyz")), 1).TotalHits); Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized", "abc")), 1).TotalHits); Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized", "xyz")), 1).TotalHits); Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized_reader", "abc")), 1).TotalHits); Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized_reader", "xyz")), 1).TotalHits); Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized_tokenstream", "abc")), 1).TotalHits); Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized_tokenstream", "xyz")), 1).TotalHits); foreach (string field in new string[] { "tv", "tv_pos", "tv_off", "tv_pos_off" }) { Fields tvFields = r.GetTermVectors(0); Terms tvs = tvFields.Terms(field); Assert.IsNotNull(tvs); Assert.AreEqual(2, tvs.Size()); TermsEnum tvsEnum = tvs.Iterator(null); Assert.AreEqual(new BytesRef("abc"), tvsEnum.Next()); DocsAndPositionsEnum dpEnum = tvsEnum.DocsAndPositions(null, null); if (field.Equals("tv")) { Assert.IsNull(dpEnum); } else { Assert.IsNotNull(dpEnum); } Assert.AreEqual(new BytesRef("xyz"), tvsEnum.Next()); Assert.IsNull(tvsEnum.Next()); } r.Dispose(); dir.Dispose(); }
public virtual void TestSimple() { int numNodes = TestUtil.NextInt(Random(), 1, 10); double runTimeSec = AtLeast(3); int minDocsToMakeTerms = TestUtil.NextInt(Random(), 5, 20); int maxSearcherAgeSeconds = TestUtil.NextInt(Random(), 1, 3); if (VERBOSE) { Console.WriteLine("TEST: numNodes=" + numNodes + " runTimeSec=" + runTimeSec + " maxSearcherAgeSeconds=" + maxSearcherAgeSeconds); } Start(numNodes, runTimeSec, maxSearcherAgeSeconds); List <PreviousSearchState> priorSearches = new List <PreviousSearchState>(); List <BytesRef> terms = null; while (Time.NanoTime() < endTimeNanos) { bool doFollowon = priorSearches.Count > 0 && Random().Next(7) == 1; // Pick a random node; we will run the query on this node: int myNodeID = Random().Next(numNodes); NodeState.ShardIndexSearcher localShardSearcher; PreviousSearchState prevSearchState; if (doFollowon) { // Pretend user issued a followon query: prevSearchState = priorSearches[Random().Next(priorSearches.Count)]; if (VERBOSE) { Console.WriteLine("\nTEST: follow-on query age=" + ((Time.NanoTime() - prevSearchState.SearchTimeNanos) / 1000000000.0)); } try { localShardSearcher = Nodes[myNodeID].Acquire(prevSearchState.Versions); } catch (SearcherExpiredException see) { // Expected, sometimes; in a "real" app we would // either forward this error to the user ("too // much time has passed; please re-run your // search") or sneakily just switch to newest // searcher w/o telling them... if (VERBOSE) { Console.WriteLine(" searcher expired during local shard searcher init: " + see); } priorSearches.Remove(prevSearchState); continue; } } else { if (VERBOSE) { Console.WriteLine("\nTEST: fresh query"); } // Do fresh query: localShardSearcher = Nodes[myNodeID].Acquire(); prevSearchState = null; } IndexReader[] subs = new IndexReader[numNodes]; PreviousSearchState searchState = null; try { // Mock: now make a single reader (MultiReader) from all node // searchers. In a real shard env you can't do this... we // do it to confirm results from the shard searcher // are correct: int docCount = 0; try { for (int nodeID = 0; nodeID < numNodes; nodeID++) { long subVersion = localShardSearcher.NodeVersions[nodeID]; IndexSearcher sub = Nodes[nodeID].Searchers.Acquire(subVersion); if (sub == null) { nodeID--; while (nodeID >= 0) { subs[nodeID].DecRef(); subs[nodeID] = null; nodeID--; } throw new SearcherExpiredException("nodeID=" + nodeID + " version=" + subVersion); } subs[nodeID] = sub.IndexReader; docCount += subs[nodeID].MaxDoc; } } catch (SearcherExpiredException see) { // Expected if (VERBOSE) { Console.WriteLine(" searcher expired during mock reader init: " + see); } continue; } IndexReader mockReader = new MultiReader(subs); IndexSearcher mockSearcher = new IndexSearcher(mockReader); Query query; Sort sort; if (prevSearchState != null) { query = prevSearchState.Query; sort = prevSearchState.Sort; } else { if (terms == null && docCount > minDocsToMakeTerms) { // TODO: try to "focus" on high freq terms sometimes too // TODO: maybe also periodically reset the terms...? TermsEnum termsEnum = MultiFields.GetTerms(mockReader, "body").GetIterator(null); terms = new List <BytesRef>(); while (termsEnum.Next() != null) { terms.Add(BytesRef.DeepCopyOf(termsEnum.Term)); } if (VERBOSE) { Console.WriteLine("TEST: init terms: " + terms.Count + " terms"); } if (terms.Count == 0) { terms = null; } } if (VERBOSE) { Console.WriteLine(" maxDoc=" + mockReader.MaxDoc); } if (terms != null) { if (Random().NextBoolean()) { query = new TermQuery(new Term("body", terms[Random().Next(terms.Count)])); } else { string t = terms[Random().Next(terms.Count)].Utf8ToString(); string prefix; if (t.Length <= 1) { prefix = t; } else { prefix = t.Substring(0, TestUtil.NextInt(Random(), 1, 2)); } query = new PrefixQuery(new Term("body", prefix)); } if (Random().NextBoolean()) { sort = null; } else { // TODO: sort by more than 1 field int what = Random().Next(3); if (what == 0) { sort = new Sort(SortField.FIELD_SCORE); } else if (what == 1) { // TODO: this sort doesn't merge // correctly... it's tricky because you // could have > 2.1B docs across all shards: //sort = new Sort(SortField.FIELD_DOC); sort = null; } else if (what == 2) { sort = new Sort(new SortField[] { new SortField("docid", SortFieldType.INT32, Random().NextBoolean()) }); } else { sort = new Sort(new SortField[] { new SortField("title", SortFieldType.STRING, Random().NextBoolean()) }); } } } else { query = null; sort = null; } } if (query != null) { try { searchState = AssertSame(mockSearcher, localShardSearcher, query, sort, prevSearchState); } catch (SearcherExpiredException see) { // Expected; in a "real" app we would // either forward this error to the user ("too // much time has passed; please re-run your // search") or sneakily just switch to newest // searcher w/o telling them... if (VERBOSE) { Console.WriteLine(" searcher expired during search: " + see); Console.Out.Write(see.StackTrace); } // We can't do this in general: on a very slow // computer it's possible the local searcher // expires before we can finish our search: // assert prevSearchState != null; if (prevSearchState != null) { priorSearches.Remove(prevSearchState); } } } } finally { Nodes[myNodeID].Release(localShardSearcher); foreach (IndexReader sub in subs) { if (sub != null) { sub.DecRef(); } } } if (searchState != null && searchState.SearchAfterLocal != null && Random().Next(5) == 3) { priorSearches.Add(searchState); if (priorSearches.Count > 200) { Collections.Shuffle(priorSearches); priorSearches.SubList(100, priorSearches.Count).Clear(); } } } Finish(); }
/// <summary> /// Build the suggest index, using up to the specified /// amount of temporary RAM while building. Note that /// the weights for the suggestions are ignored. /// </summary> public virtual void Build(IInputIterator iterator, double ramBufferSizeMB) { if (iterator.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); // TODO: messy ... java7 has Files.createTempDirectory // ... but 4.x is java6: DirectoryInfo tempIndexPath = null; Random random = new Random(); while (true) { tempIndexPath = new DirectoryInfo(Path.Combine(directory.FullName, prefix + ".index." + random.Next(int.MaxValue))); tempIndexPath.Create(); if (System.IO.Directory.Exists(tempIndexPath.FullName)) { break; } } using (Directory dir = FSDirectory.Open(tempIndexPath)) { #pragma warning disable 612, 618 IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, indexAnalyzer); #pragma warning restore 612, 618 iwc.SetOpenMode(OpenMode.CREATE); iwc.SetRAMBufferSizeMB(ramBufferSizeMB); IndexWriter writer = new IndexWriter(dir, iwc); var ft = new FieldType(TextField.TYPE_NOT_STORED); // TODO: if only we had IndexOptions.TERMS_ONLY... ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; ft.OmitNorms = true; ft.Freeze(); Document doc = new Document(); Field field = new Field("body", "", ft); doc.Add(field); totTokens = 0; IndexReader reader = null; bool success = false; count = 0; try { while (true) { BytesRef surfaceForm = iterator.Next(); if (surfaceForm == null) { break; } field.SetStringValue(surfaceForm.Utf8ToString()); writer.AddDocument(doc); count++; } reader = DirectoryReader.Open(writer, false); Terms terms = MultiFields.GetTerms(reader, "body"); if (terms == null) { throw new System.ArgumentException("need at least one suggestion"); } // Move all ngrams into an FST: TermsEnum termsEnum = terms.GetIterator(null); Outputs <long?> outputs = PositiveInt32Outputs.Singleton; Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs); Int32sRef scratchInts = new Int32sRef(); while (true) { BytesRef term = termsEnum.Next(); if (term == null) { break; } int ngramCount = CountGrams(term); if (ngramCount > grams) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams); } if (ngramCount == 1) { totTokens += termsEnum.TotalTermFreq; } builder.Add(Lucene.Net.Util.Fst.Util.ToInt32sRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq)); } fst = builder.Finish(); if (fst == null) { throw new System.ArgumentException("need at least one suggestion"); } //System.out.println("FST: " + fst.getNodeCount() + " nodes"); /* * PrintWriter pw = new PrintWriter("/x/tmp/out.dot"); * Util.toDot(fst, pw, true, true); * pw.close(); */ success = true; } finally { try { if (success) { IOUtils.Dispose(writer, reader); } else { IOUtils.DisposeWhileHandlingException(writer, reader); } } finally { foreach (string file in dir.ListAll()) { FileInfo path = new FileInfo(Path.Combine(tempIndexPath.FullName, file)); try { path.Delete(); } catch (Exception e) { throw new InvalidOperationException("failed to remove " + path, e); } } try { tempIndexPath.Delete(); } catch (Exception e) { throw new InvalidOperationException("failed to remove " + tempIndexPath, e); } } } } }
/// <summary> /// Low level api. Returns a token stream generated from a <see cref="Terms"/>. This /// can be used to feed the highlighter with a pre-parsed token /// stream. The <see cref="Terms"/> must have offsets available. /// <para/> /// In my tests the speeds to recreate 1000 token streams using this method are: /// <list type="bullet"> /// <item> /// with TermVector offset only data stored - 420 milliseconds /// </item> /// <item> /// with TermVector offset AND position data stored - 271 milliseconds /// (nb timings for TermVector with position data are based on a tokenizer with contiguous /// positions - no overlaps or gaps) /// </item> /// <item> /// The cost of not using TermPositionVector to store /// pre-parsed content and using an analyzer to re-parse the original content: /// - reanalyzing the original content - 980 milliseconds /// </item> /// </list> /// /// The re-analyze timings will typically vary depending on - /// <list type="number"> /// <item> /// The complexity of the analyzer code (timings above were using a /// stemmer/lowercaser/stopword combo) /// </item> /// <item> /// The number of other fields (Lucene reads ALL fields off the disk /// when accessing just one document field - can cost dear!) /// </item> /// <item> /// Use of compression on field storage - could be faster due to compression (less disk IO) /// or slower (more CPU burn) depending on the content. /// </item> /// </list> /// </summary> /// <param name="tpv"></param> /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param> /// <exception cref="ArgumentException">if no offsets are available</exception> public static TokenStream GetTokenStream(Terms tpv, bool tokenPositionsGuaranteedContiguous) { if (!tpv.HasOffsets) { throw new ArgumentException("Cannot create TokenStream from Terms without offsets"); } if (!tokenPositionsGuaranteedContiguous && tpv.HasPositions) { return(new TokenStreamFromTermPositionVector(tpv)); } bool hasPayloads = tpv.HasPayloads; // code to reconstruct the original sequence of Tokens TermsEnum termsEnum = tpv.GetIterator(null); int totalTokens = 0; while (termsEnum.Next() != null) { totalTokens += (int)termsEnum.TotalTermFreq; } Token[] tokensInOriginalOrder = new Token[totalTokens]; List <Token> unsortedTokens = null; termsEnum = tpv.GetIterator(null); BytesRef text; DocsAndPositionsEnum dpEnum = null; while ((text = termsEnum.Next()) != null) { dpEnum = termsEnum.DocsAndPositions(null, dpEnum); if (dpEnum == null) { throw new ArgumentException("Required TermVector Offset information was not found"); } string term = text.Utf8ToString(); dpEnum.NextDoc(); int freq = dpEnum.Freq; for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = dpEnum.NextPosition(); if (dpEnum.StartOffset < 0) { throw new ArgumentException("Required TermVector Offset information was not found"); } Token token = new Token(term, dpEnum.StartOffset, dpEnum.EndOffset); if (hasPayloads) { // Must make a deep copy of the returned payload, // since D&PEnum API is allowed to re-use on every // call: token.Payload = BytesRef.DeepCopyOf(dpEnum.GetPayload()); } if (tokenPositionsGuaranteedContiguous && pos != -1) { // We have positions stored and a guarantee that the token position // information is contiguous // This may be fast BUT wont work if Tokenizers used which create >1 // token in same position or // creates jumps in position numbers - this code would fail under those // circumstances // tokens stored with positions - can use this to index straight into // sorted array tokensInOriginalOrder[pos] = token; } else { // tokens NOT stored with positions or not guaranteed contiguous - must // add to list and sort later if (unsortedTokens == null) { unsortedTokens = new List <Token>(); } unsortedTokens.Add(token); } } } // If the field has been stored without position data we must perform a sort if (unsortedTokens != null) { tokensInOriginalOrder = unsortedTokens.ToArray(); ArrayUtil.TimSort(tokensInOriginalOrder, new TokenComparer()); //tokensInOriginalOrder = tokensInOriginalOrder // .OrderBy(t => t, new TokenComparer() ) // .ToArray(); } return(new StoredTokenStream(tokensInOriginalOrder)); }