/// <summary> /// checks term-level statistics /// </summary> public virtual void AssertTermStats(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum) { Assert.AreEqual(leftTermsEnum.DocFreq(), rightTermsEnum.DocFreq()); if (leftTermsEnum.TotalTermFreq() != -1 && rightTermsEnum.TotalTermFreq() != -1) { Assert.AreEqual(leftTermsEnum.TotalTermFreq(), rightTermsEnum.TotalTermFreq()); } }
public virtual void TestEndOffsetPositionWithTeeSinkTokenFilter() { Store.Directory dir = NewDirectory(); Analyzer analyzer = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false); IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); Document doc = new Document(); TokenStream tokenStream = analyzer.TokenStream("field", "abcd "); TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream); TokenStream sink = tee.NewSinkTokenStream(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.StoreTermVectors = true; ft.StoreTermVectorOffsets = true; ft.StoreTermVectorPositions = true; Field f1 = new Field("field", tee, ft); Field f2 = new Field("field", sink, ft); doc.Add(f1); doc.Add(f2); w.AddDocument(doc); w.Dispose(); IndexReader r = DirectoryReader.Open(dir); Terms vector = r.GetTermVectors(0).Terms("field"); assertEquals(1, vector.Size()); TermsEnum termsEnum = vector.Iterator(null); termsEnum.Next(); assertEquals(2, termsEnum.TotalTermFreq()); DocsAndPositionsEnum positions = termsEnum.DocsAndPositions(null, null); assertTrue(positions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(2, positions.Freq()); positions.NextPosition(); assertEquals(0, positions.StartOffset()); assertEquals(4, positions.EndOffset()); positions.NextPosition(); assertEquals(8, positions.StartOffset()); assertEquals(12, positions.EndOffset()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.NextDoc()); r.Dispose(); dir.Dispose(); }
/// <summary> /// Adds terms and frequencies found in vector into the Map termFreqMap /// </summary> /// <param name="termFreqMap"> a Map of terms and their frequencies </param> /// <param name="vector"> List of terms and their frequencies for a doc/field </param> private void AddTermFrequencies(IDictionary <string, Int> termFreqMap, Terms vector) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.index.TermsEnum termsEnum = vector.iterator(null); TermsEnum termsEnum = vector.Iterator(null); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.CharsRef spare = new org.apache.lucene.util.CharsRef(); CharsRef spare = new CharsRef(); BytesRef text; while ((text = termsEnum.Next()) != null) { UnicodeUtil.UTF8toUTF16(text, spare); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String term = spare.toString(); string term = spare.ToString(); if (IsNoiseWord(term)) { continue; } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int freq = (int) termsEnum.totalTermFreq(); int freq = (int)termsEnum.TotalTermFreq(); // increment frequency Int cnt = termFreqMap[term]; if (cnt == null) { cnt = new Int(); termFreqMap[term] = cnt; cnt.x = freq; } else { cnt.x += freq; } } }
protected void CompareTermVectors(Terms terms, Terms memTerms, string field_name) { TermsEnum termEnum = terms.Iterator(null); TermsEnum memTermEnum = memTerms.Iterator(null); while (termEnum.Next() != null) { assertNotNull(memTermEnum.Next()); assertEquals(termEnum.TotalTermFreq(), memTermEnum.TotalTermFreq()); DocsAndPositionsEnum docsPosEnum = termEnum.DocsAndPositions(null, null, 0); DocsAndPositionsEnum memDocsPosEnum = memTermEnum.DocsAndPositions(null, null, 0); String currentTerm = termEnum.Term().Utf8ToString(); assertEquals("Token mismatch for field: " + field_name, currentTerm, memTermEnum.Term().Utf8ToString()); docsPosEnum.NextDoc(); memDocsPosEnum.NextDoc(); int freq = docsPosEnum.Freq(); assertEquals(freq, memDocsPosEnum.Freq()); for (int i = 0; i < freq; i++) { string failDesc = " (field:" + field_name + " term:" + currentTerm + ")"; int memPos = memDocsPosEnum.NextPosition(); int pos = docsPosEnum.NextPosition(); assertEquals("Position test failed" + failDesc, memPos, pos); assertEquals("Start offset test failed" + failDesc, memDocsPosEnum.StartOffset(), docsPosEnum.StartOffset()); assertEquals("End offset test failed" + failDesc, memDocsPosEnum.EndOffset(), docsPosEnum.EndOffset()); assertEquals("Missing payload test failed" + failDesc, docsPosEnum.Payload, null); } } assertNull("Still some tokens not processed", memTermEnum.Next()); }
public virtual void TestMixedVectrosVectors() { RandomIndexWriter writer = new RandomIndexWriter(Random(), Directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.SIMPLE, true)).SetOpenMode(OpenMode.CREATE)); Document doc = new Document(); FieldType ft2 = new FieldType(TextField.TYPE_STORED); ft2.StoreTermVectors = true; FieldType ft3 = new FieldType(TextField.TYPE_STORED); ft3.StoreTermVectors = true; ft3.StoreTermVectorPositions = true; FieldType ft4 = new FieldType(TextField.TYPE_STORED); ft4.StoreTermVectors = true; ft4.StoreTermVectorOffsets = true; FieldType ft5 = new FieldType(TextField.TYPE_STORED); ft5.StoreTermVectors = true; ft5.StoreTermVectorOffsets = true; ft5.StoreTermVectorPositions = true; doc.Add(NewTextField("field", "one", Field.Store.YES)); doc.Add(NewField("field", "one", ft2)); doc.Add(NewField("field", "one", ft3)); doc.Add(NewField("field", "one", ft4)); doc.Add(NewField("field", "one", ft5)); writer.AddDocument(doc); IndexReader reader = writer.Reader; writer.Dispose(); IndexSearcher searcher = NewSearcher(reader); Query query = new TermQuery(new Term("field", "one")); ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); Fields vectors = searcher.IndexReader.GetTermVectors(hits[0].Doc); Assert.IsNotNull(vectors); Assert.AreEqual(1, vectors.Size); Terms vector = vectors.Terms("field"); Assert.IsNotNull(vector); Assert.AreEqual(1, vector.Size()); TermsEnum termsEnum = vector.Iterator(null); Assert.IsNotNull(termsEnum.Next()); Assert.AreEqual("one", termsEnum.Term().Utf8ToString()); Assert.AreEqual(5, termsEnum.TotalTermFreq()); DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null); Assert.IsNotNull(dpEnum); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(5, dpEnum.Freq()); for (int i = 0; i < 5; i++) { Assert.AreEqual(i, dpEnum.NextPosition()); } dpEnum = termsEnum.DocsAndPositions(null, dpEnum); Assert.IsNotNull(dpEnum); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(5, dpEnum.Freq()); for (int i = 0; i < 5; i++) { dpEnum.NextPosition(); Assert.AreEqual(4 * i, dpEnum.StartOffset()); Assert.AreEqual(4 * i + 3, dpEnum.EndOffset()); } reader.Dispose(); }
public virtual void CollectTermContext(IndexReader reader, IList <AtomicReaderContext> leaves, TermContext[] contextArray, Term[] queryTerms) { TermsEnum termsEnum = null; foreach (AtomicReaderContext context in leaves) { Fields fields = context.AtomicReader.Fields; if (fields == null) { // reader has no fields continue; } for (int i = 0; i < queryTerms.Length; i++) { Term term = queryTerms[i]; TermContext termContext = contextArray[i]; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.index.Terms terms = fields.terms(term.field()); Terms terms = fields.Terms(term.Field()); if (terms == null) { // field does not exist continue; } termsEnum = terms.Iterator(termsEnum); Debug.Assert(termsEnum != null); if (termsEnum == TermsEnum.EMPTY) { continue; } if (termsEnum.SeekExact(term.Bytes())) { if (termContext == null) { contextArray[i] = new TermContext(reader.Context, termsEnum.TermState(), context.Ord, termsEnum.DocFreq(), termsEnum.TotalTermFreq()); } else { termContext.Register(termsEnum.TermState(), context.Ord, termsEnum.DocFreq(), termsEnum.TotalTermFreq()); } } } } }
public override bool Collect(BytesRef bytes) { float boost = boostAtt.Boost; // make sure within a single seg we always collect // terms in order Debug.Assert(CompareToLastTerm(bytes)); //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord); // ignore uncompetitive hits if (StQueue.Size() == MaxSize) { ScoreTerm t = StQueue.Top(); if (boost < t.Boost) { return(true); } if (boost == t.Boost && termComp.Compare(bytes, t.Bytes) > 0) { return(true); } } ScoreTerm t2; TermState state = termsEnum.TermState(); Debug.Assert(state != null); if (visitedTerms.TryGetValue(bytes, out t2)) { // if the term is already in the PQ, only update docFreq of term in PQ Debug.Assert(t2.Boost == boost, "boost should be equal in all segment TermsEnums"); t2.TermState.Register(state, ReaderContext.Ord, termsEnum.DocFreq(), termsEnum.TotalTermFreq()); } else { // add new entry in PQ, we must clone the term, else it may get overwritten! st.Bytes.CopyBytes(bytes); st.Boost = boost; visitedTerms[st.Bytes] = st; Debug.Assert(st.TermState.DocFreq == 0); st.TermState.Register(state, ReaderContext.Ord, termsEnum.DocFreq(), termsEnum.TotalTermFreq()); StQueue.Add(st); // possibly drop entries from queue if (StQueue.Size() > MaxSize) { st = StQueue.Pop(); visitedTerms.Remove(st.Bytes); st.TermState.Clear(); // reset the termstate! } else { st = new ScoreTerm(termComp, new TermContext(TopReaderContext)); } Debug.Assert(StQueue.Size() <= MaxSize, "the PQ size must be limited to maxSize"); // set maxBoostAtt with values to help FuzzyTermsEnum to optimize if (StQueue.Size() == MaxSize) { t2 = StQueue.Top(); maxBoostAtt.MaxNonCompetitiveBoost = t2.Boost; maxBoostAtt.CompetitiveTerm = t2.Bytes; } } return(true); }
internal void Fill(string field, TermsEnum termsEnum) { BytesRef term = null; while ((term = termsEnum.Next()) != null) { InsertWithOverflow(new TermStats(field, term, termsEnum.DocFreq(), termsEnum.TotalTermFreq())); } }
/// <summary> /// Build the suggest index, using up to the specified /// amount of temporary RAM while building. Note that /// the weights for the suggestions are ignored. /// </summary> public virtual void Build(InputIterator iterator, double ramBufferSizeMB) { if (iterator.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); // TODO: messy ... java7 has Files.createTempDirectory // ... but 4.x is java6: File tempIndexPath = null; Random random = new Random(); while (true) { tempIndexPath = new File(directory, prefix + ".index." + random.Next(int.MaxValue)); if (tempIndexPath.mkdir()) { break; } } Directory dir = FSDirectory.Open(tempIndexPath); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_CURRENT, indexAnalyzer); iwc.OpenMode = IndexWriterConfig.OpenMode_e.CREATE; iwc.RAMBufferSizeMB = ramBufferSizeMB; IndexWriter writer = new IndexWriter(dir, iwc); var ft = new FieldType(TextField.TYPE_NOT_STORED); // TODO: if only we had IndexOptions.TERMS_ONLY... ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS; ft.OmitNorms = true; ft.Freeze(); Document doc = new Document(); Field field = new Field("body", "", ft); doc.Add(field); totTokens = 0; IndexReader reader = null; bool success = false; count = 0; try { while (true) { BytesRef surfaceForm = iterator.Next(); if (surfaceForm == null) { break; } field.StringValue = surfaceForm.Utf8ToString(); writer.AddDocument(doc); count++; } reader = DirectoryReader.Open(writer, false); Terms terms = MultiFields.GetTerms(reader, "body"); if (terms == null) { throw new System.ArgumentException("need at least one suggestion"); } // Move all ngrams into an FST: TermsEnum termsEnum = terms.Iterator(null); Outputs <long?> outputs = PositiveIntOutputs.Singleton; Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs); IntsRef scratchInts = new IntsRef(); while (true) { BytesRef term = termsEnum.Next(); if (term == null) { break; } int ngramCount = CountGrams(term); if (ngramCount > grams) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams); } if (ngramCount == 1) { totTokens += termsEnum.TotalTermFreq(); } builder.Add(Lucene.Net.Util.Fst.Util.ToIntsRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq())); } fst = builder.Finish(); if (fst == null) { throw new System.ArgumentException("need at least one suggestion"); } //System.out.println("FST: " + fst.getNodeCount() + " nodes"); /* * PrintWriter pw = new PrintWriter("/x/tmp/out.dot"); * Util.toDot(fst, pw, true, true); * pw.close(); */ success = true; } finally { try { if (success) { IOUtils.Close(writer, reader); } else { IOUtils.CloseWhileHandlingException(writer, reader); } } finally { foreach (string file in dir.ListAll()) { File path = new File(tempIndexPath, file); if (path.Delete() == false) { throw new InvalidOperationException("failed to remove " + path); } } if (tempIndexPath.Delete() == false) { throw new InvalidOperationException("failed to remove " + tempIndexPath); } dir.Dispose(); } } }
public virtual void CollectTermContext(IndexReader reader, IList <AtomicReaderContext> leaves, TermContext[] contextArray, Term[] queryTerms) { TermsEnum termsEnum = null; foreach (AtomicReaderContext context in leaves) { Fields fields = context.AtomicReader.Fields; if (fields == null) { // reader has no fields continue; } for (int i = 0; i < queryTerms.Length; i++) { Term term = queryTerms[i]; TermContext termContext = contextArray[i]; Terms terms = fields.Terms(term.Field); if (terms == null) { // field does not exist continue; } termsEnum = terms.Iterator(termsEnum); Debug.Assert(termsEnum != null); if (termsEnum == TermsEnum.EMPTY) { continue; } if (termsEnum.SeekExact(term.Bytes)) { if (termContext == null) { contextArray[i] = new TermContext(reader.Context, termsEnum.TermState(), context.Ord, termsEnum.DocFreq(), termsEnum.TotalTermFreq()); } else { termContext.Register(termsEnum.TermState(), context.Ord, termsEnum.DocFreq(), termsEnum.TotalTermFreq()); } } } } }
/// <summary> /// Safe (but, slowish) default method to write every /// vector field in the document. /// </summary> protected internal void AddAllDocVectors(Fields vectors, MergeState mergeState) { if (vectors == null) { StartDocument(0); FinishDocument(); return; } int numFields = vectors.Size; if (numFields == -1) { // count manually! TODO: Maybe enforce that Fields.size() returns something valid? numFields = 0; //for (IEnumerator<string> it = vectors.Iterator(); it.hasNext();) foreach (string it in vectors) { numFields++; } } StartDocument(numFields); string lastFieldName = null; TermsEnum termsEnum = null; DocsAndPositionsEnum docsAndPositionsEnum = null; int fieldCount = 0; foreach (string fieldName in vectors) { fieldCount++; FieldInfo fieldInfo = mergeState.FieldInfos.FieldInfo(fieldName); Debug.Assert(lastFieldName == null || fieldName.CompareTo(lastFieldName) > 0, "lastFieldName=" + lastFieldName + " fieldName=" + fieldName); lastFieldName = fieldName; Terms terms = vectors.Terms(fieldName); if (terms == null) { // FieldsEnum shouldn't lie... continue; } bool hasPositions = terms.HasPositions(); bool hasOffsets = terms.HasOffsets(); bool hasPayloads = terms.HasPayloads(); Debug.Assert(!hasPayloads || hasPositions); int numTerms = (int)terms.Size(); if (numTerms == -1) { // count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function numTerms = 0; termsEnum = terms.Iterator(termsEnum); while (termsEnum.Next() != null) { numTerms++; } } StartField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads); termsEnum = terms.Iterator(termsEnum); int termCount = 0; while (termsEnum.Next() != null) { termCount++; int freq = (int)termsEnum.TotalTermFreq(); StartTerm(termsEnum.Term(), freq); if (hasPositions || hasOffsets) { docsAndPositionsEnum = termsEnum.DocsAndPositions(null, docsAndPositionsEnum); Debug.Assert(docsAndPositionsEnum != null); int docID = docsAndPositionsEnum.NextDoc(); Debug.Assert(docID != DocIdSetIterator.NO_MORE_DOCS); Debug.Assert(docsAndPositionsEnum.Freq() == freq); for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = docsAndPositionsEnum.NextPosition(); int startOffset = docsAndPositionsEnum.StartOffset(); int endOffset = docsAndPositionsEnum.EndOffset(); BytesRef payload = docsAndPositionsEnum.Payload; Debug.Assert(!hasPositions || pos >= 0); AddPosition(pos, startOffset, endOffset, payload); } } FinishTerm(); } Debug.Assert(termCount == numTerms); FinishField(); } Debug.Assert(fieldCount == numFields); FinishDocument(); }
public override bool Collect(BytesRef bytes) { int e = Terms.Add(bytes); TermState state = TermsEnum.TermState(); Debug.Assert(state != null); if (e < 0) { // duplicate term: update docFreq int pos = (-e) - 1; Array.TermState[pos].Register(state, ReaderContext.Ord, TermsEnum.DocFreq(), TermsEnum.TotalTermFreq()); Debug.Assert(Array.Boost[pos] == BoostAtt.Boost, "boost should be equal in all segment TermsEnums"); } else { // new entry: we populate the entry initially Array.Boost[e] = BoostAtt.Boost; Array.TermState[e] = new TermContext(TopReaderContext, state, ReaderContext.Ord, TermsEnum.DocFreq(), TermsEnum.TotalTermFreq()); OuterInstance.CheckMaxClauseCount(Terms.Size()); } return(true); }
public override bool Collect(BytesRef bytes) { int pos = PendingTerms.Add(bytes); DocVisitCount += TermsEnum.DocFreq(); if (PendingTerms.Size() >= TermCountLimit || DocVisitCount >= DocCountCutoff) { HasCutOff = true; return(false); } TermState termState = TermsEnum.TermState(); Debug.Assert(termState != null); if (pos < 0) { pos = (-pos) - 1; Array.TermState[pos].Register(termState, ReaderContext.Ord, TermsEnum.DocFreq(), TermsEnum.TotalTermFreq()); } else { Array.TermState[pos] = new TermContext(TopReaderContext, termState, ReaderContext.Ord, TermsEnum.DocFreq(), TermsEnum.TotalTermFreq()); } return(true); }