Пример #1
0
        public virtual void TestTransitionAPI()
        {
            Directory         dir = NewDirectory();
            RandomIndexWriter w   = new RandomIndexWriter(Random(), dir);

            Documents.Document doc = new Documents.Document();
            doc.Add(new Field("stored", "abc", Field.Store.YES, Field.Index.NO));
            doc.Add(new Field("stored_indexed", "abc xyz", Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.Add(new Field("stored_tokenized", "abc xyz", Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("indexed", "abc xyz", Field.Store.NO, Field.Index.NOT_ANALYZED));
            doc.Add(new Field("tokenized", "abc xyz", Field.Store.NO, Field.Index.ANALYZED));
            doc.Add(new Field("tokenized_reader", new StringReader("abc xyz")));
            doc.Add(new Field("tokenized_tokenstream", w.w.Analyzer.TokenStream("tokenized_tokenstream", new StringReader("abc xyz"))));
            doc.Add(new Field("binary", new byte[10]));
            doc.Add(new Field("tv", "abc xyz", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
            doc.Add(new Field("tv_pos", "abc xyz", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS));
            doc.Add(new Field("tv_off", "abc xyz", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_OFFSETS));
            doc.Add(new Field("tv_pos_off", "abc xyz", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
            w.AddDocument(doc);
            IndexReader r = w.Reader;

            w.Dispose();

            doc = r.Document(0);
            // 4 stored fields
            Assert.AreEqual(4, doc.Fields.Count);
            Assert.AreEqual("abc", doc.Get("stored"));
            Assert.AreEqual("abc xyz", doc.Get("stored_indexed"));
            Assert.AreEqual("abc xyz", doc.Get("stored_tokenized"));
            BytesRef br = doc.GetBinaryValue("binary");

            Assert.IsNotNull(br);
            Assert.AreEqual(10, br.Length);

            IndexSearcher s = new IndexSearcher(r);

            Assert.AreEqual(1, s.Search(new TermQuery(new Term("stored_indexed", "abc xyz")), 1).TotalHits);
            Assert.AreEqual(1, s.Search(new TermQuery(new Term("stored_tokenized", "abc")), 1).TotalHits);
            Assert.AreEqual(1, s.Search(new TermQuery(new Term("stored_tokenized", "xyz")), 1).TotalHits);
            Assert.AreEqual(1, s.Search(new TermQuery(new Term("indexed", "abc xyz")), 1).TotalHits);
            Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized", "abc")), 1).TotalHits);
            Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized", "xyz")), 1).TotalHits);
            Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized_reader", "abc")), 1).TotalHits);
            Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized_reader", "xyz")), 1).TotalHits);
            Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized_tokenstream", "abc")), 1).TotalHits);
            Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized_tokenstream", "xyz")), 1).TotalHits);

            foreach (string field in new string[] { "tv", "tv_pos", "tv_off", "tv_pos_off" })
            {
                Fields tvFields = r.GetTermVectors(0);
                Terms  tvs      = tvFields.Terms(field);
                Assert.IsNotNull(tvs);
                Assert.AreEqual(2, tvs.Size());
                TermsEnum tvsEnum = tvs.Iterator(null);
                Assert.AreEqual(new BytesRef("abc"), tvsEnum.Next());
                DocsAndPositionsEnum dpEnum = tvsEnum.DocsAndPositions(null, null);
                if (field.Equals("tv"))
                {
                    Assert.IsNull(dpEnum);
                }
                else
                {
                    Assert.IsNotNull(dpEnum);
                }
                Assert.AreEqual(new BytesRef("xyz"), tvsEnum.Next());
                Assert.IsNull(tvsEnum.Next());
            }

            r.Dispose();
            dir.Dispose();
        }
Пример #2
0
        /// <summary>
        /// Default merge impl </summary>
        public virtual void Merge(MergeState mergeState, FieldInfo.IndexOptions? indexOptions, TermsEnum termsEnum)
        {
            BytesRef term;
            Debug.Assert(termsEnum != null);
            long sumTotalTermFreq = 0;
            long sumDocFreq = 0;
            long sumDFsinceLastAbortCheck = 0;
            FixedBitSet visitedDocs = new FixedBitSet(mergeState.SegmentInfo.DocCount);

            if (indexOptions == FieldInfo.IndexOptions.DOCS_ONLY)
            {
                if (DocsEnum == null)
                {
                    DocsEnum = new MappingMultiDocsEnum();
                }
                DocsEnum.MergeState = mergeState;

                MultiDocsEnum docsEnumIn = null;

                while ((term = termsEnum.Next()) != null)
                {
                    // We can pass null for liveDocs, because the
                    // mapping enum will skip the non-live docs:
                    docsEnumIn = (MultiDocsEnum)termsEnum.Docs(null, docsEnumIn, Index.DocsEnum.FLAG_NONE);
                    if (docsEnumIn != null)
                    {
                        DocsEnum.Reset(docsEnumIn);
                        PostingsConsumer postingsConsumer = StartTerm(term);
                        TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, DocsEnum, visitedDocs);
                        if (stats.DocFreq > 0)
                        {
                            FinishTerm(term, stats);
                            sumTotalTermFreq += stats.DocFreq;
                            sumDFsinceLastAbortCheck += stats.DocFreq;
                            sumDocFreq += stats.DocFreq;
                            if (sumDFsinceLastAbortCheck > 60000)
                            {
                                mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0);
                                sumDFsinceLastAbortCheck = 0;
                            }
                        }
                    }
                }
            }
            else if (indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS)
            {
                if (DocsAndFreqsEnum == null)
                {
                    DocsAndFreqsEnum = new MappingMultiDocsEnum();
                }
                DocsAndFreqsEnum.MergeState = mergeState;

                MultiDocsEnum docsAndFreqsEnumIn = null;

                while ((term = termsEnum.Next()) != null)
                {
                    // We can pass null for liveDocs, because the
                    // mapping enum will skip the non-live docs:
                    docsAndFreqsEnumIn = (MultiDocsEnum)termsEnum.Docs(null, docsAndFreqsEnumIn);
                    Debug.Assert(docsAndFreqsEnumIn != null);
                    DocsAndFreqsEnum.Reset(docsAndFreqsEnumIn);
                    PostingsConsumer postingsConsumer = StartTerm(term);
                    TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, DocsAndFreqsEnum, visitedDocs);
                    if (stats.DocFreq > 0)
                    {
                        FinishTerm(term, stats);
                        sumTotalTermFreq += stats.TotalTermFreq;
                        sumDFsinceLastAbortCheck += stats.DocFreq;
                        sumDocFreq += stats.DocFreq;
                        if (sumDFsinceLastAbortCheck > 60000)
                        {
                            mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0);
                            sumDFsinceLastAbortCheck = 0;
                        }
                    }
                }
            }
            else if (indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
            {
                if (PostingsEnum == null)
                {
                    PostingsEnum = new MappingMultiDocsAndPositionsEnum();
                }
                PostingsEnum.MergeState = mergeState;
                MultiDocsAndPositionsEnum postingsEnumIn = null;
                while ((term = termsEnum.Next()) != null)
                {
                    // We can pass null for liveDocs, because the
                    // mapping enum will skip the non-live docs:
                    postingsEnumIn = (MultiDocsAndPositionsEnum)termsEnum.DocsAndPositions(null, postingsEnumIn, DocsAndPositionsEnum.FLAG_PAYLOADS);
                    Debug.Assert(postingsEnumIn != null);
                    PostingsEnum.Reset(postingsEnumIn);

                    PostingsConsumer postingsConsumer = StartTerm(term);
                    TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, PostingsEnum, visitedDocs);
                    if (stats.DocFreq > 0)
                    {
                        FinishTerm(term, stats);
                        sumTotalTermFreq += stats.TotalTermFreq;
                        sumDFsinceLastAbortCheck += stats.DocFreq;
                        sumDocFreq += stats.DocFreq;
                        if (sumDFsinceLastAbortCheck > 60000)
                        {
                            mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0);
                            sumDFsinceLastAbortCheck = 0;
                        }
                    }
                }
            }
            else
            {
                Debug.Assert(indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
                if (PostingsEnum == null)
                {
                    PostingsEnum = new MappingMultiDocsAndPositionsEnum();
                }
                PostingsEnum.MergeState = mergeState;
                MultiDocsAndPositionsEnum postingsEnumIn = null;
                while ((term = termsEnum.Next()) != null)
                {
                    // We can pass null for liveDocs, because the
                    // mapping enum will skip the non-live docs:
                    postingsEnumIn = (MultiDocsAndPositionsEnum)termsEnum.DocsAndPositions(null, postingsEnumIn);
                    Debug.Assert(postingsEnumIn != null);
                    PostingsEnum.Reset(postingsEnumIn);

                    PostingsConsumer postingsConsumer = StartTerm(term);
                    TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, PostingsEnum, visitedDocs);
                    if (stats.DocFreq > 0)
                    {
                        FinishTerm(term, stats);
                        sumTotalTermFreq += stats.TotalTermFreq;
                        sumDFsinceLastAbortCheck += stats.DocFreq;
                        sumDocFreq += stats.DocFreq;
                        if (sumDFsinceLastAbortCheck > 60000)
                        {
                            mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0);
                            sumDFsinceLastAbortCheck = 0;
                        }
                    }
                }
            }
            Finish(indexOptions == FieldInfo.IndexOptions.DOCS_ONLY ? -1 : sumTotalTermFreq, sumDocFreq, visitedDocs.Cardinality());
        }
Пример #3
0
        /// <summary>
        /// Default merge impl </summary>
        public virtual void Merge(MergeState mergeState, FieldInfo.IndexOptions?indexOptions, TermsEnum termsEnum)
        {
            BytesRef term;

            Debug.Assert(termsEnum != null);
            long        sumTotalTermFreq         = 0;
            long        sumDocFreq               = 0;
            long        sumDFsinceLastAbortCheck = 0;
            FixedBitSet visitedDocs              = new FixedBitSet(mergeState.SegmentInfo.DocCount);

            if (indexOptions == FieldInfo.IndexOptions.DOCS_ONLY)
            {
                if (DocsEnum == null)
                {
                    DocsEnum = new MappingMultiDocsEnum();
                }
                DocsEnum.MergeState = mergeState;

                MultiDocsEnum docsEnumIn = null;

                while ((term = termsEnum.Next()) != null)
                {
                    // We can pass null for liveDocs, because the
                    // mapping enum will skip the non-live docs:
                    docsEnumIn = (MultiDocsEnum)termsEnum.Docs(null, docsEnumIn, Index.DocsEnum.FLAG_NONE);
                    if (docsEnumIn != null)
                    {
                        DocsEnum.Reset(docsEnumIn);
                        PostingsConsumer postingsConsumer = StartTerm(term);
                        TermStats        stats            = postingsConsumer.Merge(mergeState, indexOptions, DocsEnum, visitedDocs);
                        if (stats.DocFreq > 0)
                        {
                            FinishTerm(term, stats);
                            sumTotalTermFreq         += stats.DocFreq;
                            sumDFsinceLastAbortCheck += stats.DocFreq;
                            sumDocFreq += stats.DocFreq;
                            if (sumDFsinceLastAbortCheck > 60000)
                            {
                                mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0);
                                sumDFsinceLastAbortCheck = 0;
                            }
                        }
                    }
                }
            }
            else if (indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS)
            {
                if (DocsAndFreqsEnum == null)
                {
                    DocsAndFreqsEnum = new MappingMultiDocsEnum();
                }
                DocsAndFreqsEnum.MergeState = mergeState;

                MultiDocsEnum docsAndFreqsEnumIn = null;

                while ((term = termsEnum.Next()) != null)
                {
                    // We can pass null for liveDocs, because the
                    // mapping enum will skip the non-live docs:
                    docsAndFreqsEnumIn = (MultiDocsEnum)termsEnum.Docs(null, docsAndFreqsEnumIn);
                    Debug.Assert(docsAndFreqsEnumIn != null);
                    DocsAndFreqsEnum.Reset(docsAndFreqsEnumIn);
                    PostingsConsumer postingsConsumer = StartTerm(term);
                    TermStats        stats            = postingsConsumer.Merge(mergeState, indexOptions, DocsAndFreqsEnum, visitedDocs);
                    if (stats.DocFreq > 0)
                    {
                        FinishTerm(term, stats);
                        sumTotalTermFreq         += stats.TotalTermFreq;
                        sumDFsinceLastAbortCheck += stats.DocFreq;
                        sumDocFreq += stats.DocFreq;
                        if (sumDFsinceLastAbortCheck > 60000)
                        {
                            mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0);
                            sumDFsinceLastAbortCheck = 0;
                        }
                    }
                }
            }
            else if (indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
            {
                if (PostingsEnum == null)
                {
                    PostingsEnum = new MappingMultiDocsAndPositionsEnum();
                }
                PostingsEnum.MergeState = mergeState;
                MultiDocsAndPositionsEnum postingsEnumIn = null;
                while ((term = termsEnum.Next()) != null)
                {
                    // We can pass null for liveDocs, because the
                    // mapping enum will skip the non-live docs:
                    postingsEnumIn = (MultiDocsAndPositionsEnum)termsEnum.DocsAndPositions(null, postingsEnumIn, DocsAndPositionsEnum.FLAG_PAYLOADS);
                    Debug.Assert(postingsEnumIn != null);
                    PostingsEnum.Reset(postingsEnumIn);

                    PostingsConsumer postingsConsumer = StartTerm(term);
                    TermStats        stats            = postingsConsumer.Merge(mergeState, indexOptions, PostingsEnum, visitedDocs);
                    if (stats.DocFreq > 0)
                    {
                        FinishTerm(term, stats);
                        sumTotalTermFreq         += stats.TotalTermFreq;
                        sumDFsinceLastAbortCheck += stats.DocFreq;
                        sumDocFreq += stats.DocFreq;
                        if (sumDFsinceLastAbortCheck > 60000)
                        {
                            mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0);
                            sumDFsinceLastAbortCheck = 0;
                        }
                    }
                }
            }
            else
            {
                Debug.Assert(indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
                if (PostingsEnum == null)
                {
                    PostingsEnum = new MappingMultiDocsAndPositionsEnum();
                }
                PostingsEnum.MergeState = mergeState;
                MultiDocsAndPositionsEnum postingsEnumIn = null;
                while ((term = termsEnum.Next()) != null)
                {
                    // We can pass null for liveDocs, because the
                    // mapping enum will skip the non-live docs:
                    postingsEnumIn = (MultiDocsAndPositionsEnum)termsEnum.DocsAndPositions(null, postingsEnumIn);
                    Debug.Assert(postingsEnumIn != null);
                    PostingsEnum.Reset(postingsEnumIn);

                    PostingsConsumer postingsConsumer = StartTerm(term);
                    TermStats        stats            = postingsConsumer.Merge(mergeState, indexOptions, PostingsEnum, visitedDocs);
                    if (stats.DocFreq > 0)
                    {
                        FinishTerm(term, stats);
                        sumTotalTermFreq         += stats.TotalTermFreq;
                        sumDFsinceLastAbortCheck += stats.DocFreq;
                        sumDocFreq += stats.DocFreq;
                        if (sumDFsinceLastAbortCheck > 60000)
                        {
                            mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0);
                            sumDFsinceLastAbortCheck = 0;
                        }
                    }
                }
            }
            Finish(indexOptions == FieldInfo.IndexOptions.DOCS_ONLY ? -1 : sumTotalTermFreq, sumDocFreq, visitedDocs.Cardinality());
        }
Пример #4
0
            public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs)
            {
                Debug.Assert(outerInstance.termArrays.Count > 0);
                AtomicReader reader   = (context.AtomicReader);
                IBits        liveDocs = acceptDocs;

                PhraseQuery.PostingsAndFreq[] postingsFreqs = new PhraseQuery.PostingsAndFreq[outerInstance.termArrays.Count];

                Terms fieldTerms = reader.GetTerms(outerInstance.field);

                if (fieldTerms == null)
                {
                    return(null);
                }

                // Reuse single TermsEnum below:
                TermsEnum termsEnum = fieldTerms.GetIterator(null);

                for (int pos = 0; pos < postingsFreqs.Length; pos++)
                {
                    Term[] terms = outerInstance.termArrays[pos];

                    DocsAndPositionsEnum postingsEnum;
                    int docFreq;

                    if (terms.Length > 1)
                    {
                        postingsEnum = new UnionDocsAndPositionsEnum(liveDocs, context, terms, termContexts, termsEnum);

                        // coarse -- this overcounts since a given doc can
                        // have more than one term:
                        docFreq = 0;
                        for (int termIdx = 0; termIdx < terms.Length; termIdx++)
                        {
                            Term      term      = terms[termIdx];
                            TermState termState = termContexts[term].Get(context.Ord);
                            if (termState == null)
                            {
                                // Term not in reader
                                continue;
                            }
                            termsEnum.SeekExact(term.Bytes, termState);
                            docFreq += termsEnum.DocFreq;
                        }

                        if (docFreq == 0)
                        {
                            // None of the terms are in this reader
                            return(null);
                        }
                    }
                    else
                    {
                        Term      term      = terms[0];
                        TermState termState = termContexts[term].Get(context.Ord);
                        if (termState == null)
                        {
                            // Term not in reader
                            return(null);
                        }
                        termsEnum.SeekExact(term.Bytes, termState);
                        postingsEnum = termsEnum.DocsAndPositions(liveDocs, null, DocsAndPositionsFlags.NONE);

                        if (postingsEnum == null)
                        {
                            // term does exist, but has no positions
                            Debug.Assert(termsEnum.Docs(liveDocs, null, DocsFlags.NONE) != null, "termstate found but no term exists in reader");
                            throw new InvalidOperationException("field \"" + term.Field + "\" was indexed without position data; cannot run PhraseQuery (term=" + term.Text() + ")");
                        }

                        docFreq = termsEnum.DocFreq;
                    }

                    postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, (int)outerInstance.positions[pos], terms);
                }

                // sort by increasing docFreq order
                if (outerInstance.slop == 0)
                {
                    ArrayUtil.TimSort(postingsFreqs);
                }

                if (outerInstance.slop == 0)
                {
                    ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.GetSimScorer(stats, context));
                    if (s.noDocs)
                    {
                        return(null);
                    }
                    else
                    {
                        return(s);
                    }
                }
                else
                {
                    return(new SloppyPhraseScorer(this, postingsFreqs, outerInstance.slop, similarity.GetSimScorer(stats, context)));
                }
            }
Пример #5
0
        public override Spans GetSpans(AtomicReaderContext context, Bits acceptDocs, IDictionary <Term, TermContext> termContexts)
        {
            TermContext termContext;

            termContexts.TryGetValue(term, out termContext);
            TermState state;

            if (termContext == null)
            {
                // this happens with span-not query, as it doesn't include the NOT side in extractTerms()
                // so we seek to the term now in this segment..., this sucks because its ugly mostly!
                Fields fields = context.AtomicReader.Fields;
                if (fields != null)
                {
                    Terms terms = fields.Terms(term.Field);
                    if (terms != null)
                    {
                        TermsEnum termsEnum = terms.Iterator(null);
                        if (termsEnum.SeekExact(term.Bytes))
                        {
                            state = termsEnum.TermState();
                        }
                        else
                        {
                            state = null;
                        }
                    }
                    else
                    {
                        state = null;
                    }
                }
                else
                {
                    state = null;
                }
            }
            else
            {
                state = termContext.Get(context.Ord);
            }

            if (state == null) // term is not present in that reader
            {
                return(TermSpans.EMPTY_TERM_SPANS);
            }

            TermsEnum termsEnum_ = context.AtomicReader.Terms(term.Field).Iterator(null);

            termsEnum_.SeekExact(term.Bytes, state);

            DocsAndPositionsEnum postings = termsEnum_.DocsAndPositions(acceptDocs, null, DocsAndPositionsEnum.FLAG_PAYLOADS);

            if (postings != null)
            {
                return(new TermSpans(postings, term));
            }
            else
            {
                // term does exist, but has no positions
                throw new InvalidOperationException("field \"" + term.Field + "\" was indexed without position data; cannot run SpanTermQuery (term=" + term.Text() + ")");
            }
        }
Пример #6
0
        public virtual void TestMixedVectrosVectors()
        {
            RandomIndexWriter writer = new RandomIndexWriter(Random(), Directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.SIMPLE, true)).SetOpenMode(OpenMode.CREATE));
            Document          doc    = new Document();

            FieldType ft2 = new FieldType(TextField.TYPE_STORED);

            ft2.StoreTermVectors = true;

            FieldType ft3 = new FieldType(TextField.TYPE_STORED);

            ft3.StoreTermVectors         = true;
            ft3.StoreTermVectorPositions = true;

            FieldType ft4 = new FieldType(TextField.TYPE_STORED);

            ft4.StoreTermVectors       = true;
            ft4.StoreTermVectorOffsets = true;

            FieldType ft5 = new FieldType(TextField.TYPE_STORED);

            ft5.StoreTermVectors         = true;
            ft5.StoreTermVectorOffsets   = true;
            ft5.StoreTermVectorPositions = true;

            doc.Add(NewTextField("field", "one", Field.Store.YES));
            doc.Add(NewField("field", "one", ft2));
            doc.Add(NewField("field", "one", ft3));
            doc.Add(NewField("field", "one", ft4));
            doc.Add(NewField("field", "one", ft5));
            writer.AddDocument(doc);
            IndexReader reader = writer.Reader;

            writer.Dispose();

            IndexSearcher searcher = NewSearcher(reader);

            Query query = new TermQuery(new Term("field", "one"));

            ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs;
            Assert.AreEqual(1, hits.Length);

            Fields vectors = searcher.IndexReader.GetTermVectors(hits[0].Doc);

            Assert.IsNotNull(vectors);
            Assert.AreEqual(1, vectors.Count);
            Terms vector = vectors.GetTerms("field");

            Assert.IsNotNull(vector);
            Assert.AreEqual(1, vector.Count);
            TermsEnum termsEnum = vector.GetIterator(null);

            Assert.IsNotNull(termsEnum.Next());
            Assert.AreEqual("one", termsEnum.Term.Utf8ToString());
            Assert.AreEqual(5, termsEnum.TotalTermFreq);
            DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null);

            Assert.IsNotNull(dpEnum);
            Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            Assert.AreEqual(5, dpEnum.Freq);
            for (int i = 0; i < 5; i++)
            {
                Assert.AreEqual(i, dpEnum.NextPosition());
            }

            dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
            Assert.IsNotNull(dpEnum);
            Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            Assert.AreEqual(5, dpEnum.Freq);
            for (int i = 0; i < 5; i++)
            {
                dpEnum.NextPosition();
                Assert.AreEqual(4 * i, dpEnum.StartOffset);
                Assert.AreEqual(4 * i + 3, dpEnum.EndOffset);
            }
            reader.Dispose();
        }
Пример #7
0
            public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs)
            {
                Debug.Assert(outerInstance.terms.Count > 0);
                AtomicReader reader   = context.AtomicReader;
                IBits        liveDocs = acceptDocs;

                PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[outerInstance.terms.Count];

                Terms fieldTerms = reader.GetTerms(outerInstance.field);

                if (fieldTerms == null)
                {
                    return(null);
                }

                // Reuse single TermsEnum below:
                TermsEnum te = fieldTerms.GetIterator(null);

                for (int i = 0; i < outerInstance.terms.Count; i++)
                {
                    Term      t     = outerInstance.terms[i];
                    TermState state = states[i].Get(context.Ord);
                    if (state == null) // term doesnt exist in this segment
                    {
                        Debug.Assert(TermNotInReader(reader, t), "no termstate found but term exists in reader");
                        return(null);
                    }
                    te.SeekExact(t.Bytes, state);
                    DocsAndPositionsEnum postingsEnum = te.DocsAndPositions(liveDocs, null, DocsAndPositionsFlags.NONE);

                    // PhraseQuery on a field that did not index
                    // positions.
                    if (postingsEnum == null)
                    {
                        Debug.Assert(te.SeekExact(t.Bytes), "termstate found but no term exists in reader");
                        // term does exist, but has no positions
                        throw new InvalidOperationException("field \"" + t.Field + "\" was indexed without position data; cannot run PhraseQuery (term=" + t.Text() + ")");
                    }
                    postingsFreqs[i] = new PostingsAndFreq(postingsEnum, te.DocFreq, (int)outerInstance.positions[i], t);
                }

                // sort by increasing docFreq order
                if (outerInstance.slop == 0)
                {
                    ArrayUtil.TimSort(postingsFreqs);
                }

                if (outerInstance.slop == 0) // optimize exact case
                {
                    ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.GetSimScorer(stats, context));
                    if (s.noDocs)
                    {
                        return(null);
                    }
                    else
                    {
                        return(s);
                    }
                }
                else
                {
                    return(new SloppyPhraseScorer(this, postingsFreqs, outerInstance.slop, similarity.GetSimScorer(stats, context)));
                }
            }
Пример #8
0
        // algorithm: treat sentence snippets as miniature documents
        // we can intersect these with the postings lists via BreakIterator.preceding(offset),s
        // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
        private Passage[] HighlightDoc(string field, BytesRef[] terms, int contentLength, BreakIterator bi, int doc,
                                       TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n)
        {
            PassageScorer scorer = GetScorer(field);

            if (scorer == null)
            {
                throw new NullReferenceException("PassageScorer cannot be null");
            }
            JCG.PriorityQueue <OffsetsEnum> pq = new JCG.PriorityQueue <OffsetsEnum>();
            float[] weights = new float[terms.Length];
            // initialize postings
            for (int i = 0; i < terms.Length; i++)
            {
                DocsAndPositionsEnum de = postings[i];
                int pDoc;
                if (de == EMPTY)
                {
                    continue;
                }
                else if (de == null)
                {
                    postings[i] = EMPTY; // initially
                    if (!termsEnum.SeekExact(terms[i]))
                    {
                        continue; // term not found
                    }
                    de = postings[i] = termsEnum.DocsAndPositions(null, null, DocsAndPositionsFlags.OFFSETS);
                    if (de == null)
                    {
                        // no positions available
                        throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
                    }
                    pDoc = de.Advance(doc);
                }
                else
                {
                    pDoc = de.DocID;
                    if (pDoc < doc)
                    {
                        pDoc = de.Advance(doc);
                    }
                }

                if (doc == pDoc)
                {
                    weights[i] = scorer.Weight(contentLength, de.Freq);
                    de.NextPosition();
                    pq.Add(new OffsetsEnum(de, i));
                }
            }

            pq.Add(new OffsetsEnum(EMPTY, int.MaxValue)); // a sentinel for termination

            JCG.PriorityQueue <Passage> passageQueue = new JCG.PriorityQueue <Passage>(n, Comparer <Passage> .Create((left, right) =>
            {
                if (left.score < right.score)
                {
                    return(-1);
                }
                else if (left.score > right.score)
                {
                    return(1);
                }
                else
                {
                    return(left.startOffset - right.startOffset);
                }
            }));
            Passage current = new Passage();

            while (pq.TryDequeue(out OffsetsEnum off))
            {
                DocsAndPositionsEnum dp = off.dp;
                int start = dp.StartOffset;
                if (start == -1)
                {
                    throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
                }
                int end = dp.EndOffset;
                // LUCENE-5166: this hit would span the content limit... however more valid
                // hits may exist (they are sorted by start). so we pretend like we never
                // saw this term, it won't cause a passage to be added to passageQueue or anything.
                Debug.Assert(EMPTY.StartOffset == int.MaxValue);
                if (start < contentLength && end > contentLength)
                {
                    continue;
                }
                if (start >= current.endOffset)
                {
                    if (current.startOffset >= 0)
                    {
                        // finalize current
                        current.score *= scorer.Norm(current.startOffset);
                        // new sentence: first add 'current' to queue
                        if (passageQueue.Count == n && current.score < passageQueue.Peek().score)
                        {
                            current.Reset(); // can't compete, just reset it
                        }
                        else
                        {
                            passageQueue.Enqueue(current);
                            if (passageQueue.Count > n)
                            {
                                current = passageQueue.Dequeue();
                                current.Reset();
                            }
                            else
                            {
                                current = new Passage();
                            }
                        }
                    }
                    // if we exceed limit, we are done
                    if (start >= contentLength)
                    {
                        Passage[] passages = passageQueue.ToArray();
                        foreach (Passage p in passages)
                        {
                            p.Sort();
                        }
                        // sort in ascending order
                        ArrayUtil.TimSort(passages, Comparer <Passage> .Create((left, right) => left.startOffset - right.startOffset));
                        return(passages);
                    }
                    // advance breakiterator
                    Debug.Assert(BreakIterator.Done < 0);
                    current.startOffset = Math.Max(bi.Preceding(start + 1), 0);
                    current.endOffset   = Math.Min(bi.Next(), contentLength);
                }
                int tf = 0;
                while (true)
                {
                    tf++;
                    BytesRef term = terms[off.id];
                    if (term == null)
                    {
                        // multitermquery match, pull from payload
                        term = off.dp.GetPayload();
                        Debug.Assert(term != null);
                    }
                    current.AddMatch(start, end, term);
                    if (off.pos == dp.Freq)
                    {
                        break; // removed from pq
                    }
                    else
                    {
                        off.pos++;
                        dp.NextPosition();
                        start = dp.StartOffset;
                        end   = dp.EndOffset;
                    }
                    if (start >= current.endOffset || end > contentLength)
                    {
                        pq.Enqueue(off);
                        break;
                    }
                }
                current.score += weights[off.id] * scorer.Tf(tf, current.endOffset - current.startOffset);
            }

            // Dead code but compiler disagrees:
            Debug.Assert(false);
            return(null);
        }
Пример #9
0
        /// <summary>
        /// checks the terms enum sequentially
        /// if deep is false, it does a 'shallow' test that doesnt go down to the docsenums
        /// </summary>
        public virtual void AssertTermsEnum(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum, bool deep)
        {
            BytesRef             term;
            IBits                randomBits     = new RandomBits(MAXDOC, Random.NextDouble(), Random);
            DocsAndPositionsEnum leftPositions  = null;
            DocsAndPositionsEnum rightPositions = null;
            DocsEnum             leftDocs       = null;
            DocsEnum             rightDocs      = null;

            while ((term = leftTermsEnum.Next()) != null)
            {
                Assert.AreEqual(term, rightTermsEnum.Next());
                AssertTermStats(leftTermsEnum, rightTermsEnum);
                if (deep)
                {
                    // with payloads + off
                    AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions));
                    AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions));

                    AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions));
                    AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions));
                    // with payloads only
                    AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.PAYLOADS));
                    AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.PAYLOADS));

                    AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.PAYLOADS));
                    AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.PAYLOADS));

                    // with offsets only
                    AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.OFFSETS));
                    AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.OFFSETS));

                    AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.OFFSETS));
                    AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.OFFSETS));

                    // with positions only
                    AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.NONE), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.NONE));
                    AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.NONE), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.NONE));

                    AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.NONE), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.NONE));
                    AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.NONE), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.NONE));

                    // with freqs:
                    AssertDocsEnum(leftDocs = leftTermsEnum.Docs(null, leftDocs), rightDocs = rightTermsEnum.Docs(null, rightDocs));
                    AssertDocsEnum(leftDocs = leftTermsEnum.Docs(randomBits, leftDocs), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs));

                    // w/o freqs:
                    AssertDocsEnum(leftDocs = leftTermsEnum.Docs(null, leftDocs, DocsFlags.NONE), rightDocs = rightTermsEnum.Docs(null, rightDocs, DocsFlags.NONE));
                    AssertDocsEnum(leftDocs = leftTermsEnum.Docs(randomBits, leftDocs, DocsFlags.NONE), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs, DocsFlags.NONE));

                    // with freqs:
                    AssertDocsSkipping(leftTermsEnum.DocFreq, leftDocs = leftTermsEnum.Docs(null, leftDocs), rightDocs = rightTermsEnum.Docs(null, rightDocs));
                    AssertDocsSkipping(leftTermsEnum.DocFreq, leftDocs = leftTermsEnum.Docs(randomBits, leftDocs), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs));

                    // w/o freqs:
                    AssertDocsSkipping(leftTermsEnum.DocFreq, leftDocs = leftTermsEnum.Docs(null, leftDocs, DocsFlags.NONE), rightDocs = rightTermsEnum.Docs(null, rightDocs, DocsFlags.NONE));
                    AssertDocsSkipping(leftTermsEnum.DocFreq, leftDocs = leftTermsEnum.Docs(randomBits, leftDocs, DocsFlags.NONE), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs, DocsFlags.NONE));
                }
            }
            Assert.IsNull(rightTermsEnum.Next());
        }
Пример #10
0
        /// <summary>
        /// Safe (but, slowish) default method to write every
        /// vector field in the document.
        /// </summary>
        protected void AddAllDocVectors(Fields vectors, MergeState mergeState)
        {
            if (vectors == null)
            {
                StartDocument(0);
                FinishDocument();
                return;
            }

            int numFields = vectors.Count;

            if (numFields == -1)
            {
                // count manually! TODO: Maybe enforce that Fields.size() returns something valid?
                numFields = 0;
                //for (IEnumerator<string> it = vectors.Iterator(); it.hasNext();)
                foreach (string it in vectors)
                {
                    numFields++;
                }
            }
            StartDocument(numFields);

            string lastFieldName = null;

            TermsEnum            termsEnum            = null;
            DocsAndPositionsEnum docsAndPositionsEnum = null;

            int fieldCount = 0;

            foreach (string fieldName in vectors)
            {
                fieldCount++;
                FieldInfo fieldInfo = mergeState.FieldInfos.FieldInfo(fieldName);

                Debug.Assert(lastFieldName == null || fieldName.CompareToOrdinal(lastFieldName) > 0, "lastFieldName=" + lastFieldName + " fieldName=" + fieldName);
                lastFieldName = fieldName;

                Terms terms = vectors.GetTerms(fieldName);
                if (terms == null)
                {
                    // FieldsEnum shouldn't lie...
                    continue;
                }

                bool hasPositions = terms.HasPositions;
                bool hasOffsets   = terms.HasOffsets;
                bool hasPayloads  = terms.HasPayloads;
                Debug.Assert(!hasPayloads || hasPositions);

                int numTerms = (int)terms.Count;
                if (numTerms == -1)
                {
                    // count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function
                    numTerms  = 0;
                    termsEnum = terms.GetIterator(termsEnum);
                    while (termsEnum.Next() != null)
                    {
                        numTerms++;
                    }
                }

                StartField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads);
                termsEnum = terms.GetIterator(termsEnum);

                int termCount = 0;
                while (termsEnum.Next() != null)
                {
                    termCount++;

                    int freq = (int)termsEnum.TotalTermFreq;

                    StartTerm(termsEnum.Term, freq);

                    if (hasPositions || hasOffsets)
                    {
                        docsAndPositionsEnum = termsEnum.DocsAndPositions(null, docsAndPositionsEnum);
                        Debug.Assert(docsAndPositionsEnum != null);

                        int docID = docsAndPositionsEnum.NextDoc();
                        Debug.Assert(docID != DocIdSetIterator.NO_MORE_DOCS);
                        Debug.Assert(docsAndPositionsEnum.Freq == freq);

                        for (int posUpto = 0; posUpto < freq; posUpto++)
                        {
                            int pos         = docsAndPositionsEnum.NextPosition();
                            int startOffset = docsAndPositionsEnum.StartOffset;
                            int endOffset   = docsAndPositionsEnum.EndOffset;

                            BytesRef payload = docsAndPositionsEnum.GetPayload();

                            Debug.Assert(!hasPositions || pos >= 0);
                            AddPosition(pos, startOffset, endOffset, payload);
                        }
                    }
                    FinishTerm();
                }
                Debug.Assert(termCount == numTerms);
                FinishField();
            }
            Debug.Assert(fieldCount == numFields);
            FinishDocument();
        }
Пример #11
0
        ///<summary>Constructor</summary>
        /// <param name="vector">
        /// Terms that contains the data for
        /// creating the <see cref="TokenStream"/>. Must have positions and offsets.
        /// </param>
        public TokenStreamFromTermPositionVector(Terms vector)
        {
            termAttribute = AddAttribute <ICharTermAttribute>();
            positionIncrementAttribute = AddAttribute <IPositionIncrementAttribute>();
            offsetAttribute            = AddAttribute <IOffsetAttribute>();
            payloadAttribute           = AddAttribute <IPayloadAttribute>();

            bool                 hasOffsets  = vector.HasOffsets;
            bool                 hasPayloads = vector.HasPayloads;
            TermsEnum            termsEnum   = vector.GetEnumerator();
            BytesRef             text;
            DocsAndPositionsEnum dpEnum = null;

            while (termsEnum.MoveNext())
            {
                text   = termsEnum.Term;
                dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
                dpEnum.NextDoc();
                int freq = dpEnum.Freq;
                for (int j = 0; j < freq; j++)
                {
                    int   pos = dpEnum.NextPosition();
                    Token token;
                    if (hasOffsets)
                    {
                        token = new Token(text.Utf8ToString(),
                                          dpEnum.StartOffset,
                                          dpEnum.EndOffset);
                    }
                    else
                    {
                        token = new Token();
                        token.SetEmpty().Append(text.Utf8ToString());
                    }
                    if (hasPayloads)
                    {
                        // Must make a deep copy of the returned payload,
                        // since D&PEnum API is allowed to re-use on every
                        // call:
                        token.Payload = BytesRef.DeepCopyOf(dpEnum.GetPayload());
                    }

                    // Yes - this is the position, not the increment! This is for
                    // sorting. This value
                    // will be corrected before use.
                    token.PositionIncrement = pos;
                    this.positionedTokens.Add(token);
                }
            }

            CollectionUtil.TimSort(this.positionedTokens, tokenComparer);

            int lastPosition = -1;

            foreach (Token token in this.positionedTokens)
            {
                int thisPosition = token.PositionIncrement;
                token.PositionIncrement = thisPosition - lastPosition;
                lastPosition            = thisPosition;
            }
            this.tokensAtCurrentPosition = this.positionedTokens.GetEnumerator();
        }
Пример #12
0
        /// <summary>
        /// Low level api. Returns a token stream generated from a <see cref="Terms"/>. This
        /// can be used to feed the highlighter with a pre-parsed token
        /// stream.  The <see cref="Terms"/> must have offsets available.
        /// <para/>
        /// In my tests the speeds to recreate 1000 token streams using this method are:
        /// <list type="bullet">
        ///     <item><description>
        ///     with TermVector offset only data stored - 420  milliseconds
        ///     </description></item>
        ///     <item><description>
        ///     with TermVector offset AND position data stored - 271 milliseconds
        ///     (nb timings for TermVector with position data are based on a tokenizer with contiguous
        ///     positions - no overlaps or gaps)
        ///     </description></item>
        ///     <item><description>
        ///     The cost of not using TermPositionVector to store
        ///     pre-parsed content and using an analyzer to re-parse the original content:
        ///     - reanalyzing the original content - 980 milliseconds
        ///     </description></item>
        /// </list>
        ///
        /// The re-analyze timings will typically vary depending on -
        /// <list type="number">
        ///     <item><description>
        ///     The complexity of the analyzer code (timings above were using a
        ///     stemmer/lowercaser/stopword combo)
        ///     </description></item>
        ///     <item><description>
        ///     The  number of other fields (Lucene reads ALL fields off the disk
        ///     when accessing just one document field - can cost dear!)
        ///     </description></item>
        ///     <item><description>
        ///     Use of compression on field storage - could be faster due to compression (less disk IO)
        ///     or slower (more CPU burn) depending on the content.
        ///     </description></item>
        /// </list>
        /// </summary>
        /// <param name="tpv"></param>
        /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking
        /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param>
        /// <exception cref="ArgumentException">if no offsets are available</exception>
        public static TokenStream GetTokenStream(Terms tpv,
                                                 bool tokenPositionsGuaranteedContiguous)
        {
            if (!tpv.HasOffsets)
            {
                throw new ArgumentException("Cannot create TokenStream from Terms without offsets");
            }

            if (!tokenPositionsGuaranteedContiguous && tpv.HasPositions)
            {
                return(new TokenStreamFromTermPositionVector(tpv));
            }

            bool hasPayloads = tpv.HasPayloads;

            // code to reconstruct the original sequence of Tokens
            TermsEnum termsEnum   = tpv.GetEnumerator();
            int       totalTokens = 0;

            while (termsEnum.MoveNext())
            {
                totalTokens += (int)termsEnum.TotalTermFreq;
            }
            Token[]      tokensInOriginalOrder = new Token[totalTokens];
            List <Token> unsortedTokens        = null;

            termsEnum = tpv.GetEnumerator();
            DocsAndPositionsEnum dpEnum = null;

            while (termsEnum.MoveNext())
            {
                dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
                if (dpEnum == null)
                {
                    throw new ArgumentException("Required TermVector Offset information was not found");
                }
                string term = termsEnum.Term.Utf8ToString();

                dpEnum.NextDoc();
                int freq = dpEnum.Freq;
                for (int posUpto = 0; posUpto < freq; posUpto++)
                {
                    int pos = dpEnum.NextPosition();
                    if (dpEnum.StartOffset < 0)
                    {
                        throw new ArgumentException("Required TermVector Offset information was not found");
                    }
                    Token token = new Token(term, dpEnum.StartOffset, dpEnum.EndOffset);
                    if (hasPayloads)
                    {
                        // Must make a deep copy of the returned payload,
                        // since D&PEnum API is allowed to re-use on every
                        // call:
                        token.Payload = BytesRef.DeepCopyOf(dpEnum.GetPayload());
                    }

                    if (tokenPositionsGuaranteedContiguous && pos != -1)
                    {
                        // We have positions stored and a guarantee that the token position
                        // information is contiguous

                        // This may be fast BUT wont work if Tokenizers used which create >1
                        // token in same position or
                        // creates jumps in position numbers - this code would fail under those
                        // circumstances

                        // tokens stored with positions - can use this to index straight into
                        // sorted array
                        tokensInOriginalOrder[pos] = token;
                    }
                    else
                    {
                        // tokens NOT stored with positions or not guaranteed contiguous - must
                        // add to list and sort later
                        if (unsortedTokens == null)
                        {
                            unsortedTokens = new List <Token>();
                        }
                        unsortedTokens.Add(token);
                    }
                }
            }

            // If the field has been stored without position data we must perform a sort
            if (unsortedTokens != null)
            {
                tokensInOriginalOrder = unsortedTokens.ToArray();
                ArrayUtil.TimSort(tokensInOriginalOrder, new TokenComparer());
                //tokensInOriginalOrder = tokensInOriginalOrder
                //    .OrderBy(t => t, new TokenComparer() )
                //    .ToArray();
            }
            return(new StoredTokenStream(tokensInOriginalOrder));
        }
Пример #13
0
        //public static void main( string[] args ) throws Exception {
        //  Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_CURRENT);
        //  QueryParser parser = new QueryParser(Version.LUCENE_CURRENT,  "f", analyzer );
        //  Query query = parser.parse( "a x:b" );
        //  FieldQuery fieldQuery = new FieldQuery( query, true, false );

        //  Directory dir = new RAMDirectory();
        //  IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer));
        //  Document doc = new Document();
        //  IndexableFieldType ft = new IndexableFieldType(TextField.TYPE_STORED);
        //  ft.setStoreTermVectors(true);
        //  ft.setStoreTermVectorOffsets(true);
        //  ft.setStoreTermVectorPositions(true);
        //  doc.add( new Field( "f", ft, "a a a b b c a b b c d e f" ) );
        //  doc.add( new Field( "f", ft, "b a b a f" ) );
        //  writer.addDocument( doc );
        //  writer.close();

        //  IndexReader reader = IndexReader.open(dir1);
        //  new FieldTermStack( reader, 0, "f", fieldQuery );
        //  reader.close();
        //}

        /// <summary>
        /// a constructor.
        /// </summary>
        /// <param name="reader"><see cref="IndexReader"/> of the index</param>
        /// <param name="docId">document id to be highlighted</param>
        /// <param name="fieldName">field of the document to be highlighted</param>
        /// <param name="fieldQuery"><see cref="FieldQuery"/> object</param>
        /// <exception cref="IOException">If there is a low-level I/O error</exception>
        public FieldTermStack(IndexReader reader, int docId, string fieldName, FieldQuery fieldQuery)
        {
            this.fieldName = fieldName;

            ISet <string> termSet = fieldQuery.GetTermSet(fieldName);

            // just return to make null snippet if un-matched fieldName specified when fieldMatch == true
            if (termSet is null)
            {
                return;
            }

            Fields vectors = reader.GetTermVectors(docId);

            if (vectors is null)
            {
                // null snippet
                return;
            }

            Terms vector = vectors.GetTerms(fieldName);

            if (vector is null)
            {
                // null snippet
                return;
            }

            CharsRef             spare     = new CharsRef();
            TermsEnum            termsEnum = vector.GetEnumerator();
            DocsAndPositionsEnum dpEnum    = null;
            BytesRef             text;

            int numDocs = reader.MaxDoc;

            while (termsEnum.MoveNext())
            {
                text = termsEnum.Term;
                UnicodeUtil.UTF8toUTF16(text, spare);
                string term = spare.ToString();
                if (!termSet.Contains(term))
                {
                    continue;
                }
                dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
                if (dpEnum is null)
                {
                    // null snippet
                    return;
                }

                dpEnum.NextDoc();

                // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
                float weight = (float)(Math.Log(numDocs / (double)(reader.DocFreq(new Term(fieldName, text)) + 1)) + 1.0);

                int freq = dpEnum.Freq;

                for (int i = 0; i < freq; i++)
                {
                    int pos = dpEnum.NextPosition();
                    if (dpEnum.StartOffset < 0)
                    {
                        return; // no offsets, null snippet
                    }
                    termList.Add(new TermInfo(term, dpEnum.StartOffset, dpEnum.EndOffset, pos, weight));
                }
            }

            // sort by position
            CollectionUtil.TimSort(termList);

            // now look for dups at the same position, linking them together
            int      currentPos = -1;
            TermInfo previous   = null;
            TermInfo first      = null;

            for (int i = 0; i < termList.Count;)
            {
                TermInfo current = termList[i];
                if (current.Position == currentPos)
                {
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(previous != null);
                    }
                    previous.SetNext(current);
                    previous = current;
                    //iterator.Remove();

                    // LUCENENET NOTE: Remove, but don't advance the i position (since removing will advance to the next item)
                    termList.RemoveAt(i);
                }
                else
                {
                    if (previous != null)
                    {
                        previous.SetNext(first);
                    }
                    previous   = first = current;
                    currentPos = current.Position;

                    // LUCENENET NOTE: Only increment the position if we don't do a delete.
                    i++;
                }
            }

            if (previous != null)
            {
                previous.SetNext(first);
            }
        }
Пример #14
0
        private void DuellReaders(CompositeReader other, AtomicReader memIndexReader)
        {
            AtomicReader competitor = SlowCompositeReaderWrapper.Wrap(other);
            Fields       memFields  = memIndexReader.Fields;

            foreach (string field in competitor.Fields)
            {
                Terms memTerms = memFields.GetTerms(field);
                Terms iwTerms  = memIndexReader.GetTerms(field);
                if (iwTerms == null)
                {
                    assertNull(memTerms);
                }
                else
                {
                    NumericDocValues normValues    = competitor.GetNormValues(field);
                    NumericDocValues memNormValues = memIndexReader.GetNormValues(field);
                    if (normValues != null)
                    {
                        // mem idx always computes norms on the fly
                        assertNotNull(memNormValues);
                        assertEquals(normValues.Get(0), memNormValues.Get(0));
                    }

                    assertNotNull(memTerms);
                    assertEquals(iwTerms.DocCount, memTerms.DocCount);
                    assertEquals(iwTerms.SumDocFreq, memTerms.SumDocFreq);
                    assertEquals(iwTerms.SumTotalTermFreq, memTerms.SumTotalTermFreq);
                    TermsEnum iwTermsIter  = iwTerms.GetIterator(null);
                    TermsEnum memTermsIter = memTerms.GetIterator(null);
                    if (iwTerms.HasPositions)
                    {
                        bool offsets = iwTerms.HasOffsets && memTerms.HasOffsets;

                        while (iwTermsIter.Next() != null)
                        {
                            assertNotNull(memTermsIter.Next());
                            assertEquals(iwTermsIter.Term, memTermsIter.Term);
                            DocsAndPositionsEnum iwDocsAndPos  = iwTermsIter.DocsAndPositions(null, null);
                            DocsAndPositionsEnum memDocsAndPos = memTermsIter.DocsAndPositions(null, null);
                            while (iwDocsAndPos.NextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS)
                            {
                                assertEquals(iwDocsAndPos.DocID, memDocsAndPos.NextDoc());
                                assertEquals(iwDocsAndPos.Freq, memDocsAndPos.Freq);
                                for (int i = 0; i < iwDocsAndPos.Freq; i++)
                                {
                                    assertEquals("term: " + iwTermsIter.Term.Utf8ToString(), iwDocsAndPos.NextPosition(), memDocsAndPos.NextPosition());
                                    if (offsets)
                                    {
                                        assertEquals(iwDocsAndPos.StartOffset, memDocsAndPos.StartOffset);
                                        assertEquals(iwDocsAndPos.EndOffset, memDocsAndPos.EndOffset);
                                    }
                                }
                            }
                        }
                    }
                    else
                    {
                        while (iwTermsIter.Next() != null)
                        {
                            assertEquals(iwTermsIter.Term, memTermsIter.Term);
                            DocsEnum iwDocsAndPos  = iwTermsIter.Docs(null, null);
                            DocsEnum memDocsAndPos = memTermsIter.Docs(null, null);
                            while (iwDocsAndPos.NextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS)
                            {
                                assertEquals(iwDocsAndPos.DocID, memDocsAndPos.NextDoc());
                                assertEquals(iwDocsAndPos.Freq, memDocsAndPos.Freq);
                            }
                        }
                    }
                }
            }
        }
Пример #15
0
        public virtual void TestTransitionAPI()
        {
            Directory         dir = NewDirectory();
            RandomIndexWriter w   = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
                this,
#endif
                Random, dir);

            Documents.Document doc = new Documents.Document();
#pragma warning disable 612, 618
            doc.Add(new Field("stored", "abc", Field.Store.YES, Field.Index.NO));
            doc.Add(new Field("stored_indexed", "abc xyz", Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.Add(new Field("stored_tokenized", "abc xyz", Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("indexed", "abc xyz", Field.Store.NO, Field.Index.NOT_ANALYZED));
            doc.Add(new Field("tokenized", "abc xyz", Field.Store.NO, Field.Index.ANALYZED));
            doc.Add(new Field("tokenized_reader", new StringReader("abc xyz")));
            doc.Add(new Field("tokenized_tokenstream", w.IndexWriter.Analyzer.GetTokenStream("tokenized_tokenstream", new StringReader("abc xyz"))));
            doc.Add(new Field("binary", new byte[10]));
            doc.Add(new Field("tv", "abc xyz", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
            doc.Add(new Field("tv_pos", "abc xyz", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS));
            doc.Add(new Field("tv_off", "abc xyz", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_OFFSETS));
            doc.Add(new Field("tv_pos_off", "abc xyz", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
#pragma warning restore 612, 618
            w.AddDocument(doc);
            IndexReader r = w.GetReader();
            w.Dispose();

            doc = r.Document(0);
            // 4 stored fields
            Assert.AreEqual(4, doc.Fields.Count);
            Assert.AreEqual("abc", doc.Get("stored"));
            Assert.AreEqual("abc xyz", doc.Get("stored_indexed"));
            Assert.AreEqual("abc xyz", doc.Get("stored_tokenized"));
            BytesRef br = doc.GetBinaryValue("binary");
            Assert.IsNotNull(br);
            Assert.AreEqual(10, br.Length);

            IndexSearcher s = new IndexSearcher(r);
            Assert.AreEqual(1, s.Search(new TermQuery(new Term("stored_indexed", "abc xyz")), 1).TotalHits);
            Assert.AreEqual(1, s.Search(new TermQuery(new Term("stored_tokenized", "abc")), 1).TotalHits);
            Assert.AreEqual(1, s.Search(new TermQuery(new Term("stored_tokenized", "xyz")), 1).TotalHits);
            Assert.AreEqual(1, s.Search(new TermQuery(new Term("indexed", "abc xyz")), 1).TotalHits);
            Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized", "abc")), 1).TotalHits);
            Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized", "xyz")), 1).TotalHits);
            Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized_reader", "abc")), 1).TotalHits);
            Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized_reader", "xyz")), 1).TotalHits);
            Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized_tokenstream", "abc")), 1).TotalHits);
            Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized_tokenstream", "xyz")), 1).TotalHits);

            foreach (string field in new string[] { "tv", "tv_pos", "tv_off", "tv_pos_off" })
            {
                Fields tvFields = r.GetTermVectors(0);
                Terms  tvs      = tvFields.GetTerms(field);
                Assert.IsNotNull(tvs);
                Assert.AreEqual(2, tvs.Count);
                TermsEnum tvsEnum = tvs.GetEnumerator();
                Assert.IsTrue(tvsEnum.MoveNext());
                Assert.AreEqual(new BytesRef("abc"), tvsEnum.Term);
                DocsAndPositionsEnum dpEnum = tvsEnum.DocsAndPositions(null, null);
                if (field.Equals("tv", StringComparison.Ordinal))
                {
                    Assert.IsNull(dpEnum);
                }
                else
                {
                    Assert.IsNotNull(dpEnum);
                }
                Assert.IsTrue(tvsEnum.MoveNext());
                Assert.AreEqual(new BytesRef("xyz"), tvsEnum.Term);
                Assert.IsFalse(tvsEnum.MoveNext());
            }

            r.Dispose();
            dir.Dispose();
        }