Ejemplo n.º 1
0
            private void AddPositions(DocsAndPositionsEnum @in, IndexOutput @out)
            {
                int freq = @in.Freq;

                @out.WriteVInt32(freq);
                int previousPosition  = 0;
                int previousEndOffset = 0;

                for (int i = 0; i < freq; i++)
                {
                    int      pos     = @in.NextPosition();
                    BytesRef payload = @in.GetPayload();
                    // The low-order bit of token is set only if there is a payload, the
                    // previous bits are the delta-encoded position.
                    int token = (pos - previousPosition) << 1 | (payload is null ? 0 : 1);
                    @out.WriteVInt32(token);
                    previousPosition = pos;
                    if (storeOffsets) // don't encode offsets if they are not stored
                    {
                        int startOffset = @in.StartOffset;
                        int endOffset   = @in.EndOffset;
                        @out.WriteVInt32(startOffset - previousEndOffset);
                        @out.WriteVInt32(endOffset - startOffset);
                        previousEndOffset = endOffset;
                    }
                    if (payload != null)
                    {
                        @out.WriteVInt32(payload.Length);
                        @out.WriteBytes(payload.Bytes, payload.Offset, payload.Length);
                    }
                }
            }
Ejemplo n.º 2
0
        protected virtual void LoadPayload(Term term)
        {
            DocsAndPositionsEnum dp = m_reader.GetTermPositionsEnum(term);
            int docID = -1;

            while ((docID = dp.NextDoc()) != DocsEnum.NO_MORE_DOCS)
            {
                if (dp.Freq > 0)
                {
                    dp.NextPosition();
                    BytesRef payload = dp.GetPayload();
                    if (payload != null)
                    {
                        Add(docID, payload.Bytes, payload.Length);
                    }
                }
            }

            // save the last page

            while (m_curSlot < MAX_SLOTS)
            {
                m_curPage[m_curSlot++] = MISSING;
            }
            m_list[m_curPageNo] = CopyPage(new int[m_curData]); // optimize the page to make getNumItems work
            m_curPage           = null;
        }
            public override void Load()
            {
                DocsAndPositionsEnum docPosEnum = m_reader.GetTermPositionsEnum(m_sizeTerm);

                if (docPosEnum == null)
                {
                    return;
                }
                int docID = -1;

                while ((docID = docPosEnum.NextDoc()) != DocsEnum.NO_MORE_DOCS)
                {
                    if (docPosEnum.Freq > 0)
                    {
                        docPosEnum.NextPosition();
                        int len = BytesToInt32(docPosEnum.GetPayload().Bytes);
                        Allocate(docID, Math.Min(len, m_maxItems), true);
                    }
                }
            }
Ejemplo n.º 4
0
 protected internal virtual void ProcessPayload(Similarity similarity)
 {
     if (termSpans.IsPayloadAvailable)
     {
         DocsAndPositionsEnum postings = termSpans.Postings;
         m_payload = postings.GetPayload();
         if (m_payload != null)
         {
             m_payloadScore = outerInstance.outerInstance.m_function.CurrentScore(m_doc, outerInstance.outerInstance.Term.Field, m_spans.Start, m_spans.End, m_payloadsSeen, m_payloadScore, m_docScorer.ComputePayloadFactor(m_doc, m_spans.Start, m_spans.End, m_payload));
         }
         else
         {
             m_payloadScore = outerInstance.outerInstance.m_function.CurrentScore(m_doc, outerInstance.outerInstance.Term.Field, m_spans.Start, m_spans.End, m_payloadsSeen, m_payloadScore, 1F);
         }
         m_payloadsSeen++;
     }
     else
     {
         // zero out the payload?
     }
 }
Ejemplo n.º 5
0
        protected void CompareTermVectors(Terms terms, Terms memTerms, string field_name)
        {
            TermsEnum termEnum    = terms.GetEnumerator();
            TermsEnum memTermEnum = memTerms.GetEnumerator();

            while (termEnum.MoveNext())
            {
                assertTrue(memTermEnum.MoveNext());

                assertEquals(termEnum.TotalTermFreq, memTermEnum.TotalTermFreq);

                DocsAndPositionsEnum docsPosEnum    = termEnum.DocsAndPositions(null, null, 0);
                DocsAndPositionsEnum memDocsPosEnum = memTermEnum.DocsAndPositions(null, null, 0);
                String currentTerm = termEnum.Term.Utf8ToString();


                assertEquals("Token mismatch for field: " + field_name, currentTerm, memTermEnum.Term.Utf8ToString());

                docsPosEnum.NextDoc();
                memDocsPosEnum.NextDoc();

                int freq = docsPosEnum.Freq;
                assertEquals(freq, memDocsPosEnum.Freq);
                for (int i = 0; i < freq; i++)
                {
                    string failDesc = " (field:" + field_name + " term:" + currentTerm + ")";
                    int    memPos   = memDocsPosEnum.NextPosition();
                    int    pos      = docsPosEnum.NextPosition();
                    assertEquals("Position test failed" + failDesc, memPos, pos);
                    assertEquals("Start offset test failed" + failDesc, memDocsPosEnum.StartOffset, docsPosEnum.StartOffset);
                    assertEquals("End offset test failed" + failDesc, memDocsPosEnum.EndOffset, docsPosEnum.EndOffset);
                    assertEquals("Missing payload test failed" + failDesc, docsPosEnum.GetPayload(), null);
                }
            }
            assertFalse("Still some tokens not processed", memTermEnum.MoveNext());
        }
Ejemplo n.º 6
0
        /// <summary>
        /// Low level api. Returns a token stream generated from a <see cref="Terms"/>. This
        /// can be used to feed the highlighter with a pre-parsed token
        /// stream.  The <see cref="Terms"/> must have offsets available.
        /// <para/>
        /// In my tests the speeds to recreate 1000 token streams using this method are:
        /// <list type="bullet">
        ///     <item><description>
        ///     with TermVector offset only data stored - 420  milliseconds
        ///     </description></item>
        ///     <item><description>
        ///     with TermVector offset AND position data stored - 271 milliseconds
        ///     (nb timings for TermVector with position data are based on a tokenizer with contiguous
        ///     positions - no overlaps or gaps)
        ///     </description></item>
        ///     <item><description>
        ///     The cost of not using TermPositionVector to store
        ///     pre-parsed content and using an analyzer to re-parse the original content:
        ///     - reanalyzing the original content - 980 milliseconds
        ///     </description></item>
        /// </list>
        ///
        /// The re-analyze timings will typically vary depending on -
        /// <list type="number">
        ///     <item><description>
        ///     The complexity of the analyzer code (timings above were using a
        ///     stemmer/lowercaser/stopword combo)
        ///     </description></item>
        ///     <item><description>
        ///     The  number of other fields (Lucene reads ALL fields off the disk
        ///     when accessing just one document field - can cost dear!)
        ///     </description></item>
        ///     <item><description>
        ///     Use of compression on field storage - could be faster due to compression (less disk IO)
        ///     or slower (more CPU burn) depending on the content.
        ///     </description></item>
        /// </list>
        /// </summary>
        /// <param name="tpv"></param>
        /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking
        /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param>
        /// <exception cref="ArgumentException">if no offsets are available</exception>
        public static TokenStream GetTokenStream(Terms tpv,
                                                 bool tokenPositionsGuaranteedContiguous)
        {
            if (!tpv.HasOffsets)
            {
                throw new ArgumentException("Cannot create TokenStream from Terms without offsets");
            }

            if (!tokenPositionsGuaranteedContiguous && tpv.HasPositions)
            {
                return(new TokenStreamFromTermPositionVector(tpv));
            }

            bool hasPayloads = tpv.HasPayloads;

            // code to reconstruct the original sequence of Tokens
            TermsEnum termsEnum   = tpv.GetEnumerator();
            int       totalTokens = 0;

            while (termsEnum.MoveNext())
            {
                totalTokens += (int)termsEnum.TotalTermFreq;
            }
            Token[]          tokensInOriginalOrder = new Token[totalTokens];
            JCG.List <Token> unsortedTokens        = null;
            termsEnum = tpv.GetEnumerator();
            DocsAndPositionsEnum dpEnum = null;

            while (termsEnum.MoveNext())
            {
                dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
                if (dpEnum == null)
                {
                    throw new ArgumentException("Required TermVector Offset information was not found");
                }
                string term = termsEnum.Term.Utf8ToString();

                dpEnum.NextDoc();
                int freq = dpEnum.Freq;
                for (int posUpto = 0; posUpto < freq; posUpto++)
                {
                    int pos = dpEnum.NextPosition();
                    if (dpEnum.StartOffset < 0)
                    {
                        throw new ArgumentException("Required TermVector Offset information was not found");
                    }
                    Token token = new Token(term, dpEnum.StartOffset, dpEnum.EndOffset);
                    if (hasPayloads)
                    {
                        // Must make a deep copy of the returned payload,
                        // since D&PEnum API is allowed to re-use on every
                        // call:
                        token.Payload = BytesRef.DeepCopyOf(dpEnum.GetPayload());
                    }

                    if (tokenPositionsGuaranteedContiguous && pos != -1)
                    {
                        // We have positions stored and a guarantee that the token position
                        // information is contiguous

                        // This may be fast BUT wont work if Tokenizers used which create >1
                        // token in same position or
                        // creates jumps in position numbers - this code would fail under those
                        // circumstances

                        // tokens stored with positions - can use this to index straight into
                        // sorted array
                        tokensInOriginalOrder[pos] = token;
                    }
                    else
                    {
                        // tokens NOT stored with positions or not guaranteed contiguous - must
                        // add to list and sort later
                        if (unsortedTokens == null)
                        {
                            unsortedTokens = new JCG.List <Token>();
                        }
                        unsortedTokens.Add(token);
                    }
                }
            }

            // If the field has been stored without position data we must perform a sort
            if (unsortedTokens != null)
            {
                tokensInOriginalOrder = unsortedTokens.ToArray();
                ArrayUtil.TimSort(tokensInOriginalOrder, new TokenComparer());
                //tokensInOriginalOrder = tokensInOriginalOrder
                //    .OrderBy(t => t, new TokenComparer() )
                //    .ToArray();
            }
            return(new StoredTokenStream(tokensInOriginalOrder));
        }
Ejemplo n.º 7
0
        public virtual void TestDocsAndPositionsEnum()
        {
            TermsEnum termsEnum = reader.GetTerms(DOC_POSITIONS_FIELD).GetIterator(null);

            assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.SeekCeil(new BytesRef(DOC_POSITIONS_TERM)));
            DocsAndPositionsEnum sortedPositions = termsEnum.DocsAndPositions(null, null);
            int doc;

            // test nextDoc()
            while ((doc = sortedPositions.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS)
            {
                int freq = sortedPositions.Freq;
                assertEquals("incorrect freq for doc=" + doc, sortedValues[doc] / 10 + 1, freq);
                for (int i = 0; i < freq; i++)
                {
                    assertEquals("incorrect position for doc=" + doc, i, sortedPositions.NextPosition());
                    if (!DoesntSupportOffsets.Contains(TestUtil.GetPostingsFormat(DOC_POSITIONS_FIELD)))
                    {
                        assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.StartOffset);
                        assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.EndOffset);
                    }
                    assertEquals("incorrect payload for doc=" + doc, freq - i, int.Parse(sortedPositions.GetPayload().Utf8ToString(), CultureInfo.InvariantCulture));
                }
            }

            // test advance()
            DocsAndPositionsEnum reuse = sortedPositions;

            sortedPositions = termsEnum.DocsAndPositions(null, reuse);
            if (sortedPositions is SortingAtomicReader.SortingDocsAndPositionsEnum)
            {
                assertTrue(((SortingAtomicReader.SortingDocsAndPositionsEnum)sortedPositions).Reused(reuse)); // make sure reuse worked
            }
            doc = 0;
            while ((doc = sortedPositions.Advance(doc + TestUtil.NextInt32(Random, 1, 5))) != DocIdSetIterator.NO_MORE_DOCS)
            {
                int freq = sortedPositions.Freq;
                assertEquals("incorrect freq for doc=" + doc, sortedValues[doc] / 10 + 1, freq);
                for (int i = 0; i < freq; i++)
                {
                    assertEquals("incorrect position for doc=" + doc, i, sortedPositions.NextPosition());
                    if (!DoesntSupportOffsets.Contains(TestUtil.GetPostingsFormat(DOC_POSITIONS_FIELD)))
                    {
                        assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.StartOffset);
                        assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.EndOffset);
                    }
                    assertEquals("incorrect payload for doc=" + doc, freq - i, int.Parse(sortedPositions.GetPayload().Utf8ToString(), CultureInfo.InvariantCulture));
                }
            }
        }
Ejemplo n.º 8
0
        private readonly IPayloadAttribute payloadAttribute;                     // LUCENENET: marked readonly

        ///<summary>Constructor</summary>
        /// <param name="vector">
        /// Terms that contains the data for
        /// creating the <see cref="TokenStream"/>. Must have positions and offsets.
        /// </param>
        public TokenStreamFromTermPositionVector(Terms vector)
        {
            termAttribute = AddAttribute <ICharTermAttribute>();
            positionIncrementAttribute = AddAttribute <IPositionIncrementAttribute>();
            offsetAttribute            = AddAttribute <IOffsetAttribute>();
            payloadAttribute           = AddAttribute <IPayloadAttribute>();

            bool                 hasOffsets  = vector.HasOffsets;
            bool                 hasPayloads = vector.HasPayloads;
            TermsEnum            termsEnum   = vector.GetEnumerator();
            BytesRef             text;
            DocsAndPositionsEnum dpEnum = null;

            while (termsEnum.MoveNext())
            {
                text   = termsEnum.Term;
                dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
                dpEnum.NextDoc();
                int freq = dpEnum.Freq;
                for (int j = 0; j < freq; j++)
                {
                    int   pos = dpEnum.NextPosition();
                    Token token;
                    if (hasOffsets)
                    {
                        token = new Token(text.Utf8ToString(),
                                          dpEnum.StartOffset,
                                          dpEnum.EndOffset);
                    }
                    else
                    {
                        token = new Token();
                        token.SetEmpty().Append(text.Utf8ToString());
                    }
                    if (hasPayloads)
                    {
                        // Must make a deep copy of the returned payload,
                        // since D&PEnum API is allowed to re-use on every
                        // call:
                        token.Payload = BytesRef.DeepCopyOf(dpEnum.GetPayload());
                    }

                    // Yes - this is the position, not the increment! This is for
                    // sorting. This value
                    // will be corrected before use.
                    token.PositionIncrement = pos;
                    this.positionedTokens.Add(token);
                }
            }

            CollectionUtil.TimSort(this.positionedTokens, tokenComparer);

            int lastPosition = -1;

            foreach (Token token in this.positionedTokens)
            {
                int thisPosition = token.PositionIncrement;
                token.PositionIncrement = thisPosition - lastPosition;
                lastPosition            = thisPosition;
            }
            this.tokensAtCurrentPosition = this.positionedTokens.GetEnumerator();
        }
Ejemplo n.º 9
0
        /// <summary>
        /// Safe (but, slowish) default method to write every
        /// vector field in the document.
        /// </summary>
        protected void AddAllDocVectors(Fields vectors, MergeState mergeState)
        {
            if (vectors == null)
            {
                StartDocument(0);
                FinishDocument();
                return;
            }

            int numFields = vectors.Count;

            if (numFields == -1)
            {
                // count manually! TODO: Maybe enforce that Fields.size() returns something valid?
                numFields = 0;
                //for (IEnumerator<string> it = vectors.Iterator(); it.hasNext();)
                foreach (string it in vectors)
                {
                    numFields++;
                }
            }
            StartDocument(numFields);

            string lastFieldName = null;

            TermsEnum            termsEnum            = null;
            DocsAndPositionsEnum docsAndPositionsEnum = null;

            int fieldCount = 0;

            foreach (string fieldName in vectors)
            {
                fieldCount++;
                FieldInfo fieldInfo = mergeState.FieldInfos.FieldInfo(fieldName);

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(lastFieldName == null || fieldName.CompareToOrdinal(lastFieldName) > 0, "lastFieldName={0} fieldName={1}", lastFieldName, fieldName);
                }
                lastFieldName = fieldName;

                Terms terms = vectors.GetTerms(fieldName);
                if (terms == null)
                {
                    // FieldsEnum shouldn't lie...
                    continue;
                }

                bool hasPositions = terms.HasPositions;
                bool hasOffsets   = terms.HasOffsets;
                bool hasPayloads  = terms.HasPayloads;
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(!hasPayloads || hasPositions);
                }

                int numTerms = (int)terms.Count;
                if (numTerms == -1)
                {
                    // count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function
                    numTerms  = 0;
                    termsEnum = terms.GetEnumerator(termsEnum);
                    while (termsEnum.MoveNext())
                    {
                        numTerms++;
                    }
                }

                StartField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads);
                termsEnum = terms.GetEnumerator(termsEnum);

                int termCount = 0;
                while (termsEnum.MoveNext())
                {
                    termCount++;

                    int freq = (int)termsEnum.TotalTermFreq;

                    StartTerm(termsEnum.Term, freq);

                    if (hasPositions || hasOffsets)
                    {
                        docsAndPositionsEnum = termsEnum.DocsAndPositions(null, docsAndPositionsEnum);
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(docsAndPositionsEnum != null);
                        }

                        int docID = docsAndPositionsEnum.NextDoc();
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(docID != DocIdSetIterator.NO_MORE_DOCS);
                            Debugging.Assert(docsAndPositionsEnum.Freq == freq);
                        }

                        for (int posUpto = 0; posUpto < freq; posUpto++)
                        {
                            int pos         = docsAndPositionsEnum.NextPosition();
                            int startOffset = docsAndPositionsEnum.StartOffset;
                            int endOffset   = docsAndPositionsEnum.EndOffset;

                            BytesRef payload = docsAndPositionsEnum.GetPayload();

                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(!hasPositions || pos >= 0);
                            }
                            AddPosition(pos, startOffset, endOffset, payload);
                        }
                    }
                    FinishTerm();
                }
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(termCount == numTerms);
                }
                FinishField();
            }
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(fieldCount == numFields);
            }
            FinishDocument();
        }
Ejemplo n.º 10
0
 public override BytesRef GetPayload()
 {
     return(current.GetPayload());
 }