예제 #1
0
        /// <summary>Process postings from multiple segments all positioned on the
        /// same term. Writes out merged entries into freqOutput and
        /// the proxOutput streams.
        ///
        /// </summary>
        /// <param name="smis">array of segments
        /// </param>
        /// <param name="n">number of cells in the array actually occupied
        /// </param>
        /// <returns> number of documents across all segments where this term was found
        /// </returns>
        /// <throws>  CorruptIndexException if the index is corrupt </throws>
        /// <throws>  IOException if there is a low-level IO error </throws>
        private int AppendPostings(FormatPostingsTermsConsumer termsConsumer, SegmentMergeInfo[] smis, int n)
        {
            FormatPostingsDocsConsumer docConsumer = termsConsumer.AddTerm(smis[0].term.text);
            int df = 0;

            for (int i = 0; i < n; i++)
            {
                SegmentMergeInfo smi      = smis[i];
                TermPositions    postings = smi.GetPositions();
                System.Diagnostics.Debug.Assert(postings != null);
                int   base_Renamed = smi.base_Renamed;
                int[] docMap       = smi.GetDocMap();
                postings.Seek(smi.termEnum);

                while (postings.Next())
                {
                    df++;
                    int doc = postings.Doc();
                    if (docMap != null)
                    {
                        doc = docMap[doc];                   // map around deletions
                    }
                    doc += base_Renamed;                     // convert to merged space

                    int freq = postings.Freq();
                    FormatPostingsPositionsConsumer posConsumer = docConsumer.AddDoc(doc, freq);

                    if (!omitTermFreqAndPositions)
                    {
                        for (int j = 0; j < freq; j++)
                        {
                            int position      = postings.NextPosition();
                            int payloadLength = postings.GetPayloadLength();
                            if (payloadLength > 0)
                            {
                                if (payloadBuffer == null || payloadBuffer.Length < payloadLength)
                                {
                                    payloadBuffer = new byte[payloadLength];
                                }
                                postings.GetPayload(payloadBuffer, 0);
                            }
                            posConsumer.AddPosition(position, payloadBuffer, 0, payloadLength);
                        }
                        posConsumer.Finish();
                    }
                }
            }
            docConsumer.Finish();

            return(df);
        }
예제 #2
0
        // builds an index with payloads in the given Directory and performs
        // different tests to verify the payload encoding
        private void  PerformTest(Directory dir)
        {
            PayloadAnalyzer analyzer = new PayloadAnalyzer();
            IndexWriter     writer   = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);

            // should be in sync with value in TermInfosWriter
            int skipInterval = 16;

            int numTerms = 5;

            System.String fieldName = "f1";

            int numDocs = skipInterval + 1;

            // create content for the test documents with just a few terms
            Term[] terms = GenerateTerms(fieldName, numTerms);
            System.Text.StringBuilder sb = new System.Text.StringBuilder();
            for (int i = 0; i < terms.Length; i++)
            {
                sb.Append(terms[i].text_ForNUnit);
                sb.Append(" ");
            }
            System.String content = sb.ToString();


            int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;

            byte[] payloadData = GenerateRandomData(payloadDataLength);

            Document d = new Document();

            d.Add(new Field(fieldName, content, Field.Store.NO, Field.Index.ANALYZED));
            // add the same document multiple times to have the same payload lengths for all
            // occurrences within two consecutive skip intervals
            int offset = 0;

            for (int i = 0; i < 2 * numDocs; i++)
            {
                analyzer.SetPayloadData(fieldName, payloadData, offset, 1);
                offset += numTerms;
                writer.AddDocument(d);
            }

            // make sure we create more than one segment to test merging
            writer.Flush();

            // now we make sure to have different payload lengths next at the next skip point
            for (int i = 0; i < numDocs; i++)
            {
                analyzer.SetPayloadData(fieldName, payloadData, offset, i);
                offset += i * numTerms;
                writer.AddDocument(d);
            }

            writer.Optimize();
            // flush
            writer.Close();


            /*
             * Verify the index
             * first we test if all payloads are stored correctly
             */
            IndexReader reader = IndexReader.Open(dir);

            byte[] verifyPayloadData = new byte[payloadDataLength];
            offset = 0;
            TermPositions[] tps = new TermPositions[numTerms];
            for (int i = 0; i < numTerms; i++)
            {
                tps[i] = reader.TermPositions(terms[i]);
            }

            while (tps[0].Next())
            {
                for (int i = 1; i < numTerms; i++)
                {
                    tps[i].Next();
                }
                int freq = tps[0].Freq();

                for (int i = 0; i < freq; i++)
                {
                    for (int j = 0; j < numTerms; j++)
                    {
                        tps[j].NextPosition();
                        tps[j].GetPayload(verifyPayloadData, offset);
                        offset += tps[j].GetPayloadLength();
                    }
                }
            }

            for (int i = 0; i < numTerms; i++)
            {
                tps[i].Close();
            }

            AssertByteArrayEquals(payloadData, verifyPayloadData);

            /*
             *  test lazy skipping
             */
            TermPositions tp = reader.TermPositions(terms[0]);

            tp.Next();
            tp.NextPosition();
            // now we don't read this payload
            tp.NextPosition();
            Assert.AreEqual(1, tp.GetPayloadLength(), "Wrong payload length.");
            byte[] payload = tp.GetPayload(null, 0);
            Assert.AreEqual(payload[0], payloadData[numTerms]);
            tp.NextPosition();

            // we don't read this payload and skip to a different document
            tp.SkipTo(5);
            tp.NextPosition();
            Assert.AreEqual(1, tp.GetPayloadLength(), "Wrong payload length.");
            payload = tp.GetPayload(null, 0);
            Assert.AreEqual(payload[0], payloadData[5 * numTerms]);


            /*
             * Test different lengths at skip points
             */
            tp.Seek(terms[1]);
            tp.Next();
            tp.NextPosition();
            Assert.AreEqual(1, tp.GetPayloadLength(), "Wrong payload length.");
            tp.SkipTo(skipInterval - 1);
            tp.NextPosition();
            Assert.AreEqual(1, tp.GetPayloadLength(), "Wrong payload length.");
            tp.SkipTo(2 * skipInterval - 1);
            tp.NextPosition();
            Assert.AreEqual(1, tp.GetPayloadLength(), "Wrong payload length.");
            tp.SkipTo(3 * skipInterval - 1);
            tp.NextPosition();
            Assert.AreEqual(3 * skipInterval - 2 * numDocs - 1, tp.GetPayloadLength(), "Wrong payload length.");

            /*
             * Test multiple call of getPayload()
             */
            tp.GetPayload(null, 0);
            try
            {
                // it is forbidden to call getPayload() more than once
                // without calling nextPosition()
                tp.GetPayload(null, 0);
                Assert.Fail("Expected exception not thrown");
            }
            catch (System.Exception expected)
            {
                // expected exception
            }

            reader.Close();

            // test long payload
            analyzer = new PayloadAnalyzer();
            writer   = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
            System.String singleTerm = "lucene";

            d = new Document();
            d.Add(new Field(fieldName, singleTerm, Field.Store.NO, Field.Index.ANALYZED));
            // add a payload whose length is greater than the buffer size of BufferedIndexOutput
            payloadData = GenerateRandomData(2000);
            analyzer.SetPayloadData(fieldName, payloadData, 100, 1500);
            writer.AddDocument(d);


            writer.Optimize();
            // flush
            writer.Close();

            reader = IndexReader.Open(dir);
            tp     = reader.TermPositions(new Term(fieldName, singleTerm));
            tp.Next();
            tp.NextPosition();

            verifyPayloadData = new byte[tp.GetPayloadLength()];
            tp.GetPayload(verifyPayloadData, 0);
            byte[] portion = new byte[1500];
            Array.Copy(payloadData, 100, portion, 0, 1500);

            AssertByteArrayEquals(portion, verifyPayloadData);
            reader.Close();
        }
예제 #3
0
        /// <summary>Process postings from multiple segments all positioned on the
        /// same term. Writes out merged entries into freqOutput and
        /// the proxOutput streams.
        ///
        /// </summary>
        /// <param name="smis">array of segments
        /// </param>
        /// <param name="n">number of cells in the array actually occupied
        /// </param>
        /// <returns> number of documents across all segments where this term was found
        /// </returns>
        /// <throws>  CorruptIndexException if the index is corrupt </throws>
        /// <throws>  IOException if there is a low-level IO error </throws>
        private int AppendPostings(SegmentMergeInfo[] smis, int n)
        {
            int lastDoc = 0;
            int df      = 0;        // number of docs w/ term

            skipListWriter.ResetSkip();
            bool storePayloads     = fieldInfos.FieldInfo(smis[0].term.field).storePayloads;
            int  lastPayloadLength = -1;             // ensures that we write the first length

            for (int i = 0; i < n; i++)
            {
                SegmentMergeInfo smi      = smis[i];
                TermPositions    postings = smi.GetPositions();
                System.Diagnostics.Debug.Assert(postings != null);
                int   base_Renamed = smi.base_Renamed;
                int[] docMap       = smi.GetDocMap();
                postings.Seek(smi.termEnum);
                while (postings.Next())
                {
                    int doc = postings.Doc();
                    if (docMap != null)
                    {
                        doc = docMap[doc];                   // map around deletions
                    }
                    doc += base_Renamed;                     // convert to merged space

                    if (doc < 0 || (df > 0 && doc <= lastDoc))
                    {
                        throw new CorruptIndexException("docs out of order (" + doc + " <= " + lastDoc + " )");
                    }

                    df++;

                    if ((df % skipInterval) == 0)
                    {
                        skipListWriter.SetSkipData(lastDoc, storePayloads, lastPayloadLength);
                        skipListWriter.BufferSkip(df);
                    }

                    int docCode = (doc - lastDoc) << 1;                     // use low bit to flag freq=1
                    lastDoc = doc;

                    int freq = postings.Freq();
                    if (freq == 1)
                    {
                        freqOutput.WriteVInt(docCode | 1);                         // write doc & freq=1
                    }
                    else
                    {
                        freqOutput.WriteVInt(docCode);                      // write doc
                        freqOutput.WriteVInt(freq);                         // write frequency in doc
                    }

                    /** See {@link DocumentWriter#writePostings(Posting[], String) for
                     *  documentation about the encoding of positions and payloads
                     */
                    int lastPosition = 0;                     // write position deltas
                    for (int j = 0; j < freq; j++)
                    {
                        int position = postings.NextPosition();
                        int delta    = position - lastPosition;
                        if (storePayloads)
                        {
                            int payloadLength = postings.GetPayloadLength();
                            if (payloadLength == lastPayloadLength)
                            {
                                proxOutput.WriteVInt(delta * 2);
                            }
                            else
                            {
                                proxOutput.WriteVInt(delta * 2 + 1);
                                proxOutput.WriteVInt(payloadLength);
                                lastPayloadLength = payloadLength;
                            }
                            if (payloadLength > 0)
                            {
                                if (payloadBuffer == null || payloadBuffer.Length < payloadLength)
                                {
                                    payloadBuffer = new byte[payloadLength];
                                }
                                postings.GetPayload(payloadBuffer, 0);
                                proxOutput.WriteBytes(payloadBuffer, 0, payloadLength);
                            }
                        }
                        else
                        {
                            proxOutput.WriteVInt(delta);
                        }
                        lastPosition = position;
                    }
                }
            }
            return(df);
        }