public virtual void  TestSeek()
        {
            Directory   directory = new RAMDirectory();
            IndexWriter writer    = new IndexWriter(directory, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);

            for (int i = 0; i < 10; i++)
            {
                Document doc = new Document();
                doc.Add(new Field(this.field, "a b", Field.Store.YES, Field.Index.ANALYZED));
                writer.AddDocument(doc);
            }

            writer.Close();
            IndexReader   reader = IndexReader.Open(directory);
            TermPositions tp     = reader.TermPositions();

            tp.Seek(new Term(this.field, "b"));
            for (int i = 0; i < 10; i++)
            {
                tp.Next();
                Assert.AreEqual(tp.Doc(), i);
                Assert.AreEqual(tp.NextPosition(), 1);
            }
            tp.Seek(new Term(this.field, "a"));
            for (int i = 0; i < 10; i++)
            {
                tp.Next();
                Assert.AreEqual(tp.Doc(), i);
                Assert.AreEqual(tp.NextPosition(), 0);
            }
        }
示例#2
0
        public virtual void  TestPreAnalyzedField()
        {
            IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
            Document    doc    = new Document();

            doc.Add(new Field("preanalyzed", new AnonymousClassTokenStream(this), TermVector.NO));

            writer.AddDocument(doc);
            writer.Flush();
            SegmentInfo info = writer.NewestSegment();

            writer.Close();
            SegmentReader reader = SegmentReader.Get(info);

            TermPositions termPositions = reader.TermPositions(new Term("preanalyzed", "term1"));

            Assert.IsTrue(termPositions.Next());
            Assert.AreEqual(1, termPositions.Freq());
            Assert.AreEqual(0, termPositions.NextPosition());

            termPositions.Seek(new Term("preanalyzed", "term2"));
            Assert.IsTrue(termPositions.Next());
            Assert.AreEqual(2, termPositions.Freq());
            Assert.AreEqual(1, termPositions.NextPosition());
            Assert.AreEqual(3, termPositions.NextPosition());

            termPositions.Seek(new Term("preanalyzed", "term3"));
            Assert.IsTrue(termPositions.Next());
            Assert.AreEqual(1, termPositions.Freq());
            Assert.AreEqual(2, termPositions.NextPosition());
        }
示例#3
0
        public virtual void  TestPositionIncrementGap()
        {
            Analyzer analyzer = new AnonymousClassAnalyzer(this);

            IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);

            Document doc = new Document();

            doc.Add(new Field("repeated", "repeated one", Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("repeated", "repeated two", Field.Store.YES, Field.Index.ANALYZED));

            writer.AddDocument(doc);
            writer.Flush();
            SegmentInfo info = writer.NewestSegment();

            writer.Close();
            SegmentReader reader = SegmentReader.Get(info);

            TermPositions termPositions = reader.TermPositions(new Term("repeated", "repeated"));

            Assert.IsTrue(termPositions.Next());
            int freq = termPositions.Freq();

            Assert.AreEqual(2, freq);
            Assert.AreEqual(0, termPositions.NextPosition());
            Assert.AreEqual(502, termPositions.NextPosition());
        }
示例#4
0
        public virtual void  TestTokenReuse()
        {
            Analyzer analyzer = new AnonymousClassAnalyzer1(this);

            IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);

            Document doc = new Document();

            doc.Add(new Field("f1", "a 5 a a", Field.Store.YES, Field.Index.ANALYZED));

            writer.AddDocument(doc);
            writer.Flush();
            SegmentInfo info = writer.NewestSegment();

            writer.Close();
            SegmentReader reader = SegmentReader.Get(info);

            TermPositions termPositions = reader.TermPositions(new Term("f1", "a"));

            Assert.IsTrue(termPositions.Next());
            int freq = termPositions.Freq();

            Assert.AreEqual(3, freq);
            Assert.AreEqual(0, termPositions.NextPosition());
            Assert.AreEqual(true, termPositions.IsPayloadAvailable());
            Assert.AreEqual(6, termPositions.NextPosition());
            Assert.AreEqual(false, termPositions.IsPayloadAvailable());
            Assert.AreEqual(7, termPositions.NextPosition());
            Assert.AreEqual(false, termPositions.IsPayloadAvailable());
        }
示例#5
0
        protected virtual void LoadPayload(Term term)
        {
            byte[]        payloadBuf = null;
            TermPositions tp         = _reader.TermPositions();

            tp.Seek(term);
            while (tp.Next())
            {
                if (tp.Freq > 0)
                {
                    tp.NextPosition();
                    if (tp.IsPayloadAvailable)
                    {
                        int len = tp.PayloadLength;
                        payloadBuf = tp.GetPayload(payloadBuf, 0);
                        Add(tp.Doc, payloadBuf, len);
                    }
                }
            }

            // save the last page

            while (_curSlot < MAX_SLOTS)
            {
                _curPage[_curSlot++] = MISSING;
            }
            _list[_curPageNo] = CopyPage(new int[_curData]); // optimize the page to make getNumItems work
            _curPage          = null;
        }
            public override void Load()
            {
                TermPositions tp = null;

                byte[] payloadBuffer = new byte[4]; // four bytes for an int
                try
                {
                    tp = _reader.TermPositions(_sizeTerm);

                    if (tp == null)
                    {
                        return;
                    }

                    while (tp.Next())
                    {
                        if (tp.Freq > 0)
                        {
                            tp.NextPosition();
                            tp.GetPayload(payloadBuffer, 0);
                            int len = BytesToInt(payloadBuffer);
                            Allocate(tp.Doc, Math.Min(len, _maxItems), true);
                        }
                    }
                }
                finally
                {
                    if (tp != null)
                    {
                        tp.Dispose();
                    }
                }
            }
        /// <summary>Process postings from multiple segments all positioned on the
        /// same term. Writes out merged entries into freqOutput and
        /// the proxOutput streams.
        ///
        /// </summary>
        /// <param name="smis">array of segments
        /// </param>
        /// <param name="n">number of cells in the array actually occupied
        /// </param>
        /// <returns> number of documents across all segments where this term was found
        /// </returns>
        private int AppendPostings(SegmentMergeInfo[] smis, int n)
        {
            int lastDoc = 0;
            int df      = 0; // number of docs w/ term

            ResetSkip();
            for (int i = 0; i < n; i++)
            {
                SegmentMergeInfo smi      = smis[i];
                TermPositions    postings = smi.postings;
                int   base_Renamed        = smi.base_Renamed;
                int[] docMap = smi.docMap;
                postings.Seek(smi.termEnum);
                while (postings.Next())
                {
                    int doc = postings.Doc();
                    if (docMap != null)
                    {
                        doc = docMap[doc]; // map around deletions
                    }
                    doc += base_Renamed;   // convert to merged space

                    if (doc < lastDoc)
                    {
                        throw new System.SystemException("docs out of order");
                    }

                    df++;

                    if ((df % skipInterval) == 0)
                    {
                        BufferSkip(lastDoc);
                    }

                    int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
                    lastDoc = doc;

                    int freq = postings.Freq();
                    if (freq == 1)
                    {
                        freqOutput.WriteVInt(docCode | 1); // write doc & freq=1
                    }
                    else
                    {
                        freqOutput.WriteVInt(docCode); // write doc
                        freqOutput.WriteVInt(freq);    // write frequency in doc
                    }

                    int lastPosition = 0; // write position deltas
                    for (int j = 0; j < freq; j++)
                    {
                        int position = postings.NextPosition();
                        proxOutput.WriteVInt(position - lastPosition);
                        lastPosition = position;
                    }
                }
            }
            return(df);
        }
        public virtual void  TestFilterIndexReader_Renamed()
        {
            RAMDirectory directory = new MockRAMDirectory();
            IndexWriter  writer    = new IndexWriter(directory, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);

            Document d1 = new Document();

            d1.Add(new Field("default", "one two", Field.Store.YES, Field.Index.ANALYZED));
            writer.AddDocument(d1);

            Document d2 = new Document();

            d2.Add(new Field("default", "one three", Field.Store.YES, Field.Index.ANALYZED));
            writer.AddDocument(d2);

            Document d3 = new Document();

            d3.Add(new Field("default", "two four", Field.Store.YES, Field.Index.ANALYZED));
            writer.AddDocument(d3);

            writer.Close();

            IndexReader reader = new TestReader(IndexReader.Open(directory));

            Assert.IsTrue(reader.IsOptimized());

            TermEnum terms = reader.Terms();

            while (terms.Next())
            {
                Assert.IsTrue(terms.Term().Text().IndexOf('e') != -1);
            }
            terms.Close();

            TermPositions positions = reader.TermPositions(new Term("default", "one"));

            while (positions.Next())
            {
                Assert.IsTrue((positions.Doc() % 2) == 1);
            }

            int NUM_DOCS = 3;

            TermDocs td = reader.TermDocs(null);

            for (int i = 0; i < NUM_DOCS; i++)
            {
                Assert.IsTrue(td.Next());
                Assert.AreEqual(i, td.Doc());
                Assert.AreEqual(1, td.Freq());
            }
            td.Close();
            reader.Close();
            directory.Close();
        }
 internal bool Next()
 {
     // increments to next doc
     if (!tp.Next())
     {
         tp.Close();                  // close stream
         doc = System.Int32.MaxValue; // sentinel value
         return(false);
     }
     doc      = tp.Doc;
     position = 0;
     return(true);
 }
示例#10
0
        /// <summary>Process postings from multiple segments all positioned on the
        /// same term. Writes out merged entries into freqOutput and
        /// the proxOutput streams.
        ///
        /// </summary>
        /// <param name="smis">array of segments
        /// </param>
        /// <param name="n">number of cells in the array actually occupied
        /// </param>
        /// <returns> number of documents across all segments where this term was found
        /// </returns>
        /// <throws>  CorruptIndexException if the index is corrupt </throws>
        /// <throws>  IOException if there is a low-level IO error </throws>
        private int AppendPostings(FormatPostingsTermsConsumer termsConsumer, SegmentMergeInfo[] smis, int n)
        {
            FormatPostingsDocsConsumer docConsumer = termsConsumer.AddTerm(smis[0].term.Text);
            int df = 0;

            for (int i = 0; i < n; i++)
            {
                SegmentMergeInfo smi      = smis[i];
                TermPositions    postings = smi.GetPositions();
                System.Diagnostics.Debug.Assert(postings != null);
                int   base_Renamed = smi.base_Renamed;
                int[] docMap       = smi.GetDocMap();
                postings.Seek(smi.termEnum);

                while (postings.Next())
                {
                    df++;
                    int doc = postings.Doc;
                    if (docMap != null)
                    {
                        doc = docMap[doc]; // map around deletions
                    }
                    doc += base_Renamed;   // convert to merged space

                    int freq = postings.Freq;
                    FormatPostingsPositionsConsumer posConsumer = docConsumer.AddDoc(doc, freq);

                    if (!omitTermFreqAndPositions)
                    {
                        for (int j = 0; j < freq; j++)
                        {
                            int position      = postings.NextPosition();
                            int payloadLength = postings.PayloadLength;
                            if (payloadLength > 0)
                            {
                                if (payloadBuffer == null || payloadBuffer.Length < payloadLength)
                                {
                                    payloadBuffer = new byte[payloadLength];
                                }
                                postings.GetPayload(payloadBuffer, 0);
                            }
                            posConsumer.AddPosition(position, payloadBuffer, 0, payloadLength);
                        }
                        posConsumer.Finish();
                    }
                }
            }
            docConsumer.Finish();

            return(df);
        }
示例#11
0
            internal TermPositionsQueue(System.Collections.IList termPositions)
            {
                Initialize(termPositions.Count);

                System.Collections.IEnumerator i = termPositions.GetEnumerator();
                while (i.MoveNext())
                {
                    TermPositions tp = (TermPositions)i.Current;
                    if (tp.Next())
                    {
                        Put(tp);
                    }
                }
            }
示例#12
0
 public override bool Next()
 {
     if (count == freq)
     {
         if (!internalPositions.Next())
         {
             internalDoc = int.MaxValue;
             return(false);
         }
         internalDoc = internalPositions.Doc;
         freq        = internalPositions.Freq;
         count       = 0;
     }
     position = internalPositions.NextPosition();
     count++;
     return(true);
 }
示例#13
0
        /// <summary>
        /// Process postings from multiple segments without tf, all positioned on the same term.
        /// Writes out merged entries only into freqOutput, proxOut is not written.
        /// </summary>
        /// <param name="smis">smis array of segments</param>
        /// <param name="n">number of cells in the array actually occupied</param>
        /// <returns></returns>
        private int AppendPostingsNoTf(SegmentMergeInfo[] smis, int n)
        {
            int lastDoc = 0;
            int df      = 0;      // number of docs w/ term

            skipListWriter.ResetSkip();
            int lastPayloadLength = -1;   // ensures that we write the first length

            for (int i = 0; i < n; i++)
            {
                SegmentMergeInfo smi      = smis[i];
                TermPositions    postings = smi.GetPositions();
                System.Diagnostics.Debug.Assert(postings != null);
                int   base_Renamed = smi.base_Renamed;
                int[] docMap       = smi.GetDocMap();
                postings.Seek(smi.termEnum);
                while (postings.Next())
                {
                    int doc = postings.Doc();
                    if (docMap != null)
                    {
                        doc = docMap[doc];                      // map around deletions
                    }
                    doc += base_Renamed;                        // convert to merged space

                    if (doc < 0 || (df > 0 && doc <= lastDoc))
                    {
                        throw new CorruptIndexException("docs out of order (" + doc +
                                                        " <= " + lastDoc + " )");
                    }

                    df++;

                    if ((df % skipInterval) == 0)
                    {
                        skipListWriter.SetSkipData(lastDoc, false, lastPayloadLength);
                        skipListWriter.BufferSkip(df);
                    }

                    int docCode = (doc - lastDoc);
                    lastDoc = doc;
                    freqOutput.WriteVInt(docCode);    // write doc & freq=1
                }
            }
            return(df);
        }
示例#14
0
        public virtual void  TestThreadSafety()
        {
            rnd = NewRandom();
            int           numThreads = 5;
            int           numDocs    = 50;
            ByteArrayPool pool       = new ByteArrayPool(numThreads, 5);

            Directory   dir    = new RAMDirectory();
            IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED, null);

            System.String field = "test";

            ThreadClass[] ingesters = new ThreadClass[numThreads];
            for (int i = 0; i < numThreads; i++)
            {
                ingesters[i] = new AnonymousClassThread(numDocs, field, pool, writer, this);
                ingesters[i].Start();
            }

            for (int i = 0; i < numThreads; i++)
            {
                ingesters[i].Join();
            }
            writer.Close();
            IndexReader reader = IndexReader.Open(dir, true, null);
            TermEnum    terms  = reader.Terms(null);

            while (terms.Next(null))
            {
                TermPositions tp = reader.TermPositions(terms.Term, null);
                while (tp.Next(null))
                {
                    int freq = tp.Freq;
                    for (int i = 0; i < freq; i++)
                    {
                        tp.NextPosition(null);
                        Assert.AreEqual(pool.BytesToString(tp.GetPayload(new byte[5], 0, null)), terms.Term.Text);
                    }
                }
                tp.Close();
            }
            terms.Close();
            reader.Close();

            Assert.AreEqual(pool.Size(), numThreads);
        }
示例#15
0
文件: TestDoc.cs 项目: yonder/mono
        private void  PrintSegment(System.IO.StringWriter out_Renamed, System.String segment)
        {
            Directory     directory = FSDirectory.GetDirectory(indexDir, false);
            SegmentReader reader    = new SegmentReader(new SegmentInfo(segment, 1, directory));

            for (int i = 0; i < reader.NumDocs(); i++)
            {
                out_Renamed.WriteLine(reader.Document(i));
            }

            TermEnum tis = reader.Terms();

            while (tis.Next())
            {
                out_Renamed.Write(tis.Term());
                out_Renamed.WriteLine(" DF=" + tis.DocFreq());

                TermPositions positions = reader.TermPositions(tis.Term());
                try
                {
                    while (positions.Next())
                    {
                        out_Renamed.Write(" doc=" + positions.Doc());
                        out_Renamed.Write(" TF=" + positions.Freq());
                        out_Renamed.Write(" pos=");
                        out_Renamed.Write(positions.NextPosition());
                        for (int j = 1; j < positions.Freq(); j++)
                        {
                            out_Renamed.Write("," + positions.NextPosition());
                        }
                        out_Renamed.WriteLine("");
                    }
                }
                finally
                {
                    positions.Close();
                }
            }
            tis.Close();
            reader.Close();
            directory.Close();
        }
示例#16
0
文件: DocTest.cs 项目: raj581/Marvin
        internal static void  PrintSegment(System.String segment)
        {
            Directory     directory = FSDirectory.GetDirectory("test", false);
            SegmentReader reader    = new SegmentReader(new SegmentInfo(segment, 1, directory));

            for (int i = 0; i < reader.NumDocs(); i++)
            {
                System.Console.Out.WriteLine(reader.Document(i));
            }

            TermEnum tis = reader.Terms();

            while (tis.Next())
            {
                System.Console.Out.Write(tis.Term());
                System.Console.Out.WriteLine(" DF=" + tis.DocFreq());

                TermPositions positions = reader.TermPositions(tis.Term());
                try
                {
                    while (positions.Next())
                    {
                        System.Console.Out.Write(" doc=" + positions.Doc());
                        System.Console.Out.Write(" TF=" + positions.Freq());
                        System.Console.Out.Write(" pos=");
                        System.Console.Out.Write(positions.NextPosition());
                        for (int j = 1; j < positions.Freq(); j++)
                        {
                            System.Console.Out.Write("," + positions.NextPosition());
                        }
                        System.Console.Out.WriteLine("");
                    }
                }
                finally
                {
                    positions.Close();
                }
            }
            tis.Close();
            reader.Close();
            directory.Close();
        }
示例#17
0
        public virtual void  TestFilterIndexReader_()
        {
            RAMDirectory directory = new RAMDirectory();
            IndexWriter  writer    = new IndexWriter(directory, new WhitespaceAnalyzer(), true);

            Document d1 = new Document();

            d1.Add(Field.Text("default", "one two"));
            writer.AddDocument(d1);

            Document d2 = new Document();

            d2.Add(Field.Text("default", "one three"));
            writer.AddDocument(d2);

            Document d3 = new Document();

            d3.Add(Field.Text("default", "two four"));
            writer.AddDocument(d3);

            writer.Close();

            IndexReader reader = new TestReader(IndexReader.Open(directory));

            TermEnum terms = reader.Terms();

            while (terms.Next())
            {
                Assert.IsTrue(terms.Term().Text().IndexOf((System.Char) 'e') != -1);
            }
            terms.Close();

            TermPositions positions = reader.TermPositions(new Term("default", "one"));

            while (positions.Next())
            {
                Assert.IsTrue((positions.Doc() % 2) == 1);
            }

            reader.Close();
        }
示例#18
0
        private void  PrintSegment(System.IO.StreamWriter out_Renamed, SegmentInfo si)
        {
            SegmentReader reader = SegmentReader.Get(si);

            for (int i = 0; i < reader.NumDocs(); i++)
            {
                out_Renamed.WriteLine(reader.Document(i));
            }

            TermEnum tis = reader.Terms();

            while (tis.Next())
            {
                out_Renamed.Write(tis.Term());
                out_Renamed.WriteLine(" DF=" + tis.DocFreq());

                TermPositions positions = reader.TermPositions(tis.Term());
                try
                {
                    while (positions.Next())
                    {
                        out_Renamed.Write(" doc=" + positions.Doc());
                        out_Renamed.Write(" TF=" + positions.Freq());
                        out_Renamed.Write(" pos=");
                        out_Renamed.Write(positions.NextPosition());
                        for (int j = 1; j < positions.Freq(); j++)
                        {
                            out_Renamed.Write("," + positions.NextPosition());
                        }
                        out_Renamed.WriteLine("");
                    }
                }
                finally
                {
                    positions.Close();
                }
            }
            tis.Close();
            reader.Close();
        }
示例#19
0
        private void  PrintSegment(System.IO.StreamWriter out_Renamed, SegmentInfo si)
        {
            SegmentReader reader = SegmentReader.Get(true, si, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR, null);

            for (int i = 0; i < reader.NumDocs(); i++)
            {
                out_Renamed.WriteLine(reader.Document(i, null));
            }

            TermEnum tis = reader.Terms(null);

            while (tis.Next(null))
            {
                out_Renamed.Write(tis.Term);
                out_Renamed.WriteLine(" DF=" + tis.DocFreq());

                TermPositions positions = reader.TermPositions(tis.Term, null);
                try
                {
                    while (positions.Next(null))
                    {
                        out_Renamed.Write(" doc=" + positions.Doc);
                        out_Renamed.Write(" TF=" + positions.Freq);
                        out_Renamed.Write(" pos=");
                        out_Renamed.Write(positions.NextPosition(null));
                        for (int j = 1; j < positions.Freq; j++)
                        {
                            out_Renamed.Write("," + positions.NextPosition(null));
                        }
                        out_Renamed.WriteLine("");
                    }
                }
                finally
                {
                    positions.Close();
                }
            }
            tis.Close();
            reader.Close();
        }
示例#20
0
        /// <summary>Process postings from multiple segments all positioned on the
        /// same term. Writes out merged entries into freqOutput and
        /// the proxOutput streams.
        ///
        /// </summary>
        /// <param name="smis">array of segments
        /// </param>
        /// <param name="n">number of cells in the array actually occupied
        /// </param>
        /// <returns> number of documents across all segments where this term was found
        /// </returns>
        /// <throws>  CorruptIndexException if the index is corrupt </throws>
        /// <throws>  IOException if there is a low-level IO error </throws>
        private int AppendPostings(SegmentMergeInfo[] smis, int n)
        {
            int lastDoc = 0;
            int df      = 0;        // number of docs w/ term

            skipListWriter.ResetSkip();
            bool storePayloads     = fieldInfos.FieldInfo(smis[0].term.field).storePayloads;
            int  lastPayloadLength = -1;             // ensures that we write the first length

            for (int i = 0; i < n; i++)
            {
                SegmentMergeInfo smi      = smis[i];
                TermPositions    postings = smi.GetPositions();
                System.Diagnostics.Debug.Assert(postings != null);
                int   base_Renamed = smi.base_Renamed;
                int[] docMap       = smi.GetDocMap();
                postings.Seek(smi.termEnum);
                while (postings.Next())
                {
                    int doc = postings.Doc();
                    if (docMap != null)
                    {
                        doc = docMap[doc];                   // map around deletions
                    }
                    doc += base_Renamed;                     // convert to merged space

                    if (doc < 0 || (df > 0 && doc <= lastDoc))
                    {
                        throw new CorruptIndexException("docs out of order (" + doc + " <= " + lastDoc + " )");
                    }

                    df++;

                    if ((df % skipInterval) == 0)
                    {
                        skipListWriter.SetSkipData(lastDoc, storePayloads, lastPayloadLength);
                        skipListWriter.BufferSkip(df);
                    }

                    int docCode = (doc - lastDoc) << 1;                     // use low bit to flag freq=1
                    lastDoc = doc;

                    int freq = postings.Freq();
                    if (freq == 1)
                    {
                        freqOutput.WriteVInt(docCode | 1);                         // write doc & freq=1
                    }
                    else
                    {
                        freqOutput.WriteVInt(docCode);                      // write doc
                        freqOutput.WriteVInt(freq);                         // write frequency in doc
                    }

                    /** See {@link DocumentWriter#writePostings(Posting[], String) for
                     *  documentation about the encoding of positions and payloads
                     */
                    int lastPosition = 0;                     // write position deltas
                    for (int j = 0; j < freq; j++)
                    {
                        int position = postings.NextPosition();
                        int delta    = position - lastPosition;
                        if (storePayloads)
                        {
                            int payloadLength = postings.GetPayloadLength();
                            if (payloadLength == lastPayloadLength)
                            {
                                proxOutput.WriteVInt(delta * 2);
                            }
                            else
                            {
                                proxOutput.WriteVInt(delta * 2 + 1);
                                proxOutput.WriteVInt(payloadLength);
                                lastPayloadLength = payloadLength;
                            }
                            if (payloadLength > 0)
                            {
                                if (payloadBuffer == null || payloadBuffer.Length < payloadLength)
                                {
                                    payloadBuffer = new byte[payloadLength];
                                }
                                postings.GetPayload(payloadBuffer, 0);
                                proxOutput.WriteBytes(payloadBuffer, 0, payloadLength);
                            }
                        }
                        else
                        {
                            proxOutput.WriteVInt(delta);
                        }
                        lastPosition = position;
                    }
                }
            }
            return(df);
        }
示例#21
0
        /// <summary>Returns true if index is clean, else false.</summary>
        public static bool Check(Directory dir, bool doFix)
        {
            System.Globalization.NumberFormatInfo nf = System.Globalization.CultureInfo.CurrentCulture.NumberFormat;
            SegmentInfos sis = new SegmentInfos();

            try
            {
                sis.Read(dir);
            }
            catch (System.Exception t)
            {
                out_Renamed.WriteLine("ERROR: could not read any segments file in directory");
                out_Renamed.Write(t.StackTrace);
                out_Renamed.Flush();
                return(false);
            }

            int numSegments = sis.Count;

            System.String segmentsFileName = sis.GetCurrentSegmentFileName();
            IndexInput    input            = null;

            try
            {
                input = dir.OpenInput(segmentsFileName);
            }
            catch (System.Exception t)
            {
                out_Renamed.WriteLine("ERROR: could not open segments file in directory");
                out_Renamed.Write(t.StackTrace);
                out_Renamed.Flush();
                return(false);
            }
            int format = 0;

            try
            {
                format = input.ReadInt();
            }
            catch (System.Exception t)
            {
                out_Renamed.WriteLine("ERROR: could not read segment file version in directory");
                out_Renamed.Write(t.StackTrace);
                out_Renamed.Flush();
                return(false);
            }
            finally
            {
                if (input != null)
                {
                    input.Close();
                }
            }

            System.String sFormat = "";
            bool          skip    = false;

            if (format == SegmentInfos.FORMAT)
            {
                sFormat = "FORMAT [Lucene Pre-2.1]";
            }
            if (format == SegmentInfos.FORMAT_LOCKLESS)
            {
                sFormat = "FORMAT_LOCKLESS [Lucene 2.1]";
            }
            else if (format == SegmentInfos.FORMAT_SINGLE_NORM_FILE)
            {
                sFormat = "FORMAT_SINGLE_NORM_FILE [Lucene 2.2]";
            }
            else if (format == SegmentInfos.FORMAT_SHARED_DOC_STORE)
            {
                sFormat = "FORMAT_SHARED_DOC_STORE [Lucene 2.3]";
            }
            else if (format < SegmentInfos.FORMAT_SHARED_DOC_STORE)
            {
                sFormat = "int=" + format + " [newer version of Lucene than this tool]";
                skip    = true;
            }
            else
            {
                sFormat = format + " [Lucene 1.3 or prior]";
            }

            out_Renamed.WriteLine("Segments file=" + segmentsFileName + " numSegments=" + numSegments + " version=" + sFormat);

            if (skip)
            {
                out_Renamed.WriteLine("\nERROR: this index appears to be created by a newer version of Lucene than this tool was compiled on; please re-compile this tool on the matching version of Lucene; exiting");
                return(false);
            }

            SegmentInfos newSIS = (SegmentInfos)sis.Clone();

            newSIS.Clear();
            bool changed         = false;
            int  totLoseDocCount = 0;
            int  numBadSegments  = 0;

            for (int i = 0; i < numSegments; i++)
            {
                SegmentInfo info = sis.Info(i);
                out_Renamed.WriteLine("  " + (1 + i) + " of " + numSegments + ": name=" + info.name + " docCount=" + info.docCount);
                int toLoseDocCount = info.docCount;

                SegmentReader reader = null;

                try
                {
                    out_Renamed.WriteLine("    compound=" + info.GetUseCompoundFile());
                    out_Renamed.WriteLine("    numFiles=" + info.Files().Count);
                    out_Renamed.WriteLine(String.Format(nf, "    size (MB)={0:f}", new Object[] { (info.SizeInBytes() / (1024.0 * 1024.0)) }));
                    int docStoreOffset = info.GetDocStoreOffset();
                    if (docStoreOffset != -1)
                    {
                        out_Renamed.WriteLine("    docStoreOffset=" + docStoreOffset);
                        out_Renamed.WriteLine("    docStoreSegment=" + info.GetDocStoreSegment());
                        out_Renamed.WriteLine("    docStoreIsCompoundFile=" + info.GetDocStoreIsCompoundFile());
                    }
                    System.String delFileName = info.GetDelFileName();
                    if (delFileName == null)
                    {
                        out_Renamed.WriteLine("    no deletions");
                    }
                    else
                    {
                        out_Renamed.WriteLine("    has deletions [delFileName=" + delFileName + "]");
                    }
                    out_Renamed.Write("    test: open reader.........");
                    reader = SegmentReader.Get(info);
                    int numDocs = reader.NumDocs();
                    toLoseDocCount = numDocs;
                    if (reader.HasDeletions())
                    {
                        out_Renamed.WriteLine("OK [" + (info.docCount - numDocs) + " deleted docs]");
                    }
                    else
                    {
                        out_Renamed.WriteLine("OK");
                    }

                    out_Renamed.Write("    test: fields, norms.......");
                    System.Collections.IDictionary fieldNames = (System.Collections.IDictionary)reader.GetFieldNames(IndexReader.FieldOption.ALL);
                    System.Collections.IEnumerator it         = fieldNames.Keys.GetEnumerator();
                    while (it.MoveNext())
                    {
                        System.String fieldName = (System.String)it.Current;
                        byte[]        b         = reader.Norms(fieldName);
                        if (b.Length != info.docCount)
                        {
                            throw new System.SystemException("norms for field \"" + fieldName + "\" is length " + b.Length + " != maxDoc " + info.docCount);
                        }
                    }
                    out_Renamed.WriteLine("OK [" + fieldNames.Count + " fields]");

                    out_Renamed.Write("    test: terms, freq, prox...");
                    TermEnum      termEnum      = reader.Terms();
                    TermPositions termPositions = reader.TermPositions();

                    // Used only to count up # deleted docs for this
                    // term
                    MySegmentTermDocs myTermDocs = new MySegmentTermDocs(reader);

                    long termCount = 0;
                    long totFreq   = 0;
                    long totPos    = 0;
                    while (termEnum.Next())
                    {
                        termCount++;
                        Term term    = termEnum.Term();
                        int  docFreq = termEnum.DocFreq();
                        termPositions.Seek(term);
                        int lastDoc = -1;
                        int freq0   = 0;
                        totFreq += docFreq;
                        while (termPositions.Next())
                        {
                            freq0++;
                            int doc  = termPositions.Doc();
                            int freq = termPositions.Freq();
                            if (doc <= lastDoc)
                            {
                                throw new System.SystemException("term " + term + ": doc " + doc + " < lastDoc " + lastDoc);
                            }
                            lastDoc = doc;
                            if (freq <= 0)
                            {
                                throw new System.SystemException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds");
                            }

                            int lastPos = -1;
                            totPos += freq;
                            for (int j = 0; j < freq; j++)
                            {
                                int pos = termPositions.NextPosition();
                                if (pos < 0)
                                {
                                    throw new System.SystemException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds");
                                }
                                if (pos <= lastPos)
                                {
                                    throw new System.SystemException("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos);
                                }
                            }
                        }

                        // Now count how many deleted docs occurred in
                        // this term:
                        int delCount;
                        if (reader.HasDeletions())
                        {
                            myTermDocs.Seek(term);
                            while (myTermDocs.Next())
                            {
                            }
                            delCount = myTermDocs.delCount;
                        }
                        else
                        {
                            delCount = 0;
                        }

                        if (freq0 + delCount != docFreq)
                        {
                            throw new System.SystemException("term " + term + " docFreq=" + docFreq + " != num docs seen " + freq0 + " + num docs deleted " + delCount);
                        }
                    }

                    out_Renamed.WriteLine("OK [" + termCount + " terms; " + totFreq + " terms/docs pairs; " + totPos + " tokens]");

                    out_Renamed.Write("    test: stored fields.......");
                    int  docCount  = 0;
                    long totFields = 0;
                    for (int j = 0; j < info.docCount; j++)
                    {
                        if (!reader.IsDeleted(j))
                        {
                            docCount++;
                            Document doc = reader.Document(j);
                            totFields += doc.GetFields().Count;
                        }
                    }

                    if (docCount != reader.NumDocs())
                    {
                        throw new System.SystemException("docCount=" + docCount + " but saw " + docCount + " undeleted docs");
                    }

                    out_Renamed.WriteLine(String.Format(nf, "OK [{0:d} total field count; avg {1:f} fields per doc]", new Object[] { totFields, (((float)totFields) / docCount) }));

                    out_Renamed.Write("    test: term vectors........");
                    int totVectors = 0;
                    for (int j = 0; j < info.docCount; j++)
                    {
                        if (!reader.IsDeleted(j))
                        {
                            TermFreqVector[] tfv = reader.GetTermFreqVectors(j);
                            if (tfv != null)
                            {
                                totVectors += tfv.Length;
                            }
                        }
                    }

                    out_Renamed.WriteLine(String.Format(nf, "OK [{0:d} total vector count; avg {1:f} term/freq vector fields per doc]", new Object[] { totVectors, (((float)totVectors) / docCount) }));
                    out_Renamed.WriteLine("");
                }
                catch (System.Exception t)
                {
                    out_Renamed.WriteLine("FAILED");
                    System.String comment;
                    if (doFix)
                    {
                        comment = "will remove reference to this segment (-fix is specified)";
                    }
                    else
                    {
                        comment = "would remove reference to this segment (-fix was not specified)";
                    }
                    out_Renamed.WriteLine("    WARNING: " + comment + "; full exception:");
                    out_Renamed.Write(t.StackTrace);
                    out_Renamed.Flush();
                    out_Renamed.WriteLine("");
                    totLoseDocCount += toLoseDocCount;
                    numBadSegments++;
                    changed = true;
                    continue;
                }
                finally
                {
                    if (reader != null)
                    {
                        reader.Close();
                    }
                }

                // Keeper
                newSIS.Add(info.Clone());
            }

            if (!changed)
            {
                out_Renamed.WriteLine("No problems were detected with this index.\n");
                return(true);
            }
            else
            {
                out_Renamed.WriteLine("WARNING: " + numBadSegments + " broken segments detected");
                if (doFix)
                {
                    out_Renamed.WriteLine("WARNING: " + totLoseDocCount + " documents will be lost");
                }
                else
                {
                    out_Renamed.WriteLine("WARNING: " + totLoseDocCount + " documents would be lost if -fix were specified");
                }
                out_Renamed.WriteLine();
            }

            if (doFix)
            {
                out_Renamed.WriteLine("NOTE: will write new segments file in 5 seconds; this will remove " + totLoseDocCount + " docs from the index. THIS IS YOUR LAST CHANCE TO CTRL+C!");
                for (int i = 0; i < 5; i++)
                {
                    try
                    {
                        System.Threading.Thread.Sleep(new System.TimeSpan((System.Int64) 10000 * 1000));
                    }
                    catch (System.Threading.ThreadInterruptedException)
                    {
                        SupportClass.ThreadClass.Current().Interrupt();
                        i--;
                        continue;
                    }

                    out_Renamed.WriteLine("  " + (5 - i) + "...");
                }
                out_Renamed.Write("Writing...");
                try
                {
                    newSIS.Write(dir);
                }
                catch (System.Exception t)
                {
                    out_Renamed.WriteLine("FAILED; exiting");
                    out_Renamed.Write(t.StackTrace);
                    out_Renamed.Flush();
                    return(false);
                }
                out_Renamed.WriteLine("OK");
                out_Renamed.WriteLine("Wrote new segments file \"" + newSIS.GetCurrentSegmentFileName() + "\"");
            }
            else
            {
                out_Renamed.WriteLine("NOTE: would write new segments file [-fix was not specified]");
            }
            out_Renamed.WriteLine("");

            return(false);
        }
示例#22
0
        /// <summary> Test the term index.</summary>
        private Status.TermIndexStatus TestTermIndex(SegmentInfo info, SegmentReader reader)
        {
            Status.TermIndexStatus status = new Status.TermIndexStatus();

            try
            {
                if (infoStream != null)
                {
                    infoStream.Write("    test: terms, freq, prox...");
                }

                TermEnum      termEnum      = reader.Terms();
                TermPositions termPositions = reader.TermPositions();

                // Used only to count up # deleted docs for this term
                MySegmentTermDocs myTermDocs = new MySegmentTermDocs(reader);

                int maxDoc = reader.MaxDoc();

                while (termEnum.Next())
                {
                    status.termCount++;
                    Term term    = termEnum.Term();
                    int  docFreq = termEnum.DocFreq();
                    termPositions.Seek(term);
                    int lastDoc = -1;
                    int freq0   = 0;
                    status.totFreq += docFreq;
                    while (termPositions.Next())
                    {
                        freq0++;
                        int doc  = termPositions.Doc();
                        int freq = termPositions.Freq();
                        if (doc <= lastDoc)
                        {
                            throw new System.SystemException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
                        }
                        if (doc >= maxDoc)
                        {
                            throw new System.SystemException("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc);
                        }

                        lastDoc = doc;
                        if (freq <= 0)
                        {
                            throw new System.SystemException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds");
                        }

                        int lastPos = -1;
                        status.totPos += freq;
                        for (int j = 0; j < freq; j++)
                        {
                            int pos = termPositions.NextPosition();
                            if (pos < -1)
                            {
                                throw new System.SystemException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds");
                            }
                            if (pos < lastPos)
                            {
                                throw new System.SystemException("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos);
                            }
                        }
                    }

                    // Now count how many deleted docs occurred in
                    // this term:
                    int delCount;
                    if (reader.HasDeletions())
                    {
                        myTermDocs.Seek(term);
                        while (myTermDocs.Next())
                        {
                        }
                        delCount = myTermDocs.delCount;
                    }
                    else
                    {
                        delCount = 0;
                    }

                    if (freq0 + delCount != docFreq)
                    {
                        throw new System.SystemException("term " + term + " docFreq=" + docFreq + " != num docs seen " + freq0 + " + num docs deleted " + delCount);
                    }
                }

                Msg("OK [" + status.termCount + " terms; " + status.totFreq + " terms/docs pairs; " + status.totPos + " tokens]");
            }
            catch (System.Exception e)
            {
                Msg("ERROR [" + System.Convert.ToString(e.Message) + "]");
                status.error = e;
                if (infoStream != null)
                {
                    infoStream.WriteLine(e.StackTrace);
                }
            }

            return(status);
        }
示例#23
0
        // builds an index with payloads in the given Directory and performs
        // different tests to verify the payload encoding
        private void  PerformTest(Directory dir)
        {
            PayloadAnalyzer analyzer = new PayloadAnalyzer();
            IndexWriter     writer   = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED, null);

            // should be in sync with value in TermInfosWriter
            int skipInterval = 16;

            int numTerms = 5;

            System.String fieldName = "f1";

            int numDocs = skipInterval + 1;

            // create content for the test documents with just a few terms
            Term[] terms = GenerateTerms(fieldName, numTerms);
            System.Text.StringBuilder sb = new System.Text.StringBuilder();
            for (int i = 0; i < terms.Length; i++)
            {
                sb.Append(terms[i].Text);
                sb.Append(" ");
            }
            System.String content = sb.ToString();


            int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;

            byte[] payloadData = GenerateRandomData(payloadDataLength);

            Document d = new Document();

            d.Add(new Field(fieldName, content, Field.Store.NO, Field.Index.ANALYZED));
            // add the same document multiple times to have the same payload lengths for all
            // occurrences within two consecutive skip intervals
            int offset = 0;

            for (int i = 0; i < 2 * numDocs; i++)
            {
                analyzer.SetPayloadData(fieldName, payloadData, offset, 1);
                offset += numTerms;
                writer.AddDocument(d, null);
            }

            // make sure we create more than one segment to test merging
            writer.Commit(null);

            // now we make sure to have different payload lengths next at the next skip point
            for (int i = 0; i < numDocs; i++)
            {
                analyzer.SetPayloadData(fieldName, payloadData, offset, i);
                offset += i * numTerms;
                writer.AddDocument(d, null);
            }

            writer.Optimize(null);
            // flush
            writer.Close();


            /*
             * Verify the index
             * first we test if all payloads are stored correctly
             */
            IndexReader reader = IndexReader.Open(dir, true, null);

            byte[] verifyPayloadData = new byte[payloadDataLength];
            offset = 0;
            TermPositions[] tps = new TermPositions[numTerms];
            for (int i = 0; i < numTerms; i++)
            {
                tps[i] = reader.TermPositions(terms[i], null);
            }

            while (tps[0].Next(null))
            {
                for (int i = 1; i < numTerms; i++)
                {
                    tps[i].Next(null);
                }
                int freq = tps[0].Freq;

                for (int i = 0; i < freq; i++)
                {
                    for (int j = 0; j < numTerms; j++)
                    {
                        tps[j].NextPosition(null);
                        tps[j].GetPayload(verifyPayloadData, offset, null);
                        offset += tps[j].PayloadLength;
                    }
                }
            }

            for (int i = 0; i < numTerms; i++)
            {
                tps[i].Close();
            }

            AssertByteArrayEquals(payloadData, verifyPayloadData);

            /*
             *  test lazy skipping
             */
            TermPositions tp = reader.TermPositions(terms[0], null);

            tp.Next(null);
            tp.NextPosition(null);
            // now we don't read this payload
            tp.NextPosition(null);
            Assert.AreEqual(1, tp.PayloadLength, "Wrong payload length.");
            byte[] payload = tp.GetPayload(null, 0, null);
            Assert.AreEqual(payload[0], payloadData[numTerms]);
            tp.NextPosition(null);

            // we don't read this payload and skip to a different document
            tp.SkipTo(5, null);
            tp.NextPosition(null);
            Assert.AreEqual(1, tp.PayloadLength, "Wrong payload length.");
            payload = tp.GetPayload(null, 0, null);
            Assert.AreEqual(payload[0], payloadData[5 * numTerms]);


            /*
             * Test different lengths at skip points
             */
            tp.Seek(terms[1], null);
            tp.Next(null);
            tp.NextPosition(null);
            Assert.AreEqual(1, tp.PayloadLength, "Wrong payload length.");
            tp.SkipTo(skipInterval - 1, null);
            tp.NextPosition(null);
            Assert.AreEqual(1, tp.PayloadLength, "Wrong payload length.");
            tp.SkipTo(2 * skipInterval - 1, null);
            tp.NextPosition(null);
            Assert.AreEqual(1, tp.PayloadLength, "Wrong payload length.");
            tp.SkipTo(3 * skipInterval - 1, null);
            tp.NextPosition(null);
            Assert.AreEqual(3 * skipInterval - 2 * numDocs - 1, tp.PayloadLength, "Wrong payload length.");

            /*
             * Test multiple call of getPayload()
             */
            tp.GetPayload(null, 0, null);

            // it is forbidden to call getPayload() more than once
            // without calling nextPosition()
            Assert.Throws <IOException>(() => tp.GetPayload(null, 0, null), "Expected exception not thrown");

            reader.Close();

            // test long payload
            analyzer = new PayloadAnalyzer();
            writer   = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED, null);
            System.String singleTerm = "lucene";

            d = new Document();
            d.Add(new Field(fieldName, singleTerm, Field.Store.NO, Field.Index.ANALYZED));
            // add a payload whose length is greater than the buffer size of BufferedIndexOutput
            payloadData = GenerateRandomData(2000);
            analyzer.SetPayloadData(fieldName, payloadData, 100, 1500);
            writer.AddDocument(d, null);


            writer.Optimize(null);
            // flush
            writer.Close();

            reader = IndexReader.Open(dir, true, null);
            tp     = reader.TermPositions(new Term(fieldName, singleTerm), null);
            tp.Next(null);
            tp.NextPosition(null);

            verifyPayloadData = new byte[tp.PayloadLength];
            tp.GetPayload(verifyPayloadData, 0, null);
            byte[] portion = new byte[1500];
            Array.Copy(payloadData, 100, portion, 0, 1500);

            AssertByteArrayEquals(portion, verifyPayloadData);
            reader.Close();
        }