public virtual void TestSeek() { Directory directory = new RAMDirectory(); IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); for (int i = 0; i < 10; i++) { Document doc = new Document(); doc.Add(new Field(this.field, "a b", Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); } writer.Close(); IndexReader reader = IndexReader.Open(directory); TermPositions tp = reader.TermPositions(); tp.Seek(new Term(this.field, "b")); for (int i = 0; i < 10; i++) { tp.Next(); Assert.AreEqual(tp.Doc(), i); Assert.AreEqual(tp.NextPosition(), 1); } tp.Seek(new Term(this.field, "a")); for (int i = 0; i < 10; i++) { tp.Next(); Assert.AreEqual(tp.Doc(), i); Assert.AreEqual(tp.NextPosition(), 0); } }
public virtual void TestTokenReuse() { Analyzer analyzer = new AnonymousClassAnalyzer1(this); IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); doc.Add(new Field("f1", "a 5 a a", Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); writer.Flush(); SegmentInfo info = writer.NewestSegment(); writer.Close(); SegmentReader reader = SegmentReader.Get(info); TermPositions termPositions = reader.TermPositions(new Term("f1", "a")); Assert.IsTrue(termPositions.Next()); int freq = termPositions.Freq(); Assert.AreEqual(3, freq); Assert.AreEqual(0, termPositions.NextPosition()); Assert.AreEqual(true, termPositions.IsPayloadAvailable()); Assert.AreEqual(6, termPositions.NextPosition()); Assert.AreEqual(false, termPositions.IsPayloadAvailable()); Assert.AreEqual(7, termPositions.NextPosition()); Assert.AreEqual(false, termPositions.IsPayloadAvailable()); }
public virtual void TestPreAnalyzedField() { IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); doc.Add(new Field("preanalyzed", new AnonymousClassTokenStream(this), TermVector.NO)); writer.AddDocument(doc); writer.Flush(); SegmentInfo info = writer.NewestSegment(); writer.Close(); SegmentReader reader = SegmentReader.Get(info); TermPositions termPositions = reader.TermPositions(new Term("preanalyzed", "term1")); Assert.IsTrue(termPositions.Next()); Assert.AreEqual(1, termPositions.Freq()); Assert.AreEqual(0, termPositions.NextPosition()); termPositions.Seek(new Term("preanalyzed", "term2")); Assert.IsTrue(termPositions.Next()); Assert.AreEqual(2, termPositions.Freq()); Assert.AreEqual(1, termPositions.NextPosition()); Assert.AreEqual(3, termPositions.NextPosition()); termPositions.Seek(new Term("preanalyzed", "term3")); Assert.IsTrue(termPositions.Next()); Assert.AreEqual(1, termPositions.Freq()); Assert.AreEqual(2, termPositions.NextPosition()); }
public virtual void TestPositionIncrementGap() { Analyzer analyzer = new AnonymousClassAnalyzer(this); IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); doc.Add(new Field("repeated", "repeated one", Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("repeated", "repeated two", Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); writer.Flush(); SegmentInfo info = writer.NewestSegment(); writer.Close(); SegmentReader reader = SegmentReader.Get(info); TermPositions termPositions = reader.TermPositions(new Term("repeated", "repeated")); Assert.IsTrue(termPositions.Next()); int freq = termPositions.Freq(); Assert.AreEqual(2, freq); Assert.AreEqual(0, termPositions.NextPosition()); Assert.AreEqual(502, termPositions.NextPosition()); }
protected virtual void LoadPayload(Term term) { byte[] payloadBuf = null; TermPositions tp = _reader.TermPositions(); tp.Seek(term); while (tp.Next()) { if (tp.Freq > 0) { tp.NextPosition(); if (tp.IsPayloadAvailable) { int len = tp.PayloadLength; payloadBuf = tp.GetPayload(payloadBuf, 0); Add(tp.Doc, payloadBuf, len); } } } // save the last page while (_curSlot < MAX_SLOTS) { _curPage[_curSlot++] = MISSING; } _list[_curPageNo] = CopyPage(new int[_curData]); // optimize the page to make getNumItems work _curPage = null; }
public virtual void TestTerms() { TermEnum terms = reader.Terms(); Assert.IsTrue(terms != null); while (terms.Next() == true) { Term term = terms.Term(); Assert.IsTrue(term != null); //System.out.println("Term: " + term); System.String fieldValue = (System.String)DocHelper.nameValues[term.Field()]; Assert.IsTrue(fieldValue.IndexOf(term.Text()) != -1); } TermDocs termDocs = reader.TermDocs(); Assert.IsTrue(termDocs != null); termDocs.Seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field")); Assert.IsTrue(termDocs.Next() == true); termDocs.Seek(new Term(DocHelper.NO_NORMS_KEY, DocHelper.NO_NORMS_TEXT)); Assert.IsTrue(termDocs.Next() == true); TermPositions positions = reader.TermPositions(); positions.Seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field")); Assert.IsTrue(positions != null); Assert.IsTrue(positions.Doc() == 0); Assert.IsTrue(positions.NextPosition() >= 0); }
public override void Load() { TermPositions tp = null; byte[] payloadBuffer = new byte[4]; // four bytes for an int try { tp = _reader.TermPositions(_sizeTerm); if (tp == null) { return; } while (tp.Next()) { if (tp.Freq > 0) { tp.NextPosition(); tp.GetPayload(payloadBuffer, 0); int len = BytesToInt(payloadBuffer); Allocate(tp.Doc, Math.Min(len, _maxItems), true); } } } finally { if (tp != null) { tp.Dispose(); } } }
public virtual void TestTerms() { try { TermEnum terms = reader.Terms(); Assert.IsTrue(terms != null); while (terms.Next() == true) { Term term = terms.Term(); Assert.IsTrue(term != null); //System.out.println("Term: " + term); System.String fieldValue = (System.String)DocHelper.nameValues[term.Field()]; Assert.IsTrue(fieldValue.IndexOf(term.Text()) != -1); } TermDocs termDocs = reader.TermDocs(); Assert.IsTrue(termDocs != null); termDocs.Seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "Field")); Assert.IsTrue(termDocs.Next() == true); TermPositions positions = reader.TermPositions(); positions.Seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "Field")); Assert.IsTrue(positions != null); Assert.IsTrue(positions.Doc() == 0); Assert.IsTrue(positions.NextPosition() >= 0); } catch (System.IO.IOException e) { System.Console.Error.WriteLine(e.StackTrace); Assert.IsTrue(false); } }
public virtual void TestTerms() { TermEnum terms = _reader.Terms(); Assert.IsTrue(terms != null); while (terms.Next() == true) { Term term = terms.Term; Assert.IsTrue(term != null); //System.out.println("Term: " + term); System.String fieldValue = (System.String)DocHelper.NameValues[term.Field]; Assert.IsTrue(fieldValue.IndexOf(term.Text) != -1); } TermDocs termDocs = _reader.TermDocs(); Assert.IsTrue(termDocs != null); termDocs.Seek(new Term(DocHelper.TextField1Key, "field")); Assert.IsTrue(termDocs.Next() == true); termDocs.Seek(new Term(DocHelper.NoNormsKey, DocHelper.NoNormsText)); Assert.IsTrue(termDocs.Next() == true); TermPositions positions = _reader.TermPositions(); positions.Seek(new Term(DocHelper.TextField1Key, "field")); Assert.IsTrue(positions != null); Assert.IsTrue(positions.Doc == 0); Assert.IsTrue(positions.NextPosition() >= 0); }
/// <summary>Process postings from multiple segments all positioned on the /// same term. Writes out merged entries into freqOutput and /// the proxOutput streams. /// /// </summary> /// <param name="smis">array of segments /// </param> /// <param name="n">number of cells in the array actually occupied /// </param> /// <returns> number of documents across all segments where this term was found /// </returns> private int AppendPostings(SegmentMergeInfo[] smis, int n) { int lastDoc = 0; int df = 0; // number of docs w/ term ResetSkip(); for (int i = 0; i < n; i++) { SegmentMergeInfo smi = smis[i]; TermPositions postings = smi.postings; int base_Renamed = smi.base_Renamed; int[] docMap = smi.docMap; postings.Seek(smi.termEnum); while (postings.Next()) { int doc = postings.Doc(); if (docMap != null) { doc = docMap[doc]; // map around deletions } doc += base_Renamed; // convert to merged space if (doc < lastDoc) { throw new System.SystemException("docs out of order"); } df++; if ((df % skipInterval) == 0) { BufferSkip(lastDoc); } int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1 lastDoc = doc; int freq = postings.Freq(); if (freq == 1) { freqOutput.WriteVInt(docCode | 1); // write doc & freq=1 } else { freqOutput.WriteVInt(docCode); // write doc freqOutput.WriteVInt(freq); // write frequency in doc } int lastPosition = 0; // write position deltas for (int j = 0; j < freq; j++) { int position = postings.NextPosition(); proxOutput.WriteVInt(position - lastPosition); lastPosition = position; } } } return(df); }
public override bool Next() { if (count == freq) { if (!internalPositions.Next()) { internalDoc = int.MaxValue; return(false); } internalDoc = internalPositions.Doc; freq = internalPositions.Freq; count = 0; } position = internalPositions.NextPosition(); count++; return(true); }
private void PrintSegment(System.IO.StringWriter out_Renamed, System.String segment) { Directory directory = FSDirectory.GetDirectory(indexDir, false); SegmentReader reader = new SegmentReader(new SegmentInfo(segment, 1, directory)); for (int i = 0; i < reader.NumDocs(); i++) { out_Renamed.WriteLine(reader.Document(i)); } TermEnum tis = reader.Terms(); while (tis.Next()) { out_Renamed.Write(tis.Term()); out_Renamed.WriteLine(" DF=" + tis.DocFreq()); TermPositions positions = reader.TermPositions(tis.Term()); try { while (positions.Next()) { out_Renamed.Write(" doc=" + positions.Doc()); out_Renamed.Write(" TF=" + positions.Freq()); out_Renamed.Write(" pos="); out_Renamed.Write(positions.NextPosition()); for (int j = 1; j < positions.Freq(); j++) { out_Renamed.Write("," + positions.NextPosition()); } out_Renamed.WriteLine(""); } } finally { positions.Close(); } } tis.Close(); reader.Close(); directory.Close(); }
internal static void PrintSegment(System.String segment) { Directory directory = FSDirectory.GetDirectory("test", false); SegmentReader reader = new SegmentReader(new SegmentInfo(segment, 1, directory)); for (int i = 0; i < reader.NumDocs(); i++) { System.Console.Out.WriteLine(reader.Document(i)); } TermEnum tis = reader.Terms(); while (tis.Next()) { System.Console.Out.Write(tis.Term()); System.Console.Out.WriteLine(" DF=" + tis.DocFreq()); TermPositions positions = reader.TermPositions(tis.Term()); try { while (positions.Next()) { System.Console.Out.Write(" doc=" + positions.Doc()); System.Console.Out.Write(" TF=" + positions.Freq()); System.Console.Out.Write(" pos="); System.Console.Out.Write(positions.NextPosition()); for (int j = 1; j < positions.Freq(); j++) { System.Console.Out.Write("," + positions.NextPosition()); } System.Console.Out.WriteLine(""); } } finally { positions.Close(); } } tis.Close(); reader.Close(); directory.Close(); }
public virtual void CheckSkipTo(TermPositions tp, int target, int maxCounter) { tp.SkipTo(target); Assert.Greater(maxCounter, counter, "Too many bytes read: " + counter); Assert.AreEqual(target, tp.Doc, "Wrong document " + tp.Doc + " after skipTo target " + target); Assert.AreEqual(1, tp.Freq, "Frequency is not 1: " + tp.Freq); tp.NextPosition(); byte[] b = new byte[1]; tp.GetPayload(b, 0); Assert.AreEqual((byte)target, b[0], "Wrong payload for the target " + target + ": " + b[0]); }
private void PrintSegment(System.IO.StreamWriter out_Renamed, SegmentInfo si) { SegmentReader reader = SegmentReader.Get(true, si, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR, null); for (int i = 0; i < reader.NumDocs(); i++) { out_Renamed.WriteLine(reader.Document(i, null)); } TermEnum tis = reader.Terms(null); while (tis.Next(null)) { out_Renamed.Write(tis.Term); out_Renamed.WriteLine(" DF=" + tis.DocFreq()); TermPositions positions = reader.TermPositions(tis.Term, null); try { while (positions.Next(null)) { out_Renamed.Write(" doc=" + positions.Doc); out_Renamed.Write(" TF=" + positions.Freq); out_Renamed.Write(" pos="); out_Renamed.Write(positions.NextPosition(null)); for (int j = 1; j < positions.Freq; j++) { out_Renamed.Write("," + positions.NextPosition(null)); } out_Renamed.WriteLine(""); } } finally { positions.Close(); } } tis.Close(); reader.Close(); }
private void PrintSegment(System.IO.StreamWriter out_Renamed, SegmentInfo si) { SegmentReader reader = SegmentReader.Get(si); for (int i = 0; i < reader.NumDocs(); i++) { out_Renamed.WriteLine(reader.Document(i)); } TermEnum tis = reader.Terms(); while (tis.Next()) { out_Renamed.Write(tis.Term()); out_Renamed.WriteLine(" DF=" + tis.DocFreq()); TermPositions positions = reader.TermPositions(tis.Term()); try { while (positions.Next()) { out_Renamed.Write(" doc=" + positions.Doc()); out_Renamed.Write(" TF=" + positions.Freq()); out_Renamed.Write(" pos="); out_Renamed.Write(positions.NextPosition()); for (int j = 1; j < positions.Freq(); j++) { out_Renamed.Write("," + positions.NextPosition()); } out_Renamed.WriteLine(""); } } finally { positions.Close(); } } tis.Close(); reader.Close(); }
/// <summary> Go to next location of this term current document, and set /// <c>position</c> as <c>location - offset</c>, so that a /// matching exact phrase is easily identified when all PhrasePositions /// have exactly the same <c>position</c>. /// </summary> internal bool NextPosition() { if (count-- > 0) { // read subsequent pos's position = tp.NextPosition() - offset; return(true); } else { return(false); } }
/// <summary>Process postings from multiple segments all positioned on the /// same term. Writes out merged entries into freqOutput and /// the proxOutput streams. /// /// </summary> /// <param name="smis">array of segments /// </param> /// <param name="n">number of cells in the array actually occupied /// </param> /// <returns> number of documents across all segments where this term was found /// </returns> /// <throws> CorruptIndexException if the index is corrupt </throws> /// <throws> IOException if there is a low-level IO error </throws> private int AppendPostings(FormatPostingsTermsConsumer termsConsumer, SegmentMergeInfo[] smis, int n) { FormatPostingsDocsConsumer docConsumer = termsConsumer.AddTerm(smis[0].term.Text); int df = 0; for (int i = 0; i < n; i++) { SegmentMergeInfo smi = smis[i]; TermPositions postings = smi.GetPositions(); System.Diagnostics.Debug.Assert(postings != null); int base_Renamed = smi.base_Renamed; int[] docMap = smi.GetDocMap(); postings.Seek(smi.termEnum); while (postings.Next()) { df++; int doc = postings.Doc; if (docMap != null) { doc = docMap[doc]; // map around deletions } doc += base_Renamed; // convert to merged space int freq = postings.Freq; FormatPostingsPositionsConsumer posConsumer = docConsumer.AddDoc(doc, freq); if (!omitTermFreqAndPositions) { for (int j = 0; j < freq; j++) { int position = postings.NextPosition(); int payloadLength = postings.PayloadLength; if (payloadLength > 0) { if (payloadBuffer == null || payloadBuffer.Length < payloadLength) { payloadBuffer = new byte[payloadLength]; } postings.GetPayload(payloadBuffer, 0); } posConsumer.AddPosition(position, payloadBuffer, 0, payloadLength); } posConsumer.Finish(); } } } docConsumer.Finish(); return(df); }
public virtual void TestThreadSafety() { rnd = NewRandom(); int numThreads = 5; int numDocs = 50; ByteArrayPool pool = new ByteArrayPool(numThreads, 5); Directory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED, null); System.String field = "test"; ThreadClass[] ingesters = new ThreadClass[numThreads]; for (int i = 0; i < numThreads; i++) { ingesters[i] = new AnonymousClassThread(numDocs, field, pool, writer, this); ingesters[i].Start(); } for (int i = 0; i < numThreads; i++) { ingesters[i].Join(); } writer.Close(); IndexReader reader = IndexReader.Open(dir, true, null); TermEnum terms = reader.Terms(null); while (terms.Next(null)) { TermPositions tp = reader.TermPositions(terms.Term, null); while (tp.Next(null)) { int freq = tp.Freq; for (int i = 0; i < freq; i++) { tp.NextPosition(null); Assert.AreEqual(pool.BytesToString(tp.GetPayload(new byte[5], 0, null)), terms.Term.Text); } } tp.Close(); } terms.Close(); reader.Close(); Assert.AreEqual(pool.Size(), numThreads); }
/// <summary> Test the term index.</summary> private Status.TermIndexStatus TestTermIndex(SegmentInfo info, SegmentReader reader) { Status.TermIndexStatus status = new Status.TermIndexStatus(); try { if (infoStream != null) { infoStream.Write(" test: terms, freq, prox..."); } TermEnum termEnum = reader.Terms(); TermPositions termPositions = reader.TermPositions(); // Used only to count up # deleted docs for this term MySegmentTermDocs myTermDocs = new MySegmentTermDocs(reader); int maxDoc = reader.MaxDoc(); while (termEnum.Next()) { status.termCount++; Term term = termEnum.Term(); int docFreq = termEnum.DocFreq(); termPositions.Seek(term); int lastDoc = -1; int freq0 = 0; status.totFreq += docFreq; while (termPositions.Next()) { freq0++; int doc = termPositions.Doc(); int freq = termPositions.Freq(); if (doc <= lastDoc) { throw new System.SystemException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc); } if (doc >= maxDoc) { throw new System.SystemException("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc); } lastDoc = doc; if (freq <= 0) { throw new System.SystemException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds"); } int lastPos = -1; status.totPos += freq; for (int j = 0; j < freq; j++) { int pos = termPositions.NextPosition(); if (pos < -1) { throw new System.SystemException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds"); } if (pos < lastPos) { throw new System.SystemException("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos); } } } // Now count how many deleted docs occurred in // this term: int delCount; if (reader.HasDeletions()) { myTermDocs.Seek(term); while (myTermDocs.Next()) { } delCount = myTermDocs.delCount; } else { delCount = 0; } if (freq0 + delCount != docFreq) { throw new System.SystemException("term " + term + " docFreq=" + docFreq + " != num docs seen " + freq0 + " + num docs deleted " + delCount); } } Msg("OK [" + status.termCount + " terms; " + status.totFreq + " terms/docs pairs; " + status.totPos + " tokens]"); } catch (System.Exception e) { Msg("ERROR [" + System.Convert.ToString(e.Message) + "]"); status.error = e; if (infoStream != null) { infoStream.WriteLine(e.StackTrace); } } return(status); }
// builds an index with payloads in the given Directory and performs // different tests to verify the payload encoding private void PerformTest(Directory dir) { PayloadAnalyzer analyzer = new PayloadAnalyzer(); IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED, null); // should be in sync with value in TermInfosWriter int skipInterval = 16; int numTerms = 5; System.String fieldName = "f1"; int numDocs = skipInterval + 1; // create content for the test documents with just a few terms Term[] terms = GenerateTerms(fieldName, numTerms); System.Text.StringBuilder sb = new System.Text.StringBuilder(); for (int i = 0; i < terms.Length; i++) { sb.Append(terms[i].Text); sb.Append(" "); } System.String content = sb.ToString(); int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2; byte[] payloadData = GenerateRandomData(payloadDataLength); Document d = new Document(); d.Add(new Field(fieldName, content, Field.Store.NO, Field.Index.ANALYZED)); // add the same document multiple times to have the same payload lengths for all // occurrences within two consecutive skip intervals int offset = 0; for (int i = 0; i < 2 * numDocs; i++) { analyzer.SetPayloadData(fieldName, payloadData, offset, 1); offset += numTerms; writer.AddDocument(d, null); } // make sure we create more than one segment to test merging writer.Commit(null); // now we make sure to have different payload lengths next at the next skip point for (int i = 0; i < numDocs; i++) { analyzer.SetPayloadData(fieldName, payloadData, offset, i); offset += i * numTerms; writer.AddDocument(d, null); } writer.Optimize(null); // flush writer.Close(); /* * Verify the index * first we test if all payloads are stored correctly */ IndexReader reader = IndexReader.Open(dir, true, null); byte[] verifyPayloadData = new byte[payloadDataLength]; offset = 0; TermPositions[] tps = new TermPositions[numTerms]; for (int i = 0; i < numTerms; i++) { tps[i] = reader.TermPositions(terms[i], null); } while (tps[0].Next(null)) { for (int i = 1; i < numTerms; i++) { tps[i].Next(null); } int freq = tps[0].Freq; for (int i = 0; i < freq; i++) { for (int j = 0; j < numTerms; j++) { tps[j].NextPosition(null); tps[j].GetPayload(verifyPayloadData, offset, null); offset += tps[j].PayloadLength; } } } for (int i = 0; i < numTerms; i++) { tps[i].Close(); } AssertByteArrayEquals(payloadData, verifyPayloadData); /* * test lazy skipping */ TermPositions tp = reader.TermPositions(terms[0], null); tp.Next(null); tp.NextPosition(null); // now we don't read this payload tp.NextPosition(null); Assert.AreEqual(1, tp.PayloadLength, "Wrong payload length."); byte[] payload = tp.GetPayload(null, 0, null); Assert.AreEqual(payload[0], payloadData[numTerms]); tp.NextPosition(null); // we don't read this payload and skip to a different document tp.SkipTo(5, null); tp.NextPosition(null); Assert.AreEqual(1, tp.PayloadLength, "Wrong payload length."); payload = tp.GetPayload(null, 0, null); Assert.AreEqual(payload[0], payloadData[5 * numTerms]); /* * Test different lengths at skip points */ tp.Seek(terms[1], null); tp.Next(null); tp.NextPosition(null); Assert.AreEqual(1, tp.PayloadLength, "Wrong payload length."); tp.SkipTo(skipInterval - 1, null); tp.NextPosition(null); Assert.AreEqual(1, tp.PayloadLength, "Wrong payload length."); tp.SkipTo(2 * skipInterval - 1, null); tp.NextPosition(null); Assert.AreEqual(1, tp.PayloadLength, "Wrong payload length."); tp.SkipTo(3 * skipInterval - 1, null); tp.NextPosition(null); Assert.AreEqual(3 * skipInterval - 2 * numDocs - 1, tp.PayloadLength, "Wrong payload length."); /* * Test multiple call of getPayload() */ tp.GetPayload(null, 0, null); // it is forbidden to call getPayload() more than once // without calling nextPosition() Assert.Throws <IOException>(() => tp.GetPayload(null, 0, null), "Expected exception not thrown"); reader.Close(); // test long payload analyzer = new PayloadAnalyzer(); writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED, null); System.String singleTerm = "lucene"; d = new Document(); d.Add(new Field(fieldName, singleTerm, Field.Store.NO, Field.Index.ANALYZED)); // add a payload whose length is greater than the buffer size of BufferedIndexOutput payloadData = GenerateRandomData(2000); analyzer.SetPayloadData(fieldName, payloadData, 100, 1500); writer.AddDocument(d, null); writer.Optimize(null); // flush writer.Close(); reader = IndexReader.Open(dir, true, null); tp = reader.TermPositions(new Term(fieldName, singleTerm), null); tp.Next(null); tp.NextPosition(null); verifyPayloadData = new byte[tp.PayloadLength]; tp.GetPayload(verifyPayloadData, 0, null); byte[] portion = new byte[1500]; Array.Copy(payloadData, 100, portion, 0, 1500); AssertByteArrayEquals(portion, verifyPayloadData); reader.Close(); }
public virtual void CheckSkipTo(TermPositions tp, int target, int maxCounter) { tp.SkipTo(target); if (maxCounter < counter) { Assert.Fail("Too many bytes read: " + counter); } Assert.AreEqual(target, tp.Doc(), "Wrong document " + tp.Doc() + " after skipTo target " + target); Assert.AreEqual(1, tp.Freq(), "Frequency is not 1: " + tp.Freq()); tp.NextPosition(); byte[] b = new byte[1]; tp.GetPayload(b, 0); Assert.AreEqual((byte) target, b[0], "Wrong payload for the target " + target + ": " + b[0]); }
void BeginAsyncReconstruction(int docNum, Document document, Hashtable doc) { // get stored fields ArrayList sf = new ArrayList(); for (int i = 0; i < _indexFields.Length; i++) { Field[] f = document.GetFields(_indexFields[i]); if (f == null || f.Length == 0 || !f[0].IsStored()) { continue; } StringBuilder sb = new StringBuilder(); for (int k = 0; k < f.Length; k++) { if (k > 0) { sb.Append('\n'); } sb.Append(f[k].StringValue()); } Field field = Legacy.CreateField(_indexFields[i], sb.ToString(), f[0].IsStored(), f[0].IsIndexed(), f[0].IsTokenized(), f[0].IsTermVectorStored()); field.SetBoost(f[0].GetBoost()); doc[_indexFields[i]] = field; sf.Add(_indexFields[i]); } String term = null; GrowableStringArray terms = null; try { int i = 0; int delta = (int)Math.Ceiling(((double)_numTerms / 100)); TermEnum te = _luke.IndexReader.Terms(); TermPositions tp = _luke.IndexReader.TermPositions(); while (te.Next()) { if ((i++ % delta) == 0) { // update UI - async UpdateProgress(i / delta); } // skip stored fields if (sf.Contains(te.Term().Field())) { continue; } tp.Seek(te.Term()); if (!tp.SkipTo(docNum) || tp.Doc() != docNum) { // this term is not found in the doc continue; } term = te.Term().Text(); terms = (GrowableStringArray)doc[te.Term().Field()]; if (terms == null) { terms = new GrowableStringArray(); doc[te.Term().Field()] = terms; } for (int k = 0; k < tp.Freq(); k++) { int pos = tp.NextPosition(); terms.Set(pos, term); } } } catch (Exception exc) { // Update UI - async _luke.ShowStatus(exc.Message); } }
/// <summary>Process postings from multiple segments all positioned on the /// same term. Writes out merged entries into freqOutput and /// the proxOutput streams. /// /// </summary> /// <param name="smis">array of segments /// </param> /// <param name="n">number of cells in the array actually occupied /// </param> /// <returns> number of documents across all segments where this term was found /// </returns> /// <throws> CorruptIndexException if the index is corrupt </throws> /// <throws> IOException if there is a low-level IO error </throws> private int AppendPostings(SegmentMergeInfo[] smis, int n) { int lastDoc = 0; int df = 0; // number of docs w/ term skipListWriter.ResetSkip(); bool storePayloads = fieldInfos.FieldInfo(smis[0].term.field).storePayloads; int lastPayloadLength = -1; // ensures that we write the first length for (int i = 0; i < n; i++) { SegmentMergeInfo smi = smis[i]; TermPositions postings = smi.GetPositions(); System.Diagnostics.Debug.Assert(postings != null); int base_Renamed = smi.base_Renamed; int[] docMap = smi.GetDocMap(); postings.Seek(smi.termEnum); while (postings.Next()) { int doc = postings.Doc(); if (docMap != null) { doc = docMap[doc]; // map around deletions } doc += base_Renamed; // convert to merged space if (doc < 0 || (df > 0 && doc <= lastDoc)) { throw new CorruptIndexException("docs out of order (" + doc + " <= " + lastDoc + " )"); } df++; if ((df % skipInterval) == 0) { skipListWriter.SetSkipData(lastDoc, storePayloads, lastPayloadLength); skipListWriter.BufferSkip(df); } int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1 lastDoc = doc; int freq = postings.Freq(); if (freq == 1) { freqOutput.WriteVInt(docCode | 1); // write doc & freq=1 } else { freqOutput.WriteVInt(docCode); // write doc freqOutput.WriteVInt(freq); // write frequency in doc } /** See {@link DocumentWriter#writePostings(Posting[], String) for * documentation about the encoding of positions and payloads */ int lastPosition = 0; // write position deltas for (int j = 0; j < freq; j++) { int position = postings.NextPosition(); int delta = position - lastPosition; if (storePayloads) { int payloadLength = postings.GetPayloadLength(); if (payloadLength == lastPayloadLength) { proxOutput.WriteVInt(delta * 2); } else { proxOutput.WriteVInt(delta * 2 + 1); proxOutput.WriteVInt(payloadLength); lastPayloadLength = payloadLength; } if (payloadLength > 0) { if (payloadBuffer == null || payloadBuffer.Length < payloadLength) { payloadBuffer = new byte[payloadLength]; } postings.GetPayload(payloadBuffer, 0); proxOutput.WriteBytes(payloadBuffer, 0, payloadLength); } } else { proxOutput.WriteVInt(delta); } lastPosition = position; } } } return(df); }
/// <summary>Returns true if index is clean, else false.</summary> public static bool Check(Directory dir, bool doFix) { System.Globalization.NumberFormatInfo nf = System.Globalization.CultureInfo.CurrentCulture.NumberFormat; SegmentInfos sis = new SegmentInfos(); try { sis.Read(dir); } catch (System.Exception t) { out_Renamed.WriteLine("ERROR: could not read any segments file in directory"); out_Renamed.Write(t.StackTrace); out_Renamed.Flush(); return(false); } int numSegments = sis.Count; System.String segmentsFileName = sis.GetCurrentSegmentFileName(); IndexInput input = null; try { input = dir.OpenInput(segmentsFileName); } catch (System.Exception t) { out_Renamed.WriteLine("ERROR: could not open segments file in directory"); out_Renamed.Write(t.StackTrace); out_Renamed.Flush(); return(false); } int format = 0; try { format = input.ReadInt(); } catch (System.Exception t) { out_Renamed.WriteLine("ERROR: could not read segment file version in directory"); out_Renamed.Write(t.StackTrace); out_Renamed.Flush(); return(false); } finally { if (input != null) { input.Close(); } } System.String sFormat = ""; bool skip = false; if (format == SegmentInfos.FORMAT) { sFormat = "FORMAT [Lucene Pre-2.1]"; } if (format == SegmentInfos.FORMAT_LOCKLESS) { sFormat = "FORMAT_LOCKLESS [Lucene 2.1]"; } else if (format == SegmentInfos.FORMAT_SINGLE_NORM_FILE) { sFormat = "FORMAT_SINGLE_NORM_FILE [Lucene 2.2]"; } else if (format == SegmentInfos.FORMAT_SHARED_DOC_STORE) { sFormat = "FORMAT_SHARED_DOC_STORE [Lucene 2.3]"; } else if (format < SegmentInfos.FORMAT_SHARED_DOC_STORE) { sFormat = "int=" + format + " [newer version of Lucene than this tool]"; skip = true; } else { sFormat = format + " [Lucene 1.3 or prior]"; } out_Renamed.WriteLine("Segments file=" + segmentsFileName + " numSegments=" + numSegments + " version=" + sFormat); if (skip) { out_Renamed.WriteLine("\nERROR: this index appears to be created by a newer version of Lucene than this tool was compiled on; please re-compile this tool on the matching version of Lucene; exiting"); return(false); } SegmentInfos newSIS = (SegmentInfos)sis.Clone(); newSIS.Clear(); bool changed = false; int totLoseDocCount = 0; int numBadSegments = 0; for (int i = 0; i < numSegments; i++) { SegmentInfo info = sis.Info(i); out_Renamed.WriteLine(" " + (1 + i) + " of " + numSegments + ": name=" + info.name + " docCount=" + info.docCount); int toLoseDocCount = info.docCount; SegmentReader reader = null; try { out_Renamed.WriteLine(" compound=" + info.GetUseCompoundFile()); out_Renamed.WriteLine(" numFiles=" + info.Files().Count); out_Renamed.WriteLine(String.Format(nf, " size (MB)={0:f}", new Object[] { (info.SizeInBytes() / (1024.0 * 1024.0)) })); int docStoreOffset = info.GetDocStoreOffset(); if (docStoreOffset != -1) { out_Renamed.WriteLine(" docStoreOffset=" + docStoreOffset); out_Renamed.WriteLine(" docStoreSegment=" + info.GetDocStoreSegment()); out_Renamed.WriteLine(" docStoreIsCompoundFile=" + info.GetDocStoreIsCompoundFile()); } System.String delFileName = info.GetDelFileName(); if (delFileName == null) { out_Renamed.WriteLine(" no deletions"); } else { out_Renamed.WriteLine(" has deletions [delFileName=" + delFileName + "]"); } out_Renamed.Write(" test: open reader........."); reader = SegmentReader.Get(info); int numDocs = reader.NumDocs(); toLoseDocCount = numDocs; if (reader.HasDeletions()) { out_Renamed.WriteLine("OK [" + (info.docCount - numDocs) + " deleted docs]"); } else { out_Renamed.WriteLine("OK"); } out_Renamed.Write(" test: fields, norms......."); System.Collections.IDictionary fieldNames = (System.Collections.IDictionary)reader.GetFieldNames(IndexReader.FieldOption.ALL); System.Collections.IEnumerator it = fieldNames.Keys.GetEnumerator(); while (it.MoveNext()) { System.String fieldName = (System.String)it.Current; byte[] b = reader.Norms(fieldName); if (b.Length != info.docCount) { throw new System.SystemException("norms for field \"" + fieldName + "\" is length " + b.Length + " != maxDoc " + info.docCount); } } out_Renamed.WriteLine("OK [" + fieldNames.Count + " fields]"); out_Renamed.Write(" test: terms, freq, prox..."); TermEnum termEnum = reader.Terms(); TermPositions termPositions = reader.TermPositions(); // Used only to count up # deleted docs for this // term MySegmentTermDocs myTermDocs = new MySegmentTermDocs(reader); long termCount = 0; long totFreq = 0; long totPos = 0; while (termEnum.Next()) { termCount++; Term term = termEnum.Term(); int docFreq = termEnum.DocFreq(); termPositions.Seek(term); int lastDoc = -1; int freq0 = 0; totFreq += docFreq; while (termPositions.Next()) { freq0++; int doc = termPositions.Doc(); int freq = termPositions.Freq(); if (doc <= lastDoc) { throw new System.SystemException("term " + term + ": doc " + doc + " < lastDoc " + lastDoc); } lastDoc = doc; if (freq <= 0) { throw new System.SystemException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds"); } int lastPos = -1; totPos += freq; for (int j = 0; j < freq; j++) { int pos = termPositions.NextPosition(); if (pos < 0) { throw new System.SystemException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds"); } if (pos <= lastPos) { throw new System.SystemException("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos); } } } // Now count how many deleted docs occurred in // this term: int delCount; if (reader.HasDeletions()) { myTermDocs.Seek(term); while (myTermDocs.Next()) { } delCount = myTermDocs.delCount; } else { delCount = 0; } if (freq0 + delCount != docFreq) { throw new System.SystemException("term " + term + " docFreq=" + docFreq + " != num docs seen " + freq0 + " + num docs deleted " + delCount); } } out_Renamed.WriteLine("OK [" + termCount + " terms; " + totFreq + " terms/docs pairs; " + totPos + " tokens]"); out_Renamed.Write(" test: stored fields......."); int docCount = 0; long totFields = 0; for (int j = 0; j < info.docCount; j++) { if (!reader.IsDeleted(j)) { docCount++; Document doc = reader.Document(j); totFields += doc.GetFields().Count; } } if (docCount != reader.NumDocs()) { throw new System.SystemException("docCount=" + docCount + " but saw " + docCount + " undeleted docs"); } out_Renamed.WriteLine(String.Format(nf, "OK [{0:d} total field count; avg {1:f} fields per doc]", new Object[] { totFields, (((float)totFields) / docCount) })); out_Renamed.Write(" test: term vectors........"); int totVectors = 0; for (int j = 0; j < info.docCount; j++) { if (!reader.IsDeleted(j)) { TermFreqVector[] tfv = reader.GetTermFreqVectors(j); if (tfv != null) { totVectors += tfv.Length; } } } out_Renamed.WriteLine(String.Format(nf, "OK [{0:d} total vector count; avg {1:f} term/freq vector fields per doc]", new Object[] { totVectors, (((float)totVectors) / docCount) })); out_Renamed.WriteLine(""); } catch (System.Exception t) { out_Renamed.WriteLine("FAILED"); System.String comment; if (doFix) { comment = "will remove reference to this segment (-fix is specified)"; } else { comment = "would remove reference to this segment (-fix was not specified)"; } out_Renamed.WriteLine(" WARNING: " + comment + "; full exception:"); out_Renamed.Write(t.StackTrace); out_Renamed.Flush(); out_Renamed.WriteLine(""); totLoseDocCount += toLoseDocCount; numBadSegments++; changed = true; continue; } finally { if (reader != null) { reader.Close(); } } // Keeper newSIS.Add(info.Clone()); } if (!changed) { out_Renamed.WriteLine("No problems were detected with this index.\n"); return(true); } else { out_Renamed.WriteLine("WARNING: " + numBadSegments + " broken segments detected"); if (doFix) { out_Renamed.WriteLine("WARNING: " + totLoseDocCount + " documents will be lost"); } else { out_Renamed.WriteLine("WARNING: " + totLoseDocCount + " documents would be lost if -fix were specified"); } out_Renamed.WriteLine(); } if (doFix) { out_Renamed.WriteLine("NOTE: will write new segments file in 5 seconds; this will remove " + totLoseDocCount + " docs from the index. THIS IS YOUR LAST CHANCE TO CTRL+C!"); for (int i = 0; i < 5; i++) { try { System.Threading.Thread.Sleep(new System.TimeSpan((System.Int64) 10000 * 1000)); } catch (System.Threading.ThreadInterruptedException) { SupportClass.ThreadClass.Current().Interrupt(); i--; continue; } out_Renamed.WriteLine(" " + (5 - i) + "..."); } out_Renamed.Write("Writing..."); try { newSIS.Write(dir); } catch (System.Exception t) { out_Renamed.WriteLine("FAILED; exiting"); out_Renamed.Write(t.StackTrace); out_Renamed.Flush(); return(false); } out_Renamed.WriteLine("OK"); out_Renamed.WriteLine("Wrote new segments file \"" + newSIS.GetCurrentSegmentFileName() + "\""); } else { out_Renamed.WriteLine("NOTE: would write new segments file [-fix was not specified]"); } out_Renamed.WriteLine(""); return(false); }