Example #1
0
        /// <summary> Merge the TermVectors from each of the segments into the new one.</summary>
        /// <throws>  IOException </throws>
        private void  MergeVectors()
        {
            TermVectorsWriter termVectorsWriter = new TermVectorsWriter(directory, segment, fieldInfos);

            try
            {
                for (int r = 0; r < readers.Count; r++)
                {
                    IndexReader reader = (IndexReader)readers[r];
                    int         maxDoc = reader.MaxDoc();
                    for (int docNum = 0; docNum < maxDoc; docNum++)
                    {
                        // skip deleted docs
                        if (reader.IsDeleted(docNum))
                        {
                            continue;
                        }
                        termVectorsWriter.AddAllDocVectors(reader.GetTermFreqVectors(docNum));
                        if (checkAbort != null)
                        {
                            checkAbort.Work(300);
                        }
                    }
                }
            }
            finally
            {
                termVectorsWriter.Close();
            }
        }
Example #2
0
        private void  CopyVectorsNoDeletions(TermVectorsWriter termVectorsWriter, TermVectorsReader matchingVectorsReader, IndexReader reader)
        {
            int maxDoc = reader.MaxDoc;

            if (matchingVectorsReader != null)
            {
                // We can bulk-copy because the fieldInfos are "congruent"
                int docCount = 0;
                while (docCount < maxDoc)
                {
                    int len = System.Math.Min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
                    matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, docCount, len);
                    termVectorsWriter.AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, len);
                    docCount += len;
                    checkAbort.Work(300 * len);
                }
            }
            else
            {
                for (int docNum = 0; docNum < maxDoc; docNum++)
                {
                    // NOTE: it's very important to first assign to vectors then pass it to
                    // termVectorsWriter.addAllDocVectors; see LUCENE-1282
                    ITermFreqVector[] vectors = reader.GetTermFreqVectors(docNum);
                    termVectorsWriter.AddAllDocVectors(vectors);
                    checkAbort.Work(300);
                }
            }
        }
Example #3
0
        protected virtual void  SetUp()
        {
            for (int i = 0; i < testFields.Length; i++)
            {
                fieldInfos.Add(testFields[i], true, true);
            }

            try
            {
                System.Array.Sort(testTerms);
                for (int j = 0; j < 5; j++)
                {
                    writer = new TermVectorsWriter(dir, seg, fieldInfos);
                    writer.OpenDocument();

                    for (int k = 0; k < testFields.Length; k++)
                    {
                        writer.OpenField(testFields[k]);
                        for (int i = 0; i < testTerms.Length; i++)
                        {
                            writer.AddTerm(testTerms[i], i);
                        }
                        writer.CloseField();
                    }
                    writer.CloseDocument();
                    writer.Close();
                }
            }
            catch (System.IO.IOException e)
            {
                System.Console.Error.WriteLine(e.StackTrace);
                Assert.IsTrue(false);
            }
        }
Example #4
0
		protected virtual void  SetUp()
		{
			for (int i = 0; i < testFields.Length; i++)
			{
				fieldInfos.Add(testFields[i], true, true);
			}
			
			try
			{
				System.Array.Sort(testTerms);
				for (int j = 0; j < 5; j++)
				{
					writer = new TermVectorsWriter(dir, seg, fieldInfos);
					writer.OpenDocument();
					
					for (int k = 0; k < testFields.Length; k++)
					{
						writer.OpenField(testFields[k]);
						for (int i = 0; i < testTerms.Length; i++)
						{
							writer.AddTerm(testTerms[i], i);
						}
						writer.CloseField();
					}
					writer.CloseDocument();
					writer.Close();
				}
			}
			catch (System.IO.IOException e)
			{
				System.Console.Error.WriteLine(e.StackTrace);
				Assert.IsTrue(false);
			}
		}
Example #5
0
 public virtual void  TestMultipleDocuments()
 {
     try
     {
         TermVectorsWriter writer = new TermVectorsWriter(dir, seg, fieldInfos);
         Assert.IsTrue(writer != null);
         for (int i = 0; i < 10; i++)
         {
             WriteDocument(writer, testFields.Length);
         }
         writer.Close();
     }
     catch (System.IO.IOException e)
     {
         System.Console.Error.WriteLine(e.StackTrace);
         Assert.IsTrue(false);
     }
     //Do some arbitrary tests
     try
     {
         TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
         for (int i = 0; i < 10; i++)
         {
             Assert.IsTrue(reader != null);
             CheckTermVector(reader, 5, testFields[0]);
             CheckTermVector(reader, 2, testFields[2]);
         }
     }
     catch (System.IO.IOException e)
     {
         System.Console.Error.WriteLine(e.StackTrace);
         Assert.IsTrue(false);
     }
 }
Example #6
0
 public virtual void  TestWriter()
 {
     try
     {
         TermVectorsWriter writer = new TermVectorsWriter(dir, seg, fieldInfos);
         writer.OpenDocument();
         Assert.IsTrue(writer.IsDocumentOpen() == true);
         WriteField(writer, testFields[0]);
         writer.CloseDocument();
         writer.Close();
         Assert.IsTrue(writer.IsDocumentOpen() == false);
         //Check to see the files were created
         Assert.IsTrue(dir.FileExists(seg + TermVectorsWriter.TVD_EXTENSION));
         Assert.IsTrue(dir.FileExists(seg + TermVectorsWriter.TVX_EXTENSION));
         //Now read it back in
         TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
         Assert.IsTrue(reader != null);
         CheckTermVector(reader, 0, testFields[0]);
     }
     catch (System.IO.IOException e)
     {
         System.Console.Error.WriteLine(e.StackTrace);
         Assert.IsTrue(false);
     }
 }
Example #7
0
        // LUCENENE specific - original was internal, but FreqProxTermsWriter requires public (little point, since both are internal classes)
        public override void Flush(IDictionary <string, TermsHashConsumerPerField> fieldsToFlush, SegmentWriteState state)
        {
            if (writer != null)
            {
                int numDocs = state.SegmentInfo.DocCount;
                Debug.Assert(numDocs > 0);
                // At least one doc in this run had term vectors enabled
                try
                {
                    Fill(numDocs);
                    Debug.Assert(state.SegmentInfo != null);
                    writer.Finish(state.FieldInfos, numDocs);
                }
                finally
                {
                    IOUtils.Close(writer);
                    writer     = null;
                    lastDocID  = 0;
                    hasVectors = false;
                }
            }

            foreach (TermsHashConsumerPerField field in fieldsToFlush.Values)
            {
                TermVectorsConsumerPerField perField = (TermVectorsConsumerPerField)field;
                perField.termsHashPerField.Reset();
                perField.ShrinkHash();
            }
        }
Example #8
0
 private void InitTermVectorsWriter()
 {
     if (writer == null)
     {
         IOContext context = new IOContext(new FlushInfo(docWriter.NumDocsInRAM, docWriter.BytesUsed));
         writer    = docWriter.codec.TermVectorsFormat.VectorsWriter(docWriter.directory, docWriter.SegmentInfo, context);
         lastDocID = 0;
     }
 }
Example #9
0
 private void InitTermVectorsWriter()
 {
     if (Writer == null)
     {
         IOContext context = new IOContext(new FlushInfo(DocWriter.NumDocsInRAM, DocWriter.BytesUsed()));
         Writer    = DocWriter.Codec.TermVectorsFormat().VectorsWriter(DocWriter.Directory, DocWriter.SegmentInfo, context);
         LastDocID = 0;
     }
 }
Example #10
0
        private void  CopyVectorsWithDeletions(TermVectorsWriter termVectorsWriter, TermVectorsReader matchingVectorsReader, IndexReader reader)
        {
            int maxDoc = reader.MaxDoc;

            if (matchingVectorsReader != null)
            {
                // We can bulk-copy because the fieldInfos are "congruent"
                for (int docNum = 0; docNum < maxDoc;)
                {
                    if (reader.IsDeleted(docNum))
                    {
                        // skip deleted docs
                        ++docNum;
                        continue;
                    }
                    // We can optimize this case (doing a bulk byte copy) since the field
                    // numbers are identical
                    int start = docNum, numDocs = 0;
                    do
                    {
                        docNum++;
                        numDocs++;
                        if (docNum >= maxDoc)
                        {
                            break;
                        }
                        if (reader.IsDeleted(docNum))
                        {
                            docNum++;
                            break;
                        }
                    }while (numDocs < MAX_RAW_MERGE_DOCS);

                    matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
                    termVectorsWriter.AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
                    checkAbort.Work(300 * numDocs);
                }
            }
            else
            {
                for (int docNum = 0; docNum < maxDoc; docNum++)
                {
                    if (reader.IsDeleted(docNum))
                    {
                        // skip deleted docs
                        continue;
                    }

                    // NOTE: it's very important to first assign to vectors then pass it to
                    // termVectorsWriter.addAllDocVectors; see LUCENE-1282
                    ITermFreqVector[] vectors = reader.GetTermFreqVectors(docNum);
                    termVectorsWriter.AddAllDocVectors(vectors);
                    checkAbort.Work(300);
                }
            }
        }
Example #11
0
 /// <summary> </summary>
 /// <param name="writer">The writer to write to
 /// </param>
 /// <param name="j">The Field number
 /// </param>
 /// <throws>  IOException </throws>
 private void  WriteField(TermVectorsWriter writer, System.String f)
 {
     writer.OpenField(f);
     Assert.IsTrue(writer.IsFieldOpen() == true);
     for (int i = 0; i < testTerms.Length; i++)
     {
         writer.AddTerm(testTerms[i], i);
     }
     writer.CloseField();
 }
Example #12
0
        private void  WriteDocument(TermVectorsWriter writer, int numFields)
        {
            writer.OpenDocument();
            Assert.IsTrue(writer.IsDocumentOpen() == true);

            for (int j = 0; j < numFields; j++)
            {
                WriteField(writer, testFields[j]);
            }
            writer.CloseDocument();
            Assert.IsTrue(writer.IsDocumentOpen() == false);
        }
Example #13
0
        /// <summary>
        /// Merge the TermVectors from each of the segments into the new one. </summary>
        /// <exception cref="IOException"> if there is a low-level IO error </exception>
        private int MergeVectors()
        {
            TermVectorsWriter termVectorsWriter = codec.TermVectorsFormat.VectorsWriter(directory, mergeState.SegmentInfo, context);

            try
            {
                return(termVectorsWriter.Merge(mergeState));
            }
            finally
            {
                termVectorsWriter.Dispose();
            }
        }
Example #14
0
        public override void Abort()
        {
            hasVectors = false;

            if (writer != null)
            {
                writer.Abort();
                writer = null;
            }

            lastDocID = 0;
            Reset();
        }
Example #15
0
        /// <summary>
        /// Merge the TermVectors from each of the segments into the new one. </summary>
        /// <exception cref="IOException"> if there is a low-level IO error </exception>
        private int MergeVectors()
        {
            TermVectorsWriter termVectorsWriter = Codec.TermVectorsFormat().VectorsWriter(Directory, MergeState.SegmentInfo, Context);

            try
            {
                return(termVectorsWriter.Merge(MergeState));
            }
            finally
            {
                termVectorsWriter.Dispose();
            }
        }
Example #16
0
        public override void Abort()
        {
            HasVectors = false;

            if (Writer != null)
            {
                Writer.Abort();
                Writer = null;
            }

            LastDocID = 0;
            Reset();
        }
Example #17
0
        /// <summary> Merge the TermVectors from each of the segments into the new one.</summary>
        /// <throws>  IOException </throws>
        private void  MergeVectors()
        {
            TermVectorsWriter termVectorsWriter = new TermVectorsWriter(directory, segment, fieldInfos);

            try
            {
                int idx = 0;
                for (System.Collections.IEnumerator iter = readers.GetEnumerator(); iter.MoveNext();)
                {
                    SegmentReader     matchingSegmentReader = matchingSegmentReaders[idx++];
                    TermVectorsReader matchingVectorsReader = null;
                    if (matchingSegmentReader != null)
                    {
                        TermVectorsReader vectorsReader = matchingSegmentReader.GetTermVectorsReaderOrig();

                        // If the TV* files are an older format then they cannot read raw docs:
                        if (vectorsReader != null && vectorsReader.CanReadRawDocs())
                        {
                            matchingVectorsReader = vectorsReader;
                        }
                    }
                    IndexReader reader = (IndexReader)iter.Current;
                    if (reader.HasDeletions())
                    {
                        CopyVectorsWithDeletions(termVectorsWriter, matchingVectorsReader, reader);
                    }
                    else
                    {
                        CopyVectorsNoDeletions(termVectorsWriter, matchingVectorsReader, reader);
                    }
                }
            }
            finally
            {
                termVectorsWriter.Close();
            }

            System.String fileName = segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION;
            long          tvxSize  = directory.FileLength(fileName);

            if (4 + ((long)mergedDocs) * 16 != tvxSize)
            {
                // This is most likely a bug in Sun JRE 1.6.0_04/_05;
                // we detect that the bug has struck, here, and
                // throw an exception to prevent the corruption from
                // entering the index.  See LUCENE-1282 for
                // details.
                throw new System.SystemException("mergeVectors produced an invalid result: mergedDocs is " + mergedDocs + " but tvx size is " + tvxSize + " file=" + fileName + " file exists?=" + directory.FileExists(fileName) + "; now aborting this merge to prevent index corruption");
            }
        }
Example #18
0
        internal void FinishDocument()
        {
            Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start"));

            int numPostings = termsHashPerField.bytesHash.Count;

            BytesRef flushTerm = termsWriter.flushTerm;

            Debug.Assert(numPostings >= 0);

            if (numPostings > maxNumPostings)
            {
                maxNumPostings = numPostings;
            }

            // this is called once, after inverting all occurrences
            // of a given field in the doc.  At this point we flush
            // our hash into the DocWriter.

            Debug.Assert(termsWriter.VectorFieldsInOrder(fieldInfo));

            TermVectorsPostingsArray postings = (TermVectorsPostingsArray)termsHashPerField.postingsArray;
            TermVectorsWriter        tv       = termsWriter.writer;

            int[] termIDs = termsHashPerField.SortPostings(tv.Comparer);

            tv.StartField(fieldInfo, numPostings, doVectorPositions, doVectorOffsets, hasPayloads);

            ByteSliceReader posReader = doVectorPositions ? termsWriter.vectorSliceReaderPos : null;
            ByteSliceReader offReader = doVectorOffsets ? termsWriter.vectorSliceReaderOff : null;

            ByteBlockPool termBytePool = termsHashPerField.termBytePool;

            for (int j = 0; j < numPostings; j++)
            {
                int termID = termIDs[j];
                int freq   = postings.freqs[termID];

                // Get BytesRef
                termBytePool.SetBytesRef(flushTerm, postings.textStarts[termID]);
                tv.StartTerm(flushTerm, freq);

                if (doVectorPositions || doVectorOffsets)
                {
                    if (posReader != null)
                    {
                        termsHashPerField.InitReader(posReader, termID, 0);
                    }
                    if (offReader != null)
                    {
                        termsHashPerField.InitReader(offReader, termID, 1);
                    }
                    tv.AddProx(freq, posReader, offReader);
                }
                tv.FinishTerm();
            }
            tv.FinishField();

            termsHashPerField.Reset();

            fieldInfo.SetStoreTermVectors();
        }
Example #19
0
		private void  CopyVectorsNoDeletions(TermVectorsWriter termVectorsWriter, TermVectorsReader matchingVectorsReader, IndexReader reader)
		{
			int maxDoc = reader.MaxDoc();
			if (matchingVectorsReader != null)
			{
				// We can bulk-copy because the fieldInfos are "congruent"
				int docCount = 0;
				while (docCount < maxDoc)
				{
					int len = System.Math.Min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
					matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, docCount, len);
					termVectorsWriter.AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, len);
					docCount += len;
					checkAbort.Work(300 * len);
				}
			}
			else
			{
				for (int docNum = 0; docNum < maxDoc; docNum++)
				{
					// NOTE: it's very important to first assign to vectors then pass it to
					// termVectorsWriter.addAllDocVectors; see LUCENE-1282
					TermFreqVector[] vectors = reader.GetTermFreqVectors(docNum);
					termVectorsWriter.AddAllDocVectors(vectors);
					checkAbort.Work(300);
				}
			}
		}
Example #20
0
		private void  CopyVectorsWithDeletions(TermVectorsWriter termVectorsWriter, TermVectorsReader matchingVectorsReader, IndexReader reader)
		{
			int maxDoc = reader.MaxDoc();
			if (matchingVectorsReader != null)
			{
				// We can bulk-copy because the fieldInfos are "congruent"
				for (int docNum = 0; docNum < maxDoc; )
				{
					if (reader.IsDeleted(docNum))
					{
						// skip deleted docs
						++docNum;
						continue;
					}
					// We can optimize this case (doing a bulk byte copy) since the field 
					// numbers are identical
					int start = docNum, numDocs = 0;
					do 
					{
						docNum++;
						numDocs++;
						if (docNum >= maxDoc)
							break;
						if (reader.IsDeleted(docNum))
						{
							docNum++;
							break;
						}
					}
					while (numDocs < MAX_RAW_MERGE_DOCS);
					
					matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
					termVectorsWriter.AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
					checkAbort.Work(300 * numDocs);
				}
			}
			else
			{
				for (int docNum = 0; docNum < maxDoc; docNum++)
				{
					if (reader.IsDeleted(docNum))
					{
						// skip deleted docs
						continue;
					}
					
					// NOTE: it's very important to first assign to vectors then pass it to
					// termVectorsWriter.addAllDocVectors; see LUCENE-1282
					TermFreqVector[] vectors = reader.GetTermFreqVectors(docNum);
					termVectorsWriter.AddAllDocVectors(vectors);
					checkAbort.Work(300);
				}
			}
		}
Example #21
0
		/// <summary> Merge the TermVectors from each of the segments into the new one.</summary>
		/// <throws>  IOException </throws>
		private void  MergeVectors()
		{
			TermVectorsWriter termVectorsWriter = new TermVectorsWriter(directory, segment, fieldInfos);
			
			try
			{
				int idx = 0;
				for (System.Collections.IEnumerator iter = readers.GetEnumerator(); iter.MoveNext(); )
				{
					SegmentReader matchingSegmentReader = matchingSegmentReaders[idx++];
					TermVectorsReader matchingVectorsReader = null;
					if (matchingSegmentReader != null)
					{
						TermVectorsReader vectorsReader = matchingSegmentReader.GetTermVectorsReaderOrig();
						
						// If the TV* files are an older format then they cannot read raw docs:
						if (vectorsReader != null && vectorsReader.CanReadRawDocs())
						{
							matchingVectorsReader = vectorsReader;
						}
					}
					IndexReader reader = (IndexReader) iter.Current;
					if (reader.HasDeletions())
					{
						CopyVectorsWithDeletions(termVectorsWriter, matchingVectorsReader, reader);
					}
					else
					{
						CopyVectorsNoDeletions(termVectorsWriter, matchingVectorsReader, reader);
					}
				}
			}
			finally
			{
				termVectorsWriter.Close();
			}
			
			System.String fileName = segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION;
			long tvxSize = directory.FileLength(fileName);
			
			if (4 + ((long) mergedDocs) * 16 != tvxSize)
			// This is most likely a bug in Sun JRE 1.6.0_04/_05;
			// we detect that the bug has struck, here, and
			// throw an exception to prevent the corruption from
			// entering the index.  See LUCENE-1282 for
			// details.
				throw new System.SystemException("mergeVectors produced an invalid result: mergedDocs is " + mergedDocs + " but tvx size is " + tvxSize + " file=" + fileName + " file exists?=" + directory.FileExists(fileName) + "; now aborting this merge to prevent index corruption");
		}
		/// <summary> Merge the TermVectors from each of the segments into the new one.</summary>
		/// <throws>  IOException </throws>
		private void  MergeVectors()
		{
			TermVectorsWriter termVectorsWriter = new TermVectorsWriter(directory, segment, fieldInfos);
			
			try
			{
				for (int r = 0; r < readers.Count; r++)
				{
					IndexReader reader = (IndexReader) readers[r];
					int maxDoc = reader.MaxDoc();
					for (int docNum = 0; docNum < maxDoc; docNum++)
					{
						// skip deleted docs
						if (reader.IsDeleted(docNum))
							continue;
						termVectorsWriter.AddAllDocVectors(reader.GetTermFreqVectors(docNum));
						if (checkAbort != null)
							checkAbort.Work(300);
					}
				}
			}
			finally
			{
				termVectorsWriter.Close();
			}

            System.Diagnostics.Debug.Assert(4 + mergedDocs * 8 == directory.FileLength(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION),
                "after MergeVectors: tvx size mismatch: " + mergedDocs + " docs vs " +
                directory.FileLength(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION) +
                " length in bytes of " + segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
		}
Example #23
0
        /// <summary> Merge the TermVectors from each of the segments into the new one.</summary>
        /// <throws>  IOException </throws>
        private void MergeVectors()
        {
            TermVectorsWriter termVectorsWriter =
                new TermVectorsWriter(directory, segment, fieldInfos);

            try
            {
                for (int r = 0; r < readers.Count; r++)
                {
                    SegmentReader     matchingSegmentReader = matchingSegmentReaders[r];
                    TermVectorsReader matchingVectorsReader;
                    bool hasMatchingReader;
                    if (matchingSegmentReader != null)
                    {
                        matchingVectorsReader = matchingSegmentReader.termVectorsReaderOrig;

                        // If the TV* files are an older format then they
                        // cannot read raw docs:
                        if (matchingVectorsReader != null && !matchingVectorsReader.CanReadRawDocs())
                        {
                            matchingVectorsReader = null;
                            hasMatchingReader     = false;
                        }
                        else
                        {
                            hasMatchingReader = matchingVectorsReader != null;
                        }
                    }
                    else
                    {
                        hasMatchingReader     = false;
                        matchingVectorsReader = null;
                    }
                    IndexReader reader       = (IndexReader)readers[r];
                    bool        hasDeletions = reader.HasDeletions();
                    int         maxDoc       = reader.MaxDoc();
                    for (int docNum = 0; docNum < maxDoc;)
                    {
                        // skip deleted docs
                        if (!hasDeletions || !reader.IsDeleted(docNum))
                        {
                            if (hasMatchingReader)
                            {
                                // We can optimize this case (doing a bulk
                                // byte copy) since the field numbers are
                                // identical
                                int start   = docNum;
                                int numDocs = 0;
                                do
                                {
                                    docNum++;
                                    numDocs++;
                                    if (docNum >= maxDoc)
                                    {
                                        break;
                                    }
                                    if (hasDeletions && matchingSegmentReader.IsDeleted(docNum))
                                    {
                                        docNum++;
                                        break;
                                    }
                                } while (numDocs < MAX_RAW_MERGE_DOCS);

                                matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
                                termVectorsWriter.AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
                                if (checkAbort != null)
                                {
                                    checkAbort.Work(300 * numDocs);
                                }
                            }
                            else
                            {
                                // NOTE: it's very important to first assign
                                // to vectors then pass it to
                                // termVectorsWriter.addAllDocVectors; see
                                // LUCENE-1282
                                TermFreqVector[] vectors = reader.GetTermFreqVectors(docNum);
                                termVectorsWriter.AddAllDocVectors(vectors);
                                docNum++;
                                if (checkAbort != null)
                                {
                                    checkAbort.Work(300);
                                }
                            }
                        }
                        else
                        {
                            docNum++;
                        }
                    }
                }
            }
            finally
            {
                termVectorsWriter.Close();
            }

            long tvxSize = directory.FileLength(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);

            // {{dougsale-2.4.0}
            // this shouldn't be a problem for us - if it is,
            // then it's not a JRE bug
            //if (4 + mergedDocs * 16 != tvxSize)
            //  // This is most likely a bug in Sun JRE 1.6.0_04/_05;
            //  // we detect that the bug has struck, here, and
            //  // throw an exception to prevent the corruption from
            //  // entering the index.  See LUCENE-1282 for
            //  // details.
            //  throw new RuntimeException("mergeVectors produced an invalid result: mergedDocs is " + mergedDocs + " but tvx size is " + tvxSize + "; now aborting this merge to prevent index corruption");
        }
Example #24
0
        private void  WritePostings(Posting[] postings, System.String segment)
        {
            IndexOutput       freq = null, prox = null;
            TermInfosWriter   tis              = null;
            TermVectorsWriter termVectorWriter = null;

            try
            {
                //open files for inverse index storage
                freq = directory.CreateOutput(segment + ".frq");
                prox = directory.CreateOutput(segment + ".prx");
                tis  = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval);
                TermInfo      ti           = new TermInfo();
                System.String currentField = null;

                for (int i = 0; i < postings.Length; i++)
                {
                    Posting posting = postings[i];

                    // add an entry to the dictionary with pointers to prox and freq files
                    ti.Set(1, freq.GetFilePointer(), prox.GetFilePointer(), -1);
                    tis.Add(posting.term, ti);

                    // add an entry to the freq file
                    int postingFreq = posting.freq;
                    if (postingFreq == 1)
                    {
                        // optimize freq=1
                        freq.WriteVInt(1);
                    }
                    // set low bit of doc num.
                    else
                    {
                        freq.WriteVInt(0);                         // the document number
                        freq.WriteVInt(postingFreq);               // frequency in doc
                    }

                    int   lastPosition = 0;                   // write positions
                    int[] positions    = posting.positions;
                    for (int j = 0; j < postingFreq; j++)
                    {
                        // use delta-encoding
                        int position = positions[j];
                        prox.WriteVInt(position - lastPosition);
                        lastPosition = position;
                    }
                    // check to see if we switched to a new field
                    System.String termField = posting.term.Field();
                    if (currentField != termField)
                    {
                        // changing field - see if there is something to save
                        currentField = termField;
                        FieldInfo fi = fieldInfos.FieldInfo(currentField);
                        if (fi.storeTermVector)
                        {
                            if (termVectorWriter == null)
                            {
                                termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos);
                                termVectorWriter.OpenDocument();
                            }
                            termVectorWriter.OpenField(currentField);
                        }
                        else if (termVectorWriter != null)
                        {
                            termVectorWriter.CloseField();
                        }
                    }
                    if (termVectorWriter != null && termVectorWriter.IsFieldOpen())
                    {
                        termVectorWriter.AddTerm(posting.term.Text(), postingFreq, posting.positions, posting.offsets);
                    }
                }
                if (termVectorWriter != null)
                {
                    termVectorWriter.CloseDocument();
                }
            }
            finally
            {
                // make an effort to close all streams we can but remember and re-throw
                // the first exception encountered in this process
                System.IO.IOException keep = null;
                if (freq != null)
                {
                    try
                    {
                        freq.Close();
                    }
                    catch (System.IO.IOException e)
                    {
                        if (keep == null)
                        {
                            keep = e;
                        }
                    }
                }
                if (prox != null)
                {
                    try
                    {
                        prox.Close();
                    }
                    catch (System.IO.IOException e)
                    {
                        if (keep == null)
                        {
                            keep = e;
                        }
                    }
                }
                if (tis != null)
                {
                    try
                    {
                        tis.Close();
                    }
                    catch (System.IO.IOException e)
                    {
                        if (keep == null)
                        {
                            keep = e;
                        }
                    }
                }
                if (termVectorWriter != null)
                {
                    try
                    {
                        termVectorWriter.Close();
                    }
                    catch (System.IO.IOException e)
                    {
                        if (keep == null)
                        {
                            keep = e;
                        }
                    }
                }
                if (keep != null)
                {
                    throw new System.IO.IOException(keep.StackTrace);
                }
            }
        }
        /// <summary> Merge the TermVectors from each of the segments into the new one.</summary>
        /// <throws>  IOException </throws>
        private void MergeVectors()
        {
            TermVectorsWriter termVectorsWriter =
              new TermVectorsWriter(directory, segment, fieldInfos);

            try
            {
                for (int r = 0; r < readers.Count; r++)
                {
                    SegmentReader matchingSegmentReader = matchingSegmentReaders[r];
                    TermVectorsReader matchingVectorsReader;
                    bool hasMatchingReader;
                    if (matchingSegmentReader != null)
                    {
                        matchingVectorsReader = matchingSegmentReader.termVectorsReaderOrig;

                        // If the TV* files are an older format then they
                        // cannot read raw docs:
                        if (matchingVectorsReader != null && !matchingVectorsReader.CanReadRawDocs())
                        {
                            matchingVectorsReader = null;
                            hasMatchingReader = false;
                        }
                        else
                            hasMatchingReader = matchingVectorsReader != null;

                    }
                    else
                    {
                        hasMatchingReader = false;
                        matchingVectorsReader = null;
                    }
                    IndexReader reader = (IndexReader)readers[r];
                    bool hasDeletions = reader.HasDeletions();
                    int maxDoc = reader.MaxDoc();
                    for (int docNum = 0; docNum < maxDoc; )
                    {
                        // skip deleted docs
                        if (!hasDeletions || !reader.IsDeleted(docNum))
                        {
                            if (hasMatchingReader)
                            {
                                // We can optimize this case (doing a bulk
                                // byte copy) since the field numbers are
                                // identical
                                int start = docNum;
                                int numDocs = 0;
                                do
                                {
                                    docNum++;
                                    numDocs++;
                                    if (docNum >= maxDoc)
                                        break;
                                    if (hasDeletions && matchingSegmentReader.IsDeleted(docNum))
                                    {
                                        docNum++;
                                        break;
                                    }
                                } while (numDocs < MAX_RAW_MERGE_DOCS);

                                matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
                                termVectorsWriter.AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
                                if (checkAbort != null)
                                    checkAbort.Work(300 * numDocs);
                            }
                            else
                            {
                                // NOTE: it's very important to first assign
                                // to vectors then pass it to
                                // termVectorsWriter.addAllDocVectors; see
                                // LUCENE-1282
                                TermFreqVector[] vectors = reader.GetTermFreqVectors(docNum);
                                termVectorsWriter.AddAllDocVectors(vectors);
                                docNum++;
                                if (checkAbort != null)
                                    checkAbort.Work(300);
                            }
                        }
                        else
                            docNum++;
                    }
                }
            }
            finally
            {
                termVectorsWriter.Close();
            }

            long tvxSize = directory.FileLength(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);

            // {{dougsale-2.4.0}
            // this shouldn't be a problem for us - if it is,
            // then it's not a JRE bug
            //if (4 + mergedDocs * 16 != tvxSize)
            //  // This is most likely a bug in Sun JRE 1.6.0_04/_05;
            //  // we detect that the bug has struck, here, and
            //  // throw an exception to prevent the corruption from
            //  // entering the index.  See LUCENE-1282 for
            //  // details.
            //  throw new RuntimeException("mergeVectors produced an invalid result: mergedDocs is " + mergedDocs + " but tvx size is " + tvxSize + "; now aborting this merge to prevent index corruption");
        }
Example #26
0
        /// <summary> Merge the TermVectors from each of the segments into the new one.</summary>
        /// <throws>  IOException </throws>
        private void MergeVectors()
        {
            TermVectorsWriter termVectorsWriter = new TermVectorsWriter(directory, segment, fieldInfos);

            try
            {
                for (int r = 0; r < readers.Count; r++)
                {
                    IndexReader reader = (IndexReader) readers[r];
                    int maxDoc = reader.MaxDoc();
                    for (int docNum = 0; docNum < maxDoc; docNum++)
                    {
                        // skip deleted docs
                        if (reader.IsDeleted(docNum))
                            continue;
                        termVectorsWriter.AddAllDocVectors(reader.GetTermFreqVectors(docNum));
                    }
                }
            }
            finally
            {
                termVectorsWriter.Close();
            }
        }
Example #27
0
		public virtual void  TestWriter()
		{
			try
			{
				TermVectorsWriter writer = new TermVectorsWriter(dir, seg, fieldInfos);
				writer.OpenDocument();
				Assert.IsTrue(writer.IsDocumentOpen() == true);
				WriteField(writer, testFields[0]);
				writer.CloseDocument();
				writer.Close();
				Assert.IsTrue(writer.IsDocumentOpen() == false);
				//Check to see the files were created
				Assert.IsTrue(dir.FileExists(seg + TermVectorsWriter.TVD_EXTENSION));
				Assert.IsTrue(dir.FileExists(seg + TermVectorsWriter.TVX_EXTENSION));
				//Now read it back in
				TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
				Assert.IsTrue(reader != null);
				CheckTermVector(reader, 0, testFields[0]);
			}
			catch (System.IO.IOException e)
			{
				System.Console.Error.WriteLine(e.StackTrace);
				Assert.IsTrue(false);
			}
		}
Example #28
0
		public virtual void  TestMultipleDocuments()
		{
			
			try
			{
				TermVectorsWriter writer = new TermVectorsWriter(dir, seg, fieldInfos);
				Assert.IsTrue(writer != null);
				for (int i = 0; i < 10; i++)
				{
					WriteDocument(writer, testFields.Length);
				}
				writer.Close();
			}
			catch (System.IO.IOException e)
			{
				System.Console.Error.WriteLine(e.StackTrace);
				Assert.IsTrue(false);
			}
			//Do some arbitrary tests
			try
			{
				TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
				for (int i = 0; i < 10; i++)
				{
					Assert.IsTrue(reader != null);
					CheckTermVector(reader, 5, testFields[0]);
					CheckTermVector(reader, 2, testFields[2]);
				}
			}
			catch (System.IO.IOException e)
			{
				System.Console.Error.WriteLine(e.StackTrace);
				Assert.IsTrue(false);
			}
		}
Example #29
0
		/// <summary> </summary>
		/// <param name="writer">The writer to write to
		/// </param>
		/// <param name="j">The Field number
		/// </param>
		/// <throws>  IOException </throws>
		private void  WriteField(TermVectorsWriter writer, System.String f)
		{
			writer.OpenField(f);
			Assert.IsTrue(writer.IsFieldOpen() == true);
			for (int i = 0; i < testTerms.Length; i++)
			{
				writer.AddTerm(testTerms[i], i);
			}
			writer.CloseField();
		}
Example #30
0
		private void  WritePostings(Posting[] postings, System.String segment)
		{
			IndexOutput freq = null, prox = null;
			TermInfosWriter tis = null;
			TermVectorsWriter termVectorWriter = null;
			try
			{
				//open files for inverse index storage
				freq = directory.CreateOutput(segment + ".frq");
				prox = directory.CreateOutput(segment + ".prx");
				tis = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval);
				TermInfo ti = new TermInfo();
				System.String currentField = null;
				
				for (int i = 0; i < postings.Length; i++)
				{
					Posting posting = postings[i];
					
					// add an entry to the dictionary with pointers to prox and freq files
					ti.Set(1, freq.GetFilePointer(), prox.GetFilePointer(), - 1);
					tis.Add(posting.term, ti);
					
					// add an entry to the freq file
					int postingFreq = posting.freq;
					if (postingFreq == 1)
					// optimize freq=1
						freq.WriteVInt(1);
					// set low bit of doc num.
					else
					{
						freq.WriteVInt(0); // the document number
						freq.WriteVInt(postingFreq); // frequency in doc
					}
					
					int lastPosition = 0; // write positions
					int[] positions = posting.positions;
					for (int j = 0; j < postingFreq; j++)
					{
						// use delta-encoding
						int position = positions[j];
						prox.WriteVInt(position - lastPosition);
						lastPosition = position;
					}
					// check to see if we switched to a new field
					System.String termField = posting.term.Field();
					if (currentField != termField)
					{
						// changing field - see if there is something to save
						currentField = termField;
						FieldInfo fi = fieldInfos.FieldInfo(currentField);
						if (fi.storeTermVector)
						{
							if (termVectorWriter == null)
							{
								termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos);
								termVectorWriter.OpenDocument();
							}
							termVectorWriter.OpenField(currentField);
						}
						else if (termVectorWriter != null)
						{
							termVectorWriter.CloseField();
						}
					}
					if (termVectorWriter != null && termVectorWriter.IsFieldOpen())
					{
						termVectorWriter.AddTerm(posting.term.Text(), postingFreq, posting.positions, posting.offsets);
					}
				}
				if (termVectorWriter != null)
					termVectorWriter.CloseDocument();
			}
			finally
			{
				// make an effort to close all streams we can but remember and re-throw
				// the first exception encountered in this process
				System.IO.IOException keep = null;
				if (freq != null)
					try
					{
						freq.Close();
					}
					catch (System.IO.IOException e)
					{
						if (keep == null)
							keep = e;
					}
				if (prox != null)
					try
					{
						prox.Close();
					}
					catch (System.IO.IOException e)
					{
						if (keep == null)
							keep = e;
					}
				if (tis != null)
					try
					{
						tis.Close();
					}
					catch (System.IO.IOException e)
					{
						if (keep == null)
							keep = e;
					}
				if (termVectorWriter != null)
					try
					{
						termVectorWriter.Close();
					}
					catch (System.IO.IOException e)
					{
						if (keep == null)
							keep = e;
					}
				if (keep != null)
				{
					throw new System.IO.IOException(keep.StackTrace);
				}
			}
		}
Example #31
0
		private void  WriteDocument(TermVectorsWriter writer, int numFields)
		{
			writer.OpenDocument();
			Assert.IsTrue(writer.IsDocumentOpen() == true);
			
			for (int j = 0; j < numFields; j++)
			{
				WriteField(writer, testFields[j]);
			}
			writer.CloseDocument();
			Assert.IsTrue(writer.IsDocumentOpen() == false);
		}