GetTermFreqVectors() abstract public method

Return an array of term frequency vectors for the specified document. The array contains a vector for each vectorized field in the document. Each vector contains terms and frequencies for all terms in a given vectorized field. If no such fields existed, the method returns null. The term vectors that are returned may either be of type ITermFreqVector or of type TermPositionVector if positions or offsets have been stored.
abstract public GetTermFreqVectors ( int docNumber ) : ITermFreqVector[]
docNumber int document for which term frequency vectors are returned ///
return ITermFreqVector[]
Example #1
0
        private void  CopyVectorsNoDeletions(TermVectorsWriter termVectorsWriter, TermVectorsReader matchingVectorsReader, IndexReader reader)
        {
            int maxDoc = reader.MaxDoc;

            if (matchingVectorsReader != null)
            {
                // We can bulk-copy because the fieldInfos are "congruent"
                int docCount = 0;
                while (docCount < maxDoc)
                {
                    int len = System.Math.Min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
                    matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, docCount, len);
                    termVectorsWriter.AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, len);
                    docCount += len;
                    checkAbort.Work(300 * len);
                }
            }
            else
            {
                for (int docNum = 0; docNum < maxDoc; docNum++)
                {
                    // NOTE: it's very important to first assign to vectors then pass it to
                    // termVectorsWriter.addAllDocVectors; see LUCENE-1282
                    ITermFreqVector[] vectors = reader.GetTermFreqVectors(docNum);
                    termVectorsWriter.AddAllDocVectors(vectors);
                    checkAbort.Work(300);
                }
            }
        }
Example #2
0
        /// <summary> Merge the TermVectors from each of the segments into the new one.</summary>
        /// <throws>  IOException </throws>
        private void  MergeVectors()
        {
            TermVectorsWriter termVectorsWriter = new TermVectorsWriter(directory, segment, fieldInfos);

            try
            {
                for (int r = 0; r < readers.Count; r++)
                {
                    IndexReader reader = (IndexReader)readers[r];
                    int         maxDoc = reader.MaxDoc();
                    for (int docNum = 0; docNum < maxDoc; docNum++)
                    {
                        // skip deleted docs
                        if (reader.IsDeleted(docNum))
                        {
                            continue;
                        }
                        termVectorsWriter.AddAllDocVectors(reader.GetTermFreqVectors(docNum));
                        if (checkAbort != null)
                        {
                            checkAbort.Work(300);
                        }
                    }
                }
            }
            finally
            {
                termVectorsWriter.Close();
            }
        }
        private void  TestTermVectors()
        {
            // check:
            int  numDocs = reader.NumDocs();
            long start   = 0L;

            for (int docId = 0; docId < numDocs; docId++)
            {
                start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                ITermFreqVector[] vectors = reader.GetTermFreqVectors(docId, null);
                timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start;

                // verify vectors result
                VerifyVectors(vectors, docId);

                start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                ITermFreqVector vector = reader.GetTermFreqVector(docId, "field", null);
                timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start;

                vectors    = new ITermFreqVector[1];
                vectors[0] = vector;

                VerifyVectors(vectors, docId);
            }
        }
Example #4
0
        private void  CopyVectorsWithDeletions(TermVectorsWriter termVectorsWriter, TermVectorsReader matchingVectorsReader, IndexReader reader)
        {
            int maxDoc = reader.MaxDoc;

            if (matchingVectorsReader != null)
            {
                // We can bulk-copy because the fieldInfos are "congruent"
                for (int docNum = 0; docNum < maxDoc;)
                {
                    if (reader.IsDeleted(docNum))
                    {
                        // skip deleted docs
                        ++docNum;
                        continue;
                    }
                    // We can optimize this case (doing a bulk byte copy) since the field
                    // numbers are identical
                    int start = docNum, numDocs = 0;
                    do
                    {
                        docNum++;
                        numDocs++;
                        if (docNum >= maxDoc)
                        {
                            break;
                        }
                        if (reader.IsDeleted(docNum))
                        {
                            docNum++;
                            break;
                        }
                    }while (numDocs < MAX_RAW_MERGE_DOCS);

                    matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
                    termVectorsWriter.AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
                    checkAbort.Work(300 * numDocs);
                }
            }
            else
            {
                for (int docNum = 0; docNum < maxDoc; docNum++)
                {
                    if (reader.IsDeleted(docNum))
                    {
                        // skip deleted docs
                        continue;
                    }

                    // NOTE: it's very important to first assign to vectors then pass it to
                    // termVectorsWriter.addAllDocVectors; see LUCENE-1282
                    ITermFreqVector[] vectors = reader.GetTermFreqVectors(docNum);
                    termVectorsWriter.AddAllDocVectors(vectors);
                    checkAbort.Work(300);
                }
            }
        }
Example #5
0
        private void  TestHits(ScoreDoc[] hits, int expectedCount, IndexReader reader)
        {
            int hitCount = hits.Length;

            Assert.AreEqual(expectedCount, hitCount, "wrong number of hits");
            for (int i = 0; i < hitCount; i++)
            {
                reader.Document(hits[i].Doc);
                reader.GetTermFreqVectors(hits[i].Doc);
            }
        }
Example #6
0
 public override ITermFreqVector[] GetTermFreqVectors(int docNumber)
 {
     EnsureOpen();
     return(in_Renamed.GetTermFreqVectors(docNumber));
 }
Example #7
0
		private void  CopyVectorsNoDeletions(TermVectorsWriter termVectorsWriter, TermVectorsReader matchingVectorsReader, IndexReader reader)
		{
			int maxDoc = reader.MaxDoc();
			if (matchingVectorsReader != null)
			{
				// We can bulk-copy because the fieldInfos are "congruent"
				int docCount = 0;
				while (docCount < maxDoc)
				{
					int len = System.Math.Min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
					matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, docCount, len);
					termVectorsWriter.AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, len);
					docCount += len;
					checkAbort.Work(300 * len);
				}
			}
			else
			{
				for (int docNum = 0; docNum < maxDoc; docNum++)
				{
					// NOTE: it's very important to first assign to vectors then pass it to
					// termVectorsWriter.addAllDocVectors; see LUCENE-1282
					TermFreqVector[] vectors = reader.GetTermFreqVectors(docNum);
					termVectorsWriter.AddAllDocVectors(vectors);
					checkAbort.Work(300);
				}
			}
		}
Example #8
0
		private void  CopyVectorsWithDeletions(TermVectorsWriter termVectorsWriter, TermVectorsReader matchingVectorsReader, IndexReader reader)
		{
			int maxDoc = reader.MaxDoc();
			if (matchingVectorsReader != null)
			{
				// We can bulk-copy because the fieldInfos are "congruent"
				for (int docNum = 0; docNum < maxDoc; )
				{
					if (reader.IsDeleted(docNum))
					{
						// skip deleted docs
						++docNum;
						continue;
					}
					// We can optimize this case (doing a bulk byte copy) since the field 
					// numbers are identical
					int start = docNum, numDocs = 0;
					do 
					{
						docNum++;
						numDocs++;
						if (docNum >= maxDoc)
							break;
						if (reader.IsDeleted(docNum))
						{
							docNum++;
							break;
						}
					}
					while (numDocs < MAX_RAW_MERGE_DOCS);
					
					matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
					termVectorsWriter.AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
					checkAbort.Work(300 * numDocs);
				}
			}
			else
			{
				for (int docNum = 0; docNum < maxDoc; docNum++)
				{
					if (reader.IsDeleted(docNum))
					{
						// skip deleted docs
						continue;
					}
					
					// NOTE: it's very important to first assign to vectors then pass it to
					// termVectorsWriter.addAllDocVectors; see LUCENE-1282
					TermFreqVector[] vectors = reader.GetTermFreqVectors(docNum);
					termVectorsWriter.AddAllDocVectors(vectors);
					checkAbort.Work(300);
				}
			}
		}
Example #9
0
		public static void  VerifyEquals(IndexReader r1, IndexReader r2, System.String idField)
		{
			Assert.AreEqual(r1.NumDocs(), r2.NumDocs());
			bool hasDeletes = !(r1.MaxDoc() == r2.MaxDoc() && r1.NumDocs() == r1.MaxDoc());
			
			int[] r2r1 = new int[r2.MaxDoc()]; // r2 id to r1 id mapping
			
			TermDocs termDocs1 = r1.TermDocs();
			TermDocs termDocs2 = r2.TermDocs();
			
			// create mapping from id2 space to id2 based on idField
			idField = StringHelper.Intern(idField);
			TermEnum termEnum = r1.Terms(new Term(idField, ""));
			do 
			{
				Term term = termEnum.Term();
				if (term == null || (System.Object) term.Field() != (System.Object) idField)
					break;
				
				termDocs1.Seek(termEnum);
				if (!termDocs1.Next())
				{
					// This doc is deleted and wasn't replaced
					termDocs2.Seek(termEnum);
					Assert.IsFalse(termDocs2.Next());
					continue;
				}
				
				int id1 = termDocs1.Doc();
				Assert.IsFalse(termDocs1.Next());
				
				termDocs2.Seek(termEnum);
				Assert.IsTrue(termDocs2.Next());
				int id2 = termDocs2.Doc();
				Assert.IsFalse(termDocs2.Next());
				
				r2r1[id2] = id1;
				
				// verify stored fields are equivalent
				try
				{
					VerifyEquals(r1.Document(id1), r2.Document(id2));
				}
				catch (System.Exception t)
				{
					System.Console.Out.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2 + " term=" + term);
					System.Console.Out.WriteLine("  d1=" + r1.Document(id1));
					System.Console.Out.WriteLine("  d2=" + r2.Document(id2));
					throw t;
				}
				
				try
				{
					// verify term vectors are equivalent        
					VerifyEquals(r1.GetTermFreqVectors(id1), r2.GetTermFreqVectors(id2));
				}
				catch (System.Exception e)
				{
					System.Console.Out.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2);
					TermFreqVector[] tv1 = r1.GetTermFreqVectors(id1);
					System.Console.Out.WriteLine("  d1=" + tv1);
					if (tv1 != null)
						for (int i = 0; i < tv1.Length; i++)
						{
							System.Console.Out.WriteLine("    " + i + ": " + tv1[i]);
						}
					
					TermFreqVector[] tv2 = r2.GetTermFreqVectors(id2);
					System.Console.Out.WriteLine("  d2=" + tv2);
					if (tv2 != null)
						for (int i = 0; i < tv2.Length; i++)
						{
							System.Console.Out.WriteLine("    " + i + ": " + tv2[i]);
						}
					
					throw e;
				}
			}
			while (termEnum.Next());
			
			termEnum.Close();
			
			// Verify postings
			TermEnum termEnum1 = r1.Terms(new Term("", ""));
			TermEnum termEnum2 = r2.Terms(new Term("", ""));
			
			// pack both doc and freq into single element for easy sorting
			long[] info1 = new long[r1.NumDocs()];
			long[] info2 = new long[r2.NumDocs()];
			
			for (; ; )
			{
				Term term1, term2;
				
				// iterate until we get some docs
				int len1;
				for (; ; )
				{
					len1 = 0;
					term1 = termEnum1.Term();
					if (term1 == null)
						break;
					termDocs1.Seek(termEnum1);
					while (termDocs1.Next())
					{
						int d1 = termDocs1.Doc();
						int f1 = termDocs1.Freq();
						info1[len1] = (((long) d1) << 32) | f1;
						len1++;
					}
					if (len1 > 0)
						break;
					if (!termEnum1.Next())
						break;
				}
				
				// iterate until we get some docs
				int len2;
				for (; ; )
				{
					len2 = 0;
					term2 = termEnum2.Term();
					if (term2 == null)
						break;
					termDocs2.Seek(termEnum2);
					while (termDocs2.Next())
					{
						int d2 = termDocs2.Doc();
						int f2 = termDocs2.Freq();
						info2[len2] = (((long) r2r1[d2]) << 32) | f2;
						len2++;
					}
					if (len2 > 0)
						break;
					if (!termEnum2.Next())
						break;
				}
				
				if (!hasDeletes)
					Assert.AreEqual(termEnum1.DocFreq(), termEnum2.DocFreq());
				
				Assert.AreEqual(len1, len2);
				if (len1 == 0)
					break; // no more terms
				
				Assert.AreEqual(term1, term2);
				
				// sort info2 to get it into ascending docid
				System.Array.Sort(info2, 0, len2 - 0);
				
				// now compare
				for (int i = 0; i < len1; i++)
				{
					Assert.AreEqual(info1[i], info2[i]);
				}
				
				termEnum1.Next();
				termEnum2.Next();
			}
		}
Example #10
0
        public static void  VerifyEquals(IndexReader r1, IndexReader r2, System.String idField)
        {
            Assert.AreEqual(r1.NumDocs(), r2.NumDocs());
            bool hasDeletes = !(r1.MaxDoc() == r2.MaxDoc() && r1.NumDocs() == r1.MaxDoc());

            int[] r2r1 = new int[r2.MaxDoc()];             // r2 id to r1 id mapping

            TermDocs termDocs1 = r1.TermDocs();
            TermDocs termDocs2 = r2.TermDocs();

            // create mapping from id2 space to id2 based on idField
            idField = StringHelper.Intern(idField);
            TermEnum termEnum = r1.Terms(new Term(idField, ""));

            do
            {
                Term term = termEnum.Term();
                if (term == null || (System.Object)term.Field() != (System.Object)idField)
                {
                    break;
                }

                termDocs1.Seek(termEnum);
                if (!termDocs1.Next())
                {
                    // This doc is deleted and wasn't replaced
                    termDocs2.Seek(termEnum);
                    Assert.IsFalse(termDocs2.Next());
                    continue;
                }

                int id1 = termDocs1.Doc();
                Assert.IsFalse(termDocs1.Next());

                termDocs2.Seek(termEnum);
                Assert.IsTrue(termDocs2.Next());
                int id2 = termDocs2.Doc();
                Assert.IsFalse(termDocs2.Next());

                r2r1[id2] = id1;

                // verify stored fields are equivalent
                try
                {
                    VerifyEquals(r1.Document(id1), r2.Document(id2));
                }
                catch (System.Exception t)
                {
                    System.Console.Out.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2 + " term=" + term);
                    System.Console.Out.WriteLine("  d1=" + r1.Document(id1));
                    System.Console.Out.WriteLine("  d2=" + r2.Document(id2));
                    throw t;
                }

                try
                {
                    // verify term vectors are equivalent
                    VerifyEquals(r1.GetTermFreqVectors(id1), r2.GetTermFreqVectors(id2));
                }
                catch (System.Exception e)
                {
                    System.Console.Out.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2);
                    TermFreqVector[] tv1 = r1.GetTermFreqVectors(id1);
                    System.Console.Out.WriteLine("  d1=" + tv1);
                    if (tv1 != null)
                    {
                        for (int i = 0; i < tv1.Length; i++)
                        {
                            System.Console.Out.WriteLine("    " + i + ": " + tv1[i]);
                        }
                    }

                    TermFreqVector[] tv2 = r2.GetTermFreqVectors(id2);
                    System.Console.Out.WriteLine("  d2=" + tv2);
                    if (tv2 != null)
                    {
                        for (int i = 0; i < tv2.Length; i++)
                        {
                            System.Console.Out.WriteLine("    " + i + ": " + tv2[i]);
                        }
                    }

                    throw e;
                }
            }while (termEnum.Next());

            termEnum.Close();

            // Verify postings
            TermEnum termEnum1 = r1.Terms(new Term("", ""));
            TermEnum termEnum2 = r2.Terms(new Term("", ""));

            // pack both doc and freq into single element for easy sorting
            long[] info1 = new long[r1.NumDocs()];
            long[] info2 = new long[r2.NumDocs()];

            for (; ;)
            {
                Term term1, term2;

                // iterate until we get some docs
                int len1;
                for (; ;)
                {
                    len1  = 0;
                    term1 = termEnum1.Term();
                    if (term1 == null)
                    {
                        break;
                    }
                    termDocs1.Seek(termEnum1);
                    while (termDocs1.Next())
                    {
                        int d1 = termDocs1.Doc();
                        int f1 = termDocs1.Freq();
                        info1[len1] = (((long)d1) << 32) | f1;
                        len1++;
                    }
                    if (len1 > 0)
                    {
                        break;
                    }
                    if (!termEnum1.Next())
                    {
                        break;
                    }
                }

                // iterate until we get some docs
                int len2;
                for (; ;)
                {
                    len2  = 0;
                    term2 = termEnum2.Term();
                    if (term2 == null)
                    {
                        break;
                    }
                    termDocs2.Seek(termEnum2);
                    while (termDocs2.Next())
                    {
                        int d2 = termDocs2.Doc();
                        int f2 = termDocs2.Freq();
                        info2[len2] = (((long)r2r1[d2]) << 32) | f2;
                        len2++;
                    }
                    if (len2 > 0)
                    {
                        break;
                    }
                    if (!termEnum2.Next())
                    {
                        break;
                    }
                }

                if (!hasDeletes)
                {
                    Assert.AreEqual(termEnum1.DocFreq(), termEnum2.DocFreq());
                }

                Assert.AreEqual(len1, len2);
                if (len1 == 0)
                {
                    break;                     // no more terms
                }
                Assert.AreEqual(term1, term2);

                // sort info2 to get it into ascending docid
                System.Array.Sort(info2, 0, len2 - 0);

                // now compare
                for (int i = 0; i < len1; i++)
                {
                    Assert.AreEqual(info1[i], info2[i]);
                }

                termEnum1.Next();
                termEnum2.Next();
            }
        }
		private void  TestHits(ScoreDoc[] hits, int expectedCount, IndexReader reader)
		{
			int hitCount = hits.Length;
			Assert.AreEqual(expectedCount, hitCount, "wrong number of hits");
			for (int i = 0; i < hitCount; i++)
			{
				reader.Document(hits[i].doc);
				reader.GetTermFreqVectors(hits[i].doc);
			}
		}
Example #12
0
 public override TermFreqVector[] GetTermFreqVectors(int docNumber)
 {
     return(in_Renamed.GetTermFreqVectors(docNumber));
 }
Example #13
0
        /// <summary> Merge the TermVectors from each of the segments into the new one.</summary>
        /// <throws>  IOException </throws>
        private void MergeVectors()
        {
            TermVectorsWriter termVectorsWriter =
                new TermVectorsWriter(directory, segment, fieldInfos);

            try
            {
                for (int r = 0; r < readers.Count; r++)
                {
                    SegmentReader     matchingSegmentReader = matchingSegmentReaders[r];
                    TermVectorsReader matchingVectorsReader;
                    bool hasMatchingReader;
                    if (matchingSegmentReader != null)
                    {
                        matchingVectorsReader = matchingSegmentReader.termVectorsReaderOrig;

                        // If the TV* files are an older format then they
                        // cannot read raw docs:
                        if (matchingVectorsReader != null && !matchingVectorsReader.CanReadRawDocs())
                        {
                            matchingVectorsReader = null;
                            hasMatchingReader     = false;
                        }
                        else
                        {
                            hasMatchingReader = matchingVectorsReader != null;
                        }
                    }
                    else
                    {
                        hasMatchingReader     = false;
                        matchingVectorsReader = null;
                    }
                    IndexReader reader       = (IndexReader)readers[r];
                    bool        hasDeletions = reader.HasDeletions();
                    int         maxDoc       = reader.MaxDoc();
                    for (int docNum = 0; docNum < maxDoc;)
                    {
                        // skip deleted docs
                        if (!hasDeletions || !reader.IsDeleted(docNum))
                        {
                            if (hasMatchingReader)
                            {
                                // We can optimize this case (doing a bulk
                                // byte copy) since the field numbers are
                                // identical
                                int start   = docNum;
                                int numDocs = 0;
                                do
                                {
                                    docNum++;
                                    numDocs++;
                                    if (docNum >= maxDoc)
                                    {
                                        break;
                                    }
                                    if (hasDeletions && matchingSegmentReader.IsDeleted(docNum))
                                    {
                                        docNum++;
                                        break;
                                    }
                                } while (numDocs < MAX_RAW_MERGE_DOCS);

                                matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
                                termVectorsWriter.AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
                                if (checkAbort != null)
                                {
                                    checkAbort.Work(300 * numDocs);
                                }
                            }
                            else
                            {
                                // NOTE: it's very important to first assign
                                // to vectors then pass it to
                                // termVectorsWriter.addAllDocVectors; see
                                // LUCENE-1282
                                TermFreqVector[] vectors = reader.GetTermFreqVectors(docNum);
                                termVectorsWriter.AddAllDocVectors(vectors);
                                docNum++;
                                if (checkAbort != null)
                                {
                                    checkAbort.Work(300);
                                }
                            }
                        }
                        else
                        {
                            docNum++;
                        }
                    }
                }
            }
            finally
            {
                termVectorsWriter.Close();
            }

            long tvxSize = directory.FileLength(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);

            // {{dougsale-2.4.0}
            // this shouldn't be a problem for us - if it is,
            // then it's not a JRE bug
            //if (4 + mergedDocs * 16 != tvxSize)
            //  // This is most likely a bug in Sun JRE 1.6.0_04/_05;
            //  // we detect that the bug has struck, here, and
            //  // throw an exception to prevent the corruption from
            //  // entering the index.  See LUCENE-1282 for
            //  // details.
            //  throw new RuntimeException("mergeVectors produced an invalid result: mergedDocs is " + mergedDocs + " but tvx size is " + tvxSize + "; now aborting this merge to prevent index corruption");
        }