示例#1
0
 private void  ReadTermVectors(System.String[] fields, long[] tvfPointers, TermVectorMapper mapper)
 {
     for (int i = 0; i < fields.Length; i++)
     {
         ReadTermVector(fields[i], tvfPointers[i], mapper);
     }
 }
示例#2
0
        public override void  GetTermFreqVector(int docNumber, TermVectorMapper mapper)
        {
            EnsureOpen();
            int i = ReaderIndex(docNumber);             // find segment num

            subReaders[i].GetTermFreqVector(docNumber - starts[i], mapper);
        }
示例#3
0
        public virtual void  Get(int docNumber, TermVectorMapper mapper)
        {
            // Check if no term vectors are available for this segment at all
            if (tvx != null)
            {
                //We need to offset by

                SeekTvx(docNumber);
                long tvdPosition = tvx.ReadLong();

                tvd.Seek(tvdPosition);
                int fieldCount = tvd.ReadVInt();

                // No fields are vectorized for this document
                if (fieldCount != 0)
                {
                    System.String[] fields      = ReadFields(fieldCount);
                    long[]          tvfPointers = ReadTvfPointers(fieldCount);
                    mapper.SetDocumentNumber(docNumber);
                    ReadTermVectors(fields, tvfPointers, mapper);
                }
            }
            else
            {
                //System.out.println("No tvx file");
            }
        }
示例#4
0
        public override void  GetTermFreqVector(int docNumber, System.String field, TermVectorMapper mapper)
        {
            EnsureOpen();
            IndexReader reader = ((IndexReader)fieldToReader[field]);

            if (reader != null)
            {
                reader.GetTermFreqVector(docNumber, field, mapper);
            }
        }
示例#5
0
        public override void  GetTermFreqVector(int docNumber, TermVectorMapper mapper)
        {
            EnsureOpen();

            System.Collections.IEnumerator i = new System.Collections.Hashtable(fieldToReader).GetEnumerator();
            while (i.MoveNext())
            {
                System.Collections.DictionaryEntry e = (System.Collections.DictionaryEntry)i.Current;
                System.String field  = (System.String)e.Key;
                IndexReader   reader = (IndexReader)e.Value;
                reader.GetTermFreqVector(docNumber, field, mapper);
            }
        }
示例#6
0
		/// <summary> </summary>
		/// <param name="field">The field to read in
		/// </param>
		/// <param name="tvfPointer">The pointer within the tvf file where we should start reading
		/// </param>
		/// <param name="mapper">The mapper used to map the TermVector
		/// </param>
		/// <throws>  IOException </throws>
		private void  ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper)
		{
			
			// Now read the data from specified position
			//We don't need to offset by the FORMAT here since the pointer already includes the offset
			tvf.Seek(tvfPointer);
			
			int numTerms = tvf.ReadVInt();
			//System.out.println("Num Terms: " + numTerms);
			// If no terms - return a constant empty termvector. However, this should never occur!
			if (numTerms == 0)
				return ;
			
			bool storePositions;
			bool storeOffsets;
			
			if (format >= FORMAT_VERSION)
			{
				byte bits = tvf.ReadByte();
				storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
				storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
			}
			else
			{
				tvf.ReadVInt();
				storePositions = false;
				storeOffsets = false;
			}
			mapper.SetExpectations(field, numTerms, storeOffsets, storePositions);
			int start = 0;
			int deltaLength = 0;
			int totalLength = 0;
			byte[] byteBuffer;
			char[] charBuffer;
			bool preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;
			
			// init the buffers
			if (preUTF8)
			{
				charBuffer = new char[10];
				byteBuffer = null;
			}
			else
			{
				charBuffer = null;
				byteBuffer = new byte[20];
			}
			
			for (int i = 0; i < numTerms; i++)
			{
				start = tvf.ReadVInt();
				deltaLength = tvf.ReadVInt();
				totalLength = start + deltaLength;
				
				System.String term;
				
				if (preUTF8)
				{
					// Term stored as java chars
					if (charBuffer.Length < totalLength)
					{
						char[] newCharBuffer = new char[(int) (1.5 * totalLength)];
						Array.Copy(charBuffer, 0, newCharBuffer, 0, start);
						charBuffer = newCharBuffer;
					}
					tvf.ReadChars(charBuffer, start, deltaLength);
					term = new System.String(charBuffer, 0, totalLength);
				}
				else
				{
					// Term stored as utf8 bytes
					if (byteBuffer.Length < totalLength)
					{
						byte[] newByteBuffer = new byte[(int) (1.5 * totalLength)];
						Array.Copy(byteBuffer, 0, newByteBuffer, 0, start);
						byteBuffer = newByteBuffer;
					}
					tvf.ReadBytes(byteBuffer, start, deltaLength);
                    term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength);
				}
				int freq = tvf.ReadVInt();
				int[] positions = null;
				if (storePositions)
				{
					//read in the positions
					//does the mapper even care about positions?
					if (mapper.IsIgnoringPositions() == false)
					{
						positions = new int[freq];
						int prevPosition = 0;
						for (int j = 0; j < freq; j++)
						{
							positions[j] = prevPosition + tvf.ReadVInt();
							prevPosition = positions[j];
						}
					}
					else
					{
						//we need to skip over the positions.  Since these are VInts, I don't believe there is anyway to know for sure how far to skip
						//
						for (int j = 0; j < freq; j++)
						{
							tvf.ReadVInt();
						}
					}
				}
				TermVectorOffsetInfo[] offsets = null;
				if (storeOffsets)
				{
					//does the mapper even care about offsets?
					if (mapper.IsIgnoringOffsets() == false)
					{
						offsets = new TermVectorOffsetInfo[freq];
						int prevOffset = 0;
						for (int j = 0; j < freq; j++)
						{
							int startOffset = prevOffset + tvf.ReadVInt();
							int endOffset = startOffset + tvf.ReadVInt();
							offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
							prevOffset = endOffset;
						}
					}
					else
					{
						for (int j = 0; j < freq; j++)
						{
							tvf.ReadVInt();
							tvf.ReadVInt();
						}
					}
				}
				mapper.Map(term, freq, offsets, positions);
			}
		}
示例#7
0
        public virtual void  Get(int docNum, System.String field, TermVectorMapper mapper)
        {
            if (tvx != null)
            {
                int fieldNumber = fieldInfos.FieldNumber(field);
                //We need to account for the FORMAT_SIZE at when seeking in the tvx
                //We don't need to do this in other seeks because we already have the
                // file pointer
                //that was written in another file
                SeekTvx(docNum);
                //System.out.println("TVX Pointer: " + tvx.getFilePointer());
                long tvdPosition = tvx.ReadLong();

                tvd.Seek(tvdPosition);
                int fieldCount = tvd.ReadVInt();
                //System.out.println("Num Fields: " + fieldCount);
                // There are only a few fields per document. We opt for a full scan
                // rather then requiring that they be ordered. We need to read through
                // all of the fields anyway to get to the tvf pointers.
                int number = 0;
                int found  = -1;
                for (int i = 0; i < fieldCount; i++)
                {
                    if (format >= FORMAT_VERSION)
                    {
                        number = tvd.ReadVInt();
                    }
                    else
                    {
                        number += tvd.ReadVInt();
                    }

                    if (number == fieldNumber)
                    {
                        found = i;
                    }
                }

                // This field, although valid in the segment, was not found in this
                // document
                if (found != -1)
                {
                    // Compute position in the tvf file
                    long position;
                    if (format >= FORMAT_VERSION2)
                    {
                        position = tvx.ReadLong();
                    }
                    else
                    {
                        position = tvd.ReadVLong();
                    }
                    for (int i = 1; i <= found; i++)
                    {
                        position += tvd.ReadVLong();
                    }

                    mapper.SetDocumentNumber(docNum);
                    ReadTermVector(field, position, mapper);
                }
                else
                {
                    //System.out.println("Fieldable not found");
                }
            }
            else
            {
                //System.out.println("No tvx file");
            }
        }
示例#8
0
		public override void  GetTermFreqVector(int docNumber, TermVectorMapper mapper)
		{
			EnsureOpen();
			int i = ReaderIndex(docNumber); // find segment num
			subReaders[i].GetTermFreqVector(docNumber - starts[i], mapper);
		}
示例#9
0
		public override void  GetTermFreqVector(int docNumber, TermVectorMapper mapper)
		{
			EnsureOpen();

            System.Collections.IEnumerator i = new System.Collections.Hashtable(fieldToReader).GetEnumerator();
			while (i.MoveNext())
			{
				System.Collections.DictionaryEntry e = (System.Collections.DictionaryEntry) i.Current;
				System.String field = (System.String) e.Key;
				IndexReader reader = (IndexReader) e.Value;
				reader.GetTermFreqVector(docNumber, field, mapper);
			}
		}
示例#10
0
		public override void  GetTermFreqVector(int docNumber, System.String field, TermVectorMapper mapper)
		{
			EnsureOpen();
			IndexReader reader = ((IndexReader) fieldToReader[field]);
			if (reader != null)
			{
				reader.GetTermFreqVector(docNumber, field, mapper);
			}
		}
示例#11
0
		public override void  GetTermFreqVector(int docNumber, TermVectorMapper mapper)
		{
			EnsureOpen();
			
			TermVectorsReader termVectorsReader = GetTermVectorsReader();
			if (termVectorsReader == null)
				return ;
			
			termVectorsReader.Get(docNumber, mapper);
		}
示例#12
0
		public override void  GetTermFreqVector(int docNumber, System.String field, TermVectorMapper mapper)
		{
			EnsureOpen();
			FieldInfo fi = core.fieldInfos.FieldInfo(field);
			if (fi == null || !fi.storeTermVector)
				return ;
			
			TermVectorsReader termVectorsReader = GetTermVectorsReader();
			if (termVectorsReader == null)
			{
				return ;
			}
			
			
			termVectorsReader.Get(docNumber, field, mapper);
		}
示例#13
0
		public override void  GetTermFreqVector(int docNumber, TermVectorMapper mapper)
		{
			EnsureOpen();
			in_Renamed.GetTermFreqVector(docNumber, mapper);
		}
示例#14
0
		private void  ReadTermVectors(System.String[] fields, long[] tvfPointers, TermVectorMapper mapper)
		{
			for (int i = 0; i < fields.Length; i++)
			{
				ReadTermVector(fields[i], tvfPointers[i], mapper);
			}
		}
示例#15
0
 public override void  GetTermFreqVector(int docNumber, System.String field, TermVectorMapper mapper)
 {
     EnsureOpen();
     in_Renamed.GetTermFreqVector(docNumber, field, mapper);
 }
示例#16
0
		public virtual void  Get(int docNumber, TermVectorMapper mapper)
		{
			// Check if no term vectors are available for this segment at all
			if (tvx != null)
			{
				//We need to offset by
				
				SeekTvx(docNumber);
				long tvdPosition = tvx.ReadLong();
				
				tvd.Seek(tvdPosition);
				int fieldCount = tvd.ReadVInt();
				
				// No fields are vectorized for this document
				if (fieldCount != 0)
				{
					System.String[] fields = ReadFields(fieldCount);
					long[] tvfPointers = ReadTvfPointers(fieldCount);
					mapper.SetDocumentNumber(docNumber);
					ReadTermVectors(fields, tvfPointers, mapper);
				}
			}
			else
			{
				//System.out.println("No tvx file");
			}
		}
示例#17
0
		public virtual void  Get(int docNum, System.String field, TermVectorMapper mapper)
		{
			if (tvx != null)
			{
				int fieldNumber = fieldInfos.FieldNumber(field);
				//We need to account for the FORMAT_SIZE at when seeking in the tvx
				//We don't need to do this in other seeks because we already have the
				// file pointer
				//that was written in another file
				SeekTvx(docNum);
				//System.out.println("TVX Pointer: " + tvx.getFilePointer());
				long tvdPosition = tvx.ReadLong();
				
				tvd.Seek(tvdPosition);
				int fieldCount = tvd.ReadVInt();
				//System.out.println("Num Fields: " + fieldCount);
				// There are only a few fields per document. We opt for a full scan
				// rather then requiring that they be ordered. We need to read through
				// all of the fields anyway to get to the tvf pointers.
				int number = 0;
				int found = - 1;
				for (int i = 0; i < fieldCount; i++)
				{
					if (format >= FORMAT_VERSION)
						number = tvd.ReadVInt();
					else
						number += tvd.ReadVInt();
					
					if (number == fieldNumber)
						found = i;
				}
				
				// This field, although valid in the segment, was not found in this
				// document
				if (found != - 1)
				{
					// Compute position in the tvf file
					long position;
					if (format >= FORMAT_VERSION2)
						position = tvx.ReadLong();
					else
						position = tvd.ReadVLong();
					for (int i = 1; i <= found; i++)
						position += tvd.ReadVLong();
					
					mapper.SetDocumentNumber(docNum);
					ReadTermVector(field, position, mapper);
				}
				else
				{
					//System.out.println("Fieldable not found");
				}
			}
			else
			{
				//System.out.println("No tvx file");
			}
		}
示例#18
0
		/// <summary> Map all the term vectors for all fields in a Document</summary>
		/// <param name="docNumber">The number of the document to load the vector for
		/// </param>
		/// <param name="mapper">The {@link TermVectorMapper} to process the vector.  Must not be null
		/// </param>
		/// <throws>  IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified. </throws>
		abstract public void  GetTermFreqVector(int docNumber, TermVectorMapper mapper);
示例#19
0
		/// <summary> Load the Term Vector into a user-defined data structure instead of relying on the parallel arrays of
		/// the {@link TermFreqVector}.
		/// </summary>
		/// <param name="docNumber">The number of the document to load the vector for
		/// </param>
		/// <param name="field">The name of the field to load
		/// </param>
		/// <param name="mapper">The {@link TermVectorMapper} to process the vector.  Must not be null
		/// </param>
		/// <throws>  IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified. </throws>
		/// <summary> 
		/// </summary>
		abstract public void  GetTermFreqVector(int docNumber, System.String field, TermVectorMapper mapper);
示例#20
0
 public override void  GetTermFreqVector(int docNumber, TermVectorMapper mapper)
 {
     EnsureOpen();
     in_Renamed.GetTermFreqVector(docNumber, mapper);
 }
示例#21
0
        /// <summary> </summary>
        /// <param name="field">The field to read in
        /// </param>
        /// <param name="tvfPointer">The pointer within the tvf file where we should start reading
        /// </param>
        /// <param name="mapper">The mapper used to map the TermVector
        /// </param>
        /// <throws>  IOException </throws>
        private void  ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper)
        {
            // Now read the data from specified position
            //We don't need to offset by the FORMAT here since the pointer already includes the offset
            tvf.Seek(tvfPointer);

            int numTerms = tvf.ReadVInt();

            //System.out.println("Num Terms: " + numTerms);
            // If no terms - return a constant empty termvector. However, this should never occur!
            if (numTerms == 0)
            {
                return;
            }

            bool storePositions;
            bool storeOffsets;

            if (format >= FORMAT_VERSION)
            {
                byte bits = tvf.ReadByte();
                storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
                storeOffsets   = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
            }
            else
            {
                tvf.ReadVInt();
                storePositions = false;
                storeOffsets   = false;
            }
            mapper.SetExpectations(field, numTerms, storeOffsets, storePositions);
            int start       = 0;
            int deltaLength = 0;
            int totalLength = 0;

            byte[] byteBuffer;
            char[] charBuffer;
            bool   preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;

            // init the buffers
            if (preUTF8)
            {
                charBuffer = new char[10];
                byteBuffer = null;
            }
            else
            {
                charBuffer = null;
                byteBuffer = new byte[20];
            }

            for (int i = 0; i < numTerms; i++)
            {
                start       = tvf.ReadVInt();
                deltaLength = tvf.ReadVInt();
                totalLength = start + deltaLength;

                System.String term;

                if (preUTF8)
                {
                    // Term stored as java chars
                    if (charBuffer.Length < totalLength)
                    {
                        char[] newCharBuffer = new char[(int)(1.5 * totalLength)];
                        Array.Copy(charBuffer, 0, newCharBuffer, 0, start);
                        charBuffer = newCharBuffer;
                    }
                    tvf.ReadChars(charBuffer, start, deltaLength);
                    term = new System.String(charBuffer, 0, totalLength);
                }
                else
                {
                    // Term stored as utf8 bytes
                    if (byteBuffer.Length < totalLength)
                    {
                        byte[] newByteBuffer = new byte[(int)(1.5 * totalLength)];
                        Array.Copy(byteBuffer, 0, newByteBuffer, 0, start);
                        byteBuffer = newByteBuffer;
                    }
                    tvf.ReadBytes(byteBuffer, start, deltaLength);
                    term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength);
                }
                int   freq      = tvf.ReadVInt();
                int[] positions = null;
                if (storePositions)
                {
                    //read in the positions
                    //does the mapper even care about positions?
                    if (mapper.IsIgnoringPositions() == false)
                    {
                        positions = new int[freq];
                        int prevPosition = 0;
                        for (int j = 0; j < freq; j++)
                        {
                            positions[j] = prevPosition + tvf.ReadVInt();
                            prevPosition = positions[j];
                        }
                    }
                    else
                    {
                        //we need to skip over the positions.  Since these are VInts, I don't believe there is anyway to know for sure how far to skip
                        //
                        for (int j = 0; j < freq; j++)
                        {
                            tvf.ReadVInt();
                        }
                    }
                }
                TermVectorOffsetInfo[] offsets = null;
                if (storeOffsets)
                {
                    //does the mapper even care about offsets?
                    if (mapper.IsIgnoringOffsets() == false)
                    {
                        offsets = new TermVectorOffsetInfo[freq];
                        int prevOffset = 0;
                        for (int j = 0; j < freq; j++)
                        {
                            int startOffset = prevOffset + tvf.ReadVInt();
                            int endOffset   = startOffset + tvf.ReadVInt();
                            offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
                            prevOffset = endOffset;
                        }
                    }
                    else
                    {
                        for (int j = 0; j < freq; j++)
                        {
                            tvf.ReadVInt();
                            tvf.ReadVInt();
                        }
                    }
                }
                mapper.Map(term, freq, offsets, positions);
            }
        }
示例#22
0
		public override void  GetTermFreqVector(int docNumber, System.String field, TermVectorMapper mapper)
		{
			EnsureOpen();
			in_Renamed.GetTermFreqVector(docNumber, field, mapper);
		}