Example #1
0
		public TermVectorEntry(System.String field, System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
		{
			this.field = field;
			this.term = term;
			this.frequency = frequency;
			this.offsets = offsets;
			this.positions = positions;
		}
Example #2
0
 internal virtual void  addTerm(System.String term, TermVectorOffsetInfo info)
 {
     terms.Add(term);
     if (offsets != null)
     {
         offsets.Add(info);
     }
 }
		/// <summary> Callback for the TermVectorReader. </summary>
		/// <param name="term">
		/// </param>
		/// <param name="frequency">
		/// </param>
		/// <param name="offsets">
		/// </param>
		/// <param name="positions">
		/// </param>
		public override void  Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
		{
			for (int i = 0; i < positions.Length; i++)
			{
				System.Int32 posVal = (System.Int32) positions[i];
				TVPositionInfo pos = (TVPositionInfo) currentPositions[posVal];
				if (pos == null)
				{
					pos = new TVPositionInfo(positions[i], storeOffsets);
					currentPositions[posVal] = pos;
				}
				pos.addTerm(term, offsets != null?offsets[i]:null);
			}
		}
Example #4
0
        /// <summary> </summary>
        /// <param name="term">The term to map
        /// </param>
        /// <param name="frequency">The frequency of the term
        /// </param>
        /// <param name="offsets">Offset information, may be null
        /// </param>
        /// <param name="positions">Position information, may be null
        /// </param>
        //We need to combine any previous mentions of the term
        public override void  Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
        {
            TermVectorEntry entry = (TermVectorEntry)termToTVE[term];

            if (entry == null)
            {
                entry           = new TermVectorEntry(ALL, term, frequency, storeOffsets == true?offsets:null, storePositions == true?positions:null);
                termToTVE[term] = entry;
                currentSet.Add(entry, entry);
            }
            else
            {
                entry.SetFrequency(entry.GetFrequency() + frequency);
                if (storeOffsets)
                {
                    TermVectorOffsetInfo[] existingOffsets = entry.GetOffsets();
                    //A few diff. cases here:  offsets is null, existing offsets is null, both are null, same for positions
                    if (existingOffsets != null && offsets != null && offsets.Length > 0)
                    {
                        //copy over the existing offsets
                        TermVectorOffsetInfo[] newOffsets = new TermVectorOffsetInfo[existingOffsets.Length + offsets.Length];
                        Array.Copy(existingOffsets, 0, newOffsets, 0, existingOffsets.Length);
                        Array.Copy(offsets, 0, newOffsets, existingOffsets.Length, offsets.Length);
                        entry.SetOffsets(newOffsets);
                    }
                    else if (existingOffsets == null && offsets != null && offsets.Length > 0)
                    {
                        entry.SetOffsets(offsets);
                    }
                    //else leave it alone
                }
                if (storePositions)
                {
                    int[] existingPositions = entry.GetPositions();
                    if (existingPositions != null && positions != null && positions.Length > 0)
                    {
                        int[] newPositions = new int[existingPositions.Length + positions.Length];
                        Array.Copy(existingPositions, 0, newPositions, 0, existingPositions.Length);
                        Array.Copy(positions, 0, newPositions, existingPositions.Length, positions.Length);
                        entry.SetPositions(newPositions);
                    }
                    else if (existingPositions == null && positions != null && positions.Length > 0)
                    {
                        entry.SetPositions(positions);
                    }
                }
            }
        }
Example #5
0
		/// <summary> </summary>
		/// <param name="term">The term to map
		/// </param>
		/// <param name="frequency">The frequency of the term
		/// </param>
		/// <param name="offsets">Offset information, may be null
		/// </param>
		/// <param name="positions">Position information, may be null
		/// </param>
		//We need to combine any previous mentions of the term
		public override void  Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
		{
			TermVectorEntry entry = (TermVectorEntry) termToTVE[term];
			if (entry == null)
			{
				entry = new TermVectorEntry(ALL, term, frequency, storeOffsets == true?offsets:null, storePositions == true?positions:null);
				termToTVE[term] = entry;
				currentSet.Add(entry, entry);
			}
			else
			{
				entry.SetFrequency(entry.GetFrequency() + frequency);
				if (storeOffsets)
				{
					TermVectorOffsetInfo[] existingOffsets = entry.GetOffsets();
					//A few diff. cases here:  offsets is null, existing offsets is null, both are null, same for positions
					if (existingOffsets != null && offsets != null && offsets.Length > 0)
					{
						//copy over the existing offsets
						TermVectorOffsetInfo[] newOffsets = new TermVectorOffsetInfo[existingOffsets.Length + offsets.Length];
						Array.Copy(existingOffsets, 0, newOffsets, 0, existingOffsets.Length);
						Array.Copy(offsets, 0, newOffsets, existingOffsets.Length, offsets.Length);
						entry.SetOffsets(newOffsets);
					}
					else if (existingOffsets == null && offsets != null && offsets.Length > 0)
					{
						entry.SetOffsets(offsets);
					}
					//else leave it alone
				}
				if (storePositions)
				{
					int[] existingPositions = entry.GetPositions();
					if (existingPositions != null && positions != null && positions.Length > 0)
					{
						int[] newPositions = new int[existingPositions.Length + positions.Length];
						Array.Copy(existingPositions, 0, newPositions, 0, existingPositions.Length);
						Array.Copy(positions, 0, newPositions, existingPositions.Length, positions.Length);
						entry.SetPositions(newPositions);
					}
					else if (existingPositions == null && positions != null && positions.Length > 0)
					{
						entry.SetPositions(positions);
					}
				}
			}
		}
Example #6
0
        /// <summary> Two TermVectorOffsetInfos are equals if both the start and end offsets are the same</summary>
        /// <param name="o">The comparison Object
        /// </param>
        /// <returns> true if both {@link #GetStartOffset()} and {@link #GetEndOffset()} are the same for both objects.
        /// </returns>
        public override bool Equals(System.Object o)
        {
            if (this == o)
            {
                return(true);
            }
            if (!(o is TermVectorOffsetInfo))
            {
                return(false);
            }

            TermVectorOffsetInfo termVectorOffsetInfo = (TermVectorOffsetInfo)o;

            if (endOffset != termVectorOffsetInfo.endOffset)
            {
                return(false);
            }
            if (startOffset != termVectorOffsetInfo.startOffset)
            {
                return(false);
            }

            return(true);
        }
Example #7
0
        /// <summary> </summary>
        /// <param name="field">The field to read in
        /// </param>
        /// <param name="tvfPointer">The pointer within the tvf file where we should start reading
        /// </param>
        /// <param name="mapper">The mapper used to map the TermVector
        /// </param>
        /// <throws>  IOException </throws>
        private void  ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper)
        {
            // Now read the data from specified position
            //We don't need to offset by the FORMAT here since the pointer already includes the offset
            tvf.Seek(tvfPointer);

            int numTerms = tvf.ReadVInt();

            //System.out.println("Num Terms: " + numTerms);
            // If no terms - return a constant empty termvector. However, this should never occur!
            if (numTerms == 0)
            {
                return;
            }

            bool storePositions;
            bool storeOffsets;

            if (format >= FORMAT_VERSION)
            {
                byte bits = tvf.ReadByte();
                storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
                storeOffsets   = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
            }
            else
            {
                tvf.ReadVInt();
                storePositions = false;
                storeOffsets   = false;
            }
            mapper.SetExpectations(field, numTerms, storeOffsets, storePositions);
            int start       = 0;
            int deltaLength = 0;
            int totalLength = 0;

            byte[] byteBuffer;
            char[] charBuffer;
            bool   preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;

            // init the buffers
            if (preUTF8)
            {
                charBuffer = new char[10];
                byteBuffer = null;
            }
            else
            {
                charBuffer = null;
                byteBuffer = new byte[20];
            }

            for (int i = 0; i < numTerms; i++)
            {
                start       = tvf.ReadVInt();
                deltaLength = tvf.ReadVInt();
                totalLength = start + deltaLength;

                System.String term;

                if (preUTF8)
                {
                    // Term stored as java chars
                    if (charBuffer.Length < totalLength)
                    {
                        char[] newCharBuffer = new char[(int)(1.5 * totalLength)];
                        Array.Copy(charBuffer, 0, newCharBuffer, 0, start);
                        charBuffer = newCharBuffer;
                    }
                    tvf.ReadChars(charBuffer, start, deltaLength);
                    term = new System.String(charBuffer, 0, totalLength);
                }
                else
                {
                    // Term stored as utf8 bytes
                    if (byteBuffer.Length < totalLength)
                    {
                        byte[] newByteBuffer = new byte[(int)(1.5 * totalLength)];
                        Array.Copy(byteBuffer, 0, newByteBuffer, 0, start);
                        byteBuffer = newByteBuffer;
                    }
                    tvf.ReadBytes(byteBuffer, start, deltaLength);
                    term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength);
                }
                int   freq      = tvf.ReadVInt();
                int[] positions = null;
                if (storePositions)
                {
                    //read in the positions
                    //does the mapper even care about positions?
                    if (mapper.IsIgnoringPositions() == false)
                    {
                        positions = new int[freq];
                        int prevPosition = 0;
                        for (int j = 0; j < freq; j++)
                        {
                            positions[j] = prevPosition + tvf.ReadVInt();
                            prevPosition = positions[j];
                        }
                    }
                    else
                    {
                        //we need to skip over the positions.  Since these are VInts, I don't believe there is anyway to know for sure how far to skip
                        //
                        for (int j = 0; j < freq; j++)
                        {
                            tvf.ReadVInt();
                        }
                    }
                }
                TermVectorOffsetInfo[] offsets = null;
                if (storeOffsets)
                {
                    //does the mapper even care about offsets?
                    if (mapper.IsIgnoringOffsets() == false)
                    {
                        offsets = new TermVectorOffsetInfo[freq];
                        int prevOffset = 0;
                        for (int j = 0; j < freq; j++)
                        {
                            int startOffset = prevOffset + tvf.ReadVInt();
                            int endOffset   = startOffset + tvf.ReadVInt();
                            offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
                            prevOffset = endOffset;
                        }
                    }
                    else
                    {
                        for (int j = 0; j < freq; j++)
                        {
                            tvf.ReadVInt();
                            tvf.ReadVInt();
                        }
                    }
                }
                mapper.Map(term, freq, offsets, positions);
            }
        }
Example #8
0
		/// <summary> Map the Term Vector information into your own structure</summary>
		/// <param name="term">The term to add to the vector
		/// </param>
		/// <param name="frequency">The frequency of the term in the document
		/// </param>
		/// <param name="offsets">null if the offset is not specified, otherwise the offset into the field of the term
		/// </param>
		/// <param name="positions">null if the position is not specified, otherwise the position in the field of the term
		/// </param>
		public abstract void  Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions);
Example #9
0
		internal virtual void  SetOffsets(TermVectorOffsetInfo[] offsets)
		{
			this.offsets = offsets;
		}
		public SegmentTermPositionVector(System.String field, System.String[] terms, int[] termFreqs, int[][] positions, TermVectorOffsetInfo[][] offsets):base(field, terms, termFreqs)
		{
			this.offsets = offsets;
			this.positions = positions;
		}
			internal virtual void  addTerm(System.String term, TermVectorOffsetInfo info)
			{
				terms.Add(term);
				if (offsets != null)
				{
					offsets.Add(info);
				}
			}
		public override void  Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
		{
			TermVectorEntry entry = new TermVectorEntry(currentField, term, frequency, offsets, positions);
			currentSet.Add(entry, entry);
		}
Example #13
0
		public override void  Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
		{
			terms[currentPosition] = term;
			termFreqs[currentPosition] = frequency;
			if (storingOffsets)
			{
				this.offsets[currentPosition] = offsets;
			}
			if (storingPositions)
			{
				this.positions[currentPosition] = positions;
			}
			currentPosition++;
		}
Example #14
0
		/// <summary> </summary>
		/// <param name="field">The field to read in
		/// </param>
		/// <param name="tvfPointer">The pointer within the tvf file where we should start reading
		/// </param>
		/// <param name="mapper">The mapper used to map the TermVector
		/// </param>
		/// <throws>  IOException </throws>
		private void  ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper)
		{
			
			// Now read the data from specified position
			//We don't need to offset by the FORMAT here since the pointer already includes the offset
			tvf.Seek(tvfPointer);
			
			int numTerms = tvf.ReadVInt();
			//System.out.println("Num Terms: " + numTerms);
			// If no terms - return a constant empty termvector. However, this should never occur!
			if (numTerms == 0)
				return ;
			
			bool storePositions;
			bool storeOffsets;
			
			if (format >= FORMAT_VERSION)
			{
				byte bits = tvf.ReadByte();
				storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
				storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
			}
			else
			{
				tvf.ReadVInt();
				storePositions = false;
				storeOffsets = false;
			}
			mapper.SetExpectations(field, numTerms, storeOffsets, storePositions);
			int start = 0;
			int deltaLength = 0;
			int totalLength = 0;
			byte[] byteBuffer;
			char[] charBuffer;
			bool preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;
			
			// init the buffers
			if (preUTF8)
			{
				charBuffer = new char[10];
				byteBuffer = null;
			}
			else
			{
				charBuffer = null;
				byteBuffer = new byte[20];
			}
			
			for (int i = 0; i < numTerms; i++)
			{
				start = tvf.ReadVInt();
				deltaLength = tvf.ReadVInt();
				totalLength = start + deltaLength;
				
				System.String term;
				
				if (preUTF8)
				{
					// Term stored as java chars
					if (charBuffer.Length < totalLength)
					{
						char[] newCharBuffer = new char[(int) (1.5 * totalLength)];
						Array.Copy(charBuffer, 0, newCharBuffer, 0, start);
						charBuffer = newCharBuffer;
					}
					tvf.ReadChars(charBuffer, start, deltaLength);
					term = new System.String(charBuffer, 0, totalLength);
				}
				else
				{
					// Term stored as utf8 bytes
					if (byteBuffer.Length < totalLength)
					{
						byte[] newByteBuffer = new byte[(int) (1.5 * totalLength)];
						Array.Copy(byteBuffer, 0, newByteBuffer, 0, start);
						byteBuffer = newByteBuffer;
					}
					tvf.ReadBytes(byteBuffer, start, deltaLength);
                    term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength);
				}
				int freq = tvf.ReadVInt();
				int[] positions = null;
				if (storePositions)
				{
					//read in the positions
					//does the mapper even care about positions?
					if (mapper.IsIgnoringPositions() == false)
					{
						positions = new int[freq];
						int prevPosition = 0;
						for (int j = 0; j < freq; j++)
						{
							positions[j] = prevPosition + tvf.ReadVInt();
							prevPosition = positions[j];
						}
					}
					else
					{
						//we need to skip over the positions.  Since these are VInts, I don't believe there is anyway to know for sure how far to skip
						//
						for (int j = 0; j < freq; j++)
						{
							tvf.ReadVInt();
						}
					}
				}
				TermVectorOffsetInfo[] offsets = null;
				if (storeOffsets)
				{
					//does the mapper even care about offsets?
					if (mapper.IsIgnoringOffsets() == false)
					{
						offsets = new TermVectorOffsetInfo[freq];
						int prevOffset = 0;
						for (int j = 0; j < freq; j++)
						{
							int startOffset = prevOffset + tvf.ReadVInt();
							int endOffset = startOffset + tvf.ReadVInt();
							offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
							prevOffset = endOffset;
						}
					}
					else
					{
						for (int j = 0; j < freq; j++)
						{
							tvf.ReadVInt();
							tvf.ReadVInt();
						}
					}
				}
				mapper.Map(term, freq, offsets, positions);
			}
		}