Ejemplo n.º 1
0
        /// <summary> Construct the vector</summary>
        /// <returns> The {@link TermFreqVector} based on the mappings.
        /// </returns>
        public virtual TermFreqVector MaterializeVector()
        {
            SegmentTermVector tv = null;

            if (field != null && terms != null)
            {
                if (storingPositions || storingOffsets)
                {
                    tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
                }
                else
                {
                    tv = new SegmentTermVector(field, terms, termFreqs);
                }
            }
            return(tv);
        }
Ejemplo n.º 2
0
        /// <summary> </summary>
        /// <param name="field">The field to read in
        /// </param>
        /// <param name="tvfPointer">The pointer within the tvf file where we should start reading
        /// </param>
        /// <returns> The TermVector located at that position
        /// </returns>
        /// <throws>  IOException </throws>
        private SegmentTermVector ReadTermVector(System.String field, long tvfPointer)
        {
            // Now read the data from specified position
            //We don't need to offset by the FORMAT here since the pointer already includes the offset
            tvf.Seek(tvfPointer);

            int numTerms = tvf.ReadVInt();

            //System.out.println("Num Terms: " + numTerms);
            // If no terms - return a constant empty termvector. However, this should never occur!
            if (numTerms == 0)
            {
                return(new SegmentTermVector(field, null, null));
            }

            bool storePositions;
            bool storeOffsets;

            if (tvfFormat == TermVectorsWriter.FORMAT_VERSION)
            {
                byte bits = tvf.ReadByte();
                storePositions = (bits & TermVectorsWriter.STORE_POSITIONS_WITH_TERMVECTOR) != 0;
                storeOffsets   = (bits & TermVectorsWriter.STORE_OFFSET_WITH_TERMVECTOR) != 0;
            }
            else
            {
                tvf.ReadVInt();
                storePositions = false;
                storeOffsets   = false;
            }

            System.String[] terms     = new System.String[numTerms];
            int[]           termFreqs = new int[numTerms];

            //  we may not need these, but declare them
            int[][] positions = null;
            TermVectorOffsetInfo[][] offsets = null;
            if (storePositions)
            {
                positions = new int[numTerms][];
            }
            if (storeOffsets)
            {
                offsets = new TermVectorOffsetInfo[numTerms][];
            }

            int start       = 0;
            int deltaLength = 0;
            int totalLength = 0;

            char[] buffer         = new char[10];     // init the buffer with a length of 10 character
            char[] previousBuffer = new char[] {};

            for (int i = 0; i < numTerms; i++)
            {
                start       = tvf.ReadVInt();
                deltaLength = tvf.ReadVInt();
                totalLength = start + deltaLength;
                if (buffer.Length < totalLength)
                {
                    // increase buffer
                    buffer = null;                     // give a hint to garbage collector
                    buffer = new char[totalLength];

                    if (start > 0)
                    {
                        // just copy if necessary
                        Array.Copy(previousBuffer, 0, buffer, 0, start);
                    }
                }

                tvf.ReadChars(buffer, start, deltaLength);
                terms[i]       = new System.String(buffer, 0, totalLength);
                previousBuffer = buffer;
                int freq = tvf.ReadVInt();
                termFreqs[i] = freq;

                if (storePositions)
                {
                    //read in the positions
                    int[] pos = new int[freq];
                    positions[i] = pos;
                    int prevPosition = 0;
                    for (int j = 0; j < freq; j++)
                    {
                        pos[j]       = prevPosition + tvf.ReadVInt();
                        prevPosition = pos[j];
                    }
                }

                if (storeOffsets)
                {
                    TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[freq];
                    offsets[i] = offs;
                    int prevOffset = 0;
                    for (int j = 0; j < freq; j++)
                    {
                        int startOffset = prevOffset + tvf.ReadVInt();
                        int endOffset   = startOffset + tvf.ReadVInt();
                        offs[j]    = new TermVectorOffsetInfo(startOffset, endOffset);
                        prevOffset = endOffset;
                    }
                }
            }

            SegmentTermVector tv;

            if (storePositions || storeOffsets)
            {
                tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
            }
            else
            {
                tv = new SegmentTermVector(field, terms, termFreqs);
            }
            return(tv);
        }
Ejemplo n.º 3
0
		/// <summary> Construct the vector</summary>
		/// <returns> The {@link TermFreqVector} based on the mappings.
		/// </returns>
		public virtual TermFreqVector MaterializeVector()
		{
			SegmentTermVector tv = null;
			if (field != null && terms != null)
			{
				if (storingPositions || storingOffsets)
				{
					tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
				}
				else
				{
					tv = new SegmentTermVector(field, terms, termFreqs);
				}
			}
			return tv;
		}
Ejemplo n.º 4
0
        /// <summary> </summary>
        /// <param name="field">The field to read in
        /// </param>
        /// <param name="tvfPointer">The pointer within the tvf file where we should start reading
        /// </param>
        /// <returns> The TermVector located at that position
        /// </returns>
        /// <throws>  IOException </throws>
        private SegmentTermVector ReadTermVector(System.String field, long tvfPointer)
        {
            // Now read the data from specified position
            //We don't need to offset by the FORMAT here since the pointer already includes the offset
            tvf.Seek(tvfPointer);

            int numTerms = tvf.ReadVInt();
            //System.out.println("Num Terms: " + numTerms);
            // If no terms - return a constant empty termvector. However, this should never occur!
            if (numTerms == 0)
                return new SegmentTermVector(field, null, null);

            bool storePositions;
            bool storeOffsets;

            if (tvfFormat == TermVectorsWriter.FORMAT_VERSION)
            {
                byte bits = tvf.ReadByte();
                storePositions = (bits & TermVectorsWriter.STORE_POSITIONS_WITH_TERMVECTOR) != 0;
                storeOffsets = (bits & TermVectorsWriter.STORE_OFFSET_WITH_TERMVECTOR) != 0;
            }
            else
            {
                tvf.ReadVInt();
                storePositions = false;
                storeOffsets = false;
            }

            System.String[] terms = new System.String[numTerms];
            int[] termFreqs = new int[numTerms];

            //  we may not need these, but declare them
            int[][] positions = null;
            TermVectorOffsetInfo[][] offsets = null;
            if (storePositions)
                positions = new int[numTerms][];
            if (storeOffsets)
                offsets = new TermVectorOffsetInfo[numTerms][];

            int start = 0;
            int deltaLength = 0;
            int totalLength = 0;
            char[] buffer = new char[10]; // init the buffer with a length of 10 character
            char[] previousBuffer = new char[]{};

            for (int i = 0; i < numTerms; i++)
            {
                start = tvf.ReadVInt();
                deltaLength = tvf.ReadVInt();
                totalLength = start + deltaLength;
                if (buffer.Length < totalLength)
                {
                    // increase buffer
                    buffer = null; // give a hint to garbage collector
                    buffer = new char[totalLength];

                    if (start > 0)
                        // just copy if necessary
                        Array.Copy(previousBuffer, 0, buffer, 0, start);
                }

                tvf.ReadChars(buffer, start, deltaLength);
                terms[i] = new System.String(buffer, 0, totalLength);
                previousBuffer = buffer;
                int freq = tvf.ReadVInt();
                termFreqs[i] = freq;

                if (storePositions)
                {
                    //read in the positions
                    int[] pos = new int[freq];
                    positions[i] = pos;
                    int prevPosition = 0;
                    for (int j = 0; j < freq; j++)
                    {
                        pos[j] = prevPosition + tvf.ReadVInt();
                        prevPosition = pos[j];
                    }
                }

                if (storeOffsets)
                {
                    TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[freq];
                    offsets[i] = offs;
                    int prevOffset = 0;
                    for (int j = 0; j < freq; j++)
                    {
                        int startOffset = prevOffset + tvf.ReadVInt();
                        int endOffset = startOffset + tvf.ReadVInt();
                        offs[j] = new TermVectorOffsetInfo(startOffset, endOffset);
                        prevOffset = endOffset;
                    }
                }
            }

            SegmentTermVector tv;
            if (storePositions || storeOffsets)
            {
                tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
            }
            else
            {
                tv = new SegmentTermVector(field, terms, termFreqs);
            }
            return tv;
        }