public override void  SetUp()
 {
     base.SetUp();
     tokens       = new System.String[] { "here", "is", "some", "text", "to", "test", "extra" };
     thePositions = new int[tokens.Length][];
     offsets      = new TermVectorOffsetInfo[tokens.Length][];
     numPositions = 0;
     //save off the last one so we can add it with the same positions as some of the others, but in a predictable way
     for (int i = 0; i < tokens.Length - 1; i++)
     {
         thePositions[i] = new int[2 * i + 1]; //give 'em all some positions
         for (int j = 0; j < thePositions[i].Length; j++)
         {
             thePositions[i][j] = numPositions++;
         }
         offsets[i] = new TermVectorOffsetInfo[thePositions[i].Length];
         for (int j = 0; j < offsets[i].Length; j++)
         {
             offsets[i][j] = new TermVectorOffsetInfo(j, j + 1); //the actual value here doesn't much matter
         }
     }
     thePositions[tokens.Length - 1]    = new int[1];
     thePositions[tokens.Length - 1][0] = 0; //put this at the same position as "here"
     offsets[tokens.Length - 1]         = new TermVectorOffsetInfo[1];
     offsets[tokens.Length - 1][0]      = new TermVectorOffsetInfo(0, 1);
 }
		public override void  SetUp()
		{
			base.SetUp();
			tokens = new System.String[]{"here", "is", "some", "text", "to", "test", "extra"};
			thePositions = new int[tokens.Length][];
			offsets = new TermVectorOffsetInfo[tokens.Length][];
			numPositions = 0;
			//save off the last one so we can add it with the same positions as some of the others, but in a predictable way
			for (int i = 0; i < tokens.Length - 1; i++)
			{
				thePositions[i] = new int[2 * i + 1]; //give 'em all some positions
				for (int j = 0; j < thePositions[i].Length; j++)
				{
					thePositions[i][j] = numPositions++;
				}
				offsets[i] = new TermVectorOffsetInfo[thePositions[i].Length];
				for (int j = 0; j < offsets[i].Length; j++)
				{
					offsets[i][j] = new TermVectorOffsetInfo(j, j + 1); //the actual value here doesn't much matter
				}
			}
			thePositions[tokens.Length - 1] = new int[1];
			thePositions[tokens.Length - 1][0] = 0; //put this at the same position as "here"
			offsets[tokens.Length - 1] = new TermVectorOffsetInfo[1];
			offsets[tokens.Length - 1][0] = new TermVectorOffsetInfo(0, 1);
		}
        public virtual void  TestOffsetReader()
        {
            TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);

            Assert.IsTrue(reader != null);
            TermPositionVector vector = (TermPositionVector)reader.Get(0, testFields[0]);

            Assert.IsTrue(vector != null);
            System.String[] terms = vector.GetTerms();
            Assert.IsTrue(terms != null);
            Assert.IsTrue(terms.Length == testTerms.Length);
            for (int i = 0; i < terms.Length; i++)
            {
                System.String term = terms[i];
                //System.out.println("Term: " + term);
                Assert.IsTrue(term.Equals(testTerms[i]));
                int[] positions = vector.GetTermPositions(i);
                Assert.IsTrue(positions != null);
                Assert.IsTrue(positions.Length == this.positions[i].Length);
                for (int j = 0; j < positions.Length; j++)
                {
                    int position = positions[j];
                    Assert.IsTrue(position == this.positions[i][j]);
                }
                TermVectorOffsetInfo[] offset = vector.GetOffsets(i);
                Assert.IsTrue(offset != null);
                Assert.IsTrue(offset.Length == this.offsets[i].Length);
                for (int j = 0; j < offset.Length; j++)
                {
                    TermVectorOffsetInfo termVectorOffsetInfo = offset[j];
                    Assert.IsTrue(termVectorOffsetInfo.Equals(offsets[i][j]));
                }
            }
        }
 public TermVectorEntry(System.String field, System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
 {
     this.field = field;
     this.term = term;
     this.frequency = frequency;
     this.offsets = offsets;
     this.positions = positions;
 }
 internal virtual void  addTerm(System.String term, TermVectorOffsetInfo info)
 {
     terms.Add(term);
     if (offsets != null)
     {
         offsets.Add(info);
     }
 }
 public override void Map(string term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
 {
     if (StringUtils.AnyTermMatch(_terms, term))
     {
         _indexMap.Add(term);
         if (_storeOffsets)
             _tvoi.Add(term, offsets);
         if (_storePositions)
             _positions.Add(term, positions);
         _frequency.Add(term,frequency);
     }
 }
	    /// <summary> Callback for the TermVectorReader. </summary>
		/// <param name="term">
		/// </param>
		/// <param name="frequency">
		/// </param>
		/// <param name="offsets">
		/// </param>
		/// <param name="positions">
		/// </param>
		public override void  Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
		{
			for (int i = 0; i < positions.Length; i++)
			{
				System.Int32 posVal =  positions[i];
				TVPositionInfo pos = currentPositions[posVal];
				if (pos == null)
				{
					pos = new TVPositionInfo(positions[i], storeOffsets);
					currentPositions[posVal] = pos;
				}
				pos.addTerm(term, offsets != null ? offsets[i] : TermVectorOffsetInfo.Null);
			}
		}
Exemple #8
0
        /// <summary> </summary>
        /// <param name="term">The term to map
        /// </param>
        /// <param name="frequency">The frequency of the term
        /// </param>
        /// <param name="offsets">Offset information, may be null
        /// </param>
        /// <param name="positions">Position information, may be null
        /// </param>
        //We need to combine any previous mentions of the term
        public override void  Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
        {
            TermVectorEntry entry = termToTVE[term];

            if (entry == null)
            {
                entry           = new TermVectorEntry(ALL, term, frequency, storeOffsets == true?offsets:null, storePositions == true?positions:null);
                termToTVE[term] = entry;
                currentSet.Add(entry);
            }
            else
            {
                entry.Frequency = entry.Frequency + frequency;
                if (storeOffsets)
                {
                    TermVectorOffsetInfo[] existingOffsets = entry.GetOffsets();
                    //A few diff. cases here:  offsets is null, existing offsets is null, both are null, same for positions
                    if (existingOffsets != null && offsets != null && offsets.Length > 0)
                    {
                        //copy over the existing offsets
                        TermVectorOffsetInfo[] newOffsets = new TermVectorOffsetInfo[existingOffsets.Length + offsets.Length];
                        Array.Copy(existingOffsets, 0, newOffsets, 0, existingOffsets.Length);
                        Array.Copy(offsets, 0, newOffsets, existingOffsets.Length, offsets.Length);
                        entry.SetOffsets(newOffsets);
                    }
                    else if (existingOffsets == null && offsets != null && offsets.Length > 0)
                    {
                        entry.SetOffsets(offsets);
                    }
                    //else leave it alone
                }
                if (storePositions)
                {
                    int[] existingPositions = entry.GetPositions();
                    if (existingPositions != null && positions != null && positions.Length > 0)
                    {
                        int[] newPositions = new int[existingPositions.Length + positions.Length];
                        Array.Copy(existingPositions, 0, newPositions, 0, existingPositions.Length);
                        Array.Copy(positions, 0, newPositions, existingPositions.Length, positions.Length);
                        entry.SetPositions(newPositions);
                    }
                    else if (existingPositions == null && positions != null && positions.Length > 0)
                    {
                        entry.SetPositions(positions);
                    }
                }
            }
        }
        /// <summary> Two TermVectorOffsetInfos are equals if both the start and end offsets are the same</summary>
        /// <param name="o">The comparison Object
        /// </param>
        /// <returns> true if both {@link #GetStartOffset()} and {@link #GetEndOffset()} are the same for both objects.
        /// </returns>
        public override bool Equals(System.Object o)
        {
            if (this == o)
                return true;
            if (!(o is TermVectorOffsetInfo))
                return false;

            TermVectorOffsetInfo termVectorOffsetInfo = (TermVectorOffsetInfo) o;

            if (endOffset != termVectorOffsetInfo.endOffset)
                return false;
            if (startOffset != termVectorOffsetInfo.startOffset)
                return false;

            return true;
        }
Exemple #10
0
 internal Posting(Term t, int position, TermVectorOffsetInfo offset)
 {
     term         = t;
     freq         = 1;
     positions    = new int[1];
     positions[0] = position;
     if (offset != null)
     {
         offsets    = new TermVectorOffsetInfo[1];
         offsets[0] = offset;
     }
     else
     {
         offsets = null;
     }
 }
		/// <summary> </summary>
		/// <param name="term">The term to map
		/// </param>
		/// <param name="frequency">The frequency of the term
		/// </param>
		/// <param name="offsets">Offset information, may be null
		/// </param>
		/// <param name="positions">Position information, may be null
		/// </param>
		//We need to combine any previous mentions of the term
		public override void  Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
		{
			TermVectorEntry entry = (TermVectorEntry) termToTVE[term];
			if (entry == null)
			{
				entry = new TermVectorEntry(ALL, term, frequency, storeOffsets == true ? offsets : null, storePositions == true ? positions : null);
				termToTVE[term] = entry;
				currentSet.Add(entry, entry);
			}
			else
			{
				entry.SetFrequency(entry.GetFrequency() + frequency);
				if (storeOffsets)
				{
					TermVectorOffsetInfo[] existingOffsets = entry.GetOffsets();
					//A few diff. cases here:  offsets is null, existing offsets is null, both are null, same for positions
					if (existingOffsets != null && offsets != null && offsets.Length > 0)
					{
						//copy over the existing offsets
						TermVectorOffsetInfo[] newOffsets = new TermVectorOffsetInfo[existingOffsets.Length + offsets.Length];
						Array.Copy(existingOffsets, 0, newOffsets, 0, existingOffsets.Length);
						Array.Copy(offsets, 0, newOffsets, existingOffsets.Length, offsets.Length);
						entry.SetOffsets(newOffsets);
					}
					else if (existingOffsets == null && offsets != null && offsets.Length > 0)
					{
						entry.SetOffsets(offsets);
					}
					//else leave it alone
				}
				if (storePositions)
				{
					int[] existingPositions = entry.GetPositions();
					if (existingPositions != null && positions != null && positions.Length > 0)
					{
						int[] newPositions = new int[existingPositions.Length + positions.Length];
						Array.Copy(existingPositions, 0, newPositions, 0, existingPositions.Length);
						Array.Copy(positions, 0, newPositions, existingPositions.Length, positions.Length);
						entry.SetPositions(newPositions);
					}
					else if (existingPositions == null && positions != null && positions.Length > 0)
					{
						entry.SetPositions(positions);
					}
				}
			}
		}
Exemple #12
0
        private Term termBuffer = new Term("", "");         // avoid consing

        private void  AddPosition(System.String field, System.String text, int position, TermVectorOffsetInfo offset)
        {
            termBuffer.Set(field, text);
            //System.out.println("Offset: " + offset);
            Posting ti = (Posting)postingTable[termBuffer];

            if (ti != null)
            {
                // word seen before
                int freq = ti.freq;
                if (ti.positions.Length == freq)
                {
                    // positions array is full
                    int[] newPositions = new int[freq * 2];                     // double size
                    int[] positions    = ti.positions;
                    for (int i = 0; i < freq; i++)
                    {
                        // copy old positions to new
                        newPositions[i] = positions[i];
                    }
                    ti.positions = newPositions;
                }
                ti.positions[freq] = position;                 // add new position

                if (offset != null)
                {
                    if (ti.offsets.Length == freq)
                    {
                        TermVectorOffsetInfo[] newOffsets = new TermVectorOffsetInfo[freq * 2];
                        TermVectorOffsetInfo[] offsets    = ti.offsets;
                        for (int i = 0; i < freq; i++)
                        {
                            newOffsets[i] = offsets[i];
                        }
                        ti.offsets = newOffsets;
                    }
                    ti.offsets[freq] = offset;
                }
                ti.freq = freq + 1;                 // update frequency
            }
            else
            {
                // word not seen before
                Term term = new Term(field, text, false);
                postingTable[term] = new Posting(term, position, offset);
            }
        }
Exemple #13
0
        /// <summary> Two TermVectorOffsetInfos are equals if both the start and end offsets are the same</summary>
        /// <param name="o">The comparison object
        /// </param>
        /// <returns> true if both {@link #GetStartOffset()} and {@link #GetEndOffset()} are the same for both objects.
        /// </returns>
        public override bool Equals(object o)
        {
            if (this == o)
            {
                return(true);
            }
            if (!(o is TermVectorOffsetInfo))
            {
                return(false);
            }

            TermVectorOffsetInfo termVectorOffsetInfo = (TermVectorOffsetInfo)o;

            if (endOffset != termVectorOffsetInfo.endOffset)
            {
                return(false);
            }
            if (startOffset != termVectorOffsetInfo.startOffset)
            {
                return(false);
            }

            return(true);
        }
 public override void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
 {
     TermVectorEntry entry = new TermVectorEntry(currentField, term, frequency, offsets, positions);
     currentSet.Add(entry, entry);
 }
Exemple #15
0
 /// <summary> Map the Term Vector information into your own structure</summary>
 /// <param name="term">The term to add to the vector
 /// </param>
 /// <param name="frequency">The frequency of the term in the document
 /// </param>
 /// <param name="offsets">null if the offset is not specified, otherwise the offset into the field of the term
 /// </param>
 /// <param name="positions">null if the position is not specified, otherwise the position in the field of the term
 /// </param>
 public abstract void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions);
        public void AddTerm(System.String termText, int freq, int[] positions, TermVectorOffsetInfo[] offsets)
        {
            if (!IsDocumentOpen())
                throw new System.SystemException("Cannot add terms when document is not open");
            if (!IsFieldOpen())
                throw new System.SystemException("Cannot add terms when field is not open");

            AddTermInternal(termText, freq, positions, offsets);
        }
 internal virtual void SetOffsets(TermVectorOffsetInfo[] value)
 {
     offsets = value;
 }
 internal virtual void addTerm(System.String term, TermVectorOffsetInfo info)
 {
     terms.Add(term);
     if (offsets != null)
     {
         offsets.Add(info);
     }
 }
		public override void  SetUp()
		{
			base.SetUp();
			/*
			for (int i = 0; i < testFields.length; i++) {
			fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]);
			}
			*/
			
			System.Array.Sort(testTerms);
			int tokenUpto = 0;
			for (int i = 0; i < testTerms.Length; i++)
			{
				positions[i] = new int[TERM_FREQ];
				offsets[i] = new TermVectorOffsetInfo[TERM_FREQ];
				// first position must be 0
				for (int j = 0; j < TERM_FREQ; j++)
				{
					// positions are always sorted in increasing order
					positions[i][j] = (int) (j * 10 + (new System.Random().NextDouble()) * 10);
					// offsets are always sorted in increasing order
					offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].Length);
					TestToken token = tokens[tokenUpto++] = new TestToken(this);
					token.text = testTerms[i];
					token.pos = positions[i][j];
					token.startOffset = offsets[i][j].GetStartOffset();
					token.endOffset = offsets[i][j].GetEndOffset();
				}
			}
			System.Array.Sort(tokens);
			
			IndexWriter writer = new IndexWriter(dir, new MyAnalyzer(this), true, IndexWriter.MaxFieldLength.LIMITED);
			writer.SetUseCompoundFile(false);
			Document doc = new Document();
			for (int i = 0; i < testFields.Length; i++)
			{
				Field.TermVector tv;
				if (testFieldsStorePos[i] && testFieldsStoreOff[i])
					tv = Field.TermVector.WITH_POSITIONS_OFFSETS;
				else if (testFieldsStorePos[i] && !testFieldsStoreOff[i])
					tv = Field.TermVector.WITH_POSITIONS;
				else if (!testFieldsStorePos[i] && testFieldsStoreOff[i])
					tv = Field.TermVector.WITH_OFFSETS;
				else
					tv = Field.TermVector.YES;
				doc.Add(new Field(testFields[i], "", Field.Store.NO, Field.Index.ANALYZED, tv));
			}
			
			//Create 5 documents for testing, they all have the same
			//terms
			for (int j = 0; j < 5; j++)
				writer.AddDocument(doc);
			writer.Flush();
			seg = writer.NewestSegment().name;
			writer.Close();
			
			fieldInfos = new FieldInfos(dir, seg + "." + IndexFileNames.FIELD_INFOS_EXTENSION);
		}
 private void AddTermInternal(System.String termText, int freq, int[] positions, TermVectorOffsetInfo[] offsets)
 {
     TVTerm term = new TVTerm();
     term.termText = termText;
     term.freq = freq;
     term.positions = positions;
     term.offsets = offsets;
     terms.Add(term);
 }
Exemple #21
0
		private Term termBuffer = new Term("", ""); // avoid consing
		
		private void  AddPosition(System.String field, System.String text, int position, TermVectorOffsetInfo offset)
		{
			termBuffer.Set(field, text);
			//System.out.println("Offset: " + offset);
			Posting ti = (Posting) postingTable[termBuffer];
			if (ti != null)
			{
				// word seen before
				int freq = ti.freq;
				if (ti.positions.Length == freq)
				{
					// positions array is full
					int[] newPositions = new int[freq * 2]; // double size
					int[] positions = ti.positions;
					Array.Copy(positions, 0, newPositions, 0, freq);
					ti.positions = newPositions;
				}
				ti.positions[freq] = position; // add new position
				
				if (offset != null)
				{
					if (ti.offsets.Length == freq)
					{
						TermVectorOffsetInfo[] newOffsets = new TermVectorOffsetInfo[freq * 2];
						TermVectorOffsetInfo[] offsets = ti.offsets;
						Array.Copy(offsets, 0, newOffsets, 0, freq);
						ti.offsets = newOffsets;
					}
					ti.offsets[freq] = offset;
				}
				ti.freq = freq + 1; // update frequency
			}
			else
			{
				// word not seen before
				Term term = new Term(field, text, false);
				postingTable[term] = new Posting(term, position, offset);
			}
		}
		/// <summary> </summary>
		/// <param name="field">The field to read in
		/// </param>
		/// <param name="tvfPointer">The pointer within the tvf file where we should start reading
		/// </param>
		/// <param name="mapper">The mapper used to map the TermVector
		/// </param>
		/// <throws>  IOException </throws>
		private void  ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper)
		{
			
			// Now read the data from specified position
			//We don't need to offset by the FORMAT here since the pointer already includes the offset
			tvf.Seek(tvfPointer);
			
			int numTerms = tvf.ReadVInt();
			//System.out.println("Num Terms: " + numTerms);
			// If no terms - return a constant empty termvector. However, this should never occur!
			if (numTerms == 0)
				return ;
			
			bool storePositions;
			bool storeOffsets;
			
			if (format >= FORMAT_VERSION)
			{
				byte bits = tvf.ReadByte();
				storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
				storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
			}
			else
			{
				tvf.ReadVInt();
				storePositions = false;
				storeOffsets = false;
			}
			mapper.SetExpectations(field, numTerms, storeOffsets, storePositions);
			int start = 0;
			int deltaLength = 0;
			int totalLength = 0;
			byte[] byteBuffer;
			char[] charBuffer;
			bool preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;
			
			// init the buffers
			if (preUTF8)
			{
				charBuffer = new char[10];
				byteBuffer = null;
			}
			else
			{
				charBuffer = null;
				byteBuffer = new byte[20];
			}
			
			for (int i = 0; i < numTerms; i++)
			{
				start = tvf.ReadVInt();
				deltaLength = tvf.ReadVInt();
				totalLength = start + deltaLength;
				
				System.String term;
				
				if (preUTF8)
				{
					// Term stored as java chars
					if (charBuffer.Length < totalLength)
					{
						char[] newCharBuffer = new char[(int) (1.5 * totalLength)];
						Array.Copy(charBuffer, 0, newCharBuffer, 0, start);
						charBuffer = newCharBuffer;
					}
					tvf.ReadChars(charBuffer, start, deltaLength);
					term = new System.String(charBuffer, 0, totalLength);
				}
				else
				{
					// Term stored as utf8 bytes
					if (byteBuffer.Length < totalLength)
					{
						byte[] newByteBuffer = new byte[(int) (1.5 * totalLength)];
						Array.Copy(byteBuffer, 0, newByteBuffer, 0, start);
						byteBuffer = newByteBuffer;
					}
					tvf.ReadBytes(byteBuffer, start, deltaLength);
                    term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength);
				}
				int freq = tvf.ReadVInt();
				int[] positions = null;
				if (storePositions)
				{
					//read in the positions
					//does the mapper even care about positions?
					if (mapper.IsIgnoringPositions() == false)
					{
						positions = new int[freq];
						int prevPosition = 0;
						for (int j = 0; j < freq; j++)
						{
							positions[j] = prevPosition + tvf.ReadVInt();
							prevPosition = positions[j];
						}
					}
					else
					{
						//we need to skip over the positions.  Since these are VInts, I don't believe there is anyway to know for sure how far to skip
						//
						for (int j = 0; j < freq; j++)
						{
							tvf.ReadVInt();
						}
					}
				}
				TermVectorOffsetInfo[] offsets = null;
				if (storeOffsets)
				{
					//does the mapper even care about offsets?
					if (mapper.IsIgnoringOffsets() == false)
					{
						offsets = new TermVectorOffsetInfo[freq];
						int prevOffset = 0;
						for (int j = 0; j < freq; j++)
						{
							int startOffset = prevOffset + tvf.ReadVInt();
							int endOffset = startOffset + tvf.ReadVInt();
							offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
							prevOffset = endOffset;
						}
					}
					else
					{
						for (int j = 0; j < freq; j++)
						{
							tvf.ReadVInt();
							tvf.ReadVInt();
						}
					}
				}
				mapper.Map(term, freq, offsets, positions);
			}
		}
Exemple #23
0
        /// <summary> </summary>
        /// <param name="field">The field to read in
        /// </param>
        /// <param name="tvfPointer">The pointer within the tvf file where we should start reading
        /// </param>
        /// <param name="mapper">The mapper used to map the TermVector
        /// </param>
        /// <throws>  IOException </throws>
        private void  ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper)
        {
            // Now read the data from specified position
            //We don't need to offset by the FORMAT here since the pointer already includes the offset
            tvf.Seek(tvfPointer);

            int numTerms = tvf.ReadVInt();

            //System.out.println("Num Terms: " + numTerms);
            // If no terms - return a constant empty termvector. However, this should never occur!
            if (numTerms == 0)
            {
                return;
            }

            bool storePositions;
            bool storeOffsets;

            if (format >= FORMAT_VERSION)
            {
                byte bits = tvf.ReadByte();
                storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
                storeOffsets   = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
            }
            else
            {
                tvf.ReadVInt();
                storePositions = false;
                storeOffsets   = false;
            }
            mapper.SetExpectations(field, numTerms, storeOffsets, storePositions);
            int start       = 0;
            int deltaLength = 0;
            int totalLength = 0;

            byte[] byteBuffer;
            char[] charBuffer;
            bool   preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;

            // init the buffers
            if (preUTF8)
            {
                charBuffer = new char[10];
                byteBuffer = null;
            }
            else
            {
                charBuffer = null;
                byteBuffer = new byte[20];
            }

            for (int i = 0; i < numTerms; i++)
            {
                start       = tvf.ReadVInt();
                deltaLength = tvf.ReadVInt();
                totalLength = start + deltaLength;

                System.String term;

                if (preUTF8)
                {
                    // Term stored as java chars
                    if (charBuffer.Length < totalLength)
                    {
                        char[] newCharBuffer = new char[(int)(1.5 * totalLength)];
                        Array.Copy(charBuffer, 0, newCharBuffer, 0, start);
                        charBuffer = newCharBuffer;
                    }
                    tvf.ReadChars(charBuffer, start, deltaLength);
                    term = new System.String(charBuffer, 0, totalLength);
                }
                else
                {
                    // Term stored as utf8 bytes
                    if (byteBuffer.Length < totalLength)
                    {
                        byte[] newByteBuffer = new byte[(int)(1.5 * totalLength)];
                        Array.Copy(byteBuffer, 0, newByteBuffer, 0, start);
                        byteBuffer = newByteBuffer;
                    }
                    tvf.ReadBytes(byteBuffer, start, deltaLength);
                    term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength);
                }
                int   freq      = tvf.ReadVInt();
                int[] positions = null;
                if (storePositions)
                {
                    //read in the positions
                    //does the mapper even care about positions?
                    if (mapper.IsIgnoringPositions() == false)
                    {
                        positions = new int[freq];
                        int prevPosition = 0;
                        for (int j = 0; j < freq; j++)
                        {
                            positions[j] = prevPosition + tvf.ReadVInt();
                            prevPosition = positions[j];
                        }
                    }
                    else
                    {
                        //we need to skip over the positions.  Since these are VInts, I don't believe there is anyway to know for sure how far to skip
                        //
                        for (int j = 0; j < freq; j++)
                        {
                            tvf.ReadVInt();
                        }
                    }
                }
                TermVectorOffsetInfo[] offsets = null;
                if (storeOffsets)
                {
                    //does the mapper even care about offsets?
                    if (mapper.IsIgnoringOffsets() == false)
                    {
                        offsets = new TermVectorOffsetInfo[freq];
                        int prevOffset = 0;
                        for (int j = 0; j < freq; j++)
                        {
                            int startOffset = prevOffset + tvf.ReadVInt();
                            int endOffset   = startOffset + tvf.ReadVInt();
                            offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
                            prevOffset = endOffset;
                        }
                    }
                    else
                    {
                        for (int j = 0; j < freq; j++)
                        {
                            tvf.ReadVInt();
                            tvf.ReadVInt();
                        }
                    }
                }
                mapper.Map(term, freq, offsets, positions);
            }
        }
		private void  InitBlock()
		{
			positions = new int[testTerms.Length][];
			offsets = new TermVectorOffsetInfo[testTerms.Length][];
			tokens = new TestToken[testTerms.Length * TERM_FREQ];
		}
		internal virtual void  SetOffsets(TermVectorOffsetInfo[] offsets)
		{
			this.offsets = offsets;
		}
			public override void  Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
			{
				if (documentNumber == - 1)
				{
					throw new System.SystemException("Documentnumber should be set at this point!");
				}
			}
Exemple #27
0
        /// <summary> </summary>
        /// <param name="field">The field to read in
        /// </param>
        /// <param name="tvfPointer">The pointer within the tvf file where we should start reading
        /// </param>
        /// <returns> The TermVector located at that position
        /// </returns>
        /// <throws>  IOException </throws>
        private SegmentTermVector ReadTermVector(System.String field, long tvfPointer)
        {
            // Now read the data from specified position
            //We don't need to offset by the FORMAT here since the pointer already includes the offset
            tvf.Seek(tvfPointer);

            int numTerms = tvf.ReadVInt();

            //System.out.println("Num Terms: " + numTerms);
            // If no terms - return a constant empty termvector. However, this should never occur!
            if (numTerms == 0)
            {
                return(new SegmentTermVector(field, null, null));
            }

            bool storePositions;
            bool storeOffsets;

            if (tvfFormat == TermVectorsWriter.FORMAT_VERSION)
            {
                byte bits = tvf.ReadByte();
                storePositions = (bits & TermVectorsWriter.STORE_POSITIONS_WITH_TERMVECTOR) != 0;
                storeOffsets   = (bits & TermVectorsWriter.STORE_OFFSET_WITH_TERMVECTOR) != 0;
            }
            else
            {
                tvf.ReadVInt();
                storePositions = false;
                storeOffsets   = false;
            }

            System.String[] terms     = new System.String[numTerms];
            int[]           termFreqs = new int[numTerms];

            //  we may not need these, but declare them
            int[][] positions = null;
            TermVectorOffsetInfo[][] offsets = null;
            if (storePositions)
            {
                positions = new int[numTerms][];
            }
            if (storeOffsets)
            {
                offsets = new TermVectorOffsetInfo[numTerms][];
            }

            int start       = 0;
            int deltaLength = 0;
            int totalLength = 0;

            char[] buffer         = new char[10];     // init the buffer with a length of 10 character
            char[] previousBuffer = new char[] {};

            for (int i = 0; i < numTerms; i++)
            {
                start       = tvf.ReadVInt();
                deltaLength = tvf.ReadVInt();
                totalLength = start + deltaLength;
                if (buffer.Length < totalLength)
                {
                    // increase buffer
                    buffer = null;                     // give a hint to garbage collector
                    buffer = new char[totalLength];

                    if (start > 0)
                    {
                        // just copy if necessary
                        Array.Copy(previousBuffer, 0, buffer, 0, start);
                    }
                }

                tvf.ReadChars(buffer, start, deltaLength);
                terms[i]       = new System.String(buffer, 0, totalLength);
                previousBuffer = buffer;
                int freq = tvf.ReadVInt();
                termFreqs[i] = freq;

                if (storePositions)
                {
                    //read in the positions
                    int[] pos = new int[freq];
                    positions[i] = pos;
                    int prevPosition = 0;
                    for (int j = 0; j < freq; j++)
                    {
                        pos[j]       = prevPosition + tvf.ReadVInt();
                        prevPosition = pos[j];
                    }
                }

                if (storeOffsets)
                {
                    TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[freq];
                    offsets[i] = offs;
                    int prevOffset = 0;
                    for (int j = 0; j < freq; j++)
                    {
                        int startOffset = prevOffset + tvf.ReadVInt();
                        int endOffset   = startOffset + tvf.ReadVInt();
                        offs[j]    = new TermVectorOffsetInfo(startOffset, endOffset);
                        prevOffset = endOffset;
                    }
                }
            }

            SegmentTermVector tv;

            if (storePositions || storeOffsets)
            {
                tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
            }
            else
            {
                tv = new SegmentTermVector(field, terms, termFreqs);
            }
            return(tv);
        }
		/// <summary> </summary>
		/// <param name="field">The field to read in
		/// </param>
		/// <param name="tvfPointer">The pointer within the tvf file where we should start reading
		/// </param>
		/// <param name="mapper">The mapper used to map the TermVector
		/// </param>
		/// <returns> The TermVector located at that position
		/// </returns>
		/// <throws>  IOException </throws>
		private void  ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper)
		{
			
			// Now read the data from specified position
			//We don't need to offset by the FORMAT here since the pointer already includes the offset
			tvf.Seek(tvfPointer);
			
			int numTerms = tvf.ReadVInt();
			//System.out.println("Num Terms: " + numTerms);
			// If no terms - return a constant empty termvector. However, this should never occur!
			if (numTerms == 0)
				return ;
			
			bool storePositions;
			bool storeOffsets;
			
			if (tvfFormat == FORMAT_VERSION)
			{
				byte bits = tvf.ReadByte();
				storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
				storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
			}
			else
			{
				tvf.ReadVInt();
				storePositions = false;
				storeOffsets = false;
			}
			mapper.SetExpectations(field, numTerms, storeOffsets, storePositions);
			int start = 0;
			int deltaLength = 0;
			int totalLength = 0;
			char[] buffer = new char[10]; // init the buffer with a length of 10 character
			char[] previousBuffer = new char[]{};
			
			for (int i = 0; i < numTerms; i++)
			{
				start = tvf.ReadVInt();
				deltaLength = tvf.ReadVInt();
				totalLength = start + deltaLength;
				if (buffer.Length < totalLength)
				{
					// increase buffer
					buffer = null; // give a hint to garbage collector
					buffer = new char[totalLength];
					
					if (start > 0)
					// just copy if necessary
						Array.Copy(previousBuffer, 0, buffer, 0, start);
				}
				
				tvf.ReadChars(buffer, start, deltaLength);
				System.String term = new System.String(buffer, 0, totalLength);
				previousBuffer = buffer;
				int freq = tvf.ReadVInt();
				int[] positions = null;
				if (storePositions)
				{
					//read in the positions
					//does the mapper even care about positions?
					if (mapper.IsIgnoringPositions() == false)
					{
						positions = new int[freq];
						int prevPosition = 0;
						for (int j = 0; j < freq; j++)
						{
							positions[j] = prevPosition + tvf.ReadVInt();
							prevPosition = positions[j];
						}
					}
					else
					{
						//we need to skip over the positions.  Since these are VInts, I don't believe there is anyway to know for sure how far to skip
						//
						for (int j = 0; j < freq; j++)
						{
							tvf.ReadVInt();
						}
					}
				}
				TermVectorOffsetInfo[] offsets = null;
				if (storeOffsets)
				{
					//does the mapper even care about offsets?
					if (mapper.IsIgnoringOffsets() == false)
					{
						offsets = new TermVectorOffsetInfo[freq];
						int prevOffset = 0;
						for (int j = 0; j < freq; j++)
						{
							int startOffset = prevOffset + tvf.ReadVInt();
							int endOffset = startOffset + tvf.ReadVInt();
							offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
							prevOffset = endOffset;
						}
					}
					else
					{
						for (int j = 0; j < freq; j++)
						{
							tvf.ReadVInt();
							tvf.ReadVInt();
						}
					}
				}
				mapper.Map(term, freq, offsets, positions);
			}
		}
        /// <summary> </summary>
        /// <param name="field">The field to read in
        /// </param>
        /// <param name="tvfPointer">The pointer within the tvf file where we should start reading
        /// </param>
        /// <returns> The TermVector located at that position
        /// </returns>
        /// <throws>  IOException </throws>
        private SegmentTermVector ReadTermVector(System.String field, long tvfPointer)
        {
            // Now read the data from specified position
            //We don't need to offset by the FORMAT here since the pointer already includes the offset
            tvf.Seek(tvfPointer);

            int numTerms = tvf.ReadVInt();
            //System.out.println("Num Terms: " + numTerms);
            // If no terms - return a constant empty termvector. However, this should never occur!
            if (numTerms == 0)
                return new SegmentTermVector(field, null, null);

            bool storePositions;
            bool storeOffsets;

            if (tvfFormat == TermVectorsWriter.FORMAT_VERSION)
            {
                byte bits = tvf.ReadByte();
                storePositions = (bits & TermVectorsWriter.STORE_POSITIONS_WITH_TERMVECTOR) != 0;
                storeOffsets = (bits & TermVectorsWriter.STORE_OFFSET_WITH_TERMVECTOR) != 0;
            }
            else
            {
                tvf.ReadVInt();
                storePositions = false;
                storeOffsets = false;
            }

            System.String[] terms = new System.String[numTerms];
            int[] termFreqs = new int[numTerms];

            //  we may not need these, but declare them
            int[][] positions = null;
            TermVectorOffsetInfo[][] offsets = null;
            if (storePositions)
                positions = new int[numTerms][];
            if (storeOffsets)
                offsets = new TermVectorOffsetInfo[numTerms][];

            int start = 0;
            int deltaLength = 0;
            int totalLength = 0;
            char[] buffer = new char[10]; // init the buffer with a length of 10 character
            char[] previousBuffer = new char[]{};

            for (int i = 0; i < numTerms; i++)
            {
                start = tvf.ReadVInt();
                deltaLength = tvf.ReadVInt();
                totalLength = start + deltaLength;
                if (buffer.Length < totalLength)
                {
                    // increase buffer
                    buffer = null; // give a hint to garbage collector
                    buffer = new char[totalLength];

                    if (start > 0)
                        // just copy if necessary
                        Array.Copy(previousBuffer, 0, buffer, 0, start);
                }

                tvf.ReadChars(buffer, start, deltaLength);
                terms[i] = new System.String(buffer, 0, totalLength);
                previousBuffer = buffer;
                int freq = tvf.ReadVInt();
                termFreqs[i] = freq;

                if (storePositions)
                {
                    //read in the positions
                    int[] pos = new int[freq];
                    positions[i] = pos;
                    int prevPosition = 0;
                    for (int j = 0; j < freq; j++)
                    {
                        pos[j] = prevPosition + tvf.ReadVInt();
                        prevPosition = pos[j];
                    }
                }

                if (storeOffsets)
                {
                    TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[freq];
                    offsets[i] = offs;
                    int prevOffset = 0;
                    for (int j = 0; j < freq; j++)
                    {
                        int startOffset = prevOffset + tvf.ReadVInt();
                        int endOffset = startOffset + tvf.ReadVInt();
                        offs[j] = new TermVectorOffsetInfo(startOffset, endOffset);
                        prevOffset = endOffset;
                    }
                }
            }

            SegmentTermVector tv;
            if (storePositions || storeOffsets)
            {
                tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
            }
            else
            {
                tv = new SegmentTermVector(field, terms, termFreqs);
            }
            return tv;
        }
		public override void  Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
		{
			terms[currentPosition] = term;
			termFreqs[currentPosition] = frequency;
			if (storingOffsets)
			{
				this.offsets[currentPosition] = offsets;
			}
			if (storingPositions)
			{
				this.positions[currentPosition] = positions;
			}
			currentPosition++;
		}
        public override void  SetUp()
        {
            base.SetUp();

            /*
             * for (int i = 0; i < testFields.length; i++) {
             * fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]);
             * }
             */

            System.Array.Sort(testTerms);
            int tokenUpto = 0;

            for (int i = 0; i < testTerms.Length; i++)
            {
                positions[i] = new int[TERM_FREQ];
                offsets[i]   = new TermVectorOffsetInfo[TERM_FREQ];
                // first position must be 0
                for (int j = 0; j < TERM_FREQ; j++)
                {
                    // positions are always sorted in increasing order
                    positions[i][j] = (int)(j * 10 + (new System.Random().NextDouble()) * 10);
                    // offsets are always sorted in increasing order
                    offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].Length);
                    TestToken token = tokens[tokenUpto++] = new TestToken(this);
                    token.text        = testTerms[i];
                    token.pos         = positions[i][j];
                    token.startOffset = offsets[i][j].StartOffset;
                    token.endOffset   = offsets[i][j].EndOffset;
                }
            }
            System.Array.Sort(tokens);

            IndexWriter writer = new IndexWriter(dir, new MyAnalyzer(this), true, IndexWriter.MaxFieldLength.LIMITED);

            writer.UseCompoundFile = false;
            Document doc = new Document();

            for (int i = 0; i < testFields.Length; i++)
            {
                Field.TermVector tv;
                if (testFieldsStorePos[i] && testFieldsStoreOff[i])
                {
                    tv = Field.TermVector.WITH_POSITIONS_OFFSETS;
                }
                else if (testFieldsStorePos[i] && !testFieldsStoreOff[i])
                {
                    tv = Field.TermVector.WITH_POSITIONS;
                }
                else if (!testFieldsStorePos[i] && testFieldsStoreOff[i])
                {
                    tv = Field.TermVector.WITH_OFFSETS;
                }
                else
                {
                    tv = Field.TermVector.YES;
                }
                doc.Add(new Field(testFields[i], "", Field.Store.NO, Field.Index.ANALYZED, tv));
            }

            //Create 5 documents for testing, they all have the same
            //terms
            for (int j = 0; j < 5; j++)
            {
                writer.AddDocument(doc);
            }
            writer.Commit();
            seg = writer.NewestSegment().name;
            writer.Close();

            fieldInfos = new FieldInfos(dir, seg + "." + IndexFileNames.FIELD_INFOS_EXTENSION);
        }
Exemple #32
0
		internal Posting(Term t, int position, TermVectorOffsetInfo offset)
		{
			term = t;
			freq = 1;
			positions = new int[1];
			positions[0] = position;
			if (offset != null)
			{
				offsets = new TermVectorOffsetInfo[1];
				offsets[0] = offset;
			}
			else
				offsets = null;
		}
 private void  InitBlock()
 {
     positions = new int[testTerms.Length][];
     offsets   = new TermVectorOffsetInfo[testTerms.Length][];
     tokens    = new TestToken[testTerms.Length * TERM_FREQ];
 }
 public SegmentTermPositionVector(System.String field, System.String[] terms, int[] termFreqs, int[][] positions, TermVectorOffsetInfo[][] offsets)
     : base(field, terms, termFreqs)
 {
     this.offsets = offsets;
     this.positions = positions;
 }
        /// <summary> </summary>
        /// <param name="field">The field to read in
        /// </param>
        /// <param name="tvfPointer">The pointer within the tvf file where we should start reading
        /// </param>
        /// <param name="mapper">The mapper used to map the TermVector
        /// </param>
        /// <returns> The TermVector located at that position
        /// </returns>
        /// <throws>  IOException </throws>
        private void  ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper)
        {
            // Now read the data from specified position
            //We don't need to offset by the FORMAT here since the pointer already includes the offset
            tvf.Seek(tvfPointer);

            int numTerms = tvf.ReadVInt();

            //System.out.println("Num Terms: " + numTerms);
            // If no terms - return a constant empty termvector. However, this should never occur!
            if (numTerms == 0)
            {
                return;
            }

            bool storePositions;
            bool storeOffsets;

            if (tvfFormat == FORMAT_VERSION)
            {
                byte bits = tvf.ReadByte();
                storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
                storeOffsets   = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
            }
            else
            {
                tvf.ReadVInt();
                storePositions = false;
                storeOffsets   = false;
            }
            mapper.SetExpectations(field, numTerms, storeOffsets, storePositions);
            int start       = 0;
            int deltaLength = 0;
            int totalLength = 0;

            char[] buffer         = new char[10];     // init the buffer with a length of 10 character
            char[] previousBuffer = new char[] {};

            for (int i = 0; i < numTerms; i++)
            {
                start       = tvf.ReadVInt();
                deltaLength = tvf.ReadVInt();
                totalLength = start + deltaLength;
                if (buffer.Length < totalLength)
                {
                    // increase buffer
                    buffer = null;                     // give a hint to garbage collector
                    buffer = new char[totalLength];

                    if (start > 0)
                    {
                        // just copy if necessary
                        Array.Copy(previousBuffer, 0, buffer, 0, start);
                    }
                }

                tvf.ReadChars(buffer, start, deltaLength);
                System.String term = new System.String(buffer, 0, totalLength);
                previousBuffer = buffer;
                int   freq      = tvf.ReadVInt();
                int[] positions = null;
                if (storePositions)
                {
                    //read in the positions
                    //does the mapper even care about positions?
                    if (mapper.IsIgnoringPositions() == false)
                    {
                        positions = new int[freq];
                        int prevPosition = 0;
                        for (int j = 0; j < freq; j++)
                        {
                            positions[j] = prevPosition + tvf.ReadVInt();
                            prevPosition = positions[j];
                        }
                    }
                    else
                    {
                        //we need to skip over the positions.  Since these are VInts, I don't believe there is anyway to know for sure how far to skip
                        //
                        for (int j = 0; j < freq; j++)
                        {
                            tvf.ReadVInt();
                        }
                    }
                }
                TermVectorOffsetInfo[] offsets = null;
                if (storeOffsets)
                {
                    //does the mapper even care about offsets?
                    if (mapper.IsIgnoringOffsets() == false)
                    {
                        offsets = new TermVectorOffsetInfo[freq];
                        int prevOffset = 0;
                        for (int j = 0; j < freq; j++)
                        {
                            int startOffset = prevOffset + tvf.ReadVInt();
                            int endOffset   = startOffset + tvf.ReadVInt();
                            offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
                            prevOffset = endOffset;
                        }
                    }
                    else
                    {
                        for (int j = 0; j < freq; j++)
                        {
                            tvf.ReadVInt();
                            tvf.ReadVInt();
                        }
                    }
                }
                mapper.Map(term, freq, offsets, positions);
            }
        }