public override void SetUp() { base.SetUp(); tokens = new System.String[] { "here", "is", "some", "text", "to", "test", "extra" }; thePositions = new int[tokens.Length][]; offsets = new TermVectorOffsetInfo[tokens.Length][]; numPositions = 0; //save off the last one so we can add it with the same positions as some of the others, but in a predictable way for (int i = 0; i < tokens.Length - 1; i++) { thePositions[i] = new int[2 * i + 1]; //give 'em all some positions for (int j = 0; j < thePositions[i].Length; j++) { thePositions[i][j] = numPositions++; } offsets[i] = new TermVectorOffsetInfo[thePositions[i].Length]; for (int j = 0; j < offsets[i].Length; j++) { offsets[i][j] = new TermVectorOffsetInfo(j, j + 1); //the actual value here doesn't much matter } } thePositions[tokens.Length - 1] = new int[1]; thePositions[tokens.Length - 1][0] = 0; //put this at the same position as "here" offsets[tokens.Length - 1] = new TermVectorOffsetInfo[1]; offsets[tokens.Length - 1][0] = new TermVectorOffsetInfo(0, 1); }
public override void SetUp() { base.SetUp(); tokens = new System.String[]{"here", "is", "some", "text", "to", "test", "extra"}; thePositions = new int[tokens.Length][]; offsets = new TermVectorOffsetInfo[tokens.Length][]; numPositions = 0; //save off the last one so we can add it with the same positions as some of the others, but in a predictable way for (int i = 0; i < tokens.Length - 1; i++) { thePositions[i] = new int[2 * i + 1]; //give 'em all some positions for (int j = 0; j < thePositions[i].Length; j++) { thePositions[i][j] = numPositions++; } offsets[i] = new TermVectorOffsetInfo[thePositions[i].Length]; for (int j = 0; j < offsets[i].Length; j++) { offsets[i][j] = new TermVectorOffsetInfo(j, j + 1); //the actual value here doesn't much matter } } thePositions[tokens.Length - 1] = new int[1]; thePositions[tokens.Length - 1][0] = 0; //put this at the same position as "here" offsets[tokens.Length - 1] = new TermVectorOffsetInfo[1]; offsets[tokens.Length - 1][0] = new TermVectorOffsetInfo(0, 1); }
public virtual void TestOffsetReader() { TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); Assert.IsTrue(reader != null); TermPositionVector vector = (TermPositionVector)reader.Get(0, testFields[0]); Assert.IsTrue(vector != null); System.String[] terms = vector.GetTerms(); Assert.IsTrue(terms != null); Assert.IsTrue(terms.Length == testTerms.Length); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; //System.out.println("Term: " + term); Assert.IsTrue(term.Equals(testTerms[i])); int[] positions = vector.GetTermPositions(i); Assert.IsTrue(positions != null); Assert.IsTrue(positions.Length == this.positions[i].Length); for (int j = 0; j < positions.Length; j++) { int position = positions[j]; Assert.IsTrue(position == this.positions[i][j]); } TermVectorOffsetInfo[] offset = vector.GetOffsets(i); Assert.IsTrue(offset != null); Assert.IsTrue(offset.Length == this.offsets[i].Length); for (int j = 0; j < offset.Length; j++) { TermVectorOffsetInfo termVectorOffsetInfo = offset[j]; Assert.IsTrue(termVectorOffsetInfo.Equals(offsets[i][j])); } } }
public TermVectorEntry(System.String field, System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { this.field = field; this.term = term; this.frequency = frequency; this.offsets = offsets; this.positions = positions; }
internal virtual void addTerm(System.String term, TermVectorOffsetInfo info) { terms.Add(term); if (offsets != null) { offsets.Add(info); } }
public override void Map(string term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { if (StringUtils.AnyTermMatch(_terms, term)) { _indexMap.Add(term); if (_storeOffsets) _tvoi.Add(term, offsets); if (_storePositions) _positions.Add(term, positions); _frequency.Add(term,frequency); } }
/// <summary> Callback for the TermVectorReader. </summary> /// <param name="term"> /// </param> /// <param name="frequency"> /// </param> /// <param name="offsets"> /// </param> /// <param name="positions"> /// </param> public override void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { for (int i = 0; i < positions.Length; i++) { System.Int32 posVal = positions[i]; TVPositionInfo pos = currentPositions[posVal]; if (pos == null) { pos = new TVPositionInfo(positions[i], storeOffsets); currentPositions[posVal] = pos; } pos.addTerm(term, offsets != null ? offsets[i] : TermVectorOffsetInfo.Null); } }
/// <summary> </summary> /// <param name="term">The term to map /// </param> /// <param name="frequency">The frequency of the term /// </param> /// <param name="offsets">Offset information, may be null /// </param> /// <param name="positions">Position information, may be null /// </param> //We need to combine any previous mentions of the term public override void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { TermVectorEntry entry = termToTVE[term]; if (entry == null) { entry = new TermVectorEntry(ALL, term, frequency, storeOffsets == true?offsets:null, storePositions == true?positions:null); termToTVE[term] = entry; currentSet.Add(entry); } else { entry.Frequency = entry.Frequency + frequency; if (storeOffsets) { TermVectorOffsetInfo[] existingOffsets = entry.GetOffsets(); //A few diff. cases here: offsets is null, existing offsets is null, both are null, same for positions if (existingOffsets != null && offsets != null && offsets.Length > 0) { //copy over the existing offsets TermVectorOffsetInfo[] newOffsets = new TermVectorOffsetInfo[existingOffsets.Length + offsets.Length]; Array.Copy(existingOffsets, 0, newOffsets, 0, existingOffsets.Length); Array.Copy(offsets, 0, newOffsets, existingOffsets.Length, offsets.Length); entry.SetOffsets(newOffsets); } else if (existingOffsets == null && offsets != null && offsets.Length > 0) { entry.SetOffsets(offsets); } //else leave it alone } if (storePositions) { int[] existingPositions = entry.GetPositions(); if (existingPositions != null && positions != null && positions.Length > 0) { int[] newPositions = new int[existingPositions.Length + positions.Length]; Array.Copy(existingPositions, 0, newPositions, 0, existingPositions.Length); Array.Copy(positions, 0, newPositions, existingPositions.Length, positions.Length); entry.SetPositions(newPositions); } else if (existingPositions == null && positions != null && positions.Length > 0) { entry.SetPositions(positions); } } } }
/// <summary> Two TermVectorOffsetInfos are equals if both the start and end offsets are the same</summary> /// <param name="o">The comparison Object /// </param> /// <returns> true if both {@link #GetStartOffset()} and {@link #GetEndOffset()} are the same for both objects. /// </returns> public override bool Equals(System.Object o) { if (this == o) return true; if (!(o is TermVectorOffsetInfo)) return false; TermVectorOffsetInfo termVectorOffsetInfo = (TermVectorOffsetInfo) o; if (endOffset != termVectorOffsetInfo.endOffset) return false; if (startOffset != termVectorOffsetInfo.startOffset) return false; return true; }
internal Posting(Term t, int position, TermVectorOffsetInfo offset) { term = t; freq = 1; positions = new int[1]; positions[0] = position; if (offset != null) { offsets = new TermVectorOffsetInfo[1]; offsets[0] = offset; } else { offsets = null; } }
/// <summary> </summary> /// <param name="term">The term to map /// </param> /// <param name="frequency">The frequency of the term /// </param> /// <param name="offsets">Offset information, may be null /// </param> /// <param name="positions">Position information, may be null /// </param> //We need to combine any previous mentions of the term public override void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { TermVectorEntry entry = (TermVectorEntry) termToTVE[term]; if (entry == null) { entry = new TermVectorEntry(ALL, term, frequency, storeOffsets == true ? offsets : null, storePositions == true ? positions : null); termToTVE[term] = entry; currentSet.Add(entry, entry); } else { entry.SetFrequency(entry.GetFrequency() + frequency); if (storeOffsets) { TermVectorOffsetInfo[] existingOffsets = entry.GetOffsets(); //A few diff. cases here: offsets is null, existing offsets is null, both are null, same for positions if (existingOffsets != null && offsets != null && offsets.Length > 0) { //copy over the existing offsets TermVectorOffsetInfo[] newOffsets = new TermVectorOffsetInfo[existingOffsets.Length + offsets.Length]; Array.Copy(existingOffsets, 0, newOffsets, 0, existingOffsets.Length); Array.Copy(offsets, 0, newOffsets, existingOffsets.Length, offsets.Length); entry.SetOffsets(newOffsets); } else if (existingOffsets == null && offsets != null && offsets.Length > 0) { entry.SetOffsets(offsets); } //else leave it alone } if (storePositions) { int[] existingPositions = entry.GetPositions(); if (existingPositions != null && positions != null && positions.Length > 0) { int[] newPositions = new int[existingPositions.Length + positions.Length]; Array.Copy(existingPositions, 0, newPositions, 0, existingPositions.Length); Array.Copy(positions, 0, newPositions, existingPositions.Length, positions.Length); entry.SetPositions(newPositions); } else if (existingPositions == null && positions != null && positions.Length > 0) { entry.SetPositions(positions); } } } }
private Term termBuffer = new Term("", ""); // avoid consing private void AddPosition(System.String field, System.String text, int position, TermVectorOffsetInfo offset) { termBuffer.Set(field, text); //System.out.println("Offset: " + offset); Posting ti = (Posting)postingTable[termBuffer]; if (ti != null) { // word seen before int freq = ti.freq; if (ti.positions.Length == freq) { // positions array is full int[] newPositions = new int[freq * 2]; // double size int[] positions = ti.positions; for (int i = 0; i < freq; i++) { // copy old positions to new newPositions[i] = positions[i]; } ti.positions = newPositions; } ti.positions[freq] = position; // add new position if (offset != null) { if (ti.offsets.Length == freq) { TermVectorOffsetInfo[] newOffsets = new TermVectorOffsetInfo[freq * 2]; TermVectorOffsetInfo[] offsets = ti.offsets; for (int i = 0; i < freq; i++) { newOffsets[i] = offsets[i]; } ti.offsets = newOffsets; } ti.offsets[freq] = offset; } ti.freq = freq + 1; // update frequency } else { // word not seen before Term term = new Term(field, text, false); postingTable[term] = new Posting(term, position, offset); } }
/// <summary> Two TermVectorOffsetInfos are equals if both the start and end offsets are the same</summary> /// <param name="o">The comparison object /// </param> /// <returns> true if both {@link #GetStartOffset()} and {@link #GetEndOffset()} are the same for both objects. /// </returns> public override bool Equals(object o) { if (this == o) { return(true); } if (!(o is TermVectorOffsetInfo)) { return(false); } TermVectorOffsetInfo termVectorOffsetInfo = (TermVectorOffsetInfo)o; if (endOffset != termVectorOffsetInfo.endOffset) { return(false); } if (startOffset != termVectorOffsetInfo.startOffset) { return(false); } return(true); }
public override void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { TermVectorEntry entry = new TermVectorEntry(currentField, term, frequency, offsets, positions); currentSet.Add(entry, entry); }
/// <summary> Map the Term Vector information into your own structure</summary> /// <param name="term">The term to add to the vector /// </param> /// <param name="frequency">The frequency of the term in the document /// </param> /// <param name="offsets">null if the offset is not specified, otherwise the offset into the field of the term /// </param> /// <param name="positions">null if the position is not specified, otherwise the position in the field of the term /// </param> public abstract void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions);
public void AddTerm(System.String termText, int freq, int[] positions, TermVectorOffsetInfo[] offsets) { if (!IsDocumentOpen()) throw new System.SystemException("Cannot add terms when document is not open"); if (!IsFieldOpen()) throw new System.SystemException("Cannot add terms when field is not open"); AddTermInternal(termText, freq, positions, offsets); }
internal virtual void SetOffsets(TermVectorOffsetInfo[] value) { offsets = value; }
public override void SetUp() { base.SetUp(); /* for (int i = 0; i < testFields.length; i++) { fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]); } */ System.Array.Sort(testTerms); int tokenUpto = 0; for (int i = 0; i < testTerms.Length; i++) { positions[i] = new int[TERM_FREQ]; offsets[i] = new TermVectorOffsetInfo[TERM_FREQ]; // first position must be 0 for (int j = 0; j < TERM_FREQ; j++) { // positions are always sorted in increasing order positions[i][j] = (int) (j * 10 + (new System.Random().NextDouble()) * 10); // offsets are always sorted in increasing order offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].Length); TestToken token = tokens[tokenUpto++] = new TestToken(this); token.text = testTerms[i]; token.pos = positions[i][j]; token.startOffset = offsets[i][j].GetStartOffset(); token.endOffset = offsets[i][j].GetEndOffset(); } } System.Array.Sort(tokens); IndexWriter writer = new IndexWriter(dir, new MyAnalyzer(this), true, IndexWriter.MaxFieldLength.LIMITED); writer.SetUseCompoundFile(false); Document doc = new Document(); for (int i = 0; i < testFields.Length; i++) { Field.TermVector tv; if (testFieldsStorePos[i] && testFieldsStoreOff[i]) tv = Field.TermVector.WITH_POSITIONS_OFFSETS; else if (testFieldsStorePos[i] && !testFieldsStoreOff[i]) tv = Field.TermVector.WITH_POSITIONS; else if (!testFieldsStorePos[i] && testFieldsStoreOff[i]) tv = Field.TermVector.WITH_OFFSETS; else tv = Field.TermVector.YES; doc.Add(new Field(testFields[i], "", Field.Store.NO, Field.Index.ANALYZED, tv)); } //Create 5 documents for testing, they all have the same //terms for (int j = 0; j < 5; j++) writer.AddDocument(doc); writer.Flush(); seg = writer.NewestSegment().name; writer.Close(); fieldInfos = new FieldInfos(dir, seg + "." + IndexFileNames.FIELD_INFOS_EXTENSION); }
private void AddTermInternal(System.String termText, int freq, int[] positions, TermVectorOffsetInfo[] offsets) { TVTerm term = new TVTerm(); term.termText = termText; term.freq = freq; term.positions = positions; term.offsets = offsets; terms.Add(term); }
private Term termBuffer = new Term("", ""); // avoid consing private void AddPosition(System.String field, System.String text, int position, TermVectorOffsetInfo offset) { termBuffer.Set(field, text); //System.out.println("Offset: " + offset); Posting ti = (Posting) postingTable[termBuffer]; if (ti != null) { // word seen before int freq = ti.freq; if (ti.positions.Length == freq) { // positions array is full int[] newPositions = new int[freq * 2]; // double size int[] positions = ti.positions; Array.Copy(positions, 0, newPositions, 0, freq); ti.positions = newPositions; } ti.positions[freq] = position; // add new position if (offset != null) { if (ti.offsets.Length == freq) { TermVectorOffsetInfo[] newOffsets = new TermVectorOffsetInfo[freq * 2]; TermVectorOffsetInfo[] offsets = ti.offsets; Array.Copy(offsets, 0, newOffsets, 0, freq); ti.offsets = newOffsets; } ti.offsets[freq] = offset; } ti.freq = freq + 1; // update frequency } else { // word not seen before Term term = new Term(field, text, false); postingTable[term] = new Posting(term, position, offset); } }
/// <summary> </summary> /// <param name="field">The field to read in /// </param> /// <param name="tvfPointer">The pointer within the tvf file where we should start reading /// </param> /// <param name="mapper">The mapper used to map the TermVector /// </param> /// <throws> IOException </throws> private void ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper) { // Now read the data from specified position //We don't need to offset by the FORMAT here since the pointer already includes the offset tvf.Seek(tvfPointer); int numTerms = tvf.ReadVInt(); //System.out.println("Num Terms: " + numTerms); // If no terms - return a constant empty termvector. However, this should never occur! if (numTerms == 0) return ; bool storePositions; bool storeOffsets; if (format >= FORMAT_VERSION) { byte bits = tvf.ReadByte(); storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; } else { tvf.ReadVInt(); storePositions = false; storeOffsets = false; } mapper.SetExpectations(field, numTerms, storeOffsets, storePositions); int start = 0; int deltaLength = 0; int totalLength = 0; byte[] byteBuffer; char[] charBuffer; bool preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES; // init the buffers if (preUTF8) { charBuffer = new char[10]; byteBuffer = null; } else { charBuffer = null; byteBuffer = new byte[20]; } for (int i = 0; i < numTerms; i++) { start = tvf.ReadVInt(); deltaLength = tvf.ReadVInt(); totalLength = start + deltaLength; System.String term; if (preUTF8) { // Term stored as java chars if (charBuffer.Length < totalLength) { char[] newCharBuffer = new char[(int) (1.5 * totalLength)]; Array.Copy(charBuffer, 0, newCharBuffer, 0, start); charBuffer = newCharBuffer; } tvf.ReadChars(charBuffer, start, deltaLength); term = new System.String(charBuffer, 0, totalLength); } else { // Term stored as utf8 bytes if (byteBuffer.Length < totalLength) { byte[] newByteBuffer = new byte[(int) (1.5 * totalLength)]; Array.Copy(byteBuffer, 0, newByteBuffer, 0, start); byteBuffer = newByteBuffer; } tvf.ReadBytes(byteBuffer, start, deltaLength); term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength); } int freq = tvf.ReadVInt(); int[] positions = null; if (storePositions) { //read in the positions //does the mapper even care about positions? if (mapper.IsIgnoringPositions() == false) { positions = new int[freq]; int prevPosition = 0; for (int j = 0; j < freq; j++) { positions[j] = prevPosition + tvf.ReadVInt(); prevPosition = positions[j]; } } else { //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip // for (int j = 0; j < freq; j++) { tvf.ReadVInt(); } } } TermVectorOffsetInfo[] offsets = null; if (storeOffsets) { //does the mapper even care about offsets? if (mapper.IsIgnoringOffsets() == false) { offsets = new TermVectorOffsetInfo[freq]; int prevOffset = 0; for (int j = 0; j < freq; j++) { int startOffset = prevOffset + tvf.ReadVInt(); int endOffset = startOffset + tvf.ReadVInt(); offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset); prevOffset = endOffset; } } else { for (int j = 0; j < freq; j++) { tvf.ReadVInt(); tvf.ReadVInt(); } } } mapper.Map(term, freq, offsets, positions); } }
/// <summary> </summary> /// <param name="field">The field to read in /// </param> /// <param name="tvfPointer">The pointer within the tvf file where we should start reading /// </param> /// <param name="mapper">The mapper used to map the TermVector /// </param> /// <throws> IOException </throws> private void ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper) { // Now read the data from specified position //We don't need to offset by the FORMAT here since the pointer already includes the offset tvf.Seek(tvfPointer); int numTerms = tvf.ReadVInt(); //System.out.println("Num Terms: " + numTerms); // If no terms - return a constant empty termvector. However, this should never occur! if (numTerms == 0) { return; } bool storePositions; bool storeOffsets; if (format >= FORMAT_VERSION) { byte bits = tvf.ReadByte(); storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; } else { tvf.ReadVInt(); storePositions = false; storeOffsets = false; } mapper.SetExpectations(field, numTerms, storeOffsets, storePositions); int start = 0; int deltaLength = 0; int totalLength = 0; byte[] byteBuffer; char[] charBuffer; bool preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES; // init the buffers if (preUTF8) { charBuffer = new char[10]; byteBuffer = null; } else { charBuffer = null; byteBuffer = new byte[20]; } for (int i = 0; i < numTerms; i++) { start = tvf.ReadVInt(); deltaLength = tvf.ReadVInt(); totalLength = start + deltaLength; System.String term; if (preUTF8) { // Term stored as java chars if (charBuffer.Length < totalLength) { char[] newCharBuffer = new char[(int)(1.5 * totalLength)]; Array.Copy(charBuffer, 0, newCharBuffer, 0, start); charBuffer = newCharBuffer; } tvf.ReadChars(charBuffer, start, deltaLength); term = new System.String(charBuffer, 0, totalLength); } else { // Term stored as utf8 bytes if (byteBuffer.Length < totalLength) { byte[] newByteBuffer = new byte[(int)(1.5 * totalLength)]; Array.Copy(byteBuffer, 0, newByteBuffer, 0, start); byteBuffer = newByteBuffer; } tvf.ReadBytes(byteBuffer, start, deltaLength); term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength); } int freq = tvf.ReadVInt(); int[] positions = null; if (storePositions) { //read in the positions //does the mapper even care about positions? if (mapper.IsIgnoringPositions() == false) { positions = new int[freq]; int prevPosition = 0; for (int j = 0; j < freq; j++) { positions[j] = prevPosition + tvf.ReadVInt(); prevPosition = positions[j]; } } else { //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip // for (int j = 0; j < freq; j++) { tvf.ReadVInt(); } } } TermVectorOffsetInfo[] offsets = null; if (storeOffsets) { //does the mapper even care about offsets? if (mapper.IsIgnoringOffsets() == false) { offsets = new TermVectorOffsetInfo[freq]; int prevOffset = 0; for (int j = 0; j < freq; j++) { int startOffset = prevOffset + tvf.ReadVInt(); int endOffset = startOffset + tvf.ReadVInt(); offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset); prevOffset = endOffset; } } else { for (int j = 0; j < freq; j++) { tvf.ReadVInt(); tvf.ReadVInt(); } } } mapper.Map(term, freq, offsets, positions); } }
private void InitBlock() { positions = new int[testTerms.Length][]; offsets = new TermVectorOffsetInfo[testTerms.Length][]; tokens = new TestToken[testTerms.Length * TERM_FREQ]; }
internal virtual void SetOffsets(TermVectorOffsetInfo[] offsets) { this.offsets = offsets; }
public override void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { if (documentNumber == - 1) { throw new System.SystemException("Documentnumber should be set at this point!"); } }
/// <summary> </summary> /// <param name="field">The field to read in /// </param> /// <param name="tvfPointer">The pointer within the tvf file where we should start reading /// </param> /// <returns> The TermVector located at that position /// </returns> /// <throws> IOException </throws> private SegmentTermVector ReadTermVector(System.String field, long tvfPointer) { // Now read the data from specified position //We don't need to offset by the FORMAT here since the pointer already includes the offset tvf.Seek(tvfPointer); int numTerms = tvf.ReadVInt(); //System.out.println("Num Terms: " + numTerms); // If no terms - return a constant empty termvector. However, this should never occur! if (numTerms == 0) { return(new SegmentTermVector(field, null, null)); } bool storePositions; bool storeOffsets; if (tvfFormat == TermVectorsWriter.FORMAT_VERSION) { byte bits = tvf.ReadByte(); storePositions = (bits & TermVectorsWriter.STORE_POSITIONS_WITH_TERMVECTOR) != 0; storeOffsets = (bits & TermVectorsWriter.STORE_OFFSET_WITH_TERMVECTOR) != 0; } else { tvf.ReadVInt(); storePositions = false; storeOffsets = false; } System.String[] terms = new System.String[numTerms]; int[] termFreqs = new int[numTerms]; // we may not need these, but declare them int[][] positions = null; TermVectorOffsetInfo[][] offsets = null; if (storePositions) { positions = new int[numTerms][]; } if (storeOffsets) { offsets = new TermVectorOffsetInfo[numTerms][]; } int start = 0; int deltaLength = 0; int totalLength = 0; char[] buffer = new char[10]; // init the buffer with a length of 10 character char[] previousBuffer = new char[] {}; for (int i = 0; i < numTerms; i++) { start = tvf.ReadVInt(); deltaLength = tvf.ReadVInt(); totalLength = start + deltaLength; if (buffer.Length < totalLength) { // increase buffer buffer = null; // give a hint to garbage collector buffer = new char[totalLength]; if (start > 0) { // just copy if necessary Array.Copy(previousBuffer, 0, buffer, 0, start); } } tvf.ReadChars(buffer, start, deltaLength); terms[i] = new System.String(buffer, 0, totalLength); previousBuffer = buffer; int freq = tvf.ReadVInt(); termFreqs[i] = freq; if (storePositions) { //read in the positions int[] pos = new int[freq]; positions[i] = pos; int prevPosition = 0; for (int j = 0; j < freq; j++) { pos[j] = prevPosition + tvf.ReadVInt(); prevPosition = pos[j]; } } if (storeOffsets) { TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[freq]; offsets[i] = offs; int prevOffset = 0; for (int j = 0; j < freq; j++) { int startOffset = prevOffset + tvf.ReadVInt(); int endOffset = startOffset + tvf.ReadVInt(); offs[j] = new TermVectorOffsetInfo(startOffset, endOffset); prevOffset = endOffset; } } } SegmentTermVector tv; if (storePositions || storeOffsets) { tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets); } else { tv = new SegmentTermVector(field, terms, termFreqs); } return(tv); }
/// <summary> </summary> /// <param name="field">The field to read in /// </param> /// <param name="tvfPointer">The pointer within the tvf file where we should start reading /// </param> /// <param name="mapper">The mapper used to map the TermVector /// </param> /// <returns> The TermVector located at that position /// </returns> /// <throws> IOException </throws> private void ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper) { // Now read the data from specified position //We don't need to offset by the FORMAT here since the pointer already includes the offset tvf.Seek(tvfPointer); int numTerms = tvf.ReadVInt(); //System.out.println("Num Terms: " + numTerms); // If no terms - return a constant empty termvector. However, this should never occur! if (numTerms == 0) return ; bool storePositions; bool storeOffsets; if (tvfFormat == FORMAT_VERSION) { byte bits = tvf.ReadByte(); storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; } else { tvf.ReadVInt(); storePositions = false; storeOffsets = false; } mapper.SetExpectations(field, numTerms, storeOffsets, storePositions); int start = 0; int deltaLength = 0; int totalLength = 0; char[] buffer = new char[10]; // init the buffer with a length of 10 character char[] previousBuffer = new char[]{}; for (int i = 0; i < numTerms; i++) { start = tvf.ReadVInt(); deltaLength = tvf.ReadVInt(); totalLength = start + deltaLength; if (buffer.Length < totalLength) { // increase buffer buffer = null; // give a hint to garbage collector buffer = new char[totalLength]; if (start > 0) // just copy if necessary Array.Copy(previousBuffer, 0, buffer, 0, start); } tvf.ReadChars(buffer, start, deltaLength); System.String term = new System.String(buffer, 0, totalLength); previousBuffer = buffer; int freq = tvf.ReadVInt(); int[] positions = null; if (storePositions) { //read in the positions //does the mapper even care about positions? if (mapper.IsIgnoringPositions() == false) { positions = new int[freq]; int prevPosition = 0; for (int j = 0; j < freq; j++) { positions[j] = prevPosition + tvf.ReadVInt(); prevPosition = positions[j]; } } else { //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip // for (int j = 0; j < freq; j++) { tvf.ReadVInt(); } } } TermVectorOffsetInfo[] offsets = null; if (storeOffsets) { //does the mapper even care about offsets? if (mapper.IsIgnoringOffsets() == false) { offsets = new TermVectorOffsetInfo[freq]; int prevOffset = 0; for (int j = 0; j < freq; j++) { int startOffset = prevOffset + tvf.ReadVInt(); int endOffset = startOffset + tvf.ReadVInt(); offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset); prevOffset = endOffset; } } else { for (int j = 0; j < freq; j++) { tvf.ReadVInt(); tvf.ReadVInt(); } } } mapper.Map(term, freq, offsets, positions); } }
/// <summary> </summary> /// <param name="field">The field to read in /// </param> /// <param name="tvfPointer">The pointer within the tvf file where we should start reading /// </param> /// <returns> The TermVector located at that position /// </returns> /// <throws> IOException </throws> private SegmentTermVector ReadTermVector(System.String field, long tvfPointer) { // Now read the data from specified position //We don't need to offset by the FORMAT here since the pointer already includes the offset tvf.Seek(tvfPointer); int numTerms = tvf.ReadVInt(); //System.out.println("Num Terms: " + numTerms); // If no terms - return a constant empty termvector. However, this should never occur! if (numTerms == 0) return new SegmentTermVector(field, null, null); bool storePositions; bool storeOffsets; if (tvfFormat == TermVectorsWriter.FORMAT_VERSION) { byte bits = tvf.ReadByte(); storePositions = (bits & TermVectorsWriter.STORE_POSITIONS_WITH_TERMVECTOR) != 0; storeOffsets = (bits & TermVectorsWriter.STORE_OFFSET_WITH_TERMVECTOR) != 0; } else { tvf.ReadVInt(); storePositions = false; storeOffsets = false; } System.String[] terms = new System.String[numTerms]; int[] termFreqs = new int[numTerms]; // we may not need these, but declare them int[][] positions = null; TermVectorOffsetInfo[][] offsets = null; if (storePositions) positions = new int[numTerms][]; if (storeOffsets) offsets = new TermVectorOffsetInfo[numTerms][]; int start = 0; int deltaLength = 0; int totalLength = 0; char[] buffer = new char[10]; // init the buffer with a length of 10 character char[] previousBuffer = new char[]{}; for (int i = 0; i < numTerms; i++) { start = tvf.ReadVInt(); deltaLength = tvf.ReadVInt(); totalLength = start + deltaLength; if (buffer.Length < totalLength) { // increase buffer buffer = null; // give a hint to garbage collector buffer = new char[totalLength]; if (start > 0) // just copy if necessary Array.Copy(previousBuffer, 0, buffer, 0, start); } tvf.ReadChars(buffer, start, deltaLength); terms[i] = new System.String(buffer, 0, totalLength); previousBuffer = buffer; int freq = tvf.ReadVInt(); termFreqs[i] = freq; if (storePositions) { //read in the positions int[] pos = new int[freq]; positions[i] = pos; int prevPosition = 0; for (int j = 0; j < freq; j++) { pos[j] = prevPosition + tvf.ReadVInt(); prevPosition = pos[j]; } } if (storeOffsets) { TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[freq]; offsets[i] = offs; int prevOffset = 0; for (int j = 0; j < freq; j++) { int startOffset = prevOffset + tvf.ReadVInt(); int endOffset = startOffset + tvf.ReadVInt(); offs[j] = new TermVectorOffsetInfo(startOffset, endOffset); prevOffset = endOffset; } } } SegmentTermVector tv; if (storePositions || storeOffsets) { tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets); } else { tv = new SegmentTermVector(field, terms, termFreqs); } return tv; }
public override void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { terms[currentPosition] = term; termFreqs[currentPosition] = frequency; if (storingOffsets) { this.offsets[currentPosition] = offsets; } if (storingPositions) { this.positions[currentPosition] = positions; } currentPosition++; }
public override void SetUp() { base.SetUp(); /* * for (int i = 0; i < testFields.length; i++) { * fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]); * } */ System.Array.Sort(testTerms); int tokenUpto = 0; for (int i = 0; i < testTerms.Length; i++) { positions[i] = new int[TERM_FREQ]; offsets[i] = new TermVectorOffsetInfo[TERM_FREQ]; // first position must be 0 for (int j = 0; j < TERM_FREQ; j++) { // positions are always sorted in increasing order positions[i][j] = (int)(j * 10 + (new System.Random().NextDouble()) * 10); // offsets are always sorted in increasing order offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].Length); TestToken token = tokens[tokenUpto++] = new TestToken(this); token.text = testTerms[i]; token.pos = positions[i][j]; token.startOffset = offsets[i][j].StartOffset; token.endOffset = offsets[i][j].EndOffset; } } System.Array.Sort(tokens); IndexWriter writer = new IndexWriter(dir, new MyAnalyzer(this), true, IndexWriter.MaxFieldLength.LIMITED); writer.UseCompoundFile = false; Document doc = new Document(); for (int i = 0; i < testFields.Length; i++) { Field.TermVector tv; if (testFieldsStorePos[i] && testFieldsStoreOff[i]) { tv = Field.TermVector.WITH_POSITIONS_OFFSETS; } else if (testFieldsStorePos[i] && !testFieldsStoreOff[i]) { tv = Field.TermVector.WITH_POSITIONS; } else if (!testFieldsStorePos[i] && testFieldsStoreOff[i]) { tv = Field.TermVector.WITH_OFFSETS; } else { tv = Field.TermVector.YES; } doc.Add(new Field(testFields[i], "", Field.Store.NO, Field.Index.ANALYZED, tv)); } //Create 5 documents for testing, they all have the same //terms for (int j = 0; j < 5; j++) { writer.AddDocument(doc); } writer.Commit(); seg = writer.NewestSegment().name; writer.Close(); fieldInfos = new FieldInfos(dir, seg + "." + IndexFileNames.FIELD_INFOS_EXTENSION); }
internal Posting(Term t, int position, TermVectorOffsetInfo offset) { term = t; freq = 1; positions = new int[1]; positions[0] = position; if (offset != null) { offsets = new TermVectorOffsetInfo[1]; offsets[0] = offset; } else offsets = null; }
public SegmentTermPositionVector(System.String field, System.String[] terms, int[] termFreqs, int[][] positions, TermVectorOffsetInfo[][] offsets) : base(field, terms, termFreqs) { this.offsets = offsets; this.positions = positions; }
/// <summary> </summary> /// <param name="field">The field to read in /// </param> /// <param name="tvfPointer">The pointer within the tvf file where we should start reading /// </param> /// <param name="mapper">The mapper used to map the TermVector /// </param> /// <returns> The TermVector located at that position /// </returns> /// <throws> IOException </throws> private void ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper) { // Now read the data from specified position //We don't need to offset by the FORMAT here since the pointer already includes the offset tvf.Seek(tvfPointer); int numTerms = tvf.ReadVInt(); //System.out.println("Num Terms: " + numTerms); // If no terms - return a constant empty termvector. However, this should never occur! if (numTerms == 0) { return; } bool storePositions; bool storeOffsets; if (tvfFormat == FORMAT_VERSION) { byte bits = tvf.ReadByte(); storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; } else { tvf.ReadVInt(); storePositions = false; storeOffsets = false; } mapper.SetExpectations(field, numTerms, storeOffsets, storePositions); int start = 0; int deltaLength = 0; int totalLength = 0; char[] buffer = new char[10]; // init the buffer with a length of 10 character char[] previousBuffer = new char[] {}; for (int i = 0; i < numTerms; i++) { start = tvf.ReadVInt(); deltaLength = tvf.ReadVInt(); totalLength = start + deltaLength; if (buffer.Length < totalLength) { // increase buffer buffer = null; // give a hint to garbage collector buffer = new char[totalLength]; if (start > 0) { // just copy if necessary Array.Copy(previousBuffer, 0, buffer, 0, start); } } tvf.ReadChars(buffer, start, deltaLength); System.String term = new System.String(buffer, 0, totalLength); previousBuffer = buffer; int freq = tvf.ReadVInt(); int[] positions = null; if (storePositions) { //read in the positions //does the mapper even care about positions? if (mapper.IsIgnoringPositions() == false) { positions = new int[freq]; int prevPosition = 0; for (int j = 0; j < freq; j++) { positions[j] = prevPosition + tvf.ReadVInt(); prevPosition = positions[j]; } } else { //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip // for (int j = 0; j < freq; j++) { tvf.ReadVInt(); } } } TermVectorOffsetInfo[] offsets = null; if (storeOffsets) { //does the mapper even care about offsets? if (mapper.IsIgnoringOffsets() == false) { offsets = new TermVectorOffsetInfo[freq]; int prevOffset = 0; for (int j = 0; j < freq; j++) { int startOffset = prevOffset + tvf.ReadVInt(); int endOffset = startOffset + tvf.ReadVInt(); offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset); prevOffset = endOffset; } } else { for (int j = 0; j < freq; j++) { tvf.ReadVInt(); tvf.ReadVInt(); } } } mapper.Map(term, freq, offsets, positions); } }