public TermVectorEntry(System.String field, System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { this.field = field; this.term = term; this.frequency = frequency; this.offsets = offsets; this.positions = positions; }
internal virtual void addTerm(System.String term, TermVectorOffsetInfo info) { terms.Add(term); if (offsets != null) { offsets.Add(info); } }
/// <summary> Callback for the TermVectorReader. </summary> /// <param name="term"> /// </param> /// <param name="frequency"> /// </param> /// <param name="offsets"> /// </param> /// <param name="positions"> /// </param> public override void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { for (int i = 0; i < positions.Length; i++) { System.Int32 posVal = (System.Int32) positions[i]; TVPositionInfo pos = (TVPositionInfo) currentPositions[posVal]; if (pos == null) { pos = new TVPositionInfo(positions[i], storeOffsets); currentPositions[posVal] = pos; } pos.addTerm(term, offsets != null?offsets[i]:null); } }
/// <summary> </summary> /// <param name="term">The term to map /// </param> /// <param name="frequency">The frequency of the term /// </param> /// <param name="offsets">Offset information, may be null /// </param> /// <param name="positions">Position information, may be null /// </param> //We need to combine any previous mentions of the term public override void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { TermVectorEntry entry = (TermVectorEntry)termToTVE[term]; if (entry == null) { entry = new TermVectorEntry(ALL, term, frequency, storeOffsets == true?offsets:null, storePositions == true?positions:null); termToTVE[term] = entry; currentSet.Add(entry, entry); } else { entry.SetFrequency(entry.GetFrequency() + frequency); if (storeOffsets) { TermVectorOffsetInfo[] existingOffsets = entry.GetOffsets(); //A few diff. cases here: offsets is null, existing offsets is null, both are null, same for positions if (existingOffsets != null && offsets != null && offsets.Length > 0) { //copy over the existing offsets TermVectorOffsetInfo[] newOffsets = new TermVectorOffsetInfo[existingOffsets.Length + offsets.Length]; Array.Copy(existingOffsets, 0, newOffsets, 0, existingOffsets.Length); Array.Copy(offsets, 0, newOffsets, existingOffsets.Length, offsets.Length); entry.SetOffsets(newOffsets); } else if (existingOffsets == null && offsets != null && offsets.Length > 0) { entry.SetOffsets(offsets); } //else leave it alone } if (storePositions) { int[] existingPositions = entry.GetPositions(); if (existingPositions != null && positions != null && positions.Length > 0) { int[] newPositions = new int[existingPositions.Length + positions.Length]; Array.Copy(existingPositions, 0, newPositions, 0, existingPositions.Length); Array.Copy(positions, 0, newPositions, existingPositions.Length, positions.Length); entry.SetPositions(newPositions); } else if (existingPositions == null && positions != null && positions.Length > 0) { entry.SetPositions(positions); } } } }
/// <summary> </summary> /// <param name="term">The term to map /// </param> /// <param name="frequency">The frequency of the term /// </param> /// <param name="offsets">Offset information, may be null /// </param> /// <param name="positions">Position information, may be null /// </param> //We need to combine any previous mentions of the term public override void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { TermVectorEntry entry = (TermVectorEntry) termToTVE[term]; if (entry == null) { entry = new TermVectorEntry(ALL, term, frequency, storeOffsets == true?offsets:null, storePositions == true?positions:null); termToTVE[term] = entry; currentSet.Add(entry, entry); } else { entry.SetFrequency(entry.GetFrequency() + frequency); if (storeOffsets) { TermVectorOffsetInfo[] existingOffsets = entry.GetOffsets(); //A few diff. cases here: offsets is null, existing offsets is null, both are null, same for positions if (existingOffsets != null && offsets != null && offsets.Length > 0) { //copy over the existing offsets TermVectorOffsetInfo[] newOffsets = new TermVectorOffsetInfo[existingOffsets.Length + offsets.Length]; Array.Copy(existingOffsets, 0, newOffsets, 0, existingOffsets.Length); Array.Copy(offsets, 0, newOffsets, existingOffsets.Length, offsets.Length); entry.SetOffsets(newOffsets); } else if (existingOffsets == null && offsets != null && offsets.Length > 0) { entry.SetOffsets(offsets); } //else leave it alone } if (storePositions) { int[] existingPositions = entry.GetPositions(); if (existingPositions != null && positions != null && positions.Length > 0) { int[] newPositions = new int[existingPositions.Length + positions.Length]; Array.Copy(existingPositions, 0, newPositions, 0, existingPositions.Length); Array.Copy(positions, 0, newPositions, existingPositions.Length, positions.Length); entry.SetPositions(newPositions); } else if (existingPositions == null && positions != null && positions.Length > 0) { entry.SetPositions(positions); } } } }
/// <summary> Two TermVectorOffsetInfos are equals if both the start and end offsets are the same</summary> /// <param name="o">The comparison Object /// </param> /// <returns> true if both {@link #GetStartOffset()} and {@link #GetEndOffset()} are the same for both objects. /// </returns> public override bool Equals(System.Object o) { if (this == o) { return(true); } if (!(o is TermVectorOffsetInfo)) { return(false); } TermVectorOffsetInfo termVectorOffsetInfo = (TermVectorOffsetInfo)o; if (endOffset != termVectorOffsetInfo.endOffset) { return(false); } if (startOffset != termVectorOffsetInfo.startOffset) { return(false); } return(true); }
/// <summary> </summary> /// <param name="field">The field to read in /// </param> /// <param name="tvfPointer">The pointer within the tvf file where we should start reading /// </param> /// <param name="mapper">The mapper used to map the TermVector /// </param> /// <throws> IOException </throws> private void ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper) { // Now read the data from specified position //We don't need to offset by the FORMAT here since the pointer already includes the offset tvf.Seek(tvfPointer); int numTerms = tvf.ReadVInt(); //System.out.println("Num Terms: " + numTerms); // If no terms - return a constant empty termvector. However, this should never occur! if (numTerms == 0) { return; } bool storePositions; bool storeOffsets; if (format >= FORMAT_VERSION) { byte bits = tvf.ReadByte(); storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; } else { tvf.ReadVInt(); storePositions = false; storeOffsets = false; } mapper.SetExpectations(field, numTerms, storeOffsets, storePositions); int start = 0; int deltaLength = 0; int totalLength = 0; byte[] byteBuffer; char[] charBuffer; bool preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES; // init the buffers if (preUTF8) { charBuffer = new char[10]; byteBuffer = null; } else { charBuffer = null; byteBuffer = new byte[20]; } for (int i = 0; i < numTerms; i++) { start = tvf.ReadVInt(); deltaLength = tvf.ReadVInt(); totalLength = start + deltaLength; System.String term; if (preUTF8) { // Term stored as java chars if (charBuffer.Length < totalLength) { char[] newCharBuffer = new char[(int)(1.5 * totalLength)]; Array.Copy(charBuffer, 0, newCharBuffer, 0, start); charBuffer = newCharBuffer; } tvf.ReadChars(charBuffer, start, deltaLength); term = new System.String(charBuffer, 0, totalLength); } else { // Term stored as utf8 bytes if (byteBuffer.Length < totalLength) { byte[] newByteBuffer = new byte[(int)(1.5 * totalLength)]; Array.Copy(byteBuffer, 0, newByteBuffer, 0, start); byteBuffer = newByteBuffer; } tvf.ReadBytes(byteBuffer, start, deltaLength); term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength); } int freq = tvf.ReadVInt(); int[] positions = null; if (storePositions) { //read in the positions //does the mapper even care about positions? if (mapper.IsIgnoringPositions() == false) { positions = new int[freq]; int prevPosition = 0; for (int j = 0; j < freq; j++) { positions[j] = prevPosition + tvf.ReadVInt(); prevPosition = positions[j]; } } else { //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip // for (int j = 0; j < freq; j++) { tvf.ReadVInt(); } } } TermVectorOffsetInfo[] offsets = null; if (storeOffsets) { //does the mapper even care about offsets? if (mapper.IsIgnoringOffsets() == false) { offsets = new TermVectorOffsetInfo[freq]; int prevOffset = 0; for (int j = 0; j < freq; j++) { int startOffset = prevOffset + tvf.ReadVInt(); int endOffset = startOffset + tvf.ReadVInt(); offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset); prevOffset = endOffset; } } else { for (int j = 0; j < freq; j++) { tvf.ReadVInt(); tvf.ReadVInt(); } } } mapper.Map(term, freq, offsets, positions); } }
/// <summary> Map the Term Vector information into your own structure</summary> /// <param name="term">The term to add to the vector /// </param> /// <param name="frequency">The frequency of the term in the document /// </param> /// <param name="offsets">null if the offset is not specified, otherwise the offset into the field of the term /// </param> /// <param name="positions">null if the position is not specified, otherwise the position in the field of the term /// </param> public abstract void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions);
internal virtual void SetOffsets(TermVectorOffsetInfo[] offsets) { this.offsets = offsets; }
public SegmentTermPositionVector(System.String field, System.String[] terms, int[] termFreqs, int[][] positions, TermVectorOffsetInfo[][] offsets):base(field, terms, termFreqs) { this.offsets = offsets; this.positions = positions; }
public override void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { TermVectorEntry entry = new TermVectorEntry(currentField, term, frequency, offsets, positions); currentSet.Add(entry, entry); }
public override void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { terms[currentPosition] = term; termFreqs[currentPosition] = frequency; if (storingOffsets) { this.offsets[currentPosition] = offsets; } if (storingPositions) { this.positions[currentPosition] = positions; } currentPosition++; }
/// <summary> </summary> /// <param name="field">The field to read in /// </param> /// <param name="tvfPointer">The pointer within the tvf file where we should start reading /// </param> /// <param name="mapper">The mapper used to map the TermVector /// </param> /// <throws> IOException </throws> private void ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper) { // Now read the data from specified position //We don't need to offset by the FORMAT here since the pointer already includes the offset tvf.Seek(tvfPointer); int numTerms = tvf.ReadVInt(); //System.out.println("Num Terms: " + numTerms); // If no terms - return a constant empty termvector. However, this should never occur! if (numTerms == 0) return ; bool storePositions; bool storeOffsets; if (format >= FORMAT_VERSION) { byte bits = tvf.ReadByte(); storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; } else { tvf.ReadVInt(); storePositions = false; storeOffsets = false; } mapper.SetExpectations(field, numTerms, storeOffsets, storePositions); int start = 0; int deltaLength = 0; int totalLength = 0; byte[] byteBuffer; char[] charBuffer; bool preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES; // init the buffers if (preUTF8) { charBuffer = new char[10]; byteBuffer = null; } else { charBuffer = null; byteBuffer = new byte[20]; } for (int i = 0; i < numTerms; i++) { start = tvf.ReadVInt(); deltaLength = tvf.ReadVInt(); totalLength = start + deltaLength; System.String term; if (preUTF8) { // Term stored as java chars if (charBuffer.Length < totalLength) { char[] newCharBuffer = new char[(int) (1.5 * totalLength)]; Array.Copy(charBuffer, 0, newCharBuffer, 0, start); charBuffer = newCharBuffer; } tvf.ReadChars(charBuffer, start, deltaLength); term = new System.String(charBuffer, 0, totalLength); } else { // Term stored as utf8 bytes if (byteBuffer.Length < totalLength) { byte[] newByteBuffer = new byte[(int) (1.5 * totalLength)]; Array.Copy(byteBuffer, 0, newByteBuffer, 0, start); byteBuffer = newByteBuffer; } tvf.ReadBytes(byteBuffer, start, deltaLength); term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength); } int freq = tvf.ReadVInt(); int[] positions = null; if (storePositions) { //read in the positions //does the mapper even care about positions? if (mapper.IsIgnoringPositions() == false) { positions = new int[freq]; int prevPosition = 0; for (int j = 0; j < freq; j++) { positions[j] = prevPosition + tvf.ReadVInt(); prevPosition = positions[j]; } } else { //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip // for (int j = 0; j < freq; j++) { tvf.ReadVInt(); } } } TermVectorOffsetInfo[] offsets = null; if (storeOffsets) { //does the mapper even care about offsets? if (mapper.IsIgnoringOffsets() == false) { offsets = new TermVectorOffsetInfo[freq]; int prevOffset = 0; for (int j = 0; j < freq; j++) { int startOffset = prevOffset + tvf.ReadVInt(); int endOffset = startOffset + tvf.ReadVInt(); offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset); prevOffset = endOffset; } } else { for (int j = 0; j < freq; j++) { tvf.ReadVInt(); tvf.ReadVInt(); } } } mapper.Map(term, freq, offsets, positions); } }