private void ReadTermVectors(System.String[] fields, long[] tvfPointers, TermVectorMapper mapper) { for (int i = 0; i < fields.Length; i++) { ReadTermVector(fields[i], tvfPointers[i], mapper); } }
public override void GetTermFreqVector(int docNumber, TermVectorMapper mapper) { EnsureOpen(); int i = ReaderIndex(docNumber); // find segment num subReaders[i].GetTermFreqVector(docNumber - starts[i], mapper); }
public virtual void Get(int docNumber, TermVectorMapper mapper) { // Check if no term vectors are available for this segment at all if (tvx != null) { //We need to offset by SeekTvx(docNumber); long tvdPosition = tvx.ReadLong(); tvd.Seek(tvdPosition); int fieldCount = tvd.ReadVInt(); // No fields are vectorized for this document if (fieldCount != 0) { System.String[] fields = ReadFields(fieldCount); long[] tvfPointers = ReadTvfPointers(fieldCount); mapper.SetDocumentNumber(docNumber); ReadTermVectors(fields, tvfPointers, mapper); } } else { //System.out.println("No tvx file"); } }
public override void GetTermFreqVector(int docNumber, System.String field, TermVectorMapper mapper) { EnsureOpen(); IndexReader reader = ((IndexReader)fieldToReader[field]); if (reader != null) { reader.GetTermFreqVector(docNumber, field, mapper); } }
public override void GetTermFreqVector(int docNumber, TermVectorMapper mapper) { EnsureOpen(); System.Collections.IEnumerator i = new System.Collections.Hashtable(fieldToReader).GetEnumerator(); while (i.MoveNext()) { System.Collections.DictionaryEntry e = (System.Collections.DictionaryEntry)i.Current; System.String field = (System.String)e.Key; IndexReader reader = (IndexReader)e.Value; reader.GetTermFreqVector(docNumber, field, mapper); } }
/// <summary> </summary> /// <param name="field">The field to read in /// </param> /// <param name="tvfPointer">The pointer within the tvf file where we should start reading /// </param> /// <param name="mapper">The mapper used to map the TermVector /// </param> /// <throws> IOException </throws> private void ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper) { // Now read the data from specified position //We don't need to offset by the FORMAT here since the pointer already includes the offset tvf.Seek(tvfPointer); int numTerms = tvf.ReadVInt(); //System.out.println("Num Terms: " + numTerms); // If no terms - return a constant empty termvector. However, this should never occur! if (numTerms == 0) return ; bool storePositions; bool storeOffsets; if (format >= FORMAT_VERSION) { byte bits = tvf.ReadByte(); storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; } else { tvf.ReadVInt(); storePositions = false; storeOffsets = false; } mapper.SetExpectations(field, numTerms, storeOffsets, storePositions); int start = 0; int deltaLength = 0; int totalLength = 0; byte[] byteBuffer; char[] charBuffer; bool preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES; // init the buffers if (preUTF8) { charBuffer = new char[10]; byteBuffer = null; } else { charBuffer = null; byteBuffer = new byte[20]; } for (int i = 0; i < numTerms; i++) { start = tvf.ReadVInt(); deltaLength = tvf.ReadVInt(); totalLength = start + deltaLength; System.String term; if (preUTF8) { // Term stored as java chars if (charBuffer.Length < totalLength) { char[] newCharBuffer = new char[(int) (1.5 * totalLength)]; Array.Copy(charBuffer, 0, newCharBuffer, 0, start); charBuffer = newCharBuffer; } tvf.ReadChars(charBuffer, start, deltaLength); term = new System.String(charBuffer, 0, totalLength); } else { // Term stored as utf8 bytes if (byteBuffer.Length < totalLength) { byte[] newByteBuffer = new byte[(int) (1.5 * totalLength)]; Array.Copy(byteBuffer, 0, newByteBuffer, 0, start); byteBuffer = newByteBuffer; } tvf.ReadBytes(byteBuffer, start, deltaLength); term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength); } int freq = tvf.ReadVInt(); int[] positions = null; if (storePositions) { //read in the positions //does the mapper even care about positions? if (mapper.IsIgnoringPositions() == false) { positions = new int[freq]; int prevPosition = 0; for (int j = 0; j < freq; j++) { positions[j] = prevPosition + tvf.ReadVInt(); prevPosition = positions[j]; } } else { //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip // for (int j = 0; j < freq; j++) { tvf.ReadVInt(); } } } TermVectorOffsetInfo[] offsets = null; if (storeOffsets) { //does the mapper even care about offsets? if (mapper.IsIgnoringOffsets() == false) { offsets = new TermVectorOffsetInfo[freq]; int prevOffset = 0; for (int j = 0; j < freq; j++) { int startOffset = prevOffset + tvf.ReadVInt(); int endOffset = startOffset + tvf.ReadVInt(); offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset); prevOffset = endOffset; } } else { for (int j = 0; j < freq; j++) { tvf.ReadVInt(); tvf.ReadVInt(); } } } mapper.Map(term, freq, offsets, positions); } }
public virtual void Get(int docNum, System.String field, TermVectorMapper mapper) { if (tvx != null) { int fieldNumber = fieldInfos.FieldNumber(field); //We need to account for the FORMAT_SIZE at when seeking in the tvx //We don't need to do this in other seeks because we already have the // file pointer //that was written in another file SeekTvx(docNum); //System.out.println("TVX Pointer: " + tvx.getFilePointer()); long tvdPosition = tvx.ReadLong(); tvd.Seek(tvdPosition); int fieldCount = tvd.ReadVInt(); //System.out.println("Num Fields: " + fieldCount); // There are only a few fields per document. We opt for a full scan // rather then requiring that they be ordered. We need to read through // all of the fields anyway to get to the tvf pointers. int number = 0; int found = -1; for (int i = 0; i < fieldCount; i++) { if (format >= FORMAT_VERSION) { number = tvd.ReadVInt(); } else { number += tvd.ReadVInt(); } if (number == fieldNumber) { found = i; } } // This field, although valid in the segment, was not found in this // document if (found != -1) { // Compute position in the tvf file long position; if (format >= FORMAT_VERSION2) { position = tvx.ReadLong(); } else { position = tvd.ReadVLong(); } for (int i = 1; i <= found; i++) { position += tvd.ReadVLong(); } mapper.SetDocumentNumber(docNum); ReadTermVector(field, position, mapper); } else { //System.out.println("Fieldable not found"); } } else { //System.out.println("No tvx file"); } }
public override void GetTermFreqVector(int docNumber, TermVectorMapper mapper) { EnsureOpen(); System.Collections.IEnumerator i = new System.Collections.Hashtable(fieldToReader).GetEnumerator(); while (i.MoveNext()) { System.Collections.DictionaryEntry e = (System.Collections.DictionaryEntry) i.Current; System.String field = (System.String) e.Key; IndexReader reader = (IndexReader) e.Value; reader.GetTermFreqVector(docNumber, field, mapper); } }
public override void GetTermFreqVector(int docNumber, System.String field, TermVectorMapper mapper) { EnsureOpen(); IndexReader reader = ((IndexReader) fieldToReader[field]); if (reader != null) { reader.GetTermFreqVector(docNumber, field, mapper); } }
public override void GetTermFreqVector(int docNumber, TermVectorMapper mapper) { EnsureOpen(); TermVectorsReader termVectorsReader = GetTermVectorsReader(); if (termVectorsReader == null) return ; termVectorsReader.Get(docNumber, mapper); }
public override void GetTermFreqVector(int docNumber, System.String field, TermVectorMapper mapper) { EnsureOpen(); FieldInfo fi = core.fieldInfos.FieldInfo(field); if (fi == null || !fi.storeTermVector) return ; TermVectorsReader termVectorsReader = GetTermVectorsReader(); if (termVectorsReader == null) { return ; } termVectorsReader.Get(docNumber, field, mapper); }
public override void GetTermFreqVector(int docNumber, TermVectorMapper mapper) { EnsureOpen(); in_Renamed.GetTermFreqVector(docNumber, mapper); }
public override void GetTermFreqVector(int docNumber, System.String field, TermVectorMapper mapper) { EnsureOpen(); in_Renamed.GetTermFreqVector(docNumber, field, mapper); }
public virtual void Get(int docNum, System.String field, TermVectorMapper mapper) { if (tvx != null) { int fieldNumber = fieldInfos.FieldNumber(field); //We need to account for the FORMAT_SIZE at when seeking in the tvx //We don't need to do this in other seeks because we already have the // file pointer //that was written in another file SeekTvx(docNum); //System.out.println("TVX Pointer: " + tvx.getFilePointer()); long tvdPosition = tvx.ReadLong(); tvd.Seek(tvdPosition); int fieldCount = tvd.ReadVInt(); //System.out.println("Num Fields: " + fieldCount); // There are only a few fields per document. We opt for a full scan // rather then requiring that they be ordered. We need to read through // all of the fields anyway to get to the tvf pointers. int number = 0; int found = - 1; for (int i = 0; i < fieldCount; i++) { if (format >= FORMAT_VERSION) number = tvd.ReadVInt(); else number += tvd.ReadVInt(); if (number == fieldNumber) found = i; } // This field, although valid in the segment, was not found in this // document if (found != - 1) { // Compute position in the tvf file long position; if (format >= FORMAT_VERSION2) position = tvx.ReadLong(); else position = tvd.ReadVLong(); for (int i = 1; i <= found; i++) position += tvd.ReadVLong(); mapper.SetDocumentNumber(docNum); ReadTermVector(field, position, mapper); } else { //System.out.println("Fieldable not found"); } } else { //System.out.println("No tvx file"); } }
/// <summary> Map all the term vectors for all fields in a Document</summary> /// <param name="docNumber">The number of the document to load the vector for /// </param> /// <param name="mapper">The {@link TermVectorMapper} to process the vector. Must not be null /// </param> /// <throws> IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified. </throws> abstract public void GetTermFreqVector(int docNumber, TermVectorMapper mapper);
/// <summary> Load the Term Vector into a user-defined data structure instead of relying on the parallel arrays of /// the {@link TermFreqVector}. /// </summary> /// <param name="docNumber">The number of the document to load the vector for /// </param> /// <param name="field">The name of the field to load /// </param> /// <param name="mapper">The {@link TermVectorMapper} to process the vector. Must not be null /// </param> /// <throws> IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified. </throws> /// <summary> /// </summary> abstract public void GetTermFreqVector(int docNumber, System.String field, TermVectorMapper mapper);
/// <summary> </summary> /// <param name="field">The field to read in /// </param> /// <param name="tvfPointer">The pointer within the tvf file where we should start reading /// </param> /// <param name="mapper">The mapper used to map the TermVector /// </param> /// <throws> IOException </throws> private void ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper) { // Now read the data from specified position //We don't need to offset by the FORMAT here since the pointer already includes the offset tvf.Seek(tvfPointer); int numTerms = tvf.ReadVInt(); //System.out.println("Num Terms: " + numTerms); // If no terms - return a constant empty termvector. However, this should never occur! if (numTerms == 0) { return; } bool storePositions; bool storeOffsets; if (format >= FORMAT_VERSION) { byte bits = tvf.ReadByte(); storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; } else { tvf.ReadVInt(); storePositions = false; storeOffsets = false; } mapper.SetExpectations(field, numTerms, storeOffsets, storePositions); int start = 0; int deltaLength = 0; int totalLength = 0; byte[] byteBuffer; char[] charBuffer; bool preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES; // init the buffers if (preUTF8) { charBuffer = new char[10]; byteBuffer = null; } else { charBuffer = null; byteBuffer = new byte[20]; } for (int i = 0; i < numTerms; i++) { start = tvf.ReadVInt(); deltaLength = tvf.ReadVInt(); totalLength = start + deltaLength; System.String term; if (preUTF8) { // Term stored as java chars if (charBuffer.Length < totalLength) { char[] newCharBuffer = new char[(int)(1.5 * totalLength)]; Array.Copy(charBuffer, 0, newCharBuffer, 0, start); charBuffer = newCharBuffer; } tvf.ReadChars(charBuffer, start, deltaLength); term = new System.String(charBuffer, 0, totalLength); } else { // Term stored as utf8 bytes if (byteBuffer.Length < totalLength) { byte[] newByteBuffer = new byte[(int)(1.5 * totalLength)]; Array.Copy(byteBuffer, 0, newByteBuffer, 0, start); byteBuffer = newByteBuffer; } tvf.ReadBytes(byteBuffer, start, deltaLength); term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength); } int freq = tvf.ReadVInt(); int[] positions = null; if (storePositions) { //read in the positions //does the mapper even care about positions? if (mapper.IsIgnoringPositions() == false) { positions = new int[freq]; int prevPosition = 0; for (int j = 0; j < freq; j++) { positions[j] = prevPosition + tvf.ReadVInt(); prevPosition = positions[j]; } } else { //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip // for (int j = 0; j < freq; j++) { tvf.ReadVInt(); } } } TermVectorOffsetInfo[] offsets = null; if (storeOffsets) { //does the mapper even care about offsets? if (mapper.IsIgnoringOffsets() == false) { offsets = new TermVectorOffsetInfo[freq]; int prevOffset = 0; for (int j = 0; j < freq; j++) { int startOffset = prevOffset + tvf.ReadVInt(); int endOffset = startOffset + tvf.ReadVInt(); offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset); prevOffset = endOffset; } } else { for (int j = 0; j < freq; j++) { tvf.ReadVInt(); tvf.ReadVInt(); } } } mapper.Map(term, freq, offsets, positions); } }