private void WriteTerm(Term term) { int start = StringHelper.StringDifference(lastTerm.text, term.text); int length = term.text.Length - start; output.WriteVInt(start); // write shared prefix length output.WriteVInt(length); // write delta length output.WriteChars(term.text, start, length); // write delta chars output.WriteVInt(fieldInfos.FieldNumber(term.field)); // write field num lastTerm = term; }
internal void Add(Term term, TermInfo ti) { int length = term.text.Length; if (termTextBuffer.Length < length) { termTextBuffer = new char[(int)(length * 1.25)]; } int i = 0; System.Collections.Generic.IEnumerator <char> chars = term.text.GetEnumerator(); while (chars.MoveNext()) { termTextBuffer[i++] = (char)chars.Current; } Add(fieldInfos.FieldNumber(term.field), termTextBuffer, 0, length, ti); }
/// <summary> Add a complete document specified by all its term vectors. If document has no /// term vectors, add value for tvx. /// /// </summary> /// <param name="vectors"> /// </param> /// <throws> IOException </throws> public void AddAllDocVectors(TermFreqVector[] vectors) { tvx.WriteLong(tvd.GetFilePointer()); tvx.WriteLong(tvf.GetFilePointer()); if (vectors != null) { int numFields = vectors.Length; tvd.WriteVInt(numFields); long[] fieldPointers = new long[numFields]; for (int i = 0; i < numFields; i++) { fieldPointers[i] = tvf.GetFilePointer(); int fieldNumber = fieldInfos.FieldNumber(vectors[i].GetField()); // 1st pass: write field numbers to tvd tvd.WriteVInt(fieldNumber); int numTerms = vectors[i].Size(); tvf.WriteVInt(numTerms); TermPositionVector tpVector; byte bits; bool storePositions; bool storeOffsets; if (vectors[i] is TermPositionVector) { // May have positions & offsets tpVector = (TermPositionVector)vectors[i]; storePositions = tpVector.Size() > 0 && tpVector.GetTermPositions(0) != null; storeOffsets = tpVector.Size() > 0 && tpVector.GetOffsets(0) != null; bits = (byte)((storePositions ? TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR : (byte)0) + (storeOffsets ? TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR : (byte)0)); } else { tpVector = null; bits = 0; storePositions = false; storeOffsets = false; } tvf.WriteVInt(bits); string[] terms = vectors[i].GetTerms(); int[] freqs = vectors[i].GetTermFrequencies(); int utf8Upto = 0; utf8Results[1].length = 0; for (int j = 0; j < numTerms; j++) { UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].Length, utf8Results[utf8Upto]); int start = StringHelper.bytesDifference( utf8Results[1 - utf8Upto].result, utf8Results[1 - utf8Upto].length, utf8Results[utf8Upto].result, utf8Results[utf8Upto].length); int length = utf8Results[utf8Upto].length - start; tvf.WriteVInt(start); // write shared prefix length tvf.WriteVInt(length); // write delta length tvf.WriteBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes utf8Upto = 1 - utf8Upto; int termFreq = freqs[j]; tvf.WriteVInt(termFreq); if (storePositions) { int[] positions = tpVector.GetTermPositions(j); if (positions == null) { throw new System.SystemException("Trying to write positions that are null!"); } System.Diagnostics.Debug.Assert(positions.Length == termFreq); // use delta encoding for positions int lastPosition = 0; for (int k = 0; k < positions.Length; k++) { int position = positions[k]; tvf.WriteVInt(position - lastPosition); lastPosition = position; } } if (storeOffsets) { TermVectorOffsetInfo[] offsets = tpVector.GetOffsets(j); if (offsets == null) { throw new System.SystemException("Trying to write offsets that are null!"); } System.Diagnostics.Debug.Assert(offsets.Length == termFreq); // use delta encoding for offsets int lastEndOffset = 0; for (int k = 0; k < offsets.Length; k++) { int startOffset = offsets[k].GetStartOffset(); int endOffset = offsets[k].GetEndOffset(); tvf.WriteVInt(startOffset - lastEndOffset); tvf.WriteVInt(endOffset - startOffset); lastEndOffset = endOffset; } } } } // 2nd pass: write field pointers to tvd long lastFieldPointer = fieldPointers[0]; for (int i = 1; i < numFields; i++) { long fieldPointer = fieldPointers[i]; tvd.WriteVLong(fieldPointer - lastFieldPointer); lastFieldPointer = fieldPointer; } } else { tvd.WriteVInt(0); } }
internal void Add(Term term, TermInfo ti) { UnicodeUtil.UTF16toUTF8(term.Text, 0, term.Text.Length, utf8Result); Add(fieldInfos.FieldNumber(term.Field), utf8Result.result, utf8Result.length, ti); }
internal void AddDocument(Document doc) { indexStream.WriteLong(fieldsStream.GetFilePointer()); int storedCount = 0; System.Collections.IEnumerator fieldIterator = doc.GetFields().GetEnumerator(); while (fieldIterator.MoveNext()) { Fieldable field = (Fieldable)fieldIterator.Current; if (field.IsStored()) { storedCount++; } } fieldsStream.WriteVInt(storedCount); fieldIterator = doc.GetFields().GetEnumerator(); while (fieldIterator.MoveNext()) { Fieldable field = (Fieldable)fieldIterator.Current; // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode // and field.binaryValue() already returns the compressed value for a field // with isCompressed()==true, so we disable compression in that case bool disableCompression = (field is FieldsReader.FieldForMerge); if (field.IsStored()) { fieldsStream.WriteVInt(fieldInfos.FieldNumber(field.Name())); byte bits = 0; if (field.IsTokenized()) { bits |= FieldsWriter.FIELD_IS_TOKENIZED; } if (field.IsBinary()) { bits |= FieldsWriter.FIELD_IS_BINARY; } if (field.IsCompressed()) { bits |= FieldsWriter.FIELD_IS_COMPRESSED; } fieldsStream.WriteByte(bits); if (field.IsCompressed()) { // compression is enabled for the current field byte[] data = null; if (disableCompression) { // optimized case for merging, the data // is already compressed data = field.BinaryValue(); } else { // check if it is a binary field if (field.IsBinary()) { data = Compress(field.BinaryValue()); } else { data = Compress(System.Text.Encoding.GetEncoding("UTF-8").GetBytes(field.StringValue())); } } int len = data.Length; fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, len); } else { // compression is disabled for the current field if (field.IsBinary()) { byte[] data = field.BinaryValue(); int len = data.Length; fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, len); } else { fieldsStream.WriteString(field.StringValue()); } } } } }
internal void AddDocument(Document doc) { indexStream.WriteLong(fieldsStream.GetFilePointer()); int storedCount = 0; foreach (Field field in doc.Fields()) { if (field.IsStored()) { storedCount++; } } fieldsStream.WriteVInt(storedCount); foreach (Field field in doc.Fields()) { if (field.IsStored()) { fieldsStream.WriteVInt(fieldInfos.FieldNumber(field.Name())); byte bits = 0; if (field.IsTokenized()) { bits |= FieldsWriter.FIELD_IS_TOKENIZED; } if (field.IsBinary()) { bits |= FieldsWriter.FIELD_IS_BINARY; } if (field.IsCompressed()) { bits |= FieldsWriter.FIELD_IS_COMPRESSED; } fieldsStream.WriteByte(bits); if (field.IsCompressed()) { // compression is enabled for the current field byte[] data = null; // check if it is a binary field if (field.IsBinary()) { data = Compress(field.BinaryValue()); } else { data = Compress(System.Text.Encoding.GetEncoding("UTF-8").GetBytes(field.StringValue())); } int len = data.Length; fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, len); } else { // compression is disabled for the current field if (field.IsBinary()) { byte[] data = field.BinaryValue(); int len = data.Length; fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, len); } else { fieldsStream.WriteString(field.StringValue()); } } } } }
/// <summary> Retrieve the term vector for the given document and field</summary> /// <param name="docNum">The document number to retrieve the vector for /// </param> /// <param name="field">The field within the document to retrieve /// </param> /// <returns> The TermFreqVector for the document and field or null if there is no termVector for this field. /// </returns> /// <throws> IOException if there is an error reading the term vector files </throws> public /*internal*/ virtual TermFreqVector Get(int docNum, System.String field) { // Check if no term vectors are available for this segment at all int fieldNumber = fieldInfos.FieldNumber(field); TermFreqVector result = null; if (tvx != null) { //We need to account for the FORMAT_SIZE at when seeking in the tvx //We don't need to do this in other seeks because we already have the // file pointer //that was written in another file tvx.Seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE); //System.out.println("TVX Pointer: " + tvx.getFilePointer()); long position = tvx.ReadLong(); tvd.Seek(position); int fieldCount = tvd.ReadVInt(); //System.out.println("Num Fields: " + fieldCount); // There are only a few fields per document. We opt for a full scan // rather then requiring that they be ordered. We need to read through // all of the fields anyway to get to the tvf pointers. int number = 0; int found = -1; for (int i = 0; i < fieldCount; i++) { if (tvdFormat == TermVectorsWriter.FORMAT_VERSION) { number = tvd.ReadVInt(); } else { number += tvd.ReadVInt(); } if (number == fieldNumber) { found = i; } } // This field, although valid in the segment, was not found in this // document if (found != -1) { // Compute position in the tvf file position = 0; for (int i = 0; i <= found; i++) { position += tvd.ReadVLong(); } result = ReadTermVector(field, position); } else { //System.out.println("Field not found"); } } else { //System.out.println("No tvx file"); } return(result); }
public virtual void Get(int docNum, System.String field, TermVectorMapper mapper) { if (tvx != null) { int fieldNumber = fieldInfos.FieldNumber(field); //We need to account for the FORMAT_SIZE at when seeking in the tvx //We don't need to do this in other seeks because we already have the // file pointer //that was written in another file SeekTvx(docNum); //System.out.println("TVX Pointer: " + tvx.getFilePointer()); long tvdPosition = tvx.ReadLong(); tvd.Seek(tvdPosition); int fieldCount = tvd.ReadVInt(); //System.out.println("Num Fields: " + fieldCount); // There are only a few fields per document. We opt for a full scan // rather then requiring that they be ordered. We need to read through // all of the fields anyway to get to the tvf pointers. int number = 0; int found = -1; for (int i = 0; i < fieldCount; i++) { if (format >= FORMAT_VERSION) { number = tvd.ReadVInt(); } else { number += tvd.ReadVInt(); } if (number == fieldNumber) { found = i; } } // This field, although valid in the segment, was not found in this // document if (found != -1) { // Compute position in the tvf file long position; if (format >= FORMAT_VERSION2) { position = tvx.ReadLong(); } else { position = tvd.ReadVLong(); } for (int i = 1; i <= found; i++) { position += tvd.ReadVLong(); } mapper.SetDocumentNumber(docNum); ReadTermVector(field, position, mapper); } else { //System.out.println("Fieldable not found"); } } else { //System.out.println("No tvx file"); } }
// Tokenizes the fields of a document into Postings. private void InvertDocument(Document doc) { foreach (Field field in doc.Fields()) { System.String fieldName = field.Name(); int fieldNumber = fieldInfos.FieldNumber(fieldName); int length = fieldLengths[fieldNumber]; // length of field int position = fieldPositions[fieldNumber]; // position in field if (length > 0) { position += analyzer.GetPositionIncrementGap(fieldName); } int offset = fieldOffsets[fieldNumber]; // offset field if (field.IsIndexed()) { if (!field.IsTokenized()) { // un-tokenized field System.String stringValue = field.StringValue(); if (field.IsStoreOffsetWithTermVector()) { AddPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.Length)); } else { AddPosition(fieldName, stringValue, position++, null); } offset += stringValue.Length; length++; } else { System.IO.TextReader reader; // find or make Reader if (field.ReaderValue() != null) { reader = field.ReaderValue(); } else if (field.StringValue() != null) { reader = new System.IO.StringReader(field.StringValue()); } else { throw new System.ArgumentException("field must have either String or Reader value"); } // Tokenize field and add to postingTable TokenStream stream = analyzer.TokenStream(fieldName, reader); try { Token lastToken = null; for (Token t = stream.Next(); t != null; t = stream.Next()) { position += (t.GetPositionIncrement() - 1); if (field.IsStoreOffsetWithTermVector()) { AddPosition(fieldName, t.TermText(), position++, new TermVectorOffsetInfo(offset + t.StartOffset(), offset + t.EndOffset())); } else { AddPosition(fieldName, t.TermText(), position++, null); } lastToken = t; if (++length > maxFieldLength) { if (infoStream != null) { infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached, ignoring following tokens"); } break; } } if (lastToken != null) { offset += lastToken.EndOffset() + 1; } } finally { stream.Close(); } } fieldLengths[fieldNumber] = length; // save field length fieldPositions[fieldNumber] = position; // save field position fieldBoosts[fieldNumber] *= field.GetBoost(); fieldOffsets[fieldNumber] = offset; } } }