// Tokenizes the fields of a document into Postings. private void InvertDocument(Document doc) { foreach (Field field in doc.Fields()) { System.String fieldName = field.Name(); int fieldNumber = fieldInfos.FieldNumber(fieldName); int length = fieldLengths[fieldNumber]; // length of Field int position = fieldPositions[fieldNumber]; // position in Field if (field.IsIndexed()) { if (!field.IsTokenized()) { // un-tokenized Field AddPosition(fieldName, field.StringValue(), position++); length++; } else { System.IO.TextReader reader; // find or make Reader if (field.ReaderValue() != null) { reader = field.ReaderValue(); } else if (field.StringValue() != null) { reader = new System.IO.StringReader(field.StringValue()); } else { throw new System.ArgumentException("Field must have either String or Reader value"); } // Tokenize Field and add to postingTable TokenStream stream = analyzer.TokenStream(fieldName, reader); try { for (Token t = stream.Next(); t != null; t = stream.Next()) { position += (t.GetPositionIncrement() - 1); AddPosition(fieldName, t.TermText(), position++); if (++length > maxFieldLength) { break; } } } finally { stream.Close(); } } fieldLengths[fieldNumber] = length; // save Field length fieldPositions[fieldNumber] = position; // save Field position fieldBoosts[fieldNumber] *= field.GetBoost(); } } }
internal void AddDocument(Document doc) { indexStream.WriteLong(fieldsStream.GetFilePointer()); int storedCount = 0; foreach (Field field in doc.Fields()) { if (field.IsStored()) { storedCount++; } } fieldsStream.WriteVInt(storedCount); foreach (Field field in doc.Fields()) { if (field.IsStored()) { fieldsStream.WriteVInt(fieldInfos.FieldNumber(field.Name())); byte bits = 0; if (field.IsTokenized()) { bits |= 1; } fieldsStream.WriteByte(bits); fieldsStream.WriteString(field.StringValue()); } } }
private void WriteTerm(Term term) { int start = StringHelper.StringDifference(lastTerm.text, term.text); int length = term.text.Length - start; output.WriteVInt(start); // write shared prefix length output.WriteVInt(length); // write delta length output.WriteChars(term.text, start, length); // write delta chars output.WriteVInt(fieldInfos.FieldNumber(term.field)); // write Field num lastTerm = term; }
/// <summary> Retrieve the term vector for the given document and Field</summary> /// <param name="docNum">The document number to retrieve the vector for /// </param> /// <param name="Field">The Field within the document to retrieve /// </param> /// <returns> The TermFreqVector for the document and Field or null /// </returns> public /*internal*/ virtual TermFreqVector Get(int docNum, System.String field) { lock (this) { // Check if no term vectors are available for this segment at all int fieldNumber = fieldInfos.FieldNumber(field); TermFreqVector result = null; if (tvx != null) { try { //We need to account for the FORMAT_SIZE at when seeking in the tvx //We don't need to do this in other seeks because we already have the file pointer //that was written in another file tvx.Seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE); //System.out.println("TVX Pointer: " + tvx.getFilePointer()); long position = tvx.ReadLong(); tvd.Seek(position); int fieldCount = tvd.ReadVInt(); //System.out.println("Num Fields: " + fieldCount); // There are only a few fields per document. We opt for a full scan // rather then requiring that they be ordered. We need to read through // all of the fields anyway to get to the tvf pointers. int number = 0; int found = -1; for (int i = 0; i < fieldCount; i++) { number += tvd.ReadVInt(); if (number == fieldNumber) { found = i; } } // This Field, although valid in the segment, was not found in this document if (found != -1) { // Compute position in the tvf file position = 0; for (int i = 0; i <= found; i++) { position += tvd.ReadVLong(); } result = ReadTermVector(field, position); } else { //System.out.println("Field not found"); } } catch (System.Exception e) { //System.Console.Out.WriteLine(e.StackTrace); } } else { System.Console.Out.WriteLine("No tvx file"); } return(result); } }