internal override void addTerm(Token t, RawPostingList p0) { System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.addTerm start")); TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList)p0; p.freq++; if (doVectorOffsets) { int startOffset = fieldState.offset + t.StartOffset(); int endOffset = fieldState.offset + t.EndOffset(); termsHashPerField.writeVInt(1, startOffset - p.lastOffset); termsHashPerField.writeVInt(1, endOffset - startOffset); p.lastOffset = endOffset; } if (doVectorPositions) { termsHashPerField.writeVInt(0, fieldState.position - p.lastPosition); p.lastPosition = fieldState.position; } }
internal override void NewTerm(RawPostingList p0) { System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.newTerm start")); TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList)p0; p.freq = 1; if (doVectorOffsets) { int startOffset = fieldState.offset + offsetAttribute.StartOffset();; int endOffset = fieldState.offset + offsetAttribute.EndOffset(); termsHashPerField.WriteVInt(1, startOffset); termsHashPerField.WriteVInt(1, endOffset - startOffset); p.lastOffset = endOffset; } if (doVectorPositions) { termsHashPerField.WriteVInt(0, fieldState.position); p.lastPosition = fieldState.position; } }
/// <summary>Called once per field per document if term vectors /// are enabled, to write the vectors to /// RAMOutputStream, which is then quickly flushed to /// * the real term vectors files in the Directory. /// </summary> internal override void Finish() { System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start")); int numPostings = termsHashPerField.numPostings; System.Diagnostics.Debug.Assert(numPostings >= 0); if (!doVectors || numPostings == 0) { return; } if (numPostings > maxNumPostings) { maxNumPostings = numPostings; } IndexOutput tvf = perThread.doc.tvf; // This is called once, after inverting all occurences // of a given field in the doc. At this point we flush // our hash into the DocWriter. System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector); System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo)); perThread.doc.AddField(termsHashPerField.fieldInfo.number); RawPostingList[] postings = termsHashPerField.SortPostings(); tvf.WriteVInt(numPostings); byte bits = (byte)(0x0); if (doVectorPositions) { bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR; } if (doVectorOffsets) { bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; } tvf.WriteByte(bits); int encoderUpto = 0; int lastTermBytesCount = 0; ByteSliceReader reader = perThread.vectorSliceReader; char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers; for (int j = 0; j < numPostings; j++) { TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList)postings[j]; int freq = posting.freq; char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK; // We swap between two encoders to save copying // last Term's byte array UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto]; // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result); int termBytesCount = utf8Result.length; // TODO: UTF16toUTF8 could tell us this prefix // Compute common prefix between last term and // this term int prefix = 0; if (j > 0) { byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result; byte[] termBytes = perThread.utf8Results[encoderUpto].result; while (prefix < lastTermBytesCount && prefix < termBytesCount) { if (lastTermBytes[prefix] != termBytes[prefix]) { break; } prefix++; } } encoderUpto = 1 - encoderUpto; lastTermBytesCount = termBytesCount; int suffix = termBytesCount - prefix; tvf.WriteVInt(prefix); tvf.WriteVInt(suffix); tvf.WriteBytes(utf8Result.result, prefix, suffix); tvf.WriteVInt(freq); if (doVectorPositions) { termsHashPerField.InitReader(reader, posting, 0); reader.WriteTo(tvf); } if (doVectorOffsets) { termsHashPerField.InitReader(reader, posting, 1); reader.WriteTo(tvf); } } termsHashPerField.Reset(); perThread.termsHashPerThread.Reset(false); }