/// <summary> Just like {@link #getPositionIncrementGap}, except for /// Token offsets instead. By default this returns 1 for /// tokenized fields and, as if the fields were joined /// with an extra space character, and 0 for un-tokenized /// fields. This method is only called if the field /// produced at least one token for indexing. /// /// </summary> /// <param name="field">the field just indexed /// </param> /// <returns> offset gap, added to the next token emitted from {@link #TokenStream(String,Reader)} /// </returns> public virtual int GetOffsetGap(Fieldable field) { if (field.IsTokenized()) { return(1); } else { return(0); } }
internal void WriteField(FieldInfo fi, Fieldable field) { // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode // and field.binaryValue() already returns the compressed value for a field // with isCompressed()==true, so we disable compression in that case bool disableCompression = (field is FieldsReader.FieldForMerge); fieldsStream.WriteVInt(fi.number); byte bits = 0; if (field.IsTokenized()) { bits |= FieldsWriter.FIELD_IS_TOKENIZED; } if (field.IsBinary()) { bits |= FieldsWriter.FIELD_IS_BINARY; } if (field.IsCompressed()) { bits |= FieldsWriter.FIELD_IS_COMPRESSED; } fieldsStream.WriteByte(bits); if (field.IsCompressed()) { // compression is enabled for the current field byte[] data; int len; int offset; if (disableCompression) { // optimized case for merging, the data // is already compressed data = field.GetBinaryValue(); System.Diagnostics.Debug.Assert(data != null); len = field.GetBinaryLength(); offset = field.GetBinaryOffset(); } else { // check if it is a binary field if (field.IsBinary()) { data = CompressionTools.Compress(field.GetBinaryValue(), field.GetBinaryOffset(), field.GetBinaryLength()); } else { byte[] x = System.Text.Encoding.GetEncoding("UTF-8").GetBytes(field.StringValue()); data = CompressionTools.Compress(x, 0, x.Length); } len = data.Length; offset = 0; } fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, offset, len); } else { // compression is disabled for the current field if (field.IsBinary()) { byte[] data; int len; int offset; data = field.GetBinaryValue(); len = field.GetBinaryLength(); offset = field.GetBinaryOffset(); fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, offset, len); } else { fieldsStream.WriteString(field.StringValue()); } } }
/// <summary> Just like {@link #getPositionIncrementGap}, except for /// Token offsets instead. By default this returns 1 for /// tokenized fields and, as if the fields were joined /// with an extra space character, and 0 for un-tokenized /// fields. This method is only called if the field /// produced at least one token for indexing. /// /// </summary> /// <param name="field">the field just indexed /// </param> /// <returns> offset gap, added to the next token emitted from {@link #TokenStream(String,Reader)} /// </returns> public virtual int GetOffsetGap(Fieldable field) { if (field.IsTokenized()) return 1; else return 0; }
internal void WriteField(FieldInfo fi, Fieldable field) { // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode // and field.binaryValue() already returns the compressed value for a field // with isCompressed()==true, so we disable compression in that case bool disableCompression = (field is FieldsReader.FieldForMerge); fieldsStream.WriteVInt(fi.number); byte bits = 0; if (field.IsTokenized()) bits |= FieldsWriter.FIELD_IS_TOKENIZED; if (field.IsBinary()) bits |= FieldsWriter.FIELD_IS_BINARY; if (field.IsCompressed()) bits |= FieldsWriter.FIELD_IS_COMPRESSED; fieldsStream.WriteByte(bits); if (field.IsCompressed()) { // compression is enabled for the current field byte[] data; int len; int offset; if (disableCompression) { // optimized case for merging, the data // is already compressed data = field.GetBinaryValue(); System.Diagnostics.Debug.Assert(data != null); len = field.GetBinaryLength(); offset = field.GetBinaryOffset(); } else { // check if it is a binary field if (field.IsBinary()) { data = CompressionTools.Compress(field.GetBinaryValue(), field.GetBinaryOffset(), field.GetBinaryLength()); } else { byte[] x = System.Text.Encoding.GetEncoding("UTF-8").GetBytes(field.StringValue()); data = CompressionTools.Compress(x, 0, x.Length); } len = data.Length; offset = 0; } fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, offset, len); } else { // compression is disabled for the current field if (field.IsBinary()) { byte[] data; int len; int offset; data = field.GetBinaryValue(); len = field.GetBinaryLength(); offset = field.GetBinaryOffset(); fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, offset, len); } else { fieldsStream.WriteString(field.StringValue()); } } }
public override void ProcessFields(Fieldable[] fields, int count) { fieldState.Reset(docState.doc.GetBoost()); int maxFieldLength = docState.maxFieldLength; bool doInvert = consumer.Start(fields, count); for (int i = 0; i < count; i++) { Fieldable field = fields[i]; // TODO FI: this should be "genericized" to querying // consumer if it wants to see this particular field // tokenized. if (field.IsIndexed() && doInvert) { bool anyToken; if (fieldState.length > 0) { fieldState.position += docState.analyzer.GetPositionIncrementGap(fieldInfo.name); } if (!field.IsTokenized()) { // un-tokenized field System.String stringValue = field.StringValue(); int valueLength = stringValue.Length; perThread.singleTokenTokenStream.Reinit(stringValue, 0, valueLength); fieldState.attributeSource = perThread.singleTokenTokenStream; consumer.Start(field); bool success = false; try { consumer.Add(); success = true; } finally { if (!success) { docState.docWriter.SetAborting(); } } fieldState.offset += valueLength; fieldState.length++; fieldState.position++; anyToken = valueLength > 0; } else { // tokenized field TokenStream stream; TokenStream streamValue = field.TokenStreamValue(); if (streamValue != null) { stream = streamValue; } else { // the field does not have a TokenStream, // so we have to obtain one from the analyzer System.IO.TextReader reader; // find or make Reader System.IO.TextReader readerValue = field.ReaderValue(); if (readerValue != null) { reader = readerValue; } else { System.String stringValue = field.StringValue(); if (stringValue == null) { throw new System.ArgumentException("field must have either TokenStream, String or Reader value"); } perThread.stringReader.Init(stringValue); reader = perThread.stringReader; } // Tokenize field and add to postingTable stream = docState.analyzer.ReusableTokenStream(fieldInfo.name, reader); } // reset the TokenStream to the first token stream.Reset(); int startLength = fieldState.length; // deprecated bool allowMinus1Position = docState.allowMinus1Position; try { int offsetEnd = fieldState.offset - 1; bool hasMoreTokens = stream.IncrementToken(); fieldState.attributeSource = stream; OffsetAttribute offsetAttribute = (OffsetAttribute)fieldState.attributeSource.AddAttribute(typeof(OffsetAttribute)); PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute)fieldState.attributeSource.AddAttribute(typeof(PositionIncrementAttribute)); consumer.Start(field); for (; ;) { // If we hit an exception in stream.next below // (which is fairly common, eg if analyzer // chokes on a given document), then it's // non-aborting and (above) this one document // will be marked as deleted, but still // consume a docID if (!hasMoreTokens) { break; } int posIncr = posIncrAttribute.GetPositionIncrement(); fieldState.position += posIncr; if (allowMinus1Position || fieldState.position > 0) { fieldState.position--; } if (posIncr == 0) { fieldState.numOverlap++; } bool success = false; try { // If we hit an exception in here, we abort // all buffered documents since the last // flush, on the likelihood that the // internal state of the consumer is now // corrupt and should not be flushed to a // new segment: consumer.Add(); success = true; } finally { if (!success) { docState.docWriter.SetAborting(); } } fieldState.position++; offsetEnd = fieldState.offset + offsetAttribute.EndOffset(); if (++fieldState.length >= maxFieldLength) { if (docState.infoStream != null) { docState.infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached for field " + fieldInfo.name + ", ignoring following tokens"); } break; } hasMoreTokens = stream.IncrementToken(); } // trigger streams to perform end-of-stream operations stream.End(); fieldState.offset += offsetAttribute.EndOffset(); anyToken = fieldState.length > startLength; } finally { stream.Close(); } } if (anyToken) { fieldState.offset += docState.analyzer.GetOffsetGap(field); } fieldState.boost *= field.GetBoost(); } // LUCENE-2387: don't hang onto the field, so GC can // reclaim fields[i] = null; } consumer.Finish(); endConsumer.Finish(); }