public override void ProcessFields(IFieldable[] fields, int count) { fieldState.Reset(docState.doc.Boost); int maxFieldLength = docState.maxFieldLength; bool doInvert = consumer.Start(fields, count); for (int i = 0; i < count; i++) { IFieldable field = fields[i]; // TODO FI: this should be "genericized" to querying // consumer if it wants to see this particular field // tokenized. if (field.IsIndexed && doInvert) { bool anyToken; if (fieldState.length > 0) { fieldState.position += docState.analyzer.GetPositionIncrementGap(fieldInfo.name); } if (!field.IsTokenized) { // un-tokenized field System.String stringValue = field.StringValue; int valueLength = stringValue.Length; perThread.singleToken.Reinit(stringValue, 0, valueLength); fieldState.attributeSource = perThread.singleToken; consumer.Start(field); bool success = false; try { consumer.Add(); success = true; } finally { if (!success) { docState.docWriter.SetAborting(); } } fieldState.offset += valueLength; fieldState.length++; fieldState.position++; anyToken = valueLength > 0; } else { // tokenized field TokenStream stream; TokenStream streamValue = field.TokenStreamValue; if (streamValue != null) { stream = streamValue; } else { // the field does not have a TokenStream, // so we have to obtain one from the analyzer System.IO.TextReader reader; // find or make Reader System.IO.TextReader readerValue = field.ReaderValue; if (readerValue != null) { reader = readerValue; } else { System.String stringValue = field.StringValue; if (stringValue == null) { throw new System.ArgumentException("field must have either TokenStream, String or Reader value"); } perThread.stringReader.Init(stringValue); reader = perThread.stringReader; } // Tokenize field and add to postingTable stream = docState.analyzer.ReusableTokenStream(fieldInfo.name, reader); } // reset the TokenStream to the first token stream.Reset(); int startLength = fieldState.length; try { int offsetEnd = fieldState.offset - 1; bool hasMoreTokens = stream.IncrementToken(); fieldState.attributeSource = stream; IOffsetAttribute offsetAttribute = fieldState.attributeSource.AddAttribute <IOffsetAttribute>(); IPositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.AddAttribute <IPositionIncrementAttribute>(); consumer.Start(field); for (; ;) { // If we hit an exception in stream.next below // (which is fairly common, eg if analyzer // chokes on a given document), then it's // non-aborting and (above) this one document // will be marked as deleted, but still // consume a docID if (!hasMoreTokens) { break; } int posIncr = posIncrAttribute.PositionIncrement; fieldState.position += posIncr; if (fieldState.position > 0) { fieldState.position--; } if (posIncr == 0) { fieldState.numOverlap++; } bool success = false; try { // If we hit an exception in here, we abort // all buffered documents since the last // flush, on the likelihood that the // internal state of the consumer is now // corrupt and should not be flushed to a // new segment: consumer.Add(); success = true; } finally { if (!success) { docState.docWriter.SetAborting(); } } fieldState.position++; offsetEnd = fieldState.offset + offsetAttribute.EndOffset; if (++fieldState.length >= maxFieldLength) { if (docState.infoStream != null) { docState.infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached for field " + fieldInfo.name + ", ignoring following tokens"); } break; } hasMoreTokens = stream.IncrementToken(); } // trigger streams to perform end-of-stream operations stream.End(); fieldState.offset += offsetAttribute.EndOffset; anyToken = fieldState.length > startLength; } finally { stream.Close(); } } if (anyToken) { fieldState.offset += docState.analyzer.GetOffsetGap(field); } fieldState.boost *= field.Boost; } // LUCENE-2387: don't hang onto the field, so GC can // reclaim fields[i] = null; } consumer.Finish(); endConsumer.Finish(); }
public override void ProcessFields(IIndexableField[] fields, int count) { fieldState.Reset(); bool doInvert = consumer.Start(fields, count); for (int i = 0; i < count; i++) { IIndexableField field = fields[i]; IIndexableFieldType fieldType = field.FieldType; // TODO FI: this should be "genericized" to querying // consumer if it wants to see this particular field // tokenized. if (fieldType.IsIndexed && doInvert) { bool analyzed = fieldType.IsTokenized && docState.analyzer != null; // if the field omits norms, the boost cannot be indexed. if (fieldType.OmitNorms && field.Boost != 1.0f) { throw new System.NotSupportedException("You cannot set an index-time boost: norms are omitted for field '" + field.Name + "'"); } // only bother checking offsets if something will consume them. // TODO: after we fix analyzers, also check if termVectorOffsets will be indexed. bool checkOffsets = fieldType.IndexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; int lastStartOffset = 0; if (i > 0) { fieldState.Position += analyzed ? docState.analyzer.GetPositionIncrementGap(fieldInfo.Name) : 0; } /* * To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream * when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses, * but rather a finally that takes note of the problem. */ bool succeededInProcessingField = false; TokenStream stream = field.GetTokenStream(docState.analyzer); // reset the TokenStream to the first token stream.Reset(); try { bool hasMoreTokens = stream.IncrementToken(); fieldState.AttributeSource = stream; IOffsetAttribute offsetAttribute = fieldState.AttributeSource.AddAttribute <IOffsetAttribute>(); IPositionIncrementAttribute posIncrAttribute = fieldState.AttributeSource.AddAttribute <IPositionIncrementAttribute>(); if (hasMoreTokens) { consumer.Start(field); do { // If we hit an exception in stream.next below // (which is fairly common, eg if analyzer // chokes on a given document), then it's // non-aborting and (above) this one document // will be marked as deleted, but still // consume a docID int posIncr = posIncrAttribute.PositionIncrement; if (posIncr < 0) { throw new System.ArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.Name + "'"); } if (fieldState.Position == 0 && posIncr == 0) { throw new System.ArgumentException("first position increment must be > 0 (got 0) for field '" + field.Name + "'"); } int position = fieldState.Position + posIncr; if (position > 0) { // NOTE: confusing: this "mirrors" the // position++ we do below position--; } else if (position < 0) { throw new System.ArgumentException("position overflow for field '" + field.Name + "'"); } // position is legal, we can safely place it in fieldState now. // not sure if anything will use fieldState after non-aborting exc... fieldState.Position = position; if (posIncr == 0) { fieldState.NumOverlap++; } if (checkOffsets) { int startOffset = fieldState.Offset + offsetAttribute.StartOffset; int endOffset = fieldState.Offset + offsetAttribute.EndOffset; if (startOffset < 0 || endOffset < startOffset) { throw new System.ArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, " + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.Name + "'"); } if (startOffset < lastStartOffset) { throw new System.ArgumentException("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.Name + "'"); } lastStartOffset = startOffset; } bool success = false; try { // If we hit an exception in here, we abort // all buffered documents since the last // flush, on the likelihood that the // internal state of the consumer is now // corrupt and should not be flushed to a // new segment: consumer.Add(); success = true; } finally { if (!success) { docState.docWriter.SetAborting(); } } fieldState.Length++; fieldState.Position++; } while (stream.IncrementToken()); } // trigger streams to perform end-of-stream operations stream.End(); // TODO: maybe add some safety? then again, its already checked // when we come back around to the field... fieldState.Position += posIncrAttribute.PositionIncrement; fieldState.Offset += offsetAttribute.EndOffset; if (docState.maxTermPrefix != null) { string msg = "Document contains at least one immense term in field=\"" + fieldInfo.Name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'"; if (docState.infoStream.IsEnabled("IW")) { docState.infoStream.Message("IW", "ERROR: " + msg); } docState.maxTermPrefix = null; throw new System.ArgumentException(msg); } /* if success was false above there is an exception coming through and we won't get here.*/ succeededInProcessingField = true; } finally { if (!succeededInProcessingField) { IOUtils.CloseWhileHandlingException(stream); } else { stream.Dispose(); } if (!succeededInProcessingField && docState.infoStream.IsEnabled("DW")) { docState.infoStream.Message("DW", "An exception was thrown while processing field " + fieldInfo.Name); } } fieldState.Offset += analyzed ? docState.analyzer.GetOffsetGap(fieldInfo.Name) : 0; fieldState.Boost *= field.Boost; } // LUCENE-2387: don't hang onto the field, so GC can // reclaim fields[i] = null; } consumer.Finish(); endConsumer.Finish(); }