public override void  ProcessFields(IFieldable[] fields, int count)
        {
            fieldState.Reset(docState.doc.Boost);

            int maxFieldLength = docState.maxFieldLength;

            bool doInvert = consumer.Start(fields, count);

            for (int i = 0; i < count; i++)
            {
                IFieldable field = fields[i];

                // TODO FI: this should be "genericized" to querying
                // consumer if it wants to see this particular field
                // tokenized.
                if (field.IsIndexed && doInvert)
                {
                    bool anyToken;

                    if (fieldState.length > 0)
                    {
                        fieldState.position += docState.analyzer.GetPositionIncrementGap(fieldInfo.name);
                    }

                    if (!field.IsTokenized)
                    {
                        // un-tokenized field
                        System.String stringValue = field.StringValue;
                        int           valueLength = stringValue.Length;
                        perThread.singleToken.Reinit(stringValue, 0, valueLength);
                        fieldState.attributeSource = perThread.singleToken;
                        consumer.Start(field);

                        bool success = false;
                        try
                        {
                            consumer.Add();
                            success = true;
                        }
                        finally
                        {
                            if (!success)
                            {
                                docState.docWriter.SetAborting();
                            }
                        }
                        fieldState.offset += valueLength;
                        fieldState.length++;
                        fieldState.position++;
                        anyToken = valueLength > 0;
                    }
                    else
                    {
                        // tokenized field
                        TokenStream stream;
                        TokenStream streamValue = field.TokenStreamValue;

                        if (streamValue != null)
                        {
                            stream = streamValue;
                        }
                        else
                        {
                            // the field does not have a TokenStream,
                            // so we have to obtain one from the analyzer
                            System.IO.TextReader reader; // find or make Reader
                            System.IO.TextReader readerValue = field.ReaderValue;

                            if (readerValue != null)
                            {
                                reader = readerValue;
                            }
                            else
                            {
                                System.String stringValue = field.StringValue;
                                if (stringValue == null)
                                {
                                    throw new System.ArgumentException("field must have either TokenStream, String or Reader value");
                                }
                                perThread.stringReader.Init(stringValue);
                                reader = perThread.stringReader;
                            }

                            // Tokenize field and add to postingTable
                            stream = docState.analyzer.ReusableTokenStream(fieldInfo.name, reader);
                        }

                        // reset the TokenStream to the first token
                        stream.Reset();

                        int startLength = fieldState.length;

                        try
                        {
                            int offsetEnd = fieldState.offset - 1;

                            bool hasMoreTokens = stream.IncrementToken();

                            fieldState.attributeSource = stream;

                            IOffsetAttribute            offsetAttribute  = fieldState.attributeSource.AddAttribute <IOffsetAttribute>();
                            IPositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.AddAttribute <IPositionIncrementAttribute>();

                            consumer.Start(field);

                            for (; ;)
                            {
                                // If we hit an exception in stream.next below
                                // (which is fairly common, eg if analyzer
                                // chokes on a given document), then it's
                                // non-aborting and (above) this one document
                                // will be marked as deleted, but still
                                // consume a docID

                                if (!hasMoreTokens)
                                {
                                    break;
                                }

                                int posIncr = posIncrAttribute.PositionIncrement;
                                fieldState.position += posIncr;
                                if (fieldState.position > 0)
                                {
                                    fieldState.position--;
                                }

                                if (posIncr == 0)
                                {
                                    fieldState.numOverlap++;
                                }

                                bool success = false;
                                try
                                {
                                    // If we hit an exception in here, we abort
                                    // all buffered documents since the last
                                    // flush, on the likelihood that the
                                    // internal state of the consumer is now
                                    // corrupt and should not be flushed to a
                                    // new segment:
                                    consumer.Add();
                                    success = true;
                                }
                                finally
                                {
                                    if (!success)
                                    {
                                        docState.docWriter.SetAborting();
                                    }
                                }
                                fieldState.position++;
                                offsetEnd = fieldState.offset + offsetAttribute.EndOffset;
                                if (++fieldState.length >= maxFieldLength)
                                {
                                    if (docState.infoStream != null)
                                    {
                                        docState.infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached for field " + fieldInfo.name + ", ignoring following tokens");
                                    }
                                    break;
                                }

                                hasMoreTokens = stream.IncrementToken();
                            }
                            // trigger streams to perform end-of-stream operations
                            stream.End();

                            fieldState.offset += offsetAttribute.EndOffset;
                            anyToken           = fieldState.length > startLength;
                        }
                        finally
                        {
                            stream.Close();
                        }
                    }

                    if (anyToken)
                    {
                        fieldState.offset += docState.analyzer.GetOffsetGap(field);
                    }
                    fieldState.boost *= field.Boost;
                }

                // LUCENE-2387: don't hang onto the field, so GC can
                // reclaim
                fields[i] = null;
            }

            consumer.Finish();
            endConsumer.Finish();
        }
Esempio n. 2
0
        public override void ProcessFields(IIndexableField[] fields, int count)
        {
            fieldState.Reset();

            bool doInvert = consumer.Start(fields, count);

            for (int i = 0; i < count; i++)
            {
                IIndexableField     field     = fields[i];
                IIndexableFieldType fieldType = field.FieldType;

                // TODO FI: this should be "genericized" to querying
                // consumer if it wants to see this particular field
                // tokenized.
                if (fieldType.IsIndexed && doInvert)
                {
                    bool analyzed = fieldType.IsTokenized && docState.analyzer != null;

                    // if the field omits norms, the boost cannot be indexed.
                    if (fieldType.OmitNorms && field.Boost != 1.0f)
                    {
                        throw new System.NotSupportedException("You cannot set an index-time boost: norms are omitted for field '" + field.Name + "'");
                    }

                    // only bother checking offsets if something will consume them.
                    // TODO: after we fix analyzers, also check if termVectorOffsets will be indexed.
                    bool checkOffsets    = fieldType.IndexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
                    int  lastStartOffset = 0;

                    if (i > 0)
                    {
                        fieldState.Position += analyzed ? docState.analyzer.GetPositionIncrementGap(fieldInfo.Name) : 0;
                    }

                    /*
                     * To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream
                     * when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses,
                     * but rather a finally that takes note of the problem.
                     */

                    bool succeededInProcessingField = false;

                    TokenStream stream = field.GetTokenStream(docState.analyzer);
                    // reset the TokenStream to the first token
                    stream.Reset();

                    try
                    {
                        bool hasMoreTokens = stream.IncrementToken();

                        fieldState.AttributeSource = stream;

                        IOffsetAttribute            offsetAttribute  = fieldState.AttributeSource.AddAttribute <IOffsetAttribute>();
                        IPositionIncrementAttribute posIncrAttribute = fieldState.AttributeSource.AddAttribute <IPositionIncrementAttribute>();

                        if (hasMoreTokens)
                        {
                            consumer.Start(field);

                            do
                            {
                                // If we hit an exception in stream.next below
                                // (which is fairly common, eg if analyzer
                                // chokes on a given document), then it's
                                // non-aborting and (above) this one document
                                // will be marked as deleted, but still
                                // consume a docID

                                int posIncr = posIncrAttribute.PositionIncrement;
                                if (posIncr < 0)
                                {
                                    throw new System.ArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.Name + "'");
                                }
                                if (fieldState.Position == 0 && posIncr == 0)
                                {
                                    throw new System.ArgumentException("first position increment must be > 0 (got 0) for field '" + field.Name + "'");
                                }
                                int position = fieldState.Position + posIncr;
                                if (position > 0)
                                {
                                    // NOTE: confusing: this "mirrors" the
                                    // position++ we do below
                                    position--;
                                }
                                else if (position < 0)
                                {
                                    throw new System.ArgumentException("position overflow for field '" + field.Name + "'");
                                }

                                // position is legal, we can safely place it in fieldState now.
                                // not sure if anything will use fieldState after non-aborting exc...
                                fieldState.Position = position;

                                if (posIncr == 0)
                                {
                                    fieldState.NumOverlap++;
                                }

                                if (checkOffsets)
                                {
                                    int startOffset = fieldState.Offset + offsetAttribute.StartOffset;
                                    int endOffset   = fieldState.Offset + offsetAttribute.EndOffset;
                                    if (startOffset < 0 || endOffset < startOffset)
                                    {
                                        throw new System.ArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, " + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.Name + "'");
                                    }
                                    if (startOffset < lastStartOffset)
                                    {
                                        throw new System.ArgumentException("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.Name + "'");
                                    }
                                    lastStartOffset = startOffset;
                                }

                                bool success = false;
                                try
                                {
                                    // If we hit an exception in here, we abort
                                    // all buffered documents since the last
                                    // flush, on the likelihood that the
                                    // internal state of the consumer is now
                                    // corrupt and should not be flushed to a
                                    // new segment:
                                    consumer.Add();
                                    success = true;
                                }
                                finally
                                {
                                    if (!success)
                                    {
                                        docState.docWriter.SetAborting();
                                    }
                                }
                                fieldState.Length++;
                                fieldState.Position++;
                            } while (stream.IncrementToken());
                        }
                        // trigger streams to perform end-of-stream operations
                        stream.End();
                        // TODO: maybe add some safety? then again, its already checked
                        // when we come back around to the field...
                        fieldState.Position += posIncrAttribute.PositionIncrement;
                        fieldState.Offset   += offsetAttribute.EndOffset;

                        if (docState.maxTermPrefix != null)
                        {
                            string msg = "Document contains at least one immense term in field=\"" + fieldInfo.Name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped.  Please correct the analyzer to not produce such terms.  The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'";
                            if (docState.infoStream.IsEnabled("IW"))
                            {
                                docState.infoStream.Message("IW", "ERROR: " + msg);
                            }
                            docState.maxTermPrefix = null;
                            throw new System.ArgumentException(msg);
                        }

                        /* if success was false above there is an exception coming through and we won't get here.*/
                        succeededInProcessingField = true;
                    }
                    finally
                    {
                        if (!succeededInProcessingField)
                        {
                            IOUtils.CloseWhileHandlingException(stream);
                        }
                        else
                        {
                            stream.Dispose();
                        }
                        if (!succeededInProcessingField && docState.infoStream.IsEnabled("DW"))
                        {
                            docState.infoStream.Message("DW", "An exception was thrown while processing field " + fieldInfo.Name);
                        }
                    }

                    fieldState.Offset += analyzed ? docState.analyzer.GetOffsetGap(fieldInfo.Name) : 0;
                    fieldState.Boost  *= field.Boost;
                }

                // LUCENE-2387: don't hang onto the field, so GC can
                // reclaim
                fields[i] = null;
            }

            consumer.Finish();
            endConsumer.Finish();
        }