Beispiel #1
0
        internal void  AddDocument(Document doc)
        {
            indexStream.WriteLong(fieldsStream.GetFilePointer());

            int storedCount = 0;

            System.Collections.IEnumerator fieldIterator = doc.GetFields().GetEnumerator();
            while (fieldIterator.MoveNext())
            {
                Fieldable field = (Fieldable)fieldIterator.Current;
                if (field.IsStored())
                {
                    storedCount++;
                }
            }
            fieldsStream.WriteVInt(storedCount);

            fieldIterator = doc.GetFields().GetEnumerator();
            while (fieldIterator.MoveNext())
            {
                Fieldable field = (Fieldable)fieldIterator.Current;
                // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode
                // and field.binaryValue() already returns the compressed value for a field
                // with isCompressed()==true, so we disable compression in that case
                bool disableCompression = (field is FieldsReader.FieldForMerge);
                if (field.IsStored())
                {
                    fieldsStream.WriteVInt(fieldInfos.FieldNumber(field.Name()));

                    byte bits = 0;
                    if (field.IsTokenized())
                    {
                        bits |= FieldsWriter.FIELD_IS_TOKENIZED;
                    }
                    if (field.IsBinary())
                    {
                        bits |= FieldsWriter.FIELD_IS_BINARY;
                    }
                    if (field.IsCompressed())
                    {
                        bits |= FieldsWriter.FIELD_IS_COMPRESSED;
                    }

                    fieldsStream.WriteByte(bits);

                    if (field.IsCompressed())
                    {
                        // compression is enabled for the current field
                        byte[] data = null;

                        if (disableCompression)
                        {
                            // optimized case for merging, the data
                            // is already compressed
                            data = field.BinaryValue();
                        }
                        else
                        {
                            // check if it is a binary field
                            if (field.IsBinary())
                            {
                                data = Compress(field.BinaryValue());
                            }
                            else
                            {
                                data = Compress(System.Text.Encoding.GetEncoding("UTF-8").GetBytes(field.StringValue()));
                            }
                        }
                        int len = data.Length;
                        fieldsStream.WriteVInt(len);
                        fieldsStream.WriteBytes(data, len);
                    }
                    else
                    {
                        // compression is disabled for the current field
                        if (field.IsBinary())
                        {
                            byte[] data = field.BinaryValue();
                            int    len  = data.Length;
                            fieldsStream.WriteVInt(len);
                            fieldsStream.WriteBytes(data, len);
                        }
                        else
                        {
                            fieldsStream.WriteString(field.StringValue());
                        }
                    }
                }
            }
        }
        public override void  ProcessFields(Fieldable[] fields, int count)
        {
            fieldState.Reset(docState.doc.GetBoost());

            int maxFieldLength = docState.maxFieldLength;

            bool doInvert = consumer.Start(fields, count);

            for (int i = 0; i < count; i++)
            {
                Fieldable field = fields[i];

                // TODO FI: this should be "genericized" to querying
                // consumer if it wants to see this particular field
                // tokenized.
                if (field.IsIndexed() && doInvert)
                {
                    bool anyToken;

                    if (fieldState.length > 0)
                    {
                        fieldState.position += docState.analyzer.GetPositionIncrementGap(fieldInfo.name);
                    }

                    if (!field.IsTokenized())
                    {
                        // un-tokenized field
                        System.String stringValue = field.StringValue();
                        int           valueLength = stringValue.Length;
                        perThread.singleTokenTokenStream.Reinit(stringValue, 0, valueLength);
                        fieldState.attributeSource = perThread.singleTokenTokenStream;
                        consumer.Start(field);

                        bool success = false;
                        try
                        {
                            consumer.Add();
                            success = true;
                        }
                        finally
                        {
                            if (!success)
                            {
                                docState.docWriter.SetAborting();
                            }
                        }
                        fieldState.offset += valueLength;
                        fieldState.length++;
                        fieldState.position++;
                        anyToken = valueLength > 0;
                    }
                    else
                    {
                        // tokenized field
                        TokenStream stream;
                        TokenStream streamValue = field.TokenStreamValue();

                        if (streamValue != null)
                        {
                            stream = streamValue;
                        }
                        else
                        {
                            // the field does not have a TokenStream,
                            // so we have to obtain one from the analyzer
                            System.IO.TextReader reader;                             // find or make Reader
                            System.IO.TextReader readerValue = field.ReaderValue();

                            if (readerValue != null)
                            {
                                reader = readerValue;
                            }
                            else
                            {
                                System.String stringValue = field.StringValue();
                                if (stringValue == null)
                                {
                                    throw new System.ArgumentException("field must have either TokenStream, String or Reader value");
                                }
                                perThread.stringReader.Init(stringValue);
                                reader = perThread.stringReader;
                            }

                            // Tokenize field and add to postingTable
                            stream = docState.analyzer.ReusableTokenStream(fieldInfo.name, reader);
                        }

                        // reset the TokenStream to the first token
                        stream.Reset();

                        int startLength = fieldState.length;

                        // deprecated
                        bool allowMinus1Position = docState.allowMinus1Position;

                        try
                        {
                            int offsetEnd = fieldState.offset - 1;

                            bool hasMoreTokens = stream.IncrementToken();

                            fieldState.attributeSource = stream;

                            OffsetAttribute            offsetAttribute  = (OffsetAttribute)fieldState.attributeSource.AddAttribute(typeof(OffsetAttribute));
                            PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute)fieldState.attributeSource.AddAttribute(typeof(PositionIncrementAttribute));

                            consumer.Start(field);

                            for (; ;)
                            {
                                // If we hit an exception in stream.next below
                                // (which is fairly common, eg if analyzer
                                // chokes on a given document), then it's
                                // non-aborting and (above) this one document
                                // will be marked as deleted, but still
                                // consume a docID

                                if (!hasMoreTokens)
                                {
                                    break;
                                }

                                int posIncr = posIncrAttribute.GetPositionIncrement();
                                fieldState.position += posIncr;
                                if (allowMinus1Position || fieldState.position > 0)
                                {
                                    fieldState.position--;
                                }

                                if (posIncr == 0)
                                {
                                    fieldState.numOverlap++;
                                }

                                bool success = false;
                                try
                                {
                                    // If we hit an exception in here, we abort
                                    // all buffered documents since the last
                                    // flush, on the likelihood that the
                                    // internal state of the consumer is now
                                    // corrupt and should not be flushed to a
                                    // new segment:
                                    consumer.Add();
                                    success = true;
                                }
                                finally
                                {
                                    if (!success)
                                    {
                                        docState.docWriter.SetAborting();
                                    }
                                }
                                fieldState.position++;
                                offsetEnd = fieldState.offset + offsetAttribute.EndOffset();
                                if (++fieldState.length >= maxFieldLength)
                                {
                                    if (docState.infoStream != null)
                                    {
                                        docState.infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached for field " + fieldInfo.name + ", ignoring following tokens");
                                    }
                                    break;
                                }

                                hasMoreTokens = stream.IncrementToken();
                            }
                            // trigger streams to perform end-of-stream operations
                            stream.End();

                            fieldState.offset += offsetAttribute.EndOffset();
                            anyToken           = fieldState.length > startLength;
                        }
                        finally
                        {
                            stream.Close();
                        }
                    }

                    if (anyToken)
                    {
                        fieldState.offset += docState.analyzer.GetOffsetGap(field);
                    }
                    fieldState.boost *= field.GetBoost();
                }

                // LUCENE-2387: don't hang onto the field, so GC can
                // reclaim
                fields[i] = null;
            }

            consumer.Finish();
            endConsumer.Finish();
        }
Beispiel #3
0
        internal void  WriteField(FieldInfo fi, Fieldable field)
        {
            // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode
            // and field.binaryValue() already returns the compressed value for a field
            // with isCompressed()==true, so we disable compression in that case
            bool disableCompression = (field is FieldsReader.FieldForMerge);

            fieldsStream.WriteVInt(fi.number);
            byte bits = 0;

            if (field.IsTokenized())
            {
                bits |= FieldsWriter.FIELD_IS_TOKENIZED;
            }
            if (field.IsBinary())
            {
                bits |= FieldsWriter.FIELD_IS_BINARY;
            }
            if (field.IsCompressed())
            {
                bits |= FieldsWriter.FIELD_IS_COMPRESSED;
            }

            fieldsStream.WriteByte(bits);

            if (field.IsCompressed())
            {
                // compression is enabled for the current field
                byte[] data = null;

                if (disableCompression)
                {
                    // optimized case for merging, the data
                    // is already compressed
                    data = field.BinaryValue();
                }
                else
                {
                    // check if it is a binary field
                    if (field.IsBinary())
                    {
                        data = Compress(field.BinaryValue());
                    }
                    else
                    {
                        data = Compress(System.Text.Encoding.GetEncoding("UTF-8").GetBytes(field.StringValue()));
                    }
                }
                int len = data.Length;
                fieldsStream.WriteVInt(len);
                fieldsStream.WriteBytes(data, len);
            }
            else
            {
                // compression is disabled for the current field
                if (field.IsBinary())
                {
                    byte[] data = field.BinaryValue();
                    int    len  = data.Length;
                    fieldsStream.WriteVInt(len);
                    fieldsStream.WriteBytes(data, len);
                }
                else
                {
                    fieldsStream.WriteString(field.StringValue());
                }
            }
        }
		internal void  WriteField(FieldInfo fi, Fieldable field)
		{
			// if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode
			// and field.binaryValue() already returns the compressed value for a field
			// with isCompressed()==true, so we disable compression in that case
			bool disableCompression = (field is FieldsReader.FieldForMerge);
			fieldsStream.WriteVInt(fi.number);
			byte bits = 0;
			if (field.IsTokenized())
				bits |= FieldsWriter.FIELD_IS_TOKENIZED;
			if (field.IsBinary())
				bits |= FieldsWriter.FIELD_IS_BINARY;
			if (field.IsCompressed())
				bits |= FieldsWriter.FIELD_IS_COMPRESSED;
			
			fieldsStream.WriteByte(bits);
			
			if (field.IsCompressed())
			{
				// compression is enabled for the current field
				byte[] data;
				int len;
				int offset;
				if (disableCompression)
				{
					// optimized case for merging, the data
					// is already compressed
					data = field.GetBinaryValue();
					System.Diagnostics.Debug.Assert(data != null);
					len = field.GetBinaryLength();
					offset = field.GetBinaryOffset();
				}
				else
				{
					// check if it is a binary field
					if (field.IsBinary())
					{
						data = CompressionTools.Compress(field.GetBinaryValue(), field.GetBinaryOffset(), field.GetBinaryLength());
					}
					else
					{
						byte[] x = System.Text.Encoding.GetEncoding("UTF-8").GetBytes(field.StringValue());
						data = CompressionTools.Compress(x, 0, x.Length);
					}
					len = data.Length;
					offset = 0;
				}
				
				fieldsStream.WriteVInt(len);
				fieldsStream.WriteBytes(data, offset, len);
			}
			else
			{
				// compression is disabled for the current field
				if (field.IsBinary())
				{
					byte[] data;
					int len;
					int offset;
					data = field.GetBinaryValue();
					len = field.GetBinaryLength();
					offset = field.GetBinaryOffset();
					
					fieldsStream.WriteVInt(len);
					fieldsStream.WriteBytes(data, offset, len);
				}
				else
				{
					fieldsStream.WriteString(field.StringValue());
				}
			}
		}
				/* Invert one occurrence of one field in the document */
				public void  InvertField(Fieldable field, Analyzer analyzer, int maxFieldLength)
				{
					
					if (length > 0)
						position += analyzer.GetPositionIncrementGap(fieldInfo.name);
					
					if (!field.IsTokenized())
					{
						// un-tokenized field
						System.String stringValue = field.StringValue();
						int valueLength = stringValue.Length;
						Token token = localToken;
						token.Clear();
						char[] termBuffer = token.TermBuffer();
						if (termBuffer.Length < valueLength)
							termBuffer = token.ResizeTermBuffer(valueLength);
						DocumentsWriter.GetCharsFromString(stringValue, 0, valueLength, termBuffer, 0);
						token.SetTermLength(valueLength);
						token.SetStartOffset(offset);
						token.SetEndOffset(offset + stringValue.Length);
						AddPosition(token);
						offset += stringValue.Length;
						length++;
					}
					else
					{
						// tokenized field
						TokenStream stream;
						TokenStream streamValue = field.TokenStreamValue();
						
						if (streamValue != null)
							stream = streamValue;
						else
						{
							// the field does not have a TokenStream,
							// so we have to obtain one from the analyzer
							System.IO.TextReader reader; // find or make Reader
							System.IO.TextReader readerValue = field.ReaderValue();
							
							if (readerValue != null)
								reader = readerValue;
							else
							{
								System.String stringValue = field.StringValue();
								if (stringValue == null)
									throw new System.ArgumentException("field must have either TokenStream, String or Reader value");
								Enclosing_Instance.stringReader.Init(stringValue);
								reader = Enclosing_Instance.stringReader;
							}
							
							// Tokenize field and add to postingTable
							stream = analyzer.ReusableTokenStream(fieldInfo.name, reader);
						}
						
						// reset the TokenStream to the first token
						stream.Reset();
						
						try
						{
							offsetEnd = offset - 1;
							for (; ; )
							{
								Token token = stream.Next(localToken);
								if (token == null)
									break;
								position += (token.GetPositionIncrement() - 1);
								AddPosition(token);
								if (++length >= maxFieldLength)
								{
									if (Enclosing_Instance.Enclosing_Instance.infoStream != null)
										Enclosing_Instance.Enclosing_Instance.infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached for field " + fieldInfo.name + ", ignoring following tokens");
									break;
								}
							}
							offset = offsetEnd + 1;
						}
						finally
						{
							stream.Close();
						}
					}
					
					boost *= field.GetBoost();
				}
        internal override void processFields(Fieldable[] fields,
                                             int count)
        {
            fieldState.reset(docState.doc.GetBoost());

            int maxFieldLength = docState.maxFieldLength;

            bool doInvert = consumer.start(fields, count);

            for (int i = 0; i < count; i++)
            {
                Fieldable field = fields[i];

                // TODO FI: this should be "genericized" to querying
                // consumer if it wants to see this particular field
                // tokenized.
                if (field.IsIndexed() && doInvert)
                {
                    if (fieldState.length > 0)
                    {
                        fieldState.position += docState.analyzer.GetPositionIncrementGap(fieldInfo.name);
                    }

                    if (!field.IsTokenized())
                    {             // un-tokenized field
                        string stringValue = field.StringValue();
                        int    valueLength = stringValue.Length;
                        Token  token       = perThread.localToken.Reinit(stringValue, fieldState.offset, fieldState.offset + valueLength);
                        bool   success     = false;
                        try
                        {
                            consumer.add(token);
                            success = true;
                        }
                        finally
                        {
                            if (!success)
                            {
                                docState.docWriter.SetAborting();
                            }
                        }
                        fieldState.offset += valueLength;
                        fieldState.length++;
                        fieldState.position++;
                    }
                    else
                    {                                  // tokenized field
                        TokenStream stream;
                        TokenStream streamValue = field.TokenStreamValue();

                        if (streamValue != null)
                        {
                            stream = streamValue;
                        }
                        else
                        {
                            // the field does not have a TokenStream,
                            // so we have to obtain one from the analyzer
                            System.IO.TextReader reader;                          // find or make Reader
                            System.IO.TextReader readerValue = field.ReaderValue();

                            if (readerValue != null)
                            {
                                reader = readerValue;
                            }
                            else
                            {
                                string stringValue = field.StringValue();
                                if (stringValue == null)
                                {
                                    throw new System.ArgumentException("field must have either TokenStream, string or Reader value");
                                }
                                perThread.stringReader.Init(stringValue);
                                reader = perThread.stringReader;
                            }

                            // Tokenize field and add to postingTable
                            stream = docState.analyzer.ReusableTokenStream(fieldInfo.name, reader);
                        }

                        // reset the TokenStream to the first token
                        stream.Reset();

                        try
                        {
                            int   offsetEnd  = fieldState.offset - 1;
                            Token localToken = perThread.localToken;
                            for (; ;)
                            {
                                // If we hit an exception in stream.next below
                                // (which is fairly common, eg if analyzer
                                // chokes on a given document), then it's
                                // non-aborting and (above) this one document
                                // will be marked as deleted, but still
                                // consume a docID
                                Token token = stream.Next(localToken);

                                if (token == null)
                                {
                                    break;
                                }
                                fieldState.position += (token.GetPositionIncrement() - 1);
                                bool success = false;
                                try
                                {
                                    // If we hit an exception in here, we abort
                                    // all buffered documents since the last
                                    // flush, on the likelihood that the
                                    // internal state of the consumer is now
                                    // corrupt and should not be flushed to a
                                    // new segment:
                                    consumer.add(token);
                                    success = true;
                                }
                                finally
                                {
                                    if (!success)
                                    {
                                        docState.docWriter.SetAborting();
                                    }
                                }
                                fieldState.position++;
                                offsetEnd = fieldState.offset + token.EndOffset();

                                if (++fieldState.length >= maxFieldLength)
                                {
                                    if (docState.infoStream != null)
                                    {
                                        docState.infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached for field " + fieldInfo.name + ", ignoring following tokens");
                                    }
                                    break;
                                }
                            }
                            fieldState.offset = offsetEnd + 1;
                        }
                        finally
                        {
                            stream.Close();
                        }
                    }

                    fieldState.boost *= field.GetBoost();
                }
            }

            consumer.finish();
            endConsumer.finish();
        }
        internal void  WriteField(FieldInfo fi, Fieldable field)
        {
            // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode
            // and field.binaryValue() already returns the compressed value for a field
            // with isCompressed()==true, so we disable compression in that case
            bool disableCompression = (field is FieldsReader.FieldForMerge);

            fieldsStream.WriteVInt(fi.number);
            byte bits = 0;

            if (field.IsTokenized())
            {
                bits |= FieldsWriter.FIELD_IS_TOKENIZED;
            }
            if (field.IsBinary())
            {
                bits |= FieldsWriter.FIELD_IS_BINARY;
            }
            if (field.IsCompressed())
            {
                bits |= FieldsWriter.FIELD_IS_COMPRESSED;
            }

            fieldsStream.WriteByte(bits);

            if (field.IsCompressed())
            {
                // compression is enabled for the current field
                byte[] data;
                int    len;
                int    offset;

                if (disableCompression)
                {
                    // optimized case for merging, the data
                    // is already compressed
                    data = field.GetBinaryValue();
                    System.Diagnostics.Debug.Assert(data != null);
                    len    = field.GetBinaryLength();
                    offset = field.GetBinaryOffset();
                }
                else
                {
                    // check if it is a binary field
                    if (field.IsBinary())
                    {
                        data = Compress(field.GetBinaryValue(), field.GetBinaryOffset(), field.GetBinaryLength());
                    }
                    else
                    {
                        byte[] x = System.Text.Encoding.UTF8.GetBytes(field.StringValue());
                        data = Compress(x, 0, x.Length);
                    }
                    len    = data.Length;
                    offset = 0;
                }

                fieldsStream.WriteVInt(len);
                fieldsStream.WriteBytes(data, offset, len);
            }
            else
            {
                // compression is disabled for the current field
                if (field.IsBinary())
                {
                    int length = field.GetBinaryLength();
                    fieldsStream.WriteVInt(length);
                    fieldsStream.WriteBytes(field.BinaryValue(), field.GetBinaryOffset(), length);
                }
                else
                {
                    fieldsStream.WriteString(field.StringValue());
                }
            }
        }
        // Tokenizes the fields of a document into Postings.
        private void  InvertDocument(Document doc)
        {
            System.Collections.IEnumerator fieldIterator = doc.GetFields().GetEnumerator();
            while (fieldIterator.MoveNext())
            {
                Fieldable     field       = (Fieldable)fieldIterator.Current;
                System.String fieldName   = field.Name();
                int           fieldNumber = fieldInfos.FieldNumber(fieldName);

                int length   = fieldLengths[fieldNumber];               // length of field
                int position = fieldPositions[fieldNumber];             // position in field
                if (length > 0)
                {
                    position += analyzer.GetPositionIncrementGap(fieldName);
                }
                int offset = fieldOffsets[fieldNumber];                 // offset field

                if (field.IsIndexed())
                {
                    if (!field.IsTokenized())
                    {
                        // un-tokenized field
                        System.String stringValue = field.StringValue();
                        if (field.IsStoreOffsetWithTermVector())
                        {
                            AddPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.Length));
                        }
                        else
                        {
                            AddPosition(fieldName, stringValue, position++, null);
                        }
                        offset += stringValue.Length;
                        length++;
                    }
                    else
                    {
                        System.IO.TextReader reader;                         // find or make Reader
                        if (field.ReaderValue() != null)
                        {
                            reader = field.ReaderValue();
                        }
                        else if (field.StringValue() != null)
                        {
                            reader = new System.IO.StringReader(field.StringValue());
                        }
                        else
                        {
                            throw new System.ArgumentException("field must have either String or Reader value");
                        }

                        // Tokenize field and add to postingTable
                        TokenStream stream = analyzer.TokenStream(fieldName, reader);
                        try
                        {
                            Token lastToken = null;
                            for (Token t = stream.Next(); t != null; t = stream.Next())
                            {
                                position += (t.GetPositionIncrement() - 1);

                                if (field.IsStoreOffsetWithTermVector())
                                {
                                    AddPosition(fieldName, t.TermText(), position++, new TermVectorOffsetInfo(offset + t.StartOffset(), offset + t.EndOffset()));
                                }
                                else
                                {
                                    AddPosition(fieldName, t.TermText(), position++, null);
                                }

                                lastToken = t;
                                if (++length >= maxFieldLength)
                                {
                                    if (infoStream != null)
                                    {
                                        infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached, ignoring following tokens");
                                    }
                                    break;
                                }
                            }

                            if (lastToken != null)
                            {
                                offset += lastToken.EndOffset() + 1;
                            }
                        }
                        finally
                        {
                            stream.Close();
                        }
                    }

                    fieldLengths[fieldNumber]   = length;                   // save field length
                    fieldPositions[fieldNumber] = position;                 // save field position
                    fieldBoosts[fieldNumber]   *= field.GetBoost();
                    fieldOffsets[fieldNumber]   = offset;
                }
            }
        }