// Process all occurrences of a single field in one doc; // count is 1 if a given field occurs only once in the // Document, which is the "typical" case internal override void processFields(Fieldable[] fields, int count) { StoredFieldsWriter.PerDoc doc; if (perThread.doc == null) { doc = perThread.doc = perThread.storedFieldsWriter.getPerDoc(); doc.docID = docState.docID; perThread.localFieldsWriter.SetFieldsStream(doc.fdt); System.Diagnostics.Debug.Assert(doc.numStoredFields == 0, "doc.numStoredFields=" + doc.numStoredFields); System.Diagnostics.Debug.Assert(0 == doc.fdt.Length()); System.Diagnostics.Debug.Assert(0 == doc.fdt.GetFilePointer()); } else { doc = perThread.doc; System.Diagnostics.Debug.Assert(doc.docID == docState.docID, "doc.docID=" + doc.docID + " docState.docID=" + docState.docID); } for (int i = 0; i < count; i++) { Fieldable field = fields[i]; if (field.IsStored()) { perThread.localFieldsWriter.WriteField(fieldInfo, field); System.Diagnostics.Debug.Assert(docState.TestPoint("StoredFieldsWriterPerField.processFields.writeField")); doc.numStoredFields++; } } }
internal void AddDocument(Document doc) { indexStream.WriteLong(fieldsStream.GetFilePointer()); int storedCount = 0; System.Collections.IEnumerator fieldIterator = doc.GetFields().GetEnumerator(); while (fieldIterator.MoveNext()) { Fieldable field = (Fieldable)fieldIterator.Current; if (field.IsStored()) { storedCount++; } } fieldsStream.WriteVInt(storedCount); fieldIterator = doc.GetFields().GetEnumerator(); while (fieldIterator.MoveNext()) { Fieldable field = (Fieldable)fieldIterator.Current; if (field.IsStored()) { WriteField(fieldInfos.FieldInfo(field.Name()), field); } } }
internal override bool Start(Fieldable[] fields, int count) { for (int i = 0; i < count; i++) if (fields[i].IsIndexed()) return true; return false; }
/// <summary> Just like <see cref="GetPositionIncrementGap" />, except for /// Token offsets instead. By default this returns 1 for /// tokenized fields and, as if the fields were joined /// with an extra space character, and 0 for un-tokenized /// fields. This method is only called if the field /// produced at least one token for indexing. /// /// </summary> /// <param name="field">the field just indexed /// </param> /// <returns> offset gap, added to the next token emitted from <see cref="TokenStream(String,System.IO.TextReader)" /> /// </returns> public virtual int GetOffsetGap(Fieldable field) { if (field.IsTokenized()) return 1; else return 0; }
internal void WriteField(FieldInfo fi, Fieldable field) { // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode // and field.binaryValue() already returns the compressed value for a field // with isCompressed()==true, so we disable compression in that case bool disableCompression = (field is FieldsReader.FieldForMerge); fieldsStream.WriteVInt(fi.number); byte bits = 0; if (field.IsTokenized()) bits |= FieldsWriter.FIELD_IS_TOKENIZED; if (field.IsBinary()) bits |= FieldsWriter.FIELD_IS_BINARY; if (field.IsCompressed()) bits |= FieldsWriter.FIELD_IS_COMPRESSED; fieldsStream.WriteByte(bits); if (field.IsCompressed()) { // compression is enabled for the current field byte[] data = null; if (disableCompression) { // optimized case for merging, the data // is already compressed data = field.BinaryValue(); } else { // check if it is a binary field if (field.IsBinary()) { data = Compress(field.BinaryValue()); } else { data = Compress(System.Text.Encoding.GetEncoding("UTF-8").GetBytes(field.StringValue())); } } int len = data.Length; fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, len); } else { // compression is disabled for the current field if (field.IsBinary()) { byte[] data = field.BinaryValue(); int len = data.Length; fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, len); } else { fieldsStream.WriteString(field.StringValue()); } } }
internal override void Start(Fieldable f) { termAtt = (TermAttribute)fieldState.attributeSource.AddAttribute(typeof(TermAttribute)); consumer.Start(f); if (nextPerField != null) { nextPerField.Start(f); } }
/// <summary>Adds field info for a Document. </summary> public void Add(Document doc) { System.Collections.IList fields = doc.GetFields(); System.Collections.IEnumerator fieldIterator = fields.GetEnumerator(); while (fieldIterator.MoveNext()) { Fieldable field = (Fieldable)fieldIterator.Current; Add(field.Name(), field.IsIndexed(), field.IsTermVectorStored(), field.IsStorePositionWithTermVector(), field.IsStoreOffsetWithTermVector(), field.GetOmitNorms()); } }
/// <summary> Return the offsetGap from the analyzer assigned to field </summary> public override int GetOffsetGap(Lucene.Net.Documents.Fieldable field) { Analyzer analyzer = (Analyzer)analyzerMap[field.Name()]; if (analyzer == null) { analyzer = defaultAnalyzer; } return(analyzer.GetOffsetGap(field)); }
internal override void Start(Fieldable f) { if (fieldState.attributeSource.HasAttribute(typeof(PayloadAttribute))) { payloadAttribute = (PayloadAttribute)fieldState.attributeSource.GetAttribute(typeof(PayloadAttribute)); } else { payloadAttribute = null; } }
internal override void Start(Fieldable f) { if (doVectorOffsets) { offsetAttribute = (OffsetAttribute)fieldState.attributeSource.AddAttribute(typeof(OffsetAttribute)); } else { offsetAttribute = null; } }
/// <summary> Just like {@link #getPositionIncrementGap}, except for /// Token offsets instead. By default this returns 1 for /// tokenized fields and, as if the fields were joined /// with an extra space character, and 0 for un-tokenized /// fields. This method is only called if the field /// produced at least one token for indexing. /// /// </summary> /// <param name="field">the field just indexed /// </param> /// <returns> offset gap, added to the next token emitted from {@link #TokenStream(String,Reader)} /// </returns> public virtual int GetOffsetGap(Fieldable field) { if (field.IsTokenized()) { return(1); } else { return(0); } }
public void AddField(Fieldable field, FieldInfo fieldInfo) { if (doc == null) { doc = storedFieldsWriter.GetPerDoc(); doc.docID = docState.docID; localFieldsWriter.SetFieldsStream(doc.fdt); System.Diagnostics.Debug.Assert(doc.numStoredFields == 0, "doc.numStoredFields=" + doc.numStoredFields); System.Diagnostics.Debug.Assert(0 == doc.fdt.Length()); System.Diagnostics.Debug.Assert(0 == doc.fdt.GetFilePointer()); } localFieldsWriter.WriteField(fieldInfo, field); System.Diagnostics.Debug.Assert(docState.TestPoint("StoredFieldsWriterPerThread.processFields.writeField")); doc.numStoredFields++; }
internal override bool Start(Fieldable[] fields, int count) { doVectors = false; doVectorPositions = false; doVectorOffsets = false; for (int i = 0; i < count; i++) { Fieldable field = fields[i]; if (field.IsIndexed() && field.IsTermVectorStored()) { doVectors = true; doVectorPositions |= field.IsStorePositionWithTermVector(); doVectorOffsets |= field.IsStoreOffsetWithTermVector(); } } if (doVectors) { if (perThread.doc == null) { perThread.doc = termsWriter.GetPerDoc(); perThread.doc.docID = docState.docID; System.Diagnostics.Debug.Assert(perThread.doc.numVectorFields == 0); System.Diagnostics.Debug.Assert(0 == perThread.doc.tvf.Length()); System.Diagnostics.Debug.Assert(0 == perThread.doc.tvf.GetFilePointer()); } else { System.Diagnostics.Debug.Assert(perThread.doc.docID == docState.docID); if (termsHashPerField.numPostings != 0) { // Only necessary if previous doc hit a // non-aborting exception while writing vectors in // this field: termsHashPerField.Reset(); } } } // TODO: only if needed for performance //perThread.postingsCount = 0; return(doVectors); }
public virtual void TestDocument() { Assert.IsTrue(reader.NumDocs() == 1); Assert.IsTrue(reader.MaxDoc() >= 1); Document result = reader.Document(0); Assert.IsTrue(result != null); //There are 2 unstored fields on the document that are not preserved across writing Assert.IsTrue(DocHelper.NumFields(result) == DocHelper.NumFields(testDoc) - DocHelper.unstored.Count); System.Collections.IList fields = result.GetFields(); for (System.Collections.IEnumerator iter = fields.GetEnumerator(); iter.MoveNext();) { Fieldable field = (Fieldable)iter.Current; Assert.IsTrue(field != null); Assert.IsTrue(DocHelper.nameValues.Contains(field.Name())); } }
internal override bool Start(Fieldable[] fields, int count) { doVectors = false; doVectorPositions = false; doVectorOffsets = false; for (int i = 0; i < count; i++) { Fieldable field = fields[i]; if (field.IsIndexed() && field.IsTermVectorStored()) { doVectors = true; doVectorPositions |= field.IsStorePositionWithTermVector(); doVectorOffsets |= field.IsStoreOffsetWithTermVector(); } } if (doVectors) { if (perThread.doc == null) { perThread.doc = termsWriter.GetPerDoc(); perThread.doc.docID = docState.docID; System.Diagnostics.Debug.Assert(perThread.doc.numVectorFields == 0); System.Diagnostics.Debug.Assert(0 == perThread.doc.tvf.Length()); System.Diagnostics.Debug.Assert(0 == perThread.doc.tvf.GetFilePointer()); } else { System.Diagnostics.Debug.Assert(perThread.doc.docID == docState.docID); if (termsHashPerField.numPostings != 0) // Only necessary if previous doc hit a // non-aborting exception while writing vectors in // this field: termsHashPerField.Reset(); } } // TODO: only if needed for performance //perThread.postingsCount = 0; return doVectors; }
public static void CheckNorms(IndexReader reader) { // test omit norms for (int i = 0; i < DocHelper.fields.Length; i++) { Fieldable f = DocHelper.fields[i]; if (f.IsIndexed()) { Assert.AreEqual(reader.HasNorms(f.Name()), !f.GetOmitNorms()); Assert.AreEqual(reader.HasNorms(f.Name()), !DocHelper.noNorms.Contains(f.Name())); if (!reader.HasNorms(f.Name())) { // test for fake norms of 1.0 or null depending on the flag byte[] norms = reader.Norms(f.Name()); byte norm1 = DefaultSimilarity.EncodeNorm(1.0f); if (reader.GetDisableFakeNorms()) { Assert.IsNull(norms); } else { Assert.AreEqual(norms.Length, reader.MaxDoc()); for (int j = 0; j < reader.MaxDoc(); j++) { Assert.AreEqual(norms[j], norm1); } } norms = new byte[reader.MaxDoc()]; reader.Norms(f.Name(), norms, 0); for (int j = 0; j < reader.MaxDoc(); j++) { Assert.AreEqual(norms[j], norm1); } } } } }
internal abstract void Start(Fieldable field);
public override void ProcessFields(Fieldable[] fields, int count) { fieldState.Reset(docState.doc.GetBoost()); int maxFieldLength = docState.maxFieldLength; bool doInvert = consumer.Start(fields, count); for (int i = 0; i < count; i++) { Fieldable field = fields[i]; // TODO FI: this should be "genericized" to querying // consumer if it wants to see this particular field // tokenized. if (field.IsIndexed() && doInvert) { bool anyToken; if (fieldState.length > 0) { fieldState.position += docState.analyzer.GetPositionIncrementGap(fieldInfo.name); } if (!field.IsTokenized()) { // un-tokenized field System.String stringValue = field.StringValue(); int valueLength = stringValue.Length; perThread.singleTokenTokenStream.Reinit(stringValue, 0, valueLength); fieldState.attributeSource = perThread.singleTokenTokenStream; consumer.Start(field); bool success = false; try { consumer.Add(); success = true; } finally { if (!success) { docState.docWriter.SetAborting(); } } fieldState.offset += valueLength; fieldState.length++; fieldState.position++; anyToken = valueLength > 0; } else { // tokenized field TokenStream stream; TokenStream streamValue = field.TokenStreamValue(); if (streamValue != null) { stream = streamValue; } else { // the field does not have a TokenStream, // so we have to obtain one from the analyzer System.IO.TextReader reader; // find or make Reader System.IO.TextReader readerValue = field.ReaderValue(); if (readerValue != null) { reader = readerValue; } else { System.String stringValue = field.StringValue(); if (stringValue == null) { throw new System.ArgumentException("field must have either TokenStream, String or Reader value"); } perThread.stringReader.Init(stringValue); reader = perThread.stringReader; } // Tokenize field and add to postingTable stream = docState.analyzer.ReusableTokenStream(fieldInfo.name, reader); } // reset the TokenStream to the first token stream.Reset(); int startLength = fieldState.length; // deprecated bool allowMinus1Position = docState.allowMinus1Position; try { int offsetEnd = fieldState.offset - 1; bool hasMoreTokens = stream.IncrementToken(); fieldState.attributeSource = stream; OffsetAttribute offsetAttribute = (OffsetAttribute)fieldState.attributeSource.AddAttribute(typeof(OffsetAttribute)); PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute)fieldState.attributeSource.AddAttribute(typeof(PositionIncrementAttribute)); consumer.Start(field); for (; ;) { // If we hit an exception in stream.next below // (which is fairly common, eg if analyzer // chokes on a given document), then it's // non-aborting and (above) this one document // will be marked as deleted, but still // consume a docID if (!hasMoreTokens) { break; } int posIncr = posIncrAttribute.GetPositionIncrement(); fieldState.position += posIncr; if (allowMinus1Position || fieldState.position > 0) { fieldState.position--; } if (posIncr == 0) { fieldState.numOverlap++; } bool success = false; try { // If we hit an exception in here, we abort // all buffered documents since the last // flush, on the likelihood that the // internal state of the consumer is now // corrupt and should not be flushed to a // new segment: consumer.Add(); success = true; } finally { if (!success) { docState.docWriter.SetAborting(); } } fieldState.position++; offsetEnd = fieldState.offset + offsetAttribute.EndOffset(); if (++fieldState.length >= maxFieldLength) { if (docState.infoStream != null) { docState.infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached for field " + fieldInfo.name + ", ignoring following tokens"); } break; } hasMoreTokens = stream.IncrementToken(); } // trigger streams to perform end-of-stream operations stream.End(); fieldState.offset += offsetAttribute.EndOffset(); anyToken = fieldState.length > startLength; } finally { stream.Close(); } } if (anyToken) { fieldState.offset += docState.analyzer.GetOffsetGap(field); } fieldState.boost *= field.GetBoost(); } // LUCENE-2387: don't hang onto the field, so GC can // reclaim fields[i] = null; } consumer.Finish(); endConsumer.Finish(); }
/// <summary>Processes all occurrences of a single field </summary> public abstract void ProcessFields(Fieldable[] fields, int count);
// Called once per field, and is given all Fieldable // occurrences for this field in the document. Return // true if you wish to see inverted tokens for these // fields: internal abstract bool Start(Fieldable[] fields, int count);
/// <summary>Returns the offsetGap from the analyzer assigned to fiel</summary> public override int GetOffsetGap(Lucene.Net.Documents.Fieldable field) { var analyzer = GetAnalyzer(field.Name()); return(analyzer.GetOffsetGap(field)); }
public override DocumentsWriter.DocWriter ProcessDocument() { consumer.StartDocument(); fieldsWriter.StartDocument(); Document doc = docState.doc; System.Diagnostics.Debug.Assert(docFieldProcessor.docWriter.writer.TestPoint("DocumentsWriter.ThreadState.init start")); fieldCount = 0; int thisFieldGen = fieldGen++; System.Collections.IList docFields = doc.GetFields(); int numDocFields = docFields.Count; // Absorb any new fields first seen in this document. // Also absorb any changes to fields we had already // seen before (eg suddenly turning on norms or // vectors, etc.): for (int i = 0; i < numDocFields; i++) { Fieldable field = (Fieldable)docFields[i]; System.String fieldName = field.Name(); // Make sure we have a PerField allocated int hashPos = fieldName.GetHashCode() & hashMask; DocFieldProcessorPerField fp = fieldHash[hashPos]; while (fp != null && !fp.fieldInfo.name.Equals(fieldName)) { fp = fp.next; } if (fp == null) { // TODO FI: we need to genericize the "flags" that a // field holds, and, how these flags are merged; it // needs to be more "pluggable" such that if I want // to have a new "thing" my Fields can do, I can // easily add it FieldInfo fi = fieldInfos.Add(fieldName, field.IsIndexed(), field.IsTermVectorStored(), field.IsStorePositionWithTermVector(), field.IsStoreOffsetWithTermVector(), field.GetOmitNorms(), false, field.GetOmitTf()); fp = new DocFieldProcessorPerField(this, fi); fp.next = fieldHash[hashPos]; fieldHash[hashPos] = fp; totalFieldCount++; if (totalFieldCount >= fieldHash.Length / 2) { Rehash(); } } else { fp.fieldInfo.Update(field.IsIndexed(), field.IsTermVectorStored(), field.IsStorePositionWithTermVector(), field.IsStoreOffsetWithTermVector(), field.GetOmitNorms(), false, field.GetOmitTf()); } if (thisFieldGen != fp.lastGen) { // First time we're seeing this field for this doc fp.fieldCount = 0; if (fieldCount == fields.Length) { int newSize = fields.Length * 2; DocFieldProcessorPerField[] newArray = new DocFieldProcessorPerField[newSize]; Array.Copy(fields, 0, newArray, 0, fieldCount); fields = newArray; } fields[fieldCount++] = fp; fp.lastGen = thisFieldGen; } if (fp.fieldCount == fp.fields.Length) { Fieldable[] newArray = new Fieldable[fp.fields.Length * 2]; Array.Copy(fp.fields, 0, newArray, 0, fp.fieldCount); fp.fields = newArray; } fp.fields[fp.fieldCount++] = field; if (field.IsStored()) { fieldsWriter.AddField(field, fp.fieldInfo); } } // If we are writing vectors then we must visit // fields in sorted order so they are written in // sorted order. TODO: we actually only need to // sort the subset of fields that have vectors // enabled; we could save [small amount of] CPU // here. QuickSort(fields, 0, fieldCount - 1); for (int i = 0; i < fieldCount; i++) { fields[i].consumer.ProcessFields(fields[i].fields, fields[i].fieldCount); } if (docState.maxTermPrefix != null && docState.infoStream != null) { docState.infoStream.WriteLine("WARNING: document contains at least one immense term (longer than the max length " + DocumentsWriter.MAX_TERM_LENGTH + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'"); docState.maxTermPrefix = null; } DocumentsWriter.DocWriter one = fieldsWriter.FinishDocument(); DocumentsWriter.DocWriter two = consumer.FinishDocument(); if (one == null) { return(two); } else if (two == null) { return(one); } else { PerDoc both = GetPerDoc(); both.docID = docState.docID; System.Diagnostics.Debug.Assert(one.docID == docState.docID); System.Diagnostics.Debug.Assert(two.docID == docState.docID); both.one = one; both.two = two; return(both); } }
public override void ProcessFields(Fieldable[] fields, int count) { fieldState.Reset(docState.doc.GetBoost()); int maxFieldLength = docState.maxFieldLength; bool doInvert = consumer.Start(fields, count); for (int i = 0; i < count; i++) { Fieldable field = fields[i]; // TODO FI: this should be "genericized" to querying // consumer if it wants to see this particular field // tokenized. if (field.IsIndexed() && doInvert) { bool anyToken; if (fieldState.length > 0) fieldState.position += docState.analyzer.GetPositionIncrementGap(fieldInfo.name); if (!field.IsTokenized()) { // un-tokenized field System.String stringValue = field.StringValue(); int valueLength = stringValue.Length; perThread.singleTokenTokenStream.Reinit(stringValue, 0, valueLength); fieldState.attributeSource = perThread.singleTokenTokenStream; consumer.Start(field); bool success = false; try { consumer.Add(); success = true; } finally { if (!success) docState.docWriter.SetAborting(); } fieldState.offset += valueLength; fieldState.length++; fieldState.position++; anyToken = valueLength > 0; } else { // tokenized field TokenStream stream; TokenStream streamValue = field.TokenStreamValue(); if (streamValue != null) stream = streamValue; else { // the field does not have a TokenStream, // so we have to obtain one from the analyzer System.IO.TextReader reader; // find or make Reader System.IO.TextReader readerValue = field.ReaderValue(); if (readerValue != null) reader = readerValue; else { System.String stringValue = field.StringValue(); if (stringValue == null) throw new System.ArgumentException("field must have either TokenStream, String or Reader value"); perThread.stringReader.Init(stringValue); reader = perThread.stringReader; } // Tokenize field and add to postingTable stream = docState.analyzer.ReusableTokenStream(fieldInfo.name, reader); } // reset the TokenStream to the first token stream.Reset(); int startLength = fieldState.length; // deprecated bool allowMinus1Position = docState.allowMinus1Position; try { int offsetEnd = fieldState.offset - 1; bool hasMoreTokens = stream.IncrementToken(); fieldState.attributeSource = stream; OffsetAttribute offsetAttribute = (OffsetAttribute) fieldState.attributeSource.AddAttribute(typeof(OffsetAttribute)); PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) fieldState.attributeSource.AddAttribute(typeof(PositionIncrementAttribute)); consumer.Start(field); for (; ; ) { // If we hit an exception in stream.next below // (which is fairly common, eg if analyzer // chokes on a given document), then it's // non-aborting and (above) this one document // will be marked as deleted, but still // consume a docID if (!hasMoreTokens) break; int posIncr = posIncrAttribute.GetPositionIncrement(); fieldState.position += posIncr; if (allowMinus1Position || fieldState.position > 0) { fieldState.position--; } if (posIncr == 0) fieldState.numOverlap++; bool success = false; try { // If we hit an exception in here, we abort // all buffered documents since the last // flush, on the likelihood that the // internal state of the consumer is now // corrupt and should not be flushed to a // new segment: consumer.Add(); success = true; } finally { if (!success) docState.docWriter.SetAborting(); } fieldState.position++; offsetEnd = fieldState.offset + offsetAttribute.EndOffset(); if (++fieldState.length >= maxFieldLength) { if (docState.infoStream != null) docState.infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached for field " + fieldInfo.name + ", ignoring following tokens"); break; } hasMoreTokens = stream.IncrementToken(); } // trigger streams to perform end-of-stream operations stream.End(); fieldState.offset += offsetAttribute.EndOffset(); anyToken = fieldState.length > startLength; } finally { stream.Close(); } } if (anyToken) fieldState.offset += docState.analyzer.GetOffsetGap(field); fieldState.boost *= field.GetBoost(); } } consumer.Finish(); endConsumer.Finish(); }
// Tokenizes the fields of a document into Postings. private void InvertDocument(Document doc) { System.Collections.IEnumerator fieldIterator = doc.GetFields().GetEnumerator(); while (fieldIterator.MoveNext()) { Fieldable field = (Fieldable)fieldIterator.Current; System.String fieldName = field.Name(); int fieldNumber = fieldInfos.FieldNumber(fieldName); int length = fieldLengths[fieldNumber]; // length of field int position = fieldPositions[fieldNumber]; // position in field if (length > 0) { position += analyzer.GetPositionIncrementGap(fieldName); } int offset = fieldOffsets[fieldNumber]; // offset field if (field.IsIndexed()) { if (!field.IsTokenized()) { // un-tokenized field System.String stringValue = field.StringValue(); if (field.IsStoreOffsetWithTermVector()) { AddPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.Length)); } else { AddPosition(fieldName, stringValue, position++, null); } offset += stringValue.Length; length++; } else { System.IO.TextReader reader; // find or make Reader if (field.ReaderValue() != null) { reader = field.ReaderValue(); } else if (field.StringValue() != null) { reader = new System.IO.StringReader(field.StringValue()); } else { throw new System.ArgumentException("field must have either String or Reader value"); } // Tokenize field and add to postingTable TokenStream stream = analyzer.TokenStream(fieldName, reader); try { Token lastToken = null; for (Token t = stream.Next(); t != null; t = stream.Next()) { position += (t.GetPositionIncrement() - 1); if (field.IsStoreOffsetWithTermVector()) { AddPosition(fieldName, t.TermText(), position++, new TermVectorOffsetInfo(offset + t.StartOffset(), offset + t.EndOffset())); } else { AddPosition(fieldName, t.TermText(), position++, null); } lastToken = t; if (++length >= maxFieldLength) { if (infoStream != null) { infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached, ignoring following tokens"); } break; } } if (lastToken != null) { offset += lastToken.EndOffset() + 1; } } finally { stream.Close(); } } fieldLengths[fieldNumber] = length; // save field length fieldPositions[fieldNumber] = position; // save field position fieldBoosts[fieldNumber] *= field.GetBoost(); fieldOffsets[fieldNumber] = offset; } } }
public override DocumentsWriter.DocWriter ProcessDocument() { consumer.StartDocument(); fieldsWriter.StartDocument(); Document doc = docState.doc; System.Diagnostics.Debug.Assert(docFieldProcessor.docWriter.writer.TestPoint("DocumentsWriter.ThreadState.init start")); fieldCount = 0; int thisFieldGen = fieldGen++; System.Collections.IList docFields = doc.GetFields(); int numDocFields = docFields.Count; // Absorb any new fields first seen in this document. // Also absorb any changes to fields we had already // seen before (eg suddenly turning on norms or // vectors, etc.): for (int i = 0; i < numDocFields; i++) { Fieldable field = (Fieldable) docFields[i]; System.String fieldName = field.Name(); // Make sure we have a PerField allocated int hashPos = fieldName.GetHashCode() & hashMask; DocFieldProcessorPerField fp = fieldHash[hashPos]; while (fp != null && !fp.fieldInfo.name.Equals(fieldName)) fp = fp.next; if (fp == null) { // TODO FI: we need to genericize the "flags" that a // field holds, and, how these flags are merged; it // needs to be more "pluggable" such that if I want // to have a new "thing" my Fields can do, I can // easily add it FieldInfo fi = fieldInfos.Add(fieldName, field.IsIndexed(), field.IsTermVectorStored(), field.IsStorePositionWithTermVector(), field.IsStoreOffsetWithTermVector(), field.GetOmitNorms(), false, field.GetOmitTf()); fp = new DocFieldProcessorPerField(this, fi); fp.next = fieldHash[hashPos]; fieldHash[hashPos] = fp; totalFieldCount++; if (totalFieldCount >= fieldHash.Length / 2) Rehash(); } else fp.fieldInfo.Update(field.IsIndexed(), field.IsTermVectorStored(), field.IsStorePositionWithTermVector(), field.IsStoreOffsetWithTermVector(), field.GetOmitNorms(), false, field.GetOmitTf()); if (thisFieldGen != fp.lastGen) { // First time we're seeing this field for this doc fp.fieldCount = 0; if (fieldCount == fields.Length) { int newSize = fields.Length * 2; DocFieldProcessorPerField[] newArray = new DocFieldProcessorPerField[newSize]; Array.Copy(fields, 0, newArray, 0, fieldCount); fields = newArray; } fields[fieldCount++] = fp; fp.lastGen = thisFieldGen; } if (fp.fieldCount == fp.fields.Length) { Fieldable[] newArray = new Fieldable[fp.fields.Length * 2]; Array.Copy(fp.fields, 0, newArray, 0, fp.fieldCount); fp.fields = newArray; } fp.fields[fp.fieldCount++] = field; if (field.IsStored()) { fieldsWriter.AddField(field, fp.fieldInfo); } } // If we are writing vectors then we must visit // fields in sorted order so they are written in // sorted order. TODO: we actually only need to // sort the subset of fields that have vectors // enabled; we could save [small amount of] CPU // here. QuickSort(fields, 0, fieldCount - 1); for (int i = 0; i < fieldCount; i++) fields[i].consumer.ProcessFields(fields[i].fields, fields[i].fieldCount); if (docState.maxTermPrefix != null && docState.infoStream != null) { docState.infoStream.WriteLine("WARNING: document contains at least one immense term (longer than the max length " + DocumentsWriter.MAX_TERM_LENGTH + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'"); docState.maxTermPrefix = null; } DocumentsWriter.DocWriter one = fieldsWriter.FinishDocument(); DocumentsWriter.DocWriter two = consumer.FinishDocument(); if (one == null) { return two; } else if (two == null) { return one; } else { PerDoc both = GetPerDoc(); both.docID = docState.docID; System.Diagnostics.Debug.Assert(one.docID == docState.docID); System.Diagnostics.Debug.Assert(two.docID == docState.docID); both.one = one; both.two = two; return both; } }
internal void AddDocument(Document doc) { indexStream.WriteLong(fieldsStream.GetFilePointer()); int storedCount = 0; System.Collections.IEnumerator fieldIterator = doc.GetFields().GetEnumerator(); while (fieldIterator.MoveNext()) { Fieldable field = (Fieldable)fieldIterator.Current; if (field.IsStored()) { storedCount++; } } fieldsStream.WriteVInt(storedCount); fieldIterator = doc.GetFields().GetEnumerator(); while (fieldIterator.MoveNext()) { Fieldable field = (Fieldable)fieldIterator.Current; // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode // and field.binaryValue() already returns the compressed value for a field // with isCompressed()==true, so we disable compression in that case bool disableCompression = (field is FieldsReader.FieldForMerge); if (field.IsStored()) { fieldsStream.WriteVInt(fieldInfos.FieldNumber(field.Name())); byte bits = 0; if (field.IsTokenized()) { bits |= FieldsWriter.FIELD_IS_TOKENIZED; } if (field.IsBinary()) { bits |= FieldsWriter.FIELD_IS_BINARY; } if (field.IsCompressed()) { bits |= FieldsWriter.FIELD_IS_COMPRESSED; } fieldsStream.WriteByte(bits); if (field.IsCompressed()) { // compression is enabled for the current field byte[] data = null; if (disableCompression) { // optimized case for merging, the data // is already compressed data = field.BinaryValue(); } else { // check if it is a binary field if (field.IsBinary()) { data = Compress(field.BinaryValue()); } else { data = Compress(System.Text.Encoding.GetEncoding("UTF-8").GetBytes(field.StringValue())); } } int len = data.Length; fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, len); } else { // compression is disabled for the current field if (field.IsBinary()) { byte[] data = field.BinaryValue(); int len = data.Length; fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, len); } else { fieldsStream.WriteString(field.StringValue()); } } } } }
private static void Add(System.Collections.IDictionary map, Fieldable field) { map[field.Name()] = field; }
public override void ProcessFields(Fieldable[] fields, int count) { one.ProcessFields(fields, count); two.ProcessFields(fields, count); }
internal override void Start(Fieldable f) { if (doVectorOffsets) { offsetAttribute = (OffsetAttribute) fieldState.attributeSource.AddAttribute(typeof(OffsetAttribute)); } else { offsetAttribute = null; } }
internal override bool Start(Fieldable[] fields, int count) { doCall = consumer.Start(fields, count); if (nextPerField != null) doNextCall = nextPerField.Start(fields, count); return doCall || doNextCall; }
internal void WriteField(FieldInfo fi, Fieldable field) { // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode // and field.binaryValue() already returns the compressed value for a field // with isCompressed()==true, so we disable compression in that case bool disableCompression = (field is FieldsReader.FieldForMerge); fieldsStream.WriteVInt(fi.number); byte bits = 0; if (field.IsTokenized()) bits |= FieldsWriter.FIELD_IS_TOKENIZED; if (field.IsBinary()) bits |= FieldsWriter.FIELD_IS_BINARY; if (field.IsCompressed()) bits |= FieldsWriter.FIELD_IS_COMPRESSED; fieldsStream.WriteByte(bits); if (field.IsCompressed()) { // compression is enabled for the current field byte[] data; int len; int offset; if (disableCompression) { // optimized case for merging, the data // is already compressed data = field.GetBinaryValue(); System.Diagnostics.Debug.Assert(data != null); len = field.GetBinaryLength(); offset = field.GetBinaryOffset(); } else { // check if it is a binary field if (field.IsBinary()) { data = Compress(field.GetBinaryValue(), field.GetBinaryOffset(), field.GetBinaryLength()); } else { byte[] x = System.Text.Encoding.UTF8.GetBytes(field.StringValue()); data = Compress(x, 0, x.Length); } len = data.Length; offset = 0; } fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, offset, len); } else { // compression is disabled for the current field if (field.IsBinary()) { int length = field.GetBinaryLength(); fieldsStream.WriteVInt(length); fieldsStream.WriteBytes(field.BinaryValue(), field.GetBinaryOffset(), length); } else { fieldsStream.WriteString(field.StringValue()); } } }
internal override void processFields(Fieldable[] fields, int count) { fieldState.reset(docState.doc.GetBoost()); int maxFieldLength = docState.maxFieldLength; bool doInvert = consumer.start(fields, count); for (int i = 0; i < count; i++) { Fieldable field = fields[i]; // TODO FI: this should be "genericized" to querying // consumer if it wants to see this particular field // tokenized. if (field.IsIndexed() && doInvert) { if (fieldState.length > 0) { fieldState.position += docState.analyzer.GetPositionIncrementGap(fieldInfo.name); } if (!field.IsTokenized()) { // un-tokenized field string stringValue = field.StringValue(); int valueLength = stringValue.Length; Token token = perThread.localToken.Reinit(stringValue, fieldState.offset, fieldState.offset + valueLength); bool success = false; try { consumer.add(token); success = true; } finally { if (!success) { docState.docWriter.SetAborting(); } } fieldState.offset += valueLength; fieldState.length++; fieldState.position++; } else { // tokenized field TokenStream stream; TokenStream streamValue = field.TokenStreamValue(); if (streamValue != null) { stream = streamValue; } else { // the field does not have a TokenStream, // so we have to obtain one from the analyzer System.IO.TextReader reader; // find or make Reader System.IO.TextReader readerValue = field.ReaderValue(); if (readerValue != null) { reader = readerValue; } else { string stringValue = field.StringValue(); if (stringValue == null) { throw new System.ArgumentException("field must have either TokenStream, string or Reader value"); } perThread.stringReader.Init(stringValue); reader = perThread.stringReader; } // Tokenize field and add to postingTable stream = docState.analyzer.ReusableTokenStream(fieldInfo.name, reader); } // reset the TokenStream to the first token stream.Reset(); try { int offsetEnd = fieldState.offset - 1; Token localToken = perThread.localToken; for (; ;) { // If we hit an exception in stream.next below // (which is fairly common, eg if analyzer // chokes on a given document), then it's // non-aborting and (above) this one document // will be marked as deleted, but still // consume a docID Token token = stream.Next(localToken); if (token == null) { break; } fieldState.position += (token.GetPositionIncrement() - 1); bool success = false; try { // If we hit an exception in here, we abort // all buffered documents since the last // flush, on the likelihood that the // internal state of the consumer is now // corrupt and should not be flushed to a // new segment: consumer.add(token); success = true; } finally { if (!success) { docState.docWriter.SetAborting(); } } fieldState.position++; offsetEnd = fieldState.offset + token.EndOffset(); if (++fieldState.length >= maxFieldLength) { if (docState.infoStream != null) { docState.infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached for field " + fieldInfo.name + ", ignoring following tokens"); } break; } } fieldState.offset = offsetEnd + 1; } finally { stream.Close(); } } fieldState.boost *= field.GetBoost(); } } consumer.finish(); endConsumer.finish(); }
/// <summary>Initializes shared state for this new document </summary> internal void Init(Document doc, int docID) { System.Diagnostics.Debug.Assert(!isIdle); System.Diagnostics.Debug.Assert(Enclosing_Instance.writer.TestPoint("DocumentsWriter.ThreadState.init start")); this.docID = docID; docBoost = doc.GetBoost(); numStoredFields = 0; numFieldData = 0; numVectorFields = 0; maxTermPrefix = null; System.Diagnostics.Debug.Assert(0 == fdtLocal.Length()); System.Diagnostics.Debug.Assert(0 == fdtLocal.GetFilePointer()); System.Diagnostics.Debug.Assert(0 == tvfLocal.Length()); System.Diagnostics.Debug.Assert(0 == tvfLocal.GetFilePointer()); int thisFieldGen = fieldGen++; System.Collections.IList docFields = doc.GetFields(); int numDocFields = docFields.Count; bool docHasVectors = false; // Absorb any new fields first seen in this document. // Also absorb any changes to fields we had already // seen before (eg suddenly turning on norms or // vectors, etc.): for (int i = 0; i < numDocFields; i++) { Fieldable field = (Fieldable) docFields[i]; FieldInfo fi = Enclosing_Instance.fieldInfos.Add(field.Name(), field.IsIndexed(), field.IsTermVectorStored(), field.IsStorePositionWithTermVector(), field.IsStoreOffsetWithTermVector(), field.GetOmitNorms(), false); if (fi.isIndexed && !fi.omitNorms) { // Maybe grow our buffered norms if (Enclosing_Instance.norms.Length <= fi.number) { int newSize = (int) ((1 + fi.number) * 1.25); BufferedNorms[] newNorms = new BufferedNorms[newSize]; Array.Copy(Enclosing_Instance.norms, 0, newNorms, 0, Enclosing_Instance.norms.Length); Enclosing_Instance.norms = newNorms; } if (Enclosing_Instance.norms[fi.number] == null) Enclosing_Instance.norms[fi.number] = new BufferedNorms(); Enclosing_Instance.hasNorms = true; } // Make sure we have a FieldData allocated int hashPos = fi.name.GetHashCode() & fieldDataHashMask; FieldData fp = fieldDataHash[hashPos]; while (fp != null && !fp.fieldInfo.name.Equals(fi.name)) fp = fp.next; if (fp == null) { fp = new FieldData(this, fi); fp.next = fieldDataHash[hashPos]; fieldDataHash[hashPos] = fp; if (numAllFieldData == allFieldDataArray.Length) { int newSize = (int) (allFieldDataArray.Length * 1.5); int newHashSize = fieldDataHash.Length * 2; FieldData[] newArray = new FieldData[newSize]; FieldData[] newHashArray = new FieldData[newHashSize]; Array.Copy(allFieldDataArray, 0, newArray, 0, numAllFieldData); // Rehash fieldDataHashMask = newSize - 1; for (int j = 0; j < fieldDataHash.Length; j++) { FieldData fp0 = fieldDataHash[j]; while (fp0 != null) { hashPos = fp0.fieldInfo.name.GetHashCode() & fieldDataHashMask; FieldData nextFP0 = fp0.next; fp0.next = newHashArray[hashPos]; newHashArray[hashPos] = fp0; fp0 = nextFP0; } } allFieldDataArray = newArray; fieldDataHash = newHashArray; } allFieldDataArray[numAllFieldData++] = fp; } else { System.Diagnostics.Debug.Assert(fp.fieldInfo == fi); } if (thisFieldGen != fp.lastGen) { // First time we're seeing this field for this doc fp.lastGen = thisFieldGen; fp.fieldCount = 0; fp.doVectors = fp.doVectorPositions = fp.doVectorOffsets = false; fp.doNorms = fi.isIndexed && !fi.omitNorms; if (numFieldData == fieldDataArray.Length) { int newSize = fieldDataArray.Length * 2; FieldData[] newArray = new FieldData[newSize]; Array.Copy(fieldDataArray, 0, newArray, 0, numFieldData); fieldDataArray = newArray; } fieldDataArray[numFieldData++] = fp; } if (field.IsTermVectorStored()) { if (!fp.doVectors && numVectorFields++ == vectorFieldPointers.Length) { int newSize = (int) (numVectorFields * 1.5); vectorFieldPointers = new long[newSize]; vectorFieldNumbers = new int[newSize]; } fp.doVectors = true; docHasVectors = true; fp.doVectorPositions |= field.IsStorePositionWithTermVector(); fp.doVectorOffsets |= field.IsStoreOffsetWithTermVector(); } if (fp.fieldCount == fp.docFields.Length) { Fieldable[] newArray = new Fieldable[fp.docFields.Length * 2]; Array.Copy(fp.docFields, 0, newArray, 0, fp.docFields.Length); fp.docFields = newArray; } // Lazily allocate arrays for postings: if (field.IsIndexed() && fp.postingsHash == null) fp.InitPostingArrays(); fp.docFields[fp.fieldCount++] = field; } // Maybe init the local & global fieldsWriter if (localFieldsWriter == null) { if (Enclosing_Instance.fieldsWriter == null) { System.Diagnostics.Debug.Assert(Enclosing_Instance.docStoreSegment == null); System.Diagnostics.Debug.Assert(Enclosing_Instance.segment != null); Enclosing_Instance.docStoreSegment = Enclosing_Instance.segment; // If we hit an exception while init'ing the // fieldsWriter, we must abort this segment // because those files will be in an unknown // state: try { Enclosing_Instance.fieldsWriter = new FieldsWriter(Enclosing_Instance.directory, Enclosing_Instance.docStoreSegment, Enclosing_Instance.fieldInfos); } catch (System.Exception t) { throw new AbortException(t, Enclosing_Instance); } Enclosing_Instance.files = null; } localFieldsWriter = new FieldsWriter(null, fdtLocal, Enclosing_Instance.fieldInfos); } // First time we see a doc that has field(s) with // stored vectors, we init our tvx writer if (docHasVectors) { if (Enclosing_Instance.tvx == null) { System.Diagnostics.Debug.Assert(Enclosing_Instance.docStoreSegment != null); // If we hit an exception while init'ing the term // vector output files, we must abort this segment // because those files will be in an unknown // state: try { Enclosing_Instance.tvx = Enclosing_Instance.directory.CreateOutput(Enclosing_Instance.docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); Enclosing_Instance.tvx.WriteInt(TermVectorsReader.FORMAT_VERSION); Enclosing_Instance.tvd = Enclosing_Instance.directory.CreateOutput(Enclosing_Instance.docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); Enclosing_Instance.tvd.WriteInt(TermVectorsReader.FORMAT_VERSION); Enclosing_Instance.tvf = Enclosing_Instance.directory.CreateOutput(Enclosing_Instance.docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); Enclosing_Instance.tvf.WriteInt(TermVectorsReader.FORMAT_VERSION); // We must "catch up" for all docs before us // that had no vectors: for (int i = 0; i < Enclosing_Instance.numDocsInStore; i++) { Enclosing_Instance.tvx.WriteLong(Enclosing_Instance.tvd.GetFilePointer()); Enclosing_Instance.tvd.WriteVInt(0); } } catch (System.Exception t) { throw new AbortException(t, Enclosing_Instance); } Enclosing_Instance.files = null; } numVectorFields = 0; } }
internal override void processFields(Fieldable[] fields, int count) { one.processFields(fields, count); two.processFields(fields, count); }
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using Fieldable = Lucene.Net.Documents.Fieldable; namespace Lucene.Net.Index { internal abstract class DocFieldConsumerPerField { /// <summary> /// Processes all occurrences of a single field /// </summary> /// <param name="fields"></param> /// <param name="count"></param> internal abstract void processFields(Fieldable[] fields, int count); internal abstract void abort(); } }
internal void WriteField(FieldInfo fi, Fieldable field) { // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode // and field.binaryValue() already returns the compressed value for a field // with isCompressed()==true, so we disable compression in that case bool disableCompression = (field is FieldsReader.FieldForMerge); fieldsStream.WriteVInt(fi.number); byte bits = 0; if (field.IsTokenized()) { bits |= FieldsWriter.FIELD_IS_TOKENIZED; } if (field.IsBinary()) { bits |= FieldsWriter.FIELD_IS_BINARY; } if (field.IsCompressed()) { bits |= FieldsWriter.FIELD_IS_COMPRESSED; } fieldsStream.WriteByte(bits); if (field.IsCompressed()) { // compression is enabled for the current field byte[] data; int len; int offset; if (disableCompression) { // optimized case for merging, the data // is already compressed data = field.GetBinaryValue(); System.Diagnostics.Debug.Assert(data != null); len = field.GetBinaryLength(); offset = field.GetBinaryOffset(); } else { // check if it is a binary field if (field.IsBinary()) { data = Compress(field.GetBinaryValue(), field.GetBinaryOffset(), field.GetBinaryLength()); } else { byte[] x = System.Text.Encoding.UTF8.GetBytes(field.StringValue()); data = Compress(x, 0, x.Length); } len = data.Length; offset = 0; } fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, offset, len); } else { // compression is disabled for the current field if (field.IsBinary()) { int length = field.GetBinaryLength(); fieldsStream.WriteVInt(length); fieldsStream.WriteBytes(field.BinaryValue(), field.GetBinaryOffset(), length); } else { fieldsStream.WriteString(field.StringValue()); } } }
/* Invert one occurrence of one field in the document */ public void InvertField(Fieldable field, Analyzer analyzer, int maxFieldLength) { if (length > 0) position += analyzer.GetPositionIncrementGap(fieldInfo.name); if (!field.IsTokenized()) { // un-tokenized field System.String stringValue = field.StringValue(); int valueLength = stringValue.Length; Token token = localToken; token.Clear(); char[] termBuffer = token.TermBuffer(); if (termBuffer.Length < valueLength) termBuffer = token.ResizeTermBuffer(valueLength); DocumentsWriter.GetCharsFromString(stringValue, 0, valueLength, termBuffer, 0); token.SetTermLength(valueLength); token.SetStartOffset(offset); token.SetEndOffset(offset + stringValue.Length); AddPosition(token); offset += stringValue.Length; length++; } else { // tokenized field TokenStream stream; TokenStream streamValue = field.TokenStreamValue(); if (streamValue != null) stream = streamValue; else { // the field does not have a TokenStream, // so we have to obtain one from the analyzer System.IO.TextReader reader; // find or make Reader System.IO.TextReader readerValue = field.ReaderValue(); if (readerValue != null) reader = readerValue; else { System.String stringValue = field.StringValue(); if (stringValue == null) throw new System.ArgumentException("field must have either TokenStream, String or Reader value"); Enclosing_Instance.stringReader.Init(stringValue); reader = Enclosing_Instance.stringReader; } // Tokenize field and add to postingTable stream = analyzer.ReusableTokenStream(fieldInfo.name, reader); } // reset the TokenStream to the first token stream.Reset(); try { offsetEnd = offset - 1; for (; ; ) { Token token = stream.Next(localToken); if (token == null) break; position += (token.GetPositionIncrement() - 1); AddPosition(token); if (++length >= maxFieldLength) { if (Enclosing_Instance.Enclosing_Instance.infoStream != null) Enclosing_Instance.Enclosing_Instance.infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached for field " + fieldInfo.name + ", ignoring following tokens"); break; } } offset = offsetEnd + 1; } finally { stream.Close(); } } boost *= field.GetBoost(); }
// Called before a field instance is being processed internal abstract void Start(Fieldable field);
internal void WriteField(FieldInfo fi, Fieldable field) { // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode // and field.binaryValue() already returns the compressed value for a field // with isCompressed()==true, so we disable compression in that case bool disableCompression = (field is FieldsReader.FieldForMerge); fieldsStream.WriteVInt(fi.number); byte bits = 0; if (field.IsTokenized()) { bits |= FieldsWriter.FIELD_IS_TOKENIZED; } if (field.IsBinary()) { bits |= FieldsWriter.FIELD_IS_BINARY; } if (field.IsCompressed()) { bits |= FieldsWriter.FIELD_IS_COMPRESSED; } fieldsStream.WriteByte(bits); if (field.IsCompressed()) { // compression is enabled for the current field byte[] data = null; if (disableCompression) { // optimized case for merging, the data // is already compressed data = field.BinaryValue(); } else { // check if it is a binary field if (field.IsBinary()) { data = Compress(field.BinaryValue()); } else { data = Compress(System.Text.Encoding.GetEncoding("UTF-8").GetBytes(field.StringValue())); } } int len = data.Length; fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, len); } else { // compression is disabled for the current field if (field.IsBinary()) { byte[] data = field.BinaryValue(); int len = data.Length; fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, len); } else { fieldsStream.WriteString(field.StringValue()); } } }
static DocHelper() { textField1 = new Field(TEXT_FIELD_1_KEY, FIELD_1_TEXT, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO); textField2 = new Field(TEXT_FIELD_2_KEY, FIELD_2_TEXT, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); compressedTextField2 = new Field(COMPRESSED_TEXT_FIELD_2_KEY, FIELD_2_COMPRESSED_TEXT, Field.Store.COMPRESS, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); textField3 = new Field(TEXT_FIELD_3_KEY, FIELD_3_TEXT, Field.Store.YES, Field.Index.ANALYZED); { textField3.SetOmitNorms(true); } keyField = new Field(KEYWORD_FIELD_KEY, KEYWORD_TEXT, Field.Store.YES, Field.Index.NOT_ANALYZED); noNormsField = new Field(NO_NORMS_KEY, NO_NORMS_TEXT, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); noTFField = new Field(NO_TF_KEY, NO_TF_TEXT, Field.Store.YES, Field.Index.ANALYZED); { noTFField.SetOmitTermFreqAndPositions(true); } unIndField = new Field(UNINDEXED_FIELD_KEY, UNINDEXED_FIELD_TEXT, Field.Store.YES, Field.Index.NO); unStoredField1 = new Field(UNSTORED_FIELD_1_KEY, UNSTORED_1_FIELD_TEXT, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO); unStoredField2 = new Field(UNSTORED_FIELD_2_KEY, UNSTORED_2_FIELD_TEXT, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES); lazyField = new Field(LAZY_FIELD_KEY, LAZY_FIELD_TEXT, Field.Store.YES, Field.Index.ANALYZED); textUtfField1 = new Field(TEXT_FIELD_UTF1_KEY, FIELD_UTF1_TEXT, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO); textUtfField2 = new Field(TEXT_FIELD_UTF2_KEY, FIELD_UTF2_TEXT, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); fields = new Field[] { textField1, textField2, textField3, compressedTextField2, keyField, noNormsField, noTFField, unIndField, unStoredField1, unStoredField2, textUtfField1, textUtfField2, lazyField, lazyFieldBinary, largeLazyField }; { //Initialize the large Lazy Field System.Text.StringBuilder buffer = new System.Text.StringBuilder(); for (int i = 0; i < 10000; i++) { buffer.Append("Lazily loading lengths of language in lieu of laughing "); } try { LAZY_FIELD_BINARY_BYTES = System.Text.Encoding.UTF8.GetBytes("These are some binary field bytes"); } catch (System.IO.IOException e) { } lazyFieldBinary = new Field(LAZY_FIELD_BINARY_KEY, LAZY_FIELD_BINARY_BYTES, Field.Store.YES); fields[fields.Length - 2] = lazyFieldBinary; LARGE_LAZY_FIELD_TEXT = buffer.ToString(); largeLazyField = new Field(LARGE_LAZY_FIELD_KEY, LARGE_LAZY_FIELD_TEXT, Field.Store.YES, Field.Index.ANALYZED); fields[fields.Length - 1] = largeLazyField; for (int i = 0; i < fields.Length; i++) { Fieldable f = fields[i]; Add(all, f); if (f.IsIndexed()) { Add(indexed, f); } else { Add(unindexed, f); } if (f.IsTermVectorStored()) { Add(termvector, f); } if (f.IsIndexed() && !f.IsTermVectorStored()) { Add(notermvector, f); } if (f.IsStored()) { Add(stored, f); } else { Add(unstored, f); } if (f.GetOmitNorms()) { Add(noNorms, f); } if (f.GetOmitTf()) { Add(noTf, f); } if (f.IsLazy()) { Add(lazy, f); } } } { nameValues = new System.Collections.Hashtable(); nameValues[TEXT_FIELD_1_KEY] = FIELD_1_TEXT; nameValues[TEXT_FIELD_2_KEY] = FIELD_2_TEXT; nameValues[COMPRESSED_TEXT_FIELD_2_KEY] = FIELD_2_COMPRESSED_TEXT; nameValues[TEXT_FIELD_3_KEY] = FIELD_3_TEXT; nameValues[KEYWORD_FIELD_KEY] = KEYWORD_TEXT; nameValues[NO_NORMS_KEY] = NO_NORMS_TEXT; nameValues[NO_TF_KEY] = NO_TF_TEXT; nameValues[UNINDEXED_FIELD_KEY] = UNINDEXED_FIELD_TEXT; nameValues[UNSTORED_FIELD_1_KEY] = UNSTORED_1_FIELD_TEXT; nameValues[UNSTORED_FIELD_2_KEY] = UNSTORED_2_FIELD_TEXT; nameValues[LAZY_FIELD_KEY] = LAZY_FIELD_TEXT; nameValues[LAZY_FIELD_BINARY_KEY] = LAZY_FIELD_BINARY_BYTES; nameValues[LARGE_LAZY_FIELD_KEY] = LARGE_LAZY_FIELD_TEXT; nameValues[TEXT_FIELD_UTF1_KEY] = FIELD_UTF1_TEXT; nameValues[TEXT_FIELD_UTF2_KEY] = FIELD_UTF2_TEXT; } }
internal override void Start(Fieldable f) { if (fieldState.attributeSource.HasAttribute(typeof(PayloadAttribute))) { payloadAttribute = (PayloadAttribute) fieldState.attributeSource.GetAttribute(typeof(PayloadAttribute)); } else { payloadAttribute = null; } }
internal override void Start(Fieldable f) { termAtt = (TermAttribute) fieldState.attributeSource.AddAttribute(typeof(TermAttribute)); consumer.Start(f); if (nextPerField != null) { nextPerField.Start(f); } }
internal override void processFields(Fieldable[] fields, int count) { fieldState.reset(docState.doc.GetBoost()); int maxFieldLength = docState.maxFieldLength; bool doInvert = consumer.start(fields, count); for (int i = 0; i < count; i++) { Fieldable field = fields[i]; // TODO FI: this should be "genericized" to querying // consumer if it wants to see this particular field // tokenized. if (field.IsIndexed() && doInvert) { if (fieldState.length > 0) fieldState.position += docState.analyzer.GetPositionIncrementGap(fieldInfo.name); if (!field.IsTokenized()) { // un-tokenized field string stringValue = field.StringValue(); int valueLength = stringValue.Length; Token token = perThread.localToken.Reinit(stringValue, fieldState.offset, fieldState.offset + valueLength); bool success = false; try { consumer.add(token); success = true; } finally { if (!success) docState.docWriter.SetAborting(); } fieldState.offset += valueLength; fieldState.length++; fieldState.position++; } else { // tokenized field TokenStream stream; TokenStream streamValue = field.TokenStreamValue(); if (streamValue != null) stream = streamValue; else { // the field does not have a TokenStream, // so we have to obtain one from the analyzer System.IO.TextReader reader; // find or make Reader System.IO.TextReader readerValue = field.ReaderValue(); if (readerValue != null) reader = readerValue; else { string stringValue = field.StringValue(); if (stringValue == null) throw new System.ArgumentException("field must have either TokenStream, string or Reader value"); perThread.stringReader.Init(stringValue); reader = perThread.stringReader; } // Tokenize field and add to postingTable stream = docState.analyzer.ReusableTokenStream(fieldInfo.name, reader); } // reset the TokenStream to the first token stream.Reset(); try { int offsetEnd = fieldState.offset - 1; Token localToken = perThread.localToken; for (; ; ) { // If we hit an exception in stream.next below // (which is fairly common, eg if analyzer // chokes on a given document), then it's // non-aborting and (above) this one document // will be marked as deleted, but still // consume a docID Token token = stream.Next(localToken); if (token == null) break; fieldState.position += (token.GetPositionIncrement() - 1); bool success = false; try { // If we hit an exception in here, we abort // all buffered documents since the last // flush, on the likelihood that the // internal state of the consumer is now // corrupt and should not be flushed to a // new segment: consumer.add(token); success = true; } finally { if (!success) docState.docWriter.SetAborting(); } fieldState.position++; offsetEnd = fieldState.offset + token.EndOffset(); if (++fieldState.length >= maxFieldLength) { if (docState.infoStream != null) docState.infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached for field " + fieldInfo.name + ", ignoring following tokens"); break; } } fieldState.offset = offsetEnd + 1; } finally { stream.Close(); } } fieldState.boost *= field.GetBoost(); } } consumer.finish(); endConsumer.finish(); }
private static void Add(System.Collections.IDictionary map, Fieldable field) { if (field == null) System.Console.WriteLine("FIELD IS NULL!!!"); if (field == null) System.Console.WriteLine("FIELD IS NULL!!!"); if (field == null) System.Console.WriteLine("FIELD IS NULL!!!"); if (map == null) System.Console.WriteLine("MAP IS NULL!!!"); if (map == null) System.Console.WriteLine("MAP IS NULL!!!"); if (map == null) System.Console.WriteLine("MAP IS NULL!!!"); if (field.Name() == null) System.Console.WriteLine("FIELD NAME IS NULL!!!"); if (field.Name() == null) System.Console.WriteLine("FIELD NAME IS NULL!!!"); if (field.Name() == null) System.Console.WriteLine("FIELD NAME IS NULL!!!"); map[field.Name()] = field; }