protected override MonotonicBlockPackedReader GetOrdIndexInstance(IndexInput data, FieldInfo field, NumericEntry entry) { data.Seek(entry.Offset); return new MonotonicBlockPackedReader((IndexInput)data.Clone(), entry.PackedIntsVersion, entry.BlockSize, entry.Count, true); }
public override void AddField(int docID, IndexableField field, FieldInfo fieldInfo) { DocValuesType_e? dvType = field.FieldType().DocValueType; if (dvType != null) { fieldInfo.DocValuesType = dvType; if (dvType == DocValuesType_e.BINARY) { AddBinaryField(fieldInfo, docID, field.BinaryValue()); } else if (dvType == DocValuesType_e.SORTED) { AddSortedField(fieldInfo, docID, field.BinaryValue()); } else if (dvType == DocValuesType_e.SORTED_SET) { AddSortedSetField(fieldInfo, docID, field.BinaryValue()); } else if (dvType == DocValuesType_e.NUMERIC) { if (!(field.NumericValue is long?)) { throw new System.ArgumentException("illegal type " + field.NumericValue.GetType() + ": DocValues types must be Long"); } AddNumericField(fieldInfo, docID, (long)field.NumericValue); } else { Debug.Assert(false, "unrecognized DocValues.Type: " + dvType); } } }
public NormsWriterPerField(DocInverterPerField docInverterPerField, NormsWriterPerThread perThread, FieldInfo fieldInfo) { this.perThread = perThread; this.fieldInfo = fieldInfo; docState = perThread.docState; fieldState = docInverterPerField.fieldState; }
internal void SetField(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions; storePayloads = fieldInfo.storePayloads; posWriter.SetField(fieldInfo); }
internal void WriteField(FieldInfo fi, Fieldable field) { // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode // and field.binaryValue() already returns the compressed value for a field // with isCompressed()==true, so we disable compression in that case bool disableCompression = (field is FieldsReader.FieldForMerge); fieldsStream.WriteVInt(fi.number); byte bits = 0; if (field.IsTokenized()) bits |= FieldsWriter.FIELD_IS_TOKENIZED; if (field.IsBinary()) bits |= FieldsWriter.FIELD_IS_BINARY; if (field.IsCompressed()) bits |= FieldsWriter.FIELD_IS_COMPRESSED; fieldsStream.WriteByte(bits); if (field.IsCompressed()) { // compression is enabled for the current field byte[] data = null; if (disableCompression) { // optimized case for merging, the data // is already compressed data = field.BinaryValue(); } else { // check if it is a binary field if (field.IsBinary()) { data = Compress(field.BinaryValue()); } else { data = Compress(System.Text.Encoding.GetEncoding("UTF-8").GetBytes(field.StringValue())); } } int len = data.Length; fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, len); } else { // compression is disabled for the current field if (field.IsBinary()) { byte[] data = field.BinaryValue(); int len = data.Length; fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, len); } else { fieldsStream.WriteString(field.StringValue()); } } }
protected override MonotonicBlockPackedReader GetAddressInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) { data.Seek(bytes.AddressesOffset); return new MonotonicBlockPackedReader((IndexInput)data.Clone(), bytes.PackedIntsVersion, bytes.BlockSize, bytes.Count, true); }
public DocInverterPerField(DocInverter parent, FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; DocState = parent.DocState; FieldState = new FieldInvertState(fieldInfo.Name); this.Consumer = parent.Consumer.AddField(this, fieldInfo); this.EndConsumer = parent.EndConsumer.AddField(this, fieldInfo); }
internal bool HasPayloads; // if enabled, and we actually saw any for this field public TermVectorsConsumerPerField(TermsHashPerField termsHashPerField, TermVectorsConsumer termsWriter, FieldInfo fieldInfo) { this.TermsHashPerField = termsHashPerField; this.TermsWriter = termsWriter; this.FieldInfo = fieldInfo; DocState = termsHashPerField.DocState; FieldState = termsHashPerField.FieldState; }
public SortedDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { this.FieldInfo = fieldInfo; this.IwBytesUsed = iwBytesUsed; Hash = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(iwBytesUsed)), BytesRefHash.DEFAULT_CAPACITY, new BytesRefHash.DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed)); Pending = new AppendingDeltaPackedLongBuffer(PackedInts.COMPACT); BytesUsed = Pending.RamBytesUsed(); iwBytesUsed.AddAndGet(BytesUsed); }
public NumericDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed, bool trackDocsWithField) { Pending = new AppendingDeltaPackedLongBuffer(PackedInts.COMPACT); DocsWithField = trackDocsWithField ? new FixedBitSet(64) : null; BytesUsed = Pending.RamBytesUsed() + DocsWithFieldBytesUsed(); this.FieldInfo = fieldInfo; this.IwBytesUsed = iwBytesUsed; iwBytesUsed.AddAndGet(BytesUsed); }
public DocInverterPerField(DocInverterPerThread perThread, FieldInfo fieldInfo) { this.perThread = perThread; this.fieldInfo = fieldInfo; docState = perThread.docState; fieldState = perThread.fieldState; this.consumer = perThread.consumer.addField(this, fieldInfo); this.endConsumer = perThread.endConsumer.addField(this, fieldInfo); }
public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriterPerThread perThread, FieldInfo fieldInfo) { this.termsHashPerField = termsHashPerField; this.perThread = perThread; this.fieldInfo = fieldInfo; docState = termsHashPerField.docState; fieldState = termsHashPerField.fieldState; omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions; }
public TermVectorsTermsWriterPerField(TermsHashPerField termsHashPerField, TermVectorsTermsWriterPerThread perThread, FieldInfo fieldInfo) { this.termsHashPerField = termsHashPerField; this.perThread = perThread; this.termsWriter = perThread.termsWriter; this.fieldInfo = fieldInfo; docState = termsHashPerField.docState; fieldState = termsHashPerField.fieldState; }
public BinaryDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { this.FieldInfo = fieldInfo; this.Bytes = new PagedBytes(BLOCK_BITS); this.BytesOut = Bytes.DataOutput; this.Lengths = new AppendingDeltaPackedLongBuffer(PackedInts.COMPACT); this.IwBytesUsed = iwBytesUsed; this.DocsWithField = new FixedBitSet(64); this.BytesUsed = DocsWithFieldBytesUsed(); iwBytesUsed.AddAndGet(BytesUsed); }
public void AddField(Fieldable field, FieldInfo fieldInfo) { if (doc == null) { doc = storedFieldsWriter.GetPerDoc(); doc.docID = docState.docID; localFieldsWriter.SetFieldsStream(doc.fdt); System.Diagnostics.Debug.Assert(doc.numStoredFields == 0, "doc.numStoredFields=" + doc.numStoredFields); System.Diagnostics.Debug.Assert(0 == doc.fdt.Length()); System.Diagnostics.Debug.Assert(0 == doc.fdt.GetFilePointer()); } localFieldsWriter.WriteField(fieldInfo, field); System.Diagnostics.Debug.Assert(docState.TestPoint("StoredFieldsWriterPerThread.processFields.writeField")); doc.numStoredFields++; }
public TermsHashPerField(DocInverterPerField docInverterPerField, TermsHashPerThread perThread, TermsHashPerThread nextPerThread, FieldInfo fieldInfo) { this.perThread = perThread; intPool = perThread.intPool; charPool = perThread.charPool; bytePool = perThread.bytePool; docState = perThread.docState; fieldState = docInverterPerField.fieldState; this.consumer = perThread.consumer.addField(this, fieldInfo); streamCount = consumer.getStreamCount(); numPostingInt = 2 * streamCount; this.fieldInfo = fieldInfo; if (nextPerThread != null) nextPerField = (TermsHashPerField)nextPerThread.addField(docInverterPerField, fieldInfo); else nextPerField = null; }
/// <summary> /// Creates an IndexableField whose value will be lazy loaded if and /// when it is used. /// <para> /// <b>NOTE:</b> This method must be called once for each value of the field /// name specified in sequence that the values exist. This method may not be /// used to generate multiple, lazy, IndexableField instances refering to /// the same underlying IndexableField instance. /// </para> /// <para> /// The lazy loading of field values from all instances of IndexableField /// objects returned by this method are all backed by a single Document /// per LazyDocument instance. /// </para> /// </summary> public virtual IndexableField GetField(FieldInfo fieldInfo) { fieldNames.Add(fieldInfo.Name); IList<LazyField> values = fields.ContainsKey(fieldInfo.Number) ? fields[fieldInfo.Number] : null; if (null == values) { values = new List<LazyField>(); fields[fieldInfo.Number] = values; } LazyField value = new LazyField(this, fieldInfo.Name, fieldInfo.Number); values.Add(value); lock (this) { // edge case: if someone asks this LazyDoc for more LazyFields // after other LazyFields from the same LazyDoc have been // actuallized, we need to force the doc to be re-fetched // so the new LazyFields are also populated. doc = null; } return value; }
public TermsHashPerField(DocInverterPerField docInverterPerField, TermsHash termsHash, TermsHash nextTermsHash, FieldInfo fieldInfo) { IntPool = termsHash.IntPool; BytePool = termsHash.BytePool; TermBytePool = termsHash.TermBytePool; DocState = termsHash.DocState; this.TermsHash = termsHash; BytesUsed = termsHash.BytesUsed; FieldState = docInverterPerField.FieldState; this.Consumer = termsHash.Consumer.AddField(this, fieldInfo); PostingsBytesStartArray byteStarts = new PostingsBytesStartArray(this, BytesUsed); BytesHash = new BytesRefHash(TermBytePool, HASH_INIT_SIZE, byteStarts); StreamCount = Consumer.StreamCount; NumPostingInt = 2 * StreamCount; this.FieldInfo = fieldInfo; if (nextTermsHash != null) { NextPerField = (TermsHashPerField)nextTermsHash.AddField(docInverterPerField, fieldInfo); } else { NextPerField = null; } }
private void AddField(Document doc, FieldInfo fi, bool binary, bool compressed, bool tokenize) { //we have a binary stored field, and it may be compressed if (binary) { int toRead = fieldsStream.ReadVInt(); var b = new byte[toRead]; fieldsStream.ReadBytes(b, 0, b.Length); doc.Add(compressed ? new Field(fi.name, Uncompress(b), Field.Store.YES) : new Field(fi.name, b, Field.Store.YES)); } else { const Field.Store store = Field.Store.YES; Field.Index index = FieldExtensions.ToIndex(fi.isIndexed, tokenize); Field.TermVector termVector = FieldExtensions.ToTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector); AbstractField f; if (compressed) { int toRead = fieldsStream.ReadVInt(); var b = new byte[toRead]; fieldsStream.ReadBytes(b, 0, b.Length); f = new Field(fi.name, false, System.Text.Encoding.GetEncoding("UTF-8").GetString(Uncompress(b)), store, index, termVector) {OmitTermFreqAndPositions = fi.omitTermFreqAndPositions, OmitNorms = fi.omitNorms}; } else { f = new Field(fi.name, false, fieldsStream.ReadString(), store, index, termVector) {OmitTermFreqAndPositions = fi.omitTermFreqAndPositions, OmitNorms = fi.omitNorms}; } doc.Add(f); } }
internal override InvertedDocEndConsumerPerField AddField(DocInverterPerField docInverterPerField, FieldInfo fieldInfo) { return new NormsConsumerPerField(docInverterPerField, fieldInfo, this); }
private void AddFieldLazy(Document doc, FieldInfo fi, bool binary, bool compressed, bool tokenize) { if (binary) { int toRead = fieldsStream.ReadVInt(); long pointer = fieldsStream.FilePointer; //was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES)); doc.Add(new LazyField(this, fi.name, Field.Store.YES, toRead, pointer, binary, compressed)); //Need to move the pointer ahead by toRead positions fieldsStream.Seek(pointer + toRead); } else { const Field.Store store = Field.Store.YES; Field.Index index = FieldExtensions.ToIndex(fi.isIndexed, tokenize); Field.TermVector termVector = FieldExtensions.ToTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector); AbstractField f; if (compressed) { int toRead = fieldsStream.ReadVInt(); long pointer = fieldsStream.FilePointer; f = new LazyField(this, fi.name, store, toRead, pointer, binary, compressed); //skip over the part that we aren't loading fieldsStream.Seek(pointer + toRead); f.OmitNorms = fi.omitNorms; f.OmitTermFreqAndPositions = fi.omitTermFreqAndPositions; } else { int length = fieldsStream.ReadVInt(); long pointer = fieldsStream.FilePointer; //Skip ahead of where we are by the length of what is stored if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) { fieldsStream.Seek(pointer + length); } else { fieldsStream.SkipChars(length); } f = new LazyField(this, fi.name, store, index, termVector, length, pointer, binary, compressed) {OmitNorms = fi.omitNorms, OmitTermFreqAndPositions = fi.omitTermFreqAndPositions}; } doc.Add(f); } }
public override DocFieldConsumerPerField AddField(FieldInfo fi) { return new DocInverterPerField(this, fi); }
private FieldInfo AddInternal(System.String name, bool isIndexed, bool storeTermVector, bool storePositionWithTermVector, bool storeOffsetWithTermVector, bool omitNorms, bool storePayloads, bool omitTermFreqAndPositions) { name = StringHelper.Intern(name); FieldInfo fi = new FieldInfo(name, isIndexed, byNumber.Count, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); byNumber.Add(fi); byName[name] = fi; return fi; }
public override void AddField(int docID, IndexableField field, FieldInfo fieldInfo) { First.AddField(docID, field, fieldInfo); Second.AddField(docID, field, fieldInfo); }
abstract public TermsHashConsumerPerField AddField(TermsHashPerField termsHashPerField, FieldInfo fieldInfo);
public DocFieldProcessorPerField(DocFieldProcessorPerThread perThread, FieldInfo fieldInfo) { this.consumer = perThread.consumer.AddField(fieldInfo); this.fieldInfo = fieldInfo; }
internal void WriteField(FieldInfo fi, Fieldable field) { // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode // and field.binaryValue() already returns the compressed value for a field // with isCompressed()==true, so we disable compression in that case bool disableCompression = (field is FieldsReader.FieldForMerge); fieldsStream.WriteVInt(fi.number); byte bits = 0; if (field.IsTokenized()) { bits |= FieldsWriter.FIELD_IS_TOKENIZED; } if (field.IsBinary()) { bits |= FieldsWriter.FIELD_IS_BINARY; } if (field.IsCompressed()) { bits |= FieldsWriter.FIELD_IS_COMPRESSED; } fieldsStream.WriteByte(bits); if (field.IsCompressed()) { // compression is enabled for the current field byte[] data; int len; int offset; if (disableCompression) { // optimized case for merging, the data // is already compressed data = field.GetBinaryValue(); System.Diagnostics.Debug.Assert(data != null); len = field.GetBinaryLength(); offset = field.GetBinaryOffset(); } else { // check if it is a binary field if (field.IsBinary()) { data = CompressionTools.Compress(field.GetBinaryValue(), field.GetBinaryOffset(), field.GetBinaryLength()); } else { byte[] x = defaultEncoding.GetBytes(field.StringValue()); data = CompressionTools.Compress(x, 0, x.Length); } len = data.Length; offset = 0; } fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, offset, len); } else { // compression is disabled for the current field if (field.IsBinary()) { byte[] data; int len; int offset; data = field.GetBinaryValue(); len = field.GetBinaryLength(); offset = field.GetBinaryOffset(); fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, offset, len); } else { fieldsStream.WriteString(field.StringValue()); } } }
public FieldForMerge(System.Object value_Renamed, FieldInfo fi, bool binary, bool compressed, bool tokenize) { this.isStored = true; this.fieldsData = value_Renamed; this.isCompressed = compressed; this.isBinary = binary; if (binary) binaryLength = ((byte[]) value_Renamed).Length; this.isTokenized = tokenize; this.name = StringHelper.Intern(fi.name); this.isIndexed = fi.isIndexed; this.omitNorms = fi.omitNorms; this.omitTermFreqAndPositions = fi.omitTermFreqAndPositions; this.storeOffsetWithTermVector = fi.storeOffsetWithTermVector; this.storePositionWithTermVector = fi.storePositionWithTermVector; this.storeTermVector = fi.storeTermVector; }
private Field.Index GetIndexType(FieldInfo fi, bool tokenize) { Field.Index index; if (fi.isIndexed && tokenize) index = Field.Index.ANALYZED; else if (fi.isIndexed && !tokenize) index = Field.Index.NOT_ANALYZED; else index = Field.Index.NO; return index; }
private Field.TermVector GetTermVectorType(FieldInfo fi) { Field.TermVector termVector = null; if (fi.storeTermVector) { if (fi.storeOffsetWithTermVector) { if (fi.storePositionWithTermVector) { termVector = Field.TermVector.WITH_POSITIONS_OFFSETS; } else { termVector = Field.TermVector.WITH_OFFSETS; } } else if (fi.storePositionWithTermVector) { termVector = Field.TermVector.WITH_POSITIONS; } else { termVector = Field.TermVector.YES; } } else { termVector = Field.TermVector.NO; } return termVector; }
/// <summary> /// Call this only once (if you subclass!) </summary> protected virtual void Uninvert(AtomicReader reader, IBits liveDocs, BytesRef termPrefix) { FieldInfo info = reader.FieldInfos.FieldInfo(m_field); if (info != null && info.HasDocValues) { throw new InvalidOperationException("Type mismatch: " + m_field + " was indexed as " + info.DocValuesType); } //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); long startTime = Environment.TickCount; m_prefix = termPrefix == null ? null : BytesRef.DeepCopyOf(termPrefix); int maxDoc = reader.MaxDoc; int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number int[] lastTerm = new int[maxDoc]; // last term we saw for this document var bytes = new sbyte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) Fields fields = reader.Fields; if (fields == null) { // No terms return; } Terms terms = fields.GetTerms(m_field); if (terms == null) { // No terms return; } TermsEnum te = terms.GetEnumerator(); BytesRef seekStart = termPrefix ?? new BytesRef(); //System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.SeekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // If we need our "term index wrapper", these will be // init'd below: IList <BytesRef> indexedTerms = null; PagedBytes indexedTermsBytes = null; bool testedOrd = false; // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. var tempArr = new sbyte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in it's byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. this requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; m_docsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (; ;) { BytesRef t = te.Term; if (t == null || (termPrefix != null && !StringHelper.StartsWith(t, termPrefix))) { break; } //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); if (!testedOrd) { try { m_ordBase = (int)te.Ord; //System.out.println("got ordBase=" + ordBase); } catch (NotSupportedException) // LUCENENET: IDE0059: Remove unnecessary value assignment { // Reader cannot provide ord support, so we wrap // our own support by creating our own terms index: indexedTerms = new List <BytesRef>(); indexedTermsBytes = new PagedBytes(15); //System.out.println("NO ORDS"); } testedOrd = true; } VisitTerm(te, termNum); if (indexedTerms != null && (termNum & indexIntervalMask) == 0) { // Index this term m_sizeOfIndexedStrings += t.Length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.Copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.Add(indexedTerm); } int df = te.DocFreq; if (df <= m_maxTermDocFreq) { m_docsEnum = te.Docs(liveDocs, m_docsEnum, DocsFlags.NONE); // dF, but takes deletions into account int actualDF = 0; for (; ;) { int doc = m_docsEnum.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } //System.out.println(" chunk=" + chunk + " docs"); actualDF++; m_termInstances++; //System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = (int)((uint)val >> 8); int ilen = VInt32Size(delta); var arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.Length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary. // TODO: figure out what array lengths we can round up to w/o actually using more memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & unchecked ((int)0xfffffffc); // 4 byte alignment var newarr = new sbyte[newLen]; Array.Copy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = WriteInt32(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } //System.out.println(" ipos=" + ipos); int endPos = WriteInt32(delta, tempArr, ipos); //System.out.println(" endpos=" + endPos); if (endPos <= 4) { //System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (sbyte)val; val = (int)((uint)val >> 8); } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new sbyte[12]; } } } SetActualDocFreq(termNum, actualDF); } termNum++; if (!te.MoveNext()) { break; } } m_numTermsInField = termNum; long midPoint = Environment.TickCount; if (m_termInstances == 0) { // we didn't invert anything // lower memory consumption. m_tnums = null; } else { this.m_index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { var target = m_tnums[pass]; var pos = 0; // end in target; if (target != null) { pos = target.Length; } else { target = new sbyte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.Min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { //System.out.println(" pass="******" process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = (int)((uint)val >> 8); //System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw new InvalidOperationException("Too many values for UnInvertedField faceting on field " + m_field); } var arr = bytes[doc]; /* * for(byte b : arr) { * //System.out.println(" b=" + Integer.toHexString((int) b)); * } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.Length <= pos + len) { int newlen = target.Length; //* we don't have to worry about the array getting too large // since the "pos" param will overflow first (only 24 bits available) // if ((newlen<<1) <= 0) { // // overflow... // newlen = Integer.MAX_VALUE; // if (newlen <= pos + len) { // throw new SolrException(400,"Too many terms to uninvert field!"); // } // } else { // while (newlen <= pos + len) newlen<<=1; // doubling strategy // } // while (newlen <= pos + len) // doubling strategy { newlen <<= 1; } var newtarget = new sbyte[newlen]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } Array.Copy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.Length) { var newtarget = new sbyte[pos]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } m_tnums[pass] = target; if ((pass << 16) > maxDoc) { break; } } } if (indexedTerms != null) { m_indexedTermsArray = new BytesRef[indexedTerms.Count]; indexedTerms.CopyTo(m_indexedTermsArray, 0); } long endTime = Environment.TickCount; m_total_time = (int)(endTime - startTime); m_phase1_time = (int)(midPoint - startTime); }
// Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes) // Read just the size -- caller must skip the field content to continue reading fields // Return the size in bytes or chars, depending on field type private int AddFieldSize(Document doc, FieldInfo fi, bool binary, bool compressed) { int size = fieldsStream.ReadVInt(), bytesize = binary || compressed?size:2 * size; var sizebytes = new byte[4]; sizebytes[0] = (byte) (Number.URShift(bytesize, 24)); sizebytes[1] = (byte) (Number.URShift(bytesize, 16)); sizebytes[2] = (byte) (Number.URShift(bytesize, 8)); sizebytes[3] = (byte) bytesize; doc.Add(new Field(fi.name, sizebytes, Field.Store.YES)); return size; }
internal bool hasPayloads; // if enabled, and we actually saw any for this field public TermVectorsConsumerPerField(TermsHashPerField termsHashPerField, TermVectorsConsumer termsWriter, FieldInfo fieldInfo) { this.termsHashPerField = termsHashPerField; this.termsWriter = termsWriter; this.fieldInfo = fieldInfo; docState = termsHashPerField.docState; fieldState = termsHashPerField.fieldState; }