protected internal override long Size(SegmentCommitInfo info) { int hourOfDay = Calendar.Hour; if (hourOfDay < 6 || hourOfDay > 20 || Random.Next(23) == 5) // its 5 o'clock somewhere { Drink.Drink_e[] values = Enum.GetValues(typeof(Drink.Drink_e)).Cast<Drink.Drink_e>().ToArray(); // pick a random drink during the day Drink.Drink_e drink = values[Random.Next(values.Length - 1)]; return (long)drink * info.SizeInBytes(); } return info.SizeInBytes(); }
protected internal override long Size(SegmentCommitInfo info) { int hourOfDay = Calendar.Hour; if (hourOfDay < 6 || hourOfDay > 20 || Random.Next(23) == 5) // its 5 o'clock somewhere { Drink.Drink_e[] values = Enum.GetValues(typeof(Drink.Drink_e)).Cast <Drink.Drink_e>().ToArray(); // pick a random drink during the day Drink.Drink_e drink = values[Random.Next(values.Length - 1)]; return((long)drink * info.SizeInBytes()); } return(info.SizeInBytes()); }
/// <summary> /// Return the byte size of the provided {@link /// SegmentCommitInfo}, pro-rated by percentage of /// non-deleted documents if {@link /// #setCalibrateSizeByDeletes} is set. /// </summary> protected internal virtual long SizeBytes(SegmentCommitInfo info) { if (CalibrateSizeByDeletes_Renamed) { return(base.Size(info)); } return(info.SizeInBytes()); }
/// <summary> /// Return the byte size of the provided {@link /// SegmentCommitInfo}, pro-rated by percentage of /// non-deleted documents is set. /// </summary> protected internal virtual long Size(SegmentCommitInfo info) { long byteSize = info.SizeInBytes(); int delCount = Writer.Get().NumDeletedDocs(info); double delRatio = (info.Info.DocCount <= 0 ? 0.0f : ((float)delCount / (float)info.Info.DocCount)); Debug.Assert(delRatio <= 1.0); return(info.Info.DocCount <= 0 ? byteSize : (long)(byteSize * (1.0 - delRatio))); }
public override MergeSpecification FindMerges(MergeTrigger?mergeTrigger, SegmentInfos infos) { if (Verbose()) { Message("findMerges: " + infos.Size() + " segments"); } if (infos.Size() == 0) { return(null); } ICollection <SegmentCommitInfo> merging = Writer.Get().MergingSegments; ICollection <SegmentCommitInfo> toBeMerged = new HashSet <SegmentCommitInfo>(); List <SegmentCommitInfo> infosSorted = new List <SegmentCommitInfo>(infos.AsList()); infosSorted.Sort(new SegmentByteSizeDescending(this)); // Compute total index bytes & print details about the index long totIndexBytes = 0; long minSegmentBytes = long.MaxValue; foreach (SegmentCommitInfo info in infosSorted) { long segBytes = Size(info); if (Verbose()) { string extra = merging.Contains(info) ? " [merging]" : ""; if (segBytes >= MaxMergedSegmentBytes / 2.0) { extra += " [skip: too large]"; } else if (segBytes < FloorSegmentBytes) { extra += " [floored]"; } Message(" seg=" + Writer.Get().SegString(info) + " size=" + String.Format(CultureInfo.InvariantCulture, "{0:0.00}", segBytes / 1024 / 1024.0) + " MB" + extra); } minSegmentBytes = Math.Min(segBytes, minSegmentBytes); // Accum total byte size totIndexBytes += segBytes; } // If we have too-large segments, grace them out // of the maxSegmentCount: int tooBigCount = 0; while (tooBigCount < infosSorted.Count && Size(infosSorted[tooBigCount]) >= MaxMergedSegmentBytes / 2.0) { totIndexBytes -= Size(infosSorted[tooBigCount]); tooBigCount++; } minSegmentBytes = FloorSize(minSegmentBytes); // Compute max allowed segs in the index long levelSize = minSegmentBytes; long bytesLeft = totIndexBytes; double allowedSegCount = 0; while (true) { double segCountLevel = bytesLeft / (double)levelSize; if (segCountLevel < SegsPerTier) { allowedSegCount += Math.Ceiling(segCountLevel); break; } allowedSegCount += SegsPerTier; bytesLeft -= (long)(SegsPerTier * levelSize); levelSize *= MaxMergeAtOnce_Renamed; } int allowedSegCountInt = (int)allowedSegCount; MergeSpecification spec = null; // Cycle to possibly select more than one merge: while (true) { long mergingBytes = 0; // Gather eligible segments for merging, ie segments // not already being merged and not already picked (by // prior iteration of this loop) for merging: IList <SegmentCommitInfo> eligible = new List <SegmentCommitInfo>(); for (int idx = tooBigCount; idx < infosSorted.Count; idx++) { SegmentCommitInfo info = infosSorted[idx]; if (merging.Contains(info)) { mergingBytes += info.SizeInBytes(); } else if (!toBeMerged.Contains(info)) { eligible.Add(info); } } bool maxMergeIsRunning = mergingBytes >= MaxMergedSegmentBytes; if (Verbose()) { Message(" allowedSegmentCount=" + allowedSegCountInt + " vs count=" + infosSorted.Count + " (eligible count=" + eligible.Count + ") tooBigCount=" + tooBigCount); } if (eligible.Count == 0) { return(spec); } if (eligible.Count >= allowedSegCountInt) { // OK we are over budget -- find best merge! MergeScore bestScore = null; IList <SegmentCommitInfo> best = null; bool bestTooLarge = false; long bestMergeBytes = 0; // Consider all merge starts: for (int startIdx = 0; startIdx <= eligible.Count - MaxMergeAtOnce_Renamed; startIdx++) { long totAfterMergeBytes = 0; IList <SegmentCommitInfo> candidate = new List <SegmentCommitInfo>(); bool hitTooLarge = false; for (int idx = startIdx; idx < eligible.Count && candidate.Count < MaxMergeAtOnce_Renamed; idx++) { SegmentCommitInfo info = eligible[idx]; long segBytes = Size(info); if (totAfterMergeBytes + segBytes > MaxMergedSegmentBytes) { hitTooLarge = true; // NOTE: we continue, so that we can try // "packing" smaller segments into this merge // to see if we can get closer to the max // size; this in general is not perfect since // this is really "bin packing" and we'd have // to try different permutations. continue; } candidate.Add(info); totAfterMergeBytes += segBytes; } MergeScore score = Score(candidate, hitTooLarge, mergingBytes); if (Verbose()) { Message(" maybe=" + Writer.Get().SegString(candidate) + " score=" + score.Score + " " + score.Explanation + " tooLarge=" + hitTooLarge + " size=" + string.Format(CultureInfo.InvariantCulture, "%.3f MB", totAfterMergeBytes / 1024.0 / 1024.0)); } // If we are already running a max sized merge // (maxMergeIsRunning), don't allow another max // sized merge to kick off: if ((bestScore == null || score.Score < bestScore.Score) && (!hitTooLarge || !maxMergeIsRunning)) { best = candidate; bestScore = score; bestTooLarge = hitTooLarge; bestMergeBytes = totAfterMergeBytes; } } if (best != null) { if (spec == null) { spec = new MergeSpecification(); } OneMerge merge = new OneMerge(best); spec.Add(merge); foreach (SegmentCommitInfo info in merge.Segments) { toBeMerged.Add(info); } if (Verbose()) { Message(" add merge=" + Writer.Get().SegString(merge.Segments) + " size=" + string.Format(CultureInfo.InvariantCulture, "%.3f MB", bestMergeBytes / 1024.0 / 1024.0) + " score=" + string.Format(CultureInfo.InvariantCulture, "%.3f", bestScore.Score) + " " + bestScore.Explanation + (bestTooLarge ? " [max merge]" : "")); } } else { return(spec); } } else { return(spec); } } }
/// <summary> /// Flush all pending docs to a new segment </summary> internal virtual FlushedSegment Flush() { Debug.Assert(numDocsInRAM > 0); Debug.Assert(DeleteSlice.Empty, "all deletes must be applied in prepareFlush"); SegmentInfo_Renamed.DocCount = numDocsInRAM; SegmentWriteState flushState = new SegmentWriteState(InfoStream, Directory, SegmentInfo_Renamed, FieldInfos.Finish(), IndexWriterConfig.TermIndexInterval, PendingUpdates, new IOContext(new FlushInfo(numDocsInRAM, BytesUsed()))); double startMBUsed = BytesUsed() / 1024.0 / 1024.0; // Apply delete-by-docID now (delete-byDocID only // happens when an exception is hit processing that // doc, eg if analyzer has some problem w/ the text): if (PendingUpdates.DocIDs.Count > 0) { flushState.LiveDocs = Codec.LiveDocsFormat().NewLiveDocs(numDocsInRAM); foreach (int delDocID in PendingUpdates.DocIDs) { flushState.LiveDocs.Clear(delDocID); } flushState.DelCountOnFlush = PendingUpdates.DocIDs.Count; PendingUpdates.BytesUsed.AddAndGet(-PendingUpdates.DocIDs.Count * BufferedUpdates.BYTES_PER_DEL_DOCID); PendingUpdates.DocIDs.Clear(); } if (Aborting) { if (InfoStream.IsEnabled("DWPT")) { InfoStream.Message("DWPT", "flush: skip because aborting is set"); } return null; } if (InfoStream.IsEnabled("DWPT")) { InfoStream.Message("DWPT", "flush postings as segment " + flushState.SegmentInfo.Name + " numDocs=" + numDocsInRAM); } bool success = false; try { Consumer.Flush(flushState); PendingUpdates.Terms.Clear(); SegmentInfo_Renamed.Files = new HashSet<string>(Directory.CreatedFiles); SegmentCommitInfo segmentInfoPerCommit = new SegmentCommitInfo(SegmentInfo_Renamed, 0, -1L, -1L); if (InfoStream.IsEnabled("DWPT")) { InfoStream.Message("DWPT", "new segment has " + (flushState.LiveDocs == null ? 0 : (flushState.SegmentInfo.DocCount - flushState.DelCountOnFlush)) + " deleted docs"); InfoStream.Message("DWPT", "new segment has " + (flushState.FieldInfos.HasVectors() ? "vectors" : "no vectors") + "; " + (flushState.FieldInfos.HasNorms() ? "norms" : "no norms") + "; " + (flushState.FieldInfos.HasDocValues() ? "docValues" : "no docValues") + "; " + (flushState.FieldInfos.HasProx() ? "prox" : "no prox") + "; " + (flushState.FieldInfos.HasFreq() ? "freqs" : "no freqs")); InfoStream.Message("DWPT", "flushedFiles=" + segmentInfoPerCommit.Files()); InfoStream.Message("DWPT", "flushed codec=" + Codec); } BufferedUpdates segmentDeletes; if (PendingUpdates.Queries.Count == 0 && PendingUpdates.NumericUpdates.Count == 0 && PendingUpdates.BinaryUpdates.Count == 0) { PendingUpdates.Clear(); segmentDeletes = null; } else { segmentDeletes = PendingUpdates; } if (InfoStream.IsEnabled("DWPT")) { double newSegmentSize = segmentInfoPerCommit.SizeInBytes() / 1024.0 / 1024.0; InfoStream.Message("DWPT", "flushed: segment=" + SegmentInfo_Renamed.Name + " ramUsed=" + startMBUsed.ToString(Nf) + " MB" + " newFlushedSize(includes docstores)=" + newSegmentSize.ToString(Nf) + " MB" + " docs/MB=" + (flushState.SegmentInfo.DocCount / newSegmentSize).ToString(Nf)); } Debug.Assert(SegmentInfo_Renamed != null); FlushedSegment fs = new FlushedSegment(segmentInfoPerCommit, flushState.FieldInfos, segmentDeletes, flushState.LiveDocs, flushState.DelCountOnFlush); SealFlushedSegment(fs); success = true; return fs; } finally { if (!success) { Abort(FilesToDelete); } } }
/// <summary> /// Return the byte size of the provided {@link /// SegmentCommitInfo}, pro-rated by percentage of /// non-deleted documents is set. /// </summary> protected internal virtual long Size(SegmentCommitInfo info) { long byteSize = info.SizeInBytes(); int delCount = Writer.Get().NumDeletedDocs(info); double delRatio = (info.Info.DocCount <= 0 ? 0.0f : ((float)delCount / (float)info.Info.DocCount)); Debug.Assert(delRatio <= 1.0); return (info.Info.DocCount <= 0 ? byteSize : (long)(byteSize * (1.0 - delRatio))); }
/// <summary> /// Seals the <seealso cref="SegmentInfo"/> for the new flushed segment and persists /// the deleted documents <seealso cref="MutableBits"/>. /// </summary> internal virtual void SealFlushedSegment(FlushedSegment flushedSegment) { Debug.Assert(flushedSegment != null); SegmentCommitInfo newSegment = flushedSegment.SegmentInfo; IndexWriter.SetDiagnostics(newSegment.Info, IndexWriter.SOURCE_FLUSH); IOContext context = new IOContext(new FlushInfo(newSegment.Info.DocCount, newSegment.SizeInBytes())); bool success = false; try { if (IndexWriterConfig.UseCompoundFile) { CollectionsHelper.AddAll(FilesToDelete, IndexWriter.CreateCompoundFile(InfoStream, Directory, MergeState.CheckAbort.NONE, newSegment.Info, context)); newSegment.Info.UseCompoundFile = true; } // Have codec write SegmentInfo. Must do this after // creating CFS so that 1) .si isn't slurped into CFS, // and 2) .si reflects useCompoundFile=true change // above: Codec.SegmentInfoFormat().SegmentInfoWriter.Write(Directory, newSegment.Info, flushedSegment.FieldInfos, context); // TODO: ideally we would freeze newSegment here!! // because any changes after writing the .si will be // lost... // Must write deleted docs after the CFS so we don't // slurp the del file into CFS: if (flushedSegment.LiveDocs != null) { int delCount = flushedSegment.DelCount; Debug.Assert(delCount > 0); if (InfoStream.IsEnabled("DWPT")) { InfoStream.Message("DWPT", "flush: write " + delCount + " deletes gen=" + flushedSegment.SegmentInfo.DelGen); } // TODO: we should prune the segment if it's 100% // deleted... but merge will also catch it. // TODO: in the NRT case it'd be better to hand // this del vector over to the // shortly-to-be-opened SegmentReader and let it // carry the changes; there's no reason to use // filesystem as intermediary here. SegmentCommitInfo info = flushedSegment.SegmentInfo; Codec codec = info.Info.Codec; codec.LiveDocsFormat().WriteLiveDocs(flushedSegment.LiveDocs, Directory, info, delCount, context); newSegment.DelCount = delCount; newSegment.AdvanceDelGen(); } success = true; } finally { if (!success) { if (InfoStream.IsEnabled("DWPT")) { InfoStream.Message("DWPT", "hit exception creating compound file for newly flushed segment " + newSegment.Info.Name); } } } }
/// <summary> /// Flush all pending docs to a new segment </summary> internal virtual FlushedSegment Flush() { Debug.Assert(numDocsInRAM > 0); Debug.Assert(DeleteSlice.Empty, "all deletes must be applied in prepareFlush"); SegmentInfo_Renamed.DocCount = numDocsInRAM; SegmentWriteState flushState = new SegmentWriteState(InfoStream, Directory, SegmentInfo_Renamed, FieldInfos.Finish(), IndexWriterConfig.TermIndexInterval, PendingUpdates, new IOContext(new FlushInfo(numDocsInRAM, BytesUsed()))); double startMBUsed = BytesUsed() / 1024.0 / 1024.0; // Apply delete-by-docID now (delete-byDocID only // happens when an exception is hit processing that // doc, eg if analyzer has some problem w/ the text): if (PendingUpdates.DocIDs.Count > 0) { flushState.LiveDocs = Codec.LiveDocsFormat().NewLiveDocs(numDocsInRAM); foreach (int delDocID in PendingUpdates.DocIDs) { flushState.LiveDocs.Clear(delDocID); } flushState.DelCountOnFlush = PendingUpdates.DocIDs.Count; PendingUpdates.BytesUsed.AddAndGet(-PendingUpdates.DocIDs.Count * BufferedUpdates.BYTES_PER_DEL_DOCID); PendingUpdates.DocIDs.Clear(); } if (Aborting) { if (InfoStream.IsEnabled("DWPT")) { InfoStream.Message("DWPT", "flush: skip because aborting is set"); } return(null); } if (InfoStream.IsEnabled("DWPT")) { InfoStream.Message("DWPT", "flush postings as segment " + flushState.SegmentInfo.Name + " numDocs=" + numDocsInRAM); } bool success = false; try { Consumer.Flush(flushState); PendingUpdates.Terms.Clear(); SegmentInfo_Renamed.Files = new HashSet <string>(Directory.CreatedFiles); SegmentCommitInfo segmentInfoPerCommit = new SegmentCommitInfo(SegmentInfo_Renamed, 0, -1L, -1L); if (InfoStream.IsEnabled("DWPT")) { InfoStream.Message("DWPT", "new segment has " + (flushState.LiveDocs == null ? 0 : (flushState.SegmentInfo.DocCount - flushState.DelCountOnFlush)) + " deleted docs"); InfoStream.Message("DWPT", "new segment has " + (flushState.FieldInfos.HasVectors() ? "vectors" : "no vectors") + "; " + (flushState.FieldInfos.HasNorms() ? "norms" : "no norms") + "; " + (flushState.FieldInfos.HasDocValues() ? "docValues" : "no docValues") + "; " + (flushState.FieldInfos.HasProx() ? "prox" : "no prox") + "; " + (flushState.FieldInfos.HasFreq() ? "freqs" : "no freqs")); InfoStream.Message("DWPT", "flushedFiles=" + segmentInfoPerCommit.Files()); InfoStream.Message("DWPT", "flushed codec=" + Codec); } BufferedUpdates segmentDeletes; if (PendingUpdates.Queries.Count == 0 && PendingUpdates.NumericUpdates.Count == 0 && PendingUpdates.BinaryUpdates.Count == 0) { PendingUpdates.Clear(); segmentDeletes = null; } else { segmentDeletes = PendingUpdates; } if (InfoStream.IsEnabled("DWPT")) { double newSegmentSize = segmentInfoPerCommit.SizeInBytes() / 1024.0 / 1024.0; InfoStream.Message("DWPT", "flushed: segment=" + SegmentInfo_Renamed.Name + " ramUsed=" + startMBUsed.ToString(Nf) + " MB" + " newFlushedSize(includes docstores)=" + newSegmentSize.ToString(Nf) + " MB" + " docs/MB=" + (flushState.SegmentInfo.DocCount / newSegmentSize).ToString(Nf)); } Debug.Assert(SegmentInfo_Renamed != null); FlushedSegment fs = new FlushedSegment(segmentInfoPerCommit, flushState.FieldInfos, segmentDeletes, flushState.LiveDocs, flushState.DelCountOnFlush); SealFlushedSegment(fs); success = true; return(fs); } finally { if (!success) { Abort(FilesToDelete); } } }
public virtual void ListSegments() { for (int x = 0; x < infos.Size(); x++) { SegmentCommitInfo info = infos.Info(x); string sizeStr = string.Format(CultureInfo.InvariantCulture, "{0:###,###.###}", info.SizeInBytes()); Console.WriteLine(info.Info.Name + " " + sizeStr); } }