// maxAllowed = the "highest" we can index, but we will still // randomly index at lower IndexOption private FieldsProducer BuildIndex(Directory dir, FieldInfo.IndexOptions maxAllowed, bool allowPayloads, bool alwaysTestMax) { Codec codec = Codec; SegmentInfo segmentInfo = new SegmentInfo(dir, Constants.LUCENE_MAIN_VERSION, "_0", MaxDoc, false, codec, null); int maxIndexOption = Enum.GetValues(typeof(FieldInfo.IndexOptions)).Cast<FieldInfo.IndexOptions>().ToList().IndexOf(maxAllowed); if (VERBOSE) { Console.WriteLine("\nTEST: now build index"); } int maxIndexOptionNoOffsets = Enum.GetValues(typeof(FieldInfo.IndexOptions)).Cast<FieldInfo.IndexOptions>().ToList().IndexOf(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); // TODO use allowPayloads var newFieldInfoArray = new FieldInfo[Fields.Count]; for (int fieldUpto = 0; fieldUpto < Fields.Count; fieldUpto++) { FieldInfo oldFieldInfo = FieldInfos.FieldInfo(fieldUpto); string pf = TestUtil.GetPostingsFormat(codec, oldFieldInfo.Name); int fieldMaxIndexOption; if (DoesntSupportOffsets.Contains(pf)) { fieldMaxIndexOption = Math.Min(maxIndexOptionNoOffsets, maxIndexOption); } else { fieldMaxIndexOption = maxIndexOption; } // Randomly picked the IndexOptions to index this // field with: FieldInfo.IndexOptions indexOptions = Enum.GetValues(typeof(FieldInfo.IndexOptions)).Cast<FieldInfo.IndexOptions>().ToArray()[alwaysTestMax ? fieldMaxIndexOption : Random().Next(1 + fieldMaxIndexOption)]; bool doPayloads = indexOptions.CompareTo(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 && allowPayloads; newFieldInfoArray[fieldUpto] = new FieldInfo(oldFieldInfo.Name, true, fieldUpto, false, false, doPayloads, indexOptions, null, DocValuesType.NUMERIC, null); } FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray); // Estimate that flushed segment size will be 25% of // what we use in RAM: long bytes = TotalPostings * 8 + TotalPayloadBytes; SegmentWriteState writeState = new SegmentWriteState(null, dir, segmentInfo, newFieldInfos, 32, null, new IOContext(new FlushInfo(MaxDoc, bytes))); FieldsConsumer fieldsConsumer = codec.PostingsFormat().FieldsConsumer(writeState); foreach (KeyValuePair<string, SortedDictionary<BytesRef, long>> fieldEnt in Fields) { string field = fieldEnt.Key; IDictionary<BytesRef, long> terms = fieldEnt.Value; FieldInfo fieldInfo = newFieldInfos.FieldInfo(field); FieldInfo.IndexOptions? indexOptions = fieldInfo.FieldIndexOptions; if (VERBOSE) { Console.WriteLine("field=" + field + " indexOtions=" + indexOptions); } bool doFreq = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS; bool doPos = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; bool doPayloads = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS && allowPayloads; bool doOffsets = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; TermsConsumer termsConsumer = fieldsConsumer.AddField(fieldInfo); long sumTotalTF = 0; long sumDF = 0; FixedBitSet seenDocs = new FixedBitSet(MaxDoc); foreach (KeyValuePair<BytesRef, long> termEnt in terms) { BytesRef term = termEnt.Key; SeedPostings postings = GetSeedPostings(term.Utf8ToString(), termEnt.Value, false, maxAllowed); if (VERBOSE) { Console.WriteLine(" term=" + field + ":" + term.Utf8ToString() + " docFreq=" + postings.DocFreq + " seed=" + termEnt.Value); } PostingsConsumer postingsConsumer = termsConsumer.StartTerm(term); long totalTF = 0; int docID = 0; while ((docID = postings.NextDoc()) != DocsEnum.NO_MORE_DOCS) { int freq = postings.Freq(); if (VERBOSE) { Console.WriteLine(" " + postings.Upto + ": docID=" + docID + " freq=" + postings.Freq_Renamed); } postingsConsumer.StartDoc(docID, doFreq ? postings.Freq_Renamed : -1); seenDocs.Set(docID); if (doPos) { totalTF += postings.Freq_Renamed; for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = postings.NextPosition(); BytesRef payload = postings.Payload; if (VERBOSE) { if (doPayloads) { Console.WriteLine(" pos=" + pos + " payload=" + (payload == null ? "null" : payload.Length + " bytes")); } else { Console.WriteLine(" pos=" + pos); } } postingsConsumer.AddPosition(pos, doPayloads ? payload : null, doOffsets ? postings.StartOffset() : -1, doOffsets ? postings.EndOffset() : -1); } } else if (doFreq) { totalTF += freq; } else { totalTF++; } postingsConsumer.FinishDoc(); } termsConsumer.FinishTerm(term, new TermStats(postings.DocFreq, doFreq ? totalTF : -1)); sumTotalTF += totalTF; sumDF += postings.DocFreq; } termsConsumer.Finish(doFreq ? sumTotalTF : -1, sumDF, seenDocs.Cardinality()); } fieldsConsumer.Dispose(); if (VERBOSE) { Console.WriteLine("TEST: after indexing: files="); foreach (string file in dir.ListAll()) { Console.WriteLine(" " + file + ": " + dir.FileLength(file) + " bytes"); } } CurrentFieldInfos = newFieldInfos; SegmentReadState readState = new SegmentReadState(dir, segmentInfo, newFieldInfos, IOContext.READ, 1); return codec.PostingsFormat().FieldsProducer(readState); }
/// <summary> /// Returns a new Directory instance, using the specified random /// with contents copied from the provided directory. See /// <seealso cref="#newDirectory()"/> for more information. /// </summary> public static BaseDirectoryWrapper NewDirectory(Random r, Directory d) { Directory impl = NewDirectoryImpl(r, TEST_DIRECTORY); foreach (string file in d.ListAll()) { d.Copy(impl, file, file, NewIOContext(r)); } return WrapDirectory(r, impl, Rarely(r)); }
/// <summary> /// Initialize the deleter: find all previous commits in /// the Directory, incref the files they reference, call /// the policy to let it delete commits. this will remove /// any files not referenced by any of the commits. </summary> /// <exception cref="IOException"> if there is a low-level IO error </exception> public IndexFileDeleter(Directory directory, IndexDeletionPolicy policy, SegmentInfos segmentInfos, InfoStream infoStream, IndexWriter writer, bool initialIndexExists) { this.InfoStream = infoStream; this.Writer = writer; string currentSegmentsFile = segmentInfos.SegmentsFileName; if (infoStream.IsEnabled("IFD")) { infoStream.Message("IFD", "init: current segments file is \"" + currentSegmentsFile + "\"; deletionPolicy=" + policy); } this.Policy = policy; this.Directory = directory; // First pass: walk the files and initialize our ref // counts: long currentGen = segmentInfos.Generation; CommitPoint currentCommitPoint = null; string[] files = null; try { files = directory.ListAll(); } catch (NoSuchDirectoryException e) { // it means the directory is empty, so ignore it. files = new string[0]; } if (currentSegmentsFile != null) { Regex r = IndexFileNames.CODEC_FILE_PATTERN; foreach (string fileName in files) { if (!fileName.EndsWith("write.lock") && !fileName.Equals(IndexFileNames.SEGMENTS_GEN) && (r.IsMatch(fileName) || fileName.StartsWith(IndexFileNames.SEGMENTS))) { // Add this file to refCounts with initial count 0: GetRefCount(fileName); if (fileName.StartsWith(IndexFileNames.SEGMENTS)) { // this is a commit (segments or segments_N), and // it's valid (<= the max gen). Load it, then // incref all files it refers to: if (infoStream.IsEnabled("IFD")) { infoStream.Message("IFD", "init: load commit \"" + fileName + "\""); } SegmentInfos sis = new SegmentInfos(); try { sis.Read(directory, fileName); } catch (FileNotFoundException e) { // LUCENE-948: on NFS (and maybe others), if // you have writers switching back and forth // between machines, it's very likely that the // dir listing will be stale and will claim a // file segments_X exists when in fact it // doesn't. So, we catch this and handle it // as if the file does not exist if (infoStream.IsEnabled("IFD")) { infoStream.Message("IFD", "init: hit FileNotFoundException when loading commit \"" + fileName + "\"; skipping this commit point"); } sis = null; } catch (IOException e) { if (SegmentInfos.GenerationFromSegmentsFileName(fileName) <= currentGen && directory.FileLength(fileName) > 0) { throw e; } else { // Most likely we are opening an index that // has an aborted "future" commit, so suppress // exc in this case sis = null; } } if (sis != null) { CommitPoint commitPoint = new CommitPoint(CommitsToDelete, directory, sis); if (sis.Generation == segmentInfos.Generation) { currentCommitPoint = commitPoint; } Commits.Add(commitPoint); IncRef(sis, true); if (LastSegmentInfos_Renamed == null || sis.Generation > LastSegmentInfos_Renamed.Generation) { LastSegmentInfos_Renamed = sis; } } } } } } if (currentCommitPoint == null && currentSegmentsFile != null && initialIndexExists) { // We did not in fact see the segments_N file // corresponding to the segmentInfos that was passed // in. Yet, it must exist, because our caller holds // the write lock. this can happen when the directory // listing was stale (eg when index accessed via NFS // client with stale directory listing cache). So we // try now to explicitly open this commit point: SegmentInfos sis = new SegmentInfos(); try { sis.Read(directory, currentSegmentsFile); } catch (IOException e) { throw new CorruptIndexException("failed to locate current segments_N file \"" + currentSegmentsFile + "\""); } if (infoStream.IsEnabled("IFD")) { infoStream.Message("IFD", "forced open of current segments file " + segmentInfos.SegmentsFileName); } currentCommitPoint = new CommitPoint(CommitsToDelete, directory, sis); Commits.Add(currentCommitPoint); IncRef(sis, true); } // We keep commits list in sorted order (oldest to newest): CollectionUtil.TimSort(Commits); // Now delete anything with ref count at 0. These are // presumably abandoned files eg due to crash of // IndexWriter. foreach (KeyValuePair<string, RefCount> entry in RefCounts) { RefCount rc = entry.Value; string fileName = entry.Key; if (0 == rc.Count) { if (infoStream.IsEnabled("IFD")) { infoStream.Message("IFD", "init: removing unreferenced file \"" + fileName + "\""); } DeleteFile(fileName); } } // Finally, give policy a chance to remove things on // startup: Policy.OnInit(Commits); // Always protect the incoming segmentInfos since // sometime it may not be the most recent commit Checkpoint(segmentInfos, false); StartingCommitDeleted = currentCommitPoint == null ? false : currentCommitPoint.Deleted; DeleteCommits(); }