public static void AfterClassCountingFacetsAggregatorTest() { IOUtils.Close(indexDir, taxoDir); }
public virtual void TestConcurrency() { int ncats = AtLeast(100000); // add many categories int range = ncats * 3; // affects the categories selection AtomicInteger numCats = new AtomicInteger(ncats); Directory dir = NewDirectory(); var values = new ConcurrentDictionary <string, string>(); double d = Random().NextDouble(); TaxonomyWriterCache cache; if (d < 0.7) { // this is the fastest, yet most memory consuming cache = new Cl2oTaxonomyWriterCache(1024, 0.15f, 3); } else if (TEST_NIGHTLY && d > 0.98) { // this is the slowest, but tests the writer concurrency when no caching is done. // only pick it during NIGHTLY tests, and even then, with very low chances. cache = NO_OP_CACHE; } else { // this is slower than CL2O, but less memory consuming, and exercises finding categories on disk too. cache = new LruTaxonomyWriterCache(ncats / 10); } if (VERBOSE) { Console.WriteLine("TEST: use cache=" + cache); } var tw = new DirectoryTaxonomyWriter(dir, OpenMode.CREATE, cache); ThreadClass[] addThreads = new ThreadClass[AtLeast(4)]; for (int z = 0; z < addThreads.Length; z++) { addThreads[z] = new ThreadAnonymousInnerClassHelper(this, range, numCats, values, tw); } foreach (var t in addThreads) { t.Start(); } foreach (var t in addThreads) { t.Join(); } tw.Dispose(); DirectoryTaxonomyReader dtr = new DirectoryTaxonomyReader(dir); // +1 for root category if (values.Count + 1 != dtr.Size) { foreach (string value in values.Keys) { FacetLabel label = new FacetLabel(FacetsConfig.StringToPath(value)); if (dtr.GetOrdinal(label) == -1) { Console.WriteLine("FAIL: path=" + label + " not recognized"); } } Fail("mismatch number of categories"); } int[] parents = dtr.ParallelTaxonomyArrays.Parents(); foreach (string cat in values.Keys) { FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(cat)); Assert.True(dtr.GetOrdinal(cp) > 0, "category not found " + cp); int level = cp.Length; int parentOrd = 0; // for root, parent is always virtual ROOT (ord=0) FacetLabel path = new FacetLabel(); for (int i = 0; i < level; i++) { path = cp.Subpath(i + 1); int ord = dtr.GetOrdinal(path); Assert.AreEqual(parentOrd, parents[ord], "invalid parent for cp=" + path); parentOrd = ord; // next level should have this parent } } IOUtils.Close(dtr, dir); }
public override void Build(InputIterator iterator) { if (iterator.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } File tempInput = File.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.defaultTempDir()); File tempSorted = File.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.defaultTempDir()); OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; ExternalRefSorter sorter = null; // Push floats up front before sequences to sort them. For now, assume they are non-negative. // If negative floats are allowed some trickery needs to be done to find their byte order. bool success = false; count = 0; try { sbyte[] buffer = new sbyte[0]; ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef spare; while ((spare = iterator.Next()) != null) { if (spare.Length + 4 >= buffer.Length) { buffer = ArrayUtil.Grow(buffer, spare.Length + 4); } output.Reset(buffer); output.WriteInt(EncodeWeight(iterator.Weight)); output.WriteBytes(spare.Bytes, spare.Offset, spare.Length); writer.Write(buffer, 0, output.Position); } writer.Dispose(); // We don't know the distribution of scores and we need to bucket them, so we'll sort // and divide into equal buckets. OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted); tempInput.Delete(); FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength); int inputLines = info.Lines; reader = new OfflineSorter.ByteSequencesReader(tempSorted); long line = 0; int previousBucket = 0; int previousScore = 0; ByteArrayDataInput input = new ByteArrayDataInput(); BytesRef tmp1 = new BytesRef(); BytesRef tmp2 = new BytesRef(); while (reader.Read(tmp1)) { input.Reset(tmp1.Bytes); int currentScore = input.ReadInt(); int bucket; if (line > 0 && currentScore == previousScore) { bucket = previousBucket; } else { bucket = (int)(line * buckets / inputLines); } previousScore = currentScore; previousBucket = bucket; // Only append the input, discard the weight. tmp2.Bytes = tmp1.Bytes; tmp2.Offset = input.Position; tmp2.Length = tmp1.Length - input.Position; builder.Add(tmp2, bucket); line++; count++; } // The two FSTCompletions share the same automaton. this.higherWeightsCompletion = builder.Build(); this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst); success = true; } finally { if (success) { IOUtils.Close(reader, writer, sorter); } else { IOUtils.CloseWhileHandlingException(reader, writer, sorter); } tempInput.Delete(); tempSorted.Delete(); } }
/// <summary> /// Build the suggest index, using up to the specified /// amount of temporary RAM while building. Note that /// the weights for the suggestions are ignored. /// </summary> public virtual void Build(InputIterator iterator, double ramBufferSizeMB) { if (iterator.HasPayloads()) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts()) { throw new System.ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); // TODO: messy ... java7 has Files.createTempDirectory // ... but 4.x is java6: File tempIndexPath = null; Random random = new Random(); while (true) { tempIndexPath = new File(directory, prefix + ".index." + random.Next(int.MaxValue)); if (tempIndexPath.mkdir()) { break; } } Directory dir = FSDirectory.Open(tempIndexPath); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_CURRENT, indexAnalyzer); iwc.OpenMode = IndexWriterConfig.OpenMode.CREATE; iwc.RAMBufferSizeMB = ramBufferSizeMB; IndexWriter writer = new IndexWriter(dir, iwc); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); // TODO: if only we had IndexOptions.TERMS_ONLY... ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS; ft.OmitNorms = true; ft.Freeze(); Document doc = new Document(); Field field = new Field("body", "", ft); doc.Add(field); totTokens = 0; IndexReader reader = null; bool success = false; count = 0; try { while (true) { BytesRef surfaceForm = iterator.Next(); if (surfaceForm == null) { break; } field.StringValue = surfaceForm.Utf8ToString(); writer.AddDocument(doc); count++; } reader = DirectoryReader.Open(writer, false); Terms terms = MultiFields.GetTerms(reader, "body"); if (terms == null) { throw new System.ArgumentException("need at least one suggestion"); } // Move all ngrams into an FST: TermsEnum termsEnum = terms.Iterator(null); Outputs <long?> outputs = PositiveIntOutputs.Singleton; Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs); IntsRef scratchInts = new IntsRef(); while (true) { BytesRef term = termsEnum.next(); if (term == null) { break; } int ngramCount = countGrams(term); if (ngramCount > grams) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams); } if (ngramCount == 1) { totTokens += termsEnum.TotalTermFreq(); } builder.Add(Util.ToIntsRef(term, scratchInts), encodeWeight(termsEnum.TotalTermFreq())); } fst = builder.Finish(); if (fst == null) { throw new System.ArgumentException("need at least one suggestion"); } //System.out.println("FST: " + fst.getNodeCount() + " nodes"); /* * PrintWriter pw = new PrintWriter("/x/tmp/out.dot"); * Util.toDot(fst, pw, true, true); * pw.close(); */ success = true; } finally { try { if (success) { IOUtils.Close(writer, reader); } else { IOUtils.CloseWhileHandlingException(writer, reader); } } finally { foreach (string file in dir.ListAll()) { File path = new File(tempIndexPath, file); if (path.Delete() == false) { throw new InvalidOperationException("failed to remove " + path); } } if (tempIndexPath.Delete() == false) { throw new InvalidOperationException("failed to remove " + tempIndexPath); } dir.Dispose(); } } }
/// <summary> /// expert: instantiates a new reader </summary> protected internal Lucene45DocValuesProducer(SegmentReadState state, string dataCodec, string dataExtension, string metaCodec, string metaExtension) { string metaName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, metaExtension); // read in the entries from the metadata file. ChecksumIndexInput @in = state.Directory.OpenChecksumInput(metaName, state.Context); this.MaxDoc = state.SegmentInfo.DocCount; bool success = false; try { Version = CodecUtil.CheckHeader(@in, metaCodec, Lucene45DocValuesFormat.VERSION_START, Lucene45DocValuesFormat.VERSION_CURRENT); Numerics = new Dictionary <int, NumericEntry>(); Ords = new Dictionary <int, NumericEntry>(); OrdIndexes = new Dictionary <int, NumericEntry>(); Binaries = new Dictionary <int, BinaryEntry>(); SortedSets = new Dictionary <int, SortedSetEntry>(); ReadFields(@in, state.FieldInfos); if (Version >= Lucene45DocValuesFormat.VERSION_CHECKSUM) { CodecUtil.CheckFooter(@in); } else { CodecUtil.CheckEOF(@in); } success = true; } finally { if (success) { IOUtils.Close(@in); } else { IOUtils.CloseWhileHandlingException(@in); } } success = false; try { string dataName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, dataExtension); Data = state.Directory.OpenInput(dataName, state.Context); int version2 = CodecUtil.CheckHeader(Data, dataCodec, Lucene45DocValuesFormat.VERSION_START, Lucene45DocValuesFormat.VERSION_CURRENT); if (Version != version2) { throw new Exception("Format versions mismatch"); } success = true; } finally { if (!success) { IOUtils.CloseWhileHandlingException(this.Data); } } RamBytesUsed_Renamed = new AtomicLong(RamUsageEstimator.ShallowSizeOfInstance(this.GetType())); }
internal MemoryDocValuesProducer(SegmentReadState state, string dataCodec, string dataExtension, string metaCodec, string metaExtension) { maxDoc = state.SegmentInfo.DocCount; string metaName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, metaExtension); // read in the entries from the metadata file. ChecksumIndexInput @in = state.Directory.OpenChecksumInput(metaName, state.Context); bool success = false; try { version = CodecUtil.CheckHeader(@in, metaCodec, VERSION_START, VERSION_CURRENT); numerics = new Dictionary <>(); binaries = new Dictionary <>(); fsts = new Dictionary <>(); ReadFields(@in, state.FieldInfos); if (version >= VERSION_CHECKSUM) { CodecUtil.CheckFooter(@in); } else { CodecUtil.CheckEOF(@in); } ramBytesUsed_Renamed = new AtomicLong(RamUsageEstimator.ShallowSizeOfInstance(this.GetType())); success = true; } finally { if (success) { IOUtils.Close(@in); } else { IOUtils.CloseWhileHandlingException(@in); } } success = false; try { string dataName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, dataExtension); data = state.Directory.OpenInput(dataName, state.Context); int version2 = CodecUtil.CheckHeader(data, dataCodec, VERSION_START, VERSION_CURRENT); if (version != version2) { throw new CorruptIndexException("Format versions mismatch"); } success = true; } finally { if (!success) { IOUtils.CloseWhileHandlingException(this.data); } } }