private GroupedFacetResult CreateExpectedFacetResult(string searchTerm, IndexContext context, int offset, int limit, int minCount, bool orderByCount, string facetPrefix) { JCG.Dictionary <string, ISet <string> > facetGroups; if (!context.searchTermToFacetGroups.TryGetValue(searchTerm, out facetGroups)) { facetGroups = new JCG.Dictionary <string, ISet <string> >(); } int totalCount = 0; int totalMissCount = 0; ISet <string> facetValues; if (facetPrefix != null) { facetValues = new JCG.HashSet <string>(); foreach (string facetValue in context.facetValues) { if (facetValue != null && facetValue.StartsWith(facetPrefix, StringComparison.Ordinal)) { facetValues.add(facetValue); } } } else { facetValues = context.facetValues; } List <TermGroupFacetCollector.FacetEntry> entries = new List <TermGroupFacetCollector.FacetEntry>(facetGroups.size()); // also includes facets with count 0 foreach (string facetValue in facetValues) { if (facetValue == null) { continue; } int count = facetGroups.TryGetValue(facetValue, out ISet <string> groups) && groups != null?groups.size() : 0; if (count >= minCount) { entries.Add(new TermGroupFacetCollector.FacetEntry(new BytesRef(facetValue), count)); } totalCount += count; } // Only include null count when no facet prefix is specified if (facetPrefix == null) { if (facetGroups.TryGetValue(null, out ISet <string> groups) && groups != null) { totalMissCount = groups.size(); } } entries.Sort(new ComparerAnonymousHelper2(orderByCount)); int endOffset = offset + limit; List <TermGroupFacetCollector.FacetEntry> entriesResult; if (offset >= entries.size()) { entriesResult = new List <TermGroupFacetCollector.FacetEntry>(); } else if (endOffset >= entries.size()) { entriesResult = entries.GetRange(offset, entries.size() - offset); } else { entriesResult = entries.GetRange(offset, endOffset - offset); } return(new GroupedFacetResult(totalCount, totalMissCount, entriesResult)); }
public virtual void TestSurrogatesOrder() { Directory dir = NewDirectory(); var config = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); config.Codec = new PreFlexRWCodec(); RandomIndexWriter w = new RandomIndexWriter(Random, dir, config); int numField = TestUtil.NextInt32(Random, 2, 5); int uniqueTermCount = 0; int tc = 0; var fieldTerms = new List <Term>(); for (int f = 0; f < numField; f++) { string field = "f" + f; int numTerms = AtLeast(200); ISet <string> uniqueTerms = new JCG.HashSet <string>(); for (int i = 0; i < numTerms; i++) { string term = GetRandomString(Random) + "_ " + (tc++); uniqueTerms.Add(term); fieldTerms.Add(new Term(field, term)); Documents.Document doc = new Documents.Document(); doc.Add(NewStringField(field, term, Field.Store.NO)); w.AddDocument(doc); } uniqueTermCount += uniqueTerms.Count; } IndexReader reader = w.GetReader(); if (VERBOSE) { fieldTerms.Sort(TermAsUTF16Comparer); Console.WriteLine("\nTEST: UTF16 order"); foreach (Term t in fieldTerms) { Console.WriteLine(" " + ToHexString(t)); } } // sorts in code point order: fieldTerms.Sort(); if (VERBOSE) { Console.WriteLine("\nTEST: codepoint order"); foreach (Term t in fieldTerms) { Console.WriteLine(" " + ToHexString(t)); } } Term[] fieldTermsArray = fieldTerms.ToArray(); //SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms); //FieldsProducer fields = codec.fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, 1)); //Assert.IsNotNull(fields); DoTestStraightEnum(fieldTerms, reader, uniqueTermCount); DoTestSeekExists(Random, fieldTerms, reader); DoTestSeekDoesNotExist(Random, numField, fieldTerms, fieldTermsArray, reader); reader.Dispose(); w.Dispose(); dir.Dispose(); }
/// <summary> /// Retrieve suggestions. /// </summary> public virtual IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num) { if (contexts != null) { throw new ArgumentException("this suggester doesn't support contexts"); } TokenStream ts = queryAnalyzer.GetTokenStream("", key.ToString()); try { ITermToBytesRefAttribute termBytesAtt = ts.AddAttribute <ITermToBytesRefAttribute>(); IOffsetAttribute offsetAtt = ts.AddAttribute <IOffsetAttribute>(); IPositionLengthAttribute posLenAtt = ts.AddAttribute <IPositionLengthAttribute>(); IPositionIncrementAttribute posIncAtt = ts.AddAttribute <IPositionIncrementAttribute>(); ts.Reset(); var lastTokens = new BytesRef[grams]; //System.out.println("lookup: key='" + key + "'"); // Run full analysis, but save only the // last 1gram, last 2gram, etc.: BytesRef tokenBytes = termBytesAtt.BytesRef; int maxEndOffset = -1; bool sawRealToken = false; while (ts.IncrementToken()) { termBytesAtt.FillBytesRef(); sawRealToken |= tokenBytes.Length > 0; // TODO: this is somewhat iffy; today, ShingleFilter // sets posLen to the gram count; maybe we should make // a separate dedicated att for this? int gramCount = posLenAtt.PositionLength; if (Debugging.AssertsEnabled) { Debugging.Assert(gramCount <= grams); } // Safety: make sure the recalculated count "agrees": if (CountGrams(tokenBytes) != gramCount) { throw new ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes)); } maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset); lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes); } ts.End(); if (!sawRealToken) { throw new ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings"); } // Carefully fill last tokens with _ tokens; // ShingleFilter appraently won't emit "only hole" // tokens: int endPosInc = posIncAtt.PositionIncrement; // Note this will also be true if input is the empty // string (in which case we saw no tokens and // maxEndOffset is still -1), which in fact works out OK // because we fill the unigram with an empty BytesRef // below: bool lastTokenEnded = offsetAtt.EndOffset > maxEndOffset || endPosInc > 0; //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.EndOffset); if (lastTokenEnded) { //System.out.println(" lastTokenEnded"); // If user hit space after the last token, then // "upgrade" all tokens. This way "foo " will suggest // all bigrams starting w/ foo, and not any unigrams // starting with "foo": for (int i = grams - 1; i > 0; i--) { BytesRef token = lastTokens[i - 1]; if (token == null) { continue; } token.Grow(token.Length + 1); token.Bytes[token.Length] = separator; token.Length++; lastTokens[i] = token; } lastTokens[0] = new BytesRef(); } var arc = new FST.Arc <long?>(); var bytesReader = fst.GetBytesReader(); // Try highest order models first, and if they return // results, return that; else, fallback: double backoff = 1.0; List <LookupResult> results = new List <LookupResult>(num); // We only add a given suffix once, from the highest // order model that saw it; for subsequent lower order // models we skip it: var seen = new JCG.HashSet <BytesRef>(); for (int gram = grams - 1; gram >= 0; gram--) { BytesRef token = lastTokens[gram]; // Don't make unigram predictions from empty string: if (token == null || (token.Length == 0 && key.Length > 0)) { // Input didn't have enough tokens: //System.out.println(" gram=" + gram + ": skip: not enough input"); continue; } if (endPosInc > 0 && gram <= endPosInc) { // Skip hole-only predictions; in theory we // shouldn't have to do this, but we'd need to fix // ShingleFilter to produce only-hole tokens: //System.out.println(" break: only holes now"); break; } //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString()); // TODO: we could add fuzziness here // match the prefix portion exactly //Pair<Long,BytesRef> prefixOutput = null; long?prefixOutput = null; try { prefixOutput = LookupPrefix(fst, bytesReader, token, arc); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } //System.out.println(" prefixOutput=" + prefixOutput); if (prefixOutput == null) { // This model never saw this prefix, e.g. the // trigram model never saw context "purple mushroom" backoff *= ALPHA; continue; } // TODO: we could do this division at build time, and // bake it into the FST? // Denominator for computing scores from current // model's predictions: long contextCount = totTokens; BytesRef lastTokenFragment = null; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { BytesRef context = new BytesRef(token.Bytes, token.Offset, i); long? output = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToInt32sRef(context, new Int32sRef())); if (Debugging.AssertsEnabled) { Debugging.Assert(output != null); } contextCount = DecodeWeight(output); lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } BytesRef finalLastToken; if (lastTokenFragment == null) { finalLastToken = BytesRef.DeepCopyOf(token); } else { finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment); } if (Debugging.AssertsEnabled) { Debugging.Assert(finalLastToken.Offset == 0); } CharsRef spare = new CharsRef(); // complete top-N Util.Fst.Util.TopResults <long?> completions = null; try { // Because we store multiple models in one FST // (1gram, 2gram, 3gram), we must restrict the // search so that it only considers the current // model. For highest order model, this is not // necessary since all completions in the FST // must be from this model, but for lower order // models we have to filter out the higher order // ones: // Must do num+seen.size() for queue depth because we may // reject up to seen.size() paths in acceptResult(): Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparer, seen, finalLastToken); // since this search is initialized with a single start node // it is okay to start with an empty input path here searcher.AddStartPaths(arc, prefixOutput, true, new Int32sRef()); completions = searcher.Search(); if (Debugging.AssertsEnabled) { Debugging.Assert(completions.IsComplete); } } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } int prefixLength = token.Length; BytesRef suffix = new BytesRef(8); //System.out.println(" " + completions.length + " completions"); foreach (Util.Fst.Util.Result <long?> completion in completions) { token.Length = prefixLength; // append suffix Util.Fst.Util.ToBytesRef(completion.Input, suffix); token.Append(suffix); //System.out.println(" completion " + token.utf8ToString()); // Skip this path if a higher-order model already // saw/predicted its last token: BytesRef lastToken = token; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { if (Debugging.AssertsEnabled) { Debugging.Assert(token.Length - i - 1 > 0); } lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } if (seen.Contains(lastToken)) { //System.out.println(" skip dup " + lastToken.utf8ToString()); goto nextCompletionContinue; } seen.Add(BytesRef.DeepCopyOf(lastToken)); spare.Grow(token.Length); UnicodeUtil.UTF8toUTF16(token, spare); LookupResult result = new LookupResult(spare.ToString(), // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes // return numbers that are greater than long.MaxValue, which results in a negative long number. (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount)); results.Add(result); if (Debugging.AssertsEnabled) { Debugging.Assert(results.Count == seen.Count); } //System.out.println(" add result=" + result); nextCompletionContinue :; } backoff *= ALPHA; } results.Sort(Comparer <Lookup.LookupResult> .Create((a, b) => { if (a.Value > b.Value) { return(-1); } else if (a.Value < b.Value) { return(1); } else { // Tie break by UTF16 sort order: return(a.Key.CompareToOrdinal(b.Key)); } })); if (results.Count > num) { results.RemoveRange(num, results.Count - num); //results.SubList(num, results.Count).Clear(); } return(results); } finally { IOUtils.DisposeWhileHandlingException(ts); } }
private void AddTerms(IndexReader reader, FieldVals f) { if (f.queryString == null) { return; } Terms terms = MultiFields.GetTerms(reader, f.fieldName); if (terms == null) { return; } TokenStream ts = analyzer.GetTokenStream(f.fieldName, f.queryString); try { ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>(); int corpusNumDocs = reader.NumDocs; ISet <string> processedTerms = new JCG.HashSet <string>(); ts.Reset(); while (ts.IncrementToken()) { string term = termAtt.ToString(); if (!processedTerms.Contains(term)) { processedTerms.Add(term); ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term float minScore = 0; Term startTerm = new Term(f.fieldName, term); AttributeSource atts = new AttributeSource(); IMaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>(); #pragma warning disable 612, 618 SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength); #pragma warning restore 612, 618 //store the df so all variants use same idf int df = reader.DocFreq(startTerm); int numVariants = 0; int totalVariantDocFreqs = 0; BytesRef possibleMatch; IBoostAttribute boostAtt = fe.Attributes.AddAttribute <IBoostAttribute>(); while (fe.MoveNext()) { possibleMatch = fe.Term; numVariants++; totalVariantDocFreqs += fe.DocFreq; float score = boostAtt.Boost; if (variantsQ.Count < MAX_VARIANTS_PER_TERM || score > minScore) { ScoreTerm st = new ScoreTerm(new Term(startTerm.Field, BytesRef.DeepCopyOf(possibleMatch)), score, startTerm); variantsQ.InsertWithOverflow(st); minScore = variantsQ.Top.Score; // maintain minScore } maxBoostAtt.MaxNonCompetitiveBoost = variantsQ.Count >= MAX_VARIANTS_PER_TERM ? minScore : float.NegativeInfinity; } if (numVariants > 0) { int avgDf = totalVariantDocFreqs / numVariants; if (df == 0) //no direct match we can use as df for all variants { df = avgDf; //use avg df of all variants } // take the top variants (scored by edit distance) and reset the score // to include an IDF factor then add to the global queue for ranking // overall top query terms int size = variantsQ.Count; for (int i = 0; i < size; i++) { ScoreTerm st = variantsQ.Pop(); st.Score = (st.Score * st.Score) * sim.Idf(df, corpusNumDocs); q.InsertWithOverflow(st); } } } } ts.End(); } finally { IOUtils.DisposeWhileHandlingException(ts); } }
public virtual void TestIndexing() { DirectoryInfo tmpDir = CreateTempDir("TestNeverDelete"); BaseDirectoryWrapper d = NewFSDirectory(tmpDir); // We want to "see" files removed if Lucene removed // them. this is still worth running on Windows since // some files the IR opens and closes. if (d is MockDirectoryWrapper) { ((MockDirectoryWrapper)d).NoDeleteOpenFile = false; } RandomIndexWriter w = new RandomIndexWriter(Random, d, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetIndexDeletionPolicy(NoDeletionPolicy.INSTANCE)); w.IndexWriter.Config.SetMaxBufferedDocs(TestUtil.NextInt32(Random, 5, 30)); w.Commit(); ThreadJob[] indexThreads = new ThreadJob[Random.Next(4)]; long stopTime = Environment.TickCount + AtLeast(1000); for (int x = 0; x < indexThreads.Length; x++) { indexThreads[x] = new ThreadAnonymousInnerClassHelper(w, stopTime, NewStringField, NewTextField); indexThreads[x].Name = "Thread " + x; indexThreads[x].Start(); } ISet <string> allFiles = new JCG.HashSet <string>(); DirectoryReader r = DirectoryReader.Open(d); while (Environment.TickCount < stopTime) { IndexCommit ic = r.IndexCommit; if (Verbose) { Console.WriteLine("TEST: check files: " + ic.FileNames); } allFiles.UnionWith(ic.FileNames); // Make sure no old files were removed foreach (string fileName in allFiles) { Assert.IsTrue(SlowFileExists(d, fileName), "file " + fileName + " does not exist"); } DirectoryReader r2 = DirectoryReader.OpenIfChanged(r); if (r2 != null) { r.Dispose(); r = r2; } Thread.Sleep(1); } r.Dispose(); foreach (ThreadJob t in indexThreads) { t.Join(); } w.Dispose(); d.Dispose(); System.IO.Directory.Delete(tmpDir.FullName, true); }
private void CreateRandomIndexes() { dir1 = NewDirectory(); dir2 = NewDirectory(); int numDocs = AtLeast(150); int numTerms = TestUtil.NextInt32(Random, 1, numDocs / 5); ISet <string> randomTerms = new JCG.HashSet <string>(); while (randomTerms.size() < numTerms) { randomTerms.add(TestUtil.RandomSimpleString(Random)); } terms = new List <string>(randomTerms); long seed = Random.NextInt64(); IndexWriterConfig iwc1 = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(new Random((int)seed))); IndexWriterConfig iwc2 = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(new Random((int)seed))); iwc2.SetMergePolicy(NewSortingMergePolicy(sort)); RandomIndexWriter iw1 = new RandomIndexWriter(new Random((int)seed), dir1, iwc1); RandomIndexWriter iw2 = new RandomIndexWriter(new Random((int)seed), dir2, iwc2); for (int i = 0; i < numDocs; ++i) { if (Random.nextInt(5) == 0 && i != numDocs - 1) { string term = RandomPicks.RandomFrom(Random, terms); iw1.DeleteDocuments(new Term("s", term)); iw2.DeleteDocuments(new Term("s", term)); } Document doc = randomDocument(); iw1.AddDocument(doc); iw2.AddDocument(doc); if (Random.nextInt(8) == 0) { iw1.Commit(); iw2.Commit(); } } // Make sure we have something to merge iw1.Commit(); iw2.Commit(); Document doc2 = randomDocument(); // NOTE: don't use RIW.addDocument directly, since it sometimes commits // which may trigger a merge, at which case forceMerge may not do anything. // With field updates this is a problem, since the updates can go into the // single segment in the index, and threefore the index won't be sorted. // This hurts the assumption of the test later on, that the index is sorted // by SortingMP. iw1.IndexWriter.AddDocument(doc2); iw2.IndexWriter.AddDocument(doc2); if (DefaultCodecSupportsFieldUpdates) { // update NDV of docs belonging to one term (covers many documents) long value = Random.NextInt64(); string term = RandomPicks.RandomFrom(Random, terms); iw1.IndexWriter.UpdateNumericDocValue(new Term("s", term), "ndv", value); iw2.IndexWriter.UpdateNumericDocValue(new Term("s", term), "ndv", value); } iw1.ForceMerge(1); iw2.ForceMerge(1); iw1.Dispose(); iw2.Dispose(); reader = DirectoryReader.Open(dir1); sortedReader = DirectoryReader.Open(dir2); }
static TestRandomChains() { try { brokenConstructors[typeof(LimitTokenCountFilter).GetConstructor(new Type[] { typeof(TokenStream), typeof(int) })] = ALWAYS; brokenConstructors[typeof(LimitTokenCountFilter).GetConstructor(new Type[] { typeof(TokenStream), typeof(int), typeof(bool) })] = new PredicateAnonymousInnerClassHelper2(); brokenConstructors[typeof(LimitTokenPositionFilter).GetConstructor(new Type[] { typeof(TokenStream), typeof(int) })] = ALWAYS; brokenConstructors[typeof(LimitTokenPositionFilter).GetConstructor(new Type[] { typeof(TokenStream), typeof(int), typeof(bool) })] = new PredicateAnonymousInnerClassHelper3(); foreach (Type c in new Type[] { // TODO: can we promote some of these to be only // offsets offenders? // doesn't actual reset itself: typeof(CachingTokenFilter), // Not broken, simulates brokenness: typeof(CrankyTokenFilter), // Not broken: we forcefully add this, so we shouldn't // also randomly pick it: typeof(ValidatingTokenFilter) }) { foreach (ConstructorInfo ctor in c.GetConstructors()) { brokenConstructors[ctor] = ALWAYS; } } } catch (Exception e) { throw new Exception(e.Message, e); } try { foreach (Type c in new Type[] { typeof(ReversePathHierarchyTokenizer), typeof(PathHierarchyTokenizer), // TODO: it seems to mess up offsets!? typeof(WikipediaTokenizer), // TODO: doesn't handle graph inputs typeof(CJKBigramFilter), // TODO: doesn't handle graph inputs (or even look at positionIncrement) typeof(HyphenatedWordsFilter), // TODO: LUCENE-4983 typeof(CommonGramsFilter), // TODO: doesn't handle graph inputs typeof(CommonGramsQueryFilter), // TODO: probably doesnt handle graph inputs, too afraid to try typeof(WordDelimiterFilter) }) { foreach (ConstructorInfo ctor in c.GetConstructors()) { brokenOffsetsConstructors[ctor] = ALWAYS; } } } catch (Exception e) { throw new Exception(e.Message, e); } allowedTokenizerArgs = new JCG.HashSet <Type>(IdentityEqualityComparer <Type> .Default); allowedTokenizerArgs.addAll(argProducers.Keys); allowedTokenizerArgs.Add(typeof(TextReader)); allowedTokenizerArgs.Add(typeof(AttributeSource.AttributeFactory)); allowedTokenizerArgs.Add(typeof(AttributeSource)); allowedTokenFilterArgs = new JCG.HashSet <Type>(IdentityEqualityComparer <Type> .Default); allowedTokenFilterArgs.addAll(argProducers.Keys); allowedTokenFilterArgs.Add(typeof(TokenStream)); // TODO: fix this one, thats broken: allowedTokenFilterArgs.Add(typeof(CommonGramsFilter)); allowedCharFilterArgs = new JCG.HashSet <Type>(IdentityEqualityComparer <Type> .Default); allowedCharFilterArgs.addAll(argProducers.Keys); allowedCharFilterArgs.Add(typeof(TextReader)); }
public virtual void TestStressAdvance_Mem() { for (int iter = 0; iter < 3; iter++) { if (Verbose) { Console.WriteLine("\nTEST: iter=" + iter); } Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir); ISet <int> aDocs = new JCG.HashSet <int>(); Documents.Document doc = new Documents.Document(); Field f = NewStringField("field", "", Field.Store.NO); doc.Add(f); Field idField = NewStringField("id", "", Field.Store.YES); doc.Add(idField); int num = AtLeast(4097); if (Verbose) { Console.WriteLine("\nTEST: numDocs=" + num); } for (int id = 0; id < num; id++) { if (Random.Next(4) == 3) { f.SetStringValue("a"); aDocs.Add(id); } else { f.SetStringValue("b"); } idField.SetStringValue("" + id); w.AddDocument(doc); if (Verbose) { Console.WriteLine("\nTEST: doc upto " + id); } } w.ForceMerge(1); IList <int> aDocIDs = new JCG.List <int>(); IList <int> bDocIDs = new JCG.List <int>(); DirectoryReader r = w.GetReader(); int[] idToDocID = new int[r.MaxDoc]; for (int docID = 0; docID < idToDocID.Length; docID++) { int id = Convert.ToInt32(r.Document(docID).Get("id")); if (aDocs.Contains(id)) { aDocIDs.Add(docID); } else { bDocIDs.Add(docID); } } TermsEnum te = GetOnlySegmentReader(r).Fields.GetTerms("field").GetEnumerator(); DocsEnum de = null; for (int iter2 = 0; iter2 < 10; iter2++) { if (Verbose) { Console.WriteLine("\nTEST: iter=" + iter + " iter2=" + iter2); } Assert.AreEqual(TermsEnum.SeekStatus.FOUND, te.SeekCeil(new BytesRef("a"))); de = TestUtil.Docs(Random, te, null, de, DocsFlags.NONE); TestOne(de, aDocIDs); Assert.AreEqual(TermsEnum.SeekStatus.FOUND, te.SeekCeil(new BytesRef("b"))); de = TestUtil.Docs(Random, te, null, de, DocsFlags.NONE); TestOne(de, bDocIDs); } w.Dispose(); r.Dispose(); dir.Dispose(); } }
public void TestTerms() { Random random = Random; int num = AtLeast(10000); #pragma warning disable 612, 618 IComparer <BytesRef> comparer = random.nextBoolean() ? BytesRef.UTF8SortedAsUnicodeComparer : BytesRef.UTF8SortedAsUTF16Comparer; #pragma warning restore 612, 618 IDictionary <BytesRef, KeyValuePair <long, BytesRef> > sorted = new JCG.SortedDictionary <BytesRef, KeyValuePair <long, BytesRef> >(comparer); IDictionary <BytesRef, long> sortedWithoutPayload = new JCG.SortedDictionary <BytesRef, long>(comparer); IDictionary <BytesRef, KeyValuePair <long, ISet <BytesRef> > > sortedWithContext = new JCG.SortedDictionary <BytesRef, KeyValuePair <long, ISet <BytesRef> > >(comparer); IDictionary <BytesRef, KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > > > sortedWithPayloadAndContext = new JCG.SortedDictionary <BytesRef, KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > > >(comparer); Input[] unsorted = new Input[num]; Input[] unsortedWithoutPayload = new Input[num]; Input[] unsortedWithContexts = new Input[num]; Input[] unsortedWithPayloadAndContext = new Input[num]; ISet <BytesRef> ctxs; for (int i = 0; i < num; i++) { BytesRef key2; BytesRef payload; ctxs = new JCG.HashSet <BytesRef>(); do { key2 = new BytesRef(TestUtil.RandomUnicodeString(random)); payload = new BytesRef(TestUtil.RandomUnicodeString(random)); for (int j = 0; j < AtLeast(2); j++) { ctxs.add(new BytesRef(TestUtil.RandomUnicodeString(random))); } } while (sorted.ContainsKey(key2)); long value = random.Next(); sortedWithoutPayload.Put(key2, value); sorted.Put(key2, new KeyValuePair <long, BytesRef>(value, payload)); sortedWithContext.Put(key2, new KeyValuePair <long, ISet <BytesRef> >(value, ctxs)); sortedWithPayloadAndContext.Put(key2, new KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > >(value, new KeyValuePair <BytesRef, ISet <BytesRef> >(payload, ctxs))); unsorted[i] = new Input(key2, value, payload); unsortedWithoutPayload[i] = new Input(key2, value); unsortedWithContexts[i] = new Input(key2, value, ctxs); unsortedWithPayloadAndContext[i] = new Input(key2, value, payload, ctxs); } // test the sorted iterator wrapper with payloads IInputIterator wrapper = new SortedInputIterator(new InputArrayIterator(unsorted), comparer); IEnumerator <KeyValuePair <BytesRef, KeyValuePair <long, BytesRef> > > expected = sorted.GetEnumerator(); while (expected.MoveNext()) { KeyValuePair <BytesRef, KeyValuePair <long, BytesRef> > entry = expected.Current; assertEquals(entry.Key, wrapper.Next()); assertEquals(Convert.ToInt64(entry.Value.Key), wrapper.Weight); assertEquals(entry.Value.Value, wrapper.Payload); } assertNull(wrapper.Next()); // test the sorted iterator wrapper with contexts wrapper = new SortedInputIterator(new InputArrayIterator(unsortedWithContexts), comparer); IEnumerator <KeyValuePair <BytesRef, KeyValuePair <long, ISet <BytesRef> > > > actualEntries = sortedWithContext.GetEnumerator(); while (actualEntries.MoveNext()) { KeyValuePair <BytesRef, KeyValuePair <long, ISet <BytesRef> > > entry = actualEntries.Current; assertEquals(entry.Key, wrapper.Next()); assertEquals(Convert.ToInt64(entry.Value.Key), wrapper.Weight); ISet <BytesRef> actualCtxs = entry.Value.Value; assertEquals(actualCtxs, wrapper.Contexts); } assertNull(wrapper.Next()); // test the sorted iterator wrapper with contexts and payload wrapper = new SortedInputIterator(new InputArrayIterator(unsortedWithPayloadAndContext), comparer); IEnumerator <KeyValuePair <BytesRef, KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > > > > expectedPayloadContextEntries = sortedWithPayloadAndContext.GetEnumerator(); while (expectedPayloadContextEntries.MoveNext()) { KeyValuePair <BytesRef, KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > > > entry = expectedPayloadContextEntries.Current; assertEquals(entry.Key, wrapper.Next()); assertEquals(Convert.ToInt64(entry.Value.Key), wrapper.Weight); ISet <BytesRef> actualCtxs = entry.Value.Value.Value; assertEquals(actualCtxs, wrapper.Contexts); BytesRef actualPayload = entry.Value.Value.Key; assertEquals(actualPayload, wrapper.Payload); } assertNull(wrapper.Next()); // test the unsorted iterator wrapper with payloads wrapper = new UnsortedInputIterator(new InputArrayIterator(unsorted)); IDictionary <BytesRef, KeyValuePair <long, BytesRef> > actual = new JCG.SortedDictionary <BytesRef, KeyValuePair <long, BytesRef> >(); BytesRef key; while ((key = wrapper.Next()) != null) { long value = wrapper.Weight; BytesRef payload = wrapper.Payload; actual.Put(BytesRef.DeepCopyOf(key), new KeyValuePair <long, BytesRef>(value, BytesRef.DeepCopyOf(payload))); } assertEquals(sorted, actual, aggressive: false); // test the sorted iterator wrapper without payloads IInputIterator wrapperWithoutPayload = new SortedInputIterator(new InputArrayIterator(unsortedWithoutPayload), comparer); IEnumerator <KeyValuePair <BytesRef, long> > expectedWithoutPayload = sortedWithoutPayload.GetEnumerator(); while (expectedWithoutPayload.MoveNext()) { KeyValuePair <BytesRef, long> entry = expectedWithoutPayload.Current; assertEquals(entry.Key, wrapperWithoutPayload.Next()); assertEquals(Convert.ToInt64(entry.Value), wrapperWithoutPayload.Weight); assertNull(wrapperWithoutPayload.Payload); } assertNull(wrapperWithoutPayload.Next()); // test the unsorted iterator wrapper without payloads wrapperWithoutPayload = new UnsortedInputIterator(new InputArrayIterator(unsortedWithoutPayload)); IDictionary <BytesRef, long> actualWithoutPayload = new JCG.SortedDictionary <BytesRef, long>(); while ((key = wrapperWithoutPayload.Next()) != null) { long value = wrapperWithoutPayload.Weight; assertNull(wrapperWithoutPayload.Payload); actualWithoutPayload.Put(BytesRef.DeepCopyOf(key), value); } assertEquals(sortedWithoutPayload, actualWithoutPayload, aggressive: false); }
public virtual void TestRandomWithPrefix() { Directory dir = NewDirectory(); ISet <string> prefixes = new JCG.HashSet <string>(); int numPrefix = TestUtil.NextInt32(Random, 2, 7); if (Verbose) { Console.WriteLine("TEST: use " + numPrefix + " prefixes"); } while (prefixes.Count < numPrefix) { prefixes.Add(TestUtil.RandomRealisticUnicodeString(Random)); //prefixes.Add(TestUtil.RandomSimpleString(random)); } string[] prefixesArray = prefixes.ToArray(/*new string[prefixes.Count]*/); int NUM_TERMS = AtLeast(20); ISet <BytesRef> terms = new JCG.HashSet <BytesRef>(); while (terms.Count < NUM_TERMS) { string s = prefixesArray[Random.Next(prefixesArray.Length)] + TestUtil.RandomRealisticUnicodeString(Random); //final String s = prefixesArray[random.nextInt(prefixesArray.Length)] + TestUtil.RandomSimpleString(random); if (s.Length > 0) { terms.Add(new BytesRef(s)); } } BytesRef[] termsArray = terms.ToArray(); Array.Sort(termsArray); int NUM_DOCS = AtLeast(100); IndexWriterConfig conf = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); // Sometimes swap in codec that impls ord(): if (Random.Next(10) == 7) { Codec codec = TestUtil.AlwaysPostingsFormat(PostingsFormat.ForName("Lucene41WithOrds")); conf.SetCodec(codec); } RandomIndexWriter w = new RandomIndexWriter(Random, dir, conf); int[][] idToOrds = new int[NUM_DOCS][]; ISet <int?> ordsForDocSet = new JCG.HashSet <int?>(); for (int id = 0; id < NUM_DOCS; id++) { Document doc = new Document(); doc.Add(new Int32Field("id", id, Field.Store.NO)); int termCount = TestUtil.NextInt32(Random, 0, 20 * RandomMultiplier); while (ordsForDocSet.Count < termCount) { ordsForDocSet.Add(Random.Next(termsArray.Length)); } int[] ordsForDoc = new int[termCount]; int upto = 0; if (Verbose) { Console.WriteLine("TEST: doc id=" + id); } foreach (int ord in ordsForDocSet) { ordsForDoc[upto++] = ord; Field field = NewStringField("field", termsArray[ord].Utf8ToString(), Field.Store.NO); if (Verbose) { Console.WriteLine(" f=" + termsArray[ord].Utf8ToString()); } doc.Add(field); } ordsForDocSet.Clear(); Array.Sort(ordsForDoc); idToOrds[id] = ordsForDoc; w.AddDocument(doc); } DirectoryReader r = w.GetReader(); w.Dispose(); if (Verbose) { Console.WriteLine("TEST: reader=" + r); } AtomicReader slowR = SlowCompositeReaderWrapper.Wrap(r); foreach (string prefix in prefixesArray) { BytesRef prefixRef = prefix == null ? null : new BytesRef(prefix); int[][] idToOrdsPrefix = new int[NUM_DOCS][]; for (int id = 0; id < NUM_DOCS; id++) { int[] docOrds = idToOrds[id]; IList <int?> newOrds = new List <int?>(); foreach (int ord in idToOrds[id]) { if (StringHelper.StartsWith(termsArray[ord], prefixRef)) { newOrds.Add(ord); } } int[] newOrdsArray = new int[newOrds.Count]; int upto = 0; foreach (int ord in newOrds) { newOrdsArray[upto++] = ord; } idToOrdsPrefix[id] = newOrdsArray; } foreach (AtomicReaderContext ctx in r.Leaves) { if (Verbose) { Console.WriteLine("\nTEST: sub=" + ctx.Reader); } Verify((AtomicReader)ctx.Reader, idToOrdsPrefix, termsArray, prefixRef); } // Also test top-level reader: its enum does not support // ord, so this forces the OrdWrapper to run: if (Verbose) { Console.WriteLine("TEST: top reader"); } Verify(slowR, idToOrdsPrefix, termsArray, prefixRef); } FieldCache.DEFAULT.PurgeByCacheKey(slowR.CoreCacheKey); r.Dispose(); dir.Dispose(); }
private IndexIterationContext CreateContext(int nDocs, RandomIndexWriter fromWriter, RandomIndexWriter toWriter, bool multipleValuesPerDocument, bool scoreDocsInOrder) { IndexIterationContext context = new IndexIterationContext(); int numRandomValues = nDocs / 2; context.RandomUniqueValues = new string[numRandomValues]; ISet <string> trackSet = new JCG.HashSet <string>(); context.RandomFrom = new bool[numRandomValues]; for (int i = 0; i < numRandomValues; i++) { string uniqueRandomValue; do { uniqueRandomValue = TestUtil.RandomRealisticUnicodeString(Random); // uniqueRandomValue = TestUtil.randomSimpleString(random); } while ("".Equals(uniqueRandomValue, StringComparison.Ordinal) || trackSet.Contains(uniqueRandomValue)); // Generate unique values and empty strings aren't allowed. trackSet.Add(uniqueRandomValue); context.RandomFrom[i] = Random.NextBoolean(); context.RandomUniqueValues[i] = uniqueRandomValue; } RandomDoc[] docs = new RandomDoc[nDocs]; for (int i = 0; i < nDocs; i++) { string id = Convert.ToString(i); int randomI = Random.Next(context.RandomUniqueValues.Length); string value = context.RandomUniqueValues[randomI]; Document document = new Document(); document.Add(NewTextField(Random, "id", id, Field.Store.NO)); document.Add(NewTextField(Random, "value", value, Field.Store.NO)); bool from = context.RandomFrom[randomI]; int numberOfLinkValues = multipleValuesPerDocument ? 2 + Random.Next(10) : 1; docs[i] = new RandomDoc(id, numberOfLinkValues, value, from); for (int j = 0; j < numberOfLinkValues; j++) { string linkValue = context.RandomUniqueValues[Random.Next(context.RandomUniqueValues.Length)]; docs[i].linkValues.Add(linkValue); if (from) { if (!context.FromDocuments.TryGetValue(linkValue, out IList <RandomDoc> fromDocs)) { context.FromDocuments[linkValue] = fromDocs = new List <RandomDoc>(); } if (!context.RandomValueFromDocs.TryGetValue(value, out IList <RandomDoc> randomValueFromDocs)) { context.RandomValueFromDocs[value] = randomValueFromDocs = new List <RandomDoc>(); } fromDocs.Add(docs[i]); randomValueFromDocs.Add(docs[i]); document.Add(NewTextField(Random, "from", linkValue, Field.Store.NO)); } else { if (!context.ToDocuments.TryGetValue(linkValue, out IList <RandomDoc> toDocuments)) { context.ToDocuments[linkValue] = toDocuments = new List <RandomDoc>(); } if (!context.RandomValueToDocs.TryGetValue(value, out IList <RandomDoc> randomValueToDocs)) { context.RandomValueToDocs[value] = randomValueToDocs = new List <RandomDoc>(); } toDocuments.Add(docs[i]); randomValueToDocs.Add(docs[i]); document.Add(NewTextField(Random, "to", linkValue, Field.Store.NO)); } } RandomIndexWriter w; if (from) { w = fromWriter; } else { w = toWriter; } w.AddDocument(document); if (Random.Next(10) == 4) { w.Commit(); } if (Verbose) { Console.WriteLine("Added document[" + docs[i].id + "]: " + document); } } // Pre-compute all possible hits for all unique random values. On top of this also compute all possible score for // any ScoreMode. IndexSearcher fromSearcher = NewSearcher(fromWriter.GetReader()); IndexSearcher toSearcher = NewSearcher(toWriter.GetReader()); for (int i = 0; i < context.RandomUniqueValues.Length; i++) { string uniqueRandomValue = context.RandomUniqueValues[i]; string fromField; string toField; IDictionary <string, IDictionary <int, JoinScore> > queryVals; if (context.RandomFrom[i]) { fromField = "from"; toField = "to"; queryVals = context.FromHitsToJoinScore; } else { fromField = "to"; toField = "from"; queryVals = context.ToHitsToJoinScore; } IDictionary <BytesRef, JoinScore> joinValueToJoinScores = new Dictionary <BytesRef, JoinScore>(); if (multipleValuesPerDocument) { fromSearcher.Search(new TermQuery(new Term("value", uniqueRandomValue)), new CollectorAnonymousInnerClassHelper3(this, context, fromField, joinValueToJoinScores)); } else { fromSearcher.Search(new TermQuery(new Term("value", uniqueRandomValue)), new CollectorAnonymousInnerClassHelper4(this, context, fromField, joinValueToJoinScores)); } IDictionary <int, JoinScore> docToJoinScore = new Dictionary <int, JoinScore>(); if (multipleValuesPerDocument) { if (scoreDocsInOrder) { AtomicReader slowCompositeReader = SlowCompositeReaderWrapper.Wrap(toSearcher.IndexReader); Terms terms = slowCompositeReader.GetTerms(toField); if (terms != null) { DocsEnum docsEnum = null; TermsEnum termsEnum = null; JCG.SortedSet <BytesRef> joinValues = new JCG.SortedSet <BytesRef>(BytesRef.UTF8SortedAsUnicodeComparer); joinValues.UnionWith(joinValueToJoinScores.Keys); foreach (BytesRef joinValue in joinValues) { termsEnum = terms.GetIterator(termsEnum); if (termsEnum.SeekExact(joinValue)) { docsEnum = termsEnum.Docs(slowCompositeReader.LiveDocs, docsEnum, DocsFlags.NONE); JoinScore joinScore = joinValueToJoinScores[joinValue]; for (int doc = docsEnum.NextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = docsEnum.NextDoc()) { // First encountered join value determines the score. // Something to keep in mind for many-to-many relations. if (!docToJoinScore.ContainsKey(doc)) { docToJoinScore[doc] = joinScore; } } } } } } else { toSearcher.Search(new MatchAllDocsQuery(), new CollectorAnonymousInnerClassHelper5(this, context, toField, joinValueToJoinScores, docToJoinScore)); } } else { toSearcher.Search(new MatchAllDocsQuery(), new CollectorAnonymousInnerClassHelper6(this, toField, joinValueToJoinScores, docToJoinScore)); } queryVals[uniqueRandomValue] = docToJoinScore; } fromSearcher.IndexReader.Dispose(); toSearcher.IndexReader.Dispose(); return(context); }
public virtual void TestDefaults() { IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); Assert.AreEqual(typeof(MockAnalyzer), conf.Analyzer.GetType()); Assert.IsNull(conf.IndexCommit); Assert.AreEqual(typeof(KeepOnlyLastCommitDeletionPolicy), conf.IndexDeletionPolicy.GetType()); #if !FEATURE_CONCURRENTMERGESCHEDULER Assert.AreEqual(typeof(TaskMergeScheduler), conf.MergeScheduler.GetType()); #else Assert.AreEqual(typeof(ConcurrentMergeScheduler), conf.MergeScheduler.GetType()); #endif Assert.AreEqual(OpenMode.CREATE_OR_APPEND, conf.OpenMode); // we don't need to assert this, it should be unspecified Assert.IsTrue(IndexSearcher.DefaultSimilarity == conf.Similarity); Assert.AreEqual(IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL, conf.TermIndexInterval); Assert.AreEqual(IndexWriterConfig.DefaultWriteLockTimeout, conf.WriteLockTimeout); Assert.AreEqual(IndexWriterConfig.WRITE_LOCK_TIMEOUT, IndexWriterConfig.DefaultWriteLockTimeout); Assert.AreEqual(IndexWriterConfig.DEFAULT_MAX_BUFFERED_DELETE_TERMS, conf.MaxBufferedDeleteTerms); Assert.AreEqual(IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB, conf.RAMBufferSizeMB, 0.0); Assert.AreEqual(IndexWriterConfig.DEFAULT_MAX_BUFFERED_DOCS, conf.MaxBufferedDocs); Assert.AreEqual(IndexWriterConfig.DEFAULT_READER_POOLING, conf.UseReaderPooling); Assert.IsTrue(DocumentsWriterPerThread.DefaultIndexingChain == conf.IndexingChain); Assert.IsNull(conf.MergedSegmentWarmer); Assert.AreEqual(IndexWriterConfig.DEFAULT_READER_TERMS_INDEX_DIVISOR, conf.ReaderTermsIndexDivisor); Assert.AreEqual(typeof(TieredMergePolicy), conf.MergePolicy.GetType()); Assert.AreEqual(typeof(DocumentsWriterPerThreadPool), conf.IndexerThreadPool.GetType()); Assert.AreEqual(typeof(FlushByRamOrCountsPolicy), conf.FlushPolicy.GetType()); Assert.AreEqual(IndexWriterConfig.DEFAULT_RAM_PER_THREAD_HARD_LIMIT_MB, conf.RAMPerThreadHardLimitMB); Assert.AreEqual(Codec.Default, conf.Codec); Assert.AreEqual((object)InfoStream.Default, conf.InfoStream); Assert.AreEqual(IndexWriterConfig.DEFAULT_USE_COMPOUND_FILE_SYSTEM, conf.UseCompoundFile); // Sanity check - validate that all getters are covered. ISet<string> getters = new JCG.HashSet<string>(); getters.Add("getAnalyzer"); getters.Add("getIndexCommit"); getters.Add("getIndexDeletionPolicy"); getters.Add("getMaxFieldLength"); getters.Add("getMergeScheduler"); getters.Add("getOpenMode"); getters.Add("getSimilarity"); getters.Add("getTermIndexInterval"); getters.Add("getWriteLockTimeout"); getters.Add("getDefaultWriteLockTimeout"); getters.Add("getMaxBufferedDeleteTerms"); getters.Add("getRAMBufferSizeMB"); getters.Add("getMaxBufferedDocs"); getters.Add("getIndexingChain"); getters.Add("getMergedSegmentWarmer"); getters.Add("getMergePolicy"); getters.Add("getMaxThreadStates"); getters.Add("getReaderPooling"); getters.Add("getIndexerThreadPool"); getters.Add("getReaderTermsIndexDivisor"); getters.Add("getFlushPolicy"); getters.Add("getRAMPerThreadHardLimitMB"); getters.Add("getCodec"); getters.Add("getInfoStream"); getters.Add("getUseCompoundFile"); foreach (MethodInfo m in typeof(IndexWriterConfig).GetMethods()) { if (m.DeclaringType == typeof(IndexWriterConfig) && m.Name.StartsWith("get", StringComparison.Ordinal) && !m.Name.StartsWith("get_", StringComparison.Ordinal)) { Assert.IsTrue(getters.Contains(m.Name), "method " + m.Name + " is not tested for defaults"); } } }
/// <summary> /// Tests a CacheEntry[] for indication of "insane" cache usage. /// <para> /// <b>NOTE:</b>FieldCache CreationPlaceholder objects are ignored. /// (:TODO: is this a bad idea? are we masking a real problem?) /// </para> /// </summary> public Insanity[] Check(params FieldCache.CacheEntry[] cacheEntries) { if (null == cacheEntries || 0 == cacheEntries.Length) { return(Arrays.Empty <Insanity>()); } if (estimateRam) { for (int i = 0; i < cacheEntries.Length; i++) { cacheEntries[i].EstimateSize(); } } // the indirect mapping lets MapOfSet dedup identical valIds for us // maps the (valId) identityhashCode of cache values to // sets of CacheEntry instances MapOfSets <int, FieldCache.CacheEntry> valIdToItems = new MapOfSets <int, FieldCache.CacheEntry>(new Dictionary <int, ISet <FieldCache.CacheEntry> >(17)); // maps ReaderField keys to Sets of ValueIds MapOfSets <ReaderField, int> readerFieldToValIds = new MapOfSets <ReaderField, int>(new Dictionary <ReaderField, ISet <int> >(17)); // any keys that we know result in more then one valId ISet <ReaderField> valMismatchKeys = new JCG.HashSet <ReaderField>(); // iterate over all the cacheEntries to get the mappings we'll need for (int i = 0; i < cacheEntries.Length; i++) { FieldCache.CacheEntry item = cacheEntries[i]; object val = item.Value; // It's OK to have dup entries, where one is eg // float[] and the other is the Bits (from // getDocWithField()) if (val is IBits) { continue; } if (val is FieldCache.ICreationPlaceholder) { continue; } ReaderField rf = new ReaderField(item.ReaderKey, item.FieldName); int valId = RuntimeHelpers.GetHashCode(val); // indirect mapping, so the MapOfSet will dedup identical valIds for us valIdToItems.Put(valId, item); if (1 < readerFieldToValIds.Put(rf, valId)) { valMismatchKeys.Add(rf); } } List <Insanity> insanity = new List <Insanity>(valMismatchKeys.Count * 3); insanity.AddRange(CheckValueMismatch(valIdToItems, readerFieldToValIds, valMismatchKeys)); insanity.AddRange(CheckSubreaders(valIdToItems, readerFieldToValIds)); return(insanity.ToArray()); }
/// <summary> /// Fills a <see cref="T:IDictionary{string, WeightedSpanTerm}"/> with <see cref="WeightedSpanTerm"/>s using the terms from the supplied <see cref="SpanQuery"/>. /// </summary> /// <param name="terms"><see cref="T:IDictionary{string, WeightedSpanTerm}"/> to place created <see cref="WeightedSpanTerm"/>s in</param> /// <param name="spanQuery"><see cref="SpanQuery"/> to extract Terms from</param> /// <exception cref="IOException">If there is a low-level I/O error</exception> protected virtual void ExtractWeightedSpanTerms(IDictionary <string, WeightedSpanTerm> terms, SpanQuery spanQuery) { ISet <string> fieldNames; if (fieldName == null) { fieldNames = new JCG.HashSet <string>(); CollectSpanQueryFields(spanQuery, fieldNames); } else { fieldNames = new JCG.HashSet <string> { fieldName }; } // To support the use of the default field name if (defaultField != null) { fieldNames.Add(defaultField); } IDictionary <string, SpanQuery> queries = new JCG.Dictionary <string, SpanQuery>(); var nonWeightedTerms = new JCG.HashSet <Term>(); bool mustRewriteQuery = MustRewriteQuery(spanQuery); if (mustRewriteQuery) { foreach (string field in fieldNames) { SpanQuery rewrittenQuery = (SpanQuery)spanQuery.Rewrite(GetLeafContext().Reader); queries[field] = rewrittenQuery; rewrittenQuery.ExtractTerms(nonWeightedTerms); } } else { spanQuery.ExtractTerms(nonWeightedTerms); } List <PositionSpan> spanPositions = new List <PositionSpan>(); foreach (string field in fieldNames) { SpanQuery q; q = mustRewriteQuery ? queries[field] : spanQuery; AtomicReaderContext context = GetLeafContext(); var termContexts = new JCG.Dictionary <Term, TermContext>(); ISet <Term> extractedTerms = new JCG.SortedSet <Term>(); q.ExtractTerms(extractedTerms); foreach (Term term in extractedTerms) { termContexts[term] = TermContext.Build(context, term); } IBits acceptDocs = context.AtomicReader.LiveDocs; Spans.Spans spans = q.GetSpans(context, acceptDocs, termContexts); // collect span positions while (spans.MoveNext()) { spanPositions.Add(new PositionSpan(spans.Start, spans.End - 1)); } } if (spanPositions.Count == 0) { // no spans found return; } foreach (Term queryTerm in nonWeightedTerms) { if (FieldNameComparer(queryTerm.Field)) { WeightedSpanTerm weightedSpanTerm; if (!terms.TryGetValue(queryTerm.Text(), out weightedSpanTerm) || weightedSpanTerm == null) { weightedSpanTerm = new WeightedSpanTerm(spanQuery.Boost, queryTerm.Text()); weightedSpanTerm.AddPositionSpans(spanPositions); weightedSpanTerm.IsPositionSensitive = true; terms[queryTerm.Text()] = weightedSpanTerm; } else { if (spanPositions.Count > 0) { weightedSpanTerm.AddPositionSpans(spanPositions); } } } } }
/// <summary> /// Reads from legacy 3.x segments_N. </summary> private SegmentCommitInfo ReadLegacySegmentInfo(Directory dir, int format, IndexInput input) { // check that it is a format we can understand if (format > Lucene3xSegmentInfoFormat.FORMAT_DIAGNOSTICS) { throw new IndexFormatTooOldException(input, format, Lucene3xSegmentInfoFormat.FORMAT_DIAGNOSTICS, Lucene3xSegmentInfoFormat.FORMAT_3_1); } if (format < Lucene3xSegmentInfoFormat.FORMAT_3_1) { throw new IndexFormatTooNewException(input, format, Lucene3xSegmentInfoFormat.FORMAT_DIAGNOSTICS, Lucene3xSegmentInfoFormat.FORMAT_3_1); } string version; if (format <= Lucene3xSegmentInfoFormat.FORMAT_3_1) { version = input.ReadString(); } else { version = null; } string name = input.ReadString(); int docCount = input.ReadInt32(); long delGen = input.ReadInt64(); int docStoreOffset = input.ReadInt32(); IDictionary <string, string> attributes = new Dictionary <string, string>(); // parse the docstore stuff and shove it into attributes string docStoreSegment; bool docStoreIsCompoundFile; if (docStoreOffset != -1) { docStoreSegment = input.ReadString(); docStoreIsCompoundFile = input.ReadByte() == SegmentInfo.YES; attributes[Lucene3xSegmentInfoFormat.DS_OFFSET_KEY] = Convert.ToString(docStoreOffset, CultureInfo.InvariantCulture); attributes[Lucene3xSegmentInfoFormat.DS_NAME_KEY] = docStoreSegment; attributes[Lucene3xSegmentInfoFormat.DS_COMPOUND_KEY] = Convert.ToString(docStoreIsCompoundFile, CultureInfo.InvariantCulture); } else { docStoreSegment = name; docStoreIsCompoundFile = false; } // pre-4.0 indexes write a byte if there is a single norms file byte b = input.ReadByte(); //System.out.println("version=" + version + " name=" + name + " docCount=" + docCount + " delGen=" + delGen + " dso=" + docStoreOffset + " dss=" + docStoreSegment + " dssCFs=" + docStoreIsCompoundFile + " b=" + b + " format=" + format); if (Debugging.AssertsEnabled) { Debugging.Assert(1 == b, "expected 1 but was: {0} format: {1}", b, format); } int numNormGen = input.ReadInt32(); IDictionary <int, long> normGen; if (numNormGen == SegmentInfo.NO) { normGen = null; } else { normGen = new Dictionary <int, long>(); for (int j = 0; j < numNormGen; j++) { normGen[j] = input.ReadInt64(); } } bool isCompoundFile = input.ReadByte() == SegmentInfo.YES; int delCount = input.ReadInt32(); if (Debugging.AssertsEnabled) { Debugging.Assert(delCount <= docCount); } //bool hasProx = input.ReadByte() == 1; input.ReadByte(); // LUCENENET: IDE0059: Remove unnecessary value assignment IDictionary <string, string> diagnostics = input.ReadStringStringMap(); if (format <= Lucene3xSegmentInfoFormat.FORMAT_HAS_VECTORS) { // NOTE: unused //int hasVectors = input.ReadByte(); input.ReadByte(); // LUCENENET: IDE0059: Remove unnecessary value assignment } // Replicate logic from 3.x's SegmentInfo.files(): ISet <string> files = new JCG.HashSet <string>(); if (isCompoundFile) { files.Add(IndexFileNames.SegmentFileName(name, "", IndexFileNames.COMPOUND_FILE_EXTENSION)); } else { AddIfExists(dir, files, IndexFileNames.SegmentFileName(name, "", Lucene3xFieldInfosReader.FIELD_INFOS_EXTENSION)); AddIfExists(dir, files, IndexFileNames.SegmentFileName(name, "", Lucene3xPostingsFormat.FREQ_EXTENSION)); AddIfExists(dir, files, IndexFileNames.SegmentFileName(name, "", Lucene3xPostingsFormat.PROX_EXTENSION)); AddIfExists(dir, files, IndexFileNames.SegmentFileName(name, "", Lucene3xPostingsFormat.TERMS_EXTENSION)); AddIfExists(dir, files, IndexFileNames.SegmentFileName(name, "", Lucene3xPostingsFormat.TERMS_INDEX_EXTENSION)); AddIfExists(dir, files, IndexFileNames.SegmentFileName(name, "", Lucene3xNormsProducer.NORMS_EXTENSION)); } if (docStoreOffset != -1) { if (docStoreIsCompoundFile) { files.Add(IndexFileNames.SegmentFileName(docStoreSegment, "", Lucene3xCodec.COMPOUND_FILE_STORE_EXTENSION)); } else { files.Add(IndexFileNames.SegmentFileName(docStoreSegment, "", Lucene3xStoredFieldsReader.FIELDS_INDEX_EXTENSION)); files.Add(IndexFileNames.SegmentFileName(docStoreSegment, "", Lucene3xStoredFieldsReader.FIELDS_EXTENSION)); AddIfExists(dir, files, IndexFileNames.SegmentFileName(docStoreSegment, "", Lucene3xTermVectorsReader.VECTORS_INDEX_EXTENSION)); AddIfExists(dir, files, IndexFileNames.SegmentFileName(docStoreSegment, "", Lucene3xTermVectorsReader.VECTORS_FIELDS_EXTENSION)); AddIfExists(dir, files, IndexFileNames.SegmentFileName(docStoreSegment, "", Lucene3xTermVectorsReader.VECTORS_DOCUMENTS_EXTENSION)); } } else if (!isCompoundFile) { files.Add(IndexFileNames.SegmentFileName(name, "", Lucene3xStoredFieldsReader.FIELDS_INDEX_EXTENSION)); files.Add(IndexFileNames.SegmentFileName(name, "", Lucene3xStoredFieldsReader.FIELDS_EXTENSION)); AddIfExists(dir, files, IndexFileNames.SegmentFileName(name, "", Lucene3xTermVectorsReader.VECTORS_INDEX_EXTENSION)); AddIfExists(dir, files, IndexFileNames.SegmentFileName(name, "", Lucene3xTermVectorsReader.VECTORS_FIELDS_EXTENSION)); AddIfExists(dir, files, IndexFileNames.SegmentFileName(name, "", Lucene3xTermVectorsReader.VECTORS_DOCUMENTS_EXTENSION)); } // parse the normgen stuff and shove it into attributes if (normGen != null) { attributes[Lucene3xSegmentInfoFormat.NORMGEN_KEY] = Convert.ToString(numNormGen, CultureInfo.InvariantCulture); foreach (KeyValuePair <int, long> ent in normGen) { long gen = ent.Value; if (gen >= SegmentInfo.YES) { // Definitely a separate norm file, with generation: files.Add(IndexFileNames.FileNameFromGeneration(name, "s" + ent.Key, gen)); attributes[Lucene3xSegmentInfoFormat.NORMGEN_PREFIX + ent.Key] = Convert.ToString(gen, CultureInfo.InvariantCulture); } else if (gen == SegmentInfo.NO) { // No separate norm } else { // We should have already hit indexformat too old exception if (Debugging.AssertsEnabled) { Debugging.Assert(false); } } } } SegmentInfo info = new SegmentInfo(dir, version, name, docCount, isCompoundFile, null, diagnostics, attributes.AsReadOnly()); info.SetFiles(files); SegmentCommitInfo infoPerCommit = new SegmentCommitInfo(info, delCount, delGen, -1); return(infoPerCommit); }
public virtual void TestRandomStringSort() { Random random = new Random(Random.Next()); int NUM_DOCS = AtLeast(100); Directory dir = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif random, dir); bool allowDups = random.NextBoolean(); ISet <string> seen = new JCG.HashSet <string>(); int maxLength = TestUtil.NextInt32(random, 5, 100); if (VERBOSE) { Console.WriteLine("TEST: NUM_DOCS=" + NUM_DOCS + " maxLength=" + maxLength + " allowDups=" + allowDups); } int numDocs = 0; IList <BytesRef> docValues = new List <BytesRef>(); // TODO: deletions while (numDocs < NUM_DOCS) { Document doc = new Document(); // 10% of the time, the document is missing the value: BytesRef br; if (LuceneTestCase.Random.Next(10) != 7) { string s; if (random.NextBoolean()) { s = TestUtil.RandomSimpleString(random, maxLength); } else { s = TestUtil.RandomUnicodeString(random, maxLength); } if (!allowDups) { if (seen.Contains(s)) { continue; } seen.Add(s); } if (VERBOSE) { Console.WriteLine(" " + numDocs + ": s=" + s); } br = new BytesRef(s); if (DefaultCodecSupportsDocValues) { doc.Add(new SortedDocValuesField("stringdv", br)); doc.Add(new NumericDocValuesField("id", numDocs)); } else { doc.Add(NewStringField("id", Convert.ToString(numDocs), Field.Store.NO)); } doc.Add(NewStringField("string", s, Field.Store.NO)); docValues.Add(br); } else { br = null; if (VERBOSE) { Console.WriteLine(" " + numDocs + ": <missing>"); } docValues.Add(null); if (DefaultCodecSupportsDocValues) { doc.Add(new NumericDocValuesField("id", numDocs)); } else { doc.Add(NewStringField("id", Convert.ToString(numDocs), Field.Store.NO)); } } doc.Add(new StoredField("id", numDocs)); writer.AddDocument(doc); numDocs++; if (random.Next(40) == 17) { // force flush writer.GetReader().Dispose(); } } IndexReader r = writer.GetReader(); writer.Dispose(); if (VERBOSE) { Console.WriteLine(" reader=" + r); } IndexSearcher idxS = NewSearcher( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif r, false); int ITERS = AtLeast(100); for (int iter = 0; iter < ITERS; iter++) { bool reverse = random.NextBoolean(); TopFieldDocs hits; SortField sf; bool sortMissingLast; bool missingIsNull; if (DefaultCodecSupportsDocValues && random.NextBoolean()) { sf = new SortField("stringdv", SortFieldType.STRING, reverse); // Can only use sort missing if the DVFormat // supports docsWithField: sortMissingLast = DefaultCodecSupportsDocsWithField && Random.NextBoolean(); missingIsNull = DefaultCodecSupportsDocsWithField; } else { sf = new SortField("string", SortFieldType.STRING, reverse); sortMissingLast = Random.NextBoolean(); missingIsNull = true; } if (sortMissingLast) { sf.MissingValue = SortField.STRING_LAST; } Sort sort; if (random.NextBoolean()) { sort = new Sort(sf); } else { sort = new Sort(sf, SortField.FIELD_DOC); } int hitCount = TestUtil.NextInt32(random, 1, r.MaxDoc + 20); RandomFilter f = new RandomFilter(random, (float)random.NextDouble(), docValues); int queryType = random.Next(3); if (queryType == 0) { // force out of order BooleanQuery bq = new BooleanQuery(); // Add a Query with SHOULD, since bw.Scorer() returns BooleanScorer2 // which delegates to BS if there are no mandatory clauses. bq.Add(new MatchAllDocsQuery(), Occur.SHOULD); // Set minNrShouldMatch to 1 so that BQ will not optimize rewrite to return // the clause instead of BQ. bq.MinimumNumberShouldMatch = 1; hits = idxS.Search(bq, f, hitCount, sort, random.NextBoolean(), random.NextBoolean()); } else if (queryType == 1) { hits = idxS.Search(new ConstantScoreQuery(f), null, hitCount, sort, random.NextBoolean(), random.NextBoolean()); } else { hits = idxS.Search(new MatchAllDocsQuery(), f, hitCount, sort, random.NextBoolean(), random.NextBoolean()); } if (VERBOSE) { Console.WriteLine("\nTEST: iter=" + iter + " " + hits.TotalHits + " hits; topN=" + hitCount + "; reverse=" + reverse + "; sortMissingLast=" + sortMissingLast + " sort=" + sort); } // Compute expected results: var expected = f.MatchValues.ToList(); expected.Sort(new ComparerAnonymousInnerClassHelper(this, sortMissingLast)); if (reverse) { expected.Reverse(); } if (VERBOSE) { Console.WriteLine(" expected:"); for (int idx = 0; idx < expected.Count; idx++) { BytesRef br = expected[idx]; if (br == null && missingIsNull == false) { br = new BytesRef(); } Console.WriteLine(" " + idx + ": " + (br == null ? "<missing>" : br.Utf8ToString())); if (idx == hitCount - 1) { break; } } } if (VERBOSE) { Console.WriteLine(" actual:"); for (int hitIDX = 0; hitIDX < hits.ScoreDocs.Length; hitIDX++) { FieldDoc fd = (FieldDoc)hits.ScoreDocs[hitIDX]; BytesRef br = (BytesRef)fd.Fields[0]; Console.WriteLine(" " + hitIDX + ": " + (br == null ? "<missing>" : br.Utf8ToString()) + " id=" + idxS.Doc(fd.Doc).Get("id")); } } for (int hitIDX = 0; hitIDX < hits.ScoreDocs.Length; hitIDX++) { FieldDoc fd = (FieldDoc)hits.ScoreDocs[hitIDX]; BytesRef br = expected[hitIDX]; if (br == null && missingIsNull == false) { br = new BytesRef(); } // Normally, the old codecs (that don't support // docsWithField via doc values) will always return // an empty BytesRef for the missing case; however, // if all docs in a given segment were missing, in // that case it will return null! So we must map // null here, too: BytesRef br2 = (BytesRef)fd.Fields[0]; if (br2 == null && missingIsNull == false) { br2 = new BytesRef(); } Assert.AreEqual(br, br2, "hit=" + hitIDX + " has wrong sort value"); } } r.Dispose(); dir.Dispose(); }
public override void Run() { try { var seen = new JCG.HashSet <string>(); IList <string> paths = new List <string>(); while (true) { Document doc = new Document(); int numPaths = TestUtil.NextInt32(Random, 1, 5); for (int i = 0; i < numPaths; i++) { string path; if (paths.Count > 0 && Random.Next(5) != 4) { // Use previous path path = paths[Random.Next(paths.Count)]; } else { // Create new path path = null; while (true) { path = TestUtil.RandomRealisticUnicodeString(Random); if (path.Length != 0 && !seen.Contains(path)) { seen.Add(path); paths.Add(path); break; } } } doc.Add(new FacetField("field", path)); } try { w.AddDocument(config.Build(tw, doc)); if (mgr != null && Random.NextDouble() < 0.02) { w.Commit(); tw.Commit(); mgr.MaybeRefresh(); } } catch (IOException ioe) { throw new Exception(ioe.ToString(), ioe); } if (VERBOSE) { Console.WriteLine("TW size=" + tw.Count + " vs " + ordLimit); } if (tw.Count >= ordLimit) { break; } } } finally { stop.Value = true; } }
public virtual void Test() { MockDirectoryWrapper dir = NewMockFSDirectory(CreateTempDir("TestIndexWriterOutOfFileDescriptors")); dir.PreventDoubleWrite = false; double rate = Random.NextDouble() * 0.01; //System.out.println("rate=" + rate); dir.RandomIOExceptionRateOnOpen = rate; int iters = AtLeast(20); LineFileDocs docs = new LineFileDocs(Random, DefaultCodecSupportsDocValues); IndexReader r = null; DirectoryReader r2 = null; bool any = false; MockDirectoryWrapper dirCopy = null; int lastNumDocs = 0; for (int iter = 0; iter < iters; iter++) { IndexWriter w = null; if (VERBOSE) { Console.WriteLine("TEST: iter=" + iter); } try { MockAnalyzer analyzer = new MockAnalyzer(Random); analyzer.MaxTokenLength = TestUtil.NextInt32(Random, 1, IndexWriter.MAX_TERM_LENGTH); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); if (VERBOSE) { // Do this ourselves instead of relying on LTC so // we see incrementing messageID: iwc.SetInfoStream(new TextWriterInfoStream(Console.Out)); } var ms = iwc.MergeScheduler; if (ms is IConcurrentMergeScheduler) { ((IConcurrentMergeScheduler)ms).SetSuppressExceptions(); } w = new IndexWriter(dir, iwc); if (r != null && Random.Next(5) == 3) { if (Random.NextBoolean()) { if (VERBOSE) { Console.WriteLine("TEST: addIndexes IR[]"); } w.AddIndexes(new IndexReader[] { r }); } else { if (VERBOSE) { Console.WriteLine("TEST: addIndexes Directory[]"); } w.AddIndexes(new Directory[] { dirCopy }); } } else { if (VERBOSE) { Console.WriteLine("TEST: addDocument"); } w.AddDocument(docs.NextDoc()); } dir.RandomIOExceptionRateOnOpen = 0.0; w.Dispose(); w = null; // NOTE: this is O(N^2)! Only enable for temporary debugging: //dir.setRandomIOExceptionRateOnOpen(0.0); //TestUtil.CheckIndex(dir); //dir.setRandomIOExceptionRateOnOpen(rate); // Verify numDocs only increases, to catch IndexWriter // accidentally deleting the index: dir.RandomIOExceptionRateOnOpen = 0.0; Assert.IsTrue(DirectoryReader.IndexExists(dir)); if (r2 == null) { r2 = DirectoryReader.Open(dir); } else { DirectoryReader r3 = DirectoryReader.OpenIfChanged(r2); if (r3 != null) { r2.Dispose(); r2 = r3; } } Assert.IsTrue(r2.NumDocs >= lastNumDocs, "before=" + lastNumDocs + " after=" + r2.NumDocs); lastNumDocs = r2.NumDocs; //System.out.println("numDocs=" + lastNumDocs); dir.RandomIOExceptionRateOnOpen = rate; any = true; if (VERBOSE) { Console.WriteLine("TEST: iter=" + iter + ": success"); } } catch (IOException ioe) { if (VERBOSE) { Console.WriteLine("TEST: iter=" + iter + ": exception"); Console.WriteLine(ioe.ToString()); Console.Write(ioe.StackTrace); } if (w != null) { // NOTE: leave random IO exceptions enabled here, // to verify that rollback does not try to write // anything: w.Rollback(); } } if (any && r == null && Random.NextBoolean()) { // Make a copy of a non-empty index so we can use // it to addIndexes later: dir.RandomIOExceptionRateOnOpen = 0.0; r = DirectoryReader.Open(dir); dirCopy = NewMockFSDirectory(CreateTempDir("TestIndexWriterOutOfFileDescriptors.copy")); ISet <string> files = new JCG.HashSet <string>(); foreach (string file in dir.ListAll()) { dir.Copy(dirCopy, file, file, IOContext.DEFAULT); files.Add(file); } dirCopy.Sync(files); // Have IW kiss the dir so we remove any leftover // files ... we can easily have leftover files at // the time we take a copy because we are holding // open a reader: (new IndexWriter(dirCopy, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)))).Dispose(); dirCopy.RandomIOExceptionRate = rate; dir.RandomIOExceptionRateOnOpen = rate; } } if (r2 != null) { r2.Dispose(); } if (r != null) { r.Dispose(); dirCopy.Dispose(); } dir.Dispose(); }
public override void Build(IInputIterator iterator) { if (iterator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); var tempInput = FileSupport.CreateTempFile(prefix, ".input", directory); var tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory); hasPayloads = iterator.HasPayloads; var writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; var scratch = new BytesRef(); TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton(); bool success = false; count = 0; byte[] buffer = new byte[8]; try { var output = new ByteArrayDataOutput(buffer); BytesRef surfaceForm; while ((surfaceForm = iterator.Next()) != null) { ISet <Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a); maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count); foreach (Int32sRef path in paths) { Util.Fst.Util.ToBytesRef(path, scratch); // length of the analyzed text (FST input) if (scratch.Length > ushort.MaxValue - 2) { throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) + " in length (got " + scratch.Length + ")"); } ushort analyzedLength = (ushort)scratch.Length; // compute the required length: // analyzed sequence + weight (4) + surface + analyzedLength (short) int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2; BytesRef payload; if (hasPayloads) { if (surfaceForm.Length > (ushort.MaxValue - 2)) { throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) + " in length (got " + surfaceForm.Length + ")"); } payload = iterator.Payload; // payload + surfaceLength (short) requiredLength += payload.Length + 2; } else { payload = null; } buffer = ArrayUtil.Grow(buffer, requiredLength); output.Reset(buffer); output.WriteInt16((short)analyzedLength); output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length); output.WriteInt32(EncodeWeight(iterator.Weight)); if (hasPayloads) { for (int i = 0; i < surfaceForm.Length; i++) { if (surfaceForm.Bytes[i] == PAYLOAD_SEP) { throw new ArgumentException( "surface form cannot contain unit separator character U+001F; this character is reserved"); } } output.WriteInt16((short)surfaceForm.Length); output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); output.WriteBytes(payload.Bytes, payload.Offset, payload.Length); } else { output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); } Debug.Assert(output.Position == requiredLength, output.Position + " vs " + requiredLength); writer.Write(buffer, 0, output.Position); } count++; } writer.Dispose(); // Sort all input/output pairs (required by FST.Builder): (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted); // Free disk space: tempInput.Delete(); reader = new OfflineSorter.ByteSequencesReader(tempSorted); var outputs = new PairOutputs <long?, BytesRef>(PositiveInt32Outputs.Singleton, ByteSequenceOutputs.Singleton); var builder = new Builder <PairOutputs <long?, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs); // Build FST: BytesRef previousAnalyzed = null; BytesRef analyzed = new BytesRef(); BytesRef surface = new BytesRef(); Int32sRef scratchInts = new Int32sRef(); var input = new ByteArrayDataInput(); // Used to remove duplicate surface forms (but we // still index the hightest-weight one). We clear // this when we see a new analyzed form, so it cannot // grow unbounded (at most 256 entries): var seenSurfaceForms = new JCG.HashSet <BytesRef>(); var dedup = 0; while (reader.Read(scratch)) { input.Reset(scratch.Bytes, scratch.Offset, scratch.Length); ushort analyzedLength = (ushort)input.ReadInt16(); analyzed.Grow(analyzedLength + 2); input.ReadBytes(analyzed.Bytes, 0, analyzedLength); analyzed.Length = analyzedLength; long cost = input.ReadInt32(); surface.Bytes = scratch.Bytes; if (hasPayloads) { surface.Length = (ushort)input.ReadInt16(); surface.Offset = input.Position; } else { surface.Offset = input.Position; surface.Length = scratch.Length - surface.Offset; } if (previousAnalyzed == null) { previousAnalyzed = new BytesRef(); previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else if (analyzed.Equals(previousAnalyzed)) { dedup++; if (dedup >= maxSurfaceFormsPerAnalyzedForm) { // More than maxSurfaceFormsPerAnalyzedForm // dups: skip the rest: continue; } if (seenSurfaceForms.Contains(surface)) { continue; } seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else { dedup = 0; previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Clear(); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } // TODO: I think we can avoid the extra 2 bytes when // there is no dup (dedup==0), but we'd have to fix // the exactFirst logic ... which would be sort of // hairy because we'd need to special case the two // (dup/not dup)... // NOTE: must be byte 0 so we sort before whatever // is next analyzed.Bytes[analyzed.Offset + analyzed.Length] = 0; analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup; analyzed.Length += 2; Util.Fst.Util.ToInt32sRef(analyzed, scratchInts); //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); if (!hasPayloads) { builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface))); } else { int payloadOffset = input.Position + surface.Length; int payloadLength = scratch.Length - payloadOffset; BytesRef br = new BytesRef(surface.Length + 1 + payloadLength); Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length); br.Bytes[surface.Length] = PAYLOAD_SEP; Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength); br.Length = br.Bytes.Length; builder.Add(scratchInts, outputs.NewPair(cost, br)); } } fst = builder.Finish(); //Util.dotToFile(fst, "/tmp/suggest.dot"); success = true; } finally { if (success) { IOUtils.Dispose(reader, writer); } else { IOUtils.DisposeWhileHandlingException(reader, writer); } tempInput.Delete(); tempSorted.Delete(); } }
/// <summary> /// Detect repetition groups. Done once - for first doc. </summary> private IList <IList <PhrasePositions> > GatherRptGroups(JCG.LinkedDictionary <Term, int?> rptTerms) { PhrasePositions[] rpp = RepeatingPPs(rptTerms); IList <IList <PhrasePositions> > res = new List <IList <PhrasePositions> >(); if (!hasMultiTermRpts) { // simpler - no multi-terms - can base on positions in first doc for (int i = 0; i < rpp.Length; i++) { PhrasePositions pp = rpp[i]; if (pp.rptGroup >= 0) // already marked as a repetition { continue; } int tpPos = TpPos(pp); for (int j = i + 1; j < rpp.Length; j++) { PhrasePositions pp2 = rpp[j]; if (pp2.rptGroup >= 0 || pp2.offset == pp.offset || TpPos(pp2) != tpPos) // not a repetition - not a repetition: two PPs are originally in same offset in the query! - already marked as a repetition { continue; } // a repetition int g = pp.rptGroup; if (g < 0) { g = res.Count; pp.rptGroup = g; List <PhrasePositions> rl = new List <PhrasePositions>(2); rl.Add(pp); res.Add(rl); } pp2.rptGroup = g; res[g].Add(pp2); } } } else { // more involved - has multi-terms IList <JCG.HashSet <PhrasePositions> > tmp = new List <JCG.HashSet <PhrasePositions> >(); IList <FixedBitSet> bb = PpTermsBitSets(rpp, rptTerms); UnionTermGroups(bb); IDictionary <Term, int> tg = TermGroups(rptTerms, bb); JCG.HashSet <int> distinctGroupIDs = new JCG.HashSet <int>(tg.Values); for (int i = 0; i < distinctGroupIDs.Count; i++) { tmp.Add(new JCG.HashSet <PhrasePositions>()); } foreach (PhrasePositions pp in rpp) { foreach (Term t in pp.terms) { if (rptTerms.ContainsKey(t)) { int g = tg[t]; tmp[g].Add(pp); Debug.Assert(pp.rptGroup == -1 || pp.rptGroup == g); pp.rptGroup = g; } } } foreach (JCG.HashSet <PhrasePositions> hs in tmp) { res.Add(new List <PhrasePositions>(hs)); } } return(res); }
/// <summary> /// Returns the strings that can be produced from the given state, or /// <c>false</c> if more than <paramref name="limit"/> strings are found. /// <paramref name="limit"/><0 means "infinite". /// </summary> private static bool GetFiniteStrings(State s, JCG.HashSet <State> pathstates, JCG.HashSet <Int32sRef> strings, Int32sRef path, int limit) { pathstates.Add(s); foreach (Transition t in s.GetTransitions()) { if (pathstates.Contains(t.to)) { return(false); } for (int n = t.min; n <= t.max; n++) { path.Grow(path.Length + 1); path.Int32s[path.Length] = n; path.Length++; if (t.to.accept) { strings.Add(Int32sRef.DeepCopyOf(path)); if (limit >= 0 && strings.Count > limit) { return(false); } } if (!GetFiniteStrings(t.to, pathstates, strings, path, limit)) { return(false); } path.Length--; } } pathstates.Remove(s); return(true); }
/// <summary> /// (non-Javadoc) /// @see org.apache.lucene.xmlparser.QueryObjectBuilder#process(org.w3c.dom.Element) /// </summary> public virtual Query GetQuery(XmlElement e) { string fieldsList = e.GetAttribute("fieldNames"); //a comma-delimited list of fields string[] fields = defaultFieldNames; if ((fieldsList != null) && (fieldsList.Trim().Length > 0)) { fields = fieldsList.Trim().Split(',').TrimEnd(); //trim the fieldnames for (int i = 0; i < fields.Length; i++) { fields[i] = fields[i].Trim(); } } //Parse any "stopWords" attribute //TODO MoreLikeThis needs to ideally have per-field stopWords lists - until then //I use all analyzers/fields to generate multi-field compatible stop list string stopWords = e.GetAttribute("stopWords"); ISet <string> stopWordsSet = null; if ((stopWords != null) && (fields != null)) { stopWordsSet = new JCG.HashSet <string>(); foreach (string field in fields) { TokenStream ts = null; try { ts = analyzer.GetTokenStream(field, stopWords); ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>(); ts.Reset(); while (ts.IncrementToken()) { stopWordsSet.Add(termAtt.ToString()); } ts.End(); } catch (IOException ioe) { throw new ParserException("IoException parsing stop words list in " + GetType().Name + ":" + ioe.Message); } finally { IOUtils.DisposeWhileHandlingException(ts); } } } MoreLikeThisQuery mlt = new MoreLikeThisQuery(DOMUtils.GetText(e), fields, analyzer, fields[0]); mlt.MaxQueryTerms = DOMUtils.GetAttribute(e, "maxQueryTerms", DEFAULT_MAX_QUERY_TERMS); mlt.MinTermFrequency = DOMUtils.GetAttribute(e, "minTermFrequency", DEFAULT_MIN_TERM_FREQUENCY); mlt.PercentTermsToMatch = DOMUtils.GetAttribute(e, "percentTermsToMatch", DEFAULT_PERCENT_TERMS_TO_MATCH) / 100; mlt.StopWords = stopWordsSet; int minDocFreq = DOMUtils.GetAttribute(e, "minDocFreq", -1); if (minDocFreq >= 0) { mlt.MinDocFreq = minDocFreq; } mlt.Boost = DOMUtils.GetAttribute(e, "boost", 1.0f); return(mlt); }
public override void AddBinaryField(FieldInfo field, IEnumerable <BytesRef> values) { // examine the values to determine best type to use ISet <BytesRef> uniqueValues = new JCG.HashSet <BytesRef>(); int minLength = int.MaxValue; int maxLength = int.MinValue; foreach (var value in values) { BytesRef b = value; if (b is null) { b = new BytesRef(); // 4.0 doesnt distinguish } if (b.Length > Lucene40DocValuesFormat.MAX_BINARY_FIELD_LENGTH) { throw new ArgumentException("DocValuesField \"" + field.Name + "\" is too large, must be <= " + Lucene40DocValuesFormat.MAX_BINARY_FIELD_LENGTH); } minLength = Math.Min(minLength, b.Length); maxLength = Math.Max(maxLength, b.Length); if (uniqueValues != null) { if (uniqueValues.Add(BytesRef.DeepCopyOf(b))) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } } int maxDoc = state.SegmentInfo.DocCount; bool @fixed = minLength == maxLength; bool dedup = uniqueValues != null && uniqueValues.Count * 2 < maxDoc; if (dedup) { // we will deduplicate and deref values bool success = false; IndexOutput data = null; IndexOutput index = null; string dataName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name + "_" + Convert.ToString(field.Number, CultureInfo.InvariantCulture), segmentSuffix, "dat"); string indexName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name + "_" + Convert.ToString(field.Number, CultureInfo.InvariantCulture), segmentSuffix, "idx"); try { data = dir.CreateOutput(dataName, state.Context); index = dir.CreateOutput(indexName, state.Context); if (@fixed) { AddFixedDerefBytesField(field, data, index, values, minLength); } else { AddVarDerefBytesField(field, data, index, values); } success = true; } finally { if (success) { IOUtils.Dispose(data, index); } else { IOUtils.DisposeWhileHandlingException(data, index); } } } else { // we dont deduplicate, just write values straight if (@fixed) { // fixed byte[] string fileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name + "_" + Convert.ToString(field.Number, CultureInfo.InvariantCulture), segmentSuffix, "dat"); IndexOutput data = dir.CreateOutput(fileName, state.Context); bool success = false; try { AddFixedStraightBytesField(field, data, values, minLength); success = true; } finally { if (success) { IOUtils.Dispose(data); } else { IOUtils.DisposeWhileHandlingException(data); } } } else { // variable byte[] bool success = false; IndexOutput data = null; IndexOutput index = null; string dataName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name + "_" + Convert.ToString(field.Number, CultureInfo.InvariantCulture), segmentSuffix, "dat"); string indexName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name + "_" + Convert.ToString(field.Number, CultureInfo.InvariantCulture), segmentSuffix, "idx"); try { data = dir.CreateOutput(dataName, state.Context); index = dir.CreateOutput(indexName, state.Context); AddVarStraightBytesField(field, data, index, values); success = true; } finally { if (success) { IOUtils.Dispose(data, index); } else { IOUtils.DisposeWhileHandlingException(data, index); } } } } }
private static JCG.HashSet <Type> LoadTypes() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006) { var types = new JCG.HashSet <Type>(); var assembliesToExamine = Support.AssemblyUtils.GetReferencedAssemblies(); // LUCENENET NOTE: The following hack is not required because we are using abstract factories // and pure DI to ensure the order of the codecs are always correct during testing. //// LUCENENET HACK: //// Tests such as TestImpersonation.cs expect that the assemblies //// are probed in a certain order. NamedSPILoader, lines 68 - 75 adds //// the first item it sees with that name. So if you have multiple //// codecs, it may not add the right one, depending on the order of //// the assemblies that were examined. //// This results in many test failures if Types from Lucene.Net.Codecs //// are examined and added to NamedSPILoader first before //// Lucene.Net.TestFramework. //var testFrameworkAssembly = assembliesToExamine.FirstOrDefault(x => string.Equals(x.GetName().Name, "Lucene.Net.TestFramework", StringComparison.Ordinal)); //if (testFrameworkAssembly != null) //{ // //assembliesToExamine.Remove(testFrameworkAssembly); // //assembliesToExamine.Insert(0, testFrameworkAssembly); // assembliesToExamine = new Assembly[] { testFrameworkAssembly }.Concat(assembliesToExamine.Where(a => !testFrameworkAssembly.Equals(a))); //} foreach (var assembly in assembliesToExamine) { try { foreach (var type in assembly.GetTypes().Where(x => x.IsPublic)) { try { if (!IsInvokableSubclassOf <S>(type)) { continue; } // We are looking for types with a default ctor // (which is used in NamedSPILoader) or has a single parameter // of type IDictionary<string, string> (for AnalysisSPILoader) var matchingCtors = type.GetConstructors().Where(ctor => { var parameters = ctor.GetParameters(); switch (parameters.Length) { case 0: // default ctor return(false); // LUCENENET NOTE: Now that we have factored Codecs into Abstract Factories, we don't need default constructors here case 1: return(typeof(IDictionary <string, string>).IsAssignableFrom(parameters[0].ParameterType)); default: return(false); } }); if (matchingCtors.Any()) { types.Add(type); } } catch { // swallow } } } catch { // swallow } } return(types); }
private IndexContext CreateIndexContext() { Random random = Random; DocValuesType[] dvTypes = new DocValuesType[] { DocValuesType.BINARY, DocValuesType.SORTED }; Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter( random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).SetMergePolicy(NewLogMergePolicy()) ); bool canUseDV = !"Lucene3x".Equals(w.IndexWriter.Config.Codec.Name, StringComparison.Ordinal); DocValuesType dvType = canUseDV ? dvTypes[random.nextInt(dvTypes.Length)] : DocValuesType.NONE; int numDocs = 86 + random.nextInt(1087) * RandomMultiplier; string[] groupValues = new string[numDocs / 5]; string[] countValues = new string[numDocs / 10]; for (int i = 0; i < groupValues.Length; i++) { groupValues[i] = GenerateRandomNonEmptyString(); } for (int i = 0; i < countValues.Length; i++) { countValues[i] = GenerateRandomNonEmptyString(); } JCG.List <string> contentStrings = new JCG.List <string>(); IDictionary <string, IDictionary <string, ISet <string> > > searchTermToGroupCounts = new JCG.Dictionary <string, IDictionary <string, ISet <string> > >(); for (int i = 1; i <= numDocs; i++) { string groupValue = random.nextInt(23) == 14 ? null : groupValues[random.nextInt(groupValues.Length)]; string countValue = random.nextInt(21) == 13 ? null : countValues[random.nextInt(countValues.Length)]; string content = "random" + random.nextInt(numDocs / 20); if (!searchTermToGroupCounts.TryGetValue(content, out var groupToCounts)) { // Groups sort always DOCID asc... searchTermToGroupCounts.Add(content, groupToCounts = new JCG.LinkedDictionary <string, ISet <string> >()); contentStrings.Add(content); } if (!groupToCounts.TryGetValue(groupValue, out var countsVals)) { groupToCounts.Add(groupValue, countsVals = new JCG.HashSet <string>()); } countsVals.Add(countValue); Document doc = new Document(); doc.Add(new StringField("id", string.Format(CultureInfo.InvariantCulture, "{0:D9}", i), Field.Store.YES)); if (groupValue != null) { AddField(doc, groupField, groupValue, dvType); } if (countValue != null) { AddField(doc, countField, countValue, dvType); } doc.Add(new TextField("content", content, Field.Store.YES)); w.AddDocument(doc); } DirectoryReader reader = w.GetReader(); if (Verbose) { for (int docID = 0; docID < reader.MaxDoc; docID++) { Document doc = reader.Document(docID); Console.WriteLine("docID=" + docID + " id=" + doc.Get("id") + " content=" + doc.Get("content") + " author=" + doc.Get("author") + " publisher=" + doc.Get("publisher")); } } w.Dispose(); return(new IndexContext(dir, reader, dvType, searchTermToGroupCounts, contentStrings.ToArray(/*new String[contentStrings.size()]*/))); }
public SingleInstanceLock(JCG.HashSet <string> locks, string lockName) { this.locks = locks; this.lockName = lockName; }
internal virtual void AddNumericField(FieldInfo field, IEnumerable <long?> values, bool optimizeStorage) { meta.WriteVInt32(field.Number); meta.WriteByte((byte)Lucene42DocValuesProducer.NUMBER); meta.WriteInt64(data.GetFilePointer()); long minValue = long.MaxValue; long maxValue = long.MinValue; long gcd = 0; // TODO: more efficient? ISet <long> uniqueValues = null; if (optimizeStorage) { uniqueValues = new JCG.HashSet <long>(); long count = 0; foreach (long?nv in values) { // TODO: support this as MemoryDVFormat (and be smart about missing maybe) long v = nv.GetValueOrDefault(); if (gcd != 1) { if (v < long.MinValue / 2 || v > long.MaxValue / 2) { // in that case v - minValue might overflow and make the GCD computation return // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; } // minValue needs to be set first else if (count != 0) { gcd = MathUtil.Gcd(gcd, v - minValue); } } minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); if (uniqueValues != null) { if (uniqueValues.Add(v)) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } ++count; } if (Debugging.AssertsEnabled) { Debugging.Assert(count == maxDoc); } } if (uniqueValues != null) { // small number of unique values int bitsPerValue = PackedInt32s.BitsRequired(uniqueValues.Count - 1); FormatAndBits formatAndBits = PackedInt32s.FastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio); if (formatAndBits.BitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue) { meta.WriteByte((byte)Lucene42DocValuesProducer.UNCOMPRESSED); // uncompressed foreach (long?nv in values) { data.WriteByte((byte)nv.GetValueOrDefault()); } } else { meta.WriteByte((byte)Lucene42DocValuesProducer.TABLE_COMPRESSED); // table-compressed var decode = new long[uniqueValues.Count]; uniqueValues.CopyTo(decode, 0); var encode = new Dictionary <long, int>(); data.WriteVInt32(decode.Length); for (int i = 0; i < decode.Length; i++) { data.WriteInt64(decode[i]); encode[decode[i]] = i; } meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteVInt32(formatAndBits.Format.Id); data.WriteVInt32(formatAndBits.BitsPerValue); PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(data, formatAndBits.Format, maxDoc, formatAndBits.BitsPerValue, PackedInt32s.DEFAULT_BUFFER_SIZE); foreach (long?nv in values) { writer.Add(encode[nv.GetValueOrDefault()]); } writer.Finish(); } } else if (gcd != 0 && gcd != 1) { meta.WriteByte((byte)Lucene42DocValuesProducer.GCD_COMPRESSED); meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteInt64(minValue); data.WriteInt64(gcd); data.WriteVInt32(Lucene42DocValuesProducer.BLOCK_SIZE); BlockPackedWriter writer = new BlockPackedWriter(data, Lucene42DocValuesProducer.BLOCK_SIZE); foreach (long?nv in values) { writer.Add((nv.GetValueOrDefault() - minValue) / gcd); } writer.Finish(); } else { meta.WriteByte((byte)Lucene42DocValuesProducer.DELTA_COMPRESSED); // delta-compressed meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteVInt32(Lucene42DocValuesProducer.BLOCK_SIZE); BlockPackedWriter writer = new BlockPackedWriter(data, Lucene42DocValuesProducer.BLOCK_SIZE); foreach (long?nv in values) { writer.Add(nv.GetValueOrDefault()); } writer.Finish(); } }
public virtual void Test() { int NUM_DOCS = AtLeast(1000); Directory dir = NewDirectory(); RandomIndexWriter w = null; int docsLeftInthisSegment = 0; int docUpto = 0; while (docUpto < NUM_DOCS) { if (VERBOSE) { Console.WriteLine("TEST: " + docUpto + " of " + NUM_DOCS); } if (docsLeftInthisSegment == 0) { IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); if (Random.NextBoolean()) { // Make sure we aggressively mix in SimpleText // since it has different impls for all codec // formats... iwc.SetCodec(Codec.ForName("Lucene46")); } if (w != null) { w.Dispose(); } w = new RandomIndexWriter(Random, dir, iwc); docsLeftInthisSegment = TestUtil.NextInt32(Random, 10, 100); } Document doc = new Document(); doc.Add(NewStringField("id", Convert.ToString(docUpto), Field.Store.YES)); w.AddDocument(doc); docUpto++; docsLeftInthisSegment--; } if (VERBOSE) { Console.WriteLine("\nTEST: now delete..."); } // Random delete half the docs: ISet <int?> deleted = new JCG.HashSet <int?>(); while (deleted.Count < NUM_DOCS / 2) { int?toDelete = Random.Next(NUM_DOCS); if (!deleted.Contains(toDelete)) { deleted.Add(toDelete); w.DeleteDocuments(new Term("id", Convert.ToString(toDelete))); if (Random.Next(17) == 6) { IndexReader r = w.GetReader(); Assert.AreEqual(NUM_DOCS - deleted.Count, r.NumDocs); r.Dispose(); } } } w.Dispose(); dir.Dispose(); }
public virtual void TestGetChildren() { Directory dir = NewDirectory(); var taxoWriter = new DirectoryTaxonomyWriter(dir); int numCategories = AtLeast(10); int numA = 0, numB = 0; Random random = Random; // add the two categories for which we'll also add children (so asserts are simpler) taxoWriter.AddCategory(new FacetLabel("a")); taxoWriter.AddCategory(new FacetLabel("b")); for (int i = 0; i < numCategories; i++) { if (random.NextBoolean()) { taxoWriter.AddCategory(new FacetLabel("a", Convert.ToString(i, CultureInfo.InvariantCulture))); ++numA; } else { taxoWriter.AddCategory(new FacetLabel("b", Convert.ToString(i, CultureInfo.InvariantCulture))); ++numB; } } // add category with no children taxoWriter.AddCategory(new FacetLabel("c")); taxoWriter.Dispose(); var taxoReader = new DirectoryTaxonomyReader(dir); // non existing category TaxonomyReader.ChildrenEnumerator it = taxoReader.GetChildren(taxoReader.GetOrdinal(new FacetLabel("invalid"))); Assert.AreEqual(false, it.MoveNext()); // a category with no children it = taxoReader.GetChildren(taxoReader.GetOrdinal(new FacetLabel("c"))); Assert.AreEqual(false, it.MoveNext()); // arbitrary negative ordinal it = taxoReader.GetChildren(-2); Assert.AreEqual(false, it.MoveNext()); // root's children var roots = new JCG.HashSet <string> { "a", "b", "c" }; it = taxoReader.GetChildren(TaxonomyReader.ROOT_ORDINAL); while (roots.Count > 0) { it.MoveNext(); FacetLabel root = taxoReader.GetPath(it.Current); Assert.AreEqual(1, root.Length); Assert.IsTrue(roots.Remove(root.Components[0])); } Assert.AreEqual(false, it.MoveNext()); for (int i = 0; i < 2; i++) { FacetLabel cp = i == 0 ? new FacetLabel("a") : new FacetLabel("b"); int ordinal = taxoReader.GetOrdinal(cp); it = taxoReader.GetChildren(ordinal); int numChildren = 0; int child; while (it.MoveNext()) { child = it.Current; FacetLabel path = taxoReader.GetPath(child); Assert.AreEqual(2, path.Length); Assert.AreEqual(path.Components[0], i == 0 ? "a" : "b"); ++numChildren; } int expected = i == 0 ? numA : numB; Assert.AreEqual(expected, numChildren, "invalid num children"); } taxoReader.Dispose(); dir.Dispose(); }
private IndexContext CreateIndexContext(bool multipleFacetValuesPerDocument) { Random random = Random; int numDocs = TestUtil.NextInt32(random, 138, 1145) * RANDOM_MULTIPLIER; int numGroups = TestUtil.NextInt32(random, 1, numDocs / 4); int numFacets = TestUtil.NextInt32(random, 1, numDocs / 6); if (VERBOSE) { Console.WriteLine("TEST: numDocs=" + numDocs + " numGroups=" + numGroups); } List <string> groups = new List <string>(); for (int i = 0; i < numGroups; i++) { groups.Add(GenerateRandomNonEmptyString()); } List <string> facetValues = new List <string>(); for (int i = 0; i < numFacets; i++) { facetValues.Add(GenerateRandomNonEmptyString()); } string[] contentBrs = new string[TestUtil.NextInt32(random, 2, 20)]; if (VERBOSE) { Console.WriteLine("TEST: create fake content"); } for (int contentIDX = 0; contentIDX < contentBrs.Length; contentIDX++) { contentBrs[contentIDX] = GenerateRandomNonEmptyString(); if (VERBOSE) { Console.WriteLine(" content=" + contentBrs[contentIDX]); } } Directory dir = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter( random, dir, NewIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random) ) ); bool canUseDV = !"Lucene3x".Equals(writer.IndexWriter.Config.Codec.Name, StringComparison.Ordinal); bool useDv = canUseDV && !multipleFacetValuesPerDocument && random.nextBoolean(); Document doc = new Document(); Document docNoGroup = new Document(); Document docNoFacet = new Document(); Document docNoGroupNoFacet = new Document(); Field group = NewStringField("group", "", Field.Store.NO); Field groupDc = new SortedDocValuesField("group_dv", new BytesRef()); if (useDv) { doc.Add(groupDc); docNoFacet.Add(groupDc); } doc.Add(group); docNoFacet.Add(group); Field[] facetFields; if (useDv) { Debug.Assert(!multipleFacetValuesPerDocument); facetFields = new Field[2]; facetFields[0] = NewStringField("facet", "", Field.Store.NO); doc.Add(facetFields[0]); docNoGroup.Add(facetFields[0]); facetFields[1] = new SortedDocValuesField("facet_dv", new BytesRef()); doc.Add(facetFields[1]); docNoGroup.Add(facetFields[1]); } else { facetFields = multipleFacetValuesPerDocument ? new Field[2 + random.nextInt(6)] : new Field[1]; for (int i = 0; i < facetFields.Length; i++) { facetFields[i] = NewStringField("facet", "", Field.Store.NO); doc.Add(facetFields[i]); docNoGroup.Add(facetFields[i]); } } Field content = NewStringField("content", "", Field.Store.NO); doc.Add(content); docNoGroup.Add(content); docNoFacet.Add(content); docNoGroupNoFacet.Add(content); ISet <string> uniqueFacetValues = new JCG.SortedSet <string>(new ComparerAnonymousHelper1()); // LUCENENET NOTE: Need JCG.Dictionary here because of null keys IDictionary <string, JCG.Dictionary <string, ISet <string> > > searchTermToFacetToGroups = new Dictionary <string, JCG.Dictionary <string, ISet <string> > >(); int facetWithMostGroups = 0; for (int i = 0; i < numDocs; i++) { string groupValue; if (random.nextInt(24) == 17) { // So we test the "doc doesn't have the group'd // field" case: if (useDv) { groupValue = ""; } else { groupValue = null; } } else { groupValue = groups[random.nextInt(groups.size())]; } string contentStr = contentBrs[random.nextInt(contentBrs.Length)]; if (!searchTermToFacetToGroups.TryGetValue(contentStr, out JCG.Dictionary <string, ISet <string> > facetToGroups)) { searchTermToFacetToGroups[contentStr] = facetToGroups = new JCG.Dictionary <string, ISet <string> >(); } List <string> facetVals = new List <string>(); if (useDv || random.nextInt(24) != 18) { if (useDv) { string facetValue = facetValues[random.nextInt(facetValues.size())]; uniqueFacetValues.Add(facetValue); if (!facetToGroups.TryGetValue(facetValue, out ISet <string> groupsInFacet)) { facetToGroups[facetValue] = groupsInFacet = new JCG.HashSet <string>(); } groupsInFacet.add(groupValue); if (groupsInFacet.size() > facetWithMostGroups) { facetWithMostGroups = groupsInFacet.size(); } facetFields[0].SetStringValue(facetValue); facetFields[1].SetBytesValue(new BytesRef(facetValue)); facetVals.Add(facetValue); } else { foreach (Field facetField in facetFields) { string facetValue = facetValues[random.nextInt(facetValues.size())]; uniqueFacetValues.Add(facetValue); if (!facetToGroups.TryGetValue(facetValue, out ISet <string> groupsInFacet)) { facetToGroups[facetValue] = groupsInFacet = new JCG.HashSet <string>(); } groupsInFacet.add(groupValue); if (groupsInFacet.size() > facetWithMostGroups) { facetWithMostGroups = groupsInFacet.size(); } facetField.SetStringValue(facetValue); facetVals.Add(facetValue); } } } else { uniqueFacetValues.Add(null); if (!facetToGroups.TryGetValue(null, out ISet <string> groupsInFacet)) { facetToGroups[null] = groupsInFacet = new JCG.HashSet <string>(); } groupsInFacet.add(groupValue); if (groupsInFacet.size() > facetWithMostGroups) { facetWithMostGroups = groupsInFacet.size(); } } if (VERBOSE) { Console.WriteLine(" doc content=" + contentStr + " group=" + (groupValue == null ? "null" : groupValue) + " facetVals=" + Collections.ToString(facetVals)); } if (groupValue != null) { if (useDv) { groupDc.SetBytesValue(new BytesRef(groupValue)); } group.SetStringValue(groupValue); } else if (useDv) { // DV cannot have missing values: groupDc.SetBytesValue(new BytesRef()); } content.SetStringValue(contentStr); if (groupValue == null && !facetVals.Any()) { writer.AddDocument(docNoGroupNoFacet); } else if (!facetVals.Any()) { writer.AddDocument(docNoFacet); } else if (groupValue == null) { writer.AddDocument(docNoGroup); } else { writer.AddDocument(doc); } } DirectoryReader reader = writer.GetReader(); writer.Dispose(); return(new IndexContext(searchTermToFacetToGroups, reader, numDocs, dir, facetWithMostGroups, numGroups, contentBrs, uniqueFacetValues, useDv)); }