/// <summary> /// Internal helper method used by check that iterates over /// <paramref name="valMismatchKeys"/> and generates a <see cref="ICollection{T}"/> of <see cref="Insanity"/> /// instances accordingly. The <see cref="MapOfSets{TKey, TValue}"/> are used to populate /// the <see cref="Insanity"/> objects. </summary> /// <seealso cref="InsanityType.VALUEMISMATCH"/> private static ICollection <Insanity> CheckValueMismatch( // LUCENENET: CA1822: Mark members as static MapOfSets <int, FieldCache.CacheEntry> valIdToItems, MapOfSets <ReaderField, int> readerFieldToValIds, ISet <ReaderField> valMismatchKeys) { JCG.List <Insanity> insanity = new JCG.List <Insanity>(valMismatchKeys.Count * 3); if (valMismatchKeys.Count != 0) { // we have multiple values for some ReaderFields IDictionary <ReaderField, ISet <int> > rfMap = readerFieldToValIds.Map; IDictionary <int, ISet <FieldCache.CacheEntry> > valMap = valIdToItems.Map; foreach (ReaderField rf in valMismatchKeys) { IList <FieldCache.CacheEntry> badEntries = new JCG.List <FieldCache.CacheEntry>(valMismatchKeys.Count * 2); foreach (int value in rfMap[rf]) { foreach (FieldCache.CacheEntry cacheEntry in valMap[value]) { badEntries.Add(cacheEntry); } } FieldCache.CacheEntry[] badness = new FieldCache.CacheEntry[badEntries.Count]; badEntries.CopyTo(badness, 0); insanity.Add(new Insanity(InsanityType.VALUEMISMATCH, "Multiple distinct value objects for " + rf.ToString(), badness)); } } return(insanity); }
public override Search.Query Rewrite(IndexReader reader) { var luceneSubQueries = new JCG.List <Search.Query>(); m_srndQuery.VisitMatchingTerms(reader, m_fieldName, new SimpleTermRewriteMatchingTermVisitor(luceneSubQueries, m_qf)); return((luceneSubQueries.Count == 0) ? SrndQuery.TheEmptyLcnQuery : (luceneSubQueries.Count == 1) ? luceneSubQueries[0] : SrndBooleanQuery.MakeBooleanQuery( /* luceneSubQueries all have default weight */ luceneSubQueries, Occur.SHOULD)); /* OR the subquery terms */ }
public virtual bool IsOperatorInfix => operatorInfix; /* else prefix operator */ public virtual IList <Search.Query> MakeLuceneSubQueriesField(string fn, BasicQueryFactory qf) { IList <Search.Query> luceneSubQueries = new JCG.List <Search.Query>(); using (IEnumerator <SrndQuery> sqi = GetSubQueriesEnumerator()) { while (sqi.MoveNext()) { luceneSubQueries.Add((sqi.Current).MakeLuceneQueryField(fn, qf)); } } return(luceneSubQueries); }
#pragma warning restore 612, 618 private static void Search(Index.IndexReader r, int times) { var searcher = new Search.IndexSearcher(r); var docs = new JCG.List <Documents.Document>(10000); for (int i = 0; i < times; i++) { var q = new Search.TermQuery(new Index.Term("title", "volume")); foreach (var scoreDoc in searcher.Search(q, 100).ScoreDocs) { docs.Add(searcher.Doc(scoreDoc.Doc)); } } }
private void CollapseAndSaveTokens(int tokenType, string type) { //collapse StringBuilder buffer = new StringBuilder(32); int numAdded = scanner.SetText(buffer); //TODO: how to know how much whitespace to add int theStart = scanner.YyChar; int lastPos = theStart + numAdded; int tmpTokType; int numSeen = 0; IList <AttributeSource.State> tmp = new JCG.List <AttributeSource.State>(); SetupSavedToken(0, type); tmp.Add(CaptureState()); //while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type while ((tmpTokType = scanner.GetNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.NumWikiTokensSeen > numSeen) { int currPos = scanner.YyChar; //append whitespace for (int i = 0; i < (currPos - lastPos); i++) { buffer.Append(' '); } numAdded = scanner.SetText(buffer); SetupSavedToken(scanner.PositionIncrement, type); tmp.Add(CaptureState()); numSeen++; lastPos = currPos + numAdded; } //trim the buffer // TODO: this is inefficient string s = buffer.ToString().Trim(); termAtt.SetEmpty().Append(s); offsetAtt.SetOffset(CorrectOffset(theStart), CorrectOffset(theStart + s.Length)); flagsAtt.Flags = UNTOKENIZED_TOKEN_FLAG; //The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos if (tmpTokType != WikipediaTokenizerImpl.YYEOF) { scanner.YyPushBack(scanner.YyLength); } tokens = tmp.GetEnumerator(); }
public override TermsEnum Intersect(CompiledAutomaton compiled, BytesRef startTerm) { IList <MultiTermsEnum.TermsEnumIndex> termsEnums = new JCG.List <MultiTermsEnum.TermsEnumIndex>(); for (int i = 0; i < subs.Length; i++) { TermsEnum termsEnum = subs[i].Intersect(compiled, startTerm); if (termsEnum != null) { termsEnums.Add(new MultiTermsEnum.TermsEnumIndex(termsEnum, i)); } } if (termsEnums.Count > 0) { return((new MultiTermsEnum(subSlices)).Reset(termsEnums.ToArray(/*MultiTermsEnum.TermsEnumIndex.EMPTY_ARRAY*/))); } else { return(TermsEnum.EMPTY); } }
public override TermsEnum GetEnumerator() { IList <MultiTermsEnum.TermsEnumIndex> termsEnums = new JCG.List <MultiTermsEnum.TermsEnumIndex>(); for (int i = 0; i < subs.Length; i++) { TermsEnum termsEnum = subs[i].GetEnumerator(); if (termsEnum != null) { termsEnums.Add(new MultiTermsEnum.TermsEnumIndex(termsEnum, i)); } } if (termsEnums.Count > 0) { return((new MultiTermsEnum(subSlices)).Reset(termsEnums.ToArray(/*MultiTermsEnum.TermsEnumIndex.EMPTY_ARRAY*/))); } else { return(TermsEnum.EMPTY); } }
private IList <AbstractDistinctValuesCollector.IGroupCount <IComparable> > CreateExpectedResult(IndexContext context, string term, Sort groupSort, int topN) { JCG.List <AbstractDistinctValuesCollector.IGroupCount <IComparable> > result = new JCG.List <AbstractDistinctValuesCollector.IGroupCount <IComparable> >(); IDictionary <string, ISet <string> > groupCounts = context.searchTermToGroupCounts[term]; int i = 0; foreach (string group in groupCounts.Keys) { if (topN <= i++) { break; } ISet <BytesRef> uniqueValues = new JCG.HashSet <BytesRef>(); foreach (string val in groupCounts[group]) { uniqueValues.Add(val != null ? new BytesRef(val) : null); } var gc = new GroupCount(group != null ? new BytesRef(group) : (BytesRef)null, uniqueValues); result.Add(gc); } return(result); }
public void TestRandom() { int numWords = AtLeast(1000); IDictionary <string, long> slowCompletor = new JCG.SortedDictionary <string, long>(StringComparer.Ordinal); ISet <string> allPrefixes = new JCG.SortedSet <string>(StringComparer.Ordinal); Input[] keys = new Input[numWords]; for (int i = 0; i < numWords; i++) { String s; while (true) { // TODO: would be nice to fix this slowCompletor/comparer to // use full range, but we might lose some coverage too... s = TestUtil.RandomSimpleString(LuceneTestCase.Random); if (!slowCompletor.ContainsKey(s)) { break; } } for (int j = 1; j < s.Length; j++) { allPrefixes.add(s.Substring(0, j)); } // we can probably do Integer.MAX_VALUE here, but why worry. int weight = LuceneTestCase.Random.nextInt(1 << 24); slowCompletor.Put(s, (long)weight); keys[i] = new Input(s, weight); } WFSTCompletionLookup suggester = new WFSTCompletionLookup(false); suggester.Build(new InputArrayEnumerator(keys)); assertEquals(numWords, suggester.Count); Random random = new Random(Random.Next()); foreach (String prefix in allPrefixes) { int topN = TestUtil.NextInt32(random, 1, 10); IList <Lookup.LookupResult> r = suggester.DoLookup(TestUtil.StringToCharSequence(prefix, random).ToString(), false, topN); // 2. go thru whole treemap (slowCompletor) and check its actually the best suggestion JCG.List <Lookup.LookupResult> matches = new JCG.List <Lookup.LookupResult>(); // TODO: could be faster... but its slowCompletor for a reason foreach (KeyValuePair <string, long> e in slowCompletor) { if (e.Key.StartsWith(prefix, StringComparison.Ordinal)) { matches.Add(new Lookup.LookupResult(e.Key, e.Value)); } } assertTrue(matches.size() > 0); matches.Sort(new TestRandomComparer()); if (matches.size() > topN) { //matches.SubList(topN, matches.size()).clear(); matches.RemoveRange(topN, matches.size() - topN); // LUCENENET: Converted end index to length } assertEquals(matches.size(), r.size()); for (int hit = 0; hit < r.size(); hit++) { //System.out.println(" check hit " + hit); assertEquals(matches[hit].Key.toString(), r[hit].Key.toString()); assertEquals(matches[hit].Value, r[hit].Value, 0f); } } }
/// <summary> /// Internal helper method used by check that iterates over /// the keys of <paramref name="readerFieldToValIds"/> and generates a <see cref="ICollection{T}"/> /// of <see cref="Insanity"/> instances whenever two (or more) <see cref="ReaderField"/> instances are /// found that have an ancestry relationships. /// </summary> /// <seealso cref="InsanityType.SUBREADER"/> private static ICollection <Insanity> CheckSubreaders(MapOfSets <int, FieldCache.CacheEntry> valIdToItems, MapOfSets <ReaderField, int> readerFieldToValIds) // LUCENENET: CA1822: Mark members as static { JCG.List <Insanity> insanity = new JCG.List <Insanity>(23); Dictionary <ReaderField, ISet <ReaderField> > badChildren = new Dictionary <ReaderField, ISet <ReaderField> >(17); MapOfSets <ReaderField, ReaderField> badKids = new MapOfSets <ReaderField, ReaderField>(badChildren); // wrapper IDictionary <int, ISet <FieldCache.CacheEntry> > viToItemSets = valIdToItems.Map; IDictionary <ReaderField, ISet <int> > rfToValIdSets = readerFieldToValIds.Map; HashSet <ReaderField> seen = new HashSet <ReaderField>(); //IDictionary<ReaderField, ISet<int>>.KeyCollection readerFields = rfToValIdSets.Keys; foreach (ReaderField rf in rfToValIdSets.Keys) { if (seen.Contains(rf)) { continue; } IList <object> kids = GetAllDescendantReaderKeys(rf.ReaderKey); foreach (object kidKey in kids) { ReaderField kid = new ReaderField(kidKey, rf.FieldName); // LUCENENET: Eliminated extra lookup by using TryGetValue instead of ContainsKey if (badChildren.TryGetValue(kid, out ISet <ReaderField> badKid)) { // we've already process this kid as RF and found other problems // track those problems as our own badKids.Put(rf, kid); badKids.PutAll(rf, badKid); badChildren.Remove(kid); } else if (rfToValIdSets.ContainsKey(kid)) { // we have cache entries for the kid badKids.Put(rf, kid); } seen.Add(kid); } seen.Add(rf); } // every mapping in badKids represents an Insanity foreach (ReaderField parent in badChildren.Keys) { ISet <ReaderField> kids = badChildren[parent]; JCG.List <FieldCache.CacheEntry> badEntries = new JCG.List <FieldCache.CacheEntry>(kids.Count * 2); // put parent entr(ies) in first { foreach (int value in rfToValIdSets[parent]) { badEntries.AddRange(viToItemSets[value]); } } // now the entries for the descendants foreach (ReaderField kid in kids) { foreach (int value in rfToValIdSets[kid]) { badEntries.AddRange(viToItemSets[value]); } } FieldCache.CacheEntry[] badness = badEntries.ToArray(); insanity.Add(new Insanity(InsanityType.SUBREADER, "Found caches for descendants of " + parent.ToString(), badness)); } return(insanity); }
public virtual void TestRandom() { Random random = Random; int numberOfRuns = TestUtil.NextInt32(random, 3, 6); for (int indexIter = 0; indexIter < numberOfRuns; indexIter++) { IndexContext context = CreateIndexContext(); for (int searchIter = 0; searchIter < 100; searchIter++) { IndexSearcher searcher = NewSearcher(context.indexReader); bool useDv = context.dvType != DocValuesType.NONE && random.nextBoolean(); DocValuesType dvType = useDv ? context.dvType : DocValuesType.NONE; string term = context.contentStrings[random.nextInt(context.contentStrings.Length)]; Sort groupSort = new Sort(new SortField("id", SortFieldType.STRING)); int topN = 1 + random.nextInt(10); IList <AbstractDistinctValuesCollector.IGroupCount <IComparable> > expectedResult = CreateExpectedResult(context, term, groupSort, topN); IAbstractFirstPassGroupingCollector <IComparable> firstCollector = CreateRandomFirstPassCollector(dvType, groupSort, groupField, topN); searcher.Search(new TermQuery(new Term("content", term)), firstCollector); IAbstractDistinctValuesCollector <AbstractDistinctValuesCollector.IGroupCount <IComparable> > distinctValuesCollector = CreateDistinctCountCollector(firstCollector, groupField, countField, dvType); searcher.Search(new TermQuery(new Term("content", term)), distinctValuesCollector); // LUCENENET TODO: Try to work out how to do this without an O(n) operation JCG.List <AbstractDistinctValuesCollector.IGroupCount <IComparable> > actualResult = new JCG.List <AbstractDistinctValuesCollector.IGroupCount <IComparable> >(distinctValuesCollector.Groups); if (Verbose) { Console.WriteLine("Index iter=" + indexIter); Console.WriteLine("Search iter=" + searchIter); Console.WriteLine("1st pass collector class name=" + firstCollector.GetType().Name); Console.WriteLine("2nd pass collector class name=" + distinctValuesCollector.GetType().Name); Console.WriteLine("Search term=" + term); Console.WriteLine("DVType=" + dvType); Console.WriteLine("1st pass groups=" + firstCollector.GetTopGroups(0, false).toString()); Console.WriteLine("Expected:"); PrintGroups(expectedResult); Console.WriteLine("Actual:"); PrintGroups(actualResult); Console.Out.Flush(); } assertEquals(expectedResult.Count, actualResult.Count); for (int i = 0; i < expectedResult.size(); i++) { AbstractDistinctValuesCollector.IGroupCount <IComparable> expected = expectedResult[i]; AbstractDistinctValuesCollector.IGroupCount <IComparable> actual = actualResult[i]; AssertValues(expected.GroupValue, actual.GroupValue); assertEquals(expected.UniqueValues.Count(), actual.UniqueValues.Count()); JCG.List <IComparable> expectedUniqueValues = new JCG.List <IComparable>(expected.UniqueValues); expectedUniqueValues.Sort(nullComparer); JCG.List <IComparable> actualUniqueValues = new JCG.List <IComparable>(actual.UniqueValues); actualUniqueValues.Sort(nullComparer); for (int j = 0; j < expectedUniqueValues.size(); j++) { AssertValues(expectedUniqueValues[j], actualUniqueValues[j]); } } } context.indexReader.Dispose(); context.directory.Dispose(); } }
private GroupedFacetResult CreateExpectedFacetResult(string searchTerm, IndexContext context, int offset, int limit, int minCount, bool orderByCount, string facetPrefix) { if (!context.searchTermToFacetGroups.TryGetValue(searchTerm, out var facetGroups)) { facetGroups = new JCG.Dictionary <string, ISet <string> >(); } int totalCount = 0; int totalMissCount = 0; ISet <string> facetValues; if (facetPrefix != null) { facetValues = new JCG.HashSet <string>(); foreach (string facetValue in context.facetValues) { if (facetValue != null && facetValue.StartsWith(facetPrefix, StringComparison.Ordinal)) { facetValues.add(facetValue); } } } else { facetValues = context.facetValues; } JCG.List <TermGroupFacetCollector.FacetEntry> entries = new JCG.List <TermGroupFacetCollector.FacetEntry>(facetGroups.size()); // also includes facets with count 0 foreach (string facetValue in facetValues) { if (facetValue == null) { continue; } int count = facetGroups.TryGetValue(facetValue, out ISet <string> groups) && groups != null?groups.size() : 0; if (count >= minCount) { entries.Add(new TermGroupFacetCollector.FacetEntry(new BytesRef(facetValue), count)); } totalCount += count; } // Only include null count when no facet prefix is specified if (facetPrefix == null) { if (facetGroups.TryGetValue(null, out ISet <string> groups) && groups != null) { totalMissCount = groups.size(); } } entries.Sort(Comparer <TermGroupFacetCollector.FacetEntry> .Create((a, b) => { if (orderByCount) { int cmp = b.Count - a.Count; if (cmp != 0) { return(cmp); } } return(a.Value.CompareTo(b.Value)); })); int endOffset = offset + limit; IList <TermGroupFacetCollector.FacetEntry> entriesResult; if (offset >= entries.size()) { entriesResult = Collections.EmptyList <TermGroupFacetCollector.FacetEntry>(); } else if (endOffset >= entries.size()) { entriesResult = entries.GetView(offset, entries.size() - offset); // LUCENENET: Converted end index to length } else { entriesResult = entries.GetView(offset, endOffset - offset); // LUCENENET: Converted end index to length } return(new GroupedFacetResult(totalCount, totalMissCount, entriesResult)); }
public void TestRandom() { string[] terms = new string[TestUtil.NextInt32(Random, 2, 10)]; ISet <string> seen = new JCG.HashSet <string>(); while (seen.size() < terms.Length) { string token = TestUtil.RandomSimpleString(Random, 1, 5); if (!seen.contains(token)) { terms[seen.size()] = token; seen.add(token); } } Analyzer a = new MockAnalyzer(Random); int numDocs = AtLeast(10); long totTokens = 0; string[][] docs = new string[numDocs][]; for (int i = 0; i < numDocs; i++) { docs[i] = new string[AtLeast(100)]; if (Verbose) { Console.Write(" doc " + i + ":"); } for (int j = 0; j < docs[i].Length; j++) { docs[i][j] = GetZipfToken(terms); if (Verbose) { Console.Write(" " + docs[i][j]); } } if (Verbose) { Console.WriteLine(); } totTokens += docs[i].Length; } int grams = TestUtil.NextInt32(Random, 1, 4); if (Verbose) { Console.WriteLine("TEST: " + terms.Length + " terms; " + numDocs + " docs; " + grams + " grams"); } // Build suggester model: FreeTextSuggester sug = new FreeTextSuggester(a, a, grams, (byte)0x20); sug.Build(new TestRandomInputEnumerator(docs)); // Build inefficient but hopefully correct model: IList <IDictionary <string, int?> > gramCounts = new JCG.List <IDictionary <string, int?> >(grams); for (int gram = 0; gram < grams; gram++) { if (Verbose) { Console.WriteLine("TEST: build model for gram=" + gram); } IDictionary <string, int?> model = new JCG.Dictionary <string, int?>(); gramCounts.Add(model); foreach (string[] doc in docs) { for (int i = 0; i < doc.Length - gram; i++) { StringBuilder b = new StringBuilder(); for (int j = i; j <= i + gram; j++) { if (j > i) { b.append(' '); } b.append(doc[j]); } string token = b.toString(); if (!model.TryGetValue(token, out int?curCount) || curCount == null) { model.Put(token, 1); } else { model.Put(token, 1 + curCount); } if (Verbose) { Console.WriteLine(" add '" + token + "' -> count=" + (model.TryGetValue(token, out int?count) ? (count.HasValue ? count.ToString() : "null") : "")); } } } } int lookups = AtLeast(100); for (int iter = 0; iter < lookups; iter++) { string[] tokens = new string[TestUtil.NextInt32(Random, 1, 5)]; for (int i = 0; i < tokens.Length; i++) { tokens[i] = GetZipfToken(terms); } // Maybe trim last token; be sure not to create the // empty string: int trimStart; if (tokens.Length == 1) { trimStart = 1; } else { trimStart = 0; } int trimAt = TestUtil.NextInt32(Random, trimStart, tokens[tokens.Length - 1].Length); tokens[tokens.Length - 1] = tokens[tokens.Length - 1].Substring(0, trimAt - 0); int num = TestUtil.NextInt32(Random, 1, 100); StringBuilder b = new StringBuilder(); foreach (string token in tokens) { b.append(' '); b.append(token); } string query = b.toString(); query = query.Substring(1); if (Verbose) { Console.WriteLine("\nTEST: iter=" + iter + " query='" + query + "' num=" + num); } // Expected: JCG.List <Lookup.LookupResult> expected = new JCG.List <Lookup.LookupResult>(); double backoff = 1.0; seen = new JCG.HashSet <string>(); if (Verbose) { Console.WriteLine(" compute expected"); } for (int i = grams - 1; i >= 0; i--) { if (Verbose) { Console.WriteLine(" grams=" + i); } if (tokens.Length < i + 1) { // Don't have enough tokens to use this model if (Verbose) { Console.WriteLine(" skip"); } continue; } if (i == 0 && tokens[tokens.Length - 1].Length == 0) { // Never suggest unigrams from empty string: if (Verbose) { Console.WriteLine(" skip unigram priors only"); } continue; } // Build up "context" ngram: b = new StringBuilder(); for (int j = tokens.Length - i - 1; j < tokens.Length - 1; j++) { b.append(' '); b.append(tokens[j]); } string context = b.toString(); if (context.Length > 0) { context = context.Substring(1); } if (Verbose) { Console.WriteLine(" context='" + context + "'"); } long contextCount; if (context.Length == 0) { contextCount = totTokens; } else { //int? count = gramCounts.get(i - 1).get(context); var gramCount = gramCounts[i - 1]; if (!gramCount.TryGetValue(context, out int?count) || count == null) { // We never saw this context: backoff *= FreeTextSuggester.ALPHA; if (Verbose) { Console.WriteLine(" skip: never saw context"); } continue; } contextCount = count.GetValueOrDefault(); } if (Verbose) { Console.WriteLine(" contextCount=" + contextCount); } IDictionary <string, int?> model = gramCounts[i]; // First pass, gather all predictions for this model: if (Verbose) { Console.WriteLine(" find terms w/ prefix=" + tokens[tokens.Length - 1]); } JCG.List <Lookup.LookupResult> tmp = new JCG.List <Lookup.LookupResult>(); foreach (string term in terms) { if (term.StartsWith(tokens[tokens.Length - 1], StringComparison.Ordinal)) { if (Verbose) { Console.WriteLine(" term=" + term); } if (seen.contains(term)) { if (Verbose) { Console.WriteLine(" skip seen"); } continue; } string ngram = (context + " " + term).Trim(); //Integer count = model.get(ngram); if (model.TryGetValue(ngram, out int?count) && count != null) { // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes // return numbers that are greater than long.MaxValue, which results in a negative long number. // This is also the way it is being done in the FreeTextSuggester to work around the issue. // LUCENENET NOTE: The order of parentheses in the Java test didn't match the production code. This apparently doesn't affect the // result in Java, but does in .NET, so we changed the test to match the production code. //Lookup.LookupResult lr = new Lookup.LookupResult(ngram, (long)(long.MaxValue * ((decimal)backoff * (decimal)count / contextCount))); Lookup.LookupResult lr = new Lookup.LookupResult(ngram, (long)(long.MaxValue * (decimal)backoff * ((decimal)count) / contextCount)); tmp.Add(lr); if (Verbose) { Console.WriteLine(" add tmp key='" + lr.Key + "' score=" + lr.Value); } } } } // Second pass, trim to only top N, and fold those // into overall suggestions: tmp.Sort(byScoreThenKey); if (tmp.size() > num) { //tmp.subList(num, tmp.size()).clear(); tmp.RemoveRange(num, tmp.size() - num); // LUCENENET: Converted end index to length } foreach (Lookup.LookupResult result in tmp) { string key = result.Key.toString(); int idx = key.LastIndexOf(' '); string lastToken; if (idx != -1) { lastToken = key.Substring(idx + 1); } else { lastToken = key; } if (!seen.contains(lastToken)) { seen.add(lastToken); expected.Add(result); if (Verbose) { Console.WriteLine(" keep key='" + result.Key + "' score=" + result.Value); } } } backoff *= FreeTextSuggester.ALPHA; } expected.Sort(byScoreThenKey); if (expected.size() > num) { expected.RemoveRange(num, expected.size() - num); // LUCENENET: Converted end index to length } // Actual: IList <Lookup.LookupResult> actual = sug.DoLookup(query, num); if (Verbose) { Console.WriteLine(" expected: " + expected); Console.WriteLine(" actual: " + actual); } assertEquals(expected.ToString(), actual.ToString()); } }
/// <summary> /// Detect repetition groups. Done once - for first doc. </summary> private IList <IList <PhrasePositions> > GatherRptGroups(JCG.LinkedDictionary <Term, int?> rptTerms) { PhrasePositions[] rpp = RepeatingPPs(rptTerms); IList <IList <PhrasePositions> > res = new JCG.List <IList <PhrasePositions> >(); if (!hasMultiTermRpts) { // simpler - no multi-terms - can base on positions in first doc for (int i = 0; i < rpp.Length; i++) { PhrasePositions pp = rpp[i]; if (pp.rptGroup >= 0) // already marked as a repetition { continue; } int tpPos = TpPos(pp); for (int j = i + 1; j < rpp.Length; j++) { PhrasePositions pp2 = rpp[j]; if (pp2.rptGroup >= 0 || pp2.offset == pp.offset || TpPos(pp2) != tpPos) // not a repetition - not a repetition: two PPs are originally in same offset in the query! - already marked as a repetition { continue; } // a repetition int g = pp.rptGroup; if (g < 0) { g = res.Count; pp.rptGroup = g; IList <PhrasePositions> rl = new JCG.List <PhrasePositions>(2) { pp }; res.Add(rl); } pp2.rptGroup = g; res[g].Add(pp2); } } } else { // more involved - has multi-terms IList <JCG.HashSet <PhrasePositions> > tmp = new JCG.List <JCG.HashSet <PhrasePositions> >(); IList <FixedBitSet> bb = PpTermsBitSets(rpp, rptTerms); UnionTermGroups(bb); IDictionary <Term, int> tg = TermGroups(rptTerms, bb); JCG.HashSet <int> distinctGroupIDs = new JCG.HashSet <int>(tg.Values); for (int i = 0; i < distinctGroupIDs.Count; i++) { tmp.Add(new JCG.HashSet <PhrasePositions>()); } foreach (PhrasePositions pp in rpp) { foreach (Term t in pp.terms) { if (rptTerms.ContainsKey(t)) { int g = tg[t]; tmp[g].Add(pp); if (Debugging.AssertsEnabled) { Debugging.Assert(pp.rptGroup == -1 || pp.rptGroup == g); } pp.rptGroup = g; } } } foreach (JCG.HashSet <PhrasePositions> hs in tmp) { res.Add(new JCG.List <PhrasePositions>(hs)); } } return(res); }
public virtual void TestSimple() { Random random = Random; DocValuesType[] dvTypes = new DocValuesType[] { DocValuesType.NUMERIC, DocValuesType.BINARY, DocValuesType.SORTED, }; Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter( random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).SetMergePolicy(NewLogMergePolicy())); bool canUseDV = !"Lucene3x".Equals(w.IndexWriter.Config.Codec.Name, StringComparison.Ordinal); DocValuesType dvType = canUseDV ? dvTypes[random.nextInt(dvTypes.Length)] : DocValuesType.NONE; Document doc = new Document(); AddField(doc, groupField, "1", dvType); AddField(doc, countField, "1", dvType); doc.Add(new TextField("content", "random text", Field.Store.NO)); doc.Add(new StringField("id", "1", Field.Store.NO)); w.AddDocument(doc); // 1 doc = new Document(); AddField(doc, groupField, "1", dvType); AddField(doc, countField, "1", dvType); doc.Add(new TextField("content", "some more random text blob", Field.Store.NO)); doc.Add(new StringField("id", "2", Field.Store.NO)); w.AddDocument(doc); // 2 doc = new Document(); AddField(doc, groupField, "1", dvType); AddField(doc, countField, "2", dvType); doc.Add(new TextField("content", "some more random textual data", Field.Store.NO)); doc.Add(new StringField("id", "3", Field.Store.NO)); w.AddDocument(doc); w.Commit(); // To ensure a second segment // 3 doc = new Document(); AddField(doc, groupField, "2", dvType); doc.Add(new TextField("content", "some random text", Field.Store.NO)); doc.Add(new StringField("id", "4", Field.Store.NO)); w.AddDocument(doc); // 4 doc = new Document(); AddField(doc, groupField, "3", dvType); AddField(doc, countField, "1", dvType); doc.Add(new TextField("content", "some more random text", Field.Store.NO)); doc.Add(new StringField("id", "5", Field.Store.NO)); w.AddDocument(doc); // 5 doc = new Document(); AddField(doc, groupField, "3", dvType); AddField(doc, countField, "1", dvType); doc.Add(new TextField("content", "random blob", Field.Store.NO)); doc.Add(new StringField("id", "6", Field.Store.NO)); w.AddDocument(doc); // 6 -- no author field doc = new Document(); doc.Add(new TextField("content", "random word stuck in alot of other text", Field.Store.YES)); AddField(doc, countField, "1", dvType); doc.Add(new StringField("id", "6", Field.Store.NO)); w.AddDocument(doc); IndexSearcher indexSearcher = NewSearcher(w.GetReader()); w.Dispose(); var cmp = Comparer <AbstractDistinctValuesCollector.IGroupCount <IComparable> > .Create((groupCount1, groupCount2) => { if (groupCount1.GroupValue == null) { if (groupCount2.GroupValue == null) { return(0); } return(-1); } else if (groupCount2.GroupValue == null) { return(1); } else { return(groupCount1.GroupValue.CompareTo(groupCount2.GroupValue)); } }); // === Search for content:random IAbstractFirstPassGroupingCollector <IComparable> firstCollector = CreateRandomFirstPassCollector(dvType, new Sort(), groupField, 10); indexSearcher.Search(new TermQuery(new Term("content", "random")), firstCollector); IAbstractDistinctValuesCollector <AbstractDistinctValuesCollector.IGroupCount <IComparable> > distinctValuesCollector = CreateDistinctCountCollector(firstCollector, groupField, countField, dvType); indexSearcher.Search(new TermQuery(new Term("content", "random")), distinctValuesCollector); //var gcs = distinctValuesCollector.Groups as JCG.List<IGroupCount<IComparable>>; // LUCENENET TODO: Try to work out how to do this without an O(n) operation var gcs = new JCG.List <AbstractDistinctValuesCollector.IGroupCount <IComparable> >(distinctValuesCollector.Groups); gcs.Sort(cmp); assertEquals(4, gcs.Count); CompareNull(gcs[0].GroupValue); JCG.List <IComparable> countValues = new JCG.List <IComparable>(gcs[0].UniqueValues); assertEquals(1, countValues.size()); Compare("1", countValues[0]); Compare("1", gcs[1].GroupValue); countValues = new JCG.List <IComparable>(gcs[1].UniqueValues); countValues.Sort(nullComparer); assertEquals(2, countValues.size()); Compare("1", countValues[0]); Compare("2", countValues[1]); Compare("2", gcs[2].GroupValue); countValues = new JCG.List <IComparable>(gcs[2].UniqueValues); assertEquals(1, countValues.size()); CompareNull(countValues[0]); Compare("3", gcs[3].GroupValue); countValues = new JCG.List <IComparable>(gcs[3].UniqueValues); assertEquals(1, countValues.size()); Compare("1", countValues[0]); // === Search for content:some firstCollector = CreateRandomFirstPassCollector(dvType, new Sort(), groupField, 10); indexSearcher.Search(new TermQuery(new Term("content", "some")), firstCollector); distinctValuesCollector = CreateDistinctCountCollector(firstCollector, groupField, countField, dvType); indexSearcher.Search(new TermQuery(new Term("content", "some")), distinctValuesCollector); // LUCENENET TODO: Try to work out how to do this without an O(n) operation //gcs = distinctValuesCollector.Groups as JCG.List<IGroupCount<IComparable>>; gcs = new JCG.List <AbstractDistinctValuesCollector.IGroupCount <IComparable> >(distinctValuesCollector.Groups); gcs.Sort(cmp); assertEquals(3, gcs.Count); Compare("1", gcs[0].GroupValue); countValues = new JCG.List <IComparable>(gcs[0].UniqueValues); assertEquals(2, countValues.size()); countValues.Sort(nullComparer); Compare("1", countValues[0]); Compare("2", countValues[1]); Compare("2", gcs[1].GroupValue); countValues = new JCG.List <IComparable>(gcs[1].UniqueValues); assertEquals(1, countValues.size()); CompareNull(countValues[0]); Compare("3", gcs[2].GroupValue); countValues = new JCG.List <IComparable>(gcs[2].UniqueValues); assertEquals(1, countValues.size()); Compare("1", countValues[0]); // === Search for content:blob firstCollector = CreateRandomFirstPassCollector(dvType, new Sort(), groupField, 10); indexSearcher.Search(new TermQuery(new Term("content", "blob")), firstCollector); distinctValuesCollector = CreateDistinctCountCollector(firstCollector, groupField, countField, dvType); indexSearcher.Search(new TermQuery(new Term("content", "blob")), distinctValuesCollector); // LUCENENET TODO: Try to work out how to do this without an O(n) operation //gcs = distinctValuesCollector.Groups as JCG.List<IGroupCount<IComparable>>; gcs = new JCG.List <AbstractDistinctValuesCollector.IGroupCount <IComparable> >(distinctValuesCollector.Groups); gcs.Sort(cmp); assertEquals(2, gcs.Count); Compare("1", gcs[0].GroupValue); countValues = new JCG.List <IComparable>(gcs[0].UniqueValues); // B/c the only one document matched with blob inside the author 1 group assertEquals(1, countValues.Count); Compare("1", countValues[0]); Compare("3", gcs[1].GroupValue); countValues = new JCG.List <IComparable>(gcs[1].UniqueValues); assertEquals(1, countValues.Count); Compare("1", countValues[0]); indexSearcher.IndexReader.Dispose(); dir.Dispose(); }
/// <summary> /// Dumps an <see cref="FST{T}"/> to a GraphViz's <c>dot</c> language description /// for visualization. Example of use: /// /// <code> /// using (TextWriter sw = new StreamWriter("out.dot")) /// { /// Util.ToDot(fst, sw, true, true); /// } /// </code> /// /// and then, from command line: /// /// <code> /// dot -Tpng -o out.png out.dot /// </code> /// /// <para/> /// Note: larger FSTs (a few thousand nodes) won't even /// render, don't bother. If the FST is > 2.1 GB in size /// then this method will throw strange exceptions. /// <para/> /// See also <a href="http://www.graphviz.org/">http://www.graphviz.org/</a>. /// </summary> /// <param name="sameRank"> /// If <c>true</c>, the resulting <c>dot</c> file will try /// to order states in layers of breadth-first traversal. This may /// mess up arcs, but makes the output FST's structure a bit clearer. /// </param> /// <param name="labelStates"> /// If <c>true</c> states will have labels equal to their offsets in their /// binary format. Expands the graph considerably. /// </param> public static void ToDot <T>(FST <T> fst, TextWriter @out, bool sameRank, bool labelStates) { const string expandedNodeColor = "blue"; // this is the start arc in the automaton (from the epsilon state to the first state // with outgoing transitions. FST.Arc <T> startArc = fst.GetFirstArc(new FST.Arc <T>()); // A queue of transitions to consider for the next level. IList <FST.Arc <T> > thisLevelQueue = new JCG.List <FST.Arc <T> >(); // A queue of transitions to consider when processing the next level. IList <FST.Arc <T> > nextLevelQueue = new JCG.List <FST.Arc <T> >(); nextLevelQueue.Add(startArc); //System.out.println("toDot: startArc: " + startArc); // A list of states on the same level (for ranking). IList <int?> sameLevelStates = new JCG.List <int?>(); // A bitset of already seen states (target offset). BitArray seen = new BitArray(32); seen.SafeSet((int)startArc.Target, true); // Shape for states. const string stateShape = "circle"; const string finalStateShape = "doublecircle"; // Emit DOT prologue. @out.Write("digraph FST {\n"); @out.Write(" rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n"); if (!labelStates) { @out.Write(" node [shape=circle, width=.2, height=.2, style=filled]\n"); } EmitDotState(@out, "initial", "point", "white", ""); T NO_OUTPUT = fst.Outputs.NoOutput; var r = fst.GetBytesReader(); // final FST.Arc<T> scratchArc = new FST.Arc<>(); { string stateColor; if (fst.IsExpandedTarget(startArc, r)) { stateColor = expandedNodeColor; } else { stateColor = null; } bool isFinal; T finalOutput; if (startArc.IsFinal) { isFinal = true; finalOutput = startArc.NextFinalOutput.Equals(NO_OUTPUT) ? default(T) : startArc.NextFinalOutput; } else { isFinal = false; finalOutput = default(T); } EmitDotState(@out, Convert.ToString(startArc.Target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.Outputs.OutputToString(finalOutput)); } @out.Write(" initial -> " + startArc.Target + "\n"); int level = 0; while (nextLevelQueue.Count > 0) { // we could double buffer here, but it doesn't matter probably. //System.out.println("next level=" + level); thisLevelQueue.AddRange(nextLevelQueue); nextLevelQueue.Clear(); level++; @out.Write("\n // Transitions and states at level: " + level + "\n"); while (thisLevelQueue.Count > 0) { FST.Arc <T> arc = thisLevelQueue[thisLevelQueue.Count - 1]; thisLevelQueue.RemoveAt(thisLevelQueue.Count - 1); //System.out.println(" pop: " + arc); if (FST <T> .TargetHasArcs(arc)) { // scan all target arcs //System.out.println(" readFirstTarget..."); long node = arc.Target; fst.ReadFirstRealTargetArc(arc.Target, arc, r); //System.out.println(" firstTarget: " + arc); while (true) { //System.out.println(" cycle arc=" + arc); // Emit the unseen state and add it to the queue for the next level. if (arc.Target >= 0 && !seen.SafeGet((int)arc.Target)) { /* * boolean isFinal = false; * T finalOutput = null; * fst.readFirstTargetArc(arc, scratchArc); * if (scratchArc.isFinal() && fst.targetHasArcs(scratchArc)) { * // target is final * isFinal = true; * finalOutput = scratchArc.output == NO_OUTPUT ? null : scratchArc.output; * System.out.println("dot hit final label=" + (char) scratchArc.label); * } */ string stateColor; if (fst.IsExpandedTarget(arc, r)) { stateColor = expandedNodeColor; } else { stateColor = null; } string finalOutput; if (arc.NextFinalOutput != null && !arc.NextFinalOutput.Equals(NO_OUTPUT)) { finalOutput = fst.Outputs.OutputToString(arc.NextFinalOutput); } else { finalOutput = ""; } EmitDotState(@out, Convert.ToString(arc.Target), stateShape, stateColor, finalOutput); // To see the node address, use this instead: //emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target)); seen.SafeSet((int)arc.Target, true); nextLevelQueue.Add((new FST.Arc <T>()).CopyFrom(arc)); sameLevelStates.Add((int)arc.Target); } string outs; if (!arc.Output.Equals(NO_OUTPUT)) { outs = "/" + fst.Outputs.OutputToString(arc.Output); } else { outs = ""; } if (!FST <T> .TargetHasArcs(arc) && arc.IsFinal && !arc.NextFinalOutput.Equals(NO_OUTPUT)) { // Tricky special case: sometimes, due to // pruning, the builder can [sillily] produce // an FST with an arc into the final end state // (-1) but also with a next final output; in // this case we pull that output up onto this // arc outs = outs + "/[" + fst.Outputs.OutputToString(arc.NextFinalOutput) + "]"; } string arcColor; if (arc.Flag(FST.BIT_TARGET_NEXT)) { arcColor = "red"; } else { arcColor = "black"; } Debug.Assert(arc.Label != FST.END_LABEL); @out.Write(" " + node + " -> " + arc.Target + " [label=\"" + PrintableLabel(arc.Label) + outs + "\"" + (arc.IsFinal ? " style=\"bold\"" : "") + " color=\"" + arcColor + "\"]\n"); // Break the loop if we're on the last arc of this state. if (arc.IsLast) { //System.out.println(" break"); break; } fst.ReadNextRealArc(arc, r); } } } // Emit state ranking information. if (sameRank && sameLevelStates.Count > 1) { @out.Write(" {rank=same; "); foreach (int state in sameLevelStates) { @out.Write(state + "; "); } @out.Write(" }\n"); } sameLevelStates.Clear(); } // Emit terminating state (always there anyway). @out.Write(" -1 [style=filled, color=black, shape=doublecircle, label=\"\"]\n\n"); @out.Write(" {rank=sink; -1 }\n"); @out.Write("}\n"); @out.Flush(); }
public virtual void TestCloseUnderException() { int iters = 1000 + 1 + Random.nextInt(20); for (int j = 0; j < iters; j++) { Directory dir = NewDirectory(); IndexWriter writer = new IndexWriter(dir, NewIndexWriterConfig(Random, TEST_VERSION_CURRENT, new MockAnalyzer(Random))); writer.Commit(); writer.Dispose(); DirectoryReader open = DirectoryReader.Open(dir); bool throwOnClose = !Rarely(); AtomicReader wrap = SlowCompositeReaderWrapper.Wrap(open); FilterAtomicReader reader = new FilterAtomicReaderAnonymousClass(this, wrap, throwOnClose); IList <IndexReader.IReaderClosedListener> listeners = new JCG.List <IndexReader.IReaderClosedListener>(); int listenerCount = Random.Next(20); AtomicInt32 count = new AtomicInt32(); bool faultySet = false; for (int i = 0; i < listenerCount; i++) { if (Rarely()) { faultySet = true; reader.AddReaderClosedListener(new FaultyListener()); } else { count.IncrementAndGet(); reader.AddReaderClosedListener(new CountListener(count)); } } if (!faultySet && !throwOnClose) { reader.AddReaderClosedListener(new FaultyListener()); } try { reader.Dispose(); Assert.Fail("expected Exception"); } catch (Exception ex) when(ex.IsIllegalStateException()) { if (throwOnClose) { Assert.AreEqual("BOOM!", ex.Message); } else { Assert.AreEqual("GRRRRRRRRRRRR!", ex.Message); } } try { var aaa = reader.Fields; Assert.Fail("we are closed"); } catch (Exception ex) when(ex.IsAlreadyClosedException()) { } if (Random.NextBoolean()) { reader.Dispose(); // call it again } Assert.AreEqual(0, count); wrap.Dispose(); dir.Dispose(); } }