private GroupedFacetResult CreateExpectedFacetResult(string searchTerm, IndexContext context, int offset, int limit, int minCount, bool orderByCount, string facetPrefix) { if (!context.searchTermToFacetGroups.TryGetValue(searchTerm, out var facetGroups)) { facetGroups = new JCG.Dictionary <string, ISet <string> >(); } int totalCount = 0; int totalMissCount = 0; ISet <string> facetValues; if (facetPrefix != null) { facetValues = new JCG.HashSet <string>(); foreach (string facetValue in context.facetValues) { if (facetValue != null && facetValue.StartsWith(facetPrefix, StringComparison.Ordinal)) { facetValues.add(facetValue); } } } else { facetValues = context.facetValues; } JCG.List <TermGroupFacetCollector.FacetEntry> entries = new JCG.List <TermGroupFacetCollector.FacetEntry>(facetGroups.size()); // also includes facets with count 0 foreach (string facetValue in facetValues) { if (facetValue == null) { continue; } int count = facetGroups.TryGetValue(facetValue, out ISet <string> groups) && groups != null?groups.size() : 0; if (count >= minCount) { entries.Add(new TermGroupFacetCollector.FacetEntry(new BytesRef(facetValue), count)); } totalCount += count; } // Only include null count when no facet prefix is specified if (facetPrefix == null) { if (facetGroups.TryGetValue(null, out ISet <string> groups) && groups != null) { totalMissCount = groups.size(); } } entries.Sort(Comparer <TermGroupFacetCollector.FacetEntry> .Create((a, b) => { if (orderByCount) { int cmp = b.Count - a.Count; if (cmp != 0) { return(cmp); } } return(a.Value.CompareTo(b.Value)); })); int endOffset = offset + limit; IList <TermGroupFacetCollector.FacetEntry> entriesResult; if (offset >= entries.size()) { entriesResult = Collections.EmptyList <TermGroupFacetCollector.FacetEntry>(); } else if (endOffset >= entries.size()) { entriesResult = entries.GetView(offset, entries.size() - offset); // LUCENENET: Converted end index to length } else { entriesResult = entries.GetView(offset, endOffset - offset); // LUCENENET: Converted end index to length } return(new GroupedFacetResult(totalCount, totalMissCount, entriesResult)); }
public void TestRandom() { string[] terms = new string[TestUtil.NextInt32(Random, 2, 10)]; ISet <string> seen = new JCG.HashSet <string>(); while (seen.size() < terms.Length) { string token = TestUtil.RandomSimpleString(Random, 1, 5); if (!seen.contains(token)) { terms[seen.size()] = token; seen.add(token); } } Analyzer a = new MockAnalyzer(Random); int numDocs = AtLeast(10); long totTokens = 0; string[][] docs = new string[numDocs][]; for (int i = 0; i < numDocs; i++) { docs[i] = new string[AtLeast(100)]; if (Verbose) { Console.Write(" doc " + i + ":"); } for (int j = 0; j < docs[i].Length; j++) { docs[i][j] = GetZipfToken(terms); if (Verbose) { Console.Write(" " + docs[i][j]); } } if (Verbose) { Console.WriteLine(); } totTokens += docs[i].Length; } int grams = TestUtil.NextInt32(Random, 1, 4); if (Verbose) { Console.WriteLine("TEST: " + terms.Length + " terms; " + numDocs + " docs; " + grams + " grams"); } // Build suggester model: FreeTextSuggester sug = new FreeTextSuggester(a, a, grams, (byte)0x20); sug.Build(new TestRandomInputEnumerator(docs)); // Build inefficient but hopefully correct model: IList <IDictionary <string, int?> > gramCounts = new JCG.List <IDictionary <string, int?> >(grams); for (int gram = 0; gram < grams; gram++) { if (Verbose) { Console.WriteLine("TEST: build model for gram=" + gram); } IDictionary <string, int?> model = new JCG.Dictionary <string, int?>(); gramCounts.Add(model); foreach (string[] doc in docs) { for (int i = 0; i < doc.Length - gram; i++) { StringBuilder b = new StringBuilder(); for (int j = i; j <= i + gram; j++) { if (j > i) { b.append(' '); } b.append(doc[j]); } string token = b.toString(); if (!model.TryGetValue(token, out int?curCount) || curCount == null) { model.Put(token, 1); } else { model.Put(token, 1 + curCount); } if (Verbose) { Console.WriteLine(" add '" + token + "' -> count=" + (model.TryGetValue(token, out int?count) ? (count.HasValue ? count.ToString() : "null") : "")); } } } } int lookups = AtLeast(100); for (int iter = 0; iter < lookups; iter++) { string[] tokens = new string[TestUtil.NextInt32(Random, 1, 5)]; for (int i = 0; i < tokens.Length; i++) { tokens[i] = GetZipfToken(terms); } // Maybe trim last token; be sure not to create the // empty string: int trimStart; if (tokens.Length == 1) { trimStart = 1; } else { trimStart = 0; } int trimAt = TestUtil.NextInt32(Random, trimStart, tokens[tokens.Length - 1].Length); tokens[tokens.Length - 1] = tokens[tokens.Length - 1].Substring(0, trimAt - 0); int num = TestUtil.NextInt32(Random, 1, 100); StringBuilder b = new StringBuilder(); foreach (string token in tokens) { b.append(' '); b.append(token); } string query = b.toString(); query = query.Substring(1); if (Verbose) { Console.WriteLine("\nTEST: iter=" + iter + " query='" + query + "' num=" + num); } // Expected: JCG.List <Lookup.LookupResult> expected = new JCG.List <Lookup.LookupResult>(); double backoff = 1.0; seen = new JCG.HashSet <string>(); if (Verbose) { Console.WriteLine(" compute expected"); } for (int i = grams - 1; i >= 0; i--) { if (Verbose) { Console.WriteLine(" grams=" + i); } if (tokens.Length < i + 1) { // Don't have enough tokens to use this model if (Verbose) { Console.WriteLine(" skip"); } continue; } if (i == 0 && tokens[tokens.Length - 1].Length == 0) { // Never suggest unigrams from empty string: if (Verbose) { Console.WriteLine(" skip unigram priors only"); } continue; } // Build up "context" ngram: b = new StringBuilder(); for (int j = tokens.Length - i - 1; j < tokens.Length - 1; j++) { b.append(' '); b.append(tokens[j]); } string context = b.toString(); if (context.Length > 0) { context = context.Substring(1); } if (Verbose) { Console.WriteLine(" context='" + context + "'"); } long contextCount; if (context.Length == 0) { contextCount = totTokens; } else { //int? count = gramCounts.get(i - 1).get(context); var gramCount = gramCounts[i - 1]; if (!gramCount.TryGetValue(context, out int?count) || count == null) { // We never saw this context: backoff *= FreeTextSuggester.ALPHA; if (Verbose) { Console.WriteLine(" skip: never saw context"); } continue; } contextCount = count.GetValueOrDefault(); } if (Verbose) { Console.WriteLine(" contextCount=" + contextCount); } IDictionary <string, int?> model = gramCounts[i]; // First pass, gather all predictions for this model: if (Verbose) { Console.WriteLine(" find terms w/ prefix=" + tokens[tokens.Length - 1]); } JCG.List <Lookup.LookupResult> tmp = new JCG.List <Lookup.LookupResult>(); foreach (string term in terms) { if (term.StartsWith(tokens[tokens.Length - 1], StringComparison.Ordinal)) { if (Verbose) { Console.WriteLine(" term=" + term); } if (seen.contains(term)) { if (Verbose) { Console.WriteLine(" skip seen"); } continue; } string ngram = (context + " " + term).Trim(); //Integer count = model.get(ngram); if (model.TryGetValue(ngram, out int?count) && count != null) { // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes // return numbers that are greater than long.MaxValue, which results in a negative long number. // This is also the way it is being done in the FreeTextSuggester to work around the issue. // LUCENENET NOTE: The order of parentheses in the Java test didn't match the production code. This apparently doesn't affect the // result in Java, but does in .NET, so we changed the test to match the production code. //Lookup.LookupResult lr = new Lookup.LookupResult(ngram, (long)(long.MaxValue * ((decimal)backoff * (decimal)count / contextCount))); Lookup.LookupResult lr = new Lookup.LookupResult(ngram, (long)(long.MaxValue * (decimal)backoff * ((decimal)count) / contextCount)); tmp.Add(lr); if (Verbose) { Console.WriteLine(" add tmp key='" + lr.Key + "' score=" + lr.Value); } } } } // Second pass, trim to only top N, and fold those // into overall suggestions: tmp.Sort(byScoreThenKey); if (tmp.size() > num) { //tmp.subList(num, tmp.size()).clear(); tmp.RemoveRange(num, tmp.size() - num); // LUCENENET: Converted end index to length } foreach (Lookup.LookupResult result in tmp) { string key = result.Key.toString(); int idx = key.LastIndexOf(' '); string lastToken; if (idx != -1) { lastToken = key.Substring(idx + 1); } else { lastToken = key; } if (!seen.contains(lastToken)) { seen.add(lastToken); expected.Add(result); if (Verbose) { Console.WriteLine(" keep key='" + result.Key + "' score=" + result.Value); } } } backoff *= FreeTextSuggester.ALPHA; } expected.Sort(byScoreThenKey); if (expected.size() > num) { expected.RemoveRange(num, expected.size() - num); // LUCENENET: Converted end index to length } // Actual: IList <Lookup.LookupResult> actual = sug.DoLookup(query, num); if (Verbose) { Console.WriteLine(" expected: " + expected); Console.WriteLine(" actual: " + actual); } assertEquals(expected.ToString(), actual.ToString()); } }
/// <summary> /// Dumps an <see cref="FST{T}"/> to a GraphViz's <c>dot</c> language description /// for visualization. Example of use: /// /// <code> /// using (TextWriter sw = new StreamWriter("out.dot")) /// { /// Util.ToDot(fst, sw, true, true); /// } /// </code> /// /// and then, from command line: /// /// <code> /// dot -Tpng -o out.png out.dot /// </code> /// /// <para/> /// Note: larger FSTs (a few thousand nodes) won't even /// render, don't bother. If the FST is > 2.1 GB in size /// then this method will throw strange exceptions. /// <para/> /// See also <a href="http://www.graphviz.org/">http://www.graphviz.org/</a>. /// </summary> /// <param name="sameRank"> /// If <c>true</c>, the resulting <c>dot</c> file will try /// to order states in layers of breadth-first traversal. This may /// mess up arcs, but makes the output FST's structure a bit clearer. /// </param> /// <param name="labelStates"> /// If <c>true</c> states will have labels equal to their offsets in their /// binary format. Expands the graph considerably. /// </param> public static void ToDot <T>(FST <T> fst, TextWriter @out, bool sameRank, bool labelStates) { const string expandedNodeColor = "blue"; // this is the start arc in the automaton (from the epsilon state to the first state // with outgoing transitions. FST.Arc <T> startArc = fst.GetFirstArc(new FST.Arc <T>()); // A queue of transitions to consider for the next level. IList <FST.Arc <T> > thisLevelQueue = new JCG.List <FST.Arc <T> >(); // A queue of transitions to consider when processing the next level. IList <FST.Arc <T> > nextLevelQueue = new JCG.List <FST.Arc <T> >(); nextLevelQueue.Add(startArc); //System.out.println("toDot: startArc: " + startArc); // A list of states on the same level (for ranking). IList <int?> sameLevelStates = new JCG.List <int?>(); // A bitset of already seen states (target offset). BitSet seen = new BitSet(); seen.Set((int)startArc.Target); // Shape for states. const string stateShape = "circle"; const string finalStateShape = "doublecircle"; // Emit DOT prologue. @out.Write("digraph FST {\n"); @out.Write(" rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n"); if (!labelStates) { @out.Write(" node [shape=circle, width=.2, height=.2, style=filled]\n"); } EmitDotState(@out, "initial", "point", "white", ""); T NO_OUTPUT = fst.Outputs.NoOutput; var r = fst.GetBytesReader(); // final FST.Arc<T> scratchArc = new FST.Arc<>(); { string stateColor; if (fst.IsExpandedTarget(startArc, r)) { stateColor = expandedNodeColor; } else { stateColor = null; } bool isFinal; T finalOutput; if (startArc.IsFinal) { isFinal = true; finalOutput = startArc.NextFinalOutput.Equals(NO_OUTPUT) ? default(T) : startArc.NextFinalOutput; } else { isFinal = false; finalOutput = default(T); } EmitDotState(@out, Convert.ToString(startArc.Target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.Outputs.OutputToString(finalOutput)); } @out.Write(" initial -> " + startArc.Target + "\n"); int level = 0; while (nextLevelQueue.Count > 0) { // we could double buffer here, but it doesn't matter probably. //System.out.println("next level=" + level); thisLevelQueue.AddRange(nextLevelQueue); nextLevelQueue.Clear(); level++; @out.Write("\n // Transitions and states at level: " + level + "\n"); while (thisLevelQueue.Count > 0) { FST.Arc <T> arc = thisLevelQueue[thisLevelQueue.Count - 1]; thisLevelQueue.RemoveAt(thisLevelQueue.Count - 1); //System.out.println(" pop: " + arc); if (FST <T> .TargetHasArcs(arc)) { // scan all target arcs //System.out.println(" readFirstTarget..."); long node = arc.Target; fst.ReadFirstRealTargetArc(arc.Target, arc, r); //System.out.println(" firstTarget: " + arc); while (true) { //System.out.println(" cycle arc=" + arc); // Emit the unseen state and add it to the queue for the next level. if (arc.Target >= 0 && !seen.Get((int)arc.Target)) { /* * boolean isFinal = false; * T finalOutput = null; * fst.readFirstTargetArc(arc, scratchArc); * if (scratchArc.isFinal() && fst.targetHasArcs(scratchArc)) { * // target is final * isFinal = true; * finalOutput = scratchArc.output == NO_OUTPUT ? null : scratchArc.output; * System.out.println("dot hit final label=" + (char) scratchArc.label); * } */ string stateColor; if (fst.IsExpandedTarget(arc, r)) { stateColor = expandedNodeColor; } else { stateColor = null; } string finalOutput; if (arc.NextFinalOutput != null && !arc.NextFinalOutput.Equals(NO_OUTPUT)) { finalOutput = fst.Outputs.OutputToString(arc.NextFinalOutput); } else { finalOutput = ""; } EmitDotState(@out, Convert.ToString(arc.Target), stateShape, stateColor, finalOutput); // To see the node address, use this instead: //emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target)); seen.Set((int)arc.Target); nextLevelQueue.Add((new FST.Arc <T>()).CopyFrom(arc)); sameLevelStates.Add((int)arc.Target); } string outs; if (!arc.Output.Equals(NO_OUTPUT)) { outs = "/" + fst.Outputs.OutputToString(arc.Output); } else { outs = ""; } if (!FST <T> .TargetHasArcs(arc) && arc.IsFinal && !arc.NextFinalOutput.Equals(NO_OUTPUT)) { // Tricky special case: sometimes, due to // pruning, the builder can [sillily] produce // an FST with an arc into the final end state // (-1) but also with a next final output; in // this case we pull that output up onto this // arc outs = outs + "/[" + fst.Outputs.OutputToString(arc.NextFinalOutput) + "]"; } string arcColor; if (arc.Flag(FST.BIT_TARGET_NEXT)) { arcColor = "red"; } else { arcColor = "black"; } Debug.Assert(arc.Label != FST.END_LABEL); @out.Write(" " + node + " -> " + arc.Target + " [label=\"" + PrintableLabel(arc.Label) + outs + "\"" + (arc.IsFinal ? " style=\"bold\"" : "") + " color=\"" + arcColor + "\"]\n"); // Break the loop if we're on the last arc of this state. if (arc.IsLast) { //System.out.println(" break"); break; } fst.ReadNextRealArc(arc, r); } } } // Emit state ranking information. if (sameRank && sameLevelStates.Count > 1) { @out.Write(" {rank=same; "); foreach (int state in sameLevelStates) { @out.Write(state + "; "); } @out.Write(" }\n"); } sameLevelStates.Clear(); } // Emit terminating state (always there anyway). @out.Write(" -1 [style=filled, color=black, shape=doublecircle, label=\"\"]\n\n"); @out.Write(" {rank=sink; -1 }\n"); @out.Write("}\n"); @out.Flush(); }
/// <summary> /// Detect repetition groups. Done once - for first doc. </summary> private IList <IList <PhrasePositions> > GatherRptGroups(JCG.LinkedDictionary <Term, int> rptTerms) { PhrasePositions[] rpp = RepeatingPPs(rptTerms); IList <IList <PhrasePositions> > res = new JCG.List <IList <PhrasePositions> >(); if (!hasMultiTermRpts) { // simpler - no multi-terms - can base on positions in first doc for (int i = 0; i < rpp.Length; i++) { PhrasePositions pp = rpp[i]; if (pp.rptGroup >= 0) // already marked as a repetition { continue; } int tpPos = TpPos(pp); for (int j = i + 1; j < rpp.Length; j++) { PhrasePositions pp2 = rpp[j]; if (pp2.rptGroup >= 0 || pp2.offset == pp.offset || TpPos(pp2) != tpPos) // not a repetition - not a repetition: two PPs are originally in same offset in the query! - already marked as a repetition { continue; } // a repetition int g = pp.rptGroup; if (g < 0) { g = res.Count; pp.rptGroup = g; IList <PhrasePositions> rl = new JCG.List <PhrasePositions>(2) { pp }; res.Add(rl); } pp2.rptGroup = g; res[g].Add(pp2); } } } else { // more involved - has multi-terms IList <JCG.HashSet <PhrasePositions> > tmp = new JCG.List <JCG.HashSet <PhrasePositions> >(); IList <FixedBitSet> bb = PpTermsBitSets(rpp, rptTerms); UnionTermGroups(bb); IDictionary <Term, int> tg = TermGroups(rptTerms, bb); JCG.HashSet <int> distinctGroupIDs = new JCG.HashSet <int>(tg.Values); for (int i = 0; i < distinctGroupIDs.Count; i++) { tmp.Add(new JCG.HashSet <PhrasePositions>()); } foreach (PhrasePositions pp in rpp) { foreach (Term t in pp.terms) { if (rptTerms.ContainsKey(t)) { int g = tg[t]; tmp[g].Add(pp); if (Debugging.AssertsEnabled) { Debugging.Assert(pp.rptGroup == -1 || pp.rptGroup == g); } pp.rptGroup = g; } } } foreach (JCG.HashSet <PhrasePositions> hs in tmp) { res.Add(new JCG.List <PhrasePositions>(hs)); } } return(res); }