public virtual void TestRead1waySynonymRules() { SlowSynonymMap synMap; // (a)->[a] // (b)->[a] IList <string> rules = new JCG.List <string>(); rules.Add("a,b"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", false, null); assertEquals(2, synMap.Submap.size()); AssertTokIncludes(synMap, "a", "a"); AssertTokIncludes(synMap, "b", "a"); // (a)->[a] // (b)->[a] // (c)->[a] rules.Clear(); rules.Add("a,b,c"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", false, null); assertEquals(3, synMap.Submap.size()); AssertTokIncludes(synMap, "a", "a"); AssertTokIncludes(synMap, "b", "a"); AssertTokIncludes(synMap, "c", "a"); // (a)->[a] // (b1)->(b2)->[a] rules.Clear(); rules.Add("a,b1 b2"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", false, null); assertEquals(2, synMap.Submap.size()); AssertTokIncludes(synMap, "a", "a"); assertEquals(1, GetSubSynonymMap(synMap, "b1").Submap.size()); AssertTokIncludes(GetSubSynonymMap(synMap, "b1"), "b2", "a"); // (a1)->(a2)->[a1][a2] // (b)->[a1][a2] rules.Clear(); rules.Add("a1 a2,b"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", false, null); assertEquals(2, synMap.Submap.size()); assertEquals(1, GetSubSynonymMap(synMap, "a1").Submap.size()); AssertTokIncludes(GetSubSynonymMap(synMap, "a1"), "a2", "a1"); AssertTokIncludes(GetSubSynonymMap(synMap, "a1"), "a2", "a2"); AssertTokIncludes(synMap, "b", "a1"); AssertTokIncludes(synMap, "b", "a2"); }
public void TestSearchPhraseSlop() { // "a b c"~0 Query query = pqF("a", "b", "c"); // phraseHighlight = true, fieldMatch = true FieldQuery fq = new FieldQuery(query, true, true); // "a b c" w/ position-gap = 2 IList <TermInfo> phraseCandidate = new JCG.List <TermInfo>(); phraseCandidate.Add(new TermInfo("a", 0, 1, 0, 1)); phraseCandidate.Add(new TermInfo("b", 2, 3, 2, 1)); phraseCandidate.Add(new TermInfo("c", 4, 5, 4, 1)); assertNull(fq.SearchPhrase(F, phraseCandidate)); // "a b c"~1 query = pqF(1F, 1, "a", "b", "c"); // phraseHighlight = true, fieldMatch = true fq = new FieldQuery(query, true, true); // "a b c" w/ position-gap = 2 assertNotNull(fq.SearchPhrase(F, phraseCandidate)); // "a b c" w/ position-gap = 3 phraseCandidate.Clear(); phraseCandidate.Add(new TermInfo("a", 0, 1, 0, 1)); phraseCandidate.Add(new TermInfo("b", 2, 3, 3, 1)); phraseCandidate.Add(new TermInfo("c", 4, 5, 6, 1)); assertNull(fq.SearchPhrase(F, phraseCandidate)); }
/// <summary>Resets stems accumulator and hands over to superclass.</summary> public override void Reset() { lemmaListIndex = 0; lemmaList = new List <WordData>(); tagsList.Clear(); base.Reset(); }
public virtual void Clear() { lock (this) { updates.Clear(); nextGen = 1; numTerms.Value = 0; bytesUsed.Value = 0; } }
/// <summary> /// Move to the next match, returning true iff any such exists. </summary> public override bool MoveNext() { if (firstTime) { firstTime = false; for (int i = 0; i < subSpans.Length; i++) { if (!subSpans[i].MoveNext()) { more = false; return(false); } } more = true; } if (collectPayloads) { matchPayload.Clear(); } return(AdvanceAfterOrdered()); }
public override bool IncrementToken() { if (buffer != null && buffer.Count > 0) { CharsRef nextStem = buffer[0]; buffer.RemoveAt(0); RestoreState(savedState); posIncAtt.PositionIncrement = 0; termAtt.SetEmpty().Append(nextStem); return(true); } if (!m_input.IncrementToken()) { return(false); } if (keywordAtt.IsKeyword) { return(true); } buffer = new JCG.List <CharsRef>(dedup ? stemmer.UniqueStems(termAtt.Buffer, termAtt.Length) : stemmer.Stem(termAtt.Buffer, termAtt.Length)); if (buffer.Count == 0) // we do not know this word, return it unchanged { return(true); } if (longestOnly && buffer.Count > 1) { buffer.Sort(lengthComparer); } CharsRef stem = buffer[0]; buffer.RemoveAt(0); termAtt.SetEmpty().Append(stem); if (longestOnly) { buffer.Clear(); } else { if (buffer.Count > 0) { savedState = CaptureState(); } } return(true); }
public virtual void Clear() { UninterruptableMonitor.Enter(this); try { updates.Clear(); nextGen = 1; numTerms.Value = 0; bytesUsed.Value = 0; } finally { UninterruptableMonitor.Exit(this); } }
public virtual void TestAppendIterator() { Random random = Random; BytesRefArray list = new BytesRefArray(Util.Counter.NewCounter()); IList <string> stringList = new JCG.List <string>(); for (int j = 0; j < 2; j++) { if (j > 0 && random.NextBoolean()) { list.Clear(); stringList.Clear(); } int entries = AtLeast(500); BytesRef spare = new BytesRef(); int initSize = list.Length; for (int i = 0; i < entries; i++) { string randomRealisticUnicodeString = TestUtil.RandomRealisticUnicodeString(random); spare.CopyChars(randomRealisticUnicodeString); Assert.AreEqual(i + initSize, list.Append(spare)); stringList.Add(randomRealisticUnicodeString); } for (int i = 0; i < entries; i++) { Assert.IsNotNull(list.Get(spare, i)); Assert.AreEqual(stringList[i], spare.Utf8ToString(), "entry " + i + " doesn't match"); } // check random for (int i = 0; i < entries; i++) { int e = random.Next(entries); Assert.IsNotNull(list.Get(spare, e)); Assert.AreEqual(stringList[e], spare.Utf8ToString(), "entry " + i + " doesn't match"); } for (int i = 0; i < 2; i++) { IBytesRefEnumerator iterator = list.GetEnumerator(); foreach (string @string in stringList) { Assert.IsTrue(iterator.MoveNext()); Assert.AreEqual(@string, iterator.Current.Utf8ToString()); } } } }
public void TestSearchPhrase() { Query query = pqF("a", "b", "c"); // phraseHighlight = true, fieldMatch = true FieldQuery fq = new FieldQuery(query, true, true); // "a" IList <TermInfo> phraseCandidate = new JCG.List <TermInfo>(); phraseCandidate.Add(new TermInfo("a", 0, 1, 0, 1)); assertNull(fq.SearchPhrase(F, phraseCandidate)); // "a b" phraseCandidate.Add(new TermInfo("b", 2, 3, 1, 1)); assertNull(fq.SearchPhrase(F, phraseCandidate)); // "a b c" phraseCandidate.Add(new TermInfo("c", 4, 5, 2, 1)); assertNotNull(fq.SearchPhrase(F, phraseCandidate)); assertNull(fq.SearchPhrase("x", phraseCandidate)); // phraseHighlight = true, fieldMatch = false fq = new FieldQuery(query, true, false); // "a b c" assertNotNull(fq.SearchPhrase(F, phraseCandidate)); assertNotNull(fq.SearchPhrase("x", phraseCandidate)); // phraseHighlight = false, fieldMatch = true fq = new FieldQuery(query, false, true); // "a" phraseCandidate.Clear(); phraseCandidate.Add(new TermInfo("a", 0, 1, 0, 1)); assertNotNull(fq.SearchPhrase(F, phraseCandidate)); // "a b" phraseCandidate.Add(new TermInfo("b", 2, 3, 1, 1)); assertNull(fq.SearchPhrase(F, phraseCandidate)); // "a b c" phraseCandidate.Add(new TermInfo("c", 4, 5, 2, 1)); assertNotNull(fq.SearchPhrase(F, phraseCandidate)); assertNull(fq.SearchPhrase("x", phraseCandidate)); }
public virtual void TestSortIterator() { Random random = Random; BytesRefArray list = new BytesRefArray(Util.Counter.NewCounter()); IList <string> stringList = new JCG.List <string>(); for (int j = 0; j < 2; j++) { if (j > 0 && random.NextBoolean()) { list.Clear(); stringList.Clear(); } int entries = AtLeast(500); BytesRef spare = new BytesRef(); int initSize = list.Length; for (int i = 0; i < entries; i++) { string randomRealisticUnicodeString = TestUtil.RandomRealisticUnicodeString(random); spare.CopyChars(randomRealisticUnicodeString); Assert.AreEqual(initSize + i, list.Append(spare)); stringList.Add(randomRealisticUnicodeString); } // LUCENENET NOTE: Must sort using ArrayUtil.GetNaturalComparator<T>() // to ensure culture isn't taken into consideration during the sort, // which will match the sort order of BytesRef.UTF8SortedAsUTF16Comparer. CollectionUtil.TimSort(stringList); #pragma warning disable 612, 618 IBytesRefIterator iter = list.GetIterator(BytesRef.UTF8SortedAsUTF16Comparer); #pragma warning restore 612, 618 int a = 0; while ((spare = iter.Next()) != null) { Assert.AreEqual(stringList[a], spare.Utf8ToString(), "entry " + a + " doesn't match"); a++; } Assert.IsNull(iter.Next()); Assert.AreEqual(a, stringList.Count); } }
/// <summary> /// Perform the actual DM Soundex algorithm on the input string. /// </summary> /// <param name="source">A string to encode.</param> /// <param name="branching">If branching shall be performed.</param> /// <returns>A string array containing all DM Soundex codes corresponding to the string supplied depending on the selected branching mode.</returns> /// <exception cref="ArgumentException">If a character is not mapped.</exception> private string[] GetSoundex(string source, bool branching) { if (source == null) { return(null); } string input = Cleanup(source); // LinkedHashSet preserves input order. In .NET we can use List for that purpose. IList <Branch> currentBranches = new JCG.List <Branch> { new Branch() }; char lastChar = '\0'; for (int index = 0; index < input.Length; index++) { char ch = input[index]; // ignore whitespace inside a name if (char.IsWhiteSpace(ch)) { continue; } string inputContext = input.Substring(index); if (!RULES.TryGetValue(ch, out IList <Rule> rules) || rules == null) { continue; } // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access IList <Branch> nextBranches = branching ? new JCG.List <Branch>() : Collections.EmptyList <Branch>() as IList <Branch>; foreach (Rule rule in rules) { if (rule.Matches(inputContext)) { if (branching) { nextBranches.Clear(); } string[] replacements = rule.GetReplacements(inputContext, lastChar == '\0'); bool branchingRequired = replacements.Length > 1 && branching; foreach (Branch branch in currentBranches) { foreach (string nextReplacement in replacements) { // if we have multiple replacements, always create a new branch Branch nextBranch = branchingRequired ? branch.CreateBranch() : branch; // special rule: occurrences of mn or nm are treated differently bool force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm'); nextBranch.ProcessNextReplacement(nextReplacement, force); if (branching) { if (!nextBranches.Contains(nextBranch)) { nextBranches.Add(nextBranch); } } else { break; } } } if (branching) { currentBranches.Clear(); currentBranches.AddRange(nextBranches); } index += rule.PatternLength - 1; break; } } lastChar = ch; } string[] result = new string[currentBranches.Count]; int idx = 0; foreach (Branch branch in currentBranches) { branch.Finish(); result[idx++] = branch.ToString(); } return(result); }
/// <summary> /// Returns <c>true</c> if the given string is accepted by the automaton. /// <para/> /// Complexity: linear in the length of the string. /// <para/> /// <b>Note:</b> for full performance, use the <see cref="RunAutomaton"/> class. /// </summary> public static bool Run(Automaton a, string s) { if (a.IsSingleton) { return(s.Equals(a.singleton, StringComparison.Ordinal)); } if (a.deterministic) { State p = a.initial; int cp; // LUCENENET: Removed unnecessary assignment for (int i = 0; i < s.Length; i += Character.CharCount(cp)) { State q = p.Step(cp = Character.CodePointAt(s, i)); if (q == null) { return(false); } p = q; } return(p.accept); } else { State[] states = a.GetNumberedStates(); LinkedList <State> pp = new LinkedList <State>(); LinkedList <State> pp_other = new LinkedList <State>(); OpenBitSet bb = new OpenBitSet(states.Length); OpenBitSet bb_other = new OpenBitSet(states.Length); pp.AddLast(a.initial); JCG.List <State> dest = new JCG.List <State>(); bool accept = a.initial.accept; int c; // LUCENENET: Removed unnecessary assignment for (int i = 0; i < s.Length; i += Character.CharCount(c)) { c = Character.CodePointAt(s, i); accept = false; pp_other.Clear(); bb_other.Clear(0, bb_other.Length - 1); foreach (State p in pp) { dest.Clear(); p.Step(c, dest); foreach (State q in dest) { if (q.accept) { accept = true; } if (!bb_other.Get(q.number)) { bb_other.Set(q.number); pp_other.AddLast(q); } } } LinkedList <State> tp = pp; pp = pp_other; pp_other = tp; OpenBitSet tb = bb; bb = bb_other; bb_other = tb; } return(accept); } }
protected override IQueryNode PostProcessNode(IQueryNode node) { if (node is ITextableQueryNode && !(node is WildcardQueryNode) && !(node is FuzzyQueryNode) && !(node is RegexpQueryNode) && !(node.Parent is IRangeQueryNode)) { FieldQueryNode fieldNode = ((FieldQueryNode)node); string text = fieldNode.GetTextAsString(); string field = fieldNode.GetFieldAsString(); CachingTokenFilter buffer = null; IPositionIncrementAttribute posIncrAtt = null; int numTokens = 0; int positionCount = 0; bool severalTokensAtSamePosition = false; TokenStream source = null; try { source = this.analyzer.GetTokenStream(field, text); source.Reset(); buffer = new CachingTokenFilter(source); if (buffer.HasAttribute <IPositionIncrementAttribute>()) { posIncrAtt = buffer.GetAttribute <IPositionIncrementAttribute>(); } try { while (buffer.IncrementToken()) { numTokens++; int positionIncrement = (posIncrAtt != null) ? posIncrAtt .PositionIncrement : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } } } catch (Exception e) when(e.IsIOException()) { // ignore } } catch (Exception e) when(e.IsIOException()) { throw RuntimeException.Create(e); } finally { IOUtils.DisposeWhileHandlingException(source); } // rewind the buffer stream buffer.Reset(); if (!buffer.HasAttribute <ICharTermAttribute>()) { return(new NoTokenFoundQueryNode()); } ICharTermAttribute termAtt = buffer.GetAttribute <ICharTermAttribute>(); if (numTokens == 0) { return(new NoTokenFoundQueryNode()); } else if (numTokens == 1) { string term = null; try { bool hasNext; hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); } catch (Exception e) when(e.IsIOException()) { // safe to ignore, because we know the number of tokens } fieldNode.Text = term.AsCharSequence(); return(fieldNode); } else if (severalTokensAtSamePosition || !(node is QuotedFieldQueryNode)) { if (positionCount == 1 || !(node is QuotedFieldQueryNode)) { // no phrase query: if (positionCount == 1) { // simple case: only one position, with synonyms IList <IQueryNode> children = new JCG.List <IQueryNode>(); for (int i = 0; i < numTokens; i++) { string term = null; try { bool hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); } catch (Exception e) when(e.IsIOException()) { // safe to ignore, because we know the number of tokens } children.Add(new FieldQueryNode(field, term, -1, -1)); } return(new GroupQueryNode( new StandardBooleanQueryNode(children, positionCount == 1))); } else { // multiple positions IQueryNode q = new StandardBooleanQueryNode(Collections.EmptyList <IQueryNode>(), false); IQueryNode currentQuery = null; for (int i = 0; i < numTokens; i++) { string term = null; try { bool hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); } catch (Exception e) when(e.IsIOException()) { // safe to ignore, because we know the number of tokens } if (posIncrAtt != null && posIncrAtt.PositionIncrement == 0) { if (!(currentQuery is BooleanQueryNode)) { IQueryNode t = currentQuery; currentQuery = new StandardBooleanQueryNode(Collections.EmptyList <IQueryNode>(), true); ((BooleanQueryNode)currentQuery).Add(t); } ((BooleanQueryNode)currentQuery).Add(new FieldQueryNode(field, term, -1, -1)); } else { if (currentQuery != null) { if (this.defaultOperator == Operator.OR) { q.Add(currentQuery); } else { q.Add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ)); } } currentQuery = new FieldQueryNode(field, term, -1, -1); } } if (this.defaultOperator == Operator.OR) { q.Add(currentQuery); } else { q.Add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ)); } if (q is BooleanQueryNode) { q = new GroupQueryNode(q); } return(q); } } else { // phrase query: MultiPhraseQueryNode mpq = new MultiPhraseQueryNode(); IList <FieldQueryNode> multiTerms = new JCG.List <FieldQueryNode>(); int position = -1; int i = 0; int termGroupCount = 0; for (; i < numTokens; i++) { string term = null; int positionIncrement = 1; try { bool hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.PositionIncrement; } } catch (Exception e) when(e.IsIOException()) { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.Count > 0) { foreach (FieldQueryNode termNode in multiTerms) { if (this.positionIncrementsEnabled) { termNode.PositionIncrement = position; } else { termNode.PositionIncrement = termGroupCount; } mpq.Add(termNode); } // Only increment once for each "group" of // terms that were in the same position: termGroupCount++; multiTerms.Clear(); } position += positionIncrement; multiTerms.Add(new FieldQueryNode(field, term, -1, -1)); } foreach (FieldQueryNode termNode in multiTerms) { if (this.positionIncrementsEnabled) { termNode.PositionIncrement = position; } else { termNode.PositionIncrement = termGroupCount; } mpq.Add(termNode); } return(mpq); } } else { TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode(); int position = -1; for (int i = 0; i < numTokens; i++) { string term = null; int positionIncrement = 1; try { bool hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.PositionIncrement; } } catch (Exception e) when(e.IsIOException()) { // safe to ignore, because we know the number of tokens } FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1); if (this.positionIncrementsEnabled) { position += positionIncrement; newFieldNode.PositionIncrement = position; } else { newFieldNode.PositionIncrement = i; } pq.Add(newFieldNode); } return(pq); } } return(node); }
/// <summary> /// Sort input to output, explicit hint for the buffer size. The amount of allocated /// memory may deviate from the hint (may be smaller or larger). /// </summary> public SortInfo Sort(FileInfo input, FileInfo output) { sortInfo = new SortInfo(this) { TotalTime = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond }; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results output.Delete(); var merges = new JCG.List <FileInfo>(); bool success2 = false; try { var inputStream = new ByteSequencesReader(input); bool success = false; try { int lines = 0; while ((lines = ReadPartition(inputStream)) > 0) { merges.Add(SortPartition(/*lines*/)); // LUCENENET specific - removed unused parameter sortInfo.TempMergeFiles++; sortInfo.Lines += lines; // Handle intermediate merges. if (merges.Count == maxTempFiles) { var intermediate = new FileInfo(Path.GetTempFileName()); try { MergePartitions(merges, intermediate); } finally { foreach (var file in merges) { file.Delete(); } merges.Clear(); merges.Add(intermediate); } sortInfo.TempMergeFiles++; } } success = true; } finally { if (success) { IOUtils.Dispose(inputStream); } else { IOUtils.DisposeWhileHandlingException(inputStream); } } // One partition, try to rename or copy if unsuccessful. if (merges.Count == 1) { FileInfo single = merges[0]; Copy(single, output); try { File.Delete(single.FullName); } #pragma warning disable CA1031 // Do not catch general exception types catch { // ignored } #pragma warning restore CA1031 // Do not catch general exception types } else { // otherwise merge the partitions with a priority queue. MergePartitions(merges, output); } success2 = true; } finally { foreach (FileInfo file in merges) { file.Delete(); } if (!success2) { output.Delete(); } } sortInfo.TotalTime = ((J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond) - sortInfo.TotalTime); // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results return(sortInfo); }
/// <summary> /// a constructor. /// </summary> /// <param name="fieldTermStack"><see cref="FieldTermStack"/> object</param> /// <param name="fieldQuery"><see cref="FieldQuery"/> object</param> /// <param name="phraseLimit">maximum size of phraseList</param> public FieldPhraseList(FieldTermStack fieldTermStack, FieldQuery fieldQuery, int phraseLimit) { string field = fieldTermStack.FieldName; IList <TermInfo> phraseCandidate = new JCG.List <TermInfo>(); QueryPhraseMap currMap; // LUCENENET: IDE0059: Remove unnecessary value assignment QueryPhraseMap nextMap; // LUCENENET: IDE0059: Remove unnecessary value assignment while (!fieldTermStack.IsEmpty && (phraseList.Count < phraseLimit)) { phraseCandidate.Clear(); TermInfo ti; // LUCENENET: IDE0059: Remove unnecessary value assignment TermInfo first; // LUCENENET: IDE0059: Remove unnecessary value assignment first = ti = fieldTermStack.Pop(); currMap = fieldQuery.GetFieldTermMap(field, ti.Text); while (currMap is null && ti.Next != first) { ti = ti.Next; currMap = fieldQuery.GetFieldTermMap(field, ti.Text); } // if not found, discard top TermInfo from stack, then try next element if (currMap is null) { continue; } // if found, search the longest phrase phraseCandidate.Add(ti); while (true) { first = ti = fieldTermStack.Pop(); nextMap = null; if (ti != null) { nextMap = currMap.GetTermMap(ti.Text); while (nextMap is null && ti.Next != first) { ti = ti.Next; nextMap = currMap.GetTermMap(ti.Text); } } if (ti is null || nextMap is null) { if (ti != null) { fieldTermStack.Push(ti); } if (currMap.IsValidTermOrPhrase(phraseCandidate)) { AddIfNoOverlap(new WeightedPhraseInfo(phraseCandidate, currMap.Boost, currMap.TermOrPhraseNumber)); } else { while (phraseCandidate.Count > 1) { //fieldTermStack.Push(phraseCandidate.Last.Value); //phraseCandidate.RemoveLast(); TermInfo last = phraseCandidate[phraseCandidate.Count - 1]; phraseCandidate.Remove(last); fieldTermStack.Push(last); currMap = fieldQuery.SearchPhrase(field, phraseCandidate); if (currMap != null) { AddIfNoOverlap(new WeightedPhraseInfo(phraseCandidate, currMap.Boost, currMap.TermOrPhraseNumber)); break; } } } break; }
protected virtual FieldFragList CreateFieldFragList(FieldPhraseList fieldPhraseList, FieldFragList fieldFragList, int fragCharSize) { // LUCENENET specific - added guard clauses to check for nulls if (fieldPhraseList is null) { throw new ArgumentNullException(nameof(fieldPhraseList)); } if (fieldFragList is null) { throw new ArgumentNullException(nameof(fieldFragList)); } if (fragCharSize < minFragCharSize) { throw new ArgumentOutOfRangeException(nameof(fragCharSize), "fragCharSize(" + fragCharSize + ") is too small. It must be " + minFragCharSize + " or higher."); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } JCG.List <WeightedPhraseInfo> wpil = new JCG.List <WeightedPhraseInfo>(); using (IteratorQueue <WeightedPhraseInfo> queue = new IteratorQueue <WeightedPhraseInfo>(fieldPhraseList.PhraseList.GetEnumerator())) { WeightedPhraseInfo phraseInfo = null; int startOffset = 0; while ((phraseInfo = queue.Top()) != null) { // if the phrase violates the border of previous fragment, discard it and try next phrase if (phraseInfo.StartOffset < startOffset) { queue.RemoveTop(); continue; } wpil.Clear(); int currentPhraseStartOffset = phraseInfo.StartOffset; int currentPhraseEndOffset = phraseInfo.EndOffset; int spanStart = Math.Max(currentPhraseStartOffset - margin, startOffset); int spanEnd = Math.Max(currentPhraseEndOffset, spanStart + fragCharSize); if (AcceptPhrase(queue.RemoveTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) { wpil.Add(phraseInfo); } while ((phraseInfo = queue.Top()) != null) { // pull until we crossed the current spanEnd if (phraseInfo.EndOffset <= spanEnd) { currentPhraseEndOffset = phraseInfo.EndOffset; if (AcceptPhrase(queue.RemoveTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) { wpil.Add(phraseInfo); } } else { break; } } if (wpil.Count == 0) { continue; } int matchLen = currentPhraseEndOffset - currentPhraseStartOffset; // now recalculate the start and end position to "center" the result int newMargin = Math.Max(0, (fragCharSize - matchLen) / 2); // matchLen can be > fragCharSize prevent IAOOB here spanStart = currentPhraseStartOffset - newMargin; if (spanStart < startOffset) { spanStart = startOffset; } // whatever is bigger here we grow this out spanEnd = spanStart + Math.Max(matchLen, fragCharSize); startOffset = spanEnd; fieldFragList.Add(spanStart, spanEnd, wpil); } } return(fieldFragList); }
/// <summary> /// Merging constructor. /// </summary> /// <param name="toMerge"><see cref="FieldPhraseList"/>s to merge to build this one</param> public FieldPhraseList(FieldPhraseList[] toMerge) { // Merge all overlapping WeightedPhraseInfos // Step 1. Sort by startOffset, endOffset, and boost, in that order. IEnumerator <WeightedPhraseInfo>[] allInfos = new IEnumerator <WeightedPhraseInfo> [toMerge.Length]; try { int index = 0; foreach (FieldPhraseList fplToMerge in toMerge) { allInfos[index++] = fplToMerge.phraseList.GetEnumerator(); } using MergedEnumerator <WeightedPhraseInfo> itr = new MergedEnumerator <WeightedPhraseInfo>(false, allInfos); // Step 2. Walk the sorted list merging infos that overlap phraseList = new JCG.List <WeightedPhraseInfo>(); if (!itr.MoveNext()) { return; } IList <WeightedPhraseInfo> work = new JCG.List <WeightedPhraseInfo>(); WeightedPhraseInfo first = itr.Current; work.Add(first); int workEndOffset = first.EndOffset; while (itr.MoveNext()) { WeightedPhraseInfo current = itr.Current; if (current.StartOffset <= workEndOffset) { workEndOffset = Math.Max(workEndOffset, current.EndOffset); work.Add(current); } else { if (work.Count == 1) { phraseList.Add(work[0]); work[0] = current; } else { phraseList.Add(new WeightedPhraseInfo(work)); work.Clear(); work.Add(current); } workEndOffset = current.EndOffset; } } if (work.Count == 1) { phraseList.Add(work[0]); } else { phraseList.Add(new WeightedPhraseInfo(work)); work.Clear(); } } finally { IOUtils.Dispose(allInfos); } }
public virtual void TestReadMappingRules() { SlowSynonymMap synMap; // (a)->[b] IList <string> rules = new JCG.List <string>(); rules.Add("a=>b"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", true, null); assertEquals(1, synMap.Submap.size()); AssertTokIncludes(synMap, "a", "b"); // (a)->[c] // (b)->[c] rules.Clear(); rules.Add("a,b=>c"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", true, null); assertEquals(2, synMap.Submap.size()); AssertTokIncludes(synMap, "a", "c"); AssertTokIncludes(synMap, "b", "c"); // (a)->[b][c] rules.Clear(); rules.Add("a=>b,c"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", true, null); assertEquals(1, synMap.Submap.size()); AssertTokIncludes(synMap, "a", "b"); AssertTokIncludes(synMap, "a", "c"); // (a)->(b)->[a2] // [a1] rules.Clear(); rules.Add("a=>a1"); rules.Add("a b=>a2"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", true, null); assertEquals(1, synMap.Submap.size()); AssertTokIncludes(synMap, "a", "a1"); assertEquals(1, GetSubSynonymMap(synMap, "a").Submap.size()); AssertTokIncludes(GetSubSynonymMap(synMap, "a"), "b", "a2"); // (a)->(b)->[a2] // (c)->[a3] // [a1] rules.Clear(); rules.Add("a=>a1"); rules.Add("a b=>a2"); rules.Add("a c=>a3"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", true, null); assertEquals(1, synMap.Submap.size()); AssertTokIncludes(synMap, "a", "a1"); assertEquals(2, GetSubSynonymMap(synMap, "a").Submap.size()); AssertTokIncludes(GetSubSynonymMap(synMap, "a"), "b", "a2"); AssertTokIncludes(GetSubSynonymMap(synMap, "a"), "c", "a3"); // (a)->(b)->[a2] // [a1] // (b)->(c)->[b2] // [b1] rules.Clear(); rules.Add("a=>a1"); rules.Add("a b=>a2"); rules.Add("b=>b1"); rules.Add("b c=>b2"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", true, null); assertEquals(2, synMap.Submap.size()); AssertTokIncludes(synMap, "a", "a1"); assertEquals(1, GetSubSynonymMap(synMap, "a").Submap.size()); AssertTokIncludes(GetSubSynonymMap(synMap, "a"), "b", "a2"); AssertTokIncludes(synMap, "b", "b1"); assertEquals(1, GetSubSynonymMap(synMap, "b").Submap.size()); AssertTokIncludes(GetSubSynonymMap(synMap, "b"), "c", "b2"); }
public virtual void TestBasic() { string groupField = "author"; FieldType customType = new FieldType(); customType.IsStored = (true); Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter( Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetMergePolicy(NewLogMergePolicy())); bool canUseIDV = !"Lucene3x".Equals(w.IndexWriter.Config.Codec.Name, StringComparison.Ordinal); JCG.List <Document> documents = new JCG.List <Document>(); // 0 Document doc = new Document(); AddGroupField(doc, groupField, "author1", canUseIDV); doc.Add(new TextField("content", "random text", Field.Store.YES)); doc.Add(new Field("id", "1", customType)); documents.Add(doc); // 1 doc = new Document(); AddGroupField(doc, groupField, "author1", canUseIDV); doc.Add(new TextField("content", "some more random text", Field.Store.YES)); doc.Add(new Field("id", "2", customType)); documents.Add(doc); // 2 doc = new Document(); AddGroupField(doc, groupField, "author1", canUseIDV); doc.Add(new TextField("content", "some more random textual data", Field.Store.YES)); doc.Add(new Field("id", "3", customType)); doc.Add(new StringField("groupend", "x", Field.Store.NO)); documents.Add(doc); w.AddDocuments(documents); documents.Clear(); // 3 doc = new Document(); AddGroupField(doc, groupField, "author2", canUseIDV); doc.Add(new TextField("content", "some random text", Field.Store.YES)); doc.Add(new Field("id", "4", customType)); doc.Add(new StringField("groupend", "x", Field.Store.NO)); w.AddDocument(doc); // 4 doc = new Document(); AddGroupField(doc, groupField, "author3", canUseIDV); doc.Add(new TextField("content", "some more random text", Field.Store.YES)); doc.Add(new Field("id", "5", customType)); documents.Add(doc); // 5 doc = new Document(); AddGroupField(doc, groupField, "author3", canUseIDV); doc.Add(new TextField("content", "random", Field.Store.YES)); doc.Add(new Field("id", "6", customType)); doc.Add(new StringField("groupend", "x", Field.Store.NO)); documents.Add(doc); w.AddDocuments(documents); documents.Clear(); // 6 -- no author field doc = new Document(); doc.Add(new TextField("content", "random word stuck in alot of other text", Field.Store.YES)); doc.Add(new Field("id", "6", customType)); doc.Add(new StringField("groupend", "x", Field.Store.NO)); w.AddDocument(doc); IndexSearcher indexSearcher = NewSearcher(w.GetReader()); w.Dispose(); Sort groupSort = Sort.RELEVANCE; GroupingSearch groupingSearch = CreateRandomGroupingSearch(groupField, groupSort, 5, canUseIDV); ITopGroups <object> groups = groupingSearch.Search(indexSearcher, (Filter)null, new TermQuery(new Index.Term("content", "random")), 0, 10); assertEquals(7, groups.TotalHitCount); assertEquals(7, groups.TotalGroupedHitCount); assertEquals(4, groups.Groups.Length); // relevance order: 5, 0, 3, 4, 1, 2, 6 // the later a document is added the higher this docId // value IGroupDocs <object> group = groups.Groups[0]; CompareGroupValue("author3", group); assertEquals(2, group.ScoreDocs.Length); assertEquals(5, group.ScoreDocs[0].Doc); assertEquals(4, group.ScoreDocs[1].Doc); assertTrue(group.ScoreDocs[0].Score > group.ScoreDocs[1].Score); group = groups.Groups[1]; CompareGroupValue("author1", group); assertEquals(3, group.ScoreDocs.Length); assertEquals(0, group.ScoreDocs[0].Doc); assertEquals(1, group.ScoreDocs[1].Doc); assertEquals(2, group.ScoreDocs[2].Doc); assertTrue(group.ScoreDocs[0].Score > group.ScoreDocs[1].Score); assertTrue(group.ScoreDocs[1].Score > group.ScoreDocs[2].Score); group = groups.Groups[2]; CompareGroupValue("author2", group); assertEquals(1, group.ScoreDocs.Length); assertEquals(3, group.ScoreDocs[0].Doc); group = groups.Groups[3]; CompareGroupValue(null, group); assertEquals(1, group.ScoreDocs.Length); assertEquals(6, group.ScoreDocs[0].Doc); Filter lastDocInBlock = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Index.Term("groupend", "x")))); groupingSearch = new GroupingSearch(lastDocInBlock); groups = groupingSearch.Search(indexSearcher, null, new TermQuery(new Index.Term("content", "random")), 0, 10); assertEquals(7, groups.TotalHitCount); assertEquals(7, groups.TotalGroupedHitCount); assertEquals(4, groups.TotalGroupCount.GetValueOrDefault()); assertEquals(4, groups.Groups.Length); indexSearcher.IndexReader.Dispose(); dir.Dispose(); }
/// <summary> /// Dumps an <see cref="FST{T}"/> to a GraphViz's <c>dot</c> language description /// for visualization. Example of use: /// /// <code> /// using (TextWriter sw = new StreamWriter("out.dot")) /// { /// Util.ToDot(fst, sw, true, true); /// } /// </code> /// /// and then, from command line: /// /// <code> /// dot -Tpng -o out.png out.dot /// </code> /// /// <para/> /// Note: larger FSTs (a few thousand nodes) won't even /// render, don't bother. If the FST is > 2.1 GB in size /// then this method will throw strange exceptions. /// <para/> /// See also <a href="http://www.graphviz.org/">http://www.graphviz.org/</a>. /// </summary> /// <param name="sameRank"> /// If <c>true</c>, the resulting <c>dot</c> file will try /// to order states in layers of breadth-first traversal. This may /// mess up arcs, but makes the output FST's structure a bit clearer. /// </param> /// <param name="labelStates"> /// If <c>true</c> states will have labels equal to their offsets in their /// binary format. Expands the graph considerably. /// </param> public static void ToDot <T>(FST <T> fst, TextWriter @out, bool sameRank, bool labelStates) { const string expandedNodeColor = "blue"; // this is the start arc in the automaton (from the epsilon state to the first state // with outgoing transitions. FST.Arc <T> startArc = fst.GetFirstArc(new FST.Arc <T>()); // A queue of transitions to consider for the next level. IList <FST.Arc <T> > thisLevelQueue = new JCG.List <FST.Arc <T> >(); // A queue of transitions to consider when processing the next level. IList <FST.Arc <T> > nextLevelQueue = new JCG.List <FST.Arc <T> >(); nextLevelQueue.Add(startArc); //System.out.println("toDot: startArc: " + startArc); // A list of states on the same level (for ranking). IList <int?> sameLevelStates = new JCG.List <int?>(); // A bitset of already seen states (target offset). BitArray seen = new BitArray(32); seen.SafeSet((int)startArc.Target, true); // Shape for states. const string stateShape = "circle"; const string finalStateShape = "doublecircle"; // Emit DOT prologue. @out.Write("digraph FST {\n"); @out.Write(" rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n"); if (!labelStates) { @out.Write(" node [shape=circle, width=.2, height=.2, style=filled]\n"); } EmitDotState(@out, "initial", "point", "white", ""); T NO_OUTPUT = fst.Outputs.NoOutput; var r = fst.GetBytesReader(); // final FST.Arc<T> scratchArc = new FST.Arc<>(); { string stateColor; if (fst.IsExpandedTarget(startArc, r)) { stateColor = expandedNodeColor; } else { stateColor = null; } bool isFinal; T finalOutput; if (startArc.IsFinal) { isFinal = true; finalOutput = startArc.NextFinalOutput.Equals(NO_OUTPUT) ? default(T) : startArc.NextFinalOutput; } else { isFinal = false; finalOutput = default(T); } EmitDotState(@out, Convert.ToString(startArc.Target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.Outputs.OutputToString(finalOutput)); } @out.Write(" initial -> " + startArc.Target + "\n"); int level = 0; while (nextLevelQueue.Count > 0) { // we could double buffer here, but it doesn't matter probably. //System.out.println("next level=" + level); thisLevelQueue.AddRange(nextLevelQueue); nextLevelQueue.Clear(); level++; @out.Write("\n // Transitions and states at level: " + level + "\n"); while (thisLevelQueue.Count > 0) { FST.Arc <T> arc = thisLevelQueue[thisLevelQueue.Count - 1]; thisLevelQueue.RemoveAt(thisLevelQueue.Count - 1); //System.out.println(" pop: " + arc); if (FST <T> .TargetHasArcs(arc)) { // scan all target arcs //System.out.println(" readFirstTarget..."); long node = arc.Target; fst.ReadFirstRealTargetArc(arc.Target, arc, r); //System.out.println(" firstTarget: " + arc); while (true) { //System.out.println(" cycle arc=" + arc); // Emit the unseen state and add it to the queue for the next level. if (arc.Target >= 0 && !seen.SafeGet((int)arc.Target)) { /* * boolean isFinal = false; * T finalOutput = null; * fst.readFirstTargetArc(arc, scratchArc); * if (scratchArc.isFinal() && fst.targetHasArcs(scratchArc)) { * // target is final * isFinal = true; * finalOutput = scratchArc.output == NO_OUTPUT ? null : scratchArc.output; * System.out.println("dot hit final label=" + (char) scratchArc.label); * } */ string stateColor; if (fst.IsExpandedTarget(arc, r)) { stateColor = expandedNodeColor; } else { stateColor = null; } string finalOutput; if (arc.NextFinalOutput != null && !arc.NextFinalOutput.Equals(NO_OUTPUT)) { finalOutput = fst.Outputs.OutputToString(arc.NextFinalOutput); } else { finalOutput = ""; } EmitDotState(@out, Convert.ToString(arc.Target), stateShape, stateColor, finalOutput); // To see the node address, use this instead: //emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target)); seen.SafeSet((int)arc.Target, true); nextLevelQueue.Add((new FST.Arc <T>()).CopyFrom(arc)); sameLevelStates.Add((int)arc.Target); } string outs; if (!arc.Output.Equals(NO_OUTPUT)) { outs = "/" + fst.Outputs.OutputToString(arc.Output); } else { outs = ""; } if (!FST <T> .TargetHasArcs(arc) && arc.IsFinal && !arc.NextFinalOutput.Equals(NO_OUTPUT)) { // Tricky special case: sometimes, due to // pruning, the builder can [sillily] produce // an FST with an arc into the final end state // (-1) but also with a next final output; in // this case we pull that output up onto this // arc outs = outs + "/[" + fst.Outputs.OutputToString(arc.NextFinalOutput) + "]"; } string arcColor; if (arc.Flag(FST.BIT_TARGET_NEXT)) { arcColor = "red"; } else { arcColor = "black"; } Debug.Assert(arc.Label != FST.END_LABEL); @out.Write(" " + node + " -> " + arc.Target + " [label=\"" + PrintableLabel(arc.Label) + outs + "\"" + (arc.IsFinal ? " style=\"bold\"" : "") + " color=\"" + arcColor + "\"]\n"); // Break the loop if we're on the last arc of this state. if (arc.IsLast) { //System.out.println(" break"); break; } fst.ReadNextRealArc(arc, r); } } } // Emit state ranking information. if (sameRank && sameLevelStates.Count > 1) { @out.Write(" {rank=same; "); foreach (int state in sameLevelStates) { @out.Write(state + "; "); } @out.Write(" }\n"); } sameLevelStates.Clear(); } // Emit terminating state (always there anyway). @out.Write(" -1 [style=filled, color=black, shape=doublecircle, label=\"\"]\n\n"); @out.Write(" {rank=sink; -1 }\n"); @out.Write("}\n"); @out.Flush(); }
public override void Run() { // TODO: would be better if this were cross thread, so that we make sure one thread deleting anothers added docs works: IList <string> toDeleteIDs = new JCG.List <string>(); IList <SubDocs> toDeleteSubDocs = new JCG.List <SubDocs>(); while (J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond < stopTime && !outerInstance.m_failed) // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results { try { // Occasional longish pause if running // nightly if (LuceneTestCase.TestNightly && Random.Next(6) == 3) { if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": now long sleep"); } //Thread.Sleep(TestUtil.NextInt32(Random, 50, 500)); // LUCENENET specific - Reduced amount of pause to keep the total // Nightly test time under 1 hour Thread.Sleep(TestUtil.NextInt32(Random, 50, 250)); } // Rate limit ingest rate: if (Random.Next(7) == 5) { Thread.Sleep(TestUtil.NextInt32(Random, 1, 10)); if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": done sleep"); } } Document doc = docs.NextDoc(); if (doc is null) { break; } // Maybe add randomly named field string addedField; if (Random.NextBoolean()) { addedField = "extra" + Random.Next(40); doc.Add(NewTextField(addedField, "a random field", Field.Store.YES)); } else { addedField = null; } if (Random.NextBoolean()) { if (Random.NextBoolean()) { // Add/update doc block: string packID; SubDocs delSubDocs; if (toDeleteSubDocs.Count > 0 && Random.NextBoolean()) { delSubDocs = toDeleteSubDocs[Random.Next(toDeleteSubDocs.Count)]; if (Debugging.AssertsEnabled) { Debugging.Assert(!delSubDocs.Deleted); } toDeleteSubDocs.Remove(delSubDocs); // Update doc block, replacing prior packID packID = delSubDocs.PackID; } else { delSubDocs = null; // Add doc block, using new packID packID = outerInstance.m_packCount.GetAndIncrement().ToString(CultureInfo.InvariantCulture); } Field packIDField = NewStringField("packID", packID, Field.Store.YES); IList <string> docIDs = new JCG.List <string>(); SubDocs subDocs = new SubDocs(packID, docIDs); IList <Document> docsList = new JCG.List <Document>(); allSubDocs.Enqueue(subDocs); doc.Add(packIDField); docsList.Add(TestUtil.CloneDocument(doc)); docIDs.Add(doc.Get("docid")); int maxDocCount = TestUtil.NextInt32(Random, 1, 10); while (docsList.Count < maxDocCount) { doc = docs.NextDoc(); if (doc is null) { break; } docsList.Add(TestUtil.CloneDocument(doc)); docIDs.Add(doc.Get("docid")); } outerInstance.m_addCount.AddAndGet(docsList.Count); Term packIDTerm = new Term("packID", packID); if (delSubDocs != null) { delSubDocs.Deleted = true; delIDs.UnionWith(delSubDocs.SubIDs); outerInstance.m_delCount.AddAndGet(delSubDocs.SubIDs.Count); if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": update pack packID=" + delSubDocs.PackID + " count=" + docsList.Count + " docs=" + string.Format(J2N.Text.StringFormatter.InvariantCulture, "{0}", docIDs)); } outerInstance.UpdateDocuments(packIDTerm, docsList); } else { if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": add pack packID=" + packID + " count=" + docsList.Count + " docs=" + string.Format(J2N.Text.StringFormatter.InvariantCulture, "{0}", docIDs)); } outerInstance.AddDocuments(packIDTerm, docsList); } doc.RemoveField("packID"); if (Random.Next(5) == 2) { if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": buffer del id:" + packID); } toDeleteSubDocs.Add(subDocs); } } else { // Add single doc string docid = doc.Get("docid"); if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": add doc docid:" + docid); } outerInstance.AddDocument(new Term("docid", docid), doc); outerInstance.m_addCount.GetAndIncrement(); if (Random.Next(5) == 3) { if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": buffer del id:" + doc.Get("docid")); } toDeleteIDs.Add(docid); } } } else { // Update single doc, but we never re-use // and ID so the delete will never // actually happen: if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": update doc id:" + doc.Get("docid")); } string docid = doc.Get("docid"); outerInstance.UpdateDocument(new Term("docid", docid), doc); outerInstance.m_addCount.GetAndIncrement(); if (Random.Next(5) == 3) { if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": buffer del id:" + doc.Get("docid")); } toDeleteIDs.Add(docid); } } if (Random.Next(30) == 17) { if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": apply " + toDeleteIDs.Count + " deletes"); } foreach (string id in toDeleteIDs) { if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": del term=id:" + id); } outerInstance.DeleteDocuments(new Term("docid", id)); } int count = outerInstance.m_delCount.AddAndGet(toDeleteIDs.Count); if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": tot " + count + " deletes"); } delIDs.UnionWith(toDeleteIDs); toDeleteIDs.Clear(); foreach (SubDocs subDocs in toDeleteSubDocs) { if (Debugging.AssertsEnabled) { Debugging.Assert(!subDocs.Deleted); } delPackIDs.Add(subDocs.PackID); outerInstance.DeleteDocuments(new Term("packID", subDocs.PackID)); subDocs.Deleted = true; if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": del subs: " + subDocs.SubIDs + " packID=" + subDocs.PackID); } delIDs.UnionWith(subDocs.SubIDs); outerInstance.m_delCount.AddAndGet(subDocs.SubIDs.Count); } toDeleteSubDocs.Clear(); } if (addedField != null) { doc.RemoveField(addedField); } } catch (Exception t) when(t.IsThrowable()) { Console.WriteLine(Thread.CurrentThread.Name + ": hit exc"); Console.WriteLine(t.ToString()); Console.Write(t.StackTrace); outerInstance.m_failed.Value = (true); throw RuntimeException.Create(t); } } if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": indexing done"); } outerInstance.DoAfterIndexingThreadDone(); }
/// <summary> /// Minimizes the given automaton using Hopcroft's algorithm. /// </summary> public static void MinimizeHopcroft(Automaton a) { a.Determinize(); if (a.initial.numTransitions == 1) { Transition t = a.initial.TransitionsArray[0]; if (t.to == a.initial && t.min == Character.MinCodePoint && t.max == Character.MaxCodePoint) { return; } } a.Totalize(); // initialize data structures int[] sigma = a.GetStartPoints(); State[] states = a.GetNumberedStates(); int sigmaLen = sigma.Length, statesLen = states.Length; JCG.List <State>[,] reverse = new JCG.List <State> [statesLen, sigmaLen]; ISet <State>[] partition = new JCG.HashSet <State> [statesLen]; JCG.List <State>[] splitblock = new JCG.List <State> [statesLen]; int[] block = new int[statesLen]; StateList[,] active = new StateList[statesLen, sigmaLen]; StateListNode[,] active2 = new StateListNode[statesLen, sigmaLen]; Queue <Int32Pair> pending = new Queue <Int32Pair>(); // LUCENENET specific - Queue is much more performant than LinkedList OpenBitSet pending2 = new OpenBitSet(sigmaLen * statesLen); OpenBitSet split = new OpenBitSet(statesLen), refine = new OpenBitSet(statesLen), refine2 = new OpenBitSet(statesLen); for (int q = 0; q < statesLen; q++) { splitblock[q] = new JCG.List <State>(); partition[q] = new JCG.HashSet <State>(); for (int x = 0; x < sigmaLen; x++) { active[q, x] = new StateList(); } } // find initial partition and reverse edges for (int q = 0; q < statesLen; q++) { State qq = states[q]; int j = qq.accept ? 0 : 1; partition[j].Add(qq); block[q] = j; for (int x = 0; x < sigmaLen; x++) { //JCG.List<State>[] r = reverse[qq.Step(sigma[x]).number]; var r = qq.Step(sigma[x]).number; if (reverse[r, x] == null) { reverse[r, x] = new JCG.List <State>(); } reverse[r, x].Add(qq); } } // initialize active sets for (int j = 0; j <= 1; j++) { for (int x = 0; x < sigmaLen; x++) { foreach (State qq in partition[j]) { if (reverse[qq.number, x] != null) { active2[qq.number, x] = active[j, x].Add(qq); } } } } // initialize pending for (int x = 0; x < sigmaLen; x++) { int j = (active[0, x].Count <= active[1, x].Count) ? 0 : 1; pending.Enqueue(new Int32Pair(j, x)); pending2.Set(x * statesLen + j); } // process pending until fixed point int k = 2; while (pending.Count > 0) { Int32Pair ip = pending.Dequeue(); int p = ip.n1; int x = ip.n2; pending2.Clear(x * statesLen + p); // find states that need to be split off their blocks for (StateListNode m = active[p, x].First; m != null; m = m.Next) { JCG.List <State> r = reverse[m.Q.number, x]; if (r != null) { foreach (State s in r) { int i = s.number; if (!split.Get(i)) { split.Set(i); int j = block[i]; splitblock[j].Add(s); if (!refine2.Get(j)) { refine2.Set(j); refine.Set(j); } } } } } // refine blocks for (int j = refine.NextSetBit(0); j >= 0; j = refine.NextSetBit(j + 1)) { JCG.List <State> sb = splitblock[j]; if (sb.Count < partition[j].Count) { ISet <State> b1 = partition[j]; ISet <State> b2 = partition[k]; foreach (State s in sb) { b1.Remove(s); b2.Add(s); block[s.number] = k; for (int c = 0; c < sigmaLen; c++) { StateListNode sn = active2[s.number, c]; if (sn != null && sn.Sl == active[j, c]) { sn.Remove(); active2[s.number, c] = active[k, c].Add(s); } } } // update pending for (int c = 0; c < sigmaLen; c++) { int aj = active[j, c].Count, ak = active[k, c].Count, ofs = c * statesLen; if (!pending2.Get(ofs + j) && 0 < aj && aj <= ak) { pending2.Set(ofs + j); pending.Enqueue(new Int32Pair(j, c)); } else { pending2.Set(ofs + k); pending.Enqueue(new Int32Pair(k, c)); } } k++; } refine2.Clear(j); foreach (State s in sb) { split.Clear(s.number); } sb.Clear(); } refine.Clear(0, refine.Length); } // make a new state for each equivalence class, set initial state State[] newstates = new State[k]; for (int n = 0; n < newstates.Length; n++) { State s = new State(); newstates[n] = s; foreach (State q in partition[n]) { if (q == a.initial) { a.initial = s; } s.accept = q.accept; s.number = q.number; // select representative q.number = n; } } // build transitions and set acceptance for (int n = 0; n < newstates.Length; n++) { State s = newstates[n]; s.accept = states[s.number].accept; foreach (Transition t in states[s.number].GetTransitions()) { s.AddTransition(new Transition(t.min, t.max, newstates[t.to.number])); } } a.ClearNumberedStates(); a.RemoveDeadTransitions(); }
public void TestNestedSorting() { Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetMergePolicy(NoMergePolicy.COMPOUND_FILES)); IList <Document> docs = new JCG.List <Document>(); Document document = new Document(); document.Add(new StringField("field2", "a", Field.Store.NO)); document.Add(new StringField("filter_1", "T", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("field2", "b", Field.Store.NO)); document.Add(new StringField("filter_1", "T", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("field2", "c", Field.Store.NO)); document.Add(new StringField("filter_1", "T", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("__type", "parent", Field.Store.NO)); document.Add(new StringField("field1", "a", Field.Store.NO)); docs.Add(document); w.AddDocuments(docs); w.Commit(); docs.Clear(); document = new Document(); document.Add(new StringField("field2", "c", Field.Store.NO)); document.Add(new StringField("filter_1", "T", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("field2", "d", Field.Store.NO)); document.Add(new StringField("filter_1", "T", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("field2", "e", Field.Store.NO)); document.Add(new StringField("filter_1", "T", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("__type", "parent", Field.Store.NO)); document.Add(new StringField("field1", "b", Field.Store.NO)); docs.Add(document); w.AddDocuments(docs); docs.Clear(); document = new Document(); document.Add(new StringField("field2", "e", Field.Store.NO)); document.Add(new StringField("filter_1", "T", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("field2", "f", Field.Store.NO)); document.Add(new StringField("filter_1", "T", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("field2", "g", Field.Store.NO)); document.Add(new StringField("filter_1", "T", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("__type", "parent", Field.Store.NO)); document.Add(new StringField("field1", "c", Field.Store.NO)); docs.Add(document); w.AddDocuments(docs); docs.Clear(); document = new Document(); document.Add(new StringField("field2", "g", Field.Store.NO)); document.Add(new StringField("filter_1", "T", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("field2", "h", Field.Store.NO)); document.Add(new StringField("filter_1", "F", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("field2", "i", Field.Store.NO)); document.Add(new StringField("filter_1", "F", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("__type", "parent", Field.Store.NO)); document.Add(new StringField("field1", "d", Field.Store.NO)); docs.Add(document); w.AddDocuments(docs); w.Commit(); docs.Clear(); document = new Document(); document.Add(new StringField("field2", "i", Field.Store.NO)); document.Add(new StringField("filter_1", "F", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("field2", "j", Field.Store.NO)); document.Add(new StringField("filter_1", "F", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("field2", "k", Field.Store.NO)); document.Add(new StringField("filter_1", "F", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("__type", "parent", Field.Store.NO)); document.Add(new StringField("field1", "f", Field.Store.NO)); docs.Add(document); w.AddDocuments(docs); docs.Clear(); document = new Document(); document.Add(new StringField("field2", "k", Field.Store.NO)); document.Add(new StringField("filter_1", "T", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("field2", "l", Field.Store.NO)); document.Add(new StringField("filter_1", "T", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("field2", "m", Field.Store.NO)); document.Add(new StringField("filter_1", "T", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("__type", "parent", Field.Store.NO)); document.Add(new StringField("field1", "g", Field.Store.NO)); docs.Add(document); w.AddDocuments(docs); // This doc will not be included, because it doesn't have nested docs document = new Document(); document.Add(new StringField("__type", "parent", Field.Store.NO)); document.Add(new StringField("field1", "h", Field.Store.NO)); w.AddDocument(document); docs.Clear(); document = new Document(); document.Add(new StringField("field2", "m", Field.Store.NO)); document.Add(new StringField("filter_1", "T", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("field2", "n", Field.Store.NO)); document.Add(new StringField("filter_1", "F", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("field2", "o", Field.Store.NO)); document.Add(new StringField("filter_1", "F", Field.Store.NO)); docs.Add(document); document = new Document(); document.Add(new StringField("__type", "parent", Field.Store.NO)); document.Add(new StringField("field1", "i", Field.Store.NO)); docs.Add(document); w.AddDocuments(docs); w.Commit(); // Some garbage docs, just to check if the NestedFieldComparer can deal with this. document = new Document(); document.Add(new StringField("fieldXXX", "x", Field.Store.NO)); w.AddDocument(document); document = new Document(); document.Add(new StringField("fieldXXX", "x", Field.Store.NO)); w.AddDocument(document); document = new Document(); document.Add(new StringField("fieldXXX", "x", Field.Store.NO)); w.AddDocument(document); IndexSearcher searcher = new IndexSearcher(DirectoryReader.Open(w.IndexWriter, false)); w.Dispose(); Filter parentFilter = new QueryWrapperFilter(new TermQuery(new Term("__type", "parent"))); Filter childFilter = new QueryWrapperFilter(new PrefixQuery(new Term("field2"))); ToParentBlockJoinQuery query = new ToParentBlockJoinQuery(new FilteredQuery(new MatchAllDocsQuery(), childFilter), new FixedBitSetCachingWrapperFilter(parentFilter), ScoreMode.None); // Sort by field ascending, order first ToParentBlockJoinSortField sortField = new ToParentBlockJoinSortField("field2", SortFieldType.STRING, false, Wrap(parentFilter), Wrap(childFilter)); Sort sort = new Sort(sortField); TopFieldDocs topDocs = searcher.Search(query, 5, sort); assertEquals(7, topDocs.TotalHits); assertEquals(5, topDocs.ScoreDocs.Length); assertEquals(3, topDocs.ScoreDocs[0].Doc); assertEquals("a", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[0]).Fields[0]).Utf8ToString()); assertEquals(7, topDocs.ScoreDocs[1].Doc); assertEquals("c", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[1]).Fields[0]).Utf8ToString()); assertEquals(11, topDocs.ScoreDocs[2].Doc); assertEquals("e", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[2]).Fields[0]).Utf8ToString()); assertEquals(15, topDocs.ScoreDocs[3].Doc); assertEquals("g", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[3]).Fields[0]).Utf8ToString()); assertEquals(19, topDocs.ScoreDocs[4].Doc); assertEquals("i", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[4]).Fields[0]).Utf8ToString()); // Sort by field ascending, order last sortField = new ToParentBlockJoinSortField("field2", SortFieldType.STRING, false, true, Wrap(parentFilter), Wrap(childFilter)); sort = new Sort(sortField); topDocs = searcher.Search(query, 5, sort); assertEquals(7, topDocs.TotalHits); assertEquals(5, topDocs.ScoreDocs.Length); assertEquals(3, topDocs.ScoreDocs[0].Doc); assertEquals("c", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[0]).Fields[0]).Utf8ToString()); assertEquals(7, topDocs.ScoreDocs[1].Doc); assertEquals("e", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[1]).Fields[0]).Utf8ToString()); assertEquals(11, topDocs.ScoreDocs[2].Doc); assertEquals("g", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[2]).Fields[0]).Utf8ToString()); assertEquals(15, topDocs.ScoreDocs[3].Doc); assertEquals("i", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[3]).Fields[0]).Utf8ToString()); assertEquals(19, topDocs.ScoreDocs[4].Doc); assertEquals("k", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[4]).Fields[0]).Utf8ToString()); // Sort by field descending, order last sortField = new ToParentBlockJoinSortField("field2", SortFieldType.STRING, true, Wrap(parentFilter), Wrap(childFilter)); sort = new Sort(sortField); topDocs = searcher.Search(query, 5, sort); assertEquals(topDocs.TotalHits, 7); assertEquals(5, topDocs.ScoreDocs.Length); assertEquals(28, topDocs.ScoreDocs[0].Doc); assertEquals("o", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[0]).Fields[0]).Utf8ToString()); assertEquals(23, topDocs.ScoreDocs[1].Doc); assertEquals("m", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[1]).Fields[0]).Utf8ToString()); assertEquals(19, topDocs.ScoreDocs[2].Doc); assertEquals("k", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[2]).Fields[0]).Utf8ToString()); assertEquals(15, topDocs.ScoreDocs[3].Doc); assertEquals("i", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[3]).Fields[0]).Utf8ToString()); assertEquals(11, topDocs.ScoreDocs[4].Doc); assertEquals("g", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[4]).Fields[0]).Utf8ToString()); // Sort by field descending, order last, sort filter (filter_1:T) childFilter = new QueryWrapperFilter(new TermQuery((new Term("filter_1", "T")))); query = new ToParentBlockJoinQuery( new FilteredQuery(new MatchAllDocsQuery(), childFilter), new FixedBitSetCachingWrapperFilter(parentFilter), ScoreMode.None); sortField = new ToParentBlockJoinSortField("field2", SortFieldType.STRING, true, Wrap(parentFilter), Wrap(childFilter)); sort = new Sort(sortField); topDocs = searcher.Search(query, 5, sort); assertEquals(6, topDocs.TotalHits); assertEquals(5, topDocs.ScoreDocs.Length); assertEquals(23, topDocs.ScoreDocs[0].Doc); assertEquals("m", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[0]).Fields[0]).Utf8ToString()); assertEquals(28, topDocs.ScoreDocs[1].Doc); assertEquals("m", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[1]).Fields[0]).Utf8ToString()); assertEquals(11, topDocs.ScoreDocs[2].Doc); assertEquals("g", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[2]).Fields[0]).Utf8ToString()); assertEquals(15, topDocs.ScoreDocs[3].Doc); assertEquals("g", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[3]).Fields[0]).Utf8ToString()); assertEquals(7, topDocs.ScoreDocs[4].Doc); assertEquals("e", ((BytesRef)((FieldDoc)topDocs.ScoreDocs[4]).Fields[0]).Utf8ToString()); searcher.IndexReader.Dispose(); dir.Dispose(); }