protected override Query[] PrepareQueries() { // analyzer (default is standard analyzer) Analyzer anlzr = NewAnalyzerTask.CreateAnalyzer(m_config.Get("analyzer", typeof(Lucene.Net.Analysis.Standard.StandardAnalyzer).AssemblyQualifiedName)); JCG.List <object> queryList = new JCG.List <object>(20); queryList.AddRange(STANDARD_QUERIES); queryList.AddRange(GetPrebuiltQueries(DocMaker.BODY_FIELD)); return(CreateQueries(queryList, anlzr)); }
protected override Query[] PrepareQueries() { // analyzer (default is standard analyzer) Analyzer anlzr = NewAnalyzerTask.CreateAnalyzer(m_config.Get("analyzer", typeof(StandardAnalyzer).AssemblyQualifiedName)); JCG.List <object> queryList = new JCG.List <object>(20); queryList.AddRange(STANDARD_QUERIES); if (!m_config.Get("enwikiQueryMaker.disableSpanQueries", false)) { queryList.AddRange(GetPrebuiltQueries(DocMaker.BODY_FIELD)); } return(CreateQueries(queryList, anlzr)); }
public virtual void TestNextAllTerms() { IList <string> termsList = new JCG.List <string>(commonTerms.Length + mediumTerms.Length + rareTerms.Length); termsList.AddRange(commonTerms); termsList.AddRange(mediumTerms); termsList.AddRange(rareTerms); string[] terms = termsList.ToArray(); for (int minNrShouldMatch = 1; minNrShouldMatch <= terms.Length; minNrShouldMatch++) { Scorer expected = Scorer(terms, minNrShouldMatch, true); Scorer actual = Scorer(terms, minNrShouldMatch, false); AssertNext(expected, actual); } }
private static IList <FacetField> RandomCategories(Random random) { // add random categories from the two dimensions, ensuring that the same // category is not added twice. int numFacetsA = random.Next(3) + 1; // 1-3 int numFacetsB = random.Next(2) + 1; // 1-2 JCG.List <FacetField> categories_a = new JCG.List <FacetField>(); categories_a.AddRange(CATEGORIES_A); JCG.List <FacetField> categories_b = new JCG.List <FacetField>(); categories_b.AddRange(CATEGORIES_B); categories_a.Shuffle(Random); categories_b.Shuffle(Random); List <FacetField> categories = new List <FacetField>(); categories.AddRange(categories_a.GetView(0, numFacetsA)); // LUCENENET: Checked length for correctness categories.AddRange(categories_b.GetView(0, numFacetsB)); // LUCENENET: Checked length for correctness // add the NO_PARENT categories categories.Add(CATEGORIES_C[Util.LuceneTestCase.Random.Next(NUM_CHILDREN_CP_C)]); categories.Add(CATEGORIES_D[Util.LuceneTestCase.Random.Next(NUM_CHILDREN_CP_D)]); return(categories); }
// TODO: this should use inputstreams from the loader, not File! public virtual void Inform(IResourceLoader loader) { if (mapping != null) { IList <string> wlist; if (File.Exists(mapping)) { wlist = new JCG.List <string>(GetLines(loader, mapping)); } else { var files = SplitFileNames(mapping); wlist = new JCG.List <string>(); foreach (string file in files) { var lines = GetLines(loader, file.Trim()); wlist.AddRange(lines); } } NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); ParseRules(wlist, builder); m_normMap = builder.Build(); if (m_normMap.map == null) { // if the inner FST is null, it means it accepts nothing (e.g. the file is empty) // so just set the whole map to null m_normMap = null; } } }
/// <summary> /// LUCENENET specific /// Is non-static because NewIndexWriterConfig is no longer static. /// </summary> public void IndexSerial(Random random, IDictionary <string, Document> docs, Directory dir) { IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(random, TEST_VERSION_CURRENT, new MockAnalyzer(random)).SetMergePolicy(NewLogMergePolicy())); // index all docs in a single thread IEnumerator <Document> iter = docs.Values.GetEnumerator(); while (iter.MoveNext()) { Document d = iter.Current; IList <IIndexableField> fields = new JCG.List <IIndexableField>(); fields.AddRange(d.Fields); // put fields in same order each time fields.Sort(fieldNameComparer); Document d1 = new Document(); for (int i = 0; i < fields.Count; i++) { d1.Add(fields[i]); } w.AddDocument(d1); // System.out.println("indexing "+d1); } w.Dispose(); }
/// <summary> /// Merging constructor. Note that this just grabs seqnum from the first info. /// </summary> public WeightedPhraseInfo(ICollection <WeightedPhraseInfo> toMerge) { IEnumerator <Toffs>[] allToffs = new IEnumerator <Toffs> [toMerge.Count]; try { // Pretty much the same idea as merging FieldPhraseLists: // Step 1. Sort by startOffset, endOffset // While we are here merge the boosts and termInfos using IEnumerator <WeightedPhraseInfo> toMergeItr = toMerge.GetEnumerator(); if (!toMergeItr.MoveNext()) { throw new ArgumentException("toMerge must contain at least one WeightedPhraseInfo."); } WeightedPhraseInfo first = toMergeItr.Current; termsInfos = new JCG.List <TermInfo>(); seqnum = first.seqnum; boost = first.boost; allToffs[0] = first.termsOffsets.GetEnumerator(); int index = 1; while (toMergeItr.MoveNext()) { WeightedPhraseInfo info = toMergeItr.Current; boost += info.boost; termsInfos.AddRange(info.termsInfos); allToffs[index++] = info.termsOffsets.GetEnumerator(); } // Step 2. Walk the sorted list merging overlaps using MergedEnumerator <Toffs> itr = new MergedEnumerator <Toffs>(false, allToffs); termsOffsets = new JCG.List <Toffs>(); if (!itr.MoveNext()) { return; } Toffs work = itr.Current; while (itr.MoveNext()) { Toffs current = itr.Current; if (current.StartOffset <= work.EndOffset) { work.EndOffset = Math.Max(work.EndOffset, current.EndOffset); } else { termsOffsets.Add(work); work = current; } } termsOffsets.Add(work); } finally { IOUtils.Dispose(allToffs); } }
public override void BeforeClass() { base.BeforeClass(); assertFalse("test infra is broken!", OldFormatImpersonationIsActive); JCG.List <string> names = new JCG.List <string>(oldNames.Length + oldSingleSegmentNames.Length); names.AddRange(oldNames); names.AddRange(oldSingleSegmentNames); oldIndexDirs = new Dictionary <string, Directory>(); foreach (string name in names) { DirectoryInfo dir = CreateTempDir(name); using (Stream zipFileStream = this.GetType().FindAndGetManifestResourceStream("index." + name + ".zip")) { TestUtil.Unzip(zipFileStream, dir); } oldIndexDirs[name] = NewFSDirectory(dir); } }
public virtual void TestNextVaryingNumberOfTerms() { IList <string> termsList = new JCG.List <string>(commonTerms.Length + mediumTerms.Length + rareTerms.Length); termsList.AddRange(commonTerms); termsList.AddRange(mediumTerms); termsList.AddRange(rareTerms); termsList.Shuffle(Random); for (int numTerms = 2; numTerms <= termsList.Count; numTerms++) { string[] terms = termsList.GetView(0, numTerms).ToArray(/*new string[0]*/); // LUCENENET: Checked length of GetView() for correctness for (int minNrShouldMatch = 1; minNrShouldMatch <= terms.Length; minNrShouldMatch++) { Scorer expected = Scorer(terms, minNrShouldMatch, true); Scorer actual = Scorer(terms, minNrShouldMatch, false); AssertNext(expected, actual); } } }
public virtual void TestUpgradeOldIndex() { JCG.List <string> names = new JCG.List <string>(oldNames.Length + oldSingleSegmentNames.Length); names.AddRange(oldNames); names.AddRange(oldSingleSegmentNames); foreach (string name in names) { if (Verbose) { Console.WriteLine("testUpgradeOldIndex: index=" + name); } Directory dir = NewDirectory(oldIndexDirs[name]); (new IndexUpgrader(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, null), false)).Upgrade(); CheckAllSegmentsUpgraded(dir); dir.Dispose(); } }
/// <summary> /// Gets rules for a combination of name type, rule type and languages. /// </summary> /// <param name="nameType">The <see cref="NameType"/> to consider.</param> /// <param name="rt">The <see cref="RuleType"/> to consider.</param> /// <param name="langs">The set of languages to consider.</param> /// <returns>A list of <see cref="Rule"/>s that apply.</returns> public static IList <Rule> GetInstance(NameType nameType, RuleType rt, LanguageSet langs) { IDictionary <string, IList <Rule> > ruleMap = GetInstanceMap(nameType, rt, langs); IList <Rule> allRules = new JCG.List <Rule>(); foreach (IList <Rule> rules in ruleMap.Values) { allRules.AddRange(rules); } return(allRules); }
private IList <Document> CreateDocsForSegment(int segmentNumber) { IList <IList <Document> > blocks = new JCG.List <IList <Document> >(AMOUNT_OF_PARENT_DOCS); for (int i = 0; i < AMOUNT_OF_PARENT_DOCS; i++) { blocks.Add(CreateParentDocWithChildren(segmentNumber, i)); } IList <Document> result = new JCG.List <Document>(AMOUNT_OF_DOCS_IN_SEGMENT); foreach (IList <Document> block in blocks) { result.AddRange(block); } return(result); }
private IList <CharsRef> DoStem(char[] word, int length, bool caseVariant) { JCG.List <CharsRef> stems = new JCG.List <CharsRef>(); Int32sRef forms = dictionary.LookupWord(word, 0, length); if (forms != null) { for (int i = 0; i < forms.Length; i += formStep) { bool checkKeepCase = caseVariant && dictionary.keepcase != -1; bool checkNeedAffix = dictionary.needaffix != -1; bool checkOnlyInCompound = dictionary.onlyincompound != -1; if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) { dictionary.flagLookup.Get(forms.Int32s[forms.Offset + i], scratch); char[] wordFlags = Dictionary.DecodeFlags(scratch); // we are looking for a case variant, but this word does not allow it if (checkKeepCase && Dictionary.HasFlag(wordFlags, (char)dictionary.keepcase)) { continue; } // we can't add this form, its a pseudostem requiring an affix if (checkNeedAffix && Dictionary.HasFlag(wordFlags, (char)dictionary.needaffix)) { continue; } // we can't add this form, it only belongs inside a compound word if (checkOnlyInCompound && Dictionary.HasFlag(wordFlags, (char)dictionary.onlyincompound)) { continue; } } stems.Add(NewStem(word, length, forms, i)); } } try { stems.AddRange(Stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant)); } catch (Exception bogus) when(bogus.IsIOException()) { throw RuntimeException.Create(bogus); } return(stems); }
public virtual void Inform(IResourceLoader loader) { if (wordFiles != null) { protectedWords = GetWordSet(loader, wordFiles, false); } if (types != null) { IList <string> files = SplitFileNames(types); IList <string> wlist = new JCG.List <string>(); foreach (string file in files) { IList <string> lines = GetLines(loader, file.Trim()); wlist.AddRange(lines); } typeTable = ParseTypes(wlist); } }
/// <returns> a list of all rules </returns> private IEnumerable <string> LoadRules(string synonyms, IResourceLoader loader) { JCG.List <string> wlist = null; if (File.Exists(synonyms)) { wlist = new JCG.List <string>(GetLines(loader, synonyms)); } else { IList <string> files = SplitFileNames(synonyms); wlist = new JCG.List <string>(); foreach (string file in files) { IList <string> lines = GetLines(loader, file.Trim()); wlist.AddRange(lines); } } return(wlist); }
private Scorer MakeCountingSumScorerSomeReq(/* bool disableCoord // LUCENENET: Not Referenced */) // At least one required scorer. { if (optionalScorers.Count == minNrShouldMatch) // all optional scorers also required. { JCG.List <Scorer> allReq = new JCG.List <Scorer>(requiredScorers); allReq.AddRange(optionalScorers); return(AddProhibitedScorers(CountingConjunctionSumScorer(/* disableCoord, // LUCENENET: Not Referenced */ allReq))); } // optionalScorers.size() > minNrShouldMatch, and at least one required scorer else { Scorer requiredCountingSumScorer = requiredScorers.Count == 1 ? new SingleMatchScorer(this, requiredScorers[0]) : CountingConjunctionSumScorer(/* disableCoord, // LUCENENET: Not Referenced */ requiredScorers); if (minNrShouldMatch > 0) // use a required disjunction scorer over the optional scorers { return(AddProhibitedScorers(DualConjunctionSumScorer(/* disableCoord, // LUCENENET: Not Referenced */ requiredCountingSumScorer, CountingDisjunctionSumScorer(optionalScorers, minNrShouldMatch)))); // non counting } // minNrShouldMatch == 0 else { return(new ReqOptSumScorer(AddProhibitedScorers(requiredCountingSumScorer), optionalScorers.Count == 1 ? new SingleMatchScorer(this, optionalScorers[0]) // require 1 in combined, optional scorer. : CountingDisjunctionSumScorer(optionalScorers, 1))); } } }
/// <summary> /// Perform the actual DM Soundex algorithm on the input string. /// </summary> /// <param name="source">A string to encode.</param> /// <param name="branching">If branching shall be performed.</param> /// <returns>A string array containing all DM Soundex codes corresponding to the string supplied depending on the selected branching mode.</returns> /// <exception cref="ArgumentException">If a character is not mapped.</exception> private string[] GetSoundex(string source, bool branching) { if (source == null) { return(null); } string input = Cleanup(source); // LinkedHashSet preserves input order. In .NET we can use List for that purpose. IList <Branch> currentBranches = new JCG.List <Branch> { new Branch() }; char lastChar = '\0'; for (int index = 0; index < input.Length; index++) { char ch = input[index]; // ignore whitespace inside a name if (char.IsWhiteSpace(ch)) { continue; } string inputContext = input.Substring(index); if (!RULES.TryGetValue(ch, out IList <Rule> rules) || rules == null) { continue; } // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access IList <Branch> nextBranches = branching ? new JCG.List <Branch>() : Collections.EmptyList <Branch>() as IList <Branch>; foreach (Rule rule in rules) { if (rule.Matches(inputContext)) { if (branching) { nextBranches.Clear(); } string[] replacements = rule.GetReplacements(inputContext, lastChar == '\0'); bool branchingRequired = replacements.Length > 1 && branching; foreach (Branch branch in currentBranches) { foreach (string nextReplacement in replacements) { // if we have multiple replacements, always create a new branch Branch nextBranch = branchingRequired ? branch.CreateBranch() : branch; // special rule: occurrences of mn or nm are treated differently bool force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm'); nextBranch.ProcessNextReplacement(nextReplacement, force); if (branching) { if (!nextBranches.Contains(nextBranch)) { nextBranches.Add(nextBranch); } } else { break; } } } if (branching) { currentBranches.Clear(); currentBranches.AddRange(nextBranches); } index += rule.PatternLength - 1; break; } } lastChar = ch; } string[] result = new string[currentBranches.Count]; int idx = 0; foreach (Branch branch in currentBranches) { branch.Finish(); result[idx++] = branch.ToString(); } return(result); }
/// <summary> /// Applies the affix rule to the given word, producing a list of stems if any are found /// </summary> /// <param name="strippedWord"> Word the affix has been removed and the strip added </param> /// <param name="length"> valid length of stripped word </param> /// <param name="affix"> HunspellAffix representing the affix rule itself </param> /// <param name="prefixFlag"> when we already stripped a prefix, we cant simply recurse and check the suffix, unless both are compatible /// so we must check dictionary form against both to add it as a stem! </param> /// <param name="recursionDepth"> current recursion depth </param> /// <param name="prefix"> true if we are removing a prefix (false if its a suffix) </param> /// <param name="circumfix"> true if the previous prefix removal was signed as a circumfix /// this means inner most suffix must also contain circumfix flag. </param> /// <param name="caseVariant"> true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed. </param> /// <returns> <see cref="IList{CharsRef}"/> of stems for the word, or an empty list if none are found </returns> internal IList <CharsRef> ApplyAffix(char[] strippedWord, int length, int affix, int prefixFlag, int recursionDepth, bool prefix, bool circumfix, bool caseVariant) { // TODO: just pass this in from before, no need to decode it twice affixReader.Position = 8 * affix; char flag = (char)(affixReader.ReadInt16() & 0xffff); affixReader.SkipBytes(2); // strip int condition = (char)(affixReader.ReadInt16() & 0xffff); bool crossProduct = (condition & 1) == 1; condition = condition.TripleShift(1); char append = (char)(affixReader.ReadInt16() & 0xffff); JCG.List <CharsRef> stems = new JCG.List <CharsRef>(); Int32sRef forms = dictionary.LookupWord(strippedWord, 0, length); if (forms != null) { for (int i = 0; i < forms.Length; i += formStep) { dictionary.flagLookup.Get(forms.Int32s[forms.Offset + i], scratch); char[] wordFlags = Dictionary.DecodeFlags(scratch); if (Dictionary.HasFlag(wordFlags, flag)) { // confusing: in this one exception, we already chained the first prefix against the second, // so it doesnt need to be checked against the word bool chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix; if (chainedPrefix == false && prefixFlag >= 0 && !Dictionary.HasFlag(wordFlags, (char)prefixFlag)) { // see if we can chain prefix thru the suffix continuation class (only if it has any!) dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); if (!HasCrossCheckedFlag((char)prefixFlag, appendFlags, false)) { continue; } } // if circumfix was previously set by a prefix, we must check this suffix, // to ensure it has it, and vice versa if (dictionary.circumfix != -1) { dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); bool suffixCircumfix = Dictionary.HasFlag(appendFlags, (char)dictionary.circumfix); if (circumfix != suffixCircumfix) { continue; } } // we are looking for a case variant, but this word does not allow it if (caseVariant && dictionary.keepcase != -1 && Dictionary.HasFlag(wordFlags, (char)dictionary.keepcase)) { continue; } // we aren't decompounding (yet) if (dictionary.onlyincompound != -1 && Dictionary.HasFlag(wordFlags, (char)dictionary.onlyincompound)) { continue; } stems.Add(NewStem(strippedWord, length, forms, i)); } } } // if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we have that flag if (dictionary.circumfix != -1 && !circumfix && prefix) { dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); circumfix = Dictionary.HasFlag(appendFlags, (char)dictionary.circumfix); } if (crossProduct) { if (recursionDepth == 0) { if (prefix) { // we took away the first prefix. // COMPLEXPREFIXES = true: combine with a second prefix and another suffix // COMPLEXPREFIXES = false: combine with a suffix stems.AddRange(Stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix, caseVariant)); } else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix) { // we took away a suffix. // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed // COMPLEXPREFIXES = false: combine with another suffix stems.AddRange(Stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant)); } } else if (recursionDepth == 1) { if (prefix && dictionary.complexPrefixes) { // we took away the second prefix: go look for another suffix stems.AddRange(Stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix, caseVariant)); } else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix) { // we took away a prefix, then a suffix: go look for another suffix stems.AddRange(Stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant)); } } } return(stems); }
protected virtual IList <WeightedFragInfo> DiscreteMultiValueHighlighting(IList <WeightedFragInfo> fragInfos, Field[] fields) { IDictionary <string, IList <WeightedFragInfo> > fieldNameToFragInfos = new Dictionary <string, IList <WeightedFragInfo> >(); foreach (Field field in fields) { fieldNameToFragInfos[field.Name] = new JCG.List <WeightedFragInfo>(); } foreach (WeightedFragInfo fragInfo in fragInfos) { int fieldStart; int fieldEnd = 0; foreach (Field field in fields) { if (field.GetStringValue().Length == 0) { fieldEnd++; continue; } fieldStart = fieldEnd; fieldEnd += field.GetStringValue().Length + 1; // + 1 for going to next field with same name. if (fragInfo.StartOffset >= fieldStart && fragInfo.EndOffset >= fieldStart && fragInfo.StartOffset <= fieldEnd && fragInfo.EndOffset <= fieldEnd) { fieldNameToFragInfos[field.Name].Add(fragInfo); goto fragInfos_continue; } if (fragInfo.SubInfos.Count == 0) { goto fragInfos_continue; } Toffs firstToffs = fragInfo.SubInfos[0].TermsOffsets[0]; if (fragInfo.StartOffset >= fieldEnd || firstToffs.StartOffset >= fieldEnd) { continue; } int fragStart = fieldStart; if (fragInfo.StartOffset > fieldStart && fragInfo.StartOffset < fieldEnd) { fragStart = fragInfo.StartOffset; } int fragEnd = fieldEnd; if (fragInfo.EndOffset > fieldStart && fragInfo.EndOffset < fieldEnd) { fragEnd = fragInfo.EndOffset; } // LUCENENET NOTE: Instead of removing during iteration (which isn't allowed in .NET when using an IEnumerator), // We use the IList<T>.RemoveAll() extension method of J2N. This removal happens in a forward way, but since it // accepts a predicate, we can put in the rest of Lucene's logic without doing something expensive like keeping // track of the items to remove in a separate collection. In a nutshell, any time Lucene calls iterator.remove(), // we return true and any time it is skipped, we return false. IList <SubInfo> subInfos = new JCG.List <SubInfo>(); float boost = 0.0f; // The boost of the new info will be the sum of the boosts of its SubInfos fragInfo.SubInfos.RemoveAll((subInfo) => { IList <Toffs> toffsList = new JCG.List <Toffs>(); subInfo.TermsOffsets.RemoveAll((toffs) => { if (toffs.StartOffset >= fieldStart && toffs.EndOffset <= fieldEnd) { toffsList.Add(toffs); return(true); // Remove } return(false); }); if (toffsList.Count > 0) { subInfos.Add(new SubInfo(subInfo.Text, toffsList, subInfo.Seqnum, subInfo.Boost)); boost += subInfo.Boost; } if (subInfo.TermsOffsets.Count == 0) { return(true); // Remove } return(false); }); WeightedFragInfo weightedFragInfo = new WeightedFragInfo(fragStart, fragEnd, subInfos, boost); fieldNameToFragInfos[field.Name].Add(weightedFragInfo); } fragInfos_continue : { } } JCG.List <WeightedFragInfo> result = new JCG.List <WeightedFragInfo>(); foreach (IList <WeightedFragInfo> weightedFragInfos in fieldNameToFragInfos.Values) { result.AddRange(weightedFragInfos); } CollectionUtil.TimSort(result, Comparer <WeightedFragInfo> .Create((info1, info2) => info1.StartOffset - info2.StartOffset)); return(result); }
/// <summary> /// Generates a list of stems for the provided word /// </summary> /// <param name="word"> Word to generate the stems for </param> /// <param name="length"> length </param> /// <param name="previous"> previous affix that was removed (so we dont remove same one twice) </param> /// <param name="prevFlag"> Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step </param> /// <param name="prefixFlag"> flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word </param> /// <param name="recursionDepth"> current recursiondepth </param> /// <param name="doPrefix"> true if we should remove prefixes </param> /// <param name="doSuffix"> true if we should remove suffixes </param> /// <param name="previousWasPrefix"> true if the previous removal was a prefix: /// if we are removing a suffix, and it has no continuation requirements, its ok. /// but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. </param> /// <param name="circumfix"> true if the previous prefix removal was signed as a circumfix /// this means inner most suffix must also contain circumfix flag. </param> /// <param name="caseVariant"> true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed. </param> /// <returns> <see cref="IList{CharsRef}"/> of stems, or empty list if no stems are found </returns> private IList <CharsRef> Stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, bool doPrefix, bool doSuffix, bool previousWasPrefix, bool circumfix, bool caseVariant) { // TODO: allow this stuff to be reused by tokenfilter JCG.List <CharsRef> stems = new JCG.List <CharsRef>(); if (doPrefix && dictionary.prefixes != null) { FST <Int32sRef> fst = dictionary.prefixes; Outputs <Int32sRef> outputs = fst.Outputs; FST.BytesReader bytesReader = prefixReaders[recursionDepth]; FST.Arc <Int32sRef> arc = prefixArcs[recursionDepth]; fst.GetFirstArc(arc); Int32sRef NO_OUTPUT = outputs.NoOutput; Int32sRef output = NO_OUTPUT; int limit = dictionary.fullStrip ? length : length - 1; for (int i = 0; i < limit; i++) { if (i > 0) { int ch = word[i - 1]; if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null) { break; } else if (arc.Output != NO_OUTPUT) { output = fst.Outputs.Add(output, arc.Output); } } Int32sRef prefixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment if (!arc.IsFinal) { continue; } else { prefixes = fst.Outputs.Add(output, arc.NextFinalOutput); } for (int j = 0; j < prefixes.Length; j++) { int prefix = prefixes.Int32s[prefixes.Offset + j]; if (prefix == previous) { continue; } affixReader.Position = 8 * prefix; char flag = (char)(affixReader.ReadInt16() & 0xffff); char stripOrd = (char)(affixReader.ReadInt16() & 0xffff); int condition = (char)(affixReader.ReadInt16() & 0xffff); bool crossProduct = (condition & 1) == 1; condition = condition.TripleShift(1); char append = (char)(affixReader.ReadInt16() & 0xffff); bool compatible; if (recursionDepth == 0) { if (dictionary.onlyincompound == -1) { compatible = true; } else { // check if affix is allowed in a non-compound word dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); } } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); if (Debugging.AssertsEnabled) { Debugging.Assert(prevFlag >= 0); } bool allowed = dictionary.onlyincompound == -1 || !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); compatible = allowed && HasCrossCheckedFlag((char)prevFlag, appendFlags, false); } else { compatible = false; } if (compatible) { int deAffixedStart = i; int deAffixedLength = length - deAffixedStart; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!CheckCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) { continue; } char[] strippedWord = new char[stripLength + deAffixedLength]; Array.Copy(dictionary.stripData, stripStart, strippedWord, 0, stripLength); Array.Copy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength); IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, prefix, -1, recursionDepth, true, circumfix, caseVariant); stems.AddRange(stemList); } } } } if (doSuffix && dictionary.suffixes != null) { FST <Int32sRef> fst = dictionary.suffixes; Outputs <Int32sRef> outputs = fst.Outputs; FST.BytesReader bytesReader = suffixReaders[recursionDepth]; FST.Arc <Int32sRef> arc = suffixArcs[recursionDepth]; fst.GetFirstArc(arc); Int32sRef NO_OUTPUT = outputs.NoOutput; Int32sRef output = NO_OUTPUT; int limit = dictionary.fullStrip ? 0 : 1; for (int i = length; i >= limit; i--) { if (i < length) { int ch = word[i]; if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null) { break; } else if (arc.Output != NO_OUTPUT) { output = fst.Outputs.Add(output, arc.Output); } } Int32sRef suffixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment if (!arc.IsFinal) { continue; } else { suffixes = fst.Outputs.Add(output, arc.NextFinalOutput); } for (int j = 0; j < suffixes.Length; j++) { int suffix = suffixes.Int32s[suffixes.Offset + j]; if (suffix == previous) { continue; } affixReader.Position = 8 * suffix; char flag = (char)(affixReader.ReadInt16() & 0xffff); char stripOrd = (char)(affixReader.ReadInt16() & 0xffff); int condition = (char)(affixReader.ReadInt16() & 0xffff); bool crossProduct = (condition & 1) == 1; condition = condition.TripleShift(1); char append = (char)(affixReader.ReadInt16() & 0xffff); bool compatible; if (recursionDepth == 0) { if (dictionary.onlyincompound == -1) { compatible = true; } else { // check if affix is allowed in a non-compound word dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); } } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); if (Debugging.AssertsEnabled) { Debugging.Assert(prevFlag >= 0); } bool allowed = dictionary.onlyincompound == -1 || !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); compatible = HasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix); } else { compatible = false; } if (compatible) { int appendLength = length - i; int deAffixedLength = length - appendLength; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!CheckCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) { continue; } char[] strippedWord = new char[stripLength + deAffixedLength]; Array.Copy(word, 0, strippedWord, 0, deAffixedLength); Array.Copy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength); IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant); stems.AddRange(stemList); } } } } return(stems); }
/// <summary> /// Tests a CacheEntry[] for indication of "insane" cache usage. /// <para> /// <b>NOTE:</b>FieldCache CreationPlaceholder objects are ignored. /// (:TODO: is this a bad idea? are we masking a real problem?) /// </para> /// </summary> public Insanity[] Check(params FieldCache.CacheEntry[] cacheEntries) { if (null == cacheEntries || 0 == cacheEntries.Length) { return(Arrays.Empty <Insanity>()); } if (estimateRam) { for (int i = 0; i < cacheEntries.Length; i++) { cacheEntries[i].EstimateSize(); } } // the indirect mapping lets MapOfSet dedup identical valIds for us // maps the (valId) identityhashCode of cache values to // sets of CacheEntry instances MapOfSets <int, FieldCache.CacheEntry> valIdToItems = new MapOfSets <int, FieldCache.CacheEntry>(new Dictionary <int, ISet <FieldCache.CacheEntry> >(17)); // maps ReaderField keys to Sets of ValueIds MapOfSets <ReaderField, int> readerFieldToValIds = new MapOfSets <ReaderField, int>(new Dictionary <ReaderField, ISet <int> >(17)); // any keys that we know result in more then one valId ISet <ReaderField> valMismatchKeys = new JCG.HashSet <ReaderField>(); // iterate over all the cacheEntries to get the mappings we'll need for (int i = 0; i < cacheEntries.Length; i++) { FieldCache.CacheEntry item = cacheEntries[i]; object val = item.Value; // It's OK to have dup entries, where one is eg // float[] and the other is the Bits (from // getDocWithField()) if (val is IBits) { continue; } if (val is FieldCache.ICreationPlaceholder) { continue; } ReaderField rf = new ReaderField(item.ReaderKey, item.FieldName); int valId = RuntimeHelpers.GetHashCode(val); // indirect mapping, so the MapOfSet will dedup identical valIds for us valIdToItems.Put(valId, item); if (1 < readerFieldToValIds.Put(rf, valId)) { valMismatchKeys.Add(rf); } } JCG.List <Insanity> insanity = new JCG.List <Insanity>(valMismatchKeys.Count * 3); insanity.AddRange(CheckValueMismatch(valIdToItems, readerFieldToValIds, valMismatchKeys)); insanity.AddRange(CheckSubreaders(valIdToItems, readerFieldToValIds)); return(insanity.ToArray()); }
/// <summary> /// Extracts all <see cref="MultiTermQuery"/>s for <paramref name="field"/>, and returns equivalent /// automata that will match terms. /// </summary> internal static CharacterRunAutomaton[] ExtractAutomata(Query query, string field) { JCG.List <CharacterRunAutomaton> list = new JCG.List <CharacterRunAutomaton>(); if (query is BooleanQuery booleanQuery) { foreach (BooleanClause clause in booleanQuery.GetClauses()) { if (!clause.IsProhibited) { list.AddRange(ExtractAutomata(clause.Query, field)); } } } else if (query is DisjunctionMaxQuery disjunctionMaxQuery) { foreach (Query sub in disjunctionMaxQuery.Disjuncts) { list.AddRange(ExtractAutomata(sub, field)); } } else if (query is SpanOrQuery spanOrQuery) { foreach (Query sub in spanOrQuery.GetClauses()) { list.AddRange(ExtractAutomata(sub, field)); } } else if (query is SpanNearQuery spanNearQuery) { foreach (Query sub in spanNearQuery.GetClauses()) { list.AddRange(ExtractAutomata(sub, field)); } } else if (query is SpanNotQuery spanNotQuery) { list.AddRange(ExtractAutomata(spanNotQuery.Include, field)); } else if (query is SpanPositionCheckQuery spanPositionCheckQuery) { list.AddRange(ExtractAutomata(spanPositionCheckQuery.Match, field)); } else if (query is ISpanMultiTermQueryWrapper spanMultiTermQueryWrapper) { list.AddRange(ExtractAutomata(spanMultiTermQueryWrapper.WrappedQuery, field)); } else if (query is AutomatonQuery aq) { if (aq.Field.Equals(field, StringComparison.Ordinal)) { list.Add(new CharacterRunAutomatonToStringAnonymousClass(aq.Automaton, () => aq.ToString())); } } else if (query is PrefixQuery pq) { Term prefix = pq.Prefix; if (prefix.Field.Equals(field, StringComparison.Ordinal)) { list.Add(new CharacterRunAutomatonToStringAnonymousClass( BasicOperations.Concatenate(BasicAutomata.MakeString(prefix.Text), BasicAutomata.MakeAnyString()), () => pq.ToString())); } } else if (query is FuzzyQuery fq) { if (fq.Field.Equals(field, StringComparison.Ordinal)) { string utf16 = fq.Term.Text; int[] termText = new int[utf16.CodePointCount(0, utf16.Length)]; for (int cp, i = 0, j = 0; i < utf16.Length; i += Character.CharCount(cp)) { termText[j++] = cp = utf16.CodePointAt(i); } int termLength = termText.Length; int prefixLength = Math.Min(fq.PrefixLength, termLength); string suffix = UnicodeUtil.NewString(termText, prefixLength, termText.Length - prefixLength); LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.Transpositions); Automaton automaton = builder.ToAutomaton(fq.MaxEdits); if (prefixLength > 0) { Automaton prefix = BasicAutomata.MakeString(UnicodeUtil.NewString(termText, 0, prefixLength)); automaton = BasicOperations.Concatenate(prefix, automaton); } list.Add(new CharacterRunAutomatonToStringAnonymousClass(automaton, () => fq.ToString())); } } else if (query is TermRangeQuery tq) { if (tq.Field.Equals(field, StringComparison.Ordinal)) { // this is *not* an automaton, but its very simple list.Add(new SimpleCharacterRunAutomatonAnonymousClass(BasicAutomata.MakeEmpty(), tq)); } } return(list.ToArray(/*new CharacterRunAutomaton[list.size()]*/)); }
public virtual ApplyDeletesResult ApplyDeletesAndUpdates(IndexWriter.ReaderPool readerPool, IList <SegmentCommitInfo> infos) { UninterruptableMonitor.Enter(this); try { long t0 = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results if (infos.Count == 0) { return(new ApplyDeletesResult(false, nextGen++, null)); } if (Debugging.AssertsEnabled) { Debugging.Assert(CheckDeleteStats()); } if (!Any()) { if (infoStream.IsEnabled("BD")) { infoStream.Message("BD", "applyDeletes: no deletes; skipping"); } return(new ApplyDeletesResult(false, nextGen++, null)); } if (infoStream.IsEnabled("BD")) { infoStream.Message("BD", "applyDeletes: infos=" + string.Format(J2N.Text.StringFormatter.InvariantCulture, "{0}", infos) + " packetCount=" + updates.Count); } long gen = nextGen++; JCG.List <SegmentCommitInfo> infos2 = new JCG.List <SegmentCommitInfo>(); infos2.AddRange(infos); infos2.Sort(sortSegInfoByDelGen); CoalescedUpdates coalescedUpdates = null; bool anyNewDeletes = false; int infosIDX = infos2.Count - 1; int delIDX = updates.Count - 1; IList <SegmentCommitInfo> allDeleted = null; while (infosIDX >= 0) { //System.out.println("BD: cycle delIDX=" + delIDX + " infoIDX=" + infosIDX); FrozenBufferedUpdates packet = delIDX >= 0 ? updates[delIDX] : null; SegmentCommitInfo info = infos2[infosIDX]; long segGen = info.BufferedDeletesGen; if (packet != null && segGen < packet.DelGen) { // System.out.println(" coalesce"); if (coalescedUpdates is null) { coalescedUpdates = new CoalescedUpdates(); } if (!packet.isSegmentPrivate) { /* * Only coalesce if we are NOT on a segment private del packet: the segment private del packet * must only applied to segments with the same delGen. Yet, if a segment is already deleted * from the SI since it had no more documents remaining after some del packets younger than * its segPrivate packet (higher delGen) have been applied, the segPrivate packet has not been * removed. */ coalescedUpdates.Update(packet); } delIDX--; } else if (packet != null && segGen == packet.DelGen) { if (Debugging.AssertsEnabled) { Debugging.Assert(packet.isSegmentPrivate, "Packet and Segments deletegen can only match on a segment private del packet gen={0}", segGen); } //System.out.println(" eq"); // Lock order: IW -> BD -> RP if (Debugging.AssertsEnabled) { Debugging.Assert(readerPool.InfoIsLive(info)); } ReadersAndUpdates rld = readerPool.Get(info, true); SegmentReader reader = rld.GetReader(IOContext.READ); int delCount = 0; bool segAllDeletes; try { DocValuesFieldUpdates.Container dvUpdates = new DocValuesFieldUpdates.Container(); if (coalescedUpdates != null) { //System.out.println(" del coalesced"); delCount += (int)ApplyTermDeletes(coalescedUpdates.TermsIterable(), rld, reader); delCount += (int)ApplyQueryDeletes(coalescedUpdates.QueriesIterable(), rld, reader); ApplyDocValuesUpdates(coalescedUpdates.numericDVUpdates, rld, reader, dvUpdates); ApplyDocValuesUpdates(coalescedUpdates.binaryDVUpdates, rld, reader, dvUpdates); } //System.out.println(" del exact"); // Don't delete by Term here; DocumentsWriterPerThread // already did that on flush: delCount += (int)ApplyQueryDeletes(packet.GetQueriesEnumerable(), rld, reader); ApplyDocValuesUpdates(packet.numericDVUpdates, rld, reader, dvUpdates); ApplyDocValuesUpdates(packet.binaryDVUpdates, rld, reader, dvUpdates); if (dvUpdates.Any()) { rld.WriteFieldUpdates(info.Info.Dir, dvUpdates); } int fullDelCount = rld.Info.DelCount + rld.PendingDeleteCount; if (Debugging.AssertsEnabled) { Debugging.Assert(fullDelCount <= rld.Info.Info.DocCount); } segAllDeletes = fullDelCount == rld.Info.Info.DocCount; } finally { rld.Release(reader); readerPool.Release(rld); } anyNewDeletes |= delCount > 0; if (segAllDeletes) { if (allDeleted is null) { allDeleted = new JCG.List <SegmentCommitInfo>(); } allDeleted.Add(info); } if (infoStream.IsEnabled("BD")) { infoStream.Message("BD", "seg=" + info + " segGen=" + segGen + " segDeletes=[" + packet + "]; coalesced deletes=[" + (coalescedUpdates is null ? "null" : coalescedUpdates.ToString()) + "] newDelCount=" + delCount + (segAllDeletes ? " 100% deleted" : "")); } if (coalescedUpdates is null) { coalescedUpdates = new CoalescedUpdates(); } /* * Since we are on a segment private del packet we must not * update the coalescedDeletes here! We can simply advance to the * next packet and seginfo. */ delIDX--; infosIDX--; info.SetBufferedDeletesGen(gen); } else { //System.out.println(" gt"); if (coalescedUpdates != null) { // Lock order: IW -> BD -> RP if (Debugging.AssertsEnabled) { Debugging.Assert(readerPool.InfoIsLive(info)); } ReadersAndUpdates rld = readerPool.Get(info, true); SegmentReader reader = rld.GetReader(IOContext.READ); int delCount = 0; bool segAllDeletes; try { delCount += (int)ApplyTermDeletes(coalescedUpdates.TermsIterable(), rld, reader); delCount += (int)ApplyQueryDeletes(coalescedUpdates.QueriesIterable(), rld, reader); DocValuesFieldUpdates.Container dvUpdates = new DocValuesFieldUpdates.Container(); ApplyDocValuesUpdates(coalescedUpdates.numericDVUpdates, rld, reader, dvUpdates); ApplyDocValuesUpdates(coalescedUpdates.binaryDVUpdates, rld, reader, dvUpdates); if (dvUpdates.Any()) { rld.WriteFieldUpdates(info.Info.Dir, dvUpdates); } int fullDelCount = rld.Info.DelCount + rld.PendingDeleteCount; if (Debugging.AssertsEnabled) { Debugging.Assert(fullDelCount <= rld.Info.Info.DocCount); } segAllDeletes = fullDelCount == rld.Info.Info.DocCount; } finally { rld.Release(reader); readerPool.Release(rld); } anyNewDeletes |= delCount > 0; if (segAllDeletes) { if (allDeleted is null) { allDeleted = new JCG.List <SegmentCommitInfo>(); } allDeleted.Add(info); } if (infoStream.IsEnabled("BD")) { infoStream.Message("BD", "seg=" + info + " segGen=" + segGen + " coalesced deletes=[" + coalescedUpdates + "] newDelCount=" + delCount + (segAllDeletes ? " 100% deleted" : "")); } } info.SetBufferedDeletesGen(gen); infosIDX--; } } if (Debugging.AssertsEnabled) { Debugging.Assert(CheckDeleteStats()); } if (infoStream.IsEnabled("BD")) { infoStream.Message("BD", "applyDeletes took " + ((J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond) - t0) + " msec"); // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results } // assert infos != segmentInfos || !any() : "infos=" + infos + " segmentInfos=" + segmentInfos + " any=" + any; return(new ApplyDeletesResult(anyNewDeletes, gen, allDeleted)); } finally { UninterruptableMonitor.Exit(this); } }
/// <summary> /// The <see cref="SubSpans"/> are ordered in the same doc, so there is a possible match. /// Compute the slop while making the match as short as possible by advancing /// all <see cref="SubSpans"/> except the last one in reverse order. /// </summary> private bool ShrinkToAfterShortestMatch() { matchStart = subSpans[subSpans.Length - 1].Start; matchEnd = subSpans[subSpans.Length - 1].End; var possibleMatchPayloads = new JCG.HashSet <byte[]>(); if (subSpans[subSpans.Length - 1].IsPayloadAvailable) { possibleMatchPayloads.UnionWith(subSpans[subSpans.Length - 1].GetPayload()); } IList <byte[]> possiblePayload = null; int matchSlop = 0; int lastStart = matchStart; int lastEnd = matchEnd; for (int i = subSpans.Length - 2; i >= 0; i--) { Spans prevSpans = subSpans[i]; if (collectPayloads && prevSpans.IsPayloadAvailable) { possiblePayload = new JCG.List <byte[]>(prevSpans.GetPayload()); // LUCENENET specific - using copy constructor instead of AddRange() } int prevStart = prevSpans.Start; int prevEnd = prevSpans.End; while (true) // Advance prevSpans until after (lastStart, lastEnd) { if (!prevSpans.MoveNext()) { inSameDoc = false; more = false; break; // Check remaining subSpans for final match. } else if (matchDoc != prevSpans.Doc) { inSameDoc = false; // The last subSpans is not advanced here. break; // Check remaining subSpans for last match in this document. } else { int ppStart = prevSpans.Start; int ppEnd = prevSpans.End; // Cannot avoid invoking .end() if (!DocSpansOrdered(ppStart, ppEnd, lastStart, lastEnd)) { break; // Check remaining subSpans. } // prevSpans still before (lastStart, lastEnd) else { prevStart = ppStart; prevEnd = ppEnd; if (collectPayloads && prevSpans.IsPayloadAvailable) { possiblePayload = new JCG.List <byte[]>(prevSpans.GetPayload()); // LUCENENET specific - using copy constructor instead of AddRange() } } } } if (collectPayloads && possiblePayload != null) { possibleMatchPayloads.UnionWith(possiblePayload); } if (Debugging.AssertsEnabled) { Debugging.Assert(prevStart <= matchStart); } if (matchStart > prevEnd) // Only non overlapping spans add to slop. { matchSlop += (matchStart - prevEnd); } /* Do not break on (matchSlop > allowedSlop) here to make sure * that subSpans[0] is advanced after the match, if any. */ matchStart = prevStart; lastStart = prevStart; lastEnd = prevEnd; } bool match = matchSlop <= allowedSlop; if (collectPayloads && match && possibleMatchPayloads.Count > 0) { matchPayload.AddRange(possibleMatchPayloads); } return(match); // ordered and allowed slop }