Ejemplo n.º 1
        protected override Query[] PrepareQueries()
            // analyzer (default is standard analyzer)
            Analyzer anlzr = NewAnalyzerTask.CreateAnalyzer(m_config.Get("analyzer",

            JCG.List <object> queryList = new JCG.List <object>(20);
            return(CreateQueries(queryList, anlzr));
Ejemplo n.º 2
        protected override Query[] PrepareQueries()
            // analyzer (default is standard analyzer)
            Analyzer anlzr = NewAnalyzerTask.CreateAnalyzer(m_config.Get("analyzer", typeof(StandardAnalyzer).AssemblyQualifiedName));

            JCG.List <object> queryList = new JCG.List <object>(20);
            if (!m_config.Get("enwikiQueryMaker.disableSpanQueries", false))
            return(CreateQueries(queryList, anlzr));
Ejemplo n.º 3
        public virtual void TestNextAllTerms()
            IList <string> termsList = new JCG.List <string>(commonTerms.Length + mediumTerms.Length + rareTerms.Length);

            string[] terms = termsList.ToArray();

            for (int minNrShouldMatch = 1; minNrShouldMatch <= terms.Length; minNrShouldMatch++)
                Scorer expected = Scorer(terms, minNrShouldMatch, true);
                Scorer actual   = Scorer(terms, minNrShouldMatch, false);
                AssertNext(expected, actual);
        private static IList <FacetField> RandomCategories(Random random)
            // add random categories from the two dimensions, ensuring that the same
            // category is not added twice.
            int numFacetsA = random.Next(3) + 1; // 1-3
            int numFacetsB = random.Next(2) + 1; // 1-2

            JCG.List <FacetField> categories_a = new JCG.List <FacetField>();
            JCG.List <FacetField> categories_b = new JCG.List <FacetField>();

            List <FacetField> categories = new List <FacetField>();

            categories.AddRange(categories_a.GetView(0, numFacetsA)); // LUCENENET: Checked length for correctness
            categories.AddRange(categories_b.GetView(0, numFacetsB)); // LUCENENET: Checked length for correctness

            // add the NO_PARENT categories

Ejemplo n.º 5
 // TODO: this should use inputstreams from the loader, not File!
 public virtual void Inform(IResourceLoader loader)
     if (mapping != null)
         IList <string> wlist;
         if (File.Exists(mapping))
             wlist = new JCG.List <string>(GetLines(loader, mapping));
             var files = SplitFileNames(mapping);
             wlist = new JCG.List <string>();
             foreach (string file in files)
                 var lines = GetLines(loader, file.Trim());
         NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
         ParseRules(wlist, builder);
         m_normMap = builder.Build();
         if (m_normMap.map == null)
             // if the inner FST is null, it means it accepts nothing (e.g. the file is empty)
             // so just set the whole map to null
             m_normMap = null;
Ejemplo n.º 6
        /// <summary>
        /// LUCENENET specific
        /// Is non-static because NewIndexWriterConfig is no longer static.
        /// </summary>
        public void IndexSerial(Random random, IDictionary <string, Document> docs, Directory dir)
            IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(random, TEST_VERSION_CURRENT, new MockAnalyzer(random)).SetMergePolicy(NewLogMergePolicy()));

            // index all docs in a single thread
            IEnumerator <Document> iter = docs.Values.GetEnumerator();

            while (iter.MoveNext())
                Document d = iter.Current;
                IList <IIndexableField> fields = new JCG.List <IIndexableField>();
                // put fields in same order each time

                Document d1 = new Document();
                for (int i = 0; i < fields.Count; i++)
                // System.out.println("indexing "+d1);

Ejemplo n.º 7
            /// <summary>
            /// Merging constructor.  Note that this just grabs seqnum from the first info.
            /// </summary>
            public WeightedPhraseInfo(ICollection <WeightedPhraseInfo> toMerge)
                IEnumerator <Toffs>[] allToffs = new IEnumerator <Toffs> [toMerge.Count];
                    // Pretty much the same idea as merging FieldPhraseLists:
                    // Step 1.  Sort by startOffset, endOffset
                    //          While we are here merge the boosts and termInfos
                    using IEnumerator <WeightedPhraseInfo> toMergeItr = toMerge.GetEnumerator();
                    if (!toMergeItr.MoveNext())
                        throw new ArgumentException("toMerge must contain at least one WeightedPhraseInfo.");
                    WeightedPhraseInfo first = toMergeItr.Current;

                    termsInfos  = new JCG.List <TermInfo>();
                    seqnum      = first.seqnum;
                    boost       = first.boost;
                    allToffs[0] = first.termsOffsets.GetEnumerator();
                    int index = 1;
                    while (toMergeItr.MoveNext())
                        WeightedPhraseInfo info = toMergeItr.Current;
                        boost += info.boost;
                        allToffs[index++] = info.termsOffsets.GetEnumerator();

                    // Step 2.  Walk the sorted list merging overlaps
                    using MergedEnumerator <Toffs> itr = new MergedEnumerator <Toffs>(false, allToffs);
                    termsOffsets = new JCG.List <Toffs>();
                    if (!itr.MoveNext())
                    Toffs work = itr.Current;
                    while (itr.MoveNext())
                        Toffs current = itr.Current;
                        if (current.StartOffset <= work.EndOffset)
                            work.EndOffset = Math.Max(work.EndOffset, current.EndOffset);
                            work = current;
        public override void BeforeClass()

            assertFalse("test infra is broken!", OldFormatImpersonationIsActive);
            JCG.List <string> names = new JCG.List <string>(oldNames.Length + oldSingleSegmentNames.Length);
            oldIndexDirs = new Dictionary <string, Directory>();
            foreach (string name in names)
                DirectoryInfo dir = CreateTempDir(name);
                using (Stream zipFileStream = this.GetType().FindAndGetManifestResourceStream("index." + name + ".zip"))
                    TestUtil.Unzip(zipFileStream, dir);
                oldIndexDirs[name] = NewFSDirectory(dir);
Ejemplo n.º 9
        public virtual void TestNextVaryingNumberOfTerms()
            IList <string> termsList = new JCG.List <string>(commonTerms.Length + mediumTerms.Length + rareTerms.Length);


            for (int numTerms = 2; numTerms <= termsList.Count; numTerms++)
                string[] terms = termsList.GetView(0, numTerms).ToArray(/*new string[0]*/); // LUCENENET: Checked length of GetView() for correctness
                for (int minNrShouldMatch = 1; minNrShouldMatch <= terms.Length; minNrShouldMatch++)
                    Scorer expected = Scorer(terms, minNrShouldMatch, true);
                    Scorer actual   = Scorer(terms, minNrShouldMatch, false);
                    AssertNext(expected, actual);
Ejemplo n.º 10
        public virtual void TestUpgradeOldIndex()
            JCG.List <string> names = new JCG.List <string>(oldNames.Length + oldSingleSegmentNames.Length);
            foreach (string name in names)
                if (Verbose)
                    Console.WriteLine("testUpgradeOldIndex: index=" + name);
                Directory dir = NewDirectory(oldIndexDirs[name]);

                (new IndexUpgrader(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, null), false)).Upgrade();


Ejemplo n.º 11
        /// <summary>
        /// Gets rules for a combination of name type, rule type and languages.
        /// </summary>
        /// <param name="nameType">The <see cref="NameType"/> to consider.</param>
        /// <param name="rt">The <see cref="RuleType"/> to consider.</param>
        /// <param name="langs">The set of languages to consider.</param>
        /// <returns>A list of <see cref="Rule"/>s that apply.</returns>
        public static IList <Rule> GetInstance(NameType nameType, RuleType rt,
                                               LanguageSet langs)
            IDictionary <string, IList <Rule> > ruleMap = GetInstanceMap(nameType, rt, langs);
            IList <Rule> allRules = new JCG.List <Rule>();

            foreach (IList <Rule> rules in ruleMap.Values)
Ejemplo n.º 12
        private IList <Document> CreateDocsForSegment(int segmentNumber)
            IList <IList <Document> > blocks = new JCG.List <IList <Document> >(AMOUNT_OF_PARENT_DOCS);

            for (int i = 0; i < AMOUNT_OF_PARENT_DOCS; i++)
                blocks.Add(CreateParentDocWithChildren(segmentNumber, i));
            IList <Document> result = new JCG.List <Document>(AMOUNT_OF_DOCS_IN_SEGMENT);

            foreach (IList <Document> block in blocks)
Ejemplo n.º 13
        private IList <CharsRef> DoStem(char[] word, int length, bool caseVariant)
            JCG.List <CharsRef> stems = new JCG.List <CharsRef>();
            Int32sRef           forms = dictionary.LookupWord(word, 0, length);

            if (forms != null)
                for (int i = 0; i < forms.Length; i += formStep)
                    bool checkKeepCase       = caseVariant && dictionary.keepcase != -1;
                    bool checkNeedAffix      = dictionary.needaffix != -1;
                    bool checkOnlyInCompound = dictionary.onlyincompound != -1;
                    if (checkKeepCase || checkNeedAffix || checkOnlyInCompound)
                        dictionary.flagLookup.Get(forms.Int32s[forms.Offset + i], scratch);
                        char[] wordFlags = Dictionary.DecodeFlags(scratch);
                        // we are looking for a case variant, but this word does not allow it
                        if (checkKeepCase && Dictionary.HasFlag(wordFlags, (char)dictionary.keepcase))
                        // we can't add this form, its a pseudostem requiring an affix
                        if (checkNeedAffix && Dictionary.HasFlag(wordFlags, (char)dictionary.needaffix))
                        // we can't add this form, it only belongs inside a compound word
                        if (checkOnlyInCompound && Dictionary.HasFlag(wordFlags, (char)dictionary.onlyincompound))
                    stems.Add(NewStem(word, length, forms, i));
                stems.AddRange(Stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant));
            catch (Exception bogus) when(bogus.IsIOException())
                throw RuntimeException.Create(bogus);
Ejemplo n.º 14
 public virtual void Inform(IResourceLoader loader)
     if (wordFiles != null)
         protectedWords = GetWordSet(loader, wordFiles, false);
     if (types != null)
         IList <string> files = SplitFileNames(types);
         IList <string> wlist = new JCG.List <string>();
         foreach (string file in files)
             IList <string> lines = GetLines(loader, file.Trim());
         typeTable = ParseTypes(wlist);
Ejemplo n.º 15
 /// <returns> a list of all rules </returns>
 private IEnumerable <string> LoadRules(string synonyms, IResourceLoader loader)
     JCG.List <string> wlist = null;
     if (File.Exists(synonyms))
         wlist = new JCG.List <string>(GetLines(loader, synonyms));
         IList <string> files = SplitFileNames(synonyms);
         wlist = new JCG.List <string>();
         foreach (string file in files)
             IList <string> lines = GetLines(loader, file.Trim());
Ejemplo n.º 16
 private Scorer MakeCountingSumScorerSomeReq(/* bool disableCoord // LUCENENET: Not Referenced */) // At least one required scorer.
     if (optionalScorers.Count == minNrShouldMatch)                                                // all optional scorers also required.
         JCG.List <Scorer> allReq = new JCG.List <Scorer>(requiredScorers);
         return(AddProhibitedScorers(CountingConjunctionSumScorer(/* disableCoord, // LUCENENET: Not Referenced */ allReq)));
     } // optionalScorers.size() > minNrShouldMatch, and at least one required scorer
         Scorer requiredCountingSumScorer = requiredScorers.Count == 1 ? new SingleMatchScorer(this, requiredScorers[0]) : CountingConjunctionSumScorer(/* disableCoord, // LUCENENET: Not Referenced */ requiredScorers);
         if (minNrShouldMatch > 0)                                                                                                                                                                                // use a required disjunction scorer over the optional scorers
             return(AddProhibitedScorers(DualConjunctionSumScorer(/* disableCoord, // LUCENENET: Not Referenced */ requiredCountingSumScorer, CountingDisjunctionSumScorer(optionalScorers, minNrShouldMatch)))); // non counting
         } // minNrShouldMatch == 0
             return(new ReqOptSumScorer(AddProhibitedScorers(requiredCountingSumScorer), optionalScorers.Count == 1 ? new SingleMatchScorer(this, optionalScorers[0])
                                        // require 1 in combined, optional scorer.
                             : CountingDisjunctionSumScorer(optionalScorers, 1)));
Ejemplo n.º 17
        /// <summary>
        /// Perform the actual DM Soundex algorithm on the input string.
        /// </summary>
        /// <param name="source">A string to encode.</param>
        /// <param name="branching">If branching shall be performed.</param>
        /// <returns>A string array containing all DM Soundex codes corresponding to the string supplied depending on the selected branching mode.</returns>
        /// <exception cref="ArgumentException">If a character is not mapped.</exception>
        private string[] GetSoundex(string source, bool branching)
            if (source == null)

            string input = Cleanup(source);

            // LinkedHashSet preserves input order. In .NET we can use List for that purpose.
            IList <Branch> currentBranches = new JCG.List <Branch>
                new Branch()

            char lastChar = '\0';

            for (int index = 0; index < input.Length; index++)
                char ch = input[index];

                // ignore whitespace inside a name
                if (char.IsWhiteSpace(ch))

                string inputContext = input.Substring(index);
                if (!RULES.TryGetValue(ch, out IList <Rule> rules) || rules == null)

                // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
                IList <Branch> nextBranches = branching ? new JCG.List <Branch>() : Collections.EmptyList <Branch>() as IList <Branch>;

                foreach (Rule rule in rules)
                    if (rule.Matches(inputContext))
                        if (branching)
                        string[] replacements      = rule.GetReplacements(inputContext, lastChar == '\0');
                        bool     branchingRequired = replacements.Length > 1 && branching;

                        foreach (Branch branch in currentBranches)
                            foreach (string nextReplacement in replacements)
                                // if we have multiple replacements, always create a new branch
                                Branch nextBranch = branchingRequired ? branch.CreateBranch() : branch;

                                // special rule: occurrences of mn or nm are treated differently
                                bool force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm');

                                nextBranch.ProcessNextReplacement(nextReplacement, force);

                                if (branching)
                                    if (!nextBranches.Contains(nextBranch))

                        if (branching)
                        index += rule.PatternLength - 1;

                lastChar = ch;

            string[] result = new string[currentBranches.Count];
            int      idx    = 0;

            foreach (Branch branch in currentBranches)
                result[idx++] = branch.ToString();

Ejemplo n.º 18
        /// <summary>
        /// Applies the affix rule to the given word, producing a list of stems if any are found
        /// </summary>
        /// <param name="strippedWord"> Word the affix has been removed and the strip added </param>
        /// <param name="length"> valid length of stripped word </param>
        /// <param name="affix"> HunspellAffix representing the affix rule itself </param>
        /// <param name="prefixFlag"> when we already stripped a prefix, we cant simply recurse and check the suffix, unless both are compatible
        ///                   so we must check dictionary form against both to add it as a stem! </param>
        /// <param name="recursionDepth"> current recursion depth </param>
        /// <param name="prefix"> true if we are removing a prefix (false if its a suffix) </param>
        /// <param name="circumfix"> true if the previous prefix removal was signed as a circumfix
        ///        this means inner most suffix must also contain circumfix flag. </param>
        /// <param name="caseVariant"> true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed. </param>
        /// <returns> <see cref="IList{CharsRef}"/> of stems for the word, or an empty list if none are found </returns>
        internal IList <CharsRef> ApplyAffix(char[] strippedWord, int length, int affix, int prefixFlag, int recursionDepth, bool prefix, bool circumfix, bool caseVariant)
            // TODO: just pass this in from before, no need to decode it twice
            affixReader.Position = 8 * affix;
            char flag = (char)(affixReader.ReadInt16() & 0xffff);

            affixReader.SkipBytes(2); // strip
            int  condition    = (char)(affixReader.ReadInt16() & 0xffff);
            bool crossProduct = (condition & 1) == 1;

            condition = condition.TripleShift(1);
            char append = (char)(affixReader.ReadInt16() & 0xffff);

            JCG.List <CharsRef> stems = new JCG.List <CharsRef>();

            Int32sRef forms = dictionary.LookupWord(strippedWord, 0, length);

            if (forms != null)
                for (int i = 0; i < forms.Length; i += formStep)
                    dictionary.flagLookup.Get(forms.Int32s[forms.Offset + i], scratch);
                    char[] wordFlags = Dictionary.DecodeFlags(scratch);
                    if (Dictionary.HasFlag(wordFlags, flag))
                        // confusing: in this one exception, we already chained the first prefix against the second,
                        // so it doesnt need to be checked against the word
                        bool chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
                        if (chainedPrefix == false && prefixFlag >= 0 && !Dictionary.HasFlag(wordFlags, (char)prefixFlag))
                            // see if we can chain prefix thru the suffix continuation class (only if it has any!)
                            dictionary.flagLookup.Get(append, scratch);
                            char[] appendFlags = Dictionary.DecodeFlags(scratch);
                            if (!HasCrossCheckedFlag((char)prefixFlag, appendFlags, false))

                        // if circumfix was previously set by a prefix, we must check this suffix,
                        // to ensure it has it, and vice versa
                        if (dictionary.circumfix != -1)
                            dictionary.flagLookup.Get(append, scratch);
                            char[] appendFlags     = Dictionary.DecodeFlags(scratch);
                            bool   suffixCircumfix = Dictionary.HasFlag(appendFlags, (char)dictionary.circumfix);
                            if (circumfix != suffixCircumfix)

                        // we are looking for a case variant, but this word does not allow it
                        if (caseVariant && dictionary.keepcase != -1 && Dictionary.HasFlag(wordFlags, (char)dictionary.keepcase))
                        // we aren't decompounding (yet)
                        if (dictionary.onlyincompound != -1 && Dictionary.HasFlag(wordFlags, (char)dictionary.onlyincompound))
                        stems.Add(NewStem(strippedWord, length, forms, i));

            // if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we have that flag
            if (dictionary.circumfix != -1 && !circumfix && prefix)
                dictionary.flagLookup.Get(append, scratch);
                char[] appendFlags = Dictionary.DecodeFlags(scratch);
                circumfix = Dictionary.HasFlag(appendFlags, (char)dictionary.circumfix);

            if (crossProduct)
                if (recursionDepth == 0)
                    if (prefix)
                        // we took away the first prefix.
                        // COMPLEXPREFIXES = true:  combine with a second prefix and another suffix
                        // COMPLEXPREFIXES = false: combine with a suffix
                        stems.AddRange(Stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix, caseVariant));
                    else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix)
                        // we took away a suffix.
                        // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
                        // COMPLEXPREFIXES = false: combine with another suffix
                        stems.AddRange(Stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant));
                else if (recursionDepth == 1)
                    if (prefix && dictionary.complexPrefixes)
                        // we took away the second prefix: go look for another suffix
                        stems.AddRange(Stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix, caseVariant));
                    else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix)
                        // we took away a prefix, then a suffix: go look for another suffix
                        stems.AddRange(Stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant));

Ejemplo n.º 19
        protected virtual IList <WeightedFragInfo> DiscreteMultiValueHighlighting(IList <WeightedFragInfo> fragInfos, Field[] fields)
            IDictionary <string, IList <WeightedFragInfo> > fieldNameToFragInfos = new Dictionary <string, IList <WeightedFragInfo> >();

            foreach (Field field in fields)
                fieldNameToFragInfos[field.Name] = new JCG.List <WeightedFragInfo>();

            foreach (WeightedFragInfo fragInfo in fragInfos)
                int fieldStart;
                int fieldEnd = 0;
                foreach (Field field in fields)
                    if (field.GetStringValue().Length == 0)
                    fieldStart = fieldEnd;
                    fieldEnd  += field.GetStringValue().Length + 1; // + 1 for going to next field with same name.

                    if (fragInfo.StartOffset >= fieldStart && fragInfo.EndOffset >= fieldStart &&
                        fragInfo.StartOffset <= fieldEnd && fragInfo.EndOffset <= fieldEnd)

                        goto fragInfos_continue;

                    if (fragInfo.SubInfos.Count == 0)
                        goto fragInfos_continue;

                    Toffs firstToffs = fragInfo.SubInfos[0].TermsOffsets[0];
                    if (fragInfo.StartOffset >= fieldEnd || firstToffs.StartOffset >= fieldEnd)

                    int fragStart = fieldStart;
                    if (fragInfo.StartOffset > fieldStart && fragInfo.StartOffset < fieldEnd)
                        fragStart = fragInfo.StartOffset;

                    int fragEnd = fieldEnd;
                    if (fragInfo.EndOffset > fieldStart && fragInfo.EndOffset < fieldEnd)
                        fragEnd = fragInfo.EndOffset;

                    // LUCENENET NOTE: Instead of removing during iteration (which isn't allowed in .NET when using an IEnumerator),
                    // We use the IList<T>.RemoveAll() extension method of J2N. This removal happens in a forward way, but since it
                    // accepts a predicate, we can put in the rest of Lucene's logic without doing something expensive like keeping
                    // track of the items to remove in a separate collection. In a nutshell, any time Lucene calls iterator.remove(),
                    // we return true and any time it is skipped, we return false.

                    IList <SubInfo> subInfos = new JCG.List <SubInfo>();
                    float           boost    = 0.0f; //  The boost of the new info will be the sum of the boosts of its SubInfos
                    fragInfo.SubInfos.RemoveAll((subInfo) =>
                        IList <Toffs> toffsList = new JCG.List <Toffs>();
                        subInfo.TermsOffsets.RemoveAll((toffs) =>
                            if (toffs.StartOffset >= fieldStart && toffs.EndOffset <= fieldEnd)
                                return(true); // Remove
                        if (toffsList.Count > 0)
                            subInfos.Add(new SubInfo(subInfo.Text, toffsList, subInfo.Seqnum, subInfo.Boost));
                            boost += subInfo.Boost;

                        if (subInfo.TermsOffsets.Count == 0)
                            return(true); // Remove

                    WeightedFragInfo weightedFragInfo = new WeightedFragInfo(fragStart, fragEnd, subInfos, boost);
                fragInfos_continue : { }

            JCG.List <WeightedFragInfo> result = new JCG.List <WeightedFragInfo>();
            foreach (IList <WeightedFragInfo> weightedFragInfos in fieldNameToFragInfos.Values)
            CollectionUtil.TimSort(result, Comparer <WeightedFragInfo> .Create((info1, info2) => info1.StartOffset - info2.StartOffset));

Ejemplo n.º 20
        /// <summary>
        /// Generates a list of stems for the provided word
        /// </summary>
        /// <param name="word"> Word to generate the stems for </param>
        /// <param name="length"> length </param>
        /// <param name="previous"> previous affix that was removed (so we dont remove same one twice) </param>
        /// <param name="prevFlag"> Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step </param>
        /// <param name="prefixFlag"> flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word </param>
        /// <param name="recursionDepth"> current recursiondepth </param>
        /// <param name="doPrefix"> true if we should remove prefixes </param>
        /// <param name="doSuffix"> true if we should remove suffixes </param>
        /// <param name="previousWasPrefix"> true if the previous removal was a prefix:
        ///        if we are removing a suffix, and it has no continuation requirements, its ok.
        ///        but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. </param>
        /// <param name="circumfix"> true if the previous prefix removal was signed as a circumfix
        ///        this means inner most suffix must also contain circumfix flag. </param>
        /// <param name="caseVariant"> true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed. </param>
        /// <returns> <see cref="IList{CharsRef}"/> of stems, or empty list if no stems are found </returns>
        private IList <CharsRef> Stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, bool doPrefix, bool doSuffix, bool previousWasPrefix, bool circumfix, bool caseVariant)
            // TODO: allow this stuff to be reused by tokenfilter
            JCG.List <CharsRef> stems = new JCG.List <CharsRef>();

            if (doPrefix && dictionary.prefixes != null)
                FST <Int32sRef>     fst         = dictionary.prefixes;
                Outputs <Int32sRef> outputs     = fst.Outputs;
                FST.BytesReader     bytesReader = prefixReaders[recursionDepth];
                FST.Arc <Int32sRef> arc         = prefixArcs[recursionDepth];
                Int32sRef NO_OUTPUT = outputs.NoOutput;
                Int32sRef output    = NO_OUTPUT;
                int       limit     = dictionary.fullStrip ? length : length - 1;
                for (int i = 0; i < limit; i++)
                    if (i > 0)
                        int ch = word[i - 1];
                        if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null)
                        else if (arc.Output != NO_OUTPUT)
                            output = fst.Outputs.Add(output, arc.Output);
                    Int32sRef prefixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment
                    if (!arc.IsFinal)
                        prefixes = fst.Outputs.Add(output, arc.NextFinalOutput);

                    for (int j = 0; j < prefixes.Length; j++)
                        int prefix = prefixes.Int32s[prefixes.Offset + j];
                        if (prefix == previous)
                        affixReader.Position = 8 * prefix;
                        char flag         = (char)(affixReader.ReadInt16() & 0xffff);
                        char stripOrd     = (char)(affixReader.ReadInt16() & 0xffff);
                        int  condition    = (char)(affixReader.ReadInt16() & 0xffff);
                        bool crossProduct = (condition & 1) == 1;
                        condition = condition.TripleShift(1);
                        char append = (char)(affixReader.ReadInt16() & 0xffff);

                        bool compatible;
                        if (recursionDepth == 0)
                            if (dictionary.onlyincompound == -1)
                                compatible = true;
                                // check if affix is allowed in a non-compound word
                                dictionary.flagLookup.Get(append, scratch);
                                char[] appendFlags = Dictionary.DecodeFlags(scratch);
                                compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound);
                        else if (crossProduct)
                            // cross check incoming continuation class (flag of previous affix) against list.
                            dictionary.flagLookup.Get(append, scratch);
                            char[] appendFlags = Dictionary.DecodeFlags(scratch);
                            if (Debugging.AssertsEnabled)
                                Debugging.Assert(prevFlag >= 0);
                            bool allowed = dictionary.onlyincompound == -1 ||
                                           !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound);
                            compatible = allowed && HasCrossCheckedFlag((char)prevFlag, appendFlags, false);
                            compatible = false;

                        if (compatible)
                            int deAffixedStart  = i;
                            int deAffixedLength = length - deAffixedStart;

                            int stripStart  = dictionary.stripOffsets[stripOrd];
                            int stripEnd    = dictionary.stripOffsets[stripOrd + 1];
                            int stripLength = stripEnd - stripStart;

                            if (!CheckCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength))

                            char[] strippedWord = new char[stripLength + deAffixedLength];
                            Array.Copy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
                            Array.Copy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);

                            IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, prefix, -1, recursionDepth, true, circumfix, caseVariant);


            if (doSuffix && dictionary.suffixes != null)
                FST <Int32sRef>     fst         = dictionary.suffixes;
                Outputs <Int32sRef> outputs     = fst.Outputs;
                FST.BytesReader     bytesReader = suffixReaders[recursionDepth];
                FST.Arc <Int32sRef> arc         = suffixArcs[recursionDepth];
                Int32sRef NO_OUTPUT = outputs.NoOutput;
                Int32sRef output    = NO_OUTPUT;
                int       limit     = dictionary.fullStrip ? 0 : 1;
                for (int i = length; i >= limit; i--)
                    if (i < length)
                        int ch = word[i];
                        if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null)
                        else if (arc.Output != NO_OUTPUT)
                            output = fst.Outputs.Add(output, arc.Output);
                    Int32sRef suffixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment
                    if (!arc.IsFinal)
                        suffixes = fst.Outputs.Add(output, arc.NextFinalOutput);

                    for (int j = 0; j < suffixes.Length; j++)
                        int suffix = suffixes.Int32s[suffixes.Offset + j];
                        if (suffix == previous)
                        affixReader.Position = 8 * suffix;
                        char flag         = (char)(affixReader.ReadInt16() & 0xffff);
                        char stripOrd     = (char)(affixReader.ReadInt16() & 0xffff);
                        int  condition    = (char)(affixReader.ReadInt16() & 0xffff);
                        bool crossProduct = (condition & 1) == 1;
                        condition = condition.TripleShift(1);
                        char append = (char)(affixReader.ReadInt16() & 0xffff);

                        bool compatible;
                        if (recursionDepth == 0)
                            if (dictionary.onlyincompound == -1)
                                compatible = true;
                                // check if affix is allowed in a non-compound word
                                dictionary.flagLookup.Get(append, scratch);
                                char[] appendFlags = Dictionary.DecodeFlags(scratch);
                                compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound);
                        else if (crossProduct)
                            // cross check incoming continuation class (flag of previous affix) against list.
                            dictionary.flagLookup.Get(append, scratch);
                            char[] appendFlags = Dictionary.DecodeFlags(scratch);
                            if (Debugging.AssertsEnabled)
                                Debugging.Assert(prevFlag >= 0);
                            bool allowed = dictionary.onlyincompound == -1 ||
                                           !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound);
                            compatible = HasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix);
                            compatible = false;

                        if (compatible)
                            int appendLength    = length - i;
                            int deAffixedLength = length - appendLength;

                            int stripStart  = dictionary.stripOffsets[stripOrd];
                            int stripEnd    = dictionary.stripOffsets[stripOrd + 1];
                            int stripLength = stripEnd - stripStart;

                            if (!CheckCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength))

                            char[] strippedWord = new char[stripLength + deAffixedLength];
                            Array.Copy(word, 0, strippedWord, 0, deAffixedLength);
                            Array.Copy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);

                            IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant);


Ejemplo n.º 21
        /// <summary>
        /// Tests a CacheEntry[] for indication of "insane" cache usage.
        /// <para>
        /// <b>NOTE:</b>FieldCache CreationPlaceholder objects are ignored.
        /// (:TODO: is this a bad idea? are we masking a real problem?)
        /// </para>
        /// </summary>
        public Insanity[] Check(params FieldCache.CacheEntry[] cacheEntries)
            if (null == cacheEntries || 0 == cacheEntries.Length)
                return(Arrays.Empty <Insanity>());

            if (estimateRam)
                for (int i = 0; i < cacheEntries.Length; i++)

            // the indirect mapping lets MapOfSet dedup identical valIds for us
            // maps the (valId) identityhashCode of cache values to
            // sets of CacheEntry instances
            MapOfSets <int, FieldCache.CacheEntry> valIdToItems = new MapOfSets <int, FieldCache.CacheEntry>(new Dictionary <int, ISet <FieldCache.CacheEntry> >(17));
            // maps ReaderField keys to Sets of ValueIds
            MapOfSets <ReaderField, int> readerFieldToValIds = new MapOfSets <ReaderField, int>(new Dictionary <ReaderField, ISet <int> >(17));

            // any keys that we know result in more then one valId
            ISet <ReaderField> valMismatchKeys = new JCG.HashSet <ReaderField>();

            // iterate over all the cacheEntries to get the mappings we'll need
            for (int i = 0; i < cacheEntries.Length; i++)
                FieldCache.CacheEntry item = cacheEntries[i];
                object val = item.Value;

                // It's OK to have dup entries, where one is eg
                // float[] and the other is the Bits (from
                // getDocWithField())
                if (val is IBits)

                if (val is FieldCache.ICreationPlaceholder)

                ReaderField rf = new ReaderField(item.ReaderKey, item.FieldName);

                int valId = RuntimeHelpers.GetHashCode(val);

                // indirect mapping, so the MapOfSet will dedup identical valIds for us
                valIdToItems.Put(valId, item);
                if (1 < readerFieldToValIds.Put(rf, valId))

            JCG.List <Insanity> insanity = new JCG.List <Insanity>(valMismatchKeys.Count * 3);

            insanity.AddRange(CheckValueMismatch(valIdToItems, readerFieldToValIds, valMismatchKeys));
            insanity.AddRange(CheckSubreaders(valIdToItems, readerFieldToValIds));

Ejemplo n.º 22
 /// <summary>
 /// Extracts all <see cref="MultiTermQuery"/>s for <paramref name="field"/>, and returns equivalent
 /// automata that will match terms.
 /// </summary>
 internal static CharacterRunAutomaton[] ExtractAutomata(Query query, string field)
     JCG.List <CharacterRunAutomaton> list = new JCG.List <CharacterRunAutomaton>();
     if (query is BooleanQuery booleanQuery)
         foreach (BooleanClause clause in booleanQuery.GetClauses())
             if (!clause.IsProhibited)
                 list.AddRange(ExtractAutomata(clause.Query, field));
     else if (query is DisjunctionMaxQuery disjunctionMaxQuery)
         foreach (Query sub in disjunctionMaxQuery.Disjuncts)
             list.AddRange(ExtractAutomata(sub, field));
     else if (query is SpanOrQuery spanOrQuery)
         foreach (Query sub in spanOrQuery.GetClauses())
             list.AddRange(ExtractAutomata(sub, field));
     else if (query is SpanNearQuery spanNearQuery)
         foreach (Query sub in spanNearQuery.GetClauses())
             list.AddRange(ExtractAutomata(sub, field));
     else if (query is SpanNotQuery spanNotQuery)
         list.AddRange(ExtractAutomata(spanNotQuery.Include, field));
     else if (query is SpanPositionCheckQuery spanPositionCheckQuery)
         list.AddRange(ExtractAutomata(spanPositionCheckQuery.Match, field));
     else if (query is ISpanMultiTermQueryWrapper spanMultiTermQueryWrapper)
         list.AddRange(ExtractAutomata(spanMultiTermQueryWrapper.WrappedQuery, field));
     else if (query is AutomatonQuery aq)
         if (aq.Field.Equals(field, StringComparison.Ordinal))
             list.Add(new CharacterRunAutomatonToStringAnonymousClass(aq.Automaton, () => aq.ToString()));
     else if (query is PrefixQuery pq)
         Term prefix = pq.Prefix;
         if (prefix.Field.Equals(field, StringComparison.Ordinal))
             list.Add(new CharacterRunAutomatonToStringAnonymousClass(
                          BasicOperations.Concatenate(BasicAutomata.MakeString(prefix.Text), BasicAutomata.MakeAnyString()),
                          () => pq.ToString()));
     else if (query is FuzzyQuery fq)
         if (fq.Field.Equals(field, StringComparison.Ordinal))
             string utf16    = fq.Term.Text;
             int[]  termText = new int[utf16.CodePointCount(0, utf16.Length)];
             for (int cp, i = 0, j = 0; i < utf16.Length; i += Character.CharCount(cp))
                 termText[j++] = cp = utf16.CodePointAt(i);
             int    termLength             = termText.Length;
             int    prefixLength           = Math.Min(fq.PrefixLength, termLength);
             string suffix                 = UnicodeUtil.NewString(termText, prefixLength, termText.Length - prefixLength);
             LevenshteinAutomata builder   = new LevenshteinAutomata(suffix, fq.Transpositions);
             Automaton           automaton = builder.ToAutomaton(fq.MaxEdits);
             if (prefixLength > 0)
                 Automaton prefix = BasicAutomata.MakeString(UnicodeUtil.NewString(termText, 0, prefixLength));
                 automaton = BasicOperations.Concatenate(prefix, automaton);
             list.Add(new CharacterRunAutomatonToStringAnonymousClass(automaton, () => fq.ToString()));
     else if (query is TermRangeQuery tq)
         if (tq.Field.Equals(field, StringComparison.Ordinal))
             // this is *not* an automaton, but its very simple
             list.Add(new SimpleCharacterRunAutomatonAnonymousClass(BasicAutomata.MakeEmpty(), tq));
     return(list.ToArray(/*new CharacterRunAutomaton[list.size()]*/));
Ejemplo n.º 23
        public virtual ApplyDeletesResult ApplyDeletesAndUpdates(IndexWriter.ReaderPool readerPool, IList <SegmentCommitInfo> infos)
                long t0 = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results

                if (infos.Count == 0)
                    return(new ApplyDeletesResult(false, nextGen++, null));

                if (Debugging.AssertsEnabled)

                if (!Any())
                    if (infoStream.IsEnabled("BD"))
                        infoStream.Message("BD", "applyDeletes: no deletes; skipping");
                    return(new ApplyDeletesResult(false, nextGen++, null));

                if (infoStream.IsEnabled("BD"))
                    infoStream.Message("BD", "applyDeletes: infos=" + string.Format(J2N.Text.StringFormatter.InvariantCulture, "{0}", infos) + " packetCount=" + updates.Count);

                long gen = nextGen++;

                JCG.List <SegmentCommitInfo> infos2 = new JCG.List <SegmentCommitInfo>();

                CoalescedUpdates coalescedUpdates = null;
                bool             anyNewDeletes    = false;

                int infosIDX = infos2.Count - 1;
                int delIDX   = updates.Count - 1;

                IList <SegmentCommitInfo> allDeleted = null;

                while (infosIDX >= 0)
                    //System.out.println("BD: cycle delIDX=" + delIDX + " infoIDX=" + infosIDX);

                    FrozenBufferedUpdates packet = delIDX >= 0 ? updates[delIDX] : null;
                    SegmentCommitInfo     info   = infos2[infosIDX];
                    long segGen = info.BufferedDeletesGen;

                    if (packet != null && segGen < packet.DelGen)
                        //        System.out.println("  coalesce");
                        if (coalescedUpdates is null)
                            coalescedUpdates = new CoalescedUpdates();
                        if (!packet.isSegmentPrivate)
                             * Only coalesce if we are NOT on a segment private del packet: the segment private del packet
                             * must only applied to segments with the same delGen.  Yet, if a segment is already deleted
                             * from the SI since it had no more documents remaining after some del packets younger than
                             * its segPrivate packet (higher delGen) have been applied, the segPrivate packet has not been
                             * removed.

                    else if (packet != null && segGen == packet.DelGen)
                        if (Debugging.AssertsEnabled)
                            Debugging.Assert(packet.isSegmentPrivate, "Packet and Segments deletegen can only match on a segment private del packet gen={0}", segGen);
                        //System.out.println("  eq");

                        // Lock order: IW -> BD -> RP
                        if (Debugging.AssertsEnabled)
                        ReadersAndUpdates rld    = readerPool.Get(info, true);
                        SegmentReader     reader = rld.GetReader(IOContext.READ);
                        int  delCount            = 0;
                        bool segAllDeletes;
                            DocValuesFieldUpdates.Container dvUpdates = new DocValuesFieldUpdates.Container();
                            if (coalescedUpdates != null)
                                //System.out.println("    del coalesced");
                                delCount += (int)ApplyTermDeletes(coalescedUpdates.TermsIterable(), rld, reader);
                                delCount += (int)ApplyQueryDeletes(coalescedUpdates.QueriesIterable(), rld, reader);
                                ApplyDocValuesUpdates(coalescedUpdates.numericDVUpdates, rld, reader, dvUpdates);
                                ApplyDocValuesUpdates(coalescedUpdates.binaryDVUpdates, rld, reader, dvUpdates);
                            //System.out.println("    del exact");
                            // Don't delete by Term here; DocumentsWriterPerThread
                            // already did that on flush:
                            delCount += (int)ApplyQueryDeletes(packet.GetQueriesEnumerable(), rld, reader);
                            ApplyDocValuesUpdates(packet.numericDVUpdates, rld, reader, dvUpdates);
                            ApplyDocValuesUpdates(packet.binaryDVUpdates, rld, reader, dvUpdates);
                            if (dvUpdates.Any())
                                rld.WriteFieldUpdates(info.Info.Dir, dvUpdates);
                            int fullDelCount = rld.Info.DelCount + rld.PendingDeleteCount;
                            if (Debugging.AssertsEnabled)
                                Debugging.Assert(fullDelCount <= rld.Info.Info.DocCount);
                            segAllDeletes = fullDelCount == rld.Info.Info.DocCount;
                        anyNewDeletes |= delCount > 0;

                        if (segAllDeletes)
                            if (allDeleted is null)
                                allDeleted = new JCG.List <SegmentCommitInfo>();

                        if (infoStream.IsEnabled("BD"))
                            infoStream.Message("BD", "seg=" + info + " segGen=" + segGen + " segDeletes=[" + packet + "]; coalesced deletes=[" + (coalescedUpdates is null ? "null" : coalescedUpdates.ToString()) + "] newDelCount=" + delCount + (segAllDeletes ? " 100% deleted" : ""));

                        if (coalescedUpdates is null)
                            coalescedUpdates = new CoalescedUpdates();

                         * Since we are on a segment private del packet we must not
                         * update the coalescedDeletes here! We can simply advance to the
                         * next packet and seginfo.
                        //System.out.println("  gt");

                        if (coalescedUpdates != null)
                            // Lock order: IW -> BD -> RP
                            if (Debugging.AssertsEnabled)
                            ReadersAndUpdates rld    = readerPool.Get(info, true);
                            SegmentReader     reader = rld.GetReader(IOContext.READ);
                            int  delCount            = 0;
                            bool segAllDeletes;
                                delCount += (int)ApplyTermDeletes(coalescedUpdates.TermsIterable(), rld, reader);
                                delCount += (int)ApplyQueryDeletes(coalescedUpdates.QueriesIterable(), rld, reader);
                                DocValuesFieldUpdates.Container dvUpdates = new DocValuesFieldUpdates.Container();
                                ApplyDocValuesUpdates(coalescedUpdates.numericDVUpdates, rld, reader, dvUpdates);
                                ApplyDocValuesUpdates(coalescedUpdates.binaryDVUpdates, rld, reader, dvUpdates);
                                if (dvUpdates.Any())
                                    rld.WriteFieldUpdates(info.Info.Dir, dvUpdates);
                                int fullDelCount = rld.Info.DelCount + rld.PendingDeleteCount;
                                if (Debugging.AssertsEnabled)
                                    Debugging.Assert(fullDelCount <= rld.Info.Info.DocCount);
                                segAllDeletes = fullDelCount == rld.Info.Info.DocCount;
                            anyNewDeletes |= delCount > 0;

                            if (segAllDeletes)
                                if (allDeleted is null)
                                    allDeleted = new JCG.List <SegmentCommitInfo>();

                            if (infoStream.IsEnabled("BD"))
                                infoStream.Message("BD", "seg=" + info + " segGen=" + segGen + " coalesced deletes=[" + coalescedUpdates + "] newDelCount=" + delCount + (segAllDeletes ? " 100% deleted" : ""));


                if (Debugging.AssertsEnabled)
                if (infoStream.IsEnabled("BD"))
                    infoStream.Message("BD", "applyDeletes took " + ((J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond) - t0) + " msec"); // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results
                // assert infos != segmentInfos || !any() : "infos=" + infos + " segmentInfos=" + segmentInfos + " any=" + any;

                return(new ApplyDeletesResult(anyNewDeletes, gen, allDeleted));
Ejemplo n.º 24
        /// <summary>
        /// The <see cref="SubSpans"/> are ordered in the same doc, so there is a possible match.
        /// Compute the slop while making the match as short as possible by advancing
        /// all <see cref="SubSpans"/> except the last one in reverse order.
        /// </summary>
        private bool ShrinkToAfterShortestMatch()
            matchStart = subSpans[subSpans.Length - 1].Start;
            matchEnd   = subSpans[subSpans.Length - 1].End;
            var possibleMatchPayloads = new JCG.HashSet <byte[]>();

            if (subSpans[subSpans.Length - 1].IsPayloadAvailable)
                possibleMatchPayloads.UnionWith(subSpans[subSpans.Length - 1].GetPayload());

            IList <byte[]> possiblePayload = null;

            int matchSlop = 0;
            int lastStart = matchStart;
            int lastEnd   = matchEnd;

            for (int i = subSpans.Length - 2; i >= 0; i--)
                Spans prevSpans = subSpans[i];
                if (collectPayloads && prevSpans.IsPayloadAvailable)
                    possiblePayload = new JCG.List <byte[]>(prevSpans.GetPayload()); // LUCENENET specific - using copy constructor instead of AddRange()

                int prevStart = prevSpans.Start;
                int prevEnd   = prevSpans.End;
                while (true) // Advance prevSpans until after (lastStart, lastEnd)
                    if (!prevSpans.MoveNext())
                        inSameDoc = false;
                        more      = false;
                        break; // Check remaining subSpans for final match.
                    else if (matchDoc != prevSpans.Doc)
                        inSameDoc = false; // The last subSpans is not advanced here.
                        break;             // Check remaining subSpans for last match in this document.
                        int ppStart = prevSpans.Start;
                        int ppEnd   = prevSpans.End; // Cannot avoid invoking .end()
                        if (!DocSpansOrdered(ppStart, ppEnd, lastStart, lastEnd))
                            break; // Check remaining subSpans.
                        } // prevSpans still before (lastStart, lastEnd)
                            prevStart = ppStart;
                            prevEnd   = ppEnd;
                            if (collectPayloads && prevSpans.IsPayloadAvailable)
                                possiblePayload = new JCG.List <byte[]>(prevSpans.GetPayload()); // LUCENENET specific - using copy constructor instead of AddRange()

                if (collectPayloads && possiblePayload != null)

                if (Debugging.AssertsEnabled)
                    Debugging.Assert(prevStart <= matchStart);
                if (matchStart > prevEnd) // Only non overlapping spans add to slop.
                    matchSlop += (matchStart - prevEnd);

                /* Do not break on (matchSlop > allowedSlop) here to make sure
                 * that subSpans[0] is advanced after the match, if any.
                matchStart = prevStart;
                lastStart  = prevStart;
                lastEnd    = prevEnd;

            bool match = matchSlop <= allowedSlop;

            if (collectPayloads && match && possibleMatchPayloads.Count > 0)

            return(match); // ordered and allowed slop