Пример #1
0
        private IList <CharsRef> DoStem(char[] word, int length, bool caseVariant)
        {
            JCG.List <CharsRef> stems = new JCG.List <CharsRef>();
            Int32sRef           forms = dictionary.LookupWord(word, 0, length);

            if (forms != null)
            {
                for (int i = 0; i < forms.Length; i += formStep)
                {
                    bool checkKeepCase       = caseVariant && dictionary.keepcase != -1;
                    bool checkNeedAffix      = dictionary.needaffix != -1;
                    bool checkOnlyInCompound = dictionary.onlyincompound != -1;
                    if (checkKeepCase || checkNeedAffix || checkOnlyInCompound)
                    {
                        dictionary.flagLookup.Get(forms.Int32s[forms.Offset + i], scratch);
                        char[] wordFlags = Dictionary.DecodeFlags(scratch);
                        // we are looking for a case variant, but this word does not allow it
                        if (checkKeepCase && Dictionary.HasFlag(wordFlags, (char)dictionary.keepcase))
                        {
                            continue;
                        }
                        // we can't add this form, its a pseudostem requiring an affix
                        if (checkNeedAffix && Dictionary.HasFlag(wordFlags, (char)dictionary.needaffix))
                        {
                            continue;
                        }
                        // we can't add this form, it only belongs inside a compound word
                        if (checkOnlyInCompound && Dictionary.HasFlag(wordFlags, (char)dictionary.onlyincompound))
                        {
                            continue;
                        }
                    }
                    stems.Add(NewStem(word, length, forms, i));
                }
            }
            try
            {
                stems.AddRange(Stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant));
            }
            catch (Exception bogus) when(bogus.IsIOException())
            {
                throw RuntimeException.Create(bogus);
            }
            return(stems);
        }
Пример #2
0
        /// <summary>
        /// Applies the affix rule to the given word, producing a list of stems if any are found
        /// </summary>
        /// <param name="strippedWord"> Word the affix has been removed and the strip added </param>
        /// <param name="length"> valid length of stripped word </param>
        /// <param name="affix"> HunspellAffix representing the affix rule itself </param>
        /// <param name="prefixFlag"> when we already stripped a prefix, we cant simply recurse and check the suffix, unless both are compatible
        ///                   so we must check dictionary form against both to add it as a stem! </param>
        /// <param name="recursionDepth"> current recursion depth </param>
        /// <param name="prefix"> true if we are removing a prefix (false if its a suffix) </param>
        /// <returns> List of stems for the word, or an empty list if none are found </returns>
        internal IList <CharsRef> ApplyAffix(char[] strippedWord, int length, int affix, int prefixFlag, int recursionDepth, bool prefix, bool circumfix)
        {
            // TODO: just pass this in from before, no need to decode it twice
            affixReader.Position = 8 * affix;
            char flag = (char)(affixReader.ReadShort() & 0xffff);

            affixReader.SkipBytes(2); // strip
            int  condition    = (char)(affixReader.ReadShort() & 0xffff);
            bool crossProduct = (condition & 1) == 1;

            condition = (int)((uint)condition >> 1);
            char append = (char)(affixReader.ReadShort() & 0xffff);

            List <CharsRef> stems = new List <CharsRef>();

            IntsRef forms = dictionary.LookupWord(strippedWord, 0, length);

            if (forms != null)
            {
                for (int i = 0; i < forms.Length; i++)
                {
                    dictionary.flagLookup.Get(forms.Ints[forms.Offset + i], scratch);
                    char[] wordFlags = Dictionary.DecodeFlags(scratch);
                    if (Dictionary.HasFlag(wordFlags, flag))
                    {
                        // confusing: in this one exception, we already chained the first prefix against the second,
                        // so it doesnt need to be checked against the word
                        bool chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
                        if (chainedPrefix == false && prefixFlag >= 0 && !Dictionary.HasFlag(wordFlags, (char)prefixFlag))
                        {
                            // see if we can chain prefix thru the suffix continuation class (only if it has any!)
                            dictionary.flagLookup.Get(append, scratch);
                            char[] appendFlags = Dictionary.DecodeFlags(scratch);
                            if (!HasCrossCheckedFlag((char)prefixFlag, appendFlags, false))
                            {
                                continue;
                            }
                        }

                        // if circumfix was previously set by a prefix, we must check this suffix,
                        // to ensure it has it, and vice versa
                        if (dictionary.circumfix != -1)
                        {
                            dictionary.flagLookup.Get(append, scratch);
                            char[] appendFlags     = Dictionary.DecodeFlags(scratch);
                            bool   suffixCircumfix = Dictionary.HasFlag(appendFlags, (char)dictionary.circumfix);
                            if (circumfix != suffixCircumfix)
                            {
                                continue;
                            }
                        }
                        stems.Add(NewStem(strippedWord, length));
                    }
                }
            }

            // if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we have that flag
            if (dictionary.circumfix != -1 && !circumfix && prefix)
            {
                dictionary.flagLookup.Get(append, scratch);
                char[] appendFlags = Dictionary.DecodeFlags(scratch);
                circumfix = Dictionary.HasFlag(appendFlags, (char)dictionary.circumfix);
            }

            if (crossProduct)
            {
                if (recursionDepth == 0)
                {
                    if (prefix)
                    {
                        // we took away the first prefix.
                        // COMPLEXPREFIXES = true:  combine with a second prefix and another suffix
                        // COMPLEXPREFIXES = false: combine with a suffix
                        stems.AddRange(Stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix));
                    }
                    else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix)
                    {
                        // we took away a suffix.
                        // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
                        // COMPLEXPREFIXES = false: combine with another suffix
                        stems.AddRange(Stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix));
                    }
                }
                else if (recursionDepth == 1)
                {
                    if (prefix && dictionary.complexPrefixes)
                    {
                        // we took away the second prefix: go look for another suffix
                        stems.AddRange(Stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix));
                    }
                    else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix)
                    {
                        // we took away a prefix, then a suffix: go look for another suffix
                        stems.AddRange(Stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix));
                    }
                }
            }

            return(stems);
        }
Пример #3
0
        /// <summary>
        /// Generates a list of stems for the provided word
        /// </summary>
        /// <param name="word"> Word to generate the stems for </param>
        /// <param name="length"> length </param>
        /// <param name="previous"> previous affix that was removed (so we dont remove same one twice) </param>
        /// <param name="prevFlag"> Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step </param>
        /// <param name="prefixFlag"> flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word </param>
        /// <param name="recursionDepth"> current recursiondepth </param>
        /// <param name="doPrefix"> true if we should remove prefixes </param>
        /// <param name="doSuffix"> true if we should remove suffixes </param>
        /// <param name="previousWasPrefix"> true if the previous removal was a prefix:
        ///        if we are removing a suffix, and it has no continuation requirements, its ok.
        ///        but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. </param>
        /// <param name="circumfix"> true if the previous prefix removal was signed as a circumfix
        ///        this means inner most suffix must also contain circumfix flag. </param>
        /// <param name="caseVariant"> true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed. </param>
        /// <returns> <see cref="IList{CharsRef}"/> of stems, or empty list if no stems are found </returns>
        private IList <CharsRef> Stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, bool doPrefix, bool doSuffix, bool previousWasPrefix, bool circumfix, bool caseVariant)
        {
            // TODO: allow this stuff to be reused by tokenfilter
            JCG.List <CharsRef> stems = new JCG.List <CharsRef>();

            if (doPrefix && dictionary.prefixes != null)
            {
                FST <Int32sRef>     fst         = dictionary.prefixes;
                Outputs <Int32sRef> outputs     = fst.Outputs;
                FST.BytesReader     bytesReader = prefixReaders[recursionDepth];
                FST.Arc <Int32sRef> arc         = prefixArcs[recursionDepth];
                fst.GetFirstArc(arc);
                Int32sRef NO_OUTPUT = outputs.NoOutput;
                Int32sRef output    = NO_OUTPUT;
                int       limit     = dictionary.fullStrip ? length : length - 1;
                for (int i = 0; i < limit; i++)
                {
                    if (i > 0)
                    {
                        int ch = word[i - 1];
                        if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null)
                        {
                            break;
                        }
                        else if (arc.Output != NO_OUTPUT)
                        {
                            output = fst.Outputs.Add(output, arc.Output);
                        }
                    }
                    Int32sRef prefixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment
                    if (!arc.IsFinal)
                    {
                        continue;
                    }
                    else
                    {
                        prefixes = fst.Outputs.Add(output, arc.NextFinalOutput);
                    }

                    for (int j = 0; j < prefixes.Length; j++)
                    {
                        int prefix = prefixes.Int32s[prefixes.Offset + j];
                        if (prefix == previous)
                        {
                            continue;
                        }
                        affixReader.Position = 8 * prefix;
                        char flag         = (char)(affixReader.ReadInt16() & 0xffff);
                        char stripOrd     = (char)(affixReader.ReadInt16() & 0xffff);
                        int  condition    = (char)(affixReader.ReadInt16() & 0xffff);
                        bool crossProduct = (condition & 1) == 1;
                        condition = condition.TripleShift(1);
                        char append = (char)(affixReader.ReadInt16() & 0xffff);

                        bool compatible;
                        if (recursionDepth == 0)
                        {
                            if (dictionary.onlyincompound == -1)
                            {
                                compatible = true;
                            }
                            else
                            {
                                // check if affix is allowed in a non-compound word
                                dictionary.flagLookup.Get(append, scratch);
                                char[] appendFlags = Dictionary.DecodeFlags(scratch);
                                compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound);
                            }
                        }
                        else if (crossProduct)
                        {
                            // cross check incoming continuation class (flag of previous affix) against list.
                            dictionary.flagLookup.Get(append, scratch);
                            char[] appendFlags = Dictionary.DecodeFlags(scratch);
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(prevFlag >= 0);
                            }
                            bool allowed = dictionary.onlyincompound == -1 ||
                                           !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound);
                            compatible = allowed && HasCrossCheckedFlag((char)prevFlag, appendFlags, false);
                        }
                        else
                        {
                            compatible = false;
                        }

                        if (compatible)
                        {
                            int deAffixedStart  = i;
                            int deAffixedLength = length - deAffixedStart;

                            int stripStart  = dictionary.stripOffsets[stripOrd];
                            int stripEnd    = dictionary.stripOffsets[stripOrd + 1];
                            int stripLength = stripEnd - stripStart;

                            if (!CheckCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength))
                            {
                                continue;
                            }

                            char[] strippedWord = new char[stripLength + deAffixedLength];
                            Array.Copy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
                            Array.Copy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);

                            IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, prefix, -1, recursionDepth, true, circumfix, caseVariant);

                            stems.AddRange(stemList);
                        }
                    }
                }
            }

            if (doSuffix && dictionary.suffixes != null)
            {
                FST <Int32sRef>     fst         = dictionary.suffixes;
                Outputs <Int32sRef> outputs     = fst.Outputs;
                FST.BytesReader     bytesReader = suffixReaders[recursionDepth];
                FST.Arc <Int32sRef> arc         = suffixArcs[recursionDepth];
                fst.GetFirstArc(arc);
                Int32sRef NO_OUTPUT = outputs.NoOutput;
                Int32sRef output    = NO_OUTPUT;
                int       limit     = dictionary.fullStrip ? 0 : 1;
                for (int i = length; i >= limit; i--)
                {
                    if (i < length)
                    {
                        int ch = word[i];
                        if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null)
                        {
                            break;
                        }
                        else if (arc.Output != NO_OUTPUT)
                        {
                            output = fst.Outputs.Add(output, arc.Output);
                        }
                    }
                    Int32sRef suffixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment
                    if (!arc.IsFinal)
                    {
                        continue;
                    }
                    else
                    {
                        suffixes = fst.Outputs.Add(output, arc.NextFinalOutput);
                    }

                    for (int j = 0; j < suffixes.Length; j++)
                    {
                        int suffix = suffixes.Int32s[suffixes.Offset + j];
                        if (suffix == previous)
                        {
                            continue;
                        }
                        affixReader.Position = 8 * suffix;
                        char flag         = (char)(affixReader.ReadInt16() & 0xffff);
                        char stripOrd     = (char)(affixReader.ReadInt16() & 0xffff);
                        int  condition    = (char)(affixReader.ReadInt16() & 0xffff);
                        bool crossProduct = (condition & 1) == 1;
                        condition = condition.TripleShift(1);
                        char append = (char)(affixReader.ReadInt16() & 0xffff);

                        bool compatible;
                        if (recursionDepth == 0)
                        {
                            if (dictionary.onlyincompound == -1)
                            {
                                compatible = true;
                            }
                            else
                            {
                                // check if affix is allowed in a non-compound word
                                dictionary.flagLookup.Get(append, scratch);
                                char[] appendFlags = Dictionary.DecodeFlags(scratch);
                                compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound);
                            }
                        }
                        else if (crossProduct)
                        {
                            // cross check incoming continuation class (flag of previous affix) against list.
                            dictionary.flagLookup.Get(append, scratch);
                            char[] appendFlags = Dictionary.DecodeFlags(scratch);
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(prevFlag >= 0);
                            }
                            bool allowed = dictionary.onlyincompound == -1 ||
                                           !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound);
                            compatible = HasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix);
                        }
                        else
                        {
                            compatible = false;
                        }

                        if (compatible)
                        {
                            int appendLength    = length - i;
                            int deAffixedLength = length - appendLength;

                            int stripStart  = dictionary.stripOffsets[stripOrd];
                            int stripEnd    = dictionary.stripOffsets[stripOrd + 1];
                            int stripLength = stripEnd - stripStart;

                            if (!CheckCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength))
                            {
                                continue;
                            }

                            char[] strippedWord = new char[stripLength + deAffixedLength];
                            Array.Copy(word, 0, strippedWord, 0, deAffixedLength);
                            Array.Copy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);

                            IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant);

                            stems.AddRange(stemList);
                        }
                    }
                }
            }

            return(stems);
        }