private IList <CharsRef> DoStem(char[] word, int length, bool caseVariant) { JCG.List <CharsRef> stems = new JCG.List <CharsRef>(); Int32sRef forms = dictionary.LookupWord(word, 0, length); if (forms != null) { for (int i = 0; i < forms.Length; i += formStep) { bool checkKeepCase = caseVariant && dictionary.keepcase != -1; bool checkNeedAffix = dictionary.needaffix != -1; bool checkOnlyInCompound = dictionary.onlyincompound != -1; if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) { dictionary.flagLookup.Get(forms.Int32s[forms.Offset + i], scratch); char[] wordFlags = Dictionary.DecodeFlags(scratch); // we are looking for a case variant, but this word does not allow it if (checkKeepCase && Dictionary.HasFlag(wordFlags, (char)dictionary.keepcase)) { continue; } // we can't add this form, its a pseudostem requiring an affix if (checkNeedAffix && Dictionary.HasFlag(wordFlags, (char)dictionary.needaffix)) { continue; } // we can't add this form, it only belongs inside a compound word if (checkOnlyInCompound && Dictionary.HasFlag(wordFlags, (char)dictionary.onlyincompound)) { continue; } } stems.Add(NewStem(word, length, forms, i)); } } try { stems.AddRange(Stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant)); } catch (Exception bogus) when(bogus.IsIOException()) { throw RuntimeException.Create(bogus); } return(stems); }
/// <summary> /// Applies the affix rule to the given word, producing a list of stems if any are found /// </summary> /// <param name="strippedWord"> Word the affix has been removed and the strip added </param> /// <param name="length"> valid length of stripped word </param> /// <param name="affix"> HunspellAffix representing the affix rule itself </param> /// <param name="prefixFlag"> when we already stripped a prefix, we cant simply recurse and check the suffix, unless both are compatible /// so we must check dictionary form against both to add it as a stem! </param> /// <param name="recursionDepth"> current recursion depth </param> /// <param name="prefix"> true if we are removing a prefix (false if its a suffix) </param> /// <returns> List of stems for the word, or an empty list if none are found </returns> internal IList <CharsRef> ApplyAffix(char[] strippedWord, int length, int affix, int prefixFlag, int recursionDepth, bool prefix, bool circumfix) { // TODO: just pass this in from before, no need to decode it twice affixReader.Position = 8 * affix; char flag = (char)(affixReader.ReadShort() & 0xffff); affixReader.SkipBytes(2); // strip int condition = (char)(affixReader.ReadShort() & 0xffff); bool crossProduct = (condition & 1) == 1; condition = (int)((uint)condition >> 1); char append = (char)(affixReader.ReadShort() & 0xffff); List <CharsRef> stems = new List <CharsRef>(); IntsRef forms = dictionary.LookupWord(strippedWord, 0, length); if (forms != null) { for (int i = 0; i < forms.Length; i++) { dictionary.flagLookup.Get(forms.Ints[forms.Offset + i], scratch); char[] wordFlags = Dictionary.DecodeFlags(scratch); if (Dictionary.HasFlag(wordFlags, flag)) { // confusing: in this one exception, we already chained the first prefix against the second, // so it doesnt need to be checked against the word bool chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix; if (chainedPrefix == false && prefixFlag >= 0 && !Dictionary.HasFlag(wordFlags, (char)prefixFlag)) { // see if we can chain prefix thru the suffix continuation class (only if it has any!) dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); if (!HasCrossCheckedFlag((char)prefixFlag, appendFlags, false)) { continue; } } // if circumfix was previously set by a prefix, we must check this suffix, // to ensure it has it, and vice versa if (dictionary.circumfix != -1) { dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); bool suffixCircumfix = Dictionary.HasFlag(appendFlags, (char)dictionary.circumfix); if (circumfix != suffixCircumfix) { continue; } } stems.Add(NewStem(strippedWord, length)); } } } // if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we have that flag if (dictionary.circumfix != -1 && !circumfix && prefix) { dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); circumfix = Dictionary.HasFlag(appendFlags, (char)dictionary.circumfix); } if (crossProduct) { if (recursionDepth == 0) { if (prefix) { // we took away the first prefix. // COMPLEXPREFIXES = true: combine with a second prefix and another suffix // COMPLEXPREFIXES = false: combine with a suffix stems.AddRange(Stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix)); } else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix) { // we took away a suffix. // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed // COMPLEXPREFIXES = false: combine with another suffix stems.AddRange(Stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix)); } } else if (recursionDepth == 1) { if (prefix && dictionary.complexPrefixes) { // we took away the second prefix: go look for another suffix stems.AddRange(Stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix)); } else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix) { // we took away a prefix, then a suffix: go look for another suffix stems.AddRange(Stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix)); } } } return(stems); }
/// <summary> /// Generates a list of stems for the provided word /// </summary> /// <param name="word"> Word to generate the stems for </param> /// <param name="length"> length </param> /// <param name="previous"> previous affix that was removed (so we dont remove same one twice) </param> /// <param name="prevFlag"> Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step </param> /// <param name="prefixFlag"> flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word </param> /// <param name="recursionDepth"> current recursiondepth </param> /// <param name="doPrefix"> true if we should remove prefixes </param> /// <param name="doSuffix"> true if we should remove suffixes </param> /// <param name="previousWasPrefix"> true if the previous removal was a prefix: /// if we are removing a suffix, and it has no continuation requirements, its ok. /// but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. </param> /// <param name="circumfix"> true if the previous prefix removal was signed as a circumfix /// this means inner most suffix must also contain circumfix flag. </param> /// <param name="caseVariant"> true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed. </param> /// <returns> <see cref="IList{CharsRef}"/> of stems, or empty list if no stems are found </returns> private IList <CharsRef> Stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, bool doPrefix, bool doSuffix, bool previousWasPrefix, bool circumfix, bool caseVariant) { // TODO: allow this stuff to be reused by tokenfilter JCG.List <CharsRef> stems = new JCG.List <CharsRef>(); if (doPrefix && dictionary.prefixes != null) { FST <Int32sRef> fst = dictionary.prefixes; Outputs <Int32sRef> outputs = fst.Outputs; FST.BytesReader bytesReader = prefixReaders[recursionDepth]; FST.Arc <Int32sRef> arc = prefixArcs[recursionDepth]; fst.GetFirstArc(arc); Int32sRef NO_OUTPUT = outputs.NoOutput; Int32sRef output = NO_OUTPUT; int limit = dictionary.fullStrip ? length : length - 1; for (int i = 0; i < limit; i++) { if (i > 0) { int ch = word[i - 1]; if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null) { break; } else if (arc.Output != NO_OUTPUT) { output = fst.Outputs.Add(output, arc.Output); } } Int32sRef prefixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment if (!arc.IsFinal) { continue; } else { prefixes = fst.Outputs.Add(output, arc.NextFinalOutput); } for (int j = 0; j < prefixes.Length; j++) { int prefix = prefixes.Int32s[prefixes.Offset + j]; if (prefix == previous) { continue; } affixReader.Position = 8 * prefix; char flag = (char)(affixReader.ReadInt16() & 0xffff); char stripOrd = (char)(affixReader.ReadInt16() & 0xffff); int condition = (char)(affixReader.ReadInt16() & 0xffff); bool crossProduct = (condition & 1) == 1; condition = condition.TripleShift(1); char append = (char)(affixReader.ReadInt16() & 0xffff); bool compatible; if (recursionDepth == 0) { if (dictionary.onlyincompound == -1) { compatible = true; } else { // check if affix is allowed in a non-compound word dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); } } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); if (Debugging.AssertsEnabled) { Debugging.Assert(prevFlag >= 0); } bool allowed = dictionary.onlyincompound == -1 || !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); compatible = allowed && HasCrossCheckedFlag((char)prevFlag, appendFlags, false); } else { compatible = false; } if (compatible) { int deAffixedStart = i; int deAffixedLength = length - deAffixedStart; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!CheckCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) { continue; } char[] strippedWord = new char[stripLength + deAffixedLength]; Array.Copy(dictionary.stripData, stripStart, strippedWord, 0, stripLength); Array.Copy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength); IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, prefix, -1, recursionDepth, true, circumfix, caseVariant); stems.AddRange(stemList); } } } } if (doSuffix && dictionary.suffixes != null) { FST <Int32sRef> fst = dictionary.suffixes; Outputs <Int32sRef> outputs = fst.Outputs; FST.BytesReader bytesReader = suffixReaders[recursionDepth]; FST.Arc <Int32sRef> arc = suffixArcs[recursionDepth]; fst.GetFirstArc(arc); Int32sRef NO_OUTPUT = outputs.NoOutput; Int32sRef output = NO_OUTPUT; int limit = dictionary.fullStrip ? 0 : 1; for (int i = length; i >= limit; i--) { if (i < length) { int ch = word[i]; if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null) { break; } else if (arc.Output != NO_OUTPUT) { output = fst.Outputs.Add(output, arc.Output); } } Int32sRef suffixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment if (!arc.IsFinal) { continue; } else { suffixes = fst.Outputs.Add(output, arc.NextFinalOutput); } for (int j = 0; j < suffixes.Length; j++) { int suffix = suffixes.Int32s[suffixes.Offset + j]; if (suffix == previous) { continue; } affixReader.Position = 8 * suffix; char flag = (char)(affixReader.ReadInt16() & 0xffff); char stripOrd = (char)(affixReader.ReadInt16() & 0xffff); int condition = (char)(affixReader.ReadInt16() & 0xffff); bool crossProduct = (condition & 1) == 1; condition = condition.TripleShift(1); char append = (char)(affixReader.ReadInt16() & 0xffff); bool compatible; if (recursionDepth == 0) { if (dictionary.onlyincompound == -1) { compatible = true; } else { // check if affix is allowed in a non-compound word dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); } } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); if (Debugging.AssertsEnabled) { Debugging.Assert(prevFlag >= 0); } bool allowed = dictionary.onlyincompound == -1 || !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); compatible = HasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix); } else { compatible = false; } if (compatible) { int appendLength = length - i; int deAffixedLength = length - appendLength; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!CheckCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) { continue; } char[] strippedWord = new char[stripLength + deAffixedLength]; Array.Copy(word, 0, strippedWord, 0, deAffixedLength); Array.Copy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength); IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant); stems.AddRange(stemList); } } } } return(stems); }