public virtual void TestSimpleDictionary() { using (System.IO.Stream affixStream = this.GetType().getResourceAsStream("simple.aff")) { using (System.IO.Stream dictStream = this.GetType().getResourceAsStream("simple.dic")) { Dictionary dictionary = new Dictionary(affixStream, dictStream); assertEquals(3, dictionary.LookupSuffix(new char[] { 'e' }, 0, 1).Length); assertEquals(1, dictionary.LookupPrefix(new char[] { 's' }, 0, 1).Length); Int32sRef ordList = dictionary.LookupWord(new char[] { 'o', 'l', 'r' }, 0, 3); assertNotNull(ordList); assertEquals(1, ordList.Length); BytesRef @ref = new BytesRef(); dictionary.flagLookup.Get(ordList.Int32s[0], @ref); char[] flags = Dictionary.DecodeFlags(@ref); assertEquals(1, flags.Length); ordList = dictionary.LookupWord(new char[] { 'l', 'u', 'c', 'e', 'n' }, 0, 5); assertNotNull(ordList); assertEquals(1, ordList.Length); dictionary.flagLookup.Get(ordList.Int32s[0], @ref); flags = Dictionary.DecodeFlags(@ref); assertEquals(1, flags.Length); } } }
public virtual void TestCompressedEmptyAliasDictionary() { using Stream affixStream = this.GetType().getResourceAsStream("compressed-empty-alias.aff"); using Stream dictStream = this.GetType().getResourceAsStream("compressed.dic"); Dictionary dictionary = new Dictionary(affixStream, dictStream); assertEquals(3, dictionary.LookupSuffix(new char[] { 'e' }, 0, 1).Length); assertEquals(1, dictionary.LookupPrefix(new char[] { 's' }, 0, 1).Length); Int32sRef ordList = dictionary.LookupWord(new char[] { 'o', 'l', 'r' }, 0, 3); BytesRef @ref = new BytesRef(); dictionary.flagLookup.Get(ordList.Int32s[0], @ref); char[] flags = Dictionary.DecodeFlags(@ref); assertEquals(1, flags.Length); }
public virtual void TestCompressedBeforeSetDictionary() { using (System.IO.Stream affixStream = this.GetType().getResourceAsStream("compressed-before-set.aff")) { using (System.IO.Stream dictStream = this.GetType().getResourceAsStream("compressed.dic")) { Dictionary dictionary = new Dictionary(affixStream, dictStream); assertEquals(3, dictionary.LookupSuffix(new char[] { 'e' }, 0, 1).Length); assertEquals(1, dictionary.LookupPrefix(new char[] { 's' }, 0, 1).Length); IntsRef ordList = dictionary.LookupWord(new char[] { 'o', 'l', 'r' }, 0, 3); BytesRef @ref = new BytesRef(); dictionary.flagLookup.Get(ordList.Ints[0], @ref); char[] flags = Dictionary.DecodeFlags(@ref); assertEquals(1, flags.Length); } } }
// ================================================= Helper Methods ================================================ /// <summary> /// Generates a list of stems for the provided word /// </summary> /// <param name="word"> Word to generate the stems for </param> /// <param name="previous"> previous affix that was removed (so we dont remove same one twice) </param> /// <param name="prevFlag"> Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step </param> /// <param name="prefixFlag"> flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word </param> /// <param name="recursionDepth"> current recursiondepth </param> /// <param name="doPrefix"> true if we should remove prefixes </param> /// <param name="doSuffix"> true if we should remove suffixes </param> /// <param name="previousWasPrefix"> true if the previous removal was a prefix: /// if we are removing a suffix, and it has no continuation requirements, its ok. /// but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. </param> /// <param name="circumfix"> true if the previous prefix removal was signed as a circumfix /// this means inner most suffix must also contain circumfix flag. </param> /// <returns> List of stems, or empty list if no stems are found </returns> private IList <CharsRef> Stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, bool doPrefix, bool doSuffix, bool previousWasPrefix, bool circumfix) { // TODO: allow this stuff to be reused by tokenfilter List <CharsRef> stems = new List <CharsRef>(); if (doPrefix && dictionary.prefixes != null) { for (int i = length - 1; i >= 0; i--) { IntsRef prefixes = dictionary.LookupPrefix(word, 0, i); if (prefixes == null) { continue; } for (int j = 0; j < prefixes.Length; j++) { int prefix = prefixes.Ints[prefixes.Offset + j]; if (prefix == previous) { continue; } affixReader.Position = 8 * prefix; char flag = (char)(affixReader.ReadShort() & 0xffff); char stripOrd = (char)(affixReader.ReadShort() & 0xffff); int condition = (char)(affixReader.ReadShort() & 0xffff); bool crossProduct = (condition & 1) == 1; condition = (int)((uint)condition >> 1); char append = (char)(affixReader.ReadShort() & 0xffff); bool compatible; if (recursionDepth == 0) { compatible = true; } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); Debug.Assert(prevFlag >= 0); compatible = HasCrossCheckedFlag((char)prevFlag, appendFlags, false); } else { compatible = false; } if (compatible) { int deAffixedStart = i; int deAffixedLength = length - deAffixedStart; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!CheckCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) { continue; } char[] strippedWord = new char[stripLength + deAffixedLength]; Array.Copy(dictionary.stripData, stripStart, strippedWord, 0, stripLength); Array.Copy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength); IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, prefix, -1, recursionDepth, true, circumfix); stems.AddRange(stemList); } } } } if (doSuffix && dictionary.suffixes != null) { for (int i = 0; i < length; i++) { IntsRef suffixes = dictionary.LookupSuffix(word, i, length - i); if (suffixes == null) { continue; } for (int j = 0; j < suffixes.Length; j++) { int suffix = suffixes.Ints[suffixes.Offset + j]; if (suffix == previous) { continue; } affixReader.Position = 8 * suffix; char flag = (char)(affixReader.ReadShort() & 0xffff); char stripOrd = (char)(affixReader.ReadShort() & 0xffff); int condition = (char)(affixReader.ReadShort() & 0xffff); bool crossProduct = (condition & 1) == 1; condition = (int)((uint)condition >> 1); char append = (char)(affixReader.ReadShort() & 0xffff); bool compatible; if (recursionDepth == 0) { compatible = true; } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); Debug.Assert(prevFlag >= 0); compatible = HasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix); } else { compatible = false; } if (compatible) { int appendLength = length - i; int deAffixedLength = length - appendLength; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!CheckCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) { continue; } char[] strippedWord = new char[stripLength + deAffixedLength]; Array.Copy(word, 0, strippedWord, 0, deAffixedLength); Array.Copy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength); IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, suffix, prefixFlag, recursionDepth, false, circumfix); stems.AddRange(stemList); } } } } return(stems); }