コード例 #1
0
        public virtual void TestSimpleDictionary()
        {
            using (System.IO.Stream affixStream = this.GetType().getResourceAsStream("simple.aff"))
            {
                using (System.IO.Stream dictStream = this.GetType().getResourceAsStream("simple.dic"))
                {

                    Dictionary dictionary = new Dictionary(affixStream, dictStream);
                    assertEquals(3, dictionary.LookupSuffix(new char[] { 'e' }, 0, 1).Length);
                    assertEquals(1, dictionary.LookupPrefix(new char[] { 's' }, 0, 1).Length);
                    IntsRef ordList = dictionary.LookupWord(new char[] { 'o', 'l', 'r' }, 0, 3);
                    assertNotNull(ordList);
                    assertEquals(1, ordList.Length);

                    BytesRef @ref = new BytesRef();
                    dictionary.flagLookup.Get(ordList.Ints[0], @ref);
                    char[] flags = Dictionary.DecodeFlags(@ref);
                    assertEquals(1, flags.Length);

                    ordList = dictionary.LookupWord(new char[] { 'l', 'u', 'c', 'e', 'n' }, 0, 5);
                    assertNotNull(ordList);
                    assertEquals(1, ordList.Length);
                    dictionary.flagLookup.Get(ordList.Ints[0], @ref);
                    flags = Dictionary.DecodeFlags(@ref);
                    assertEquals(1, flags.Length);
                }
            }
        }
コード例 #2
0
        // ================================================= Helper Methods ================================================

        /// <summary>
        /// Generates a list of stems for the provided word
        /// </summary>
        /// <param name="word"> Word to generate the stems for </param>
        /// <param name="length"> length </param>
        /// <param name="previous"> previous affix that was removed (so we dont remove same one twice) </param>
        /// <param name="prevFlag"> Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step </param>
        /// <param name="prefixFlag"> flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word </param>
        /// <param name="recursionDepth"> current recursiondepth </param>
        /// <param name="doPrefix"> true if we should remove prefixes </param>
        /// <param name="doSuffix"> true if we should remove suffixes </param>
        /// <param name="previousWasPrefix"> true if the previous removal was a prefix:
        ///        if we are removing a suffix, and it has no continuation requirements, its ok.
        ///        but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. </param>
        /// <param name="circumfix"> true if the previous prefix removal was signed as a circumfix
        ///        this means inner most suffix must also contain circumfix flag. </param>
        /// <returns> <see cref="IList{CharsRef}"/> of stems, or empty list if no stems are found </returns>
        private IList <CharsRef> Stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, bool doPrefix, bool doSuffix, bool previousWasPrefix, bool circumfix)
        {
            // TODO: allow this stuff to be reused by tokenfilter
            List <CharsRef> stems = new List <CharsRef>();

            if (doPrefix && dictionary.prefixes != null)
            {
                for (int i = length - 1; i >= 0; i--)
                {
                    Int32sRef prefixes = dictionary.LookupPrefix(word, 0, i);
                    if (prefixes == null)
                    {
                        continue;
                    }

                    for (int j = 0; j < prefixes.Length; j++)
                    {
                        int prefix = prefixes.Int32s[prefixes.Offset + j];
                        if (prefix == previous)
                        {
                            continue;
                        }
                        affixReader.Position = 8 * prefix;
                        char flag         = (char)(affixReader.ReadInt16() & 0xffff);
                        char stripOrd     = (char)(affixReader.ReadInt16() & 0xffff);
                        int  condition    = (char)(affixReader.ReadInt16() & 0xffff);
                        bool crossProduct = (condition & 1) == 1;
                        condition = (int)((uint)condition >> 1);
                        char append = (char)(affixReader.ReadInt16() & 0xffff);

                        bool compatible;
                        if (recursionDepth == 0)
                        {
                            compatible = true;
                        }
                        else if (crossProduct)
                        {
                            // cross check incoming continuation class (flag of previous affix) against list.
                            dictionary.flagLookup.Get(append, scratch);
                            char[] appendFlags = Dictionary.DecodeFlags(scratch);
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(prevFlag >= 0);
                            }
                            compatible = HasCrossCheckedFlag((char)prevFlag, appendFlags, false);
                        }
                        else
                        {
                            compatible = false;
                        }

                        if (compatible)
                        {
                            int deAffixedStart  = i;
                            int deAffixedLength = length - deAffixedStart;

                            int stripStart  = dictionary.stripOffsets[stripOrd];
                            int stripEnd    = dictionary.stripOffsets[stripOrd + 1];
                            int stripLength = stripEnd - stripStart;

                            if (!CheckCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength))
                            {
                                continue;
                            }

                            char[] strippedWord = new char[stripLength + deAffixedLength];
                            Array.Copy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
                            Array.Copy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);

                            IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, prefix, -1, recursionDepth, true, circumfix);

                            stems.AddRange(stemList);
                        }
                    }
                }
            }

            if (doSuffix && dictionary.suffixes != null)
            {
                for (int i = 0; i < length; i++)
                {
                    Int32sRef suffixes = dictionary.LookupSuffix(word, i, length - i);
                    if (suffixes == null)
                    {
                        continue;
                    }

                    for (int j = 0; j < suffixes.Length; j++)
                    {
                        int suffix = suffixes.Int32s[suffixes.Offset + j];
                        if (suffix == previous)
                        {
                            continue;
                        }
                        affixReader.Position = 8 * suffix;
                        char flag         = (char)(affixReader.ReadInt16() & 0xffff);
                        char stripOrd     = (char)(affixReader.ReadInt16() & 0xffff);
                        int  condition    = (char)(affixReader.ReadInt16() & 0xffff);
                        bool crossProduct = (condition & 1) == 1;
                        condition = (int)((uint)condition >> 1);
                        char append = (char)(affixReader.ReadInt16() & 0xffff);

                        bool compatible;
                        if (recursionDepth == 0)
                        {
                            compatible = true;
                        }
                        else if (crossProduct)
                        {
                            // cross check incoming continuation class (flag of previous affix) against list.
                            dictionary.flagLookup.Get(append, scratch);
                            char[] appendFlags = Dictionary.DecodeFlags(scratch);
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(prevFlag >= 0);
                            }
                            compatible = HasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix);
                        }
                        else
                        {
                            compatible = false;
                        }

                        if (compatible)
                        {
                            int appendLength    = length - i;
                            int deAffixedLength = length - appendLength;

                            int stripStart  = dictionary.stripOffsets[stripOrd];
                            int stripEnd    = dictionary.stripOffsets[stripOrd + 1];
                            int stripLength = stripEnd - stripStart;

                            if (!CheckCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength))
                            {
                                continue;
                            }

                            char[] strippedWord = new char[stripLength + deAffixedLength];
                            Array.Copy(word, 0, strippedWord, 0, deAffixedLength);
                            Array.Copy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);

                            IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, suffix, prefixFlag, recursionDepth, false, circumfix);

                            stems.AddRange(stemList);
                        }
                    }
                }
            }

            return(stems);
        }
コード例 #3
0
        public virtual void TestCompressedBeforeSetDictionary()
        {
            using (System.IO.Stream affixStream = this.GetType().getResourceAsStream("compressed-before-set.aff"))
            {
                using (System.IO.Stream dictStream = this.GetType().getResourceAsStream("compressed.dic"))
                {

                    Dictionary dictionary = new Dictionary(affixStream, dictStream);
                    assertEquals(3, dictionary.LookupSuffix(new char[] { 'e' }, 0, 1).Length);
                    assertEquals(1, dictionary.LookupPrefix(new char[] { 's' }, 0, 1).Length);
                    IntsRef ordList = dictionary.LookupWord(new char[] { 'o', 'l', 'r' }, 0, 3);
                    BytesRef @ref = new BytesRef();
                    dictionary.flagLookup.Get(ordList.Ints[0], @ref);
                    char[] flags = Dictionary.DecodeFlags(@ref);
                    assertEquals(1, flags.Length);
                }
            }
        }