A TokenFilter that decomposes compound words found in many Germanic languages.

"Donaudampfschiff" becomes Donau, dampf, schiff so that you can find "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation grammar and a word dictionary to achieve this.

You must specify the required Version compatibility when creating CompoundWordTokenFilterBase:

  • As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0 supplementary characters in strings and char arrays provided as compound word dictionaries.

Inheritance: CompoundWordTokenFilterBase
示例#1
0
        public virtual void Inform(IResourceLoader loader)
        {
            Stream stream = null;

            try
            {
                if (dictFile != null) // the dictionary can be empty.
                {
                    dictionary = GetWordSet(loader, dictFile, false);
                }
                // TODO: Broken, because we cannot resolve real system id
                // ResourceLoader should also supply method like ClassLoader to get resource URL
                stream = loader.OpenResource(hypFile);
                //InputSource @is = new InputSource(stream);
                //@is.Encoding = encoding; // if it's null let xml parser decide
                //@is.SystemId = hypFile;

                var xmlEncoding = string.IsNullOrEmpty(encoding) ? Encoding.UTF8 : Encoding.GetEncoding(encoding);

                hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(stream, xmlEncoding);
            }
            finally
            {
                IOUtils.CloseWhileHandlingException(stream);
            }
        }
            protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer   tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
                TokenFilter filter    = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, hyphenator);

                return(new TokenStreamComponents(tokenizer, filter));
            }
            protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer   tokenizer = new KeywordTokenizer(reader);
                TokenFilter filter    = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, hyphenator);

                return(new TokenStreamComponents(tokenizer, filter));
            }
        public virtual void TestHyphenationCompoundWordsDA()
        {
            CharArraySet dict = makeDictionary("læse", "hest");

            //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
            using var @is = this.GetType().getResourceAsStream("da_UTF8.xml");
            HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);

            HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("min veninde som er lidt af en læsehest"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

            AssertTokenStreamContents(tf, new string[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 });
        }
        public virtual void TestHyphenationCompoundWordsDELongestMatch()
        {
            CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv");

            //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
            using var @is = this.GetType().getResourceAsStream("da_UTF8.xml");
            HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);

            // the word basket will not be added due to the longest match option
            HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);

            AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basketball", "ball", "kurv" }, new int[] { 1, 0, 0, 0 });
        }
        public virtual void TestHyphenationCompoundWordsDA()
        {
            CharArraySet dict = makeDictionary("læse", "hest");

            //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
            using (var @is = this.GetType().getResourceAsStream("da_UTF8.xml"))
            {
                HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);

                HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("min veninde som er lidt af en læsehest"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
                AssertTokenStreamContents(tf, new string[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 });
            }
        }
        public virtual void TestHyphenationCompoundWordsDELongestMatch()
        {
            CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv");

            //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
            using (var @is = this.GetType().getResourceAsStream("da_UTF8.xml"))
            {
                HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);

                // the word basket will not be added due to the longest match option
                HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
                AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basketball", "ball", "kurv" }, new int[] { 1, 0, 0, 0 });
            }
        }
        public virtual void TestEmptyTerm()
        {
            CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
            Analyzer     a    = new AnalyzerAnonymousInnerClassHelper4(this, dict);

            CheckOneTerm(a, "", "");

            //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
            using (var @is = this.GetType().getResourceAsStream("da_UTF8.xml"))
            {
                HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);
                Analyzer        b          = new AnalyzerAnonymousInnerClassHelper5(this, hyphenator);
                CheckOneTerm(b, "", "");
            }
        }
        public virtual void TestRandomStrings()
        {
            CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
            Analyzer     a    = new AnalyzerAnonymousInnerClassHelper2(this, dict);

            CheckRandomData(Random, a, 1000 * RANDOM_MULTIPLIER);

            //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
            using (var @is = this.GetType().getResourceAsStream("da_UTF8.xml"))
            {
                HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);
                Analyzer        b          = new AnalyzerAnonymousInnerClassHelper3(this, hyphenator);
                CheckRandomData(Random, b, 1000 * RANDOM_MULTIPLIER);
            }
        }
        public virtual void TestHyphenationOnly()
        {
            //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
            using var @is = this.GetType().getResourceAsStream("da_UTF8.xml");
            HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);

            HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 2, 4);

            // min=2, max=4
            AssertTokenStreamContents(tf, new string[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" });

            tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 6);

            // min=4, max=6
            AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" });

            tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 10);

            // min=4, max=10
            AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basket", "basketbal", "basketball", "sket", "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" });
        }
        public virtual void Inform(ResourceLoader loader)
        {
            InputStream stream = null;

            try
            {
                if (dictFile != null)   // the dictionary can be empty.
                {
                    dictionary = getWordSet(loader, dictFile, false);
                }
                // TODO: Broken, because we cannot resolve real system id
                // ResourceLoader should also supply method like ClassLoader to get resource URL
                stream = loader.openResource(hypFile);
                InputSource @is = new InputSource(stream);
                @is.Encoding = encoding;   // if it's null let xml parser decide
                @is.SystemId = hypFile;
                hyphenator   = HyphenationCompoundWordTokenFilter.getHyphenationTree(@is);
            }
            finally
            {
                IOUtils.closeWhileHandlingException(stream);
            }
        }
        public virtual void TestHyphenationOnly()
        {
            //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
            using (var @is = this.GetType().getResourceAsStream("da_UTF8.xml"))
            {
                HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);

                HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 2, 4);

                // min=2, max=4
                AssertTokenStreamContents(tf, new string[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" });

                tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 6);

                // min=4, max=6
                AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" });

                tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 10);

                // min=4, max=10
                AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basket", "basketbal", "basketball", "sket", "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" });
            }
        }
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer tokenizer = new KeywordTokenizer(reader);
     TokenFilter filter = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, hyphenator);
     return new TokenStreamComponents(tokenizer, filter);
 }