GetHyphenationTree() public static method

Create a hyphenator tree
If there is a low-level I/O error.
public static GetHyphenationTree ( FileInfo hyphenationFile ) : HyphenationTree
hyphenationFile System.IO.FileInfo the file of the XML grammar to load
return org.apache.lucene.analysis.compound.hyphenation.HyphenationTree
Exemplo n.º 1
0
        public virtual void Inform(IResourceLoader loader)
        {
            Stream stream = null;

            try
            {
                if (dictFile != null) // the dictionary can be empty.
                {
                    dictionary = GetWordSet(loader, dictFile, false);
                }
                // TODO: Broken, because we cannot resolve real system id
                // ResourceLoader should also supply method like ClassLoader to get resource URL
                stream = loader.OpenResource(hypFile);
                //InputSource @is = new InputSource(stream);
                //@is.Encoding = encoding; // if it's null let xml parser decide
                //@is.SystemId = hypFile;

                var xmlEncoding = string.IsNullOrEmpty(encoding) ? Encoding.UTF8 : Encoding.GetEncoding(encoding);

                hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(stream, xmlEncoding);
            }
            finally
            {
                IOUtils.CloseWhileHandlingException(stream);
            }
        }
        public virtual void TestHyphenationCompoundWordsDA()
        {
            CharArraySet dict = makeDictionary("læse", "hest");

            //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
            using var @is = this.GetType().getResourceAsStream("da_UTF8.xml");
            HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);

            HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("min veninde som er lidt af en læsehest"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

            AssertTokenStreamContents(tf, new string[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 });
        }
        public virtual void TestHyphenationCompoundWordsDELongestMatch()
        {
            CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv");

            //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
            using var @is = this.GetType().getResourceAsStream("da_UTF8.xml");
            HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);

            // the word basket will not be added due to the longest match option
            HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);

            AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basketball", "ball", "kurv" }, new int[] { 1, 0, 0, 0 });
        }
        public virtual void TestEmptyTerm()
        {
            CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
            Analyzer     a    = new AnalyzerAnonymousInnerClassHelper4(this, dict);

            CheckOneTerm(a, "", "");

            //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
            using (var @is = this.GetType().getResourceAsStream("da_UTF8.xml"))
            {
                HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);
                Analyzer        b          = new AnalyzerAnonymousInnerClassHelper5(this, hyphenator);
                CheckOneTerm(b, "", "");
            }
        }
        public virtual void TestRandomStrings()
        {
            CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
            Analyzer     a    = new AnalyzerAnonymousInnerClassHelper2(this, dict);

            CheckRandomData(Random, a, 1000 * RANDOM_MULTIPLIER);

            //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
            using (var @is = this.GetType().getResourceAsStream("da_UTF8.xml"))
            {
                HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);
                Analyzer        b          = new AnalyzerAnonymousInnerClassHelper3(this, hyphenator);
                CheckRandomData(Random, b, 1000 * RANDOM_MULTIPLIER);
            }
        }
        public virtual void TestHyphenationOnly()
        {
            //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
            using var @is = this.GetType().getResourceAsStream("da_UTF8.xml");
            HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);

            HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 2, 4);

            // min=2, max=4
            AssertTokenStreamContents(tf, new string[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" });

            tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 6);

            // min=4, max=6
            AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" });

            tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 10);

            // min=4, max=10
            AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basket", "basketbal", "basketball", "sket", "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" });
        }