/// <summary>
        /// Create a hyphenator tree
        /// </summary>
        /// <param name="hyphenationSource"> the InputSource pointing to the XML grammar </param>
        /// <returns> An object representing the hyphenation patterns </returns>
        /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
        public static HyphenationTree getHyphenationTree(InputSource hyphenationSource)
        {
            var tree = new HyphenationTree();

            tree.loadPatterns(hyphenationSource);
            return(tree);
        }
Exemplo n.º 2
0
        /// <summary>
        /// Create a hyphenator tree
        /// </summary>
        /// <param name="hyphenationSource"> the InputSource pointing to the XML grammar </param>
        /// <param name="encoding">The character encoding to use</param>
        /// <returns> An object representing the hyphenation patterns </returns>
        /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
        public static HyphenationTree GetHyphenationTree(Stream hyphenationSource, Encoding encoding)
        {
            var tree = new HyphenationTree();

            tree.LoadPatterns(hyphenationSource, encoding);
            return(tree);
        }
Exemplo n.º 3
0
 /// <summary>
 /// Create a <see cref="HyphenationCompoundWordTokenFilter"/> with no dictionary.
 /// <para>
 /// Calls <see cref="HyphenationCompoundWordTokenFilter.HyphenationCompoundWordTokenFilter(LuceneVersion, TokenStream, HyphenationTree, CharArraySet, int, int, int, bool)"/>
 /// </para>
 /// </summary>
 public HyphenationCompoundWordTokenFilter(LuceneVersion matchVersion, TokenStream input,
                                           HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
                                           int maxSubwordSize)
     : this(matchVersion, input, hyphenator, null, minWordSize, minSubwordSize,
            maxSubwordSize, false)
 {
 }
Exemplo n.º 4
0
        public virtual void Inform(IResourceLoader loader)
        {
            Stream stream = null;

            try
            {
                if (dictFile != null) // the dictionary can be empty.
                {
                    dictionary = GetWordSet(loader, dictFile, false);
                }
                // TODO: Broken, because we cannot resolve real system id
                // ResourceLoader should also supply method like ClassLoader to get resource URL
                stream = loader.OpenResource(hypFile);
                //InputSource @is = new InputSource(stream);
                //@is.Encoding = encoding; // if it's null let xml parser decide
                //@is.SystemId = hypFile;

                var xmlEncoding = string.IsNullOrEmpty(encoding) ? Encoding.UTF8 : Encoding.GetEncoding(encoding);

                hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(stream, xmlEncoding);
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(stream);
            }
        }
Exemplo n.º 5
0
 /// <summary>
 /// Creates a new <see cref="HyphenationCompoundWordTokenFilter"/> instance.
 /// </summary>
 /// <param name="matchVersion">
 ///          Lucene version to enable correct Unicode 4.0 behavior in the
 ///          dictionaries if Version > 3.0. See <a
 ///          href="CompoundWordTokenFilterBase.html#version"
 ///          >CompoundWordTokenFilterBase</a> for details. </param>
 /// <param name="input">
 ///          the <see cref="TokenStream"/> to process </param>
 /// <param name="hyphenator">
 ///          the hyphenation pattern tree to use for hyphenation </param>
 /// <param name="dictionary">
 ///          the word dictionary to match against. </param>
 /// <param name="minWordSize">
 ///          only words longer than this get processed </param>
 /// <param name="minSubwordSize">
 ///          only subwords longer than this get to the output stream </param>
 /// <param name="maxSubwordSize">
 ///          only subwords shorter than this get to the output stream </param>
 /// <param name="onlyLongestMatch">
 ///          Add only the longest matching subword to the stream </param>
 public HyphenationCompoundWordTokenFilter(LuceneVersion matchVersion, TokenStream input,
                                           HyphenationTree hyphenator, CharArraySet dictionary, int minWordSize, int minSubwordSize,
                                           int maxSubwordSize, bool onlyLongestMatch)
     : base(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
            onlyLongestMatch)
 {
     this.hyphenator = hyphenator;
 }
        public virtual void TestHyphenationCompoundWordsDA()
        {
            CharArraySet dict = makeDictionary("læse", "hest");

            //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
            using var @is = this.GetType().getResourceAsStream("da_UTF8.xml");
            HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);

            HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("min veninde som er lidt af en læsehest"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

            AssertTokenStreamContents(tf, new string[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 });
        }
        public virtual void TestHyphenationCompoundWordsDELongestMatch()
        {
            CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv");

            //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
            using var @is = this.GetType().getResourceAsStream("da_UTF8.xml");
            HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);

            // the word basket will not be added due to the longest match option
            HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);

            AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basketball", "ball", "kurv" }, new int[] { 1, 0, 0, 0 });
        }
        public virtual void TestEmptyTerm()
        {
            CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
            Analyzer     a    = new AnalyzerAnonymousInnerClassHelper4(this, dict);

            CheckOneTerm(a, "", "");

            //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
            using (var @is = this.GetType().getResourceAsStream("da_UTF8.xml"))
            {
                HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);
                Analyzer        b          = new AnalyzerAnonymousInnerClassHelper5(this, hyphenator);
                CheckOneTerm(b, "", "");
            }
        }
        public virtual void TestRandomStrings()
        {
            CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
            Analyzer     a    = new AnalyzerAnonymousInnerClassHelper2(this, dict);

            CheckRandomData(Random, a, 1000 * RANDOM_MULTIPLIER);

            //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
            using (var @is = this.GetType().getResourceAsStream("da_UTF8.xml"))
            {
                HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);
                Analyzer        b          = new AnalyzerAnonymousInnerClassHelper3(this, hyphenator);
                CheckRandomData(Random, b, 1000 * RANDOM_MULTIPLIER);
            }
        }
Exemplo n.º 10
0
 public object Create(Random random)
 {
     // TODO: make nastier
     try
     {
         using (Stream @is = typeof(TestCompoundWordTokenFilter).getResourceAsStream("da_UTF8.xml"))
         {
             HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);
             return(hyphenator);
         }
     }
     catch (Exception ex)
     {
         throw ex;
         return(null); // unreachable code
     }
 }
Exemplo n.º 11
0
            public object Create(Random random)
            {
                // TODO: make nastier
                try
                {
                    using Stream @is = typeof(TestCompoundWordTokenFilter).getResourceAsStream("da_UTF8.xml");
                    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);
                    return(hyphenator);
                }
                catch (Exception /*ex*/)
                {
                    throw;        // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details)
#pragma warning disable 162
                    return(null); // unreachable code

#pragma warning restore 162
                }
            }
Exemplo n.º 12
0
        public virtual void TestHyphenationOnly()
        {
            //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm());
            using var @is = this.GetType().getResourceAsStream("da_UTF8.xml");
            HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is);

            HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 2, 4);

            // min=2, max=4
            AssertTokenStreamContents(tf, new string[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" });

            tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 6);

            // min=4, max=6
            AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" });

            tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 10);

            // min=4, max=10
            AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basket", "basketbal", "basketball", "sket", "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" });
        }
        public virtual void Inform(ResourceLoader loader)
        {
            InputStream stream = null;

            try
            {
                if (dictFile != null)   // the dictionary can be empty.
                {
                    dictionary = getWordSet(loader, dictFile, false);
                }
                // TODO: Broken, because we cannot resolve real system id
                // ResourceLoader should also supply method like ClassLoader to get resource URL
                stream = loader.openResource(hypFile);
                InputSource @is = new InputSource(stream);
                @is.Encoding = encoding;   // if it's null let xml parser decide
                @is.SystemId = hypFile;
                hyphenator   = HyphenationCompoundWordTokenFilter.getHyphenationTree(@is);
            }
            finally
            {
                IOUtils.CloseWhileHandlingException(stream);
            }
        }
Exemplo n.º 14
0
 /// <summary>
 /// Creates a new <see cref="HyphenationCompoundWordTokenFilter"/> instance.
 /// </summary>
 /// <param name="matchVersion">
 ///          Lucene version to enable correct Unicode 4.0 behavior in the
 ///          dictionaries if Version > 3.0. See <a
 ///          href="CompoundWordTokenFilterBase.html#version"
 ///          >CompoundWordTokenFilterBase</a> for details. </param>
 /// <param name="input">
 ///          the <see cref="TokenStream"/> to process </param>
 /// <param name="hyphenator">
 ///          the hyphenation pattern tree to use for hyphenation </param>
 /// <param name="dictionary">
 ///          the word dictionary to match against. </param>
 public HyphenationCompoundWordTokenFilter(LuceneVersion matchVersion, TokenStream input,
                                           HyphenationTree hyphenator, CharArraySet dictionary)
     : this(matchVersion, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
            DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false)
 {
 }
Exemplo n.º 15
0
 /// <summary>
 /// Create a <see cref="HyphenationCompoundWordTokenFilter"/> with no dictionary.
 /// <para>
 /// Calls <see cref="HyphenationCompoundWordTokenFilter.HyphenationCompoundWordTokenFilter(LuceneVersion, TokenStream, HyphenationTree, int, int, int)"/>
 /// </para>
 /// </summary>
 public HyphenationCompoundWordTokenFilter(LuceneVersion matchVersion, TokenStream input,
                                           HyphenationTree hyphenator)
     : this(matchVersion, input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE,
            DEFAULT_MAX_SUBWORD_SIZE)
 {
 }
 public AnalyzerAnonymousInnerClassHelper5(TestCompoundWordTokenFilter outerInstance, HyphenationTree hyphenator)
 {
     this.outerInstance = outerInstance;
     this.hyphenator    = hyphenator;
 }