public virtual void Inform(IResourceLoader loader) { Stream stream = null; try { if (dictFile != null) // the dictionary can be empty. { dictionary = GetWordSet(loader, dictFile, false); } // TODO: Broken, because we cannot resolve real system id // ResourceLoader should also supply method like ClassLoader to get resource URL stream = loader.OpenResource(hypFile); //InputSource @is = new InputSource(stream); //@is.Encoding = encoding; // if it's null let xml parser decide //@is.SystemId = hypFile; var xmlEncoding = string.IsNullOrEmpty(encoding) ? Encoding.UTF8 : Encoding.GetEncoding(encoding); hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(stream, xmlEncoding); } finally { IOUtils.CloseWhileHandlingException(stream); } }
protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filter = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, hyphenator); return(new TokenStreamComponents(tokenizer, filter)); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); TokenFilter filter = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, hyphenator); return(new TokenStreamComponents(tokenizer, filter)); }
public virtual void TestHyphenationCompoundWordsDA() { CharArraySet dict = makeDictionary("læse", "hest"); //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm()); using var @is = this.GetType().getResourceAsStream("da_UTF8.xml"); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is); HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("min veninde som er lidt af en læsehest"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); AssertTokenStreamContents(tf, new string[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 }); }
public virtual void TestHyphenationCompoundWordsDELongestMatch() { CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv"); //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm()); using var @is = this.GetType().getResourceAsStream("da_UTF8.xml"); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is); // the word basket will not be added due to the longest match option HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true); AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basketball", "ball", "kurv" }, new int[] { 1, 0, 0, 0 }); }
public virtual void TestHyphenationCompoundWordsDA() { CharArraySet dict = makeDictionary("læse", "hest"); //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm()); using (var @is = this.GetType().getResourceAsStream("da_UTF8.xml")) { HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is); HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("min veninde som er lidt af en læsehest"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); AssertTokenStreamContents(tf, new string[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 }); } }
public virtual void TestHyphenationCompoundWordsDELongestMatch() { CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv"); //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm()); using (var @is = this.GetType().getResourceAsStream("da_UTF8.xml")) { HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is); // the word basket will not be added due to the longest match option HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true); AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basketball", "ball", "kurv" }, new int[] { 1, 0, 0, 0 }); } }
public virtual void TestEmptyTerm() { CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def"); Analyzer a = new AnalyzerAnonymousInnerClassHelper4(this, dict); CheckOneTerm(a, "", ""); //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm()); using (var @is = this.GetType().getResourceAsStream("da_UTF8.xml")) { HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is); Analyzer b = new AnalyzerAnonymousInnerClassHelper5(this, hyphenator); CheckOneTerm(b, "", ""); } }
public virtual void TestRandomStrings() { CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def"); Analyzer a = new AnalyzerAnonymousInnerClassHelper2(this, dict); CheckRandomData(Random, a, 1000 * RANDOM_MULTIPLIER); //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm()); using (var @is = this.GetType().getResourceAsStream("da_UTF8.xml")) { HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is); Analyzer b = new AnalyzerAnonymousInnerClassHelper3(this, hyphenator); CheckRandomData(Random, b, 1000 * RANDOM_MULTIPLIER); } }
public virtual void TestHyphenationOnly() { //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm()); using var @is = this.GetType().getResourceAsStream("da_UTF8.xml"); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is); HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 2, 4); // min=2, max=4 AssertTokenStreamContents(tf, new string[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" }); tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 6); // min=4, max=6 AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" }); tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 10); // min=4, max=10 AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basket", "basketbal", "basketball", "sket", "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" }); }
public virtual void Inform(ResourceLoader loader) { InputStream stream = null; try { if (dictFile != null) // the dictionary can be empty. { dictionary = getWordSet(loader, dictFile, false); } // TODO: Broken, because we cannot resolve real system id // ResourceLoader should also supply method like ClassLoader to get resource URL stream = loader.openResource(hypFile); InputSource @is = new InputSource(stream); @is.Encoding = encoding; // if it's null let xml parser decide @is.SystemId = hypFile; hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(@is); } finally { IOUtils.closeWhileHandlingException(stream); } }
public virtual void TestHyphenationOnly() { //InputSource @is = new InputSource(this.GetType().getResource("da_UTF8.xml").toExternalForm()); using (var @is = this.GetType().getResourceAsStream("da_UTF8.xml")) { HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.GetHyphenationTree(@is); HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 2, 4); // min=2, max=4 AssertTokenStreamContents(tf, new string[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" }); tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 6); // min=4, max=6 AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" }); tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 10); // min=4, max=10 AssertTokenStreamContents(tf, new string[] { "basketballkurv", "basket", "basketbal", "basketball", "sket", "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" }); } }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); TokenFilter filter = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, hyphenator); return new TokenStreamComponents(tokenizer, filter); }