/// <summary> /// Creates a new <see cref="HMMChineseTokenizer"/>, supplying the <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> /// </summary> public HMMChineseTokenizer(AttributeFactory factory, TextReader reader) : base(factory, reader, (BreakIterator)sentenceProto.Clone()) { termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); }
/// <summary> /// Creates a new <see cref="ThaiTokenizer"/>, supplying the <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> </summary> public ThaiTokenizer(AttributeFactory factory, TextReader reader) : base(factory, reader, (BreakIterator)sentenceProto.Clone()) { // LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator wordBreaker = new ThaiWordBreaker((BreakIterator)proto.Clone()); termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); }
public override RuleBasedBreakIterator GetBreakIterator(int script) { switch (script) { case UScript.Japanese: return((RuleBasedBreakIterator)cjkBreakIterator.Clone()); case UScript.Myanmar: if (myanmarAsWords) { return((RuleBasedBreakIterator)defaultBreakIterator.Clone()); } else { return((RuleBasedBreakIterator)myanmarSyllableIterator.Clone()); } default: return((RuleBasedBreakIterator)defaultBreakIterator.Clone()); } }
private static BreakIterator CreateSentenceClone() { UninterruptableMonitor.Enter(syncLock); try { return((BreakIterator)sentenceProto.Clone()); } finally { UninterruptableMonitor.Exit(syncLock); } }
/// <summary> /// Creates a new <see cref="ThaiTokenizer"/>, supplying the <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> </summary> public ThaiTokenizer(AttributeFactory factory, TextReader reader) : base(factory, reader, CreateSentenceClone()) { // LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator UninterruptableMonitor.Enter(syncLock); try { wordBreaker = new ThaiWordBreaker((BreakIterator)proto.Clone()); } finally { UninterruptableMonitor.Exit(syncLock); } termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); }
public void TestRegUnreg() { CultureInfo thailand_locale = new CultureInfo("th-TH"); // ICU4N: Arbitrary locales are not allowed in .NET //CultureInfo foo_locale = new CultureInfo("fu-FU"); BreakIterator jwbi = BreakIterator.GetWordInstance(new CultureInfo("ja")); BreakIterator uwbi = BreakIterator.GetWordInstance(new CultureInfo("en-US")); BreakIterator usbi = BreakIterator.GetSentenceInstance(new CultureInfo("en-US")); BreakIterator twbi = BreakIterator.GetWordInstance(thailand_locale); BreakIterator rwbi = BreakIterator.GetWordInstance(CultureInfo.InvariantCulture); // (new Locale("", "", "")); BreakIterator sbi = (BreakIterator)usbi.Clone(); // todo: this will cause the test to fail, no way to set a breakiterator to null text so can't fix yet. // String text = "This is some test, by golly. Boy, they don't make tests like they used to, do they? This here test ain't worth $2.50. Nope."; // sbi.setText(text); assertTrue(!BreakIterator.Unregister(""), "unregister before register"); // coverage // ICU4N: Arbitrary locales are not allowed in .NET //object key0 = BreakIterator.RegisterInstance((BreakIterator)twbi.Clone(), foo_locale, BreakIterator.KIND_WORD); object key1 = BreakIterator.RegisterInstance(sbi, new CultureInfo("en-US"), BreakIterator.KIND_WORD); object key2 = BreakIterator.RegisterInstance((BreakIterator)twbi.Clone(), new CultureInfo("en-US"), BreakIterator.KIND_WORD); { BreakIterator test0 = BreakIterator.GetWordInstance(new CultureInfo("ja")); BreakIterator test1 = BreakIterator.GetWordInstance(new CultureInfo("en-US")); BreakIterator test2 = BreakIterator.GetSentenceInstance(new CultureInfo("en-US")); BreakIterator test3 = BreakIterator.GetWordInstance(thailand_locale); // ICU4N: Arbitrary locales are not allowed in .NET //BreakIterator test4 = BreakIterator.GetWordInstance(foo_locale); assertEqual(test0, jwbi, "japan word == japan word"); assertEqual(test1, twbi, "us word == thai word"); assertEqual(test2, usbi, "us sentence == us sentence"); assertEqual(test3, twbi, "thai word == thai word"); // ICU4N: Arbitrary locales are not allowed in .NET //assertEqual(test4, twbi, "foo word == thai word"); } //Locale[] locales = BreakIterator.getAvailableLocales(); assertTrue(BreakIterator.Unregister(key2), "unregister us word (thai word)"); assertTrue(!BreakIterator.Unregister(key2), "unregister second time"); bool error = false; try { BreakIterator.Unregister(null); } catch (ArgumentException e) { error = true; } assertTrue(error, "unregister null"); { CharacterIterator sci = BreakIterator.GetWordInstance(new CultureInfo("en-US")).Text; int len = sci.EndIndex - sci.BeginIndex; assertEqual(len, 0, "us word text: " + getString(sci)); } // ICU4N: Arbitrary locales are not allowed in .NET //assertTrue((BreakIterator.GetAvailableLocales().ToList()).Contains(foo_locale), "foo_locale"); //assertTrue(BreakIterator.Unregister(key0), "unregister foo word (thai word)"); //assertTrue(!(BreakIterator.GetAvailableLocales().ToList()).Contains(foo_locale), "no foo_locale"); assertEqual(BreakIterator.GetWordInstance(new CultureInfo("en-US")), usbi, "us word == us sentence"); assertTrue(BreakIterator.Unregister(key1), "unregister us word (us sentence)"); { BreakIterator test0 = BreakIterator.GetWordInstance(new CultureInfo("ja")); BreakIterator test1 = BreakIterator.GetWordInstance(new CultureInfo("en-US")); BreakIterator test2 = BreakIterator.GetSentenceInstance(new CultureInfo("en-US")); BreakIterator test3 = BreakIterator.GetWordInstance(thailand_locale); // ICU4N: Arbitrary locales are not allowed in .NET //BreakIterator test4 = BreakIterator.GetWordInstance(foo_locale); assertEqual(test0, jwbi, "japanese word break"); assertEqual(test1, uwbi, "us sentence-word break"); assertEqual(test2, usbi, "us sentence break"); assertEqual(test3, twbi, "thai word break"); // ICU4N: Arbitrary locales are not allowed in .NET //assertEqual(test4, rwbi, "root word break"); CharacterIterator sci = test1.Text; int len = sci.EndIndex - sci.BeginIndex; assertEqual(len, 0, "us sentence-word break text: " + getString(sci)); } }
public CodeTokenizer(TextReader reader) : base(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader, (BreakIterator)sentenceProto.Clone()) { termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); }
private static BreakIterator CreateSentenceClone() { lock (syncLock) return((BreakIterator)sentenceProto.Clone()); }
internal BreakEnumerator(BreakIterator iterator) { _breakIterator = iterator.Clone(); }