LUCENENET specific class to patch the behavior of the ICU BreakIterator. Corrects the breaking of words by finding transitions between Thai and non-Thai characters. This logic assumes that the Java BreakIterator also breaks up Thai numerals from Arabic numerals (1, 2, 3, etc.). That is, it assumes the first test below passes and the second test fails in Lucene (not attempted). ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET); AssertAnalyzesTo(analyzer, "๑๒๓456", new string[] { "๑๒๓", "456" }); AssertAnalyzesTo(analyzer, "๑๒๓456", new string[] { "๑๒๓456" });
Exemple #1
0
        /// <summary>
        /// Creates a new <see cref="ThaiTokenizer"/>, supplying the <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> </summary>
        public ThaiTokenizer(AttributeFactory factory, TextReader reader)
            : base(factory, reader, (BreakIterator)sentenceProto.Clone())
        {
            // LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator

            wordBreaker = new ThaiWordBreaker((BreakIterator)proto.Clone());
            termAtt     = AddAttribute <ICharTermAttribute>();
            offsetAtt   = AddAttribute <IOffsetAttribute>();
        }
Exemple #2
0
 /// <summary>
 /// Creates a new <see cref="ThaiTokenizer"/>, supplying the <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> </summary>
 public ThaiTokenizer(AttributeFactory factory, TextReader reader)
     : base(factory, reader, new IcuBreakIterator(global::Icu.BreakIterator.UBreakIteratorType.SENTENCE, new CultureInfo("th")))
 {
     if (!DBBI_AVAILABLE)
     {
         throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
     }
     wordBreaker = new ThaiWordBreaker(new IcuBreakIterator(global::Icu.BreakIterator.UBreakIteratorType.WORD, CultureInfo.InvariantCulture));
     termAtt     = AddAttribute <ICharTermAttribute>();
     offsetAtt   = AddAttribute <IOffsetAttribute>();
 }
Exemple #3
0
 /// <summary>
 /// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary>
 public ThaiTokenizer(AttributeFactory factory, TextReader reader)
     : base(factory, reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
 {
     if (!DBBI_AVAILABLE)
     {
         throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
     }
     wordBreaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(Locale.GetUS()));
     termAtt     = AddAttribute <ICharTermAttribute>();
     offsetAtt   = AddAttribute <IOffsetAttribute>();
 }
 /// <summary>
 /// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary>
 public ThaiTokenizer(AttributeFactory factory, TextReader reader)
       : base(factory, reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
 {
     if (!DBBI_AVAILABLE)
     {
         throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
     }
     wordBreaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(Locale.GetUS()));
     termAtt = AddAttribute<ICharTermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
 }
Exemple #5
0
        /// <summary>
        /// Creates a new <see cref="ThaiTokenizer"/>, supplying the <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> </summary>
        public ThaiTokenizer(AttributeFactory factory, TextReader reader)
            : base(factory, reader, CreateSentenceClone())
        {
            // LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator

            UninterruptableMonitor.Enter(syncLock);
            try
            {
                wordBreaker = new ThaiWordBreaker((BreakIterator)proto.Clone());
            }
            finally
            {
                UninterruptableMonitor.Exit(syncLock);
            }
            termAtt   = AddAttribute <ICharTermAttribute>();
            offsetAtt = AddAttribute <IOffsetAttribute>();
        }