Tokenize Chinese text as individual chinese chars.

The difference between ChineseTokenizer and CJKTokenizer is that they have different token parsing logic.

For example, if the Chinese text "C1C2C3C4" is to be indexed:

  • The tokens returned from ChineseTokenizer are C1, C2, C3, C4
  • The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.

Therefore the index created by CJKTokenizer is much larger.

The problem is that when searching for C1, C1C2, C1C3, C4C2, C1C2C3 ... the ChineseTokenizer works, but the CJKTokenizer will not work.

Inheritance: Lucene.Net.Analysis.Tokenizer
示例#1
0
        public void TestOtherLetterOffset()
        {
            String s = "a天b";
            ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s));

            int correctStartOffset = 0;
            int correctEndOffset = 1;
            IOffsetAttribute offsetAtt = tokenizer.GetAttribute<IOffsetAttribute>();
            while (tokenizer.IncrementToken())
            {
                Assert.AreEqual(correctStartOffset, offsetAtt.StartOffset);
                Assert.AreEqual(correctEndOffset, offsetAtt.EndOffset);
                correctStartOffset++;
                correctEndOffset++;
            }
        }
        public virtual void TestOtherLetterOffset()
        {
            string s = "a天b";
            ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s));

            int correctStartOffset = 0;
            int correctEndOffset = 1;
            IOffsetAttribute offsetAtt = tokenizer.GetAttribute<IOffsetAttribute>();
            tokenizer.Reset();
            while (tokenizer.IncrementToken())
            {
                assertEquals(correctStartOffset, offsetAtt.StartOffset());
                assertEquals(correctEndOffset, offsetAtt.EndOffset());
                correctStartOffset++;
                correctEndOffset++;
            }
            tokenizer.End();
            tokenizer.Dispose();
        }
示例#3
0
        public virtual void TestOtherLetterOffset()
        {
            string           s         = "a天b";
            ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s));

            int correctStartOffset     = 0;
            int correctEndOffset       = 1;
            IOffsetAttribute offsetAtt = tokenizer.GetAttribute <IOffsetAttribute>();

            tokenizer.Reset();
            while (tokenizer.IncrementToken())
            {
                assertEquals(correctStartOffset, offsetAtt.StartOffset);
                assertEquals(correctEndOffset, offsetAtt.EndOffset);
                correctStartOffset++;
                correctEndOffset++;
            }
            tokenizer.End();
            tokenizer.Dispose();
        }
示例#4
0
		/// <summary>
		/// Creates a TokenStream which tokenizes all the text in the provided Reader.
		/// </summary>
		/// <returns>A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.</returns>
		public override sealed TokenStream TokenStream(String fieldName, TextReader reader) 
		{
			TokenStream result = new ChineseTokenizer(reader);
			result = new ChineseFilter(result);
			return result;
		}
示例#5
0
        /// <summary>
        /// Creates
        /// <see cref="TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> <see cref="TokenStreamComponents"/>
        ///         built from a <see cref="ChineseTokenizer"/> filtered with
        ///         <see cref="ChineseFilter"/> </returns>
        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer source = new ChineseTokenizer(reader);

            return(new TokenStreamComponents(source, new ChineseFilter(source)));
        }
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer source = new ChineseTokenizer(reader);
     return new TokenStreamComponents(source, new ChineseFilter(source));
 }