A token filter for truncating the terms into a specific length. Fixed prefix truncation, as a stemming method, produces good results on Turkish language. It is reported that F5, using first 5 characters, produced best results in Information Retrieval on Turkish Texts
Inheritance: TokenFilter
コード例 #1
0
        public virtual void TestTruncating()
        {
            TokenStream stream = new MockTokenizer(new StringReader("abcdefg 1234567 ABCDEFG abcde abc 12345 123"), MockTokenizer.WHITESPACE, false);

            stream = new TruncateTokenFilter(stream, 5);
            AssertTokenStreamContents(stream, new string[] { "abcde", "12345", "ABCDE", "abcde", "abc", "12345", "123" });
        }
コード例 #2
0
 public virtual void TestTruncating()
 {
     TokenStream stream = new MockTokenizer(new StringReader("abcdefg 1234567 ABCDEFG abcde abc 12345 123"), MockTokenizer.WHITESPACE, false);
     stream = new TruncateTokenFilter(stream, 5);
     AssertTokenStreamContents(stream, new string[] { "abcde", "12345", "ABCDE", "abcde", "abc", "12345", "123" });
 }