예제 #1
0
        /// <summary>
        /// Construct a new <see cref="ICUTokenizer"/> that breaks text into words from the given
        /// <see cref="TextReader"/>, using a tailored <see cref="BreakIterator"/> configuration.
        /// </summary>
        /// <param name="factory"><see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> to use.</param>
        /// <param name="input"><see cref="TextReader"/> containing text to tokenize.</param>
        /// <param name="config">Tailored <see cref="BreakIterator"/> configuration.</param>
        public ICUTokenizer(AttributeFactory factory, TextReader input, ICUTokenizerConfig config)
            : base(factory, input)
        {
            this.offsetAtt = AddAttribute <IOffsetAttribute>();
            this.termAtt   = AddAttribute <ICharTermAttribute>();
            this.typeAtt   = AddAttribute <ITypeAttribute>();
            this.scriptAtt = AddAttribute <IScriptAttribute>();

            this.config = config;
            breaker     = new CompositeBreakIterator(config);
        }
예제 #2
0
        public void TestTokenAttributes()
        {
            using TokenStream ts = a.GetTokenStream("dummy", "This is a test");
            IScriptAttribute scriptAtt = ts.AddAttribute <IScriptAttribute>();

            ts.Reset();
            while (ts.IncrementToken())
            {
                assertEquals(UScript.Latin, scriptAtt.Code);
                assertEquals(UScript.GetName(UScript.Latin), scriptAtt.GetName());
                assertEquals(UScript.GetShortName(UScript.Latin), scriptAtt.GetShortName());
                assertTrue(ts.ReflectAsString(false).Contains("script=Latin"));
            }
            ts.End();
        }