Exemplo n.º 1
0
        internal static void Dump()
        {
            if (!Console.IsInputRedirected)
            {
                Console.WriteLine("tandoku: expecting input");
                return;
            }

            using (new Utf8EncodingOverride())
            {
                // Ignore input encoding from console, use UTF-8 by default
                using var r = new StreamReader(Console.OpenStandardInput());

                // TODO: try out different modes (normal vs search)

                var factory        = new JapaneseTokenizerFactory(new Dictionary <string, string>());
                var tokenizer      = factory.Create(r);
                int i              = 0;
                var termAttr       = tokenizer.GetAttribute <ICharTermAttribute>();
                var offsetAttr     = tokenizer.GetAttribute <IOffsetAttribute>();
                var posIncrAttr    = tokenizer.GetAttribute <IPositionIncrementAttribute>();
                var posLenAttr     = tokenizer.GetAttribute <IPositionLengthAttribute>();
                var baseFormAttr   = tokenizer.GetAttribute <IBaseFormAttribute>();
                var inflectionAttr = tokenizer.GetAttribute <IInflectionAttribute>();
                var posAttr        = tokenizer.GetAttribute <IPartOfSpeechAttribute>();
                var readingAttr    = tokenizer.GetAttribute <IReadingAttribute>();
                tokenizer.Reset();
                Console.Write($"{"Num",-3}  {"Term",-10}");
                Console.Write($" {"St",-2}/{"En",-2}");
                Console.Write($" {"Ic",-2} {"Ln",-2}");
                Console.Write($" {"Base",-10}");
                Console.Write($" {"POS",-18}");
                Console.Write($" {"InflForm",-8} / {"InflType",-16}");
                Console.Write($" {"Pronunctn",-10} / {"Reading",-10}");
                Console.WriteLine();
                while (tokenizer.IncrementToken())
                {
                    Console.Write($"{i++,3}: {Align(termAttr, 10)}");
                    Console.Write($" {offsetAttr.StartOffset,2}/{offsetAttr.EndOffset,2}");
                    Console.Write($" {posIncrAttr.PositionIncrement,2} {posLenAttr.PositionLength,2}");
                    Console.Write($" {Align(baseFormAttr.GetBaseForm(), 10)}");
                    Console.Write($" {Align(posAttr.GetPartOfSpeech(), 18)}");
                    Console.Write($" {Align(inflectionAttr.GetInflectionForm(), 8)} / {Align(inflectionAttr.GetInflectionType(), 16)}");
                    Console.Write($" {Align(readingAttr.GetPronunciation(), 10)} / {Align(readingAttr.GetReading(), 10)}");
                    Console.WriteLine();

                    if (DictionaryLookupDemo.TryLookupKanji(termAttr.ToString(), out var entries) ||
                        DictionaryLookupDemo.TryLookupReading(termAttr.ToString(), out entries))
                    {
                        foreach (var entry in entries)
                        {
                            Console.WriteLine($"    {string.Join("; ", entry.Glosses)}");
                        }
                    }
                }
                tokenizer.End();
            }
        }
Exemplo n.º 2
0
        public IEnumerable <Token> Tokenize(string text)
        {
            // TODO: optimize this to reuse tokenizer
            var tokenizer      = _tokenizerFactory.Create(new StringReader(text));
            var termAttr       = tokenizer.GetAttribute <ICharTermAttribute>();
            var offsetAttr     = tokenizer.GetAttribute <IOffsetAttribute>();
            var posIncrAttr    = tokenizer.GetAttribute <IPositionIncrementAttribute>();
            var posLenAttr     = tokenizer.GetAttribute <IPositionLengthAttribute>();
            var baseFormAttr   = tokenizer.GetAttribute <IBaseFormAttribute>();
            var inflectionAttr = tokenizer.GetAttribute <IInflectionAttribute>();
            var posAttr        = tokenizer.GetAttribute <IPartOfSpeechAttribute>();
            var readingAttr    = tokenizer.GetAttribute <IReadingAttribute>();

            tokenizer.Reset();
            long ordinal = 0;

            while (tokenizer.IncrementToken())
            {
                yield return(new Token
                {
                    Ordinal = ++ordinal,
                    Term = termAttr.ToString(),
                    StartOffset = offsetAttr.StartOffset,
                    EndOffset = offsetAttr.EndOffset,
                    PositionIncrement = posIncrAttr.PositionIncrement,
                    PositionLength = posLenAttr.PositionLength,
                    BaseForm = baseFormAttr.GetBaseForm(),
                    PartOfSpeech = posAttr.GetPartOfSpeech(),
                    InflectionForm = inflectionAttr.GetInflectionForm(),
                    InflectionType = inflectionAttr.GetInflectionType(),
                    Pronunciation = readingAttr.GetPronunciation(),
                    Reading = readingAttr.GetReading(),
                });
            }
            tokenizer.End();
        }