Exemplo n.º 1
0
        private void Test(string text, params string[] expected)
        {
            string[] tokens;

            tokens = tokenizer.Tokenize(text).Select(t => t.Text).ToArray();
            CollectionAssert.AreEqual(expected, tokens);

            tokens = tokenizer.Tokenize(new StringReader(text)).Select(t => t.Text).ToArray();
            CollectionAssert.AreEqual(expected, tokens);

            tokens = tokenizer.Tokenize(text as IEnumerable <char>).Select(t => t.Text).ToArray();
            CollectionAssert.AreEqual(expected, tokens);
        }
Exemplo n.º 2
0
        public void TokenizeString()
        {
            int count     = 5;
            int mebibytes = 10;
            int size      = mebibytes * 1024 * 1024;

            string text = DarbaLikums(size);

            Assert.AreEqual(size, text.Length);

            LatvianTokenizer tokenizer = new LatvianTokenizer();

            Stopwatch timer      = new Stopwatch();
            int       tokenCount = 0;

            timer.Start();
            for (int i = 0; i < count; i++)
            {
                foreach (Token token in tokenizer.Tokenize(text))
                {
                    tokenCount++;
                }
            }
            timer.Stop();

            Assert.IsTrue(tokenCount > 0);

            Debug.WriteLine("Tokenize string ({1} MiB): {0:0.000} ms", timer.ElapsedMilliseconds / count, mebibytes);
            Debug.WriteLine("Tokenize string: {0:0.000} MB/s", (mebibytes * count) / timer.Elapsed.TotalSeconds, mebibytes);
            Debug.WriteLine("Tokenize string: {0:0.000} tokens/s", tokenCount / timer.Elapsed.TotalSeconds);
        }
Exemplo n.º 3
0
        public void Reorder()
        {
            LatvianTokenizer tokenizer = new LatvianTokenizer(compile: false);

            tokenizer.Clear();
            tokenizer.Add <TimeSpanToken>(); // matches 00:00:00
            tokenizer.Add <ClockToken>();    // matches 00:00:00
            tokenizer.Remove <ClockToken>();
            tokenizer.Insert <ClockToken>(0);
            Token token = tokenizer.Tokenize("00:00:00").First();

            Assert.IsTrue(token is ClockToken);

            tokenizer = new LatvianTokenizer(compile: false);
            tokenizer.Clear();
            tokenizer.Add <TimeSpanToken>(); // matches 00:00:00
            tokenizer.Add <ClockToken>();    // matches 00:00:00
            tokenizer.Remove(typeof(ClockToken));
            tokenizer.Insert(0, typeof(ClockToken));
            token = tokenizer.Tokenize("00:00:00").First();
            Assert.IsTrue(token is ClockToken);

            tokenizer = new LatvianTokenizer(compile: false);
            tokenizer.Clear();
            tokenizer.Add <TimeSpanToken>(); // matches 00:00:00
            tokenizer.Add <ClockToken>();    // matches 00:00:00
            tokenizer.Move <ClockToken>(0);
            token = tokenizer.Tokenize("00:00:00").First();
            Assert.IsTrue(token is ClockToken);
        }
Exemplo n.º 4
0
        public void Emotions()
        {
            LatvianTokenizer tokenizer = new LatvianTokenizer(compile: false);

            tokenizer.Add <EmotionToken>();

            EmotionToken[] tokens = tokenizer.Tokenize("Šodien esmu :) bet vakar biju :(").OfType <EmotionToken>().ToArray();
            Assert.AreEqual(":)", tokens[0].Text);
            Assert.AreEqual(":(", tokens[1].Text);
        }
Exemplo n.º 5
0
        public void InitialsRemovedNotCompiled()
        {
            LatvianTokenizer tokenizer = new LatvianTokenizer(compile: false);

            tokenizer.Remove <InitialsToken>();

            Token[] tokens = tokenizer.Tokenize("A.Bērziņš").ToArray();
            Assert.AreEqual("A", tokens[0].Text);
            Assert.AreEqual(".", tokens[1].Text);
            Assert.AreEqual("Bērziņš", tokens[2].Text);
        }
Exemplo n.º 6
0
        public void EqualMatch()
        {
            LatvianTokenizer tokenizer = new LatvianTokenizer(compile: false);

            tokenizer.Clear();
            tokenizer.Add <TimeSpanToken>(); // matches 00:00:00
            tokenizer.Add <ClockToken>();    // matches 00:00:00

            Token token = tokenizer.Tokenize("00:00:00").First();

            Assert.IsTrue(token is TimeSpanToken);
        }
Exemplo n.º 7
0
        public void WhitespaceIncluded()
        {
            LatvianTokenizer tokenizer = new LatvianTokenizer()
            {
                IncludeWhitespace = true
            };

            Token[] tokens = tokenizer.Tokenize("123 456").ToArray();
            Assert.AreEqual("123", tokens[0].Text);
            Assert.AreEqual(" ", tokens[1].Text);
            Assert.AreEqual("456", tokens[2].Text);
        }
Exemplo n.º 8
0
        public void LoadSave()
        {
            string filename = Path.GetTempFileName();

            LatvianTokenizer tokenizer = new LatvianTokenizer();

            tokenizer.Save(filename);

            tokenizer = new LatvianTokenizer(filename);

            Token[] tokens = tokenizer.Tokenize("123 456").ToArray();
            Assert.AreEqual("123", tokens[0].Text);
            Assert.AreEqual("456", tokens[1].Text);
        }
Exemplo n.º 9
0
        public void QuickStart_BreakSentences()
        {
            string text = "Sveika, pasaule! Man iet labi. Šodienas datums ir 2014-01-01";

            LatvianTokenizer tokenizer = new LatvianTokenizer();

            Token[]    tokens    = tokenizer.Tokenize(text).ToArray();
            Sentence[] sentences = tokenizer.BreakSentences(tokens).ToArray();

            Assert.AreEqual(3, sentences.Count());
            Assert.AreEqual(4, sentences[0].Count());
            Assert.AreEqual(4, sentences[1].Count());
            Assert.AreEqual(4, sentences[2].Count());
        }
Exemplo n.º 10
0
        public void TokenizeFile()
        {
            int count     = 5;
            int mebibytes = 10;
            int size      = mebibytes * 1024 * 1024;;

            string text = DarbaLikums(size);

            Assert.AreEqual(size, text.Length);

            string filename = Path.GetTempFileName();

            File.WriteAllText(filename, text);
            Assert.AreEqual(size, File.ReadAllText(filename).Length);

            LatvianTokenizer tokenizer = new LatvianTokenizer();

            Stopwatch timer      = new Stopwatch();
            int       tokenCount = 0;

            timer.Start();
            for (int i = 0; i < count; i++)
            {
                using (StreamReader reader = new StreamReader(filename))
                {
                    foreach (Token token in tokenizer.Tokenize(reader))
                    {
                        tokenCount++;
                    }
                }
            }
            timer.Stop();

            Assert.IsTrue(tokenCount > 0);

            Debug.WriteLine("Tokenize file ({1} MiB): {0:0.000} ms", timer.ElapsedMilliseconds / count, mebibytes);
            Debug.WriteLine("Tokenize file: {0:0.000} MB/s", (mebibytes * count) / timer.Elapsed.TotalSeconds, mebibytes);
            Debug.WriteLine("Tokenize file: {0:0.000} tokens/s", tokenCount / timer.Elapsed.TotalSeconds);

            File.Delete(filename);
        }
Exemplo n.º 11
0
        public void QuickStart()
        {
            List <Token> tokens    = new List <Token>();
            DateToken    dateToken = null;

            Debug.WriteLine("Latvian.Tests.Tokenization.ReadmeTests.QuickStart starts");

            string text = "Sveika, pasaule! Man iet labi. Šodienas datums ir 2014-01-01";

            LatvianTokenizer tokenizer = new LatvianTokenizer();

            foreach (Token token in tokenizer.Tokenize(text))
            {
                Debug.WriteLine("Line {0}: Pos {1}: Type: {2} Token: {3}",
                                token.Line, token.Position, token.GetType(), token.Text);

                tokens.Add(token);

                if (token is DateToken)
                {
                    dateToken = token as DateToken;
                    Debug.WriteLine(dateToken.DateTime.ToString("dd/MM/yyyy"));
                }
            }

            Debug.WriteLine("Latvian.Tests.Tokenization.ReadmeTests.QuickStart end");

            Assert.AreEqual(12, tokens.Count);

            Assert.AreEqual("Sveika", tokens[0].Text);
            Assert.AreEqual(typeof(WordToken), tokens[0].GetType());
            Assert.AreEqual(0, tokens[0].Position);
            Assert.AreEqual(6, tokens[0].PositionEnd);
            Assert.AreEqual(0, tokens[0].Line);
            Assert.AreEqual(0, tokens[0].LineEnd);
            Assert.AreEqual(0, tokens[0].LinePosition);
            Assert.AreEqual(6, tokens[0].LinePositionEnd);

            Assert.AreEqual(",", tokens[1].Text);
            Assert.AreEqual(typeof(PunctuationToken), tokens[1].GetType());
            Assert.AreEqual(6, tokens[1].Position);
            Assert.AreEqual(7, tokens[1].PositionEnd);
            Assert.AreEqual(0, tokens[1].Line);
            Assert.AreEqual(0, tokens[1].LineEnd);
            Assert.AreEqual(6, tokens[1].LinePosition);
            Assert.AreEqual(7, tokens[1].LinePositionEnd);

            Assert.AreEqual("pasaule", tokens[2].Text);
            Assert.AreEqual(typeof(WordToken), tokens[2].GetType());
            Assert.AreEqual(8, tokens[2].Position);
            Assert.AreEqual(15, tokens[2].PositionEnd);
            Assert.AreEqual(0, tokens[2].Line);
            Assert.AreEqual(0, tokens[2].LineEnd);
            Assert.AreEqual(8, tokens[2].LinePosition);
            Assert.AreEqual(15, tokens[2].LinePositionEnd);

            Assert.AreEqual("!", tokens[3].Text);
            Assert.AreEqual(typeof(PunctuationToken), tokens[3].GetType());
            Assert.AreEqual(15, tokens[3].Position);
            Assert.AreEqual(16, tokens[3].PositionEnd);
            Assert.AreEqual(0, tokens[3].Line);
            Assert.AreEqual(0, tokens[3].LineEnd);
            Assert.AreEqual(15, tokens[3].LinePosition);
            Assert.AreEqual(16, tokens[3].LinePositionEnd);

            Assert.AreEqual("Man", tokens[4].Text);
            Assert.AreEqual(typeof(WordToken), tokens[4].GetType());
            Assert.AreEqual(17, tokens[4].Position);
            Assert.AreEqual(20, tokens[4].PositionEnd);
            Assert.AreEqual(0, tokens[4].Line);
            Assert.AreEqual(0, tokens[4].LineEnd);
            Assert.AreEqual(17, tokens[4].LinePosition);
            Assert.AreEqual(20, tokens[4].LinePositionEnd);

            Assert.AreEqual("iet", tokens[5].Text);
            Assert.AreEqual(typeof(WordToken), tokens[5].GetType());
            Assert.AreEqual(21, tokens[5].Position);
            Assert.AreEqual(24, tokens[5].PositionEnd);
            Assert.AreEqual(0, tokens[5].Line);
            Assert.AreEqual(0, tokens[5].LineEnd);
            Assert.AreEqual(21, tokens[5].LinePosition);
            Assert.AreEqual(24, tokens[5].LinePositionEnd);

            Assert.AreEqual("labi", tokens[6].Text);
            Assert.AreEqual(typeof(WordToken), tokens[6].GetType());
            Assert.AreEqual(25, tokens[6].Position);
            Assert.AreEqual(29, tokens[6].PositionEnd);
            Assert.AreEqual(0, tokens[6].Line);
            Assert.AreEqual(0, tokens[6].LineEnd);
            Assert.AreEqual(25, tokens[6].LinePosition);
            Assert.AreEqual(29, tokens[6].LinePositionEnd);

            Assert.AreEqual(".", tokens[7].Text);
            Assert.AreEqual(typeof(PunctuationToken), tokens[7].GetType());
            Assert.AreEqual(29, tokens[7].Position);
            Assert.AreEqual(30, tokens[7].PositionEnd);
            Assert.AreEqual(0, tokens[7].Line);
            Assert.AreEqual(0, tokens[7].LineEnd);
            Assert.AreEqual(29, tokens[7].LinePosition);
            Assert.AreEqual(30, tokens[7].LinePositionEnd);

            Assert.AreEqual("2014-01-01", tokens[11].Text);
            Assert.AreEqual(typeof(DateToken), tokens[11].GetType());
            Assert.AreEqual(50, tokens[11].Position);
            Assert.AreEqual(60, tokens[11].PositionEnd);
            Assert.AreEqual(0, tokens[11].Line);
            Assert.AreEqual(0, tokens[11].LineEnd);
            Assert.AreEqual(50, tokens[11].LinePosition);
            Assert.AreEqual(60, tokens[11].LinePositionEnd);

            Assert.AreEqual(tokens[11], dateToken);
            Assert.AreEqual("01.01.2014", dateToken.DateTime.ToString("dd/MM/yyyy"));
        }