Exemple #1
0
        public void TokenizeString()
        {
            int count     = 5;
            int mebibytes = 10;
            int size      = mebibytes * 1024 * 1024;

            string text = DarbaLikums(size);

            Assert.AreEqual(size, text.Length);

            LatvianTokenizer tokenizer = new LatvianTokenizer();

            Stopwatch timer      = new Stopwatch();
            int       tokenCount = 0;

            timer.Start();
            for (int i = 0; i < count; i++)
            {
                foreach (Token token in tokenizer.Tokenize(text))
                {
                    tokenCount++;
                }
            }
            timer.Stop();

            Assert.IsTrue(tokenCount > 0);

            Debug.WriteLine("Tokenize string ({1} MiB): {0:0.000} ms", timer.ElapsedMilliseconds / count, mebibytes);
            Debug.WriteLine("Tokenize string: {0:0.000} MB/s", (mebibytes * count) / timer.Elapsed.TotalSeconds, mebibytes);
            Debug.WriteLine("Tokenize string: {0:0.000} tokens/s", tokenCount / timer.Elapsed.TotalSeconds);
        }
Exemple #2
0
        public void QuickStart_TokenizeSentences()
        {
            string text = "Sveika, pasaule! Man iet labi. Šodienas datums ir 2014-01-01";

            List <Sentence> sentences = new List <Sentence>();

            LatvianTokenizer tokenizer = new LatvianTokenizer();

            foreach (Sentence sentence in tokenizer.TokenizeSentences(text))
            {
                List <Token> sentenceTokens = new List <Token>();

                foreach (Token token in sentence)
                {
                    sentenceTokens.Add(token);
                }

                sentences.Add(sentenceTokens);
            }

            Assert.AreEqual(3, sentences.Count());
            Assert.AreEqual(4, sentences[0].Count());
            Assert.AreEqual(4, sentences[1].Count());
            Assert.AreEqual(4, sentences[2].Count());
        }
Exemple #3
0
        public void Text()
        {
            Token token = new LatvianTokenizer().Tokenize("vārds").First();

            Assert.AreEqual("vārds", token.Text);
            Assert.AreEqual("vārds", token.ToString());
        }
Exemple #4
0
        public void OrdinalNumberToken()
        {
            Token token = new LatvianTokenizer().Tokenize("123").First();

            Assert.IsTrue(token is OrdinalNumberToken);
            Assert.AreEqual(123 * 2, ((OrdinalNumberToken)token).Value * 2);
        }
Exemple #5
0
        public void Reorder()
        {
            LatvianTokenizer tokenizer = new LatvianTokenizer(compile: false);

            tokenizer.Clear();
            tokenizer.Add <TimeSpanToken>(); // matches 00:00:00
            tokenizer.Add <ClockToken>();    // matches 00:00:00
            tokenizer.Remove <ClockToken>();
            tokenizer.Insert <ClockToken>(0);
            Token token = tokenizer.Tokenize("00:00:00").First();

            Assert.IsTrue(token is ClockToken);

            tokenizer = new LatvianTokenizer(compile: false);
            tokenizer.Clear();
            tokenizer.Add <TimeSpanToken>(); // matches 00:00:00
            tokenizer.Add <ClockToken>();    // matches 00:00:00
            tokenizer.Remove(typeof(ClockToken));
            tokenizer.Insert(0, typeof(ClockToken));
            token = tokenizer.Tokenize("00:00:00").First();
            Assert.IsTrue(token is ClockToken);

            tokenizer = new LatvianTokenizer(compile: false);
            tokenizer.Clear();
            tokenizer.Add <TimeSpanToken>(); // matches 00:00:00
            tokenizer.Add <ClockToken>();    // matches 00:00:00
            tokenizer.Move <ClockToken>(0);
            token = tokenizer.Tokenize("00:00:00").First();
            Assert.IsTrue(token is ClockToken);
        }
Exemple #6
0
        public void DateToken()
        {
            Token token = new LatvianTokenizer().Tokenize("2014-01-01").First();

            Assert.IsTrue(token is DateToken);
            Assert.AreEqual(2014, ((DateToken)token).DateTime.Year);
        }
Exemple #7
0
 public void TextReader()
 {
     using (StringReader reader = new StringReader("123 456"))
     {
         Token[] tokens = new LatvianTokenizer().Tokenize(reader).ToArray();
         Assert.AreEqual("123", tokens[0].Text);
         Assert.AreEqual("456", tokens[1].Text);
     }
 }
Exemple #8
0
        public void Emotions()
        {
            LatvianTokenizer tokenizer = new LatvianTokenizer(compile: false);

            tokenizer.Add <EmotionToken>();

            EmotionToken[] tokens = tokenizer.Tokenize("Šodien esmu :) bet vakar biju :(").OfType <EmotionToken>().ToArray();
            Assert.AreEqual(":)", tokens[0].Text);
            Assert.AreEqual(":(", tokens[1].Text);
        }
Exemple #9
0
        public void InitialsRemovedNotCompiled()
        {
            LatvianTokenizer tokenizer = new LatvianTokenizer(compile: false);

            tokenizer.Remove <InitialsToken>();

            Token[] tokens = tokenizer.Tokenize("A.Bērziņš").ToArray();
            Assert.AreEqual("A", tokens[0].Text);
            Assert.AreEqual(".", tokens[1].Text);
            Assert.AreEqual("Bērziņš", tokens[2].Text);
        }
Exemple #10
0
        public void EqualMatch()
        {
            LatvianTokenizer tokenizer = new LatvianTokenizer(compile: false);

            tokenizer.Clear();
            tokenizer.Add <TimeSpanToken>(); // matches 00:00:00
            tokenizer.Add <ClockToken>();    // matches 00:00:00

            Token token = tokenizer.Tokenize("00:00:00").First();

            Assert.IsTrue(token is TimeSpanToken);
        }
Exemple #11
0
        public void WhitespaceIncluded()
        {
            LatvianTokenizer tokenizer = new LatvianTokenizer()
            {
                IncludeWhitespace = true
            };

            Token[] tokens = tokenizer.Tokenize("123 456").ToArray();
            Assert.AreEqual("123", tokens[0].Text);
            Assert.AreEqual(" ", tokens[1].Text);
            Assert.AreEqual("456", tokens[2].Text);
        }
Exemple #12
0
        public void QuickStart_BreakSentences()
        {
            string text = "Sveika, pasaule! Man iet labi. Šodienas datums ir 2014-01-01";

            LatvianTokenizer tokenizer = new LatvianTokenizer();

            Token[]    tokens    = tokenizer.Tokenize(text).ToArray();
            Sentence[] sentences = tokenizer.BreakSentences(tokens).ToArray();

            Assert.AreEqual(3, sentences.Count());
            Assert.AreEqual(4, sentences[0].Count());
            Assert.AreEqual(4, sentences[1].Count());
            Assert.AreEqual(4, sentences[2].Count());
        }
Exemple #13
0
        public void Stream()
        {
            using (Stream stream = new MemoryStream())
            {
                using (StreamWriter writer = new StreamWriter(stream, Encoding.UTF8, bufferSize: 1024, leaveOpen: true))
                    writer.Write("123 456");

                stream.Position = 0;

                Token[] tokens = new LatvianTokenizer().Tokenize(stream).ToArray();
                Assert.AreEqual("123", tokens[0].Text);
                Assert.AreEqual("456", tokens[1].Text);
            }
        }
Exemple #14
0
        public void LoadSave()
        {
            string filename = Path.GetTempFileName();

            LatvianTokenizer tokenizer = new LatvianTokenizer();

            tokenizer.Save(filename);

            tokenizer = new LatvianTokenizer(filename);

            Token[] tokens = tokenizer.Tokenize("123 456").ToArray();
            Assert.AreEqual("123", tokens[0].Text);
            Assert.AreEqual("456", tokens[1].Text);
        }
Exemple #15
0
        public void Compile()
        {
            int count = 100;

            Stopwatch timer = new Stopwatch();

            timer.Start();
            for (int i = 0; i < count; i++)
            {
                LatvianTokenizer tokenizer = new LatvianTokenizer();
            }
            timer.Stop();

            Debug.WriteLine("Compile: {0:0.000} ms", timer.ElapsedMilliseconds / count);
            Debug.WriteLine("Compile x{1}: {0:0.000} ms", timer.ElapsedMilliseconds, count);
        }
Exemple #16
0
        public void Position()
        {
            Token[] tokens = new LatvianTokenizer().Tokenize("Vārds.").ToArray();
            Assert.AreEqual(0, tokens[0].Position);
            Assert.AreEqual(5, tokens[0].PositionEnd);
            Assert.AreEqual(0, tokens[0].Line);
            Assert.AreEqual(0, tokens[0].LineEnd);
            Assert.AreEqual(0, tokens[0].LinePosition);
            Assert.AreEqual(5, tokens[0].LinePositionEnd);

            Assert.AreEqual(".", tokens[1].Text);
            Assert.AreEqual(5, tokens[1].Position);
            Assert.AreEqual(6, tokens[1].PositionEnd);
            Assert.AreEqual(0, tokens[1].Line);
            Assert.AreEqual(0, tokens[1].LineEnd);
            Assert.AreEqual(5, tokens[1].LinePosition);
            Assert.AreEqual(6, tokens[1].LinePositionEnd);
        }
Exemple #17
0
        public void TokenizeFile()
        {
            int count     = 5;
            int mebibytes = 10;
            int size      = mebibytes * 1024 * 1024;;

            string text = DarbaLikums(size);

            Assert.AreEqual(size, text.Length);

            string filename = Path.GetTempFileName();

            File.WriteAllText(filename, text);
            Assert.AreEqual(size, File.ReadAllText(filename).Length);

            LatvianTokenizer tokenizer = new LatvianTokenizer();

            Stopwatch timer      = new Stopwatch();
            int       tokenCount = 0;

            timer.Start();
            for (int i = 0; i < count; i++)
            {
                using (StreamReader reader = new StreamReader(filename))
                {
                    foreach (Token token in tokenizer.Tokenize(reader))
                    {
                        tokenCount++;
                    }
                }
            }
            timer.Stop();

            Assert.IsTrue(tokenCount > 0);

            Debug.WriteLine("Tokenize file ({1} MiB): {0:0.000} ms", timer.ElapsedMilliseconds / count, mebibytes);
            Debug.WriteLine("Tokenize file: {0:0.000} MB/s", (mebibytes * count) / timer.Elapsed.TotalSeconds, mebibytes);
            Debug.WriteLine("Tokenize file: {0:0.000} tokens/s", tokenCount / timer.Elapsed.TotalSeconds);

            File.Delete(filename);
        }
Exemple #18
0
        public void Load()
        {
            string           filename  = Path.GetTempFileName();
            LatvianTokenizer tokenizer = new LatvianTokenizer();

            tokenizer.Save(filename);

            int count = 100;

            Stopwatch timer = new Stopwatch();

            timer.Start();
            for (int i = 0; i < count; i++)
            {
                LatvianTokenizer tokenizer2 = new LatvianTokenizer(filename);
            }
            timer.Stop();

            Debug.WriteLine("Load: {0:0.000} ms", timer.ElapsedMilliseconds / count);
            Debug.WriteLine("Load x{1}: {0:0.000} ms", timer.ElapsedMilliseconds, count);

            File.Delete(filename);
        }
Exemple #19
0
 public void Initials()
 {
     Token[] tokens = new LatvianTokenizer().Tokenize("A.Bērziņš").ToArray();
     Assert.AreEqual("A.", tokens[0].Text);
     Assert.AreEqual("Bērziņš", tokens[1].Text);
 }
Exemple #20
0
 public void Values()
 {
     string[] tokens = new LatvianTokenizer().Tokenize("viens divi").Select(t => t.Text).ToArray();
     Assert.AreEqual("viens", tokens[0]);
     Assert.AreEqual("divi", tokens[1]);
 }
Exemple #21
0
        public void Distinct()
        {
            var tokens = new LatvianTokenizer().Tokenize("viens viens").Distinct();

            Assert.AreEqual(1, tokens.Count());
        }
Exemple #22
0
 public void WhitespaceExcluded()
 {
     Token[] tokens = new LatvianTokenizer().Tokenize("123 456").ToArray();
     Assert.AreEqual("123", tokens[0].Text);
     Assert.AreEqual("456", tokens[1].Text);
 }
Exemple #23
0
        public void QuickStart()
        {
            List <Token> tokens    = new List <Token>();
            DateToken    dateToken = null;

            Debug.WriteLine("Latvian.Tests.Tokenization.ReadmeTests.QuickStart starts");

            string text = "Sveika, pasaule! Man iet labi. Šodienas datums ir 2014-01-01";

            LatvianTokenizer tokenizer = new LatvianTokenizer();

            foreach (Token token in tokenizer.Tokenize(text))
            {
                Debug.WriteLine("Line {0}: Pos {1}: Type: {2} Token: {3}",
                                token.Line, token.Position, token.GetType(), token.Text);

                tokens.Add(token);

                if (token is DateToken)
                {
                    dateToken = token as DateToken;
                    Debug.WriteLine(dateToken.DateTime.ToString("dd/MM/yyyy"));
                }
            }

            Debug.WriteLine("Latvian.Tests.Tokenization.ReadmeTests.QuickStart end");

            Assert.AreEqual(12, tokens.Count);

            Assert.AreEqual("Sveika", tokens[0].Text);
            Assert.AreEqual(typeof(WordToken), tokens[0].GetType());
            Assert.AreEqual(0, tokens[0].Position);
            Assert.AreEqual(6, tokens[0].PositionEnd);
            Assert.AreEqual(0, tokens[0].Line);
            Assert.AreEqual(0, tokens[0].LineEnd);
            Assert.AreEqual(0, tokens[0].LinePosition);
            Assert.AreEqual(6, tokens[0].LinePositionEnd);

            Assert.AreEqual(",", tokens[1].Text);
            Assert.AreEqual(typeof(PunctuationToken), tokens[1].GetType());
            Assert.AreEqual(6, tokens[1].Position);
            Assert.AreEqual(7, tokens[1].PositionEnd);
            Assert.AreEqual(0, tokens[1].Line);
            Assert.AreEqual(0, tokens[1].LineEnd);
            Assert.AreEqual(6, tokens[1].LinePosition);
            Assert.AreEqual(7, tokens[1].LinePositionEnd);

            Assert.AreEqual("pasaule", tokens[2].Text);
            Assert.AreEqual(typeof(WordToken), tokens[2].GetType());
            Assert.AreEqual(8, tokens[2].Position);
            Assert.AreEqual(15, tokens[2].PositionEnd);
            Assert.AreEqual(0, tokens[2].Line);
            Assert.AreEqual(0, tokens[2].LineEnd);
            Assert.AreEqual(8, tokens[2].LinePosition);
            Assert.AreEqual(15, tokens[2].LinePositionEnd);

            Assert.AreEqual("!", tokens[3].Text);
            Assert.AreEqual(typeof(PunctuationToken), tokens[3].GetType());
            Assert.AreEqual(15, tokens[3].Position);
            Assert.AreEqual(16, tokens[3].PositionEnd);
            Assert.AreEqual(0, tokens[3].Line);
            Assert.AreEqual(0, tokens[3].LineEnd);
            Assert.AreEqual(15, tokens[3].LinePosition);
            Assert.AreEqual(16, tokens[3].LinePositionEnd);

            Assert.AreEqual("Man", tokens[4].Text);
            Assert.AreEqual(typeof(WordToken), tokens[4].GetType());
            Assert.AreEqual(17, tokens[4].Position);
            Assert.AreEqual(20, tokens[4].PositionEnd);
            Assert.AreEqual(0, tokens[4].Line);
            Assert.AreEqual(0, tokens[4].LineEnd);
            Assert.AreEqual(17, tokens[4].LinePosition);
            Assert.AreEqual(20, tokens[4].LinePositionEnd);

            Assert.AreEqual("iet", tokens[5].Text);
            Assert.AreEqual(typeof(WordToken), tokens[5].GetType());
            Assert.AreEqual(21, tokens[5].Position);
            Assert.AreEqual(24, tokens[5].PositionEnd);
            Assert.AreEqual(0, tokens[5].Line);
            Assert.AreEqual(0, tokens[5].LineEnd);
            Assert.AreEqual(21, tokens[5].LinePosition);
            Assert.AreEqual(24, tokens[5].LinePositionEnd);

            Assert.AreEqual("labi", tokens[6].Text);
            Assert.AreEqual(typeof(WordToken), tokens[6].GetType());
            Assert.AreEqual(25, tokens[6].Position);
            Assert.AreEqual(29, tokens[6].PositionEnd);
            Assert.AreEqual(0, tokens[6].Line);
            Assert.AreEqual(0, tokens[6].LineEnd);
            Assert.AreEqual(25, tokens[6].LinePosition);
            Assert.AreEqual(29, tokens[6].LinePositionEnd);

            Assert.AreEqual(".", tokens[7].Text);
            Assert.AreEqual(typeof(PunctuationToken), tokens[7].GetType());
            Assert.AreEqual(29, tokens[7].Position);
            Assert.AreEqual(30, tokens[7].PositionEnd);
            Assert.AreEqual(0, tokens[7].Line);
            Assert.AreEqual(0, tokens[7].LineEnd);
            Assert.AreEqual(29, tokens[7].LinePosition);
            Assert.AreEqual(30, tokens[7].LinePositionEnd);

            Assert.AreEqual("2014-01-01", tokens[11].Text);
            Assert.AreEqual(typeof(DateToken), tokens[11].GetType());
            Assert.AreEqual(50, tokens[11].Position);
            Assert.AreEqual(60, tokens[11].PositionEnd);
            Assert.AreEqual(0, tokens[11].Line);
            Assert.AreEqual(0, tokens[11].LineEnd);
            Assert.AreEqual(50, tokens[11].LinePosition);
            Assert.AreEqual(60, tokens[11].LinePositionEnd);

            Assert.AreEqual(tokens[11], dateToken);
            Assert.AreEqual("01.01.2014", dateToken.DateTime.ToString("dd/MM/yyyy"));
        }
Exemple #24
0
 public void LongestMatch()
 {
     Token[] tokens = new LatvianTokenizer().Tokenize("2014-01-01 2014-01-01T12:00:00").ToArray();
     Assert.AreEqual("2014-01-01", tokens[0].Text);
     Assert.AreEqual("2014-01-01T12:00:00", tokens[1].Text);
 }