Exemplo n.º 1
0
        public void Emotions()
        {
            LatvianTokenizer tokenizer = new LatvianTokenizer(compile: false);
            tokenizer.Add<EmotionToken>();

            EmotionToken[] tokens = tokenizer.Tokenize("Šodien esmu :) bet vakar biju :(").OfType<EmotionToken>().ToArray();
            Assert.AreEqual(":)", tokens[0].Text);
            Assert.AreEqual(":(", tokens[1].Text);
        }
Exemplo n.º 2
0
        public void EqualMatch()
        {
            LatvianTokenizer tokenizer = new LatvianTokenizer(compile: false);
            tokenizer.Clear();
            tokenizer.Add<TimeSpanToken>(); // matches 00:00:00
            tokenizer.Add<ClockToken>(); // matches 00:00:00

            Token token = tokenizer.Tokenize("00:00:00").First();
            Assert.IsTrue(token is TimeSpanToken);
        }
Exemplo n.º 3
0
        public void kā_arī()
        {
            LatvianTokenizer tokenizer = new LatvianTokenizer(compile: false);
            tokenizer.Add<LuMiiCollocationToken>();
            tokenizer.Compile();

            string text = "Es skrienu kā arī lecu.";

            Token[] tokens = tokenizer.Tokenize(text).ToArray();

            Assert.AreEqual(5, tokens.Length);
            Assert.AreEqual("kā arī", tokens[2].Text);
        }
Exemplo n.º 4
0
        public void ļcien()
        {
            LatvianTokenizer tokenizer = new LatvianTokenizer(compile: false);
            tokenizer.Add<LuMiiAbbreviationToken>();
            tokenizer.Compile();

            string text = "Ļ.cien. kungs!";

            Token[] tokens = tokenizer.Tokenize(text).ToArray();

            Assert.AreEqual(3, tokens.Length);
            Assert.AreEqual("Ļ.cien.", tokens[0].Text);
        }
Exemplo n.º 5
0
        public void Compile()
        {
            int count = 100;

            Stopwatch timer = new Stopwatch();
            timer.Start();
            for (int i = 0; i < count; i++)
            {
                LatvianTokenizer tokenizer = new LatvianTokenizer();
            }
            timer.Stop();

            Debug.WriteLine("Compile: {0:0.000} ms", timer.ElapsedMilliseconds / count);
            Debug.WriteLine("Compile x{1}: {0:0.000} ms", timer.ElapsedMilliseconds, count);
        }
Exemplo n.º 6
0
        public void kaut_jel_milj_kgs()
        {
            LatvianTokenizer tokenizer = new LatvianTokenizer(compile: false);
            tokenizer.Add<LuMiiAbbreviationToken>();
            tokenizer.Add<LuMiiCollocationToken>();
            tokenizer.Compile();

            string text = "kaut JEL 2 milj. mans k-gs";

            Token[] tokens = tokenizer.Tokenize(text).ToArray();

            Assert.AreEqual(5, tokens.Length);
            Assert.AreEqual("kaut JEL", tokens[0].Text);
            Assert.AreEqual("milj.", tokens[2].Text);
            Assert.AreEqual("k-gs", tokens[4].Text);
        }
Exemplo n.º 7
0
        public void Load()
        {
            string filename = Path.GetTempFileName();
            LatvianTokenizer tokenizer = new LatvianTokenizer();
            tokenizer.Save(filename);

            int count = 100;

            Stopwatch timer = new Stopwatch();
            timer.Start();
            for (int i = 0; i < count; i++)
            {
                LatvianTokenizer tokenizer2 = new LatvianTokenizer(filename);
            }
            timer.Stop();

            Debug.WriteLine("Load: {0:0.000} ms", timer.ElapsedMilliseconds / count);
            Debug.WriteLine("Load x{1}: {0:0.000} ms", timer.ElapsedMilliseconds, count);

            File.Delete(filename);
        }
Exemplo n.º 8
0
        public void InitialsRemoved()
        {
            LatvianTokenizer tokenizer = new LatvianTokenizer(compile: false);
            tokenizer.Remove<InitialsToken>();
            tokenizer.Compile(); // optional

            Token[] tokens = tokenizer.Tokenize("A.Bērziņš").ToArray();
            Assert.AreEqual("A", tokens[0].Text);
            Assert.AreEqual(".", tokens[1].Text);
            Assert.AreEqual("Bērziņš", tokens[2].Text);
        }
Exemplo n.º 9
0
 public void Initials()
 {
     Token[] tokens = new LatvianTokenizer().Tokenize("A.Bērziņš").ToArray();
     Assert.AreEqual("A.", tokens[0].Text);
     Assert.AreEqual("Bērziņš", tokens[1].Text);
 }
Exemplo n.º 10
0
        public void Position()
        {
            Token[] tokens = new LatvianTokenizer().Tokenize("Vārds.").ToArray();
            Assert.AreEqual(0, tokens[0].Position);
            Assert.AreEqual(5, tokens[0].PositionEnd);
            Assert.AreEqual(0, tokens[0].Line);
            Assert.AreEqual(0, tokens[0].LineEnd);
            Assert.AreEqual(0, tokens[0].LinePosition);
            Assert.AreEqual(5, tokens[0].LinePositionEnd);

            Assert.AreEqual(".", tokens[1].Text);
            Assert.AreEqual(5, tokens[1].Position);
            Assert.AreEqual(6, tokens[1].PositionEnd);
            Assert.AreEqual(0, tokens[1].Line);
            Assert.AreEqual(0, tokens[1].LineEnd);
            Assert.AreEqual(5, tokens[1].LinePosition);
            Assert.AreEqual(6, tokens[1].LinePositionEnd);
        }
Exemplo n.º 11
0
 public void WhitespaceIncluded()
 {
     LatvianTokenizer tokenizer = new LatvianTokenizer() { IncludeWhitespace = true };
     Token[] tokens = tokenizer.Tokenize("123 456").ToArray();
     Assert.AreEqual("123", tokens[0].Text);
     Assert.AreEqual(" ", tokens[1].Text);
     Assert.AreEqual("456", tokens[2].Text);
 }
Exemplo n.º 12
0
        public void QuickStart()
        {
            List<Token> tokens = new List<Token>();
            DateToken dateToken = null;

            Debug.WriteLine("Latvian.Tests.Tokenization.ReadmeTests.QuickStart starts");

            string text = "Sveika, pasaule! Man iet labi. Šodienas datums ir 2014-01-01";

            LatvianTokenizer tokenizer = new LatvianTokenizer();

            foreach (Token token in tokenizer.Tokenize(text))
            {
                Debug.WriteLine("Line {0}: Pos {1}: Type: {2} Token: {3}",
                    token.Line, token.Position, token.GetType(), token.Text);

                tokens.Add(token);

                if (token is DateToken)
                {
                    dateToken = token as DateToken;
                    Debug.WriteLine(dateToken.DateTime.ToString("dd/MM/yyyy"));
                }
            }

            Debug.WriteLine("Latvian.Tests.Tokenization.ReadmeTests.QuickStart end");

            Assert.AreEqual(12, tokens.Count);

            Assert.AreEqual("Sveika", tokens[0].Text);
            Assert.AreEqual(typeof(WordToken), tokens[0].GetType());
            Assert.AreEqual(0, tokens[0].Position);
            Assert.AreEqual(6, tokens[0].PositionEnd);
            Assert.AreEqual(0, tokens[0].Line);
            Assert.AreEqual(0, tokens[0].LineEnd);
            Assert.AreEqual(0, tokens[0].LinePosition);
            Assert.AreEqual(6, tokens[0].LinePositionEnd);

            Assert.AreEqual(",", tokens[1].Text);
            Assert.AreEqual(typeof(PunctuationToken), tokens[1].GetType());
            Assert.AreEqual(6, tokens[1].Position);
            Assert.AreEqual(7, tokens[1].PositionEnd);
            Assert.AreEqual(0, tokens[1].Line);
            Assert.AreEqual(0, tokens[1].LineEnd);
            Assert.AreEqual(6, tokens[1].LinePosition);
            Assert.AreEqual(7, tokens[1].LinePositionEnd);

            Assert.AreEqual("pasaule", tokens[2].Text);
            Assert.AreEqual(typeof(WordToken), tokens[2].GetType());
            Assert.AreEqual(8, tokens[2].Position);
            Assert.AreEqual(15, tokens[2].PositionEnd);
            Assert.AreEqual(0, tokens[2].Line);
            Assert.AreEqual(0, tokens[2].LineEnd);
            Assert.AreEqual(8, tokens[2].LinePosition);
            Assert.AreEqual(15, tokens[2].LinePositionEnd);

            Assert.AreEqual("!", tokens[3].Text);
            Assert.AreEqual(typeof(PunctuationToken), tokens[3].GetType());
            Assert.AreEqual(15, tokens[3].Position);
            Assert.AreEqual(16, tokens[3].PositionEnd);
            Assert.AreEqual(0, tokens[3].Line);
            Assert.AreEqual(0, tokens[3].LineEnd);
            Assert.AreEqual(15, tokens[3].LinePosition);
            Assert.AreEqual(16, tokens[3].LinePositionEnd);

            Assert.AreEqual("Man", tokens[4].Text);
            Assert.AreEqual(typeof(WordToken), tokens[4].GetType());
            Assert.AreEqual(17, tokens[4].Position);
            Assert.AreEqual(20, tokens[4].PositionEnd);
            Assert.AreEqual(0, tokens[4].Line);
            Assert.AreEqual(0, tokens[4].LineEnd);
            Assert.AreEqual(17, tokens[4].LinePosition);
            Assert.AreEqual(20, tokens[4].LinePositionEnd);

            Assert.AreEqual("iet", tokens[5].Text);
            Assert.AreEqual(typeof(WordToken), tokens[5].GetType());
            Assert.AreEqual(21, tokens[5].Position);
            Assert.AreEqual(24, tokens[5].PositionEnd);
            Assert.AreEqual(0, tokens[5].Line);
            Assert.AreEqual(0, tokens[5].LineEnd);
            Assert.AreEqual(21, tokens[5].LinePosition);
            Assert.AreEqual(24, tokens[5].LinePositionEnd);

            Assert.AreEqual("labi", tokens[6].Text);
            Assert.AreEqual(typeof(WordToken), tokens[6].GetType());
            Assert.AreEqual(25, tokens[6].Position);
            Assert.AreEqual(29, tokens[6].PositionEnd);
            Assert.AreEqual(0, tokens[6].Line);
            Assert.AreEqual(0, tokens[6].LineEnd);
            Assert.AreEqual(25, tokens[6].LinePosition);
            Assert.AreEqual(29, tokens[6].LinePositionEnd);

            Assert.AreEqual(".", tokens[7].Text);
            Assert.AreEqual(typeof(PunctuationToken), tokens[7].GetType());
            Assert.AreEqual(29, tokens[7].Position);
            Assert.AreEqual(30, tokens[7].PositionEnd);
            Assert.AreEqual(0, tokens[7].Line);
            Assert.AreEqual(0, tokens[7].LineEnd);
            Assert.AreEqual(29, tokens[7].LinePosition);
            Assert.AreEqual(30, tokens[7].LinePositionEnd);

            Assert.AreEqual("2014-01-01", tokens[11].Text);
            Assert.AreEqual(typeof(DateToken), tokens[11].GetType());
            Assert.AreEqual(50, tokens[11].Position);
            Assert.AreEqual(60, tokens[11].PositionEnd);
            Assert.AreEqual(0, tokens[11].Line);
            Assert.AreEqual(0, tokens[11].LineEnd);
            Assert.AreEqual(50, tokens[11].LinePosition);
            Assert.AreEqual(60, tokens[11].LinePositionEnd);

            Assert.AreEqual(tokens[11], dateToken);
            Assert.AreEqual("01.01.2014", dateToken.DateTime.ToString("dd/MM/yyyy"));
        }
Exemplo n.º 13
0
        public void QuickStart_TokenizeSentences()
        {
            string text = "Sveika, pasaule! Man iet labi. Šodienas datums ir 2014-01-01";

            List<Sentence> sentences = new List<Sentence>();

            LatvianTokenizer tokenizer = new LatvianTokenizer();

            foreach (Sentence sentence in tokenizer.TokenizeSentences(text))
            {
                List<Token> sentenceTokens = new List<Token>();

                foreach (Token token in sentence)
                {
                    sentenceTokens.Add(token);
                }

                sentences.Add(sentenceTokens);
            }

            Assert.AreEqual(3, sentences.Count());
            Assert.AreEqual(4, sentences[0].Count());
            Assert.AreEqual(4, sentences[1].Count());
            Assert.AreEqual(4, sentences[2].Count());
        }
Exemplo n.º 14
0
 public void LongestMatch()
 {
     Token[] tokens = new LatvianTokenizer().Tokenize("2014-01-01 2014-01-01T12:00:00").ToArray();
     Assert.AreEqual("2014-01-01", tokens[0].Text);
     Assert.AreEqual("2014-01-01T12:00:00", tokens[1].Text);
 }
Exemplo n.º 15
0
        public void Stream()
        {
            using (Stream stream = new MemoryStream())
            {
                using (StreamWriter writer = new StreamWriter(stream, Encoding.UTF8, bufferSize: 1024, leaveOpen: true))
                    writer.Write("123 456");

                stream.Position = 0;

                Token[] tokens = new LatvianTokenizer().Tokenize(stream).ToArray();
                Assert.AreEqual("123", tokens[0].Text);
                Assert.AreEqual("456", tokens[1].Text);
            }
        }
Exemplo n.º 16
0
        public void TokenizeString()
        {
            int count = 5;
            int mebibytes = 10;
            int size = mebibytes * 1024 * 1024;

            string text = DarbaLikums(size);
            Assert.AreEqual(size, text.Length);

            LatvianTokenizer tokenizer = new LatvianTokenizer();

            Stopwatch timer = new Stopwatch();
            int tokenCount = 0;
            timer.Start();
            for (int i = 0; i < count; i++)
            {
                foreach (Token token in tokenizer.Tokenize(text))
                {
                    tokenCount++;
                }
            }
            timer.Stop();

            Assert.IsTrue(tokenCount > 0);

            Debug.WriteLine("Tokenize string ({1} MiB): {0:0.000} ms", timer.ElapsedMilliseconds / count, mebibytes);
            Debug.WriteLine("Tokenize string: {0:0.000} MB/s", (mebibytes * count) / timer.Elapsed.TotalSeconds, mebibytes);
            Debug.WriteLine("Tokenize string: {0:0.000} tokens/s", tokenCount / timer.Elapsed.TotalSeconds);
        }
Exemplo n.º 17
0
 public void Distinct()
 {
     var tokens = new LatvianTokenizer().Tokenize("viens viens").Distinct();
     Assert.AreEqual(1, tokens.Count());
 }
Exemplo n.º 18
0
        public void Reorder()
        {
            LatvianTokenizer tokenizer = new LatvianTokenizer(compile: false);
            tokenizer.Clear();
            tokenizer.Add<TimeSpanToken>(); // matches 00:00:00
            tokenizer.Add<ClockToken>(); // matches 00:00:00
            tokenizer.Remove<ClockToken>();
            tokenizer.Insert<ClockToken>(0);
            Token token = tokenizer.Tokenize("00:00:00").First();
            Assert.IsTrue(token is ClockToken);

            tokenizer = new LatvianTokenizer(compile: false);
            tokenizer.Clear();
            tokenizer.Add<TimeSpanToken>(); // matches 00:00:00
            tokenizer.Add<ClockToken>(); // matches 00:00:00
            tokenizer.Remove(typeof(ClockToken));
            tokenizer.Insert(0, typeof(ClockToken));
            token = tokenizer.Tokenize("00:00:00").First();
            Assert.IsTrue(token is ClockToken);

            tokenizer = new LatvianTokenizer(compile: false);
            tokenizer.Clear();
            tokenizer.Add<TimeSpanToken>(); // matches 00:00:00
            tokenizer.Add<ClockToken>(); // matches 00:00:00
            tokenizer.Move<ClockToken>(0);
            token = tokenizer.Tokenize("00:00:00").First();
            Assert.IsTrue(token is ClockToken);
        }
Exemplo n.º 19
0
 public void DateToken()
 {
     Token token = new LatvianTokenizer().Tokenize("2014-01-01").First();
     Assert.IsTrue(token is DateToken);
     Assert.AreEqual(2014, ((DateToken)token).DateTime.Year);
 }
Exemplo n.º 20
0
 public void OrdinalNumberToken()
 {
     Token token = new LatvianTokenizer().Tokenize("123").First();
     Assert.IsTrue(token is OrdinalNumberToken);
     Assert.AreEqual(123 * 2, ((OrdinalNumberToken)token).Value * 2);
 }
Exemplo n.º 21
0
 public void WhitespaceExcluded()
 {
     Token[] tokens = new LatvianTokenizer().Tokenize("123 456").ToArray();
     Assert.AreEqual("123", tokens[0].Text);
     Assert.AreEqual("456", tokens[1].Text);
 }
Exemplo n.º 22
0
 public void Text()
 {
     Token token = new LatvianTokenizer().Tokenize("vārds").First();
     Assert.AreEqual("vārds", token.Text);
     Assert.AreEqual("vārds", token.ToString());
 }
Exemplo n.º 23
0
        public void QuickStart_BreakSentences()
        {
            string text = "Sveika, pasaule! Man iet labi. Šodienas datums ir 2014-01-01";

            LatvianTokenizer tokenizer = new LatvianTokenizer();

            Token[] tokens = tokenizer.Tokenize(text).ToArray();
            Sentence[] sentences = tokenizer.BreakSentences(tokens).ToArray();

            Assert.AreEqual(3, sentences.Count());
            Assert.AreEqual(4, sentences[0].Count());
            Assert.AreEqual(4, sentences[1].Count());
            Assert.AreEqual(4, sentences[2].Count());
        }
Exemplo n.º 24
0
 public void TextReader()
 {
     using (StringReader reader = new StringReader("123 456"))
     {
         Token[] tokens = new LatvianTokenizer().Tokenize(reader).ToArray();
         Assert.AreEqual("123", tokens[0].Text);
         Assert.AreEqual("456", tokens[1].Text);
     }
 }
Exemplo n.º 25
0
 public void Values()
 {
     string[] tokens = new LatvianTokenizer().Tokenize("viens divi").Select(t => t.Text).ToArray();
     Assert.AreEqual("viens", tokens[0]);
     Assert.AreEqual("divi", tokens[1]);
 }
Exemplo n.º 26
0
        public void TokenizeFile()
        {
            int count = 5;
            int mebibytes = 10;
            int size = mebibytes * 1024 * 1024; ;

            string text = DarbaLikums(size);
            Assert.AreEqual(size, text.Length);

            string filename = Path.GetTempFileName();
            File.WriteAllText(filename, text);
            Assert.AreEqual(size, File.ReadAllText(filename).Length);

            LatvianTokenizer tokenizer = new LatvianTokenizer();

            Stopwatch timer = new Stopwatch();
            int tokenCount = 0;
            timer.Start();
            for (int i = 0; i < count; i++)
            {
                using (StreamReader reader = new StreamReader(filename))
                {
                    foreach (Token token in tokenizer.Tokenize(reader))
                    {
                        tokenCount++;
                    }
                }
            }
            timer.Stop();

            Assert.IsTrue(tokenCount > 0);

            Debug.WriteLine("Tokenize file ({1} MiB): {0:0.000} ms", timer.ElapsedMilliseconds / count, mebibytes);
            Debug.WriteLine("Tokenize file: {0:0.000} MB/s", (mebibytes * count) / timer.Elapsed.TotalSeconds, mebibytes);
            Debug.WriteLine("Tokenize file: {0:0.000} tokens/s", tokenCount / timer.Elapsed.TotalSeconds);

            File.Delete(filename);
        }
Exemplo n.º 27
0
        public void LoadSave()
        {
            string filename = Path.GetTempFileName();

            LatvianTokenizer tokenizer = new LatvianTokenizer();
            tokenizer.Save(filename);

            tokenizer = new LatvianTokenizer(filename);

            Token[] tokens = tokenizer.Tokenize("123 456").ToArray();
            Assert.AreEqual("123", tokens[0].Text);
            Assert.AreEqual("456", tokens[1].Text);
        }