private void Test(string text, params string[] expected) { string[] tokens; tokens = tokenizer.Tokenize(text).Select(t => t.Text).ToArray(); CollectionAssert.AreEqual(expected, tokens); tokens = tokenizer.Tokenize(new StringReader(text)).Select(t => t.Text).ToArray(); CollectionAssert.AreEqual(expected, tokens); tokens = tokenizer.Tokenize(text as IEnumerable <char>).Select(t => t.Text).ToArray(); CollectionAssert.AreEqual(expected, tokens); }
public void TokenizeString() { int count = 5; int mebibytes = 10; int size = mebibytes * 1024 * 1024; string text = DarbaLikums(size); Assert.AreEqual(size, text.Length); LatvianTokenizer tokenizer = new LatvianTokenizer(); Stopwatch timer = new Stopwatch(); int tokenCount = 0; timer.Start(); for (int i = 0; i < count; i++) { foreach (Token token in tokenizer.Tokenize(text)) { tokenCount++; } } timer.Stop(); Assert.IsTrue(tokenCount > 0); Debug.WriteLine("Tokenize string ({1} MiB): {0:0.000} ms", timer.ElapsedMilliseconds / count, mebibytes); Debug.WriteLine("Tokenize string: {0:0.000} MB/s", (mebibytes * count) / timer.Elapsed.TotalSeconds, mebibytes); Debug.WriteLine("Tokenize string: {0:0.000} tokens/s", tokenCount / timer.Elapsed.TotalSeconds); }
public void Reorder() { LatvianTokenizer tokenizer = new LatvianTokenizer(compile: false); tokenizer.Clear(); tokenizer.Add <TimeSpanToken>(); // matches 00:00:00 tokenizer.Add <ClockToken>(); // matches 00:00:00 tokenizer.Remove <ClockToken>(); tokenizer.Insert <ClockToken>(0); Token token = tokenizer.Tokenize("00:00:00").First(); Assert.IsTrue(token is ClockToken); tokenizer = new LatvianTokenizer(compile: false); tokenizer.Clear(); tokenizer.Add <TimeSpanToken>(); // matches 00:00:00 tokenizer.Add <ClockToken>(); // matches 00:00:00 tokenizer.Remove(typeof(ClockToken)); tokenizer.Insert(0, typeof(ClockToken)); token = tokenizer.Tokenize("00:00:00").First(); Assert.IsTrue(token is ClockToken); tokenizer = new LatvianTokenizer(compile: false); tokenizer.Clear(); tokenizer.Add <TimeSpanToken>(); // matches 00:00:00 tokenizer.Add <ClockToken>(); // matches 00:00:00 tokenizer.Move <ClockToken>(0); token = tokenizer.Tokenize("00:00:00").First(); Assert.IsTrue(token is ClockToken); }
public void Emotions() { LatvianTokenizer tokenizer = new LatvianTokenizer(compile: false); tokenizer.Add <EmotionToken>(); EmotionToken[] tokens = tokenizer.Tokenize("Šodien esmu :) bet vakar biju :(").OfType <EmotionToken>().ToArray(); Assert.AreEqual(":)", tokens[0].Text); Assert.AreEqual(":(", tokens[1].Text); }
public void InitialsRemovedNotCompiled() { LatvianTokenizer tokenizer = new LatvianTokenizer(compile: false); tokenizer.Remove <InitialsToken>(); Token[] tokens = tokenizer.Tokenize("A.Bērziņš").ToArray(); Assert.AreEqual("A", tokens[0].Text); Assert.AreEqual(".", tokens[1].Text); Assert.AreEqual("Bērziņš", tokens[2].Text); }
public void EqualMatch() { LatvianTokenizer tokenizer = new LatvianTokenizer(compile: false); tokenizer.Clear(); tokenizer.Add <TimeSpanToken>(); // matches 00:00:00 tokenizer.Add <ClockToken>(); // matches 00:00:00 Token token = tokenizer.Tokenize("00:00:00").First(); Assert.IsTrue(token is TimeSpanToken); }
public void WhitespaceIncluded() { LatvianTokenizer tokenizer = new LatvianTokenizer() { IncludeWhitespace = true }; Token[] tokens = tokenizer.Tokenize("123 456").ToArray(); Assert.AreEqual("123", tokens[0].Text); Assert.AreEqual(" ", tokens[1].Text); Assert.AreEqual("456", tokens[2].Text); }
public void LoadSave() { string filename = Path.GetTempFileName(); LatvianTokenizer tokenizer = new LatvianTokenizer(); tokenizer.Save(filename); tokenizer = new LatvianTokenizer(filename); Token[] tokens = tokenizer.Tokenize("123 456").ToArray(); Assert.AreEqual("123", tokens[0].Text); Assert.AreEqual("456", tokens[1].Text); }
public void QuickStart_BreakSentences() { string text = "Sveika, pasaule! Man iet labi. Šodienas datums ir 2014-01-01"; LatvianTokenizer tokenizer = new LatvianTokenizer(); Token[] tokens = tokenizer.Tokenize(text).ToArray(); Sentence[] sentences = tokenizer.BreakSentences(tokens).ToArray(); Assert.AreEqual(3, sentences.Count()); Assert.AreEqual(4, sentences[0].Count()); Assert.AreEqual(4, sentences[1].Count()); Assert.AreEqual(4, sentences[2].Count()); }
public void TokenizeFile() { int count = 5; int mebibytes = 10; int size = mebibytes * 1024 * 1024;; string text = DarbaLikums(size); Assert.AreEqual(size, text.Length); string filename = Path.GetTempFileName(); File.WriteAllText(filename, text); Assert.AreEqual(size, File.ReadAllText(filename).Length); LatvianTokenizer tokenizer = new LatvianTokenizer(); Stopwatch timer = new Stopwatch(); int tokenCount = 0; timer.Start(); for (int i = 0; i < count; i++) { using (StreamReader reader = new StreamReader(filename)) { foreach (Token token in tokenizer.Tokenize(reader)) { tokenCount++; } } } timer.Stop(); Assert.IsTrue(tokenCount > 0); Debug.WriteLine("Tokenize file ({1} MiB): {0:0.000} ms", timer.ElapsedMilliseconds / count, mebibytes); Debug.WriteLine("Tokenize file: {0:0.000} MB/s", (mebibytes * count) / timer.Elapsed.TotalSeconds, mebibytes); Debug.WriteLine("Tokenize file: {0:0.000} tokens/s", tokenCount / timer.Elapsed.TotalSeconds); File.Delete(filename); }
public void QuickStart() { List <Token> tokens = new List <Token>(); DateToken dateToken = null; Debug.WriteLine("Latvian.Tests.Tokenization.ReadmeTests.QuickStart starts"); string text = "Sveika, pasaule! Man iet labi. Šodienas datums ir 2014-01-01"; LatvianTokenizer tokenizer = new LatvianTokenizer(); foreach (Token token in tokenizer.Tokenize(text)) { Debug.WriteLine("Line {0}: Pos {1}: Type: {2} Token: {3}", token.Line, token.Position, token.GetType(), token.Text); tokens.Add(token); if (token is DateToken) { dateToken = token as DateToken; Debug.WriteLine(dateToken.DateTime.ToString("dd/MM/yyyy")); } } Debug.WriteLine("Latvian.Tests.Tokenization.ReadmeTests.QuickStart end"); Assert.AreEqual(12, tokens.Count); Assert.AreEqual("Sveika", tokens[0].Text); Assert.AreEqual(typeof(WordToken), tokens[0].GetType()); Assert.AreEqual(0, tokens[0].Position); Assert.AreEqual(6, tokens[0].PositionEnd); Assert.AreEqual(0, tokens[0].Line); Assert.AreEqual(0, tokens[0].LineEnd); Assert.AreEqual(0, tokens[0].LinePosition); Assert.AreEqual(6, tokens[0].LinePositionEnd); Assert.AreEqual(",", tokens[1].Text); Assert.AreEqual(typeof(PunctuationToken), tokens[1].GetType()); Assert.AreEqual(6, tokens[1].Position); Assert.AreEqual(7, tokens[1].PositionEnd); Assert.AreEqual(0, tokens[1].Line); Assert.AreEqual(0, tokens[1].LineEnd); Assert.AreEqual(6, tokens[1].LinePosition); Assert.AreEqual(7, tokens[1].LinePositionEnd); Assert.AreEqual("pasaule", tokens[2].Text); Assert.AreEqual(typeof(WordToken), tokens[2].GetType()); Assert.AreEqual(8, tokens[2].Position); Assert.AreEqual(15, tokens[2].PositionEnd); Assert.AreEqual(0, tokens[2].Line); Assert.AreEqual(0, tokens[2].LineEnd); Assert.AreEqual(8, tokens[2].LinePosition); Assert.AreEqual(15, tokens[2].LinePositionEnd); Assert.AreEqual("!", tokens[3].Text); Assert.AreEqual(typeof(PunctuationToken), tokens[3].GetType()); Assert.AreEqual(15, tokens[3].Position); Assert.AreEqual(16, tokens[3].PositionEnd); Assert.AreEqual(0, tokens[3].Line); Assert.AreEqual(0, tokens[3].LineEnd); Assert.AreEqual(15, tokens[3].LinePosition); Assert.AreEqual(16, tokens[3].LinePositionEnd); Assert.AreEqual("Man", tokens[4].Text); Assert.AreEqual(typeof(WordToken), tokens[4].GetType()); Assert.AreEqual(17, tokens[4].Position); Assert.AreEqual(20, tokens[4].PositionEnd); Assert.AreEqual(0, tokens[4].Line); Assert.AreEqual(0, tokens[4].LineEnd); Assert.AreEqual(17, tokens[4].LinePosition); Assert.AreEqual(20, tokens[4].LinePositionEnd); Assert.AreEqual("iet", tokens[5].Text); Assert.AreEqual(typeof(WordToken), tokens[5].GetType()); Assert.AreEqual(21, tokens[5].Position); Assert.AreEqual(24, tokens[5].PositionEnd); Assert.AreEqual(0, tokens[5].Line); Assert.AreEqual(0, tokens[5].LineEnd); Assert.AreEqual(21, tokens[5].LinePosition); Assert.AreEqual(24, tokens[5].LinePositionEnd); Assert.AreEqual("labi", tokens[6].Text); Assert.AreEqual(typeof(WordToken), tokens[6].GetType()); Assert.AreEqual(25, tokens[6].Position); Assert.AreEqual(29, tokens[6].PositionEnd); Assert.AreEqual(0, tokens[6].Line); Assert.AreEqual(0, tokens[6].LineEnd); Assert.AreEqual(25, tokens[6].LinePosition); Assert.AreEqual(29, tokens[6].LinePositionEnd); Assert.AreEqual(".", tokens[7].Text); Assert.AreEqual(typeof(PunctuationToken), tokens[7].GetType()); Assert.AreEqual(29, tokens[7].Position); Assert.AreEqual(30, tokens[7].PositionEnd); Assert.AreEqual(0, tokens[7].Line); Assert.AreEqual(0, tokens[7].LineEnd); Assert.AreEqual(29, tokens[7].LinePosition); Assert.AreEqual(30, tokens[7].LinePositionEnd); Assert.AreEqual("2014-01-01", tokens[11].Text); Assert.AreEqual(typeof(DateToken), tokens[11].GetType()); Assert.AreEqual(50, tokens[11].Position); Assert.AreEqual(60, tokens[11].PositionEnd); Assert.AreEqual(0, tokens[11].Line); Assert.AreEqual(0, tokens[11].LineEnd); Assert.AreEqual(50, tokens[11].LinePosition); Assert.AreEqual(60, tokens[11].LinePositionEnd); Assert.AreEqual(tokens[11], dateToken); Assert.AreEqual("01.01.2014", dateToken.DateTime.ToString("dd/MM/yyyy")); }