public void Tokenize_Abbreviation_ReturnsTokens() { var tokenizer = new LatinWordTokenizer(new[] { "mr", "dr", "ms" }); Assert.That(tokenizer.TokenizeToStrings("Mr. Smith went to Washington."), Is.EqualTo(new[] { "Mr.", "Smith", "went", "to", "Washington", "." })); }
public void Tokenize_Quotes_ReturnsTokens() { var tokenizer = new LatinWordTokenizer(); Assert.That(tokenizer.TokenizeToStrings("\"This is a test.\""), Is.EqualTo(new[] { "\"", "This", "is", "a", "test", ".", "\"" })); }
public void Tokenize_PunctuationAtStartOfWord_ReturnsTokens() { var tokenizer = new LatinWordTokenizer(); Assert.That(tokenizer.TokenizeToStrings("Is this a \"test\"?"), Is.EqualTo(new[] { "Is", "this", "a", "\"", "test", "\"", "?" })); }
public void Tokenize_PunctuationInsideWord_ReturnsTokens() { var tokenizer = new LatinWordTokenizer(); Assert.That(tokenizer.TokenizeToStrings("This isn't a test."), Is.EqualTo(new[] { "This", "isn't", "a", "test", "." })); }
/// <summary> /// This tests a workaround for a bug in Bridge.NET, see issue #2981. /// </summary> private static void Tokenize_NonAsciiCharacter_DoesNotThrow(Assert assert) { var tokenizer = new LatinWordTokenizer(); assert.DeepEqual(tokenizer.TokenizeToStrings("This is—a test.").ToArray(), new[] { "This", "is", "—", "a", "test", "." }); }
private static void Tokenize_Abbreviation_ReturnsTokens(Assert assert) { var tokenizer = new LatinWordTokenizer(new[] { "mr", "dr", "ms" }); assert.DeepEqual(tokenizer.TokenizeToStrings("Mr. Smith went to Washington.").ToArray(), new[] { "Mr.", "Smith", "went", "to", "Washington", "." }); }
private static void Tokenize_PunctuationInsideWord_ReturnsTokens(Assert assert) { var tokenizer = new LatinWordTokenizer(); assert.DeepEqual(tokenizer.TokenizeToStrings("This isn't a test.").ToArray(), new[] { "This", "isn't", "a", "test", "." }); }
private static void Tokenize_PunctuationAtStartOfWord_ReturnsTokens(Assert assert) { var tokenizer = new LatinWordTokenizer(); assert.DeepEqual(tokenizer.TokenizeToStrings("Is this a \"test\"?").ToArray(), new[] { "Is", "this", "a", "\"", "test", "\"", "?" }); }
public void Tokenize_ApostropheAsSingleQuote_ReturnsTokens() { var tokenizer = new LatinWordTokenizer { TreatApostropheAsSingleQuote = true }; Assert.That(tokenizer.TokenizeToStrings("'Moses's cat said 'Meow' to the dog.'"), Is.EqualTo(new[] { "'", "Moses's", "cat", "said", "'", "Meow", "'", "to", "the", "dog", ".", "'" })); }
public void Tokenize_ApostropheNotAsSingleQuote_ReturnsTokens() { var tokenizer = new LatinWordTokenizer(); Assert.That(tokenizer.TokenizeToStrings("“Moses' cat said ‘Meow’ to the dog.”"), Is.EqualTo(new[] { "“", "Moses'", "cat", "said", "‘", "Meow", "’", "to", "the", "dog", ".", "”" })); Assert.That(tokenizer.TokenizeToStrings("i ha''on ot ano'."), Is.EqualTo(new[] { "i", "ha''on", "ot", "ano'", "." })); }
public void Tokenize_Empty_ReturnsEmpty() { var tokenizer = new LatinWordTokenizer(); Assert.That(tokenizer.TokenizeToStrings(""), Is.Empty); }
private static void Tokenize_Whitespace_ReturnsEmpty(Assert assert) { var tokenizer = new LatinWordTokenizer(); assert.DeepEqual(tokenizer.TokenizeToStrings(" ").ToArray(), new string[0]); }