public void Tokenize_ApostropheNotAsSingleQuote_ReturnsTokens()
        {
            var tokenizer = new LatinWordTokenizer();

            Assert.That(tokenizer.TokenizeToStrings("“Moses' cat said ‘Meow’ to the dog.”"),
                        Is.EqualTo(new[] { "“", "Moses'", "cat", "said", "‘", "Meow", "’", "to", "the", "dog", ".", "”" }));

            Assert.That(tokenizer.TokenizeToStrings("i ha''on ot ano'."),
                        Is.EqualTo(new[] { "i", "ha''on", "ot", "ano'", "." }));
        }
Example #2
0
        public void Tokenize_Quotes_ReturnsTokens()
        {
            var tokenizer = new LatinWordTokenizer();

            Assert.That(tokenizer.TokenizeToStrings("\"This is a test.\""),
                        Is.EqualTo(new[] { "\"", "This", "is", "a", "test", ".", "\"" }));
        }
Example #3
0
        public void Tokenize_Abbreviation_ReturnsTokens()
        {
            var tokenizer = new LatinWordTokenizer(new[] { "mr", "dr", "ms" });

            Assert.That(tokenizer.TokenizeToStrings("Mr. Smith went to Washington."),
                        Is.EqualTo(new[] { "Mr.", "Smith", "went", "to", "Washington", "." }));
        }
Example #4
0
        public void Tokenize_PunctuationInsideWord_ReturnsTokens()
        {
            var tokenizer = new LatinWordTokenizer();

            Assert.That(tokenizer.TokenizeToStrings("This isn't a test."),
                        Is.EqualTo(new[] { "This", "isn't", "a", "test", "." }));
        }
Example #5
0
        public void Tokenize_PunctuationAtStartOfWord_ReturnsTokens()
        {
            var tokenizer = new LatinWordTokenizer();

            Assert.That(tokenizer.TokenizeToStrings("Is this a \"test\"?"),
                        Is.EqualTo(new[] { "Is", "this", "a", "\"", "test", "\"", "?" }));
        }
        /// <summary>
        /// This tests a workaround for a bug in Bridge.NET, see issue #2981.
        /// </summary>
        private static void Tokenize_NonAsciiCharacter_DoesNotThrow(Assert assert)
        {
            var tokenizer = new LatinWordTokenizer();

            assert.DeepEqual(tokenizer.TokenizeToStrings("This is—a test.").ToArray(),
                             new[] { "This", "is", "—", "a", "test", "." });
        }
        private static void Tokenize_Abbreviation_ReturnsTokens(Assert assert)
        {
            var tokenizer = new LatinWordTokenizer(new[] { "mr", "dr", "ms" });

            assert.DeepEqual(tokenizer.TokenizeToStrings("Mr. Smith went to Washington.").ToArray(),
                             new[] { "Mr.", "Smith", "went", "to", "Washington", "." });
        }
        private static void Tokenize_PunctuationInsideWord_ReturnsTokens(Assert assert)
        {
            var tokenizer = new LatinWordTokenizer();

            assert.DeepEqual(tokenizer.TokenizeToStrings("This isn't a test.").ToArray(),
                             new[] { "This", "isn't", "a", "test", "." });
        }
        private static void Tokenize_PunctuationAtStartOfWord_ReturnsTokens(Assert assert)
        {
            var tokenizer = new LatinWordTokenizer();

            assert.DeepEqual(tokenizer.TokenizeToStrings("Is this a \"test\"?").ToArray(),
                             new[] { "Is", "this", "a", "\"", "test", "\"", "?" });
        }
        public void Tokenize_ApostropheAsSingleQuote_ReturnsTokens()
        {
            var tokenizer = new LatinWordTokenizer {
                TreatApostropheAsSingleQuote = true
            };

            Assert.That(tokenizer.TokenizeToStrings("'Moses's cat said 'Meow' to the dog.'"),
                        Is.EqualTo(new[] { "'", "Moses's", "cat", "said", "'", "Meow", "'", "to", "the", "dog", ".", "'" }));
        }
Example #11
0
        public void Tokenize_Empty_ReturnsEmpty()
        {
            var tokenizer = new LatinWordTokenizer();

            Assert.That(tokenizer.TokenizeToStrings(""), Is.Empty);
        }
        private static void Tokenize_Whitespace_ReturnsEmpty(Assert assert)
        {
            var tokenizer = new LatinWordTokenizer();

            assert.DeepEqual(tokenizer.TokenizeToStrings(" ").ToArray(), new string[0]);
        }