Exemple #1
0
        public void NextTokenShouldReturnNullWhenThereAreNoMoreTokens()
        {
            var tokenizer = new WordTokenizer("This");

            tokenizer.NextToken();
            Assert.IsNull(tokenizer.NextToken());
        }
Exemple #2
0
        public void PassAllPunctuation_ReturnWord()
        {
            string text = "??!!?!";
            List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text));

            Assert.AreEqual(0, list.Count);
        }
Exemple #3
0
        public void TokenizerShouldTokenizeWordsBySpaces()
        {
            var tokenizer = new WordTokenizer("This is a test");
            var results   = tokenizer.GetTokens().ToArray();

            Assert.AreEqual(new[] { "This", "is", "a", "test" }, results);
        }
Exemple #4
0
        public void PassEmptyWord_ReturnWord()
        {
            string text = "";
            List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text));

            Assert.AreEqual(0, list.Count);
        }
Exemple #5
0
        public void PassMultipleDifferentWordsAndSpaces_ReturnWord()
        {
            string text = "Jesus' abc123s rejoiced,when black-bird's fiance\u0301\u200b flew.";
            List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text));

            Assert.AreEqual(6, list.Count);

            Assert.AreEqual("Jesus", list[0].Value);
            Assert.AreEqual(5, list[0].Length);
            Assert.AreEqual(0, list[0].Offset);

            Assert.AreEqual("abc123s", list[1].Value);
            Assert.AreEqual(7, list[1].Length);
            Assert.AreEqual(7, list[1].Offset);

            Assert.AreEqual("rejoiced,when", list[2].Value);
            Assert.AreEqual(13, list[2].Length);
            Assert.AreEqual(15, list[2].Offset);

            Assert.AreEqual("black-bird's", list[3].Value);
            Assert.AreEqual(12, list[3].Length);
            Assert.AreEqual(29, list[3].Offset);

            Assert.AreEqual("fiance\u0301", list[4].Value);
            Assert.AreEqual(7, list[4].Length);
            Assert.AreEqual(42, list[4].Offset);

            Assert.AreEqual("flew", list[5].Value);
            Assert.AreEqual(4, list[5].Length);
            Assert.AreEqual(51, list[5].Offset);
        }
Exemple #6
0
 public string Tokenize(string input)
 {
     input = Uri.UnescapeDataString(input);
     string[] words = WordTokenizer.Tokenize(input).ToArray();
     string body = "<p><a href=\"/NestleSearch\">Home Page</a></p><p>" + words.Select(w => w + "<br/>").Aggregate("", (a, b) => a + b) + "</p>";
     return string.Format(html, body);
 }
Exemple #7
0
        /// <summary>
        /// Initializes a new instance of the <see cref="EditorView"/> class.
        /// </summary>
        public EditorView()
        {
            // Set up the basic characteristics of the widget.
            Events = EventMask.PointerMotionMask | EventMask.ButtonPressMask
                     | EventMask.PointerMotionHintMask | EventMask.ButtonReleaseMask
                     | EventMask.EnterNotifyMask | EventMask.LeaveNotifyMask
                     | EventMask.VisibilityNotifyMask | EventMask.FocusChangeMask
                     | EventMask.ScrollMask | EventMask.KeyPressMask | EventMask.KeyReleaseMask;
            DoubleBuffered = true;
            CanFocus       = true;
            WidgetFlags   |= WidgetFlags.NoWindow;

            // Set up the rest of the screen elements.
            margins = new MarginRendererCollection();
            margins.Add(new LineNumberMarginRenderer());
            margins.WidthChanged += OnMarginsWidthChanged;

            theme = new Theme();
            editorViewSettings = new EditorViewSettings();

            // Set up the caret, this must be done after the buffer is set.
            caret = new Caret(this);

            // Set up the text editor controller.
            controller    = new EditorViewController(this);
            wordTokenizer = new WordTokenizer();
            Clipboard     = Clipboard.Get(Atom.Intern("CLIPBOARD", true));

            controller.BeginAction += OnBeginAction;
            controller.EndAction   += OnEndAction;
        }
        public void TokenizeWithSeparators()
        {
            string dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data     = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadText(1)), hasHeader: true)
                              .Read(new MultiFileSource(dataPath)).AsDynamic;

            var est       = new WordTokenizer(Env, "text", "words", separators: new[] { ' ', '?', '!', '.', ',' });
            var outdata   = TakeFilter.Create(Env, est.Fit(data).Transform(data), 4);
            var savedData = new ChooseColumnsTransform(Env, outdata, "words");

            var saver = new TextSaver(Env, new TextSaver.Arguments {
                Silent = true
            });
            var outputPath = GetOutputPath("Text", "tokenizedWithSeparators.tsv");

            using (var ch = Env.Start("save"))
            {
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }
            CheckEquality("Text", "tokenizedWithSeparators.tsv");
            Done();
        }
 public void NextTokenShouldReadThroughATextReaderOneWordAtATime()
 {
     var reader = new StringReader("This is a test");
     var tokenizer = new WordTokenizer(reader);
     var word = tokenizer.NextToken();
     Assert.AreEqual("This", word);
     Assert.True(reader.Peek() == 'i');
 }
        public void VerifySingleSentence()
        {
            string input =
                "Basically, this issue is not restricted to NVidia GPUs or specific operating systems - This can be reproduced on Windows, Linux and OSX. Basically the concept of memory safety does not exist in the gpu space - which is the reason why the webgl standard is so strict about always zeroing buffers.";
            var wordList = WordTokenizer.GetWords(input);

            Assert.IsTrue(wordList.ToList().Count > 0);
        }
        public void VerifyComplexSentence()
        {
            string input =
                "They have a lot more software in the works, such as Cassanda, MongoDB and Spark [1]. It's a push to have good software support for the IBM LinuxONE machine [2], essentially an IBM z Systems (z13) mainframe that will only run Linux on KVM (i.e. without expensive z/OS or z/VM 'legacy')";
            var wordlist = WordTokenizer.GetWords(input);

            Assert.IsTrue(wordlist.Contains("z13"));
        }
Exemple #12
0
        static WordPatternInfo[] SeparateWords(string sentence)
        {
            WordTokenizerOptions wto = WordTokenizerOptions.ReturnPunctuations;
            WordTokenizer        st  = new WordTokenizer(wto);

            WordPatternInfo[] wpi = st.ExtractWords(sentence).ToArray();
            return(wpi);
        }
Exemple #13
0
        public void NextTokenShouldReadThroughATextReaderOneWordAtATime()
        {
            var reader    = new StringReader("This is a test");
            var tokenizer = new WordTokenizer(reader);
            var word      = tokenizer.NextToken();

            Assert.AreEqual("This", word);
            Assert.True(reader.Peek() == 'i');
        }
Exemple #14
0
        public void IsWordContinuationWithNulls()
        {
            // Arrange
            var tokenizer = new WordTokenizer();

            // Act
            Assert.Throws <ArgumentNullException>(
                () => tokenizer.IsWordContinuation(null, 'a'));
        }
Exemple #15
0
 public Form1()
 {
     InitializeComponent();
     lines         = System.IO.File.ReadAllLines("stopwords.txt");
     normalizer    = new Normalizer(true, false, false);
     senTokenizer  = new SentenceTokenizer();
     wordTokenizer = new WordTokenizer(true);
     tagger        = new POSTagger();
 }
Exemple #16
0
        public void BackwardsTest10MaxLength2()
        {
            WordTokenizer       tokenizer = new WordTokenizer(message);
            WordTokenCollection tokens    = tokenizer.Backwards(10, 2);

            Assert.AreEqual("morn", tokens [0].Message, "morn");
            Assert.AreEqual("    ", tokens [1].Message, "3 spaces.");
            Assert.AreEqual(2, tokens.Count, "Count");
        }
Exemple #17
0
        public void PassGibberishWithoutWordWithoutSpace_ReturnWord()
        {
            string text = "1a2b3c";
            List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text));

            Assert.AreEqual(1, list.Count);
            Assert.AreEqual(text, list[0].Value);
            Assert.AreEqual(0, list[0].Offset);
            Assert.AreEqual(text.Length, list[0].Length);
        }
Exemple #18
0
        public void PassWordWithUnicodeSuperscript_ReturnWord()
        {
            string text = "fiance\u0301";
            List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text));

            Assert.AreEqual(1, list.Count);
            Assert.AreEqual(text, list[0].Value);
            Assert.AreEqual(0, list[0].Offset);
            Assert.AreEqual(text.Length, list[0].Length);
        }
Exemple #19
0
        public void PassWordWithWordInternalHyphen_ReturnWord()
        {
            string text = "black-bird";
            List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text));

            Assert.AreEqual(1, list.Count);
            Assert.AreEqual(text, list[0].Value);
            Assert.AreEqual(0, list[0].Offset);
            Assert.AreEqual(text.Length, list[0].Length);
        }
Exemple #20
0
        public void PassSimpleWord_ReturnWord()
        {
            string text = "Hello";
            List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text));

            Assert.AreEqual(1, list.Count);
            Assert.AreEqual(text, list[0].Value);
            Assert.AreEqual(0, list[0].Offset);
            Assert.AreEqual(text.Length, list[0].Length);
        }
Exemple #21
0
        public void PassWordWithWordFinalApostrophe_ReturnWord()
        {
            string text = "Jesus'";
            List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text));

            Assert.AreEqual(1, list.Count);
            Assert.AreEqual("Jesus", list[0].Value);
            Assert.AreEqual(0, list[0].Offset);
            Assert.AreEqual(text.Length - 1, list[0].Length);
        }
Exemple #22
0
        public void PassWordWithWordPreviousInternalFinalPunctuation_ReturnWord()
        {
            string text = "?black,bird.flew!Home?";
            List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text));

            Assert.AreEqual(1, list.Count);
            Assert.AreEqual("black,bird.flew!Home", list[0].Value);
            Assert.AreEqual(1, list[0].Offset);
            Assert.AreEqual(text.Length - 2, list[0].Length);             //drop word previous and wordfinal punctuation
        }
Exemple #23
0
        public void IsWordContinuationWithTextAndLetter()
        {
            // Arrange
            var tokenizer = new WordTokenizer();

            // Act
            bool results = tokenizer.IsWordContinuation("a", 'a');

            // Assert
            Assert.IsTrue(results);
        }
Exemple #24
0
        public void IsWordContinuationWithPeriodAndLetter()
        {
            // Arrange
            var tokenizer = new WordTokenizer();

            // Act
            bool results = tokenizer.IsWordContinuation(".", 'a');

            // Assert
            Assert.IsFalse(results);
        }
Exemple #25
0
        public void IsWordContinuationWithSpaceAndPeriod()
        {
            // Arrange
            var tokenizer = new WordTokenizer();

            // Act
            bool results = tokenizer.IsWordContinuation(" ", '.');

            // Assert
            Assert.IsFalse(results);
        }
Exemple #26
0
 public void PassNULL_Throws()
 {
     Assert.Throws <ArgumentNullException>(
         () =>
     {
         foreach (WordTokenizer.Token t in WordTokenizer.TokenizeText(null))
         {
         }
     }
         );
 }
        public void JoinVerbPartsTest()
        {
            WordTokenizer wordTokenizer = new WordTokenizer(true);

            string input;

            string[] expected, actual;

            input    = "خواهد رفت";
            expected = new string[] { "خواهد رفت" };
            actual   = wordTokenizer.Tokenize(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize words of '" + input + "' sentence");
            for (int i = 0; i < expected.Length; i++)
            {
                Assert.AreEqual(expected[i], actual[i], "Failed to tokenize words of '" + input + "' sentence");
            }

            input    = "رفته است";
            expected = new string[] { "رفته است" };
            actual   = wordTokenizer.Tokenize(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize words of '" + input + "' sentence");
            for (int i = 0; i < expected.Length; i++)
            {
                Assert.AreEqual(expected[i], actual[i], "Failed to tokenize words of '" + input + "' sentence");
            }

            input    = "گفته شده است";
            expected = new string[] { "گفته شده است" };
            actual   = wordTokenizer.Tokenize(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize words of '" + input + "' sentence");
            for (int i = 0; i < expected.Length; i++)
            {
                Assert.AreEqual(expected[i], actual[i], "Failed to tokenize words of '" + input + "' sentence");
            }

            input    = "گفته خواهد شد";
            expected = new string[] { "گفته خواهد شد" };
            actual   = wordTokenizer.Tokenize(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize words of '" + input + "' sentence");
            for (int i = 0; i < expected.Length; i++)
            {
                Assert.AreEqual(expected[i], actual[i], "Failed to tokenize words of '" + input + "' sentence");
            }

            input    = "خسته شدید";
            expected = new string[] { "خسته", "شدید" };
            actual   = wordTokenizer.Tokenize(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize words of '" + input + "' sentence");
            for (int i = 0; i < expected.Length; i++)
            {
                Assert.AreEqual(expected[i], actual[i], "Failed to tokenize words of '" + input + "' sentence");
            }
        }
        public void JoinVerbPartsTest()
        {
            WordTokenizer wordTokenizer = new WordTokenizer(true);

            string input;
            string[] expected, actual;

            input = "خواهد رفت";
            expected = new string[] { "خواهد رفت" };
            actual = wordTokenizer.Tokenize(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize words of '" + input + "' sentence");
            for (int i = 0; i < expected.Length; i++)
            {
                Assert.AreEqual(expected[i], actual[i], "Failed to tokenize words of '" + input + "' sentence");
            }

            input = "رفته است";
            expected = new string[] { "رفته است" };
            actual = wordTokenizer.Tokenize(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize words of '" + input + "' sentence");
            for (int i = 0; i < expected.Length; i++)
            {
                Assert.AreEqual(expected[i], actual[i], "Failed to tokenize words of '" + input + "' sentence");
            }

            input = "گفته شده است";
            expected = new string[] { "گفته شده است" };
            actual = wordTokenizer.Tokenize(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize words of '" + input + "' sentence");
            for (int i = 0; i < expected.Length; i++)
            {
                Assert.AreEqual(expected[i], actual[i], "Failed to tokenize words of '" + input + "' sentence");
            }

            input = "گفته خواهد شد";
            expected = new string[] { "گفته خواهد شد" };
            actual = wordTokenizer.Tokenize(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize words of '" + input + "' sentence");
            for (int i = 0; i < expected.Length; i++)
            {
                Assert.AreEqual(expected[i], actual[i], "Failed to tokenize words of '" + input + "' sentence");
            }

            input = "خسته شدید";
            expected = new string[] { "خسته", "شدید" };
            actual = wordTokenizer.Tokenize(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize words of '" + input + "' sentence");
            for (int i = 0; i < expected.Length; i++)
            {
                Assert.AreEqual(expected[i], actual[i], "Failed to tokenize words of '" + input + "' sentence");
            }
        }
Exemple #29
0
        public void PreviousOneWord()
        {
            // Setup
            var          splitter = new WordTokenizer();
            const string text     = "word";
            const int    index    = 1;

            // Test
            int boundary = splitter.GetPreviousWordBoundary(text, index);

            // Assertion
            Assert.AreEqual(0, boundary);
        }
        public void GetWordsWithFilterOut()
        {
            WordTokenizer tokenizer = new WordTokenizer(
                "Test",
                new SimpleWordItemFactory(Global.PosTagger, Global.Raw),
                NullSimpleWordPipeline.Instance,
                NullWordItemPipeline.Instance,
                new[] { "", "one" });
            var result = tokenizer.GetWords().ToArray();

            Assert.AreEqual(1, result.Length);
            Assert.AreEqual("one", result[0]);
        }
Exemple #31
0
        public string ReplaceHumanNum(string input_s)
        {
            WordTokenizer wt = new WordTokenizer();
            var           l  = wt.Tokenize(input_s);

            HumanNumberFind hmf = new HumanNumberFind();

            hmf.findNumberPositions(l);

            string santance_buffer = string.Empty;
            string number_buffer   = string.Empty;

            int j = 0;

            //hmf.num.Add(0); // to finish & flush loop

            foreach (int i in hmf.num)
            {
                if (i == 0 && hmf.pos[j] == 0)
                {
                    number_buffer += " " + l.ToArray()[i];
                    j              = i + 1;
                }
                else if (i == 0) //not number
                {
                    if (number_buffer.Trim().Length > 0)
                    { //convert number and flush
                        santance_buffer += " " + HumanNumber.ParseFarsi(number_buffer.Trim());
                        number_buffer    = string.Empty;
                    }

                    santance_buffer += " " + l.ToArray()[j];
                    j = j + 1;
                }
                else
                {
                    number_buffer += " " + l.ToArray()[i];
                    j              = i + 1;
                }
            }

            //test for buffer
            if (number_buffer.Trim().Length > 0)
            { //convert number and flush
                santance_buffer += " " + HumanNumber.ParseFarsi(number_buffer.Trim());
                number_buffer    = string.Empty;
            }


            return(santance_buffer);
        }
Exemple #32
0
        public void PassMultipleSimpleWordsSeperatedByZeroWisthSpace_ReturnMultipleWords()
        {
            string text = "Jesus\u200bwept";
            List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text));

            Assert.AreEqual(2, list.Count);
            Assert.AreEqual("Jesus", list[0].Value);
            Assert.AreEqual(0, list[0].Offset);
            Assert.AreEqual(5, list[0].Length);

            Assert.AreEqual("wept", list[1].Value);
            Assert.AreEqual(6, list[1].Offset);
            Assert.AreEqual(4, list[1].Length);
        }
Exemple #33
0
 public override TokenStream TokenStream(String fieldName, TextReader reader)
 {
     TokenStream result = new SentenceTokenizer(reader);
     result = new WordTokenizer(result, wordSegment);
     // result = new LowerCaseFilter(result);
     // 不再需要LowerCaseFilter,因为SegTokenFilter已经将所有英文字符转换成小写
     // stem太严格了, This is not bug, this feature:)
     result = new PorterStemFilter(result);
     if (stopWords != null)
     {
         result = new StopFilter(true, result, StopFilter.MakeStopSet(stopWords), false);
     }
     return result;
 }
        public void TokenizeTest()
        {
            WordTokenizer wordTokenizer = new WordTokenizer(false);

            string input;
            string[] expected, actual;

            input = "این جمله (خیلی) پیچیده نیست!!!";
            expected = new string[] { "این", "جمله", "(", "خیلی", ")", "پیچیده", "نیست", "!!!"};
            actual = wordTokenizer.Tokenize(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize words of '" + input + "' sentence");
            for (int i = 0; i < expected.Length; i++)
            {
                Assert.AreEqual(expected[i], actual[i], "Failed to tokenize words of '" + input + "' sentence");
            }
        }
 public void TokenizerShouldTokenizeWordsBySpaces()
 {
     var tokenizer = new WordTokenizer("This is a test");
     var results = tokenizer.GetTokens().ToArray();
     Assert.AreEqual(new[]{"This", "is", "a", "test"}, results);
 }
Exemple #36
0
        static WordPatternInfo[] SeparateWords(string sentence)
        {
            WordTokenizerOptions wto = WordTokenizerOptions.ReturnPunctuations;
            WordTokenizer st = new WordTokenizer(wto);

            WordPatternInfo[] wpi = st.ExtractWords(sentence).ToArray();
            return wpi;
        }
 public void NextTokenShouldReturnNullWhenThereAreNoMoreTokens()
 {
     var tokenizer = new WordTokenizer("This");
     tokenizer.NextToken();
     Assert.IsNull(tokenizer.NextToken());
 }