public void NextTokenShouldReturnNullWhenThereAreNoMoreTokens() { var tokenizer = new WordTokenizer("This"); tokenizer.NextToken(); Assert.IsNull(tokenizer.NextToken()); }
public void PassAllPunctuation_ReturnWord() { string text = "??!!?!"; List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text)); Assert.AreEqual(0, list.Count); }
public void TokenizerShouldTokenizeWordsBySpaces() { var tokenizer = new WordTokenizer("This is a test"); var results = tokenizer.GetTokens().ToArray(); Assert.AreEqual(new[] { "This", "is", "a", "test" }, results); }
public void PassEmptyWord_ReturnWord() { string text = ""; List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text)); Assert.AreEqual(0, list.Count); }
public void PassMultipleDifferentWordsAndSpaces_ReturnWord() { string text = "Jesus' abc123s rejoiced,when black-bird's fiance\u0301\u200b flew."; List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text)); Assert.AreEqual(6, list.Count); Assert.AreEqual("Jesus", list[0].Value); Assert.AreEqual(5, list[0].Length); Assert.AreEqual(0, list[0].Offset); Assert.AreEqual("abc123s", list[1].Value); Assert.AreEqual(7, list[1].Length); Assert.AreEqual(7, list[1].Offset); Assert.AreEqual("rejoiced,when", list[2].Value); Assert.AreEqual(13, list[2].Length); Assert.AreEqual(15, list[2].Offset); Assert.AreEqual("black-bird's", list[3].Value); Assert.AreEqual(12, list[3].Length); Assert.AreEqual(29, list[3].Offset); Assert.AreEqual("fiance\u0301", list[4].Value); Assert.AreEqual(7, list[4].Length); Assert.AreEqual(42, list[4].Offset); Assert.AreEqual("flew", list[5].Value); Assert.AreEqual(4, list[5].Length); Assert.AreEqual(51, list[5].Offset); }
public string Tokenize(string input) { input = Uri.UnescapeDataString(input); string[] words = WordTokenizer.Tokenize(input).ToArray(); string body = "<p><a href=\"/NestleSearch\">Home Page</a></p><p>" + words.Select(w => w + "<br/>").Aggregate("", (a, b) => a + b) + "</p>"; return string.Format(html, body); }
/// <summary> /// Initializes a new instance of the <see cref="EditorView"/> class. /// </summary> public EditorView() { // Set up the basic characteristics of the widget. Events = EventMask.PointerMotionMask | EventMask.ButtonPressMask | EventMask.PointerMotionHintMask | EventMask.ButtonReleaseMask | EventMask.EnterNotifyMask | EventMask.LeaveNotifyMask | EventMask.VisibilityNotifyMask | EventMask.FocusChangeMask | EventMask.ScrollMask | EventMask.KeyPressMask | EventMask.KeyReleaseMask; DoubleBuffered = true; CanFocus = true; WidgetFlags |= WidgetFlags.NoWindow; // Set up the rest of the screen elements. margins = new MarginRendererCollection(); margins.Add(new LineNumberMarginRenderer()); margins.WidthChanged += OnMarginsWidthChanged; theme = new Theme(); editorViewSettings = new EditorViewSettings(); // Set up the caret, this must be done after the buffer is set. caret = new Caret(this); // Set up the text editor controller. controller = new EditorViewController(this); wordTokenizer = new WordTokenizer(); Clipboard = Clipboard.Get(Atom.Intern("CLIPBOARD", true)); controller.BeginAction += OnBeginAction; controller.EndAction += OnEndAction; }
public void TokenizeWithSeparators() { string dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(new MultiFileSource(dataPath)).AsDynamic; var est = new WordTokenizer(Env, "text", "words", separators: new[] { ' ', '?', '!', '.', ',' }); var outdata = TakeFilter.Create(Env, est.Fit(data).Transform(data), 4); var savedData = new ChooseColumnsTransform(Env, outdata, "words"); var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); var outputPath = GetOutputPath("Text", "tokenizedWithSeparators.tsv"); using (var ch = Env.Start("save")) { using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "tokenizedWithSeparators.tsv"); Done(); }
public void NextTokenShouldReadThroughATextReaderOneWordAtATime() { var reader = new StringReader("This is a test"); var tokenizer = new WordTokenizer(reader); var word = tokenizer.NextToken(); Assert.AreEqual("This", word); Assert.True(reader.Peek() == 'i'); }
public void VerifySingleSentence() { string input = "Basically, this issue is not restricted to NVidia GPUs or specific operating systems - This can be reproduced on Windows, Linux and OSX. Basically the concept of memory safety does not exist in the gpu space - which is the reason why the webgl standard is so strict about always zeroing buffers."; var wordList = WordTokenizer.GetWords(input); Assert.IsTrue(wordList.ToList().Count > 0); }
public void VerifyComplexSentence() { string input = "They have a lot more software in the works, such as Cassanda, MongoDB and Spark [1]. It's a push to have good software support for the IBM LinuxONE machine [2], essentially an IBM z Systems (z13) mainframe that will only run Linux on KVM (i.e. without expensive z/OS or z/VM 'legacy')"; var wordlist = WordTokenizer.GetWords(input); Assert.IsTrue(wordlist.Contains("z13")); }
static WordPatternInfo[] SeparateWords(string sentence) { WordTokenizerOptions wto = WordTokenizerOptions.ReturnPunctuations; WordTokenizer st = new WordTokenizer(wto); WordPatternInfo[] wpi = st.ExtractWords(sentence).ToArray(); return(wpi); }
public void IsWordContinuationWithNulls() { // Arrange var tokenizer = new WordTokenizer(); // Act Assert.Throws <ArgumentNullException>( () => tokenizer.IsWordContinuation(null, 'a')); }
public Form1() { InitializeComponent(); lines = System.IO.File.ReadAllLines("stopwords.txt"); normalizer = new Normalizer(true, false, false); senTokenizer = new SentenceTokenizer(); wordTokenizer = new WordTokenizer(true); tagger = new POSTagger(); }
public void BackwardsTest10MaxLength2() { WordTokenizer tokenizer = new WordTokenizer(message); WordTokenCollection tokens = tokenizer.Backwards(10, 2); Assert.AreEqual("morn", tokens [0].Message, "morn"); Assert.AreEqual(" ", tokens [1].Message, "3 spaces."); Assert.AreEqual(2, tokens.Count, "Count"); }
public void PassGibberishWithoutWordWithoutSpace_ReturnWord() { string text = "1a2b3c"; List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text)); Assert.AreEqual(1, list.Count); Assert.AreEqual(text, list[0].Value); Assert.AreEqual(0, list[0].Offset); Assert.AreEqual(text.Length, list[0].Length); }
public void PassWordWithUnicodeSuperscript_ReturnWord() { string text = "fiance\u0301"; List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text)); Assert.AreEqual(1, list.Count); Assert.AreEqual(text, list[0].Value); Assert.AreEqual(0, list[0].Offset); Assert.AreEqual(text.Length, list[0].Length); }
public void PassWordWithWordInternalHyphen_ReturnWord() { string text = "black-bird"; List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text)); Assert.AreEqual(1, list.Count); Assert.AreEqual(text, list[0].Value); Assert.AreEqual(0, list[0].Offset); Assert.AreEqual(text.Length, list[0].Length); }
public void PassSimpleWord_ReturnWord() { string text = "Hello"; List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text)); Assert.AreEqual(1, list.Count); Assert.AreEqual(text, list[0].Value); Assert.AreEqual(0, list[0].Offset); Assert.AreEqual(text.Length, list[0].Length); }
public void PassWordWithWordFinalApostrophe_ReturnWord() { string text = "Jesus'"; List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text)); Assert.AreEqual(1, list.Count); Assert.AreEqual("Jesus", list[0].Value); Assert.AreEqual(0, list[0].Offset); Assert.AreEqual(text.Length - 1, list[0].Length); }
public void PassWordWithWordPreviousInternalFinalPunctuation_ReturnWord() { string text = "?black,bird.flew!Home?"; List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text)); Assert.AreEqual(1, list.Count); Assert.AreEqual("black,bird.flew!Home", list[0].Value); Assert.AreEqual(1, list[0].Offset); Assert.AreEqual(text.Length - 2, list[0].Length); //drop word previous and wordfinal punctuation }
public void IsWordContinuationWithTextAndLetter() { // Arrange var tokenizer = new WordTokenizer(); // Act bool results = tokenizer.IsWordContinuation("a", 'a'); // Assert Assert.IsTrue(results); }
public void IsWordContinuationWithPeriodAndLetter() { // Arrange var tokenizer = new WordTokenizer(); // Act bool results = tokenizer.IsWordContinuation(".", 'a'); // Assert Assert.IsFalse(results); }
public void IsWordContinuationWithSpaceAndPeriod() { // Arrange var tokenizer = new WordTokenizer(); // Act bool results = tokenizer.IsWordContinuation(" ", '.'); // Assert Assert.IsFalse(results); }
public void PassNULL_Throws() { Assert.Throws <ArgumentNullException>( () => { foreach (WordTokenizer.Token t in WordTokenizer.TokenizeText(null)) { } } ); }
public void JoinVerbPartsTest() { WordTokenizer wordTokenizer = new WordTokenizer(true); string input; string[] expected, actual; input = "خواهد رفت"; expected = new string[] { "خواهد رفت" }; actual = wordTokenizer.Tokenize(input).ToArray(); Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize words of '" + input + "' sentence"); for (int i = 0; i < expected.Length; i++) { Assert.AreEqual(expected[i], actual[i], "Failed to tokenize words of '" + input + "' sentence"); } input = "رفته است"; expected = new string[] { "رفته است" }; actual = wordTokenizer.Tokenize(input).ToArray(); Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize words of '" + input + "' sentence"); for (int i = 0; i < expected.Length; i++) { Assert.AreEqual(expected[i], actual[i], "Failed to tokenize words of '" + input + "' sentence"); } input = "گفته شده است"; expected = new string[] { "گفته شده است" }; actual = wordTokenizer.Tokenize(input).ToArray(); Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize words of '" + input + "' sentence"); for (int i = 0; i < expected.Length; i++) { Assert.AreEqual(expected[i], actual[i], "Failed to tokenize words of '" + input + "' sentence"); } input = "گفته خواهد شد"; expected = new string[] { "گفته خواهد شد" }; actual = wordTokenizer.Tokenize(input).ToArray(); Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize words of '" + input + "' sentence"); for (int i = 0; i < expected.Length; i++) { Assert.AreEqual(expected[i], actual[i], "Failed to tokenize words of '" + input + "' sentence"); } input = "خسته شدید"; expected = new string[] { "خسته", "شدید" }; actual = wordTokenizer.Tokenize(input).ToArray(); Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize words of '" + input + "' sentence"); for (int i = 0; i < expected.Length; i++) { Assert.AreEqual(expected[i], actual[i], "Failed to tokenize words of '" + input + "' sentence"); } }
public void PreviousOneWord() { // Setup var splitter = new WordTokenizer(); const string text = "word"; const int index = 1; // Test int boundary = splitter.GetPreviousWordBoundary(text, index); // Assertion Assert.AreEqual(0, boundary); }
public void GetWordsWithFilterOut() { WordTokenizer tokenizer = new WordTokenizer( "Test", new SimpleWordItemFactory(Global.PosTagger, Global.Raw), NullSimpleWordPipeline.Instance, NullWordItemPipeline.Instance, new[] { "", "one" }); var result = tokenizer.GetWords().ToArray(); Assert.AreEqual(1, result.Length); Assert.AreEqual("one", result[0]); }
public string ReplaceHumanNum(string input_s) { WordTokenizer wt = new WordTokenizer(); var l = wt.Tokenize(input_s); HumanNumberFind hmf = new HumanNumberFind(); hmf.findNumberPositions(l); string santance_buffer = string.Empty; string number_buffer = string.Empty; int j = 0; //hmf.num.Add(0); // to finish & flush loop foreach (int i in hmf.num) { if (i == 0 && hmf.pos[j] == 0) { number_buffer += " " + l.ToArray()[i]; j = i + 1; } else if (i == 0) //not number { if (number_buffer.Trim().Length > 0) { //convert number and flush santance_buffer += " " + HumanNumber.ParseFarsi(number_buffer.Trim()); number_buffer = string.Empty; } santance_buffer += " " + l.ToArray()[j]; j = j + 1; } else { number_buffer += " " + l.ToArray()[i]; j = i + 1; } } //test for buffer if (number_buffer.Trim().Length > 0) { //convert number and flush santance_buffer += " " + HumanNumber.ParseFarsi(number_buffer.Trim()); number_buffer = string.Empty; } return(santance_buffer); }
public void PassMultipleSimpleWordsSeperatedByZeroWisthSpace_ReturnMultipleWords() { string text = "Jesus\u200bwept"; List <WordTokenizer.Token> list = new List <WordTokenizer.Token>(WordTokenizer.TokenizeText(text)); Assert.AreEqual(2, list.Count); Assert.AreEqual("Jesus", list[0].Value); Assert.AreEqual(0, list[0].Offset); Assert.AreEqual(5, list[0].Length); Assert.AreEqual("wept", list[1].Value); Assert.AreEqual(6, list[1].Offset); Assert.AreEqual(4, list[1].Length); }
public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new SentenceTokenizer(reader); result = new WordTokenizer(result, wordSegment); // result = new LowerCaseFilter(result); // 不再需要LowerCaseFilter,因为SegTokenFilter已经将所有英文字符转换成小写 // stem太严格了, This is not bug, this feature:) result = new PorterStemFilter(result); if (stopWords != null) { result = new StopFilter(true, result, StopFilter.MakeStopSet(stopWords), false); } return result; }
public void TokenizeTest() { WordTokenizer wordTokenizer = new WordTokenizer(false); string input; string[] expected, actual; input = "این جمله (خیلی) پیچیده نیست!!!"; expected = new string[] { "این", "جمله", "(", "خیلی", ")", "پیچیده", "نیست", "!!!"}; actual = wordTokenizer.Tokenize(input).ToArray(); Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize words of '" + input + "' sentence"); for (int i = 0; i < expected.Length; i++) { Assert.AreEqual(expected[i], actual[i], "Failed to tokenize words of '" + input + "' sentence"); } }
public void TokenizerShouldTokenizeWordsBySpaces() { var tokenizer = new WordTokenizer("This is a test"); var results = tokenizer.GetTokens().ToArray(); Assert.AreEqual(new[]{"This", "is", "a", "test"}, results); }
static WordPatternInfo[] SeparateWords(string sentence) { WordTokenizerOptions wto = WordTokenizerOptions.ReturnPunctuations; WordTokenizer st = new WordTokenizer(wto); WordPatternInfo[] wpi = st.ExtractWords(sentence).ToArray(); return wpi; }