private async void button2_Click(object sender, EventArgs e) { var res = _intermediateTextContainer; // define and remove 'operator' words var operator_words = new[] { "და", "რომ", "არ", "წლის", "შემდეგ", "კი", "თმცა", "ამ", "ეს", "მისი", "რომელიც", "უნდა", "რაც", "მაგრამ", "მას", "ის", "როგორც", "ვერ", "თუ", "მხოლოდ", "თუმცა", "იყო", "ძალიან", "აქვს", "რა", "ამის", "უფრო", "იმ", "ასე", "ასეთი", "ან" }; foreach (var item in operator_words) { string pattern = @"\b" + item + @"\b"; // replace whole word only res = Regex.Replace(res, pattern, string.Empty); } var punctuation = new[] { ".", ",", ":", "!", "?", "%", "(", ")", "=", "-", }; foreach (var item in punctuation) { res = res.Replace(item, string.Empty); } var result = res.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries) .GroupBy(r => r) .Select(grp => new Word { WordName = grp.Key, Occurence = grp.Count() }).OrderByDescending(b => b.Occurence).ToList(); // remove those words that have occurence = 1, since they cannot influence the ML algorithm in a positive way. //TODO: for the rest of the words, make sure they are distributed in at least 2 articles. result = result.Where(a => a.Occurence > 1).ToList(); // remove newline chars result = result.Where(a => !a.WordName.Equals("\n") && !a.WordName.Equals(" ")).ToList(); result = result.Skip(1).ToList(); dataGridView1.DataSource = result; _intermediateWordContainer = result; await MongoDBHandler.SaveWordCollection(result); }