示例#1
0
        private async void button2_Click(object sender, EventArgs e)
        {
            var res = _intermediateTextContainer;

            // define and remove 'operator' words

            var operator_words = new[]
            {
                "და",
                "რომ",
                "არ",
                "წლის",
                "შემდეგ",
                "კი",
                "თმცა",
                "ამ",
                "ეს",
                "მისი",
                "რომელიც",
                "უნდა",
                "რაც",
                "მაგრამ",
                "მას",
                "ის",
                "როგორც",
                "ვერ",
                "თუ",
                "მხოლოდ",
                "თუმცა",
                "იყო",
                "ძალიან",
                "აქვს",
                "რა",
                "ამის",
                "უფრო",
                "იმ",
                "ასე",
                "ასეთი",
                "ან"
            };

            foreach (var item in operator_words)
            {
                string pattern = @"\b" + item + @"\b"; // replace whole word only
                res = Regex.Replace(res, pattern, string.Empty);
            }

            var punctuation = new[]
            {
                ".",
                ",",
                ":",
                "!",
                "?",
                "%",
                "(",
                ")",
                "=",
                "-",
            };

            foreach (var item in punctuation)
            {
                res = res.Replace(item, string.Empty);
            }


            var result = res.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries)
                         .GroupBy(r => r)
                         .Select(grp => new Word
            {
                WordName  = grp.Key,
                Occurence = grp.Count()
            }).OrderByDescending(b => b.Occurence).ToList();

            // remove those words that have occurence = 1,  since they cannot influence the ML algorithm in a positive way.
            //TODO:  for the rest of the words, make sure they are distributed in at least 2 articles.

            result = result.Where(a => a.Occurence > 1).ToList();


            // remove newline chars

            result = result.Where(a => !a.WordName.Equals("\n") && !a.WordName.Equals(" ")).ToList();

            result = result.Skip(1).ToList();

            dataGridView1.DataSource = result;

            _intermediateWordContainer = result;

            await MongoDBHandler.SaveWordCollection(result);
        }