private void buttonLoadLexicalizedWords_Click(object sender, EventArgs e) { var ofd = new OpenFileDialog { Multiselect = true }; var stemmer = new Stemming(); if (ofd.ShowDialog() == DialogResult.OK) { foreach (var filename in ofd.FileNames) { var theme = Path.GetFileNameWithoutExtension(filename); var words = new List <string>(); var sr = new StreamReader(filename, Encoding.Default); while (!sr.EndOfStream) { var word = _tc.NormilizeText(sr.ReadLine()); word = word.Trim(); if (word.IndexOf(' ') >= 0) { continue; } word = stemmer.Stem(word); words.Add(word); } _tc.LoadLexicalizedWords(comboBoxThemeGroup.Text, theme, words); sr.Close(); } } }
/// <summary>Функция обучения (для дальнейшей классификации текстов по группам) /// </summary> /// <param name="groupsTexts">данные для обучения вида имя группы-текст принадлежащей группе</param> public void Learn(string themeGroupName, Dictionary<string, string> groupsTexts, Action callback = null) { //обнуление предыдущего обучения var groupsTextsOld = groupsTexts; //получение списка групп _groupsTexts = new Dictionary<string, string>(); foreach (var g in groupsTexts) _groupsTexts.Add(g.Key, NormilizeText(g.Value)); _wordsCountInGroups = new Dictionary<string, Dictionary<string, Int64>>(); _summaryWordsCountInGroup = new Dictionary<string, Int64>(); _summaryWordsCountInGroupDistinct = new Dictionary<string, Int64>(); _wordsWeigthInGroups = new Dictionary<string, Dictionary<string, double>>(); _wordsWeigthsInGroupsIncludingOtherGroups = new Dictionary<string, Dictionary<string, double>>(); _wordsWeigthsInGroupsIncludingOtherGroupsRemoved = new Dictionary<string, Dictionary<string, double>>(); _stemOfWord = new Dictionary<string, List<string>>(); _themesOfWord = new Dictionary<string, List<string>>(); //конец обнуления предыдущего обучения var stemming = new Stemming(); //класс для получения основы слова using (var db = new TextClassificatorEntities()) { _themeGroupId = db.ThemeGroup.Single(tg => tg.ThemeGroup_name == themeGroupName).ThemeGroup_id; foreach (var text in _groupsTexts) { var words = text.Value.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);//.Select(w => stemming.Stem(w)).ToList(); var wordsStem = new List<string>(); foreach (var w in words) { var stem = stemming.Stem(w); if (!_stemOfWord.ContainsKey(stem)) _stemOfWord.Add(stem, new List<string>()); wordsStem.Add(stem); if (!_stemOfWord[stem].Contains(w)) _stemOfWord[stem].Add(w); } wordsStem = wordsStem.Where(w => !string.IsNullOrWhiteSpace(w) && w.Length >= MinimumWordLength).ToList(); wordsStem.RemoveAll(w => StopWords.Any(sw => sw == w)); var groupWords = wordsStem.GroupBy(w => w); foreach (var w in groupWords) { if (!_themesOfWord.ContainsKey(w.Key)) _themesOfWord.Add(w.Key, new List<string>()); _themesOfWord[w.Key].Add(text.Key); } var dictionaryWords = (from gw in groupWords where gw.Count() >= MinimumWordCount select gw).ToDictionary(gw => gw.Key, gw => gw.LongCount()); _summaryWordsCountInGroup.Add(text.Key, dictionaryWords.Sum(dw => dw.Value)); _summaryWordsCountInGroupDistinct.Add(text.Key, dictionaryWords.LongCount()); _wordsCountInGroups.Add(text.Key, dictionaryWords); } foreach (var g in _wordsCountInGroups) { var temp = new Dictionary<string, double>(); foreach (var w in g.Value) { temp.Add(w.Key, Weight(w.Value, _summaryWordsCountInGroup[g.Key], _summaryWordsCountInGroupDistinct[g.Key])); } _wordsWeigthInGroups.Add(g.Key, temp); } //для простоты необходима перегруппировка, на верхнем уровне сейчас удобнее использовать слова, а не группы var wordsWeigthInGroupsTranspose = new Dictionary<string, Dictionary<string, double>>(); //слово,<имя группы, вес слова в группе> foreach (var wg in _wordsWeigthInGroups) { foreach (var w in wg.Value) { if (!wordsWeigthInGroupsTranspose.ContainsKey(w.Key)) wordsWeigthInGroupsTranspose.Add(w.Key, new Dictionary<string, double>()); wordsWeigthInGroupsTranspose[w.Key].Add(wg.Key, w.Value); } } foreach (var w in wordsWeigthInGroupsTranspose) _wordsWeigthsInGroupsIncludingOtherGroups.Add(w.Key, WeightIncludedOtherGroup(w.Value, _groupsTexts.LongCount())); _wordsWeigthsInGroupsIncludingOtherGroups = _wordsWeigthsInGroupsIncludingOtherGroups.Where(w => w.Value.Count > 0).ToDictionary(w => w.Key, w => w.Value); //удаления у слов лишних групп foreach (var w in _wordsWeigthsInGroupsIncludingOtherGroups) { var delGroups = ( from word in db.Word join wit in db.WordNotInTheme on word.Word_id equals wit.Word_id join g in db.Theme on wit.Theme_id equals g.Theme_id join gg in db.ThemeGroup on g.ThemeGroup_id equals gg.ThemeGroup_id where gg.ThemeGroup_id == _themeGroupId where word.Word_name == w.Key select g.Theme_name ).ToList(); foreach (var g in delGroups) if (w.Value.ContainsKey(g)) { if (!_wordsWeigthsInGroupsIncludingOtherGroupsRemoved.ContainsKey(w.Key)) _wordsWeigthsInGroupsIncludingOtherGroupsRemoved.Add(w.Key, new Dictionary<string, double>()); _wordsWeigthsInGroupsIncludingOtherGroupsRemoved[w.Key].Add(g, w.Value[g]); w.Value.Remove(g); } } var addWordGroup = ( from w in db.Word join wig in db.WordInTheme on w.Word_id equals wig.Word_id join g in db.Theme on wig.Theme_id equals g.Theme_id join gg in db.ThemeGroup on g.ThemeGroup_id equals gg.ThemeGroup_id where gg.ThemeGroup_id == _themeGroupId select new { id = w.Word_id, name = w.Word_name, theme = g.Theme_name, isLexicalizedWord = wig.WordInTheme_isLexicalizedWord } ); foreach (var wg in addWordGroup) { if (!_themesOfWord.ContainsKey(wg.name)) _themesOfWord.Add(wg.name, new List<string>()); if (!_themesOfWord[wg.name].Contains(wg.theme)) _themesOfWord[wg.name].Add(wg.theme); if (!_groupsTexts.ContainsKey(wg.theme)) _groupsTexts.Add(wg.theme, string.Empty); if (!_wordsWeigthsInGroupsIncludingOtherGroups.ContainsKey(wg.name)) _wordsWeigthsInGroupsIncludingOtherGroups.Add(wg.name, new Dictionary<string, double>()); var addWeight = wg.isLexicalizedWord ? AdditionalDictionaryWeight : AdditionalWeight; if (!_wordsWeigthsInGroupsIncludingOtherGroups[wg.name].ContainsKey(wg.theme)) _wordsWeigthsInGroupsIncludingOtherGroups[wg.name].Add(wg.theme, 0); _wordsWeigthsInGroupsIncludingOtherGroups[wg.name][wg.theme] += addWeight; if (!_stemOfWord.ContainsKey(wg.name)) _stemOfWord.Add(wg.name, new List<string>()); var allolog = db.Allolog.FirstOrDefault(a => a.Word_id == wg.id); var allolog_name = allolog.With(a => a.Allolog_name, wg.name); if (!_stemOfWord[wg.name].Contains(allolog_name)) _stemOfWord[wg.name].Add(allolog_name); } } _wordsCountInGroups = new Dictionary<string, Dictionary<string, Int64>>(); _summaryWordsCountInGroup = new Dictionary<string, Int64>(); _summaryWordsCountInGroupDistinct = new Dictionary<string, Int64>(); _wordsWeigthInGroups = new Dictionary<string, Dictionary<string, double>>(); _wordsWeigthsInGroupsIncludingOtherGroupsRemoved = new Dictionary<string, Dictionary<string, double>>(); GC.Collect(); if (callback != null) callback(); }
private void buttonLoadLexicalizedWords_Click(object sender, EventArgs e) { var ofd = new OpenFileDialog { Multiselect = true }; var stemmer = new Stemming(); if (ofd.ShowDialog() == DialogResult.OK) { foreach (var filename in ofd.FileNames) { var theme = Path.GetFileNameWithoutExtension(filename); var words = new List<string>(); var sr = new StreamReader(filename, Encoding.Default); while (!sr.EndOfStream) { var word = _tc.NormilizeText(sr.ReadLine()); word = word.Trim(); if (word.IndexOf(' ') >= 0) continue; word = stemmer.Stem(word); words.Add(word); } _tc.LoadLexicalizedWords(comboBoxThemeGroup.Text, theme, words); sr.Close(); } } }
/// <summary>Функция классификации текста /// </summary> /// <param name="text">Текст который необходимо классифийировать</param> /// <returns>Данные вида имя группы , степень принадлежность текста группе</returns> public Dictionary<string, double> Check(string text) { if (_wordsWeigthsInGroupsIncludingOtherGroups == null) return null; var lastStatisticsWeightGlobal = new Dictionary<string, Dictionary<string, double>>(); var lastStatisticsCountGlobal = new Dictionary<string, Dictionary<string, Int64>>(); var answer = new Dictionary<string, double>(); text = text.Replace(Environment.NewLine, " "); text = NormilizeText(text); var stemming = new Stemming(); //класс для получения основы слова var words = text.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).Select(w => stemming.Stem(w)); words = words.Where(w => !string.IsNullOrWhiteSpace(w) && w.Length >= MinimumWordLength); foreach (var group in _groupsTexts) answer.Add(group.Key, 0); foreach (var word in words) { if (_wordsWeigthsInGroupsIncludingOtherGroups.ContainsKey(word)) foreach (var group in _wordsWeigthsInGroupsIncludingOtherGroups[word]) { if (!lastStatisticsWeightGlobal.ContainsKey(group.Key)) { lastStatisticsWeightGlobal.Add(group.Key, new Dictionary<string, double>()); lastStatisticsCountGlobal.Add(group.Key, new Dictionary<string, long>()); } if (!lastStatisticsWeightGlobal[group.Key].ContainsKey(word)) { lastStatisticsWeightGlobal[group.Key].Add(word, group.Value); lastStatisticsCountGlobal[group.Key].Add(word, 0); } answer[group.Key] += group.Value; lastStatisticsCountGlobal[group.Key][word]++; } } var orderAnswer = new Dictionary<string, double>(); //сортировка в обратном порядке, не охото компарер писать ради такой фигни var values = answer.Values.Select(v => -v).ToList(); values.Sort(); values = values.Distinct().Select(v => -v).ToList(); foreach (var value in values) { var orderStep = answer.Where(a => a.Value == value); foreach (var answerElement in orderStep) orderAnswer.Add(answerElement.Key, answerElement.Value); } //конец сортировки в обратном порядке var lastStatisticsWeight = new Dictionary<string, Dictionary<string, double>>(); var lastStatisticsCount = new Dictionary<string, Dictionary<string, Int64>>(); foreach (var ans in orderAnswer) { if (lastStatisticsWeightGlobal.ContainsKey(ans.Key)) { lastStatisticsWeight.Add(ans.Key, lastStatisticsWeightGlobal[ans.Key]); lastStatisticsCount.Add(ans.Key, lastStatisticsCountGlobal[ans.Key]); } } _lastStatisticsWeight = lastStatisticsWeight; _lastStatisticsCount = lastStatisticsCount; return orderAnswer; }