public void LoadLexicalizedWords(string themeGroup, string theme, List<string> words) { words = words.Distinct().ToList(); words = words.Where(w => !string.IsNullOrWhiteSpace(w)).ToList(); words = words.Select(w => NormilizeText(w)).ToList(); using (var db = new TextClassificatorEntities()) { _themeGroupId = db.ThemeGroup.Single(tg => tg.ThemeGroup_name == themeGroup).ThemeGroup_id; if (!db.Theme.Any(th => th.Theme_name == theme)) { db.Theme.AddObject(new Theme { ThemeGroup_id = _themeGroupId, Theme_name = theme, Theme_caption = theme, System_InsDT = DateTime.Now }); db.SaveChanges(); } var themeObj = db.Theme.Single(th => th.Theme_name == theme && th.ThemeGroup_id == _themeGroupId); var newWords = words.Except(db.Word.Select(w => w.Word_name)).Select(w => new Word { Word_name = w, System_InsDT = DateTime.Now }).ToList(); newWords.ToList().ForEach(w => db.Word.AddObject(w)); db.SaveChanges(); foreach (var word in words) { var wordObj = db.Word.Single(w => w.Word_name == word); if (db.WordNotInTheme.Any(w => w.Theme_id == themeObj.Theme_id && w.Word_id == wordObj.Word_id)) continue; var wc = db.WordCandidatesInTheme.SingleOrDefault(w => w.Theme_id == themeObj.Theme_id && w.Word_id == wordObj.Word_id); if (wc != null) { db.WordCandidatesInTheme.DeleteObject(wc); db.SaveChanges(); } var wt = db.WordInTheme.SingleOrDefault(w => w.Theme_id == themeObj.Theme_id && w.Word_id == wordObj.Word_id); if (wt != null) { db.WordInTheme.DeleteObject(wt); db.SaveChanges(); } db.WordInTheme.AddObject(new WordInTheme { Word_id = wordObj.Word_id, Theme_id = themeObj.Theme_id, WordInTheme_isLexicalizedWord = true, System_InsDT = DateTime.Now }); db.SaveChanges(); } } }
/// <summary>Функция обучения (для дальнейшей классификации текстов по группам) /// </summary> /// <param name="groupsTexts">данные для обучения вида имя группы-текст принадлежащей группе</param> public void Learn(string themeGroupName, Dictionary<string, string> groupsTexts, Action callback = null) { //обнуление предыдущего обучения var groupsTextsOld = groupsTexts; //получение списка групп _groupsTexts = new Dictionary<string, string>(); foreach (var g in groupsTexts) _groupsTexts.Add(g.Key, NormilizeText(g.Value)); _wordsCountInGroups = new Dictionary<string, Dictionary<string, Int64>>(); _summaryWordsCountInGroup = new Dictionary<string, Int64>(); _summaryWordsCountInGroupDistinct = new Dictionary<string, Int64>(); _wordsWeigthInGroups = new Dictionary<string, Dictionary<string, double>>(); _wordsWeigthsInGroupsIncludingOtherGroups = new Dictionary<string, Dictionary<string, double>>(); _wordsWeigthsInGroupsIncludingOtherGroupsRemoved = new Dictionary<string, Dictionary<string, double>>(); _stemOfWord = new Dictionary<string, List<string>>(); _themesOfWord = new Dictionary<string, List<string>>(); //конец обнуления предыдущего обучения var stemming = new Stemming(); //класс для получения основы слова using (var db = new TextClassificatorEntities()) { _themeGroupId = db.ThemeGroup.Single(tg => tg.ThemeGroup_name == themeGroupName).ThemeGroup_id; foreach (var text in _groupsTexts) { var words = text.Value.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);//.Select(w => stemming.Stem(w)).ToList(); var wordsStem = new List<string>(); foreach (var w in words) { var stem = stemming.Stem(w); if (!_stemOfWord.ContainsKey(stem)) _stemOfWord.Add(stem, new List<string>()); wordsStem.Add(stem); if (!_stemOfWord[stem].Contains(w)) _stemOfWord[stem].Add(w); } wordsStem = wordsStem.Where(w => !string.IsNullOrWhiteSpace(w) && w.Length >= MinimumWordLength).ToList(); wordsStem.RemoveAll(w => StopWords.Any(sw => sw == w)); var groupWords = wordsStem.GroupBy(w => w); foreach (var w in groupWords) { if (!_themesOfWord.ContainsKey(w.Key)) _themesOfWord.Add(w.Key, new List<string>()); _themesOfWord[w.Key].Add(text.Key); } var dictionaryWords = (from gw in groupWords where gw.Count() >= MinimumWordCount select gw).ToDictionary(gw => gw.Key, gw => gw.LongCount()); _summaryWordsCountInGroup.Add(text.Key, dictionaryWords.Sum(dw => dw.Value)); _summaryWordsCountInGroupDistinct.Add(text.Key, dictionaryWords.LongCount()); _wordsCountInGroups.Add(text.Key, dictionaryWords); } foreach (var g in _wordsCountInGroups) { var temp = new Dictionary<string, double>(); foreach (var w in g.Value) { temp.Add(w.Key, Weight(w.Value, _summaryWordsCountInGroup[g.Key], _summaryWordsCountInGroupDistinct[g.Key])); } _wordsWeigthInGroups.Add(g.Key, temp); } //для простоты необходима перегруппировка, на верхнем уровне сейчас удобнее использовать слова, а не группы var wordsWeigthInGroupsTranspose = new Dictionary<string, Dictionary<string, double>>(); //слово,<имя группы, вес слова в группе> foreach (var wg in _wordsWeigthInGroups) { foreach (var w in wg.Value) { if (!wordsWeigthInGroupsTranspose.ContainsKey(w.Key)) wordsWeigthInGroupsTranspose.Add(w.Key, new Dictionary<string, double>()); wordsWeigthInGroupsTranspose[w.Key].Add(wg.Key, w.Value); } } foreach (var w in wordsWeigthInGroupsTranspose) _wordsWeigthsInGroupsIncludingOtherGroups.Add(w.Key, WeightIncludedOtherGroup(w.Value, _groupsTexts.LongCount())); _wordsWeigthsInGroupsIncludingOtherGroups = _wordsWeigthsInGroupsIncludingOtherGroups.Where(w => w.Value.Count > 0).ToDictionary(w => w.Key, w => w.Value); //удаления у слов лишних групп foreach (var w in _wordsWeigthsInGroupsIncludingOtherGroups) { var delGroups = ( from word in db.Word join wit in db.WordNotInTheme on word.Word_id equals wit.Word_id join g in db.Theme on wit.Theme_id equals g.Theme_id join gg in db.ThemeGroup on g.ThemeGroup_id equals gg.ThemeGroup_id where gg.ThemeGroup_id == _themeGroupId where word.Word_name == w.Key select g.Theme_name ).ToList(); foreach (var g in delGroups) if (w.Value.ContainsKey(g)) { if (!_wordsWeigthsInGroupsIncludingOtherGroupsRemoved.ContainsKey(w.Key)) _wordsWeigthsInGroupsIncludingOtherGroupsRemoved.Add(w.Key, new Dictionary<string, double>()); _wordsWeigthsInGroupsIncludingOtherGroupsRemoved[w.Key].Add(g, w.Value[g]); w.Value.Remove(g); } } var addWordGroup = ( from w in db.Word join wig in db.WordInTheme on w.Word_id equals wig.Word_id join g in db.Theme on wig.Theme_id equals g.Theme_id join gg in db.ThemeGroup on g.ThemeGroup_id equals gg.ThemeGroup_id where gg.ThemeGroup_id == _themeGroupId select new { id = w.Word_id, name = w.Word_name, theme = g.Theme_name, isLexicalizedWord = wig.WordInTheme_isLexicalizedWord } ); foreach (var wg in addWordGroup) { if (!_themesOfWord.ContainsKey(wg.name)) _themesOfWord.Add(wg.name, new List<string>()); if (!_themesOfWord[wg.name].Contains(wg.theme)) _themesOfWord[wg.name].Add(wg.theme); if (!_groupsTexts.ContainsKey(wg.theme)) _groupsTexts.Add(wg.theme, string.Empty); if (!_wordsWeigthsInGroupsIncludingOtherGroups.ContainsKey(wg.name)) _wordsWeigthsInGroupsIncludingOtherGroups.Add(wg.name, new Dictionary<string, double>()); var addWeight = wg.isLexicalizedWord ? AdditionalDictionaryWeight : AdditionalWeight; if (!_wordsWeigthsInGroupsIncludingOtherGroups[wg.name].ContainsKey(wg.theme)) _wordsWeigthsInGroupsIncludingOtherGroups[wg.name].Add(wg.theme, 0); _wordsWeigthsInGroupsIncludingOtherGroups[wg.name][wg.theme] += addWeight; if (!_stemOfWord.ContainsKey(wg.name)) _stemOfWord.Add(wg.name, new List<string>()); var allolog = db.Allolog.FirstOrDefault(a => a.Word_id == wg.id); var allolog_name = allolog.With(a => a.Allolog_name, wg.name); if (!_stemOfWord[wg.name].Contains(allolog_name)) _stemOfWord[wg.name].Add(allolog_name); } } _wordsCountInGroups = new Dictionary<string, Dictionary<string, Int64>>(); _summaryWordsCountInGroup = new Dictionary<string, Int64>(); _summaryWordsCountInGroupDistinct = new Dictionary<string, Int64>(); _wordsWeigthInGroups = new Dictionary<string, Dictionary<string, double>>(); _wordsWeigthsInGroupsIncludingOtherGroupsRemoved = new Dictionary<string, Dictionary<string, double>>(); GC.Collect(); if (callback != null) callback(); }
/// <summary>экспорт результатов текущего обучения в базу (без учёта весов) для дальнейшей ручной группировки слов /// </summary> public void ExportToDataBase(string themeGroupName) { using (var db = new TextClassificatorEntities()) { _themeGroupId = db.ThemeGroup.Single(tg => tg.ThemeGroup_name == themeGroupName).ThemeGroup_id; var newGroups = Groups.Except(db.Theme.Where(th => th.ThemeGroup_id == _themeGroupId).Select(th => th.Theme_name)).Select(th => new Theme { ThemeGroup_id = _themeGroupId, Theme_name = th, Theme_caption = th, System_InsDT = DateTime.Now }).ToList(); newGroups.ToList().ForEach(th => db.Theme.AddObject(th)); db.SaveChanges(); var newWords = _wordsWeigthsInGroupsIncludingOtherGroups.Keys.Except(db.Word.Select(w => w.Word_name)).Select(w => new Word { Word_name = w, System_InsDT = DateTime.Now }).ToList(); newWords.ToList().ForEach(w => db.Word.AddObject(w)); db.SaveChanges(); foreach (var allWord in _wordsWeigthsInGroupsIncludingOtherGroups.Select(w => w.Key)) { var stem = _stemOfWord[allWord]; var word = db.Word.Single(w => w.Word_name == allWord); var currentAllologs = db.Allolog.Where(a => a.Word_id == word.Word_id); var newAllogs = stem.Except(currentAllologs.Select(a => a.Allolog_name)).Select(a => new Allolog { Allolog_name = a, Word_id = word.Word_id, System_InsDT = DateTime.Now }).ToList(); newAllogs.ForEach(a => db.Allolog.AddObject(a)); db.SaveChanges(); } foreach (var theme in Groups) { var words = new List<string>(); words.AddRange ( from gr in db.Theme join wc in db.WordCandidatesInTheme on gr.Theme_id equals wc.Theme_id join w in db.Word on wc.Word_id equals w.Word_id where gr.Theme_name == theme where gr.ThemeGroup_id == _themeGroupId select w.Word_name ); words.AddRange ( from gr in db.Theme join wc in db.WordInTheme on gr.Theme_id equals wc.Theme_id join w in db.Word on wc.Word_id equals w.Word_id where gr.Theme_name == theme where gr.ThemeGroup_id == _themeGroupId select w.Word_name ); words.AddRange ( from gr in db.Theme join wc in db.WordNotInTheme on gr.Theme_id equals wc.Theme_id join w in db.Word on wc.Word_id equals w.Word_id where gr.Theme_name == theme where gr.ThemeGroup_id == _themeGroupId select w.Word_name ); words = words.Distinct().ToList(); var wordsInGroup = _wordsWeigthsInGroupsIncludingOtherGroups.Where(w => w.Value.Any(gr => gr.Key == theme)).Select(w => w.Key); var newWordsInGroup = wordsInGroup.Except(words); var group = db.Theme.Single(th => th.Theme_name == theme && th.ThemeGroup_id == _themeGroupId); foreach (var newWordInGroup in newWordsInGroup) { var word = db.Word.Single(w => w.Word_name == newWordInGroup); db.WordCandidatesInTheme.AddObject(new WordCandidatesInTheme { Word_id = word.Word_id, Theme_id = group.Theme_id, System_InsDT = DateTime.Now, WordCandidatesInTheme_Visible = true }); } db.SaveChanges(); } } }
public Int64 GetGroupThemeIdByGroupThemeName(string themeGroupName) { using (var db = new TextClassificatorEntities()) { return db.ThemeGroup.Single(tg => tg.ThemeGroup_name == themeGroupName).ThemeGroup_id; } }
/// <summary>создать обученный экземпляр классификатора (все значения берутся по умолчанию) /// </summary> /// <returns></returns> public static TextClassificator CreateLearnedInstance() { var separators = new List<string>(); var fs = (string)Resource.ResourceManager.GetObject(TextClassificator.SeparatorsFileName); separators = fs.Split(new[] { '\n' }, StringSplitOptions.RemoveEmptyEntries).ToList(); var stopWords = new List<string>(); fs = (string)Resource.ResourceManager.GetObject(TextClassificator.StopWordsFileName); stopWords = fs.Split(new[] { '\n' }, StringSplitOptions.RemoveEmptyEntries).ToList(); var instance = new TextClassificator(separators, stopWords); var themes = new Dictionary<string, string>(); foreach (System.Collections.DictionaryEntry file in Resource.ResourceManager.GetResourceSet(new System.Globalization.CultureInfo("ru-RU"), true, true)) { if (file.Key.ToString()[0] == '_') { var text = file.Value.ToString(); text = text.ToLower(); text = DeleteWords(text, separators, false); text = DeleteWords(text, stopWords); text = DeleteDoubleSpaceAndBadSymbols(text); themes.Add(file.Key.ToString().TrimStart('_'), text); } } using (var db = new TextClassificatorEntities()) { instance.Learn(db.ThemeGroup.Single(tg => tg.ThemeGroup_id == 1).ThemeGroup_name, themes); } return instance; }