示例#1
0
        public void LoadLexicalizedWords(string themeGroup, string theme, List<string> words)
        {
            words = words.Distinct().ToList();
            words = words.Where(w => !string.IsNullOrWhiteSpace(w)).ToList();
            words = words.Select(w => NormilizeText(w)).ToList();
            using (var db = new TextClassificatorEntities())
            {
                _themeGroupId = db.ThemeGroup.Single(tg => tg.ThemeGroup_name == themeGroup).ThemeGroup_id;
                if (!db.Theme.Any(th => th.Theme_name == theme))
                {
                    db.Theme.AddObject(new Theme { ThemeGroup_id = _themeGroupId, Theme_name = theme, Theme_caption = theme, System_InsDT = DateTime.Now });
                    db.SaveChanges();
                }

                var themeObj = db.Theme.Single(th => th.Theme_name == theme && th.ThemeGroup_id == _themeGroupId);

                var newWords = words.Except(db.Word.Select(w => w.Word_name)).Select(w => new Word { Word_name = w, System_InsDT = DateTime.Now }).ToList();
                newWords.ToList().ForEach(w => db.Word.AddObject(w));
                db.SaveChanges();
                foreach (var word in words)
                {
                    var wordObj = db.Word.Single(w => w.Word_name == word);
                    if (db.WordNotInTheme.Any(w => w.Theme_id == themeObj.Theme_id && w.Word_id == wordObj.Word_id))
                        continue;

                    var wc = db.WordCandidatesInTheme.SingleOrDefault(w => w.Theme_id == themeObj.Theme_id && w.Word_id == wordObj.Word_id);
                    if (wc != null)
                    {
                        db.WordCandidatesInTheme.DeleteObject(wc);
                        db.SaveChanges();

                    }

                    var wt = db.WordInTheme.SingleOrDefault(w => w.Theme_id == themeObj.Theme_id && w.Word_id == wordObj.Word_id);
                    if (wt != null)
                    {
                        db.WordInTheme.DeleteObject(wt);
                        db.SaveChanges();
                    }
                    db.WordInTheme.AddObject(new WordInTheme { Word_id = wordObj.Word_id, Theme_id = themeObj.Theme_id, WordInTheme_isLexicalizedWord = true, System_InsDT = DateTime.Now });
                    db.SaveChanges();
                }
            }
        }
示例#2
0
        /// <summary>Функция обучения (для дальнейшей классификации текстов по группам)
        /// </summary>
        /// <param name="groupsTexts">данные для обучения вида имя группы-текст принадлежащей группе</param>
        public void Learn(string themeGroupName, Dictionary<string, string> groupsTexts, Action callback = null)
        {
            //обнуление предыдущего обучения
            var groupsTextsOld = groupsTexts;													//получение списка групп
            _groupsTexts = new Dictionary<string, string>();
            foreach (var g in groupsTexts)
                _groupsTexts.Add(g.Key, NormilizeText(g.Value));
            _wordsCountInGroups = new Dictionary<string, Dictionary<string, Int64>>();
            _summaryWordsCountInGroup = new Dictionary<string, Int64>();
            _summaryWordsCountInGroupDistinct = new Dictionary<string, Int64>();
            _wordsWeigthInGroups = new Dictionary<string, Dictionary<string, double>>();
            _wordsWeigthsInGroupsIncludingOtherGroups = new Dictionary<string, Dictionary<string, double>>();
            _wordsWeigthsInGroupsIncludingOtherGroupsRemoved = new Dictionary<string, Dictionary<string, double>>();
            _stemOfWord = new Dictionary<string, List<string>>();
            _themesOfWord = new Dictionary<string, List<string>>();
            //конец обнуления предыдущего обучения
            var stemming = new Stemming();					//класс для получения основы слова
            using (var db = new TextClassificatorEntities())
            {
                _themeGroupId = db.ThemeGroup.Single(tg => tg.ThemeGroup_name == themeGroupName).ThemeGroup_id;
                foreach (var text in _groupsTexts)
                {
                    var words = text.Value.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);//.Select(w => stemming.Stem(w)).ToList();
                    var wordsStem = new List<string>();
                    foreach (var w in words)
                    {
                        var stem = stemming.Stem(w);
                        if (!_stemOfWord.ContainsKey(stem))
                            _stemOfWord.Add(stem, new List<string>());
                        wordsStem.Add(stem);
                        if (!_stemOfWord[stem].Contains(w))
                            _stemOfWord[stem].Add(w);
                    }
                    wordsStem = wordsStem.Where(w => !string.IsNullOrWhiteSpace(w) && w.Length >= MinimumWordLength).ToList();
                    wordsStem.RemoveAll(w => StopWords.Any(sw => sw == w));
                    var groupWords = wordsStem.GroupBy(w => w);
                    foreach (var w in groupWords)
                    {
                        if (!_themesOfWord.ContainsKey(w.Key))
                            _themesOfWord.Add(w.Key, new List<string>());
                        _themesOfWord[w.Key].Add(text.Key);
                    }
                    var dictionaryWords = (from gw in groupWords
                                           where gw.Count() >= MinimumWordCount
                                           select gw).ToDictionary(gw => gw.Key, gw => gw.LongCount());
                    _summaryWordsCountInGroup.Add(text.Key, dictionaryWords.Sum(dw => dw.Value));
                    _summaryWordsCountInGroupDistinct.Add(text.Key, dictionaryWords.LongCount());
                    _wordsCountInGroups.Add(text.Key, dictionaryWords);
                }

                foreach (var g in _wordsCountInGroups)
                {
                    var temp = new Dictionary<string, double>();
                    foreach (var w in g.Value)
                    {
                        temp.Add(w.Key, Weight(w.Value, _summaryWordsCountInGroup[g.Key], _summaryWordsCountInGroupDistinct[g.Key]));
                    }
                    _wordsWeigthInGroups.Add(g.Key, temp);
                }

                //для простоты необходима перегруппировка, на верхнем уровне сейчас удобнее использовать слова, а не группы
                var wordsWeigthInGroupsTranspose = new Dictionary<string, Dictionary<string, double>>(); //слово,<имя группы, вес слова в группе>
                foreach (var wg in _wordsWeigthInGroups)
                {
                    foreach (var w in wg.Value)
                    {
                        if (!wordsWeigthInGroupsTranspose.ContainsKey(w.Key))
                            wordsWeigthInGroupsTranspose.Add(w.Key, new Dictionary<string, double>());
                        wordsWeigthInGroupsTranspose[w.Key].Add(wg.Key, w.Value);
                    }
                }

                foreach (var w in wordsWeigthInGroupsTranspose)
                    _wordsWeigthsInGroupsIncludingOtherGroups.Add(w.Key, WeightIncludedOtherGroup(w.Value, _groupsTexts.LongCount()));
                _wordsWeigthsInGroupsIncludingOtherGroups = _wordsWeigthsInGroupsIncludingOtherGroups.Where(w => w.Value.Count > 0).ToDictionary(w => w.Key, w => w.Value);

                //удаления у слов лишних групп
                foreach (var w in _wordsWeigthsInGroupsIncludingOtherGroups)
                {
                    var delGroups =
                        (
                            from word in db.Word
                            join wit in db.WordNotInTheme on word.Word_id equals wit.Word_id
                            join g in db.Theme on wit.Theme_id equals g.Theme_id
                            join gg in db.ThemeGroup on g.ThemeGroup_id equals gg.ThemeGroup_id
                            where gg.ThemeGroup_id == _themeGroupId
                            where word.Word_name == w.Key
                            select g.Theme_name
                        ).ToList();
                    foreach (var g in delGroups)
                        if (w.Value.ContainsKey(g))
                        {
                            if (!_wordsWeigthsInGroupsIncludingOtherGroupsRemoved.ContainsKey(w.Key))
                                _wordsWeigthsInGroupsIncludingOtherGroupsRemoved.Add(w.Key, new Dictionary<string, double>());
                            _wordsWeigthsInGroupsIncludingOtherGroupsRemoved[w.Key].Add(g, w.Value[g]);
                            w.Value.Remove(g);
                        }
                }

                var addWordGroup =
                    (
                        from w in db.Word
                        join wig in db.WordInTheme on w.Word_id equals wig.Word_id
                        join g in db.Theme on wig.Theme_id equals g.Theme_id
                        join gg in db.ThemeGroup on g.ThemeGroup_id equals gg.ThemeGroup_id
                        where gg.ThemeGroup_id == _themeGroupId
                        select new { id = w.Word_id, name = w.Word_name, theme = g.Theme_name, isLexicalizedWord = wig.WordInTheme_isLexicalizedWord }
                    );

                foreach (var wg in addWordGroup)
                {
                    if (!_themesOfWord.ContainsKey(wg.name))
                        _themesOfWord.Add(wg.name, new List<string>());
                    if (!_themesOfWord[wg.name].Contains(wg.theme))
                        _themesOfWord[wg.name].Add(wg.theme);
                    if (!_groupsTexts.ContainsKey(wg.theme))
                        _groupsTexts.Add(wg.theme, string.Empty);
                    if (!_wordsWeigthsInGroupsIncludingOtherGroups.ContainsKey(wg.name))
                        _wordsWeigthsInGroupsIncludingOtherGroups.Add(wg.name, new Dictionary<string, double>());
                    var addWeight = wg.isLexicalizedWord ? AdditionalDictionaryWeight : AdditionalWeight;
                    if (!_wordsWeigthsInGroupsIncludingOtherGroups[wg.name].ContainsKey(wg.theme))
                        _wordsWeigthsInGroupsIncludingOtherGroups[wg.name].Add(wg.theme, 0);
                    _wordsWeigthsInGroupsIncludingOtherGroups[wg.name][wg.theme] += addWeight;

                    if (!_stemOfWord.ContainsKey(wg.name))
                        _stemOfWord.Add(wg.name, new List<string>());
                    var allolog = db.Allolog.FirstOrDefault(a => a.Word_id == wg.id);
                    var allolog_name = allolog.With(a => a.Allolog_name, wg.name);
                    if (!_stemOfWord[wg.name].Contains(allolog_name))
                        _stemOfWord[wg.name].Add(allolog_name);
                }
            }

            _wordsCountInGroups = new Dictionary<string, Dictionary<string, Int64>>();
            _summaryWordsCountInGroup = new Dictionary<string, Int64>();
            _summaryWordsCountInGroupDistinct = new Dictionary<string, Int64>();
            _wordsWeigthInGroups = new Dictionary<string, Dictionary<string, double>>();
            _wordsWeigthsInGroupsIncludingOtherGroupsRemoved = new Dictionary<string, Dictionary<string, double>>();

            GC.Collect();
            if (callback != null)
                callback();
        }
示例#3
0
        /// <summary>экспорт результатов текущего обучения в базу (без учёта весов) для дальнейшей ручной группировки слов
        /// </summary>
        public void ExportToDataBase(string themeGroupName)
        {
            using (var db = new TextClassificatorEntities())
            {
                _themeGroupId = db.ThemeGroup.Single(tg => tg.ThemeGroup_name == themeGroupName).ThemeGroup_id;
                var newGroups = Groups.Except(db.Theme.Where(th => th.ThemeGroup_id == _themeGroupId).Select(th => th.Theme_name)).Select(th => new Theme { ThemeGroup_id = _themeGroupId, Theme_name = th, Theme_caption = th, System_InsDT = DateTime.Now }).ToList();
                newGroups.ToList().ForEach(th => db.Theme.AddObject(th));
                db.SaveChanges();

                var newWords = _wordsWeigthsInGroupsIncludingOtherGroups.Keys.Except(db.Word.Select(w => w.Word_name)).Select(w => new Word { Word_name = w, System_InsDT = DateTime.Now }).ToList();
                newWords.ToList().ForEach(w => db.Word.AddObject(w));
                db.SaveChanges();

                foreach (var allWord in _wordsWeigthsInGroupsIncludingOtherGroups.Select(w => w.Key))
                {
                    var stem = _stemOfWord[allWord];
                    var word = db.Word.Single(w => w.Word_name == allWord);
                    var currentAllologs = db.Allolog.Where(a => a.Word_id == word.Word_id);
                    var newAllogs = stem.Except(currentAllologs.Select(a => a.Allolog_name)).Select(a => new Allolog { Allolog_name = a, Word_id = word.Word_id, System_InsDT = DateTime.Now }).ToList();
                    newAllogs.ForEach(a => db.Allolog.AddObject(a));
                    db.SaveChanges();
                }

                foreach (var theme in Groups)
                {
                    var words = new List<string>();
                    words.AddRange
                        (
                            from gr in db.Theme
                            join wc in db.WordCandidatesInTheme on gr.Theme_id equals wc.Theme_id
                            join w in db.Word on wc.Word_id equals w.Word_id
                            where gr.Theme_name == theme
                            where gr.ThemeGroup_id == _themeGroupId
                            select w.Word_name
                        );
                    words.AddRange
                        (
                            from gr in db.Theme
                            join wc in db.WordInTheme on gr.Theme_id equals wc.Theme_id
                            join w in db.Word on wc.Word_id equals w.Word_id
                            where gr.Theme_name == theme
                            where gr.ThemeGroup_id == _themeGroupId
                            select w.Word_name
                        );
                    words.AddRange
                        (
                            from gr in db.Theme
                            join wc in db.WordNotInTheme on gr.Theme_id equals wc.Theme_id
                            join w in db.Word on wc.Word_id equals w.Word_id
                            where gr.Theme_name == theme
                            where gr.ThemeGroup_id == _themeGroupId
                            select w.Word_name
                        );
                    words = words.Distinct().ToList();
                    var wordsInGroup = _wordsWeigthsInGroupsIncludingOtherGroups.Where(w => w.Value.Any(gr => gr.Key == theme)).Select(w => w.Key);
                    var newWordsInGroup = wordsInGroup.Except(words);
                    var group = db.Theme.Single(th => th.Theme_name == theme && th.ThemeGroup_id == _themeGroupId);
                    foreach (var newWordInGroup in newWordsInGroup)
                    {
                        var word = db.Word.Single(w => w.Word_name == newWordInGroup);
                        db.WordCandidatesInTheme.AddObject(new WordCandidatesInTheme { Word_id = word.Word_id, Theme_id = group.Theme_id, System_InsDT = DateTime.Now, WordCandidatesInTheme_Visible = true });
                    }
                    db.SaveChanges();
                }
            }
        }
示例#4
0
 public Int64 GetGroupThemeIdByGroupThemeName(string themeGroupName)
 {
     using (var db = new TextClassificatorEntities())
     {
         return db.ThemeGroup.Single(tg => tg.ThemeGroup_name == themeGroupName).ThemeGroup_id;
     }
 }
示例#5
0
        /// <summary>создать обученный экземпляр классификатора (все значения берутся по умолчанию)
        /// </summary>
        /// <returns></returns>
        public static TextClassificator CreateLearnedInstance()
        {
            var separators = new List<string>();
            var fs = (string)Resource.ResourceManager.GetObject(TextClassificator.SeparatorsFileName);
            separators = fs.Split(new[] { '\n' }, StringSplitOptions.RemoveEmptyEntries).ToList();

            var stopWords = new List<string>();
            fs = (string)Resource.ResourceManager.GetObject(TextClassificator.StopWordsFileName);
            stopWords = fs.Split(new[] { '\n' }, StringSplitOptions.RemoveEmptyEntries).ToList();

            var instance = new TextClassificator(separators, stopWords);
            var themes = new Dictionary<string, string>();
            foreach (System.Collections.DictionaryEntry file in Resource.ResourceManager.GetResourceSet(new System.Globalization.CultureInfo("ru-RU"), true, true))
            {
                if (file.Key.ToString()[0] == '_')
                {
                    var text = file.Value.ToString();
                    text = text.ToLower();
                    text = DeleteWords(text, separators, false);
                    text = DeleteWords(text, stopWords);
                    text = DeleteDoubleSpaceAndBadSymbols(text);
                    themes.Add(file.Key.ToString().TrimStart('_'), text);
                }
            }
            using (var db = new TextClassificatorEntities())
            {
                instance.Learn(db.ThemeGroup.Single(tg => tg.ThemeGroup_id == 1).ThemeGroup_name, themes);
            }

            return instance;
        }