예제 #1
0
        private void buttonLoadLexicalizedWords_Click(object sender, EventArgs e)
        {
            var ofd = new OpenFileDialog {
                Multiselect = true
            };
            var stemmer = new Stemming();

            if (ofd.ShowDialog() == DialogResult.OK)
            {
                foreach (var filename in ofd.FileNames)
                {
                    var theme = Path.GetFileNameWithoutExtension(filename);
                    var words = new List <string>();
                    var sr    = new StreamReader(filename, Encoding.Default);
                    while (!sr.EndOfStream)
                    {
                        var word = _tc.NormilizeText(sr.ReadLine());
                        word = word.Trim();
                        if (word.IndexOf(' ') >= 0)
                        {
                            continue;
                        }
                        word = stemmer.Stem(word);
                        words.Add(word);
                    }
                    _tc.LoadLexicalizedWords(comboBoxThemeGroup.Text, theme, words);
                    sr.Close();
                }
            }
        }
예제 #2
0
        /// <summary>Функция обучения (для дальнейшей классификации текстов по группам)
        /// </summary>
        /// <param name="groupsTexts">данные для обучения вида имя группы-текст принадлежащей группе</param>
        public void Learn(string themeGroupName, Dictionary<string, string> groupsTexts, Action callback = null)
        {
            //обнуление предыдущего обучения
            var groupsTextsOld = groupsTexts;													//получение списка групп
            _groupsTexts = new Dictionary<string, string>();
            foreach (var g in groupsTexts)
                _groupsTexts.Add(g.Key, NormilizeText(g.Value));
            _wordsCountInGroups = new Dictionary<string, Dictionary<string, Int64>>();
            _summaryWordsCountInGroup = new Dictionary<string, Int64>();
            _summaryWordsCountInGroupDistinct = new Dictionary<string, Int64>();
            _wordsWeigthInGroups = new Dictionary<string, Dictionary<string, double>>();
            _wordsWeigthsInGroupsIncludingOtherGroups = new Dictionary<string, Dictionary<string, double>>();
            _wordsWeigthsInGroupsIncludingOtherGroupsRemoved = new Dictionary<string, Dictionary<string, double>>();
            _stemOfWord = new Dictionary<string, List<string>>();
            _themesOfWord = new Dictionary<string, List<string>>();
            //конец обнуления предыдущего обучения
            var stemming = new Stemming();					//класс для получения основы слова
            using (var db = new TextClassificatorEntities())
            {
                _themeGroupId = db.ThemeGroup.Single(tg => tg.ThemeGroup_name == themeGroupName).ThemeGroup_id;
                foreach (var text in _groupsTexts)
                {
                    var words = text.Value.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);//.Select(w => stemming.Stem(w)).ToList();
                    var wordsStem = new List<string>();
                    foreach (var w in words)
                    {
                        var stem = stemming.Stem(w);
                        if (!_stemOfWord.ContainsKey(stem))
                            _stemOfWord.Add(stem, new List<string>());
                        wordsStem.Add(stem);
                        if (!_stemOfWord[stem].Contains(w))
                            _stemOfWord[stem].Add(w);
                    }
                    wordsStem = wordsStem.Where(w => !string.IsNullOrWhiteSpace(w) && w.Length >= MinimumWordLength).ToList();
                    wordsStem.RemoveAll(w => StopWords.Any(sw => sw == w));
                    var groupWords = wordsStem.GroupBy(w => w);
                    foreach (var w in groupWords)
                    {
                        if (!_themesOfWord.ContainsKey(w.Key))
                            _themesOfWord.Add(w.Key, new List<string>());
                        _themesOfWord[w.Key].Add(text.Key);
                    }
                    var dictionaryWords = (from gw in groupWords
                                           where gw.Count() >= MinimumWordCount
                                           select gw).ToDictionary(gw => gw.Key, gw => gw.LongCount());
                    _summaryWordsCountInGroup.Add(text.Key, dictionaryWords.Sum(dw => dw.Value));
                    _summaryWordsCountInGroupDistinct.Add(text.Key, dictionaryWords.LongCount());
                    _wordsCountInGroups.Add(text.Key, dictionaryWords);
                }

                foreach (var g in _wordsCountInGroups)
                {
                    var temp = new Dictionary<string, double>();
                    foreach (var w in g.Value)
                    {
                        temp.Add(w.Key, Weight(w.Value, _summaryWordsCountInGroup[g.Key], _summaryWordsCountInGroupDistinct[g.Key]));
                    }
                    _wordsWeigthInGroups.Add(g.Key, temp);
                }

                //для простоты необходима перегруппировка, на верхнем уровне сейчас удобнее использовать слова, а не группы
                var wordsWeigthInGroupsTranspose = new Dictionary<string, Dictionary<string, double>>(); //слово,<имя группы, вес слова в группе>
                foreach (var wg in _wordsWeigthInGroups)
                {
                    foreach (var w in wg.Value)
                    {
                        if (!wordsWeigthInGroupsTranspose.ContainsKey(w.Key))
                            wordsWeigthInGroupsTranspose.Add(w.Key, new Dictionary<string, double>());
                        wordsWeigthInGroupsTranspose[w.Key].Add(wg.Key, w.Value);
                    }
                }

                foreach (var w in wordsWeigthInGroupsTranspose)
                    _wordsWeigthsInGroupsIncludingOtherGroups.Add(w.Key, WeightIncludedOtherGroup(w.Value, _groupsTexts.LongCount()));
                _wordsWeigthsInGroupsIncludingOtherGroups = _wordsWeigthsInGroupsIncludingOtherGroups.Where(w => w.Value.Count > 0).ToDictionary(w => w.Key, w => w.Value);

                //удаления у слов лишних групп
                foreach (var w in _wordsWeigthsInGroupsIncludingOtherGroups)
                {
                    var delGroups =
                        (
                            from word in db.Word
                            join wit in db.WordNotInTheme on word.Word_id equals wit.Word_id
                            join g in db.Theme on wit.Theme_id equals g.Theme_id
                            join gg in db.ThemeGroup on g.ThemeGroup_id equals gg.ThemeGroup_id
                            where gg.ThemeGroup_id == _themeGroupId
                            where word.Word_name == w.Key
                            select g.Theme_name
                        ).ToList();
                    foreach (var g in delGroups)
                        if (w.Value.ContainsKey(g))
                        {
                            if (!_wordsWeigthsInGroupsIncludingOtherGroupsRemoved.ContainsKey(w.Key))
                                _wordsWeigthsInGroupsIncludingOtherGroupsRemoved.Add(w.Key, new Dictionary<string, double>());
                            _wordsWeigthsInGroupsIncludingOtherGroupsRemoved[w.Key].Add(g, w.Value[g]);
                            w.Value.Remove(g);
                        }
                }

                var addWordGroup =
                    (
                        from w in db.Word
                        join wig in db.WordInTheme on w.Word_id equals wig.Word_id
                        join g in db.Theme on wig.Theme_id equals g.Theme_id
                        join gg in db.ThemeGroup on g.ThemeGroup_id equals gg.ThemeGroup_id
                        where gg.ThemeGroup_id == _themeGroupId
                        select new { id = w.Word_id, name = w.Word_name, theme = g.Theme_name, isLexicalizedWord = wig.WordInTheme_isLexicalizedWord }
                    );

                foreach (var wg in addWordGroup)
                {
                    if (!_themesOfWord.ContainsKey(wg.name))
                        _themesOfWord.Add(wg.name, new List<string>());
                    if (!_themesOfWord[wg.name].Contains(wg.theme))
                        _themesOfWord[wg.name].Add(wg.theme);
                    if (!_groupsTexts.ContainsKey(wg.theme))
                        _groupsTexts.Add(wg.theme, string.Empty);
                    if (!_wordsWeigthsInGroupsIncludingOtherGroups.ContainsKey(wg.name))
                        _wordsWeigthsInGroupsIncludingOtherGroups.Add(wg.name, new Dictionary<string, double>());
                    var addWeight = wg.isLexicalizedWord ? AdditionalDictionaryWeight : AdditionalWeight;
                    if (!_wordsWeigthsInGroupsIncludingOtherGroups[wg.name].ContainsKey(wg.theme))
                        _wordsWeigthsInGroupsIncludingOtherGroups[wg.name].Add(wg.theme, 0);
                    _wordsWeigthsInGroupsIncludingOtherGroups[wg.name][wg.theme] += addWeight;

                    if (!_stemOfWord.ContainsKey(wg.name))
                        _stemOfWord.Add(wg.name, new List<string>());
                    var allolog = db.Allolog.FirstOrDefault(a => a.Word_id == wg.id);
                    var allolog_name = allolog.With(a => a.Allolog_name, wg.name);
                    if (!_stemOfWord[wg.name].Contains(allolog_name))
                        _stemOfWord[wg.name].Add(allolog_name);
                }
            }

            _wordsCountInGroups = new Dictionary<string, Dictionary<string, Int64>>();
            _summaryWordsCountInGroup = new Dictionary<string, Int64>();
            _summaryWordsCountInGroupDistinct = new Dictionary<string, Int64>();
            _wordsWeigthInGroups = new Dictionary<string, Dictionary<string, double>>();
            _wordsWeigthsInGroupsIncludingOtherGroupsRemoved = new Dictionary<string, Dictionary<string, double>>();

            GC.Collect();
            if (callback != null)
                callback();
        }
예제 #3
0
 private void buttonLoadLexicalizedWords_Click(object sender, EventArgs e)
 {
     var ofd = new OpenFileDialog { Multiselect = true };
     var stemmer = new Stemming();
     if (ofd.ShowDialog() == DialogResult.OK)
     {
         foreach (var filename in ofd.FileNames)
         {
             var theme = Path.GetFileNameWithoutExtension(filename);
             var words = new List<string>();
             var sr = new StreamReader(filename, Encoding.Default);
             while (!sr.EndOfStream)
             {
                 var word = _tc.NormilizeText(sr.ReadLine());
                 word = word.Trim();
                 if (word.IndexOf(' ') >= 0)
                     continue;
                 word = stemmer.Stem(word);
                 words.Add(word);
             }
             _tc.LoadLexicalizedWords(comboBoxThemeGroup.Text, theme, words);
             sr.Close();
         }
     }
 }
예제 #4
0
        /// <summary>Функция классификации текста
        /// </summary>
        /// <param name="text">Текст который необходимо классифийировать</param>		
        /// <returns>Данные вида имя группы , степень принадлежность текста группе</returns>
        public Dictionary<string, double> Check(string text)
        {
            if (_wordsWeigthsInGroupsIncludingOtherGroups == null)
                return null;
            var lastStatisticsWeightGlobal = new Dictionary<string, Dictionary<string, double>>();
            var lastStatisticsCountGlobal = new Dictionary<string, Dictionary<string, Int64>>();
            var answer = new Dictionary<string, double>();
            text = text.Replace(Environment.NewLine, " ");
            text = NormilizeText(text);

            var stemming = new Stemming();					//класс для получения основы слова

            var words = text.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).Select(w => stemming.Stem(w));
            words = words.Where(w => !string.IsNullOrWhiteSpace(w) && w.Length >= MinimumWordLength);

            foreach (var group in _groupsTexts)
                answer.Add(group.Key, 0);

            foreach (var word in words)
            {
                if (_wordsWeigthsInGroupsIncludingOtherGroups.ContainsKey(word))
                    foreach (var group in _wordsWeigthsInGroupsIncludingOtherGroups[word])
                    {
                        if (!lastStatisticsWeightGlobal.ContainsKey(group.Key))
                        {
                            lastStatisticsWeightGlobal.Add(group.Key, new Dictionary<string, double>());
                            lastStatisticsCountGlobal.Add(group.Key, new Dictionary<string, long>());
                        }
                        if (!lastStatisticsWeightGlobal[group.Key].ContainsKey(word))
                        {
                            lastStatisticsWeightGlobal[group.Key].Add(word, group.Value);
                            lastStatisticsCountGlobal[group.Key].Add(word, 0);
                        }
                        answer[group.Key] += group.Value;
                        lastStatisticsCountGlobal[group.Key][word]++;
                    }
            }

            var orderAnswer = new Dictionary<string, double>();

            //сортировка в обратном порядке, не охото компарер писать ради такой фигни
            var values = answer.Values.Select(v => -v).ToList();
            values.Sort();
            values = values.Distinct().Select(v => -v).ToList();
            foreach (var value in values)
            {
                var orderStep = answer.Where(a => a.Value == value);
                foreach (var answerElement in orderStep)
                    orderAnswer.Add(answerElement.Key, answerElement.Value);
            }
            //конец сортировки в обратном порядке

            var lastStatisticsWeight = new Dictionary<string, Dictionary<string, double>>();
            var lastStatisticsCount = new Dictionary<string, Dictionary<string, Int64>>();

            foreach (var ans in orderAnswer)
            {
                if (lastStatisticsWeightGlobal.ContainsKey(ans.Key))
                {
                    lastStatisticsWeight.Add(ans.Key, lastStatisticsWeightGlobal[ans.Key]);
                    lastStatisticsCount.Add(ans.Key, lastStatisticsCountGlobal[ans.Key]);
                }
            }

            _lastStatisticsWeight = lastStatisticsWeight;
            _lastStatisticsCount = lastStatisticsCount;

            return orderAnswer;
        }