예제 #1
0
 public DocumentViewer(Document documnet)
 {
     InitializeComponent();
     v_TextBlock_Title.Text = documnet.Title;
     v_TextBlock_OriginalText.Text = documnet.OriginalText;
     v_TextBlock_ProcessedText.Text = documnet.PostProcessedText;
     v_TextBlock_Keywords.Text = string.Concat(documnet.BagOfWords.Where(s => s.Value > 0)
                                                                  .OrderByDescending(s => s.Value).Select(s => s.Key + ", "));
 }
예제 #2
0
        private void RecalculateDocuments()
        {
            //Calculating TF/IDF
            #region TF IDF
            foreach (var doc in _rawDocuments)
            {
                var docRep = new Document();
                docRep.Title = Regex.Split(doc, "\r\n").FirstOrDefault();
                docRep.OriginalText = string.Concat(Regex.Split(doc, "\r\n").Skip(1));
                docRep.PostProcessedText = string.Concat(Regex.Replace(doc.ToLower(), @"[^\w\s]", "").Split(null).Select(s => _ps.stemTerm(s) + " "));
                docRep.BagOfWords = new Dictionary<string, int>();
                foreach (var keyword in _rawKeywords)
                {
                    var temp = _ps.stemTerm(keyword);
                    docRep.BagOfWords[keyword] = docRep.PostProcessedText.Split(null).Count(s => s.Equals(_ps.stemTerm(temp)));
                }

                docRep.TF = new Dictionary<string, double>();
                var tfMax = docRep.BagOfWords.Max(w => w.Value);
                foreach (var word in docRep.BagOfWords)
                {
                    if (tfMax <= 0)
                    {
                        docRep.TF[word.Key] = 0;
                    }
                    else
                    {
                        docRep.TF[word.Key] = (double)docRep.BagOfWords[word.Key] / tfMax;
                    }
                }

                _documents.Add(docRep);
            }

            //calculationg IDFs
            foreach (var keyword in _rawKeywords)
            {
                var count = (double)_documents.Count;
                var contains = _documents.Count(d => d.BagOfWords.Any(w => w.Key.Equals(keyword) && w.Value > 0));
                IDF[keyword] = contains > 0 ? Math.Log10(count / contains) : 0;
            }

            //calculationg Vector Length
            foreach (var docRep in _documents)
            {
                docRep.TFIDFVectorValue = DistanceN(docRep.TF.Select(tf => IDF[tf.Key] * tf.Value));
            }
            #endregion

            //words Correlation
            #region Correlation
            foreach (var doc in _rawDocuments)
            {
                var wordsInDoc = GetWords(doc);
                foreach (var word in wordsInDoc)
                {
                    if (!allWords.Contains(word))
                    {
                        allWords.Add(word);
                    }
                }
            }
            double[,] docsWordsArray = new double[_rawDocuments.Count, allWords.Count];
            for (int i = 0; i < _rawDocuments.Count; i++)
            {
                var wordsInDoc = GetWords(_rawDocuments[i]);
                for (int j = 0; j < allWords.Count; j++)
                {
                    var wordsCount = wordsInDoc.Count(w => w.Equals(allWords[j]));
                    docsWordsArray[i, j] = wordsCount;
                }
            }
            var docsWordsMatrix = Matrix<double>.Build.DenseOfArray(docsWordsArray);
            wordsCorrelationMatrix = docsWordsMatrix.Transpose().Multiply(docsWordsMatrix).NormalizeRows(1.0);
            #endregion

            //NextWordCounting
            #region Next Word Counting

            for (int i = 0; i < _rawDocuments.Count; i++)
            {
                var words = GetWords(_rawDocuments[i]);
                for (int j = 0; j < words.Length; j++)
                {
                    if (!_nextWordCounter.ContainsKey(words[j]))
                    {// jeszcze nie było takiego słowa
                        _nextWordCounter[words[j]] = new Dictionary<string, int>();
                        if (j + 1 < words.Length)
                        {//istnieje następne słowo
                            _nextWordCounter[words[j]][words[j + 1]] = 1;
                        }
                    }
                    else
                    {// było już i incrementujemy wartość jego następnego słowa
                        if (j + 1 < words.Length)
                        {// istnieje następne słowo
                            if (!_nextWordCounter[words[j]].ContainsKey(words[j + 1]))
                            {//następnego słowa jeszcze nie ma w słowniku
                                _nextWordCounter[words[j]][words[j + 1]] = 1;
                            }
                            else
                            {//następne słowo już jest w słowniku - incrementujemy wartość
                                _nextWordCounter[words[j]][words[j + 1]]++;
                            }
                        }
                    }
                }
            }

            #endregion

            //LSI Calculating
            #region LSI
            double[,] termsDocsArray = new double[_rawKeywords.Count, _documents.Count];
            //bag of words
            for (int i = 0; i < _rawKeywords.Count; i++)
            {
                for (int j = 0; j < _documents.Count; j++)
                {
                    termsDocsArray[i, j] = _documents[j].BagOfWords[_rawKeywords[i]];
                }
            }
            // TF/IDF
            //for (int i = 0; i < _rawKeywords.Count; i++)
            //{
            //    for (int j = 0; j < _documents.Count; j++)
            //    {
            //        termsDocsArray[i, j] = _documents[j].TF[_rawKeywords[i]] * IDF[_rawKeywords[i]];
            //    }
            //}

            Matrix<double> termsDocsMatrix = Matrix<double>.Build.DenseOfArray(termsDocsArray);

            var svd = termsDocsMatrix.Svd();
            var K = svd.U;
            var S = Matrix<double>.Build.DenseOfDiagonalArray(svd.S.ToArray());
            var DT = svd.VT.SubMatrix(0, S.RowCount, 0, svd.VT.ColumnCount);
            var D = DT.Transpose();

            S_s = S.SubMatrix(0, S.RowCount - reductionCount, 0, S.ColumnCount - reductionCount);
            K_s = K.SubMatrix(0, K.RowCount, 0, K.ColumnCount - Math.Abs(K.ColumnCount - S_s.ColumnCount));
            D_sT = DT.SubMatrix(0, DT.RowCount - reductionCount, 0, DT.ColumnCount);
            D_s = D_sT.Transpose();

            LsiMatrix = K_s * S_s * D_sT;
            #endregion
        }