public DocumentViewer(Document documnet) { InitializeComponent(); v_TextBlock_Title.Text = documnet.Title; v_TextBlock_OriginalText.Text = documnet.OriginalText; v_TextBlock_ProcessedText.Text = documnet.PostProcessedText; v_TextBlock_Keywords.Text = string.Concat(documnet.BagOfWords.Where(s => s.Value > 0) .OrderByDescending(s => s.Value).Select(s => s.Key + ", ")); }
private void RecalculateDocuments() { //Calculating TF/IDF #region TF IDF foreach (var doc in _rawDocuments) { var docRep = new Document(); docRep.Title = Regex.Split(doc, "\r\n").FirstOrDefault(); docRep.OriginalText = string.Concat(Regex.Split(doc, "\r\n").Skip(1)); docRep.PostProcessedText = string.Concat(Regex.Replace(doc.ToLower(), @"[^\w\s]", "").Split(null).Select(s => _ps.stemTerm(s) + " ")); docRep.BagOfWords = new Dictionary<string, int>(); foreach (var keyword in _rawKeywords) { var temp = _ps.stemTerm(keyword); docRep.BagOfWords[keyword] = docRep.PostProcessedText.Split(null).Count(s => s.Equals(_ps.stemTerm(temp))); } docRep.TF = new Dictionary<string, double>(); var tfMax = docRep.BagOfWords.Max(w => w.Value); foreach (var word in docRep.BagOfWords) { if (tfMax <= 0) { docRep.TF[word.Key] = 0; } else { docRep.TF[word.Key] = (double)docRep.BagOfWords[word.Key] / tfMax; } } _documents.Add(docRep); } //calculationg IDFs foreach (var keyword in _rawKeywords) { var count = (double)_documents.Count; var contains = _documents.Count(d => d.BagOfWords.Any(w => w.Key.Equals(keyword) && w.Value > 0)); IDF[keyword] = contains > 0 ? Math.Log10(count / contains) : 0; } //calculationg Vector Length foreach (var docRep in _documents) { docRep.TFIDFVectorValue = DistanceN(docRep.TF.Select(tf => IDF[tf.Key] * tf.Value)); } #endregion //words Correlation #region Correlation foreach (var doc in _rawDocuments) { var wordsInDoc = GetWords(doc); foreach (var word in wordsInDoc) { if (!allWords.Contains(word)) { allWords.Add(word); } } } double[,] docsWordsArray = new double[_rawDocuments.Count, allWords.Count]; for (int i = 0; i < _rawDocuments.Count; i++) { var wordsInDoc = GetWords(_rawDocuments[i]); for (int j = 0; j < allWords.Count; j++) { var wordsCount = wordsInDoc.Count(w => w.Equals(allWords[j])); docsWordsArray[i, j] = wordsCount; } } var docsWordsMatrix = Matrix<double>.Build.DenseOfArray(docsWordsArray); wordsCorrelationMatrix = docsWordsMatrix.Transpose().Multiply(docsWordsMatrix).NormalizeRows(1.0); #endregion //NextWordCounting #region Next Word Counting for (int i = 0; i < _rawDocuments.Count; i++) { var words = GetWords(_rawDocuments[i]); for (int j = 0; j < words.Length; j++) { if (!_nextWordCounter.ContainsKey(words[j])) {// jeszcze nie było takiego słowa _nextWordCounter[words[j]] = new Dictionary<string, int>(); if (j + 1 < words.Length) {//istnieje następne słowo _nextWordCounter[words[j]][words[j + 1]] = 1; } } else {// było już i incrementujemy wartość jego następnego słowa if (j + 1 < words.Length) {// istnieje następne słowo if (!_nextWordCounter[words[j]].ContainsKey(words[j + 1])) {//następnego słowa jeszcze nie ma w słowniku _nextWordCounter[words[j]][words[j + 1]] = 1; } else {//następne słowo już jest w słowniku - incrementujemy wartość _nextWordCounter[words[j]][words[j + 1]]++; } } } } } #endregion //LSI Calculating #region LSI double[,] termsDocsArray = new double[_rawKeywords.Count, _documents.Count]; //bag of words for (int i = 0; i < _rawKeywords.Count; i++) { for (int j = 0; j < _documents.Count; j++) { termsDocsArray[i, j] = _documents[j].BagOfWords[_rawKeywords[i]]; } } // TF/IDF //for (int i = 0; i < _rawKeywords.Count; i++) //{ // for (int j = 0; j < _documents.Count; j++) // { // termsDocsArray[i, j] = _documents[j].TF[_rawKeywords[i]] * IDF[_rawKeywords[i]]; // } //} Matrix<double> termsDocsMatrix = Matrix<double>.Build.DenseOfArray(termsDocsArray); var svd = termsDocsMatrix.Svd(); var K = svd.U; var S = Matrix<double>.Build.DenseOfDiagonalArray(svd.S.ToArray()); var DT = svd.VT.SubMatrix(0, S.RowCount, 0, svd.VT.ColumnCount); var D = DT.Transpose(); S_s = S.SubMatrix(0, S.RowCount - reductionCount, 0, S.ColumnCount - reductionCount); K_s = K.SubMatrix(0, K.RowCount, 0, K.ColumnCount - Math.Abs(K.ColumnCount - S_s.ColumnCount)); D_sT = DT.SubMatrix(0, DT.RowCount - reductionCount, 0, DT.ColumnCount); D_s = D_sT.Transpose(); LsiMatrix = K_s * S_s * D_sT; #endregion }