public static string SummarizeByLSA(TextFile textFile) { string input = textFile.RawText; string[] sentences = input.Split(new char[] { '.', '!', '?', ':', '…', '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < sentences.Length; ++i) { var sb = new StringBuilder(); string sentence = sentences[i].Trim(); foreach (char c in sentence) { if (!char.IsPunctuation(c)) { sb.Append(c); } } sentences[i] = sb.ToString().ToLower(); } // Remove stop words--e.g., the, and, a, etc. string[] stopwords = File.ReadAllLines(@"Resources/stopwords.txt"); for (int i = 0; i < sentences.Count(); ++i) { string sentence = sentences[i]; for (int j = 0; j < stopwords.Count(); ++j) { sentences[i] = string.Join(" ", sentence.Split(' ').Where(wrd => !stopwords.Contains(wrd))); } } // Reduce words to their stem. PorterStemmer stemmer = new PorterStemmer(); for (int i = 0; i < sentences.Count(); ++i) { sentences[i] = stemmer.StemWord(sentences[i]); } Dictionary <string, int> wordFrequencies = new Dictionary <string, int>(); foreach (string s in sentences) { string[] words = s.Split(' '); foreach (string w in words) { if (wordFrequencies.ContainsKey(w)) { wordFrequencies[w] += 1; } else { wordFrequencies[w] = 1; } } } // Top N words with highest frequencies will serve as document concepts. int N = textFile.DesiredSummaryLength; string[] concepts = (from kvp in wordFrequencies orderby kvp.Value descending select kvp) .ToDictionary(pair => pair.Key, pair => pair.Value).Take(N) .Select(k => k.Key).ToArray(); // Add concepts to TextFile instance properties. textFile.DocumentConcepts = concepts; int documentLength = sentences.Length; var X = DenseMatrix.Create(N, documentLength, (i, j) => 0.0); for (int i = 0; i < X.RowCount; ++i) { int sentencesWithConcept = 0; string concept = concepts[i]; for (int j = 0; j < X.ColumnCount; ++j) { string[] sentenceWords = sentences[j].Split(' '); int wordCount = (from word in sentenceWords where word == concept select word) .Count(); if (wordCount > 0) { sentencesWithConcept += 1; } X[i, j] = wordCount / sentenceWords.Length; } if (sentencesWithConcept == 0) { Console.WriteLine("No sentences with concept " + concepts[i]); } double inverseDocumentFreq = Math.Log(documentLength / (sentencesWithConcept + 0.0001), 2.0); for (int k = 0; k < X.ColumnCount; ++k) { X[i, k] = X[i, k] * inverseDocumentFreq; } } // Compute SVD of the topic representation matrix, X. var svd = X.Svd(); // Cross method to select summary sentences. int columnCount = svd.VT.ColumnCount; Matrix <double> Vh = svd.VT.SubMatrix(0, concepts.Length, 0, columnCount).PointwiseAbs(); for (int i = 0; i < Vh.RowCount; ++i) { double averageSentenceScore = Vh.Row(i).Average(); for (int j = 0; j < Vh.ColumnCount; ++j) { if (Vh[i, j] <= averageSentenceScore) { Vh[i, j] = 0; } } } var sentenceLengths = Vh.RowSums(); int[] summaryIndices = new int[Vh.RowCount]; Console.Write("Vh.RowCnt = ", Vh.RowCount); Console.Write("concepts.Length = ", concepts.Length); for (int i = 0; i < Vh.RowCount; ++i) { double max = 0; for (int j = 0; j < Vh.ColumnCount; ++j) { if (Vh[i, j] > max) { max = Vh[i, j]; summaryIndices[i] = j; } } } string[] sourceSentences = Regex.Split(input, @"(?<=[\.!\?])\s+"); textFile.DocumentLength = sourceSentences.Length; string summary = ""; foreach (int i in summaryIndices) { summary += sourceSentences[i] + " "; } /* From https://bit.ly/3ogjy2l */ return(summary.Replace("\r\n", string.Empty) .Replace("\n", string.Empty) .Replace("\r", string.Empty) .Replace("\t", string.Empty) .Replace(((char)0x2028).ToString(), string.Empty) .Replace(((char)0x2029).ToString(), string.Empty)); }
private void ProcessSelectedFiles(string[] fileNames) { /* * Handle improper file types */ Form wf = new Form() { Size = new System.Drawing.Size(0, 0) }; Task.Delay(TimeSpan.FromSeconds(7)) .ContinueWith((t) => wf.Close(), TaskScheduler.FromCurrentSynchronizationContext()); if (fileNames.Length == 1) { string fileExt = System.IO.Path.GetExtension(fileNames[0]); if (fileExt == ".docx" || fileExt == ".odt" || fileExt == ".pdf") { AddProperties_Button.Visibility = Visibility.Visible; } else if (fileExt == ".txt" /* Txt file properties cannot be updated */) { AddProperties_Button.Visibility = Visibility.Hidden; } else { string message = "Unsupported file type selected: " + fileExt + "\nType must be .docx, .pdf, .odt, or .txt"; string caption = "Unsupported file type..."; System.Windows.Forms.MessageBox.Show(wf, message, caption); return; } _activeTextFile = new TextFile(fileNames[0]); _activeTextFile.DesiredSummaryLength = _desiredSummaryLength; GenerateAndPrintSummary(_activeTextFile); Regen_Button.Visibility = Visibility.Visible; DragAndDrop.Visibility = Visibility.Hidden; Copy_Button.Visibility = Visibility.Visible; Clear_Button.Visibility = Visibility.Visible; } else if (fileNames.Length > 1) { string overallSummary = ""; string overallPath = System.IO.Path.GetDirectoryName(fileNames[0]); foreach (string filePath in fileNames) { string fileExt = ""; try { fileExt = System.IO.Path.GetExtension(filePath); } catch (Exception ex) { Console.WriteLine(ex.Message); } // Handle invalid file types if (!"docx|odt|pdf|txt".Contains(fileExt)) { continue; } _activeTextFile = new TextFile(filePath); overallSummary += Summarizer.SummarizeByLSA(_activeTextFile); } if (overallSummary.Length == 0) { string message = "No supported file types were selected. " + "\nType must be .docx, .pdf, .odt, or .txt"; string caption = "Unsupported file types..."; System.Windows.Forms.MessageBox.Show(wf, message, caption); return; } _activeTextFile = new TextFile { RawText = overallSummary, Name = overallPath, FullPath = overallPath, Extension = null }; GenerateAndPrintSummary(_activeTextFile); } }