コード例 #1
0
        public static string SummarizeByLSA(TextFile textFile)
        {
            string input = textFile.RawText;

            string[] sentences = input.Split(new char[] { '.', '!', '?', ':', '…', '\r', '\n' },
                                             StringSplitOptions.RemoveEmptyEntries);
            for (int i = 0; i < sentences.Length; ++i)
            {
                var    sb       = new StringBuilder();
                string sentence = sentences[i].Trim();
                foreach (char c in sentence)
                {
                    if (!char.IsPunctuation(c))
                    {
                        sb.Append(c);
                    }
                }
                sentences[i] = sb.ToString().ToLower();
            }

            // Remove stop words--e.g., the, and, a, etc.
            string[] stopwords = File.ReadAllLines(@"Resources/stopwords.txt");
            for (int i = 0; i < sentences.Count(); ++i)
            {
                string sentence = sentences[i];
                for (int j = 0; j < stopwords.Count(); ++j)
                {
                    sentences[i] = string.Join(" ", sentence.Split(' ').Where(wrd => !stopwords.Contains(wrd)));
                }
            }

            // Reduce words to their stem.
            PorterStemmer stemmer = new PorterStemmer();

            for (int i = 0; i < sentences.Count(); ++i)
            {
                sentences[i] = stemmer.StemWord(sentences[i]);
            }

            Dictionary <string, int> wordFrequencies = new Dictionary <string, int>();

            foreach (string s in sentences)
            {
                string[] words = s.Split(' ');
                foreach (string w in words)
                {
                    if (wordFrequencies.ContainsKey(w))
                    {
                        wordFrequencies[w] += 1;
                    }
                    else
                    {
                        wordFrequencies[w] = 1;
                    }
                }
            }

            // Top N words with highest frequencies will serve as document concepts.
            int N = textFile.DesiredSummaryLength;

            string[] concepts = (from kvp in wordFrequencies
                                 orderby kvp.Value descending
                                 select kvp)
                                .ToDictionary(pair => pair.Key, pair => pair.Value).Take(N)
                                .Select(k => k.Key).ToArray();

            // Add concepts to TextFile instance properties.
            textFile.DocumentConcepts = concepts;

            int documentLength = sentences.Length;
            var X = DenseMatrix.Create(N, documentLength, (i, j) => 0.0);

            for (int i = 0; i < X.RowCount; ++i)
            {
                int    sentencesWithConcept = 0;
                string concept = concepts[i];
                for (int j = 0; j < X.ColumnCount; ++j)
                {
                    string[] sentenceWords = sentences[j].Split(' ');
                    int      wordCount     = (from word in sentenceWords
                                              where word == concept
                                              select word)
                                             .Count();
                    if (wordCount > 0)
                    {
                        sentencesWithConcept += 1;
                    }

                    X[i, j] = wordCount / sentenceWords.Length;
                }
                if (sentencesWithConcept == 0)
                {
                    Console.WriteLine("No sentences with concept " + concepts[i]);
                }
                double inverseDocumentFreq = Math.Log(documentLength / (sentencesWithConcept + 0.0001), 2.0);
                for (int k = 0; k < X.ColumnCount; ++k)
                {
                    X[i, k] = X[i, k] * inverseDocumentFreq;
                }
            }

            // Compute SVD of the topic representation matrix, X.
            var svd = X.Svd();

            // Cross method to select summary sentences.
            int             columnCount = svd.VT.ColumnCount;
            Matrix <double> Vh          = svd.VT.SubMatrix(0, concepts.Length, 0, columnCount).PointwiseAbs();

            for (int i = 0; i < Vh.RowCount; ++i)
            {
                double averageSentenceScore = Vh.Row(i).Average();
                for (int j = 0; j < Vh.ColumnCount; ++j)
                {
                    if (Vh[i, j] <= averageSentenceScore)
                    {
                        Vh[i, j] = 0;
                    }
                }
            }

            var sentenceLengths = Vh.RowSums();

            int[] summaryIndices = new int[Vh.RowCount];
            Console.Write("Vh.RowCnt = ", Vh.RowCount);
            Console.Write("concepts.Length = ", concepts.Length);
            for (int i = 0; i < Vh.RowCount; ++i)
            {
                double max = 0;
                for (int j = 0; j < Vh.ColumnCount; ++j)
                {
                    if (Vh[i, j] > max)
                    {
                        max = Vh[i, j];
                        summaryIndices[i] = j;
                    }
                }
            }

            string[] sourceSentences = Regex.Split(input, @"(?<=[\.!\?])\s+");
            textFile.DocumentLength = sourceSentences.Length;
            string summary = "";

            foreach (int i in summaryIndices)
            {
                summary += sourceSentences[i] + " ";
            }

            /* From https://bit.ly/3ogjy2l */
            return(summary.Replace("\r\n", string.Empty)
                   .Replace("\n", string.Empty)
                   .Replace("\r", string.Empty)
                   .Replace("\t", string.Empty)
                   .Replace(((char)0x2028).ToString(), string.Empty)
                   .Replace(((char)0x2029).ToString(), string.Empty));
        }
コード例 #2
0
        private void ProcessSelectedFiles(string[] fileNames)
        {
            /*
             * Handle improper file types
             */
            Form wf = new Form()
            {
                Size = new System.Drawing.Size(0, 0)
            };

            Task.Delay(TimeSpan.FromSeconds(7))
            .ContinueWith((t) => wf.Close(), TaskScheduler.FromCurrentSynchronizationContext());

            if (fileNames.Length == 1)
            {
                string fileExt = System.IO.Path.GetExtension(fileNames[0]);

                if (fileExt == ".docx" || fileExt == ".odt" || fileExt == ".pdf")
                {
                    AddProperties_Button.Visibility = Visibility.Visible;
                }
                else if (fileExt == ".txt" /* Txt file properties cannot be updated */)
                {
                    AddProperties_Button.Visibility = Visibility.Hidden;
                }
                else
                {
                    string message = "Unsupported file type selected: " + fileExt +
                                     "\nType must be .docx, .pdf, .odt, or .txt";
                    string caption = "Unsupported file type...";
                    System.Windows.Forms.MessageBox.Show(wf, message, caption);
                    return;
                }

                _activeTextFile = new TextFile(fileNames[0]);
                _activeTextFile.DesiredSummaryLength = _desiredSummaryLength;
                GenerateAndPrintSummary(_activeTextFile);

                Regen_Button.Visibility = Visibility.Visible;
                DragAndDrop.Visibility  = Visibility.Hidden;
                Copy_Button.Visibility  = Visibility.Visible;
                Clear_Button.Visibility = Visibility.Visible;
            }
            else if (fileNames.Length > 1)
            {
                string overallSummary = "";
                string overallPath    = System.IO.Path.GetDirectoryName(fileNames[0]);
                foreach (string filePath in fileNames)
                {
                    string fileExt = "";
                    try
                    {
                        fileExt = System.IO.Path.GetExtension(filePath);
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine(ex.Message);
                    }

                    // Handle invalid file types
                    if (!"docx|odt|pdf|txt".Contains(fileExt))
                    {
                        continue;
                    }

                    _activeTextFile = new TextFile(filePath);
                    overallSummary += Summarizer.SummarizeByLSA(_activeTextFile);
                }

                if (overallSummary.Length == 0)
                {
                    string message = "No supported file types were selected. " +
                                     "\nType must be .docx, .pdf, .odt, or .txt";
                    string caption = "Unsupported file types...";
                    System.Windows.Forms.MessageBox.Show(wf, message, caption);
                    return;
                }

                _activeTextFile = new TextFile
                {
                    RawText   = overallSummary,
                    Name      = overallPath,
                    FullPath  = overallPath,
                    Extension = null
                };

                GenerateAndPrintSummary(_activeTextFile);
            }
        }