Exemplo n.º 1
0
        private void button2_Click(object sender, EventArgs e)
        {
            string text = ReadPdfFile(textBox1.Text);

            string[] sentences = SplitSentences(text);

            Dictionary <string, int> dictionary = new Dictionary <string, int>();
            summary       summ          = new summary();
            List <string> tokenSentence = new List <string>();
            List <string> nonStopWords  = new List <string>();
            List <string> posTag        = new List <string>();
            List <string> names         = new List <string>();

            string[] models = new string[] { "date", "location", "money", "organization", "percentage", "person", "time" };
            List <TFIDFFrequency> tFrequency = new List <TFIDFFrequency>();

            foreach (string sentence in sentences)
            {
                tokenSentence = mTokenizer.Tokenize(sentence).ToList();
                names.Add(mNameFinder.GetNames(models, sentence));
            }

            foreach (string word in StopwordTool._stops.Keys)
            {
                while (tokenSentence.Contains(word))
                {
                    tokenSentence.Remove(word);
                }
            }

            posTag = mPosTagger.Tag(tokenSentence.ToArray()).ToList();

            double[][] inputs = TFIDF.Transform(sentences, 0);
            inputs = TFIDF.Normalize(inputs);

            // Display the output.
            for (int index = 0; index < inputs.Length; index++)
            {
                foreach (double value in inputs[index])
                {
                    tFrequency.Add(new TFIDFFrequency {
                        Sentence = sentences[index], TFValue = value
                    });
                }
            }

            foreach (string word in tokenSentence)
            {
                if (word.Length >= 3)
                {
                    if (dictionary.ContainsKey(word))
                    {
                        dictionary[word]++;
                    }
                    else
                    {
                        dictionary[word] = 1;
                    }
                }
            }

            foreach (string name in names)
            {
                if (name.Length >= 3)
                {
                    if (dictionary.ContainsKey(name))
                    {
                        dictionary[name]++;
                    }
                    else
                    {
                        dictionary[name] = 1;
                    }
                }
            }

            var sortedDict = (from entry in dictionary orderby entry.Value descending select entry).ToDictionary(pair => pair.Key, pair => pair.Value);

            int count2  = 1;
            int result2 = 1;

            int.TryParse(textBox3.Text, out result2);

            foreach (KeyValuePair <string, int> pair in sortedDict)
            {
                summ.Sentences.Add(pair.Key);
                count2++;
                if (count2.Equals(result2))
                {
                    break;
                }
            }

            int count1  = 1;
            int result1 = 1;

            int.TryParse(textBox3.Text, out result1);

            var sortedtFIDF = from data in tFrequency orderby data.TFValue descending select data;

            foreach (var vr in sortedtFIDF)
            {
                summ.Sentences.Add(vr.Sentence);
                count1++;
                if (count1.Equals(result1))
                {
                    break;
                }
            }

            string summary = string.Join("\r\n", summ.Sentences.ToArray());

            richTextBox1.Text = summary;
        }
Exemplo n.º 2
0
        private void button3_Click(object sender, EventArgs e)
        {
            string    text;
            string    word = textBox2.Text;
            WebClient web  = new WebClient();

            HtmlAgilityPack.HtmlDocument Htmldoc = new HtmlAgilityPack.HtmlDocument();
            Process.Start("https://en.wikipedia.org/wiki/" + word);
            byte[] byteArray = web.DownloadData(new Uri("https://en.wikipedia.org/wiki/" + word));
            Stream stream    = new MemoryStream(byteArray);

            Htmldoc.Load(stream);
            FileStream fs     = new FileStream("D:\\htmltext2.pdf", FileMode.Create, FileAccess.Write);
            Document   pdfDoc = new Document();
            PdfWriter  writer = PdfWriter.GetInstance(pdfDoc, fs);

            pdfDoc.Open();
            foreach (HtmlNode node in Htmldoc.DocumentNode.SelectNodes("//p"))
            {
                text = node.InnerText.Trim();
                pdfDoc.Add(new Paragraph(text));
            }
            pdfDoc.Close();

            string htmltext = ReadPdfFile("D:\\htmltext2.pdf");

            string[] sentences = SplitSentences(htmltext);

            Dictionary <string, int> dictionary = new Dictionary <string, int>();
            summary       summ          = new summary();
            List <string> tokenSentence = new List <string>();
            List <string> nonStopWords  = new List <string>();
            List <string> posTag        = new List <string>();
            List <string> names         = new List <string>();

            string[] models = new string[] { "date", "location", "money", "organization", "percentage", "person", "time" };
            List <TFIDFFrequency> tFrequency = new List <TFIDFFrequency>();

            foreach (string sentence in sentences)
            {
                tokenSentence = mTokenizer.Tokenize(sentence).ToList();
                names.Add(mNameFinder.GetNames(models, sentence));
            }

            foreach (string stopWord in StopwordTool._stops.Keys)
            {
                while (tokenSentence.Contains(stopWord))
                {
                    tokenSentence.Remove(stopWord);
                }
            }

            posTag = mPosTagger.Tag(tokenSentence.ToArray()).ToList();

            double[][] inputs = TFIDF.Transform(sentences, 0);
            inputs = TFIDF.Normalize(inputs);

            // Display the output.
            for (int index = 0; index < inputs.Length; index++)
            {
                foreach (double value in inputs[index])
                {
                    tFrequency.Add(new TFIDFFrequency {
                        Sentence = sentences[index], TFValue = value
                    });
                }
            }

            foreach (string tokenWord in tokenSentence)
            {
                if (tokenWord.Length >= 3)
                {
                    if (dictionary.ContainsKey(tokenWord))
                    {
                        dictionary[tokenWord]++;
                    }
                    else
                    {
                        dictionary[tokenWord] = 1;
                    }
                }
            }

            foreach (string name in names)
            {
                if (name.Length >= 3)
                {
                    if (dictionary.ContainsKey(name))
                    {
                        dictionary[name]++;
                    }
                    else
                    {
                        dictionary[name] = 1;
                    }
                }
            }

            var sortedDict = (from entry in dictionary orderby entry.Value descending select entry).ToDictionary(pair => pair.Key, pair => pair.Value);

            int count2  = 1;
            int result2 = 1;

            int.TryParse(textBox4.Text, out result2);

            foreach (KeyValuePair <string, int> pair in sortedDict)
            {
                summ.Sentences.Add(pair.Key);
                count2++;
                if (count2.Equals(result2))
                {
                    break;
                }
            }

            int count1  = 1;
            int result1 = 1;

            int.TryParse(textBox4.Text, out result1);

            var sortedtFIDF = from data in tFrequency orderby data.TFValue descending select data;

            foreach (var vr in sortedtFIDF)
            {
                summ.Sentences.Add(vr.Sentence);
                count1++;
                if (count1.Equals(result1))
                {
                    break;
                }
            }

            string summary = string.Join("\r\n", summ.Sentences.ToArray());

            richTextBox1.Text = summary;
        }