コード例 #1
0
        //Read stop word list from text file called stopwords.txt

        //public string[] StopWords()
        //{
        //    string[] stopText = File.ReadAllLines("stopwords.txt");
        //    return stopText;

        //}


        public void ReadPdf(string pdfpath)
        {
            try
            {
                PdfReader     pdfr    = new PdfReader(pdfpath);
                StringBuilder pdfText = new StringBuilder();

                int tp = DocumentInfo.TotalPages;
                //loop to read pdf page by page

                for (int page = 1; page <= tp; page++)
                {
                    ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                    string currentText = PdfTextExtractor.GetTextFromPage(pdfr, page, strategy);



                    currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));

                    //currentText = currentText.ToLower();


                    POSTagger.mModelPath = "Models\\";
                    string[] tSplittedWords = GetWords(currentText);

                    string[] sentences = POSTagger.SplitSentences(currentText);

                    Sentences[] sentencearray = new Sentences[sentences.Count()];

                    int sc = sentences.Count();
                    for (int i = 0; i < sc; i++)
                    {
                        sentencearray[i] = new Sentences();

                        sentencearray[i].SentenceNumber = i + 1 + currentcountofsentences;

                        sentencearray[i].SentenceString = sentences[i].ToLower();
                        sentenceList.Add(sentencearray[i]);
                    }

                    currentcountofsentences += sentences.Count();

                    string[] POS = POSTagger.PosTagTokens(tSplittedWords);

                    splittedWords.Clear();
                    nrSplittedWords.Clear();
                    countList.Clear();

                    int poslength = POS.Length;
                    for (int i = 0; i < poslength; i++)
                    {
                        //NN Noun, singular or mass
                        //NNS Noun, plural
                        //NNP Proper noun, singular
                        //NNPS Proper noun, plural

                        if (POS[i] == "NN" || POS[i] == "NNS" || POS[i] == "NNP" || POS[i] == "NNPS")
                        {
                            tSplittedWords[i] = tSplittedWords[i].ToLower();
                            splittedWords.Add(tSplittedWords[i]);
                        }
                    }


                    //removing stop words from splittedwords list
                    List <string> t1SplittedWords = new List <string>();
                    foreach (string sw in splittedWords)
                    {
                        if (!Array.Exists(_stopWords, element => element == sw))
                        {
                            t1SplittedWords.Add(sw);
                        }

                        //this also works
                        //if (!_stopWords.Contains(sw))
                        //{
                        //    t1SplittedWords.Add(sw);
                        //}
                    }
                    splittedWords.Clear();
                    splittedWords = t1SplittedWords.ToList();

                    nrSplittedWords = splittedWords.Distinct().ToList();


                    int nrswcount = nrSplittedWords.Count;

                    _eachPageWordCount.Add(nrswcount);
                    _tUniqueWordsinPage = nrswcount;

                    //calculating frequency of words in each page i.e. Term Frequency

                    for (int i = 0; i < nrswcount; i++)
                    {
                        string searchItem = nrSplittedWords[i];


                        int count = 0;
                        for (int j = 0; j < splittedWords.Count(); j++)
                        {
                            if (searchItem == splittedWords[j])
                            {
                                count++;
                            }
                        }

                        countList.Add(count);
                    }
                    Words[] wordarray = new Words[nrSplittedWords.Count()];

                    for (int i = 0; i < nrswcount; i++)
                    {
                        wordarray[i]               = new Words();
                        wordarray[i].Word          = nrSplittedWords[i];
                        wordarray[i].TermFrequency = countList[i];
                        wordarray[i].Pageno        = page;
                        wordList.Add(wordarray[i]);
                    }

                    //foreach (string s in nrSplittedWords)
                    //{
                    //    UniqueWordsinCorpus.Add(s);
                    //}
                    UniqueWordsinCorpus.AddRange(nrSplittedWords);

                    pdfText.Append(currentText);
                }                  //end of page loop

                pdfr.Close();

                //UniqueWordsinCorpus is a list of string of unique words
                UniqueWordsinCorpus = UniqueWordsinCorpus.Distinct().ToList();

                UniqueWordsinCorpus.Sort();



                foreach (Words w in wordList)
                {
                    int corf = 0;
                    foreach (Words w1 in wordList)
                    {
                        if (w.Word == w1.Word)
                        {
                            corf = corf + w1.TermFrequency;
                        }
                    }

                    w.CorpusFrequency = corf;
                }


                foreach (Words w in wordList)
                {
                    w.SentencenoWithFrequency = new Dictionary <int, int>();
                    foreach (Sentences s in sentenceList)
                    {
                        int      sentfreq = 0;
                        string[] splittedwordsofsentence = GetWords(s.SentenceString);

                        int swoscount = splittedwordsofsentence.Count();
                        for (int i = 0; i < swoscount; i++)
                        {
                            if (w.Word == splittedwordsofsentence[i])
                            {
                                sentfreq++;
                            }
                        }
                        w.SentencenoWithFrequency.Add(s.SentenceNumber, sentfreq);
                    }
                }



                //wordList.Sort(delegate(Words w1, Words w2) { return w1.Word.CompareTo(w2.Word); });

                wordList.Sort((w1, w2) => w1.Word.CompareTo(w2.Word));


                //copying words from wordList of Words to uniquewordlist of uniquewords while removing the redundant entry

                UniqueWords[] uniquewordarray = new UniqueWords[UniqueWordsinCorpus.Count];

                int uwiccount = UniqueWordsinCorpus.Count;
                for (int i = 0; i < uwiccount; i++)
                {
                    uniquewordarray[i] = new UniqueWords();
                    uniquewordarray[i].SentencenoWithFrequency = new Dictionary <int, int>();
                    uniquewordarray[i].PagenoWithFrequency     = new Dictionary <int, int>();
                    foreach (Words w in wordList)
                    {
                        if (UniqueWordsinCorpus[i] == w.Word)
                        {
                            if (uniquewordarray[i].Term == null)
                            {
                                uniquewordarray[i].Term = w.Word;
                            }


                            uniquewordarray[i].CorpusFrequency = w.CorpusFrequency;

                            uniquewordarray[i].SentencenoWithFrequency = w.SentencenoWithFrequency;

                            uniquewordarray[i].PagenoWithFrequency.Add(w.Pageno, w.TermFrequency);
                        }
                    }

                    UniqueWordList.Add(uniquewordarray[i]);
                }

                //computing document frequency of unique words

                foreach (UniqueWords uw in UniqueWordList)
                {
                    uw.DocFrequency = uw.PagenoWithFrequency.Count;
                }



                //Displaying uniquewords with their attribute values

                //foreach (UniqueWords uw in UniqueWordList)
                //{
                //    DocText.AppendText(uw.Term + "........");
                //    DocText.AppendText(uw.CorpusFrequency.ToString() + "\n");
                //    DocText.AppendText("Sentence no with frequency \n");
                //    List<KeyValuePair<int, int>> list = uw.SentencenoWithFrequency.ToList();
                //    foreach (KeyValuePair<int, int> pair in list)
                //    {
                //        if (pair.Value > 0)
                //        {
                //            DocText.AppendText(pair.Key.ToString() + ".......");
                //            DocText.AppendText(pair.Value.ToString() + Environment.NewLine);
                //        }

                //    }
                //    DocText.AppendText("Page no with freqency \n");
                //    List<KeyValuePair<int, int>> list1 = uw.PagenoWithFrequency.ToList();
                //    foreach (KeyValuePair<int, int> pair in list1)
                //    {
                //        if (pair.Value > 0)
                //        {
                //            DocText.AppendText(pair.Key.ToString() + ".......");
                //            DocText.AppendText(pair.Value.ToString() + Environment.NewLine);
                //        }

                //    }
                //}

                //foreach (Words w in wordList)
                //{
                //    DocText.AppendText(w.Word + Environment.NewLine);
                //    List<KeyValuePair<int, int>> list = w.SentencenoWithFrequency.ToList();
                //    foreach (KeyValuePair<int, int> pair in list)
                //    {
                //        if (pair.Value > 0)
                //        {
                //            DocText.AppendText(pair.Key.ToString() + ".......");
                //            DocText.AppendText(pair.Value.ToString() + Environment.NewLine);
                //        }

                //    }

                //}



                // matrixpro.GenerateMatrix(UniqueWordList);


                //UserWordsEditor uwe=new UserWordsEditor(UniqueWordList);
                //uwe.Show();
            }
            catch (Exception se)
            {
                MessageBox.Show(se.Message);
            }
        }
コード例 #2
0
        //Read stop word list from text file called stopwords.txt
        //public string[] StopWords()
        //{
        //    string[] stopText = File.ReadAllLines("stopwords.txt");
        //    return stopText;
        //}
        public void ReadPdf(string pdfpath)
        {
            try
            {

                PdfReader pdfr = new PdfReader(pdfpath);
                StringBuilder pdfText = new StringBuilder();

                int tp = DocumentInfo.TotalPages;
                //loop to read pdf page by page

                for (int page = 1; page <= tp; page++)
                {
                    ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                    string currentText = PdfTextExtractor.GetTextFromPage(pdfr, page, strategy);

                    currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));

                    //currentText = currentText.ToLower();

                    POSTagger.mModelPath = "Models\\";
                    string[] tSplittedWords = GetWords(currentText);

                    string[] sentences = POSTagger.SplitSentences(currentText);

                    Sentences[] sentencearray = new Sentences[sentences.Count()];

                    int sc = sentences.Count();
                    for (int i = 0; i < sc; i++)
                    {
                        sentencearray[i] = new Sentences();

                        sentencearray[i].SentenceNumber = i + 1 + currentcountofsentences;

                        sentencearray[i].SentenceString = sentences[i].ToLower();
                        sentenceList.Add(sentencearray[i]);
                    }

                    currentcountofsentences += sentences.Count();

                    string[] POS = POSTagger.PosTagTokens(tSplittedWords);

                    splittedWords.Clear();
                    nrSplittedWords.Clear();
                    countList.Clear();

                    int poslength = POS.Length;
                    for (int i = 0; i < poslength; i++)
                    {
                        //NN Noun, singular or mass
                        //NNS Noun, plural
                        //NNP Proper noun, singular
                        //NNPS Proper noun, plural

                        if (POS[i] == "NN" || POS[i] == "NNS" || POS[i] == "NNP" || POS[i] == "NNPS")
                        {
                        tSplittedWords[i] = tSplittedWords[i].ToLower();
                        splittedWords.Add(tSplittedWords[i]);
                        }

                    }

                    //removing stop words from splittedwords list
                    List<string> t1SplittedWords = new List<string>();
                    foreach (string sw in splittedWords)
                    {

                        if (!Array.Exists(_stopWords, element => element == sw))
                        {

                            t1SplittedWords.Add(sw);

                        }

                        //this also works
                        //if (!_stopWords.Contains(sw))
                        //{
                        //    t1SplittedWords.Add(sw);
                        //}

                    }
                    splittedWords.Clear();
                    splittedWords = t1SplittedWords.ToList();

                    nrSplittedWords = splittedWords.Distinct().ToList();

                    int nrswcount = nrSplittedWords.Count;

                    _eachPageWordCount.Add(nrswcount);
                    _tUniqueWordsinPage = nrswcount;

                    //calculating frequency of words in each page i.e. Term Frequency

                    for (int i = 0; i < nrswcount; i++)
                    {
                        string searchItem = nrSplittedWords[i];

                        int count = 0;
                        for (int j = 0; j < splittedWords.Count(); j++)
                        {
                            if (searchItem == splittedWords[j])
                                count++;

                        }

                        countList.Add(count);

                    }
                    Words[] wordarray = new Words[nrSplittedWords.Count()];

                    for (int i = 0; i < nrswcount; i++)
                    {
                        wordarray[i] = new Words();
                        wordarray[i].Word = nrSplittedWords[i];
                        wordarray[i].TermFrequency = countList[i];
                        wordarray[i].Pageno = page;
                        wordList.Add(wordarray[i]);

                    }

                    //foreach (string s in nrSplittedWords)
                    //{
                    //    UniqueWordsinCorpus.Add(s);
                    //}
                    UniqueWordsinCorpus.AddRange(nrSplittedWords);

                    pdfText.Append(currentText);

                }                  //end of page loop

                pdfr.Close();

                //UniqueWordsinCorpus is a list of string of unique words
                UniqueWordsinCorpus = UniqueWordsinCorpus.Distinct().ToList();

                UniqueWordsinCorpus.Sort();

                foreach (Words w in wordList)
                {
                    int corf = 0;
                    foreach (Words w1 in wordList)
                    {
                        if (w.Word == w1.Word)
                            corf = corf + w1.TermFrequency;

                    }

                    w.CorpusFrequency = corf;
                }

                foreach (Words w in wordList)
                {
                    w.SentencenoWithFrequency = new Dictionary<int, int>();
                    foreach (Sentences s in sentenceList)
                    {
                        int sentfreq = 0;
                        string[] splittedwordsofsentence = GetWords(s.SentenceString);

                        int swoscount = splittedwordsofsentence.Count();
                        for (int i = 0; i < swoscount; i++)
                        {
                            if (w.Word == splittedwordsofsentence[i])
                                sentfreq++;
                        }
                        w.SentencenoWithFrequency.Add(s.SentenceNumber, sentfreq);
                    }
                }

                //wordList.Sort(delegate(Words w1, Words w2) { return w1.Word.CompareTo(w2.Word); });

                wordList.Sort((w1, w2) => w1.Word.CompareTo(w2.Word));

             //copying words from wordList of Words to uniquewordlist of uniquewords while removing the redundant entry

                UniqueWords[] uniquewordarray = new UniqueWords[UniqueWordsinCorpus.Count];

                int uwiccount = UniqueWordsinCorpus.Count;
                for (int i = 0; i < uwiccount; i++)
                {

                    uniquewordarray[i] = new UniqueWords();
                    uniquewordarray[i].SentencenoWithFrequency = new Dictionary<int, int>();
                    uniquewordarray[i].PagenoWithFrequency = new Dictionary<int, int>();
                    foreach (Words w in wordList)
                    {
                        if (UniqueWordsinCorpus[i] == w.Word)
                        {
                            if (uniquewordarray[i].Term == null)
                                uniquewordarray[i].Term = w.Word;

                            uniquewordarray[i].CorpusFrequency = w.CorpusFrequency;

                            uniquewordarray[i].SentencenoWithFrequency = w.SentencenoWithFrequency;

                            uniquewordarray[i].PagenoWithFrequency.Add(w.Pageno, w.TermFrequency);

                        }
                    }

                    UniqueWordList.Add(uniquewordarray[i]);

                }

             //computing document frequency of unique words

                foreach (UniqueWords uw in UniqueWordList)
                {
                    uw.DocFrequency = uw.PagenoWithFrequency.Count;
                }

                //Displaying uniquewords with their attribute values

                //foreach (UniqueWords uw in UniqueWordList)
                //{
                //    DocText.AppendText(uw.Term + "........");
                //    DocText.AppendText(uw.CorpusFrequency.ToString() + "\n");
                //    DocText.AppendText("Sentence no with frequency \n");
                //    List<KeyValuePair<int, int>> list = uw.SentencenoWithFrequency.ToList();
                //    foreach (KeyValuePair<int, int> pair in list)
                //    {
                //        if (pair.Value > 0)
                //        {
                //            DocText.AppendText(pair.Key.ToString() + ".......");
                //            DocText.AppendText(pair.Value.ToString() + Environment.NewLine);
                //        }

                //    }
                //    DocText.AppendText("Page no with freqency \n");
                //    List<KeyValuePair<int, int>> list1 = uw.PagenoWithFrequency.ToList();
                //    foreach (KeyValuePair<int, int> pair in list1)
                //    {
                //        if (pair.Value > 0)
                //        {
                //            DocText.AppendText(pair.Key.ToString() + ".......");
                //            DocText.AppendText(pair.Value.ToString() + Environment.NewLine);
                //        }

                //    }
                //}

                //foreach (Words w in wordList)
                //{
                //    DocText.AppendText(w.Word + Environment.NewLine);
                //    List<KeyValuePair<int, int>> list = w.SentencenoWithFrequency.ToList();
                //    foreach (KeyValuePair<int, int> pair in list)
                //    {
                //        if (pair.Value > 0)
                //        {
                //            DocText.AppendText(pair.Key.ToString() + ".......");
                //            DocText.AppendText(pair.Value.ToString() + Environment.NewLine);
                //        }

                //    }

                //}

                // matrixpro.GenerateMatrix(UniqueWordList);

                //UserWordsEditor uwe=new UserWordsEditor(UniqueWordList);
                //uwe.Show();

            }
            catch (Exception se)
            {

                MessageBox.Show(se.Message);
            }
        }