//Read stop word list from text file called stopwords.txt //public string[] StopWords() //{ // string[] stopText = File.ReadAllLines("stopwords.txt"); // return stopText; //} public void ReadPdf(string pdfpath) { try { PdfReader pdfr = new PdfReader(pdfpath); StringBuilder pdfText = new StringBuilder(); int tp = DocumentInfo.TotalPages; //loop to read pdf page by page for (int page = 1; page <= tp; page++) { ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string currentText = PdfTextExtractor.GetTextFromPage(pdfr, page, strategy); currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); //currentText = currentText.ToLower(); POSTagger.mModelPath = "Models\\"; string[] tSplittedWords = GetWords(currentText); string[] sentences = POSTagger.SplitSentences(currentText); Sentences[] sentencearray = new Sentences[sentences.Count()]; int sc = sentences.Count(); for (int i = 0; i < sc; i++) { sentencearray[i] = new Sentences(); sentencearray[i].SentenceNumber = i + 1 + currentcountofsentences; sentencearray[i].SentenceString = sentences[i].ToLower(); sentenceList.Add(sentencearray[i]); } currentcountofsentences += sentences.Count(); string[] POS = POSTagger.PosTagTokens(tSplittedWords); splittedWords.Clear(); nrSplittedWords.Clear(); countList.Clear(); int poslength = POS.Length; for (int i = 0; i < poslength; i++) { //NN Noun, singular or mass //NNS Noun, plural //NNP Proper noun, singular //NNPS Proper noun, plural if (POS[i] == "NN" || POS[i] == "NNS" || POS[i] == "NNP" || POS[i] == "NNPS") { tSplittedWords[i] = tSplittedWords[i].ToLower(); splittedWords.Add(tSplittedWords[i]); } } //removing stop words from splittedwords list List <string> t1SplittedWords = new List <string>(); foreach (string sw in splittedWords) { if (!Array.Exists(_stopWords, element => element == sw)) { t1SplittedWords.Add(sw); } //this also works //if (!_stopWords.Contains(sw)) //{ // t1SplittedWords.Add(sw); //} } splittedWords.Clear(); splittedWords = t1SplittedWords.ToList(); nrSplittedWords = splittedWords.Distinct().ToList(); int nrswcount = nrSplittedWords.Count; _eachPageWordCount.Add(nrswcount); _tUniqueWordsinPage = nrswcount; //calculating frequency of words in each page i.e. Term Frequency for (int i = 0; i < nrswcount; i++) { string searchItem = nrSplittedWords[i]; int count = 0; for (int j = 0; j < splittedWords.Count(); j++) { if (searchItem == splittedWords[j]) { count++; } } countList.Add(count); } Words[] wordarray = new Words[nrSplittedWords.Count()]; for (int i = 0; i < nrswcount; i++) { wordarray[i] = new Words(); wordarray[i].Word = nrSplittedWords[i]; wordarray[i].TermFrequency = countList[i]; wordarray[i].Pageno = page; wordList.Add(wordarray[i]); } //foreach (string s in nrSplittedWords) //{ // UniqueWordsinCorpus.Add(s); //} UniqueWordsinCorpus.AddRange(nrSplittedWords); pdfText.Append(currentText); } //end of page loop pdfr.Close(); //UniqueWordsinCorpus is a list of string of unique words UniqueWordsinCorpus = UniqueWordsinCorpus.Distinct().ToList(); UniqueWordsinCorpus.Sort(); foreach (Words w in wordList) { int corf = 0; foreach (Words w1 in wordList) { if (w.Word == w1.Word) { corf = corf + w1.TermFrequency; } } w.CorpusFrequency = corf; } foreach (Words w in wordList) { w.SentencenoWithFrequency = new Dictionary <int, int>(); foreach (Sentences s in sentenceList) { int sentfreq = 0; string[] splittedwordsofsentence = GetWords(s.SentenceString); int swoscount = splittedwordsofsentence.Count(); for (int i = 0; i < swoscount; i++) { if (w.Word == splittedwordsofsentence[i]) { sentfreq++; } } w.SentencenoWithFrequency.Add(s.SentenceNumber, sentfreq); } } //wordList.Sort(delegate(Words w1, Words w2) { return w1.Word.CompareTo(w2.Word); }); wordList.Sort((w1, w2) => w1.Word.CompareTo(w2.Word)); //copying words from wordList of Words to uniquewordlist of uniquewords while removing the redundant entry UniqueWords[] uniquewordarray = new UniqueWords[UniqueWordsinCorpus.Count]; int uwiccount = UniqueWordsinCorpus.Count; for (int i = 0; i < uwiccount; i++) { uniquewordarray[i] = new UniqueWords(); uniquewordarray[i].SentencenoWithFrequency = new Dictionary <int, int>(); uniquewordarray[i].PagenoWithFrequency = new Dictionary <int, int>(); foreach (Words w in wordList) { if (UniqueWordsinCorpus[i] == w.Word) { if (uniquewordarray[i].Term == null) { uniquewordarray[i].Term = w.Word; } uniquewordarray[i].CorpusFrequency = w.CorpusFrequency; uniquewordarray[i].SentencenoWithFrequency = w.SentencenoWithFrequency; uniquewordarray[i].PagenoWithFrequency.Add(w.Pageno, w.TermFrequency); } } UniqueWordList.Add(uniquewordarray[i]); } //computing document frequency of unique words foreach (UniqueWords uw in UniqueWordList) { uw.DocFrequency = uw.PagenoWithFrequency.Count; } //Displaying uniquewords with their attribute values //foreach (UniqueWords uw in UniqueWordList) //{ // DocText.AppendText(uw.Term + "........"); // DocText.AppendText(uw.CorpusFrequency.ToString() + "\n"); // DocText.AppendText("Sentence no with frequency \n"); // List<KeyValuePair<int, int>> list = uw.SentencenoWithFrequency.ToList(); // foreach (KeyValuePair<int, int> pair in list) // { // if (pair.Value > 0) // { // DocText.AppendText(pair.Key.ToString() + "......."); // DocText.AppendText(pair.Value.ToString() + Environment.NewLine); // } // } // DocText.AppendText("Page no with freqency \n"); // List<KeyValuePair<int, int>> list1 = uw.PagenoWithFrequency.ToList(); // foreach (KeyValuePair<int, int> pair in list1) // { // if (pair.Value > 0) // { // DocText.AppendText(pair.Key.ToString() + "......."); // DocText.AppendText(pair.Value.ToString() + Environment.NewLine); // } // } //} //foreach (Words w in wordList) //{ // DocText.AppendText(w.Word + Environment.NewLine); // List<KeyValuePair<int, int>> list = w.SentencenoWithFrequency.ToList(); // foreach (KeyValuePair<int, int> pair in list) // { // if (pair.Value > 0) // { // DocText.AppendText(pair.Key.ToString() + "......."); // DocText.AppendText(pair.Value.ToString() + Environment.NewLine); // } // } //} // matrixpro.GenerateMatrix(UniqueWordList); //UserWordsEditor uwe=new UserWordsEditor(UniqueWordList); //uwe.Show(); } catch (Exception se) { MessageBox.Show(se.Message); } }
//Read stop word list from text file called stopwords.txt //public string[] StopWords() //{ // string[] stopText = File.ReadAllLines("stopwords.txt"); // return stopText; //} public void ReadPdf(string pdfpath) { try { PdfReader pdfr = new PdfReader(pdfpath); StringBuilder pdfText = new StringBuilder(); int tp = DocumentInfo.TotalPages; //loop to read pdf page by page for (int page = 1; page <= tp; page++) { ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string currentText = PdfTextExtractor.GetTextFromPage(pdfr, page, strategy); currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); //currentText = currentText.ToLower(); POSTagger.mModelPath = "Models\\"; string[] tSplittedWords = GetWords(currentText); string[] sentences = POSTagger.SplitSentences(currentText); Sentences[] sentencearray = new Sentences[sentences.Count()]; int sc = sentences.Count(); for (int i = 0; i < sc; i++) { sentencearray[i] = new Sentences(); sentencearray[i].SentenceNumber = i + 1 + currentcountofsentences; sentencearray[i].SentenceString = sentences[i].ToLower(); sentenceList.Add(sentencearray[i]); } currentcountofsentences += sentences.Count(); string[] POS = POSTagger.PosTagTokens(tSplittedWords); splittedWords.Clear(); nrSplittedWords.Clear(); countList.Clear(); int poslength = POS.Length; for (int i = 0; i < poslength; i++) { //NN Noun, singular or mass //NNS Noun, plural //NNP Proper noun, singular //NNPS Proper noun, plural if (POS[i] == "NN" || POS[i] == "NNS" || POS[i] == "NNP" || POS[i] == "NNPS") { tSplittedWords[i] = tSplittedWords[i].ToLower(); splittedWords.Add(tSplittedWords[i]); } } //removing stop words from splittedwords list List<string> t1SplittedWords = new List<string>(); foreach (string sw in splittedWords) { if (!Array.Exists(_stopWords, element => element == sw)) { t1SplittedWords.Add(sw); } //this also works //if (!_stopWords.Contains(sw)) //{ // t1SplittedWords.Add(sw); //} } splittedWords.Clear(); splittedWords = t1SplittedWords.ToList(); nrSplittedWords = splittedWords.Distinct().ToList(); int nrswcount = nrSplittedWords.Count; _eachPageWordCount.Add(nrswcount); _tUniqueWordsinPage = nrswcount; //calculating frequency of words in each page i.e. Term Frequency for (int i = 0; i < nrswcount; i++) { string searchItem = nrSplittedWords[i]; int count = 0; for (int j = 0; j < splittedWords.Count(); j++) { if (searchItem == splittedWords[j]) count++; } countList.Add(count); } Words[] wordarray = new Words[nrSplittedWords.Count()]; for (int i = 0; i < nrswcount; i++) { wordarray[i] = new Words(); wordarray[i].Word = nrSplittedWords[i]; wordarray[i].TermFrequency = countList[i]; wordarray[i].Pageno = page; wordList.Add(wordarray[i]); } //foreach (string s in nrSplittedWords) //{ // UniqueWordsinCorpus.Add(s); //} UniqueWordsinCorpus.AddRange(nrSplittedWords); pdfText.Append(currentText); } //end of page loop pdfr.Close(); //UniqueWordsinCorpus is a list of string of unique words UniqueWordsinCorpus = UniqueWordsinCorpus.Distinct().ToList(); UniqueWordsinCorpus.Sort(); foreach (Words w in wordList) { int corf = 0; foreach (Words w1 in wordList) { if (w.Word == w1.Word) corf = corf + w1.TermFrequency; } w.CorpusFrequency = corf; } foreach (Words w in wordList) { w.SentencenoWithFrequency = new Dictionary<int, int>(); foreach (Sentences s in sentenceList) { int sentfreq = 0; string[] splittedwordsofsentence = GetWords(s.SentenceString); int swoscount = splittedwordsofsentence.Count(); for (int i = 0; i < swoscount; i++) { if (w.Word == splittedwordsofsentence[i]) sentfreq++; } w.SentencenoWithFrequency.Add(s.SentenceNumber, sentfreq); } } //wordList.Sort(delegate(Words w1, Words w2) { return w1.Word.CompareTo(w2.Word); }); wordList.Sort((w1, w2) => w1.Word.CompareTo(w2.Word)); //copying words from wordList of Words to uniquewordlist of uniquewords while removing the redundant entry UniqueWords[] uniquewordarray = new UniqueWords[UniqueWordsinCorpus.Count]; int uwiccount = UniqueWordsinCorpus.Count; for (int i = 0; i < uwiccount; i++) { uniquewordarray[i] = new UniqueWords(); uniquewordarray[i].SentencenoWithFrequency = new Dictionary<int, int>(); uniquewordarray[i].PagenoWithFrequency = new Dictionary<int, int>(); foreach (Words w in wordList) { if (UniqueWordsinCorpus[i] == w.Word) { if (uniquewordarray[i].Term == null) uniquewordarray[i].Term = w.Word; uniquewordarray[i].CorpusFrequency = w.CorpusFrequency; uniquewordarray[i].SentencenoWithFrequency = w.SentencenoWithFrequency; uniquewordarray[i].PagenoWithFrequency.Add(w.Pageno, w.TermFrequency); } } UniqueWordList.Add(uniquewordarray[i]); } //computing document frequency of unique words foreach (UniqueWords uw in UniqueWordList) { uw.DocFrequency = uw.PagenoWithFrequency.Count; } //Displaying uniquewords with their attribute values //foreach (UniqueWords uw in UniqueWordList) //{ // DocText.AppendText(uw.Term + "........"); // DocText.AppendText(uw.CorpusFrequency.ToString() + "\n"); // DocText.AppendText("Sentence no with frequency \n"); // List<KeyValuePair<int, int>> list = uw.SentencenoWithFrequency.ToList(); // foreach (KeyValuePair<int, int> pair in list) // { // if (pair.Value > 0) // { // DocText.AppendText(pair.Key.ToString() + "......."); // DocText.AppendText(pair.Value.ToString() + Environment.NewLine); // } // } // DocText.AppendText("Page no with freqency \n"); // List<KeyValuePair<int, int>> list1 = uw.PagenoWithFrequency.ToList(); // foreach (KeyValuePair<int, int> pair in list1) // { // if (pair.Value > 0) // { // DocText.AppendText(pair.Key.ToString() + "......."); // DocText.AppendText(pair.Value.ToString() + Environment.NewLine); // } // } //} //foreach (Words w in wordList) //{ // DocText.AppendText(w.Word + Environment.NewLine); // List<KeyValuePair<int, int>> list = w.SentencenoWithFrequency.ToList(); // foreach (KeyValuePair<int, int> pair in list) // { // if (pair.Value > 0) // { // DocText.AppendText(pair.Key.ToString() + "......."); // DocText.AppendText(pair.Value.ToString() + Environment.NewLine); // } // } //} // matrixpro.GenerateMatrix(UniqueWordList); //UserWordsEditor uwe=new UserWordsEditor(UniqueWordList); //uwe.Show(); } catch (Exception se) { MessageBox.Show(se.Message); } }