private void processDocument() { // Document document = app.Documents.Open(@"C:\User_Pradeep\Transcript2.doc", ReadOnly: true); Document document = app.Documents.Open(@"C:\User_Pradeep\Transcript3C.doc", ReadOnly: true); //test version document.Activate(); applicationTime.Start(); //Start stopwatch HashSet <string> processedWordList = new HashSet <string>(); // 10/25 string textOftheWord = ""; bool validWord = false; //Console.SetWindowSize(); for (int i = 1; i < document.Words.Count; i++) { textOftheWord = document.Words[i].Text.ToString().Trim().ToLower(); // 10/26 //if ((textOftheWord.Length > 3) && (textOftheWord.Contains("_________") == false) && (textOftheWord != "-")) // 10/26 - 03/28/2017 //Check length of the word is > 3 and word contains letters, numbers or $,#,* only if ((textOftheWord.Length > 2) && (Regex.IsMatch(textOftheWord, @"^[a-zA-Z0-9\$#*]+$"))) // 03/28/2017 { validWord = true; string processedWord = processedWordList.FirstOrDefault(w => w == textOftheWord); // 10/25 if (processedWord == null) { //If word is all numbers and equal to number at the begining of the sentence then it is a question number, ignore! if (Regex.IsMatch(textOftheWord, @"^[0-9]")) { Range sentenceRange = document.Range(Start: document.Words[i].Start, End: document.Content.End); // 03/27/2017 string textOfTheCurrentSentence = sentenceRange.Sentences.First.Text; // 03/27/2017 string firstWordOfTheCurrentSentence = textOfTheCurrentSentence.Substring(0, textOftheWord.Length); int valueInTheWord = 0; bool isNumber = Int32.TryParse(firstWordOfTheCurrentSentence, out valueInTheWord); // 10/22 if ((isNumber) && (textOftheWord == firstWordOfTheCurrentSentence)) { validWord = false; } } if (validWord) //((!parsed) ) 03/28/2017 { Range currentRange = document.Range(Start: document.Words[i].Start, End: document.Words[i].End); // 10/22 int pageNumberOfTheCurrentRange = currentRange.Information[WdInformation.wdActiveEndPageNumber]; // 10/22 if ((pageNumberOfTheCurrentRange > 1)) // 10/26 { Console.WriteLine("Scanning transcript- processing word # " + i + " - In page # " + pageNumberOfTheCurrentRange); var CustomWord = new TranscriptWord(); // Not in the array, it is a new word processedWordList.Add(textOftheWord); // 10/24 CustomWord.Name = textOftheWord; int wordFoundFrequency = 0; Range searchRange = document.Range(Start: document.Words[i].Start, End: document.Content.End); // 10/23 searchRange.Find.Forward = true; // 10/26 all range words searchRange.Find.Text = textOftheWord; currentWord = textOftheWord; searchRange.Find.Execute(MatchWholeWord: true); int currentLineNumber = 0; int currentPageNumber = 0; int pageNumberOfTheWord = 0; int lineNumberOfTheWord = 0; while (searchRange.Find.Found) { Console.WriteLine("Looking for word : " + currentWord); // Get currently searched sentence and extract the first word string textOfTheSearchedRangeSentence = searchRange.Sentences.First.Text; string firstWordOfTheSearchedSentence = textOfTheSearchedRangeSentence.Substring(0, textOftheWord.Length); //If begining of the sentence is a number, it usually is a question number, now if it is a number and matches the searched text, it definitely // cannot be a regular word, it got to be a question number, so ignore! if ((firstWordOfTheSearchedSentence == textOftheWord) && (Regex.IsMatch(firstWordOfTheSearchedSentence, @"^[0-9]"))) { } else { wordFoundFrequency++; currentPageNumber = searchRange.Information[WdInformation.wdActiveEndPageNumber]; currentLineNumber = searchRange.Information[WdInformation.wdFirstCharacterLineNumber]; if (currentPageNumber != 1) // Page # 1 is cover page { //Check whether current word is repeating in the same page and line number, if not, create "Occurrence" object. if (wordFoundFrequency > 1) { if (pageNumberOfTheWord != currentPageNumber || lineNumberOfTheWord != currentLineNumber) { pageNumberOfTheWord = currentPageNumber; lineNumberOfTheWord = currentLineNumber; var CustomOccurrence = new Occurrence { CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord }; CustomWord.PageAndLine.Add(CustomOccurrence); } } else { pageNumberOfTheWord = currentPageNumber; lineNumberOfTheWord = currentLineNumber; var CustomOccurrence = new Occurrence { CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord }; CustomWord.PageAndLine.Add(CustomOccurrence); } } } searchRange.Find.Execute(MatchWholeWord: true); } CustomWord.Frequency = wordFoundFrequency; CustomWordDirectory.Add(CustomWord); } } } } } document.Close(); }
private void processDocument2(ref string[] finalDeDupedWordList) { Document document = app.Documents.Open(@"C:\User_Pradeep\Transcript3C.doc", ReadOnly: true); //test version document.Activate(); HashSet <string> processedWordList = new HashSet <string>(); string finalSearchWord = ""; //Console.SetWindowSize(); for (int i = 0; i < finalDeDupedWordList.Length; i++) { finalSearchWord = finalDeDupedWordList[i].Trim(); //Words and sentences within double quotes(" ") should be identified, quotes should be removed in order to preserve correct print order (i.e. #'s $'s digits and actual words) if (Regex.IsMatch(finalSearchWord, @"^[a-zA-Z0-9\$#]")) { } else { //If the word is not all spaces and does not starts with one of the allowed charactors, then remove first position, could be a starting double quote or single quote if (finalSearchWord != "") { finalSearchWord = finalSearchWord.Remove(0, 1); } } //If the last position of the word in not one of the allowed charactors, then remove it, could be a closing double quote or single quote if (finalSearchWord != "") { if (Regex.IsMatch(finalSearchWord.Substring(finalSearchWord.Length - 1, 1), @"[a-zA-Z0-9\$#]")) { } else { finalSearchWord = finalSearchWord.Remove(finalSearchWord.Length - 1, 1); } } //Cleanup any spaces created by above process if any finalSearchWord.Trim(); if (finalSearchWord.Length > 2) { string processedWord = processedWordList.FirstOrDefault(w => w == finalSearchWord); // 10/25 if (processedWord == null) { Console.WriteLine("Scanning transcript- processing word # " + i); var CustomWord = new TranscriptWord(); // Not in the array, it is a new word so add to processed word list and start processing.... processedWordList.Add(finalSearchWord); CustomWord.Name = finalSearchWord; int wordFoundFrequency = 0; Range searchRange = document.Range(Start: document.Content.Start, End: document.Content.End); //Look for the word from start of the transcript to end searchRange.Find.Forward = true; // 10/26 all range words searchRange.Find.MatchCase = true; // 04/14/2016 searchRange.Find.Text = finalSearchWord; currentWord = finalSearchWord; searchRange.Find.Execute(MatchWholeWord: true); int currentLineNumber = 0; int currentPageNumber = 0; int pageNumberOfTheWord = 0; int lineNumberOfTheWord = 0; while (searchRange.Find.Found) { Console.WriteLine("Looking for word : " + currentWord); // Get current sentence being searched and extract the first word string textOfTheSearchedRangeSentence = searchRange.Sentences.First.Text; string firstWordOfTheSearchedSentence = textOfTheSearchedRangeSentence.Substring(0, finalSearchWord.Length); //If sentence starts with a number, it usually is a question number, now if it is a number and matches the searched text, it definitely // cannot be a regular word, it got to be a question number, so ignore! if ((firstWordOfTheSearchedSentence == finalSearchWord) && (Regex.IsMatch(firstWordOfTheSearchedSentence, @"^[0-9]"))) { } else { // if (currentPageNumber != 1) // Page # 1 is cover page // { wordFoundFrequency++; currentPageNumber = searchRange.Information[WdInformation.wdActiveEndPageNumber]; currentLineNumber = searchRange.Information[WdInformation.wdFirstCharacterLineNumber]; //Check whether current word is repeating in the same page and line number, if not, create the "Occurrence" object. if (wordFoundFrequency > 1) { if (pageNumberOfTheWord != currentPageNumber || lineNumberOfTheWord != currentLineNumber) { pageNumberOfTheWord = currentPageNumber; lineNumberOfTheWord = currentLineNumber; var CustomOccurrence = new Occurrence { CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord }; CustomWord.PageAndLine.Add(CustomOccurrence); } } else { pageNumberOfTheWord = currentPageNumber; lineNumberOfTheWord = currentLineNumber; var CustomOccurrence = new Occurrence { CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord }; CustomWord.PageAndLine.Add(CustomOccurrence); } // } } searchRange.Find.Execute(MatchWholeWord: true); } CustomWord.Frequency = wordFoundFrequency; CustomWordDirectory.Add(CustomWord); } } } document.Close(); }