private void processDocument() { // Document document = app.Documents.Open(@"C:\User_Pradeep\Transcript2.doc", ReadOnly: true); Document document = app.Documents.Open(@"C:\User_Pradeep\Transcript3C.doc", ReadOnly: true); //test version //--------------------------------------------------------------------->New changes to be implemented <------------------------------------------- //var wordEncode = Microsoft.Office.Core.MsoEncoding.msoEncodingUSASCII; ////Split words into an array //string str = ""; //string newStr = ""; //string finalStr = ""; //str = document.Content.Text; //string[] words = null; //string[] newWords = null; //words = str.Replace("\r", "").Replace("\tQ", "").Replace("\tA", "").Replace("(", "").Replace(")", "").Replace(":", "").Split(' '); //for (int i = 0; i < words.Length; i++) //{ // if (words[i].Trim().Equals("")) // { // continue; // } // else // { // if (i == words.Length - 1) // { // newStr += words[i]; // } // else // { // newStr += words[i] + " "; // } // } //} //newWords = newStr.Split(' '); //for (int i = 0; i < newWords.Length; i++) //{ // finalStr = newWords[i]; //} //<-------------------------------------------------------------------- document.Activate(); applicationTime.Start(); //Start stopwatch HashSet <string> processedWordList = new HashSet <string>(); // 10/25 string textOftheWord = ""; bool validWord = false; //Console.SetWindowSize(); for (int i = 1; i < document.Words.Count; i++) { textOftheWord = document.Words[i].Text.ToString().Trim().ToLower(); // 10/26 //if ((textOftheWord.Length > 3) && (textOftheWord.Contains("_________") == false) && (textOftheWord != "-")) // 10/26 - 03/28/2017 //Check length of the word is > 3 and word contains letters, numbers or $,#,* only if ((textOftheWord.Length > 2) && (Regex.IsMatch(textOftheWord, @"^[a-zA-Z0-9\$#*]+$"))) // 03/28/2017 { validWord = true; string processedWord = processedWordList.FirstOrDefault(w => w == textOftheWord); // 10/25 if (processedWord == null) { //If word is all numbers and equal to number at the begining of the sentence then it is a question number, ignore! if (Regex.IsMatch(textOftheWord, @"^[0-9]")) { Range sentenceRange = document.Range(Start: document.Words[i].Start, End: document.Content.End); // 03/27/2017 string textOfTheCurrentSentence = sentenceRange.Sentences.First.Text; // 03/27/2017 string firstWordOfTheCurrentSentence = textOfTheCurrentSentence.Substring(0, textOftheWord.Length); int valueInTheWord = 0; bool isNumber = Int32.TryParse(firstWordOfTheCurrentSentence, out valueInTheWord); // 10/22 if ((isNumber) && (textOftheWord == firstWordOfTheCurrentSentence)) { validWord = false; } } if (validWord) //((!parsed) ) 03/28/2017 { Range currentRange = document.Range(Start: document.Words[i].Start, End: document.Words[i].End); // 10/22 int pageNumberOfTheCurrentRange = currentRange.Information[WdInformation.wdActiveEndPageNumber]; // 10/22 if ((pageNumberOfTheCurrentRange > 1)) // 10/26 { Console.WriteLine("Scanning transcript- processing word # " + i + " - In page # " + pageNumberOfTheCurrentRange); var CustomWord = new TranscriptWord(); // Not in the array, it is a new word processedWordList.Add(textOftheWord); // 10/24 CustomWord.Name = textOftheWord; int wordFoundFrequency = 0; Range searchRange = document.Range(Start: document.Words[i].Start, End: document.Content.End); // 10/23 searchRange.Find.Forward = true; // 10/26 all range words searchRange.Find.Text = textOftheWord; currentWord = textOftheWord; searchRange.Find.Execute(MatchWholeWord: true); int currentLineNumber = 0; int currentPageNumber = 0; int pageNumberOfTheWord = 0; int lineNumberOfTheWord = 0; while (searchRange.Find.Found) { Console.WriteLine("Looking for word : " + currentWord); // Get currently searched sentence and extract the first word string textOfTheSearchedRangeSentence = searchRange.Sentences.First.Text; string firstWordOfTheSearchedSentence = textOfTheSearchedRangeSentence.Substring(0, textOftheWord.Length); //If begining of the sentence is a number, it usually is a question number, now if it is a number and matches the searched text, it definitely // cannot be a regular word, it got to be a question number, so ignore! if ((firstWordOfTheSearchedSentence == textOftheWord) && (Regex.IsMatch(firstWordOfTheSearchedSentence, @"^[0-9]"))) { } else { wordFoundFrequency++; currentPageNumber = searchRange.Information[WdInformation.wdActiveEndPageNumber]; currentLineNumber = searchRange.Information[WdInformation.wdFirstCharacterLineNumber]; if (currentPageNumber != 1) // Page # 1 is cover page { //Check whether current word is repeating in the same page and line number, if not, create "Occurrence" object. if (wordFoundFrequency > 1) { if (pageNumberOfTheWord != currentPageNumber || lineNumberOfTheWord != currentLineNumber) { pageNumberOfTheWord = currentPageNumber; lineNumberOfTheWord = currentLineNumber; var CustomOccurrence = new Occurrence { CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord }; CustomWord.PageAndLine.Add(CustomOccurrence); } } else { pageNumberOfTheWord = currentPageNumber; lineNumberOfTheWord = currentLineNumber; var CustomOccurrence = new Occurrence { CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord }; CustomWord.PageAndLine.Add(CustomOccurrence); } } } searchRange.Find.Execute(MatchWholeWord: true); } CustomWord.Frequency = wordFoundFrequency; CustomWordDirectory.Add(CustomWord); } } } } } document.Close(); }
private void processDocument() { Document document = app.Documents.Open(@"C:\User_Pradeep\Transcript2.doc", ReadOnly: true); document.Activate(); //int totalWordCount = document.Words.Count; 10/25 //int wordCountArrayLength = 0; 10/24 //string[] processedWordList = new string[totalWordCount]; 10/24 //List<string> processedWordList = new List<string>(); 10/25 HashSet <string> processedWordList = new HashSet <string>(); // 10/25 String textOftheWord = ""; for (int i = 1; i < document.Words.Count; i++) { textOftheWord = document.Words[i].Text.ToString().Trim().ToLower(); // 10/26 //if ((textOftheWord.Length > 3) && (textOftheWord.All(Char.IsLetterOrDigit))) // 10/23 // Comma issues if ((textOftheWord.Length > 3) && (textOftheWord.Contains("_________") == false) && (textOftheWord != "-")) // 10/26 { string processedWord = processedWordList.FirstOrDefault(w => w == textOftheWord); // 10/25 //int wordFoundIntheProcessedWordList = Array.IndexOf(processedWordList, textOftheWord); // 10/23 10/24 //if ((wordFoundIntheProcessedWordList < 0)) // 10/23 10/24 if (processedWord == null) // 10/24 { int valueInTheWord = 0; // 10/22 bool parsed = Int32.TryParse(textOftheWord, out valueInTheWord); // 10/22 //Ignore *All numbers and continues and dotted lines if ((!parsed)) // 10/22 { //curRange = document.Words[i]; Range currentRange = document.Range(Start: document.Words[i].Start, End: document.Words[i].End); // 10/22 int pageNumberOfTheCurrentRange = currentRange.Information[WdInformation.wdActiveEndPageNumber]; // 10/22 //if ((textOftheWord.Length > 3) && (pageNumberOfTheCurrentRange > 1) && (textOftheWord != "__") && (textOftheWord != "-")) // 10/22 //if (pageNumberOfTheCurrentRange > 1) // 10/23 if ((pageNumberOfTheCurrentRange > 1)) // 10/26 { Console.WriteLine("Now processing word # " + i + " In page # " + pageNumberOfTheCurrentRange); //Check whether word is already processed? // int wordPosition = Array.IndexOf(processedWordList, textOftheWord); 10/24 var CustomWord = new TranscriptWord(); // Not in the array, it is a new word processedWordList.Add(textOftheWord); // 10/24 // if (wordPosition < 0) //10/24 // { //Add new word to processed list of words // if (wordCountArrayLength <= totalWordCount) // 10/24 //{ // processedWordList[wordCountArrayLength] = textOftheWord; 10/24 // wordCountArrayLength++; 10/24 // Update CustomWord name property CustomWord.Name = textOftheWord; //} int wordFoundFrequency = 0; // Range fullRange = document.Content; // 10/23 Range searchRange = document.Range(Start: document.Words[i].Start, End: document.Content.End); // 10/23 //Range fullRange = curRange; searchRange.Find.Forward = true; // 10/26 all range words searchRange.Find.Text = textOftheWord; currentWord = textOftheWord; searchRange.Find.Execute(MatchWholeWord: true); int currentLineNumber = 0; int currentPageNumber = 0; int pageNumberOfTheWord = 0; int lineNumberOfTheWord = 0; while (searchRange.Find.Found) { wordFoundFrequency++; Console.WriteLine("Looking for word : " + currentWord); //___________________________________________ // Sentences sent = fullRange.Sentences; // Range rng = document.Range(Start: sent, End: sent); //Microsoft.Office.Interop.Word.Selection sel = document.S //Total papagraphs in the documnet // int paraCountTotal = document.Paragraphs.Count; 10/22 //Paragraph count in the selected range = Last paragraph read // int totalParagraphsRead = app.ActiveDocument.Range(0, fullRange.End).Paragraphs.Count; 10/22 // Range rangeFirstSentence = fullRange.Sentences.First; 10/22 //Text of the current sentence // string textOfTheCurrentSentence = rangeFirstSentence.Text; // Entire Line // 10/22 //------------------------------------------------------------------------------------------------------ //int lineNumberOfTheCurrentSentence = getLineNumberOfTheWord(textOfTheCurrentSentence); // string text2 = fullRange.Text; // Selected Word // 10/22 currentPageNumber = searchRange.Information[WdInformation.wdActiveEndPageNumber]; currentLineNumber = searchRange.Information[WdInformation.wdFirstCharacterLineNumber]; if (currentPageNumber != 1) // Page # 1 is cover page 10/22 { //Check whether current word is repeating in the same page and line number, if not, create "Occurrence" object. if (wordFoundFrequency > 1) { if (pageNumberOfTheWord != currentPageNumber || lineNumberOfTheWord != currentLineNumber) { pageNumberOfTheWord = currentPageNumber; lineNumberOfTheWord = currentLineNumber; var CustomOccurrence = new Occurrence { CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord }; CustomWord.PageAndLine.Add(CustomOccurrence); } } else { pageNumberOfTheWord = currentPageNumber; lineNumberOfTheWord = currentLineNumber; var CustomOccurrence = new Occurrence { CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord }; CustomWord.PageAndLine.Add(CustomOccurrence); } } searchRange.Find.Execute(MatchWholeWord: true); } CustomWord.Frequency = wordFoundFrequency; CustomWordDirectory.Add(CustomWord); } } } } } //} document.Close(); }
private void processDocument2(ref string[] finalDeDupedWordList) { Document document = app.Documents.Open(@"C:\User_Pradeep\Transcript3C.doc", ReadOnly: true); //test version document.Activate(); HashSet <string> processedWordList = new HashSet <string>(); string finalSearchWord = ""; //Console.SetWindowSize(); for (int i = 0; i < finalDeDupedWordList.Length; i++) { finalSearchWord = finalDeDupedWordList[i].Trim(); //Words and sentences within double quotes(" ") should be identified, quotes should be removed in order to preserve correct print order (i.e. #'s $'s digits and actual words) if (Regex.IsMatch(finalSearchWord, @"^[a-zA-Z0-9\$#]")) { } else { //If the word is not all spaces and does not starts with one of the allowed charactors, then remove first position, could be a starting double quote or single quote if (finalSearchWord != "") { finalSearchWord = finalSearchWord.Remove(0, 1); } } //If the last position of the word in not one of the allowed charactors, then remove it, could be a closing double quote or single quote if (finalSearchWord != "") { if (Regex.IsMatch(finalSearchWord.Substring(finalSearchWord.Length - 1, 1), @"[a-zA-Z0-9\$#]")) { } else { finalSearchWord = finalSearchWord.Remove(finalSearchWord.Length - 1, 1); } } //Cleanup any spaces created by above process if any finalSearchWord.Trim(); if (finalSearchWord.Length > 2) { string processedWord = processedWordList.FirstOrDefault(w => w == finalSearchWord); // 10/25 if (processedWord == null) { Console.WriteLine("Scanning transcript- processing word # " + i); var CustomWord = new TranscriptWord(); // Not in the array, it is a new word so add to processed word list and start processing.... processedWordList.Add(finalSearchWord); CustomWord.Name = finalSearchWord; int wordFoundFrequency = 0; Range searchRange = document.Range(Start: document.Content.Start, End: document.Content.End); //Look for the word from start of the transcript to end searchRange.Find.Forward = true; // 10/26 all range words searchRange.Find.MatchCase = true; // 04/14/2016 searchRange.Find.Text = finalSearchWord; currentWord = finalSearchWord; searchRange.Find.Execute(MatchWholeWord: true); int currentLineNumber = 0; int currentPageNumber = 0; int pageNumberOfTheWord = 0; int lineNumberOfTheWord = 0; while (searchRange.Find.Found) { Console.WriteLine("Looking for word : " + currentWord); // Get current sentence being searched and extract the first word string textOfTheSearchedRangeSentence = searchRange.Sentences.First.Text; string firstWordOfTheSearchedSentence = textOfTheSearchedRangeSentence.Substring(0, finalSearchWord.Length); //If sentence starts with a number, it usually is a question number, now if it is a number and matches the searched text, it definitely // cannot be a regular word, it got to be a question number, so ignore! if ((firstWordOfTheSearchedSentence == finalSearchWord) && (Regex.IsMatch(firstWordOfTheSearchedSentence, @"^[0-9]"))) { } else { // if (currentPageNumber != 1) // Page # 1 is cover page // { wordFoundFrequency++; currentPageNumber = searchRange.Information[WdInformation.wdActiveEndPageNumber]; currentLineNumber = searchRange.Information[WdInformation.wdFirstCharacterLineNumber]; //Check whether current word is repeating in the same page and line number, if not, create the "Occurrence" object. if (wordFoundFrequency > 1) { if (pageNumberOfTheWord != currentPageNumber || lineNumberOfTheWord != currentLineNumber) { pageNumberOfTheWord = currentPageNumber; lineNumberOfTheWord = currentLineNumber; var CustomOccurrence = new Occurrence { CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord }; CustomWord.PageAndLine.Add(CustomOccurrence); } } else { pageNumberOfTheWord = currentPageNumber; lineNumberOfTheWord = currentLineNumber; var CustomOccurrence = new Occurrence { CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord }; CustomWord.PageAndLine.Add(CustomOccurrence); } // } } searchRange.Find.Execute(MatchWholeWord: true); } CustomWord.Frequency = wordFoundFrequency; CustomWordDirectory.Add(CustomWord); } } } document.Close(); }