public void processTranscript(string fileName, string[] excludedWordList) { Document document = app.Documents.Open(fileName, ReadOnly: true); document.Activate(); int totalWordCount = document.Words.Count; int wordCountArrayLength = 0; string[] processedWordList = new string[totalWordCount]; String textOftheWord = ""; for (int i = 1; i < document.Words.Count; i++) { textOftheWord = document.Words[i].Text.ToString().Trim().ToLower(); if (textOftheWord.Length > 2) { //Check whether word is already processed? int wordPosition = Array.IndexOf(processedWordList, textOftheWord); //Check whether word is in the excluded list? int wordInExcludeList = Array.IndexOf(excludedWordList, textOftheWord); var CustomWord = new TranscriptWord(); // Not in the array, it is a new word; not in the excluded list so it will be in the index if ((wordPosition < 0) && (wordInExcludeList < 0)) { //Add new word to processed list of words if (wordCountArrayLength <= totalWordCount) { processedWordList[wordCountArrayLength] = textOftheWord; wordCountArrayLength++; // Update CustomWord name property CustomWord.Name = textOftheWord; } int wordFoundFrequency = 0; Range fullRange = document.Content; fullRange.Find.Forward = true; fullRange.Find.Text = textOftheWord; fullRange.Find.Execute(MatchWholeWord: true); int currentLineNumber = 0; int currentPageNumber = 0; int pageNumberOfTheWord = 0; int lineNumberOfTheWord = 0; while (fullRange.Find.Found) { wordFoundFrequency++; //Total papagraphs in the documnet int paraCountTotal = document.Paragraphs.Count; //Paragraph count in the selected range = Last paragraph read int totalParagraphsRead = app.ActiveDocument.Range(0, fullRange.End).Paragraphs.Count; Range rangeFirstSentence = fullRange.Sentences.First; //Text of the current sentence -- Entire Line string textOfTheCurrentSentence = rangeFirstSentence.Text; int lineNumberOfTheCurrentSentence = getLineNumberOfTheWord(textOfTheCurrentSentence); if (lineNumberOfTheCurrentSentence == 0) { var currentParagraph = fullRange.Paragraphs.Last; // If no line number found fir the text, then get the entire paragraph. Range rangeCurrentParagraph = currentParagraph.Range; //Text of the current paragraph string textOfTheCurrentParagraph = rangeCurrentParagraph.Text; // Line number found at the begining of the paragraph int lineNumberOfTheCurrentParagraph = getLineNumberOfTheWord(textOfTheCurrentParagraph); if (lineNumberOfTheCurrentParagraph == 0) { var lastParagraph = document.Paragraphs[totalParagraphsRead - 1]; Range rangeLastParagraph = lastParagraph.Range; //Text of the last paragraph string textOfTheLastParagraph = rangeLastParagraph.Text; // Line number found at the begining of the First paragraph int lineNumberOfTheLastParagraph = getLineNumberOfTheWord(textOfTheLastParagraph); //Assume line number of the searced word is found here, if not ?? another logic needed!!! currentLineNumber = lineNumberOfTheLastParagraph; } else { currentLineNumber = lineNumberOfTheCurrentParagraph; } } else { currentLineNumber = lineNumberOfTheCurrentSentence; } currentPageNumber = fullRange.Information[WdInformation.wdActiveEndPageNumber]; //Check whether current word is repeating in the same page and line number, if not, create "Occurrence" object. //Only one Occurrence object is needed even if current word is repeated more than once in the same line number if (wordFoundFrequency > 1) { if (pageNumberOfTheWord != currentPageNumber || lineNumberOfTheWord != currentLineNumber) { pageNumberOfTheWord = currentPageNumber; lineNumberOfTheWord = currentLineNumber; var CustomOccurrence = new Occurrence { CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord }; CustomWord.PageAndLine.Add(CustomOccurrence); } } else { pageNumberOfTheWord = currentPageNumber; lineNumberOfTheWord = currentLineNumber; var CustomOccurrence = new Occurrence { CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord }; CustomWord.PageAndLine.Add(CustomOccurrence); } fullRange.Find.Execute(MatchWholeWord: true); } CustomWord.Frequency = wordFoundFrequency; WordIndexDictionary.Add(CustomWord); } } } document.Close(); }
private void searchDocumentAndCreateWordDicionary(ref string[] finalDeDupedWordList, string fileName, string[] excludedWordList) { Document document = app.Documents.Open(fileName, ReadOnly: true); document.Activate(); HashSet <string> processedWordList = new HashSet <string>(); string finalSearchWord = ""; //Set console window properties Console.Title = "- Index Generator Status -"; Console.ForegroundColor = ConsoleColor.Green; for (int i = 0; i < finalDeDupedWordList.Length; i++) { finalSearchWord = finalDeDupedWordList[i].Trim(); //Words and sentences with double quotes(" ") should be identified, quotes should be removed in order to preserve correct print order (i.e. #'s $'s digits and actual words) if (Regex.IsMatch(finalSearchWord, @"^[a-zA-Z0-9\$#]")) { } else { //If the word is not all spaces and does not starts with one of the allowed charactors, then remove first position, could be a starting double quote or single quote if (finalSearchWord != "") { finalSearchWord = finalSearchWord.Remove(0, 1); } } //If the last position of the word in not one of the allowed charactors, then remove it, could be a closing double quote or single quote if (finalSearchWord != "") { if (Regex.IsMatch(finalSearchWord.Substring(finalSearchWord.Length - 1, 1), @"[a-zA-Z0-9\$#]")) { } else { finalSearchWord = finalSearchWord.Remove(finalSearchWord.Length - 1, 1); } } //Cleanup any spaces created by above process, if any finalSearchWord.Trim(); if (finalSearchWord.Length > 2) { //Check whether current word is already processed? string processedWord = processedWordList.FirstOrDefault(w => w == finalSearchWord); //Check whether word is in the excluded list? int wordInExcludedList = Array.IndexOf(excludedWordList, finalSearchWord); if ((processedWord == null) && (wordInExcludedList < 0)) //Not in the processed word list, not in the excluded list so it will be in the index { Console.WriteLine("Scanning transcript- processing word # " + i); var CustomWord = new TranscriptWord(); // Not in the array, it is a new word so add to processed word list and start processing.... processedWordList.Add(finalSearchWord); CustomWord.Name = finalSearchWord; int wordFoundFrequency = 0; Range searchRange = document.Range(Start: document.Content.Start, End: document.Content.End); //Look for the word from start of the transcript to end searchRange.Find.Forward = true; searchRange.Find.MatchCase = true; searchRange.Find.Text = finalSearchWord; currentWord = finalSearchWord; searchRange.Find.Execute(MatchWholeWord: true); int currentLineNumber = 0; int currentPageNumber = 0; int pageNumberOfTheWord = 0; int lineNumberOfTheWord = 0; string textOfTheSearchedRangeSentence = ""; //04/21/2017 string firstWordOfTheSearchedSentence = ""; //04/21/2017 while (searchRange.Find.Found) { Console.WriteLine("Looking for word : " + currentWord); // If final search word is only a number, get current sentence being searched and extract the first word if (Regex.IsMatch(finalSearchWord, @"^[0-9]")) //04/21/2017 { textOfTheSearchedRangeSentence = searchRange.Sentences.First.Text; if (textOfTheSearchedRangeSentence.Length >= finalSearchWord.Length) //04/21/2017 // To avoid "System.ArgumentOutOfRangeException" { firstWordOfTheSearchedSentence = textOfTheSearchedRangeSentence.Substring(0, finalSearchWord.Length); } } //If sentence starts with a number, it usually is a question number, now if it is a number and matches the searched text, it definitely // cannot be a regular word, it got to be a question number, so ignore! if ((firstWordOfTheSearchedSentence == finalSearchWord) && (Regex.IsMatch(firstWordOfTheSearchedSentence, @"^[0-9]"))) { } else { //Process all pages, including the cover page wordFoundFrequency++; currentPageNumber = searchRange.Information[WdInformation.wdActiveEndPageNumber]; currentLineNumber = searchRange.Information[WdInformation.wdFirstCharacterLineNumber]; //Check whether current word is repeating in the same page and line number, if not, create the "Occurrence" object. if (wordFoundFrequency > 1) { if (pageNumberOfTheWord != currentPageNumber || lineNumberOfTheWord != currentLineNumber) { pageNumberOfTheWord = currentPageNumber; lineNumberOfTheWord = currentLineNumber; var CustomOccurrence = new Occurrence { CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord }; CustomWord.PageAndLine.Add(CustomOccurrence); } } else { pageNumberOfTheWord = currentPageNumber; lineNumberOfTheWord = currentLineNumber; var CustomOccurrence = new Occurrence { CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord }; CustomWord.PageAndLine.Add(CustomOccurrence); } } searchRange.Find.Execute(MatchWholeWord: true); } CustomWord.Frequency = wordFoundFrequency; WordIndexDictionary.Add(CustomWord); } } } document.Close(); }