Пример #1
0
        public void processTranscript(string fileName, string[] excludedWordList)
        {
            Document document = app.Documents.Open(fileName, ReadOnly: true);

            document.Activate();
            int totalWordCount       = document.Words.Count;
            int wordCountArrayLength = 0;

            string[] processedWordList = new string[totalWordCount];

            String textOftheWord = "";

            for (int i = 1; i < document.Words.Count; i++)
            {
                textOftheWord = document.Words[i].Text.ToString().Trim().ToLower();

                if (textOftheWord.Length > 2)
                {
                    //Check whether word is already processed?

                    int wordPosition = Array.IndexOf(processedWordList, textOftheWord);

                    //Check whether word is in the excluded list?

                    int wordInExcludeList = Array.IndexOf(excludedWordList, textOftheWord);

                    var CustomWord = new TranscriptWord();

                    // Not in the array, it is a new word; not in the excluded list so it will be in the index

                    if ((wordPosition < 0) && (wordInExcludeList < 0))
                    {
                        //Add new word to processed list of words

                        if (wordCountArrayLength <= totalWordCount)
                        {
                            processedWordList[wordCountArrayLength] = textOftheWord;
                            wordCountArrayLength++;

                            // Update CustomWord name property
                            CustomWord.Name = textOftheWord;
                        }

                        int   wordFoundFrequency = 0;
                        Range fullRange          = document.Content;
                        fullRange.Find.Forward = true;
                        fullRange.Find.Text    = textOftheWord;
                        fullRange.Find.Execute(MatchWholeWord: true);
                        int currentLineNumber   = 0;
                        int currentPageNumber   = 0;
                        int pageNumberOfTheWord = 0;
                        int lineNumberOfTheWord = 0;


                        while (fullRange.Find.Found)
                        {
                            wordFoundFrequency++;

                            //Total papagraphs in the documnet

                            int paraCountTotal = document.Paragraphs.Count;
                            //Paragraph count in the selected range = Last paragraph read
                            int totalParagraphsRead = app.ActiveDocument.Range(0, fullRange.End).Paragraphs.Count;

                            Range rangeFirstSentence = fullRange.Sentences.First;

                            //Text of the current sentence -- Entire Line
                            string textOfTheCurrentSentence = rangeFirstSentence.Text;

                            int lineNumberOfTheCurrentSentence = getLineNumberOfTheWord(textOfTheCurrentSentence);

                            if (lineNumberOfTheCurrentSentence == 0)
                            {
                                var   currentParagraph      = fullRange.Paragraphs.Last; // If no line number found fir the text, then get the entire paragraph.
                                Range rangeCurrentParagraph = currentParagraph.Range;

                                //Text of the current paragraph
                                string textOfTheCurrentParagraph = rangeCurrentParagraph.Text;  // Line number found at the begining of the paragraph

                                int lineNumberOfTheCurrentParagraph = getLineNumberOfTheWord(textOfTheCurrentParagraph);

                                if (lineNumberOfTheCurrentParagraph == 0)
                                {
                                    var   lastParagraph      = document.Paragraphs[totalParagraphsRead - 1];
                                    Range rangeLastParagraph = lastParagraph.Range;

                                    //Text of the last paragraph
                                    string textOfTheLastParagraph       = rangeLastParagraph.Text; // Line number found at the begining of the First paragraph
                                    int    lineNumberOfTheLastParagraph = getLineNumberOfTheWord(textOfTheLastParagraph);

                                    //Assume line number of the searced word is found here, if not ?? another logic needed!!!
                                    currentLineNumber = lineNumberOfTheLastParagraph;
                                }
                                else
                                {
                                    currentLineNumber = lineNumberOfTheCurrentParagraph;
                                }
                            }
                            else
                            {
                                currentLineNumber = lineNumberOfTheCurrentSentence;
                            }

                            currentPageNumber = fullRange.Information[WdInformation.wdActiveEndPageNumber];

                            //Check whether current word is repeating in the same page and line number, if not, create "Occurrence" object.
                            //Only one Occurrence object is needed even if current word is repeated more than once in the same line number
                            if (wordFoundFrequency > 1)
                            {
                                if (pageNumberOfTheWord != currentPageNumber || lineNumberOfTheWord != currentLineNumber)
                                {
                                    pageNumberOfTheWord = currentPageNumber;
                                    lineNumberOfTheWord = currentLineNumber;

                                    var CustomOccurrence = new Occurrence {
                                        CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord
                                    };
                                    CustomWord.PageAndLine.Add(CustomOccurrence);
                                }
                            }
                            else
                            {
                                pageNumberOfTheWord = currentPageNumber;
                                lineNumberOfTheWord = currentLineNumber;

                                var CustomOccurrence = new Occurrence {
                                    CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord
                                };
                                CustomWord.PageAndLine.Add(CustomOccurrence);
                            }


                            fullRange.Find.Execute(MatchWholeWord: true);
                        }

                        CustomWord.Frequency = wordFoundFrequency;
                        WordIndexDictionary.Add(CustomWord);
                    }
                }
            }

            document.Close();
        }
Пример #2
0
        private void searchDocumentAndCreateWordDicionary(ref string[] finalDeDupedWordList, string fileName, string[] excludedWordList)
        {
            Document document = app.Documents.Open(fileName, ReadOnly: true);

            document.Activate();

            HashSet <string> processedWordList = new HashSet <string>();

            string finalSearchWord = "";

            //Set console window properties
            Console.Title           = "- Index Generator Status -";
            Console.ForegroundColor = ConsoleColor.Green;


            for (int i = 0; i < finalDeDupedWordList.Length; i++)
            {
                finalSearchWord = finalDeDupedWordList[i].Trim();

                //Words and sentences with double quotes(" ") should be identified, quotes should be removed in order to preserve correct print order (i.e. #'s $'s digits and actual words)

                if (Regex.IsMatch(finalSearchWord, @"^[a-zA-Z0-9\$#]"))
                {
                }
                else
                {
                    //If the word is not all spaces and does not starts with one of the allowed charactors, then remove first position, could be a starting double quote or single quote
                    if (finalSearchWord != "")
                    {
                        finalSearchWord = finalSearchWord.Remove(0, 1);
                    }
                }

                //If the last position of the word in not one of the allowed charactors, then remove it, could be a closing double quote or single quote
                if (finalSearchWord != "")
                {
                    if (Regex.IsMatch(finalSearchWord.Substring(finalSearchWord.Length - 1, 1), @"[a-zA-Z0-9\$#]"))
                    {
                    }
                    else
                    {
                        finalSearchWord = finalSearchWord.Remove(finalSearchWord.Length - 1, 1);
                    }
                }

                //Cleanup any spaces created by above process, if any
                finalSearchWord.Trim();

                if (finalSearchWord.Length > 2)
                {
                    //Check whether current word is already processed?
                    string processedWord = processedWordList.FirstOrDefault(w => w == finalSearchWord);

                    //Check whether word is in the excluded list?
                    int wordInExcludedList = Array.IndexOf(excludedWordList, finalSearchWord);

                    if ((processedWord == null) && (wordInExcludedList < 0)) //Not in the processed word list, not in the excluded list so it will be in the index
                    {
                        Console.WriteLine("Scanning transcript- processing word # " + i);

                        var CustomWord = new TranscriptWord();

                        // Not in the array, it is a new word so add to processed word list and start processing....

                        processedWordList.Add(finalSearchWord);

                        CustomWord.Name = finalSearchWord;

                        int wordFoundFrequency = 0;

                        Range searchRange = document.Range(Start: document.Content.Start, End: document.Content.End); //Look for the word from start of the transcript to end

                        searchRange.Find.Forward   = true;
                        searchRange.Find.MatchCase = true;
                        searchRange.Find.Text      = finalSearchWord;

                        currentWord = finalSearchWord;

                        searchRange.Find.Execute(MatchWholeWord: true);
                        int currentLineNumber   = 0;
                        int currentPageNumber   = 0;
                        int pageNumberOfTheWord = 0;
                        int lineNumberOfTheWord = 0;

                        string textOfTheSearchedRangeSentence = ""; //04/21/2017
                        string firstWordOfTheSearchedSentence = ""; //04/21/2017

                        while (searchRange.Find.Found)
                        {
                            Console.WriteLine("Looking for word : " + currentWord);

                            // If final search word is only a number, get current sentence being searched and extract the first word

                            if (Regex.IsMatch(finalSearchWord, @"^[0-9]"))   //04/21/2017
                            {
                                textOfTheSearchedRangeSentence = searchRange.Sentences.First.Text;

                                if (textOfTheSearchedRangeSentence.Length >= finalSearchWord.Length) //04/21/2017 // To avoid "System.ArgumentOutOfRangeException"
                                {
                                    firstWordOfTheSearchedSentence = textOfTheSearchedRangeSentence.Substring(0, finalSearchWord.Length);
                                }
                            }

                            //If sentence starts with a number, it usually is a question number, now if it is a number and matches the searched text, it definitely
                            // cannot be a regular word, it got to be a question number, so ignore!

                            if ((firstWordOfTheSearchedSentence == finalSearchWord) && (Regex.IsMatch(firstWordOfTheSearchedSentence, @"^[0-9]")))
                            {
                            }
                            else
                            {
                                //Process all pages, including the cover page
                                wordFoundFrequency++;

                                currentPageNumber = searchRange.Information[WdInformation.wdActiveEndPageNumber];

                                currentLineNumber = searchRange.Information[WdInformation.wdFirstCharacterLineNumber];


                                //Check whether current word is repeating in the same page and line number, if not, create the "Occurrence" object.

                                if (wordFoundFrequency > 1)
                                {
                                    if (pageNumberOfTheWord != currentPageNumber || lineNumberOfTheWord != currentLineNumber)
                                    {
                                        pageNumberOfTheWord = currentPageNumber;
                                        lineNumberOfTheWord = currentLineNumber;

                                        var CustomOccurrence = new Occurrence {
                                            CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord
                                        };
                                        CustomWord.PageAndLine.Add(CustomOccurrence);
                                    }
                                }
                                else
                                {
                                    pageNumberOfTheWord = currentPageNumber;
                                    lineNumberOfTheWord = currentLineNumber;

                                    var CustomOccurrence = new Occurrence {
                                        CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord
                                    };
                                    CustomWord.PageAndLine.Add(CustomOccurrence);
                                }
                            }

                            searchRange.Find.Execute(MatchWholeWord: true);
                        }

                        CustomWord.Frequency = wordFoundFrequency;
                        WordIndexDictionary.Add(CustomWord);
                    }
                }
            }

            document.Close();
        }