Ejemplo n.º 1
0
        private void processDocument()
        {
            //  Document document = app.Documents.Open(@"C:\User_Pradeep\Transcript2.doc", ReadOnly: true);

            Document document = app.Documents.Open(@"C:\User_Pradeep\Transcript3C.doc", ReadOnly: true); //test version

            //--------------------------------------------------------------------->New changes to be implemented <-------------------------------------------

            //var wordEncode = Microsoft.Office.Core.MsoEncoding.msoEncodingUSASCII;
            ////Split words into an array
            //string str = "";
            //string newStr = "";
            //string finalStr = "";
            //str = document.Content.Text;
            //string[] words = null;
            //string[] newWords = null;
            //words = str.Replace("\r", "").Replace("\tQ", "").Replace("\tA", "").Replace("(", "").Replace(")", "").Replace(":", "").Split(' ');
            //for (int i = 0; i < words.Length; i++)
            //{
            //    if (words[i].Trim().Equals(""))
            //    {
            //        continue;
            //    }
            //    else
            //    {
            //        if (i == words.Length - 1)
            //        {
            //            newStr += words[i];
            //        }
            //        else
            //        {
            //            newStr += words[i] + " ";
            //        }
            //    }
            //}
            //newWords = newStr.Split(' ');

            //for (int i = 0; i < newWords.Length; i++)
            //{
            //    finalStr = newWords[i];
            //}
            //<--------------------------------------------------------------------
            document.Activate();

            applicationTime.Start();                                     //Start stopwatch

            HashSet <string> processedWordList = new HashSet <string>(); // 10/25


            string textOftheWord = "";
            bool   validWord     = false;


            //Console.SetWindowSize();

            for (int i = 1; i < document.Words.Count; i++)
            {
                textOftheWord = document.Words[i].Text.ToString().Trim().ToLower(); // 10/26

                //if ((textOftheWord.Length > 3) && (textOftheWord.Contains("_________") == false) && (textOftheWord != "-")) // 10/26 - 03/28/2017

                //Check length of the word is > 3 and word contains letters, numbers or $,#,* only
                if ((textOftheWord.Length > 2) && (Regex.IsMatch(textOftheWord, @"^[a-zA-Z0-9\$#*]+$"))) // 03/28/2017

                {
                    validWord = true;

                    string processedWord = processedWordList.FirstOrDefault(w => w == textOftheWord); // 10/25

                    if (processedWord == null)
                    {
                        //If word is all numbers and equal to number at the begining of the sentence then it is a question number, ignore!
                        if (Regex.IsMatch(textOftheWord, @"^[0-9]"))
                        {
                            Range  sentenceRange            = document.Range(Start: document.Words[i].Start, End: document.Content.End); // 03/27/2017
                            string textOfTheCurrentSentence = sentenceRange.Sentences.First.Text;                                        // 03/27/2017

                            string firstWordOfTheCurrentSentence = textOfTheCurrentSentence.Substring(0, textOftheWord.Length);
                            int    valueInTheWord = 0;

                            bool isNumber = Int32.TryParse(firstWordOfTheCurrentSentence, out valueInTheWord); // 10/22

                            if ((isNumber) && (textOftheWord == firstWordOfTheCurrentSentence))
                            {
                                validWord = false;
                            }
                        }

                        if (validWord)                                                                                       //((!parsed) )  03/28/2017
                        {
                            Range currentRange = document.Range(Start: document.Words[i].Start, End: document.Words[i].End); // 10/22

                            int pageNumberOfTheCurrentRange = currentRange.Information[WdInformation.wdActiveEndPageNumber]; // 10/22


                            if ((pageNumberOfTheCurrentRange > 1))  // 10/26
                            {
                                Console.WriteLine("Scanning transcript- processing word # " + i + " -  In page # " + pageNumberOfTheCurrentRange);

                                var CustomWord = new TranscriptWord();

                                // Not in the array, it is a new word

                                processedWordList.Add(textOftheWord); // 10/24

                                CustomWord.Name = textOftheWord;

                                int wordFoundFrequency = 0;

                                Range searchRange = document.Range(Start: document.Words[i].Start, End: document.Content.End); // 10/23

                                searchRange.Find.Forward = true;                                                               // 10/26 all range words
                                searchRange.Find.Text    = textOftheWord;

                                currentWord = textOftheWord;

                                searchRange.Find.Execute(MatchWholeWord: true);
                                int currentLineNumber   = 0;
                                int currentPageNumber   = 0;
                                int pageNumberOfTheWord = 0;
                                int lineNumberOfTheWord = 0;


                                while (searchRange.Find.Found)
                                {
                                    Console.WriteLine("Looking for word : " + currentWord);

                                    // Get currently searched sentence and extract the first word
                                    string textOfTheSearchedRangeSentence = searchRange.Sentences.First.Text;
                                    string firstWordOfTheSearchedSentence = textOfTheSearchedRangeSentence.Substring(0, textOftheWord.Length);

                                    //If begining of the sentence is a number, it usually is a question number, now if it is a number and matches the searched text, it definitely
                                    // cannot be a regular word, it got to be a question number, so ignore!
                                    if ((firstWordOfTheSearchedSentence == textOftheWord) && (Regex.IsMatch(firstWordOfTheSearchedSentence, @"^[0-9]")))
                                    {
                                    }
                                    else
                                    {
                                        wordFoundFrequency++;

                                        currentPageNumber = searchRange.Information[WdInformation.wdActiveEndPageNumber];

                                        currentLineNumber = searchRange.Information[WdInformation.wdFirstCharacterLineNumber];

                                        if (currentPageNumber != 1) // Page # 1 is cover page
                                        {
                                            //Check whether current word is repeating in the same page and line number, if not, create "Occurrence" object.
                                            if (wordFoundFrequency > 1)
                                            {
                                                if (pageNumberOfTheWord != currentPageNumber || lineNumberOfTheWord != currentLineNumber)
                                                {
                                                    pageNumberOfTheWord = currentPageNumber;
                                                    lineNumberOfTheWord = currentLineNumber;

                                                    var CustomOccurrence = new Occurrence {
                                                        CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord
                                                    };
                                                    CustomWord.PageAndLine.Add(CustomOccurrence);
                                                }
                                            }
                                            else
                                            {
                                                pageNumberOfTheWord = currentPageNumber;
                                                lineNumberOfTheWord = currentLineNumber;

                                                var CustomOccurrence = new Occurrence {
                                                    CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord
                                                };
                                                CustomWord.PageAndLine.Add(CustomOccurrence);
                                            }
                                        }
                                    }

                                    searchRange.Find.Execute(MatchWholeWord: true);
                                }

                                CustomWord.Frequency = wordFoundFrequency;
                                CustomWordDirectory.Add(CustomWord);
                            }
                        }
                    }
                }
            }

            document.Close();
        }
Ejemplo n.º 2
0
        private void processDocument()
        {
            Document document = app.Documents.Open(@"C:\User_Pradeep\Transcript2.doc", ReadOnly: true);

            document.Activate();

            //int totalWordCount = document.Words.Count; 10/25

            //int wordCountArrayLength = 0; 10/24

            //string[] processedWordList = new string[totalWordCount]; 10/24

            //List<string> processedWordList = new List<string>(); 10/25

            HashSet <string> processedWordList = new HashSet <string>(); // 10/25


            String textOftheWord = "";


            for (int i = 1; i < document.Words.Count; i++)
            {
                textOftheWord = document.Words[i].Text.ToString().Trim().ToLower(); // 10/26



                //if ((textOftheWord.Length > 3) && (textOftheWord.All(Char.IsLetterOrDigit))) // 10/23 // Comma issues



                if ((textOftheWord.Length > 3) && (textOftheWord.Contains("_________") == false) && (textOftheWord != "-")) // 10/26
                {
                    string processedWord = processedWordList.FirstOrDefault(w => w == textOftheWord);                       // 10/25

                    //int wordFoundIntheProcessedWordList = Array.IndexOf(processedWordList, textOftheWord); // 10/23 10/24

                    //if ((wordFoundIntheProcessedWordList < 0)) // 10/23 10/24


                    if (processedWord == null)                                           // 10/24
                    {
                        int valueInTheWord = 0;                                          // 10/22

                        bool parsed = Int32.TryParse(textOftheWord, out valueInTheWord); // 10/22

                        //Ignore *All numbers and continues and dotted lines

                        if ((!parsed))   // 10/22
                        {
                            //curRange = document.Words[i];
                            Range currentRange = document.Range(Start: document.Words[i].Start, End: document.Words[i].End); // 10/22

                            int pageNumberOfTheCurrentRange = currentRange.Information[WdInformation.wdActiveEndPageNumber]; // 10/22


                            //if ((textOftheWord.Length > 3) && (pageNumberOfTheCurrentRange > 1) && (textOftheWord != "__") && (textOftheWord != "-")) // 10/22

                            //if (pageNumberOfTheCurrentRange > 1) // 10/23


                            if ((pageNumberOfTheCurrentRange > 1))  // 10/26
                            {
                                Console.WriteLine("Now processing word # " + i + "  In page # " + pageNumberOfTheCurrentRange);


                                //Check whether word is already processed?

                                //  int wordPosition = Array.IndexOf(processedWordList, textOftheWord); 10/24

                                var CustomWord = new TranscriptWord();

                                // Not in the array, it is a new word

                                processedWordList.Add(textOftheWord); // 10/24

                                // if (wordPosition < 0) //10/24
                                // {
                                //Add new word to processed list of words

                                // if (wordCountArrayLength <= totalWordCount) // 10/24
                                //{
                                //  processedWordList[wordCountArrayLength] = textOftheWord; 10/24
                                // wordCountArrayLength++; 10/24

                                // Update CustomWord name property
                                CustomWord.Name = textOftheWord;

                                //}

                                int wordFoundFrequency = 0;

                                // Range fullRange = document.Content; // 10/23

                                Range searchRange = document.Range(Start: document.Words[i].Start, End: document.Content.End);     // 10/23

                                //Range fullRange = curRange;
                                searchRange.Find.Forward = true;     // 10/26 all range words
                                searchRange.Find.Text    = textOftheWord;

                                currentWord = textOftheWord;

                                searchRange.Find.Execute(MatchWholeWord: true);
                                int currentLineNumber   = 0;
                                int currentPageNumber   = 0;
                                int pageNumberOfTheWord = 0;
                                int lineNumberOfTheWord = 0;


                                while (searchRange.Find.Found)
                                {
                                    wordFoundFrequency++;

                                    Console.WriteLine("Looking for word : " + currentWord);

                                    //___________________________________________
                                    //  Sentences sent = fullRange.Sentences;
                                    // Range rng = document.Range(Start: sent, End: sent);

                                    //Microsoft.Office.Interop.Word.Selection sel = document.S

                                    //Total papagraphs in the documnet

                                    // int paraCountTotal = document.Paragraphs.Count; 10/22
                                    //Paragraph count in the selected range = Last paragraph read
                                    // int totalParagraphsRead = app.ActiveDocument.Range(0, fullRange.End).Paragraphs.Count; 10/22



                                    // Range rangeFirstSentence = fullRange.Sentences.First;  10/22
                                    //Text of the current sentence
                                    // string textOfTheCurrentSentence = rangeFirstSentence.Text;    // Entire Line   // 10/22
                                    //------------------------------------------------------------------------------------------------------
                                    //int lineNumberOfTheCurrentSentence = getLineNumberOfTheWord(textOfTheCurrentSentence);



                                    // string text2 = fullRange.Text;   // Selected Word   // 10/22

                                    currentPageNumber = searchRange.Information[WdInformation.wdActiveEndPageNumber];

                                    currentLineNumber = searchRange.Information[WdInformation.wdFirstCharacterLineNumber];

                                    if (currentPageNumber != 1)     // Page # 1 is cover page 10/22
                                    {
                                        //Check whether current word is repeating in the same page and line number, if not, create "Occurrence" object.
                                        if (wordFoundFrequency > 1)
                                        {
                                            if (pageNumberOfTheWord != currentPageNumber || lineNumberOfTheWord != currentLineNumber)
                                            {
                                                pageNumberOfTheWord = currentPageNumber;
                                                lineNumberOfTheWord = currentLineNumber;

                                                var CustomOccurrence = new Occurrence {
                                                    CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord
                                                };
                                                CustomWord.PageAndLine.Add(CustomOccurrence);
                                            }
                                        }
                                        else
                                        {
                                            pageNumberOfTheWord = currentPageNumber;
                                            lineNumberOfTheWord = currentLineNumber;

                                            var CustomOccurrence = new Occurrence {
                                                CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord
                                            };
                                            CustomWord.PageAndLine.Add(CustomOccurrence);
                                        }
                                    }


                                    searchRange.Find.Execute(MatchWholeWord: true);
                                }

                                CustomWord.Frequency = wordFoundFrequency;
                                CustomWordDirectory.Add(CustomWord);
                            }
                        }
                    }
                }
            }
            //}

            document.Close();
        }
        private void processDocument2(ref string[] finalDeDupedWordList)
        {
            Document document = app.Documents.Open(@"C:\User_Pradeep\Transcript3C.doc", ReadOnly: true); //test version

            document.Activate();

            HashSet <string> processedWordList = new HashSet <string>();

            string finalSearchWord = "";

            //Console.SetWindowSize();

            for (int i = 0; i < finalDeDupedWordList.Length; i++)
            {
                finalSearchWord = finalDeDupedWordList[i].Trim();

                //Words and sentences within double quotes(" ") should be identified, quotes should be removed in order to preserve correct print order (i.e. #'s $'s digits and actual words)

                if (Regex.IsMatch(finalSearchWord, @"^[a-zA-Z0-9\$#]"))
                {
                }
                else
                {
                    //If the word is not all spaces and does not starts with one of the allowed charactors, then remove first position, could be a starting double quote or single quote
                    if (finalSearchWord != "")
                    {
                        finalSearchWord = finalSearchWord.Remove(0, 1);
                    }
                }

                //If the last position of the word in not one of the allowed charactors, then remove it, could be a closing double quote or single quote
                if (finalSearchWord != "")
                {
                    if (Regex.IsMatch(finalSearchWord.Substring(finalSearchWord.Length - 1, 1), @"[a-zA-Z0-9\$#]"))
                    {
                    }
                    else
                    {
                        finalSearchWord = finalSearchWord.Remove(finalSearchWord.Length - 1, 1);
                    }
                }

                //Cleanup any spaces created by above process if any
                finalSearchWord.Trim();

                if (finalSearchWord.Length > 2)
                {
                    string processedWord = processedWordList.FirstOrDefault(w => w == finalSearchWord); // 10/25

                    if (processedWord == null)
                    {
                        Console.WriteLine("Scanning transcript- processing word # " + i);

                        var CustomWord = new TranscriptWord();

                        // Not in the array, it is a new word so add to processed word list and start processing....

                        processedWordList.Add(finalSearchWord);

                        CustomWord.Name = finalSearchWord;

                        int wordFoundFrequency = 0;

                        Range searchRange = document.Range(Start: document.Content.Start, End: document.Content.End); //Look for the word from start of the transcript to end

                        searchRange.Find.Forward   = true;                                                            // 10/26 all range words
                        searchRange.Find.MatchCase = true;                                                            // 04/14/2016
                        searchRange.Find.Text      = finalSearchWord;

                        currentWord = finalSearchWord;

                        searchRange.Find.Execute(MatchWholeWord: true);
                        int currentLineNumber   = 0;
                        int currentPageNumber   = 0;
                        int pageNumberOfTheWord = 0;
                        int lineNumberOfTheWord = 0;


                        while (searchRange.Find.Found)
                        {
                            Console.WriteLine("Looking for word : " + currentWord);

                            // Get current sentence being searched and extract the first word

                            string textOfTheSearchedRangeSentence = searchRange.Sentences.First.Text;
                            string firstWordOfTheSearchedSentence = textOfTheSearchedRangeSentence.Substring(0, finalSearchWord.Length);

                            //If sentence starts with a number, it usually is a question number, now if it is a number and matches the searched text, it definitely
                            // cannot be a regular word, it got to be a question number, so ignore!

                            if ((firstWordOfTheSearchedSentence == finalSearchWord) && (Regex.IsMatch(firstWordOfTheSearchedSentence, @"^[0-9]")))
                            {
                            }
                            else
                            {
                                // if (currentPageNumber != 1) // Page # 1 is cover page
                                //  {
                                wordFoundFrequency++;

                                currentPageNumber = searchRange.Information[WdInformation.wdActiveEndPageNumber];

                                currentLineNumber = searchRange.Information[WdInformation.wdFirstCharacterLineNumber];


                                //Check whether current word is repeating in the same page and line number, if not, create the "Occurrence" object.

                                if (wordFoundFrequency > 1)
                                {
                                    if (pageNumberOfTheWord != currentPageNumber || lineNumberOfTheWord != currentLineNumber)
                                    {
                                        pageNumberOfTheWord = currentPageNumber;
                                        lineNumberOfTheWord = currentLineNumber;

                                        var CustomOccurrence = new Occurrence {
                                            CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord
                                        };
                                        CustomWord.PageAndLine.Add(CustomOccurrence);
                                    }
                                }
                                else
                                {
                                    pageNumberOfTheWord = currentPageNumber;
                                    lineNumberOfTheWord = currentLineNumber;

                                    var CustomOccurrence = new Occurrence {
                                        CustomPageNumber = pageNumberOfTheWord, CustomLineNumber = lineNumberOfTheWord
                                    };
                                    CustomWord.PageAndLine.Add(CustomOccurrence);
                                }
                                // }
                            }

                            searchRange.Find.Execute(MatchWholeWord: true);
                        }

                        CustomWord.Frequency = wordFoundFrequency;
                        CustomWordDirectory.Add(CustomWord);
                    }
                }
            }
            document.Close();
        }