Example #1
0
        /// <summary>
        /// Removes non-meaningful words and breaks text into paragraphs.
        /// </summary>
        /// <param name="text"></param>
        /// <param name="wordCount"></param>
        /// <returns></returns>
        internal List <Paragraph> ScrapeToParagraphs(string text, out int wordCount)
        {
            wordCount = 0;
            ps        = new string(p);
            List <string> rawParagraphs = new List <string>();

            try
            {
                MatchCollection mc = RegWordCount.Matches(text);
                wordCount = mc.Count;
                text      = para.Replace(text, ps);            //replace decimal CRLF with a acsii 01 for later splitting
                text      = crlftab.Replace(text, " ");        //replace remaining line breaks with simple space
                //remove all right ' for finding it's
                text = rsquote.Replace(text, "'");
                //pronouns, helper verbs (to be forms, prepositions, a, an, the, conjunctions
                //Remove guids
                text = isGuid.Replace(text, "");

                List <Noise> noises = serviceKeywords.Keywords();

                foreach (Noise noise in noises)
                {
                    if (!string.IsNullOrEmpty(noise.Words))
                    {
                        //string expression = @"\b(" + noise.Words + ")\b";
                        string expression = @"\b(" + noise.Words + ")";
                        Regex  reg        = new Regex(expression, (RegexOptions.IgnoreCase | RegexOptions.Compiled));
                        text = reg.Replace(text, "");
                    }
                }

                /*
                 *              //text = nonwords.Replace(text, "");
                 *              text = non1.Replace(text, "");
                 *              text = non2.Replace(text, "");
                 *              text = non3.Replace(text, "");
                 *              text = non4.Replace(text, "");
                 *              text = non5.Replace(text, "");
                 *              text = non6.Replace(text, "");
                 *              text = non7.Replace(text, "");
                 *              text = non8.Replace(text, "");
                 *              text = non9.Replace(text, "");
                 *              text = non10.Replace(text, "");
                 *              text = non11.Replace(text, "");
                 *              text = non12.Replace(text, "");
                 *              text = non13.Replace(text, "");
                 *              text = non14.Replace(text, "");
                 *              text = non15.Replace(text, "");
                 *              text = non16.Replace(text, "");
                 *              text = non17.Replace(text, "");
                 *              text = non18.Replace(text, "");
                 *              text = non19.Replace(text, "");
                 */
                //remove large pockets of whitespace and replace with single space
                //LabLogger.Instance.Write("StripToParagraphs white called text = " + text, 411, 01, LoggingCategory.All);
                text = white.Replace(text, " ");

                //LabLogger.Instance.Write("StripToParagraphs split called text = " + text, 411, 01, LoggingCategory.All);
                string[] paras = text.Split(p);

                rawParagraphs = new List <string>(paras);
            }
            catch (Exception)
            {
                throw;
            }

            /* now process rawParagraphs into dense paragraphs
             *
             * remove all non-essential words: pronouns, helper verbs (to be forms, propositions, a, an, the, conjunctions
             *
             * split text into sentences ( . ? ! ) and into words stemming each and adding
             * to sentences, stems, and termCount (total occurence)
             */

            List <Paragraph> paragraphs = new List <Paragraph>();
            Stemmer          stemmer    = new Stemmer();

            foreach (string rawpara in rawParagraphs)
            {
                if (rawpara.Trim(trim).Length > 2)                 //ignore empty paragraphs
                {
                    List <Sentence> sentlist = new List <Sentence>();

                    MatchCollection mcsent = sentdiv.Matches(rawpara);
                    string[]        sents  = new string[mcsent.Count];
                    int             i      = 0;
                    foreach (Match ms in mcsent)
                    {
                        sents[i] = ms.Value;
                        i++;
                    }

                    foreach (string s in sents)
                    {
                        if (s.Trim(trim).Length > 2)
                        {
                            //look for title case phrase and add to titles collection???
                            string fxs = ProcessSpecialCase(s);

                            //add individual words from this sentence
                            List <Word>     words = new List <Word>();
                            MatchCollection mc    = WordReg.Matches(fxs);
                            foreach (Match m in mc)
                            {
                                string word = m.Value.Trim(trim);
                                if (word.Length > 2 || WordIsUncommon(word))                                     //all two and one letter words are ignored
                                {
                                    string stem = (word.Length > 2) ? stemmer.Porter.stemTerm(word) : word;      //only stem if more than 2 characters
                                    Word   term = new Word {
                                        Text = word, Stem = stem
                                    };
                                    words.Add(term);
                                }
                            }
                            if (words.Count > 0)                             //only add if we have words in the sentence
                            {
                                sentlist.Add(new Sentence {
                                    Words = words
                                });
                            }
                        }
                    }
                    if (sentlist.Count > 0)                     //only add paragraph if there are sentences
                    {
                        paragraphs.Add(new Paragraph {
                            Sentences = sentlist
                        });
                    }
                }
            }
            return(paragraphs);
        }