示例#1
0
        /// <summary>
        /// Removes non-meaningful words and breaks text into paragraphs.
        /// </summary>
        /// <param name="text"></param>
        /// <param name="wordCount"></param>
        /// <returns></returns>
        public static List <Paragraph> ScrapeToParagraphs(string text, out int wordCount)
        {
            wordCount = 0;
            ps        = new string(p);
            List <string> rawParagraphs = new List <string>();

            try
            {
                MatchCollection mc = RegWordCount.Matches(text);
                wordCount = mc.Count;
                text      = para.Replace(text, ps);            //replace decimal CRLF with a acsii 01 for later splitting
                text      = crlftab.Replace(text, " ");        //replace remaining line breaks with simple space
                //remove all right ' for finding it's
                text = rsquote.Replace(text, "'");
                //pronouns, helper verbs (to be forms, prepositions, a, an, the, conjunctions

                //text = nonwords.Replace(text, "");
                text = non1.Replace(text, "");
                text = non2.Replace(text, "");
                text = non3.Replace(text, "");
                text = non4.Replace(text, "");
                text = non5.Replace(text, "");
                text = non6.Replace(text, "");
                text = non7.Replace(text, "");
                text = non8.Replace(text, "");
                text = non9.Replace(text, "");
                text = non10.Replace(text, "");
                text = non11.Replace(text, "");
                text = non12.Replace(text, "");
                text = non13.Replace(text, "");
                text = non14.Replace(text, "");
                text = non15.Replace(text, "");
                text = non16.Replace(text, "");
                text = non17.Replace(text, "");
                text = non18.Replace(text, "");
                text = non19.Replace(text, "");
                //remove large pockets of whitespace and replace with single space
                //LabLogger.Instance.Write("StripToParagraphs white called text = " + text, 411, 01, LoggingCategory.All);
                text = white.Replace(text, " ");

                //LabLogger.Instance.Write("StripToParagraphs split called text = " + text, 411, 01, LoggingCategory.All);
                string[] paras = text.Split(p);

                rawParagraphs = new List <string>(paras);
            }
            catch (Exception e)
            {
                throw e;
            }

            /* now process rawParagraphs into dense paragraphs
             *
             * remove all non-essential words: pronouns, helper verbs (to be forms, propositions, a, an, the, conjunctions
             *
             * split text into sentences ( . ? ! ) and into words stemming each and adding
             * to sentences, stems, and termCount (total occurence)
             */

            List <Paragraph> paragraphs = new List <Paragraph>();
            Stemmer          stemmer    = new Stemmer();

            foreach (string rawpara in rawParagraphs)
            {
                if (rawpara.Trim(trim).Length > 2)                 //ignore empty paragraphs
                {
                    List <Sentence> sentlist = new List <Sentence>();

                    MatchCollection mcsent = sentdiv.Matches(rawpara);
                    string[]        sents  = new string[mcsent.Count];
                    int             i      = 0;
                    foreach (Match ms in mcsent)
                    {
                        sents[i] = ms.Value;
                        i++;
                    }

                    foreach (string s in sents)
                    {
                        if (s.Trim(trim).Length > 2)
                        {
                            //look for title case phrase and add to titles collection???
                            string fxs = ProcessSpecialCase(s);

                            //add individual words from this sentence
                            List <Word>     words = new List <Word>();
                            MatchCollection mc    = WordReg.Matches(fxs);
                            foreach (Match m in mc)
                            {
                                string word = m.Value.Trim(trim);
                                if (word.Length > 2 || WordIsUncommon(word))                                     //all two and one letter words are ignored
                                {
                                    string stem = (word.Length > 2) ? stemmer.Porter.stemTerm(word) : word;      //only stem if more than 2 characters
                                    Word   term = new Word {
                                        Text = word, Stem = stem
                                    };
                                    words.Add(term);
                                }
                            }
                            if (words.Count > 0)                             //only add if we have words in the sentence
                            {
                                sentlist.Add(new Sentence {
                                    Words = words
                                });
                            }
                        }
                    }
                    if (sentlist.Count > 0)                     //only add paragraph if there are sentences
                    {
                        paragraphs.Add(new Paragraph {
                            Sentences = sentlist
                        });
                    }
                }
            }
            return(paragraphs);
        }
        /// <summary>
        /// Removes non-meaningful words and breaks text into paragraphs.
        /// </summary>
        /// <param name="text"></param>
        /// <param name="wordCount"></param>
        /// <returns></returns>
        public static List<Paragraph> ScrapeToParagraphs(string text, out int wordCount)
        {
            wordCount = 0;
            ps = new string(p);
            List<string> rawParagraphs = new List<string>();
            try
            {
                MatchCollection mc = RegWordCount.Matches(text);
                wordCount = mc.Count;
                text = para.Replace(text, ps); //replace decimal CRLF with a acsii 01 for later splitting
                text = crlftab.Replace(text, " "); //replace remaining line breaks with simple space
                //remove all right ' for finding it's
                text = rsquote.Replace(text, "'");
                //pronouns, helper verbs (to be forms, prepositions, a, an, the, conjunctions

                //text = nonwords.Replace(text, "");
                text = non1.Replace(text, "");
                text = non2.Replace(text, "");
                text = non3.Replace(text, "");
                text = non4.Replace(text, "");
                text = non5.Replace(text, "");
                text = non6.Replace(text, "");
                text = non7.Replace(text, "");
                text = non8.Replace(text, "");
                text = non9.Replace(text, "");
                text = non10.Replace(text, "");
                text = non11.Replace(text, "");
                text = non12.Replace(text, "");
                text = non13.Replace(text, "");
                text = non14.Replace(text, "");
                text = non15.Replace(text, "");
                text = non16.Replace(text, "");
                text = non17.Replace(text, "");
                text = non18.Replace(text, "");
                text = non19.Replace(text, "");
                //remove large pockets of whitespace and replace with single space
                //LabLogger.Instance.Write("StripToParagraphs white called text = " + text, 411, 01, LoggingCategory.All);
                text = white.Replace(text, " ");

                //LabLogger.Instance.Write("StripToParagraphs split called text = " + text, 411, 01, LoggingCategory.All);
                string[] paras = text.Split(p);

                rawParagraphs = new List<string>(paras);
            }
            catch (Exception e)
            {
                throw e;
            }

            /* now process rawParagraphs into dense paragraphs
             *
             * remove all non-essential words: pronouns, helper verbs (to be forms, propositions, a, an, the, conjunctions
             *
             * split text into sentences ( . ? ! ) and into words stemming each and adding
             * to sentences, stems, and termCount (total occurence)
            */

            List<Paragraph> paragraphs = new List<Paragraph>();
            Stemmer stemmer = new Stemmer();
            foreach (string rawpara in rawParagraphs)
            {
                if (rawpara.Trim(trim).Length > 2) //ignore empty paragraphs
                {
                    List<Sentence> sentlist = new List<Sentence>();

                    MatchCollection mcsent = sentdiv.Matches(rawpara);
                    string[] sents = new string[mcsent.Count];
                    int i = 0;
                    foreach (Match ms in mcsent)
                    {
                        sents[i] = ms.Value;
                        i++;
                    }

                    foreach (string s in sents)
                    {
                        if (s.Trim(trim).Length > 2)
                        {
                            //look for title case phrase and add to titles collection???
                            string fxs = ProcessSpecialCase(s);

                            //add individual words from this sentence
                            List<Word> words = new List<Word>();
                            MatchCollection mc = WordReg.Matches(fxs);
                            foreach (Match m in mc)
                            {
                                string word = m.Value.Trim(trim);
                                if (word.Length > 2 || WordIsUncommon(word))	 //all two and one letter words are ignored
                                {
                                    string stem = (word.Length > 2) ? stemmer.Porter.stemTerm(word) : word; //only stem if more than 2 characters
                                    Word term = new Word { Text = word, Stem = stem };
                                    words.Add(term);
                                }
                            }
                            if (words.Count > 0) //only add if we have words in the sentence
                            {
                                sentlist.Add(new Sentence { Words = words });
                            }
                        }
                    }
                    if (sentlist.Count > 0) //only add paragraph if there are sentences
                    {
                        paragraphs.Add(new Paragraph { Sentences = sentlist });
                    }
                }
            }
            return paragraphs;
        }