/// <summary> /// Removes non-meaningful words and breaks text into paragraphs. /// </summary> /// <param name="text"></param> /// <param name="wordCount"></param> /// <returns></returns> public static List <Paragraph> ScrapeToParagraphs(string text, out int wordCount) { wordCount = 0; ps = new string(p); List <string> rawParagraphs = new List <string>(); try { MatchCollection mc = RegWordCount.Matches(text); wordCount = mc.Count; text = para.Replace(text, ps); //replace decimal CRLF with a acsii 01 for later splitting text = crlftab.Replace(text, " "); //replace remaining line breaks with simple space //remove all right ' for finding it's text = rsquote.Replace(text, "'"); //pronouns, helper verbs (to be forms, prepositions, a, an, the, conjunctions //text = nonwords.Replace(text, ""); text = non1.Replace(text, ""); text = non2.Replace(text, ""); text = non3.Replace(text, ""); text = non4.Replace(text, ""); text = non5.Replace(text, ""); text = non6.Replace(text, ""); text = non7.Replace(text, ""); text = non8.Replace(text, ""); text = non9.Replace(text, ""); text = non10.Replace(text, ""); text = non11.Replace(text, ""); text = non12.Replace(text, ""); text = non13.Replace(text, ""); text = non14.Replace(text, ""); text = non15.Replace(text, ""); text = non16.Replace(text, ""); text = non17.Replace(text, ""); text = non18.Replace(text, ""); text = non19.Replace(text, ""); //remove large pockets of whitespace and replace with single space //LabLogger.Instance.Write("StripToParagraphs white called text = " + text, 411, 01, LoggingCategory.All); text = white.Replace(text, " "); //LabLogger.Instance.Write("StripToParagraphs split called text = " + text, 411, 01, LoggingCategory.All); string[] paras = text.Split(p); rawParagraphs = new List <string>(paras); } catch (Exception e) { throw e; } /* now process rawParagraphs into dense paragraphs * * remove all non-essential words: pronouns, helper verbs (to be forms, propositions, a, an, the, conjunctions * * split text into sentences ( . ? ! ) and into words stemming each and adding * to sentences, stems, and termCount (total occurence) */ List <Paragraph> paragraphs = new List <Paragraph>(); Stemmer stemmer = new Stemmer(); foreach (string rawpara in rawParagraphs) { if (rawpara.Trim(trim).Length > 2) //ignore empty paragraphs { List <Sentence> sentlist = new List <Sentence>(); MatchCollection mcsent = sentdiv.Matches(rawpara); string[] sents = new string[mcsent.Count]; int i = 0; foreach (Match ms in mcsent) { sents[i] = ms.Value; i++; } foreach (string s in sents) { if (s.Trim(trim).Length > 2) { //look for title case phrase and add to titles collection??? string fxs = ProcessSpecialCase(s); //add individual words from this sentence List <Word> words = new List <Word>(); MatchCollection mc = WordReg.Matches(fxs); foreach (Match m in mc) { string word = m.Value.Trim(trim); if (word.Length > 2 || WordIsUncommon(word)) //all two and one letter words are ignored { string stem = (word.Length > 2) ? stemmer.Porter.stemTerm(word) : word; //only stem if more than 2 characters Word term = new Word { Text = word, Stem = stem }; words.Add(term); } } if (words.Count > 0) //only add if we have words in the sentence { sentlist.Add(new Sentence { Words = words }); } } } if (sentlist.Count > 0) //only add paragraph if there are sentences { paragraphs.Add(new Paragraph { Sentences = sentlist }); } } } return(paragraphs); }
/// <summary> /// Removes non-meaningful words and breaks text into paragraphs. /// </summary> /// <param name="text"></param> /// <param name="wordCount"></param> /// <returns></returns> public static List<Paragraph> ScrapeToParagraphs(string text, out int wordCount) { wordCount = 0; ps = new string(p); List<string> rawParagraphs = new List<string>(); try { MatchCollection mc = RegWordCount.Matches(text); wordCount = mc.Count; text = para.Replace(text, ps); //replace decimal CRLF with a acsii 01 for later splitting text = crlftab.Replace(text, " "); //replace remaining line breaks with simple space //remove all right ' for finding it's text = rsquote.Replace(text, "'"); //pronouns, helper verbs (to be forms, prepositions, a, an, the, conjunctions //text = nonwords.Replace(text, ""); text = non1.Replace(text, ""); text = non2.Replace(text, ""); text = non3.Replace(text, ""); text = non4.Replace(text, ""); text = non5.Replace(text, ""); text = non6.Replace(text, ""); text = non7.Replace(text, ""); text = non8.Replace(text, ""); text = non9.Replace(text, ""); text = non10.Replace(text, ""); text = non11.Replace(text, ""); text = non12.Replace(text, ""); text = non13.Replace(text, ""); text = non14.Replace(text, ""); text = non15.Replace(text, ""); text = non16.Replace(text, ""); text = non17.Replace(text, ""); text = non18.Replace(text, ""); text = non19.Replace(text, ""); //remove large pockets of whitespace and replace with single space //LabLogger.Instance.Write("StripToParagraphs white called text = " + text, 411, 01, LoggingCategory.All); text = white.Replace(text, " "); //LabLogger.Instance.Write("StripToParagraphs split called text = " + text, 411, 01, LoggingCategory.All); string[] paras = text.Split(p); rawParagraphs = new List<string>(paras); } catch (Exception e) { throw e; } /* now process rawParagraphs into dense paragraphs * * remove all non-essential words: pronouns, helper verbs (to be forms, propositions, a, an, the, conjunctions * * split text into sentences ( . ? ! ) and into words stemming each and adding * to sentences, stems, and termCount (total occurence) */ List<Paragraph> paragraphs = new List<Paragraph>(); Stemmer stemmer = new Stemmer(); foreach (string rawpara in rawParagraphs) { if (rawpara.Trim(trim).Length > 2) //ignore empty paragraphs { List<Sentence> sentlist = new List<Sentence>(); MatchCollection mcsent = sentdiv.Matches(rawpara); string[] sents = new string[mcsent.Count]; int i = 0; foreach (Match ms in mcsent) { sents[i] = ms.Value; i++; } foreach (string s in sents) { if (s.Trim(trim).Length > 2) { //look for title case phrase and add to titles collection??? string fxs = ProcessSpecialCase(s); //add individual words from this sentence List<Word> words = new List<Word>(); MatchCollection mc = WordReg.Matches(fxs); foreach (Match m in mc) { string word = m.Value.Trim(trim); if (word.Length > 2 || WordIsUncommon(word)) //all two and one letter words are ignored { string stem = (word.Length > 2) ? stemmer.Porter.stemTerm(word) : word; //only stem if more than 2 characters Word term = new Word { Text = word, Stem = stem }; words.Add(term); } } if (words.Count > 0) //only add if we have words in the sentence { sentlist.Add(new Sentence { Words = words }); } } } if (sentlist.Count > 0) //only add paragraph if there are sentences { paragraphs.Add(new Paragraph { Sentences = sentlist }); } } } return paragraphs; }