//prepare information for a given hmtl public element AnalyzeGivenHTML(string html_content, string inner_text) { //html_content = RemoveScripts(html_content); element _element = new element(); _element.BagofWords = inner_text; _element.wordCount = HTML.WordsCountGivenText(_element.BagofWords); string pattern = "href=.*?>(.*?)</a"; Regex exp = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace); MatchCollection matchList = exp.Matches(html_content); string[] _list = new string[matchList.Count]; string URL_INNER = ""; for (int i = 0; i < matchList.Count; i++) { Match match = matchList[i]; if (match.Value.Length > 0) { URL_INNER = URL_INNER + " " + HTML.stripHtml(match.Groups[1].Value); } } int count_link = webfilter.CountStringOccurrences(html_content, "<a "); count_link = count_link + webfilter.CountStringOccurrences(html_content, "<A "); count_link = count_link + webfilter.CountStringOccurrences(html_content, "onclick="); //interesting javascript with link _element.LinkCount = count_link; _element.wordCountinLink = HTML.WordsCountGivenText(URL_INNER); if (_element.LinkCount != 0) { _element.meanofWordinLinks = (double)_element.wordCountinLink / _element.LinkCount; } else { _element.meanofWordinLinks = 0; } if (_element.wordCount != 0) { _element.meanofWordinLinksAllWords = (double)_element.wordCountinLink / _element.wordCount; } else { _element.meanofWordinLinksAllWords = 0; } return(_element); }
//prepare information for a given hmtl public element AnalyzeGivenHTML_AE(string html_content, string inner_text) { //html_content = RemoveScripts(html_content); element _element = new element(); _element.BagofWords_AE = inner_text; _element.wordCount_AE = HTML.WordsCountGivenText(_element.BagofWords_AE); string pattern = "href=.*?>(.*?)</a"; Regex exp = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace); MatchCollection matchList = exp.Matches(html_content); string[] _list = new string[matchList.Count]; string URL_INNER = ""; for (int i = 0; i < matchList.Count; i++) { Match match = matchList[i]; if (match.Value.Length > 0) { URL_INNER = URL_INNER + " " + HTML.stripHtml(match.Groups[1].Value); } } _element.LinkCount_AE = matchList.Count; _element.wordCountinLink_AE = HTML.WordsCountGivenText(URL_INNER); if (_element.LinkCount_AE != 0) { _element.meanofWordinLinks_AE = (double)_element.wordCountinLink_AE / _element.LinkCount_AE; } else { _element.meanofWordinLinks_AE = 0; } if (_element.wordCount_AE != 0) { _element.meanofWordinLinksAllWords_AE = (double)_element.wordCountinLink_AE / _element.wordCount_AE; } else { _element.meanofWordinLinksAllWords = 0; } return(_element); }