// public static ArrayList extractRules(Hashtable _ht_rules, string awebpage, string _r_className) { ArrayList _al = new ArrayList(); awebpage = uppercaseonlytags(awebpage); awebpage = HTML.trim_commenttags(awebpage); awebpage = HTML.trimScript(awebpage); awebpage = HTML.trim_some_cases(awebpage); foreach (DictionaryEntry a_rule in _ht_rules) { rule_information _ri = (rule_information)a_rule.Value; if (_ri._Classname == _r_className) { string a_pattern_for_parent_tag = prepare_a_pattern(_ri._parent_tag); string[] parentcont = HTMLMarkerClass.webfilter.Contents_of_givenLayout_Tags_TESTER(awebpage, a_pattern_for_parent_tag, false); if ((parentcont == null && !(_ri._parent_tag.Contains("<DIV") || _ri._parent_tag.Contains("<div"))) || (parentcont == null && !(_ri._parent_tag.Contains("<TD") || _ri._parent_tag.Contains("<td"))) || (parentcont != null && (_ri._parent_tag.Contains("<tr") || _ri._parent_tag.Contains("<TR")))) { parentcont = new string[1]; parentcont[0] = awebpage; } if (parentcont != null) { foreach (string _str in parentcont) { string a_pattern_for_tag = prepare_a_pattern(_ri._tag); string[] tagcont = HTMLMarkerClass.webfilter.Contents_of_givenLayout_Tags_TESTER(_str, a_pattern_for_tag, false); if (tagcont != null) { foreach (string item in tagcont) { if (item != null) { if (HTML.stripHtml(item).Trim() != "" && _ri._htmlText != item) { if (!same_content(_al, item)) { _al.Add(item); } } } } } } } } } return(_al); }
private static void RemovefromList(ref ArrayList _list, string inner_html) { for (int i = 0; i < _list.Count; i++) { if (HTML.stripHtml((string)_list[i]) == HTML.stripHtml(inner_html)) { _list.RemoveAt(i); i--; } } }
//prepare information for a given hmtl public element AnalyzeGivenHTML(string html_content, string inner_text) { //html_content = RemoveScripts(html_content); element _element = new element(); _element.BagofWords = inner_text; _element.wordCount = HTML.WordsCountGivenText(_element.BagofWords); string pattern = "href=.*?>(.*?)</a"; Regex exp = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace); MatchCollection matchList = exp.Matches(html_content); string[] _list = new string[matchList.Count]; string URL_INNER = ""; for (int i = 0; i < matchList.Count; i++) { Match match = matchList[i]; if (match.Value.Length > 0) { URL_INNER = URL_INNER + " " + HTML.stripHtml(match.Groups[1].Value); } } int count_link = webfilter.CountStringOccurrences(html_content, "<a "); count_link = count_link + webfilter.CountStringOccurrences(html_content, "<A "); count_link = count_link + webfilter.CountStringOccurrences(html_content, "onclick="); //interesting javascript with link _element.LinkCount = count_link; _element.wordCountinLink = HTML.WordsCountGivenText(URL_INNER); if (_element.LinkCount != 0) { _element.meanofWordinLinks = (double)_element.wordCountinLink / _element.LinkCount; } else { _element.meanofWordinLinks = 0; } if (_element.wordCount != 0) { _element.meanofWordinLinksAllWords = (double)_element.wordCountinLink / _element.wordCount; } else { _element.meanofWordinLinksAllWords = 0; } return(_element); }
//prepare information for a given hmtl public element AnalyzeGivenHTML_AE(string html_content, string inner_text) { //html_content = RemoveScripts(html_content); element _element = new element(); _element.BagofWords_AE = inner_text; _element.wordCount_AE = HTML.WordsCountGivenText(_element.BagofWords_AE); string pattern = "href=.*?>(.*?)</a"; Regex exp = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace); MatchCollection matchList = exp.Matches(html_content); string[] _list = new string[matchList.Count]; string URL_INNER = ""; for (int i = 0; i < matchList.Count; i++) { Match match = matchList[i]; if (match.Value.Length > 0) { URL_INNER = URL_INNER + " " + HTML.stripHtml(match.Groups[1].Value); } } _element.LinkCount_AE = matchList.Count; _element.wordCountinLink_AE = HTML.WordsCountGivenText(URL_INNER); if (_element.LinkCount_AE != 0) { _element.meanofWordinLinks_AE = (double)_element.wordCountinLink_AE / _element.LinkCount_AE; } else { _element.meanofWordinLinks_AE = 0; } if (_element.wordCount_AE != 0) { _element.meanofWordinLinksAllWords_AE = (double)_element.wordCountinLink_AE / _element.wordCount_AE; } else { _element.meanofWordinLinksAllWords = 0; } return(_element); }
private void add_rule(string key, rule_information _ri) { if (HTML.stripHtml(_ri._htmlText) != "") { if (!_ht_rules.ContainsKey(key)) { _ht_rules.Add(key, _ri); //new rule } else //rule ok { rule_information _ri_temp = (rule_information)_ht_rules[key]; if (_ri_temp._Classname == _ri._Classname) {//prediction ok _ri_temp._count++; if (HTML.stripHtml(_ri._htmlText).Trim() != "") { if (_ri_temp._htmlText == _ri._htmlText) { _ri_temp._repetive = true; _ht_rules[key] = _ri_temp; } else { _ri_temp._repetive = true; _ht_rules[key] = _ri_temp; } } else { _ri_temp._htmlText = _ri._htmlText; _ht_rules[key] = _ri_temp; } } else { //prediction error if (_ri_temp._count < 3) //maybe mistake so delete { _ht_rules.Remove(key); } } } //rule ok else } }