Ejemplo n.º 1
0
        //
        public static ArrayList extractRules(Hashtable _ht_rules, string awebpage, string _r_className)
        {
            ArrayList _al = new ArrayList();

            awebpage = uppercaseonlytags(awebpage);
            awebpage = HTML.trim_commenttags(awebpage);
            awebpage = HTML.trimScript(awebpage);
            awebpage = HTML.trim_some_cases(awebpage);

            foreach (DictionaryEntry a_rule in _ht_rules)
            {
                rule_information _ri = (rule_information)a_rule.Value;
                if (_ri._Classname == _r_className)
                {
                    string   a_pattern_for_parent_tag = prepare_a_pattern(_ri._parent_tag);
                    string[] parentcont = HTMLMarkerClass.webfilter.Contents_of_givenLayout_Tags_TESTER(awebpage, a_pattern_for_parent_tag, false);

                    if ((parentcont == null && !(_ri._parent_tag.Contains("<DIV") || _ri._parent_tag.Contains("<div"))) ||
                        (parentcont == null && !(_ri._parent_tag.Contains("<TD") || _ri._parent_tag.Contains("<td"))) ||
                        (parentcont != null && (_ri._parent_tag.Contains("<tr") || _ri._parent_tag.Contains("<TR"))))
                    {
                        parentcont    = new string[1];
                        parentcont[0] = awebpage;
                    }

                    if (parentcont != null)
                    {
                        foreach (string _str in parentcont)
                        {
                            string   a_pattern_for_tag = prepare_a_pattern(_ri._tag);
                            string[] tagcont           = HTMLMarkerClass.webfilter.Contents_of_givenLayout_Tags_TESTER(_str, a_pattern_for_tag, false);

                            if (tagcont != null)
                            {
                                foreach (string item in tagcont)
                                {
                                    if (item != null)
                                    {
                                        if (HTML.stripHtml(item).Trim() != "" && _ri._htmlText != item)
                                        {
                                            if (!same_content(_al, item))
                                            {
                                                _al.Add(item);
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }

            return(_al);
        }
Ejemplo n.º 2
0
 private static void RemovefromList(ref ArrayList _list, string inner_html)
 {
     for (int i = 0; i < _list.Count; i++)
     {
         if (HTML.stripHtml((string)_list[i]) == HTML.stripHtml(inner_html))
         {
             _list.RemoveAt(i);
             i--;
         }
     }
 }
Ejemplo n.º 3
0
        //prepare information for a given hmtl
        public element AnalyzeGivenHTML(string html_content, string inner_text)
        {
            //html_content = RemoveScripts(html_content);
            element _element = new element();

            _element.BagofWords = inner_text;
            _element.wordCount  = HTML.WordsCountGivenText(_element.BagofWords);


            string pattern = "href=.*?>(.*?)</a";
            Regex  exp     = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace);

            MatchCollection matchList = exp.Matches(html_content);

            string[] _list     = new string[matchList.Count];
            string   URL_INNER = "";

            for (int i = 0; i < matchList.Count; i++)
            {
                Match match = matchList[i];
                if (match.Value.Length > 0)
                {
                    URL_INNER = URL_INNER + " " + HTML.stripHtml(match.Groups[1].Value);
                }
            }

            int count_link = webfilter.CountStringOccurrences(html_content, "<a ");

            count_link = count_link + webfilter.CountStringOccurrences(html_content, "<A ");
            count_link = count_link + webfilter.CountStringOccurrences(html_content, "onclick="); //interesting javascript with link

            _element.LinkCount       = count_link;
            _element.wordCountinLink = HTML.WordsCountGivenText(URL_INNER);
            if (_element.LinkCount != 0)
            {
                _element.meanofWordinLinks = (double)_element.wordCountinLink / _element.LinkCount;
            }
            else
            {
                _element.meanofWordinLinks = 0;
            }

            if (_element.wordCount != 0)
            {
                _element.meanofWordinLinksAllWords = (double)_element.wordCountinLink / _element.wordCount;
            }
            else
            {
                _element.meanofWordinLinksAllWords = 0;
            }

            return(_element);
        }
Ejemplo n.º 4
0
        //prepare information for a given hmtl
        public element AnalyzeGivenHTML_AE(string html_content, string inner_text)
        {
            //html_content = RemoveScripts(html_content);
            element _element = new element();

            _element.BagofWords_AE = inner_text;
            _element.wordCount_AE  = HTML.WordsCountGivenText(_element.BagofWords_AE);

            string pattern = "href=.*?>(.*?)</a";
            Regex  exp     = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace);

            MatchCollection matchList = exp.Matches(html_content);

            string[] _list     = new string[matchList.Count];
            string   URL_INNER = "";

            for (int i = 0; i < matchList.Count; i++)
            {
                Match match = matchList[i];
                if (match.Value.Length > 0)
                {
                    URL_INNER = URL_INNER + " " + HTML.stripHtml(match.Groups[1].Value);
                }
            }

            _element.LinkCount_AE       = matchList.Count;
            _element.wordCountinLink_AE = HTML.WordsCountGivenText(URL_INNER);
            if (_element.LinkCount_AE != 0)
            {
                _element.meanofWordinLinks_AE = (double)_element.wordCountinLink_AE / _element.LinkCount_AE;
            }
            else
            {
                _element.meanofWordinLinks_AE = 0;
            }

            if (_element.wordCount_AE != 0)
            {
                _element.meanofWordinLinksAllWords_AE = (double)_element.wordCountinLink_AE / _element.wordCount_AE;
            }
            else
            {
                _element.meanofWordinLinksAllWords = 0;
            }

            return(_element);
        }
Ejemplo n.º 5
0
        private void add_rule(string key, rule_information _ri)
        {
            if (HTML.stripHtml(_ri._htmlText) != "")
            {
                if (!_ht_rules.ContainsKey(key))
                {
                    _ht_rules.Add(key, _ri); //new rule
                }
                else //rule ok
                {
                    rule_information _ri_temp = (rule_information)_ht_rules[key];

                    if (_ri_temp._Classname == _ri._Classname)
                    {//prediction ok
                        _ri_temp._count++;
                        if (HTML.stripHtml(_ri._htmlText).Trim() != "")
                        {
                            if (_ri_temp._htmlText == _ri._htmlText)
                            {
                                _ri_temp._repetive = true;
                                _ht_rules[key]     = _ri_temp;
                            }
                            else
                            {
                                _ri_temp._repetive = true;
                                _ht_rules[key]     = _ri_temp;
                            }
                        }
                        else
                        {
                            _ri_temp._htmlText = _ri._htmlText;
                            _ht_rules[key]     = _ri_temp;
                        }
                    }
                    else
                    {                            //prediction error
                        if (_ri_temp._count < 3) //maybe mistake so delete
                        {
                            _ht_rules.Remove(key);
                        }
                    }
                } //rule ok else
            }
        }