Esempio n. 1
0
        public static Hashtable Load_Rules(string filename)
        {
            Hashtable _ht = new Hashtable();

            if (File.Exists(filename))
            {
                StreamReader file = new StreamReader(filename);
                string       line = "";
                while ((line = file.ReadLine()) != null)
                {
                    string tagName        = "<" + xmlfile.findElementName(line, "tagname=\"<(.*?)>\"") + ">";
                    string parent_tagName = "<" + xmlfile.findElementName(line, "parent_tagname=\"<(.*?)>\"") + ">";
                    string html_text      = "<" + xmlfile.findElementName(line, "html_text=\"<(.*?)>\"") + ">";
                    string count          = "<" + xmlfile.findElementName(line, "_count=\"<(.*?)>\"") + ">";
                    string className      = xmlfile.find_ClassName(line);
                    string key            = tagName + ", " + parent_tagName;
                    if (className == "others")
                    {
                        className = "additional";
                    }

                    rule_information _ri = new rule_information(className, tagName, parent_tagName, html_text, 1, false);
                    _ht.Add(key, _ri);
                }
                file.Close();
            }
            return(_ht);
        }
Esempio n. 2
0
        public file_Rules(string _dir, string filename)
        {
            xmlfile._destdir = _dir;
            if (filename == "")
            {
                xmlfile.Read_XMLFile(); //read template.xml file in _dir
            }
            else
            {
                xmlfile.Read_XMLFile(_dir + filename);
            }
            //results in xmlfile._xml_list, now read this arraylist
            _ht_rules = new Hashtable();
            foreach (string line in xmlfile._xml_list)
            {
                string tagName        = "<" + xmlfile.findElementName(line, "tagname=\"<(.*?)>\"") + ">";
                string parent_tagName = "<" + xmlfile.findElementName(line, "parent_tagname=\"<(.*?)>\"") + ">";
                string className      = xmlfile.find_ClassName(line);
                if (className == "others")
                {
                    className = "additional";
                }

                /*rule_part _rp = new rule_part();
                 * _rp._tag = tagName;
                 * _rp._parent_tag = parent_tagName;*/
                string           key = tagName + ", " + parent_tagName;
                rule_information _ri = new rule_information(className, tagName, parent_tagName, "", 100, false);
                _ht_rules.Add(key, _ri);
            }
        }
Esempio n. 3
0
        //
        public static ArrayList extractRules(Hashtable _ht_rules, string awebpage, string _r_className)
        {
            ArrayList _al = new ArrayList();

            awebpage = uppercaseonlytags(awebpage);
            awebpage = HTML.trim_commenttags(awebpage);
            awebpage = HTML.trimScript(awebpage);
            awebpage = HTML.trim_some_cases(awebpage);

            foreach (DictionaryEntry a_rule in _ht_rules)
            {
                rule_information _ri = (rule_information)a_rule.Value;
                if (_ri._Classname == _r_className)
                {
                    string   a_pattern_for_parent_tag = prepare_a_pattern(_ri._parent_tag);
                    string[] parentcont = HTMLMarkerClass.webfilter.Contents_of_givenLayout_Tags_TESTER(awebpage, a_pattern_for_parent_tag, false);

                    if ((parentcont == null && !(_ri._parent_tag.Contains("<DIV") || _ri._parent_tag.Contains("<div"))) ||
                        (parentcont == null && !(_ri._parent_tag.Contains("<TD") || _ri._parent_tag.Contains("<td"))) ||
                        (parentcont != null && (_ri._parent_tag.Contains("<tr") || _ri._parent_tag.Contains("<TR"))))
                    {
                        parentcont    = new string[1];
                        parentcont[0] = awebpage;
                    }

                    if (parentcont != null)
                    {
                        foreach (string _str in parentcont)
                        {
                            string   a_pattern_for_tag = prepare_a_pattern(_ri._tag);
                            string[] tagcont           = HTMLMarkerClass.webfilter.Contents_of_givenLayout_Tags_TESTER(_str, a_pattern_for_tag, false);

                            if (tagcont != null)
                            {
                                foreach (string item in tagcont)
                                {
                                    if (item != null)
                                    {
                                        if (HTML.stripHtml(item).Trim() != "" && _ri._htmlText != item)
                                        {
                                            if (!same_content(_al, item))
                                            {
                                                _al.Add(item);
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }

            return(_al);
        }
Esempio n. 4
0
        public static void Save_Rules(Hashtable _ht, string file)
        {
            StreamWriter sw = File.CreateText(file);

            foreach (DictionaryEntry d in _ht)
            {
                //rule_part _rp = (rule_part)d.Key;
                rule_information _ri = (rule_information)d.Value;
                //<menu tagname="<DIV class="cabeceratop clearfix">" parent_tagname="<DIV class=headertop>" />
                sw.WriteLine("<" + _ri._Classname + " tagname=\"" + _ri._tag + "\" parent_tagname=\"" + _ri._parent_tag + "\" html_text=\"<" + _ri._htmlText + ">\" _count=\"<" + _ri._count + ">\"/>");
            }
            sw.Close();
        }
Esempio n. 5
0
        private void prepareRules(ArrayList _list)
        {
            if (_list != null)
            {
                HTMLMarkerClass.desicionClass._list = _list; //_list gönder

                for (int i = 0; i < _list.Count; i++)
                {
                    HTMLMarkerClass.element _element = (HTMLMarkerClass.element)_list[i];
                    if (_element.wordCount_AE > 2)
                    {
                        foreach (string item_cn in _CN)
                        {
                            string i_cn = item_cn;
                            if (i_cn == "additional")
                            {
                                i_cn = "others";
                            }

                            if (HTMLMarkerClass.desicionClass.write_or_not(i_cn, _element))
                            {
                                string _parent_tag;
                                if (_element.elementlinked_id != -1)
                                {
                                    HTMLMarkerClass.element _parent_element = (HTMLMarkerClass.element)_list[_element.elementlinked_id];
                                    _parent_tag = _parent_element.elementName;
                                }
                                else
                                {
                                    _parent_tag = "";
                                }

                                string key = _element.elementName + ", " + _parent_tag;

                                /*rule_part _rp = new rule_part();
                                 * _rp._tag = ;
                                 * _rp._parent_tag = _parent_tag;*/
                                rule_information _ri = new rule_information(item_cn, _element.elementName, _parent_tag, _element.innerHTML_AE, 1, false);
                                add_rule(key, _ri);
                            } //yaz = true
                        }     //for each
                    }
                }             //for i
            }
        }
Esempio n. 6
0
        private void add_rule(string key, rule_information _ri)
        {
            if (HTML.stripHtml(_ri._htmlText) != "")
            {
                if (!_ht_rules.ContainsKey(key))
                {
                    _ht_rules.Add(key, _ri); //new rule
                }
                else //rule ok
                {
                    rule_information _ri_temp = (rule_information)_ht_rules[key];

                    if (_ri_temp._Classname == _ri._Classname)
                    {//prediction ok
                        _ri_temp._count++;
                        if (HTML.stripHtml(_ri._htmlText).Trim() != "")
                        {
                            if (_ri_temp._htmlText == _ri._htmlText)
                            {
                                _ri_temp._repetive = true;
                                _ht_rules[key]     = _ri_temp;
                            }
                            else
                            {
                                _ri_temp._repetive = true;
                                _ht_rules[key]     = _ri_temp;
                            }
                        }
                        else
                        {
                            _ri_temp._htmlText = _ri._htmlText;
                            _ht_rules[key]     = _ri_temp;
                        }
                    }
                    else
                    {                            //prediction error
                        if (_ri_temp._count < 3) //maybe mistake so delete
                        {
                            _ht_rules.Remove(key);
                        }
                    }
                } //rule ok else
            }
        }
Esempio n. 7
0
 public static void RemoveRepetativeParts(ref ArrayList _main_list, ref ArrayList _headline_list, ref ArrayList _summary_list, ref ArrayList _additional_list, Hashtable _ht)
 {
     foreach (DictionaryEntry d in _ht)
     {
         rule_information _ri = (rule_information)d.Value;
         if (_ri._Classname == "main")
         {
             RemovefromList(ref _main_list, _ri._htmlText);
         }
         if (_ri._Classname == "headline")
         {
             RemovefromList(ref _headline_list, _ri._htmlText);
         }
         if (_ri._Classname == "summary")
         {
             RemovefromList(ref _summary_list, _ri._htmlText);
         }
         if (_ri._Classname == "additional")
         {
             RemovefromList(ref _additional_list, _ri._htmlText);
         }
     }
 }
Esempio n. 8
0
        public static void Compare_Two_Ht(Hashtable _ht01, Hashtable _ht02, ArrayList _list)
        {
            for (int i = 0; i < _CN_count_correct.Length; i++)
            {
                _CN_count_correct[i] = 0;
                _CN_count_false[i]   = 0;
            }

            foreach (DictionaryEntry d2 in _ht02)
            {
                rule_information _ri2 = (rule_information)d2.Value;
                string           _cn  = _ri2._Classname;
                int _id = id_CN(_cn);

                bool rule_var        = false;
                bool element_predict = false;

                for (int i = 0; i < _list.Count; i++)
                {
                    HTMLMarkerClass.element _element = (HTMLMarkerClass.element)_list[i];
                    if (_element.elementName == _ri2._tag && _element.parent_elementName == _ri2._parent_tag && _element.wordCount_AE > 2)
                    {
                        rule_var = true;
                        foreach (DictionaryEntry d1 in _ht01)
                        {
                            rule_information _ri1 = (rule_information)d1.Value;
                            if (_ri1._tag == _ri2._tag && _ri1._parent_tag == _ri2._parent_tag)
                            {
                                if (_ri1._Classname == _cn)
                                {
                                    if (_id != -1)
                                    {
                                        _CN_count_correct[_id]++;
                                    }
                                }
                                else
                                if (_id != -1)
                                {
                                    _CN_count_false[_id]++;    //error 1
                                }
                                element_predict = true;
                                break;
                            }
                        }//foreach 2
                        if (rule_var)
                        {
                            break;
                        }
                    }
                }//for

                //rule in list but no prediction
                if (rule_var && !element_predict)
                {
                    if (_id != -1)
                    {
                        _CN_count_false[_id]++;//error 2
                    }
                }
            }//foreach 1*/
        }