Exemplo n.º 1
0
        private void add_rule(string key, rule_information _ri)
        {
            if (HTML.stripHtml(_ri._htmlText) != "")
            {
                if (!_ht_rules.ContainsKey(key))
                {
                    _ht_rules.Add(key, _ri); //new rule
                }
                else //rule ok
                {
                    rule_information _ri_temp = (rule_information)_ht_rules[key];

                    if (_ri_temp._Classname == _ri._Classname)
                    {//prediction ok
                        _ri_temp._count++;
                        if (HTML.stripHtml(_ri._htmlText).Trim() != "")
                        {
                            if (_ri_temp._htmlText == _ri._htmlText)
                            {
                                _ri_temp._repetive = true;
                                _ht_rules[key]     = _ri_temp;
                            }
                            else
                            {
                                _ri_temp._repetive = true;
                                _ht_rules[key]     = _ri_temp;
                            }
                        }
                        else
                        {
                            _ri_temp._htmlText = _ri._htmlText;
                            _ht_rules[key]     = _ri_temp;
                        }
                    }
                    else
                    {                            //prediction error
                        if (_ri_temp._count < 3) //maybe mistake so delete
                        {
                            _ht_rules.Remove(key);
                        }
                    }
                } //rule ok else
            }
        }
Exemplo n.º 2
0
        //prepare information for a given hmtl
        public element AnalyzeGivenHTML_AE(string html_content, string inner_text, ref element _element)
        {
            _element.BagofWords_AE = inner_text;
            _element.wordCount_AE  = HTML.WordsCountGivenText(_element.BagofWords_AE);

            string pattern = "href=.*?>(.*?)</a";
            Regex  exp     = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace);

            MatchCollection matchList = exp.Matches(html_content);

            string[] _list     = new string[matchList.Count];
            string   URL_INNER = "";

            for (int i = 0; i < matchList.Count; i++)
            {
                Match match = matchList[i];
                if (match.Value.Length > 0)
                {
                    URL_INNER = URL_INNER + " " + HTML.stripHtml(match.Groups[1].Value);
                }
            }

            _element.LinkCount_AE       = matchList.Count;
            _element.wordCountinLink_AE = HTML.WordsCountGivenText(URL_INNER);
            if (_element.LinkCount_AE != 0)
            {
                _element.meanofWordinLinks_AE = (double)_element.wordCountinLink_AE / _element.LinkCount_AE;
            }
            else
            {
                _element.meanofWordinLinks_AE = 0;
            }

            if (_element.wordCount_AE != 0)
            {
                _element.meanofWordinLinksAllWords_AE = (double)_element.wordCountinLink_AE / _element.wordCount_AE;
            }
            else
            {
                _element.meanofWordinLinksAllWords = 0;
            }

            return(_element);
        }
Exemplo n.º 3
0
        //for test finding operation
        public static string[] Contents_of_givenLayout_Tags_TESTER(string html_content, string pattern, bool cut_sub_blocks)
        {
            Hashtable _tags_in_HTML = filtergivenHTMLtag_TESTER(html_content, pattern);

            string[] _content    = null;
            string   s_tag       = pattern.Replace(".", " ");
            string   e_tag       = find_EndTag(s_tag);
            int      elementsize = 0;

            foreach (DictionaryEntry d in _tags_in_HTML)
            {
                string _tag = (string)d.Key;
                int    _cnt = (int)d.Value;

                _content = GrabbingofHTMLTags(html_content, _tag, _cnt);
                string temp = "";
                for (int i = 0; i < _content.Length; i++)
                {
                    string t_content = _content[i];
                    if (cut_sub_blocks)
                    {
                        t_content   = HTML.trimDIV(t_content);
                        t_content   = HTML.trimTD(t_content);
                        _content[i] = t_content;
                    }

                    temp = temp + t_content;

                    //başlagıç ve bitiş etiketi tekrar yazılıyor.
                    _content[i] = s_tag + _content[i] + e_tag;
                }

                elementsize = elementsize + temp.Length;
            }

            //başlangıç etiketini tekrar yapıştır.


            return(_content);
        }
Exemplo n.º 4
0
        public ArrayList prepareDOM(string htmlContent)
        {
            htmlContent = HTML.trimOptions(htmlContent);
            htmlContent = HTML.trimScript(htmlContent);

            IHTMLDocument2 htmlDocument = new mshtml.HTMLDocumentClass();

            htmlDocument.write(htmlContent);

            IHTMLElementCollection allElements = htmlDocument.all;

            _ht      = new Hashtable();
            _list    = new ArrayList();
            _xmllist = new ArrayList();

            string _tempinner_text = "";

            if (htmlDocument.body != null)
            {
                if (htmlDocument.body.innerText != null)
                {
                    _tempinner_text = htmlDocument.body.innerText.Replace("\r\n", "");
                }
            }

            element _firstelement = AnalyzeGivenHTML(htmlContent, _tempinner_text);
            int     i             = 0;

            foreach (IHTMLElement htmlelement in allElements)
            {
                if (htmlelement.outerHTML != null)
                {
                    element _element = new element();
                    _element.id        = i;
                    _element.outerHTML = htmlelement.outerHTML;
                    _element.outerHTML = _element.outerHTML.Replace("\r\n", "");
                    if (htmlelement.innerHTML != null)
                    {
                        _element.innerHTML = htmlelement.innerHTML;
                        _element.innerHTML = _element.innerHTML.Replace("\r\n", "");
                    }
                    else
                    {
                        _element.innerHTML = "";
                    }

                    if (_element.id == 0)
                    {
                        _element.elementlinked_id = -1;//root
                        savehtmlContent           = _element.outerHTML;
                        resulthmtlContent         = _element.outerHTML;
                    }
                    else
                    {
                        _element.elementlinked_id = 0;
                    }

                    if (htmlelement.tagName == "HTML")
                    {//html bazen geç geliyor...
                        savehtmlContent   = _element.outerHTML;
                        resulthmtlContent = _element.outerHTML;
                    }

                    string _str   = _element.outerHTML;
                    int    _start = _str.IndexOf('<');
                    int    _end   = _str.IndexOf('>');
                    _element.elementName = _str.Substring(_start, _end - _start + 1);

                    //<!--className::(.*?)--> ???garanti başlangıçtaki olmalı diğerlerine kaymamalı????
                    _start = _str.IndexOf("<!--className::");
                    _end   = _str.IndexOf("-->");
                    if (_start >= 0)
                    {
                        if (_start == _element.elementName.Length)
                        {
                            _start = 15 + _element.elementName.Length;
                            if (_end - _start > 0)
                            {
                                _element.className = _str.Substring(_start, _end - _start);
                            }
                        }
                    }

                    _element.tagName = htmlelement.tagName;
                    if (htmlelement.id != null)
                    {
                        _element.tag_id = 1;
                    }
                    if (htmlelement.className != null)
                    {
                        _element.tag_class = 1;
                    }

                    _element.tag_idORclass = _element.tag_id + _element.tag_idORclass;
                    if (_element.tag_idORclass == 2)
                    {
                        _element.tag_idORclass = 1;
                    }

                    string tempinner_text = htmlelement.innerText;
                    if (tempinner_text != null)
                    {
                        tempinner_text = tempinner_text.Replace("\r\n", " ");
                        tempinner_text = tempinner_text.Trim();
                    }
                    else
                    {
                        tempinner_text = "";
                    }

                    element _tempelement = AnalyzeGivenHTML(htmlelement.outerHTML, tempinner_text);
                    _element.BagofWords                = _tempelement.BagofWords;
                    _element.wordCount                 = _tempelement.wordCount;
                    _element.DensityinHTML             = (double)_element.wordCount / _firstelement.wordCount;
                    _element.LinkCount                 = _tempelement.LinkCount;
                    _element.wordCountinLink           = _tempelement.wordCountinLink;
                    _element.meanofWordinLinks         = _tempelement.meanofWordinLinks;
                    _element.meanofWordinLinksAllWords = _tempelement.meanofWordinLinksAllWords;

                    _element.similarity_with_other_web_page = 1;

                    _element.relevant           = false;
                    _element.parent_elementName = "";

                    _list.Add(_element);

                    int key = (int)htmlelement.sourceIndex;//for fast searching
                    _ht.Add(key, i);
                    i++;
                }
            }

            foreach (IHTMLElement htmlelement in allElements)
            {
                if (htmlelement.outerHTML != null)
                {
                    string[] _sonuclar      = ExtractionofSubLayouts(htmlelement);
                    string   tempinner_text = _sonuclar[1];
                    string   tempOuterHTML  = _sonuclar[2];
                    string   tempinnerHTML  = _sonuclar[3];
                    string   str_i          = _sonuclar[0];

                    i = Convert.ToInt32(str_i);

                    //After Extraction
                    element _element     = (element)_list[i];
                    element _tempelement = AnalyzeGivenHTML_AE(tempOuterHTML, tempinner_text);

                    _element.outerHTML_AE                 = tempOuterHTML;
                    _element.innerHTML_AE                 = tempinnerHTML;
                    _element.BagofWords_AE                = _tempelement.BagofWords_AE;
                    _element.wordCount_AE                 = _tempelement.wordCount_AE;
                    _element.DensityinHTML_AE             = (double)_element.wordCount_AE / _firstelement.wordCount;
                    _element.LinkCount_AE                 = _tempelement.LinkCount_AE;
                    _element.wordCountinLink_AE           = _tempelement.wordCountinLink_AE;
                    _element.meanofWordinLinks_AE         = _tempelement.meanofWordinLinks_AE;
                    _element.meanofWordinLinksAllWords_AE = _tempelement.meanofWordinLinksAllWords_AE;

                    //dot endofcontent
                    if (htmlelement.innerText != null)
                    {
                        if (htmlelement.innerText.Trim() != "")
                        {
                            if (htmlelement.innerText[htmlelement.innerText.Length - 1] == '.')
                            {
                                _element.dot_endofstence = 1;
                            }
                            else
                            {
                                _element.dot_endofstence = 0;
                            }
                        }
                    }

                    if (_element.wordCount_AE > _element.wordCount)
                    {
                        _element.wordCount_AE = _element.wordCount; //istisnayi durum scriptler sorun olduğu için nadir bir durum...
                    }
                    _list[i] = _element;

                    if (htmlelement.tagName == "DIV" || htmlelement.tagName == "TD")
                    {
                        element _e = (element)_list[i];
                        if (_e.elementName.Contains("vAlign=bot"))
                        {
                            _e.relevant = true;
                        }

                        bool _decision = HTMLMarkerClass.desicionClass.determineIrrevelantLayout(_element);

                        if (_decision == false)
                        {
                            _e.relevant = true;
                            _list[i]    = _e;

                            //Update child elements
                            for (int m = 0; m < _list.Count; m++)
                            {
                                element _et = (element)_list[m];
                                if (_et.elementlinked_id == _e.id)
                                {
                                    if (_et.tagName != "DIV")//div'ler için karar verme
                                    {
                                        if (_et.tagName != "TD")
                                        {
                                            _et.relevant           = true;
                                            _et.parent_elementName = _e.elementName;
                                            _list[m] = _et;
                                        }
                                    }
                                }
                            } //for m
                        }     //decision = true
                    }         //if div or td
                }             // if not null
            }                 //for each

            for (int m = 0; m < _list.Count; m++)
            {
                element _element = (element)_list[m];
                if (_element.relevant == true)
                {
                    if (_element.tagName == "DIV" || _element.tagName == "TD")
                    {
                        bool _mainlayout = HTMLMarkerClass.desicionClass.determineLayout(_element);
                        if (_mainlayout)
                        {
                            _element.predicted_className = "MAIN";
                        }
                        else
                        {
                            _element.predicted_className = HTMLMarkerClass.desicionClass.determineHEADLINE_INFORMATION(_element);
                        }
                    }
                    else
                    {
                        _element.predicted_className = HTMLMarkerClass.desicionClass.determineHEADLINE_INFORMATION(_element);
                        if (_element.predicted_className == "MAIN")
                        {
                            _element.predicted_className = "IRRELEVANT";
                        }
                    }

                    /*if (_element.predicted_className == "IRRELEVANT")
                     *  _element.predicted_className = "INFORMATIONABOUTARTICLE";*/

                    if (clear_illegal_characters_for_XML(_element.BagofWords_AE.Trim()).Trim() != "")
                    {
                        _list[m] = _element;

                        xml_elemet _xml = new xml_elemet();
                        _xml.elementName         = _element.elementName;
                        _xml.content             = _element.BagofWords_AE;
                        _xml.predicted_className = _element.predicted_className;
                        _xml.parent_elementName  = _element.parent_elementName;
                        //equal content in _xml_list
                        bool find = false;
                        for (int v = 0; v < _xmllist.Count; v++)
                        {
                            xml_elemet item = (xml_elemet)_xmllist[v];
                            if (item.content == _xml.content)
                            {
                                item.elementName = item.elementName + ", " + _element.elementName;
                                _xmllist[v]      = item;
                                find             = true;
                            }
                        }

                        if (!find)
                        {
                            _xmllist.Add(_xml);
                        }
                    }
                }
            }

            return(_list);
        }
Exemplo n.º 5
0
        public ArrayList prepareDOM(string htmlContent2)
        {
            string htmlContent = htmlContent2;

            htmlContent = HTML.trim_commenttags(htmlContent);
            htmlContent = HTML.trimOptions(htmlContent);
            htmlContent = HTML.trimScript(htmlContent);
            htmlContent = HTML.trim_HREF_SCR(htmlContent);
            htmlContent = HTML.trim_some_cases(htmlContent);
            //for fast processing otherwise image, link, javascript loading...

            IHTMLDocument2 htmlDocument = new mshtml.HTMLDocumentClass();

            htmlDocument.write(htmlContent);

            IHTMLElementCollection allElements = htmlDocument.all;

            _ht      = new Hashtable();
            _list    = new ArrayList();
            _xmllist = new ArrayList();

            string _tempinner_text = "";

            if (htmlDocument.body != null)
            {
                if (htmlDocument.body.innerText != null)
                {
                    _tempinner_text = htmlDocument.body.innerText.Replace("\r\n", "");
                    domhtmlContent  = htmlDocument.body.outerHTML.Replace("\r\n", "");
                }
            }

            element _firstelement = AnalyzeGivenHTML(htmlDocument.body.innerHTML, _tempinner_text);

            all_words = _firstelement.BagofWords;
            int i = 0;

            foreach (IHTMLElement htmlelement in allElements)
            {
                if (htmlelement.outerHTML != null)
                {
                    element _element = new element();
                    _element.id        = i;
                    _element.outerHTML = htmlelement.outerHTML;
                    _element.outerHTML = _element.outerHTML.Replace("\r\n", "");
                    if (htmlelement.innerHTML != null)
                    {
                        _element.innerHTML = htmlelement.innerHTML;
                        _element.innerHTML = _element.innerHTML.Replace("\r\n", "");
                    }
                    else
                    {
                        _element.innerHTML = "";
                    }

                    if (_element.id == 0)
                    {
                        _element.elementlinked_id = -1;//root
                        savehtmlContent           = _element.outerHTML;
                        resulthmtlContent         = _element.outerHTML;
                    }
                    else
                    {
                        _element.elementlinked_id = 0;
                    }

                    if (htmlelement.tagName == "HTML")
                    {//html bazen geç geliyor...
                        savehtmlContent   = _element.outerHTML;
                        resulthmtlContent = _element.outerHTML;
                    }

                    string _str   = _element.outerHTML;
                    int    _start = _str.IndexOf('<');
                    int    _end   = _str.IndexOf('>');
                    _element.elementName = _str.Substring(_start, _end - _start + 1);

                    _element.tagName        = htmlelement.tagName;
                    _element.tag_id_Name    = "";
                    _element.tag_class_Name = "";
                    if (htmlelement.id != null)
                    {
                        _element.tag_id      = 1;
                        _element.tag_id_Name = htmlelement.id;
                    }

                    if (htmlelement.className != null)
                    {
                        _element.tag_class      = 1;
                        _element.tag_class_Name = htmlelement.className;
                    }

                    if (_element.tag_id != 1 || _element.tag_class != 1)
                    {
                        _element.tag_idORclass = 1;
                    }

                    string tempinner_text = htmlelement.innerText;
                    if (tempinner_text != null)
                    {
                        tempinner_text = tempinner_text.Replace("\r\n", " ");
                        tempinner_text = tempinner_text.Trim();
                    }
                    else
                    {
                        tempinner_text = "";
                    }

                    element _tempelement = AnalyzeGivenHTML(htmlelement.outerHTML, tempinner_text);
                    _element.BagofWords                = _tempelement.BagofWords;
                    _element.wordCount                 = _tempelement.wordCount;
                    _element.DensityinHTML             = (double)_element.wordCount / _firstelement.wordCount;
                    _element.LinkCount                 = _tempelement.LinkCount;
                    _element.wordCountinLink           = _tempelement.wordCountinLink;
                    _element.meanofWordinLinks         = _tempelement.meanofWordinLinks;
                    _element.meanofWordinLinksAllWords = _tempelement.meanofWordinLinksAllWords;
                    string temp_innerhtml_ = _element.innerHTML.ToUpper(new CultureInfo("en-US", false));    //for english words thus html tags
                    _element.dot_count    = webfilter.CountStringOccurrences(temp_innerhtml_, ".");
                    _element.h1_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H1");
                    _element.h2_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H2");
                    _element.h3_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H3");
                    _element.h4_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H4");
                    _element.h5_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H5");
                    _element.h6_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H6");
                    _element.img_count    = webfilter.CountStringOccurrences(temp_innerhtml_, "<IMG");
                    _element.p_count      = webfilter.CountStringOccurrences(temp_innerhtml_, "<P");
                    _element.br_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<BR");
                    _element.span_count   = webfilter.CountStringOccurrences(temp_innerhtml_, "<SPAN");
                    _element.object_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<OBJECT");
                    _element.ul_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<UL");
                    _element.li_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<LI");
                    _element.input_count  = webfilter.CountStringOccurrences(temp_innerhtml_, "<INPUT")
                                            + webfilter.CountStringOccurrences(temp_innerhtml_, "<BUTTON")
                                            + webfilter.CountStringOccurrences(temp_innerhtml_, "<LABEL");
                    _element.div_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<DIV");
                    _element.td_count  = webfilter.CountStringOccurrences(temp_innerhtml_, "<TD");

                    _element.parent_elementName = "";

                    //sim control
                    //-1 : not available for sim control
                    //0  : similar
                    //0..1: similarity degree
                    //1  : not similar
                    _element.sim_bagofword    = -1;
                    _element.sim_bagofword_AE = -1;
                    _element.sim_innerHTML    = -1;
                    _element.sim_innerHTML_AE = -1;


                    _list.Add(_element);

                    int key = (int)htmlelement.sourceIndex;//for fast searching
                    _ht.Add(key, i);
                    i++;
                }
            }

            foreach (IHTMLElement htmlelement in allElements)
            {
                if (htmlelement.outerHTML != null)
                {
                    string[] _sonuclar      = ExtractionofSubLayouts(htmlelement);
                    string   tempinner_text = _sonuclar[1];
                    string   tempOuterHTML  = _sonuclar[2];
                    string   tempinnerHTML  = _sonuclar[3];
                    string   str_i          = _sonuclar[0];

                    i = Convert.ToInt32(str_i);

                    //After Extraction
                    element _element     = (element)_list[i];
                    element _tempelement = AnalyzeGivenHTML_AE(tempOuterHTML, tempinner_text);

                    if (_element.elementlinked_id > 0)
                    {
                        element _p_element = (element)_list[_element.elementlinked_id];
                        _element.parent_elementName = _p_element.elementName;
                    }

                    if (_element.tagName == "DIV" || _element.tagName == "TD" || _element.tagName == "UL" ||
                        _element.tagName == "H1" || _element.tagName == "H2" || _element.tagName == "H3" ||
                        _element.tagName == "H4" || _element.tagName == "H5" || _element.tagName == "H6" ||
                        _element.tagName == "SPAN" || _element.tagName == "B" || _element.tagName == "STRONG" ||
                        _element.tagName == "P")
                    {
                        _element.outerHTML_AE                 = tempOuterHTML;
                        _element.innerHTML_AE                 = tempinnerHTML;
                        _element.BagofWords_AE                = _tempelement.BagofWords_AE;
                        _element.wordCount_AE                 = _tempelement.wordCount_AE;
                        _element.DensityinHTML_AE             = (double)_element.wordCount_AE / _firstelement.wordCount;
                        _element.LinkCount_AE                 = _tempelement.LinkCount_AE;
                        _element.wordCountinLink_AE           = _tempelement.wordCountinLink_AE;
                        _element.meanofWordinLinks_AE         = _tempelement.meanofWordinLinks_AE;
                        _element.meanofWordinLinksAllWords_AE = _tempelement.meanofWordinLinksAllWords_AE;
                        string temp_innerhtml_AE = _element.innerHTML_AE.ToUpper(new CultureInfo("en-US", false));//for english words thus html tags
                        _element.dot_count_AE    = webfilter.CountStringOccurrences(temp_innerhtml_AE, ".");
                        _element.h1_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H1");
                        _element.h2_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H2");
                        _element.h3_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H3");
                        _element.h4_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H4");
                        _element.h5_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H5");
                        _element.h6_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H6");
                        _element.img_count_AE    = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<IMG");
                        _element.p_count_AE      = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<P");
                        _element.br_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<BR");
                        _element.span_count_AE   = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<SPAN");
                        _element.object_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<OBJECT");
                        _element.ul_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<UL");
                        _element.li_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<LI");
                        _element.input_count_AE  = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<INPUT")
                                                   + webfilter.CountStringOccurrences(temp_innerhtml_AE, "<BUTTON")
                                                   + webfilter.CountStringOccurrences(temp_innerhtml_AE, "<LABEL");
                        _element.div_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<DIV");
                        _element.td_count_AE  = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<TD");

                        if (_element.wordCount_AE > _element.wordCount)
                        {
                            _element.wordCount_AE = _element.wordCount; //istisnayi durum scriptler sorun olduğu için nadir bir durum...
                        }
                        //etiketin tekrar sayısı
                        //_element.repeat_tag_count = webfilter.CountStringOccurrences(htmlDocument.body.innerHTML, _element.elementName);
                        int benzertagsayisi = 0;
                        for (int k = 0; k < _list.Count; k++)
                        {
                            element _e1 = (element)_list[k];
                            if (_element.elementName == _e1.elementName)
                            {
                                benzertagsayisi++;
                            }
                        }
                        _element.repeat_tag_count = benzertagsayisi;
                    }
                    _list[i] = _element;
                } // if not null
            }     //for each

            return(_list);
        }