Exemplo n.º 1
0
        public void prepareDOM(string htmlContent2)
        {
            string htmlContent = htmlContent2;

            htmlContent = HTML.trim_commenttags(htmlContent);
            htmlContent = HTML.trimOptions(htmlContent);
            htmlContent = HTML.trimScript(htmlContent);
            htmlContent = HTML.trim_HREF_SCR(htmlContent);
            htmlContent = HTML.trim_some_cases(htmlContent);
            //for fast processing otherwise image, link, javascript loading...

            HtmlAgilityPack.HtmlDocument htmlDocument = new HtmlAgilityPack.HtmlDocument();
            htmlDocument.LoadHtml(htmlContent);
            _list         = new List <element>();
            _ht_tag_count = new Dictionary <string, int>();

            HtmlAgilityPack.HtmlNode body = htmlDocument.DocumentNode.SelectSingleNode("//body");
            element _firstelement         = AnalyzeGivenHTML(body.InnerHtml.Replace("\r\n", "").Trim(), body.InnerText.Replace("\r\n", "").Trim());

            AnalyzeProcess(htmlDocument, _firstelement);
            //AnalyzeProcess(htmlDocument, "//li", _firstelement);
            //AnalyzeProcess(htmlDocument, "//td", _firstelement);

            //count aktarılıyor.
            for (int i = 0; i < _list.Count; i++)
            {
                element _e = (element)_list[i];
                _e.repeat_tag_count = (int)_ht_tag_count[_e.tagName_Orginal];
                _list[i]            = _e;
            }
        }
Exemplo n.º 2
0
        //
        public static ArrayList extractRules(Hashtable _ht_rules, string awebpage, string _r_className)
        {
            ArrayList _al = new ArrayList();

            awebpage = uppercaseonlytags(awebpage);
            awebpage = HTML.trim_commenttags(awebpage);
            awebpage = HTML.trimScript(awebpage);
            awebpage = HTML.trim_some_cases(awebpage);

            foreach (DictionaryEntry a_rule in _ht_rules)
            {
                rule_information _ri = (rule_information)a_rule.Value;
                if (_ri._Classname == _r_className)
                {
                    string   a_pattern_for_parent_tag = prepare_a_pattern(_ri._parent_tag);
                    string[] parentcont = HTMLMarkerClass.webfilter.Contents_of_givenLayout_Tags_TESTER(awebpage, a_pattern_for_parent_tag, false);

                    if ((parentcont == null && !(_ri._parent_tag.Contains("<DIV") || _ri._parent_tag.Contains("<div"))) ||
                        (parentcont == null && !(_ri._parent_tag.Contains("<TD") || _ri._parent_tag.Contains("<td"))) ||
                        (parentcont != null && (_ri._parent_tag.Contains("<tr") || _ri._parent_tag.Contains("<TR"))))
                    {
                        parentcont    = new string[1];
                        parentcont[0] = awebpage;
                    }

                    if (parentcont != null)
                    {
                        foreach (string _str in parentcont)
                        {
                            string   a_pattern_for_tag = prepare_a_pattern(_ri._tag);
                            string[] tagcont           = HTMLMarkerClass.webfilter.Contents_of_givenLayout_Tags_TESTER(_str, a_pattern_for_tag, false);

                            if (tagcont != null)
                            {
                                foreach (string item in tagcont)
                                {
                                    if (item != null)
                                    {
                                        if (HTML.stripHtml(item).Trim() != "" && _ri._htmlText != item)
                                        {
                                            if (!same_content(_al, item))
                                            {
                                                _al.Add(item);
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }

            return(_al);
        }
Exemplo n.º 3
0
        public ArrayList prepareDOM(string htmlContent2)
        {
            string htmlContent = htmlContent2;

            htmlContent = HTML.trim_commenttags(htmlContent);
            htmlContent = HTML.trimOptions(htmlContent);
            htmlContent = HTML.trimScript(htmlContent);
            htmlContent = HTML.trim_HREF_SCR(htmlContent);
            htmlContent = HTML.trim_some_cases(htmlContent);
            //for fast processing otherwise image, link, javascript loading...

            IHTMLDocument2 htmlDocument = new mshtml.HTMLDocumentClass();

            htmlDocument.write(htmlContent);

            IHTMLElementCollection allElements = htmlDocument.all;

            _ht      = new Hashtable();
            _list    = new ArrayList();
            _xmllist = new ArrayList();

            string _tempinner_text = "";

            if (htmlDocument.body != null)
            {
                if (htmlDocument.body.innerText != null)
                {
                    _tempinner_text = htmlDocument.body.innerText.Replace("\r\n", "");
                    domhtmlContent  = htmlDocument.body.outerHTML.Replace("\r\n", "");
                }
            }

            element _firstelement = AnalyzeGivenHTML(htmlDocument.body.innerHTML, _tempinner_text);

            all_words = _firstelement.BagofWords;
            int i = 0;

            foreach (IHTMLElement htmlelement in allElements)
            {
                if (htmlelement.outerHTML != null)
                {
                    element _element = new element();
                    _element.id        = i;
                    _element.outerHTML = htmlelement.outerHTML;
                    _element.outerHTML = _element.outerHTML.Replace("\r\n", "");
                    if (htmlelement.innerHTML != null)
                    {
                        _element.innerHTML = htmlelement.innerHTML;
                        _element.innerHTML = _element.innerHTML.Replace("\r\n", "");
                    }
                    else
                    {
                        _element.innerHTML = "";
                    }

                    if (_element.id == 0)
                    {
                        _element.elementlinked_id = -1;//root
                        savehtmlContent           = _element.outerHTML;
                        resulthmtlContent         = _element.outerHTML;
                    }
                    else
                    {
                        _element.elementlinked_id = 0;
                    }

                    if (htmlelement.tagName == "HTML")
                    {//html bazen geç geliyor...
                        savehtmlContent   = _element.outerHTML;
                        resulthmtlContent = _element.outerHTML;
                    }

                    string _str   = _element.outerHTML;
                    int    _start = _str.IndexOf('<');
                    int    _end   = _str.IndexOf('>');
                    _element.elementName = _str.Substring(_start, _end - _start + 1);

                    _element.tagName        = htmlelement.tagName;
                    _element.tag_id_Name    = "";
                    _element.tag_class_Name = "";
                    if (htmlelement.id != null)
                    {
                        _element.tag_id      = 1;
                        _element.tag_id_Name = htmlelement.id;
                    }

                    if (htmlelement.className != null)
                    {
                        _element.tag_class      = 1;
                        _element.tag_class_Name = htmlelement.className;
                    }

                    if (_element.tag_id != 1 || _element.tag_class != 1)
                    {
                        _element.tag_idORclass = 1;
                    }

                    string tempinner_text = htmlelement.innerText;
                    if (tempinner_text != null)
                    {
                        tempinner_text = tempinner_text.Replace("\r\n", " ");
                        tempinner_text = tempinner_text.Trim();
                    }
                    else
                    {
                        tempinner_text = "";
                    }

                    element _tempelement = AnalyzeGivenHTML(htmlelement.outerHTML, tempinner_text);
                    _element.BagofWords                = _tempelement.BagofWords;
                    _element.wordCount                 = _tempelement.wordCount;
                    _element.DensityinHTML             = (double)_element.wordCount / _firstelement.wordCount;
                    _element.LinkCount                 = _tempelement.LinkCount;
                    _element.wordCountinLink           = _tempelement.wordCountinLink;
                    _element.meanofWordinLinks         = _tempelement.meanofWordinLinks;
                    _element.meanofWordinLinksAllWords = _tempelement.meanofWordinLinksAllWords;
                    string temp_innerhtml_ = _element.innerHTML.ToUpper(new CultureInfo("en-US", false));    //for english words thus html tags
                    _element.dot_count    = webfilter.CountStringOccurrences(temp_innerhtml_, ".");
                    _element.h1_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H1");
                    _element.h2_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H2");
                    _element.h3_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H3");
                    _element.h4_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H4");
                    _element.h5_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H5");
                    _element.h6_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H6");
                    _element.img_count    = webfilter.CountStringOccurrences(temp_innerhtml_, "<IMG");
                    _element.p_count      = webfilter.CountStringOccurrences(temp_innerhtml_, "<P");
                    _element.br_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<BR");
                    _element.span_count   = webfilter.CountStringOccurrences(temp_innerhtml_, "<SPAN");
                    _element.object_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<OBJECT");
                    _element.ul_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<UL");
                    _element.li_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<LI");
                    _element.input_count  = webfilter.CountStringOccurrences(temp_innerhtml_, "<INPUT")
                                            + webfilter.CountStringOccurrences(temp_innerhtml_, "<BUTTON")
                                            + webfilter.CountStringOccurrences(temp_innerhtml_, "<LABEL");
                    _element.div_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<DIV");
                    _element.td_count  = webfilter.CountStringOccurrences(temp_innerhtml_, "<TD");

                    _element.parent_elementName = "";

                    //sim control
                    //-1 : not available for sim control
                    //0  : similar
                    //0..1: similarity degree
                    //1  : not similar
                    _element.sim_bagofword    = -1;
                    _element.sim_bagofword_AE = -1;
                    _element.sim_innerHTML    = -1;
                    _element.sim_innerHTML_AE = -1;


                    _list.Add(_element);

                    int key = (int)htmlelement.sourceIndex;//for fast searching
                    _ht.Add(key, i);
                    i++;
                }
            }

            foreach (IHTMLElement htmlelement in allElements)
            {
                if (htmlelement.outerHTML != null)
                {
                    string[] _sonuclar      = ExtractionofSubLayouts(htmlelement);
                    string   tempinner_text = _sonuclar[1];
                    string   tempOuterHTML  = _sonuclar[2];
                    string   tempinnerHTML  = _sonuclar[3];
                    string   str_i          = _sonuclar[0];

                    i = Convert.ToInt32(str_i);

                    //After Extraction
                    element _element     = (element)_list[i];
                    element _tempelement = AnalyzeGivenHTML_AE(tempOuterHTML, tempinner_text);

                    if (_element.elementlinked_id > 0)
                    {
                        element _p_element = (element)_list[_element.elementlinked_id];
                        _element.parent_elementName = _p_element.elementName;
                    }

                    if (_element.tagName == "DIV" || _element.tagName == "TD" || _element.tagName == "UL" ||
                        _element.tagName == "H1" || _element.tagName == "H2" || _element.tagName == "H3" ||
                        _element.tagName == "H4" || _element.tagName == "H5" || _element.tagName == "H6" ||
                        _element.tagName == "SPAN" || _element.tagName == "B" || _element.tagName == "STRONG" ||
                        _element.tagName == "P")
                    {
                        _element.outerHTML_AE                 = tempOuterHTML;
                        _element.innerHTML_AE                 = tempinnerHTML;
                        _element.BagofWords_AE                = _tempelement.BagofWords_AE;
                        _element.wordCount_AE                 = _tempelement.wordCount_AE;
                        _element.DensityinHTML_AE             = (double)_element.wordCount_AE / _firstelement.wordCount;
                        _element.LinkCount_AE                 = _tempelement.LinkCount_AE;
                        _element.wordCountinLink_AE           = _tempelement.wordCountinLink_AE;
                        _element.meanofWordinLinks_AE         = _tempelement.meanofWordinLinks_AE;
                        _element.meanofWordinLinksAllWords_AE = _tempelement.meanofWordinLinksAllWords_AE;
                        string temp_innerhtml_AE = _element.innerHTML_AE.ToUpper(new CultureInfo("en-US", false));//for english words thus html tags
                        _element.dot_count_AE    = webfilter.CountStringOccurrences(temp_innerhtml_AE, ".");
                        _element.h1_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H1");
                        _element.h2_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H2");
                        _element.h3_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H3");
                        _element.h4_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H4");
                        _element.h5_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H5");
                        _element.h6_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H6");
                        _element.img_count_AE    = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<IMG");
                        _element.p_count_AE      = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<P");
                        _element.br_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<BR");
                        _element.span_count_AE   = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<SPAN");
                        _element.object_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<OBJECT");
                        _element.ul_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<UL");
                        _element.li_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<LI");
                        _element.input_count_AE  = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<INPUT")
                                                   + webfilter.CountStringOccurrences(temp_innerhtml_AE, "<BUTTON")
                                                   + webfilter.CountStringOccurrences(temp_innerhtml_AE, "<LABEL");
                        _element.div_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<DIV");
                        _element.td_count_AE  = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<TD");

                        if (_element.wordCount_AE > _element.wordCount)
                        {
                            _element.wordCount_AE = _element.wordCount; //istisnayi durum scriptler sorun olduğu için nadir bir durum...
                        }
                        //etiketin tekrar sayısı
                        //_element.repeat_tag_count = webfilter.CountStringOccurrences(htmlDocument.body.innerHTML, _element.elementName);
                        int benzertagsayisi = 0;
                        for (int k = 0; k < _list.Count; k++)
                        {
                            element _e1 = (element)_list[k];
                            if (_element.elementName == _e1.elementName)
                            {
                                benzertagsayisi++;
                            }
                        }
                        _element.repeat_tag_count = benzertagsayisi;
                    }
                    _list[i] = _element;
                } // if not null
            }     //for each

            return(_list);
        }