Beispiel #1
0
        public void prepareDOM(string htmlContent2)
        {
            string htmlContent = htmlContent2;

            htmlContent = HTML.trim_commenttags(htmlContent);
            htmlContent = HTML.trimOptions(htmlContent);
            htmlContent = HTML.trimScript(htmlContent);
            htmlContent = HTML.trim_HREF_SCR(htmlContent);
            htmlContent = HTML.trim_some_cases(htmlContent);
            //for fast processing otherwise image, link, javascript loading...

            HtmlAgilityPack.HtmlDocument htmlDocument = new HtmlAgilityPack.HtmlDocument();
            htmlDocument.LoadHtml(htmlContent);
            _list         = new List <element>();
            _ht_tag_count = new Dictionary <string, int>();

            HtmlAgilityPack.HtmlNode body = htmlDocument.DocumentNode.SelectSingleNode("//body");
            element _firstelement         = AnalyzeGivenHTML(body.InnerHtml.Replace("\r\n", "").Trim(), body.InnerText.Replace("\r\n", "").Trim());

            AnalyzeProcess(htmlDocument, _firstelement);
            //AnalyzeProcess(htmlDocument, "//li", _firstelement);
            //AnalyzeProcess(htmlDocument, "//td", _firstelement);

            //count aktarılıyor.
            for (int i = 0; i < _list.Count; i++)
            {
                element _e = (element)_list[i];
                _e.repeat_tag_count = (int)_ht_tag_count[_e.tagName_Orginal];
                _list[i]            = _e;
            }
        }
Beispiel #2
0
        //prepare information for a given hmtl
        public element AnalyzeGivenHTML(string html_content, string inner_text)
        {
            //html_content = RemoveScripts(html_content);
            element _element = new element();

            _element.BagofWords = inner_text;
            _element.wordCount  = HTML.WordsCountGivenText(_element.BagofWords);


            string pattern = "href=.*?>(.*?)</a";
            Regex  exp     = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace);

            MatchCollection matchList = exp.Matches(html_content);

            string[] _list     = new string[matchList.Count];
            string   URL_INNER = "";

            for (int i = 0; i < matchList.Count; i++)
            {
                Match match = matchList[i];
                if (match.Value.Length > 0)
                {
                    URL_INNER = URL_INNER + " " + HTML.stripHtml(match.Groups[1].Value);
                }
            }

            int count_link = webfilter.CountStringOccurrences(html_content, "<a ");

            count_link = count_link + webfilter.CountStringOccurrences(html_content, "<A ");
            count_link = count_link + webfilter.CountStringOccurrences(html_content, "onclick="); //interesting javascript with link

            _element.LinkCount       = count_link;
            _element.wordCountinLink = HTML.WordsCountGivenText(URL_INNER);
            if (_element.LinkCount != 0)
            {
                _element.meanofWordinLinks = (double)_element.wordCountinLink / _element.LinkCount;
            }
            else
            {
                _element.meanofWordinLinks = 0;
            }

            if (_element.wordCount != 0)
            {
                _element.meanofWordinLinksAllWords = (double)_element.wordCountinLink / _element.wordCount;
            }
            else
            {
                _element.meanofWordinLinksAllWords = 0;
            }

            return(_element);
        }
Beispiel #3
0
        //ref'lerle gönderilen değerleri dom update edecek
        public static void prepareDOMSim(ref DOM _dom1, ref DOM _dom2)
        {
            for (int i = 0; i < _dom1._list.Count; i++)
            {
                for (int j = i; j < _dom2._list.Count; j++)
                {
                    element _element1 = (element)_dom1._list[i];
                    element _element2 = (element)_dom2._list[j];

                    if (_element1.tagName == "DIV" && _element2.tagName == "DIV")
                    {
                        element _element_sub1 = new element();
                        element _element_sub2 = new element();
                        if (_element1.elementlinked_id >= 0 && _element2.elementlinked_id >= 0)
                        {
                            _element_sub1 = (element)_dom1._list[_element1.elementlinked_id];
                            _element_sub2 = (element)_dom2._list[_element2.elementlinked_id];
                        }

                        if (_element1.tag_class_Name != "" || _element1.tag_id_Name != "")
                        {
                            if (_element1.tagName == _element2.tagName &&
                                _element1.tag_id_Name == _element2.tag_id_Name &&
                                _element1.tag_class_Name == _element2.tag_class_Name &&
                                _element_sub1.tagName == _element_sub2.tagName &&
                                _element_sub1.tag_id_Name == _element_sub2.tag_id_Name &&
                                _element_sub1.tag_class_Name == _element_sub2.tag_class_Name)
                            {
                                _element1.sim_bagofword    = Math.Round(similarity.Cossine_Similarity(_element1.BagofWords, _element2.BagofWords), 5);
                                _element1.sim_bagofword_AE = Math.Round(similarity.Cossine_Similarity(_element1.BagofWords_AE, _element2.BagofWords_AE), 5);
                                _element1.sim_innerHTML    = Math.Round(similarity.Cossine_Similarity(_element1.innerHTML, _element2.innerHTML), 5);
                                _element1.sim_innerHTML_AE = Math.Round(similarity.Cossine_Similarity(_element1.innerHTML_AE, _element2.innerHTML_AE), 5);

                                _element2.sim_bagofword    = _element1.sim_bagofword;
                                _element2.sim_bagofword_AE = _element1.sim_bagofword_AE;
                                _element2.sim_innerHTML    = _element1.sim_innerHTML;
                                _element2.sim_innerHTML_AE = _element1.sim_innerHTML_AE;

                                _dom1._list[i] = _element1;
                                _dom2._list[j] = _element2;
                                break;
                            }
                        }
                    } //if div
                }     //for2
            }            //for1
        }
Beispiel #4
0
        //prepare information for a given hmtl
        public element AnalyzeGivenHTML_AE(string html_content, string inner_text)
        {
            //html_content = RemoveScripts(html_content);
            element _element = new element();

            _element.BagofWords_AE = inner_text;
            _element.wordCount_AE  = HTML.WordsCountGivenText(_element.BagofWords_AE);

            string pattern = "href=.*?>(.*?)</a";
            Regex  exp     = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace);

            MatchCollection matchList = exp.Matches(html_content);

            string[] _list     = new string[matchList.Count];
            string   URL_INNER = "";

            for (int i = 0; i < matchList.Count; i++)
            {
                Match match = matchList[i];
                if (match.Value.Length > 0)
                {
                    URL_INNER = URL_INNER + " " + HTML.stripHtml(match.Groups[1].Value);
                }
            }

            _element.LinkCount_AE       = matchList.Count;
            _element.wordCountinLink_AE = HTML.WordsCountGivenText(URL_INNER);
            if (_element.LinkCount_AE != 0)
            {
                _element.meanofWordinLinks_AE = (double)_element.wordCountinLink_AE / _element.LinkCount_AE;
            }
            else
            {
                _element.meanofWordinLinks_AE = 0;
            }

            if (_element.wordCount_AE != 0)
            {
                _element.meanofWordinLinksAllWords_AE = (double)_element.wordCountinLink_AE / _element.wordCount_AE;
            }
            else
            {
                _element.meanofWordinLinksAllWords = 0;
            }

            return(_element);
        }
Beispiel #5
0
        public string extract_links_by_ML(ArrayList _list)
        {
            string sonuc_AE = "";

            if (_list != null)
            {
                HTMLMarkerClass.desicionClass._list = _list; //_list gönder

                for (int i = 0; i < _list.Count; i++)
                {
                    HTMLMarkerClass.element _element = (HTMLMarkerClass.element)_list[i];

                    if (HTMLMarkerClass.desicionClass.write_or_not("links", _element))
                    {
                        sonuc_AE = sonuc_AE + " " + _element.outerHTML_AE;
                    }
                }
            }
            return(sonuc_AE);
        }
Beispiel #6
0
        //dom to xml main, headline, summary, additional
        private ArrayList PrepareContent(string _class, ArrayList _list)
        {
            ArrayList _al = new ArrayList();

            if (_list != null)
            {
                HTMLMarkerClass.desicionClass._list = _list; //_list gönder

                for (int i = 0; i < _list.Count; i++)
                {
                    HTMLMarkerClass.element _element = (HTMLMarkerClass.element)_list[i];

                    if (HTMLMarkerClass.desicionClass.write_or_not(_class, _element))
                    {
                        _al.Add(_element.outerHTML_AE);
                    }
                }
            }

            return(_al);
        }
Beispiel #7
0
        public ArrayList prepareDOM(string htmlContent)
        {
            htmlContent = HTML.trimOptions(htmlContent);
            htmlContent = HTML.trimScript(htmlContent);

            IHTMLDocument2 htmlDocument = new mshtml.HTMLDocumentClass();

            htmlDocument.write(htmlContent);

            IHTMLElementCollection allElements = htmlDocument.all;

            _ht      = new Hashtable();
            _list    = new ArrayList();
            _xmllist = new ArrayList();

            string _tempinner_text = "";

            if (htmlDocument.body != null)
            {
                if (htmlDocument.body.innerText != null)
                {
                    _tempinner_text = htmlDocument.body.innerText.Replace("\r\n", "");
                }
            }

            element _firstelement = AnalyzeGivenHTML(htmlContent, _tempinner_text);
            int     i             = 0;

            foreach (IHTMLElement htmlelement in allElements)
            {
                if (htmlelement.outerHTML != null)
                {
                    element _element = new element();
                    _element.id        = i;
                    _element.outerHTML = htmlelement.outerHTML;
                    _element.outerHTML = _element.outerHTML.Replace("\r\n", "");
                    if (htmlelement.innerHTML != null)
                    {
                        _element.innerHTML = htmlelement.innerHTML;
                        _element.innerHTML = _element.innerHTML.Replace("\r\n", "");
                    }
                    else
                    {
                        _element.innerHTML = "";
                    }

                    if (_element.id == 0)
                    {
                        _element.elementlinked_id = -1;//root
                        savehtmlContent           = _element.outerHTML;
                        resulthmtlContent         = _element.outerHTML;
                    }
                    else
                    {
                        _element.elementlinked_id = 0;
                    }

                    if (htmlelement.tagName == "HTML")
                    {//html bazen geç geliyor...
                        savehtmlContent   = _element.outerHTML;
                        resulthmtlContent = _element.outerHTML;
                    }

                    string _str   = _element.outerHTML;
                    int    _start = _str.IndexOf('<');
                    int    _end   = _str.IndexOf('>');
                    _element.elementName = _str.Substring(_start, _end - _start + 1);

                    //<!--className::(.*?)--> ???garanti başlangıçtaki olmalı diğerlerine kaymamalı????
                    _start = _str.IndexOf("<!--className::");
                    _end   = _str.IndexOf("-->");
                    if (_start >= 0)
                    {
                        if (_start == _element.elementName.Length)
                        {
                            _start = 15 + _element.elementName.Length;
                            if (_end - _start > 0)
                            {
                                _element.className = _str.Substring(_start, _end - _start);
                            }
                        }
                    }

                    _element.tagName = htmlelement.tagName;
                    if (htmlelement.id != null)
                    {
                        _element.tag_id = 1;
                    }
                    if (htmlelement.className != null)
                    {
                        _element.tag_class = 1;
                    }

                    _element.tag_idORclass = _element.tag_id + _element.tag_idORclass;
                    if (_element.tag_idORclass == 2)
                    {
                        _element.tag_idORclass = 1;
                    }

                    string tempinner_text = htmlelement.innerText;
                    if (tempinner_text != null)
                    {
                        tempinner_text = tempinner_text.Replace("\r\n", " ");
                        tempinner_text = tempinner_text.Trim();
                    }
                    else
                    {
                        tempinner_text = "";
                    }

                    element _tempelement = AnalyzeGivenHTML(htmlelement.outerHTML, tempinner_text);
                    _element.BagofWords                = _tempelement.BagofWords;
                    _element.wordCount                 = _tempelement.wordCount;
                    _element.DensityinHTML             = (double)_element.wordCount / _firstelement.wordCount;
                    _element.LinkCount                 = _tempelement.LinkCount;
                    _element.wordCountinLink           = _tempelement.wordCountinLink;
                    _element.meanofWordinLinks         = _tempelement.meanofWordinLinks;
                    _element.meanofWordinLinksAllWords = _tempelement.meanofWordinLinksAllWords;

                    _element.similarity_with_other_web_page = 1;

                    _element.relevant           = false;
                    _element.parent_elementName = "";

                    _list.Add(_element);

                    int key = (int)htmlelement.sourceIndex;//for fast searching
                    _ht.Add(key, i);
                    i++;
                }
            }

            foreach (IHTMLElement htmlelement in allElements)
            {
                if (htmlelement.outerHTML != null)
                {
                    string[] _sonuclar      = ExtractionofSubLayouts(htmlelement);
                    string   tempinner_text = _sonuclar[1];
                    string   tempOuterHTML  = _sonuclar[2];
                    string   tempinnerHTML  = _sonuclar[3];
                    string   str_i          = _sonuclar[0];

                    i = Convert.ToInt32(str_i);

                    //After Extraction
                    element _element     = (element)_list[i];
                    element _tempelement = AnalyzeGivenHTML_AE(tempOuterHTML, tempinner_text);

                    _element.outerHTML_AE                 = tempOuterHTML;
                    _element.innerHTML_AE                 = tempinnerHTML;
                    _element.BagofWords_AE                = _tempelement.BagofWords_AE;
                    _element.wordCount_AE                 = _tempelement.wordCount_AE;
                    _element.DensityinHTML_AE             = (double)_element.wordCount_AE / _firstelement.wordCount;
                    _element.LinkCount_AE                 = _tempelement.LinkCount_AE;
                    _element.wordCountinLink_AE           = _tempelement.wordCountinLink_AE;
                    _element.meanofWordinLinks_AE         = _tempelement.meanofWordinLinks_AE;
                    _element.meanofWordinLinksAllWords_AE = _tempelement.meanofWordinLinksAllWords_AE;

                    //dot endofcontent
                    if (htmlelement.innerText != null)
                    {
                        if (htmlelement.innerText.Trim() != "")
                        {
                            if (htmlelement.innerText[htmlelement.innerText.Length - 1] == '.')
                            {
                                _element.dot_endofstence = 1;
                            }
                            else
                            {
                                _element.dot_endofstence = 0;
                            }
                        }
                    }

                    if (_element.wordCount_AE > _element.wordCount)
                    {
                        _element.wordCount_AE = _element.wordCount; //istisnayi durum scriptler sorun olduğu için nadir bir durum...
                    }
                    _list[i] = _element;

                    if (htmlelement.tagName == "DIV" || htmlelement.tagName == "TD")
                    {
                        element _e = (element)_list[i];
                        if (_e.elementName.Contains("vAlign=bot"))
                        {
                            _e.relevant = true;
                        }

                        bool _decision = HTMLMarkerClass.desicionClass.determineIrrevelantLayout(_element);

                        if (_decision == false)
                        {
                            _e.relevant = true;
                            _list[i]    = _e;

                            //Update child elements
                            for (int m = 0; m < _list.Count; m++)
                            {
                                element _et = (element)_list[m];
                                if (_et.elementlinked_id == _e.id)
                                {
                                    if (_et.tagName != "DIV")//div'ler için karar verme
                                    {
                                        if (_et.tagName != "TD")
                                        {
                                            _et.relevant           = true;
                                            _et.parent_elementName = _e.elementName;
                                            _list[m] = _et;
                                        }
                                    }
                                }
                            } //for m
                        }     //decision = true
                    }         //if div or td
                }             // if not null
            }                 //for each

            for (int m = 0; m < _list.Count; m++)
            {
                element _element = (element)_list[m];
                if (_element.relevant == true)
                {
                    if (_element.tagName == "DIV" || _element.tagName == "TD")
                    {
                        bool _mainlayout = HTMLMarkerClass.desicionClass.determineLayout(_element);
                        if (_mainlayout)
                        {
                            _element.predicted_className = "MAIN";
                        }
                        else
                        {
                            _element.predicted_className = HTMLMarkerClass.desicionClass.determineHEADLINE_INFORMATION(_element);
                        }
                    }
                    else
                    {
                        _element.predicted_className = HTMLMarkerClass.desicionClass.determineHEADLINE_INFORMATION(_element);
                        if (_element.predicted_className == "MAIN")
                        {
                            _element.predicted_className = "IRRELEVANT";
                        }
                    }

                    /*if (_element.predicted_className == "IRRELEVANT")
                     *  _element.predicted_className = "INFORMATIONABOUTARTICLE";*/

                    if (clear_illegal_characters_for_XML(_element.BagofWords_AE.Trim()).Trim() != "")
                    {
                        _list[m] = _element;

                        xml_elemet _xml = new xml_elemet();
                        _xml.elementName         = _element.elementName;
                        _xml.content             = _element.BagofWords_AE;
                        _xml.predicted_className = _element.predicted_className;
                        _xml.parent_elementName  = _element.parent_elementName;
                        //equal content in _xml_list
                        bool find = false;
                        for (int v = 0; v < _xmllist.Count; v++)
                        {
                            xml_elemet item = (xml_elemet)_xmllist[v];
                            if (item.content == _xml.content)
                            {
                                item.elementName = item.elementName + ", " + _element.elementName;
                                _xmllist[v]      = item;
                                find             = true;
                            }
                        }

                        if (!find)
                        {
                            _xmllist.Add(_xml);
                        }
                    }
                }
            }

            return(_list);
        }
Beispiel #8
0
        public string[] ExtractionofSubLayouts(IHTMLElement htmlelement)
        {
            int key = (int)htmlelement.sourceIndex;
            int i   = (int)_ht[key];

            string[] _sonuclar = new string[4];

            string tempOuterHTML  = htmlelement.outerHTML;
            string tempinner_text = htmlelement.innerText;
            string tempinnerHTML  = htmlelement.innerHTML;

            tempOuterHTML = tempOuterHTML.Replace("\r\n", "");

            if (tempinnerHTML != null)
            {
                tempinnerHTML = tempinnerHTML.Replace("\r\n", "");
            }
            else
            {
                tempinnerHTML = "";
            }


            if (tempinner_text != null)
            {
                tempinner_text = tempinner_text.Replace("\r\n", " ");
                tempinner_text = tempinner_text.Trim();
            }
            else
            {
                tempinner_text = "";
            }

            foreach (IHTMLElement htmlchild in (IHTMLElementCollection)htmlelement.children)
            {
                if (htmlchild.outerHTML != null)
                {
                    int     keychild = (int)htmlchild.sourceIndex;
                    int     ic       = (int)_ht[keychild];
                    element _e       = (element)_list[ic];
                    _e.elementlinked_id = i;
                    _list[ic]           = _e;

                    if (_e.tagName == "DIV" ||
                        _e.tagName == "TABLE" || _e.tagName == "TBODY" || _e.tagName == "TR" || _e.tagName == "TD" ||
                        _e.tagName == "FORM" || _e.tagName == "CENTER")
                    {
                        if (tempOuterHTML != "")
                        {
                            //Clear child tags from bag of words
                            //Replace function clear all possible words so we write this algorithm
                            tempinner_text = StripOnlyFirstData(tempinner_text, _e.BagofWords);
                            //Clear child tags from outer html
                            //tempOuterHTML = tempOuterHTML.Replace(_e.outerHTML, "");
                            tempOuterHTML = StripOnlyFirstData(tempOuterHTML, _e.outerHTML);
                            tempinnerHTML = StripOnlyFirstData(tempinnerHTML, _e.innerHTML);
                        } //
                    }     //IF DIV TABLE ...
                }
            }             //childrens

            _sonuclar[0] = i.ToString();
            _sonuclar[1] = tempinner_text;
            _sonuclar[2] = tempOuterHTML;
            _sonuclar[3] = tempinnerHTML;

            return(_sonuclar);
        }
Beispiel #9
0
        public ArrayList prepareDOM(string htmlContent2)
        {
            string htmlContent = htmlContent2;

            htmlContent = HTML.trim_commenttags(htmlContent);
            htmlContent = HTML.trimOptions(htmlContent);
            htmlContent = HTML.trimScript(htmlContent);
            htmlContent = HTML.trim_HREF_SCR(htmlContent);
            htmlContent = HTML.trim_some_cases(htmlContent);
            //for fast processing otherwise image, link, javascript loading...

            IHTMLDocument2 htmlDocument = new mshtml.HTMLDocumentClass();

            htmlDocument.write(htmlContent);

            IHTMLElementCollection allElements = htmlDocument.all;

            _ht      = new Hashtable();
            _list    = new ArrayList();
            _xmllist = new ArrayList();

            string _tempinner_text = "";

            if (htmlDocument.body != null)
            {
                if (htmlDocument.body.innerText != null)
                {
                    _tempinner_text = htmlDocument.body.innerText.Replace("\r\n", "");
                    domhtmlContent  = htmlDocument.body.outerHTML.Replace("\r\n", "");
                }
            }

            element _firstelement = AnalyzeGivenHTML(htmlDocument.body.innerHTML, _tempinner_text);

            all_words = _firstelement.BagofWords;
            int i = 0;

            foreach (IHTMLElement htmlelement in allElements)
            {
                if (htmlelement.outerHTML != null)
                {
                    element _element = new element();
                    _element.id        = i;
                    _element.outerHTML = htmlelement.outerHTML;
                    _element.outerHTML = _element.outerHTML.Replace("\r\n", "");
                    if (htmlelement.innerHTML != null)
                    {
                        _element.innerHTML = htmlelement.innerHTML;
                        _element.innerHTML = _element.innerHTML.Replace("\r\n", "");
                    }
                    else
                    {
                        _element.innerHTML = "";
                    }

                    if (_element.id == 0)
                    {
                        _element.elementlinked_id = -1;//root
                        savehtmlContent           = _element.outerHTML;
                        resulthmtlContent         = _element.outerHTML;
                    }
                    else
                    {
                        _element.elementlinked_id = 0;
                    }

                    if (htmlelement.tagName == "HTML")
                    {//html bazen geç geliyor...
                        savehtmlContent   = _element.outerHTML;
                        resulthmtlContent = _element.outerHTML;
                    }

                    string _str   = _element.outerHTML;
                    int    _start = _str.IndexOf('<');
                    int    _end   = _str.IndexOf('>');
                    _element.elementName = _str.Substring(_start, _end - _start + 1);

                    _element.tagName        = htmlelement.tagName;
                    _element.tag_id_Name    = "";
                    _element.tag_class_Name = "";
                    if (htmlelement.id != null)
                    {
                        _element.tag_id      = 1;
                        _element.tag_id_Name = htmlelement.id;
                    }

                    if (htmlelement.className != null)
                    {
                        _element.tag_class      = 1;
                        _element.tag_class_Name = htmlelement.className;
                    }

                    if (_element.tag_id != 1 || _element.tag_class != 1)
                    {
                        _element.tag_idORclass = 1;
                    }

                    string tempinner_text = htmlelement.innerText;
                    if (tempinner_text != null)
                    {
                        tempinner_text = tempinner_text.Replace("\r\n", " ");
                        tempinner_text = tempinner_text.Trim();
                    }
                    else
                    {
                        tempinner_text = "";
                    }

                    element _tempelement = AnalyzeGivenHTML(htmlelement.outerHTML, tempinner_text);
                    _element.BagofWords                = _tempelement.BagofWords;
                    _element.wordCount                 = _tempelement.wordCount;
                    _element.DensityinHTML             = (double)_element.wordCount / _firstelement.wordCount;
                    _element.LinkCount                 = _tempelement.LinkCount;
                    _element.wordCountinLink           = _tempelement.wordCountinLink;
                    _element.meanofWordinLinks         = _tempelement.meanofWordinLinks;
                    _element.meanofWordinLinksAllWords = _tempelement.meanofWordinLinksAllWords;
                    string temp_innerhtml_ = _element.innerHTML.ToUpper(new CultureInfo("en-US", false));    //for english words thus html tags
                    _element.dot_count    = webfilter.CountStringOccurrences(temp_innerhtml_, ".");
                    _element.h1_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H1");
                    _element.h2_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H2");
                    _element.h3_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H3");
                    _element.h4_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H4");
                    _element.h5_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H5");
                    _element.h6_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H6");
                    _element.img_count    = webfilter.CountStringOccurrences(temp_innerhtml_, "<IMG");
                    _element.p_count      = webfilter.CountStringOccurrences(temp_innerhtml_, "<P");
                    _element.br_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<BR");
                    _element.span_count   = webfilter.CountStringOccurrences(temp_innerhtml_, "<SPAN");
                    _element.object_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<OBJECT");
                    _element.ul_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<UL");
                    _element.li_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<LI");
                    _element.input_count  = webfilter.CountStringOccurrences(temp_innerhtml_, "<INPUT")
                                            + webfilter.CountStringOccurrences(temp_innerhtml_, "<BUTTON")
                                            + webfilter.CountStringOccurrences(temp_innerhtml_, "<LABEL");
                    _element.div_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<DIV");
                    _element.td_count  = webfilter.CountStringOccurrences(temp_innerhtml_, "<TD");

                    _element.parent_elementName = "";

                    //sim control
                    //-1 : not available for sim control
                    //0  : similar
                    //0..1: similarity degree
                    //1  : not similar
                    _element.sim_bagofword    = -1;
                    _element.sim_bagofword_AE = -1;
                    _element.sim_innerHTML    = -1;
                    _element.sim_innerHTML_AE = -1;


                    _list.Add(_element);

                    int key = (int)htmlelement.sourceIndex;//for fast searching
                    _ht.Add(key, i);
                    i++;
                }
            }

            foreach (IHTMLElement htmlelement in allElements)
            {
                if (htmlelement.outerHTML != null)
                {
                    string[] _sonuclar      = ExtractionofSubLayouts(htmlelement);
                    string   tempinner_text = _sonuclar[1];
                    string   tempOuterHTML  = _sonuclar[2];
                    string   tempinnerHTML  = _sonuclar[3];
                    string   str_i          = _sonuclar[0];

                    i = Convert.ToInt32(str_i);

                    //After Extraction
                    element _element     = (element)_list[i];
                    element _tempelement = AnalyzeGivenHTML_AE(tempOuterHTML, tempinner_text);

                    if (_element.elementlinked_id > 0)
                    {
                        element _p_element = (element)_list[_element.elementlinked_id];
                        _element.parent_elementName = _p_element.elementName;
                    }

                    if (_element.tagName == "DIV" || _element.tagName == "TD" || _element.tagName == "UL" ||
                        _element.tagName == "H1" || _element.tagName == "H2" || _element.tagName == "H3" ||
                        _element.tagName == "H4" || _element.tagName == "H5" || _element.tagName == "H6" ||
                        _element.tagName == "SPAN" || _element.tagName == "B" || _element.tagName == "STRONG" ||
                        _element.tagName == "P")
                    {
                        _element.outerHTML_AE                 = tempOuterHTML;
                        _element.innerHTML_AE                 = tempinnerHTML;
                        _element.BagofWords_AE                = _tempelement.BagofWords_AE;
                        _element.wordCount_AE                 = _tempelement.wordCount_AE;
                        _element.DensityinHTML_AE             = (double)_element.wordCount_AE / _firstelement.wordCount;
                        _element.LinkCount_AE                 = _tempelement.LinkCount_AE;
                        _element.wordCountinLink_AE           = _tempelement.wordCountinLink_AE;
                        _element.meanofWordinLinks_AE         = _tempelement.meanofWordinLinks_AE;
                        _element.meanofWordinLinksAllWords_AE = _tempelement.meanofWordinLinksAllWords_AE;
                        string temp_innerhtml_AE = _element.innerHTML_AE.ToUpper(new CultureInfo("en-US", false));//for english words thus html tags
                        _element.dot_count_AE    = webfilter.CountStringOccurrences(temp_innerhtml_AE, ".");
                        _element.h1_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H1");
                        _element.h2_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H2");
                        _element.h3_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H3");
                        _element.h4_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H4");
                        _element.h5_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H5");
                        _element.h6_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H6");
                        _element.img_count_AE    = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<IMG");
                        _element.p_count_AE      = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<P");
                        _element.br_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<BR");
                        _element.span_count_AE   = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<SPAN");
                        _element.object_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<OBJECT");
                        _element.ul_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<UL");
                        _element.li_count_AE     = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<LI");
                        _element.input_count_AE  = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<INPUT")
                                                   + webfilter.CountStringOccurrences(temp_innerhtml_AE, "<BUTTON")
                                                   + webfilter.CountStringOccurrences(temp_innerhtml_AE, "<LABEL");
                        _element.div_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<DIV");
                        _element.td_count_AE  = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<TD");

                        if (_element.wordCount_AE > _element.wordCount)
                        {
                            _element.wordCount_AE = _element.wordCount; //istisnayi durum scriptler sorun olduğu için nadir bir durum...
                        }
                        //etiketin tekrar sayısı
                        //_element.repeat_tag_count = webfilter.CountStringOccurrences(htmlDocument.body.innerHTML, _element.elementName);
                        int benzertagsayisi = 0;
                        for (int k = 0; k < _list.Count; k++)
                        {
                            element _e1 = (element)_list[k];
                            if (_element.elementName == _e1.elementName)
                            {
                                benzertagsayisi++;
                            }
                        }
                        _element.repeat_tag_count = benzertagsayisi;
                    }
                    _list[i] = _element;
                } // if not null
            }     //for each

            return(_list);
        }
Beispiel #10
0
        public void AnalyzeProcess(HtmlAgilityPack.HtmlDocument htmlDocument, element _firstelement)
        {
            all_words = _firstelement.BagofWords;
            try
            {
                int i = 0;
                foreach (HtmlAgilityPack.HtmlNode node in htmlDocument.DocumentNode.Descendants())
                {
                    if (node.Name == "div" || node.Name == "td" || node.Name == "li")
                    {
                        string innerText = node.InnerText.Replace("\r\n", " ").Trim();
                        if (innerText != "")
                        {
                            element _element = new element();
                            _element.outerHTML = node.OuterHtml.Replace("\r\n", " ").Trim();
                            _element.innerHTML = node.InnerHtml.Replace("\r\n", " ").Trim();
                            _element.id        = i;
                            i++;

                            int    _start  = _element.outerHTML.IndexOf('<');
                            int    _end    = _element.outerHTML.IndexOf('>');
                            string temp_to = _element.outerHTML.Substring(_start, _end - _start + 1).Trim();
                            _element.tagName_Orginal = webfilter.String_Decimal_Clear(temp_to);
                            //tekrar sayısı hesaplanıyor
                            if (_ht_tag_count.ContainsKey(_element.tagName_Orginal))
                            {
                                int _cnt = (int)_ht_tag_count[_element.tagName_Orginal];
                                _cnt++;
                                _ht_tag_count[_element.tagName_Orginal] = _cnt;
                            }
                            else
                            {
                                _ht_tag_count.Add(_element.tagName_Orginal, 1);
                            }

                            _element.tagName = node.OriginalName;
                            _element.xPath   = node.XPath;
                            element _tempelement = AnalyzeGivenHTML(_element.outerHTML, innerText);
                            _element.BagofWords                = _tempelement.BagofWords;
                            _element.wordCount                 = _tempelement.wordCount;
                            _element.DensityinHTML             = (double)_element.wordCount / _firstelement.wordCount;
                            _element.LinkCount                 = _tempelement.LinkCount;
                            _element.wordCountinLink           = _tempelement.wordCountinLink;
                            _element.meanofWordinLinks         = _tempelement.meanofWordinLinks;
                            _element.meanofWordinLinksAllWords = _tempelement.meanofWordinLinksAllWords;
                            string temp_innerhtml_ = _element.innerHTML.ToUpper(new CultureInfo("en-US", false));//for english words thus html tags
                            _element.dot_count    = webfilter.CountStringOccurrences(temp_innerhtml_, ".");
                            _element.h1_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H1");
                            _element.h2_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H2");
                            _element.h3_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H3");
                            _element.h4_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H4");
                            _element.h5_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H5");
                            _element.h6_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<H6");
                            _element.img_count    = webfilter.CountStringOccurrences(temp_innerhtml_, "<IMG");
                            _element.p_count      = webfilter.CountStringOccurrences(temp_innerhtml_, "<P");
                            _element.br_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<BR");
                            _element.span_count   = webfilter.CountStringOccurrences(temp_innerhtml_, "<SPAN");
                            _element.object_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<OBJECT");
                            _element.ul_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<UL");
                            _element.li_count     = webfilter.CountStringOccurrences(temp_innerhtml_, "<LI");
                            _element.input_count  = webfilter.CountStringOccurrences(temp_innerhtml_, "<INPUT")
                                                    + webfilter.CountStringOccurrences(temp_innerhtml_, "<BUTTON")
                                                    + webfilter.CountStringOccurrences(temp_innerhtml_, "<LABEL");
                            _element.div_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<DIV");
                            _element.td_count  = webfilter.CountStringOccurrences(temp_innerhtml_, "<TD");

                            string[] _sonuclar      = ExtractionofSubLayouts(node);
                            string   tempinner_text = _sonuclar[0];
                            string   tempOuterHTML  = _sonuclar[1];
                            string   tempinnerHTML  = _sonuclar[2];

                            _element.outerHTML_AE  = tempinnerHTML.Trim();
                            _element.innerHTML_AE  = tempOuterHTML.Trim();
                            _element.BagofWords_AE = tempinner_text.Trim();
                            //After Extraction
                            if (tempinner_text.Trim() != "")
                            {
                                AnalyzeGivenHTML_AE(_element.outerHTML_AE, _element.innerHTML_AE, ref _element);
                            }

                            _list.Add(_element);
                        }
                    }
                }
            }
            catch
            {
            }
        }