public void prepareDOM(string htmlContent2) { string htmlContent = htmlContent2; htmlContent = HTML.trim_commenttags(htmlContent); htmlContent = HTML.trimOptions(htmlContent); htmlContent = HTML.trimScript(htmlContent); htmlContent = HTML.trim_HREF_SCR(htmlContent); htmlContent = HTML.trim_some_cases(htmlContent); //for fast processing otherwise image, link, javascript loading... HtmlAgilityPack.HtmlDocument htmlDocument = new HtmlAgilityPack.HtmlDocument(); htmlDocument.LoadHtml(htmlContent); _list = new List <element>(); _ht_tag_count = new Dictionary <string, int>(); HtmlAgilityPack.HtmlNode body = htmlDocument.DocumentNode.SelectSingleNode("//body"); element _firstelement = AnalyzeGivenHTML(body.InnerHtml.Replace("\r\n", "").Trim(), body.InnerText.Replace("\r\n", "").Trim()); AnalyzeProcess(htmlDocument, _firstelement); //AnalyzeProcess(htmlDocument, "//li", _firstelement); //AnalyzeProcess(htmlDocument, "//td", _firstelement); //count aktarılıyor. for (int i = 0; i < _list.Count; i++) { element _e = (element)_list[i]; _e.repeat_tag_count = (int)_ht_tag_count[_e.tagName_Orginal]; _list[i] = _e; } }
//prepare information for a given hmtl public element AnalyzeGivenHTML(string html_content, string inner_text) { //html_content = RemoveScripts(html_content); element _element = new element(); _element.BagofWords = inner_text; _element.wordCount = HTML.WordsCountGivenText(_element.BagofWords); string pattern = "href=.*?>(.*?)</a"; Regex exp = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace); MatchCollection matchList = exp.Matches(html_content); string[] _list = new string[matchList.Count]; string URL_INNER = ""; for (int i = 0; i < matchList.Count; i++) { Match match = matchList[i]; if (match.Value.Length > 0) { URL_INNER = URL_INNER + " " + HTML.stripHtml(match.Groups[1].Value); } } int count_link = webfilter.CountStringOccurrences(html_content, "<a "); count_link = count_link + webfilter.CountStringOccurrences(html_content, "<A "); count_link = count_link + webfilter.CountStringOccurrences(html_content, "onclick="); //interesting javascript with link _element.LinkCount = count_link; _element.wordCountinLink = HTML.WordsCountGivenText(URL_INNER); if (_element.LinkCount != 0) { _element.meanofWordinLinks = (double)_element.wordCountinLink / _element.LinkCount; } else { _element.meanofWordinLinks = 0; } if (_element.wordCount != 0) { _element.meanofWordinLinksAllWords = (double)_element.wordCountinLink / _element.wordCount; } else { _element.meanofWordinLinksAllWords = 0; } return(_element); }
//ref'lerle gönderilen değerleri dom update edecek public static void prepareDOMSim(ref DOM _dom1, ref DOM _dom2) { for (int i = 0; i < _dom1._list.Count; i++) { for (int j = i; j < _dom2._list.Count; j++) { element _element1 = (element)_dom1._list[i]; element _element2 = (element)_dom2._list[j]; if (_element1.tagName == "DIV" && _element2.tagName == "DIV") { element _element_sub1 = new element(); element _element_sub2 = new element(); if (_element1.elementlinked_id >= 0 && _element2.elementlinked_id >= 0) { _element_sub1 = (element)_dom1._list[_element1.elementlinked_id]; _element_sub2 = (element)_dom2._list[_element2.elementlinked_id]; } if (_element1.tag_class_Name != "" || _element1.tag_id_Name != "") { if (_element1.tagName == _element2.tagName && _element1.tag_id_Name == _element2.tag_id_Name && _element1.tag_class_Name == _element2.tag_class_Name && _element_sub1.tagName == _element_sub2.tagName && _element_sub1.tag_id_Name == _element_sub2.tag_id_Name && _element_sub1.tag_class_Name == _element_sub2.tag_class_Name) { _element1.sim_bagofword = Math.Round(similarity.Cossine_Similarity(_element1.BagofWords, _element2.BagofWords), 5); _element1.sim_bagofword_AE = Math.Round(similarity.Cossine_Similarity(_element1.BagofWords_AE, _element2.BagofWords_AE), 5); _element1.sim_innerHTML = Math.Round(similarity.Cossine_Similarity(_element1.innerHTML, _element2.innerHTML), 5); _element1.sim_innerHTML_AE = Math.Round(similarity.Cossine_Similarity(_element1.innerHTML_AE, _element2.innerHTML_AE), 5); _element2.sim_bagofword = _element1.sim_bagofword; _element2.sim_bagofword_AE = _element1.sim_bagofword_AE; _element2.sim_innerHTML = _element1.sim_innerHTML; _element2.sim_innerHTML_AE = _element1.sim_innerHTML_AE; _dom1._list[i] = _element1; _dom2._list[j] = _element2; break; } } } //if div } //for2 } //for1 }
//prepare information for a given hmtl public element AnalyzeGivenHTML_AE(string html_content, string inner_text) { //html_content = RemoveScripts(html_content); element _element = new element(); _element.BagofWords_AE = inner_text; _element.wordCount_AE = HTML.WordsCountGivenText(_element.BagofWords_AE); string pattern = "href=.*?>(.*?)</a"; Regex exp = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace); MatchCollection matchList = exp.Matches(html_content); string[] _list = new string[matchList.Count]; string URL_INNER = ""; for (int i = 0; i < matchList.Count; i++) { Match match = matchList[i]; if (match.Value.Length > 0) { URL_INNER = URL_INNER + " " + HTML.stripHtml(match.Groups[1].Value); } } _element.LinkCount_AE = matchList.Count; _element.wordCountinLink_AE = HTML.WordsCountGivenText(URL_INNER); if (_element.LinkCount_AE != 0) { _element.meanofWordinLinks_AE = (double)_element.wordCountinLink_AE / _element.LinkCount_AE; } else { _element.meanofWordinLinks_AE = 0; } if (_element.wordCount_AE != 0) { _element.meanofWordinLinksAllWords_AE = (double)_element.wordCountinLink_AE / _element.wordCount_AE; } else { _element.meanofWordinLinksAllWords = 0; } return(_element); }
public string extract_links_by_ML(ArrayList _list) { string sonuc_AE = ""; if (_list != null) { HTMLMarkerClass.desicionClass._list = _list; //_list gönder for (int i = 0; i < _list.Count; i++) { HTMLMarkerClass.element _element = (HTMLMarkerClass.element)_list[i]; if (HTMLMarkerClass.desicionClass.write_or_not("links", _element)) { sonuc_AE = sonuc_AE + " " + _element.outerHTML_AE; } } } return(sonuc_AE); }
//dom to xml main, headline, summary, additional private ArrayList PrepareContent(string _class, ArrayList _list) { ArrayList _al = new ArrayList(); if (_list != null) { HTMLMarkerClass.desicionClass._list = _list; //_list gönder for (int i = 0; i < _list.Count; i++) { HTMLMarkerClass.element _element = (HTMLMarkerClass.element)_list[i]; if (HTMLMarkerClass.desicionClass.write_or_not(_class, _element)) { _al.Add(_element.outerHTML_AE); } } } return(_al); }
public ArrayList prepareDOM(string htmlContent) { htmlContent = HTML.trimOptions(htmlContent); htmlContent = HTML.trimScript(htmlContent); IHTMLDocument2 htmlDocument = new mshtml.HTMLDocumentClass(); htmlDocument.write(htmlContent); IHTMLElementCollection allElements = htmlDocument.all; _ht = new Hashtable(); _list = new ArrayList(); _xmllist = new ArrayList(); string _tempinner_text = ""; if (htmlDocument.body != null) { if (htmlDocument.body.innerText != null) { _tempinner_text = htmlDocument.body.innerText.Replace("\r\n", ""); } } element _firstelement = AnalyzeGivenHTML(htmlContent, _tempinner_text); int i = 0; foreach (IHTMLElement htmlelement in allElements) { if (htmlelement.outerHTML != null) { element _element = new element(); _element.id = i; _element.outerHTML = htmlelement.outerHTML; _element.outerHTML = _element.outerHTML.Replace("\r\n", ""); if (htmlelement.innerHTML != null) { _element.innerHTML = htmlelement.innerHTML; _element.innerHTML = _element.innerHTML.Replace("\r\n", ""); } else { _element.innerHTML = ""; } if (_element.id == 0) { _element.elementlinked_id = -1;//root savehtmlContent = _element.outerHTML; resulthmtlContent = _element.outerHTML; } else { _element.elementlinked_id = 0; } if (htmlelement.tagName == "HTML") {//html bazen geç geliyor... savehtmlContent = _element.outerHTML; resulthmtlContent = _element.outerHTML; } string _str = _element.outerHTML; int _start = _str.IndexOf('<'); int _end = _str.IndexOf('>'); _element.elementName = _str.Substring(_start, _end - _start + 1); //<!--className::(.*?)--> ???garanti başlangıçtaki olmalı diğerlerine kaymamalı???? _start = _str.IndexOf("<!--className::"); _end = _str.IndexOf("-->"); if (_start >= 0) { if (_start == _element.elementName.Length) { _start = 15 + _element.elementName.Length; if (_end - _start > 0) { _element.className = _str.Substring(_start, _end - _start); } } } _element.tagName = htmlelement.tagName; if (htmlelement.id != null) { _element.tag_id = 1; } if (htmlelement.className != null) { _element.tag_class = 1; } _element.tag_idORclass = _element.tag_id + _element.tag_idORclass; if (_element.tag_idORclass == 2) { _element.tag_idORclass = 1; } string tempinner_text = htmlelement.innerText; if (tempinner_text != null) { tempinner_text = tempinner_text.Replace("\r\n", " "); tempinner_text = tempinner_text.Trim(); } else { tempinner_text = ""; } element _tempelement = AnalyzeGivenHTML(htmlelement.outerHTML, tempinner_text); _element.BagofWords = _tempelement.BagofWords; _element.wordCount = _tempelement.wordCount; _element.DensityinHTML = (double)_element.wordCount / _firstelement.wordCount; _element.LinkCount = _tempelement.LinkCount; _element.wordCountinLink = _tempelement.wordCountinLink; _element.meanofWordinLinks = _tempelement.meanofWordinLinks; _element.meanofWordinLinksAllWords = _tempelement.meanofWordinLinksAllWords; _element.similarity_with_other_web_page = 1; _element.relevant = false; _element.parent_elementName = ""; _list.Add(_element); int key = (int)htmlelement.sourceIndex;//for fast searching _ht.Add(key, i); i++; } } foreach (IHTMLElement htmlelement in allElements) { if (htmlelement.outerHTML != null) { string[] _sonuclar = ExtractionofSubLayouts(htmlelement); string tempinner_text = _sonuclar[1]; string tempOuterHTML = _sonuclar[2]; string tempinnerHTML = _sonuclar[3]; string str_i = _sonuclar[0]; i = Convert.ToInt32(str_i); //After Extraction element _element = (element)_list[i]; element _tempelement = AnalyzeGivenHTML_AE(tempOuterHTML, tempinner_text); _element.outerHTML_AE = tempOuterHTML; _element.innerHTML_AE = tempinnerHTML; _element.BagofWords_AE = _tempelement.BagofWords_AE; _element.wordCount_AE = _tempelement.wordCount_AE; _element.DensityinHTML_AE = (double)_element.wordCount_AE / _firstelement.wordCount; _element.LinkCount_AE = _tempelement.LinkCount_AE; _element.wordCountinLink_AE = _tempelement.wordCountinLink_AE; _element.meanofWordinLinks_AE = _tempelement.meanofWordinLinks_AE; _element.meanofWordinLinksAllWords_AE = _tempelement.meanofWordinLinksAllWords_AE; //dot endofcontent if (htmlelement.innerText != null) { if (htmlelement.innerText.Trim() != "") { if (htmlelement.innerText[htmlelement.innerText.Length - 1] == '.') { _element.dot_endofstence = 1; } else { _element.dot_endofstence = 0; } } } if (_element.wordCount_AE > _element.wordCount) { _element.wordCount_AE = _element.wordCount; //istisnayi durum scriptler sorun olduğu için nadir bir durum... } _list[i] = _element; if (htmlelement.tagName == "DIV" || htmlelement.tagName == "TD") { element _e = (element)_list[i]; if (_e.elementName.Contains("vAlign=bot")) { _e.relevant = true; } bool _decision = HTMLMarkerClass.desicionClass.determineIrrevelantLayout(_element); if (_decision == false) { _e.relevant = true; _list[i] = _e; //Update child elements for (int m = 0; m < _list.Count; m++) { element _et = (element)_list[m]; if (_et.elementlinked_id == _e.id) { if (_et.tagName != "DIV")//div'ler için karar verme { if (_et.tagName != "TD") { _et.relevant = true; _et.parent_elementName = _e.elementName; _list[m] = _et; } } } } //for m } //decision = true } //if div or td } // if not null } //for each for (int m = 0; m < _list.Count; m++) { element _element = (element)_list[m]; if (_element.relevant == true) { if (_element.tagName == "DIV" || _element.tagName == "TD") { bool _mainlayout = HTMLMarkerClass.desicionClass.determineLayout(_element); if (_mainlayout) { _element.predicted_className = "MAIN"; } else { _element.predicted_className = HTMLMarkerClass.desicionClass.determineHEADLINE_INFORMATION(_element); } } else { _element.predicted_className = HTMLMarkerClass.desicionClass.determineHEADLINE_INFORMATION(_element); if (_element.predicted_className == "MAIN") { _element.predicted_className = "IRRELEVANT"; } } /*if (_element.predicted_className == "IRRELEVANT") * _element.predicted_className = "INFORMATIONABOUTARTICLE";*/ if (clear_illegal_characters_for_XML(_element.BagofWords_AE.Trim()).Trim() != "") { _list[m] = _element; xml_elemet _xml = new xml_elemet(); _xml.elementName = _element.elementName; _xml.content = _element.BagofWords_AE; _xml.predicted_className = _element.predicted_className; _xml.parent_elementName = _element.parent_elementName; //equal content in _xml_list bool find = false; for (int v = 0; v < _xmllist.Count; v++) { xml_elemet item = (xml_elemet)_xmllist[v]; if (item.content == _xml.content) { item.elementName = item.elementName + ", " + _element.elementName; _xmllist[v] = item; find = true; } } if (!find) { _xmllist.Add(_xml); } } } } return(_list); }
public string[] ExtractionofSubLayouts(IHTMLElement htmlelement) { int key = (int)htmlelement.sourceIndex; int i = (int)_ht[key]; string[] _sonuclar = new string[4]; string tempOuterHTML = htmlelement.outerHTML; string tempinner_text = htmlelement.innerText; string tempinnerHTML = htmlelement.innerHTML; tempOuterHTML = tempOuterHTML.Replace("\r\n", ""); if (tempinnerHTML != null) { tempinnerHTML = tempinnerHTML.Replace("\r\n", ""); } else { tempinnerHTML = ""; } if (tempinner_text != null) { tempinner_text = tempinner_text.Replace("\r\n", " "); tempinner_text = tempinner_text.Trim(); } else { tempinner_text = ""; } foreach (IHTMLElement htmlchild in (IHTMLElementCollection)htmlelement.children) { if (htmlchild.outerHTML != null) { int keychild = (int)htmlchild.sourceIndex; int ic = (int)_ht[keychild]; element _e = (element)_list[ic]; _e.elementlinked_id = i; _list[ic] = _e; if (_e.tagName == "DIV" || _e.tagName == "TABLE" || _e.tagName == "TBODY" || _e.tagName == "TR" || _e.tagName == "TD" || _e.tagName == "FORM" || _e.tagName == "CENTER") { if (tempOuterHTML != "") { //Clear child tags from bag of words //Replace function clear all possible words so we write this algorithm tempinner_text = StripOnlyFirstData(tempinner_text, _e.BagofWords); //Clear child tags from outer html //tempOuterHTML = tempOuterHTML.Replace(_e.outerHTML, ""); tempOuterHTML = StripOnlyFirstData(tempOuterHTML, _e.outerHTML); tempinnerHTML = StripOnlyFirstData(tempinnerHTML, _e.innerHTML); } // } //IF DIV TABLE ... } } //childrens _sonuclar[0] = i.ToString(); _sonuclar[1] = tempinner_text; _sonuclar[2] = tempOuterHTML; _sonuclar[3] = tempinnerHTML; return(_sonuclar); }
public ArrayList prepareDOM(string htmlContent2) { string htmlContent = htmlContent2; htmlContent = HTML.trim_commenttags(htmlContent); htmlContent = HTML.trimOptions(htmlContent); htmlContent = HTML.trimScript(htmlContent); htmlContent = HTML.trim_HREF_SCR(htmlContent); htmlContent = HTML.trim_some_cases(htmlContent); //for fast processing otherwise image, link, javascript loading... IHTMLDocument2 htmlDocument = new mshtml.HTMLDocumentClass(); htmlDocument.write(htmlContent); IHTMLElementCollection allElements = htmlDocument.all; _ht = new Hashtable(); _list = new ArrayList(); _xmllist = new ArrayList(); string _tempinner_text = ""; if (htmlDocument.body != null) { if (htmlDocument.body.innerText != null) { _tempinner_text = htmlDocument.body.innerText.Replace("\r\n", ""); domhtmlContent = htmlDocument.body.outerHTML.Replace("\r\n", ""); } } element _firstelement = AnalyzeGivenHTML(htmlDocument.body.innerHTML, _tempinner_text); all_words = _firstelement.BagofWords; int i = 0; foreach (IHTMLElement htmlelement in allElements) { if (htmlelement.outerHTML != null) { element _element = new element(); _element.id = i; _element.outerHTML = htmlelement.outerHTML; _element.outerHTML = _element.outerHTML.Replace("\r\n", ""); if (htmlelement.innerHTML != null) { _element.innerHTML = htmlelement.innerHTML; _element.innerHTML = _element.innerHTML.Replace("\r\n", ""); } else { _element.innerHTML = ""; } if (_element.id == 0) { _element.elementlinked_id = -1;//root savehtmlContent = _element.outerHTML; resulthmtlContent = _element.outerHTML; } else { _element.elementlinked_id = 0; } if (htmlelement.tagName == "HTML") {//html bazen geç geliyor... savehtmlContent = _element.outerHTML; resulthmtlContent = _element.outerHTML; } string _str = _element.outerHTML; int _start = _str.IndexOf('<'); int _end = _str.IndexOf('>'); _element.elementName = _str.Substring(_start, _end - _start + 1); _element.tagName = htmlelement.tagName; _element.tag_id_Name = ""; _element.tag_class_Name = ""; if (htmlelement.id != null) { _element.tag_id = 1; _element.tag_id_Name = htmlelement.id; } if (htmlelement.className != null) { _element.tag_class = 1; _element.tag_class_Name = htmlelement.className; } if (_element.tag_id != 1 || _element.tag_class != 1) { _element.tag_idORclass = 1; } string tempinner_text = htmlelement.innerText; if (tempinner_text != null) { tempinner_text = tempinner_text.Replace("\r\n", " "); tempinner_text = tempinner_text.Trim(); } else { tempinner_text = ""; } element _tempelement = AnalyzeGivenHTML(htmlelement.outerHTML, tempinner_text); _element.BagofWords = _tempelement.BagofWords; _element.wordCount = _tempelement.wordCount; _element.DensityinHTML = (double)_element.wordCount / _firstelement.wordCount; _element.LinkCount = _tempelement.LinkCount; _element.wordCountinLink = _tempelement.wordCountinLink; _element.meanofWordinLinks = _tempelement.meanofWordinLinks; _element.meanofWordinLinksAllWords = _tempelement.meanofWordinLinksAllWords; string temp_innerhtml_ = _element.innerHTML.ToUpper(new CultureInfo("en-US", false)); //for english words thus html tags _element.dot_count = webfilter.CountStringOccurrences(temp_innerhtml_, "."); _element.h1_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H1"); _element.h2_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H2"); _element.h3_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H3"); _element.h4_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H4"); _element.h5_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H5"); _element.h6_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H6"); _element.img_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<IMG"); _element.p_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<P"); _element.br_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<BR"); _element.span_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<SPAN"); _element.object_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<OBJECT"); _element.ul_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<UL"); _element.li_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<LI"); _element.input_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<INPUT") + webfilter.CountStringOccurrences(temp_innerhtml_, "<BUTTON") + webfilter.CountStringOccurrences(temp_innerhtml_, "<LABEL"); _element.div_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<DIV"); _element.td_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<TD"); _element.parent_elementName = ""; //sim control //-1 : not available for sim control //0 : similar //0..1: similarity degree //1 : not similar _element.sim_bagofword = -1; _element.sim_bagofword_AE = -1; _element.sim_innerHTML = -1; _element.sim_innerHTML_AE = -1; _list.Add(_element); int key = (int)htmlelement.sourceIndex;//for fast searching _ht.Add(key, i); i++; } } foreach (IHTMLElement htmlelement in allElements) { if (htmlelement.outerHTML != null) { string[] _sonuclar = ExtractionofSubLayouts(htmlelement); string tempinner_text = _sonuclar[1]; string tempOuterHTML = _sonuclar[2]; string tempinnerHTML = _sonuclar[3]; string str_i = _sonuclar[0]; i = Convert.ToInt32(str_i); //After Extraction element _element = (element)_list[i]; element _tempelement = AnalyzeGivenHTML_AE(tempOuterHTML, tempinner_text); if (_element.elementlinked_id > 0) { element _p_element = (element)_list[_element.elementlinked_id]; _element.parent_elementName = _p_element.elementName; } if (_element.tagName == "DIV" || _element.tagName == "TD" || _element.tagName == "UL" || _element.tagName == "H1" || _element.tagName == "H2" || _element.tagName == "H3" || _element.tagName == "H4" || _element.tagName == "H5" || _element.tagName == "H6" || _element.tagName == "SPAN" || _element.tagName == "B" || _element.tagName == "STRONG" || _element.tagName == "P") { _element.outerHTML_AE = tempOuterHTML; _element.innerHTML_AE = tempinnerHTML; _element.BagofWords_AE = _tempelement.BagofWords_AE; _element.wordCount_AE = _tempelement.wordCount_AE; _element.DensityinHTML_AE = (double)_element.wordCount_AE / _firstelement.wordCount; _element.LinkCount_AE = _tempelement.LinkCount_AE; _element.wordCountinLink_AE = _tempelement.wordCountinLink_AE; _element.meanofWordinLinks_AE = _tempelement.meanofWordinLinks_AE; _element.meanofWordinLinksAllWords_AE = _tempelement.meanofWordinLinksAllWords_AE; string temp_innerhtml_AE = _element.innerHTML_AE.ToUpper(new CultureInfo("en-US", false));//for english words thus html tags _element.dot_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "."); _element.h1_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H1"); _element.h2_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H2"); _element.h3_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H3"); _element.h4_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H4"); _element.h5_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H5"); _element.h6_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H6"); _element.img_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<IMG"); _element.p_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<P"); _element.br_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<BR"); _element.span_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<SPAN"); _element.object_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<OBJECT"); _element.ul_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<UL"); _element.li_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<LI"); _element.input_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<INPUT") + webfilter.CountStringOccurrences(temp_innerhtml_AE, "<BUTTON") + webfilter.CountStringOccurrences(temp_innerhtml_AE, "<LABEL"); _element.div_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<DIV"); _element.td_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<TD"); if (_element.wordCount_AE > _element.wordCount) { _element.wordCount_AE = _element.wordCount; //istisnayi durum scriptler sorun olduğu için nadir bir durum... } //etiketin tekrar sayısı //_element.repeat_tag_count = webfilter.CountStringOccurrences(htmlDocument.body.innerHTML, _element.elementName); int benzertagsayisi = 0; for (int k = 0; k < _list.Count; k++) { element _e1 = (element)_list[k]; if (_element.elementName == _e1.elementName) { benzertagsayisi++; } } _element.repeat_tag_count = benzertagsayisi; } _list[i] = _element; } // if not null } //for each return(_list); }
public void AnalyzeProcess(HtmlAgilityPack.HtmlDocument htmlDocument, element _firstelement) { all_words = _firstelement.BagofWords; try { int i = 0; foreach (HtmlAgilityPack.HtmlNode node in htmlDocument.DocumentNode.Descendants()) { if (node.Name == "div" || node.Name == "td" || node.Name == "li") { string innerText = node.InnerText.Replace("\r\n", " ").Trim(); if (innerText != "") { element _element = new element(); _element.outerHTML = node.OuterHtml.Replace("\r\n", " ").Trim(); _element.innerHTML = node.InnerHtml.Replace("\r\n", " ").Trim(); _element.id = i; i++; int _start = _element.outerHTML.IndexOf('<'); int _end = _element.outerHTML.IndexOf('>'); string temp_to = _element.outerHTML.Substring(_start, _end - _start + 1).Trim(); _element.tagName_Orginal = webfilter.String_Decimal_Clear(temp_to); //tekrar sayısı hesaplanıyor if (_ht_tag_count.ContainsKey(_element.tagName_Orginal)) { int _cnt = (int)_ht_tag_count[_element.tagName_Orginal]; _cnt++; _ht_tag_count[_element.tagName_Orginal] = _cnt; } else { _ht_tag_count.Add(_element.tagName_Orginal, 1); } _element.tagName = node.OriginalName; _element.xPath = node.XPath; element _tempelement = AnalyzeGivenHTML(_element.outerHTML, innerText); _element.BagofWords = _tempelement.BagofWords; _element.wordCount = _tempelement.wordCount; _element.DensityinHTML = (double)_element.wordCount / _firstelement.wordCount; _element.LinkCount = _tempelement.LinkCount; _element.wordCountinLink = _tempelement.wordCountinLink; _element.meanofWordinLinks = _tempelement.meanofWordinLinks; _element.meanofWordinLinksAllWords = _tempelement.meanofWordinLinksAllWords; string temp_innerhtml_ = _element.innerHTML.ToUpper(new CultureInfo("en-US", false));//for english words thus html tags _element.dot_count = webfilter.CountStringOccurrences(temp_innerhtml_, "."); _element.h1_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H1"); _element.h2_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H2"); _element.h3_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H3"); _element.h4_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H4"); _element.h5_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H5"); _element.h6_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H6"); _element.img_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<IMG"); _element.p_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<P"); _element.br_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<BR"); _element.span_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<SPAN"); _element.object_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<OBJECT"); _element.ul_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<UL"); _element.li_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<LI"); _element.input_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<INPUT") + webfilter.CountStringOccurrences(temp_innerhtml_, "<BUTTON") + webfilter.CountStringOccurrences(temp_innerhtml_, "<LABEL"); _element.div_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<DIV"); _element.td_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<TD"); string[] _sonuclar = ExtractionofSubLayouts(node); string tempinner_text = _sonuclar[0]; string tempOuterHTML = _sonuclar[1]; string tempinnerHTML = _sonuclar[2]; _element.outerHTML_AE = tempinnerHTML.Trim(); _element.innerHTML_AE = tempOuterHTML.Trim(); _element.BagofWords_AE = tempinner_text.Trim(); //After Extraction if (tempinner_text.Trim() != "") { AnalyzeGivenHTML_AE(_element.outerHTML_AE, _element.innerHTML_AE, ref _element); } _list.Add(_element); } } } } catch { } }