public void prepareDOM(string htmlContent2) { string htmlContent = htmlContent2; htmlContent = HTML.trim_commenttags(htmlContent); htmlContent = HTML.trimOptions(htmlContent); htmlContent = HTML.trimScript(htmlContent); htmlContent = HTML.trim_HREF_SCR(htmlContent); htmlContent = HTML.trim_some_cases(htmlContent); //for fast processing otherwise image, link, javascript loading... HtmlAgilityPack.HtmlDocument htmlDocument = new HtmlAgilityPack.HtmlDocument(); htmlDocument.LoadHtml(htmlContent); _list = new List <element>(); _ht_tag_count = new Dictionary <string, int>(); HtmlAgilityPack.HtmlNode body = htmlDocument.DocumentNode.SelectSingleNode("//body"); element _firstelement = AnalyzeGivenHTML(body.InnerHtml.Replace("\r\n", "").Trim(), body.InnerText.Replace("\r\n", "").Trim()); AnalyzeProcess(htmlDocument, _firstelement); //AnalyzeProcess(htmlDocument, "//li", _firstelement); //AnalyzeProcess(htmlDocument, "//td", _firstelement); //count aktarılıyor. for (int i = 0; i < _list.Count; i++) { element _e = (element)_list[i]; _e.repeat_tag_count = (int)_ht_tag_count[_e.tagName_Orginal]; _list[i] = _e; } }
public ArrayList prepareDOM(string htmlContent2) { string htmlContent = htmlContent2; htmlContent = HTML.trim_commenttags(htmlContent); htmlContent = HTML.trimOptions(htmlContent); htmlContent = HTML.trimScript(htmlContent); htmlContent = HTML.trim_HREF_SCR(htmlContent); htmlContent = HTML.trim_some_cases(htmlContent); //for fast processing otherwise image, link, javascript loading... IHTMLDocument2 htmlDocument = new mshtml.HTMLDocumentClass(); htmlDocument.write(htmlContent); IHTMLElementCollection allElements = htmlDocument.all; _ht = new Hashtable(); _list = new ArrayList(); _xmllist = new ArrayList(); string _tempinner_text = ""; if (htmlDocument.body != null) { if (htmlDocument.body.innerText != null) { _tempinner_text = htmlDocument.body.innerText.Replace("\r\n", ""); domhtmlContent = htmlDocument.body.outerHTML.Replace("\r\n", ""); } } element _firstelement = AnalyzeGivenHTML(htmlDocument.body.innerHTML, _tempinner_text); all_words = _firstelement.BagofWords; int i = 0; foreach (IHTMLElement htmlelement in allElements) { if (htmlelement.outerHTML != null) { element _element = new element(); _element.id = i; _element.outerHTML = htmlelement.outerHTML; _element.outerHTML = _element.outerHTML.Replace("\r\n", ""); if (htmlelement.innerHTML != null) { _element.innerHTML = htmlelement.innerHTML; _element.innerHTML = _element.innerHTML.Replace("\r\n", ""); } else { _element.innerHTML = ""; } if (_element.id == 0) { _element.elementlinked_id = -1;//root savehtmlContent = _element.outerHTML; resulthmtlContent = _element.outerHTML; } else { _element.elementlinked_id = 0; } if (htmlelement.tagName == "HTML") {//html bazen geç geliyor... savehtmlContent = _element.outerHTML; resulthmtlContent = _element.outerHTML; } string _str = _element.outerHTML; int _start = _str.IndexOf('<'); int _end = _str.IndexOf('>'); _element.elementName = _str.Substring(_start, _end - _start + 1); _element.tagName = htmlelement.tagName; _element.tag_id_Name = ""; _element.tag_class_Name = ""; if (htmlelement.id != null) { _element.tag_id = 1; _element.tag_id_Name = htmlelement.id; } if (htmlelement.className != null) { _element.tag_class = 1; _element.tag_class_Name = htmlelement.className; } if (_element.tag_id != 1 || _element.tag_class != 1) { _element.tag_idORclass = 1; } string tempinner_text = htmlelement.innerText; if (tempinner_text != null) { tempinner_text = tempinner_text.Replace("\r\n", " "); tempinner_text = tempinner_text.Trim(); } else { tempinner_text = ""; } element _tempelement = AnalyzeGivenHTML(htmlelement.outerHTML, tempinner_text); _element.BagofWords = _tempelement.BagofWords; _element.wordCount = _tempelement.wordCount; _element.DensityinHTML = (double)_element.wordCount / _firstelement.wordCount; _element.LinkCount = _tempelement.LinkCount; _element.wordCountinLink = _tempelement.wordCountinLink; _element.meanofWordinLinks = _tempelement.meanofWordinLinks; _element.meanofWordinLinksAllWords = _tempelement.meanofWordinLinksAllWords; string temp_innerhtml_ = _element.innerHTML.ToUpper(new CultureInfo("en-US", false)); //for english words thus html tags _element.dot_count = webfilter.CountStringOccurrences(temp_innerhtml_, "."); _element.h1_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H1"); _element.h2_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H2"); _element.h3_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H3"); _element.h4_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H4"); _element.h5_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H5"); _element.h6_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H6"); _element.img_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<IMG"); _element.p_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<P"); _element.br_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<BR"); _element.span_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<SPAN"); _element.object_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<OBJECT"); _element.ul_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<UL"); _element.li_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<LI"); _element.input_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<INPUT") + webfilter.CountStringOccurrences(temp_innerhtml_, "<BUTTON") + webfilter.CountStringOccurrences(temp_innerhtml_, "<LABEL"); _element.div_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<DIV"); _element.td_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<TD"); _element.parent_elementName = ""; //sim control //-1 : not available for sim control //0 : similar //0..1: similarity degree //1 : not similar _element.sim_bagofword = -1; _element.sim_bagofword_AE = -1; _element.sim_innerHTML = -1; _element.sim_innerHTML_AE = -1; _list.Add(_element); int key = (int)htmlelement.sourceIndex;//for fast searching _ht.Add(key, i); i++; } } foreach (IHTMLElement htmlelement in allElements) { if (htmlelement.outerHTML != null) { string[] _sonuclar = ExtractionofSubLayouts(htmlelement); string tempinner_text = _sonuclar[1]; string tempOuterHTML = _sonuclar[2]; string tempinnerHTML = _sonuclar[3]; string str_i = _sonuclar[0]; i = Convert.ToInt32(str_i); //After Extraction element _element = (element)_list[i]; element _tempelement = AnalyzeGivenHTML_AE(tempOuterHTML, tempinner_text); if (_element.elementlinked_id > 0) { element _p_element = (element)_list[_element.elementlinked_id]; _element.parent_elementName = _p_element.elementName; } if (_element.tagName == "DIV" || _element.tagName == "TD" || _element.tagName == "UL" || _element.tagName == "H1" || _element.tagName == "H2" || _element.tagName == "H3" || _element.tagName == "H4" || _element.tagName == "H5" || _element.tagName == "H6" || _element.tagName == "SPAN" || _element.tagName == "B" || _element.tagName == "STRONG" || _element.tagName == "P") { _element.outerHTML_AE = tempOuterHTML; _element.innerHTML_AE = tempinnerHTML; _element.BagofWords_AE = _tempelement.BagofWords_AE; _element.wordCount_AE = _tempelement.wordCount_AE; _element.DensityinHTML_AE = (double)_element.wordCount_AE / _firstelement.wordCount; _element.LinkCount_AE = _tempelement.LinkCount_AE; _element.wordCountinLink_AE = _tempelement.wordCountinLink_AE; _element.meanofWordinLinks_AE = _tempelement.meanofWordinLinks_AE; _element.meanofWordinLinksAllWords_AE = _tempelement.meanofWordinLinksAllWords_AE; string temp_innerhtml_AE = _element.innerHTML_AE.ToUpper(new CultureInfo("en-US", false));//for english words thus html tags _element.dot_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "."); _element.h1_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H1"); _element.h2_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H2"); _element.h3_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H3"); _element.h4_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H4"); _element.h5_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H5"); _element.h6_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H6"); _element.img_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<IMG"); _element.p_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<P"); _element.br_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<BR"); _element.span_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<SPAN"); _element.object_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<OBJECT"); _element.ul_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<UL"); _element.li_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<LI"); _element.input_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<INPUT") + webfilter.CountStringOccurrences(temp_innerhtml_AE, "<BUTTON") + webfilter.CountStringOccurrences(temp_innerhtml_AE, "<LABEL"); _element.div_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<DIV"); _element.td_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<TD"); if (_element.wordCount_AE > _element.wordCount) { _element.wordCount_AE = _element.wordCount; //istisnayi durum scriptler sorun olduğu için nadir bir durum... } //etiketin tekrar sayısı //_element.repeat_tag_count = webfilter.CountStringOccurrences(htmlDocument.body.innerHTML, _element.elementName); int benzertagsayisi = 0; for (int k = 0; k < _list.Count; k++) { element _e1 = (element)_list[k]; if (_element.elementName == _e1.elementName) { benzertagsayisi++; } } _element.repeat_tag_count = benzertagsayisi; } _list[i] = _element; } // if not null } //for each return(_list); }