public void prepareDOM(string htmlContent2) { string htmlContent = htmlContent2; htmlContent = HTML.trim_commenttags(htmlContent); htmlContent = HTML.trimOptions(htmlContent); htmlContent = HTML.trimScript(htmlContent); htmlContent = HTML.trim_HREF_SCR(htmlContent); htmlContent = HTML.trim_some_cases(htmlContent); //for fast processing otherwise image, link, javascript loading... HtmlAgilityPack.HtmlDocument htmlDocument = new HtmlAgilityPack.HtmlDocument(); htmlDocument.LoadHtml(htmlContent); _list = new List <element>(); _ht_tag_count = new Dictionary <string, int>(); HtmlAgilityPack.HtmlNode body = htmlDocument.DocumentNode.SelectSingleNode("//body"); element _firstelement = AnalyzeGivenHTML(body.InnerHtml.Replace("\r\n", "").Trim(), body.InnerText.Replace("\r\n", "").Trim()); AnalyzeProcess(htmlDocument, _firstelement); //AnalyzeProcess(htmlDocument, "//li", _firstelement); //AnalyzeProcess(htmlDocument, "//td", _firstelement); //count aktarılıyor. for (int i = 0; i < _list.Count; i++) { element _e = (element)_list[i]; _e.repeat_tag_count = (int)_ht_tag_count[_e.tagName_Orginal]; _list[i] = _e; } }
private static string uppercaseonlytags(string awebpage) { awebpage = HTML.trimOptions(awebpage); awebpage = HTML.trimScript(awebpage); awebpage = awebpage.Replace("\r", " "); awebpage = awebpage.Replace("\n", " "); awebpage = awebpage.Replace("\t", " "); awebpage = awebpage.Replace("style=\"\"", ""); awebpage = awebpage.Replace(" ", " ");//whitespace problem in regex so... awebpage = awebpage.Replace(" ", " "); awebpage = awebpage.Replace(" ", " "); //uppercase problem turkish encoding??? //starting tags awebpage = awebpage.Replace("<div", "<DIV"); awebpage = awebpage.Replace("<td", "<TD"); awebpage = awebpage.Replace("<h1", "<H1"); awebpage = awebpage.Replace("<h2", "<H2"); awebpage = awebpage.Replace("<h3", "<H3"); awebpage = awebpage.Replace("<h4", "<H4"); awebpage = awebpage.Replace("<h5", "<H5"); awebpage = awebpage.Replace("<h6", "<H6"); awebpage = awebpage.Replace("<span", "<SPAN"); awebpage = awebpage.Replace("<font", "<FONT"); awebpage = awebpage.Replace("<ul", "<UL"); awebpage = awebpage.Replace("<li", "<LI"); awebpage = awebpage.Replace("<b", "<b"); awebpage = awebpage.Replace("<object", "<OBJECT"); awebpage = awebpage.Replace("<button", "<BUTTON"); awebpage = awebpage.Replace("<input", "<INPUT"); awebpage = awebpage.Replace("<img", "<IMG"); awebpage = awebpage.Replace("<br", "<BR"); //ending tags awebpage = awebpage.Replace("</div", "</DIV"); awebpage = awebpage.Replace("</td", "</TD"); awebpage = awebpage.Replace("</h1", "</H1"); awebpage = awebpage.Replace("</h2", "</H2"); awebpage = awebpage.Replace("</h3", "</H3"); awebpage = awebpage.Replace("</h4", "</H4"); awebpage = awebpage.Replace("</h5", "</H5"); awebpage = awebpage.Replace("</h6", "</H6"); awebpage = awebpage.Replace("</span", "</SPAN"); awebpage = awebpage.Replace("</font", "</FONT"); awebpage = awebpage.Replace("</ul", "</UL"); awebpage = awebpage.Replace("</li", "</LI"); awebpage = awebpage.Replace("</b", "</b"); awebpage = awebpage.Replace("</object", "</OBJECT"); awebpage = awebpage.Replace("</button", "</BUTTON"); awebpage = awebpage.Replace("</input", "</INPUT"); awebpage = awebpage.Replace("</img", "</IMG"); awebpage = awebpage.Replace("</br", "</BR"); return(awebpage); }
public ArrayList prepareDOM(string htmlContent) { htmlContent = HTML.trimOptions(htmlContent); htmlContent = HTML.trimScript(htmlContent); IHTMLDocument2 htmlDocument = new mshtml.HTMLDocumentClass(); htmlDocument.write(htmlContent); IHTMLElementCollection allElements = htmlDocument.all; _ht = new Hashtable(); _list = new ArrayList(); _xmllist = new ArrayList(); string _tempinner_text = ""; if (htmlDocument.body != null) { if (htmlDocument.body.innerText != null) { _tempinner_text = htmlDocument.body.innerText.Replace("\r\n", ""); } } element _firstelement = AnalyzeGivenHTML(htmlContent, _tempinner_text); int i = 0; foreach (IHTMLElement htmlelement in allElements) { if (htmlelement.outerHTML != null) { element _element = new element(); _element.id = i; _element.outerHTML = htmlelement.outerHTML; _element.outerHTML = _element.outerHTML.Replace("\r\n", ""); if (htmlelement.innerHTML != null) { _element.innerHTML = htmlelement.innerHTML; _element.innerHTML = _element.innerHTML.Replace("\r\n", ""); } else { _element.innerHTML = ""; } if (_element.id == 0) { _element.elementlinked_id = -1;//root savehtmlContent = _element.outerHTML; resulthmtlContent = _element.outerHTML; } else { _element.elementlinked_id = 0; } if (htmlelement.tagName == "HTML") {//html bazen geç geliyor... savehtmlContent = _element.outerHTML; resulthmtlContent = _element.outerHTML; } string _str = _element.outerHTML; int _start = _str.IndexOf('<'); int _end = _str.IndexOf('>'); _element.elementName = _str.Substring(_start, _end - _start + 1); //<!--className::(.*?)--> ???garanti başlangıçtaki olmalı diğerlerine kaymamalı???? _start = _str.IndexOf("<!--className::"); _end = _str.IndexOf("-->"); if (_start >= 0) { if (_start == _element.elementName.Length) { _start = 15 + _element.elementName.Length; if (_end - _start > 0) { _element.className = _str.Substring(_start, _end - _start); } } } _element.tagName = htmlelement.tagName; if (htmlelement.id != null) { _element.tag_id = 1; } if (htmlelement.className != null) { _element.tag_class = 1; } _element.tag_idORclass = _element.tag_id + _element.tag_idORclass; if (_element.tag_idORclass == 2) { _element.tag_idORclass = 1; } string tempinner_text = htmlelement.innerText; if (tempinner_text != null) { tempinner_text = tempinner_text.Replace("\r\n", " "); tempinner_text = tempinner_text.Trim(); } else { tempinner_text = ""; } element _tempelement = AnalyzeGivenHTML(htmlelement.outerHTML, tempinner_text); _element.BagofWords = _tempelement.BagofWords; _element.wordCount = _tempelement.wordCount; _element.DensityinHTML = (double)_element.wordCount / _firstelement.wordCount; _element.LinkCount = _tempelement.LinkCount; _element.wordCountinLink = _tempelement.wordCountinLink; _element.meanofWordinLinks = _tempelement.meanofWordinLinks; _element.meanofWordinLinksAllWords = _tempelement.meanofWordinLinksAllWords; _element.similarity_with_other_web_page = 1; _element.relevant = false; _element.parent_elementName = ""; _list.Add(_element); int key = (int)htmlelement.sourceIndex;//for fast searching _ht.Add(key, i); i++; } } foreach (IHTMLElement htmlelement in allElements) { if (htmlelement.outerHTML != null) { string[] _sonuclar = ExtractionofSubLayouts(htmlelement); string tempinner_text = _sonuclar[1]; string tempOuterHTML = _sonuclar[2]; string tempinnerHTML = _sonuclar[3]; string str_i = _sonuclar[0]; i = Convert.ToInt32(str_i); //After Extraction element _element = (element)_list[i]; element _tempelement = AnalyzeGivenHTML_AE(tempOuterHTML, tempinner_text); _element.outerHTML_AE = tempOuterHTML; _element.innerHTML_AE = tempinnerHTML; _element.BagofWords_AE = _tempelement.BagofWords_AE; _element.wordCount_AE = _tempelement.wordCount_AE; _element.DensityinHTML_AE = (double)_element.wordCount_AE / _firstelement.wordCount; _element.LinkCount_AE = _tempelement.LinkCount_AE; _element.wordCountinLink_AE = _tempelement.wordCountinLink_AE; _element.meanofWordinLinks_AE = _tempelement.meanofWordinLinks_AE; _element.meanofWordinLinksAllWords_AE = _tempelement.meanofWordinLinksAllWords_AE; //dot endofcontent if (htmlelement.innerText != null) { if (htmlelement.innerText.Trim() != "") { if (htmlelement.innerText[htmlelement.innerText.Length - 1] == '.') { _element.dot_endofstence = 1; } else { _element.dot_endofstence = 0; } } } if (_element.wordCount_AE > _element.wordCount) { _element.wordCount_AE = _element.wordCount; //istisnayi durum scriptler sorun olduğu için nadir bir durum... } _list[i] = _element; if (htmlelement.tagName == "DIV" || htmlelement.tagName == "TD") { element _e = (element)_list[i]; if (_e.elementName.Contains("vAlign=bot")) { _e.relevant = true; } bool _decision = HTMLMarkerClass.desicionClass.determineIrrevelantLayout(_element); if (_decision == false) { _e.relevant = true; _list[i] = _e; //Update child elements for (int m = 0; m < _list.Count; m++) { element _et = (element)_list[m]; if (_et.elementlinked_id == _e.id) { if (_et.tagName != "DIV")//div'ler için karar verme { if (_et.tagName != "TD") { _et.relevant = true; _et.parent_elementName = _e.elementName; _list[m] = _et; } } } } //for m } //decision = true } //if div or td } // if not null } //for each for (int m = 0; m < _list.Count; m++) { element _element = (element)_list[m]; if (_element.relevant == true) { if (_element.tagName == "DIV" || _element.tagName == "TD") { bool _mainlayout = HTMLMarkerClass.desicionClass.determineLayout(_element); if (_mainlayout) { _element.predicted_className = "MAIN"; } else { _element.predicted_className = HTMLMarkerClass.desicionClass.determineHEADLINE_INFORMATION(_element); } } else { _element.predicted_className = HTMLMarkerClass.desicionClass.determineHEADLINE_INFORMATION(_element); if (_element.predicted_className == "MAIN") { _element.predicted_className = "IRRELEVANT"; } } /*if (_element.predicted_className == "IRRELEVANT") * _element.predicted_className = "INFORMATIONABOUTARTICLE";*/ if (clear_illegal_characters_for_XML(_element.BagofWords_AE.Trim()).Trim() != "") { _list[m] = _element; xml_elemet _xml = new xml_elemet(); _xml.elementName = _element.elementName; _xml.content = _element.BagofWords_AE; _xml.predicted_className = _element.predicted_className; _xml.parent_elementName = _element.parent_elementName; //equal content in _xml_list bool find = false; for (int v = 0; v < _xmllist.Count; v++) { xml_elemet item = (xml_elemet)_xmllist[v]; if (item.content == _xml.content) { item.elementName = item.elementName + ", " + _element.elementName; _xmllist[v] = item; find = true; } } if (!find) { _xmllist.Add(_xml); } } } } return(_list); }
public ArrayList prepareDOM(string htmlContent2) { string htmlContent = htmlContent2; htmlContent = HTML.trim_commenttags(htmlContent); htmlContent = HTML.trimOptions(htmlContent); htmlContent = HTML.trimScript(htmlContent); htmlContent = HTML.trim_HREF_SCR(htmlContent); htmlContent = HTML.trim_some_cases(htmlContent); //for fast processing otherwise image, link, javascript loading... IHTMLDocument2 htmlDocument = new mshtml.HTMLDocumentClass(); htmlDocument.write(htmlContent); IHTMLElementCollection allElements = htmlDocument.all; _ht = new Hashtable(); _list = new ArrayList(); _xmllist = new ArrayList(); string _tempinner_text = ""; if (htmlDocument.body != null) { if (htmlDocument.body.innerText != null) { _tempinner_text = htmlDocument.body.innerText.Replace("\r\n", ""); domhtmlContent = htmlDocument.body.outerHTML.Replace("\r\n", ""); } } element _firstelement = AnalyzeGivenHTML(htmlDocument.body.innerHTML, _tempinner_text); all_words = _firstelement.BagofWords; int i = 0; foreach (IHTMLElement htmlelement in allElements) { if (htmlelement.outerHTML != null) { element _element = new element(); _element.id = i; _element.outerHTML = htmlelement.outerHTML; _element.outerHTML = _element.outerHTML.Replace("\r\n", ""); if (htmlelement.innerHTML != null) { _element.innerHTML = htmlelement.innerHTML; _element.innerHTML = _element.innerHTML.Replace("\r\n", ""); } else { _element.innerHTML = ""; } if (_element.id == 0) { _element.elementlinked_id = -1;//root savehtmlContent = _element.outerHTML; resulthmtlContent = _element.outerHTML; } else { _element.elementlinked_id = 0; } if (htmlelement.tagName == "HTML") {//html bazen geç geliyor... savehtmlContent = _element.outerHTML; resulthmtlContent = _element.outerHTML; } string _str = _element.outerHTML; int _start = _str.IndexOf('<'); int _end = _str.IndexOf('>'); _element.elementName = _str.Substring(_start, _end - _start + 1); _element.tagName = htmlelement.tagName; _element.tag_id_Name = ""; _element.tag_class_Name = ""; if (htmlelement.id != null) { _element.tag_id = 1; _element.tag_id_Name = htmlelement.id; } if (htmlelement.className != null) { _element.tag_class = 1; _element.tag_class_Name = htmlelement.className; } if (_element.tag_id != 1 || _element.tag_class != 1) { _element.tag_idORclass = 1; } string tempinner_text = htmlelement.innerText; if (tempinner_text != null) { tempinner_text = tempinner_text.Replace("\r\n", " "); tempinner_text = tempinner_text.Trim(); } else { tempinner_text = ""; } element _tempelement = AnalyzeGivenHTML(htmlelement.outerHTML, tempinner_text); _element.BagofWords = _tempelement.BagofWords; _element.wordCount = _tempelement.wordCount; _element.DensityinHTML = (double)_element.wordCount / _firstelement.wordCount; _element.LinkCount = _tempelement.LinkCount; _element.wordCountinLink = _tempelement.wordCountinLink; _element.meanofWordinLinks = _tempelement.meanofWordinLinks; _element.meanofWordinLinksAllWords = _tempelement.meanofWordinLinksAllWords; string temp_innerhtml_ = _element.innerHTML.ToUpper(new CultureInfo("en-US", false)); //for english words thus html tags _element.dot_count = webfilter.CountStringOccurrences(temp_innerhtml_, "."); _element.h1_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H1"); _element.h2_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H2"); _element.h3_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H3"); _element.h4_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H4"); _element.h5_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H5"); _element.h6_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<H6"); _element.img_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<IMG"); _element.p_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<P"); _element.br_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<BR"); _element.span_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<SPAN"); _element.object_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<OBJECT"); _element.ul_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<UL"); _element.li_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<LI"); _element.input_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<INPUT") + webfilter.CountStringOccurrences(temp_innerhtml_, "<BUTTON") + webfilter.CountStringOccurrences(temp_innerhtml_, "<LABEL"); _element.div_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<DIV"); _element.td_count = webfilter.CountStringOccurrences(temp_innerhtml_, "<TD"); _element.parent_elementName = ""; //sim control //-1 : not available for sim control //0 : similar //0..1: similarity degree //1 : not similar _element.sim_bagofword = -1; _element.sim_bagofword_AE = -1; _element.sim_innerHTML = -1; _element.sim_innerHTML_AE = -1; _list.Add(_element); int key = (int)htmlelement.sourceIndex;//for fast searching _ht.Add(key, i); i++; } } foreach (IHTMLElement htmlelement in allElements) { if (htmlelement.outerHTML != null) { string[] _sonuclar = ExtractionofSubLayouts(htmlelement); string tempinner_text = _sonuclar[1]; string tempOuterHTML = _sonuclar[2]; string tempinnerHTML = _sonuclar[3]; string str_i = _sonuclar[0]; i = Convert.ToInt32(str_i); //After Extraction element _element = (element)_list[i]; element _tempelement = AnalyzeGivenHTML_AE(tempOuterHTML, tempinner_text); if (_element.elementlinked_id > 0) { element _p_element = (element)_list[_element.elementlinked_id]; _element.parent_elementName = _p_element.elementName; } if (_element.tagName == "DIV" || _element.tagName == "TD" || _element.tagName == "UL" || _element.tagName == "H1" || _element.tagName == "H2" || _element.tagName == "H3" || _element.tagName == "H4" || _element.tagName == "H5" || _element.tagName == "H6" || _element.tagName == "SPAN" || _element.tagName == "B" || _element.tagName == "STRONG" || _element.tagName == "P") { _element.outerHTML_AE = tempOuterHTML; _element.innerHTML_AE = tempinnerHTML; _element.BagofWords_AE = _tempelement.BagofWords_AE; _element.wordCount_AE = _tempelement.wordCount_AE; _element.DensityinHTML_AE = (double)_element.wordCount_AE / _firstelement.wordCount; _element.LinkCount_AE = _tempelement.LinkCount_AE; _element.wordCountinLink_AE = _tempelement.wordCountinLink_AE; _element.meanofWordinLinks_AE = _tempelement.meanofWordinLinks_AE; _element.meanofWordinLinksAllWords_AE = _tempelement.meanofWordinLinksAllWords_AE; string temp_innerhtml_AE = _element.innerHTML_AE.ToUpper(new CultureInfo("en-US", false));//for english words thus html tags _element.dot_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "."); _element.h1_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H1"); _element.h2_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H2"); _element.h3_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H3"); _element.h4_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H4"); _element.h5_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H5"); _element.h6_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<H6"); _element.img_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<IMG"); _element.p_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<P"); _element.br_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<BR"); _element.span_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<SPAN"); _element.object_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<OBJECT"); _element.ul_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<UL"); _element.li_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<LI"); _element.input_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<INPUT") + webfilter.CountStringOccurrences(temp_innerhtml_AE, "<BUTTON") + webfilter.CountStringOccurrences(temp_innerhtml_AE, "<LABEL"); _element.div_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<DIV"); _element.td_count_AE = webfilter.CountStringOccurrences(temp_innerhtml_AE, "<TD"); if (_element.wordCount_AE > _element.wordCount) { _element.wordCount_AE = _element.wordCount; //istisnayi durum scriptler sorun olduğu için nadir bir durum... } //etiketin tekrar sayısı //_element.repeat_tag_count = webfilter.CountStringOccurrences(htmlDocument.body.innerHTML, _element.elementName); int benzertagsayisi = 0; for (int k = 0; k < _list.Count; k++) { element _e1 = (element)_list[k]; if (_element.elementName == _e1.elementName) { benzertagsayisi++; } } _element.repeat_tag_count = benzertagsayisi; } _list[i] = _element; } // if not null } //for each return(_list); }