public static List <string> Extract_Tag_with_AngleSharp(string tagName, string source) { numberofElement = 0; //prepare nodename and its attributes List <string[]> res = TagProcessing.NodenameAndAttributes(tagName); List <string> list_sonuc = new List <string>(); Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); var parser = new HtmlParser(); var document = parser.Parse(source); stopwatch.Stop(); preProcessTime = stopwatch.Elapsed.TotalMilliseconds; stopwatch.Restart(); //Do something with LINQ if (document != null) { numberofElement = document.All.Length; string[] nodename = res[0]; List <AngleSharp.Dom.IElement> temp = document.All.Where(m => m.LocalName == nodename[0]).ToList(); for (int i = 1; i < res.Count; i++) { string[] att = res[i]; temp = temp.Where(m => m.Attributes[att[0]] != null && m.Attributes[att[0]].Value == att[1]).ToList(); } if (temp != null) { foreach (AngleSharp.Dom.IElement node in temp) { list_sonuc.Add(node.InnerHtml); } } else { stopwatch.Stop(); searchTime = stopwatch.Elapsed.TotalMilliseconds; return(null); } } else { stopwatch.Stop(); searchTime = stopwatch.Elapsed.TotalMilliseconds; return(null); } stopwatch.Stop(); searchTime = stopwatch.Elapsed.TotalMilliseconds; return(list_sonuc); }
public static List <string> Extract_Tag_with_HAP(string tagname, string source) { //prepare xpath string tagname_xpath = TagProcessing.ToXPath(tagname); List <string> list_sonuc = new List <string>(); Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); HtmlDocument htmlDoc = new HtmlDocument(); // There are various options, set as needed htmlDoc.OptionFixNestedTags = true; // filePath is a path to a file containing the html htmlDoc.LoadHtml(source); stopwatch.Stop(); preProcessTime = stopwatch.Elapsed.TotalMilliseconds; stopwatch.Restart(); HtmlNodeCollection _htc = htmlDoc.DocumentNode.SelectNodes(tagname_xpath); if (_htc != null) { foreach (HtmlNode node in _htc) { list_sonuc.Add(node.InnerHtml); } } else { stopwatch.Stop(); searchTime = stopwatch.Elapsed.TotalMilliseconds; return(null); } stopwatch.Stop(); searchTime = stopwatch.Elapsed.TotalMilliseconds; return(list_sonuc); }
public static List <string> Extract_Tag_with_IHTMLDocument(string tagName, string source) { //prepare nodename and its attributes List <string[]> res = TagProcessing.NodenameAndAttributes(tagName); List <string> list_sonuc = new List <string>(); Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); HTMLDocument doc = new HTMLDocument(); IHTMLDocument2 doc2 = (IHTMLDocument2)doc; doc2.clear(); doc2.designMode = "On"; doc2.write(source); stopwatch.Stop(); preProcessTime = stopwatch.Elapsed.TotalMilliseconds; stopwatch.Restart(); if (null != doc) { string[] nodename = res[0]; nodename[0] = nodename[0].ToUpper(new CultureInfo("en-US", false)); for (int i = 1; i < res.Count; i++) { string[] att = res[i]; if (att[0] == "id") { //id means only one record list_sonuc.Add(doc.getElementById(att[1]).innerHTML); stopwatch.Stop(); searchTime = stopwatch.Elapsed.TotalMilliseconds; return(list_sonuc); } } foreach (IHTMLElement element in doc.getElementsByTagName(nodename[0])) { bool sonuc = true; for (int i = 1; i < res.Count; i++) { string[] att = res[i]; if (att[0] == "class") { if (element.className != att[1]) { sonuc = false; break; } } else { if (element.innerHTML != null) { string tag_temp = element.outerHTML.Substring(0, element.outerHTML.IndexOf(">")); if (!(tag_temp.Contains(att[0]) && tag_temp.Contains(att[1]))) { sonuc = false; break; } } else { sonuc = false; break; } } } if (sonuc) { list_sonuc.Add(element.innerHTML); } } } else { stopwatch.Stop(); searchTime = stopwatch.Elapsed.TotalMilliseconds; return(null); } stopwatch.Stop(); searchTime = stopwatch.Elapsed.TotalMilliseconds; return(list_sonuc); }