private static void Dump(ParsedHtml parsedHtml) { //Console.WriteLine("PageTitle:" + parsedHtml.Title); StreamWriter sw = new StreamWriter(@"D:\HtmlParser\LOG\bing" + num.ToString() + ".txt"); num++; int link = 0; foreach (LinkItem item in parsedHtml.Links) { link++; //Console.WriteLine("Line:{0}\n{1}\n", link, item.ToString()); sw.WriteLine("Line:{0}\n{1}\n", link, item.ToString()); } }
public void Process(ParsedHtml parsedHtml, bool extractSurroundingText) { Debug.Assert(parsedHtml != null); Debug.Assert(parsedHtml.Html != null); Debug.Assert(parsedHtml.Nodes != null); this.extractSurroundingText = extractSurroundingText; this.parsedHtml = parsedHtml; parsedHtml.Links = new List <LinkItem>(); int linkCount = 0; List <int> linkPosition = new List <int>(); for (int i = 0; i < parsedHtml.Nodes.Count; i++) { ProcessNode(i); if (parsedHtml.Links.Count > linkCount) { linkPosition.Add(i); linkCount++; } } //TODO: move the following out of HtmlParser since this is not generic // Extract Image Anchor like <a href=...><img src=... alt=...></a> Debug.Assert(parsedHtml.Links.Count == linkCount); for (int i = 0; i < linkCount - 1; i++) { if (linkPosition[i] + 1 == linkPosition[i + 1] && parsedHtml.Links[i].LinkSource == LinkSourceType.Anchor && string.IsNullOrEmpty(parsedHtml.Links[i].AnchorText) && parsedHtml.Links[i + 1].LinkSource == LinkSourceType.Image) { parsedHtml.Links[i].AnchorText = "<Img_Anchor>"; if (!string.IsNullOrEmpty(parsedHtml.Links[i + 1].AnchorText)) { parsedHtml.Links[i].AnchorText += parsedHtml.Links[i + 1].AnchorText; } } } }
/// <summary> /// parse a HTML /// </summary> /// <param name="html">content of HTML file</param> /// <returns>parsed result</returns> public ParsedHtml Parse(string html) { // markup parsing List <HtmlNode> nodes = markupParser.Parse(html); if (nodes == null) { return(null); } // parsing links ParsedHtml parsedHtml = new ParsedHtml(); parsedHtml.Html = html; parsedHtml.Nodes = nodes; // process parsed Html parsedHtml.Process(); return(parsedHtml); }
public static List <HtmlForm> ParseForms(List <HtmlNode> nodes) { List <HtmlForm> forms = new List <HtmlForm>(); for (int i = 0; i < nodes.Count; i++) { if (nodes[i].IsStartTag(HtmlTagId.Form)) { int startTag = i; int endTag = ParsedHtml.FindEndTag(nodes, startTag + 1, HtmlTagId.Form); if (endTag > 0) { HtmlForm form = HtmlForm.Parse(nodes, startTag, endTag); if (form != null) { forms.Add(form); } i = endTag; } } } return(forms); }
public static void Process(string directory) { DirectoryInfo folder = new DirectoryInfo(directory); FileInfo[] files = folder.GetFiles("*.htm"); foreach (FileInfo file in files) { string path = file.FullName; using (StreamReader streamReader = File.OpenText(path)) { string html = streamReader.ReadToEnd(); List <HtmlNode> nodes = markupParser.Parse(html); Dump(path, html, nodes); LinkExtractor linkExtractor = new LinkExtractor(); ParsedHtml parsedHtml = new ParsedHtml(); parsedHtml.Html = html; parsedHtml.Nodes = nodes; linkExtractor.Process(parsedHtml, true); Dump(parsedHtml); } } }
public void ExtractLinks(ParsedHtml parsedHtml, bool extractSurroundingText) { linkExtractor.Process(parsedHtml, extractSurroundingText); }