/// <summary> /// Process a URL /// </summary> /// <param name="page">the URL to process</param> private void ProcessPage(string page) { ParseHTML parse = new ParseHTML(); parse.Source = page; while (!parse.Eof()) { char ch = parse.Parse(); if (ch == 0) { Attribute a = parse.GetTag()["HREF"]; if (a != null) { ProcessLink(a.Value); } a = parse.GetTag()["SRC"]; if (a != null) { ProcessLink(a.Value); } } } }
private void ProcessPage(string page) { ParseHTML parse = new ParseHTML(); parse.Source = page; //page为要解析的HTML文档 while (!parse.eof()) //利用循环来检查HTML文档包含的所有文本和标记 { char ch = parse.Parse(); //Parse方法将返回HTML文档包含的字符--它返回的内容只包含那些非HTML标记的字符,如果遇到了HTML标记,Parse方法将返回0值,表示现在遇到了一个HTML标记。 //遇到一个标记之后,用GetTag()方法来处理它。 if (ch == 0) { Attribute a = parse.get()["HREF"]; if (a != null) { ProcessLink(a.Value); //提取出HREF属性的值 } a = parse.get()["SRC"]; if (a != null) { ProcessLink(a.Value); //提取出SRC属性的值 } if (spider.Flag == 1) { a = parse.get()["IMG"]; if (a != null && (a.Name.ToLower() == "src" || a.Name.ToLower() == "href")) { Uri url = new Uri(uri, a.Value); if (spider.addIMG(url)) { getpage(url); } } } } } }
/// <summary> /// Process a URL /// </summary> /// <param name="page">the URL to process</param> private void ProcessPage(string page) { ParseHTML parse = new ParseHTML(); parse.Source = page; while(!parse.Eof()) { char ch = parse.Parse(); if(ch==0) { Attribute a = parse.GetTag()["HREF"]; if( a!=null ) ProcessLink(a.Value); a = parse.GetTag()["SRC"]; if( a!=null ) ProcessLink(a.Value); } } }