/// <summary> /// 获取工作描述 /// </summary> /// <param name="url"></param> /// <returns></returns> private Dictionary <string, string> getjobdesc(string url) { try { IHtmlDocument html = new JumonyParser().LoadDocument(url); Regex jgr = new Regex("(?<jg>\\d+)"); var desc = html.Find(".main-detail .desc").FirstOrDefault().InnerHtml(); var zz = html.Find(".name a").FirstOrDefault().InnerText(); var number = html.Find(".main-top .number").FirstOrDefault().InnerText(); var price = html.Find(".main-top .detail-row .budgets .budget span").FirstOrDefault().InnerText(); var prices = jgr.Matches(price); Dictionary <string, string> dic = new Dictionary <string, string>(); dic.Add("desc", desc); dic.Add("zz", zz); dic.Add("number", number); if (prices.Count > 0) { for (int i = 0; i < prices.Count; i++) { dic.Add("price_" + (i + 1), prices[i].Groups["jg"].Value); } } else { dic.Add("price_min", ""); dic.Add("price_max", ""); } return(dic); } catch (Exception e) { log.Error(e.Message); throw; } }
private async Task <bool> GetAgentsAsync(Uri cityUri) { LogHelper.Info(cityUri.ToString()); var pageSource = await HttpClient.GetStringAsync(cityUri); while (!pageSource.Contains("查企业")) { if (pageSource.StartsWith("<script>window.location")) { VertifyCode(new Uri(pageSource.Split("'")[1])); pageSource = await HttpClient.GetStringAsync(cityUri); } else if (pageSource.Contains("小查还没找到数据")) { return(false); } } var block = JumonyParser.Parse(pageSource).Find(".m_srchList tbody tr td:nth-child(3)"); foreach (var item in block) { await VertifyAsync(item.InnerHtml()); } if (block.Count() < PageSize) { return(false); } return(true); }
static void get_items() { var msg_prefix = "-分析列表:"; var y = Encoding.Default.GetByteCount(msg_prefix); Console.Write(msg_prefix); var charArr = @"-\|/".ToArray(); var resp = browser.Get("https://materials.ulprospector.com/zh/results?pageSize=100"); JumonyParser parser = new JumonyParser(); var document = parser.Parse(resp); var total = document.FindFirst(".pagination.pull-right").InnerText().Split(' ').Last().ToInt(); for (int i = 1; i < 2; i++) { resp = browser.Get("https://materials.ulprospector.com/zh/results?pageNum=" + i + "pageSize=100"); document = parser.Parse(resp); var eles = document.Find("table.results tbody tr. "); foreach (var v in eles) { var item = new Item(); item.Url = v.FindFirst(".entry a").Attribute("href").Value(); item.Id = v.FindFirst("input").Attribute("value").Value().ToInt(); Items.Add(item); } Console.CursorLeft = 0; Console.Write(charArr[i % charArr.Length]); Console.CursorLeft = y; Console.Write((int)(i * 100 / total) + "%"); } Console.WriteLine(); }
public override IList <ExamItem> Process(string strResponse, int moduleId) { var result = new List <ExamItem>(); if (string.IsNullOrEmpty(strResponse)) { return(result); } var document = new JumonyParser().Parse(strResponse); //所有题目 var dataTable = document.Descendants("body>div>table").ElementAt(1); var AllTrs = dataTable.Elements("tr"); var htmlExamItems = AllTrs.Where(x => x.InnerHtml().Contains("【")); foreach (var item in htmlExamItems) { var model = BuildEntity(moduleId, item); if (model == null) { continue; } result.Add(model); } if (htmlExamItems.Count() > result.Count) { string msg = string.Format("Html:[{0}]个,解析:[{1}]个。", result.Count, htmlExamItems.Count()); WriteLog(strResponse, msg); } return(result); }
public Class1[] GetHref_(int page) { System.Threading.Thread.Sleep(1 * 1000); string URL = "http://liaoyang.58.com/ershoufang/0"; Class1[] RTc = new Class1[160]; URL = page == 1 ? URL : URL + "/pn" + page.ToString() + "/"; //抓取关键字对应的url WebClient client = new WebClient(); client.Encoding = Encoding.UTF8; string html = client.DownloadString(URL); IHtmlDocument document = new JumonyParser().Parse(html); GetUrlText_1(RTc, document); int b = 0; b = GetCount(RTc, b); Class1[] rtcc = new Class1[b]; for (int a = 0; a < rtcc.Length; a++) { rtcc[a] = RTc[a]; } return(rtcc); }
protected void Page_Load( object sender, EventArgs e ) { var client = new WebClient(); var html = client.DownloadString( "http://www.cnblogs.com/" ); var parser = new JumonyParser(); var document = parser.Parse( html ); var links = document.Find( "a[href]" ); var baseUrl = new Uri( "http://www.cnblogs.com" ); var data = from hyperLink in links let url = new Uri( baseUrl, hyperLink.Attribute( "href" ).Value() ) orderby url.AbsoluteUri select new { Url = url.AbsoluteUri, IsLinkingOut = !url.Host.EndsWith( "cnblogs.com" ), Target = hyperLink.Attribute( "target" ).Value() ?? "_self" }; DataList.DataSource = data; DataBind(); }
public void Test1() { var context = new ControllerContext(HttpContext.Request.RequestContext, new TestController()); var result = ViewEngines.Engines.FindView(context, "~/ActionUrlTest/Test1.html", null); Assert.NotNull(result.View, "找不到视图"); IHtmlDocument document; using (var writer = new StringWriter()) { result.View.Render(new ViewContext(context, result.View, new ViewDataDictionary(), new TempDataDictionary(), writer), writer); document = new JumonyParser().Parse(writer.ToString()); } var link = document.FindFirst("a"); Assert.NotNull(link); Assert.AreEqual(link.Attribute("href").Value(), "/TestController/TestAction?arg=args"); }
protected void Page_Load(object sender, EventArgs e) { var client = new WebClient(); var html = client.DownloadString("http://www.cnblogs.com/"); var parser = new JumonyParser(); var document = parser.Parse(html); var links = document.Find("a[href]"); var baseUrl = new Uri("http://www.cnblogs.com"); var data = from hyperLink in links let url = new Uri(baseUrl, hyperLink.Attribute("href").Value()) orderby url.AbsoluteUri select new { Url = url.AbsoluteUri, IsLinkingOut = !url.Host.EndsWith("cnblogs.com"), Target = hyperLink.Attribute("target").Value() ?? "_self" }; DataList.DataSource = data; DataBind(); }
static void Main(string[] args) { var id = Guid.NewGuid(); var path = Path.Combine(tempDirectory, id.ToString()); Directory.CreateDirectory(path); SmtpClient smtp = new SmtpClient(); smtp.EnableSsl = false; smtp.DeliveryMethod = SmtpDeliveryMethod.SpecifiedPickupDirectory; smtp.PickupDirectoryLocation = path; var parser = new JumonyParser(); var document = parser.LoadDocument("http://blog.sina.com.cn/s/blog_4701280b010183ny.html"); MailMessage message = CreateMail(document); smtp.Send(message); var directory = new DirectoryInfo(path); var file = directory.GetFiles().Single(); file.MoveTo(Path.Combine(tempDirectory, id.ToString() + ".mht")); directory.Delete(true); }
public sys_job GetJobInfo(string url) { try { IHtmlDocument html = new JumonyParser().LoadDocument(url, Encoding.UTF8); int pos = html.InnerHtml().IndexOf("product-info-summary"); if (pos < 0) { return(new sys_job()); } string jobtitle = html.Find(".product-info-summary .row h4").FirstOrDefault().InnerText(); string author = html.Find(".product-info-summary .row small").FirstOrDefault().InnerText().Replace("发布者:", ""); string price = html.Find(".product-info-summary .row .p-desc").FirstOrDefault().InnerText().Replace(" 预算: ", ""); string rq = html.Find("#p-other ul li:first-child").FirstOrDefault().InnerText(); string xqh = html.Find("#p-other ul li:nth-child(3)").SingleOrDefault().InnerText(); string describe = html.Find("#wrap").SingleOrDefault().InnerHtml(); string t = @"<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>"; describe = Regex.Replace(describe, t, ""); describe = Regex.Replace(describe, "<.*?>", "").Replace("\t", "").Replace("\r", "").Replace("\n", ""); return(new sys_job { title = jobtitle, author = author, desc = describe, rq = rq, tag = xqh, price_min = price }); } catch (Exception e) { log.Error(url + "----" + e.Message); return(new sys_job()); } }
/// <summary> /// 核心加载 /// </summary> /// <param name="context"></param> /// <param name="url"></param> /// <param name="htmlContent">HTML内容</param> /// <param name="result">目标结果</param> /// <returns></returns> protected override bool LoadCore(HttpContext <string> context, string url, string htmlContent, ResourceSearchInfo result) { var parser = new JumonyParser(); var doc = parser.Parse(htmlContent, new Uri(ReferUrlPage)); var node = doc.Find("#archiveResult tr").Skip(1); foreach (var row in node) { var title = row.FindFirstOrDefault("td.name")?.InnerText(); //var size = row.FindFirstOrDefault("td.size")?.InnerText(); var date = row.FindFirstOrDefault("td.date")?.InnerText()?.ToDateTimeNullable(); var has = Regex.Match(row.FindFirstOrDefault("td.action a:nth-child(1)").Attribute("href").AttributeValue, @"/([a-z\d]{40})", RegexOptions.IgnoreCase).GetGroupValue(1); var item = CreateResourceInfo(has, title); //item.DownloadSize = size; item.UpdateTime = date; result.Add(item); } var pager = doc.FindFirstOrDefault("div.pagination"); result.HasPrevious = pager?.FindFirstOrDefault("*:first-child")?.Name == "a"; result.HasMore = pager?.FindFirstOrDefault("*:last-child")?.Name == "a"; return(base.LoadCore(context, url, htmlContent, result)); }
public List <string> GetPageUrl(string url) { try { url = rooturl; List <string> list = new List <string>(); IHtmlDocument html = new JumonyParser().LoadDocument(url, Encoding.UTF8); string entityCount = html.Find("#entityCount").SingleOrDefault().Attribute("value").Value(); string maxEntityPerPage = html.Find("#maxEntityPerPage").SingleOrDefault().Attribute("value").Value(); string maxPagePerRow = html.Find("#maxPagePerRow").SingleOrDefault().Attribute("value").Value(); string pageCount = html.Find("#pageCount").SingleOrDefault().Attribute("value").Value(); string currentPage = html.Find("#currentPage").SingleOrDefault().Attribute("value").Value(); string currentPageRow = html.Find("#currentPageRow").SingleOrDefault().Attribute("value").Value(); string pageRowCount = html.Find("#pageRowCount").SingleOrDefault().Attribute("value").Value(); Int32 count = Convert.ToInt32(pageCount); Int32 current = Convert.ToInt32(currentPage); string query = string.Empty; for (Int32 i = current; i <= count; i++) { query = "entityCount=" + entityCount + "&maxEntityPerPage=" + maxEntityPerPage + "&maxPagePerRow=" + maxPagePerRow + "&pageCount=" + pageCount + "¤tPage=" + i + "¤tPageRow=" + currentPageRow + "&pageRowCount=" + pageRowCount + "&cBudget=0-1000000000&budgetTo=&statusBy=&categoryBy=&typeBy=&typeName=&orderByClause=a.c_postDate+desc"; string pageurl = url + "?" + query; list.Add(pageurl); } return(list); } catch (Exception e) { log.Error(e.Message); return(new List <string>()); } }
public void GetPageUrlToRedis(string url) { try { IHtmlDocument html = new JumonyParser().LoadDocument(url); var pagelist = html.Find(".co_content8 .x a"); var last_index = pagelist.Count() - 3; var end_index = pagelist.Count() - 1; var last_url = pagelist.ToList()[last_index].Attribute("href").Value(); var end_txt = pagelist.ToList()[end_index].InnerText(); this.isendpage = end_txt.IndexOf("末页") >= 0 ? false : true; looplast_index = isendpage ? pagelist.Count() : pagelist.Count() - 1; var last_full_url = page_baseurl + last_url; int i = 0; i = isfirstpage ? 0 : 2; for (; i < looplast_index; i++) { IHtmlElement item = pagelist.ToList()[i]; string pageurl = item.Attribute("href").Value(); string pagefullurl = page_baseurl + pageurl; db.SortedSetAdd("filmpageurl", pagefullurl, (double)index++); if (i == last_index && !isendpage) { isfirstpage = false; GetPageUrl(pagefullurl); } } } catch (Exception e) { log.Error(e.Message); throw; } }
public override IList <ExamItem> Process(string strResponse, int moduleId) { var result = new List <ExamItem>(); if (string.IsNullOrEmpty(strResponse)) { return(result); } var document = new JumonyParser().Parse(strResponse); //所有题目 var htmlExamItems = document.Descendants(@"div.st"); foreach (var item in htmlExamItems) { var model = BuildEntity(moduleId, item); if (model == null) { continue; } result.Add(model); } if (htmlExamItems.Count() > result.Count) { string msg = string.Format("Html:[{0}]个,解析:[{1}]个。", result.Count, htmlExamItems.Count()); WriteLog(strResponse, msg); } return(result); }
public override List <SubjectModule> Process(string strResponse, int moduleId) { var list = new List <SubjectModule>(); var document = new JumonyParser().Parse(strResponse); var trs = document.Descendants("tbody>tr"); foreach (IHtmlElement tr in trs) { string title = tr.FindFirst("td").InnerText(); string href = tr.FindLast("td a").Attribute("href").Value(); list.Add(new SubjectModule() { Id = GetId(moduleId, title), Handler = "SweetFly.Job.Handler.NormalHandler,SweetFly.Job", HtmlDataSource = new HtmlDataSource() { Encoding = "GB2312", Uri = @"http://learning.cmr.com.cn/student/acourse/HomeworkCenter/" + href } }); } Console.WriteLine("{0} - {1}", trs.Count(), list.Count); return(list); }
public void SetStyleTest() { var element = new JumonyParser().Parse("<div></div>").Elements().First(); element.Style("display", "none"); Assert.AreEqual(element.Attribute("style").Value(), "display:none", ".Style( name, value ) 测试不通过"); element.Style().SetValue("color", "red"); Assert.AreEqual(element.Attribute("style").Value(), "display:none;color:red", ".Style().SetValue( name, value ) 测试不通过"); element.Style().SetValue("display", "block"); Assert.AreEqual(element.Attribute("style").Value(), "display:block;color:red", ".Style().SetValue( name, value ) 测试不通过"); element.Style().SetValue("display", null); Assert.AreEqual(element.Attribute("style").Value(), "color:red", ".Style().SetValue( name, null ) 测试不通过"); element.Style().Clear(); Assert.AreEqual(element.Attribute("style").Value(), "", ".Style().Clear() 测试不通过"); element.Style().SetValue("padding", "10px"); Assert.AreEqual(element.Style().GetValue("padding-left"), "10px", "shorthand 展开测试不通过"); element.Style().SetValue("padding-left", "0px"); Assert.AreEqual(element.Style().GetValue("padding-left"), "0px", "shorthand 展开测试不通过"); Assert.AreEqual(element.Style().GetValue("padding-top"), "10px", "shorthand 展开测试不通过"); element.Style().SetValue("margin", "5px"); Assert.AreEqual(element.Style().GetValue("margin-left"), "5px", "margin shorthand 展开测试不通过"); }
static void GetDataByJumony() { Console.WriteLine("开始跑数据"); var db = DB.GetInstance(); for (var i = 1; i < 51; i++) { try { var path = "http://cn.coovee.com/company/s1.html?p=" + i.ToString(); IHtmlDocument source = new JumonyParser().LoadDocument(path, System.Text.Encoding.GetEncoding("utf-8")); var company = source.Find(".company-l-item"); Console.WriteLine(company.Count()); foreach (var item in company) { try { var a = item.Find(".dt h4 a").FirstOrDefault(); Console.WriteLine("公司名=" + a.InnerText()); var span = item.Find(".dt p span").FirstOrDefault(); Console.WriteLine("联系人=" + span.InnerText()); var zhuying = item.Find(".dd p").FirstOrDefault(); Console.WriteLine("主营=" + zhuying.InnerText().Replace("主营:", "")); var url = item.Find(".dd ul li").Last().Find("a").FirstOrDefault().Attribute("href").Value(); Console.WriteLine(url); System.Threading.Thread.Sleep(1000); source = new JumonyParser().LoadDocument(url, System.Text.Encoding.GetEncoding("utf-8")); var tel = source.Find(".company-contact-info p").ToList <IHtmlElement>()[1].InnerText(); Console.WriteLine("电话=" + tel.Replace("联系电话:", "")); var adress = source.Find(".company-contact-info p").ToList <IHtmlElement>()[3].InnerText(); Console.WriteLine("地址=" + adress.Replace("公司地址:", "").Replace(" ", "")); COMPANY comp = new COMPANY() { ID = System.Guid.NewGuid().ToString(), COMPANY_NAME = a.InnerText(), ADDRESS = adress.Replace("公司地址:", "").Replace(" ", ""), LINK_MAN = span.InnerText(), SALE_PRODUCT = zhuying.InnerText().Replace("主营:", ""), TEL = tel.Replace("联系电话:", "") }; db.Insertable <COMPANY>(comp).ExecuteCommand(); System.Threading.Thread.Sleep(2000); } catch { } } System.Threading.Thread.Sleep(2000); } catch { } } }
private void BaiXingNewHTMLhreper() { string url = "http://liaoyang.baixing.com/qiufang/"; string ThisHtml = BXGetHTMLstr(url); using (var ctx = new oaEntities()) { DateTime DBtime = Convert.ToDateTime(DateTime.Now.Year.ToString() + "-" + DateTime.Now.Month.ToString() + "-" + DateTime.Now.Day.ToString()); IHtmlDocument document = new JumonyParser().Parse(ThisHtml); IEnumerable<IHtmlElement> result = document.Find(".media-body-title"); List<Class1> Ncss = new List<Class1>(); foreach (var item in result) { Class1 Class1 = new Class1(); item.FindFirst("a"); Class1.href = item.Exists("a") ? item.FindFirst("a").Attribute("href").Value() : string.Empty; Class1.TextName = MainWindow.GetN_value(item, "a"); //开始读取子连接 #region 读取子连接 string ThisZ = BXGetHTMLstr(Class1.href); IHtmlDocument document_1 = new JumonyParser().Parse(ThisZ); IEnumerable<IHtmlElement> restime = document_1.Find("div>.viewad-topMeta"); foreach (var tm in restime) { } #endregion Ncss.Add(Class1); } } //media - body - title }
void webbrowser2_LoadCompleted(object sender, NavigationEventArgs e) { mshtml.HTMLDocument mhtml = (mshtml.HTMLDocument)Webbrowser2.Document; Webbrowser2.Navigate("http://#"); string html = mhtml.body.innerHTML; IHtmlDocument document_1 = new JumonyParser().Parse(html); }
/// <summary> /// 获取每一次请求的Ids /// </summary> /// <param name="address"></param> /// <returns></returns> public Hashtable GetList(string address = "") { var listAddress = new List <string>(); // 获取返回信息 var result = JsonConvert.DeserializeObject <DuoWan.DwResult>(GetUrlString(address)); var document = new JumonyParser().Parse(result.html); var cells = document.Find("li>a"); // 开始遍历 foreach (var li in cells) { var detailUrl = li.Attribute("href").Value(); // 获取id listAddress.Add((from each in detailUrl where each.ToString().ToInt(-1) > 0 select each).Join("")); } var o = new Hashtable() { { "more", result.more }, { "ids", listAddress } }; return(o); }
public static void getContent(string id) { HttpClient httpClient = new HttpClient(); try { httpClient .PostAsync("http://hd.huachuan.gov.cn/aspx/gkml_view.aspx?id=" + id, null) .ContinueWith((postTask) => { HttpResponseMessage response = postTask.Result; response.Content.ReadAsStringAsync().ContinueWith((readTask) => { try { IHtmlDocument source = new JumonyParser().Parse(readTask.Result); string content = source.FindSingle(".zwnr").InnerHtml(); string sql = string.Format("update t_spider_zwgk t set t.content='{0}' where t.id={1}", content, id); int count = DbHelperMySQL.ExecuteSql(sql); } catch (Exception e) { Debug.WriteLine("----->【" + id + "】内容存储异常<-----:" + e); } }); }); } catch (Exception e) { Debug.WriteLine("----->【" + id + "】内容存储异常<-----:" + e); } }
public static void getByPage(int page) { string url = ""; if (page == 0) { url = "http://www.huachuan.gov.cn/zwgk/xxgksyzl/fgfgg/index.html"; } else { string p = page.ToString(); if (page < 10) { p = "0" + p; } url = "http://www.huachuan.gov.cn/system/more/zwgk/xxgksyzl/fgfgg/index/page_" + p + ".html"; } string pageStr = Util.getHtmlStr(url, Encoding.Default); IHtmlDocument source = new JumonyParser().Parse(pageStr); var items = source.Find(".listmain ul li"); foreach (var item in items) { string id = "0"; string path = item.FindFirst("div").FindSingle("a").Attribute("href").Value(); string title = item.FindFirst("div").FindSingle("a").InnerText(); string time = item.FindFirst("div").NextElement().InnerText(); if (path.StartsWith("http")) { id = new Random().Next(1000000, 9999999).ToString(); string sql = string.Format("select count(*) from t_spider_bslc t where t.title='{0}'", title); int count = Convert.ToInt32(DbHelperMySQL.GetSingle(sql)); if (count == 0) { sql = string.Format("insert into t_spider_bslc(id,title,time,path) values({0},'{1}','{2}','{3}')", id, title, time, path); count = DbHelperMySQL.ExecuteSql(sql); if (count == 1) { getContent(path); } } } else { id = path.Split('/')[3].Split('.')[0]; string sql = string.Format("select count(*) from t_spider_bslc t where t.id={0}", id); int count = Convert.ToInt32(DbHelperMySQL.GetSingle(sql)); if (count == 0) { sql = string.Format("insert into t_spider_bslc(id,title,time,path) values({0},'{1}','{2}','{3}')", id, title, time, path); count = DbHelperMySQL.ExecuteSql(sql); if (count == 1) { getContent(path); } } } } }
public void VisibleTest() { var document = new JumonyParser().LoadDocument(Path.Combine(Environment.CurrentDirectory, "StyleTest1.html")); document.DataBind(null); Assert.AreEqual(document.Find(".invisible").Count(), 0); }
void Webbrowser2_LoadCompleted(object sender, NavigationEventArgs e) { mshtml.HTMLDocument mhtml = (mshtml.HTMLDocument)Webbrowser2.Document; string html = mhtml.body.innerHTML; IHtmlDocument document = new JumonyParser().Parse(html); IEnumerable <IHtmlElement> result = document.Find("ul").Where(x => x.Identity() == "houselist-mod-new"); IEnumerable <IHtmlElement> result_li = result.Find("li"); IEnumerable <IHtmlElement> result_li1 = document.Find("li>.list-item"); }
public void Test1() { var document = new JumonyParser().LoadDocument(Path.Combine(Environment.CurrentDirectory, "Test1.html")); HtmlBinding.Create(document, null).DataBind(); Assert.AreEqual(document.FindFirst("title").InnerHtml(), "Test Title abc text", "对 title 元素内容的文本替换测试失败"); }
public static void getByPage(string cookie, string viewstate, int page) { HttpClient httpClient = new HttpClient(); HttpContent postContent = new FormUrlEncodedContent(new Dictionary <string, string>() { { "__VIEWSTATE", viewstate }, { "__VIEWSTATEGENERATOR", "7BE8FDE8" }, { "__EVENTTARGET", "AspNetPager1" }, { "__EVENTARGUMENT", page.ToString() }, { "_keywords", "" }, { "AspNetPager1_input", "1" }, }); httpClient .PostAsync("http://hd.huachuan.gov.cn/aspx/gkml_list.aspx", postContent) .ContinueWith((postTask) => { HttpResponseMessage response = postTask.Result; response.Content.ReadAsStringAsync().ContinueWith((readTask) => { //Debug.WriteLine(readTask.Result); IHtmlDocument source = new JumonyParser().Parse(readTask.Result); var itemCount = source.Find(".listbox").Count(); for (int i = 1; i <= itemCount; i++) { try { string id = source.FindSingle("#four" + i).Attribute("href").Value().Split('=')[1]; string author = source.Find("#con_four_" + i).Find(".li1").Last().InnerText().Replace("发布机构:", ""); string time = source.Find("#con_four_" + i).Find(".li2").Last().InnerText().Replace("发文日期:", ""); string title = source.Find("#con_four_" + i).Find(".infoname").First().InnerText().Replace("名称:", ""); //判断第一条是否存在,如果存在,则说明新闻一直未更新,不需要继续下去了 //可以使用下边逻辑,continue换成return //判断是否存在 string sql = string.Format("select count(*) from t_spider_zwgk t where t.id={0}", id); int count = Convert.ToInt32(DbHelperMySQL.GetSingle(sql)); if (count > 0) { //continue; return; } //不存在,插入数据库 sql = string.Format("insert into t_spider_zwgk(id,title,time,author) values({0},'{1}','{2}','{3}')", id, title.Replace('\'', '"'), time, author); count = DbHelperMySQL.ExecuteSql(sql); if (count == 1) { getContent(id); } } catch (Exception e) { Debug.WriteLine("----->【" + page + "." + i + "】新闻创建异常<-----:" + e); } } }); }); }
static void Main(string[] args) { //抓取小说 FileStream fs = null; if (File.Exists("D\\c.txt")) { fs = new FileStream("D:\\c.txt", FileMode.Append); } else { fs = new FileStream("D:\\c.txt", FileMode.Create); } var sw = new StreamWriter(fs, Encoding.UTF8); var baseUrl = "http://www.42xs.com/read/0/404/"; var nextUrl = "171271.html"; var url = ""; var txt = ""; var title = ""; while (nextUrl != "") { try { url = baseUrl + nextUrl; var doc = new JumonyParser().LoadDocument(url); var titleDom = doc.FindFirst("#center > div.title > h1"); title = titleDom.InnerText(); var dom = doc.FindFirst("#content"); txt = dom.InnerText(); var domNext = doc.FindFirst("#container > div:nth-child(3) > div > div.jump > a:nth-child(6)"); nextUrl = domNext.Attribute("href").Value(); } catch { Console.WriteLine("{0}没有成功", url); nextUrl = ""; } Console.WriteLine(title); sw.WriteLine(""); sw.WriteLine(title); sw.WriteLine(""); sw.WriteLine(txt); } Console.Write("The End. Press any key to exit..."); Console.ReadKey(); sw.Close(); fs.Close(); }
public void css_class_has_hyphen() { var html = "<div class=\"css-class\"></div>"; var htmlParser = new JumonyParser(); var doc = htmlParser.Parse(html); var css_class = doc.Find(".css-class"); Assert.AreEqual(1, css_class.Count()); }
public void SpecificationTest7() { var document = new JumonyParser().LoadDocument(Path.Combine(Environment.CurrentDirectory, "SpecificationTest7.html")); var link = document.FindFirstOrDefault("a"); Assert.IsNotNull(link, "属性或内容包含特殊字符的标签解析失败"); Assert.AreEqual(link.Attribute("href").Value(), "#", "属性内容包含 > 时解析失败。"); Assert.AreEqual(link.Attribute("title").Value(), "this is a <a> tag", "属性内容包含 > 时解析失败。"); //Assert.AreEqual( link.Elements().Count(), 0, "错误的解析了以特殊字符为标签名的标签" ); }
public void SpecificationTest5() { var document = new JumonyParser().LoadDocument(Path.Combine(Environment.CurrentDirectory, "SpecificationTest5.html")); //Assert.AreEqual( document.DocumentDeclaration, "<!DOCTYPE html>", "HTML 声明解析失败" ); var specials = document.DescendantNodes().OfType <IHtmlSpecial>().ToArray(); Assert.AreEqual(specials.Count(), 4, "特殊标签解析数量不对"); }
public void CompileTest() { var parser = new JumonyParser(); var document = parser.LoadDocument(Path.Combine(Environment.CurrentDirectory, "Test1.html")); var method = document.Compile(); var document2 = method(parser.DomProvider); Assert.IsTrue(document.DescendantNodes().SequenceEqual(document2.DescendantNodes(), new DomNodeComparer()), "编译还原测试失败"); }