public EInfoDetail AnalyDetail(string url, EInfoSummery summery) { string htmlstr = GetHtmlStr(url, Encoding.UTF8); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(htmlstr); HtmlNode rootnode = doc.DocumentNode; string xpathstring = "//td[@id='article_content']"; HtmlNode node = rootnode.SelectSingleNode(xpathstring); node.ChildNodes[0].Remove(); // node.ChildNodes[0].Remove(); EInfoDetail ed = new EInfoDetail() { OrigUrl = url, ArticleContent = node.OuterHtml, EInfoSummeryId = summery.InfoId, Title = summery.Title, }; return(ed); }
public void run(string url, Encoding code) { _EnCode = code; Dictionary <string, EInfoSummery> detailList = new Dictionary <string, EInfoSummery>(); try { string htmlstr = GetHtmlStr(url, code); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(htmlstr); HtmlNode rootnode = doc.DocumentNode; string xpathstring = ""; if (!_IsFromIQB) { xpathstring = "//a[text() ='最新技术']"; HtmlNodeCollection listAddr = rootnode.SelectNodes(xpathstring); url += listAddr[0].Attributes["href"].Value; htmlstr = GetHtmlStr(url, Encoding.UTF8); doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(htmlstr); rootnode = doc.DocumentNode; } xpathstring = "//span[@class='list-title-size']"; HtmlNodeCollection titles = rootnode.SelectNodes(xpathstring); //所有找到的节点都是一个集合 xpathstring = "//div[@class='cell-flat-content list-content-size']"; HtmlNodeCollection ess = rootnode.SelectNodes(xpathstring); xpathstring = "//div[@class='image-container undefined ']"; HtmlNodeCollection img = rootnode.SelectNodes(xpathstring); xpathstring = "//div[@class='cell-flat-time']"; HtmlNodeCollection date = rootnode.SelectNodes(xpathstring); xpathstring = "//div[@class='cell-flat-eye']"; HtmlNodeCollection rc = rootnode.SelectNodes(xpathstring); xpathstring = "//a[@class='cell-flat']"; HtmlNodeCollection detail = rootnode.SelectNodes(xpathstring); using (Html5Content db = new Html5Content()) { for (int i = 0; i < titles.Count(); i++) { string detailUrl = "http://hjc025.xiaoyun.com" + detail[i].Attributes["href"].Value; if (this.IsFilterUrls(detail[i].Attributes["href"].Value)) { WriteOut("Url Filted -- " + detailUrl); continue; } if (!db.IsExistSummery(detail[i].Attributes["href"].Value)) { EInfoSummery es = new EInfoSummery(); es.ReadCount = Convert.ToInt32(rc[i].InnerText.Replace("阅读", "")); es.Title = titles[i].InnerText; es.Summery = ess[i].InnerText; es.PublishDate = this.FormatPublishDate(date[i].InnerText); if (!_IsFromIQB) { es.CoverImg = img[i + 7].Attributes["src"].Value; } else { es.CoverImg = img[i + 9].Attributes["src"].Value; } es.OrigInfoId = detail[i].Attributes["href"].Value; es.CreateDateTime = DateTime.Now; db.InfoSummery.Add(es); //detailList.Add(detailUrl, es); using (TransactionScope sc = new TransactionScope()) { db.SaveChanges(); EInfoDetail ed = this.AnalyDetail(detailUrl, es); db.InfoDetail.Add(ed); db.SaveChanges(); sc.Complete(); } WriteOut("Analy Url -- " + detailUrl + " Done"); } else { WriteOut("Url Existed -- " + detailUrl); continue; } //EInfoDetail ed = this.AnalyDetail(detailUrl, es); //db.InfoDetail.Add(ed); } // this.AnalyDetail(detailList); } //} //sc.Complete(); } catch (Exception ex) { throw ex; } finally { } }