public async Task <IEnumerable <OriginRow> > CrawlerMaster(string path) { List <OriginRow> rows = new List <OriginRow>(); string html = await File.ReadAllTextAsync(path); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); HtmlNode root = doc.DocumentNode; HtmlNodeCollection items = root.SelectNodes("//article/div[@class='list']/div[@class='item']"); if (items != null && items.Any()) { foreach (HtmlNode item in items) { OriginRow row = new OriginRow(); row.Summary = WebUtility.HtmlDecode(item.SelectSingleNode("./div[@class='item-summary']").InnerText).Trim(); HtmlNodeCollection details = item.SelectNodes("./div[@class='item-details']/p"); foreach (HtmlNode detail in details) { string key = detail.SelectSingleNode("./span[1]").InnerText.Trim(); string value = detail.SelectSingleNode("./span[2]").InnerText.Trim(); switch (key) { case "Notice Type:": break; case "Approval Number:": break; case "Executing Agency:": row.ExecutingAgency = value; break; case "Contractor Name:": row.ContractorName = value; break; case "Address:": row.ContractorAddress = value; break; case "Total Contract Amount (US$):": row.TotalContractAmount = value; break; case "Contract Amount Financed by ADB (US$):": row.FinancedByAdb = value; break; } } rows.Add(row); } } return(rows); }
public async Task <bool> CrawlerMaster(List <OriginRow> rows, string url) { string html = await GetHtml(url); if (html == null) { return(false); } HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); HtmlNode root = doc.DocumentNode; HtmlNodeCollection items = root.SelectNodes("//tbody[@id='posts']/tr"); if (items != null && items.Any()) { foreach (HtmlNode item in items) { OriginRow row = new OriginRow(); row.IssueDate = item.SelectSingleNode("./td[1]").InnerText.Trim(); row.ClosingDate = item.SelectSingleNode("./td[2]").InnerText.Trim(); row.Location = item.SelectSingleNode("./td[3]").InnerText.Trim(); row.ProjectName = item.SelectSingleNode("./td[4]/a").InnerText.Trim(); string href = WebUtility.HtmlDecode(item.SelectSingleNode("./td[4]/a").Attributes["href"].Value); if (href.StartsWith("//")) { row.ProjectLink = $"https:{href}"; } else if (href.StartsWith("/")) { row.ProjectLink = $"https://www.ebrd.com{href}"; } else if (href.StartsWith("http:") || href.StartsWith("https:")) { row.ProjectLink = href; } else { row.ProjectLink = $"https://www.ebrd.com/{href}"; } row.ProjectDetail = await CrawlerDetail(row.ProjectLink); row.Sector = item.SelectSingleNode("./td[5]").InnerText.Trim(); row.Contract = item.SelectSingleNode("./td[6]").InnerText.Trim(); row.Type = item.SelectSingleNode("./td[7]").InnerText.Trim(); rows.Add(row); } return(true); } return(false); }
public async Task <bool> CrawlerMaster(List <OriginRow> rows, string url) { string html = await GetHtml(url); if (html == null) { return(false); } string fileName = $"{url.Substring(url.IndexOf("?") + 1)}.html"; string savePath = System.IO.Path.Combine(@"D:\temp\output\html", fileName); var saveTask = System.IO.File.WriteAllTextAsync(savePath, html); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); HtmlNode root = doc.DocumentNode; HtmlNodeCollection items = root.SelectNodes("//article/div[@class='list']/div[@class='item']"); if (items != null && items.Any()) { foreach (HtmlNode item in items) { OriginRow row = new OriginRow(); row.Summary = WebUtility.HtmlDecode(item.SelectSingleNode("./div[@class='item-summary']").InnerText).Trim(); HtmlNodeCollection details = item.SelectNodes("./div[@class='item-details']/p"); foreach (HtmlNode detail in details) { string key = detail.SelectSingleNode("./span[1]").InnerText.Trim(); string value = detail.SelectSingleNode("./span[2]").InnerText.Trim(); switch (key) { case "Notice Type:": break; case "Approval Number:": break; case "Executing Agency:": row.ExecutingAgency = value; break; case "Contractor Name:": row.ContractorName = value; break; case "Address:": row.ContractorAddress = value; break; case "Total Contract Amount (US$):": row.TotalContractAmount = value; break; case "Contract Amount Financed by ADB (US$):": row.FinancedByAdb = value; break; } } rows.Add(row); } return(true); } return(false); }