private static perDocument extractLinks(string srUrl, HtmlDocument doc) { var baseUri = new Uri(srUrl); perDocument myDoc = new perDocument(); // extracting all links var vrNodes = doc.DocumentNode.SelectNodes("//a[@href]"); if (vrNodes != null) { foreach (HtmlNode link in vrNodes)//xpath notation { HtmlAttribute att = link.Attributes["href"]; //this is used to convert from relative path to absolute path var absoluteUri = new Uri(baseUri, att.Value.ToString()); if (!absoluteUri.ToString().StartsWith("http://") && !absoluteUri.ToString().StartsWith("https://")) { continue; } myDoc.lstExtractedUrls.Add(absoluteUri.ToString().Split('#').FirstOrDefault()); } } myDoc.lstExtractedUrls = myDoc.lstExtractedUrls.Distinct().ToList(); var vrDocTitle = doc.DocumentNode.SelectSingleNode("//title")?.InnerText.ToString().Trim(); vrDocTitle = System.Net.WebUtility.HtmlDecode(vrDocTitle); myDoc.srDocTitle = vrDocTitle; return(myDoc); }
public static void crawlPage(string srUrlToCrawl, int irUrlDepthLevel, string _srParentUrl, DateTime _dtDiscoverDate) { var vrLocalUrl = srUrlToCrawl; crawlingResult crawlResult = new crawlingResult(); crawlResult.srCrawledUrl = vrLocalUrl; crawlResult.srParentUrlHash = _srParentUrl; crawlResult.dtDiscoverDate = _dtDiscoverDate; Stopwatch swTimerCrawling = new Stopwatch(); swTimerCrawling.Start(); HtmlWeb wbClient = new HtmlWeb();//you should use httpwebrequest for more control and better performance wbClient.AutoDetectEncoding = true; HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); try { doc = wbClient.Load(crawlResult.srCrawledUrl); crawlResult.srCrawledSourceCode = doc.Text; } catch (Exception E) { crawlResult.blcrawlSuccess = false; logError(E, "crawlPage"); } swTimerCrawling.Stop(); crawlResult.irCrawlingTimeMS = Convert.ToInt32(swTimerCrawling.ElapsedMilliseconds); if (crawlResult.blcrawlSuccess) { perDocument docResults = extractLinks(crawlResult.srCrawledUrl, doc); crawlResult.lstDiscoveredLinks.AddRange(docResults.lstExtractedUrls); crawlResult.srTitleofPage = docResults.srDocTitle; } doc = null; saveCrawlInDatabase(crawlResult); }
private static perDocument extractLinks(string srUrl) { lock (swCrawling) swCrawling.WriteLine(DateTime.Now + "\t" + srUrl); var baseUri = new Uri(srUrl); HtmlWeb web = new HtmlWeb(); web.AutoDetectEncoding = true; HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc = web.Load(srUrl); perDocument myDoc = new perDocument(); // extracting all links foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//a[@href]"))//xpath notation { HtmlAttribute att = link.Attributes["href"]; //this is used to convert from relative path to absolute path var absoluteUri = new Uri(baseUri, att.Value.ToString()); if (!absoluteUri.ToString().StartsWith("http://") && !absoluteUri.ToString().StartsWith("https://")) { continue; } myDoc.lstExtractedUrls.Add(absoluteUri.ToString()); } myDoc.lstExtractedUrls = myDoc.lstExtractedUrls.Distinct().ToList(); var vrDocTitle = doc.DocumentNode.SelectSingleNode("//title")?.InnerText.ToString().Trim(); vrDocTitle = System.Net.WebUtility.HtmlDecode(vrDocTitle); myDoc.srDocTitle = vrDocTitle; return(myDoc); }