private static perDocument extractLinks(string srUrl, HtmlDocument doc)
        {
            var baseUri = new Uri(srUrl);

            perDocument myDoc = new perDocument();

            // extracting all links
            var vrNodes = doc.DocumentNode.SelectNodes("//a[@href]");

            if (vrNodes != null)
            {
                foreach (HtmlNode link in vrNodes)//xpath notation
                {
                    HtmlAttribute att = link.Attributes["href"];
                    //this is used to convert from relative path to absolute path
                    var absoluteUri = new Uri(baseUri, att.Value.ToString());

                    if (!absoluteUri.ToString().StartsWith("http://") && !absoluteUri.ToString().StartsWith("https://"))
                    {
                        continue;
                    }

                    myDoc.lstExtractedUrls.Add(absoluteUri.ToString().Split('#').FirstOrDefault());
                }
            }

            myDoc.lstExtractedUrls = myDoc.lstExtractedUrls.Distinct().ToList();

            var vrDocTitle = doc.DocumentNode.SelectSingleNode("//title")?.InnerText.ToString().Trim();

            vrDocTitle = System.Net.WebUtility.HtmlDecode(vrDocTitle);

            myDoc.srDocTitle = vrDocTitle;
            return(myDoc);
        }
        public static void crawlPage(string srUrlToCrawl, int irUrlDepthLevel, string _srParentUrl, DateTime _dtDiscoverDate)
        {
            var            vrLocalUrl  = srUrlToCrawl;
            crawlingResult crawlResult = new crawlingResult();

            crawlResult.srCrawledUrl    = vrLocalUrl;
            crawlResult.srParentUrlHash = _srParentUrl;
            crawlResult.dtDiscoverDate  = _dtDiscoverDate;

            Stopwatch swTimerCrawling = new Stopwatch();

            swTimerCrawling.Start();

            HtmlWeb wbClient = new HtmlWeb();//you should use httpwebrequest for more control and better performance

            wbClient.AutoDetectEncoding = true;
            HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();

            try
            {
                doc = wbClient.Load(crawlResult.srCrawledUrl);
                crawlResult.srCrawledSourceCode = doc.Text;
            }
            catch (Exception E)
            {
                crawlResult.blcrawlSuccess = false;
                logError(E, "crawlPage");
            }

            swTimerCrawling.Stop();
            crawlResult.irCrawlingTimeMS = Convert.ToInt32(swTimerCrawling.ElapsedMilliseconds);

            if (crawlResult.blcrawlSuccess)
            {
                perDocument docResults = extractLinks(crawlResult.srCrawledUrl, doc);
                crawlResult.lstDiscoveredLinks.AddRange(docResults.lstExtractedUrls);
                crawlResult.srTitleofPage = docResults.srDocTitle;
            }

            doc = null;
            saveCrawlInDatabase(crawlResult);
        }
        private static perDocument extractLinks(string srUrl)
        {
            lock (swCrawling)
                swCrawling.WriteLine(DateTime.Now + "\t" + srUrl);

            var     baseUri = new Uri(srUrl);
            HtmlWeb web     = new HtmlWeb();

            web.AutoDetectEncoding = true;
            HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
            doc = web.Load(srUrl);

            perDocument myDoc = new perDocument();

            // extracting all links
            foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//a[@href]"))//xpath notation
            {
                HtmlAttribute att = link.Attributes["href"];
                //this is used to convert from relative path to absolute path
                var absoluteUri = new Uri(baseUri, att.Value.ToString());

                if (!absoluteUri.ToString().StartsWith("http://") && !absoluteUri.ToString().StartsWith("https://"))
                {
                    continue;
                }

                myDoc.lstExtractedUrls.Add(absoluteUri.ToString());
            }

            myDoc.lstExtractedUrls = myDoc.lstExtractedUrls.Distinct().ToList();

            var vrDocTitle = doc.DocumentNode.SelectSingleNode("//title")?.InnerText.ToString().Trim();

            vrDocTitle = System.Net.WebUtility.HtmlDecode(vrDocTitle);

            myDoc.srDocTitle = vrDocTitle;
            return(myDoc);
        }