private static void saveCrawlInDatabase(crawlingResult crawledResult) { lock (_lockDatabaseAdd) { using (var context = new DBCrawling()) { crawledResult.UrlHash = crawledResult.Url.ComputeHashOfOurSystem(); crawledResult.HostUrl = crawledResult.Url.returnRootUrl(); var vrResult = context.tblMainUrls.SingleOrDefault(b => b.UrlHash == crawledResult.UrlHash); crawledResult.ParentUrlHash = crawledResult.ParentUrlHash.ComputeHashOfOurSystem(); if (crawledResult.blcrawlSuccess == true) { crawledResult.IsCrawled = true; if (!string.IsNullOrEmpty(crawledResult.SourceCode)) { double dblOriginalSourceCodeLenght = crawledResult.SourceCode.Length; crawledResult.SourceCode = crawledResult.SourceCode.CompressString(); crawledResult.CompressionPercent = Convert.ToByte( Math.Floor( ((crawledResult.SourceCode.Length.ToDouble() / dblOriginalSourceCodeLenght) * 100)) ); } crawledResult.CrawlTryCounter = 0; } tblMainUrl finalObject = crawledResult.converToBaseMainUrlClass(); //this approach brings extra overhead to the server with deleting from server first //therefore will use copy properties of object to another object without changing reference //if (vrResult != null) //{ // context.tblMainUrls.Remove(vrResult); // context.SaveChanges(); //} if (vrResult != null) { finalObject.DiscoverDate = vrResult.DiscoverDate; finalObject.LinkDepthLevel = vrResult.LinkDepthLevel; finalObject.CrawlTryCounter = vrResult.CrawlTryCounter; if (crawledResult.blcrawlSuccess == false) { finalObject.CrawlTryCounter++; } finalObject.CopyProperties(vrResult); } else { context.tblMainUrls.Add(finalObject); } var gg = context.SaveChanges(); } } }
private static void extractLinks(crawlingResult myCrawlingResult, HtmlDocument doc) { var baseUri = new Uri(myCrawlingResult.Url); // extracting all links var vrNodes = doc.DocumentNode.SelectNodes("//a[@href]"); if (vrNodes != null) { foreach (HtmlNode link in vrNodes)//xpath notation { HtmlAttribute att = link.Attributes["href"]; //this is used to convert from relative path to absolute path var absoluteUri = new Uri(baseUri, att.Value.ToString().decodeUrl()); if (!absoluteUri.ToString().StartsWith("http://") && !absoluteUri.ToString().StartsWith("https://")) { continue; } myCrawlingResult.lstDiscoveredLinks.Add(absoluteUri.ToString().Split('#').FirstOrDefault()); } } myCrawlingResult.lstDiscoveredLinks = myCrawlingResult.lstDiscoveredLinks.Distinct().Where(pr => pr.Length < 201).ToList(); var vrDocTitle = doc.DocumentNode.SelectSingleNode("//title")?.InnerText.ToString().Trim(); vrDocTitle = System.Net.WebUtility.HtmlDecode(vrDocTitle); myCrawlingResult.PageTile = vrDocTitle; }
public static void crawlPage(string srUrlToCrawl, int irUrlDepthLevel, string _srParentUrl, DateTime _dtDiscoverDate) { var vrLocalUrl = srUrlToCrawl; crawlingResult crawlResult = new crawlingResult(); crawlResult.Url = vrLocalUrl; if (!string.IsNullOrEmpty(_srParentUrl)) { crawlResult.ParentUrlHash = _srParentUrl; } if (_dtDiscoverDate != DateTime.MinValue) { crawlResult.DiscoverDate = _dtDiscoverDate; } Stopwatch swTimerCrawling = new Stopwatch(); swTimerCrawling.Start(); HtmlWeb wbClient = new HtmlWeb();//you should use httpwebrequest for more control and better performance wbClient.AutoDetectEncoding = true; wbClient.BrowserTimeout = new TimeSpan(0, 2, 0); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); try { doc = wbClient.Load(crawlResult.Url); crawlResult.SourceCode = doc.Text; } catch (Exception E) { crawlResult.blcrawlSuccess = false; logError(E, "crawlPage"); } Interlocked.Increment(ref irCrawledUrlCount); swTimerCrawling.Stop(); crawlResult.FetchTimeMS = Convert.ToInt32(swTimerCrawling.ElapsedMilliseconds); crawlResult.LastCrawlingDate = DateTime.Now; saveCrawlInDatabase(crawlResult); if (crawlResult.blcrawlSuccess) { extractLinks(crawlResult, doc); saveDiscoveredLinksInDatabaseForFutureCrawling(crawlResult); } doc = null; }
private static void saveDiscoveredLinksInDatabaseForFutureCrawling(crawlingResult crawlResult) { lock (_lockDatabaseAdd) { using (var context = new DBCrawling()) { HashSet <string> hsProcessedUrls = new HashSet <string>(); foreach (var vrPerLink in crawlResult.lstDiscoveredLinks) { var vrHashedLink = vrPerLink.ComputeHashOfOurSystem(); if (hsProcessedUrls.Contains(vrHashedLink)) { continue; } var vrResult = context.tblMainUrls.Any(databaseRecord => databaseRecord.UrlHash == vrHashedLink); if (vrResult == false) { crawlingResult newLinkCrawlingResult = new crawlingResult(); newLinkCrawlingResult.Url = vrPerLink.normalizeUrl(); newLinkCrawlingResult.HostUrl = newLinkCrawlingResult.Url.returnRootUrl(); newLinkCrawlingResult.UrlHash = vrPerLink.ComputeHashOfOurSystem(); newLinkCrawlingResult.ParentUrlHash = crawlResult.UrlHash; newLinkCrawlingResult.LinkDepthLevel = (short)(crawlResult.LinkDepthLevel + 1); context.tblMainUrls.Add(newLinkCrawlingResult.converToBaseMainUrlClass()); hsProcessedUrls.Add(vrHashedLink); Interlocked.Increment(ref irDiscoveredUrlCount); } } context.SaveChanges(); } } }