private static void saveCrawlInDatabase(crawlingResult crawledResult) { lock (_lockDatabaseAdd) { using (var context = new DBCrawling()) { crawledResult.UrlHash = crawledResult.Url.ComputeHashOfOurSystem(); crawledResult.HostUrl = crawledResult.Url.returnRootUrl(); var vrResult = context.tblMainUrls.SingleOrDefault(b => b.UrlHash == crawledResult.UrlHash); crawledResult.ParentUrlHash = crawledResult.ParentUrlHash.ComputeHashOfOurSystem(); if (crawledResult.blcrawlSuccess == true) { crawledResult.IsCrawled = true; if (!string.IsNullOrEmpty(crawledResult.SourceCode)) { double dblOriginalSourceCodeLenght = crawledResult.SourceCode.Length; crawledResult.SourceCode = crawledResult.SourceCode.CompressString(); crawledResult.CompressionPercent = Convert.ToByte( Math.Floor( ((crawledResult.SourceCode.Length.ToDouble() / dblOriginalSourceCodeLenght) * 100)) ); } crawledResult.CrawlTryCounter = 0; } tblMainUrl finalObject = crawledResult.converToBaseMainUrlClass(); //this approach brings extra overhead to the server with deleting from server first //therefore will use copy properties of object to another object without changing reference //if (vrResult != null) //{ // context.tblMainUrls.Remove(vrResult); // context.SaveChanges(); //} if (vrResult != null) { finalObject.DiscoverDate = vrResult.DiscoverDate; finalObject.LinkDepthLevel = vrResult.LinkDepthLevel; finalObject.CrawlTryCounter = vrResult.CrawlTryCounter; if (crawledResult.blcrawlSuccess == false) { finalObject.CrawlTryCounter++; } finalObject.CopyProperties(vrResult); } else { context.tblMainUrls.Add(finalObject); } var gg = context.SaveChanges(); } } }
public static void clearDatabase() { using (var context = new DBCrawling()) { var ctx = ((System.Data.Entity.Infrastructure.IObjectContextAdapter)context).ObjectContext; ctx.ExecuteStoreCommand("truncate table tblMainUrls"); } }
private void btnTest_Click(object sender, RoutedEventArgs e) { using (DBCrawling db = new DBCrawling()) { db.tblMainUrls.RemoveRange(db.tblMainUrls); db.SaveChanges(); db.tblMainUrls.Add(new tblMainUrl { Url = "www.toros.edu.tr", ParentUrlHash = "www.toros.edu.tr", SourceCode = "gg", UrlHash = "ww" }); db.SaveChanges(); } }
private static void saveCrawlInDatabase(crawlingResult crawledResult) { if (crawledResult.blcrawlSuccess == false) { return; } using (var context = new DBCrawling()) { tblMainUrl crawledUrl = new tblMainUrl(); crawledUrl.UrlHash = crawledResult.srCrawledUrl.normalizeUrl().ComputeSha256Hash(); var vrResult = context.tblMainUrls.SingleOrDefault(b => b.UrlHash == crawledUrl.UrlHash); if (vrResult == null) { context.tblMainUrls.Add(crawledUrl); } else { crawledUrl = vrResult; context.tblMainUrls.Attach(crawledUrl); context.Entry(crawledUrl).State = EntityState.Modified; } crawledUrl.DiscoverDate = crawledResult.dtDiscoverDate; crawledUrl.FetchTimeMS = crawledResult.irCrawlingTimeMS; crawledUrl.LastCrawlingDate = crawledResult.dtLastCrawlingDate; crawledUrl.LinkDepthLevel = crawledResult.irLinkDepthLevel; crawledUrl.PageTile = crawledResult.srTitleofPage; crawledUrl.ParentUrlHash = crawledResult.srParentUrlHash.normalizeUrl().ComputeSha256Hash(); crawledUrl.SourceCode = crawledResult.srCrawledSourceCode.CompressString(); crawledUrl.CompressionPercent = Convert.ToByte( Math.Floor( ((crawledUrl.SourceCode.Length.ToDouble() / crawledResult.srCrawledSourceCode.Length.ToDouble()) * 100)) ); crawledUrl.Url = crawledResult.srCrawledUrl; var gg = context.SaveChanges(); } }
private static void saveDiscoveredLinksInDatabaseForFutureCrawling(crawlingResult crawlResult) { lock (_lockDatabaseAdd) { using (var context = new DBCrawling()) { HashSet <string> hsProcessedUrls = new HashSet <string>(); foreach (var vrPerLink in crawlResult.lstDiscoveredLinks) { var vrHashedLink = vrPerLink.ComputeHashOfOurSystem(); if (hsProcessedUrls.Contains(vrHashedLink)) { continue; } var vrResult = context.tblMainUrls.Any(databaseRecord => databaseRecord.UrlHash == vrHashedLink); if (vrResult == false) { crawlingResult newLinkCrawlingResult = new crawlingResult(); newLinkCrawlingResult.Url = vrPerLink.normalizeUrl(); newLinkCrawlingResult.HostUrl = newLinkCrawlingResult.Url.returnRootUrl(); newLinkCrawlingResult.UrlHash = vrPerLink.ComputeHashOfOurSystem(); newLinkCrawlingResult.ParentUrlHash = crawlResult.UrlHash; newLinkCrawlingResult.LinkDepthLevel = (short)(crawlResult.LinkDepthLevel + 1); context.tblMainUrls.Add(newLinkCrawlingResult.converToBaseMainUrlClass()); hsProcessedUrls.Add(vrHashedLink); Interlocked.Increment(ref irDiscoveredUrlCount); } } context.SaveChanges(); } } }
private void startPollingAwaitingURLs(object sender, EventArgs e) { lock (UserLogs) { string srPerMinCrawlingspeed = (irCrawledUrlCount.ToDouble() / (DateTime.Now - dtStartDate).TotalMinutes).ToString("N2"); string srPerMinDiscoveredLinkSpeed = (irDiscoveredUrlCount.ToDouble() / (DateTime.Now - dtStartDate).TotalMinutes).ToString("N2"); string srPassedTime = (DateTime.Now - dtStartDate).TotalMinutes.ToString("N2"); UserLogs.Insert(0, $"{DateTime.Now} polling awaiting urls \t processing: {blBeingProcessed} \t number of crawling tasks: {lstCrawlingTasks.Count}"); UserLogs.Insert(0, $"Total Time: {srPassedTime} Minutes \t Total Crawled Links Count: {irCrawledUrlCount.ToString("N0")} \t Crawling Speed Per Minute: {srPerMinCrawlingspeed} \t Total Discovered Links : {irDiscoveredUrlCount.ToString("N0")} \t Discovered Url Speed: {srPerMinDiscoveredLinkSpeed} "); } logMesssage($"polling awaiting urls \t processing: {blBeingProcessed} \t number of crawling tasks: {lstCrawlingTasks.Count}"); if (blBeingProcessed) { return; } lock (_lock_CrawlingSync) { blBeingProcessed = true; lstCrawlingTasks = lstCrawlingTasks.Where(pr => pr.Status != TaskStatus.RanToCompletion && pr.Status != TaskStatus.Faulted).ToList(); int irTasksCountToStart = _irNumberOfTotalConcurrentCrawling - lstCrawlingTasks.Count; if (irTasksCountToStart > 0) { using (DBCrawling db = new DBCrawling()) { var vrReturnedList = db.tblMainUrls.Where(x => x.IsCrawled == false && x.CrawlTryCounter < _irMaximumTryCount) .OrderBy(pr => pr.DiscoverDate) .Select(x => new { x.Url, x.LinkDepthLevel }).Take(irTasksCountToStart * 2).ToList(); logMesssage(string.Join(" , ", vrReturnedList.Select(pr => pr.Url))); foreach (var vrPerReturned in vrReturnedList) { var vrUrlToCrawl = vrPerReturned.Url; int irDepth = vrPerReturned.LinkDepthLevel; lock (lstCurrentlyCrawlingUrls) { if (lstCurrentlyCrawlingUrls.Contains(vrUrlToCrawl)) { logMesssage($"bypass url since already crawling: \t {vrUrlToCrawl}"); continue; } lstCurrentlyCrawlingUrls.Add(vrUrlToCrawl); } logMesssage($"starting crawling url: \t {vrUrlToCrawl}"); lock (UserLogs) { UserLogs.Insert(0, $"{DateTime.Now} starting crawling url: \t {vrUrlToCrawl}"); } var vrStartedTask = Task.Factory.StartNew(() => { crawlPage(vrUrlToCrawl, irDepth, null, DateTime.MinValue); }).ContinueWith((pr) => { lock (lstCurrentlyCrawlingUrls) { lstCurrentlyCrawlingUrls.Remove(vrUrlToCrawl); logMesssage($"removing url from list since task completed: \t {vrUrlToCrawl}"); } }); lstCrawlingTasks.Add(vrStartedTask); if (lstCrawlingTasks.Count > _irNumberOfTotalConcurrentCrawling) { break; } } } } blBeingProcessed = false; } }