Exemple #1
0
        private static void saveCrawlInDatabase(crawlingResult crawledResult)
        {
            lock (_lockDatabaseAdd)
            {
                using (var context = new DBCrawling())
                {
                    crawledResult.UrlHash = crawledResult.Url.ComputeHashOfOurSystem();
                    crawledResult.HostUrl = crawledResult.Url.returnRootUrl();
                    var vrResult = context.tblMainUrls.SingleOrDefault(b => b.UrlHash == crawledResult.UrlHash);
                    crawledResult.ParentUrlHash = crawledResult.ParentUrlHash.ComputeHashOfOurSystem();

                    if (crawledResult.blcrawlSuccess == true)
                    {
                        crawledResult.IsCrawled = true;
                        if (!string.IsNullOrEmpty(crawledResult.SourceCode))
                        {
                            double dblOriginalSourceCodeLenght = crawledResult.SourceCode.Length;
                            crawledResult.SourceCode         = crawledResult.SourceCode.CompressString();
                            crawledResult.CompressionPercent = Convert.ToByte(
                                Math.Floor(
                                    ((crawledResult.SourceCode.Length.ToDouble() / dblOriginalSourceCodeLenght) * 100))
                                );
                        }
                        crawledResult.CrawlTryCounter = 0;
                    }


                    tblMainUrl finalObject = crawledResult.converToBaseMainUrlClass();

                    //this approach brings extra overhead to the server with deleting from server first
                    //therefore will use copy properties of object to another object without changing reference
                    //if (vrResult != null)
                    //{
                    //    context.tblMainUrls.Remove(vrResult);
                    //    context.SaveChanges();
                    //}



                    if (vrResult != null)
                    {
                        finalObject.DiscoverDate    = vrResult.DiscoverDate;
                        finalObject.LinkDepthLevel  = vrResult.LinkDepthLevel;
                        finalObject.CrawlTryCounter = vrResult.CrawlTryCounter;
                        if (crawledResult.blcrawlSuccess == false)
                        {
                            finalObject.CrawlTryCounter++;
                        }
                        finalObject.CopyProperties(vrResult);
                    }
                    else
                    {
                        context.tblMainUrls.Add(finalObject);
                    }


                    var gg = context.SaveChanges();
                }
            }
        }
 public static void clearDatabase()
 {
     using (var context = new DBCrawling())
     {
         var ctx = ((System.Data.Entity.Infrastructure.IObjectContextAdapter)context).ObjectContext;
         ctx.ExecuteStoreCommand("truncate table tblMainUrls");
     }
 }
        private void btnTest_Click(object sender, RoutedEventArgs e)
        {
            using (DBCrawling db = new DBCrawling())
            {
                db.tblMainUrls.RemoveRange(db.tblMainUrls);
                db.SaveChanges();

                db.tblMainUrls.Add(new tblMainUrl {
                    Url = "www.toros.edu.tr", ParentUrlHash = "www.toros.edu.tr", SourceCode = "gg", UrlHash = "ww"
                });
                db.SaveChanges();
            }
        }
        private static void saveCrawlInDatabase(crawlingResult crawledResult)
        {
            if (crawledResult.blcrawlSuccess == false)
            {
                return;
            }

            using (var context = new DBCrawling())
            {
                tblMainUrl crawledUrl = new tblMainUrl();

                crawledUrl.UrlHash = crawledResult.srCrawledUrl.normalizeUrl().ComputeSha256Hash();

                var vrResult = context.tblMainUrls.SingleOrDefault(b => b.UrlHash == crawledUrl.UrlHash);

                if (vrResult == null)
                {
                    context.tblMainUrls.Add(crawledUrl);
                }
                else
                {
                    crawledUrl = vrResult;
                    context.tblMainUrls.Attach(crawledUrl);
                    context.Entry(crawledUrl).State = EntityState.Modified;
                }

                crawledUrl.DiscoverDate       = crawledResult.dtDiscoverDate;
                crawledUrl.FetchTimeMS        = crawledResult.irCrawlingTimeMS;
                crawledUrl.LastCrawlingDate   = crawledResult.dtLastCrawlingDate;
                crawledUrl.LinkDepthLevel     = crawledResult.irLinkDepthLevel;
                crawledUrl.PageTile           = crawledResult.srTitleofPage;
                crawledUrl.ParentUrlHash      = crawledResult.srParentUrlHash.normalizeUrl().ComputeSha256Hash();
                crawledUrl.SourceCode         = crawledResult.srCrawledSourceCode.CompressString();
                crawledUrl.CompressionPercent = Convert.ToByte(
                    Math.Floor(
                        ((crawledUrl.SourceCode.Length.ToDouble() / crawledResult.srCrawledSourceCode.Length.ToDouble()) * 100))
                    );
                crawledUrl.Url = crawledResult.srCrawledUrl;



                var gg = context.SaveChanges();
            }
        }
Exemple #5
0
        private static void saveDiscoveredLinksInDatabaseForFutureCrawling(crawlingResult crawlResult)
        {
            lock (_lockDatabaseAdd)
            {
                using (var context = new DBCrawling())
                {
                    HashSet <string> hsProcessedUrls = new HashSet <string>();

                    foreach (var vrPerLink in crawlResult.lstDiscoveredLinks)
                    {
                        var vrHashedLink = vrPerLink.ComputeHashOfOurSystem();

                        if (hsProcessedUrls.Contains(vrHashedLink))
                        {
                            continue;
                        }

                        var vrResult = context.tblMainUrls.Any(databaseRecord => databaseRecord.UrlHash == vrHashedLink);

                        if (vrResult == false)
                        {
                            crawlingResult newLinkCrawlingResult = new crawlingResult();
                            newLinkCrawlingResult.Url            = vrPerLink.normalizeUrl();
                            newLinkCrawlingResult.HostUrl        = newLinkCrawlingResult.Url.returnRootUrl();
                            newLinkCrawlingResult.UrlHash        = vrPerLink.ComputeHashOfOurSystem();
                            newLinkCrawlingResult.ParentUrlHash  = crawlResult.UrlHash;
                            newLinkCrawlingResult.LinkDepthLevel = (short)(crawlResult.LinkDepthLevel + 1);
                            context.tblMainUrls.Add(newLinkCrawlingResult.converToBaseMainUrlClass());
                            hsProcessedUrls.Add(vrHashedLink);
                            Interlocked.Increment(ref irDiscoveredUrlCount);
                        }
                    }

                    context.SaveChanges();
                }
            }
        }
        private void startPollingAwaitingURLs(object sender, EventArgs e)
        {
            lock (UserLogs)
            {
                string srPerMinCrawlingspeed = (irCrawledUrlCount.ToDouble() / (DateTime.Now - dtStartDate).TotalMinutes).ToString("N2");

                string srPerMinDiscoveredLinkSpeed = (irDiscoveredUrlCount.ToDouble() / (DateTime.Now - dtStartDate).TotalMinutes).ToString("N2");

                string srPassedTime = (DateTime.Now - dtStartDate).TotalMinutes.ToString("N2");

                UserLogs.Insert(0, $"{DateTime.Now} polling awaiting urls \t processing: {blBeingProcessed} \t number of crawling tasks: {lstCrawlingTasks.Count}");

                UserLogs.Insert(0, $"Total Time: {srPassedTime} Minutes \t Total Crawled Links Count: {irCrawledUrlCount.ToString("N0")} \t Crawling Speed Per Minute: {srPerMinCrawlingspeed} \t Total Discovered Links : {irDiscoveredUrlCount.ToString("N0")} \t Discovered Url Speed: {srPerMinDiscoveredLinkSpeed} ");
            }

            logMesssage($"polling awaiting urls \t processing: {blBeingProcessed} \t number of crawling tasks: {lstCrawlingTasks.Count}");

            if (blBeingProcessed)
            {
                return;
            }

            lock (_lock_CrawlingSync)
            {
                blBeingProcessed = true;

                lstCrawlingTasks = lstCrawlingTasks.Where(pr => pr.Status != TaskStatus.RanToCompletion && pr.Status != TaskStatus.Faulted).ToList();

                int irTasksCountToStart = _irNumberOfTotalConcurrentCrawling - lstCrawlingTasks.Count;

                if (irTasksCountToStart > 0)
                {
                    using (DBCrawling db = new DBCrawling())
                    {
                        var vrReturnedList = db.tblMainUrls.Where(x => x.IsCrawled == false && x.CrawlTryCounter < _irMaximumTryCount)
                                             .OrderBy(pr => pr.DiscoverDate)
                                             .Select(x => new
                        {
                            x.Url,
                            x.LinkDepthLevel
                        }).Take(irTasksCountToStart * 2).ToList();

                        logMesssage(string.Join(" , ", vrReturnedList.Select(pr => pr.Url)));

                        foreach (var vrPerReturned in vrReturnedList)
                        {
                            var vrUrlToCrawl = vrPerReturned.Url;
                            int irDepth      = vrPerReturned.LinkDepthLevel;
                            lock (lstCurrentlyCrawlingUrls)
                            {
                                if (lstCurrentlyCrawlingUrls.Contains(vrUrlToCrawl))
                                {
                                    logMesssage($"bypass url since already crawling: \t {vrUrlToCrawl}");
                                    continue;
                                }
                                lstCurrentlyCrawlingUrls.Add(vrUrlToCrawl);
                            }

                            logMesssage($"starting crawling url: \t {vrUrlToCrawl}");

                            lock (UserLogs)
                            {
                                UserLogs.Insert(0, $"{DateTime.Now} starting crawling url: \t {vrUrlToCrawl}");
                            }

                            var vrStartedTask = Task.Factory.StartNew(() => { crawlPage(vrUrlToCrawl, irDepth, null, DateTime.MinValue); }).ContinueWith((pr) =>
                            {
                                lock (lstCurrentlyCrawlingUrls)
                                {
                                    lstCurrentlyCrawlingUrls.Remove(vrUrlToCrawl);
                                    logMesssage($"removing url from list since task completed: \t {vrUrlToCrawl}");
                                }
                            });
                            lstCrawlingTasks.Add(vrStartedTask);

                            if (lstCrawlingTasks.Count > _irNumberOfTotalConcurrentCrawling)
                            {
                                break;
                            }
                        }
                    }
                }

                blBeingProcessed = false;
            }
        }