private readonly ConcurrentDictionary<string, CrawlUrl> working = new ConcurrentDictionary<string, CrawlUrl>(); // {hash, url}

        #endregion Fields

        #region Methods

        public void Done(string key, CrawlUrl crawlUrl)
        {
            if (done.TryAdd(key, ""))
            {
                scheduled.TryRemove(key, out crawlUrl);
            }
        }
 public void Done(string key, CrawlUrl crawlUrl)
 {
     if (done.TryAdd(key, ""))
     {
         scheduled.TryRemove(key, out crawlUrl);
     }
 }
        public void Done(string key, CrawlUrl crawlUrl)
        {
            var typedClient = Client.As<CrawlUrl>();

            if (typedClient.RemoveEntryFromHash(typedClient.GetHash<string>(WorkingHashId), crawlUrl.Hash))
            {
                typedClient.SetEntryInHashIfNotExists(typedClient.GetHash<string>(DoneHashId), crawlUrl.Hash, crawlUrl);
            }
        }
Beispiel #4
0
        private void RaisePageProcessing(CrawlUrl crawlUrl)
        {
            var handler = PageProcessing;

            if (handler != null)
            {
                handler.Invoke(crawlUrl);
            }
        }
Beispiel #5
0
        private void RaisePageScheduled(CrawlUrl crawlUrl)
        {
            var handler = PageScheduled;

            if (handler != null)
            {
                handler.Invoke(crawlUrl);
            }
        }
        public CrawlUrl PeekNext()
        {
            CrawlUrl next = null;

            if (scheduledQueue.TryPeek(out next))
            {
                return(next);
            }

            LoadMore();

            scheduledQueue.TryPeek(out next);
            return(next);
        }
Beispiel #7
0
        public int Schedule(IEnumerable <string> crawlUrls)
        {
            var hashes = crawlUrls.Select(x => x.Split('#')[0].TrimEnd('/')).Where(x => !string.IsNullOrWhiteSpace(x)).Distinct().ToDictionary(urlHasher.CalculateHashAsString);
            var scheduledLinksCount = 0;

            foreach (var hash in hashes)
            {
                if (crawlUrlRepository.IsKnown(hash.Key))
                {
                    continue;
                }

                try
                {
                    var crawlUrl = new CrawlUrl
                    {
                        Hash = hash.Key,
                        Url  = hash.Value,
                    };

                    var websiteDefinition = websiteDefinitions.FirstOrDefault(x => x.Website.IsRelativeUrl(crawlUrl));
                    if (websiteDefinition != null)
                    {
                        crawlUrl.WebsiteDefinition = websiteDefinition;
                        if (crawlUrlRepository.TryAdd(hash.Key, crawlUrl))
                        {
                            Interlocked.Increment(ref websiteDefinition.UrlsToProcessCount);
                            Interlocked.Increment(ref scheduledLinksCount);

/*
 *                                                      var inputCount = websiteProcessingDefinitions[websiteDefinition].Post(crawlUrl);
 *                                                      log.DebugFormat("Process block has {0} pending messages", inputCount);
 *
 */
                            RaisePageScheduled(crawlUrl);
                        }
                    }
                }
                catch (Exception ex)
                {
                    log.Error(ex);
                }
            }

            return(scheduledLinksCount);
        }
 public bool TryAdd(string key, CrawlUrl crawlUrl)
 {
     return(scheduled.TryAdd(key, crawlUrl));
 }
 public int Post(CrawlUrl crawlUrl)
 {
     ProcessingBlock.Post(crawlUrl);
     return(ProcessingBlock.InputCount);
 }
 public bool TryAdd(string key, CrawlUrl crawlUrl)
 {
     return scheduled.TryAdd(key, crawlUrl);
 }
Beispiel #11
0
 private void RaisePageScheduled(CrawlUrl crawlUrl)
 {
     var handler = PageScheduled;
     if (handler != null)
         handler.Invoke(crawlUrl);
 }
Beispiel #12
0
        public int Schedule(IEnumerable<string> crawlUrls)
        {
            var hashes = crawlUrls.Select(x => x.Split('#')[0].TrimEnd('/')).Where(x => !string.IsNullOrWhiteSpace(x)).Distinct().ToDictionary(urlHasher.CalculateHashAsString);
            var scheduledLinksCount = 0;

            foreach (var hash in hashes)
            {
                if (crawlUrlRepository.IsKnown(hash.Key))
                    continue;

                try
                {
                    var crawlUrl = new CrawlUrl
                        {
                            Hash = hash.Key,
                            Url = hash.Value,
                        };

                    var websiteDefinition = websiteDefinitions.FirstOrDefault(x => x.Website.IsRelativeUrl(crawlUrl));
                    if (websiteDefinition != null)
                    {
                        crawlUrl.WebsiteDefinition = websiteDefinition;
                        if (crawlUrlRepository.TryAdd(hash.Key, crawlUrl))
                        {
                            Interlocked.Increment(ref websiteDefinition.UrlsToProcessCount);
                            Interlocked.Increment(ref scheduledLinksCount);

            /*
                            var inputCount = websiteProcessingDefinitions[websiteDefinition].Post(crawlUrl);
                            log.DebugFormat("Process block has {0} pending messages", inputCount);

            */
                            RaisePageScheduled(crawlUrl);
                        }
                    }
                }
                catch (Exception ex)
                {
                    log.Error(ex);
                }
            }

            return scheduledLinksCount;
        }
        public bool TryAdd(string key, CrawlUrl crawlUrl)
        {
            Client.As<string>().Lists[ScheduledListId].Add(key);

            var typedClient = Client.As<CrawlUrl>();
            typedClient.SetEntryInHashIfNotExists(typedClient.GetHash<string>(ScheduledHashId), key, crawlUrl);

            return true;
        }
Beispiel #14
0
        public bool IsRelativeUrl(CrawlUrl crawlUrl)
        {
            return crawlUrl.Uri.AbsoluteUri.StartsWith(rootUri.AbsoluteUri);

            //			return rootUri.IsBaseOf(crawlUrl.Uri);
        }
Beispiel #15
0
 private void RaisePageProcessing(CrawlUrl crawlUrl)
 {
     var handler = PageProcessing;
     if (handler != null)
         handler.Invoke(crawlUrl);
 }
 public int Post(CrawlUrl crawlUrl)
 {
     ProcessingBlock.Post(crawlUrl);
     return ProcessingBlock.InputCount;
 }
Beispiel #17
0
        public bool IsRelativeUrl(CrawlUrl crawlUrl)
        {
            return(crawlUrl.Uri.AbsoluteUri.StartsWith(rootUri.AbsoluteUri));

//			return rootUri.IsBaseOf(crawlUrl.Uri);
        }