private readonly ConcurrentDictionary<string, CrawlUrl> working = new ConcurrentDictionary<string, CrawlUrl>(); // {hash, url} #endregion Fields #region Methods public void Done(string key, CrawlUrl crawlUrl) { if (done.TryAdd(key, "")) { scheduled.TryRemove(key, out crawlUrl); } }
public void Done(string key, CrawlUrl crawlUrl) { if (done.TryAdd(key, "")) { scheduled.TryRemove(key, out crawlUrl); } }
public void Done(string key, CrawlUrl crawlUrl) { var typedClient = Client.As<CrawlUrl>(); if (typedClient.RemoveEntryFromHash(typedClient.GetHash<string>(WorkingHashId), crawlUrl.Hash)) { typedClient.SetEntryInHashIfNotExists(typedClient.GetHash<string>(DoneHashId), crawlUrl.Hash, crawlUrl); } }
private void RaisePageProcessing(CrawlUrl crawlUrl) { var handler = PageProcessing; if (handler != null) { handler.Invoke(crawlUrl); } }
private void RaisePageScheduled(CrawlUrl crawlUrl) { var handler = PageScheduled; if (handler != null) { handler.Invoke(crawlUrl); } }
public CrawlUrl PeekNext() { CrawlUrl next = null; if (scheduledQueue.TryPeek(out next)) { return(next); } LoadMore(); scheduledQueue.TryPeek(out next); return(next); }
public int Schedule(IEnumerable <string> crawlUrls) { var hashes = crawlUrls.Select(x => x.Split('#')[0].TrimEnd('/')).Where(x => !string.IsNullOrWhiteSpace(x)).Distinct().ToDictionary(urlHasher.CalculateHashAsString); var scheduledLinksCount = 0; foreach (var hash in hashes) { if (crawlUrlRepository.IsKnown(hash.Key)) { continue; } try { var crawlUrl = new CrawlUrl { Hash = hash.Key, Url = hash.Value, }; var websiteDefinition = websiteDefinitions.FirstOrDefault(x => x.Website.IsRelativeUrl(crawlUrl)); if (websiteDefinition != null) { crawlUrl.WebsiteDefinition = websiteDefinition; if (crawlUrlRepository.TryAdd(hash.Key, crawlUrl)) { Interlocked.Increment(ref websiteDefinition.UrlsToProcessCount); Interlocked.Increment(ref scheduledLinksCount); /* * var inputCount = websiteProcessingDefinitions[websiteDefinition].Post(crawlUrl); * log.DebugFormat("Process block has {0} pending messages", inputCount); * */ RaisePageScheduled(crawlUrl); } } } catch (Exception ex) { log.Error(ex); } } return(scheduledLinksCount); }
public bool TryAdd(string key, CrawlUrl crawlUrl) { return(scheduled.TryAdd(key, crawlUrl)); }
public int Post(CrawlUrl crawlUrl) { ProcessingBlock.Post(crawlUrl); return(ProcessingBlock.InputCount); }
public bool TryAdd(string key, CrawlUrl crawlUrl) { return scheduled.TryAdd(key, crawlUrl); }
private void RaisePageScheduled(CrawlUrl crawlUrl) { var handler = PageScheduled; if (handler != null) handler.Invoke(crawlUrl); }
public int Schedule(IEnumerable<string> crawlUrls) { var hashes = crawlUrls.Select(x => x.Split('#')[0].TrimEnd('/')).Where(x => !string.IsNullOrWhiteSpace(x)).Distinct().ToDictionary(urlHasher.CalculateHashAsString); var scheduledLinksCount = 0; foreach (var hash in hashes) { if (crawlUrlRepository.IsKnown(hash.Key)) continue; try { var crawlUrl = new CrawlUrl { Hash = hash.Key, Url = hash.Value, }; var websiteDefinition = websiteDefinitions.FirstOrDefault(x => x.Website.IsRelativeUrl(crawlUrl)); if (websiteDefinition != null) { crawlUrl.WebsiteDefinition = websiteDefinition; if (crawlUrlRepository.TryAdd(hash.Key, crawlUrl)) { Interlocked.Increment(ref websiteDefinition.UrlsToProcessCount); Interlocked.Increment(ref scheduledLinksCount); /* var inputCount = websiteProcessingDefinitions[websiteDefinition].Post(crawlUrl); log.DebugFormat("Process block has {0} pending messages", inputCount); */ RaisePageScheduled(crawlUrl); } } } catch (Exception ex) { log.Error(ex); } } return scheduledLinksCount; }
public bool TryAdd(string key, CrawlUrl crawlUrl) { Client.As<string>().Lists[ScheduledListId].Add(key); var typedClient = Client.As<CrawlUrl>(); typedClient.SetEntryInHashIfNotExists(typedClient.GetHash<string>(ScheduledHashId), key, crawlUrl); return true; }
public bool IsRelativeUrl(CrawlUrl crawlUrl) { return crawlUrl.Uri.AbsoluteUri.StartsWith(rootUri.AbsoluteUri); // return rootUri.IsBaseOf(crawlUrl.Uri); }
private void RaisePageProcessing(CrawlUrl crawlUrl) { var handler = PageProcessing; if (handler != null) handler.Invoke(crawlUrl); }
public int Post(CrawlUrl crawlUrl) { ProcessingBlock.Post(crawlUrl); return ProcessingBlock.InputCount; }
public bool IsRelativeUrl(CrawlUrl crawlUrl) { return(crawlUrl.Uri.AbsoluteUri.StartsWith(rootUri.AbsoluteUri)); // return rootUri.IsBaseOf(crawlUrl.Uri); }