public void CollectorStacker() { CollectorPool stack = new CollectorPool(); Assert.IsTrue(stack.Next() == "google.com"); Assert.IsTrue(stack.Next() == "facebook.com"); stack.Store("jubacs.somee.com"); Assert.IsTrue(stack.Next() == "jubacs.somee.com"); }
private static void TestRun1(int WORK_AREA_TOP, int parallelCount, Repository repository, RobotService robots, CollectorPool pool, VisitedUrls history, TldParser tldParser) { CreateCollectorPool(pool, history, parallelCount).ForEach( c => { //ManualResetEvent done = new ManualResetEvent(false); Dictionary<string, object> properties = new Dictionary<string, object>(); properties.Add("TldParser", tldParser); Uri uri = c.Link.ToUri(); WriteXY(0, WORK_AREA_TOP + c.SeqNo, "{0} (fetching)", uri.ToUriShort()); WebDownloader web = new WebDownloader(uri, properties, ea => { WriteXY(0, WORK_AREA_TOP + c.SeqNo, "{0} [{1}] ({2})", uri.ToUriShort(), ea.Status, (ea.Exception.IsNull() ? "responded: " + ea.Stream.Length + " bytes" : "exception: " + ea.Exception.Message)); if (ea.Stream.IsNull()) { Thread.Sleep(5000); } else { HtmlProcessor processor = new HtmlProcessor( uri.ToString(), ea.Stream, ((TldParser)ea.Properties["TldParser"])); WriteXY(0, WORK_AREA_TOP + c.SeqNo, "{0} (found={1})", uri.ToUriShort(), processor.Links.Count); int pushedLinks = 0; int linkCounter = 1; processor.Links.ForEach(l => { WriteXY(0, WORK_AREA_TOP + c.SeqNo, "{0} (found={1}, prc={2} {3} ({4}))", uri.ToUriShort(), processor.Links.Count, (l.Domain.Length > 10 ? l.Domain.Substring(0, 10) : l.Domain), l.Tld, linkCounter); if (robots.IsAllowed(string.Empty, l.Href.ToUri())) { ++pushedLinks; pool.Store(l.Href); repository.SaveLink(uri.ToString(), l.Href, string.Empty, l); repository.SaveLink(l.Href, string.Empty, uri.ToString(), l); history.Add(uri.ToString()); } ++linkCounter; }); WriteXY(0, WORK_AREA_TOP + c.SeqNo, "{0} (found={1}, added={2} links) [DONE]", uri.ToUriShort(), processor.Links.Count, pushedLinks); ea.Stream.Close(); } //((ManualResetEvent)ea.Properties["State"]).Set(); }); web.Download(); //done.WaitOne(); }); }
static string PoolManager( CollectorPool pool, VisitedUrls urlHistory, VisitedDomains domainHistory) { string link = pool.Next(); Uri uri = link.ToUri(); if (uri.IsNull()) return string.Empty; // No more links in the pool // Check if url has been visited within this session while ( urlHistory.ContainsUrl(uri.ToString()) || domainHistory.ContainsDomain(uri.ToString()) || IsExcludedDomain(pool, uri)) { link = pool.Next(); uri = link.ToUri(); if (uri.IsNull()) return string.Empty; // No more links in the pool } // Recursively check if host is still processed //if (domainHistory.ContainsDomain(uri.ToString())) //{ // // Store the link back to pool // pool.Store(uri.ToString()); // return PoolManager(pool, urlHistory, domainHistory); //} //else //{ // Link is ok, add to history urlHistory.Add(uri.ToString()); domainHistory.Add(uri.ToString()); return uri.ToString(); //} }
static bool IsExcludedDomain(CollectorPool pool, Uri uri) { var li = new HtmlProcessor.LinkInfo(TldParser.Instance); li.Href = uri.ToString(); foreach (var excludedDomain in pool.ExcludedDomains) { if (li.Domain.IndexOf(excludedDomain) > -1) { return true; } } return false; }
static List<Collector> CreateCollectorPool( CollectorPool pool, VisitedUrls history, int parallelCount) { List<Collector> collectors = new List<Collector>(parallelCount); int i = 0; while (i < parallelCount) { string link = pool.Next(); if (!history.ContainsUrl(link.ToUri().ToString())) { collectors.Add(new Collector { SeqNo = i, Link = link }); history.Add(link.ToUri().ToString()); ++i; } } return collectors; }