public void HtmlProcessor_LinkInfo_Check() { TldParser parser = new TldParser(); HtmlProcessor.LinkInfo li = new HtmlProcessor.LinkInfo(parser); li.Href = "google.com"; Assert.IsTrue(li.Domain == "www.google.com"); //Assert.IsTrue(li.DomainOrSubdomain == "www.google.com"); Assert.IsTrue(li.DomainScheme == "http"); Assert.IsTrue(li.Tld == "com"); li.Href = "jubacs.somee.net.ph"; Assert.IsTrue(li.Domain == "www.jubacs.somee.net.ph"); //Assert.IsTrue(li.DomainOrSubdomain == "www.jubacs.somee.net.ph"); Assert.IsTrue(li.DomainScheme == "http"); Assert.IsTrue(li.Tld == "net.ph"); li.Href = "http://www.jubacs.somee.com"; Assert.IsTrue(li.Domain == "www.jubacs.somee.com"); }
public CollectorPool(IEnumerable<string> pool) { currentTop1MCounter = Convert.ToInt32(Properties.Settings.Default.Top1MCounter); var l = new List<string>(); Array.ForEach(ConfigurationManager.AppSettings["ExcludedDomains"].Split(new []{','}, StringSplitOptions.RemoveEmptyEntries), excludedDomain => { l.Add(excludedDomain.Trim()); }); ExcludedDomains = l.ToArray(); if (pool.IsNull()) this.pool = new ConcurrentQueue<string>(); else { IEnumerable<string> pools = pool.Where( s => { HtmlProcessor.LinkInfo li = new HtmlProcessor.LinkInfo(TldParser.Instance); li.Href = s; bool isExcluded = false; foreach (var excludedDomain in ExcludedDomains) { if (li.Domain.IndexOf(excludedDomain) > -1) { isExcluded = true; break; } } return !isExcluded; }); this.pool = new ConcurrentQueue<string>(pools); } }
static void CollectorProcessor(object s) { TaskState state = (TaskState)s; CollectorProcessorEventArgs progress = new CollectorProcessorEventArgs(); progress.CollectorID = state.CollectorID; string link = state.PoolManagerHandler(state.Pool, state.VisitedUrls, state.VisitedDomains); while (!link.IsNullOrEmpty() && !state.CancelToken.IsCancellationRequested) { ManualResetEvent done = new ManualResetEvent(false); Dictionary<string, object> properties = new Dictionary<string, object>(); properties.Add("TldParser", state.TldParser); properties.Add("State", done); Uri uri = link.ToUri(); bool isPageLoadAllowed = false; bool isAllowedCrawl = true; //to be changed //targetTime set to 4 seconds TimeSpan targetTime = new TimeSpan(0, 0, 0, 4); TimeSpan checkTime = HtmlPageLoadCheck(uri.ToString()); //if checkTime is less than 4 seconds set isPageLoadAllowed to True if (TimeSpan.Compare(checkTime, targetTime) == -1) { isPageLoadAllowed = true; } //TODO //isAllowedCrawl = state.MongoPersistence.isToCrawl(uri.ToString()); if (uri != null && isPageLoadAllowed == true && isAllowedCrawl == true) { HtmlProcessor.LinkInfo currentUrlLinkInfo = new HtmlProcessor.LinkInfo((state.TldParser)) { Href = uri.ToString() }; progress.Message = string.Format("{0} (fetching)", uri); state.ProgressHandler(progress); WebDownloader web = new WebDownloader(uri, properties, ea => { progress.Message = string.Format( "{0} [{1}] ({2})", uri.ToUriShort(), ea.Status, (ea.Exception.IsNull() ? "responded: " + ea.Stream.Length + " bytes" : "exception: " + ea.Exception.Message)); state.ProgressHandler(progress); currentUrlLinkInfo.Status = (int)ea.Status; //Thread.Sleep(2000); if (ea.Stream.IsNull()) { state.Repository.SaveLink(uri.ToString(), string.Empty, string.Empty, currentUrlLinkInfo); //Thread.Sleep(5000); } else { HtmlProcessor processor = new HtmlProcessor( uri.ToString(), ea.Stream, ((TldParser)ea.Properties["TldParser"])); progress.Message = string.Format( "{0} (found={1})", uri, processor.Links.Count); state.ProgressHandler(progress); //Thread.Sleep(2000); // Check if there is an external link bool hasExternalLink = false; int countPage = 0; foreach (var l in processor.Links) { if (l.Domain != currentUrlLinkInfo.Domain) { hasExternalLink = true; break; } countPage++; if (countPage > 400 && hasExternalLink == false) { break; } } // There is at least one external link if (hasExternalLink == true) { // Save the current link state.Repository.SaveLink(uri.ToString(), string.Empty, string.Empty, currentUrlLinkInfo); state.VisitedUrls.Add(uri.ToString()); state.VisitedDomains.Add(uri.ToString()); // Save the kids int pushedLinks = 0; int linkCounter = 1; processor.Links.ForEach(l => { progress.Message = string.Format( "{0} (processing={1} of {2})", uri, linkCounter, processor.Links.Count); state.ProgressHandler(progress); //if (state.Robots.IsAllowed(string.Empty, l.Href.ToUri())) //{ ++pushedLinks; state.VisitedUrls.Add(l.Href); state.VisitedDomains.Add(l.Href); state.Pool.Store(l.Href); HtmlProcessor.LinkInfo childLinkInfo = currentUrlLinkInfo; childLinkInfo.AnchorText = l.AnchorText; childLinkInfo.AnchorRel = l.AnchorRel; childLinkInfo.AnchorKind = l.AnchorKind; HtmlProcessor.LinkInfo backLinkInfo = l; backLinkInfo.AnchorText = string.Empty; backLinkInfo.AnchorRel = string.Empty; backLinkInfo.AnchorKind = string.Empty; state.Repository.SaveLink(uri.ToString(), l.Href, string.Empty, childLinkInfo); state.Repository.SaveLink(l.Href, string.Empty, uri.ToString(), backLinkInfo); state.VisitedDomains.Remove(l.Href); //} ++linkCounter; }); } progress.Message = string.Format("{0} [DONE]", uri); state.ProgressHandler(progress); ea.Stream.Close(); } ((ManualResetEvent)ea.Properties["State"]).Set(); }); web.Download(); done.WaitOne(); // Remove from the buffer so that collectors can crawl again the domain state.VisitedDomains.Remove(link); } // Fetch next link link = state.PoolManagerHandler(state.Pool, state.VisitedUrls, state.VisitedDomains); } //state.Countdown.Signal(); progress.Message = "NO URL IN THE POOL"; state.ProgressHandler(progress); }
static bool IsExcludedDomain(CollectorPool pool, Uri uri) { var li = new HtmlProcessor.LinkInfo(TldParser.Instance); li.Href = uri.ToString(); foreach (var excludedDomain in pool.ExcludedDomains) { if (li.Domain.IndexOf(excludedDomain) > -1) { return true; } } return false; }
static string GetDomain(string url) { HtmlProcessor.LinkInfo li = new HtmlProcessor.LinkInfo(TldParser.Instance); li.Href = url; return li.Domain; }