private static void TestRun1(int WORK_AREA_TOP, int parallelCount, Repository repository, RobotService robots, CollectorPool pool, VisitedUrls history, TldParser tldParser) { CreateCollectorPool(pool, history, parallelCount).ForEach( c => { //ManualResetEvent done = new ManualResetEvent(false); Dictionary<string, object> properties = new Dictionary<string, object>(); properties.Add("TldParser", tldParser); Uri uri = c.Link.ToUri(); WriteXY(0, WORK_AREA_TOP + c.SeqNo, "{0} (fetching)", uri.ToUriShort()); WebDownloader web = new WebDownloader(uri, properties, ea => { WriteXY(0, WORK_AREA_TOP + c.SeqNo, "{0} [{1}] ({2})", uri.ToUriShort(), ea.Status, (ea.Exception.IsNull() ? "responded: " + ea.Stream.Length + " bytes" : "exception: " + ea.Exception.Message)); if (ea.Stream.IsNull()) { Thread.Sleep(5000); } else { HtmlProcessor processor = new HtmlProcessor( uri.ToString(), ea.Stream, ((TldParser)ea.Properties["TldParser"])); WriteXY(0, WORK_AREA_TOP + c.SeqNo, "{0} (found={1})", uri.ToUriShort(), processor.Links.Count); int pushedLinks = 0; int linkCounter = 1; processor.Links.ForEach(l => { WriteXY(0, WORK_AREA_TOP + c.SeqNo, "{0} (found={1}, prc={2} {3} ({4}))", uri.ToUriShort(), processor.Links.Count, (l.Domain.Length > 10 ? l.Domain.Substring(0, 10) : l.Domain), l.Tld, linkCounter); if (robots.IsAllowed(string.Empty, l.Href.ToUri())) { ++pushedLinks; pool.Store(l.Href); repository.SaveLink(uri.ToString(), l.Href, string.Empty, l); repository.SaveLink(l.Href, string.Empty, uri.ToString(), l); history.Add(uri.ToString()); } ++linkCounter; }); WriteXY(0, WORK_AREA_TOP + c.SeqNo, "{0} (found={1}, added={2} links) [DONE]", uri.ToUriShort(), processor.Links.Count, pushedLinks); ea.Stream.Close(); } //((ManualResetEvent)ea.Properties["State"]).Set(); }); web.Download(); //done.WaitOne(); }); }
public void WebDownloaderer() { Uri uri = new Uri("http://lixam.com"); WebDownloader web = new WebDownloader(uri, null, ea => { Assert.IsNotNull(ea.Stream); ea.Stream.Close(); }); web.Download(); }
static void CollectorProcessor(object s) { TaskState state = (TaskState)s; CollectorProcessorEventArgs progress = new CollectorProcessorEventArgs(); progress.CollectorID = state.CollectorID; string link = state.PoolManagerHandler(state.Pool, state.VisitedUrls, state.VisitedDomains); while (!link.IsNullOrEmpty() && !state.CancelToken.IsCancellationRequested) { ManualResetEvent done = new ManualResetEvent(false); Dictionary<string, object> properties = new Dictionary<string, object>(); properties.Add("TldParser", state.TldParser); properties.Add("State", done); Uri uri = link.ToUri(); bool isPageLoadAllowed = false; bool isAllowedCrawl = true; //to be changed //targetTime set to 4 seconds TimeSpan targetTime = new TimeSpan(0, 0, 0, 4); TimeSpan checkTime = HtmlPageLoadCheck(uri.ToString()); //if checkTime is less than 4 seconds set isPageLoadAllowed to True if (TimeSpan.Compare(checkTime, targetTime) == -1) { isPageLoadAllowed = true; } //TODO //isAllowedCrawl = state.MongoPersistence.isToCrawl(uri.ToString()); if (uri != null && isPageLoadAllowed == true && isAllowedCrawl == true) { HtmlProcessor.LinkInfo currentUrlLinkInfo = new HtmlProcessor.LinkInfo((state.TldParser)) { Href = uri.ToString() }; progress.Message = string.Format("{0} (fetching)", uri); state.ProgressHandler(progress); WebDownloader web = new WebDownloader(uri, properties, ea => { progress.Message = string.Format( "{0} [{1}] ({2})", uri.ToUriShort(), ea.Status, (ea.Exception.IsNull() ? "responded: " + ea.Stream.Length + " bytes" : "exception: " + ea.Exception.Message)); state.ProgressHandler(progress); currentUrlLinkInfo.Status = (int)ea.Status; //Thread.Sleep(2000); if (ea.Stream.IsNull()) { state.Repository.SaveLink(uri.ToString(), string.Empty, string.Empty, currentUrlLinkInfo); //Thread.Sleep(5000); } else { HtmlProcessor processor = new HtmlProcessor( uri.ToString(), ea.Stream, ((TldParser)ea.Properties["TldParser"])); progress.Message = string.Format( "{0} (found={1})", uri, processor.Links.Count); state.ProgressHandler(progress); //Thread.Sleep(2000); // Check if there is an external link bool hasExternalLink = false; int countPage = 0; foreach (var l in processor.Links) { if (l.Domain != currentUrlLinkInfo.Domain) { hasExternalLink = true; break; } countPage++; if (countPage > 400 && hasExternalLink == false) { break; } } // There is at least one external link if (hasExternalLink == true) { // Save the current link state.Repository.SaveLink(uri.ToString(), string.Empty, string.Empty, currentUrlLinkInfo); state.VisitedUrls.Add(uri.ToString()); state.VisitedDomains.Add(uri.ToString()); // Save the kids int pushedLinks = 0; int linkCounter = 1; processor.Links.ForEach(l => { progress.Message = string.Format( "{0} (processing={1} of {2})", uri, linkCounter, processor.Links.Count); state.ProgressHandler(progress); //if (state.Robots.IsAllowed(string.Empty, l.Href.ToUri())) //{ ++pushedLinks; state.VisitedUrls.Add(l.Href); state.VisitedDomains.Add(l.Href); state.Pool.Store(l.Href); HtmlProcessor.LinkInfo childLinkInfo = currentUrlLinkInfo; childLinkInfo.AnchorText = l.AnchorText; childLinkInfo.AnchorRel = l.AnchorRel; childLinkInfo.AnchorKind = l.AnchorKind; HtmlProcessor.LinkInfo backLinkInfo = l; backLinkInfo.AnchorText = string.Empty; backLinkInfo.AnchorRel = string.Empty; backLinkInfo.AnchorKind = string.Empty; state.Repository.SaveLink(uri.ToString(), l.Href, string.Empty, childLinkInfo); state.Repository.SaveLink(l.Href, string.Empty, uri.ToString(), backLinkInfo); state.VisitedDomains.Remove(l.Href); //} ++linkCounter; }); } progress.Message = string.Format("{0} [DONE]", uri); state.ProgressHandler(progress); ea.Stream.Close(); } ((ManualResetEvent)ea.Properties["State"]).Set(); }); web.Download(); done.WaitOne(); // Remove from the buffer so that collectors can crawl again the domain state.VisitedDomains.Remove(link); } // Fetch next link link = state.PoolManagerHandler(state.Pool, state.VisitedUrls, state.VisitedDomains); } //state.Countdown.Signal(); progress.Message = "NO URL IN THE POOL"; state.ProgressHandler(progress); }
private List<string> DownloadRobots(string host) { List<string> deniedUrls = new List<string>(); Uri resolvedUri = new Uri(string.Format("http://{0}/", host)); using (ManualResetEvent done = new ManualResetEvent(false)) { try { WebDownloader web = new WebDownloader( string.Format("http://{0}/robots.txt", resolvedUri.Host).ToUri(), null, ea => { if (!ea.Stream.IsNull()) { using (StreamReader sr = new StreamReader(ea.Stream)) { bool rulesApply = false; while (sr.Peek() >= 0) { string instructionLine = sr.ReadLine().ToUpperInvariant(); if (!instructionLine.IsNullOrEmpty()) { RobotInstruction ri = new RobotInstruction(instructionLine); int commentPosition = instructionLine.IndexOf("#"); if (commentPosition > -1) instructionLine = instructionLine.Substring(0, commentPosition); if (instructionLine.Length > 0) { if (instructionLine.StartsWith("U")) { // User-agent: * int colonPosition = instructionLine.IndexOf(":"); instructionLine = instructionLine.Substring(colonPosition + 1).Trim(); if ((instructionLine.StartsWith("*") == true) || ((ri.UrlOrAgent.IndexOf(user_agent) >= 0))) rulesApply = true; else rulesApply = false; } else if (instructionLine.StartsWith("D")) { // Disallow: / // Disallow: /cgi-bin if (rulesApply) { int colonPosition = instructionLine.IndexOf(":"); instructionLine = instructionLine.Substring(colonPosition + 1).Trim(); Uri possibleDenyUri; if (Uri.TryCreate(resolvedUri, instructionLine, out possibleDenyUri)) { if (!deniedUrls.Contains(possibleDenyUri.AbsoluteUri.ToUpperInvariant())) deniedUrls.Add(possibleDenyUri.AbsoluteUri.ToUpperInvariant()); } } } } } } sr.Close(); } } done.Set(); }); web.Download(); done.WaitOne(); } catch { // Do nothing for now } cache.AddOrUpdate(host, deniedUrls, (s, l) => { return l; }); return deniedUrls; } }