Exemple #1
0
        private static void TestRun1(int WORK_AREA_TOP, int parallelCount, Repository repository, RobotService robots, CollectorPool pool, VisitedUrls history, TldParser tldParser)
        {
            CreateCollectorPool(pool, history, parallelCount).ForEach(
                c =>
                {
                    //ManualResetEvent done = new ManualResetEvent(false);
                    Dictionary<string, object> properties = new Dictionary<string, object>();
                    properties.Add("TldParser", tldParser);

                    Uri uri = c.Link.ToUri();
                    WriteXY(0, WORK_AREA_TOP + c.SeqNo, "{0} (fetching)", uri.ToUriShort());

                    WebDownloader web = new WebDownloader(uri, properties,
                        ea =>
                        {
                            WriteXY(0, WORK_AREA_TOP + c.SeqNo,
                                "{0} [{1}] ({2})",
                                uri.ToUriShort(), ea.Status,
                                (ea.Exception.IsNull() ?
                                    "responded: " + ea.Stream.Length + " bytes" :
                                    "exception: " + ea.Exception.Message));

                            if (ea.Stream.IsNull())
                            {
                                Thread.Sleep(5000);
                            }
                            else
                            {
                                HtmlProcessor processor = new HtmlProcessor(
                                    uri.ToString(), ea.Stream,
                                    ((TldParser)ea.Properties["TldParser"]));

                                WriteXY(0, WORK_AREA_TOP + c.SeqNo,
                                    "{0} (found={1})", uri.ToUriShort(), processor.Links.Count);

                                int pushedLinks = 0;
                                int linkCounter = 1;
                                processor.Links.ForEach(l =>
                                {
                                    WriteXY(0, WORK_AREA_TOP + c.SeqNo,
                                        "{0} (found={1}, prc={2} {3} ({4}))",
                                        uri.ToUriShort(),
                                        processor.Links.Count,
                                        (l.Domain.Length > 10 ? l.Domain.Substring(0, 10) : l.Domain),
                                        l.Tld,
                                        linkCounter);

                                    if (robots.IsAllowed(string.Empty, l.Href.ToUri()))
                                    {
                                        ++pushedLinks;
                                        pool.Store(l.Href);

                                        repository.SaveLink(uri.ToString(), l.Href, string.Empty, l);
                                        repository.SaveLink(l.Href, string.Empty, uri.ToString(), l);
                                        history.Add(uri.ToString());
                                    }

                                    ++linkCounter;
                                });

                                WriteXY(0, WORK_AREA_TOP + c.SeqNo,
                                    "{0} (found={1}, added={2} links) [DONE]",
                                    uri.ToUriShort(), processor.Links.Count, pushedLinks);

                                ea.Stream.Close();
                            }

                            //((ManualResetEvent)ea.Properties["State"]).Set();
                        });

                    web.Download();
                    //done.WaitOne();
                });
        }
Exemple #2
0
        public void WebDownloaderer()
        {
            Uri uri = new Uri("http://lixam.com");

            WebDownloader web = new WebDownloader(uri, null, ea =>
            {
                Assert.IsNotNull(ea.Stream);
                ea.Stream.Close();
            });

            web.Download();
        }
Exemple #3
0
        static void CollectorProcessor(object s)
        {
            TaskState state = (TaskState)s;

            CollectorProcessorEventArgs progress = new CollectorProcessorEventArgs();
            progress.CollectorID = state.CollectorID;

            string link = state.PoolManagerHandler(state.Pool, state.VisitedUrls, state.VisitedDomains);

            while (!link.IsNullOrEmpty() && !state.CancelToken.IsCancellationRequested)
            {
                ManualResetEvent done = new ManualResetEvent(false);
                Dictionary<string, object> properties = new Dictionary<string, object>();
                properties.Add("TldParser", state.TldParser);
                properties.Add("State", done);

                Uri uri = link.ToUri();
                bool isPageLoadAllowed = false;
                bool isAllowedCrawl = true; //to be changed
                //targetTime set to 4 seconds
                TimeSpan targetTime = new TimeSpan(0, 0, 0, 4);
                TimeSpan checkTime = HtmlPageLoadCheck(uri.ToString());
                //if checkTime is less than 4 seconds set isPageLoadAllowed to True
                if (TimeSpan.Compare(checkTime, targetTime) == -1)
                {
                    isPageLoadAllowed = true;
                }
                //TODO
                //isAllowedCrawl = state.MongoPersistence.isToCrawl(uri.ToString());

                if (uri != null && isPageLoadAllowed == true && isAllowedCrawl == true)
                {
                    HtmlProcessor.LinkInfo currentUrlLinkInfo = new HtmlProcessor.LinkInfo((state.TldParser))
                    {
                        Href = uri.ToString()
                    };

                    progress.Message = string.Format("{0} (fetching)", uri);
                    state.ProgressHandler(progress);

                    WebDownloader web = new WebDownloader(uri, properties,
                        ea =>
                        {
                            progress.Message = string.Format(
                                "{0} [{1}] ({2})",
                                uri.ToUriShort(), ea.Status,
                                (ea.Exception.IsNull() ?
                                    "responded: " + ea.Stream.Length + " bytes" :
                                    "exception: " + ea.Exception.Message));
                            state.ProgressHandler(progress);

                            currentUrlLinkInfo.Status = (int)ea.Status;
                            //Thread.Sleep(2000);

                            if (ea.Stream.IsNull())
                            {
                                state.Repository.SaveLink(uri.ToString(), string.Empty, string.Empty, currentUrlLinkInfo);
                                //Thread.Sleep(5000);
                            }
                            else
                            {
                                HtmlProcessor processor = new HtmlProcessor(
                                    uri.ToString(), ea.Stream,
                                    ((TldParser)ea.Properties["TldParser"]));

                                progress.Message = string.Format(
                                    "{0} (found={1})", uri, processor.Links.Count);
                                state.ProgressHandler(progress);
                                //Thread.Sleep(2000);

                                // Check if there is an external link
                                bool hasExternalLink = false;
                                int countPage = 0;

                                foreach (var l in processor.Links)
                                {
                                    if (l.Domain != currentUrlLinkInfo.Domain)
                                    {
                                        hasExternalLink = true;
                                        break;
                                    }
                                    countPage++;
                                    if (countPage > 400 && hasExternalLink == false)
                                    {
                                        break;
                                    }

                                }

                                // There is at least one external link
                                if (hasExternalLink == true)
                                {
                                    // Save the current link
                                    state.Repository.SaveLink(uri.ToString(), string.Empty, string.Empty, currentUrlLinkInfo);
                                    state.VisitedUrls.Add(uri.ToString());
                                    state.VisitedDomains.Add(uri.ToString());

                                    // Save the kids
                                    int pushedLinks = 0;
                                    int linkCounter = 1;
                                    processor.Links.ForEach(l =>
                                    {
                                        progress.Message = string.Format(
                                            "{0} (processing={1} of {2})",
                                            uri,
                                            linkCounter,
                                            processor.Links.Count);
                                        state.ProgressHandler(progress);

                                        //if (state.Robots.IsAllowed(string.Empty, l.Href.ToUri()))
                                        //{
                                        ++pushedLinks;
                                        state.VisitedUrls.Add(l.Href);
                                        state.VisitedDomains.Add(l.Href);
                                        state.Pool.Store(l.Href);

                                        HtmlProcessor.LinkInfo childLinkInfo = currentUrlLinkInfo;
                                        childLinkInfo.AnchorText = l.AnchorText;
                                        childLinkInfo.AnchorRel = l.AnchorRel;
                                        childLinkInfo.AnchorKind = l.AnchorKind;

                                        HtmlProcessor.LinkInfo backLinkInfo = l;
                                        backLinkInfo.AnchorText = string.Empty;
                                        backLinkInfo.AnchorRel = string.Empty;
                                        backLinkInfo.AnchorKind = string.Empty;

                                        state.Repository.SaveLink(uri.ToString(), l.Href, string.Empty, childLinkInfo);
                                        state.Repository.SaveLink(l.Href, string.Empty, uri.ToString(), backLinkInfo);
                                        state.VisitedDomains.Remove(l.Href);
                                        //}

                                        ++linkCounter;
                                    });
                                }

                                progress.Message = string.Format("{0} [DONE]", uri);
                                state.ProgressHandler(progress);

                                ea.Stream.Close();
                            }

                            ((ManualResetEvent)ea.Properties["State"]).Set();
                        });

                    web.Download();
                    done.WaitOne();

                    // Remove from the buffer so that collectors can crawl again the domain
                    state.VisitedDomains.Remove(link);
                }

                // Fetch next link
                link = state.PoolManagerHandler(state.Pool, state.VisitedUrls, state.VisitedDomains);
            }

            //state.Countdown.Signal();
            progress.Message = "NO URL IN THE POOL";
            state.ProgressHandler(progress);
        }
Exemple #4
0
        private List<string> DownloadRobots(string host)
        {
            List<string> deniedUrls = new List<string>();
            Uri resolvedUri = new Uri(string.Format("http://{0}/", host));

            using (ManualResetEvent done = new ManualResetEvent(false))
            {
                try
                {

                    WebDownloader web = new WebDownloader(
                        string.Format("http://{0}/robots.txt", resolvedUri.Host).ToUri(),
                        null,
                        ea =>
                        {
                            if (!ea.Stream.IsNull())
                            {
                                using (StreamReader sr = new StreamReader(ea.Stream))
                                {
                                    bool rulesApply = false;

                                    while (sr.Peek() >= 0)
                                    {
                                        string instructionLine = sr.ReadLine().ToUpperInvariant();
                                        if (!instructionLine.IsNullOrEmpty())
                                        {
                                            RobotInstruction ri = new RobotInstruction(instructionLine);
                                            int commentPosition = instructionLine.IndexOf("#");

                                            if (commentPosition > -1)
                                                instructionLine = instructionLine.Substring(0, commentPosition);

                                            if (instructionLine.Length > 0)
                                            {
                                                if (instructionLine.StartsWith("U"))
                                                {
                                                    // User-agent: *
                                                    int colonPosition = instructionLine.IndexOf(":");
                                                    instructionLine = instructionLine.Substring(colonPosition + 1).Trim();
                                                    if ((instructionLine.StartsWith("*") == true) || ((ri.UrlOrAgent.IndexOf(user_agent) >= 0)))
                                                        rulesApply = true;
                                                    else
                                                        rulesApply = false;
                                                }
                                                else if (instructionLine.StartsWith("D"))
                                                {
                                                    // Disallow: /
                                                    // Disallow: /cgi-bin
                                                    if (rulesApply)
                                                    {
                                                        int colonPosition = instructionLine.IndexOf(":");
                                                        instructionLine = instructionLine.Substring(colonPosition + 1).Trim();
                                                        Uri possibleDenyUri;
                                                        if (Uri.TryCreate(resolvedUri,
                                                            instructionLine, out possibleDenyUri))
                                                        {
                                                            if (!deniedUrls.Contains(possibleDenyUri.AbsoluteUri.ToUpperInvariant()))
                                                                deniedUrls.Add(possibleDenyUri.AbsoluteUri.ToUpperInvariant());
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }

                                    sr.Close();
                                }
                            }

                            done.Set();
                        });

                    web.Download();
                    done.WaitOne();
                }
                catch
                {
                    // Do nothing for now
                }

                cache.AddOrUpdate(host, deniedUrls, (s, l) => { return l;  });
                return deniedUrls;
            }
        }