Ejemplo n.º 1
0
 public void CollectorStacker()
 {
     CollectorPool stack = new CollectorPool();
     Assert.IsTrue(stack.Next() == "google.com");
     Assert.IsTrue(stack.Next() == "facebook.com");
     stack.Store("jubacs.somee.com");
     Assert.IsTrue(stack.Next() == "jubacs.somee.com");
 }
Ejemplo n.º 2
0
        private static void TestRun1(int WORK_AREA_TOP, int parallelCount, Repository repository, RobotService robots, CollectorPool pool, VisitedUrls history, TldParser tldParser)
        {
            CreateCollectorPool(pool, history, parallelCount).ForEach(
                c =>
                {
                    //ManualResetEvent done = new ManualResetEvent(false);
                    Dictionary<string, object> properties = new Dictionary<string, object>();
                    properties.Add("TldParser", tldParser);

                    Uri uri = c.Link.ToUri();
                    WriteXY(0, WORK_AREA_TOP + c.SeqNo, "{0} (fetching)", uri.ToUriShort());

                    WebDownloader web = new WebDownloader(uri, properties,
                        ea =>
                        {
                            WriteXY(0, WORK_AREA_TOP + c.SeqNo,
                                "{0} [{1}] ({2})",
                                uri.ToUriShort(), ea.Status,
                                (ea.Exception.IsNull() ?
                                    "responded: " + ea.Stream.Length + " bytes" :
                                    "exception: " + ea.Exception.Message));

                            if (ea.Stream.IsNull())
                            {
                                Thread.Sleep(5000);
                            }
                            else
                            {
                                HtmlProcessor processor = new HtmlProcessor(
                                    uri.ToString(), ea.Stream,
                                    ((TldParser)ea.Properties["TldParser"]));

                                WriteXY(0, WORK_AREA_TOP + c.SeqNo,
                                    "{0} (found={1})", uri.ToUriShort(), processor.Links.Count);

                                int pushedLinks = 0;
                                int linkCounter = 1;
                                processor.Links.ForEach(l =>
                                {
                                    WriteXY(0, WORK_AREA_TOP + c.SeqNo,
                                        "{0} (found={1}, prc={2} {3} ({4}))",
                                        uri.ToUriShort(),
                                        processor.Links.Count,
                                        (l.Domain.Length > 10 ? l.Domain.Substring(0, 10) : l.Domain),
                                        l.Tld,
                                        linkCounter);

                                    if (robots.IsAllowed(string.Empty, l.Href.ToUri()))
                                    {
                                        ++pushedLinks;
                                        pool.Store(l.Href);

                                        repository.SaveLink(uri.ToString(), l.Href, string.Empty, l);
                                        repository.SaveLink(l.Href, string.Empty, uri.ToString(), l);
                                        history.Add(uri.ToString());
                                    }

                                    ++linkCounter;
                                });

                                WriteXY(0, WORK_AREA_TOP + c.SeqNo,
                                    "{0} (found={1}, added={2} links) [DONE]",
                                    uri.ToUriShort(), processor.Links.Count, pushedLinks);

                                ea.Stream.Close();
                            }

                            //((ManualResetEvent)ea.Properties["State"]).Set();
                        });

                    web.Download();
                    //done.WaitOne();
                });
        }
Ejemplo n.º 3
0
        static string PoolManager(
            CollectorPool pool, 
            VisitedUrls urlHistory,
            VisitedDomains domainHistory)
        {
            string link = pool.Next();
            Uri uri = link.ToUri();

            if (uri.IsNull())
                return string.Empty; // No more links in the pool

            // Check if url has been visited within this session
            while (
                urlHistory.ContainsUrl(uri.ToString()) ||
                domainHistory.ContainsDomain(uri.ToString()) ||
                IsExcludedDomain(pool, uri))
            {
                link = pool.Next();
                uri = link.ToUri();

                if (uri.IsNull())
                    return string.Empty; // No more links in the pool
            }

            // Recursively check if host is still processed
            //if (domainHistory.ContainsDomain(uri.ToString()))
            //{
            //    // Store the link back to pool
            //    pool.Store(uri.ToString());
            //    return PoolManager(pool, urlHistory, domainHistory);
            //}
            //else
            //{
                // Link is ok, add to history
                urlHistory.Add(uri.ToString());
                domainHistory.Add(uri.ToString());

                return uri.ToString();
            //}
        }
Ejemplo n.º 4
0
        static bool IsExcludedDomain(CollectorPool pool, Uri uri)
        {
            var li = new HtmlProcessor.LinkInfo(TldParser.Instance);
            li.Href = uri.ToString();

            foreach (var excludedDomain in pool.ExcludedDomains)
            {
                if (li.Domain.IndexOf(excludedDomain) > -1)
                {
                    return true;
                }
            }

            return false;
        }
Ejemplo n.º 5
0
        static List<Collector> CreateCollectorPool(
            CollectorPool pool,
            VisitedUrls history,
            int parallelCount)
        {
            List<Collector> collectors = new List<Collector>(parallelCount);
            int i = 0;
            while (i < parallelCount)
            {
                string link = pool.Next();

                if (!history.ContainsUrl(link.ToUri().ToString()))
                {
                    collectors.Add(new Collector
                    {
                        SeqNo = i,
                        Link = link
                    });

                    history.Add(link.ToUri().ToString());

                    ++i;
                }
            }

            return collectors;
        }