Exemplo n.º 1
0
        public void HtmlProcessor_LinkInfo_Check()
        {
            TldParser parser = new TldParser();

            HtmlProcessor.LinkInfo li = new HtmlProcessor.LinkInfo(parser);
            li.Href = "google.com";
            Assert.IsTrue(li.Domain == "www.google.com");
            //Assert.IsTrue(li.DomainOrSubdomain == "www.google.com");
            Assert.IsTrue(li.DomainScheme == "http");
            Assert.IsTrue(li.Tld == "com");

            li.Href = "jubacs.somee.net.ph";
            Assert.IsTrue(li.Domain == "www.jubacs.somee.net.ph");
            //Assert.IsTrue(li.DomainOrSubdomain == "www.jubacs.somee.net.ph");
            Assert.IsTrue(li.DomainScheme == "http");
            Assert.IsTrue(li.Tld == "net.ph");

            li.Href = "http://www.jubacs.somee.com";
            Assert.IsTrue(li.Domain == "www.jubacs.somee.com");
        }
Exemplo n.º 2
0
        public CollectorPool(IEnumerable<string> pool)
        {
            currentTop1MCounter = Convert.ToInt32(Properties.Settings.Default.Top1MCounter);

            var l = new List<string>();
            Array.ForEach(ConfigurationManager.AppSettings["ExcludedDomains"].Split(new []{','}, StringSplitOptions.RemoveEmptyEntries),
                excludedDomain =>
                {
                    l.Add(excludedDomain.Trim());
                });

            ExcludedDomains = l.ToArray();

            if (pool.IsNull())
                this.pool = new ConcurrentQueue<string>();
            else
            {
                IEnumerable<string> pools = pool.Where(
                    s =>
                    {
                        HtmlProcessor.LinkInfo li = new HtmlProcessor.LinkInfo(TldParser.Instance);
                        li.Href = s;
                        bool isExcluded = false;

                        foreach (var excludedDomain in ExcludedDomains)
                        {
                            if (li.Domain.IndexOf(excludedDomain) > -1)
                            {
                                isExcluded = true;
                                break;
                            }
                        }

                        return !isExcluded;
                    });

                this.pool = new ConcurrentQueue<string>(pools);
            }
        }
Exemplo n.º 3
0
        static void CollectorProcessor(object s)
        {
            TaskState state = (TaskState)s;

            CollectorProcessorEventArgs progress = new CollectorProcessorEventArgs();
            progress.CollectorID = state.CollectorID;

            string link = state.PoolManagerHandler(state.Pool, state.VisitedUrls, state.VisitedDomains);

            while (!link.IsNullOrEmpty() && !state.CancelToken.IsCancellationRequested)
            {
                ManualResetEvent done = new ManualResetEvent(false);
                Dictionary<string, object> properties = new Dictionary<string, object>();
                properties.Add("TldParser", state.TldParser);
                properties.Add("State", done);

                Uri uri = link.ToUri();
                bool isPageLoadAllowed = false;
                bool isAllowedCrawl = true; //to be changed
                //targetTime set to 4 seconds
                TimeSpan targetTime = new TimeSpan(0, 0, 0, 4);
                TimeSpan checkTime = HtmlPageLoadCheck(uri.ToString());
                //if checkTime is less than 4 seconds set isPageLoadAllowed to True
                if (TimeSpan.Compare(checkTime, targetTime) == -1)
                {
                    isPageLoadAllowed = true;
                }
                //TODO
                //isAllowedCrawl = state.MongoPersistence.isToCrawl(uri.ToString());

                if (uri != null && isPageLoadAllowed == true && isAllowedCrawl == true)
                {
                    HtmlProcessor.LinkInfo currentUrlLinkInfo = new HtmlProcessor.LinkInfo((state.TldParser))
                    {
                        Href = uri.ToString()
                    };

                    progress.Message = string.Format("{0} (fetching)", uri);
                    state.ProgressHandler(progress);

                    WebDownloader web = new WebDownloader(uri, properties,
                        ea =>
                        {
                            progress.Message = string.Format(
                                "{0} [{1}] ({2})",
                                uri.ToUriShort(), ea.Status,
                                (ea.Exception.IsNull() ?
                                    "responded: " + ea.Stream.Length + " bytes" :
                                    "exception: " + ea.Exception.Message));
                            state.ProgressHandler(progress);

                            currentUrlLinkInfo.Status = (int)ea.Status;
                            //Thread.Sleep(2000);

                            if (ea.Stream.IsNull())
                            {
                                state.Repository.SaveLink(uri.ToString(), string.Empty, string.Empty, currentUrlLinkInfo);
                                //Thread.Sleep(5000);
                            }
                            else
                            {
                                HtmlProcessor processor = new HtmlProcessor(
                                    uri.ToString(), ea.Stream,
                                    ((TldParser)ea.Properties["TldParser"]));

                                progress.Message = string.Format(
                                    "{0} (found={1})", uri, processor.Links.Count);
                                state.ProgressHandler(progress);
                                //Thread.Sleep(2000);

                                // Check if there is an external link
                                bool hasExternalLink = false;
                                int countPage = 0;

                                foreach (var l in processor.Links)
                                {
                                    if (l.Domain != currentUrlLinkInfo.Domain)
                                    {
                                        hasExternalLink = true;
                                        break;
                                    }
                                    countPage++;
                                    if (countPage > 400 && hasExternalLink == false)
                                    {
                                        break;
                                    }

                                }

                                // There is at least one external link
                                if (hasExternalLink == true)
                                {
                                    // Save the current link
                                    state.Repository.SaveLink(uri.ToString(), string.Empty, string.Empty, currentUrlLinkInfo);
                                    state.VisitedUrls.Add(uri.ToString());
                                    state.VisitedDomains.Add(uri.ToString());

                                    // Save the kids
                                    int pushedLinks = 0;
                                    int linkCounter = 1;
                                    processor.Links.ForEach(l =>
                                    {
                                        progress.Message = string.Format(
                                            "{0} (processing={1} of {2})",
                                            uri,
                                            linkCounter,
                                            processor.Links.Count);
                                        state.ProgressHandler(progress);

                                        //if (state.Robots.IsAllowed(string.Empty, l.Href.ToUri()))
                                        //{
                                        ++pushedLinks;
                                        state.VisitedUrls.Add(l.Href);
                                        state.VisitedDomains.Add(l.Href);
                                        state.Pool.Store(l.Href);

                                        HtmlProcessor.LinkInfo childLinkInfo = currentUrlLinkInfo;
                                        childLinkInfo.AnchorText = l.AnchorText;
                                        childLinkInfo.AnchorRel = l.AnchorRel;
                                        childLinkInfo.AnchorKind = l.AnchorKind;

                                        HtmlProcessor.LinkInfo backLinkInfo = l;
                                        backLinkInfo.AnchorText = string.Empty;
                                        backLinkInfo.AnchorRel = string.Empty;
                                        backLinkInfo.AnchorKind = string.Empty;

                                        state.Repository.SaveLink(uri.ToString(), l.Href, string.Empty, childLinkInfo);
                                        state.Repository.SaveLink(l.Href, string.Empty, uri.ToString(), backLinkInfo);
                                        state.VisitedDomains.Remove(l.Href);
                                        //}

                                        ++linkCounter;
                                    });
                                }

                                progress.Message = string.Format("{0} [DONE]", uri);
                                state.ProgressHandler(progress);

                                ea.Stream.Close();
                            }

                            ((ManualResetEvent)ea.Properties["State"]).Set();
                        });

                    web.Download();
                    done.WaitOne();

                    // Remove from the buffer so that collectors can crawl again the domain
                    state.VisitedDomains.Remove(link);
                }

                // Fetch next link
                link = state.PoolManagerHandler(state.Pool, state.VisitedUrls, state.VisitedDomains);
            }

            //state.Countdown.Signal();
            progress.Message = "NO URL IN THE POOL";
            state.ProgressHandler(progress);
        }
Exemplo n.º 4
0
        static bool IsExcludedDomain(CollectorPool pool, Uri uri)
        {
            var li = new HtmlProcessor.LinkInfo(TldParser.Instance);
            li.Href = uri.ToString();

            foreach (var excludedDomain in pool.ExcludedDomains)
            {
                if (li.Domain.IndexOf(excludedDomain) > -1)
                {
                    return true;
                }
            }

            return false;
        }
Exemplo n.º 5
0
 static string GetDomain(string url)
 {
     HtmlProcessor.LinkInfo li = new HtmlProcessor.LinkInfo(TldParser.Instance);
     li.Href = url;
     return li.Domain;
 }