Example #1
0
        public void TestInitializeTld()
        {
            LinkInfo u = new LinkInfo(CM);

            Assert.IsTrue(LinkInfo.TldMozillaList.Count > 0, "TldMozillaList not loaded");

            foreach (string tld in LinkInfo.TldMozillaList)
                if (tld.StartsWith("//"))
                    Assert.Fail("TldMozillaList contains a 'comment' value");
                else if (string.IsNullOrWhiteSpace(tld))
                    Assert.Fail("TldMozillaList contains an empty string");
        }
Example #2
0
        public void TestNewLinkInfo()
        {
            LinkInfo u = new LinkInfo(CM, "http://www.yahoo.com", "http://www.google.com", "Yahoo!", "nofollow", "anchor");
            Assert.IsTrue(u.Link == "http://www.yahoo.com/", "Link is wrong");
            Assert.IsTrue(u.Domain == "yahoo.com", "Domain is wrong `{0}`", u.Domain);
            Assert.IsTrue(u.DomainOrSubdomain == "www.yahoo.com", "DomainOrSubdomain is wrong `{0}`", u.DomainOrSubdomain);
            Assert.IsTrue(u.Tld == "com", "Tld is wrong `{0}`", u.Tld);
            Assert.IsTrue(u.DomainScheme == "http", "Scheme is wrong {0}", u.DomainScheme);
            Assert.IsTrue(u.DomainSchemeAndServer == "http://www.yahoo.com/", "SchemeAndServer is wrong {0}", u.DomainSchemeAndServer);
            Assert.IsTrue(!string.IsNullOrEmpty(u.DomainCountry), "Country is not resolved for {0}", u.Domain);
            Assert.IsTrue(u.IPAddresses.Count > 0, "No ip address is resolved for {0}", u.Domain);
            foreach (string s in u.RobotsExclusion)
            {
                Assert.IsTrue(s.StartsWith("/"), "Invalid robots.txt for {0}", u.DomainSchemeAndServer);
            }

            u = new LinkInfo(CM, "https://dev.mysql.com/", "http://www.google.com", "MySQL", "nofollow", "anchor");
            Assert.IsTrue(u.Link == "https://dev.mysql.com/", "Link is wrong `{0}`", u.Link);
            Assert.IsTrue(u.Domain == "mysql.com", "Domain is wrong `{0}`", u.Domain);
            Assert.IsTrue(u.DomainOrSubdomain == "dev.mysql.com", "DomainOrSubdomain is wrong `{0}`", u.DomainOrSubdomain);
            Assert.IsTrue(u.Tld == "com", "Tld is wrong `{0}`", u.Tld);
            Assert.IsTrue(u.DomainScheme == "https", "Scheme is wrong {0}", u.DomainScheme);
            Assert.IsTrue(u.DomainSchemeAndServer == "https://dev.mysql.com/", "SchemeAndServer is wrong {0}", u.DomainSchemeAndServer);
            Assert.IsTrue(!string.IsNullOrEmpty(u.DomainCountry), "Country is not resolved for {0}", u.Domain);
            Assert.IsTrue(u.IPAddresses.Count > 0, "No ip address is resolved for {0}", u.Domain);
            Assert.IsTrue(u.RobotsExclusion.Count > 0, "No robots.txt for {0}", u.DomainSchemeAndServer);

            u = new LinkInfo(CM, "http://www.google.com.ph/webhp?sourceid=chrome-instant", "http://www.google.com", "MySQL", "nofollow", "anchor");
            Assert.IsTrue(u.Link == "http://www.google.com.ph/webhp?sourceid=chrome-instant", "Link is wrong `{0}`", u.Link);
            Assert.IsTrue(u.Domain == "google.com.ph", "Domain is wrong `{0}`", u.Domain);
            Assert.IsTrue(u.DomainOrSubdomain == "www.google.com.ph", "DomainOrSubdomain is wrong `{0}`", u.DomainOrSubdomain);
            Assert.IsTrue(u.Tld == "com.ph", "Tld is wrong `{0}`", u.Tld);
            Assert.IsTrue(u.DomainScheme == "http", "Scheme is wrong {0}", u.DomainScheme);
            Assert.IsTrue(u.DomainSchemeAndServer == "http://www.google.com.ph/", "SchemeAndServer is wrong {0}", u.DomainSchemeAndServer);
            Assert.IsTrue(!string.IsNullOrEmpty(u.DomainCountry), "Country is not resolved for {0}", u.Domain);
            Assert.IsTrue(u.IPAddresses.Count > 0, "No ip address is resolved for {0}", u.Domain);
            Assert.IsTrue(u.RobotsExclusion.Count > 0, "No robots.txt for {0}", u.DomainSchemeAndServer);
        }
Example #3
0
        static void CrawlNextLinkFromPool(object o)
        {
            CollectorManager m = (CollectorManager)o;

            // Limit workers
            if (m.LinksCurrentlyProcessing.Count < m.COLLECTOR_COUNT)
            {
                //string link = m.Redis.DequeueItemFromList("urn:pool");
                string link = string.Empty;

                try
                {
                    if (m.COLLECTOR_DIRECTION == COLLECTOR_DIRECTION_OLDEST)
                        link = m.Redis.List["urn:pool"].LeftPop();
                    else
                        link = m.Redis.List["urn:pool"].RightPop();
                }
                catch { }

                if (!string.IsNullOrEmpty(link))
                {
                    link = LinkParser.Validate(link, string.Empty);
                }

                if (!string.IsNullOrEmpty(link))
                {
                    m.LinksCurrentlyProcessing.Add(link);

                    LinkInfo info = new LinkInfo(m, link);

                    if (m.IsDomainOrSubdomainCurrentlyCrawled(info.DomainOrSubdomain))
                    {
                        Thread.Sleep(500);
                        //m.Redis.AddItemToList("urn:pool", link);

                        if (m.COLLECTOR_DIRECTION == COLLECTOR_DIRECTION_OLDEST)
                            m.Redis.List["urn:pool"].Append(link);
                        else
                            m.Redis.List["urn:pool"].Prepend(link);

                        m.LinksCurrentlyProcessing.Remove(link);
                    }
                    else
                    {
                        DateTime? lastDateCrawl = null;
                        //if (m.Redis.HashContainsEntry("urn:link:data-last-date-crawl", info.Link))
                        if (m.Redis.Hash["urn:link:data-last-date-crawl"].ContainsKey(info.Link))
                        {
                            //lastDateCrawl = Convert.ToDateTime(
                            //    m.Redis.GetValueFromHash("urn:link:data-last-date-crawl", info.Link));
                            lastDateCrawl = Convert.ToDateTime(
                                m.Redis.Hash["urn:link:data-last-date-crawl"][info.Link]);
                        }

                        if (!lastDateCrawl.HasValue ||
                            (DateTime.Now - lastDateCrawl.Value).Days > 30 ||
                            m.COLLECTOR_DIRECTION == COLLECTOR_DIRECTION_NEWEST)
                        {
                            // Only crawl if the link has not been crawled since
                            // And if the last crawl was 30 days ago
                            m.Collectors.Add(link,
                                new LinkCollectorTaskPair
                                {
                                    Task = Task.Factory.StartNew(CrawlLinkInfo,
                                        new object[] { m, info }),
                                    LinkInfo = info
                                });
                            m.CollectorCount[0] = m.Collectors.Count;
                        }
                        else
                        {
                            m.LinksCurrentlyProcessing.Remove(link);
                        }
                    }
                }
            }
        }
Example #4
0
        public void Merge(LinkInfo info)
        {
            if (info.IPAddresses != null)
            {
                foreach (string ip in info.IPAddresses)
                {
                    if (!this.IPAddresses.Contains(ip))
                        this.IPAddresses.Add(ip);
                }
            }

            if (info.Backlinks != null)
            {
                foreach (string backlink in info.Backlinks)
                {
                    if (!this.Backlinks.Contains(backlink))
                        this.Backlinks.Add(backlink);
                }
            }
        }