public void TestInitializeTld() { LinkInfo u = new LinkInfo(CM); Assert.IsTrue(LinkInfo.TldMozillaList.Count > 0, "TldMozillaList not loaded"); foreach (string tld in LinkInfo.TldMozillaList) if (tld.StartsWith("//")) Assert.Fail("TldMozillaList contains a 'comment' value"); else if (string.IsNullOrWhiteSpace(tld)) Assert.Fail("TldMozillaList contains an empty string"); }
public void TestNewLinkInfo() { LinkInfo u = new LinkInfo(CM, "http://www.yahoo.com", "http://www.google.com", "Yahoo!", "nofollow", "anchor"); Assert.IsTrue(u.Link == "http://www.yahoo.com/", "Link is wrong"); Assert.IsTrue(u.Domain == "yahoo.com", "Domain is wrong `{0}`", u.Domain); Assert.IsTrue(u.DomainOrSubdomain == "www.yahoo.com", "DomainOrSubdomain is wrong `{0}`", u.DomainOrSubdomain); Assert.IsTrue(u.Tld == "com", "Tld is wrong `{0}`", u.Tld); Assert.IsTrue(u.DomainScheme == "http", "Scheme is wrong {0}", u.DomainScheme); Assert.IsTrue(u.DomainSchemeAndServer == "http://www.yahoo.com/", "SchemeAndServer is wrong {0}", u.DomainSchemeAndServer); Assert.IsTrue(!string.IsNullOrEmpty(u.DomainCountry), "Country is not resolved for {0}", u.Domain); Assert.IsTrue(u.IPAddresses.Count > 0, "No ip address is resolved for {0}", u.Domain); foreach (string s in u.RobotsExclusion) { Assert.IsTrue(s.StartsWith("/"), "Invalid robots.txt for {0}", u.DomainSchemeAndServer); } u = new LinkInfo(CM, "https://dev.mysql.com/", "http://www.google.com", "MySQL", "nofollow", "anchor"); Assert.IsTrue(u.Link == "https://dev.mysql.com/", "Link is wrong `{0}`", u.Link); Assert.IsTrue(u.Domain == "mysql.com", "Domain is wrong `{0}`", u.Domain); Assert.IsTrue(u.DomainOrSubdomain == "dev.mysql.com", "DomainOrSubdomain is wrong `{0}`", u.DomainOrSubdomain); Assert.IsTrue(u.Tld == "com", "Tld is wrong `{0}`", u.Tld); Assert.IsTrue(u.DomainScheme == "https", "Scheme is wrong {0}", u.DomainScheme); Assert.IsTrue(u.DomainSchemeAndServer == "https://dev.mysql.com/", "SchemeAndServer is wrong {0}", u.DomainSchemeAndServer); Assert.IsTrue(!string.IsNullOrEmpty(u.DomainCountry), "Country is not resolved for {0}", u.Domain); Assert.IsTrue(u.IPAddresses.Count > 0, "No ip address is resolved for {0}", u.Domain); Assert.IsTrue(u.RobotsExclusion.Count > 0, "No robots.txt for {0}", u.DomainSchemeAndServer); u = new LinkInfo(CM, "http://www.google.com.ph/webhp?sourceid=chrome-instant", "http://www.google.com", "MySQL", "nofollow", "anchor"); Assert.IsTrue(u.Link == "http://www.google.com.ph/webhp?sourceid=chrome-instant", "Link is wrong `{0}`", u.Link); Assert.IsTrue(u.Domain == "google.com.ph", "Domain is wrong `{0}`", u.Domain); Assert.IsTrue(u.DomainOrSubdomain == "www.google.com.ph", "DomainOrSubdomain is wrong `{0}`", u.DomainOrSubdomain); Assert.IsTrue(u.Tld == "com.ph", "Tld is wrong `{0}`", u.Tld); Assert.IsTrue(u.DomainScheme == "http", "Scheme is wrong {0}", u.DomainScheme); Assert.IsTrue(u.DomainSchemeAndServer == "http://www.google.com.ph/", "SchemeAndServer is wrong {0}", u.DomainSchemeAndServer); Assert.IsTrue(!string.IsNullOrEmpty(u.DomainCountry), "Country is not resolved for {0}", u.Domain); Assert.IsTrue(u.IPAddresses.Count > 0, "No ip address is resolved for {0}", u.Domain); Assert.IsTrue(u.RobotsExclusion.Count > 0, "No robots.txt for {0}", u.DomainSchemeAndServer); }
static void CrawlNextLinkFromPool(object o) { CollectorManager m = (CollectorManager)o; // Limit workers if (m.LinksCurrentlyProcessing.Count < m.COLLECTOR_COUNT) { //string link = m.Redis.DequeueItemFromList("urn:pool"); string link = string.Empty; try { if (m.COLLECTOR_DIRECTION == COLLECTOR_DIRECTION_OLDEST) link = m.Redis.List["urn:pool"].LeftPop(); else link = m.Redis.List["urn:pool"].RightPop(); } catch { } if (!string.IsNullOrEmpty(link)) { link = LinkParser.Validate(link, string.Empty); } if (!string.IsNullOrEmpty(link)) { m.LinksCurrentlyProcessing.Add(link); LinkInfo info = new LinkInfo(m, link); if (m.IsDomainOrSubdomainCurrentlyCrawled(info.DomainOrSubdomain)) { Thread.Sleep(500); //m.Redis.AddItemToList("urn:pool", link); if (m.COLLECTOR_DIRECTION == COLLECTOR_DIRECTION_OLDEST) m.Redis.List["urn:pool"].Append(link); else m.Redis.List["urn:pool"].Prepend(link); m.LinksCurrentlyProcessing.Remove(link); } else { DateTime? lastDateCrawl = null; //if (m.Redis.HashContainsEntry("urn:link:data-last-date-crawl", info.Link)) if (m.Redis.Hash["urn:link:data-last-date-crawl"].ContainsKey(info.Link)) { //lastDateCrawl = Convert.ToDateTime( // m.Redis.GetValueFromHash("urn:link:data-last-date-crawl", info.Link)); lastDateCrawl = Convert.ToDateTime( m.Redis.Hash["urn:link:data-last-date-crawl"][info.Link]); } if (!lastDateCrawl.HasValue || (DateTime.Now - lastDateCrawl.Value).Days > 30 || m.COLLECTOR_DIRECTION == COLLECTOR_DIRECTION_NEWEST) { // Only crawl if the link has not been crawled since // And if the last crawl was 30 days ago m.Collectors.Add(link, new LinkCollectorTaskPair { Task = Task.Factory.StartNew(CrawlLinkInfo, new object[] { m, info }), LinkInfo = info }); m.CollectorCount[0] = m.Collectors.Count; } else { m.LinksCurrentlyProcessing.Remove(link); } } } } }
public void Merge(LinkInfo info) { if (info.IPAddresses != null) { foreach (string ip in info.IPAddresses) { if (!this.IPAddresses.Contains(ip)) this.IPAddresses.Add(ip); } } if (info.Backlinks != null) { foreach (string backlink in info.Backlinks) { if (!this.Backlinks.Contains(backlink)) this.Backlinks.Add(backlink); } } }