public static Collector CrawlAndSave( CollectorManager cm, string link, string backlink, string text, string rel, string kind, bool crawlChildLinks, bool poolChildLinksFound) { link = LinkParser.Validate(link, backlink); if (string.IsNullOrEmpty(link)) { // Do nothing, unparsable url return null; } else { Collector collector = new Collector( cm, link, backlink, text, rel, kind, crawlChildLinks, poolChildLinksFound); return collector; } }
// ~Collector() { if (this.Redis != null) this.Redis.Dispose(); } static void SaveLink(Collector c) { //IRedisHash urnLinkData = this.Redis.Hashes["urn:link:data"]; var urnLinkData = c.Redis.Hash["urn:link:data"]; if (urnLinkData.ContainsKey(c.Link)) { string serializedLinkData = urnLinkData[c.Link]; c.LinkInfo.Merge(serializedLinkData.JsonDeserialize<LinkInfo>()); } //me.Redis.SetEntryInHash("urn:link:data", // me.Link, me.LinkInfo.JsonSerialize()); c.Redis.Hash["urn:link:data"][c.Link] = c.LinkInfo.JsonSerialize(); //me.Redis.SetEntryInHash("urn:link:data-last-date-crawl", // me.Link, DateTime.Now.ToString()); c.Redis.Hash["urn:link:data-last-date-crawl"][c.Link] = DateTime.Now.ToString(); // Index date last crawl //IRedisHash urnLinkDateLastCrawl = me.Redis.Hashes["urn:link:date-last-crawl"]; // Seperate redis connection seems to be fixing race conditions using (RedisDataAccessProvider myRedis = new RedisDataAccessProvider()) { myRedis.Configuration = c.Redis.Configuration; var urnLinkDateLastCrawl = myRedis.Hash["urn:link:date-last-crawl"]; List<string> dateLastCrawlLinks = new List<string>(); if (urnLinkDateLastCrawl.ContainsKey(DateTime.Today.ToString())) dateLastCrawlLinks = urnLinkDateLastCrawl[DateTime.Today.ToString()] .JsonDeserialize<List<string>>(); else dateLastCrawlLinks = new List<string>(); if (!dateLastCrawlLinks.Contains(c.Link)) dateLastCrawlLinks.Add(c.Link); //me.Redis.SetEntryInHash("urn:link:date-last-crawl", // DateTime.Today.ToString(), dateLastCrawlLinks.JsonSerialize()); urnLinkDateLastCrawl.Set(DateTime.Today.ToString(), dateLastCrawlLinks.JsonSerialize()); myRedis.Close(); } if (!string.IsNullOrEmpty(c.CurrentBacklink)) { // Index anchor //IRedisHash urnLinkAnchor = me.Redis.Hashes["urn:link:anchor"]; var urnLinkAnchor = c.Redis.Hash["urn:link:anchor"]; List<string> anchorData; if (urnLinkAnchor.ContainsKey(c.LinkInfo.LinkPairID)) anchorData = urnLinkData[c.LinkInfo.LinkPairID] .JsonDeserialize<List<string>>(); else anchorData = new List<string>(); if (!anchorData.Contains(c.LinkInfo.AnchorInfo.JsonSerialize())) { anchorData.Add(c.LinkInfo.AnchorInfo.JsonSerialize()); //me.Redis.SetEntryInHash("urn:link:anchor", // me.LinkInfo.LinkPairID, anchorData.JsonSerialize()); urnLinkAnchor[c.LinkInfo.LinkPairID] = anchorData.JsonSerialize(); } //Redis.Lists["urn:link:anchor:" + me.LinkInfo.LinkPairID].RemoveValue(me.LinkInfo.AnchorInfo.JsonSerialize()); //Redis.Lists["urn:link:anchor:" + me.LinkInfo.LinkPairID].Append(me.LinkInfo.AnchorInfo.JsonSerialize()); // TODO: Index external backlinks //foreach (string backlink in me.LinkInfo.Backlinks) //{ //Redis.Lists["urn:backlink-external-link:" + backlink.Replace(':','_')].RemoveValue(me.Link); //Redis.Lists["urn:backlink-external-link:" + backlink.Replace(':', '_')].Append(me.Link); //} // Index backlink count by link //me.Redis.SetEntryInHash("urn:link:backlink-count", // me.Link, me.LinkInfo.Backlinks.Count.ToString()); c.Redis.Hash["urn:link:backlink-count"][c.Link] = c.LinkInfo.Backlinks.Count.ToString(); } // Index domain or subdomain //IRedisHash urnLinkDomainOrSubdomain = me.Redis.Hashes["urn:link:domain-or-subdomain"]; var urnLinkDomainOrSubdomain = c.Redis.Hash["urn:link:domain-or-subdomain"]; List<string> domainOrSubdomainLinks; if (urnLinkDomainOrSubdomain.ContainsKey(c.LinkInfo.DomainOrSubdomain)) domainOrSubdomainLinks = urnLinkDomainOrSubdomain[c.LinkInfo.DomainOrSubdomain] .JsonDeserialize<List<string>>(); else domainOrSubdomainLinks = new List<string>(); if (!domainOrSubdomainLinks.Contains(c.Link)) domainOrSubdomainLinks.Add(c.Link); //me.Redis.SetEntryInHash("urn:link:domain-or-subdomain", // me.LinkInfo.DomainOrSubdomain, domainOrSubdomainLinks.JsonSerialize()); urnLinkDomainOrSubdomain[c.LinkInfo.DomainOrSubdomain] = domainOrSubdomainLinks.JsonSerialize(); // Index domain //IRedisHash urnLinkDomain = me.Redis.Hashes["urn:link:domain"]; var urnLinkDomain = c.Redis.Hash["urn:link:domain"]; List<string> domainLinks; if (urnLinkDomain.ContainsKey(c.LinkInfo.Domain)) domainLinks = urnLinkDomain[c.LinkInfo.Domain] .JsonDeserialize<List<string>>(); else domainLinks = new List<string>(); if (!domainLinks.Contains(c.Link)) domainLinks.Add(c.Link); //me.Redis.SetEntryInHash("urn:link:domain", // me.LinkInfo.Domain, domainLinks.JsonSerialize()); urnLinkDomain[c.LinkInfo.Domain] = domainLinks.JsonSerialize(); //Redis.Lists["urn:link:domain-or-subdomain:" + me.LinkInfo.DomainOrSubdomain].RemoveValue(me.Link); //Redis.Lists["urn:link:domain-or-subdomain:" + me.LinkInfo.DomainOrSubdomain].Append(me.Link); //Redis.Lists["urn:link:domain:" + me.LinkInfo.Domain].RemoveValue(me.Link); //Redis.Lists["urn:link:domain:" + me.LinkInfo.Domain].Append(me.Link); }