Пример #1
0
 public static Collector CrawlAndSave(
     CollectorManager cm,
     string link, string backlink, string text, string rel, string kind,
     bool crawlChildLinks, bool poolChildLinksFound)
 {
     link = LinkParser.Validate(link, backlink);
     if (string.IsNullOrEmpty(link))
     {
         // Do nothing, unparsable url
         return null;
     }
     else
     {
         Collector collector = new Collector(
             cm, link, backlink, text, rel, kind, crawlChildLinks, poolChildLinksFound);
         return collector;
     }
 }
Пример #2
0
        // ~Collector() { if (this.Redis != null) this.Redis.Dispose(); }
        static void SaveLink(Collector c)
        {
            //IRedisHash urnLinkData = this.Redis.Hashes["urn:link:data"];
            var urnLinkData = c.Redis.Hash["urn:link:data"];
            if (urnLinkData.ContainsKey(c.Link))
            {
                string serializedLinkData = urnLinkData[c.Link];
                c.LinkInfo.Merge(serializedLinkData.JsonDeserialize<LinkInfo>());
            }

            //me.Redis.SetEntryInHash("urn:link:data",
            //    me.Link, me.LinkInfo.JsonSerialize());
            c.Redis.Hash["urn:link:data"][c.Link] = c.LinkInfo.JsonSerialize();

            //me.Redis.SetEntryInHash("urn:link:data-last-date-crawl",
            //    me.Link, DateTime.Now.ToString());
            c.Redis.Hash["urn:link:data-last-date-crawl"][c.Link] = DateTime.Now.ToString();

            // Index date last crawl
            //IRedisHash urnLinkDateLastCrawl = me.Redis.Hashes["urn:link:date-last-crawl"];

            // Seperate redis connection seems to be fixing race conditions
            using (RedisDataAccessProvider myRedis = new RedisDataAccessProvider())
            {
                myRedis.Configuration = c.Redis.Configuration;

                var urnLinkDateLastCrawl = myRedis.Hash["urn:link:date-last-crawl"];
                List<string> dateLastCrawlLinks = new List<string>();
                if (urnLinkDateLastCrawl.ContainsKey(DateTime.Today.ToString()))
                    dateLastCrawlLinks = urnLinkDateLastCrawl[DateTime.Today.ToString()]
                        .JsonDeserialize<List<string>>();
                else
                    dateLastCrawlLinks = new List<string>();
                if (!dateLastCrawlLinks.Contains(c.Link))
                    dateLastCrawlLinks.Add(c.Link);
                //me.Redis.SetEntryInHash("urn:link:date-last-crawl",
                //    DateTime.Today.ToString(), dateLastCrawlLinks.JsonSerialize());
                urnLinkDateLastCrawl.Set(DateTime.Today.ToString(), dateLastCrawlLinks.JsonSerialize());

                myRedis.Close();
            }

            if (!string.IsNullOrEmpty(c.CurrentBacklink))
            {
                // Index anchor
                //IRedisHash urnLinkAnchor = me.Redis.Hashes["urn:link:anchor"];
                var urnLinkAnchor = c.Redis.Hash["urn:link:anchor"];
                List<string> anchorData;
                if (urnLinkAnchor.ContainsKey(c.LinkInfo.LinkPairID))
                    anchorData = urnLinkData[c.LinkInfo.LinkPairID]
                        .JsonDeserialize<List<string>>();
                else
                    anchorData = new List<string>();
                if (!anchorData.Contains(c.LinkInfo.AnchorInfo.JsonSerialize()))
                {
                    anchorData.Add(c.LinkInfo.AnchorInfo.JsonSerialize());
                    //me.Redis.SetEntryInHash("urn:link:anchor",
                    //    me.LinkInfo.LinkPairID, anchorData.JsonSerialize());
                    urnLinkAnchor[c.LinkInfo.LinkPairID] = anchorData.JsonSerialize();
                }

                //Redis.Lists["urn:link:anchor:" + me.LinkInfo.LinkPairID].RemoveValue(me.LinkInfo.AnchorInfo.JsonSerialize());
                //Redis.Lists["urn:link:anchor:" + me.LinkInfo.LinkPairID].Append(me.LinkInfo.AnchorInfo.JsonSerialize());

                // TODO: Index external backlinks
                //foreach (string backlink in me.LinkInfo.Backlinks)
                //{
                    //Redis.Lists["urn:backlink-external-link:" + backlink.Replace(':','_')].RemoveValue(me.Link);
                    //Redis.Lists["urn:backlink-external-link:" + backlink.Replace(':', '_')].Append(me.Link);
                //}

                // Index backlink count by link
                //me.Redis.SetEntryInHash("urn:link:backlink-count",
                //    me.Link, me.LinkInfo.Backlinks.Count.ToString());
                c.Redis.Hash["urn:link:backlink-count"][c.Link] =
                    c.LinkInfo.Backlinks.Count.ToString();
            }

            // Index domain or subdomain
            //IRedisHash urnLinkDomainOrSubdomain = me.Redis.Hashes["urn:link:domain-or-subdomain"];
            var urnLinkDomainOrSubdomain = c.Redis.Hash["urn:link:domain-or-subdomain"];
            List<string> domainOrSubdomainLinks;
            if (urnLinkDomainOrSubdomain.ContainsKey(c.LinkInfo.DomainOrSubdomain))
                domainOrSubdomainLinks = urnLinkDomainOrSubdomain[c.LinkInfo.DomainOrSubdomain]
                    .JsonDeserialize<List<string>>();
            else
                domainOrSubdomainLinks = new List<string>();
            if (!domainOrSubdomainLinks.Contains(c.Link))
                domainOrSubdomainLinks.Add(c.Link);
            //me.Redis.SetEntryInHash("urn:link:domain-or-subdomain",
            //    me.LinkInfo.DomainOrSubdomain, domainOrSubdomainLinks.JsonSerialize());
            urnLinkDomainOrSubdomain[c.LinkInfo.DomainOrSubdomain] =
                domainOrSubdomainLinks.JsonSerialize();

            // Index domain
            //IRedisHash urnLinkDomain = me.Redis.Hashes["urn:link:domain"];
            var urnLinkDomain = c.Redis.Hash["urn:link:domain"];
            List<string> domainLinks;
            if (urnLinkDomain.ContainsKey(c.LinkInfo.Domain))
                domainLinks = urnLinkDomain[c.LinkInfo.Domain]
                    .JsonDeserialize<List<string>>();
            else
                domainLinks = new List<string>();
            if (!domainLinks.Contains(c.Link))
                domainLinks.Add(c.Link);
            //me.Redis.SetEntryInHash("urn:link:domain",
            //    me.LinkInfo.Domain, domainLinks.JsonSerialize());
            urnLinkDomain[c.LinkInfo.Domain] = domainLinks.JsonSerialize();

            //Redis.Lists["urn:link:domain-or-subdomain:" + me.LinkInfo.DomainOrSubdomain].RemoveValue(me.Link);
            //Redis.Lists["urn:link:domain-or-subdomain:" + me.LinkInfo.DomainOrSubdomain].Append(me.Link);
            //Redis.Lists["urn:link:domain:" + me.LinkInfo.Domain].RemoveValue(me.Link);
            //Redis.Lists["urn:link:domain:" + me.LinkInfo.Domain].Append(me.Link);
        }