Esempio n. 1
0
        public void Crawl()
        {
            _crawlSetting.StartTime = DateTime.Now;

            DataLayer.Save(_crawlSetting);

            for (int i = 0; i < _hosts.Length; i++)
            {
                Queue <Webpage>  sharedQueue    = new Queue <Webpage>();
                object           sharedLock     = new object();
                HashSet <string> sharedPageHash = new HashSet <string>();
                HashSet <string> sharedFormHash = new HashSet <string>();

                CrawlingSharedResource sharedResource = new CrawlingSharedResource(_crawlSetting, _hosts[i], sharedQueue, sharedLock, sharedPageHash, sharedFormHash);
                CrawlerAgent[]         agents         = new CrawlerAgent[_threadsNumber];

                _aliveAgentsDic.Add(_hosts[i], 0);
                _agentsDic.Add(_hosts[i], agents);
                _hosts[i].StartTime = DateTime.Now;
                _hosts[i].CrawlId   = _crawlSetting.Id;

                DataLayer.Save(_hosts[i]);

                for (int j = 0; j < _threadsNumber; j++)
                {
                    agents[j] = new CrawlerAgent(sharedResource);
                    agents[j].CrawlAgentCompleted += agent_CrawlAgentCompleted;
                    agents[j].CrawlAnnounced      += agent_CrawlAnnounced;
                    agents[j].CrawlAgentStarted   += Crawler_CrawlAgentStarted;
                    agents[j].CrawlAsync();
                }
            }
        }
Esempio n. 2
0
        void Crawler_CrawlAgentStarted(object sender, EventArgs e)
        {
            CrawlerAgent agent = sender as CrawlerAgent;
            Host         host  = agent.CrawlingSharedResource.Host;

            lock (_lock)
            {
                _aliveAgentsDic[host]++;
                if (_aliveAgentsDic.All(kv => kv.Value == _threadsNumber))
                {
                    OnCrawlStarted(this);
                }
            }
        }
Esempio n. 3
0
        void agent_CrawlAgentCompleted(object sender, EventArgs e)
        {
            CrawlerAgent agent = sender as CrawlerAgent;
            Host         host  = agent.CrawlingSharedResource.Host;

            lock (_lock)
            {
                _aliveAgentsDic[host]--;
                if (_aliveAgentsDic[host] == 0)
                {
                    _agentsDic.Remove(host);
                    host.FinishTime = DateTime.Now;

                    DataLayer.Save(host);

                    if (_agentsDic.Count == 0)
                    {
                        _crawlSetting.FinishTime = DateTime.Now;
                        DataLayer.Save(_crawlSetting);
                        OnCrawlCompleted(this);
                    }
                }
            }
        }
Esempio n. 4
0
        public void Crawl()
        {
            _crawlSetting.StartTime = DateTime.Now;

            DataLayer.Save(_crawlSetting);

            for (int i = 0; i < _hosts.Length; i++)
            {
                Queue<Webpage> sharedQueue = new Queue<Webpage>();
                object sharedLock = new object();
                HashSet<string> sharedPageHash = new HashSet<string>();
                HashSet<string> sharedFormHash = new HashSet<string>();

                CrawlingSharedResource sharedResource = new CrawlingSharedResource(_crawlSetting, _hosts[i], sharedQueue, sharedLock, sharedPageHash, sharedFormHash);
                CrawlerAgent[] agents = new CrawlerAgent[_threadsNumber];

                _aliveAgentsDic.Add(_hosts[i], 0);
                _agentsDic.Add(_hosts[i], agents);
                _hosts[i].StartTime = DateTime.Now;
                _hosts[i].CrawlId = _crawlSetting.Id;

                DataLayer.Save(_hosts[i]);

                for (int j = 0; j < _threadsNumber; j++)
                {
                    agents[j] = new CrawlerAgent(sharedResource);
                    agents[j].CrawlAgentCompleted += agent_CrawlAgentCompleted;
                    agents[j].CrawlAnnounced += agent_CrawlAnnounced;
                    agents[j].CrawlAgentStarted += Crawler_CrawlAgentStarted;
                    agents[j].CrawlAsync();
                }
            }
        }