Exemplo n.º 1
0
        public override void Run()
        {
            //put visited urls in a hash set
            startingCode();

            while (true)
            {
                Trace.TraceInformation("Working");

                //Sleep 50ms
                Thread.Sleep(50);

                //Check and handle admin messages
                CloudQueueMessage startStopMessage = startStopQueue.GetMessage();
                if (startStopMessage != null)
                {
                    state = startStopMessage.AsString;
                    Update("info", "state", state);
                    startStopQueue.DeleteMessage(startStopMessage);
                    if (state.Equals("started"))
                    {
                        crawl = new Crawler();
                        crawl.StartLoader();
                    }
                }

                if (state.Equals("clearing"))
                {
                    //clear queue
                    queue.Clear();
                    startingCode();
                }
                else if (state.Equals("started")) //keepGoing
                {
                    //get message from url queue
                    CloudQueueMessage message = queue.GetMessage();
                    //if message isn't null
                    if (message != null)
                    {
                        urlsCrawled++;
                        Update("info", "total", urlsCrawled.ToString());
                        queue.FetchAttributes();
                        queueSize = (int)queue.ApproximateMessageCount;
                        Update("info", "queue", queueSize.ToString());
                        string url = message.AsString;
                        if (!acceptedURLs.Contains(url))
                        {
                            try
                            {
                                List <WebCrawlerEntity> entities = crawl.startCrawler(url); //Store dates
                                if (entities != null)
                                {
                                    numTitles++;
                                    Update("info", "numTitles", numTitles.ToString());
                                    string temp = "";
                                    foreach (WebCrawlerEntity w in entities)
                                    {
                                        temp += " " + w.PartitionKey;
                                        urlsAccepted++;
                                        w.num = urlsAccepted;
                                        Update("info", "accepted", urlsAccepted.ToString());
                                        TableOperation insertOperation = TableOperation.InsertOrReplace(w);
                                        table.ExecuteAsync(insertOperation);
                                        acceptedURLs.Add(url);
                                    }
                                    TextInfo myTI = new CultureInfo("en-US", false).TextInfo;
                                    Update("info", "lastTitle", myTI.ToTitleCase(temp));
                                }
                            }
                            catch (Exception e)
                            { //put errors in error table
                                numErrors++;
                                infoEntity     newError             = new infoEntity(numErrors.ToString(), "url: " + url + " Error: " + e.Message);
                                TableOperation insertErrorOperation = TableOperation.InsertOrReplace(newError);
                                errorTable.Execute(insertErrorOperation);
                            }
                        }
                        queue.DeleteMessage(message);
                    }
                }
            }
        }
Exemplo n.º 2
0
        public override void Run()
        {
            //put visited urls in a hash set
            startingCode();
            
            while (true)
            {
                Trace.TraceInformation("Working");

                //Sleep 50ms
                Thread.Sleep(50);
                
                //Check and handle admin messages
                CloudQueueMessage startStopMessage = startStopQueue.GetMessage();
                if (startStopMessage != null)
                {
                    state = startStopMessage.AsString;
                    Update("info", "state", state);
                    startStopQueue.DeleteMessage(startStopMessage);
                    if (state.Equals("started"))
                    {
                        crawl = new Crawler();
                        crawl.StartLoader();
                    }
                }

                if (state.Equals("clearing")){
                    //clear queue
                    queue.Clear();
                    startingCode();
                }
                else if (state.Equals("started")) //keepGoing
                {
                    //get message from url queue
                    CloudQueueMessage message = queue.GetMessage();
                    //if message isn't null
                    if (message != null)
                    {
                        urlsCrawled++;
                        Update("info", "total", urlsCrawled.ToString());
                        queue.FetchAttributes();
                        queueSize = (int)queue.ApproximateMessageCount;
                        Update("info", "queue", queueSize.ToString());
                        string url = message.AsString;
                        if (!acceptedURLs.Contains(url))
                        {
                            try
                            {
                                List<WebCrawlerEntity> entities = crawl.startCrawler(url); //Store dates
                                if (entities != null)
                                {
                                    numTitles++;
                                    Update("info", "numTitles", numTitles.ToString());
                                    string temp = "";
                                    foreach (WebCrawlerEntity w in entities)
                                    {
                                        temp += " " + w.PartitionKey;
                                        urlsAccepted++;
                                        w.num = urlsAccepted;
                                        Update("info", "accepted", urlsAccepted.ToString());
                                        TableOperation insertOperation = TableOperation.InsertOrReplace(w);
                                        table.ExecuteAsync(insertOperation);
                                        acceptedURLs.Add(url);
                                    }
                                    TextInfo myTI = new CultureInfo("en-US", false).TextInfo;
                                    Update("info", "lastTitle", myTI.ToTitleCase(temp));
                                }

                            }
                            catch (Exception e)
                            { //put errors in error table
                                numErrors++;
                                infoEntity newError = new infoEntity(numErrors.ToString(), "url: " + url + " Error: " + e.Message);
                                TableOperation insertErrorOperation = TableOperation.InsertOrReplace(newError);
                                errorTable.Execute(insertErrorOperation);
                            }          
                        }
                        queue.DeleteMessage(message);
                    }
                }
            }
        }