Example #1
0
 public override void Run()
 {
     Trace.TraceInformation("WorkerRole1 is running");
     try {
         getReference g     = new getReference();
         CloudQueue   cmd   = g.commandQueue();
         CloudTable   table = g.getTable();
         while (true)
         {
             CloudQueueMessage retrievedMessage          = cmd.GetMessage();
             crawledTable      dashboard                 = new crawledTable("dash", null, null, null, null, null, "rowkey", 0, null, null, "idle");
             TableOperation    insertOrReplaceOperation1 = TableOperation.InsertOrReplace(dashboard);
             table.Execute(insertOrReplaceOperation1);
             if (retrievedMessage != null && retrievedMessage.AsString.Equals("start"))
             {
                 cmd.DeleteMessage(retrievedMessage);
                 crawlRobots();
             }
             else if (retrievedMessage != null && retrievedMessage.AsString.Equals("stop"))
             {
                 cmd.DeleteMessage(retrievedMessage);
                 break;
             }
         }
         Thread.Sleep(50);
         this.RunAsync(this.cancellationTokenSource.Token).Wait();
     }
     finally
     {
         this.runCompleteEvent.Set();
     }
 }
Example #2
0
        private HashSet <String> getAllHtml(List <String> o)
        {
            List <String>    oldList  = o;
            HashSet <String> htmlList = new HashSet <String>();
            int          count        = 0;
            getReference g            = new getReference();
            CloudQueue   queue        = g.getQueue();
            CloudTable   table        = g.getTable();

            while (count < oldList.Count)
            {
                WebClient       web   = new WebClient();
                String          html  = web.DownloadString(oldList.ElementAt(count));
                MatchCollection m1    = Regex.Matches(html, @"<loc>\s*(.+?)\s*</loc>", RegexOptions.Singleline);
                String          index = oldList.ElementAt(count);
                foreach (Match m in m1)
                {
                    String url = m.Groups[1].Value;
                    if (url.Contains("xml") && ((url.Contains("2015") || !url.Contains("-20"))))
                    {
                        oldList.Add(url);
                    }
                    if (!url.Contains("xml"))
                    {
                        crawledTable   dashboard = new crawledTable("dash", null, null, null, null, null, "rowkey", 0, null, null, "loading");
                        TableOperation insertOrReplaceOperation1 = TableOperation.InsertOrReplace(dashboard);
                        table.Execute(insertOrReplaceOperation1);
                        CloudQueueMessage message = new CloudQueueMessage(url);
                        queue.AddMessageAsync(message);
                        htmlList.Add(url);
                    }
                }
                count++;
            }
            return(htmlList);
        }
Example #3
0
        public void crawlerUrls(HashSet <String> duplicateList, List <String> noRobots)
        {
            List <String> lastten   = new List <String>();
            List <String> errorList = new List <String>();
            getReference  g         = new getReference();
            CloudQueue    queue     = g.getQueue();
            CloudTable    table     = g.getTable();
            CloudQueue    cmd       = g.commandQueue();

            queue.FetchAttributes();
            var limitCount = queue.ApproximateMessageCount.Value;
            int tableSize  = 0;

            while (0 < limitCount)
            {
                CloudQueueMessage retrievedMessage = queue.GetMessage();
                try
                {
                    if (retrievedMessage != null)
                    {
                        HtmlWeb      web      = new HtmlWeb();
                        HtmlDocument document = web.Load(retrievedMessage.AsString);
                        String       title    = "";
                        HtmlNode     node     = document.DocumentNode.SelectSingleNode("//title");
                        if (node != null)
                        {
                            HtmlAttribute desc;
                            desc  = node.Attributes["content"];
                            title = node.InnerHtml;
                        }

                        HtmlNode dateNode = document.DocumentNode.SelectSingleNode("//meta[(@itemprop='dateCreated')]");
                        String   date     = "";
                        if (dateNode != null)
                        {
                            date = dateNode.GetAttributeValue("content", "");
                        }

                        String tenUrls = "";
                        lastten.Add(retrievedMessage.AsString);
                        if (lastten.Count == 11)
                        {
                            lastten.RemoveAt(0);
                            tenUrls = String.Join(",", lastten);
                        }

                        queue.DeleteMessage(retrievedMessage);

                        String encodeUrl = EncodeUrlInKey(retrievedMessage.AsString);

                        float memory         = this.theMemCounter.NextValue();
                        float cpuUtilization = this.cpuload.NextValue();

                        tableSize++;

                        String errors = String.Join(",", errorList);

                        crawledTable   ct        = new crawledTable("index", retrievedMessage.AsString, title, date, null, null, encodeUrl, 0, null, null, null);
                        crawledTable   dashboard = new crawledTable("dash", retrievedMessage.AsString, title, date, errors, tenUrls, "rowkey", tableSize, memory.ToString(), cpuUtilization + "%", "crawling");
                        TableOperation insertOrReplaceOperation  = TableOperation.InsertOrReplace(ct);
                        TableOperation insertOrReplaceOperation1 = TableOperation.InsertOrReplace(dashboard);
                        table.Execute(insertOrReplaceOperation);
                        table.Execute(insertOrReplaceOperation1);

                        String root = "";

                        if (retrievedMessage.AsString.Contains("bleacher"))
                        {
                            root = "bleacherreport.com";
                        }
                        else if (retrievedMessage.AsString.Contains("cnn"))
                        {
                            root = "cnn.com";
                        }

                        var rows = document.DocumentNode.SelectNodes("//a[@href]");
                        if (rows != null && rows.Count > 0)
                        {
                            foreach (var link in rows)
                            {
                                String url = link.Attributes["href"].Value;
                                if (url.StartsWith("//"))
                                {
                                    url = "http:" + url;
                                }
                                else if (url.StartsWith("/"))
                                {
                                    url = "http://" + root + url;
                                }
                                Boolean isAllowed = true;
                                for (int i = 0; i < noRobots.Count; i++)
                                {
                                    String disallowedUrl = noRobots[i];
                                    if (url.Contains(disallowedUrl))
                                    {
                                        isAllowed = false;
                                        break;
                                    }
                                }

                                if (!duplicateList.Contains(url) && isAllowed && (url.Contains(root + "/")))
                                {
                                    duplicateList.Add(url);
                                    CloudQueueMessage message = new CloudQueueMessage(url);
                                    queue.AddMessageAsync(message);
                                }
                            }
                        }
                    }
                }
                catch (Exception e)
                {
                    string errorMessage = "Url: " + retrievedMessage.AsString + " problem is: " + e.Message;
                    if (!errorList.Contains(errorMessage))
                    {
                        errorList.Add(e.Message);
                    }
                }
                queue.FetchAttributes();
                limitCount = queue.ApproximateMessageCount.Value;
            }
        }
Example #4
0
        public void crawlerUrls(HashSet <String> duplicateList, List <String> noRobots)
        {
            HashSet <String> urlList = duplicateList;
            List <String>    lastten = new List <String>();
            getReference     g       = new getReference();
            CloudQueue       queue   = g.getQueue();
            CloudTable       table   = g.getTable();
            CloudQueue       cmd     = g.commandQueue();

            queue.FetchAttributes();
            var limitCount = queue.ApproximateMessageCount.Value;
            int count      = 0;

            while (0 < limitCount)
            {
                CloudQueueMessage retrievedMessage = queue.GetMessage();
                try
                {
                    if (retrievedMessage != null)
                    {
                        HtmlWeb      web      = new HtmlWeb();
                        HtmlDocument document = web.Load(retrievedMessage.AsString);
                        String       title    = "";
                        HtmlNode     node     = document.DocumentNode.SelectSingleNode("//title");
                        if (node != null)
                        {
                            HtmlAttribute desc;
                            desc  = node.Attributes["content"];
                            title = node.InnerHtml;
                        }

                        HtmlNode dateNode = document.DocumentNode.SelectSingleNode("//meta[(@itemprop='dateCreated')]");
                        String   date     = "";
                        if (dateNode != null)
                        {
                            date = dateNode.GetAttributeValue("content", "");
                        }

                        String tenUrls = "";
                        lastten.Add(retrievedMessage.AsString);
                        if (lastten.Count == 11)
                        {
                            lastten.RemoveAt(0);
                            tenUrls = String.Join(",", lastten);
                        }


                        queue.DeleteMessage(retrievedMessage);

                        String encodeUrl = EncodeUrlInKey(retrievedMessage.AsString);

                        crawledTable   ct        = new crawledTable("index", retrievedMessage.AsString, null, date, null, null, encodeUrl, 0, null, null, null);
                        crawledTable   dashboard = new crawledTable("dash", retrievedMessage.AsString, title, date, "DASHBOARD", tenUrls, "rowkey", 0, null, null, null);
                        TableOperation insertOrReplaceOperation  = TableOperation.InsertOrReplace(ct);
                        TableOperation insertOrReplaceOperation1 = TableOperation.InsertOrReplace(dashboard);
                        table.Execute(insertOrReplaceOperation);
                        table.Execute(insertOrReplaceOperation1);


                        String root = "";

                        if (retrievedMessage.AsString.Contains("bleacher"))
                        {
                            root = "bleacherreport.com";
                        }
                        else if (retrievedMessage.AsString.Contains("cnn"))
                        {
                            root = "cnn.com";
                        }

                        var rows = document.DocumentNode.SelectNodes("//a[@href]");
                        if (rows != null && rows.Count > 0)
                        {
                            foreach (var link in rows)
                            {
                                String url = link.Attributes["href"].Value;
                                if (url.StartsWith("//"))
                                {
                                    url = "http:" + url;
                                }
                                else if (url.StartsWith("/"))
                                {
                                    url = "http://" + root + url;
                                }
                                if (!urlList.Contains(url) && !noRobots.Contains(url) && (url.Contains(root + "/")))
                                {
                                    urlList.Add(url);
                                    CloudQueueMessage message = new CloudQueueMessage(url);
                                    queue.AddMessageAsync(message);
                                }
                            }
                        }
                        queue.FetchAttributes();
                        limitCount = queue.ApproximateMessageCount.Value;
                    }
                }
                catch (WebException e)
                {
                    queue.DeleteMessage(retrievedMessage);
                    crawledTable   ct = new crawledTable("error", retrievedMessage.AsString, "No Title", "No date", e.Status.ToString(), null, "error url", 0, null, null, null);
                    TableOperation insertOrReplaceOperation = TableOperation.InsertOrReplace(ct);
                    table.Execute(insertOrReplaceOperation);
                }
            }
        }