コード例 #1
0
ファイル: crawler.cs プロジェクト: kyang126/info344homework3
        //get xml method

        public void crawlRobots()
        {
            List <String>    noRobots = new List <String>();
            HashSet <String> allHtml  = new HashSet <String>();
            var           wc          = new WebClient();
            List <String> yesRobots   = new List <String>();
            getReference  g           = new getReference();
            CloudQueue    queue       = g.getQueue();

            string[] robotSites = new string[2];
            robotSites[0] = "http://bleacherreport.com/robots.txt";
            robotSites[1] = "http://www.cnn.com/robots.txt";
            for (int i = 0; i < robotSites.Length; i++)
            {
                String robot = robotSites[i];
                using (var sourceStream = wc.OpenRead(robot))
                {
                    using (var reader = new StreamReader(sourceStream))
                    {
                        while (reader.EndOfStream == false)
                        {
                            string line = reader.ReadLine();
                            if (!line.Contains("User-Agent"))
                            {
                                if (line.Contains("Sitemap:"))
                                {
                                    String newUrl = line.Replace("Sitemap:", "").Trim();

                                    if (robot.Contains("bleacher") && newUrl.Contains("nba"))
                                    {
                                        yesRobots.Add(newUrl);
                                    }
                                    else if (robot.Contains("cnn"))
                                    {
                                        yesRobots.Add(newUrl);
                                    }
                                }
                                if (line.Contains("Disallow:"))
                                {
                                    String newUrl = line.Replace("Disallow:", "").Trim();
                                    if (robot.Contains("bleacher"))
                                    {
                                        newUrl = "http://bleacherreport.com" + newUrl;
                                    }
                                    else
                                    {
                                        newUrl = "http://cnn.com" + newUrl;
                                    }
                                    noRobots.Add(newUrl);
                                }
                            }
                        }
                    }
                }
            }
            allHtml = getAllHtml(yesRobots);
            crawlerUrls(allHtml, noRobots);
        }
コード例 #2
0
ファイル: crawler.cs プロジェクト: kyang126/info344homework3
        private HashSet <String> getAllHtml(List <String> o)
        {
            List <String>    oldList  = o;
            HashSet <String> htmlList = new HashSet <String>();
            int          count        = 0;
            getReference g            = new getReference();
            CloudQueue   queue        = g.getQueue();
            CloudTable   table        = g.getTable();

            while (count < oldList.Count)
            {
                WebClient       web   = new WebClient();
                String          html  = web.DownloadString(oldList.ElementAt(count));
                MatchCollection m1    = Regex.Matches(html, @"<loc>\s*(.+?)\s*</loc>", RegexOptions.Singleline);
                String          index = oldList.ElementAt(count);
                foreach (Match m in m1)
                {
                    String url = m.Groups[1].Value;
                    if (url.Contains("xml") && ((url.Contains("2015") || !url.Contains("-20"))))
                    {
                        oldList.Add(url);
                    }
                    if (!url.Contains("xml"))
                    {
                        crawledTable   dashboard = new crawledTable("dash", null, null, null, null, null, "rowkey", 0, null, null, "loading");
                        TableOperation insertOrReplaceOperation1 = TableOperation.InsertOrReplace(dashboard);
                        table.Execute(insertOrReplaceOperation1);
                        CloudQueueMessage message = new CloudQueueMessage(url);
                        queue.AddMessageAsync(message);
                        htmlList.Add(url);
                    }
                }
                count++;
            }
            return(htmlList);
        }
コード例 #3
0
ファイル: crawler.cs プロジェクト: kyang126/info344homework3
        public void crawlerUrls(HashSet <String> duplicateList, List <String> noRobots)
        {
            List <String> lastten   = new List <String>();
            List <String> errorList = new List <String>();
            getReference  g         = new getReference();
            CloudQueue    queue     = g.getQueue();
            CloudTable    table     = g.getTable();
            CloudQueue    cmd       = g.commandQueue();

            queue.FetchAttributes();
            var limitCount = queue.ApproximateMessageCount.Value;
            int tableSize  = 0;

            while (0 < limitCount)
            {
                CloudQueueMessage retrievedMessage = queue.GetMessage();
                try
                {
                    if (retrievedMessage != null)
                    {
                        HtmlWeb      web      = new HtmlWeb();
                        HtmlDocument document = web.Load(retrievedMessage.AsString);
                        String       title    = "";
                        HtmlNode     node     = document.DocumentNode.SelectSingleNode("//title");
                        if (node != null)
                        {
                            HtmlAttribute desc;
                            desc  = node.Attributes["content"];
                            title = node.InnerHtml;
                        }

                        HtmlNode dateNode = document.DocumentNode.SelectSingleNode("//meta[(@itemprop='dateCreated')]");
                        String   date     = "";
                        if (dateNode != null)
                        {
                            date = dateNode.GetAttributeValue("content", "");
                        }

                        String tenUrls = "";
                        lastten.Add(retrievedMessage.AsString);
                        if (lastten.Count == 11)
                        {
                            lastten.RemoveAt(0);
                            tenUrls = String.Join(",", lastten);
                        }

                        queue.DeleteMessage(retrievedMessage);

                        String encodeUrl = EncodeUrlInKey(retrievedMessage.AsString);

                        float memory         = this.theMemCounter.NextValue();
                        float cpuUtilization = this.cpuload.NextValue();

                        tableSize++;

                        String errors = String.Join(",", errorList);

                        crawledTable   ct        = new crawledTable("index", retrievedMessage.AsString, title, date, null, null, encodeUrl, 0, null, null, null);
                        crawledTable   dashboard = new crawledTable("dash", retrievedMessage.AsString, title, date, errors, tenUrls, "rowkey", tableSize, memory.ToString(), cpuUtilization + "%", "crawling");
                        TableOperation insertOrReplaceOperation  = TableOperation.InsertOrReplace(ct);
                        TableOperation insertOrReplaceOperation1 = TableOperation.InsertOrReplace(dashboard);
                        table.Execute(insertOrReplaceOperation);
                        table.Execute(insertOrReplaceOperation1);

                        String root = "";

                        if (retrievedMessage.AsString.Contains("bleacher"))
                        {
                            root = "bleacherreport.com";
                        }
                        else if (retrievedMessage.AsString.Contains("cnn"))
                        {
                            root = "cnn.com";
                        }

                        var rows = document.DocumentNode.SelectNodes("//a[@href]");
                        if (rows != null && rows.Count > 0)
                        {
                            foreach (var link in rows)
                            {
                                String url = link.Attributes["href"].Value;
                                if (url.StartsWith("//"))
                                {
                                    url = "http:" + url;
                                }
                                else if (url.StartsWith("/"))
                                {
                                    url = "http://" + root + url;
                                }
                                Boolean isAllowed = true;
                                for (int i = 0; i < noRobots.Count; i++)
                                {
                                    String disallowedUrl = noRobots[i];
                                    if (url.Contains(disallowedUrl))
                                    {
                                        isAllowed = false;
                                        break;
                                    }
                                }

                                if (!duplicateList.Contains(url) && isAllowed && (url.Contains(root + "/")))
                                {
                                    duplicateList.Add(url);
                                    CloudQueueMessage message = new CloudQueueMessage(url);
                                    queue.AddMessageAsync(message);
                                }
                            }
                        }
                    }
                }
                catch (Exception e)
                {
                    errorList.Add(e.Message);
                }
                queue.FetchAttributes();
                limitCount = queue.ApproximateMessageCount.Value;
            }
        }