Beispiel #1
0
        private void CrawlSiteMap(string link)
        {
            XElement whole = XElement.Load(link);
            XName    url;
            XName    loc;
            string   selectLink           = "http://www.sitemaps.org/schemas/sitemap/0.9";
            XName    lastmod              = XName.Get("lastmod", "http://www.sitemaps.org/schemas/sitemap/0.9");
            XName    news                 = XName.Get("news", "http://www.google.com/schemas/sitemaps-news/0.9");
            XName    newsPublicationDate  = XName.Get("publication_date", "http://www.google.com/schemas/sitemaps-news/0.9");
            XName    video                = XName.Get("video", "http://www.google.com/schemas/sitemap-video/1.1");
            XName    videoPublicationDate = XName.Get("publication_date", "http://www.google.com/schemas/sitemap-video/1.1");
            DateTime givendate            = new DateTime(2017, 12, 1);
            DateTime publishDate          = new DateTime(1000, 01, 01);

            if (link.Contains("bleacherreport.com"))
            {
                selectLink  = "http://www.google.com/schemas/sitemap/0.9";
                publishDate = DateTime.Today;
            }

            url = XName.Get("url", selectLink);
            loc = XName.Get("loc", selectLink);


            try
            {
                foreach (var urlElement in whole.Elements(url))
                {
                    string locElement = urlElement.Element(loc).Value;
                    if (urlElement.Element(news) != null)
                    {
                        publishDate = DateTime.Parse(urlElement.Element(news).Element(newsPublicationDate).Value);
                    }
                    else if (urlElement.Element(video) != null)
                    {
                        publishDate = DateTime.Parse(urlElement.Element(video).Element(videoPublicationDate).Value);
                    }
                    else if (urlElement.Element(lastmod) != null)
                    {
                        publishDate = DateTime.Parse(urlElement.Element(lastmod).Value);
                    }

                    if (publishDate != null)
                    {
                        if (publishDate > givendate)
                        {
                            if (!HtmlSet.Contains(locElement))
                            {
                                HtmlSet.Add(locElement);
                                StorageManager.HTMLQueue().AddMessage(new CloudQueueMessage(locElement));
                            }
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                Trace.TraceInformation(ex.Message);
            }
        }
        public Dashboard(string state, int cpu, int mem)
        {
            this.PartitionKey = "Dashboard Status";
            this.RowKey       = "Row";
            State             = state;
            CPUCounter        = cpu;
            MemCounter        = mem;
            NumCrawled        = 0;
            Last10CrawledList = new List <string>();
            Last10Crawled     = "";
            CloudQueue temp = StorageManager.HTMLQueue();

            temp.FetchAttributes();
            QueueSize = temp.ApproximateMessageCount.ToString();
        }
Beispiel #3
0
        public void GetHTMLData()
        {
            HtmlWeb      htmlWeb = new HtmlWeb();
            HtmlDocument webpage = new HtmlDocument();

            new Task(GetPerfCounters).Start();
            Thread.Sleep(100);
            CloudQueueMessage htmllink = StorageManager.HTMLQueue().GetMessage();
            string            url      = htmllink.AsString;

            if (!IsDisallow(url) && htmllink != null)
            {
                using (var client = new WebClient())
                {
                    try
                    {
                        webpage.LoadHtml(client.DownloadString(url));
                        DateTime pubdlication;
                        Uri      currentUri  = new Uri(url);
                        HtmlNode pubdateHTML = webpage.DocumentNode.SelectSingleNode("//head/meta[@name='lastmod']");
                        if (pubdateHTML != null)
                        {
                            pubdlication = DateTime.Parse(pubdateHTML.Attributes["content"].Value);
                        }
                        else
                        {
                            pubdlication = DateTime.Today;
                        }
                        UpdateRecentUrl(url);
                        string title = webpage.DocumentNode.SelectSingleNode("//head/title").InnerText ?? "";
                        foreach (string word in CleanUpTitle(title))
                        {
                            TableOperation insertOp = TableOperation.InsertOrReplace(new PageEntity(word, title, url, pubdlication));
                            StorageManager.GetTable().Execute(insertOp);
                        }
                        HtmlNodeCollection href = webpage.DocumentNode.SelectNodes("//a[@href]");
                        if (href != null)
                        {
                            foreach (HtmlNode linkNode in href)
                            {
                                string templink = linkNode.Attributes["href"].Value;
                                if (templink.StartsWith("//"))
                                {
                                    templink = "http:" + templink;
                                }
                                else if (templink.StartsWith("/"))
                                {
                                    templink = "http://" + currentUri.Host + templink;
                                }
                                if (!HtmlSet.Contains(templink))
                                {
                                    HtmlSet.Add(templink);
                                    StorageManager.HTMLQueue().AddMessageAsync(new CloudQueueMessage(templink));
                                }
                            }
                        }
                    }
                    catch (Exception e)
                    {
                        if (!HtmlSet.Contains(htmllink.AsString))
                        {
                            HtmlSet.Add(htmllink.AsString);
                            TableOperation insertOp = TableOperation.InsertOrReplace(new ErrorMessage(htmllink.AsString, e.Message));
                            StorageManager.ErrorTable().Execute(insertOp);
                        }
                    }
                }
            }
            StorageManager.HTMLQueue().DeleteMessage(htmllink);
        }