Example #1
0
 // Refresh storage and helpers.
 private void InitializeCrawlrComponents()
 {
     this._storageManager
         = new CrawlrStorageManager(ConfigurationManager.AppSettings["StorageConnectionString"]);
     this._crawlrData    = new CrawlrDataHelper();
     this._statusManager = new CrawlrStatusManager();
 }
        public void UpdateCrawlrStatus(string currentStatus, CrawlrDataHelper data, CrawlrStorageManager storage)
        {
            WorkerRoleStatus currStatusEntity;

            if (currentStatus == "CLEAR")
            {
                currStatusEntity = new WorkerRoleStatus(
                    currentStatus,
                    (int)_cpuTime.NextValue(),
                    (int)_memoryFree.NextValue(),
                    0,
                    new System.Collections.Generic.Queue <string>()
                    );
            }
            else
            {
                currStatusEntity = new WorkerRoleStatus(
                    currentStatus,
                    (int)_cpuTime.NextValue(),
                    (int)_memoryFree.NextValue(),
                    data.NumUrlsCrawled,
                    data.LastTenUrls
                    );
            }

            TableOperation insertStatus = TableOperation.InsertOrReplace(currStatusEntity);

            storage.StatusTable.Execute(insertStatus);
        }
Example #3
0
        /// <summary>
        ///     Checks to see if a given URL has already been queued/parsed; if not, adds to queue.
        /// </summary>
        /// <param name="currHref">
        ///     URL to check.
        /// </param>
        /// <param name="currUri">
        ///     Domain space of URL.
        /// </param>
        /// <param name="urlLastMod">
        ///     URLs lastmod/published date, if known. Nullable.
        /// </param>
        /// <param name="data">
        ///     Crawler data helper. Ref.
        /// </param>
        /// <param name="storage">
        ///     Crawler azure storage helper. Ref.
        /// </param>
        public static void ChkAndAddUrl(string currHref, string currUri,
                                        DateTime?urlLastMod, ref CrawlrDataHelper data, ref CrawlrStorageManager storage)
        {
            bool   validDateIfExists = true;
            string domain            = new Uri(currUri).Host;

            if (currHref.StartsWith(@"//"))
            {
                currHref = @"http:" + currHref;
            }
            else if (currHref.StartsWith(@"/"))
            {
                currHref = @"http://" + domain + currHref;
            }
            if (urlLastMod != null)
            {
                validDateIfExists = (urlLastMod >= DateTime.Now - TimeSpan.FromDays(62));
            }
            if (IsInProperDomain(currHref) &&
                !data.QueuedUrls.Contains(currHref) &&
                !data.AddedUrls.Contains(currHref) &&
                validDateIfExists)
            {
                CloudQueueMessage urlMsg = new CloudQueueMessage(currHref);
                storage.UrlQueue.AddMessage(urlMsg);
                data.QueuedUrls.Add(currHref);
                data.NumUrlsQueued++;
            }
        }
        /// <summary>
        ///     Crawls a given robots.txt, adding all sitemaps to queue.
        /// </summary>
        /// <param name="data">
        ///     Crawler data helper. Ref.
        /// </param>
        /// <param name="storage">
        ///     Crawler azure storage helper. Ref.
        /// </param>
        public static void CrawlRobotsTxt(ref CrawlrDataHelper data, ref CrawlrStorageManager storage)
        {
            string url = storage.GetCurrentRobotsTxt();

            CrawlSpecificRobotsTxt(url, ref data, ref storage);

            // Include bleacherreport.com (formerly cnn.com/sports) if crawling cnn.com
            if (storage.GetCurrentRobotsTxt().Contains("cnn"))
            {
                CrawlSpecificRobotsTxt("http://www.bleacherreport.com/robots.txt", ref data, ref storage);
            }
        }
Example #5
0
        private static void CrawlSpecificRobotsTxt(string url, ref CrawlrDataHelper data, ref CrawlrStorageManager storage)
        {
            string    tempPath = Path.GetTempFileName();
            WebClient wc       = new WebClient();

            wc.DownloadFile(url, tempPath);
            StreamReader  input         = new StreamReader(tempPath);
            string        currLine      = "";
            string        currUserAgent = "";
            List <string> sitemaps      = new List <string>();

            while ((currLine = input.ReadLine()) != null)
            {
                var splitLine = currLine.Split(' ');
                if (splitLine[0].ToLower() == "sitemap:")
                {
                    bool pass = false;
                    if (url.Contains("bleacherreport"))
                    {
                        if (splitLine[1].Contains("nba"))
                        {
                            pass = true;
                        }
                    }
                    else
                    {
                        pass = true;
                    }
                    if (pass)
                    {
                        sitemaps.Add(splitLine[1]);
                        data.QueuedXmls.Add(splitLine[1]);
                        CloudQueueMessage msg = new CloudQueueMessage(splitLine[1]);
                        storage.XmlQueue.AddMessage(msg);
                        data.NumXmlsQueued++;
                    }
                }
                else if (splitLine[0].ToLower() == "user-agent:")
                {
                    currUserAgent = splitLine[1];
                }
                else if (splitLine[0].ToLower() == "disallow:" && currUserAgent == "*")
                {
                    data.DisallowedStrings.Add(splitLine[1]);
                }
            }
        }
Example #6
0
        /// <summary>
        ///     Crawls a given XML document (sitemap) by URI, checking each found URI's date if possible
        ///     and filtering out links disallowed or older than 2 months. Places found and valid XMLs/URLs
        ///     into a relevant queue for crawling.
        /// </summary>
        /// <param name="data">
        ///     Crawler data helper. Ref.
        /// </param>
        /// <param name="storage">
        ///     Crawler azure storage helper. Ref.
        /// </param>
        /// <param name="xml">
        ///     URI to XML document (sitemap) to crawl.
        /// </param>
        public static void CrawlXml(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string xml)
        {
            if (data.ChkIfUriAllowed(xml))
            {
                if (xml.EndsWith(".xml"))
                {
                    XElement sitemap       = XElement.Load(xml);
                    string   sitemapType   = sitemap.Name.LocalName;
                    string   nameSpace     = sitemap.GetDefaultNamespace().ToString();
                    string   dateNameSpace = null;
                    XName    dateParent    = null;
                    XName    date          = null;
                    if (sitemap.ToString().Contains(@"xmlns:news"))
                    {
                        dateNameSpace = sitemap.GetNamespaceOfPrefix("news").ToString();
                        dateParent    = XName.Get("news", dateNameSpace);
                        date          = XName.Get("publication_date", dateNameSpace);
                    }
                    else if (sitemap.ToString().Contains(@"xmlns:video"))
                    {
                        dateNameSpace = sitemap.GetNamespaceOfPrefix("video").ToString();
                        dateParent    = XName.Get("video", dateNameSpace);
                        date          = XName.Get("publication_date", dateNameSpace);
                    }
                    XName loc     = XName.Get("loc", nameSpace);
                    XName lastMod = XName.Get("lastmod", nameSpace);
                    XName elementSelector;
                    if (sitemapType == "sitemapindex")
                    {
                        elementSelector = XName.Get("sitemap", nameSpace);
                    }
                    else
                    {
                        elementSelector = XName.Get("url", nameSpace);
                    }

                    List <string> xmlsToQueue = new List <string>();
                    List <string> urlsToQueue = new List <string>();

                    foreach (var element in sitemap.Elements(elementSelector))
                    {
                        bool   validDateIfExists  = true;
                        var    currLocElement     = element.Element(loc);
                        string currLocValue       = currLocElement.Value;
                        var    currLastModElement = element.Element(lastMod);
                        if (currLastModElement == null)
                        {
                            currLastModElement = element.Element(dateParent);
                            currLastModElement = (currLastModElement == null ? null : currLastModElement.Element(date));
                        }
                        if (currLastModElement != null)
                        {
                            validDateIfExists = DateTime.Parse(currLastModElement.Value) >= new DateTime(2018, 1, 1);
                        }
                        if (currLocValue.Contains(".xml"))
                        {
                            if (!data.QueuedXmls.Contains(currLocValue) &&
                                validDateIfExists)
                            {
                                xmlsToQueue.Add(currLocValue);
                            }
                        }
                        else
                        {
                            if (!data.QueuedUrls.Contains(currLocValue) &&
                                validDateIfExists)
                            {
                                urlsToQueue.Add(currLocValue);
                            }
                        }
                    }

                    foreach (string newXml in xmlsToQueue)
                    {
                        data.QueuedXmls.Add(newXml);
                        data.NumXmlsQueued++;
                        data.XmlQueue.Enqueue(newXml);
                    }

                    foreach (string newUrl in urlsToQueue)
                    {
                        UrlCrawlr.ChkAndAddUrl(newUrl, xml, null, ref data, ref storage);
                    }
                }
            }
        }
Example #7
0
        /// <summary>
        ///     Crawls a given XML document (sitemap) by URI, checking each found URI's date if possible
        ///     and filtering out links disallowed or older than 2 months. Places found and valid XMLs/URLs
        ///     into relevant azure queue for processing.
        /// </summary>
        /// <param name="data">
        ///     Crawler data helper. Ref.
        /// </param>
        /// <param name="storage">
        ///     Crawler azure storage helper. Ref.
        /// </param>
        /// <param name="xml">
        ///     URI to XML document (sitemap) to crawl.
        /// </param>
        public static void CrawlXml(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string xml)
        {
            if (data.ChkIfUriAllowed(xml))
            {
                try
                {
                    if (xml.EndsWith(".xml"))
                    {
                        XElement sitemap       = XElement.Load(xml);
                        string   sitemapType   = sitemap.Name.LocalName;
                        string   nameSpace     = sitemap.GetDefaultNamespace().ToString();
                        string   dateNameSpace = null;
                        XName    dateParent    = null;
                        XName    date          = null;
                        if (sitemap.ToString().Contains(@"xmlns:news"))
                        {
                            dateNameSpace = sitemap.GetNamespaceOfPrefix("news").ToString();
                            dateParent    = XName.Get("news", dateNameSpace);
                            date          = XName.Get("publication_date", dateNameSpace);
                        }
                        else if (sitemap.ToString().Contains(@"xmlns:video"))
                        {
                            dateNameSpace = sitemap.GetNamespaceOfPrefix("video").ToString();
                            dateParent    = XName.Get("video", dateNameSpace);
                            date          = XName.Get("publication_date", dateNameSpace);
                        }
                        XName loc     = XName.Get("loc", nameSpace);
                        XName lastMod = XName.Get("lastmod", nameSpace);
                        XName elementSelector;
                        if (sitemapType == "sitemapindex")
                        {
                            elementSelector = XName.Get("sitemap", nameSpace);
                        }
                        else
                        {
                            elementSelector = XName.Get("url", nameSpace);
                        }

                        List <string> xmlsToQueue = new List <string>();
                        List <string> urlsToQueue = new List <string>();

                        foreach (var element in sitemap.Elements(elementSelector))
                        {
                            bool   validDateIfExists  = true;
                            var    currLocElement     = element.Element(loc);
                            string currLocValue       = currLocElement.Value;
                            var    currLastModElement = element.Element(lastMod);
                            if (currLastModElement == null)
                            {
                                currLastModElement = element.Element(dateParent);
                                currLastModElement = (currLastModElement == null ? null : currLastModElement.Element(date));
                            }
                            if (currLastModElement != null)
                            {
                                validDateIfExists = DateTime.Parse(currLastModElement.Value) >= DateTime.Now - TimeSpan.FromDays(62);
                            }
                            if (currLocValue.Contains(".xml"))
                            {
                                if (!data.QueuedXmls.Contains(currLocValue) &&
                                    validDateIfExists)
                                {
                                    xmlsToQueue.Add(currLocValue);
                                }
                            }
                            else
                            {
                                if (!data.QueuedUrls.Contains(currLocValue) &&
                                    validDateIfExists)
                                {
                                    urlsToQueue.Add(currLocValue);
                                }
                            }
                        }

                        foreach (string newXml in xmlsToQueue)
                        {
                            CloudQueueMessage msg = new CloudQueueMessage(newXml);
                            storage.XmlQueue.AddMessage(msg);
                            data.QueuedXmls.Add(newXml);
                            data.NumXmlsQueued++;
                        }

                        foreach (string newUrl in urlsToQueue)
                        {
                            UrlCrawlr.ChkAndAddUrl(newUrl, xml, null, ref data, ref storage);
                        }
                    }
                }
                catch (Exception ex)
                {
                    ErrorUrl       errorUrl       = new ErrorUrl(xml, ex.ToString());
                    TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl);
                    storage.ErrorTable.Execute(insertErrorUrl);
                }
            }
        }
Example #8
0
        /// <summary>
        ///     Crawls a given URL, queueing all found URLs and storing information about
        ///     the given URL for later querying.
        /// </summary>
        /// <param name="data">
        ///     Crawler data helper. Ref.
        /// </param>
        /// <param name="storage">
        ///     Crawler azure storage helper. Ref.
        /// </param>
        /// <param name="url">
        ///     The given URL to crawl.
        /// </param>
        public static void CrawlUrl(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string url)
        {
            if (data.ChkIfUriAllowed(url))
            {
                ///*  Unsure if necessary.  */
                //if (!url.Contains(".htm"))
                //{
                //    if (!url.Contains(".jpg") && !url.Contains(".png"))
                //    {
                //        if (url.EndsWith(@"/"))
                //        {
                //            url += "index.html";
                //        }
                //        else
                //        {
                //            url += @"/index.html";
                //        }
                //    }
                //}
                try
                {
                    var web      = new HtmlWeb();
                    var currDoc  = web.Load(url);
                    var urlNodes = currDoc.DocumentNode.Descendants("a")
                                   .ToList();
                    var urlPageTitle = currDoc.DocumentNode.Descendants("title")
                                       .First()
                                       .InnerText;
                    var urlLastModNode = currDoc.DocumentNode.Descendants("meta")
                                         .Select(y => y)
                                         .Where(y => y.Attributes.Contains("name"))
                                         .Where(y => y.Attributes["name"].Value == "pubdate")
                                         .ToList();

                    DateTime?urlLastMod = null;
                    if (urlLastModNode.Count > 0)
                    {
                        urlLastMod = DateTime.Parse(
                            urlLastModNode.First().Attributes["content"].Value);
                    }

                    List <string> urlsToQueue = new List <string>();

                    foreach (var urlNode in urlNodes)
                    {
                        if (urlNode.Attributes.Contains("href"))
                        {
                            urlsToQueue.Add(urlNode.Attributes["href"].Value);
                        }
                    }

                    foreach (string newUrl in urlsToQueue)
                    {
                        ChkAndAddUrl(newUrl, url, urlLastMod, ref data, ref storage);
                    }

                    if (!data.AddedUrls.Contains(url))
                    {
                        data.AddedUrls.Add(url);
                        data.NumUrlsIndexed++;
                    }
                    data.NumUrlsCrawled++;
                    FoundUrl       finishedUrl = new FoundUrl(urlPageTitle, (urlLastMod != null ? urlLastMod.ToString() : "NULL"), url);
                    UrlTableCount  newCount    = new UrlTableCount(data.NumUrlsCrawled, data.NumUrlsIndexed);
                    TableOperation insertUrl   = TableOperation.InsertOrReplace(finishedUrl);
                    TableOperation insertCount = TableOperation.InsertOrReplace(newCount);
                    storage.UrlTable.Execute(insertUrl);
                    storage.UrlTable.Execute(insertCount);
                    if (data.LastTenUrls.Count == 10)
                    {
                        data.LastTenUrls.Dequeue();
                    }
                    data.LastTenUrls.Enqueue(url);
                }
                catch (Exception ex)
                {
                    ErrorUrl       errorUrl       = new ErrorUrl(url, ex.ToString());
                    TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl);
                    storage.ErrorTable.Execute(insertErrorUrl);
                }
            }
        }
Example #9
0
        /// <summary>
        ///     Crawls a given URL, queueing all found URLs and storing information about
        ///     the given URL for later querying.
        /// </summary>
        /// <param name="data">
        ///     Crawler data helper. Ref.
        /// </param>
        /// <param name="storage">
        ///     Crawler azure storage helper. Ref.
        /// </param>
        /// <param name="url">
        ///     The given URL to crawl.
        /// </param>
        public static void CrawlUrl(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string url)
        {
            if (data.ChkIfUriAllowed(url))
            {
                ///*  Unsure if necessary.  */
                //if (!url.Contains(".htm"))
                //{
                //    if (!url.Contains(".jpg") && !url.Contains(".png"))
                //    {
                //        if (url.EndsWith(@"/"))
                //        {
                //            url += "index.html";
                //        }
                //        else
                //        {
                //            url += @"/index.html";
                //        }
                //    }
                //}

                var web      = new HtmlWeb();
                var currDoc  = web.Load(url);
                var urlNodes = currDoc.DocumentNode.Descendants("a")
                               .ToList();
                var urlPageTitle = currDoc.DocumentNode.Descendants("title")
                                   .First()
                                   .InnerText;
                var urlLastModNode = currDoc.DocumentNode.Descendants("meta")
                                     .Select(y => y)
                                     .Where(y => y.Attributes.Contains("name"))
                                     .Where(y => y.Attributes["name"].Value == "pubdate")
                                     .ToList();

                DateTime?urlLastMod = null;
                if (urlLastModNode.Count > 0)
                {
                    urlLastMod = DateTime.Parse(
                        urlLastModNode.First().Attributes["content"].Value);
                }

                List <string> urlsToQueue = new List <string>();

                foreach (var urlNode in urlNodes)
                {
                    if (urlNode.Attributes.Contains("href"))
                    {
                        urlsToQueue.Add(urlNode.Attributes["href"].Value);
                    }
                }

                foreach (string newUrl in urlsToQueue)
                {
                    ChkAndAddUrl(newUrl, url, null, ref data, ref storage);
                }

                if (!data.AddedUrls.Contains(url))
                {
                    data.AddedUrls.Add(url);
                }
                data.NumUrlsCrawled++;
                string[] splitPageTitle = urlPageTitle.Split(' ');
                foreach (string s in splitPageTitle)
                {
                    string plainText = s.ToLower();
                    plainText = Regex.Replace(plainText, "[^a-zA-Z0-9]", "");
                    if (plainText != "")
                    {
                        IndexedUrl     wordToUrl       = new IndexedUrl(plainText, urlPageTitle, (urlLastMod != null ? urlLastMod.ToString(): "NULL"), url);
                        TableOperation insertWordToUrl = TableOperation.InsertOrReplace(wordToUrl);
                        storage.UrlTable.Execute(insertWordToUrl);
                        data.NumUrlsIndexed++;
                    }
                }
                UrlTableCount  newCount    = new UrlTableCount(data.NumUrlsCrawled, data.NumUrlsIndexed);
                TableOperation insertCount = TableOperation.InsertOrReplace(newCount);
                storage.UrlTable.Execute(insertCount);
                if (data.LastTenUrls.Count == 10)
                {
                    data.LastTenUrls.Dequeue();
                }
                data.LastTenUrls.Enqueue(url);
            }
        }