예제 #1
0
 // Refresh storage and helpers.
 private void InitializeCrawlrComponents()
 {
     this._storageManager
         = new CrawlrStorageManager(ConfigurationManager.AppSettings["StorageConnectionString"]);
     this._crawlrData    = new CrawlrDataHelper();
     this._statusManager = new CrawlrStatusManager();
 }
        /// <summary>
        ///     Crawls a given robots.txt, adding all sitemaps to queue.
        /// </summary>
        /// <param name="data">
        ///     Crawler data helper. Ref.
        /// </param>
        /// <param name="storage">
        ///     Crawler azure storage helper. Ref.
        /// </param>
        public static void CrawlRobotsTxt(ref CrawlrDataHelper data, ref CrawlrStorageManager storage)
        {
            string url = storage.GetCurrentRobotsTxt();

            CrawlSpecificRobotsTxt(url, ref data, ref storage);

            // Include bleacherreport.com (formerly cnn.com/sports) if crawling cnn.com
            if (storage.GetCurrentRobotsTxt().Contains("cnn"))
            {
                CrawlSpecificRobotsTxt("http://www.bleacherreport.com/robots.txt", ref data, ref storage);
            }
        }
예제 #3
0
        private async Task RunAsync(CancellationToken cancellationToken)
        {
            _storageManager = new CrawlrStorageManager(ConfigurationManager.AppSettings["StorageConnectionString"]);

            // Get the current cmd from the cmd table;
            // Re-execute cmd query periodically until current cmd exists
            while (_storageManager.GetCurrentCmd() != "START")
            {
                Thread.Sleep(5000);
            }

            // If start cmd given, initialize download of robots.txt
            // and populate the xmlQueue and _disallowed list
            if (_storageManager.GetCurrentCmd() == "START" && _crawlrData == null)
            {
                // Set up queues, tables, data helper, status helper
                InitializeCrawlrComponents();
                Startup();
            }

            // Recurring work
            while (!cancellationToken.IsCancellationRequested)
            {
                Trace.TraceInformation("Working");

                // Do work if current cmd is still "start"
                if (_storageManager.GetCurrentCmd() == "START")
                {
                    //Process all XMLs(sitemaps) found
                    string nextXml = "";
                    try
                    {
                        while (_crawlrData.NumXmlsQueued > 0 && _storageManager.GetCurrentCmd() == "START")
                        {
                            //CloudQueueMessage nextXmlMsg = _storageManager.XmlQueue.GetMessage();
                            nextXml = _crawlrData.XmlQueue.Dequeue();
                            _crawlrData.NumXmlsQueued--;

                            XmlCrawlr.CrawlXml(ref _crawlrData, ref _storageManager, nextXml);

                            //_storageManager.XmlQueue.DeleteMessage(nextXmlMsg);

                            // Update worker role status
                            _statusManager.UpdateCrawlrStatus(
                                "Loading",
                                _crawlrData,
                                _storageManager
                                );
                            _statusManager.UpdateQueueSize(_storageManager, _crawlrData.NumXmlsQueued, _crawlrData.NumUrlsQueued);

                            Thread.Sleep(50);
                        }
                    }
                    catch (Exception ex)
                    {
                        try
                        {
                            ErrorEntity    errorUrl       = new ErrorEntity(nextXml, ex.ToString());
                            TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl);
                            _storageManager.ErrorTable.Execute(insertErrorUrl);
                        }
                        catch (Exception) { }
                    }

                    // Process all URLs in queue
                    string nextUrl = "";
                    try
                    {
                        while (_storageManager.GetCurrentCmd() == "START")
                        {
                            CloudQueueMessage nextUrlMsg = _storageManager.UrlQueue.GetMessage();
                            nextUrl = nextUrlMsg.AsString;

                            UrlCrawlr.CrawlUrl(ref _crawlrData, ref _storageManager, nextUrl);

                            _storageManager.UrlQueue.DeleteMessage(nextUrlMsg);
                            _crawlrData.NumUrlsQueued--;

                            // Update worker role status
                            _statusManager.UpdateCrawlrStatus(
                                "Crawling",
                                _crawlrData,
                                _storageManager
                                );
                            _statusManager.UpdateQueueSize(_storageManager, _crawlrData.NumXmlsQueued, _crawlrData.NumUrlsQueued);

                            Thread.Sleep(50);
                        }
                    }
                    catch (Exception ex)
                    {
                        try
                        {
                            ErrorEntity    errorUrl       = new ErrorEntity(nextUrl, ex.ToString());
                            TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl);
                            _storageManager.ErrorTable.Execute(insertErrorUrl);
                        }
                        catch (Exception) { }
                    }
                }
                else if (_storageManager.GetCurrentCmd() == "CLEAR")
                {
                    // If the "CLEAR" command is found, update status.
                    _statusManager.UpdateCrawlrStatus(
                        "CLEAR",
                        _crawlrData,
                        _storageManager
                        );
                    _statusManager.UpdateQueueSize(_storageManager, 0, 0);
                    _storageManager.UrlQueue.Clear();
                    _storageManager.XmlQueue.Clear();
                    // Give Azure time to delete tables.
                    Thread.Sleep(20000);

                    try
                    {
                        // Idle while waiting for next command.
                        while (_storageManager.GetCurrentCmd() == "CLEAR")
                        {
                            Thread.Sleep(10000);
                        }
                    }
                    finally
                    {
                        // Reinitialize worker role.
                        InitializeCrawlrComponents();
                        Startup();
                    }
                }
                else
                {
                    // Idle worker role (for unimplemented 'pause' functionality).
                    _statusManager.UpdateCrawlrStatus(
                        "Idle",
                        _crawlrData,
                        _storageManager
                        );

                    Thread.Sleep(5000);
                }
            }

            Thread.Sleep(1000);
        }
예제 #4
0
        /// <summary>
        ///     Crawls a given XML document (sitemap) by URI, checking each found URI's date if possible
        ///     and filtering out links disallowed or older than 2 months. Places found and valid XMLs/URLs
        ///     into a relevant queue for crawling.
        /// </summary>
        /// <param name="data">
        ///     Crawler data helper. Ref.
        /// </param>
        /// <param name="storage">
        ///     Crawler azure storage helper. Ref.
        /// </param>
        /// <param name="xml">
        ///     URI to XML document (sitemap) to crawl.
        /// </param>
        public static void CrawlXml(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string xml)
        {
            if (data.ChkIfUriAllowed(xml))
            {
                if (xml.EndsWith(".xml"))
                {
                    XElement sitemap       = XElement.Load(xml);
                    string   sitemapType   = sitemap.Name.LocalName;
                    string   nameSpace     = sitemap.GetDefaultNamespace().ToString();
                    string   dateNameSpace = null;
                    XName    dateParent    = null;
                    XName    date          = null;
                    if (sitemap.ToString().Contains(@"xmlns:news"))
                    {
                        dateNameSpace = sitemap.GetNamespaceOfPrefix("news").ToString();
                        dateParent    = XName.Get("news", dateNameSpace);
                        date          = XName.Get("publication_date", dateNameSpace);
                    }
                    else if (sitemap.ToString().Contains(@"xmlns:video"))
                    {
                        dateNameSpace = sitemap.GetNamespaceOfPrefix("video").ToString();
                        dateParent    = XName.Get("video", dateNameSpace);
                        date          = XName.Get("publication_date", dateNameSpace);
                    }
                    XName loc     = XName.Get("loc", nameSpace);
                    XName lastMod = XName.Get("lastmod", nameSpace);
                    XName elementSelector;
                    if (sitemapType == "sitemapindex")
                    {
                        elementSelector = XName.Get("sitemap", nameSpace);
                    }
                    else
                    {
                        elementSelector = XName.Get("url", nameSpace);
                    }

                    List <string> xmlsToQueue = new List <string>();
                    List <string> urlsToQueue = new List <string>();

                    foreach (var element in sitemap.Elements(elementSelector))
                    {
                        bool   validDateIfExists  = true;
                        var    currLocElement     = element.Element(loc);
                        string currLocValue       = currLocElement.Value;
                        var    currLastModElement = element.Element(lastMod);
                        if (currLastModElement == null)
                        {
                            currLastModElement = element.Element(dateParent);
                            currLastModElement = (currLastModElement == null ? null : currLastModElement.Element(date));
                        }
                        if (currLastModElement != null)
                        {
                            validDateIfExists = DateTime.Parse(currLastModElement.Value) >= new DateTime(2018, 1, 1);
                        }
                        if (currLocValue.Contains(".xml"))
                        {
                            if (!data.QueuedXmls.Contains(currLocValue) &&
                                validDateIfExists)
                            {
                                xmlsToQueue.Add(currLocValue);
                            }
                        }
                        else
                        {
                            if (!data.QueuedUrls.Contains(currLocValue) &&
                                validDateIfExists)
                            {
                                urlsToQueue.Add(currLocValue);
                            }
                        }
                    }

                    foreach (string newXml in xmlsToQueue)
                    {
                        data.QueuedXmls.Add(newXml);
                        data.NumXmlsQueued++;
                        data.XmlQueue.Enqueue(newXml);
                    }

                    foreach (string newUrl in urlsToQueue)
                    {
                        UrlCrawlr.ChkAndAddUrl(newUrl, xml, null, ref data, ref storage);
                    }
                }
            }
        }
예제 #5
0
        /// <summary>
        ///     Crawls a given XML document (sitemap) by URI, checking each found URI's date if possible
        ///     and filtering out links disallowed or older than 2 months. Places found and valid XMLs/URLs
        ///     into relevant azure queue for processing.
        /// </summary>
        /// <param name="data">
        ///     Crawler data helper. Ref.
        /// </param>
        /// <param name="storage">
        ///     Crawler azure storage helper. Ref.
        /// </param>
        /// <param name="xml">
        ///     URI to XML document (sitemap) to crawl.
        /// </param>
        public static void CrawlXml(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string xml)
        {
            if (data.ChkIfUriAllowed(xml))
            {
                try
                {
                    if (xml.EndsWith(".xml"))
                    {
                        XElement sitemap       = XElement.Load(xml);
                        string   sitemapType   = sitemap.Name.LocalName;
                        string   nameSpace     = sitemap.GetDefaultNamespace().ToString();
                        string   dateNameSpace = null;
                        XName    dateParent    = null;
                        XName    date          = null;
                        if (sitemap.ToString().Contains(@"xmlns:news"))
                        {
                            dateNameSpace = sitemap.GetNamespaceOfPrefix("news").ToString();
                            dateParent    = XName.Get("news", dateNameSpace);
                            date          = XName.Get("publication_date", dateNameSpace);
                        }
                        else if (sitemap.ToString().Contains(@"xmlns:video"))
                        {
                            dateNameSpace = sitemap.GetNamespaceOfPrefix("video").ToString();
                            dateParent    = XName.Get("video", dateNameSpace);
                            date          = XName.Get("publication_date", dateNameSpace);
                        }
                        XName loc     = XName.Get("loc", nameSpace);
                        XName lastMod = XName.Get("lastmod", nameSpace);
                        XName elementSelector;
                        if (sitemapType == "sitemapindex")
                        {
                            elementSelector = XName.Get("sitemap", nameSpace);
                        }
                        else
                        {
                            elementSelector = XName.Get("url", nameSpace);
                        }

                        List <string> xmlsToQueue = new List <string>();
                        List <string> urlsToQueue = new List <string>();

                        foreach (var element in sitemap.Elements(elementSelector))
                        {
                            bool   validDateIfExists  = true;
                            var    currLocElement     = element.Element(loc);
                            string currLocValue       = currLocElement.Value;
                            var    currLastModElement = element.Element(lastMod);
                            if (currLastModElement == null)
                            {
                                currLastModElement = element.Element(dateParent);
                                currLastModElement = (currLastModElement == null ? null : currLastModElement.Element(date));
                            }
                            if (currLastModElement != null)
                            {
                                validDateIfExists = DateTime.Parse(currLastModElement.Value) >= DateTime.Now - TimeSpan.FromDays(62);
                            }
                            if (currLocValue.Contains(".xml"))
                            {
                                if (!data.QueuedXmls.Contains(currLocValue) &&
                                    validDateIfExists)
                                {
                                    xmlsToQueue.Add(currLocValue);
                                }
                            }
                            else
                            {
                                if (!data.QueuedUrls.Contains(currLocValue) &&
                                    validDateIfExists)
                                {
                                    urlsToQueue.Add(currLocValue);
                                }
                            }
                        }

                        foreach (string newXml in xmlsToQueue)
                        {
                            CloudQueueMessage msg = new CloudQueueMessage(newXml);
                            storage.XmlQueue.AddMessage(msg);
                            data.QueuedXmls.Add(newXml);
                            data.NumXmlsQueued++;
                        }

                        foreach (string newUrl in urlsToQueue)
                        {
                            UrlCrawlr.ChkAndAddUrl(newUrl, xml, null, ref data, ref storage);
                        }
                    }
                }
                catch (Exception ex)
                {
                    ErrorUrl       errorUrl       = new ErrorUrl(xml, ex.ToString());
                    TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl);
                    storage.ErrorTable.Execute(insertErrorUrl);
                }
            }
        }
예제 #6
0
        /// <summary>
        ///     Crawls a given URL, queueing all found URLs and storing information about
        ///     the given URL for later querying.
        /// </summary>
        /// <param name="data">
        ///     Crawler data helper. Ref.
        /// </param>
        /// <param name="storage">
        ///     Crawler azure storage helper. Ref.
        /// </param>
        /// <param name="url">
        ///     The given URL to crawl.
        /// </param>
        public static void CrawlUrl(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string url)
        {
            if (data.ChkIfUriAllowed(url))
            {
                ///*  Unsure if necessary.  */
                //if (!url.Contains(".htm"))
                //{
                //    if (!url.Contains(".jpg") && !url.Contains(".png"))
                //    {
                //        if (url.EndsWith(@"/"))
                //        {
                //            url += "index.html";
                //        }
                //        else
                //        {
                //            url += @"/index.html";
                //        }
                //    }
                //}
                try
                {
                    var web      = new HtmlWeb();
                    var currDoc  = web.Load(url);
                    var urlNodes = currDoc.DocumentNode.Descendants("a")
                                   .ToList();
                    var urlPageTitle = currDoc.DocumentNode.Descendants("title")
                                       .First()
                                       .InnerText;
                    var urlLastModNode = currDoc.DocumentNode.Descendants("meta")
                                         .Select(y => y)
                                         .Where(y => y.Attributes.Contains("name"))
                                         .Where(y => y.Attributes["name"].Value == "pubdate")
                                         .ToList();

                    DateTime?urlLastMod = null;
                    if (urlLastModNode.Count > 0)
                    {
                        urlLastMod = DateTime.Parse(
                            urlLastModNode.First().Attributes["content"].Value);
                    }

                    List <string> urlsToQueue = new List <string>();

                    foreach (var urlNode in urlNodes)
                    {
                        if (urlNode.Attributes.Contains("href"))
                        {
                            urlsToQueue.Add(urlNode.Attributes["href"].Value);
                        }
                    }

                    foreach (string newUrl in urlsToQueue)
                    {
                        ChkAndAddUrl(newUrl, url, urlLastMod, ref data, ref storage);
                    }

                    if (!data.AddedUrls.Contains(url))
                    {
                        data.AddedUrls.Add(url);
                        data.NumUrlsIndexed++;
                    }
                    data.NumUrlsCrawled++;
                    FoundUrl       finishedUrl = new FoundUrl(urlPageTitle, (urlLastMod != null ? urlLastMod.ToString() : "NULL"), url);
                    UrlTableCount  newCount    = new UrlTableCount(data.NumUrlsCrawled, data.NumUrlsIndexed);
                    TableOperation insertUrl   = TableOperation.InsertOrReplace(finishedUrl);
                    TableOperation insertCount = TableOperation.InsertOrReplace(newCount);
                    storage.UrlTable.Execute(insertUrl);
                    storage.UrlTable.Execute(insertCount);
                    if (data.LastTenUrls.Count == 10)
                    {
                        data.LastTenUrls.Dequeue();
                    }
                    data.LastTenUrls.Enqueue(url);
                }
                catch (Exception ex)
                {
                    ErrorUrl       errorUrl       = new ErrorUrl(url, ex.ToString());
                    TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl);
                    storage.ErrorTable.Execute(insertErrorUrl);
                }
            }
        }
예제 #7
0
        /// <summary>
        ///     Checks to see if a given URL has already been queued/parsed; if not, adds to queue.
        /// </summary>
        /// <param name="currHref">
        ///     URL to check.
        /// </param>
        /// <param name="currUri">
        ///     Domain space of URL.
        /// </param>
        /// <param name="urlLastMod">
        ///     URLs lastmod/published date, if known. Nullable.
        /// </param>
        /// <param name="data">
        ///     Crawler data helper. Ref.
        /// </param>
        /// <param name="storage">
        ///     Crawler azure storage helper. Ref.
        /// </param>
        public static void ChkAndAddUrl(string currHref, string currUri,
                                        DateTime?urlLastMod, ref CrawlrDataHelper data, ref CrawlrStorageManager storage)
        {
            bool   validDateIfExists = true;
            string domain            = new Uri(currUri).Host;

            if (currHref.StartsWith(@"//"))
            {
                currHref = @"http:" + currHref;
            }
            else if (currHref.StartsWith(@"/"))
            {
                currHref = @"http://" + domain + currHref;
            }
            if (urlLastMod != null)
            {
                validDateIfExists = (urlLastMod >= DateTime.Now - TimeSpan.FromDays(62));
            }
            if (IsInProperDomain(currHref) &&
                !data.QueuedUrls.Contains(currHref) &&
                !data.AddedUrls.Contains(currHref) &&
                validDateIfExists)
            {
                CloudQueueMessage urlMsg = new CloudQueueMessage(currHref);
                storage.UrlQueue.AddMessage(urlMsg);
                data.QueuedUrls.Add(currHref);
                data.NumUrlsQueued++;
            }
        }
예제 #8
0
        /// <summary>
        ///     Crawls a given URL, queueing all found URLs and storing information about
        ///     the given URL for later querying.
        /// </summary>
        /// <param name="data">
        ///     Crawler data helper. Ref.
        /// </param>
        /// <param name="storage">
        ///     Crawler azure storage helper. Ref.
        /// </param>
        /// <param name="url">
        ///     The given URL to crawl.
        /// </param>
        public static void CrawlUrl(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string url)
        {
            if (data.ChkIfUriAllowed(url))
            {
                ///*  Unsure if necessary.  */
                //if (!url.Contains(".htm"))
                //{
                //    if (!url.Contains(".jpg") && !url.Contains(".png"))
                //    {
                //        if (url.EndsWith(@"/"))
                //        {
                //            url += "index.html";
                //        }
                //        else
                //        {
                //            url += @"/index.html";
                //        }
                //    }
                //}

                var web      = new HtmlWeb();
                var currDoc  = web.Load(url);
                var urlNodes = currDoc.DocumentNode.Descendants("a")
                               .ToList();
                var urlPageTitle = currDoc.DocumentNode.Descendants("title")
                                   .First()
                                   .InnerText;
                var urlLastModNode = currDoc.DocumentNode.Descendants("meta")
                                     .Select(y => y)
                                     .Where(y => y.Attributes.Contains("name"))
                                     .Where(y => y.Attributes["name"].Value == "pubdate")
                                     .ToList();

                DateTime?urlLastMod = null;
                if (urlLastModNode.Count > 0)
                {
                    urlLastMod = DateTime.Parse(
                        urlLastModNode.First().Attributes["content"].Value);
                }

                List <string> urlsToQueue = new List <string>();

                foreach (var urlNode in urlNodes)
                {
                    if (urlNode.Attributes.Contains("href"))
                    {
                        urlsToQueue.Add(urlNode.Attributes["href"].Value);
                    }
                }

                foreach (string newUrl in urlsToQueue)
                {
                    ChkAndAddUrl(newUrl, url, null, ref data, ref storage);
                }

                if (!data.AddedUrls.Contains(url))
                {
                    data.AddedUrls.Add(url);
                }
                data.NumUrlsCrawled++;
                string[] splitPageTitle = urlPageTitle.Split(' ');
                foreach (string s in splitPageTitle)
                {
                    string plainText = s.ToLower();
                    plainText = Regex.Replace(plainText, "[^a-zA-Z0-9]", "");
                    if (plainText != "")
                    {
                        IndexedUrl     wordToUrl       = new IndexedUrl(plainText, urlPageTitle, (urlLastMod != null ? urlLastMod.ToString(): "NULL"), url);
                        TableOperation insertWordToUrl = TableOperation.InsertOrReplace(wordToUrl);
                        storage.UrlTable.Execute(insertWordToUrl);
                        data.NumUrlsIndexed++;
                    }
                }
                UrlTableCount  newCount    = new UrlTableCount(data.NumUrlsCrawled, data.NumUrlsIndexed);
                TableOperation insertCount = TableOperation.InsertOrReplace(newCount);
                storage.UrlTable.Execute(insertCount);
                if (data.LastTenUrls.Count == 10)
                {
                    data.LastTenUrls.Dequeue();
                }
                data.LastTenUrls.Enqueue(url);
            }
        }
 private void InitializeCrawlrComponents()
 {
     _storageManager
         = new CrawlrStorageManager(ConfigurationManager.AppSettings["StorageConnectionString"]);
 }
예제 #10
0
        private static void CrawlSpecificRobotsTxt(string url, ref CrawlrDataHelper data, ref CrawlrStorageManager storage)
        {
            string    tempPath = Path.GetTempFileName();
            WebClient wc       = new WebClient();

            wc.DownloadFile(url, tempPath);
            StreamReader  input         = new StreamReader(tempPath);
            string        currLine      = "";
            string        currUserAgent = "";
            List <string> sitemaps      = new List <string>();

            while ((currLine = input.ReadLine()) != null)
            {
                var splitLine = currLine.Split(' ');
                if (splitLine[0].ToLower() == "sitemap:")
                {
                    bool pass = false;
                    if (url.Contains("bleacherreport"))
                    {
                        if (splitLine[1].Contains("/nba") || splitLine[1].Contains("/articles"))
                        {
                            pass = true;
                        }
                    }
                    else
                    {
                        pass = true;
                    }
                    if (pass)
                    {
                        sitemaps.Add(splitLine[1]);
                        data.QueuedXmls.Add(splitLine[1]);
                        data.XmlQueue.Enqueue(splitLine[1]);
                        data.NumXmlsQueued++;
                    }
                }
                else if (splitLine[0].ToLower() == "user-agent:")
                {
                    currUserAgent = splitLine[1];
                }
                else if (splitLine[0].ToLower() == "disallow:" && currUserAgent == "*")
                {
                    data.DisallowedStrings.Add(splitLine[1]);
                }
            }
        }