Ejemplo n.º 1
0
        async Task handle(string url)
        {
            if (VisitedUrl.Add(url))
            {
                var context = BrowsingContext.New(Configuration.Default.WithDefaultLoader());
                try
                {
                    var document = await context.OpenAsync(url);

                    foreach (var item in document.QuerySelectorAll("[href]"))
                    {
                        var u = fixUrl(item.GetAttribute("href"));
                        try
                        {
                            if (new Uri(u).Host.EndsWith(StartUrl.Host) && !PendingUrl.Contains(u) && !VisitedUrl.Contains(u))
                            {
                                PendingUrl.Enqueue(u);
                            }
                        }
                        catch (System.UriFormatException)
                        {
                            ErrorUrl.Add(u);
                        }
                    }

                    foreach (var item in document.QuerySelectorAll("[src]"))
                    {
                        var u = fixUrl(item.GetAttribute("src"));
                        if (FileTypes.Contains(u.Split('.')[^ 1]) && VisitedUrl.Add(u))
Ejemplo n.º 2
0
        /// <summary>
        ///     Crawls a given XML document (sitemap) by URI, checking each found URI's date if possible
        ///     and filtering out links disallowed or older than 2 months. Places found and valid XMLs/URLs
        ///     into relevant azure queue for processing.
        /// </summary>
        /// <param name="data">
        ///     Crawler data helper. Ref.
        /// </param>
        /// <param name="storage">
        ///     Crawler azure storage helper. Ref.
        /// </param>
        /// <param name="xml">
        ///     URI to XML document (sitemap) to crawl.
        /// </param>
        public static void CrawlXml(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string xml)
        {
            if (data.ChkIfUriAllowed(xml))
            {
                try
                {
                    if (xml.EndsWith(".xml"))
                    {
                        XElement sitemap       = XElement.Load(xml);
                        string   sitemapType   = sitemap.Name.LocalName;
                        string   nameSpace     = sitemap.GetDefaultNamespace().ToString();
                        string   dateNameSpace = null;
                        XName    dateParent    = null;
                        XName    date          = null;
                        if (sitemap.ToString().Contains(@"xmlns:news"))
                        {
                            dateNameSpace = sitemap.GetNamespaceOfPrefix("news").ToString();
                            dateParent    = XName.Get("news", dateNameSpace);
                            date          = XName.Get("publication_date", dateNameSpace);
                        }
                        else if (sitemap.ToString().Contains(@"xmlns:video"))
                        {
                            dateNameSpace = sitemap.GetNamespaceOfPrefix("video").ToString();
                            dateParent    = XName.Get("video", dateNameSpace);
                            date          = XName.Get("publication_date", dateNameSpace);
                        }
                        XName loc     = XName.Get("loc", nameSpace);
                        XName lastMod = XName.Get("lastmod", nameSpace);
                        XName elementSelector;
                        if (sitemapType == "sitemapindex")
                        {
                            elementSelector = XName.Get("sitemap", nameSpace);
                        }
                        else
                        {
                            elementSelector = XName.Get("url", nameSpace);
                        }

                        List <string> xmlsToQueue = new List <string>();
                        List <string> urlsToQueue = new List <string>();

                        foreach (var element in sitemap.Elements(elementSelector))
                        {
                            bool   validDateIfExists  = true;
                            var    currLocElement     = element.Element(loc);
                            string currLocValue       = currLocElement.Value;
                            var    currLastModElement = element.Element(lastMod);
                            if (currLastModElement == null)
                            {
                                currLastModElement = element.Element(dateParent);
                                currLastModElement = (currLastModElement == null ? null : currLastModElement.Element(date));
                            }
                            if (currLastModElement != null)
                            {
                                validDateIfExists = DateTime.Parse(currLastModElement.Value) >= DateTime.Now - TimeSpan.FromDays(62);
                            }
                            if (currLocValue.Contains(".xml"))
                            {
                                if (!data.QueuedXmls.Contains(currLocValue) &&
                                    validDateIfExists)
                                {
                                    xmlsToQueue.Add(currLocValue);
                                }
                            }
                            else
                            {
                                if (!data.QueuedUrls.Contains(currLocValue) &&
                                    validDateIfExists)
                                {
                                    urlsToQueue.Add(currLocValue);
                                }
                            }
                        }

                        foreach (string newXml in xmlsToQueue)
                        {
                            CloudQueueMessage msg = new CloudQueueMessage(newXml);
                            storage.XmlQueue.AddMessage(msg);
                            data.QueuedXmls.Add(newXml);
                            data.NumXmlsQueued++;
                        }

                        foreach (string newUrl in urlsToQueue)
                        {
                            UrlCrawlr.ChkAndAddUrl(newUrl, xml, null, ref data, ref storage);
                        }
                    }
                }
                catch (Exception ex)
                {
                    ErrorUrl       errorUrl       = new ErrorUrl(xml, ex.ToString());
                    TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl);
                    storage.ErrorTable.Execute(insertErrorUrl);
                }
            }
        }
Ejemplo n.º 3
0
        /// <summary>
        ///     Crawls a given URL, queueing all found URLs and storing information about
        ///     the given URL for later querying.
        /// </summary>
        /// <param name="data">
        ///     Crawler data helper. Ref.
        /// </param>
        /// <param name="storage">
        ///     Crawler azure storage helper. Ref.
        /// </param>
        /// <param name="url">
        ///     The given URL to crawl.
        /// </param>
        public static void CrawlUrl(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string url)
        {
            if (data.ChkIfUriAllowed(url))
            {
                ///*  Unsure if necessary.  */
                //if (!url.Contains(".htm"))
                //{
                //    if (!url.Contains(".jpg") && !url.Contains(".png"))
                //    {
                //        if (url.EndsWith(@"/"))
                //        {
                //            url += "index.html";
                //        }
                //        else
                //        {
                //            url += @"/index.html";
                //        }
                //    }
                //}
                try
                {
                    var web      = new HtmlWeb();
                    var currDoc  = web.Load(url);
                    var urlNodes = currDoc.DocumentNode.Descendants("a")
                                   .ToList();
                    var urlPageTitle = currDoc.DocumentNode.Descendants("title")
                                       .First()
                                       .InnerText;
                    var urlLastModNode = currDoc.DocumentNode.Descendants("meta")
                                         .Select(y => y)
                                         .Where(y => y.Attributes.Contains("name"))
                                         .Where(y => y.Attributes["name"].Value == "pubdate")
                                         .ToList();

                    DateTime?urlLastMod = null;
                    if (urlLastModNode.Count > 0)
                    {
                        urlLastMod = DateTime.Parse(
                            urlLastModNode.First().Attributes["content"].Value);
                    }

                    List <string> urlsToQueue = new List <string>();

                    foreach (var urlNode in urlNodes)
                    {
                        if (urlNode.Attributes.Contains("href"))
                        {
                            urlsToQueue.Add(urlNode.Attributes["href"].Value);
                        }
                    }

                    foreach (string newUrl in urlsToQueue)
                    {
                        ChkAndAddUrl(newUrl, url, urlLastMod, ref data, ref storage);
                    }

                    if (!data.AddedUrls.Contains(url))
                    {
                        data.AddedUrls.Add(url);
                        data.NumUrlsIndexed++;
                    }
                    data.NumUrlsCrawled++;
                    FoundUrl       finishedUrl = new FoundUrl(urlPageTitle, (urlLastMod != null ? urlLastMod.ToString() : "NULL"), url);
                    UrlTableCount  newCount    = new UrlTableCount(data.NumUrlsCrawled, data.NumUrlsIndexed);
                    TableOperation insertUrl   = TableOperation.InsertOrReplace(finishedUrl);
                    TableOperation insertCount = TableOperation.InsertOrReplace(newCount);
                    storage.UrlTable.Execute(insertUrl);
                    storage.UrlTable.Execute(insertCount);
                    if (data.LastTenUrls.Count == 10)
                    {
                        data.LastTenUrls.Dequeue();
                    }
                    data.LastTenUrls.Enqueue(url);
                }
                catch (Exception ex)
                {
                    ErrorUrl       errorUrl       = new ErrorUrl(url, ex.ToString());
                    TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl);
                    storage.ErrorTable.Execute(insertErrorUrl);
                }
            }
        }