public ActionResult Index()
        {
            // get the view model from the cache
            var model = cache["InBoundLinkCheckerViewModel"] as InBoundLinkCheckerViewModel;

            if (model == null)
            {
                // if no model is in the cache then create a new one
                model = new InBoundLinkCheckerViewModel();
                model.CachedDataAvailable = false;
                model.DataBeingGenerated  = false;
                StoreModelInCache(model);
            }
            else if (model.DataBeingGenerated)
            {
                // if a crawl is in progress, add a header to refresh the page periodically to update the views progress bar.
                this.HttpContext.Response.AddHeader("refresh", "5; url=" + Url.Action("Index"));
            }

            // if a crawl has been done and cached data is available, then prepare the viewmodel.
            if (model.CachedDataAvailable)
            {
                model = PrepareViewModel(model);
            }



            return(View("~/App_Plugins/EditorTools/Views/InBoundLinkChecker/Index.cshtml", model));
        }
        public async Task <ActionResult> StartCrawl(InBoundLinkCheckerViewModel model)
        {
            var Searcher = Examine.ExamineManager.Instance.SearchProviderCollection["ExternalSearcher"];
            var Criteria = Searcher.CreateSearchCriteria(IndexTypes.Content);

            // Create a query to get all nodes of the index type content
            var ExamineQuery = Criteria.RawQuery(string.Format("__IndexType:content"));
            var Results      = Searcher.Search(ExamineQuery);


            // Ensure that the collections are new and empty before crawling.
            var ResultsDictionary = new Dictionary <string, ContentModel>();
            var LinksFound        = new List <string>();
            var BrokenLinks       = new List <BrokenPageModel>();
            var Domains           = new List <string>();

            StoreResultsInCache(ResultsDictionary, LinksFound, BrokenLinks, Domains);

            model.TotalToCrawl       = Results.Count();
            model.CrawledLinks       = 0;
            model.DataBeingGenerated = true;
            StoreModelInCache(model);

            // run async so as not to present the user with a long loading screen and to return the index view.
            Task.Run(() => ManageInternalCrawl(model, Results.ToList()));

            this.HttpContext.Response.AddHeader("refresh", "5; url=" + Url.Action("Index"));
            return(View("~/App_Plugins/EditorTools/Views/InBoundLinkChecker/Index.cshtml", model));
        }
 private void StoreModelInCache(InBoundLinkCheckerViewModel model)
 {
     if (cache.Contains("InBoundLinkCheckerViewModel"))
     {
         cache.Remove("InBoundLinkCheckerViewModel");
     }
     cache.Add("InBoundLinkCheckerViewModel", model, DateTime.Now.AddHours(1));
 }
Beispiel #4
0
 private void StoreModelInCache(InBoundLinkCheckerViewModel model)
 {
     if (cache.Contains("InBoundLinkCheckerViewModel"))
     {
         cache["InBoundLinkCheckerViewModel"] = model;
     }
     else
     {
         cache.Add("InBoundLinkCheckerViewModel", model, System.Web.Caching.Cache.NoAbsoluteExpiration, null);
     }
 }
        public InBoundLinkCheckerViewModel PrepareViewModel(InBoundLinkCheckerViewModel ViewModel)
        {
            // Gather data from the cache
            var ResultsDictionary = cache["ResultsDictionary"] as Dictionary <string, ContentModel>;
            var LinksFound        = cache["LinksFound"] as List <string>;
            var BrokenLinks       = cache["BrokenLinks"] as List <BrokenPageModel>;
            var Domains           = cache["Domains"] as List <string>;

            // Instantiate the view models DataTables
            ViewModel.IndexedLinks.Table = new DataTable();
            ViewModel.IndexedLinks.Table.Columns.Add("Name", typeof(string));
            ViewModel.IndexedLinks.Table.Columns.Add("Published Url", typeof(string));

            ViewModel.BrokenLinks.Table = new DataTable();
            ViewModel.BrokenLinks.Table.Columns.Add("URL", typeof(string));
            ViewModel.BrokenLinks.Table.Columns.Add("Found On", typeof(string));
            ViewModel.BrokenLinks.Table.Columns.Add("Exception", typeof(string));

            ViewModel.LinksFoundTable.Table = new DataTable();
            ViewModel.LinksFoundTable.Table.Columns.Add("Link", typeof(string));

            ViewModel.Domains.Table = new DataTable();
            ViewModel.Domains.Table.Columns.Add("Domain", typeof(string));

            // Populate the DataTables using the cached data
            foreach (var item in ResultsDictionary)
            {
                ViewModel.IndexedLinks.Table.Rows.Add(item.Value.NodeName, item.Value.URL);
            }
            foreach (var item in LinksFound)
            {
                ViewModel.LinksFoundTable.Table.Rows.Add(item);
            }
            foreach (var item in BrokenLinks)
            {
                ViewModel.BrokenLinks.Table.Rows.Add(item.URL, item.FoundOn, item.Exception);
            }
            foreach (var item in Domains)
            {
                ViewModel.Domains.Table.Rows.Add(item);
            }

            ViewModel.TotalBrokenLinks  = ViewModel.BrokenLinks.Table.Rows.Count;
            ViewModel.TotalDomainsFound = ViewModel.Domains.Table.Rows.Count;
            ViewModel.TotalUniqueLinks  = ViewModel.LinksFoundTable.Table.Rows.Count;
            ViewModel.TotalVerified     = ViewModel.IndexedLinks.Table.Rows.Count;

            StoreModelInCache(ViewModel);
            return(ViewModel);
        }
        public ActionResult SearchForInBoundLinks(InBoundLinkCheckerViewModel PostModel)
        {
            // If nothing has been entered for the query, just return the index page.
            if (PostModel.Query == null || PostModel.Query == "")
            {
                Index();
            }

            // Get the current ViewModel from the cache
            var model = cache["InBoundLinkCheckerViewModel"] as InBoundLinkCheckerViewModel;

            // Remove any trailing / characters from the query
            model.Query           = PostModel.Query.TrimEnd('/');
            model.HasInBoundLinks = false;
            // Get the ValidatedPages from the cache
            var results = cache["ResultsDictionary"] as Dictionary <string, ContentModel>;

            // instantiate the datatable to store the results of the query
            model.InBoundLinks.Table = new DataTable();
            model.InBoundLinks.Table.Columns.Add("Name", typeof(string));
            model.InBoundLinks.Table.Columns.Add("Url", typeof(string));
            model.InBoundLinks.Table.Columns.Add("Edit", typeof(HtmlString));

            // Create a list to store the results of the query
            var PagesWithLink = new List <ContentModel>();

            // for each validated page
            foreach (var node in results)
            {
                // for each link found on that page
                foreach (var link in node.Value.LinksOnNode)
                {
                    // if the link matches the query
                    if (model.Query == link.TrimEnd('/'))
                    {
                        // if that page has not already been added to the list
                        if (!PagesWithLink.Contains(node.Value))
                        {
                            // add the page to list
                            PagesWithLink.Add(node.Value);
                        }
                    }
                    // purely for ESCC, some east sussex pages have new instead of www.
                    else if (model.Query.Replace("www", "new") == link.TrimEnd('/'))
                    {
                        if (!PagesWithLink.Contains(node.Value))
                        {
                            PagesWithLink.Add(node.Value);
                        }
                    }
                }
            }

            // If some results have been found
            if (PagesWithLink.Count > 0)
            {
                // set the boolean to let the view know there are results
                model.HasInBoundLinks = true;
                // for each result found
                foreach (var page in PagesWithLink)
                {
                    // create its edit url and add the page to the datatable
                    var editURL = new HtmlString(string.Format("<a target=\"_top\" href=\"/umbraco#/content/content/edit/{0}\">edit</a>", page.NodeID));
                    model.InBoundLinks.Table.Rows.Add(page.NodeName, page.URL, editURL);
                }
            }
            return(View("~/App_Plugins/EditorTools/Views/InBoundLinkChecker/Index.cshtml", model));
        }
        public CrawlerModel ProcessPage(InBoundLinkCheckerViewModel model, List <Examine.SearchResult> Results)
        {
            var CrawlModel = new CrawlerModel();

            if (!model.SiteUri.Contains("http"))
            {
                model.SiteUri = string.Format("{0}{1}", "https://", model.SiteUri);
            }
            // ensure that the we have the current umbraco context ( needed for async methods )
            var context = GetUmbracoContext();

            // Potential to be passed several results.
            foreach (var node in Results)
            {
                CrawlModel.CrawledLinks++;

                WebClient client = new WebClient();
                var       doc    = new HtmlAgilityPack.HtmlDocument();
                // Almost all the time the urlName is different from the published url
                // search for IPublishedContent to get the published url (this process is significantly slower but much more accurate)
                var TypedContent = new UmbracoHelper(context).TypedContent(int.Parse(node.Fields["__NodeId"]));

                try
                {
                    if (TypedContent != null)
                    {
                        doc.LoadHtml(client.DownloadString(string.Format("{0}{1}", model.SiteUri, TypedContent.Url())));
                        if (!CrawlModel.ResultsDictionary.Keys.Contains(string.Format("{0}{1}", model.SiteUri, TypedContent.Url())))
                        {
                            CrawlModel.ResultsDictionary.Add(string.Format("{0}{1}", model.SiteUri, TypedContent.Url()), new ContentModel(node.Fields["nodeName"], string.Format("{0}{1}", model.SiteUri, TypedContent.Url()), int.Parse(node.Fields["__NodeId"])));
                        }
                    }
                    else
                    {
                        doc.LoadHtml(client.DownloadString(string.Format("{0}/{1}", model.SiteUri, node.Fields["urlName"])));
                        if (!CrawlModel.ResultsDictionary.Keys.Contains(string.Format("{0}/{1}", model.SiteUri, node.Fields["urlName"])))
                        {
                            CrawlModel.ResultsDictionary.Add(string.Format("{0}/{1}", model.SiteUri, node.Fields["urlName"]), new ContentModel(node.Fields["nodeName"], string.Format("{0}/{1}", model.SiteUri, node.Fields["urlName"]), int.Parse(node.Fields["__NodeId"])));
                        }
                    }
                }
                catch (Exception e)
                {
                    // this shouldn't happen but if it does then the node is invalid or the link is broken and should be skipped
                    if (TypedContent != null)
                    {
                        CrawlModel.BrokenLinks.Add(new BrokenPageModel(string.Format("{0}{1}", model.SiteUri, TypedContent.Url()), "Internal Crawl", e.Message));
                    }
                    else
                    {
                        CrawlModel.BrokenLinks.Add(new BrokenPageModel(string.Format("{0}/{1}", model.SiteUri, node.Fields["urlName"]), "Internal Crawl", e.Message));
                    }
                    continue;
                }
                if (TypedContent != null)
                {
                    CrawlModel = GetLinksOnPage(doc.DocumentNode.InnerHtml, string.Format("{0}{1}", model.SiteUri, TypedContent.Url()), CrawlModel);
                }
                else
                {
                    CrawlModel = GetLinksOnPage(doc.DocumentNode.InnerHtml, string.Format("{0}/{1}", model.SiteUri, node.Fields["urlName"]), CrawlModel);
                }
            }
            return(CrawlModel);
        }
        public async Task ManageInternalCrawl(InBoundLinkCheckerViewModel model, List <Examine.SearchResult> PublishedPages)
        {
            // Instantiate Crawler Variables
            var LinksAvailableToCrawl = true;
            var TaskCount             = 0;
            var TaskID     = 0;
            var TaskStatus = new Dictionary <int, string>();
            var TaskList   = new Dictionary <int, Task <CrawlerModel> >();

            // Keep going while there are still umbraco pages to crawl
            while (LinksAvailableToCrawl)
            {
                try
                {
                    // While there are less than 8 async tasks running and at least 1 page to crawl.
                    while (TaskCount < 8 && PublishedPages.Count > 0)
                    {
                        TaskCount++;
                        TaskID++;
                        TaskList.Add(TaskID, Task.Run(() => ProcessPage(model, PublishedPages.Take(1).ToList())));
                        PublishedPages.RemoveRange(0, 1);
                        TaskStatus.Add(TaskID, "Started");
                    }
                    // If there are no pages left to crawl after assigning tasks, set LinksAvailableToCrawl to false to end the while loop after this iteration
                    if (PublishedPages.Count() <= 0 && TaskCount == 0)
                    {
                        LinksAvailableToCrawl = false;
                    }

                    // Instantiate a List to store the results of the aysnc tasks.
                    var ResultsModelList = new List <CrawlerModel>();

                    // Foreach task in the list, if one is completed, gather its result and log its status as completed.
                    foreach (var Task in TaskList)
                    {
                        if (Task.Value.IsCompleted)
                        {
                            ResultsModelList.Add(Task.Value.Result);
                            TaskStatus[Task.Key] = "Completed";
                        }
                    }

                    // Create a tempory list to log which task keys should be removed.
                    var KeysToRemove = new List <int>();
                    // Foreach task in the status list, if it is logged as completed, remove the task from the task list and add its key to the keystoremove list.
                    foreach (var Task in TaskStatus)
                    {
                        if (Task.Value == "Completed")
                        {
                            TaskList.Remove(Task.Key);
                            TaskCount--;
                            KeysToRemove.Add(Task.Key);
                        }
                    }
                    foreach (var Key in KeysToRemove)
                    {
                        TaskStatus.Remove(Key);
                    }

                    // Instantiate the Collections to store results in.
                    var ResultsDictionary = cache["ResultsDictionary"] as Dictionary <string, ContentModel>;
                    if (ResultsDictionary == null)
                    {
                        ResultsDictionary = new Dictionary <string, ContentModel>();
                    }

                    var BrokenLinks = cache["BrokenLinks"] as List <BrokenPageModel>;
                    if (BrokenLinks == null)
                    {
                        BrokenLinks = new List <BrokenPageModel>();
                    }
                    var LinksFound = cache["LinksFound"] as List <string>;
                    if (LinksFound == null)
                    {
                        LinksFound = new List <string>();
                    }
                    var Domains = cache["Domains"] as List <string>;
                    if (Domains == null)
                    {
                        Domains = new List <string>();
                    }

                    // For each result model in the resultmodellist, if it is not null, process the models results into the results collections.
                    foreach (var ResultModel in ResultsModelList)
                    {
                        if (ResultModel != null)
                        {
                            model.CrawledLinks += ResultModel.CrawledLinks;
                            foreach (var item in ResultModel.BrokenLinks)
                            {
                                if (!BrokenLinks.Contains(item))
                                {
                                    BrokenLinks.Add(item);
                                }
                            }
                            foreach (var item in ResultModel.ResultsDictionary)
                            {
                                if (!ResultsDictionary.Keys.Contains(item.Key))
                                {
                                    ResultsDictionary.Add(item.Key, item.Value);
                                }
                            }
                            foreach (var item in ResultModel.LinksFound)
                            {
                                if (!LinksFound.Contains(item))
                                {
                                    LinksFound.Add(item);
                                }
                            }
                            foreach (var item in ResultModel.Domains)
                            {
                                if (!Domains.Contains(item))
                                {
                                    Domains.Add(item);
                                }
                            }
                        }
                    }

                    // make a note of the number of pages that have been crawled and verified.
                    model.IndexedPagesTotal = ResultsDictionary.Count();
                    StoreModelInCache(model);
                    // store the results in the cache
                    StoreResultsInCache(ResultsDictionary, LinksFound, BrokenLinks, Domains);
                }
                catch (Exception ex)
                {
                    // If an Exception occurs then stop the crawl and store the results up till now.
                    model.DataBeingGenerated  = false;
                    model.CachedDataAvailable = true;
                    model.ErrorOccured        = ex.InnerException.Message;
                    StoreModelInCache(model);
                    break;
                }
            }

            // Now the Crawl has ended, set the view model booleans to let the view know that data is no longer being generated, and their is cached data to view.
            model.DataBeingGenerated  = false;
            model.CachedDataAvailable = true;
            StoreModelInCache(model);
        }