public async Task ManageInternalCrawl(InBoundLinkCheckerViewModel model, List <Examine.SearchResult> PublishedPages)
        {
            // Instantiate Crawler Variables
            var LinksAvailableToCrawl = true;
            var TaskCount             = 0;
            var TaskID     = 0;
            var TaskStatus = new Dictionary <int, string>();
            var TaskList   = new Dictionary <int, Task <CrawlerModel> >();

            // Keep going while there are still umbraco pages to crawl
            while (LinksAvailableToCrawl)
            {
                try
                {
                    // While there are less than 8 async tasks running and at least 1 page to crawl.
                    while (TaskCount < 8 && PublishedPages.Count > 0)
                    {
                        TaskCount++;
                        TaskID++;
                        TaskList.Add(TaskID, Task.Run(() => ProcessPage(model, PublishedPages.Take(1).ToList())));
                        PublishedPages.RemoveRange(0, 1);
                        TaskStatus.Add(TaskID, "Started");
                    }
                    // If there are no pages left to crawl after assigning tasks, set LinksAvailableToCrawl to false to end the while loop after this iteration
                    if (PublishedPages.Count() <= 0 && TaskCount == 0)
                    {
                        LinksAvailableToCrawl = false;
                    }

                    // Instantiate a List to store the results of the aysnc tasks.
                    var ResultsModelList = new List <CrawlerModel>();

                    // Foreach task in the list, if one is completed, gather its result and log its status as completed.
                    foreach (var Task in TaskList)
                    {
                        if (Task.Value.IsCompleted)
                        {
                            ResultsModelList.Add(Task.Value.Result);
                            TaskStatus[Task.Key] = "Completed";
                        }
                    }

                    // Create a tempory list to log which task keys should be removed.
                    var KeysToRemove = new List <int>();
                    // Foreach task in the status list, if it is logged as completed, remove the task from the task list and add its key to the keystoremove list.
                    foreach (var Task in TaskStatus)
                    {
                        if (Task.Value == "Completed")
                        {
                            TaskList.Remove(Task.Key);
                            TaskCount--;
                            KeysToRemove.Add(Task.Key);
                        }
                    }
                    foreach (var Key in KeysToRemove)
                    {
                        TaskStatus.Remove(Key);
                    }

                    // Instantiate the Collections to store results in.
                    var ResultsDictionary = cache["ResultsDictionary"] as Dictionary <string, ContentModel>;
                    if (ResultsDictionary == null)
                    {
                        ResultsDictionary = new Dictionary <string, ContentModel>();
                    }

                    var BrokenLinks = cache["BrokenLinks"] as List <BrokenPageModel>;
                    if (BrokenLinks == null)
                    {
                        BrokenLinks = new List <BrokenPageModel>();
                    }
                    var LinksFound = cache["LinksFound"] as List <string>;
                    if (LinksFound == null)
                    {
                        LinksFound = new List <string>();
                    }
                    var Domains = cache["Domains"] as List <string>;
                    if (Domains == null)
                    {
                        Domains = new List <string>();
                    }

                    // For each result model in the resultmodellist, if it is not null, process the models results into the results collections.
                    foreach (var ResultModel in ResultsModelList)
                    {
                        if (ResultModel != null)
                        {
                            model.CrawledLinks += ResultModel.CrawledLinks;
                            foreach (var item in ResultModel.BrokenLinks)
                            {
                                if (!BrokenLinks.Contains(item))
                                {
                                    BrokenLinks.Add(item);
                                }
                            }
                            foreach (var item in ResultModel.ResultsDictionary)
                            {
                                if (!ResultsDictionary.Keys.Contains(item.Key))
                                {
                                    ResultsDictionary.Add(item.Key, item.Value);
                                }
                            }
                            foreach (var item in ResultModel.LinksFound)
                            {
                                if (!LinksFound.Contains(item))
                                {
                                    LinksFound.Add(item);
                                }
                            }
                            foreach (var item in ResultModel.Domains)
                            {
                                if (!Domains.Contains(item))
                                {
                                    Domains.Add(item);
                                }
                            }
                        }
                    }

                    // make a note of the number of pages that have been crawled and verified.
                    model.IndexedPagesTotal = ResultsDictionary.Count();
                    StoreModelInCache(model);
                    // store the results in the cache
                    StoreResultsInCache(ResultsDictionary, LinksFound, BrokenLinks, Domains);
                }
                catch (Exception ex)
                {
                    // If an Exception occurs then stop the crawl and store the results up till now.
                    model.DataBeingGenerated  = false;
                    model.CachedDataAvailable = true;
                    model.ErrorOccured        = ex.InnerException.Message;
                    StoreModelInCache(model);
                    break;
                }
            }

            // Now the Crawl has ended, set the view model booleans to let the view know that data is no longer being generated, and their is cached data to view.
            model.DataBeingGenerated  = false;
            model.CachedDataAvailable = true;
            StoreModelInCache(model);
        }
Пример #2
0
        public void LoadFilmsInfo()
        {
            System.Console.WriteLine("***********************************");
            System.Console.WriteLine("Wczytuje filmy.");
            System.Console.WriteLine("***********************************");
            foreach (var link in AllLinks)
            {
                Film film = new Film();

                Document = HtmlWeb.Load(link);

                if (Document.DocumentNode.HasChildNodes == false)
                {
                    BrokenLinks.Add(link);
                    continue;
                }

                film.FilmwebUrl = link;

                Titles = Document.DocumentNode.Descendants("a");
                foreach (var title in Titles)
                {
                    if (title.GetAttributeValue("typeof", "test") == "v:Review-aggregate")
                    {
                        film.Name = WebUtility.HtmlDecode(title.InnerText);
                    }
                }

                OriginalTitles = Document.DocumentNode.Descendants("h2");
                foreach (var title in OriginalTitles)
                {
                    if (title.GetAttributeValue("class", "test") == "cap s-16 top-5")
                    {
                        film.OriginalName = WebUtility.HtmlDecode(title.InnerText);
                    }
                }

                Genres = Document.DocumentNode.Descendants("ul");
                foreach (var genre in Genres)
                {
                    if (genre.GetAttributeValue("class", "test") == "inline sep-comma genresList")
                    {
                        film.Genre = WebUtility.HtmlDecode(genre.InnerText);
                        film.Genre = AddSpacesToSentence(film.Genre);
                    }
                }

                ReleaseDates = Document.DocumentNode.Descendants("a");
                foreach (var date in ReleaseDates)
                {
                    //http://www.filmweb.pl [21]
                    var shorterLink = link.Substring(17, link.Length - 17) + "/dates";
                    if (date.GetAttributeValue("href", "test") == shorterLink)
                    {
                        film.ReleaseDate = WebUtility.HtmlDecode(date.InnerText);
                    }
                }

                Descriptions = Document.DocumentNode.Descendants("div");
                foreach (var description in Descriptions)
                {
                    if (description.GetAttributeValue("class", "test") == "filmPlot bottom-15")
                    {
                        film.Description = WebUtility.HtmlDecode(description.InnerText);
                    }
                }

                Covers = Document.DocumentNode.Descendants("img");
                foreach (var cover in Covers)
                {
                    string imgContent = cover.GetAttributeValue("alt", "test");
                    string imgContentWithoutSpaces = imgContent.Replace(" ", string.Empty);
                    string titleWithoutSpaces      = film.Name.Replace(" ", string.Empty);
                    if (imgContentWithoutSpaces.Contains(titleWithoutSpaces))
                    {
                        film.Cover = cover.GetAttributeValue("src", "test");
                    }
                }
                if (film.Name != null)
                {
                    m_Films.Add(film);
                    System.Console.WriteLine("Pobrano zawartosc filmu: " + film.Name);
                }

                int actorsCount = 0;
                Actors = Document.DocumentNode.Descendants("a");
                foreach (var actorName in Actors)
                {
                    if (actorName.GetAttributeValue("rel", "test") == "v:starring")
                    {
                        Actor actor = new Actor();
                        actor.Name     = WebUtility.HtmlDecode(actorName.InnerText);
                        actor.FilmName = film.Name;

                        if (actor.Name != null)
                        {
                            m_Actors.Add(actor);
                            actorsCount++;
                        }

                        // we will display only 5 actors per film
                        if (actorsCount >= 5)
                        {
                            actorsCount = 0;
                            break;
                        }
                    }
                }

                Directors = Document.DocumentNode.Descendants("a");
                foreach (var directorName in Directors)
                {
                    if (directorName.GetAttributeValue("rel", "test") == "v:directedBy")
                    {
                        Director director = new Director();
                        director.Name     = WebUtility.HtmlDecode(directorName.InnerText);
                        director.FilmName = film.Name;

                        if (director.Name != null)
                        {
                            m_Directors.Add(director);
                        }
                    }
                }
            }
        }