/// <summary> /// Method to retrieve list of URL on the page /// </summary> /// <param name="pageText">HTML content of the page</param> /// <param name="sourceUrl">URL of the page</param> public void ParseLinks(string pageText, Uri sourceUrl) { MatchCollection matches = Regex.Matches(pageText, _LINK_REGEX); for (int i = 0; i <= matches.Count - 1; i++) { Match anchorMatch = matches[i]; if (anchorMatch.Value != String.Empty) { string foundHref = null; try { foundHref = anchorMatch.Value.Replace("href=\"", ""); foundHref = foundHref.Substring(0, foundHref.IndexOf("\"")); } catch (Exception) { // add code to find the list of broken url's } Uri uriFoundHref = new Uri(foundHref); if (!ValidUrls.Contains(uriFoundHref)) { if (foundHref != "/") { if (IsExternalUrl(foundHref)) { _externalUrls.Add(uriFoundHref); } else if (!IsAWebPage(foundHref)) { _otherUrls.Add(uriFoundHref); } else { if (foundHref.Substring(foundHref.Length - 1) != "/") { foundHref = foundHref + "/"; } if (!ValidUrls.Contains(uriFoundHref)) { ValidUrls.Add(uriFoundHref); } } } } } } }
private void Ready() { ColorConsole.WriteLine($"{UrlTrackerActorName} has become Ready", ConsoleColor.Red); Receive <UnprocessedUrlsMessage>(message => { List <Uri> distinctUrls = message.Urls.Distinct().ToList(); // TODO Can ToLower() be implemented? or will it result in false positives when visiting the Urls later? And what about case sensitive urls? foreach (var url in distinctUrls) { #region Checks if url is valid bool isUrlFromProjectDomain = url.OriginalString.Contains(ProjectDefinition.Domain.OriginalString); // bool doesUrlContainAnyDisallowedWords = disallowedWords.Any(url.ToString().Contains); //TODO Add check for disallowed words? if (isUrlFromProjectDomain == false || ValidUrls.Contains(url) == true) { continue; } ValidUrls.Add(url); #endregion #region Creates messages based on the project definition. if (ProjectDefinition.IsFixedListOfUrls) // Move to 'Shepherd' Actor?! { if (ProjectDefinition.StartUrls.Contains(url)) { Context.ActorSelection(ActorPaths.Browser).Tell(new UrlForUrlAndObjectParsingMessage(url)); // BrowserActor is load balancer } else { Context.ActorSelection(ActorPaths.Browser).Tell(new UrlForObjectParsingMessage(url)); // BrowserActor is load balancer } } else { Context.ActorSelection(ActorPaths.Browser).Tell(new UrlForUrlAndObjectParsingMessage(url)); // BrowserActor is load balancer } #endregion } }); Receive <ProcessedUrlMessage>(message => { ProcessedUrls.Add(message.Url); }); Receive <UrlForUrlAndObjectParsingMessage>(message => { }); }