Beispiel #1
0
        /// <summary>
        /// Method to retrieve list of URL on the page
        /// </summary>
        /// <param name="pageText">HTML content of the page</param>
        /// <param name="sourceUrl">URL of the page</param>
        public void ParseLinks(string pageText, Uri sourceUrl)
        {
            MatchCollection matches = Regex.Matches(pageText, _LINK_REGEX);

            for (int i = 0; i <= matches.Count - 1; i++)
            {
                Match anchorMatch = matches[i];

                if (anchorMatch.Value != String.Empty)
                {
                    string foundHref = null;
                    try
                    {
                        foundHref = anchorMatch.Value.Replace("href=\"", "");
                        foundHref = foundHref.Substring(0, foundHref.IndexOf("\""));
                    }
                    catch (Exception)
                    {
                        // add code to find the list of broken url's
                    }

                    Uri uriFoundHref = new Uri(foundHref);

                    if (!ValidUrls.Contains(uriFoundHref))
                    {
                        if (foundHref != "/")
                        {
                            if (IsExternalUrl(foundHref))
                            {
                                _externalUrls.Add(uriFoundHref);
                            }
                            else if (!IsAWebPage(foundHref))
                            {
                                _otherUrls.Add(uriFoundHref);
                            }
                            else
                            {
                                if (foundHref.Substring(foundHref.Length - 1) != "/")
                                {
                                    foundHref = foundHref + "/";
                                }

                                if (!ValidUrls.Contains(uriFoundHref))
                                {
                                    ValidUrls.Add(uriFoundHref);
                                }
                            }
                        }
                    }
                }
            }
        }
Beispiel #2
0
        private void Ready()
        {
            ColorConsole.WriteLine($"{UrlTrackerActorName} has become Ready", ConsoleColor.Red);
            Receive <UnprocessedUrlsMessage>(message =>
            {
                List <Uri> distinctUrls = message.Urls.Distinct().ToList(); // TODO Can ToLower() be implemented? or will it result in false positives when visiting the Urls later? And what about case sensitive urls?
                foreach (var url in distinctUrls)
                {
                    #region Checks if url is valid
                    bool isUrlFromProjectDomain = url.OriginalString.Contains(ProjectDefinition.Domain.OriginalString);
                    // bool doesUrlContainAnyDisallowedWords = disallowedWords.Any(url.ToString().Contains); //TODO Add check for disallowed words?
                    if (isUrlFromProjectDomain == false ||
                        ValidUrls.Contains(url) == true)
                    {
                        continue;
                    }
                    ValidUrls.Add(url);
                    #endregion

                    #region Creates messages based on the project definition.
                    if (ProjectDefinition.IsFixedListOfUrls) // Move to 'Shepherd' Actor?!
                    {
                        if (ProjectDefinition.StartUrls.Contains(url))
                        {
                            Context.ActorSelection(ActorPaths.Browser).Tell(new UrlForUrlAndObjectParsingMessage(url)); // BrowserActor is load balancer
                        }
                        else
                        {
                            Context.ActorSelection(ActorPaths.Browser).Tell(new UrlForObjectParsingMessage(url)); // BrowserActor is load balancer
                        }
                    }
                    else
                    {
                        Context.ActorSelection(ActorPaths.Browser).Tell(new UrlForUrlAndObjectParsingMessage(url)); // BrowserActor is load balancer
                    }
                    #endregion
                }
            });
            Receive <ProcessedUrlMessage>(message =>
            {
                ProcessedUrls.Add(message.Url);
            });

            Receive <UrlForUrlAndObjectParsingMessage>(message =>
            {
            });
        }