Beispiel #1
0
        private static Task <Uri[]> GetUriListFromPage(Uri baseUrl,
                                                       FileSaver fileSaver,
                                                       UriExtractor uriExtractor,
                                                       HashSet <Uri> hashSet)
        {
            return(new WebClient()
                   .DownloadStringTaskAsync(baseUrl)
                   .ContinueWith(task =>
            {
                if (task.IsFaulted)
                {
                    return new Uri[0];
                }

                Console.WriteLine(
                    $"Thread: {Thread.CurrentThread.ManagedThreadId} downloaded page: {baseUrl.AbsoluteUri}");

                var html = task.Result;

                fileSaver.SaveHtmlFile(baseUrl, html);

                return uriExtractor
                .GetUriList(html, baseUrl)
                .Where(x => !hashSet.Contains(x))
                .ToArray();
            }));
        }
Beispiel #2
0
        private static void Run(Options opts)
        {
            var startUri      = new Uri(opts.Uri);
            var pathToFolder  = opts.Path ?? GetPathToFolderWithResults();
            var maxPagesCount = opts.MaxPagesCount;
            var maxDepth      = opts.MaxDepth;

            var fileSaver    = new FileSaver(pathToFolder);
            var uriExtractor = new UriExtractor();
            var queue        = new Queue <Uri>();

            queue.Enqueue(startUri);
            var hashSet = new HashSet <Uri> {
                startUri
            };

            var currentDepth     = 0;
            var currentPageCount = 0;

            while (queue.Count != 0 &&
                   currentDepth < maxDepth &&
                   currentPageCount < maxPagesCount)
            {
                var tasks = GetAllUriListFromQueue(queue, maxPagesCount - currentPageCount)
                            .Select(url => GetUriListFromPage(url, fileSaver, uriExtractor, hashSet))
                            .ToArray();

                currentDepth++;
                currentPageCount += tasks.Length;

                Task.WaitAll(tasks);

                foreach (var uri in tasks.SelectMany(task => task.Result))
                {
                    if (!hashSet.Contains(uri))
                    {
                        hashSet.Add(uri);
                        queue.Enqueue(uri);
                    }
                }
            }

            PrintStats(hashSet, currentPageCount, queue, currentDepth);
        }