public static async Task RunCrowler(Action <WebCrawlerSettings> options) { if (!Monitor.TryEnter(BlockingObject, 50)) { throw new Exception("Worker Is Busy!"); } try { WebCrawlerSettings settings = new WebCrawlerSettings(); options.Invoke(settings); Instance.settings = settings; QueueManager.Init(settings.MaxDepth); QueueManager.AddTask(new TaskData() { DepthLevel = 0, Link = new Models.Link() { Url = settings.RootUrl } }); await Instance.Run(); } finally { Monitor.Exit(BlockingObject); } }
private List <Link> ProcessUrl(TaskData taskData) { var result = new List <Link>(); var domain = (new Uri(taskData.Link.Url)).Host; /// DEBUG! //if(domain.Contains("vk")) //{ //} try { WebRequest request = WebRequest.Create(taskData.Link.Url); request.Credentials = CredentialCache.DefaultCredentials; request.Headers.Add("User-Agent", "PostmanRuntime/7.24.0"); WebResponse response = request.GetResponse(); taskData.Link.ContentType = response.ContentType; int MaxDepth = WebCrawler.Instance.settings.MaxDepth; string responseString = ""; // вычитыаем html using (var reader = new StreamReader(response.GetResponseStream())) { responseString = reader.ReadToEnd(); } taskData.Link.ResponseLength = responseString.Length; // если глубина не превысила целевую // ищем ссылки глубже if (taskData.DepthLevel <= MaxDepth) { // парсим var raw = HtmlAgilityPack(responseString); var childLinks = MakeAbsolutUrls(raw, domain); //.ToList(); foreach (var linkUrl in childLinks) { TaskData newTask = new TaskData() { DepthLevel = taskData.DepthLevel + 1, IsDone = false, Link = new Link() { Url = linkUrl } }; QueueManager.AddTask(newTask); taskData.ChildTasks.Add(newTask); } //if (childLinks.Count > 0) // OnChildLinkProcessed?.Invoke(result); } } catch (WebException ex) { taskData.Link.ContentType = "failed"; //Console.WriteLine($"FAIL: {taskData.Link.Url}"); } catch (Exception ex) { taskData.Link.ContentType = "failed"; //Console.WriteLine($"FAIL: {taskData.Link.Url}"); } finally { taskData.IsDone = true; } OnLinkProcessed?.Invoke(taskData); return(result); }