private void ProcessingFunction() { while (DoProcessing) { try { TaskData taskData = QueueManager.GetTask(); if (taskData == null) { Thread.Sleep(250); continue; } var candidates = ProcessUrl(taskData); if (candidates != null && candidates.Count > 0) { OnChildLinkProcessed?.Invoke(candidates); } } catch (Exception ex) { } } }
private void OnLinkProcessedHandler(TaskData link) { OnLinkProcessed?.Invoke(link); //throw new NotImplementedException(); }
private List <Link> ProcessUrl(TaskData taskData) { var result = new List <Link>(); var domain = (new Uri(taskData.Link.Url)).Host; /// DEBUG! //if(domain.Contains("vk")) //{ //} try { WebRequest request = WebRequest.Create(taskData.Link.Url); request.Credentials = CredentialCache.DefaultCredentials; request.Headers.Add("User-Agent", "PostmanRuntime/7.24.0"); WebResponse response = request.GetResponse(); taskData.Link.ContentType = response.ContentType; int MaxDepth = WebCrawler.Instance.settings.MaxDepth; string responseString = ""; // вычитыаем html using (var reader = new StreamReader(response.GetResponseStream())) { responseString = reader.ReadToEnd(); } taskData.Link.ResponseLength = responseString.Length; // если глубина не превысила целевую // ищем ссылки глубже if (taskData.DepthLevel <= MaxDepth) { // парсим var raw = HtmlAgilityPack(responseString); var childLinks = MakeAbsolutUrls(raw, domain); //.ToList(); foreach (var linkUrl in childLinks) { TaskData newTask = new TaskData() { DepthLevel = taskData.DepthLevel + 1, IsDone = false, Link = new Link() { Url = linkUrl } }; QueueManager.AddTask(newTask); taskData.ChildTasks.Add(newTask); } //if (childLinks.Count > 0) // OnChildLinkProcessed?.Invoke(result); } } catch (WebException ex) { taskData.Link.ContentType = "failed"; //Console.WriteLine($"FAIL: {taskData.Link.Url}"); } catch (Exception ex) { taskData.Link.ContentType = "failed"; //Console.WriteLine($"FAIL: {taskData.Link.Url}"); } finally { taskData.IsDone = true; } OnLinkProcessed?.Invoke(taskData); return(result); }