private void StartWebCrawler() { websitelToCrawler = httpValue + "://www." + websitelToCrawler; CurrentUrl = websitelToCrawler; Uri _uri = new Uri(websitelToCrawler); fullUriWebsite = _uri; fullUrlWebsite = _uri.GetLeftPart(UriPartial.Authority) + "/"; UrlToDo.Add(fullUrlWebsite); StartCrawlerAsync(); }
private async void StartCrawlerAsync() { while ((UrlToDo.Count != 0) && ((urlStatus.Status != Models.EnumStatus.onPause)) && ((urlStatus.Status != Models.EnumStatus.onStop))) { try { CurrentUrl = UrlToDo[0]; NotifyPropertyChanged(CurrentUrl); //HtmlWeb hw = new HtmlWeb(); HtmlResponseData replay = await GetHtmlDocument(CurrentUrl); if (replay != null) { AnalizePage(replay); UrlDone.Add(replay.AbsoluteUri); NotifyPropertyChanged("UrlDone"); } else { UrlError.Add(CurrentUrl); } UrlToDo.RemoveAt(0); NotifyPropertyChanged("UrlToDo"); } catch (Exception err) { UrlError.Add(CurrentUrl); NotifyPropertyChanged("UrlError"); } } if (UrlToDo.Count == 0) { urlStatus.Status = Models.EnumStatus.finish; CurrentUrl = ""; } else { UrlStatus.Status = Models.EnumStatus.onStartup; } NotifyPropertyChanged("UrlStatus"); }
private bool AnalizePage(HtmlResponseData replay) { try { HtmlNodeCollection _list = replay.document.DocumentNode.SelectNodes("//a[@href]"); if (_list.Count == 0) { return(false); } foreach (HtmlNode link in _list) { HtmlAttribute att = null; try { att = link.Attributes["href"]; } catch (Exception err2) { var dedug2 = ""; } if (att != null) { //HtmlAttribute att = link.Attributes["href"]; string linkToAdd = IsValidLink(att.Value); if (linkToAdd != null) { UrlToDo.Add(linkToAdd.ToLower().Trim()); NotifyPropertyChanged("UrlDone"); } } } NotifyPropertyChanged(); } catch (Exception err1) { var dedug1 = ""; return(false); } return(true); }