private void ExtractDivs(Crawler c) { MatchCollection res = Regex.Matches(c.HTMLText, @"<div\s?(.*?)>(.*?)</div", RegexOptions.Multiline); StringBuilder s = new StringBuilder(); string subres = ""; foreach (Match m in res) { subres = Regex.Replace(m.Groups[2].Value, "(<.*?>)", " "); if (!String.IsNullOrWhiteSpace(subres)) { s.Append(subres); s.Append("\n"); } } c.ExtractedDivs = s.ToString(); }
private void StartCrawlbutton_Copy_Click(object sender, RoutedEventArgs e) { try { int tmp_depth = (int)DepthTextBox.Value; Thread t = new Thread(() => { Crawler c = new Crawler(); c.CrawlNext(tmp_depth); }); Tasks.Add(t); t.Start(); ThreadslistBox.BeginInit(); TaskNames.Add("Crawling from last visited links"); ThreadslistBox.DataContext = TaskNames; ThreadslistBox.EndInit(); } catch { } }
private void StartCrawlbutton_Click(object sender, RoutedEventArgs e) { try { string tmp_link = StartLinkTextBox.Text; int tmp_depth = (int)DepthTextBox.Value; Thread t = new Thread(() => { Crawler c = new Crawler(); c.Crawl(tmp_link, tmp_depth); }); Tasks.Add(t); t.Name = tmp_link; t.Start(); ThreadslistBox.BeginInit(); TaskNames.Add(t.Name); ThreadslistBox.DataContext = TaskNames; ThreadslistBox.EndInit(); } catch { } }
public void ParseLinkText(String url) { if (_crawler == null) { _crawler = new Crawler(); } _crawler.ParseLinkText(url); }
public void Crawl(String start_url, int depth) { if (_crawler == null) { _crawler = new Crawler(); } _crawler.Crawl(start_url, depth); }
public override void SendNext(Crawler c) { this.ExtractHeaders(c); base.SendNext(c); }
public virtual void SendNext(Crawler c) { if (this.Next != null) { this.Next.SendNext(c); } }
private void ExtractTitle(Crawler c) { MatchCollection regex_reults = Regex.Matches(c.HTMLText, @"<title\s?(.*?)>(.*?)</", RegexOptions.Multiline); StringBuilder s = new StringBuilder(); try { string subres = regex_reults[0].Groups[2].Value; if (!String.IsNullOrWhiteSpace(subres)) { c.ExtractedTitle = subres; } } catch { c.ExtractedTitle = null; } }
public override void SendNext(Crawler c) { this.ExtractTitle(c); if (!String.IsNullOrWhiteSpace(c.ExtractedTitle)) { base.SendNext(c); } }
public override void SendNext(Crawler c) { this.ExtractParagraphs(c); base.SendNext(c); }
public override void SendNext(Crawler c) { this.ExtractLinksText(c); base.SendNext(c); }
private void ExtractLinks(Crawler c) { MatchCollection res = Regex.Matches(c.HTMLText, @"href=(.*)"); string link = ""; foreach (Match m in res) { link = this.ExtractSingleLink(m.ToString(), c.URL); if (!String.IsNullOrWhiteSpace(link)) { c.LINKS.Add(link); } } }