public IEnumerable <Event> Extract(CrawlResult result) { var dom = result.DOM; var selector = dom["#heat_lists > table > tbody > tr:nth-child(n + 1) > td:nth-child(4) > p"]; var timeSelector = dom["#heat_lists > table > tbody > tr > td.heatlist_datetime > p"]; var sessionSelector = dom["#heat_lists > table > tbody > tr > td[valign='top']:nth-child(1) > p"]; var heatSelector = dom["#heat_lists > table > tbody > tr > td[valign='top']:nth-child(2) > p"]; var events = from eventNames in selector let name = regex.Replace(eventNames.InnerText.Trim(), " ") select new Event { Name = name }; var times = from time in timeSelector select time.InnerText.Trim(); var sessions = from session in sessionSelector select session.InnerText.Trim(); var heats = from heat in heatSelector select heat.InnerText.Trim(); return(events.Zip(times, sessions, heats, (ev, t, s, h) => { ev.Session = s; ev.Heat = h; ev.Time = t; return ev; })); }
public override IObservable <CrawlResult> Crawl(CrawlResult result) { return(Observable.If(() => Extractor.CanExtract(result), Extractor.Extract(result).ToObservable() .Do(x => Console.WriteLine("Crawling: " + x)) .SelectMany(uri => base.Crawl(new CrawlResult(uri, null))))); }
public IEnumerable <Uri> Extract(CrawlResult result) { Console.WriteLine("Extracting: " + result.Uri); return(result.DOM["#main_tbl > tbody > tr > td:nth-child(2) > a"].Select(domObj => { return new Uri(result.Uri, domObj["href"]); })); }
public virtual IObservable <CrawlResult> Crawl(CrawlResult result) { var crawl = this.CreateCrawlerForUri(result.Uri); return(crawl.Select(cq => new CrawlResult(result.Uri, cq)) .SelectMany(cr => this.SubCrawlers.Where(sub => sub.CanCrawl(cr)), (cr, sub) => sub.Crawl(cr)) .Merge()); }
public override IObservable <CrawlResult> Crawl(CrawlResult result) { return(result.DOM["#placement > form > table:nth-child(2) > tbody > tr > td.h5b > a"].Select(domObj => { return new Uri(result.Uri, domObj["href"]); }) .Select(x => new CrawlResult(x, null)) .ToObservable() .SelectMany(cr => { return this.CreateCrawlerForUri(cr.Uri) .Select(x => new CrawlResult(cr.Uri, x)); })); }
public IObservable <CrawlResult> Crawl(CrawlResult result) { var dom = result.DOM; var urls = dom["#results_competitor_list > a[onclick]"].Select(participant => { //Will have to do some finagaling to get the onclick correct var clickAttr = participant.GetAttribute("onclick"); var firstQuote = clickAttr.IndexOf('\''); var secondQuote = clickAttr.IndexOf('\'', firstQuote + 1); return(clickAttr.Substring(firstQuote + 1, secondQuote - firstQuote - 1)); }); return(urls.ToObservable().SelectMany(url => { return this.CreateCrawlerForUri(new Uri(url), scheduler: this.scheduler); }, (url, cq) => new CrawlResult(new Uri(url), cq))); }
public IEnumerable <Dancer> Extract(CrawlResult crawlResult) { var dom = crawlResult.DOM; var uri = crawlResult.Uri; var dancerName = dom["#header > span"].Select(x => x.InnerText.Trim()); var partnerNames = dom["#heat_lists > table > tbody > tr > td[colspan='4'] > p"] .Select(pn => { var trimmedText = pn.InnerText.Trim(':', ' '); var withIndex = trimmedText.IndexOf("with"); var minusWith = trimmedText.Substring(withIndex + 4).Trim(); return(minusWith); }); return(from name in dancerName select new Dancer { Name = name, Uri = uri.AbsoluteUri, Events = eventExtractor.Extract(crawlResult).ToList(), //Events = eventNames.Select(ev => new Event { Name = ev }).ToList(), Partners = partnerNames.Distinct().ToList() }); }
public bool CanCrawl(CrawlResult result) { return(true); }
public bool CanExtract(CrawlResult result) { return(true); }
public override bool CanCrawl(CrawlResult result) { return(result.Uri.Host.Contains("results.o2cm.com")); }
public bool CanExtract(CrawlResult result) { return(result.Uri.Host.Contains("results.o2cm.com") && result.DOM["#main_tbl"].Any()); }
public virtual bool CanCrawl(CrawlResult result) { return(true); }