public NewsItem ParseItem(HtmlDocument doc, string host) { var header = ReadTitle(doc.DocumentNode); var result = new NewsItem(); result.Header = header; result.Author = ReadAuthor(doc.DocumentNode); long date; result.DatePublished = long.TryParse(ReadPublishedDate(doc.DocumentNode),out date)?date:0; result.MainPic = ReadMainImage(doc.DocumentNode); RemoveTags(doc.DocumentNode.SelectNodes("//a").Where(d => d.Attributes.Contains("href") && d.Attributes["href"].Value.Contains("skybet"))); RemoveTags(doc.DocumentNode.SelectNodes("//a").Where(d => d.Attributes.Contains("href") && d.Attributes["href"].Value.Contains("betfair"))); RemoveTags(doc.DocumentNode.SelectNodes("//a").Where(d => d.Attributes.Contains("href") && d.Attributes["href"].Value.Contains("bet365"))); RemoveTags(doc.DocumentNode.SelectNodes("//a").Where(d => d.Attributes.Contains("href") && d.Attributes["href"].Value.Contains("williamhill"))); RemoveTags(doc.DocumentNode.SelectNodes("//a").Where(d => d.Attributes.Contains("href") && d.Attributes["href"].Value.Contains("paddypower"))); RemoveTags(doc.DocumentNode.SelectNodes("//a").Where(d => d.Attributes.Contains("href") && d.Attributes["href"].Value.Contains("coral.co.uk"))); RemoveTags(doc.DocumentNode.SelectNodes("//a").Where(d => d.Attributes.Contains("href") && d.Attributes["href"].Value.Contains("betvictor"))); RemoveTags(doc.DocumentNode.SelectNodes("//a").Where(d => d.Attributes.Contains("href") && d.Attributes["href"].Value.Contains("doubleclick"))); RemoveTags(doc.DocumentNode.SelectNodes("//a").Where(d => d.Attributes.Contains("href") && d.Attributes["href"].Value.Contains("victor.com"))); RemoveTags(doc.DocumentNode.SelectNodes("//a").Where(d => d.Attributes.Contains("href") && !d.Attributes["href"].Value.Contains("http://"))); RemoveTags(doc.DocumentNode.Descendants().Where(d => d.Attributes.Contains("data-bookmaker-url"))); result.Content = ReadContent(doc.DocumentNode); result.CleanContent = BaseHelper.ScrubHtml(result.Content); return result; }
private void UpdateOldItem(string url, Source source, NewsItem oldItem) { if (!oldItem.Categories.Any(c => c.Id == source.CategoryId)) { oldItem.Categories.Add(this.Data.Categories.All().FirstOrDefault(s => s.Id == source.CategoryId)); this.Data.SaveChanges(); } //Parse the article var forUpdateArticle = this.Data.NewsItems.All().FirstOrDefault(s => s.Id == oldItem.Id); if (forUpdateArticle == null && string.IsNullOrEmpty(forUpdateArticle.Content)) { forUpdateArticle = ParseArticle(url, source.SourceWebsite.Name); this.Data.NewsItems.Update(forUpdateArticle); this.Data.SaveChanges(); } }