public Offer ParseOffer(HtmlNode node) { string link = node.SelectSingleNode("a[1]").Attributes["href"].Value; string title = node.SelectSingleNode("a[1]").InnerText; var offer = new Offer { Title = title, Link = link }; var request = WebRequest.Create(offer.Link); using (var response = request.GetResponse().GetResponseStream()) { var document = new HtmlDocument(); document.Load(response, true); var body = document.DocumentNode.SelectSingleNode("//body"); var date = body.SelectSingleNode("//td[@class='first_row']").InnerText.Trim().Substring(0, 10).Replace("/", "-"); var text = body.SelectSingleNode("//span[@id='preview-local-desc']").InnerText; var address = document.DocumentNode.SelectSingleNode("//meta[@property='og:locality']").Attributes["content"].Value; var provinceAndCity = address.Trim().Split('/'); var province = provinceAndCity[0]; var city = provinceAndCity[1]; DateTime dateParsed; DateTime.TryParseExact(date, "dd-MM-yyyy", CultureInfo.CurrentUICulture.DateTimeFormat, DateTimeStyles.AllowWhiteSpaces, out dateParsed); offer.Text = text; offer.City = city; offer.Province = province; offer.Date = dateParsed; offer.Source = "Gumtree"; } offer.Sha1 = OfferHelper.GenerateSha1(offer.Text); return(offer); }
public Offer ParseOffer(HtmlNode node) { string link = node.SelectSingleNode("td[@class='position']").SelectSingleNode("a[1]").Attributes["href"].Value; var dateAndProvince = node.SelectSingleNode("td[@class='date']").InnerHtml.Split(new string[] { "<br>" }, StringSplitOptions.None); var date = dateAndProvince[0]; var province = dateAndProvince[1]; DateTime dateParsed; DateTime.TryParse(date, out dateParsed); var offer = new Offer { Link = link, Date = dateParsed, Province = province, }; var request = WebRequest.Create(offer.Link); using (var response = request.GetResponse().GetResponseStream()) { var document = new HtmlDocument(); document.Load(response, true); var body = document.DocumentNode.SelectSingleNode("//body"); var title = document.DocumentNode.SelectSingleNode("//strong[1]").InnerText; var textRawHtml = body.InnerHtml; offer.City = ""; offer.Title = title; offer.Text = textRawHtml; offer.Source = "GoldenLine"; } offer.Sha1 = OfferHelper.GenerateSha1(offer.Text); return(offer); }
public Offer ParseOffer(HtmlNode node) { string link = node.SelectSingleNode("a[1]").Attributes["href"].Value; string title = node.SelectSingleNode("div[@class='info']/div[1]/h3[1]/a[1]").InnerText; var offer = new Offer { Link = link, Title = title, }; var date = node.SelectSingleNode("div[@class='description']/dl[1]/dd[2]/div/div") ?? node.SelectSingleNode("div[@class='description']/dl[1]/dd[1]/span[1]"); DateTime dateParsed; DateTime.TryParse(date.InnerText, out dateParsed); offer.Date = dateParsed; var request = WebRequest.Create(offer.Link); using (var response = request.GetResponse().GetResponseStream()) { var document = new HtmlDocument(); document.Load(response, true); var body = document.DocumentNode.SelectSingleNode("//body"); var textRawHtml = body.SelectSingleNode("//div[@class='divPostContent']/div[1]").InnerHtml; offer.City = ""; offer.Province = ""; offer.Text = textRawHtml; offer.Source = "CodeGuru"; } offer.Sha1 = OfferHelper.GenerateSha1(offer.Text); return(offer); }
public virtual void Crawl() { var request = WebRequest.Create(_requestUriString); using (var response = request.GetResponse().GetResponseStream()) { if (response == null) { return; } var document = new HtmlDocument(); document.Load(response, true); var nodes = document.DocumentNode.SelectNodes(_nodesXPath); var allKeywords = _repository.GetAllKeywords(); foreach (var node in nodes) { try { var offer = _parser.ParseOffer(node); if (_repository.InsertOffer(offer)) { var offerKeywords = OfferHelper.ScanTextForKeywords(offer.Text, allKeywords); _repository.InsertKeywordsForOffer(offer.Sha1, offerKeywords); } } catch (Exception ex) { var log = NLog.LogManager.GetCurrentClassLogger(); log.LogException(NLog.LogLevel.Fatal, "Exception occured", ex); } } } }
public Offer ParseOffer(HtmlNode node) { if (node.SelectSingleNode("ul") != null) { return(Offer.Empty); } string link = node.SelectSingleNode("div//a[@class='offerLink']").Attributes["href"].Value; var offer = new Offer { Link = string.Format("http://pracuj.pl{0}", link), }; var request = WebRequest.Create(offer.Link); using (var response = request.GetResponse().GetResponseStream()) { var document = new HtmlDocument(); document.Load(response, true); var body = document.DocumentNode.SelectSingleNode("//body"); var rawHtmlText = body.SelectSingleNode("//div[@id='offCont']").InnerHtml; var dateRegex = new Regex("\\d{4}-\\d{2}-\\d{2}"); var date = body.SelectNodes("//dd").First(x => dateRegex.IsMatch(x.InnerText)).InnerText; var addressContainer = body.SelectNodes("//dt").First(x => x.InnerText == "Lokalizacja:").NextSibling.NextSibling; string city = ""; string province = ""; string title = body.SelectSingleNode("//h1[@class='offerTitle']").InnerText; if (addressContainer.ChildNodes.Count > 3) { city = addressContainer.ChildNodes.ElementAt(1).InnerText; province = addressContainer.ChildNodes.ElementAt(3).InnerText; } else { province = addressContainer.ChildNodes.ElementAt(0).InnerText; } DateTime dateParsed; DateTime.TryParse(date, out dateParsed); offer.Title = title; offer.Text = rawHtmlText; offer.City = city; offer.Province = province; offer.Date = dateParsed; offer.Source = "PracujPL"; } offer.Sha1 = OfferHelper.GenerateSha1(offer.Text); return(offer); }