public static void Process(Record record) { try { #region Process string xmlns = "{http://www.w3.org/1999/xhtml}"; var cl = new CrawlerClass(record.Url); XDocument xdoc = cl.GetXDocument(); if (xdoc != null) { var res = from item in xdoc.Descendants(xmlns + "div") where item.Attribute("class") != null && (item.Attribute("class").Value == "clearfix") && (item.Element(xmlns + "h4") != null || item.Element(xmlns + "h3") != null) select new { Link = item.Element(xmlns + "h4") != null ? item.Element(xmlns + "h4").Element(xmlns + "a").Attribute("href").Value : item.Element(xmlns + "h3").Element(xmlns + "a").Attribute("href").Value, Image = item.Element(xmlns + "a").Element(xmlns + "img") != null ? item.Element(xmlns + "a").Element(xmlns + "img").Attribute("src").Value : "", Title = item.Element(xmlns + "h4") != null ? item.Element(xmlns + "h4").Element(xmlns + "a").Value : item.Element(xmlns + "h3").Element(xmlns + "a").Value, Hour = item.Elements(xmlns + "p").ElementAt(0).Value, Date = item.Elements(xmlns + "p").ElementAt(0).Value, Desc = item.Elements(xmlns + "p").ElementAt(1).Value }; foreach (var node in res) { var info = new ContentInfo { Title = node.Title, Teaser = node.Desc, Image = node.Image, Link = record.HttpPrefix + node.Link, CategoryID = record.CategoryID, CrawlerUrl = record.Url, Hour = node.Hour.Split(' ')[0], Date = node.Date.Split(' ')[1] }; cl = new CrawlerClass(info.Link); xdoc = cl.GetXDocument(); #region Get Title //var resTitle = from item in xdoc.Descendants(xmlns + "div") // where // item.Attribute("class") != null && // item.Attribute("class").Value == "box10" // select new // { // Title = item.Element(xmlns + "h1").Value, // }; //info.Title = resTitle.ElementAt(0).Title; #endregion #region Get Hour and Date var resDate = from item in xdoc.Descendants(xmlns + "p") where item.Attribute("class") != null && item.Attribute("class").Value == "time" select new { Date = item.Value, }; //19/03/2011 - 12:56 AM string newDate = resDate.ElementAt(0).Date.Trim(); newDate = newDate.Substring(newDate.IndexOf(',') + 1).Trim(); string[] arr = newDate.Split(' '); info.Hour = arr[1].ToString().Trim(); info.Date = arr[0].ToString().Trim(); #endregion #region Get Body var resBody = from item in xdoc.Descendants(xmlns + "div") where item.Attribute("class") != null && item.Attribute("class").Value == "content" select new { Description = item.Value }; string body = resBody.ElementAt(1).Description; if (body.IndexOf("//") > 0) { body = body.Substring(0, body.IndexOf("//")); } info.Body = body; #endregion AppEnv.Insert(info); _logger.Debug("---------------------------------"); _logger.Debug("Title: " + info.Title); _logger.Debug("Desc : " + info.Teaser); _logger.Debug("Image: " + info.Image); _logger.Debug("Link : " + info.Link); _logger.Debug("Url : " + info.CrawlerUrl); } } #endregion } catch (Exception ex) { _logger.Debug("----------------Error-----------------"); _logger.Debug("Message: " + ex.Message); _logger.Debug("StackTrace : " + ex.StackTrace); _logger.Debug("Category: " + record.CategoryID); _logger.Debug("Link : " + record.Url); } }
public static void Process(Record record) { try { #region Process string xmlns = "{http://www.w3.org/1999/xhtml}"; var cl = new CrawlerClass(record.Url); XDocument xdoc = cl.GetXDocument(); if (xdoc != null) { var res = from item in xdoc.Descendants(xmlns + "ul").Elements(xmlns + "li") where item.Elements(xmlns + "div").Count() > 1 select new { Link = item.Elements(xmlns + "div").ElementAt(0).Element(xmlns + "a").Attribute("href").Value, Image = item.Elements(xmlns + "div").ElementAt(0).Element(xmlns + "a").Element(xmlns + "img").Attribute("src").Value, Title = item.Elements(xmlns + "div").ElementAt(1).Elements(xmlns + "p").ElementAt(0).Element(xmlns + "a").Value, Hour = DateTime.Now.Hour + ":" + DateTime.Now.Minute, Date = DateTime.Now.ToString("dd/MM/yyyy"), Desc = item.Elements(xmlns + "div").ElementAt(1).Elements(xmlns + "p").ElementAt(1).Value }; foreach (var node in res) { var info = new ContentInfo { Title = node.Title, Teaser = node.Desc, Image = record.HttpPrefix + node.Image, Link = record.HttpPrefix + node.Link, CategoryID = record.CategoryID, CrawlerUrl = record.Url, Hour = node.Hour, Date = node.Date }; cl = new CrawlerClass(info.Link); xdoc = cl.GetXDocument(); #region Get Body string body = ""; if (xdoc != null) { var resBody = from item in xdoc.Descendants(xmlns + "div") where item.Attribute("cpms_content") != null && item.Attribute("cpms_content").Value == "true" select new { Description = item.Value }; body = resBody.ElementAt(0).Description; } if (body.IndexOf("//") > 0) { body = body.Substring(0, body.IndexOf("//")); } info.Body = body; #endregion AppEnv.Insert(info); _logger.Debug("---------------------------------"); _logger.Debug("Title: " + info.Title); _logger.Debug("Desc : " + info.Teaser); _logger.Debug("Image: " + info.Image); _logger.Debug("Link : " + info.Link); _logger.Debug("Url : " + info.CrawlerUrl); } } #endregion } catch (Exception ex) { _logger.Debug("----------------Error-----------------"); _logger.Debug("Message: " + ex.Message); _logger.Debug("StackTrace : " + ex.StackTrace); _logger.Debug("Category: " + record.CategoryID); _logger.Debug("Link : " + record.Url); } }
public static void Process(Record record) { try { #region Process string xmlns = "{http://www.w3.org/1999/xhtml}"; var cl = new CrawlerClass(record.Url); XDocument xdoc = cl.GetXDocument(); if (xdoc != null) { var res = from item in xdoc.Descendants(xmlns + "div") where item.Attribute("class") != null && item.Attribute("class").Value == "item clearfix" && item.Elements(xmlns + "div").ElementAt(0).Attribute("class").Value == "meta" select new { Link = item.Elements(xmlns + "div").ElementAt(1).Element(xmlns + "a").Attribute("href").Value, Image = item.Elements(xmlns + "div").ElementAt(1).Element(xmlns + "a").Element(xmlns + "img").Attribute("src").Value, Title = item.Elements(xmlns + "div").ElementAt(2).Elements(xmlns + "div").ElementAt(0).Element(xmlns + "a").Value, Hour = item.Elements(xmlns + "div").ElementAt(0).Value.Substring(11).Trim(), Date = item.Elements(xmlns + "div").ElementAt(0).Value.Substring(0, 10).Trim(), Desc = item.Elements(xmlns + "div").ElementAt(2).Elements(xmlns + "div").ElementAt(0).Value, _LI = item.Elements(xmlns + "ul") }; foreach (var node in res) { var info = new ContentInfo { Title = node.Title, Teaser = node.Desc, Image = node.Image, Link = record.HttpPrefix + node.Link, CategoryID = record.CategoryID, CrawlerUrl = record.Url, Hour = node.Hour, Date = node.Date }; cl = new CrawlerClass(info.Link); xdoc = cl.GetXDocument(); #region Get Body var resBody = from item in xdoc.Descendants(xmlns + "div") where item.Attribute("class") != null && item.Attribute("class").Value == "content" select new { Description = item.Value }; string body = resBody.ElementAt(0).Description; if (body.IndexOf("//") > 0) { body = body.Substring(0, body.IndexOf("//")); } info.Body = body; #endregion AppEnv.Insert(info); _logger.Debug("---------------------------------"); _logger.Debug("Title: " + info.Title); _logger.Debug("Desc : " + info.Teaser); _logger.Debug("Image: " + info.Image); _logger.Debug("Link : " + info.Link); _logger.Debug("Url : " + info.CrawlerUrl); #region Get relate News var resLI = from item in node._LI.Descendants(xmlns + "li") where item.Element(xmlns + "a") != null select new { Link = item.Element(xmlns + "a").Attribute("href").Value, Image = "", Title = item.Element(xmlns + "a").Value, Hour = "00:00", Date = "", Desc = "" }; foreach (var nodeLI in resLI) { var infoLI = new ContentInfo { Title = nodeLI.Title, Teaser = nodeLI.Desc, Image = nodeLI.Image, Link = record.HttpPrefix + nodeLI.Link, CategoryID = record.CategoryID, CrawlerUrl = record.Url, Hour = nodeLI.Hour, Date = nodeLI.Date }; cl = new CrawlerClass(infoLI.Link); xdoc = cl.GetXDocument(); #region Get Hour and Date var resDate = from item in xdoc.Descendants(xmlns + "div") where item.Attribute("class") != null && item.Attribute("class").Value == "meta" select new { Hour = item.Elements(xmlns + "span").ElementAt(0).Value, Date = item.Elements(xmlns + "span").ElementAt(1).Value }; infoLI.Hour = resDate.ElementAt(0).Hour; infoLI.Date = resDate.ElementAt(0).Date; #endregion #region Get full teaser var resTeaser = from item in xdoc.Descendants(xmlns + "p") where item.Attribute("class") != null && item.Attribute("class").Value == "sapo" select new { Teaser = item.Value, }; info.Teaser = resTeaser.ElementAt(0).Teaser; #endregion #region Get Body var resBodyLI = from item in xdoc.Descendants(xmlns + "div") where item.Attribute("class") != null && item.Attribute("class").Value == "content" select new { Description = item.Value }; string bodyLI = resBodyLI.ElementAt(0).Description; if (bodyLI.IndexOf("//") > 0) { bodyLI = bodyLI.Substring(0, bodyLI.IndexOf("//")); } infoLI.Body = bodyLI; #endregion AppEnv.Insert(infoLI); _logger.Debug("---------------------------------"); _logger.Debug("Title: " + info.Title); _logger.Debug("Desc : " + info.Teaser); _logger.Debug("Image: " + info.Image); _logger.Debug("Link : " + info.Link); _logger.Debug("Url : " + info.CrawlerUrl); } #endregion } } #endregion } catch (Exception ex) { _logger.Debug("----------------Error-----------------"); _logger.Debug("Message: " + ex.Message); _logger.Debug("StackTrace : " + ex.StackTrace); _logger.Debug("Category: " + record.CategoryID); _logger.Debug("Link : " + record.Url); } }
public static void Process(Record record) { try { #region Process string xmlns = ""; var cl = new CrawlerClass(record.Url); XDocument xdoc = cl.GetXDocument(); if (xdoc != null) { var res = from item in xdoc.Descendants(xmlns + "div") where item.Attribute("class") != null && item.Attribute("class").Value == "DivImgSumSALEHNNP" select new { Link = item.Elements(xmlns + "a").ElementAt(0).Attribute("href").Value, Image = item.Elements(xmlns + "a").ElementAt(0).Element(xmlns + "img").Attribute("src").Value, Title = item.Elements(xmlns + "a").ElementAt(1).Value, Hour = "00:00", Date = "", Desc = item.Value }; foreach (var node in res) { var info = new ContentInfo { Title = node.Title, Teaser = node.Desc, Image = record.HttpPrefix + "/" + node.Image, Link = record.HttpPrefix + "/" + node.Link, CategoryID = record.CategoryID, CrawlerUrl = record.Url, Hour = node.Hour, Date = node.Date }; cl = new CrawlerClass(info.Link); xdoc = cl.GetXDocument(); #region Get Hour and Date var resDate = from item in xdoc.Descendants(xmlns + "span") where item.Attribute("class") != null && item.Attribute("class").Value == "News_adt_DatePl" select new { Date = item.Value, }; //Thứ bẩy, ngày 19 tháng 03 năm 2011 cập nhật lúc 14:08 string newDate = resDate.ElementAt(0).Date.Trim(); newDate = newDate.Substring(newDate.IndexOf(',') + 1).Trim(); //ngày 19 tháng 03 năm 2011 cập nhật lúc 14:08 string[] arr = newDate.Split(' '); info.Hour = arr[9].ToString().Trim(); info.Date = arr[1].ToString().Trim() + "/" + arr[3].ToString().Trim() + "/" + arr[5].ToString().Trim(); #endregion #region Get Body var resBody = from item in xdoc.Descendants(xmlns + "span") where item.Attribute("class") != null && item.Attribute("class").Value == "News_ArticleDetailContent" select new { Description = item.Value }; string body = resBody.ElementAt(0).Description; if (body.IndexOf("//") > 0) { body = body.Substring(0, body.IndexOf("//")); } info.Body = body; #endregion AppEnv.Insert(info); _logger.Debug("---------------------------------"); _logger.Debug("Title: " + info.Title); _logger.Debug("Desc : " + info.Teaser); _logger.Debug("Image: " + info.Image); _logger.Debug("Link : " + info.Link); _logger.Debug("Url : " + info.CrawlerUrl); } } #endregion } catch (Exception ex) { _logger.Debug("----------------Error-----------------"); _logger.Debug("Message: " + ex.Message); _logger.Debug("StackTrace : " + ex.StackTrace); _logger.Debug("Category: " + record.CategoryID); _logger.Debug("Link : " + record.Url); } }
public static void Process(Record record) { try { #region Process string xmlns = ""; var cl = new CrawlerClass(record.Url); XDocument xdoc = cl.GetXDocument(); if (xdoc != null) { var res = from item in xdoc.Descendants(xmlns + "td") where item.Attribute("class") != null && item.Attribute("class").Value == "text" && item.Attribute("colspan") != null && item.Attribute("colspan").Value == "2" select new { Link = item.Elements(xmlns + "p").ElementAt(0).Element(xmlns + "span").Element(xmlns + "a"). Attribute("href").Value, Image = item.Elements(xmlns + "table").ElementAt(0).Element(xmlns + "tr").Elements(xmlns + "td"). ElementAt(0).Element(xmlns + "a").Element(xmlns + "img").Attribute("src").Value, Title = item.Elements(xmlns + "p").ElementAt(0).Element(xmlns + "span").Element(xmlns + "a"). Value, Hour = "00:00", Date = "", Desc = item.Elements(xmlns + "p").ElementAt(0).Element(xmlns + "span").Value }; foreach (var node in res) { var info = new ContentInfo { Title = node.Title.Replace("\t", ""), Teaser = node.Desc.Replace("\t", "").Substring(node.Desc.IndexOf("<br>") + 4). Replace("<br>", ""), Image = record.HttpPrefix + node.Image, Link = record.HttpPrefix + node.Link, CategoryID = record.CategoryID, CrawlerUrl = record.Url, Hour = node.Hour, Date = node.Date }; cl = new CrawlerClass(info.Link); xdoc = cl.GetXDocument(); #region Get Hour and Date var resDate = from item in xdoc.Descendants(xmlns + "td") where item.Attribute("class") != null && item.Attribute("class").Value == "posted_date" select new { Date = item.Value, }; //Chủ nhật, 20/3/2011, 22:41 GMT+7 string newDate = resDate.ElementAt(0).Date.Trim(); string[] arr = newDate.Split(','); info.Hour = arr[2].ToString().Trim().Trim().Substring(0, 5); info.Date = arr[1].ToString().Trim(); #endregion #region Get Body var resBody = from item in xdoc.Descendants(xmlns + "span") where item.Attribute("class") != null && item.Attribute("class").Value == "textbai" select new { Description = item.Value }; string body = resBody.ElementAt(0).Description; if (body.IndexOf("//") > 0) { body = body.Substring(0, body.IndexOf("//")); } info.Body = body; #endregion AppEnv.Insert(info); _logger.Debug("---------------------------------"); _logger.Debug("Title: " + info.Title); _logger.Debug("Desc : " + info.Teaser); _logger.Debug("Image: " + info.Image); _logger.Debug("Link : " + info.Link); _logger.Debug("Url : " + info.CrawlerUrl); } } #endregion } catch (Exception ex) { _logger.Debug("----------------Error-----------------"); _logger.Debug("Message: " + ex.Message); _logger.Debug("StackTrace : " + ex.StackTrace); _logger.Debug("Category: " + record.CategoryID); _logger.Debug("Link : " + record.Url); } }