Exemplo n.º 1
0
        public static void Process(Record record)
        {
            try
            {
                #region Process

                string    xmlns = "{http://www.w3.org/1999/xhtml}";
                var       cl    = new CrawlerClass(record.Url);
                XDocument xdoc  = cl.GetXDocument();

                if (xdoc != null)
                {
                    var res = from item in xdoc.Descendants(xmlns + "div")
                              where
                              item.Attribute("class") != null &&
                              (item.Attribute("class").Value == "clearfix") &&
                              (item.Element(xmlns + "h4") != null || item.Element(xmlns + "h3") != null)
                              select new
                    {
                        Link =
                            item.Element(xmlns + "h4") != null
                                      ? item.Element(xmlns + "h4").Element(xmlns + "a").Attribute("href").Value
                                      : item.Element(xmlns + "h3").Element(xmlns + "a").Attribute("href").Value,
                        Image =
                            item.Element(xmlns + "a").Element(xmlns + "img") != null
                                      ? item.Element(xmlns + "a").Element(xmlns + "img").Attribute("src").Value
                                      : "",
                        Title =
                            item.Element(xmlns + "h4") != null
                                      ? item.Element(xmlns + "h4").Element(xmlns + "a").Value
                                      : item.Element(xmlns + "h3").Element(xmlns + "a").Value,
                        Hour = item.Elements(xmlns + "p").ElementAt(0).Value,
                        Date = item.Elements(xmlns + "p").ElementAt(0).Value,
                        Desc = item.Elements(xmlns + "p").ElementAt(1).Value
                    };
                    foreach (var node in res)
                    {
                        var info = new ContentInfo
                        {
                            Title      = node.Title,
                            Teaser     = node.Desc,
                            Image      = node.Image,
                            Link       = record.HttpPrefix + node.Link,
                            CategoryID = record.CategoryID,
                            CrawlerUrl = record.Url,
                            Hour       = node.Hour.Split(' ')[0],
                            Date       = node.Date.Split(' ')[1]
                        };

                        cl = new CrawlerClass(info.Link);

                        xdoc = cl.GetXDocument();

                        #region Get Title

                        //var resTitle = from item in xdoc.Descendants(xmlns + "div")
                        //               where
                        //                   item.Attribute("class") != null &&
                        //                   item.Attribute("class").Value == "box10"
                        //               select new
                        //               {
                        //                   Title = item.Element(xmlns + "h1").Value,
                        //               };

                        //info.Title = resTitle.ElementAt(0).Title;

                        #endregion

                        #region Get Hour and Date

                        var resDate = from item in xdoc.Descendants(xmlns + "p")
                                      where
                                      item.Attribute("class") != null &&
                                      item.Attribute("class").Value == "time"
                                      select new
                        {
                            Date = item.Value,
                        };
                        //19/03/2011 - 12:56 AM
                        string newDate = resDate.ElementAt(0).Date.Trim();
                        newDate = newDate.Substring(newDate.IndexOf(',') + 1).Trim();

                        string[] arr = newDate.Split(' ');

                        info.Hour = arr[1].ToString().Trim();
                        info.Date = arr[0].ToString().Trim();

                        #endregion

                        #region Get Body

                        var resBody = from item in xdoc.Descendants(xmlns + "div")
                                      where item.Attribute("class") != null && item.Attribute("class").Value == "content"
                                      select new
                        {
                            Description = item.Value
                        };

                        string body = resBody.ElementAt(1).Description;

                        if (body.IndexOf("//") > 0)
                        {
                            body = body.Substring(0, body.IndexOf("//"));
                        }
                        info.Body = body;

                        #endregion

                        AppEnv.Insert(info);

                        _logger.Debug("---------------------------------");
                        _logger.Debug("Title: " + info.Title);
                        _logger.Debug("Desc : " + info.Teaser);
                        _logger.Debug("Image: " + info.Image);
                        _logger.Debug("Link : " + info.Link);
                        _logger.Debug("Url  : " + info.CrawlerUrl);
                    }
                }

                #endregion
            }
            catch (Exception ex)
            {
                _logger.Debug("----------------Error-----------------");
                _logger.Debug("Message: " + ex.Message);
                _logger.Debug("StackTrace : " + ex.StackTrace);
                _logger.Debug("Category: " + record.CategoryID);
                _logger.Debug("Link : " + record.Url);
            }
        }
Exemplo n.º 2
0
        public static void Process(Record record)
        {
            try
            {
                #region Process

                string    xmlns = "{http://www.w3.org/1999/xhtml}";
                var       cl    = new CrawlerClass(record.Url);
                XDocument xdoc  = cl.GetXDocument();

                if (xdoc != null)
                {
                    var res = from item in xdoc.Descendants(xmlns + "ul").Elements(xmlns + "li")
                              where item.Elements(xmlns + "div").Count() > 1
                              select new
                    {
                        Link  = item.Elements(xmlns + "div").ElementAt(0).Element(xmlns + "a").Attribute("href").Value,
                        Image = item.Elements(xmlns + "div").ElementAt(0).Element(xmlns + "a").Element(xmlns + "img").Attribute("src").Value,
                        Title = item.Elements(xmlns + "div").ElementAt(1).Elements(xmlns + "p").ElementAt(0).Element(xmlns + "a").Value,
                        Hour  = DateTime.Now.Hour + ":" + DateTime.Now.Minute,
                        Date  = DateTime.Now.ToString("dd/MM/yyyy"),
                        Desc  = item.Elements(xmlns + "div").ElementAt(1).Elements(xmlns + "p").ElementAt(1).Value
                    };
                    foreach (var node in res)
                    {
                        var info = new ContentInfo
                        {
                            Title      = node.Title,
                            Teaser     = node.Desc,
                            Image      = record.HttpPrefix + node.Image,
                            Link       = record.HttpPrefix + node.Link,
                            CategoryID = record.CategoryID,
                            CrawlerUrl = record.Url,
                            Hour       = node.Hour,
                            Date       = node.Date
                        };

                        cl = new CrawlerClass(info.Link);

                        xdoc = cl.GetXDocument();

                        #region Get Body

                        string body = "";
                        if (xdoc != null)
                        {
                            var resBody = from item in xdoc.Descendants(xmlns + "div")
                                          where
                                          item.Attribute("cpms_content") != null &&
                                          item.Attribute("cpms_content").Value == "true"
                                          select new
                            {
                                Description = item.Value
                            };

                            body = resBody.ElementAt(0).Description;
                        }
                        if (body.IndexOf("//") > 0)
                        {
                            body = body.Substring(0, body.IndexOf("//"));
                        }
                        info.Body = body;

                        #endregion

                        AppEnv.Insert(info);

                        _logger.Debug("---------------------------------");
                        _logger.Debug("Title: " + info.Title);
                        _logger.Debug("Desc : " + info.Teaser);
                        _logger.Debug("Image: " + info.Image);
                        _logger.Debug("Link : " + info.Link);
                        _logger.Debug("Url  : " + info.CrawlerUrl);
                    }
                }

                #endregion
            }
            catch (Exception ex)
            {
                _logger.Debug("----------------Error-----------------");
                _logger.Debug("Message: " + ex.Message);
                _logger.Debug("StackTrace : " + ex.StackTrace);
                _logger.Debug("Category: " + record.CategoryID);
                _logger.Debug("Link : " + record.Url);
            }
        }
Exemplo n.º 3
0
        public static void Process(Record record)
        {
            try
            {
                #region Process

                string    xmlns = "{http://www.w3.org/1999/xhtml}";
                var       cl    = new CrawlerClass(record.Url);
                XDocument xdoc  = cl.GetXDocument();

                if (xdoc != null)
                {
                    var res = from item in xdoc.Descendants(xmlns + "div")
                              where item.Attribute("class") != null &&
                              item.Attribute("class").Value == "item clearfix" &&
                              item.Elements(xmlns + "div").ElementAt(0).Attribute("class").Value == "meta"
                              select new
                    {
                        Link  = item.Elements(xmlns + "div").ElementAt(1).Element(xmlns + "a").Attribute("href").Value,
                        Image = item.Elements(xmlns + "div").ElementAt(1).Element(xmlns + "a").Element(xmlns + "img").Attribute("src").Value,
                        Title = item.Elements(xmlns + "div").ElementAt(2).Elements(xmlns + "div").ElementAt(0).Element(xmlns + "a").Value,
                        Hour  = item.Elements(xmlns + "div").ElementAt(0).Value.Substring(11).Trim(),
                        Date  = item.Elements(xmlns + "div").ElementAt(0).Value.Substring(0, 10).Trim(),
                        Desc  = item.Elements(xmlns + "div").ElementAt(2).Elements(xmlns + "div").ElementAt(0).Value,
                        _LI   = item.Elements(xmlns + "ul")
                    };
                    foreach (var node in res)
                    {
                        var info = new ContentInfo
                        {
                            Title      = node.Title,
                            Teaser     = node.Desc,
                            Image      = node.Image,
                            Link       = record.HttpPrefix + node.Link,
                            CategoryID = record.CategoryID,
                            CrawlerUrl = record.Url,
                            Hour       = node.Hour,
                            Date       = node.Date
                        };

                        cl = new CrawlerClass(info.Link);

                        xdoc = cl.GetXDocument();

                        #region Get Body

                        var resBody = from item in xdoc.Descendants(xmlns + "div")
                                      where item.Attribute("class") != null && item.Attribute("class").Value == "content"
                                      select new
                        {
                            Description = item.Value
                        };

                        string body = resBody.ElementAt(0).Description;

                        if (body.IndexOf("//") > 0)
                        {
                            body = body.Substring(0, body.IndexOf("//"));
                        }
                        info.Body = body;

                        #endregion

                        AppEnv.Insert(info);

                        _logger.Debug("---------------------------------");
                        _logger.Debug("Title: " + info.Title);
                        _logger.Debug("Desc : " + info.Teaser);
                        _logger.Debug("Image: " + info.Image);
                        _logger.Debug("Link : " + info.Link);
                        _logger.Debug("Url  : " + info.CrawlerUrl);

                        #region Get relate News

                        var resLI = from item in node._LI.Descendants(xmlns + "li")
                                    where item.Element(xmlns + "a") != null
                                    select new
                        {
                            Link  = item.Element(xmlns + "a").Attribute("href").Value,
                            Image = "",
                            Title = item.Element(xmlns + "a").Value,
                            Hour  = "00:00",
                            Date  = "",
                            Desc  = ""
                        };
                        foreach (var nodeLI in resLI)
                        {
                            var infoLI = new ContentInfo
                            {
                                Title      = nodeLI.Title,
                                Teaser     = nodeLI.Desc,
                                Image      = nodeLI.Image,
                                Link       = record.HttpPrefix + nodeLI.Link,
                                CategoryID = record.CategoryID,
                                CrawlerUrl = record.Url,
                                Hour       = nodeLI.Hour,
                                Date       = nodeLI.Date
                            };

                            cl = new CrawlerClass(infoLI.Link);

                            xdoc = cl.GetXDocument();

                            #region Get Hour and Date

                            var resDate = from item in xdoc.Descendants(xmlns + "div")
                                          where
                                          item.Attribute("class") != null &&
                                          item.Attribute("class").Value == "meta"
                                          select new
                            {
                                Hour = item.Elements(xmlns + "span").ElementAt(0).Value,
                                Date = item.Elements(xmlns + "span").ElementAt(1).Value
                            };

                            infoLI.Hour = resDate.ElementAt(0).Hour;
                            infoLI.Date = resDate.ElementAt(0).Date;

                            #endregion

                            #region Get full teaser

                            var resTeaser = from item in xdoc.Descendants(xmlns + "p")
                                            where
                                            item.Attribute("class") != null &&
                                            item.Attribute("class").Value == "sapo"
                                            select new
                            {
                                Teaser = item.Value,
                            };

                            info.Teaser = resTeaser.ElementAt(0).Teaser;

                            #endregion

                            #region Get Body

                            var resBodyLI = from item in xdoc.Descendants(xmlns + "div")
                                            where
                                            item.Attribute("class") != null &&
                                            item.Attribute("class").Value == "content"
                                            select new
                            {
                                Description = item.Value
                            };

                            string bodyLI = resBodyLI.ElementAt(0).Description;

                            if (bodyLI.IndexOf("//") > 0)
                            {
                                bodyLI = bodyLI.Substring(0, bodyLI.IndexOf("//"));
                            }
                            infoLI.Body = bodyLI;

                            #endregion

                            AppEnv.Insert(infoLI);

                            _logger.Debug("---------------------------------");
                            _logger.Debug("Title: " + info.Title);
                            _logger.Debug("Desc : " + info.Teaser);
                            _logger.Debug("Image: " + info.Image);
                            _logger.Debug("Link : " + info.Link);
                            _logger.Debug("Url  : " + info.CrawlerUrl);
                        }

                        #endregion
                    }
                }

                #endregion
            }
            catch (Exception ex)
            {
                _logger.Debug("----------------Error-----------------");
                _logger.Debug("Message: " + ex.Message);
                _logger.Debug("StackTrace : " + ex.StackTrace);
                _logger.Debug("Category: " + record.CategoryID);
                _logger.Debug("Link : " + record.Url);
            }
        }
Exemplo n.º 4
0
        public static void Process(Record record)
        {
            try
            {
                #region Process

                string    xmlns = "";
                var       cl    = new CrawlerClass(record.Url);
                XDocument xdoc  = cl.GetXDocument();

                if (xdoc != null)
                {
                    var res = from item in xdoc.Descendants(xmlns + "div")
                              where item.Attribute("class") != null && item.Attribute("class").Value == "DivImgSumSALEHNNP"
                              select new
                    {
                        Link  = item.Elements(xmlns + "a").ElementAt(0).Attribute("href").Value,
                        Image = item.Elements(xmlns + "a").ElementAt(0).Element(xmlns + "img").Attribute("src").Value,
                        Title = item.Elements(xmlns + "a").ElementAt(1).Value,
                        Hour  = "00:00",
                        Date  = "",
                        Desc  = item.Value
                    };
                    foreach (var node in res)
                    {
                        var info = new ContentInfo
                        {
                            Title      = node.Title,
                            Teaser     = node.Desc,
                            Image      = record.HttpPrefix + "/" + node.Image,
                            Link       = record.HttpPrefix + "/" + node.Link,
                            CategoryID = record.CategoryID,
                            CrawlerUrl = record.Url,
                            Hour       = node.Hour,
                            Date       = node.Date
                        };

                        cl = new CrawlerClass(info.Link);

                        xdoc = cl.GetXDocument();

                        #region Get Hour and Date

                        var resDate = from item in xdoc.Descendants(xmlns + "span")
                                      where
                                      item.Attribute("class") != null &&
                                      item.Attribute("class").Value == "News_adt_DatePl"
                                      select new
                        {
                            Date = item.Value,
                        };
                        //Thứ bẩy, ngày 19 tháng 03 năm 2011 cập nhật lúc 14:08
                        string newDate = resDate.ElementAt(0).Date.Trim();
                        newDate = newDate.Substring(newDate.IndexOf(',') + 1).Trim();
                        //ngày 19 tháng 03 năm 2011 cập nhật lúc 14:08
                        string[] arr = newDate.Split(' ');

                        info.Hour = arr[9].ToString().Trim();
                        info.Date = arr[1].ToString().Trim() + "/" + arr[3].ToString().Trim() + "/" + arr[5].ToString().Trim();

                        #endregion

                        #region Get Body

                        var resBody = from item in xdoc.Descendants(xmlns + "span")
                                      where item.Attribute("class") != null && item.Attribute("class").Value == "News_ArticleDetailContent"
                                      select new
                        {
                            Description = item.Value
                        };

                        string body = resBody.ElementAt(0).Description;

                        if (body.IndexOf("//") > 0)
                        {
                            body = body.Substring(0, body.IndexOf("//"));
                        }
                        info.Body = body;

                        #endregion

                        AppEnv.Insert(info);

                        _logger.Debug("---------------------------------");
                        _logger.Debug("Title: " + info.Title);
                        _logger.Debug("Desc : " + info.Teaser);
                        _logger.Debug("Image: " + info.Image);
                        _logger.Debug("Link : " + info.Link);
                        _logger.Debug("Url  : " + info.CrawlerUrl);
                    }
                }

                #endregion
            }
            catch (Exception ex)
            {
                _logger.Debug("----------------Error-----------------");
                _logger.Debug("Message: " + ex.Message);
                _logger.Debug("StackTrace : " + ex.StackTrace);
                _logger.Debug("Category: " + record.CategoryID);
                _logger.Debug("Link : " + record.Url);
            }
        }
Exemplo n.º 5
0
        public static void Process(Record record)
        {
            try
            {
                #region Process

                string    xmlns = "";
                var       cl    = new CrawlerClass(record.Url);
                XDocument xdoc  = cl.GetXDocument();

                if (xdoc != null)
                {
                    var res = from item in xdoc.Descendants(xmlns + "td")
                              where
                              item.Attribute("class") != null && item.Attribute("class").Value == "text" &&
                              item.Attribute("colspan") != null && item.Attribute("colspan").Value == "2"
                              select new
                    {
                        Link =
                            item.Elements(xmlns + "p").ElementAt(0).Element(xmlns + "span").Element(xmlns + "a").
                            Attribute("href").Value,
                        Image =
                            item.Elements(xmlns + "table").ElementAt(0).Element(xmlns + "tr").Elements(xmlns +
                                                                                                       "td").
                            ElementAt(0).Element(xmlns + "a").Element(xmlns + "img").Attribute("src").Value,
                        Title =
                            item.Elements(xmlns + "p").ElementAt(0).Element(xmlns + "span").Element(xmlns + "a").
                            Value,
                        Hour = "00:00",
                        Date = "",
                        Desc =
                            item.Elements(xmlns + "p").ElementAt(0).Element(xmlns + "span").Value
                    };
                    foreach (var node in res)
                    {
                        var info = new ContentInfo
                        {
                            Title  = node.Title.Replace("\t", ""),
                            Teaser =
                                node.Desc.Replace("\t", "").Substring(node.Desc.IndexOf("<br>") + 4).
                                Replace("<br>", ""),
                            Image      = record.HttpPrefix + node.Image,
                            Link       = record.HttpPrefix + node.Link,
                            CategoryID = record.CategoryID,
                            CrawlerUrl = record.Url,
                            Hour       = node.Hour,
                            Date       = node.Date
                        };

                        cl = new CrawlerClass(info.Link);

                        xdoc = cl.GetXDocument();

                        #region Get Hour and Date

                        var resDate = from item in xdoc.Descendants(xmlns + "td")
                                      where
                                      item.Attribute("class") != null &&
                                      item.Attribute("class").Value == "posted_date"
                                      select new
                        {
                            Date = item.Value,
                        };
                        //Chủ nhật, 20/3/2011, 22:41 GMT+7
                        string   newDate = resDate.ElementAt(0).Date.Trim();
                        string[] arr     = newDate.Split(',');

                        info.Hour = arr[2].ToString().Trim().Trim().Substring(0, 5);
                        info.Date = arr[1].ToString().Trim();

                        #endregion

                        #region Get Body

                        var resBody = from item in xdoc.Descendants(xmlns + "span")
                                      where
                                      item.Attribute("class") != null && item.Attribute("class").Value == "textbai"
                                      select new
                        {
                            Description = item.Value
                        };

                        string body = resBody.ElementAt(0).Description;

                        if (body.IndexOf("//") > 0)
                        {
                            body = body.Substring(0, body.IndexOf("//"));
                        }
                        info.Body = body;

                        #endregion

                        AppEnv.Insert(info);

                        _logger.Debug("---------------------------------");
                        _logger.Debug("Title: " + info.Title);
                        _logger.Debug("Desc : " + info.Teaser);
                        _logger.Debug("Image: " + info.Image);
                        _logger.Debug("Link : " + info.Link);
                        _logger.Debug("Url  : " + info.CrawlerUrl);
                    }
                }

                #endregion
            }
            catch (Exception ex)
            {
                _logger.Debug("----------------Error-----------------");
                _logger.Debug("Message: " + ex.Message);
                _logger.Debug("StackTrace : " + ex.StackTrace);
                _logger.Debug("Category: " + record.CategoryID);
                _logger.Debug("Link : " + record.Url);
            }
        }