Example #1
0
        public static void Process(Record record)
        {
            try
            {
                #region Process

                string    xmlns = "{http://www.w3.org/1999/xhtml}";
                var       cl    = new CrawlerClass(record.Url);
                XDocument xdoc  = cl.GetXDocument();

                if (xdoc != null)
                {
                    var res = from item in xdoc.Descendants(xmlns + "div")
                              where
                              item.Attribute("class") != null &&
                              (item.Attribute("class").Value == "clearfix") &&
                              (item.Element(xmlns + "h4") != null || item.Element(xmlns + "h3") != null)
                              select new
                    {
                        Link =
                            item.Element(xmlns + "h4") != null
                                      ? item.Element(xmlns + "h4").Element(xmlns + "a").Attribute("href").Value
                                      : item.Element(xmlns + "h3").Element(xmlns + "a").Attribute("href").Value,
                        Image =
                            item.Element(xmlns + "a").Element(xmlns + "img") != null
                                      ? item.Element(xmlns + "a").Element(xmlns + "img").Attribute("src").Value
                                      : "",
                        Title =
                            item.Element(xmlns + "h4") != null
                                      ? item.Element(xmlns + "h4").Element(xmlns + "a").Value
                                      : item.Element(xmlns + "h3").Element(xmlns + "a").Value,
                        Hour = item.Elements(xmlns + "p").ElementAt(0).Value,
                        Date = item.Elements(xmlns + "p").ElementAt(0).Value,
                        Desc = item.Elements(xmlns + "p").ElementAt(1).Value
                    };
                    foreach (var node in res)
                    {
                        var info = new ContentInfo
                        {
                            Title      = node.Title,
                            Teaser     = node.Desc,
                            Image      = node.Image,
                            Link       = record.HttpPrefix + node.Link,
                            CategoryID = record.CategoryID,
                            CrawlerUrl = record.Url,
                            Hour       = node.Hour.Split(' ')[0],
                            Date       = node.Date.Split(' ')[1]
                        };

                        cl = new CrawlerClass(info.Link);

                        xdoc = cl.GetXDocument();

                        #region Get Title

                        //var resTitle = from item in xdoc.Descendants(xmlns + "div")
                        //               where
                        //                   item.Attribute("class") != null &&
                        //                   item.Attribute("class").Value == "box10"
                        //               select new
                        //               {
                        //                   Title = item.Element(xmlns + "h1").Value,
                        //               };

                        //info.Title = resTitle.ElementAt(0).Title;

                        #endregion

                        #region Get Hour and Date

                        var resDate = from item in xdoc.Descendants(xmlns + "p")
                                      where
                                      item.Attribute("class") != null &&
                                      item.Attribute("class").Value == "time"
                                      select new
                        {
                            Date = item.Value,
                        };
                        //19/03/2011 - 12:56 AM
                        string newDate = resDate.ElementAt(0).Date.Trim();
                        newDate = newDate.Substring(newDate.IndexOf(',') + 1).Trim();

                        string[] arr = newDate.Split(' ');

                        info.Hour = arr[1].ToString().Trim();
                        info.Date = arr[0].ToString().Trim();

                        #endregion

                        #region Get Body

                        var resBody = from item in xdoc.Descendants(xmlns + "div")
                                      where item.Attribute("class") != null && item.Attribute("class").Value == "content"
                                      select new
                        {
                            Description = item.Value
                        };

                        string body = resBody.ElementAt(1).Description;

                        if (body.IndexOf("//") > 0)
                        {
                            body = body.Substring(0, body.IndexOf("//"));
                        }
                        info.Body = body;

                        #endregion

                        AppEnv.Insert(info);

                        _logger.Debug("---------------------------------");
                        _logger.Debug("Title: " + info.Title);
                        _logger.Debug("Desc : " + info.Teaser);
                        _logger.Debug("Image: " + info.Image);
                        _logger.Debug("Link : " + info.Link);
                        _logger.Debug("Url  : " + info.CrawlerUrl);
                    }
                }

                #endregion
            }
            catch (Exception ex)
            {
                _logger.Debug("----------------Error-----------------");
                _logger.Debug("Message: " + ex.Message);
                _logger.Debug("StackTrace : " + ex.StackTrace);
                _logger.Debug("Category: " + record.CategoryID);
                _logger.Debug("Link : " + record.Url);
            }
        }
Example #2
0
        public static void Process(Record record)
        {
            try
            {
                #region Process

                string    xmlns = "{http://www.w3.org/1999/xhtml}";
                var       cl    = new CrawlerClass(record.Url);
                XDocument xdoc  = cl.GetXDocument();

                if (xdoc != null)
                {
                    var res = from item in xdoc.Descendants(xmlns + "ul").Elements(xmlns + "li")
                              where item.Elements(xmlns + "div").Count() > 1
                              select new
                    {
                        Link  = item.Elements(xmlns + "div").ElementAt(0).Element(xmlns + "a").Attribute("href").Value,
                        Image = item.Elements(xmlns + "div").ElementAt(0).Element(xmlns + "a").Element(xmlns + "img").Attribute("src").Value,
                        Title = item.Elements(xmlns + "div").ElementAt(1).Elements(xmlns + "p").ElementAt(0).Element(xmlns + "a").Value,
                        Hour  = DateTime.Now.Hour + ":" + DateTime.Now.Minute,
                        Date  = DateTime.Now.ToString("dd/MM/yyyy"),
                        Desc  = item.Elements(xmlns + "div").ElementAt(1).Elements(xmlns + "p").ElementAt(1).Value
                    };
                    foreach (var node in res)
                    {
                        var info = new ContentInfo
                        {
                            Title      = node.Title,
                            Teaser     = node.Desc,
                            Image      = record.HttpPrefix + node.Image,
                            Link       = record.HttpPrefix + node.Link,
                            CategoryID = record.CategoryID,
                            CrawlerUrl = record.Url,
                            Hour       = node.Hour,
                            Date       = node.Date
                        };

                        cl = new CrawlerClass(info.Link);

                        xdoc = cl.GetXDocument();

                        #region Get Body

                        string body = "";
                        if (xdoc != null)
                        {
                            var resBody = from item in xdoc.Descendants(xmlns + "div")
                                          where
                                          item.Attribute("cpms_content") != null &&
                                          item.Attribute("cpms_content").Value == "true"
                                          select new
                            {
                                Description = item.Value
                            };

                            body = resBody.ElementAt(0).Description;
                        }
                        if (body.IndexOf("//") > 0)
                        {
                            body = body.Substring(0, body.IndexOf("//"));
                        }
                        info.Body = body;

                        #endregion

                        AppEnv.Insert(info);

                        _logger.Debug("---------------------------------");
                        _logger.Debug("Title: " + info.Title);
                        _logger.Debug("Desc : " + info.Teaser);
                        _logger.Debug("Image: " + info.Image);
                        _logger.Debug("Link : " + info.Link);
                        _logger.Debug("Url  : " + info.CrawlerUrl);
                    }
                }

                #endregion
            }
            catch (Exception ex)
            {
                _logger.Debug("----------------Error-----------------");
                _logger.Debug("Message: " + ex.Message);
                _logger.Debug("StackTrace : " + ex.StackTrace);
                _logger.Debug("Category: " + record.CategoryID);
                _logger.Debug("Link : " + record.Url);
            }
        }
Example #3
0
        public static void Process(Record record)
        {
            try
            {
                #region Process

                string    xmlns = "";
                var       cl    = new CrawlerClass(record.Url);
                XDocument xdoc  = cl.GetXDocument();

                if (xdoc != null)
                {
                    var res = from item in xdoc.Descendants(xmlns + "div")
                              where item.Attribute("class") != null && item.Attribute("class").Value == "DivImgSumSALEHNNP"
                              select new
                    {
                        Link  = item.Elements(xmlns + "a").ElementAt(0).Attribute("href").Value,
                        Image = item.Elements(xmlns + "a").ElementAt(0).Element(xmlns + "img").Attribute("src").Value,
                        Title = item.Elements(xmlns + "a").ElementAt(1).Value,
                        Hour  = "00:00",
                        Date  = "",
                        Desc  = item.Value
                    };
                    foreach (var node in res)
                    {
                        var info = new ContentInfo
                        {
                            Title      = node.Title,
                            Teaser     = node.Desc,
                            Image      = record.HttpPrefix + "/" + node.Image,
                            Link       = record.HttpPrefix + "/" + node.Link,
                            CategoryID = record.CategoryID,
                            CrawlerUrl = record.Url,
                            Hour       = node.Hour,
                            Date       = node.Date
                        };

                        cl = new CrawlerClass(info.Link);

                        xdoc = cl.GetXDocument();

                        #region Get Hour and Date

                        var resDate = from item in xdoc.Descendants(xmlns + "span")
                                      where
                                      item.Attribute("class") != null &&
                                      item.Attribute("class").Value == "News_adt_DatePl"
                                      select new
                        {
                            Date = item.Value,
                        };
                        //Thứ bẩy, ngày 19 tháng 03 năm 2011 cập nhật lúc 14:08
                        string newDate = resDate.ElementAt(0).Date.Trim();
                        newDate = newDate.Substring(newDate.IndexOf(',') + 1).Trim();
                        //ngày 19 tháng 03 năm 2011 cập nhật lúc 14:08
                        string[] arr = newDate.Split(' ');

                        info.Hour = arr[9].ToString().Trim();
                        info.Date = arr[1].ToString().Trim() + "/" + arr[3].ToString().Trim() + "/" + arr[5].ToString().Trim();

                        #endregion

                        #region Get Body

                        var resBody = from item in xdoc.Descendants(xmlns + "span")
                                      where item.Attribute("class") != null && item.Attribute("class").Value == "News_ArticleDetailContent"
                                      select new
                        {
                            Description = item.Value
                        };

                        string body = resBody.ElementAt(0).Description;

                        if (body.IndexOf("//") > 0)
                        {
                            body = body.Substring(0, body.IndexOf("//"));
                        }
                        info.Body = body;

                        #endregion

                        AppEnv.Insert(info);

                        _logger.Debug("---------------------------------");
                        _logger.Debug("Title: " + info.Title);
                        _logger.Debug("Desc : " + info.Teaser);
                        _logger.Debug("Image: " + info.Image);
                        _logger.Debug("Link : " + info.Link);
                        _logger.Debug("Url  : " + info.CrawlerUrl);
                    }
                }

                #endregion
            }
            catch (Exception ex)
            {
                _logger.Debug("----------------Error-----------------");
                _logger.Debug("Message: " + ex.Message);
                _logger.Debug("StackTrace : " + ex.StackTrace);
                _logger.Debug("Category: " + record.CategoryID);
                _logger.Debug("Link : " + record.Url);
            }
        }
Example #4
0
        //private static void GetTopNews(string html, FilterFullInfo record)
        //{
        //    try
        //    {
        //        long totalSize = 0;

        //        //string p = "<div class="folder-top">.*?<a.*?href="(?<Link>.*?)".*?><img.*?src="(?<ImagePath>.*?)".*?/></a>.*?<p><a.*?href=".*?".*?class="link-topnews">(?<Title>.*?)</a>.*?<label class="item-time">(?<Time>.*?)</label>.*?<label class="item-date">(?<Date>.*?)</label></p>.*?<p>(?<Teaser>.*?)</p>.*?</div>";
        //        MatchCollection mcList = Regex.Matches(html, record.ParternGetTopNews, RegexOptions.IgnoreCase | RegexOptions.Singleline);


        //    }
        //    catch (Exception ex)
        //    {
        //        _logger.Info(string.Format("Error = {0}", ex.Message) + Environment.NewLine);

        //    }

        //}

        //Tra ve 1 mang chua cac link va image thumb
        private static string StartMining(Record record)
        {
            string url       = record.Source;
            long   totalSize = 0;
            string html      = RunBrowser(record.Source, out totalSize);

            HtmlDocument docList = new HtmlDocument();

            docList.LoadHtml(html);
            HtmlNode nodes = docList.DocumentNode.SelectSingleNode(record.ListStartAfter);

            if (nodes != null)
            {
                html = nodes.InnerHtml;
            }
            else
            {
                html = "";
            }


            if (html != "")
            {
                html = html.Replace("\r\n", "").Replace("> <", "><");

                HtmlDocument doc1 = new HtmlDocument();
                doc1.LoadHtml(html);
                HtmlNodeCollection nodesList = doc1.DocumentNode.SelectNodes(record.PartternNodeList);

                if (nodesList != null)
                {
                    foreach (HtmlNode itemList in nodesList)
                    {
                        HtmlDocument docitemList = new HtmlDocument();
                        docitemList.LoadHtml(itemList.InnerHtml);

                        string   imgAvatar    = "";
                        HtmlNode itemListNode = docitemList.DocumentNode;
                        HtmlNode nodesImage   = itemListNode.SelectSingleNode(record.PartternAvatar);
                        if (nodesImage != null)
                        {
                            imgAvatar = nodesImage.InnerHtml;
                        }
                        //itemListNode = docitemList.DocumentNode;
                        HtmlNode nodesTitle = itemListNode.SelectSingleNode(record.PartternTitle);
                        string   title      = "";
                        if (nodesTitle != null)
                        {
                            title = nodesTitle.InnerText;
                        }
                        //itemListNode = docitemList.DocumentNode;
                        HtmlNode nodesTeaser = itemListNode.SelectSingleNode(record.PartternTeaser);
                        string   Teaser      = "";
                        if (nodesTeaser != null)
                        {
                            Teaser = Regex.Replace(nodesTeaser.InnerHtml, "<.*?>.*?</.*?>", "", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                            //Teaser = nodesTeaser.InnerText;
                        }
                        //itemListNode = docitemList.DocumentNode;
                        HtmlNode nodesLink = itemListNode.SelectSingleNode(record.PartternLink);
                        string   link      = "";
                        if (nodesLink != null)
                        {
                            HtmlAttribute linkAtt = nodesLink.Attributes["href"];
                            link = linkAtt.Value;
                        }

                        //HtmlNode nodesPubDate = itemList.SelectSingleNode("//*[@class=\"box-wiget-content-timpost\"]");
                        //string pubDate = nodesPubDate.InnerHtml;

                        //MatchCollection mcList = Regex.Matches(html, record.ParternList, RegexOptions.IgnoreCase | RegexOptions.Singleline);

                        var list      = new List <ObjectLink>();
                        var listImage = new List <ObjectImage>();

                        int i = 0;

                        //if (mcList != null && mcList.Count > 0)
                        //{
                        //    _logger.Debug("Matchs  : " + mcList.Count.ToString());

                        //foreach (Match match in mcList)
                        //{
                        #region Get main content

                        var contentInfo = new ContentInfo();

                        contentInfo.Content_Headline = AppEnv.NCRToUnicode(Regex.Replace(title, "<.*?>", "", RegexOptions.IgnoreCase | RegexOptions.Singleline)); // AppEnv.NCRToUnicode(Regex.Replace(match.Groups["Title"].Value, "<.*?>", "", RegexOptions.IgnoreCase | RegexOptions.Singleline));
                        contentInfo.Content_Url      = OptimiseUrl(link, record);                                                                                 //OptimiseUrl(match.Groups["Link"].Value, record);

                        if (contentInfo.Content_Url.StartsWith("http://"))
                        {
                            try
                            {
                                Uri uri  = new Uri(contentInfo.Content_Url);
                                Uri uri1 = new Uri(record.Source);
                                if (uri.Host != uri1.Host)
                                {
                                    continue;
                                }
                            }
                            catch (Exception)
                            {
                                continue;
                            }
                        }
                        else
                        {
                        }

                        long totalSizeTemp = 0;

                        //if (contentInfo.Content_Url == "http://news.zing.vn/Hai-toa-nha-5-tang-sap-sau-tieng-no-tai-New-York-post398904.html#pictures")
                        //{
                        //    string es = "";
                        //}
                        string contentDetail = GetContentDetail(contentInfo.Content_Url, record, out totalSizeTemp);
                        if (contentDetail.Trim() == "")
                        {
                            continue;
                        }
                        contentInfo.Content_Teaser = GetContentTeaser(contentDetail, record); //Teaser;
                        //if (contentInfo.Content_Teaser.Trim() == "")
                        //{
                        //    continue;
                        //}

                        string tags = GetTag(contentDetail, record);
                        contentInfo.Content_Body = GetContentBody(contentDetail, record);
                        //if (contentInfo.Content_Body.Trim() == "")
                        //{
                        //    continue;
                        //}

                        string date = "";
                        //date = pubDate;// match.Groups["PubDate"].Value;
                        if (date == "")
                        {
                            //if (record.PartternPubDate != "")
                            //{
                            date = GetPubDateByParttern(contentDetail, record);
                            //}
                            //else
                            //{
                            //    date = GetPubDate(contentInfo.Content_Body, record);
                            //}
                        }

                        contentInfo.Content_CreateDate = OpitmisePubDate(date, record.FormatDate);

                        contentInfo.Content_Status     = 1;
                        contentInfo.CategoryID         = record.CategoryID;
                        contentInfo.Content_UserID     = 212;
                        contentInfo.Content_HeadlineKD = UnicodeUtility.UnicodeToKoDau(contentInfo.Content_Headline);
                        contentInfo.Content_TeaserKD   = UnicodeUtility.UnicodeToKoDau(contentInfo.Content_Teaser);
                        contentInfo.Content_Source     = record.Page;

                        string imagePath = imgAvatar;// match.Groups["ImagePath"].Value.Trim();
                        if (imagePath.IndexOf("src=\"") > -1)
                        {
                            imagePath = imagePath.Substring(imagePath.IndexOf("src=\"") + 5);
                            imagePath = imagePath.Substring(0, imagePath.IndexOf("\""));
                        }
                        if (imagePath.IndexOf("src='") > -1)
                        {
                            imagePath = imagePath.Substring(imagePath.IndexOf("src='") + 5);
                            imagePath = imagePath.Substring(0, imagePath.IndexOf("'"));
                        }
                        string imageDownloadUrl = "";
                        string imageName        = "";
                        if (Convert.ToInt32(AppEnv.GetSetting("SaveImage")) == 1)
                        {
                            #region lay anh dau tien trong bai chi tiet

                            HtmlDocument doc = new HtmlDocument();
                            doc.LoadHtml(contentInfo.Content_Body);

                            HtmlNodeCollection node = doc.DocumentNode.SelectNodes("//img");
                            if (node != null)
                            {
                                foreach (HtmlNode img in node)
                                {
                                    HtmlAttribute att = img.Attributes["src"];
                                    imageDownloadUrl = att.Value;

                                    string   exceptionImg = AppEnv.GetSetting("exceptionImg");
                                    string[] arr          = exceptionImg.Split(',');
                                    int      flag         = 0;
                                    foreach (string item in arr)
                                    {
                                        if (item != "")
                                        {
                                            if (imageDownloadUrl.Contains(item))
                                            {
                                                flag             = 1;
                                                imageDownloadUrl = "";
                                                break;
                                            }
                                        }
                                    }

                                    if (flag == 1)
                                    {
                                        continue;
                                    }
                                    else
                                    {
                                        break;
                                    }
                                }
                                if (imageDownloadUrl == "")
                                {
                                    imageDownloadUrl = imagePath;
                                }
                            }
                            else
                            {
                                imageDownloadUrl = imagePath;
                            }

                            #endregion

                            imageDownloadUrl = OptimiseUrl(imageDownloadUrl, record);
                            imageName        = DateTime.Now.ToString("yyyyMMddHHmmssffff") + ".jpg";

                            var objectImage = new ObjectImage
                            {
                                ImageName = imageName,
                                Link      = imageDownloadUrl
                            };

                            listImage.Add(objectImage);

                            contentInfo.Content_Avatar = AppEnv.GetSetting("virtualPath") + DateTime.Now.Year + "/" + DateTime.Now.Month + "/" + DateTime.Now.Day + "/" + imageName;
                        }
                        else
                        {
                            contentInfo.Content_Avatar = record.HttpPrefix + imagePath;
                        }
                        contentInfo.Content_BigAvatar = "";

                        if (i < 6)
                        {
                            contentInfo.Content_Rank = 2;
                        }
                        else
                        {
                            contentInfo.Content_Rank = 1;
                        }

                        contentInfo.IsPublished = true;

                        try
                        {
                            DateTime dt = DateTime.Parse(contentInfo.Content_CreateDate, new CultureInfo("fr-FR", false));
                            if (dt.Year == DateTime.Now.Year && dt.Month == DateTime.Now.Month && dt.Day < DateTime.Now.Day)
                            {
                                continue;
                            }
                            else
                            {
                                if (dt.Year < DateTime.Now.Year || dt.Month < DateTime.Now.Month)
                                {
                                    continue;
                                }
                                else
                                {
                                    int returnValue = AppEnv.Insert(contentInfo);
                                }
                            }
                        }
                        catch (Exception)
                        {
                            try
                            {
                            }
                            catch (Exception)
                            {
                                int returnValue = AppEnv.Insert(contentInfo);
                            }
                        }

                        #endregion

                        i++;

                        totalSize += totalSizeTemp;
                        //}

                        string physicalPath = AppEnv.GetSetting("phyPath") + DateTime.Now.Year + "\\" + DateTime.Now.Month + "\\" + DateTime.Now.Day + "\\";

                        #region Download Image

                        if (listImage != null && listImage.Count > 0)
                        {
                            int totalImage = 0;
                            foreach (ObjectImage item in listImage)
                            {
                                Image image = DownloadImage(item.Link);

                                if (image != null)
                                {
                                    if (!Directory.Exists(physicalPath))
                                    {
                                        Directory.CreateDirectory(physicalPath);
                                    }

                                    image.Save(physicalPath + item.ImageName, ImageFormat.Jpeg);
                                    totalImage++;
                                }
                            }
                        }


                        #endregion

                        _logger.Info(string.Format("Finished mining with source = {0}", record.Source) + Environment.NewLine);
                        //}
                    }
                    return(html);
                }
                else
                {
                    return("");
                }
            }
            else
            {
                return("");
            }
        }
Example #5
0
        public static void Process(Record record)
        {
            try
            {
                #region Process

                string    xmlns = "{http://www.w3.org/1999/xhtml}";
                var       cl    = new CrawlerClass(record.Url);
                XDocument xdoc  = cl.GetXDocument();

                if (xdoc != null)
                {
                    var res = from item in xdoc.Descendants(xmlns + "div")
                              where item.Attribute("class") != null &&
                              item.Attribute("class").Value == "item clearfix" &&
                              item.Elements(xmlns + "div").ElementAt(0).Attribute("class").Value == "meta"
                              select new
                    {
                        Link  = item.Elements(xmlns + "div").ElementAt(1).Element(xmlns + "a").Attribute("href").Value,
                        Image = item.Elements(xmlns + "div").ElementAt(1).Element(xmlns + "a").Element(xmlns + "img").Attribute("src").Value,
                        Title = item.Elements(xmlns + "div").ElementAt(2).Elements(xmlns + "div").ElementAt(0).Element(xmlns + "a").Value,
                        Hour  = item.Elements(xmlns + "div").ElementAt(0).Value.Substring(11).Trim(),
                        Date  = item.Elements(xmlns + "div").ElementAt(0).Value.Substring(0, 10).Trim(),
                        Desc  = item.Elements(xmlns + "div").ElementAt(2).Elements(xmlns + "div").ElementAt(0).Value,
                        _LI   = item.Elements(xmlns + "ul")
                    };
                    foreach (var node in res)
                    {
                        var info = new ContentInfo
                        {
                            Title      = node.Title,
                            Teaser     = node.Desc,
                            Image      = node.Image,
                            Link       = record.HttpPrefix + node.Link,
                            CategoryID = record.CategoryID,
                            CrawlerUrl = record.Url,
                            Hour       = node.Hour,
                            Date       = node.Date
                        };

                        cl = new CrawlerClass(info.Link);

                        xdoc = cl.GetXDocument();

                        #region Get Body

                        var resBody = from item in xdoc.Descendants(xmlns + "div")
                                      where item.Attribute("class") != null && item.Attribute("class").Value == "content"
                                      select new
                        {
                            Description = item.Value
                        };

                        string body = resBody.ElementAt(0).Description;

                        if (body.IndexOf("//") > 0)
                        {
                            body = body.Substring(0, body.IndexOf("//"));
                        }
                        info.Body = body;

                        #endregion

                        AppEnv.Insert(info);

                        _logger.Debug("---------------------------------");
                        _logger.Debug("Title: " + info.Title);
                        _logger.Debug("Desc : " + info.Teaser);
                        _logger.Debug("Image: " + info.Image);
                        _logger.Debug("Link : " + info.Link);
                        _logger.Debug("Url  : " + info.CrawlerUrl);

                        #region Get relate News

                        var resLI = from item in node._LI.Descendants(xmlns + "li")
                                    where item.Element(xmlns + "a") != null
                                    select new
                        {
                            Link  = item.Element(xmlns + "a").Attribute("href").Value,
                            Image = "",
                            Title = item.Element(xmlns + "a").Value,
                            Hour  = "00:00",
                            Date  = "",
                            Desc  = ""
                        };
                        foreach (var nodeLI in resLI)
                        {
                            var infoLI = new ContentInfo
                            {
                                Title      = nodeLI.Title,
                                Teaser     = nodeLI.Desc,
                                Image      = nodeLI.Image,
                                Link       = record.HttpPrefix + nodeLI.Link,
                                CategoryID = record.CategoryID,
                                CrawlerUrl = record.Url,
                                Hour       = nodeLI.Hour,
                                Date       = nodeLI.Date
                            };

                            cl = new CrawlerClass(infoLI.Link);

                            xdoc = cl.GetXDocument();

                            #region Get Hour and Date

                            var resDate = from item in xdoc.Descendants(xmlns + "div")
                                          where
                                          item.Attribute("class") != null &&
                                          item.Attribute("class").Value == "meta"
                                          select new
                            {
                                Hour = item.Elements(xmlns + "span").ElementAt(0).Value,
                                Date = item.Elements(xmlns + "span").ElementAt(1).Value
                            };

                            infoLI.Hour = resDate.ElementAt(0).Hour;
                            infoLI.Date = resDate.ElementAt(0).Date;

                            #endregion

                            #region Get full teaser

                            var resTeaser = from item in xdoc.Descendants(xmlns + "p")
                                            where
                                            item.Attribute("class") != null &&
                                            item.Attribute("class").Value == "sapo"
                                            select new
                            {
                                Teaser = item.Value,
                            };

                            info.Teaser = resTeaser.ElementAt(0).Teaser;

                            #endregion

                            #region Get Body

                            var resBodyLI = from item in xdoc.Descendants(xmlns + "div")
                                            where
                                            item.Attribute("class") != null &&
                                            item.Attribute("class").Value == "content"
                                            select new
                            {
                                Description = item.Value
                            };

                            string bodyLI = resBodyLI.ElementAt(0).Description;

                            if (bodyLI.IndexOf("//") > 0)
                            {
                                bodyLI = bodyLI.Substring(0, bodyLI.IndexOf("//"));
                            }
                            infoLI.Body = bodyLI;

                            #endregion

                            AppEnv.Insert(infoLI);

                            _logger.Debug("---------------------------------");
                            _logger.Debug("Title: " + info.Title);
                            _logger.Debug("Desc : " + info.Teaser);
                            _logger.Debug("Image: " + info.Image);
                            _logger.Debug("Link : " + info.Link);
                            _logger.Debug("Url  : " + info.CrawlerUrl);
                        }

                        #endregion
                    }
                }

                #endregion
            }
            catch (Exception ex)
            {
                _logger.Debug("----------------Error-----------------");
                _logger.Debug("Message: " + ex.Message);
                _logger.Debug("StackTrace : " + ex.StackTrace);
                _logger.Debug("Category: " + record.CategoryID);
                _logger.Debug("Link : " + record.Url);
            }
        }
Example #6
0
        public static void Process(Record record)
        {
            try
            {
                #region Process

                string    xmlns = "";
                var       cl    = new CrawlerClass(record.Url);
                XDocument xdoc  = cl.GetXDocument();

                if (xdoc != null)
                {
                    var res = from item in xdoc.Descendants(xmlns + "td")
                              where
                              item.Attribute("class") != null && item.Attribute("class").Value == "text" &&
                              item.Attribute("colspan") != null && item.Attribute("colspan").Value == "2"
                              select new
                    {
                        Link =
                            item.Elements(xmlns + "p").ElementAt(0).Element(xmlns + "span").Element(xmlns + "a").
                            Attribute("href").Value,
                        Image =
                            item.Elements(xmlns + "table").ElementAt(0).Element(xmlns + "tr").Elements(xmlns +
                                                                                                       "td").
                            ElementAt(0).Element(xmlns + "a").Element(xmlns + "img").Attribute("src").Value,
                        Title =
                            item.Elements(xmlns + "p").ElementAt(0).Element(xmlns + "span").Element(xmlns + "a").
                            Value,
                        Hour = "00:00",
                        Date = "",
                        Desc =
                            item.Elements(xmlns + "p").ElementAt(0).Element(xmlns + "span").Value
                    };
                    foreach (var node in res)
                    {
                        var info = new ContentInfo
                        {
                            Title  = node.Title.Replace("\t", ""),
                            Teaser =
                                node.Desc.Replace("\t", "").Substring(node.Desc.IndexOf("<br>") + 4).
                                Replace("<br>", ""),
                            Image      = record.HttpPrefix + node.Image,
                            Link       = record.HttpPrefix + node.Link,
                            CategoryID = record.CategoryID,
                            CrawlerUrl = record.Url,
                            Hour       = node.Hour,
                            Date       = node.Date
                        };

                        cl = new CrawlerClass(info.Link);

                        xdoc = cl.GetXDocument();

                        #region Get Hour and Date

                        var resDate = from item in xdoc.Descendants(xmlns + "td")
                                      where
                                      item.Attribute("class") != null &&
                                      item.Attribute("class").Value == "posted_date"
                                      select new
                        {
                            Date = item.Value,
                        };
                        //Chủ nhật, 20/3/2011, 22:41 GMT+7
                        string   newDate = resDate.ElementAt(0).Date.Trim();
                        string[] arr     = newDate.Split(',');

                        info.Hour = arr[2].ToString().Trim().Trim().Substring(0, 5);
                        info.Date = arr[1].ToString().Trim();

                        #endregion

                        #region Get Body

                        var resBody = from item in xdoc.Descendants(xmlns + "span")
                                      where
                                      item.Attribute("class") != null && item.Attribute("class").Value == "textbai"
                                      select new
                        {
                            Description = item.Value
                        };

                        string body = resBody.ElementAt(0).Description;

                        if (body.IndexOf("//") > 0)
                        {
                            body = body.Substring(0, body.IndexOf("//"));
                        }
                        info.Body = body;

                        #endregion

                        AppEnv.Insert(info);

                        _logger.Debug("---------------------------------");
                        _logger.Debug("Title: " + info.Title);
                        _logger.Debug("Desc : " + info.Teaser);
                        _logger.Debug("Image: " + info.Image);
                        _logger.Debug("Link : " + info.Link);
                        _logger.Debug("Url  : " + info.CrawlerUrl);
                    }
                }

                #endregion
            }
            catch (Exception ex)
            {
                _logger.Debug("----------------Error-----------------");
                _logger.Debug("Message: " + ex.Message);
                _logger.Debug("StackTrace : " + ex.StackTrace);
                _logger.Debug("Category: " + record.CategoryID);
                _logger.Debug("Link : " + record.Url);
            }
        }