コード例 #1
0
        //details_html 쪼개는 함수
        protected override void ParseContent(ContentRevisionDTO contentrevision)
        {
            try
            {
                var loadedContent = webGetutf.Load(contentrevision.Content.Contents_URL);

                var checkifdeprecated = loadedContent.DocumentNode.SelectNodes("//li[@class = 'hx_cate']");
                if (checkifdeprecated != null && checkifdeprecated.FirstOrDefault().InnerText.Trim() == "삭제된 글입니다.")
                {
                    contentrevision.isDepricate = true;
                    return;
                }

                var articlecontent = loadedContent.DocumentNode.SelectNodes("//div[@id = 'copy_layer_1']").LastOrDefault();

                contentrevision.Details      = articlecontent.InnerText.Trim();
                contentrevision.Details_Html = articlecontent.InnerHtml.Trim();
                contentrevision.isDepricate  = false;

                var imgnodes = articlecontent.SelectNodes(".//img");

                contentrevision.SrcDatas = new List <SrcdataDTO>();

                if (imgnodes == null)
                {
                    return;
                }

                foreach (var img in imgnodes)
                {
                    var srcurl  = new Uri(img.GetAttributeValue("src", "default"));
                    var srcdata = new SrcdataDTO
                    {
                        SourceUrl    = srcurl.AbsoluteUri,
                        IsDepricated = false,
                        FileName     = System.IO.Path.GetFileName(srcurl.LocalPath),
                        SrcGuId      = Guid.NewGuid(),
                    };
                    img.SetAttributeValue("guid", srcdata.SrcGuId.ToString());

                    contentrevision.SrcDatas.Add(srcdata);
                }
            }
            catch (ArgumentNullException)
            {
                contentrevision.isDepricate = true;
                return;
            }
            catch (UriFormatException)
            {
                return;
            }
            catch (Exception e)
            {
                Console.WriteLine("ReStart");
                ParseContent(contentrevision);
            }
        }
コード例 #2
0
        protected override void ParseContent(ContentRevisionDTO contentrevision)
        {
            try
            {
                var loadedContent = webGetutf.Load(contentrevision.Content.Contents_URL);

                //var checkifdeprecated = loadedContent.DocumentNode.SelectNodes("//div(@class='whole_box')").ToList().Where(p => p.InnerText.Trim() == "해당 게시물이 존재하지 않습니다.");
                //if (checkifdeprecated != null)
                //{
                //    contentrevision.isDepricate = true;
                //    return;
                //}

                //var content_count = 0;SSS
                List <string> details        = new List <string>();
                var           articlecontent = loadedContent.DocumentNode.SelectNodes("//div[@class = 'view_content']").SingleOrDefault();
                contentrevision.Details      = articlecontent.InnerText.Trim();
                contentrevision.Details_Html = articlecontent.InnerHtml.Trim();
                contentrevision.isDepricate  = false;

                var imgnodes = articlecontent.SelectNodes(".//img");

                contentrevision.SrcDatas = new List <SrcdataDTO>();
                if (imgnodes == null)
                {
                    return;
                }

                foreach (var img in imgnodes)
                {
                    var srcurl  = new Uri(img.GetAttributeValue("src", "default"));
                    var srcdata = new SrcdataDTO
                    {
                        SourceUrl    = srcurl.AbsoluteUri,
                        IsDepricated = false,
                        FileName     = System.IO.Path.GetFileName(srcurl.LocalPath),
                        SrcGuId      = Guid.NewGuid(),
                    };
                    img.SetAttributeValue("guid", srcdata.SrcGuId.ToString());

                    contentrevision.SrcDatas.Add(srcdata);
                }
            }
            catch (ArgumentNullException)
            {
                contentrevision.isDepricate = true;
                return;
            }
            catch (UriFormatException)
            {
                return;
            }
            catch (Exception e)
            {
                Console.WriteLine("ReStart");
                ParseContent(contentrevision);
            }
        }
コード例 #3
0
        //content 내용, html 가져오는 함수
        protected override void ParseContent(ContentRevisionDTO contentrevision)
        {
            //var content_count = 0;
            try
            {
                var loadedContent  = webGetutf.Load(contentrevision.Content.Contents_URL);
                var articlecontent = loadedContent.DocumentNode.SelectNodes("//div[@id = 'body_frame']").FirstOrDefault();
                foreach (var ct in articlecontent.Descendants())
                {
                    if (ct.Name == "style")
                    {
                        ct.InnerHtml = "";
                    }
                }

                contentrevision.Details      = articlecontent.InnerText.Trim();
                contentrevision.Details_Html = articlecontent.InnerHtml.Trim();
                contentrevision.isDepricate  = false;

                var imgnodes = articlecontent.SelectNodes("./img");

                contentrevision.SrcDatas = new List <SrcdataDTO>();
                if (imgnodes == null)
                {
                    return;
                }

                foreach (var img in imgnodes)
                {
                    var srcurl  = new Uri(img.GetAttributeValue("src", "default"));
                    var srcdata = new SrcdataDTO
                    {
                        SourceUrl    = srcurl.AbsoluteUri,
                        IsDepricated = false,
                        FileName     = System.IO.Path.GetFileName(srcurl.LocalPath),
                        SrcGuId      = Guid.NewGuid(),
                    };
                    img.SetAttributeValue("guid", srcdata.SrcGuId.ToString());

                    contentrevision.SrcDatas.Add(srcdata);
                }
            }
            catch (ArgumentNullException)
            {
                contentrevision.isDepricate = true;
                return;
            }
            catch (UriFormatException)
            {
                return;
            }
            catch (Exception e)
            {
                Console.WriteLine("ReStart");
                ParseContent(contentrevision);
            }
        }
コード例 #4
0
        protected override void ParseContent(ContentRevisionDTO contentrevision)
        {
            try
            {
                var loadedContent = webGetutf.Load(contentrevision.Content.Contents_URL);

                var articlecontent = loadedContent.DocumentNode.SelectNodes("//div[@id = 'pann-content']").FirstOrDefault();
                contentrevision.Details      = articlecontent.InnerText.Trim();
                contentrevision.Details_Html = articlecontent.InnerHtml.Trim();
                contentrevision.isDepricate  = false;

                var imgnodes = articlecontent.SelectNodes(".//img");

                contentrevision.SrcDatas = new List <SrcdataDTO>();
                if (imgnodes == null)
                {
                    return;
                }

                foreach (var img in imgnodes)
                {
                    var srcurl  = new Uri(img.GetAttributeValue("src", "default"));
                    var srcdata = new SrcdataDTO
                    {
                        SourceUrl    = srcurl.AbsoluteUri,
                        IsDepricated = false,
                        FileName     = System.IO.Path.GetFileName(srcurl.LocalPath),
                        SrcGuId      = Guid.NewGuid(),
                    };
                    img.SetAttributeValue("guid", srcdata.SrcGuId.ToString());

                    contentrevision.SrcDatas.Add(srcdata);
                }
            }
            catch (ArgumentNullException)
            {
                contentrevision.isDepricate = true;
                return;
            }
            catch (UriFormatException)
            {
                return;
            }
            catch (WebException wex)
            {
                if (((HttpWebResponse)wex.Response).StatusCode == HttpStatusCode.NotFound)
                {
                    // error 404, do what you need to do
                }
            }
            catch (Exception e)
            {
                Console.WriteLine("ReStart");
                ParseContent(contentrevision);
            }
        }
コード例 #5
0
        protected override void ParseContent(ContentRevisionDTO contentrevision)
        {
            try
            {
                var ruiwebContents = webGetkr.Load(contentrevision.Content.Contents_URL);

                var checkifdeprecated = ruiwebContents.DocumentNode.SelectNodes("//td[@class = 'te2']");
                if (checkifdeprecated != null && checkifdeprecated.FirstOrDefault().InnerText.Trim() == "이미 삭제 된 게시글 입니다.")
                {
                    contentrevision.isDepricate = true;
                    return;
                }
                
                //var content_count = 0;
                List<string> details = new List<string>();
                var content = ruiwebContents.DocumentNode.SelectNodes("//div[@id = 'DocContent']").SingleOrDefault();
                contentrevision.Details = content.InnerText.Trim();
                contentrevision.Details_Html = content.InnerHtml.Trim();
                contentrevision.isDepricate = false;

                var imgnodes = content.SelectNodes(".//img");

                contentrevision.SrcDatas = new List<SrcdataDTO>();
                if (imgnodes == null) return;

                foreach (var img in imgnodes)
                {
                    var srcurl = new Uri(img.GetAttributeValue("src", "default"));
                    var srcdata = new SrcdataDTO
                    {
                        SourceUrl = srcurl.AbsoluteUri,
                        IsDepricated = false,
                        FileName = System.IO.Path.GetFileName(srcurl.LocalPath),
                        SrcGuId = Guid.NewGuid(),
                    };
                    img.SetAttributeValue("guid", srcdata.SrcGuId.ToString());

                    contentrevision.SrcDatas.Add(srcdata);
                }
            }
            catch (ArgumentNullException)
            {
                contentrevision.isDepricate = true;
                return;
            }
            catch (UriFormatException)
            {
                return;
            }
            catch (Exception e)
            {
                Console.WriteLine("ReStart");
                ParseContent(contentrevision);
            }
        }
コード例 #6
0
ファイル: commonCrawler.cs プロジェクト: radtek/Crawler.All
        //Srcdata가 있을경우 이미지 파일을 파싱 해옴.
        protected void CacheImage(ContentRevisionDTO ContentRevision)
        {
            try
            {
                if (ContentRevision.SrcDatas != null)
                {
                    Parallel.ForEach(ContentRevision.SrcDatas, srcdata =>
                    {
                        var client = new WebClient();
                        //System.Console.WriteLine(content.Contents_URL);
                        var url = new Uri(HttpUtility.HtmlDecode(srcdata.SourceUrl));

                        client.Headers.Add("Accept", @"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
                        client.Headers.Add("Referer", ContentRevision.Content.Contents_URL);
                        client.Headers.Add("Accept-Encoding", @"gzip, deflate, sdch");
                        client.Headers.Add("Accept-Language", @"ko,en-US;q=0.8,en;q=0.6");
                        client.Headers.Add("User-Agent", webGetkr.UserAgent);
                        client.UseDefaultCredentials = true;
                        try
                        {
                            var data = client.DownloadData(url);

                            srcdata.OriginalPayload      = data;
                            srcdata.OriginalPayload_Size = data.LongLength;
                        }
                        catch (ArgumentNullException)
                        {
                            try
                            {
                                var data = client.DownloadData(url);

                                srcdata.OriginalPayload      = data;
                                srcdata.OriginalPayload_Size = data.LongLength;
                            }
                            catch (ArgumentNullException)
                            {
                                try
                                {
                                    var data = client.DownloadData(url);
                                    srcdata.OriginalPayload      = data;
                                    srcdata.OriginalPayload_Size = data.LongLength;
                                }
                                catch (ArgumentNullException)
                                {
                                    srcdata.IsDepricated = true;
                                    return;
                                }
                            }
                        }
                        catch (Exception e)
                        {
                            var errorlog = new ErrorLogDTO
                            {
                                Error_Address = "SrcData",
                                Error_URL     = url.AbsoluteUri,
                                Error_Details = e.Message.ToString(),
                                Hresult       = e.HResult
                            };
                            //SendErrorLog(errorlog);

                            srcdata.IsDepricated = true;
                            return;
                        }
                    });
                }
            }
            catch (WebException wex)
            {
                if (((HttpWebResponse)wex.Response).StatusCode == HttpStatusCode.NotFound)
                {
                    // error 404, do what you need to do
                }
            }
        }
コード例 #7
0
ファイル: commonCrawler.cs プロジェクト: radtek/Crawler.All
 //ContentRevigion의 Details, Detail_Html등의 정보를 가져옴
 protected abstract void ParseContent(ContentRevisionDTO contentrevision);