protected override void ProcessParseLink() { try { while (!IS_STOP) { int number = CounterManager.GetNumber(); _removelinks_detail = this.RemoveLinks.ToString(); string url = string.Empty; if (QueueDetailURL.Count > 0) { ConcreteLink objUrl; lock (QueueDetailURL) { try { objUrl = QueueDetailURL.Dequeue(); } catch { continue; } } string html = string.Empty; try { if (string.IsNullOrEmpty(objUrl.href)) { continue; } } catch { continue; } url = objUrl.href; if (!objUrl.href.Contains("https://") && !objUrl.href.Contains("http://")) { url = this.BaseURL + objUrl.href; } try { html = DownloadExpress.getResponseString(url, ""); } catch (Exception _error) { //------------------------------------------------------------------------------------------------------- string _errorString = string.Format("{0}-ERROR:{1}-URLDownload:{2}", this.ThreadName, _error.ToString(), url); SpiderArgs args = new SpiderArgs() { Message = _errorString }; SpiderSingletonEvent.Instance.OnSpiderScreenConsole(args); } if (string.IsNullOrEmpty(html)) { continue; } //#region FIND DETAIL LINK //foreach (var item in base.ConfigLinks.configlinks) //{ // FindLinkBase findLink = null; // if (item.pattern_type == "REGEX") // { // //if(item.link_type == "REMOVE_LINK") // if (_removelinks_detail != string.Empty) // { // findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, _removelinks_detail); // } // else // { // findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, ""); // } // } // if(item.pattern_type == "XPATH") // { // if (_removelinks_detail != string.Empty) // { // findLink = new FindLinkByXPath(this.BaseURL, item.url_pattern, _removelinks_detail); // } // else // { // findLink = new FindLinkByXPath(this.BaseURL, item.url_pattern, ""); // } // } // if (findLink == null) // continue; // if (item.link_type == "DETAIL_LINK") // { // List<string> linkDetail = findLink.FindLink(html); // if (linkDetail == null || linkDetail.Count == 0) // continue; // foreach (string href in linkDetail) // { // string _url = href; // if (IsNewUrl(ref _url)) // { // ConcreteLink _concreteLink = new ConcreteLink(); // _concreteLink.link_type = "DETAIL_LINK"; // _concreteLink.href = href; // QueueDetailURL.Enqueue(_concreteLink); // } // } // } //} //#endregion #region PARSE FIELD var dict_post = new Dictionary <string, object>(); dict_post.Add("source_id", this.SourceID); dict_post.Add("href", url); Dictionary <string, string> _dataString = new Dictionary <string, string>(); string content_filter = string.Empty; string content_khongdau = string.Empty; #region process parsing try { //------------------------------- video ------------------------------------------------------------ try { //process parsing video foreach (var itemvideo in base.ConfigVideos.configvideos) { string field_name = itemvideo.name; string pattern_type = itemvideo.config.pattern_type; if (pattern_type == PatternType.XPATH.ToString()) //pattern type xpath { foreach (var config in itemvideo.config.step) { string video_url = XPATHHelpers.ParsingHTML(html, config.Value.field_value); int is_url_video_cache = 0; int.TryParse(config.Value.is_url_video_cache, out is_url_video_cache); if (!string.IsNullOrEmpty(video_url)) { if (is_url_video_cache == 1) //if video url is cache { var dict_video_post = new Dictionary <string, object>(); dict_video_post.Add("content_id", ""); dict_video_post.Add("source_id", this.SourceID); dict_video_post.Add("video_url", video_url); dict_video_post.Add("video_url_cache", video_url); dict_video_post.Add("status", "NEED_CACHE"); dict_video_post.Add("video_url_path", "empty"); dict_video_post.Add("status_download", "PREPARING"); string json_video_Content = JsonConvert.SerializeObject(dict_video_post, Formatting.Indented); var response_video = DownloadExpress.DownloadPost(this.VIDEO_POST_URL, json_video_Content); } else { var dict_video_post = new Dictionary <string, object>(); dict_video_post.Add("content_id", ""); dict_video_post.Add("source_id", this.SourceID); dict_video_post.Add("video_url", video_url); dict_video_post.Add("video_url_cache", "empty"); dict_video_post.Add("status", "NEED_PARSING"); dict_video_post.Add("video_url_path", "empty"); dict_video_post.Add("status_download", "PREPARING"); string json_video_Content = JsonConvert.SerializeObject(dict_video_post, Formatting.Indented); var response_video = DownloadExpress.DownloadPost(this.VIDEO_POST_URL, json_video_Content); } } } } } } catch { } //--------------------------------------------------------------------------------------------------- foreach (var itemfield in base.ConfigFields.configfields) { string field_name = itemfield.name; string htmlParser = html; string content_text = ""; string the_last_config = string.Empty; string pattern_type = itemfield.config.pattern_type; if (pattern_type == PatternType.XPATH.ToString()) //pattern type xpath { foreach (var config in itemfield.config.step) { //################# check break_parsing point ############################## bool break_parsing = false; try { if (!string.IsNullOrWhiteSpace(config.Value.break_parsing)) { if (Convert.ToInt32(config.Value.break_parsing) == 0) { break_parsing = false; } else { break_parsing = true; } } } catch { break_parsing = false; } //########################################################################### if (break_parsing) //breaking parsing { htmlParser = XPATHHelpers.ParsingHTML(html, config.Value.xpath); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; if (field_name == "content") { the_last_config = config.Value.xpath; } } } else { htmlParser = XPATHHelpers.ParsingHTML(htmlParser, config.Value.xpath); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; if (field_name == "content") { the_last_config = config.Value.xpath; } } } //################## process remove html tag ############################### try { if (!string.IsNullOrWhiteSpace(config.Value.remove_html)) { if (Convert.ToInt32(config.Value.remove_html) == 1) { if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = HtmlUtility.RemoveAllTag(htmlParser); } } } } catch { } //########################################################################## } } else // STRING BETWEEN { foreach (var config in itemfield.config.step) { //################# check break_parsing point ############################## bool break_parsing = false; try { if (!string.IsNullOrWhiteSpace(config.Value.break_parsing)) { if (Convert.ToInt32(config.Value.break_parsing) == 0) { break_parsing = false; } else { break_parsing = true; } } } catch { break_parsing = false; } //########################################################################### if (break_parsing) //breaking parsing { if (string.IsNullOrWhiteSpace(content_text)) { htmlParser = StringHelpers.GetStringBetween(html, config.Value.start_pattern, config.Value.end_pattern); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; the_last_config = config.Value.start_pattern; } } else { htmlParser = StringHelpers.GetStringBetween(content_text, config.Value.start_pattern, config.Value.end_pattern); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; the_last_config = config.Value.start_pattern; } } } else { htmlParser = StringHelpers.GetStringBetween(htmlParser, config.Value.start_pattern, config.Value.end_pattern); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; the_last_config = config.Value.start_pattern; } } //################## process remove html tag ############################### try { if (!string.IsNullOrWhiteSpace(config.Value.remove_html)) { if (Convert.ToInt32(config.Value.remove_html) == 1) { if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = HtmlUtility.RemoveAllTag(htmlParser); } } } } catch { } //########################################################################## } //endfor } if (!string.IsNullOrWhiteSpace(content_text)) { if (field_name.ToLower() == "title") { dict_post.Add("title", content_text); content_khongdau += StringHelpers.ConvertToKD(content_text) + "\n\n"; } if (field_name.ToLower() == "tag_name") { if (string.IsNullOrWhiteSpace(content_text)) { content_text = "Không xác định"; } dict_post.Add("tag_name", content_text.Trim().Replace(" ", "")); } if (field_name.ToLower() == "published_at") { bool checkDateTime = false; if (content_text.Contains(".")) { content_text = content_text.Replace(".", "/"); } //content_text = "Thứ hai, 15/5/2017 | 09:48 GMT+7"; DateTime _published_date = Utility.ConvertDateTimeStringForArchived(content_text, out checkDateTime).Date; if (checkDateTime) { dict_post.Add("published_at", string.Format("{0}-{1}-{2}", _published_date.Year, _published_date.Month, _published_date.Day)); } else { DateTime _published_date_ = Utility.ConvertDateTimeStringForArchived(content_text, out checkDateTime).Date; string message = string.Format("[{0}] ERROR: {1} , {2}, {3}", ThreadName, "NOT PARSING DATATIME", content_text, url); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); continue; } content_text = content_text.Trim().Replace(" ", ""); dict_post.Add("published_time", content_text); } if (field_name.ToLower() == "content") { content_filter = HtmlUtility.RemoveAllTag(content_text); content_khongdau += StringHelpers.ConvertToKD(content_text) + "\n\n"; dict_post.Add("content", content_text); } //_dataString.Add(field_name, content_text); } } //_dataString.Add("html_data", html); //string jsonContent = JsonConvert.SerializeObject(_dataString, Formatting.Indented); //dict_post.Add("data", jsonContent); dict_post.Add("content_filter", content_filter); dict_post.Add("content_khongdau", content_khongdau); dict_post.Add("full_html", WebUtility.HtmlEncode(html)); string jsonContent = JsonConvert.SerializeObject(dict_post, Formatting.Indented); if (!string.IsNullOrWhiteSpace(jsonContent)) { string results = string.Empty; if (this.Mode != "TEST") // if not test then insert to api db { results = DownloadExpress.DownloadPost(this.PostURL, jsonContent); if (!string.IsNullOrEmpty(results)) //check data return from api if not null or empty { try { var _info = JsonConvert.DeserializeObject <Dictionary <string, string> >(results); string type = _info["type"]; switch (type) { case "UPDATE_VERSION_SUCCESS": string content_id = _info["content_id"]; this._BaseManagement.TotalUpdateLink = this._BaseManagement.TotalUpdateLink + 1; CounterManager.ResetTryingCount(); //--------------BEGIN capture content to images ------------------------------------------------------------------- CaptureArchivied.PDFConverterServiceClient _captureService = new CaptureArchivied.PDFConverterServiceClient(); CaptureArchivied.ConvertPDFRequest _pdfRequest = new CaptureArchivied.ConvertPDFRequest(); _pdfRequest.contentId = content_id; _pdfRequest.pageURL = url; _pdfRequest.filterBy = this.chk_unique_css == 1 ? 0 : 1; _pdfRequest.filterText = this.filter_pdf; _pdfRequest.deleteItems = this.remove_filter_pdf; var return_data = _captureService.ConvertToPDF(_pdfRequest); //--------------END capture content to images --------------------------------------------------------------------- break; case "INSERT_SUCCESS": string content_insert_id = _info["content_id"]; this._BaseManagement.TotalInsertLink = this._BaseManagement.TotalInsertLink + 1; CounterManager.ResetTryingCount(); //--------------BEGIN capture content to images ------------------------------------------------------------------- CaptureArchivied.PDFConverterServiceClient insert_captureService = new CaptureArchivied.PDFConverterServiceClient(); CaptureArchivied.ConvertPDFRequest insert_pdfRequest = new CaptureArchivied.ConvertPDFRequest(); insert_pdfRequest.contentId = content_insert_id; insert_pdfRequest.pageURL = url; insert_pdfRequest.filterBy = this.chk_unique_css == 1 ? 0 : 1; insert_pdfRequest.filterText = this.filter_pdf; insert_pdfRequest.deleteItems = this.remove_filter_pdf; var insert_return_data = insert_captureService.ConvertToPDF(insert_pdfRequest); //--------------END capture content to images --------------------------------------------------------------------- break; default: CounterManager.InCreaseTryingCount(); break; } } catch (Exception error) { string message = string.Format("[{0}] ERROR: {1} , {2}, {3}", ThreadName, error.Message, DateTime.Now, url); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); CounterManager.InCreaseTryingCount(); continue; } } else //empty or error { CounterManager.InCreaseTryingCount(); } } try { SpiderArgs args = new SpiderArgs() { SourceName = this.ThreadName, Index = number, Href = url, Title = (string)dict_post["title"], Message = results }; SpiderSingletonEvent.Instance.OnSpiderProcessing(args); } catch { } } } catch (Exception _errorParsing) { string message = string.Format("[{0}][PROCESS-PARSING] ERROR: {1} , {2}, {3}", ThreadName, _errorParsing.Message, DateTime.Now, url); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); CounterManager.InCreaseTryingCount(); continue; } #endregion #endregion } else { CounterManager.InCreaseTryingCount(); } if (this.Mode == "UPDATE") { if (number >= 2000) { CounterManager.CurrentTryingCount = 100000; } if (CounterManager.CurrentTryingCount >= this.MaxTryingCount) { string message = string.Format("[{0}] SLEEP FOR UPDATE AFTER {1} seconds, {2}", ThreadName, (this.UpdateSleep / 1000), DateTime.Now); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); Thread.Sleep(this.UpdateSleep); try { //this._BaseManagement.TotalUpdateLink = 0; //this._BaseManagement.TotalInsertLink = 0; //UrlStoragClear(); //CounterManager.ResetTryingCount(); CounterManager.Reset(0, 100); } catch { //this._BaseManagement.TotalUpdateLink = 0; //this._BaseManagement.TotalInsertLink = 0; //UrlStoragClear(); //CounterManager.ResetTryingCount(); CounterManager.Reset(0, 100); } } } Thread.Sleep(this.THREAD_SLEEP); } } catch (Exception ex) { string message = string.Format("[{0}] ERROR OUT OF THREAD: {1} , {2}", ThreadName, ex.Message.ToString(), DateTime.Now); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); } }
protected override void ProcessParseLink() { try { while (!IS_STOP) { int number = CounterManager.GetNumber(); _removelinks_detail = this.RemoveLinks.ToString(); string url = string.Empty; if (QueueDetailURL.Count > 0) { ConcreteLink objUrl; lock (QueueDetailURL) { try { objUrl = QueueDetailURL.Dequeue(); } catch { continue; } } string html = string.Empty; try { if (string.IsNullOrEmpty(objUrl.href)) { continue; } } catch { continue; } url = objUrl.href; if (!objUrl.href.Contains("https://") && !objUrl.href.Contains("http://")) { url = this.BaseURL + objUrl.href; } try { html = DownloadExpress.getResponseString(url, ""); } catch (Exception _error) { //------------------------------------------------------------------------------------------------------- string _errorString = string.Format("{0}-ERROR:{1}-URLDownload:{2}", this.ThreadName, _error.ToString(), url); SpiderArgs args = new SpiderArgs() { Message = _errorString }; SpiderSingletonEvent.Instance.OnSpiderScreenConsole(args); } if (string.IsNullOrEmpty(html)) { continue; } #region FIND DETAIL LINK /*foreach (var item in base.ConfigLinks.configlinks) * { * FindLinkBase findLink = null; * if (item.pattern_type == "REGEX") * { * //if(item.link_type == "REMOVE_LINK") * * if (_removelinks_detail != string.Empty) * { * findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, _removelinks_detail); * } * else * { * findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, ""); * } * } * * * if (findLink == null) * continue; * * if (item.link_type == "DETAIL_LINK") * { * List<string> linkDetail = findLink.FindLink(html); * if (linkDetail == null || linkDetail.Count == 0) * continue; * * * foreach (string href in linkDetail) * { * string _url = href; * if (IsNewUrl(ref _url)) * { * ConcreteLink _concreteLink = new ConcreteLink(); * _concreteLink.link_type = "DETAIL_LINK"; * _concreteLink.href = href; * QueueDetailURL.Enqueue(_concreteLink); * } * } * } * }*/ #endregion #region PARSE FIELD var dict_post = new Dictionary <string, object>(); dict_post.Add("source_id", this.SourceID); dict_post.Add("href", url); Dictionary <string, string> _dataString = new Dictionary <string, string>(); string the_last_config = string.Empty; string content_filter = string.Empty; string content_khongdau = string.Empty; #region process parsing try { #region Parsing Field foreach (var itemfield in base.ConfigFields.configfields) { string field_name = itemfield.name; string htmlParser = html; string content_text = ""; string pattern_type = itemfield.config.pattern_type; if (pattern_type == PatternType.XPATH.ToString()) //pattern type xpath { foreach (var config in itemfield.config.step) { //################# check break_parsing point ############################## bool break_parsing = false; try { if (!string.IsNullOrWhiteSpace(config.Value.break_parsing)) { if (Convert.ToInt32(config.Value.break_parsing) == 0) { break_parsing = false; } else { break_parsing = true; } } } catch { break_parsing = false; } //########################################################################### if (break_parsing) //breaking parsing { htmlParser = XPATHHelpers.ParsingHTML(html, config.Value.xpath); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; if (field_name == "content") { the_last_config = config.Value.xpath; } } } else { htmlParser = XPATHHelpers.ParsingHTML(htmlParser, config.Value.xpath); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; if (field_name == "content") { the_last_config = config.Value.xpath; } } } //################## process remove html tag ############################### try { if (!string.IsNullOrWhiteSpace(config.Value.remove_html)) { if (Convert.ToInt32(config.Value.remove_html) == 1) { if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = HtmlUtility.RemoveAllTag(htmlParser); } } } } catch { } //########################################################################## } } else //pattern type string_between { foreach (var config in itemfield.config.step) { //################# check break_parsing point ############################## bool break_parsing = false; try { if (!string.IsNullOrWhiteSpace(config.Value.break_parsing)) { if (Convert.ToInt32(config.Value.break_parsing) == 0) { break_parsing = false; } else { break_parsing = true; } } } catch { break_parsing = false; } //########################################################################### if (break_parsing) //breaking parsing { if (string.IsNullOrWhiteSpace(content_text)) { htmlParser = StringHelpers.GetStringBetween(html, config.Value.start_pattern, config.Value.end_pattern); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; } } else { htmlParser = StringHelpers.GetStringBetween(content_text, config.Value.start_pattern, config.Value.end_pattern); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; } } } else { htmlParser = StringHelpers.GetStringBetween(htmlParser, config.Value.start_pattern, config.Value.end_pattern); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; } } //################## process remove html tag ############################### try { if (!string.IsNullOrWhiteSpace(config.Value.remove_html)) { if (Convert.ToInt32(config.Value.remove_html) == 1) { if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = HtmlUtility.RemoveAllTag(htmlParser); } } } } catch { } //########################################################################## } } if (!string.IsNullOrWhiteSpace(content_text)) { if (field_name.ToLower() == "title") { dict_post.Add("title", content_text); content_khongdau += StringHelpers.ConvertToKD(content_text) + "\n\n"; } if (field_name.ToLower() == "tag_name") { if (string.IsNullOrWhiteSpace(content_text)) { content_text = "Không xác định"; } dict_post.Add("tag_name", content_text.Trim().Replace(" ", "")); } if (field_name.ToLower() == "published_at") { bool checkDateTime = false; //content_text = "Thứ hai, 15/5/2017 | 09:48 GMT+7"; DateTime _published_date = Utility.ConvertDateTimeStringForArchived(content_text, out checkDateTime).Date; if (checkDateTime) { dict_post.Add("published_at", string.Format("{0}-{1}-{2}", _published_date.Year, _published_date.Month, _published_date.Day)); } else { DateTime _published_date_ = Utility.ConvertDateTimeStringForArchived(content_text, out checkDateTime).Date; string message = string.Format("[{0}] ERROR: {1} , {2}, {3}", ThreadName, "NOT PARSING DATATIME", content_text, url); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); continue; } content_text = content_text.Trim().Replace(" ", ""); dict_post.Add("published_time", content_text); } if (field_name.ToLower() == "content") { content_filter = HtmlUtility.RemoveAllTag(content_text); content_khongdau += StringHelpers.ConvertToKD(content_text) + "\n\n"; dict_post.Add("content", content_text); } _dataString.Add(field_name, content_text); } } #endregion dict_post.Add("content_filter", content_filter); dict_post.Add("content_khongdau", content_khongdau); dict_post.Add("full_html", html); string jsonContent = JsonConvert.SerializeObject(dict_post, Formatting.Indented); if (!string.IsNullOrWhiteSpace(jsonContent)) { string results = string.Empty; if (this.Mode != "TEST") // if not test then insert to api db { results = DownloadExpress.DownloadPost(this.PostURL, jsonContent); if (!string.IsNullOrEmpty(results)) //check data return from api if not null or empty { try { var _info = JsonConvert.DeserializeObject <Dictionary <string, string> >(results); string type = _info["type"]; switch (type) { case "UPDATE_VERSION_SUCCESS": this._BaseManagement.TotalUpdateLink = this._BaseManagement.TotalUpdateLink + 1; CounterManager.ResetTryingCount(); string content_id = _info["content_id"]; //--------------BEGIN capture content to images ------------------------------------------------------------------- if (this.SourceID == "58d0ee601d41c849a893a49d") //vnexpress { CaptureArchivied.PDFConverterServiceClient _captureService = new CaptureArchivied.PDFConverterServiceClient(); if (url.Contains("sohoa.vnexpress.net")) { CaptureArchivied.ConvertPDFRequest _pdfRequest = new CaptureArchivied.ConvertPDFRequest(); _pdfRequest.contentId = content_id; _pdfRequest.pageURL = url; _pdfRequest.filterBy = 1; _pdfRequest.filterText = "main_content_detail"; _pdfRequest.deleteItems = "#box_tinlienquan,#box_comment,.banner_468,#social_like,.xemthem_new_ver,.banner_468x90,.block_goithutoasoan"; //capture by id var return_data = _captureService.ConvertToPDF(_pdfRequest); } else { CaptureArchivied.ConvertPDFRequest _pdfRequest = new CaptureArchivied.ConvertPDFRequest(); _pdfRequest.contentId = content_id; _pdfRequest.pageURL = url; _pdfRequest.filterBy = 1; _pdfRequest.filterText = "block_col_480"; _pdfRequest.deleteItems = "#box_tinlienquan,#box_comment,.banner_468,#social_like,.xemthem_new_ver,.banner_468x90,.block_goithutoasoan"; var return_data = _captureService.ConvertToPDF(_pdfRequest); } //capture by id /*if (the_last_config.Contains("fck_detail")) * { * CaptureArchivied.ReturnData return_data = _captureService.Capture(url, 3, "fck_detail", null, content_id); * } * else if(the_last_config.Contains("article_content")) * { * CaptureArchivied.ReturnData return_data = _captureService.Capture(url, 2, "article_content", null, content_id); * } * else if (the_last_config.Contains("container_tab_live")) * { * CaptureArchivied.ReturnData return_data = _captureService.Capture(url, 2, "container_tab_live", null, content_id); * }*/ } //--------------END capture content to images --------------------------------------------------------------------- break; case "INSERT_SUCCESS": string content_insert_id = _info["content_id"]; this._BaseManagement.TotalInsertLink = this._BaseManagement.TotalInsertLink + 1; CounterManager.ResetTryingCount(); //--------------BEGIN capture content to images ------------------------------------------------------------------- if (this.SourceID == "58d0ee601d41c849a893a49d") //vnexpress { CaptureArchivied.PDFConverterServiceClient _captureService = new CaptureArchivied.PDFConverterServiceClient(); if (url.Contains("sohoa.vnexpress.net")) { CaptureArchivied.ConvertPDFRequest _pdfRequest = new CaptureArchivied.ConvertPDFRequest(); _pdfRequest.contentId = content_insert_id; _pdfRequest.pageURL = url; _pdfRequest.filterBy = 1; _pdfRequest.filterText = "main_content_detail"; _pdfRequest.deleteItems = "#box_tinlienquan,#box_comment,.banner_468,#social_like,.xemthem_new_ver,.banner_468x90,.block_goithutoasoan"; //capture by id var return_data = _captureService.ConvertToPDF(_pdfRequest); } else { CaptureArchivied.ConvertPDFRequest _pdfRequest = new CaptureArchivied.ConvertPDFRequest(); _pdfRequest.contentId = content_insert_id; _pdfRequest.pageURL = url; _pdfRequest.filterBy = 1; _pdfRequest.filterText = "block_col_480"; _pdfRequest.deleteItems = "#box_tinlienquan,#box_comment,.banner_468,#social_like,.xemthem_new_ver,.banner_468x90,.block_goithutoasoan"; var return_data = _captureService.ConvertToPDF(_pdfRequest); } //capture by id /*if (the_last_config.Contains("fck_detail")) * { * CaptureArchivied.ReturnData return_data = _captureService.Capture(url, 3, "fck_detail", null, content_insert_id); * } * else if (the_last_config.Contains("article_content")) * { * CaptureArchivied.ReturnData return_data = _captureService.Capture(url, 2, "article_content", null, content_insert_id); * } * else if (the_last_config.Contains("container_tab_live")) * { * CaptureArchivied.ReturnData return_data = _captureService.Capture(url, 2, "container_tab_live", null, content_insert_id); * }*/ } //--------------END capture content to images --------------------------------------------------------------------- break; default: CounterManager.InCreaseTryingCount(); break; } } catch (Exception error) { string message = string.Format("[{0}] ERROR: {1} , {2}, {3}", ThreadName, error.Message, DateTime.Now, url); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); CounterManager.InCreaseTryingCount(); continue; } } else //empty or error { CounterManager.InCreaseTryingCount(); } } try { SpiderArgs args = new SpiderArgs() { SourceName = this.ThreadName, Index = number, Href = url, Title = (string)dict_post["title"], Message = results }; SpiderSingletonEvent.Instance.OnSpiderProcessing(args); } catch { } } } catch (Exception _errorParsing) { string message = string.Format("[{0}][PROCESS-PARSING] ERROR: {1} , {2}, {3}", ThreadName, _errorParsing.Message, DateTime.Now, url); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); CounterManager.InCreaseTryingCount(); continue; } #endregion #endregion } else { CounterManager.InCreaseTryingCount(); } if (this.Mode == "UPDATE") { if (number >= 2000) { CounterManager.CurrentTryingCount = 100000; } if (CounterManager.CurrentTryingCount >= this.MaxTryingCount) { string message = string.Format("[{0}] SLEEP FOR UPDATE AFTER {1} seconds, {2}", ThreadName, (this.UpdateSleep / 1000), DateTime.Now); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); Thread.Sleep(this.UpdateSleep); try { //this._BaseManagement.TotalUpdateLink = 0; //this._BaseManagement.TotalInsertLink = 0; //UrlStoragClear(); //CounterManager.ResetTryingCount(); CounterManager.Reset(0, 100); } catch { //this._BaseManagement.TotalUpdateLink = 0; //this._BaseManagement.TotalInsertLink = 0; //UrlStoragClear(); //CounterManager.ResetTryingCount(); CounterManager.Reset(0, 100); } } } Thread.Sleep(this.THREAD_SLEEP); } } catch (Exception ex) { string message = string.Format("[{0}] ERROR OUT OF THREAD: {1} , {2}", ThreadName, ex.Message.ToString(), DateTime.Now); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); } }