protected override void ProcessParseLink() { try { while (!IS_STOP) { int number = CounterManager.GetNumber(); _removelinks_detail = this.RemoveLinks.ToString(); string url = string.Empty; if (QueueDetailURL.Count > 0) { ConcreteLink objUrl; lock (QueueDetailURL) { try { objUrl = QueueDetailURL.Dequeue(); } catch { continue; } } string html = string.Empty; try { if (string.IsNullOrEmpty(objUrl.href)) { continue; } } catch { continue; } url = objUrl.href; if (!objUrl.href.Contains("https://") && !objUrl.href.Contains("http://")) { url = this.BaseURL + objUrl.href; } try { html = DownloadExpress.getResponseString(url, ""); } catch (Exception _error) { //------------------------------------------------------------------------------------------------------- string _errorString = string.Format("{0}-ERROR:{1}-URLDownload:{2}", this.ThreadName, _error.ToString(), url); SpiderArgs args = new SpiderArgs() { Message = _errorString }; SpiderSingletonEvent.Instance.OnSpiderScreenConsole(args); } if (string.IsNullOrEmpty(html)) { continue; } #region FIND DETAIL LINK /*foreach (var item in base.ConfigLinks.configlinks) * { * FindLinkBase findLink = null; * if (item.pattern_type == "REGEX") * { * //if(item.link_type == "REMOVE_LINK") * * if (_removelinks_detail != string.Empty) * { * findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, _removelinks_detail); * } * else * { * findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, ""); * } * } * * * if (findLink == null) * continue; * * if (item.link_type == "DETAIL_LINK") * { * List<string> linkDetail = findLink.FindLink(html); * if (linkDetail == null || linkDetail.Count == 0) * continue; * * * foreach (string href in linkDetail) * { * string _url = href; * if (IsNewUrl(ref _url)) * { * ConcreteLink _concreteLink = new ConcreteLink(); * _concreteLink.link_type = "DETAIL_LINK"; * _concreteLink.href = href; * QueueDetailURL.Enqueue(_concreteLink); * } * } * } * }*/ #endregion #region PARSE FIELD string url_md5 = StringHelpers.MD5Hash(url); //check exist try { string isExist = DownloadExpress.getResponseString(string.Format("http://localhost:9999/ContentHub/CheckExist?url_md5={0}", url_md5), ""); bool _isExist = Convert.ToBoolean(isExist); if (_isExist) { CounterManager.InCreaseTryingCount(); continue; } } catch (Exception _error) { } //################################################################################ var dict_post = new Dictionary <string, object>(); dict_post.Add("UrlMd5", url_md5); dict_post.Add("Url", url); Dictionary <string, string> _dataString = new Dictionary <string, string>(); string the_last_config = string.Empty; string content_filter = string.Empty; string content_khongdau = string.Empty; #region process parsing try { string title = StringHelpers.GetStringBetween(html, "<h1><p class=\"title font_28\">", "</p></h1>"); if (string.IsNullOrEmpty(title)) { continue; } dict_post.Add("Title", title); string Price = StringHelpers.GetStringBetween(html, "<li class=\"li_100 clear icon_bds font_14 roboto_regular\">Giá : <span class=\"font_14 roboto_bold\">", "</span></li>"); dict_post.Add("Price", Price); string Acreage = StringHelpers.GetStringBetween(html, "<li class=\"li_100 clear icon_bds font_14 roboto_regular\">Diện tích : <span class=\"font_14 roboto_bold\">", "<sup>2</sup></span></li>"); dict_post.Add("Acreage", Acreage); string Address = StringHelpers.GetStringBetween(html, "<span class=\"roboto_bold font_14 cl_333 dc_new icon_bds\">Địa chỉ: </span>", "<span"); dict_post.Add("Address", Address); string Room = StringHelpers.GetStringBetween(html, "<li class=\"li_100 clear icon_bds font_14 roboto_regular\">Có <span class=\"font_14 roboto_bold\">", "</span></li>"); dict_post.Add("Room", Room); string Phone = StringHelpers.GetStringBetween(html, "<p id=\"mobile_show\" class=\"mobile_hide show_mobile\" style=\"*display: none; \"><span style=\"*padding-left: 10px\">", "</span></p>"); dict_post.Add("Phone", Phone); string Content = StringHelpers.GetStringBetween(html, "<div class=\"info_text\">", "<div class=\"info_box\">"); Content = HtmlUtility.RemoveAllTag(Content); dict_post.Add("Content", Content); dict_post.Add("Location", "Ha Noi"); dict_post.Add("Source", "Rongbay"); string DatePost = StringHelpers.GetStringBetween(html, "Thời gian đăng : <span class=\"color cl_888 font_13\">", " -"); dict_post.Add("DatePost", DatePost); string CategoryName = StringHelpers.GetStringBetween(html, "<span class=\"nameScate\">", "</span>"); CategoryName = HtmlUtility.RemoveAllTag(CategoryName); dict_post.Add("CategoryName", CategoryName); dict_post.Add("Metadata", html); string jsonContent = JsonConvert.SerializeObject(dict_post, Formatting.Indented); if (!string.IsNullOrWhiteSpace(jsonContent)) { string results = string.Empty; if (this.Mode != "TEST") // if not test then insert to api db { string _url_post = String.Join("&", dict_post.Select(i => $"{i.Key}={i.Value}")); results = DownloadExpress.DownloadPost(this.PostURL, jsonContent); if (!string.IsNullOrEmpty(results)) //check data return from api if not null or empty { try { var _info = JsonConvert.DeserializeObject <ContentHub>(results); if (_info.Id > 0) { this._BaseManagement.TotalInsertLink = this._BaseManagement.TotalInsertLink + 1; CounterManager.ResetTryingCount(); } else { } } catch (Exception error) { string message = string.Format("[{0}] ERROR: {1} , {2}, {3}", ThreadName, error.Message, DateTime.Now, url); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); CounterManager.InCreaseTryingCount(); continue; } } else //empty or error { CounterManager.InCreaseTryingCount(); } } try { SpiderArgs args = new SpiderArgs() { SourceName = this.ThreadName, Index = number, Href = url, Title = (string)dict_post["Title"], Message = results }; SpiderSingletonEvent.Instance.OnSpiderProcessing(args); } catch { } } } catch (Exception _errorParsing) { string message = string.Format("[{0}][PROCESS-PARSING] ERROR: {1} , {2}, {3}", ThreadName, _errorParsing.Message, DateTime.Now, url); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); CounterManager.InCreaseTryingCount(); continue; } #endregion #endregion } else { CounterManager.InCreaseTryingCount(); } if (this.Mode == "UPDATE") { if (number >= 2000) { CounterManager.CurrentTryingCount = 100000; } if (CounterManager.CurrentTryingCount >= this.MaxTryingCount) { string message = string.Format("[{0}] SLEEP FOR UPDATE AFTER {1} seconds, {2}", ThreadName, (this.UpdateSleep / 1000), DateTime.Now); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); Thread.Sleep(this.UpdateSleep); try { //this._BaseManagement.TotalUpdateLink = 0; //this._BaseManagement.TotalInsertLink = 0; //UrlStoragClear(); //CounterManager.ResetTryingCount(); CounterManager.Reset(0, 100); } catch { //this._BaseManagement.TotalUpdateLink = 0; //this._BaseManagement.TotalInsertLink = 0; //UrlStoragClear(); //CounterManager.ResetTryingCount(); CounterManager.Reset(0, 100); } } } Thread.Sleep(this.THREAD_SLEEP); } } catch (Exception ex) { string message = string.Format("[{0}] ERROR OUT OF THREAD: {1} , {2}", ThreadName, ex.Message.ToString(), DateTime.Now); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); } }
protected override void ProcessFindLink() { try { while (!IS_STOP) { //int number = CounterManager.GetNumber(); _removelinks = this.RemoveLinks.ToString(); string url = string.Empty; if (FrontierURL.Count > 0) { ConcreteLink objUrl; lock (FrontierURL) { try { objUrl = FrontierURL.Dequeue(); } catch { continue; } } string html = string.Empty; if (string.IsNullOrEmpty(objUrl.href)) { continue; } url = objUrl.href; if (!objUrl.href.Contains("https://") && !objUrl.href.Contains("http://")) { url = this.BaseURL + objUrl.href; } string baseURL = string.Empty; try { //html = DownloadExpress.Download(url, Utility.GetEncoding(EncodingType.UTF8)); html = DownloadExpress.getResponseString(url, "", out baseURL); } catch (Exception _error) { //------------------------------------------------------------------------------------------------------- string _errorString = string.Format("{0}-ERROR:{1}-URLDownload:{2}", this.ThreadName, _error.ToString(), url); SpiderArgs args = new SpiderArgs() { Message = _errorString }; SpiderSingletonEvent.Instance.OnSpiderScreenConsole(args); //-------------------------------------------------------------------------------------------------------- try { html = DownloadExpress.getResponseString(url, "", out baseURL); } catch (Exception ex) { _errorString = string.Format("{0}-ERROR:{1}-URLDownloadBySocket:{2}", this.ThreadName, ex.ToString(), url); args = new SpiderArgs() { Message = _errorString }; SpiderSingletonEvent.Instance.OnSpiderScreenConsole(args); //-------------------------------------------------------------------------------------------------------- continue; } } if (string.IsNullOrEmpty(html)) { continue; } var _configLinkDetailLink = base.ConfigLinks.configlinks.Where(c => c.link_type == "DETAIL_LINK").ToList(); foreach (var item in base.ConfigLinks.configlinks) { FindLinkBase findLink = null; if (item.pattern_type == "REGEX") { //if(item.link_type == "REMOVE_LINK") if (_removelinks != string.Empty) { findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, _removelinks); } else { findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, ""); } } else if (item.pattern_type == "XPATH") { if (_removelinks != string.Empty) { findLink = new FindLinkByXPath(this.BaseURL, item.url_pattern, _removelinks); } else { findLink = new FindLinkByXPath(this.BaseURL, item.url_pattern, ""); } } if (findLink == null) { continue; } #region SUB_LINK if (item.link_type == "SUB_LINK") { List <string> listLinkSub = findLink.FindLink(html); if (listLinkSub == null || listLinkSub.Count == 0) { continue; } List <string> listSubLinkRemove = new List <string>(); //##################### filter sub link ############################## //##################################################################### if (listSubLinkRemove.Count > 0) { listLinkSub = listLinkSub.Where(s => !listSubLinkRemove.Contains(s)).ToList(); } foreach (string linkSub in listLinkSub) { string _link = linkSub; if (!_link.Contains("http://") && !_link.Contains("https://")) { _link = baseURL + linkSub; } string _url = _link; if (IsNewUrl(ref _url)) { ConcreteLink _concreteLink = new ConcreteLink(); _concreteLink.link_type = "SUB_LINK"; _concreteLink.href = _url; FrontierURL.Enqueue(_concreteLink); SpiderArgs args = new SpiderArgs() { Message = string.Format("{0}-SUB-{1}", _url, this.ThreadName) }; SpiderSingletonEvent.Instance.OnSpiderInformation(args); } } } #endregion #region DETAIL_LINK else if (item.link_type == "DETAIL_LINK") { List <string> linkDetail = findLink.FindLink(html); if (linkDetail == null || linkDetail.Count == 0) { continue; } foreach (string href in linkDetail) { string _link = href; if (!_link.Contains("http://") && !_link.Contains("https://")) { _link = baseURL + href; } string _url = _link; if (IsNewUrl(ref _url)) { ConcreteLink _concreteLink = new ConcreteLink(); _concreteLink.link_type = "DETAIL_LINK"; _concreteLink.href = _url; QueueDetailURL.Enqueue(_concreteLink); } } } #endregion } } if (this.Mode == "UPDATE") { if (CounterManager.CurrentTryingCount >= this.MaxTryingCount) { string message = string.Format("[{0}] SLEEP FOR UPDATE AFTER {1} seconds, {2}", ThreadName, (this.UpdateSleep / 1000), DateTime.Now); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); SpiderSingletonEvent.Instance.OnSpiderReloadForUpdate(new BlankSpider.Spider.Events.SpiderArgs() { Message = string.Format("{0} RELOAD FOR UPDATE \r\n", ThreadName) }); Thread.Sleep(this.UpdateSleep); try { this._BaseManagement.TotalUpdateLink = 0; this._BaseManagement.TotalInsertLink = 0; //CounterManager.ResetTryingCount(); //CounterManager.CurrentNumber = 0; UrlStoragClear(); CounterManager.Reset(0, 100); } catch { this._BaseManagement.TotalUpdateLink = 0; this._BaseManagement.TotalInsertLink = 0; //CounterManager.ResetTryingCount(); //CounterManager.CurrentNumber = 0; UrlStoragClear(); CounterManager.Reset(0, 100); } } } Thread.Sleep(this.THREAD_SLEEP); } } catch (Exception ex) { string test = ex.Message; } }
protected override void ProcessParseLink() { try { while (!IS_STOP) { int number = CounterManager.GetNumber(); _removelinks_detail = this.RemoveLinks.ToString(); string url = string.Empty; if (QueueDetailURL.Count > 0) { ConcreteLink objUrl; lock (QueueDetailURL) { try { objUrl = QueueDetailURL.Dequeue(); } catch { continue; } } string html = string.Empty; try { if (string.IsNullOrEmpty(objUrl.href)) { continue; } } catch { continue; } url = objUrl.href; if (!objUrl.href.Contains("https://") && !objUrl.href.Contains("http://")) { url = this.BaseURL + objUrl.href; } try { html = DownloadExpress.getResponseString(url, ""); } catch (Exception _error) { //------------------------------------------------------------------------------------------------------- string _errorString = string.Format("{0}-ERROR:{1}-URLDownload:{2}", this.ThreadName, _error.ToString(), url); SpiderArgs args = new SpiderArgs() { Message = _errorString }; SpiderSingletonEvent.Instance.OnSpiderScreenConsole(args); } if (string.IsNullOrEmpty(html)) { continue; } //#region FIND DETAIL LINK //foreach (var item in base.ConfigLinks.configlinks) //{ // FindLinkBase findLink = null; // if (item.pattern_type == "REGEX") // { // //if(item.link_type == "REMOVE_LINK") // if (_removelinks_detail != string.Empty) // { // findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, _removelinks_detail); // } // else // { // findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, ""); // } // } // if(item.pattern_type == "XPATH") // { // if (_removelinks_detail != string.Empty) // { // findLink = new FindLinkByXPath(this.BaseURL, item.url_pattern, _removelinks_detail); // } // else // { // findLink = new FindLinkByXPath(this.BaseURL, item.url_pattern, ""); // } // } // if (findLink == null) // continue; // if (item.link_type == "DETAIL_LINK") // { // List<string> linkDetail = findLink.FindLink(html); // if (linkDetail == null || linkDetail.Count == 0) // continue; // foreach (string href in linkDetail) // { // string _url = href; // if (IsNewUrl(ref _url)) // { // ConcreteLink _concreteLink = new ConcreteLink(); // _concreteLink.link_type = "DETAIL_LINK"; // _concreteLink.href = href; // QueueDetailURL.Enqueue(_concreteLink); // } // } // } //} //#endregion #region PARSE FIELD var dict_post = new Dictionary <string, object>(); dict_post.Add("source_id", this.SourceID); dict_post.Add("href", url); Dictionary <string, string> _dataString = new Dictionary <string, string>(); string content_filter = string.Empty; string content_khongdau = string.Empty; #region process parsing try { //------------------------------- video ------------------------------------------------------------ try { //process parsing video foreach (var itemvideo in base.ConfigVideos.configvideos) { string field_name = itemvideo.name; string pattern_type = itemvideo.config.pattern_type; if (pattern_type == PatternType.XPATH.ToString()) //pattern type xpath { foreach (var config in itemvideo.config.step) { string video_url = XPATHHelpers.ParsingHTML(html, config.Value.field_value); int is_url_video_cache = 0; int.TryParse(config.Value.is_url_video_cache, out is_url_video_cache); if (!string.IsNullOrEmpty(video_url)) { if (is_url_video_cache == 1) //if video url is cache { var dict_video_post = new Dictionary <string, object>(); dict_video_post.Add("content_id", ""); dict_video_post.Add("source_id", this.SourceID); dict_video_post.Add("video_url", video_url); dict_video_post.Add("video_url_cache", video_url); dict_video_post.Add("status", "NEED_CACHE"); dict_video_post.Add("video_url_path", "empty"); dict_video_post.Add("status_download", "PREPARING"); string json_video_Content = JsonConvert.SerializeObject(dict_video_post, Formatting.Indented); var response_video = DownloadExpress.DownloadPost(this.VIDEO_POST_URL, json_video_Content); } else { var dict_video_post = new Dictionary <string, object>(); dict_video_post.Add("content_id", ""); dict_video_post.Add("source_id", this.SourceID); dict_video_post.Add("video_url", video_url); dict_video_post.Add("video_url_cache", "empty"); dict_video_post.Add("status", "NEED_PARSING"); dict_video_post.Add("video_url_path", "empty"); dict_video_post.Add("status_download", "PREPARING"); string json_video_Content = JsonConvert.SerializeObject(dict_video_post, Formatting.Indented); var response_video = DownloadExpress.DownloadPost(this.VIDEO_POST_URL, json_video_Content); } } } } } } catch { } //--------------------------------------------------------------------------------------------------- foreach (var itemfield in base.ConfigFields.configfields) { string field_name = itemfield.name; string htmlParser = html; string content_text = ""; string the_last_config = string.Empty; string pattern_type = itemfield.config.pattern_type; if (pattern_type == PatternType.XPATH.ToString()) //pattern type xpath { foreach (var config in itemfield.config.step) { //################# check break_parsing point ############################## bool break_parsing = false; try { if (!string.IsNullOrWhiteSpace(config.Value.break_parsing)) { if (Convert.ToInt32(config.Value.break_parsing) == 0) { break_parsing = false; } else { break_parsing = true; } } } catch { break_parsing = false; } //########################################################################### if (break_parsing) //breaking parsing { htmlParser = XPATHHelpers.ParsingHTML(html, config.Value.xpath); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; if (field_name == "content") { the_last_config = config.Value.xpath; } } } else { htmlParser = XPATHHelpers.ParsingHTML(htmlParser, config.Value.xpath); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; if (field_name == "content") { the_last_config = config.Value.xpath; } } } //################## process remove html tag ############################### try { if (!string.IsNullOrWhiteSpace(config.Value.remove_html)) { if (Convert.ToInt32(config.Value.remove_html) == 1) { if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = HtmlUtility.RemoveAllTag(htmlParser); } } } } catch { } //########################################################################## } } else // STRING BETWEEN { foreach (var config in itemfield.config.step) { //################# check break_parsing point ############################## bool break_parsing = false; try { if (!string.IsNullOrWhiteSpace(config.Value.break_parsing)) { if (Convert.ToInt32(config.Value.break_parsing) == 0) { break_parsing = false; } else { break_parsing = true; } } } catch { break_parsing = false; } //########################################################################### if (break_parsing) //breaking parsing { if (string.IsNullOrWhiteSpace(content_text)) { htmlParser = StringHelpers.GetStringBetween(html, config.Value.start_pattern, config.Value.end_pattern); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; the_last_config = config.Value.start_pattern; } } else { htmlParser = StringHelpers.GetStringBetween(content_text, config.Value.start_pattern, config.Value.end_pattern); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; the_last_config = config.Value.start_pattern; } } } else { htmlParser = StringHelpers.GetStringBetween(htmlParser, config.Value.start_pattern, config.Value.end_pattern); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; the_last_config = config.Value.start_pattern; } } //################## process remove html tag ############################### try { if (!string.IsNullOrWhiteSpace(config.Value.remove_html)) { if (Convert.ToInt32(config.Value.remove_html) == 1) { if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = HtmlUtility.RemoveAllTag(htmlParser); } } } } catch { } //########################################################################## } //endfor } if (!string.IsNullOrWhiteSpace(content_text)) { if (field_name.ToLower() == "title") { dict_post.Add("title", content_text); content_khongdau += StringHelpers.ConvertToKD(content_text) + "\n\n"; } if (field_name.ToLower() == "tag_name") { if (string.IsNullOrWhiteSpace(content_text)) { content_text = "Không xác định"; } dict_post.Add("tag_name", content_text.Trim().Replace(" ", "")); } if (field_name.ToLower() == "published_at") { bool checkDateTime = false; if (content_text.Contains(".")) { content_text = content_text.Replace(".", "/"); } //content_text = "Thứ hai, 15/5/2017 | 09:48 GMT+7"; DateTime _published_date = Utility.ConvertDateTimeStringForArchived(content_text, out checkDateTime).Date; if (checkDateTime) { dict_post.Add("published_at", string.Format("{0}-{1}-{2}", _published_date.Year, _published_date.Month, _published_date.Day)); } else { DateTime _published_date_ = Utility.ConvertDateTimeStringForArchived(content_text, out checkDateTime).Date; string message = string.Format("[{0}] ERROR: {1} , {2}, {3}", ThreadName, "NOT PARSING DATATIME", content_text, url); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); continue; } content_text = content_text.Trim().Replace(" ", ""); dict_post.Add("published_time", content_text); } if (field_name.ToLower() == "content") { content_filter = HtmlUtility.RemoveAllTag(content_text); content_khongdau += StringHelpers.ConvertToKD(content_text) + "\n\n"; dict_post.Add("content", content_text); } //_dataString.Add(field_name, content_text); } } //_dataString.Add("html_data", html); //string jsonContent = JsonConvert.SerializeObject(_dataString, Formatting.Indented); //dict_post.Add("data", jsonContent); dict_post.Add("content_filter", content_filter); dict_post.Add("content_khongdau", content_khongdau); dict_post.Add("full_html", WebUtility.HtmlEncode(html)); string jsonContent = JsonConvert.SerializeObject(dict_post, Formatting.Indented); if (!string.IsNullOrWhiteSpace(jsonContent)) { string results = string.Empty; if (this.Mode != "TEST") // if not test then insert to api db { results = DownloadExpress.DownloadPost(this.PostURL, jsonContent); if (!string.IsNullOrEmpty(results)) //check data return from api if not null or empty { try { var _info = JsonConvert.DeserializeObject <Dictionary <string, string> >(results); string type = _info["type"]; switch (type) { case "UPDATE_VERSION_SUCCESS": string content_id = _info["content_id"]; this._BaseManagement.TotalUpdateLink = this._BaseManagement.TotalUpdateLink + 1; CounterManager.ResetTryingCount(); //--------------BEGIN capture content to images ------------------------------------------------------------------- CaptureArchivied.PDFConverterServiceClient _captureService = new CaptureArchivied.PDFConverterServiceClient(); CaptureArchivied.ConvertPDFRequest _pdfRequest = new CaptureArchivied.ConvertPDFRequest(); _pdfRequest.contentId = content_id; _pdfRequest.pageURL = url; _pdfRequest.filterBy = this.chk_unique_css == 1 ? 0 : 1; _pdfRequest.filterText = this.filter_pdf; _pdfRequest.deleteItems = this.remove_filter_pdf; var return_data = _captureService.ConvertToPDF(_pdfRequest); //--------------END capture content to images --------------------------------------------------------------------- break; case "INSERT_SUCCESS": string content_insert_id = _info["content_id"]; this._BaseManagement.TotalInsertLink = this._BaseManagement.TotalInsertLink + 1; CounterManager.ResetTryingCount(); //--------------BEGIN capture content to images ------------------------------------------------------------------- CaptureArchivied.PDFConverterServiceClient insert_captureService = new CaptureArchivied.PDFConverterServiceClient(); CaptureArchivied.ConvertPDFRequest insert_pdfRequest = new CaptureArchivied.ConvertPDFRequest(); insert_pdfRequest.contentId = content_insert_id; insert_pdfRequest.pageURL = url; insert_pdfRequest.filterBy = this.chk_unique_css == 1 ? 0 : 1; insert_pdfRequest.filterText = this.filter_pdf; insert_pdfRequest.deleteItems = this.remove_filter_pdf; var insert_return_data = insert_captureService.ConvertToPDF(insert_pdfRequest); //--------------END capture content to images --------------------------------------------------------------------- break; default: CounterManager.InCreaseTryingCount(); break; } } catch (Exception error) { string message = string.Format("[{0}] ERROR: {1} , {2}, {3}", ThreadName, error.Message, DateTime.Now, url); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); CounterManager.InCreaseTryingCount(); continue; } } else //empty or error { CounterManager.InCreaseTryingCount(); } } try { SpiderArgs args = new SpiderArgs() { SourceName = this.ThreadName, Index = number, Href = url, Title = (string)dict_post["title"], Message = results }; SpiderSingletonEvent.Instance.OnSpiderProcessing(args); } catch { } } } catch (Exception _errorParsing) { string message = string.Format("[{0}][PROCESS-PARSING] ERROR: {1} , {2}, {3}", ThreadName, _errorParsing.Message, DateTime.Now, url); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); CounterManager.InCreaseTryingCount(); continue; } #endregion #endregion } else { CounterManager.InCreaseTryingCount(); } if (this.Mode == "UPDATE") { if (number >= 2000) { CounterManager.CurrentTryingCount = 100000; } if (CounterManager.CurrentTryingCount >= this.MaxTryingCount) { string message = string.Format("[{0}] SLEEP FOR UPDATE AFTER {1} seconds, {2}", ThreadName, (this.UpdateSleep / 1000), DateTime.Now); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); Thread.Sleep(this.UpdateSleep); try { //this._BaseManagement.TotalUpdateLink = 0; //this._BaseManagement.TotalInsertLink = 0; //UrlStoragClear(); //CounterManager.ResetTryingCount(); CounterManager.Reset(0, 100); } catch { //this._BaseManagement.TotalUpdateLink = 0; //this._BaseManagement.TotalInsertLink = 0; //UrlStoragClear(); //CounterManager.ResetTryingCount(); CounterManager.Reset(0, 100); } } } Thread.Sleep(this.THREAD_SLEEP); } } catch (Exception ex) { string message = string.Format("[{0}] ERROR OUT OF THREAD: {1} , {2}", ThreadName, ex.Message.ToString(), DateTime.Now); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); } }
protected override void ProcessParseLink() { try { while (!IS_STOP) { int number = CounterManager.GetNumber(); _removelinks_detail = this.RemoveLinks.ToString(); string url = string.Empty; if (QueueDetailURL.Count > 0) { ConcreteLink objUrl; lock (QueueDetailURL) { try { objUrl = QueueDetailURL.Dequeue(); } catch { continue; } } string html = string.Empty; try { if (string.IsNullOrEmpty(objUrl.href)) { continue; } } catch { continue; } url = objUrl.href; if (!objUrl.href.Contains("https://") && !objUrl.href.Contains("http://")) { url = this.BaseURL + objUrl.href; } try { html = DownloadExpress.getResponseString(url, ""); } catch (Exception _error) { //------------------------------------------------------------------------------------------------------- string _errorString = string.Format("{0}-ERROR:{1}-URLDownload:{2}", this.ThreadName, _error.ToString(), url); SpiderArgs args = new SpiderArgs() { Message = _errorString }; SpiderSingletonEvent.Instance.OnSpiderScreenConsole(args); } if (string.IsNullOrEmpty(html)) { continue; } #region FIND DETAIL LINK /*foreach (var item in base.ConfigLinks.configlinks) * { * FindLinkBase findLink = null; * if (item.pattern_type == "REGEX") * { * //if(item.link_type == "REMOVE_LINK") * * if (_removelinks_detail != string.Empty) * { * findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, _removelinks_detail); * } * else * { * findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, ""); * } * } * * * if (findLink == null) * continue; * * if (item.link_type == "DETAIL_LINK") * { * List<string> linkDetail = findLink.FindLink(html); * if (linkDetail == null || linkDetail.Count == 0) * continue; * * * foreach (string href in linkDetail) * { * string _url = href; * if (IsNewUrl(ref _url)) * { * ConcreteLink _concreteLink = new ConcreteLink(); * _concreteLink.link_type = "DETAIL_LINK"; * _concreteLink.href = href; * QueueDetailURL.Enqueue(_concreteLink); * } * } * } * }*/ #endregion #region PARSE FIELD var dict_post = new Dictionary <string, object>(); dict_post.Add("source_id", this.SourceID); dict_post.Add("href", url); Dictionary <string, string> _dataString = new Dictionary <string, string>(); string the_last_config = string.Empty; string content_filter = string.Empty; string content_khongdau = string.Empty; #region process parsing try { #region Parsing Field foreach (var itemfield in base.ConfigFields.configfields) { string field_name = itemfield.name; string htmlParser = html; string content_text = ""; string pattern_type = itemfield.config.pattern_type; if (pattern_type == PatternType.XPATH.ToString()) //pattern type xpath { foreach (var config in itemfield.config.step) { //################# check break_parsing point ############################## bool break_parsing = false; try { if (!string.IsNullOrWhiteSpace(config.Value.break_parsing)) { if (Convert.ToInt32(config.Value.break_parsing) == 0) { break_parsing = false; } else { break_parsing = true; } } } catch { break_parsing = false; } //########################################################################### if (break_parsing) //breaking parsing { htmlParser = XPATHHelpers.ParsingHTML(html, config.Value.xpath); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; if (field_name == "content") { the_last_config = config.Value.xpath; } } } else { htmlParser = XPATHHelpers.ParsingHTML(htmlParser, config.Value.xpath); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; if (field_name == "content") { the_last_config = config.Value.xpath; } } } //################## process remove html tag ############################### try { if (!string.IsNullOrWhiteSpace(config.Value.remove_html)) { if (Convert.ToInt32(config.Value.remove_html) == 1) { if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = HtmlUtility.RemoveAllTag(htmlParser); } } } } catch { } //########################################################################## } } else //pattern type string_between { foreach (var config in itemfield.config.step) { //################# check break_parsing point ############################## bool break_parsing = false; try { if (!string.IsNullOrWhiteSpace(config.Value.break_parsing)) { if (Convert.ToInt32(config.Value.break_parsing) == 0) { break_parsing = false; } else { break_parsing = true; } } } catch { break_parsing = false; } //########################################################################### if (break_parsing) //breaking parsing { if (string.IsNullOrWhiteSpace(content_text)) { htmlParser = StringHelpers.GetStringBetween(html, config.Value.start_pattern, config.Value.end_pattern); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; } } else { htmlParser = StringHelpers.GetStringBetween(content_text, config.Value.start_pattern, config.Value.end_pattern); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; } } } else { htmlParser = StringHelpers.GetStringBetween(htmlParser, config.Value.start_pattern, config.Value.end_pattern); if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = htmlParser; } } //################## process remove html tag ############################### try { if (!string.IsNullOrWhiteSpace(config.Value.remove_html)) { if (Convert.ToInt32(config.Value.remove_html) == 1) { if (!string.IsNullOrWhiteSpace(htmlParser)) { content_text = HtmlUtility.RemoveAllTag(htmlParser); } } } } catch { } //########################################################################## } } if (!string.IsNullOrWhiteSpace(content_text)) { if (field_name.ToLower() == "title") { dict_post.Add("title", content_text); content_khongdau += StringHelpers.ConvertToKD(content_text) + "\n\n"; } if (field_name.ToLower() == "tag_name") { if (string.IsNullOrWhiteSpace(content_text)) { content_text = "Không xác định"; } dict_post.Add("tag_name", content_text.Trim().Replace(" ", "")); } if (field_name.ToLower() == "published_at") { bool checkDateTime = false; //content_text = "Thứ hai, 15/5/2017 | 09:48 GMT+7"; DateTime _published_date = Utility.ConvertDateTimeStringForArchived(content_text, out checkDateTime).Date; if (checkDateTime) { dict_post.Add("published_at", string.Format("{0}-{1}-{2}", _published_date.Year, _published_date.Month, _published_date.Day)); } else { DateTime _published_date_ = Utility.ConvertDateTimeStringForArchived(content_text, out checkDateTime).Date; string message = string.Format("[{0}] ERROR: {1} , {2}, {3}", ThreadName, "NOT PARSING DATATIME", content_text, url); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); continue; } content_text = content_text.Trim().Replace(" ", ""); dict_post.Add("published_time", content_text); } if (field_name.ToLower() == "content") { content_filter = HtmlUtility.RemoveAllTag(content_text); content_khongdau += StringHelpers.ConvertToKD(content_text) + "\n\n"; dict_post.Add("content", content_text); } _dataString.Add(field_name, content_text); } } #endregion dict_post.Add("content_filter", content_filter); dict_post.Add("content_khongdau", content_khongdau); dict_post.Add("full_html", html); string jsonContent = JsonConvert.SerializeObject(dict_post, Formatting.Indented); if (!string.IsNullOrWhiteSpace(jsonContent)) { string results = string.Empty; if (this.Mode != "TEST") // if not test then insert to api db { results = DownloadExpress.DownloadPost(this.PostURL, jsonContent); if (!string.IsNullOrEmpty(results)) //check data return from api if not null or empty { try { var _info = JsonConvert.DeserializeObject <Dictionary <string, string> >(results); string type = _info["type"]; switch (type) { case "UPDATE_VERSION_SUCCESS": this._BaseManagement.TotalUpdateLink = this._BaseManagement.TotalUpdateLink + 1; CounterManager.ResetTryingCount(); string content_id = _info["content_id"]; //--------------BEGIN capture content to images ------------------------------------------------------------------- if (this.SourceID == "58d0ee601d41c849a893a49d") //vnexpress { CaptureArchivied.PDFConverterServiceClient _captureService = new CaptureArchivied.PDFConverterServiceClient(); if (url.Contains("sohoa.vnexpress.net")) { CaptureArchivied.ConvertPDFRequest _pdfRequest = new CaptureArchivied.ConvertPDFRequest(); _pdfRequest.contentId = content_id; _pdfRequest.pageURL = url; _pdfRequest.filterBy = 1; _pdfRequest.filterText = "main_content_detail"; _pdfRequest.deleteItems = "#box_tinlienquan,#box_comment,.banner_468,#social_like,.xemthem_new_ver,.banner_468x90,.block_goithutoasoan"; //capture by id var return_data = _captureService.ConvertToPDF(_pdfRequest); } else { CaptureArchivied.ConvertPDFRequest _pdfRequest = new CaptureArchivied.ConvertPDFRequest(); _pdfRequest.contentId = content_id; _pdfRequest.pageURL = url; _pdfRequest.filterBy = 1; _pdfRequest.filterText = "block_col_480"; _pdfRequest.deleteItems = "#box_tinlienquan,#box_comment,.banner_468,#social_like,.xemthem_new_ver,.banner_468x90,.block_goithutoasoan"; var return_data = _captureService.ConvertToPDF(_pdfRequest); } //capture by id /*if (the_last_config.Contains("fck_detail")) * { * CaptureArchivied.ReturnData return_data = _captureService.Capture(url, 3, "fck_detail", null, content_id); * } * else if(the_last_config.Contains("article_content")) * { * CaptureArchivied.ReturnData return_data = _captureService.Capture(url, 2, "article_content", null, content_id); * } * else if (the_last_config.Contains("container_tab_live")) * { * CaptureArchivied.ReturnData return_data = _captureService.Capture(url, 2, "container_tab_live", null, content_id); * }*/ } //--------------END capture content to images --------------------------------------------------------------------- break; case "INSERT_SUCCESS": string content_insert_id = _info["content_id"]; this._BaseManagement.TotalInsertLink = this._BaseManagement.TotalInsertLink + 1; CounterManager.ResetTryingCount(); //--------------BEGIN capture content to images ------------------------------------------------------------------- if (this.SourceID == "58d0ee601d41c849a893a49d") //vnexpress { CaptureArchivied.PDFConverterServiceClient _captureService = new CaptureArchivied.PDFConverterServiceClient(); if (url.Contains("sohoa.vnexpress.net")) { CaptureArchivied.ConvertPDFRequest _pdfRequest = new CaptureArchivied.ConvertPDFRequest(); _pdfRequest.contentId = content_insert_id; _pdfRequest.pageURL = url; _pdfRequest.filterBy = 1; _pdfRequest.filterText = "main_content_detail"; _pdfRequest.deleteItems = "#box_tinlienquan,#box_comment,.banner_468,#social_like,.xemthem_new_ver,.banner_468x90,.block_goithutoasoan"; //capture by id var return_data = _captureService.ConvertToPDF(_pdfRequest); } else { CaptureArchivied.ConvertPDFRequest _pdfRequest = new CaptureArchivied.ConvertPDFRequest(); _pdfRequest.contentId = content_insert_id; _pdfRequest.pageURL = url; _pdfRequest.filterBy = 1; _pdfRequest.filterText = "block_col_480"; _pdfRequest.deleteItems = "#box_tinlienquan,#box_comment,.banner_468,#social_like,.xemthem_new_ver,.banner_468x90,.block_goithutoasoan"; var return_data = _captureService.ConvertToPDF(_pdfRequest); } //capture by id /*if (the_last_config.Contains("fck_detail")) * { * CaptureArchivied.ReturnData return_data = _captureService.Capture(url, 3, "fck_detail", null, content_insert_id); * } * else if (the_last_config.Contains("article_content")) * { * CaptureArchivied.ReturnData return_data = _captureService.Capture(url, 2, "article_content", null, content_insert_id); * } * else if (the_last_config.Contains("container_tab_live")) * { * CaptureArchivied.ReturnData return_data = _captureService.Capture(url, 2, "container_tab_live", null, content_insert_id); * }*/ } //--------------END capture content to images --------------------------------------------------------------------- break; default: CounterManager.InCreaseTryingCount(); break; } } catch (Exception error) { string message = string.Format("[{0}] ERROR: {1} , {2}, {3}", ThreadName, error.Message, DateTime.Now, url); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); CounterManager.InCreaseTryingCount(); continue; } } else //empty or error { CounterManager.InCreaseTryingCount(); } } try { SpiderArgs args = new SpiderArgs() { SourceName = this.ThreadName, Index = number, Href = url, Title = (string)dict_post["title"], Message = results }; SpiderSingletonEvent.Instance.OnSpiderProcessing(args); } catch { } } } catch (Exception _errorParsing) { string message = string.Format("[{0}][PROCESS-PARSING] ERROR: {1} , {2}, {3}", ThreadName, _errorParsing.Message, DateTime.Now, url); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); CounterManager.InCreaseTryingCount(); continue; } #endregion #endregion } else { CounterManager.InCreaseTryingCount(); } if (this.Mode == "UPDATE") { if (number >= 2000) { CounterManager.CurrentTryingCount = 100000; } if (CounterManager.CurrentTryingCount >= this.MaxTryingCount) { string message = string.Format("[{0}] SLEEP FOR UPDATE AFTER {1} seconds, {2}", ThreadName, (this.UpdateSleep / 1000), DateTime.Now); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); Thread.Sleep(this.UpdateSleep); try { //this._BaseManagement.TotalUpdateLink = 0; //this._BaseManagement.TotalInsertLink = 0; //UrlStoragClear(); //CounterManager.ResetTryingCount(); CounterManager.Reset(0, 100); } catch { //this._BaseManagement.TotalUpdateLink = 0; //this._BaseManagement.TotalInsertLink = 0; //UrlStoragClear(); //CounterManager.ResetTryingCount(); CounterManager.Reset(0, 100); } } } Thread.Sleep(this.THREAD_SLEEP); } } catch (Exception ex) { string message = string.Format("[{0}] ERROR OUT OF THREAD: {1} , {2}", ThreadName, ex.Message.ToString(), DateTime.Now); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); } }
protected override void ProcessParseLink() { try { while (!IS_STOP) { int number = CounterManager.GetNumber(); string url = string.Empty; if (QueueDetailURL.Count > 0) { ConcreteLink objUrl = QueueDetailURL.Dequeue(); string html = string.Empty; try { if (string.IsNullOrEmpty(objUrl.href)) { continue; } } catch { continue; } url = objUrl.href; if (!objUrl.href.Contains("https://") && !objUrl.href.Contains("http://")) { url = this.BaseURL + objUrl.href; } try { html = DownloadExpress.Download(url, Utility.GetEncoding(EncodingType.UTF8)); } catch { try { html = DownloadExpress.DownloadBySocket(url, Utility.GetEncoding(EncodingType.UTF8)); } catch { continue; } } if (string.IsNullOrEmpty(html)) { continue; } #region PARSE FIELD var dict_post = new Dictionary <string, object>(); dict_post.Add("source_id", this.SourceID); dict_post.Add("href", url); Dictionary <string, string> _dataString = new Dictionary <string, string>(); foreach (var itemfield in base.ConfigFields.configfields) { string field_name = itemfield.name; string htmlParser = html; string content_text = ""; foreach (var config in itemfield.config.step) { htmlParser = StringHelpers.GetStringBetween(htmlParser, config.Value.start_pattern, config.Value.end_pattern); content_text = htmlParser; } if (!string.IsNullOrWhiteSpace(content_text)) { if (field_name.ToLower() == "title") { SpiderArgs args = new SpiderArgs() { SourceName = this.ThreadName, Index = number, Href = url, Title = content_text, Message = string.Empty }; SpiderSingletonEvent.Instance.OnSpiderProcessing(args); } if (field_name.ToLower() == "tag_name") { dict_post.Add("tag_name", content_text); } _dataString.Add(field_name, content_text); } } _dataString.Add("html_data", html); string jsonContent = JsonConvert.SerializeObject(_dataString, Formatting.Indented); dict_post.Add("data", jsonContent); jsonContent = JsonConvert.SerializeObject(dict_post, Formatting.Indented); if (!string.IsNullOrWhiteSpace(jsonContent)) { string results = DownloadExpress.DownloadPost(this.PostURL, jsonContent); } #endregion } Thread.Sleep(this.THREAD_SLEEP); } } catch (Exception ex) { string test = ex.Message; } }
protected override void ProcessFindLink() { try { while (!IS_STOP) { int number = CounterManager.GetNumber(); //_removelinks = this.RemoveLinks.ToString(); string url = string.Empty; if (FrontierURL.Count > 0) { ConcreteLink objUrl = FrontierURL.Dequeue(); string html = string.Empty; if (string.IsNullOrEmpty(objUrl.href)) { continue; } url = objUrl.href; if (!objUrl.href.Contains("https://") && !objUrl.href.Contains("http://")) { url = this.BaseURL + objUrl.href; } try { html = DownloadExpress.Download(url, Utility.GetEncoding(EncodingType.UTF8)); } catch { try { html = DownloadExpress.DownloadBySocket(url, Utility.GetEncoding(EncodingType.UTF8)); } catch (Exception ex) { continue; } } if (string.IsNullOrEmpty(html)) { continue; } foreach (var item in base.ConfigLinks.configlinks) { FindLinkBase findLink = null; if (item.pattern_type == "REGEX") { //if(item.link_type == "REMOVE_LINK") if (_removelinks != string.Empty) { findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, _removelinks); } else { findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, ""); } } if (findLink == null) { continue; } #region SUB_LINK if (item.link_type == "SUB_LINK") { List <string> listLinkSub = findLink.FindLink(html); if (listLinkSub == null || listLinkSub.Count == 0) { continue; } foreach (string linkSub in listLinkSub) { string _url = linkSub; if (IsNewUrl(ref _url)) //check exist link { if (!_url.Contains(".html#") && !_url.Contains(".htm#")) { ConcreteLink _concreteLink = new ConcreteLink(); _concreteLink.link_type = "SUB_LINK"; _concreteLink.href = linkSub; FrontierURL.Enqueue(_concreteLink); SpiderArgs args = new SpiderArgs() { Message = linkSub + "\r\n" }; SpiderSingletonEvent.Instance.OnSpiderInformation(args); } } } } #endregion #region DETAIL_LINK else if (item.link_type == "DETAIL_LINK") { List <string> linkDetail = findLink.FindLink(html); if (linkDetail == null || linkDetail.Count == 0) { continue; } foreach (string href in linkDetail) { string _url = href; if (IsNewUrl(ref _url)) //check exist link { if (!_url.Contains(".html#") && !_url.Contains(".htm#")) { ConcreteLink _concreteLink = new ConcreteLink(); _concreteLink.link_type = "DETAIL_LINK"; _concreteLink.href = href; QueueDetailURL.Enqueue(_concreteLink); SpiderArgs args = new SpiderArgs() { Message = _url + " - DETAIL" + "\r\n" }; SpiderSingletonEvent.Instance.OnSpiderInformation(args); } } } } #endregion } } Thread.Sleep(this.THREAD_SLEEP); } } catch (Exception ex) { string test = ex.Message; } }