public List <string> GetImage(Uri uri, HtmlDocument document, ConfigXPaths configXPath) { try { List <string> lstReuslt = new List <string>(); List <string> lst = configXPath.ImageUrlsXPaths; if (lst != null && lst.Count > 0) { foreach (string str in lst) { List <string> lstImage = Common.GetTextInNode(document, str, configXPath); if (lstImage != null) { foreach (string s in lstImage) { lstReuslt.Add(Common.GetAbsoluteUrl(s, uri)); } } } } return(lstReuslt); } catch (Exception ex) { logErrorDb.SaveError(LogErrorToDb.errorParseImage, uri.AbsoluteUri, configXPath.ID, ex.Message); return(null); } }
/// <summary> /// Trả về số dưựa ào config list /// 1 phần tử có dạng: Regex:Int. /// Ví dụ: *.ô tô.*:9 /// </summary> /// <param name="uri"></param> /// <param name="lstRegexToInt"></param> /// <param name="WebCategory"></param> /// <param name="doc"></param> /// <returns></returns> public int RecognizeCategory(string url, ConfigXPaths config, string WebCategory, int defaultReturn = 0) { try { int iCat = 0; List <string> lstRegexToInt = config.RegexStringToCategory; if (lstRegexToInt != null && lstRegexToInt.Count > 0) { foreach (string aRule in lstRegexToInt) { if (aRule.Contains(":")) { string[] regexs = aRule.Split(new char[] { ':' }); if (regexs.Length > 1) { if (Regex.IsMatch(WebCategory, regexs[0])) { if (Int32.TryParse(regexs[1], out iCat)) { return(iCat); } } } } } } return(defaultReturn); } catch (Exception ex) { this.logErrorDb.SaveError(LogErrorToDb.errorRegonizeCategory, url, config.ID, ex.Message); return(0); } }
private void btnTestProduct_Click(object sender, EventArgs e) { string urlTest = urlTestTextBox.Text; if (!string.IsNullOrWhiteSpace(urlTest)) { string url = urlTestTextBox.Text; string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 45, 2); GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); WebExceptionStatus status = WebExceptionStatus.Success; html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 45, 2, out status); doc.LoadHtml(html); ConfigXPaths config = this.raovatSqlAdapter.GetConfigByID((int)this.configXPathIDSpinEdit.Value); if (config == null) { config = new ConfigXPaths() { ID = -1 } } ; if (this.LoadFormToConfig(ref config)) { var product = new ProductSaleNew(); int iError = this.hanlerContentOfHtml.AnalyticsProductSaleNew(config.domain, urlTest, config, product, this.raovatSqlAdapter.GetDicMapClassificationAndCategories(config.website_id), this.raovatSqlAdapter.GetDicCityAndRegex()); FrmDataShow frmDataShow = new FrmDataShow(product.ToString()); frmDataShow.btnSave.Click += new EventHandler(delegate(object obj, EventArgs eventArg) { if (MessageBox.Show("Save to Cassandra?", "Warning", MessageBoxButtons.YesNo, MessageBoxIcon.Warning) == System.Windows.Forms.DialogResult.Yes) { bool bExits = this.mongoDbAdapter.CheckExistsProductSalenew(product.id); if (bExits) { mongoDbAdapter.UpdateProduct(product); mongoDbAdapter.SaveHtml(product.id, html, bExits); } else { mongoDbAdapter.InsertProduct(product); mongoDbAdapter.SaveHtml(product.id, html, bExits); } } }); frmDataShow.ShowDialog(); } } else { MessageBox.Show("Not url test"); } }
/// <summary> /// 0-Full. /// 1-RealTime. /// </summary> /// <param name="iType"></param> public SimpleCrawlerRaoVat(int ConfigID, ETypeCrawlRaoVat TypeCrawler) { this.sqlDb = new SqlDb(QT.Entities.Server.ConnectionString); this.iTypeCrawler = TypeCrawler; this.setAddedQueue = new SetCrawlerRaoVat(this.sqlDb, ConfigID, Convert.ToInt32(this.iTypeCrawler)); this.queueWaitRun = new QueueCrawlerRaoVat(this.sqlDb, ConfigID, Convert.ToInt32(this.iTypeCrawler)); this.sqlRaoVatAdapter = new RaoVatSQLAdapter(this.sqlDb); this.configXPath = sqlRaoVatAdapter.GetConfigByID(ConfigID); this.Domain = this.configXPath.domain; this.mongoDbAdapter = new MongoDbRaoVat(); this.dicMapClassificationAndCategories = this.sqlRaoVatAdapter.GetDicMapClassificationAndCategories(this.configXPath.website_id); this.dicCityAndRegex = this.sqlRaoVatAdapter.GetDicCityAndRegex(); }
public ConfigXPaths GetConfigByID(int ConfigID) { ConfigXPaths config = null; RowSet rows = this._session.Execute(string.Format("SELECT * FROM Config_Crawl WHERE ID = {0}", ConfigID), 1); if (rows.Info.TriedHosts.Count == 1) { foreach (Row row in rows) { config = new ConfigXPaths(); config.ID = Convert.ToInt32(row.GetValue(typeof(int), CassandraColumn.id)); config.AddressXPaths = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.address_xpaths)); config.AvaiableXPaths = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.avaiable_xpaths)); config.CategoryXPaths = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.category_xpaths)); config.ContentXPaths = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.content_xpaths)); config.ImageUrlsXPaths = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.image_url_xpaths)); config.ProvinceXPaths = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.province_xpaths)); config.QualityXPaths = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.quality_xpaths)); config.ThumbUrlXPaths = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.thumb_url_xpaths)); config.TitleXPaths = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.title_xpaths)); config.WebCategoryXPaths = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.web_category_xpaths)); config.PostDateXPaths = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.post_date_xpaths)); config.LastChangeXPaths = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.last_change_xpaths)); config.PhoneSalerXPaths = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.phone_saler_xpaths)); config.PriceXPaths = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.price_xpaths)); config.WebCategoryXPaths = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.web_category_xpaths)); config.UserNameXPaths = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.saler_name_xpaths)); config.UrlTest = row.GetValue <string>(CassandraColumn.url_test); config.TimeDelay = Common.Obj2Int(row.GetValue(typeof(string), CassandraColumn.time_delay)); config.ItemReCrawler = Common.Obj2Int(row.GetValue(typeof(string), CassandraColumn.item_recrawl)); config.CategoryID = Common.Obj2Int(row.GetValue(typeof(string), CassandraColumn.category_id)); config.NoVisitUrlRegex = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.novisit_url_regex)); config.NoProductUrlRegex = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.noproduct_url_regex)); config.ProductUrlsRegex = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.product_url_regex)); config.VisitUrlsRegex = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.visit_url_regex)); config.tags_xpaths = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.tags_xpaths)); string str = row.GetValue <string>(CassandraColumn.extend_xpaths); if (!string.IsNullOrEmpty(str)) { config.extend_xpaths = str.Split(new char[] { '\n' }, 200, StringSplitOptions.RemoveEmptyEntries).ToList(); } config.spe_car_carmaker_xpaths = Common.GetListXPathFromString(row.GetValue <string>(CassandraColumn.spe_car_carmaker_xpaths)); return(config); } } return(null); }
private string GetLocation(HtmlDocument document, ConfigXPaths configXPath) { List <string> xPaths = configXPath.XPaths08; if (xPaths != null && xPaths.Count > 0) { foreach (string xPath in xPaths) { var node = document.DocumentNode.SelectSingleNode(xPath); string location = System.Text.RegularExpressions.Regex.Replace(node.InnerText.Trim(), "\r|\n|\t", ""); if (!string.IsNullOrEmpty(location)) { return(location.Contains(':') ? location.Substring(location.LastIndexOf(':') + 1) : ""); } } } return(""); }
private void SaveData() { try { bool bExits = false; ConfigXPaths config = this.raovatSqlAdapter.GetConfigByID((int)configXPathIDSpinEdit.Value); if (config == null) { config = new ConfigXPaths() { ID = (int)this.configXPathIDSpinEdit.Value }; bExits = false; } else { bExits = true; } if (this.LoadFormToConfig(ref config)) { if ((!bExits) || ( bExits && MessageBox.Show("Đã tồn tại, muốn ghi đè không", "Warning" , MessageBoxButtons.YesNo, MessageBoxIcon.Warning, MessageBoxDefaultButton.Button2) == System.Windows.Forms.DialogResult.Yes)) { if (this.raovatSqlAdapter.UpdateConfigXPath(config)) { //Cap nhat len Redis. config = this.raovatSqlAdapter.GetConfigByID(config.ID); LoadConfigToForm(config); //this.redisDb.SetValueSession("", config.ID // , RedisDb.FieldSession_LastUpdateToSql // , (config.wws_last_change_config).ToString(RedisDb.Format_DateTime)); MessageBox.Show("Updated!"); } } } } catch (Exception ex) { MessageBox.Show(ex.Message); } }
public string GetWebCategory(string url, HtmlDocument document, ConfigXPaths config) { try { var xpaths = config.WebCategoryXPaths; if (xpaths == null || xpaths.Count == 0) { return(""); } else { string xPath = xpaths[0]; var nodes = document.DocumentNode.SelectNodes(xPath); if (nodes != null) { string str = ""; foreach (var node in nodes) { string s = node.InnerText; s = Common.ChangeSpaceCharacter(s); s = Common.RemoveDumplicateSpace(s); s = s.Trim(); str = str + (string.IsNullOrEmpty(str) ? "" : ">") + s; } str = Common.RemoveDumplicateSpace(str); str = str.Replace("/", ">"); str = str.Trim(); str = str.ToLower(); str = Common.ChuanHoaUnicode(str); str = Regex.Replace(str, @"\s>", ">"); str = Regex.Replace(str, @">\s", ">"); return(str); } } return(""); } catch (Exception ex) { this.logErrorDb.SaveError(LogErrorToDb.errorParseWebCategory, url, config.ID, ex.Message); return(""); } }
private void button6_Click_1(object sender, EventArgs e) { ConfigXPaths config = this.raovatSqlAdapter.GetConfigByID((int)configXPathIDSpinEdit.Value); if (config == null) { config = new ConfigXPaths() { ID = (int)this.configXPathIDSpinEdit.Value } } ; if (this.LoadFormToConfig(ref config)) { if (this.raovatSqlAdapter.UpdateConfigXPath(config)) { MessageBox.Show("Updated!"); } } }
private string GetCategory(HtmlDocument document, ConfigXPaths configXPath) { if (configXPath.XPaths52 != null && configXPath.XPaths52.Count > 0) { var nodes = document.DocumentNode.SelectSingleNode("//nav/fieldset//span"); if (nodes != null) { string str = Regex.Replace(Regex.Replace(nodes.InnerText, "\n|\r|\t", ""), ">", "->"); if (str.StartsWith("->")) { str = str.Remove(0, 2); } else if (str.EndsWith("->")) { str = str.Remove(str.Length - 2, 2); } return(str); } } return(""); }
private string GetTitleSaleNew(string url, HtmlDocument document, ConfigXPaths configXPath) { try { foreach (var xPath in configXPath.TitleXPaths) { HtmlNodeCollection a = document.DocumentNode.SelectNodes(xPath); if (a != null && a.Count > 0 && a[0].InnerText != "") { string result = a[0].InnerText; result = Regex.Replace(result, "\n|\t|\r", " "); return(Common.RemoveDumplicateSpace(result).Trim()); } } return(""); } catch (Exception ex) { this.logErrorDb.SaveError(LogErrorToDb.errorParseTitleProduct, url, configXPath.ID, ex.Message); return(""); } }
public string GetContent(string url, HtmlDocument document, ConfigXPaths configXPath) { try { string strResult = ""; List <string> lst = configXPath.ContentXPaths; if (lst != null && lst.Count > 0) { foreach (string xPath in lst) { List <string> lstTitle = Common.GetTextInNode(document, xPath, configXPath); strResult += Common.ConvertToString(lstTitle, " "); } } return(Common.ChuanHoaUnicode(strResult)); } catch (Exception ex) { logErrorDb.SaveError(LogErrorToDb.errorParseContent, url, configXPath.ID, ex.Message); return(""); } }
private void ProcessData_ReloadAllField(object objSender, BsonDocument productNeedProcess) { ProductSaleNew productSaleNew = new ProductSaleNew(); ConfigXPaths configXPath = this.dicConfigXPath[productNeedProcess["source_id"].AsInt32]; int iFail = handerContent.AnalyticsProductSaleNew(configXPath.domain, productNeedProcess["url"].AsString, configXPath, productSaleNew, null, null); if (productSaleNew.IsDetailSucess) { //Cập nhật hàng hóa. mon.UpdateProduct(productSaleNew); mon.colProduct.UpdateOneAsync( Builders <BsonDocument> .Filter.Eq("_id", productNeedProcess["_id"].AsObjectId) , Builders <BsonDocument> .Update .Set("processed", true)); } else { int iNumberFail = 0; int iStatusCurrent = productNeedProcess["status"].AsInt32; if (productNeedProcess.Contains("fail_count")) { iNumberFail = productNeedProcess["fail_count"].AsInt32; } iNumberFail++; if (iNumberFail >= 1 && iStatusCurrent == 1) { iStatusCurrent = 2; } mon.colProduct.UpdateOneAsync( Builders <BsonDocument> .Filter.Eq("_id", productNeedProcess["_id"].AsObjectId) , Builders <BsonDocument> .Update .Set("status", iStatusCurrent) .Set("fail_count", iNumberFail) .Set("is_solr_updated", false) .CurrentDate("updated_at")); } }
private decimal GetPriceSaleNew(HtmlDocument document, ConfigXPaths configXPath) { if (configXPath.XPaths04 != null && configXPath.XPaths04.Count > 0) { foreach (string xPath in configXPath.XPaths04) { var node = document.DocumentNode.SelectSingleNode(xPath); if (node != null) { string textPrice = node.InnerText.Trim(); if (!string.IsNullOrWhiteSpace(textPrice)) { decimal price = Common.ParsePrice(textPrice, true); if (price > 0) { return(price); } } } } } return(0); }
public string GetTagsString(HtmlDocument document, ConfigXPaths configXPath) { string str = ""; List <string> lst = configXPath.tags_xpaths; if (lst != null && lst.Count > 0) { foreach (var xPath in lst) { string sItem = ""; List <string> arData = Common.GetTextInNode(document, xPath, configXPath); for (int i = 0; i < arData.Count; i++) { sItem = (arData == null || arData.Count > 0) ? arData[i] : ""; sItem = Common.RemoveDumplicateSpace(sItem); sItem = sItem.ToLower(); sItem = sItem.Trim(); str = str + (string.IsNullOrEmpty(str) ? "" : ",") + sItem; } } } return(str); }
private string GetPhoneSaler(HtmlDocument document, ConfigXPaths configXPath) { try { List <string> lst = configXPath.PhoneSalerXPaths; if (lst != null && lst.Count > 0) { foreach (string xPath in lst) { List <string> phone01 = Common.GetTextInNode(document, xPath, configXPath); if (phone01 != null && phone01.Count > 0) { return(phone01[0]); } } } } catch (Exception ex) { log.ErrorFormat("Exception:{0}", ex.Message); } return(""); }
private string GetProvince(HtmlDocument document, ConfigXPaths configXPath) { try { List <string> lst = configXPath.ProvinceXPaths; if (lst != null && lst.Count > 0) { foreach (string str in lst) { string strText = Common.ChuanHoaUnicode(Common.GetTextOfXPath(document, lst[0])); if (!string.IsNullOrEmpty(strText)) { return(strText); } } } return(""); } catch (Exception ex) { log.ErrorFormat("Exception:{0}", ex.Message); return(""); } }
private string GetQuality(HtmlDocument document, ConfigXPaths configXPath) { try { List <string> lst = configXPath.QualityXPaths; if (lst != null && lst.Count > 0) { if (configXPath.ID == 1) { var node = document.DocumentNode.SelectSingleNode(lst[0]); if (node != null) { string textTable = node.InnerText; string[] ar = textTable.Replace(">", "").Split(new char[] { '\n', '\r', '\t' }, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < ar.Length; i++) { if (ar[i].Contains("Tình trạng:") && i < ar.Length - 1) { return(ar[i + 1]); } } } } else { return(Regex.Replace(this.GetDataFromNodes(document, lst, false), "\n|\t|\r", "").Trim()); } } return(""); } catch (Exception ex) { log.ErrorFormat("Exception:{0}", ex.Message); return(""); } }
private DateTime GetLastEdit(string url, HtmlDocument document, ConfigXPaths configXPath) { return(ParseDate(url, document, configXPath.ID, configXPath.last_edit_xpaths)); }
private bool LoadFormToConfig(ref ConfigXPaths config) { try { config.website_id = Convert.ToInt32(CboWebSiteID.SelectedValue); config.TitleXPaths = wssCommon.GetListXPathFromString(this.TitleXPathsTextBox.Text); config.PriceXPaths = wssCommon.GetListXPathFromString(this.PriceXPathsTextBox.Text); config.PostDateXPaths = wssCommon.GetListXPathFromString(this.PostDateXPathsTextBox.Text); config.LastChangeXPaths = wssCommon.GetListXPathFromString(this.LastChangeXPathsTextBox.Text); config.ProvinceXPaths = wssCommon.GetListXPathFromString(this.ProvinceXPathsTextBox.Text); config.PhoneSalerXPaths = wssCommon.GetListXPathFromString(this.PhoneSalerXPathsTextBox.Text); config.AddressXPaths = wssCommon.GetListXPathFromString(this.AddressXPathsTextBox.Text); config.ContentXPaths = wssCommon.GetListXPathFromString(this.ContentXPathTextbox.Text); config.AvaiableXPaths = wssCommon.GetListXPathFromString(this.AvaiableXPathsTextBox.Text); config.QualityXPaths = wssCommon.GetListXPathFromString(this.QualityXPathsTextBox.Text); config.UserNameXPaths = wssCommon.GetListXPathFromString(this.UserNameXPathsTextBox.Text); config.ThumbUrlXPaths = wssCommon.GetListXPathFromString(this.txtThumbXPaths.Text); config.AllowExtractProductLink = ckExtractProduct.Checked; config.ReloadVisitUrlsRegex = wssCommon.GetListXPathFromString(this.ExtrationRegex_richTextBox4.Text); config.ReloadNoVisitUrlsRegex = wssCommon.GetListXPathFromString(this.NoExtrationRegex_richTextBox4.Text); config.ReloadProductUrlsRegex = wssCommon.GetListXPathFromString(this.ProductRegex_RichTextBox.Text); config.ReloadNoProductUrlRegex = wssCommon.GetListXPathFromString(this.NoProductRegex_RichTextBox.Text); config.RegexReloadLikeFull = this.RegexReloadLikeFull_CheckEdit.Checked; config.image_regex = wssCommon.GetListXPathFromString(this.txtImage.Text); config.noimage_regex = wssCommon.GetListXPathFromString(this.txtNoRegexImage.Text); config.UrlTest = this.urlTestTextBox.Text; config.VisitUrlsRegex = wssCommon.GetListXPathFromString(this.visitUrlsRegexTextBox.Text); config.NoVisitUrlRegex = wssCommon.GetListXPathFromString(this.noVisitUrlsRegexTextBox.Text); config.ProductUrlsRegex = wssCommon.GetListXPathFromString(this.ProductUrlsRegexTextBox.Text); config.NoProductUrlRegex = wssCommon.GetListXPathFromString(this.NoProductUrlRegexTextBox.Text); config.AllowExtractProductLink = ckExtractProduct.Checked; config.TimeDelay = (int)timeDelaySpinEdit.Value; config.ItemReCrawler = (int)itemReCrawlerSpinEdit.Value; config.RootLink = Common.GetListXPathFromString(rootLinkTextBox.Text); config.ID = Convert.ToInt32(configXPathIDSpinEdit.Value); config.ImageUrlsXPaths = wssCommon.GetListXPathFromString(this.ImageUrlsXPathsTextBox.Text); config.WebCategoryXPaths = wssCommon.GetListXPathFromString(this.webCategoryXPathsTextBox.Text); config.extend_xpaths = Common.GetListXPathFromString(this.textPropertyTextBox.Text); config.tags_xpaths = Common.GetListXPathFromString(txtTagXPaths.Text); config.wss_interval_push = Convert.ToInt32(minMinuteWaitSpin.Value); config.wss_allow_auto_push = Convert.ToBoolean(allowAutoPushCheckBox.Checked); config.wss_deep_full_crawler = Convert.ToInt32(wss_deep_full_crawler_TextBox.Value); config.wss_deep_reload_crawler = Convert.ToInt32(wss_deep_reload_crawler_TextBox.Value); config.RegexStringToCategory = Common.GetListXPathFromString(txtRegexCatToID.Text); config.Name = txtNameConfig.Text; config.last_edit_xpaths = wssCommon.GetListXPathFromString(this.txtLastEditXPaths.Text); config.last_comment_xpaths = wssCommon.GetListXPathFromString(this.richTextBox2.Text); config.domain = this.txtDomain.Text; config.view_count_xpaths = wssCommon.GetListXPathFromString(this.txtViewCount.Text); } catch (Exception ex) { MessageBox.Show(ex.Message); return(false); } return(true); }
private void LoadConfigToForm(ConfigXPaths config) { if (config == null) { return; } this.TitleXPathsTextBox.Text = wssCommon.ConvertToString(config.TitleXPaths); this.PriceXPathsTextBox.Text = wssCommon.ConvertToString(config.PriceXPaths); this.PostDateXPathsTextBox.Text = wssCommon.ConvertToString(config.PostDateXPaths); this.LastChangeXPathsTextBox.Text = wssCommon.ConvertToString(config.LastChangeXPaths); this.ProvinceXPathsTextBox.Text = wssCommon.ConvertToString(config.ProvinceXPaths); this.PhoneSalerXPathsTextBox.Text = wssCommon.ConvertToString(config.PhoneSalerXPaths); this.ContentXPathTextbox.Text = wssCommon.ConvertToString(config.ContentXPaths); this.AddressXPathsTextBox.Text = wssCommon.ConvertToString(config.AddressXPaths); this.AvaiableXPathsTextBox.Text = wssCommon.ConvertToString(config.AvaiableXPaths); this.QualityXPathsTextBox.Text = wssCommon.ConvertToString(config.QualityXPaths); this.ImageUrlsXPathsTextBox.Text = wssCommon.ConvertToString(config.ImageUrlsXPaths); this.UserNameXPathsTextBox.Text = wssCommon.ConvertToString(config.UserNameXPaths); this.urlTestTextBox.Text = config.UrlTest; this.ExtrationRegex_richTextBox4.Text = wssCommon.ConvertToString(config.ReloadVisitUrlsRegex); this.NoExtrationRegex_richTextBox4.Text = wssCommon.ConvertToString(config.ReloadNoVisitUrlsRegex); this.ProductRegex_RichTextBox.Text = wssCommon.ConvertToString(config.ReloadProductUrlsRegex); this.NoProductRegex_RichTextBox.Text = wssCommon.ConvertToString(config.ReloadNoProductUrlRegex); this.RegexReloadLikeFull_CheckEdit.Checked = config.RegexReloadLikeFull; this.txtImage.Text = wssCommon.ConvertToString(config.image_regex); this.txtNoRegexImage.Text = wssCommon.ConvertToString(config.noimage_regex); this.visitUrlsRegexTextBox.Text = wssCommon.ConvertToString(config.VisitUrlsRegex); this.noVisitUrlsRegexTextBox.Text = wssCommon.ConvertToString(config.NoVisitUrlRegex); this.ProductUrlsRegexTextBox.Text = wssCommon.ConvertToString(config.ProductUrlsRegex); this.NoProductUrlRegexTextBox.Text = wssCommon.ConvertToString(config.NoProductUrlRegex); this.txtThumbXPaths.Text = wssCommon.ConvertToString(config.ThumbUrlXPaths); this.timeDelaySpinEdit.Value = config.TimeDelay; this.itemReCrawlerSpinEdit.Value = config.ItemReCrawler; this.rootLinkTextBox.Text = Common.ConvertToString(config.RootLink); this.webCategoryXPathsTextBox.Text = wssCommon.ConvertToString(config.WebCategoryXPaths); this.textPropertyTextBox.Text = Common.ConvertToString(config.extend_xpaths); this.txtTagXPaths.Text = Common.ConvertToString(config.tags_xpaths); this.ckExtractProduct.Checked = config.AllowExtractProductLink; this.allowAutoPushCheckBox.Checked = config.wss_allow_auto_push; this.minMinuteWaitSpin.Value = config.wss_interval_push; this.dateTimePicker1.Value = config.wss_last_push; this.wss_deep_full_crawler_TextBox.Value = config.wss_deep_full_crawler; this.wss_deep_reload_crawler_TextBox.Value = config.wss_deep_reload_crawler; this.txtRegexCatToID.Text = Common.ConvertToString(config.RegexStringToCategory); this.txtNameConfig.Text = Common.Obj2String(config.Name); this.txtLastEditXPaths.Text = wssCommon.ConvertToString(config.last_edit_xpaths); this.richTextBox2.Text = wssCommon.ConvertToString(config.last_comment_xpaths); this.txtViewCount.Text = wssCommon.ConvertToString(config.view_count_xpaths); this.txtDomain.Text = config.domain; this.CboWebSiteID.SelectedValue = config.website_id; }
public Model.SaleNews.TypeCrawlerData GetTypeOfLink(string c_url, GABIZ.Base.HtmlAgilityPack.HtmlDocument document, ConfigXPaths config) { string category = GetCategory(document, config).Trim(); if (!string.IsNullOrEmpty(category)) { if (Regex.IsMatch(category, "Nhật tảo->Mua bán->Điện thoại.*")) { return(Model.SaleNews.TypeCrawlerData.PhoneComputer); } } return(Model.SaleNews.TypeCrawlerData.None); }
private DateTime GetLastChange(HtmlDocument document, ConfigXPaths configXPath) { return(SqlDb.MinDateDb); }
public void doCrawl(int configID, ETypeCrawlRaoVat typeCrawler) { SqlDb sqlDb = new SqlDb(QT.Entities.Server.ConnectionStringCrawler); RaoVatSQLAdapter adapter = new Entities.RaoVat.RaoVatSQLAdapter(sqlDb); ConfigXPaths configuration = adapter.GetConfigByID(this.ConfigID); while (!this.bPause) { try { SimpleCrawlerRaoVat crawler = new SimpleCrawlerRaoVat(configID, typeCrawler); crawler.eventWhenStart += new Crawler.AbstractionCrawler.EventReportRun(delegate(object sender, string mss) { WriteLog("eventWhenStart", mss); }); crawler.eventWhenGetJob += new Crawler.AbstractionCrawler.EventReportRun(delegate(object sender, string mss) { WriteLog("eventWhenGetJob", mss); this.Invoke(new Action(() => { this.spinnumberVisitedLink.Value = Convert.ToInt32(spinnumberVisitedLink.Value) + 1; })); }); crawler.eventWhenPushJob += new Crawler.AbstractionCrawler.EventReportRun(delegate(object sender, string mss) { WriteLog("eventWhenPushJob", mss); }); crawler.eventWhenEnd += new Crawler.AbstractionCrawler.EventReportRun(delegate(object sender, string mss) { WriteLog("eventWhenEnd", mss); }); crawler.eventWhenSuccessProduct += new Crawler.AbstractionCrawler.EventReportRun(delegate(object sender, string mss) { WriteLog("eventWhenSuccessProduct", mss); this.Invoke(new Action(() => { this.spinNumberProduct.Value = Convert.ToInt32(spinNumberProduct.Value) + 1; })); }); crawler.StartCrawler(); Thread.Sleep(20000); } catch (ThreadAbortException ex1) { break; } catch (Exception ex) { log.ErrorFormat(ex.Message); WriteLog("Exception", string.Format("Exception:{0}", ex.Message)); Thread.Sleep(20000); } } }