private void SiteComboBox_SelectedIndexChanged(object sender, EventArgs e) { if (this.siteComboBox.SelectedValue == null) { return; } SiteParameter siteParameter = siteComboBox.SelectedItem as SiteParameter; this.siteNameTextBox.Text = siteParameter.SiteName; this.urlTextBox.Text = siteParameter.UrlPattern; this.startUrlTextBox.Text = siteParameter.StartUrl; this.itemTextBox.Text = siteParameter.ItemPattern; this.startNumber.Value = siteParameter.StartNumber; this.pageStepNumber.Value = siteParameter.PageStepNumber ?? 0; this.captionPosition.Value = siteParameter.CaptionPosition; this.urlPosition.Value = siteParameter.UrlPosition; this.datePosition.Value = siteParameter.DatePosition; this.categoryTextBox.Text = siteParameter.CategoryPattern; this.indexCodeTextBox.Text = siteParameter.IndexCodePattern; this.issueCodeTextBox.Text = siteParameter.IssueCodePattern; this.publishAgencyTextBox.Text = siteParameter.PublishAgencyPattern; this.keywordTextBox.Text = siteParameter.KeywordPattern; this.attachmentTextBox.Text = siteParameter.AttachmentPattern; this.publishDateTextBox.Text = siteParameter.PublishDatePattern; this.contentTextBox.Text = siteParameter.ContentPattern; BindSource(siteParameter); }
private void AddDicButton_Click(object sender, EventArgs e) { if (this.siteComboBox.SelectedValue == null) { return; } if (string.IsNullOrWhiteSpace(this.parseKeyTextBox.Text) || string.IsNullOrWhiteSpace(this.parseValueTextBox.Text)) { this.testLogTextBox.Text = "Please input key and value"; return; } SiteParameter siteParameter = siteComboBox.SelectedItem as SiteParameter; if (siteParameter.CustomProcessors == null) { siteParameter.CustomProcessors = new Dictionary <string, string>(); siteParameter.CustomProcessors.Add(this.parseKeyTextBox.Text, this.parseValueTextBox.Text); } else { if (siteParameter.CustomProcessors.Keys.Contains(this.parseKeyTextBox.Text)) { siteParameter.CustomProcessors[this.parseKeyTextBox.Text] = this.parseValueTextBox.Text; } else { siteParameter.CustomProcessors.Add(this.parseKeyTextBox.Text, this.parseValueTextBox.Text); } } BindSource(siteParameter); this.parseKeyTextBox.Text = string.Empty; this.parseValueTextBox.Text = string.Empty; }
/// <summary> /// 保存数据 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> protected void btnSave_Click(Object sender, EventArgs e) { #region 保存更改 var config = SiteParameter.Config; var lang = config.Languages.Find(a => { return(a.language == Language.SelectedValue); }); if (lang == null) { lang = new Language(); config.Languages.Add(lang); } lang = this.GetFormValue <Language>(lang); config = this.GetFormValue <SiteParameter>(config); config.DataTypeOptions.ForEach(a => { a.Selected = false; }); var datatype = config.DataTypeOptions.Find(a => { return(a.Name == @DataType.SelectedValue); }); if (datatype != null) { datatype.Selected = true; datatype.ConnectionString = ConnectionString.Text; } SiteParameter.SaveConfig(); #endregion Alert("保存成功!", "success"); }
public ParamPageReader(SiteParameter siteParameter, IHtmlReader htmlReader, IItemReader itemReader) { this.siteParameter = siteParameter ?? throw new ArgumentNullException(nameof(siteParameter)); this.htmlReader = htmlReader ?? throw new ArgumentNullException(nameof(htmlReader)); this.itemReader = itemReader ?? throw new ArgumentNullException(nameof(itemReader)); this.pageNumber = this.siteParameter.StartNumber; }
public static ISiteCrawler Create(SiteParameter siteParameter) { if (siteParameter == null) { throw new ArgumentNullException(nameof(siteParameter)); } string dataServiceName = GetValueOrDefault(siteParameter.CustomProcessors, "IDataService"); string htmlReaderName = GetValueOrDefault(siteParameter.CustomProcessors, "IHtmlReader"); string pageParserName = GetValueOrDefault(siteParameter.CustomProcessors, "IPageParser"); string itemReaderName = GetValueOrDefault(siteParameter.CustomProcessors, "IItemReader"); string pageReaderName = GetValueOrDefault(siteParameter.CustomProcessors, "IPageReader"); IHtmlReader htmlReader = Container.Resolve <IHtmlReader>(htmlReaderName); ParameterOverride htmlReaderParameter = new ParameterOverride("htmlReader", htmlReader); ParameterOverride siteParameterParameter = new ParameterOverride("siteParameter", siteParameter); IItemReader itemReader = Container.Resolve <IItemReader>(itemReaderName, siteParameterParameter); ParameterOverride itemReaderParameter = new ParameterOverride("itemReader", itemReader); IPageReader pageReader = Container.Resolve <IPageReader>(pageReaderName, siteParameterParameter, htmlReaderParameter, itemReaderParameter); IPageParser pageParser = Container.Resolve <IPageParser>(pageParserName, siteParameterParameter, htmlReaderParameter); return(new GeneralSiteCrawler(pageReader, pageParser)); }
public GeneralSiteCrawler(SiteParameter siteParameter) { IItemReader itemReader = new RegexItemReader(siteParameter); IHtmlReader htmlReader = new HttpClientReader(); this.pageReader = new SequentialPageReader(siteParameter, htmlReader, itemReader); this.pageParser = new RegexPageParser(siteParameter, htmlReader); }
private void NewButton_Click(object sender, EventArgs e) { SiteParameter siteParameter = new SiteParameter { SiteName = "New", }; siteParameters.Add(siteParameter); this.siteComboBox.SelectedItem = siteParameter; }
private void Content_ValueChanged(object sender, EventArgs e) { if (this.siteComboBox.SelectedValue == null) { return; } SiteParameter siteParameter = siteComboBox.SelectedItem as SiteParameter; siteParameter.ContentPattern = this.contentTextBox.Text; }
private void PublishDate_ValueChanged(object sender, EventArgs e) { if (this.siteComboBox.SelectedValue == null) { return; } SiteParameter siteParameter = siteComboBox.SelectedItem as SiteParameter; siteParameter.PublishDatePattern = string.IsNullOrWhiteSpace(this.publishDateTextBox.Text) ? null : this.publishDateTextBox.Text; }
private void DatePosition_ValueChanged(object sender, EventArgs e) { if (this.siteComboBox.SelectedValue == null) { return; } SiteParameter siteParameter = siteComboBox.SelectedItem as SiteParameter; siteParameter.DatePosition = (int)this.datePosition.Value; }
private void StartNumber_ValueChanged(object sender, EventArgs e) { if (this.siteComboBox.SelectedValue == null) { return; } SiteParameter siteParameter = siteComboBox.SelectedItem as SiteParameter; siteParameter.StartNumber = (int)this.startNumber.Value; }
private void ItemTextBox_TextChanged(object sender, EventArgs e) { if (this.siteComboBox.SelectedValue == null) { return; } SiteParameter siteParameter = siteComboBox.SelectedItem as SiteParameter; siteParameter.ItemPattern = this.itemTextBox.Text; }
private void SiteNameTextBox_TextChanged(object sender, EventArgs e) { if (this.siteComboBox.SelectedValue == null) { return; } SiteParameter siteParameter = siteComboBox.SelectedItem as SiteParameter; siteParameter.SiteName = this.siteNameTextBox.Text; siteParameters.ResetItem(siteParameters.IndexOf(siteParameter)); }
private void BindSource(SiteParameter siteParameter) { if (siteParameter.CustomProcessors == null || !siteParameter.CustomProcessors.Keys.Any()) { this.parseDicDataGridView.DataSource = null; return; } this.parseDicDataGridView.DataSource = siteParameter.CustomProcessors?.Select(s => new { Key = s.Key, Value = s.Value }).ToArray(); this.parseDicDataGridView.Rows[0].Selected = false; this.parseDicDataGridView.CurrentCell = null; this.parseDicDataGridView.ClearSelection(); }
public GeneralSiteCrawler(SiteParameter siteParameter) { this.dataService = new DbDataService(CrawlerDbHelper.GetContext()); IItemReader itemReader = new RegexItemReader(siteParameter); IHtmlReader htmlReader = new HttpClientReader(); this.pageReader = new SequentialPageReader(siteParameter, htmlReader, itemReader); this.pageParser = new RegexPageParser(siteParameter, htmlReader); this.pageParser.SetErrorHandler((url, exception) => this.dataService.AddLog(new CrawlerLog { Url = url, LogTime = DateTime.Now, Message = exception.Message })); }
private void PageStepNumber_ValueChanged(object sender, EventArgs e) { if (this.siteComboBox.SelectedValue == null) { return; } SiteParameter siteParameter = siteComboBox.SelectedItem as SiteParameter; if (this.pageStepNumber.Value == 0) { siteParameter.PageStepNumber = null; } else { siteParameter.PageStepNumber = Convert.ToInt32(this.pageStepNumber.Value); } }
private void removeDicButton_Click(object sender, EventArgs e) { var rows = this.parseDicDataGridView.SelectedRows; if (rows.Count < 1) { this.testLogTextBox.Text = "Please select an item delete"; return; } string key = rows[0].Cells[0].Value.ToString(); SiteParameter siteParameter = siteComboBox.SelectedItem as SiteParameter; siteParameter.CustomProcessors.Remove(key); if (!siteParameter.CustomProcessors.Any()) { siteParameter.CustomProcessors = null; } BindSource(siteParameter); }
bool DefoultUpadte3(bool a01, bool a0, bool a1, bool a2, bool a3, bool a4, bool a5, bool a6, bool a7, bool a8, bool a9, bool a10) { //根据编号查询对象 var item = dbm.SiteParameters.FirstOrDefault(); SiteParameter sp = new SiteParameter { Id = item.Id, Copyright = item.Copyright, Describe = item.Describe, Keyword = item.Keyword, SiteName = item.SiteName, SiteUrl = item.SiteUrl, OpenSite = item.OpenSite, siteState = item.siteState, userRegiste = item.userRegiste, trader = item.trader, tradePwd = item.tradePwd, outTime = item.outTime, emailUserPwd = item.emailUserPwd, emailUserName = item.emailUserName, SendEmail = item.SendEmail, sendFreezeEmail = a01, sendRecoverEmail = a0, sendZhuanzEmail = a1, sendWithdrawEmail = a2, sendreChargeEmail = a3, sendFreezeMsg = a4, sendThawMsg = a5, sendZhuanzMsg = a6, sendWithdrawMsg = a7, sendreChargeMsg = a8, BankZhuanz = a9, OnlinePayment = a10 }; bool r = dbm.UpdateSite(sp); return(r); }
bool DefoultUpadte2(SiteParameter item, bool a1, bool a2, string a3, string a4, string a5, string a6, string a7) { SiteParameter sp = new SiteParameter { Id = item.Id, Copyright = item.Copyright, Describe = item.Describe, Keyword = item.Keyword, SiteName = item.SiteName, SiteUrl = item.SiteUrl, OpenSite = item.OpenSite, siteState = item.siteState, userRegiste = item.userRegiste, trader = a1, tradePwd = a2, outTime = a3, emailUserPwd = a4, emailUserName = a5, emaiSendName = a6, SendEmail = a7, sendFreezeEmail = item.sendFreezeEmail, sendZhuanzEmail = item.sendZhuanzEmail, sendWithdrawEmail = item.sendWithdrawEmail, sendreChargeEmail = item.sendreChargeEmail, sendFreezeMsg = item.sendFreezeMsg, sendThawMsg = item.sendThawMsg, sendZhuanzMsg = item.sendZhuanzMsg, sendWithdrawMsg = item.sendWithdrawMsg, sendreChargeMsg = item.sendreChargeMsg, BankZhuanz = item.BankZhuanz, OnlinePayment = item.OnlinePayment }; bool r = dbm.UpdateSite(sp); return(r); }
public void Crawl(SiteParameter siteParameter) { Stopwatch stopwatch = Stopwatch.StartNew(); ArticleMonitor monitor = new ArticleMonitor() { StartTime = DateTime.Now, SiteName = siteParameter.SiteName }; if (!string.IsNullOrWhiteSpace(siteParameter.StartUrl)) { monitor.SiteUrl = siteParameter.StartUrl; } else { monitor.SiteUrl = string.Format(siteParameter.UrlPattern, siteParameter.StartNumber, siteParameter.PageStepNumber); } IEnumerable <Article> articles = this.pageReader.GetArticals().ToArray(); articles = articles.Select(article => this.pageParser.GetArticleDetails(article)).ToArray(); this.dataService.AddOrUpdateArticles(articles, monitor); int attachmentCount = 0; foreach (var article in articles) { var attatchments = this.pageParser.GetAttachments(article); attachmentCount += attatchments?.Count() ?? 0; this.dataService.AddOrUpdateArticleAttachments(attatchments); } this.dataService.AddOrUpdateArticleMontior(monitor); string info = string.Format("{0} articles crawled, {1} attachments crawled.", articles.Count(), attachmentCount); Logging.WriteEntry(this, LogType.Information, info); Logging.WriteEntry(this, LogType.Information, $"{stopwatch.Elapsed} elapsed."); }
bool DefoultUpadte(SiteParameter item, string a1, string a2, string a3, string a4, string a5, bool a6, bool a7, bool a8) { SiteParameter sp = new SiteParameter { Id = item.Id, Copyright = a1, Describe = a2, Keyword = a3, SiteName = a4, SiteUrl = a5, OpenSite = a6, siteState = a7, userRegiste = a8, tradePwd = item.tradePwd, outTime = item.outTime, emailUserPwd = item.emailUserPwd, emailUserName = item.emailUserName, emaiSendName = item.emaiSendName, SendEmail = item.SendEmail, sendFreezeEmail = item.sendFreezeEmail, sendRecoverEmail = item.sendRecoverEmail, sendZhuanzEmail = item.sendZhuanzEmail, sendWithdrawEmail = item.sendWithdrawEmail, sendreChargeEmail = item.sendreChargeEmail, sendFreezeMsg = item.sendFreezeMsg, sendThawMsg = item.sendThawMsg, sendZhuanzMsg = item.sendZhuanzMsg, sendWithdrawMsg = item.sendWithdrawMsg, sendreChargeMsg = item.sendreChargeMsg, BankZhuanz = item.BankZhuanz, OnlinePayment = item.OnlinePayment }; bool r = dbm.UpdateSite(sp); return(r); }
public bool UpdateSite(SiteParameter p) { sp.SetCtx(db); return(sp.Update(p)); }
private void TestButton_Click(object sender, EventArgs e) { if (this.siteComboBox.SelectedValue == null) { return; } SiteParameter siteParameter = siteComboBox.SelectedItem as SiteParameter; siteParameter.StartUrl = string.Format(siteParameter.UrlPattern, siteParameter.StartNumber * (siteParameter.PageStepNumber == null ? 1 : siteParameter.PageStepNumber)); StringBuilder log = new StringBuilder(); log.Append("Start crawl...\r\n"); testLogTextBox.Text = log.ToString(); log.Append(string.Format("Current list url:{0}\r\n", siteParameter.StartUrl)); testLogTextBox.Text = log.ToString(); string listError = string.Empty; string listResult = CreateHttpWebRequest(siteParameter.StartUrl, out listError); if (!string.IsNullOrWhiteSpace(listError)) { log.Append(string.Format("Crawl list failed,because {0}\r\n", listError)); testLogTextBox.Text = log.ToString(); log.Append("End crawler...\r\n"); testLogTextBox.Text = log.ToString(); return; } DataTable table = new DataTable(); table.Columns.Add("URL"); table.Columns.Add("Caption"); table.Columns.Add("Category"); table.Columns.Add("IndexCode"); table.Columns.Add("IssueCode"); table.Columns.Add("PublishAgency"); table.Columns.Add("Keyword"); table.Columns.Add("Publish Date"); table.Columns.Add("Attachment"); log.Append("Analysis list html...\r\n"); testLogTextBox.Text = log.ToString(); Regex regex = new Regex(siteParameter.ItemPattern, RegexOptions.Multiline | RegexOptions.IgnoreCase); MatchCollection matches = regex.Matches(listResult); log.Append(string.Format("Analysis list total:{0}\r\n", matches.Count)); if (matches.Count < 1) { log.Append("End crawler...\r\n"); testLogTextBox.Text = log.ToString(); return; } foreach (Match match in matches) { DataRow dr = table.NewRow(); string detailUrl = new Uri(new Uri(siteParameter.StartUrl), match.Groups[siteParameter.UrlPosition].Value).AbsoluteUri.ToString(); log.Append(string.Format("Current detail url:{0}\r\n", detailUrl)); testLogTextBox.Text = log.ToString(); dr[0] = detailUrl; dr[1] = match.Groups[siteParameter.CaptionPosition].Value; if (datePosition.Value > 0) { dr[7] = match.Groups[siteParameter.DatePosition].Value; } string detailError = string.Empty; string detailResult = CreateHttpWebRequest(detailUrl, out detailError); if (!string.IsNullOrWhiteSpace(detailError)) { log.Append(string.Format("Crawl detail failed,because {0}\r\n", detailError)); testLogTextBox.Text = log.ToString(); continue; } log.Append("Analysis detail html...\r\n"); testLogTextBox.Text = log.ToString(); //analysis detail html AnalysisDetailHtml(detailResult, siteParameter, dr); table.Rows.Add(dr); dataGridView.DataSource = table; } log.Append("End crawler...\r\n"); testLogTextBox.Text = log.ToString(); }
public RegexItemReader(SiteParameter siteParameter) { this.siteParameter = siteParameter ?? throw new ArgumentNullException(nameof(siteParameter)); this.pattern = new Regex(siteParameter.ItemPattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled); }
public void AnalysisDetailHtml(string detailResult, SiteParameter siteParameter, DataRow dr) { if (!string.IsNullOrWhiteSpace(siteParameter.CategoryPattern)) { if (Regex.IsMatch(detailResult, siteParameter.CategoryPattern, RegexOptions.IgnoreCase)) { dr[2] = Regex.Match(detailResult, siteParameter.CategoryPattern).Groups[1].Value; } } if (!string.IsNullOrWhiteSpace(siteParameter.IndexCodePattern)) { if (Regex.IsMatch(detailResult, siteParameter.IndexCodePattern, RegexOptions.IgnoreCase)) { dr[3] = Regex.Match(detailResult, siteParameter.IndexCodePattern).Groups[1].Value; } } if (!string.IsNullOrWhiteSpace(siteParameter.IssueCodePattern)) { if (Regex.IsMatch(detailResult, siteParameter.IssueCodePattern, RegexOptions.IgnoreCase)) { dr[4] = Regex.Match(detailResult, siteParameter.IssueCodePattern).Groups[1].Value; } } if (!string.IsNullOrWhiteSpace(siteParameter.PublishAgencyPattern)) { if (Regex.IsMatch(detailResult, siteParameter.PublishAgencyPattern, RegexOptions.IgnoreCase)) { dr[5] = Regex.Match(detailResult, siteParameter.PublishAgencyPattern).Groups[1].Value; } } if (!string.IsNullOrWhiteSpace(siteParameter.KeywordPattern)) { if (Regex.IsMatch(detailResult, siteParameter.KeywordPattern, RegexOptions.IgnoreCase)) { dr[6] = Regex.Match(detailResult, siteParameter.KeywordPattern).Groups[1].Value; } } if (datePosition.Value < 1 && !string.IsNullOrWhiteSpace(siteParameter.PublishDatePattern)) { if (Regex.IsMatch(detailResult, siteParameter.PublishDatePattern, RegexOptions.IgnoreCase)) { dr[7] = Regex.Match(detailResult, siteParameter.PublishDatePattern).Groups[1].Value; } } if (!string.IsNullOrWhiteSpace(siteParameter.AttachmentPattern)) { if (Regex.IsMatch(detailResult, siteParameter.AttachmentPattern, RegexOptions.IgnoreCase)) { Match match = Regex.Match(detailResult, siteParameter.AttachmentPattern); StringBuilder attachment = new StringBuilder(); foreach (Group group in match.Groups.Cast <Group>().Skip(1)) { if (string.IsNullOrWhiteSpace(group.Value)) { continue; } attachment.Append(string.Format("Url:{0},", HttpUtility.HtmlDecode(group.Value))); } dr[8] = attachment.ToString().TrimEnd(','); } } }
public HenanHrPageParser(SiteParameter siteParameter, IHtmlReader htmlReader) : base(siteParameter, htmlReader) { }
public JsonPageParser(SiteParameter siteParameter, IHtmlReader htmlReader) { this.SiteParameter = siteParameter ?? throw new ArgumentNullException(nameof(siteParameter)); this.HtmlReader = htmlReader ?? throw new ArgumentNullException(nameof(htmlReader)); }