/// <summary> /// 一个详细页处理完成 /// </summary> /// <param name="detailPage"></param> void SnifferThread_DetailPageParseDone(DetailPage detailPage) { if (Data.Tables.Count == 0) { //如果没有表,则先创建表结构 Data.Tables.Add(detailPage.DetailPageConfiguration.CreateDataTable()); } DataTable table = Data.Tables[0]; DataRow row = table.NewRow(); bool hasData = false; foreach (DataColumn col in table.Columns) { if (col.ColumnName != "ID" && !string.IsNullOrEmpty(row[col].ToString())) { hasData = true; break; } } hasData = true; if (hasData) { AddValueToRow(row, detailPage); table.Rows.Add(row); } }
private void OnDetailPageParseDone(DetailPage detailPage) { if (DetailPageParseDone != null) { DetailPageParseDone(detailPage); } }
/// <summary> /// 采集 /// </summary> /// <returns></returns> public override bool Sniffer() { bool bool1 = InlineSniffer(); //return bool1; if (!bool1) { return(false); } //这里开始内容页的翻页 if (string.IsNullOrEmpty(this.DetailPageConfiguration.PageQuery)) { return(bool1); } DetailPage rootDetailPage = this; ListPage listPage = rootDetailPage.Parent as ListPage; while (listPage == null) { rootDetailPage = rootDetailPage.Parent as DetailPage; listPage = rootDetailPage.Parent as ListPage; } DetailPage detailPage = new DetailPage(rootDetailPage, (DetailPageConfiguration)listPage.ListPageConfiguration.SubPageConfiguration); detailPage.PageName = this.PageName; detailPage.PageUrl = this.PageUrl; int pageIndex = this.PageIndex + this.DetailPageConfiguration.PageIndexStep; ReplacePageIndex(detailPage, pageIndex); //如果内容一样,则表示已经结束了啦 if (detailPage.PageBody == this.PageBody) { return(bool1); } else { //否则要看看,识别项的正则结果是否一样 if (!string.IsNullOrEmpty(this.Configuration.EndPageDetermineRegex)) { Match match1 = Regex.Match(this.PageBody, this.Configuration.EndPageDetermineRegex, (RegexOptions)25); Match match2 = Regex.Match(detailPage.PageBody, detailPage.Configuration.EndPageDetermineRegex, (RegexOptions)25); if (match1.Value == match2.Value) { return(bool1); } } } detailPage.Sniffer(); return(bool1); }
void AddValueToRow(DataRow row, DetailPage page) { foreach (string key in page.ResultItems.Keys) { row[key] = page.ResultItems[key]; } //foreach (DetailPage p in page.SubPages) //{ // AddValueToRow(row, p); //} }
/// <summary> /// 采集详细页 /// </summary> /// <param name="listPage"></param> public void ParseDetailPage(ListPage listPage) { ListPage backListPage = null; bool isNotPage = false; int pageIndex = this.SnifferContext.GetStartPageIndex(listPage); int snifferContextCount = SnifferContext.GetSnifferPageCount(listPage); int donePageCount = 0; while (!isNotPage) { listPage.Sniffer(); //如果不成功,或者大于要采集的页数,则表示没有数据了,完成了分类 if (!listPage.Succeed || donePageCount == snifferContextCount || (backListPage != null && backListPage.PageBody == listPage.PageBody)) { break; } OnPageIndexChange(listPage.PageUrl); foreach (UrlItem urlItem in listPage.SubPageUrlResults) { DetailPageConfiguration detailPageConf = (DetailPageConfiguration)listPage.ListPageConfiguration.SubPageConfiguration; DetailPage detailPage = new DetailPage(listPage, detailPageConf); detailPage.PageIndex = detailPageConf.PageStartIndex; detailPage.PageName = urlItem.Title; detailPage.PageUrl = urlItem.Url; detailPage.Sniffer(); OnDetailPageParseDone(detailPage); } pageIndex = pageIndex + (listPage.ListPageConfiguration.PageIndexStep - 1); donePageCount++; ListPage newListPage = new ListPage(listPage.Parent, listPage.ListPageConfiguration); newListPage.PageName = listPage.PageName; newListPage.PageUrl = listPage.PageUrl; if (listPage.ListPageConfiguration.PageMethod == PageMethod.Get) { ReplacePageIndex(newListPage, pageIndex); } else { newListPage.PageQuery = string.Format(newListPage.PageQuery, pageIndex); } backListPage = listPage; listPage = newListPage; } }
/// <summary> /// 读取详细页重载 /// </summary> /// <returns></returns> public static DetailPage GetDetailPage(string fileName, string rootPageName, int firstIndex, int pageIndex, int urlIndex) { ListPage firstPage = GetListPage(fileName, rootPageName, firstIndex, pageIndex); UrlItem urlItem = firstPage.SubPageUrlResults[urlIndex]; DetailPageConfiguration detailPageConf = (DetailPageConfiguration)firstPage.ListPageConfiguration.SubPageConfiguration; DetailPage detailPage = new DetailPage(firstPage, detailPageConf); detailPage.PageIndex = detailPageConf.PageStartIndex; detailPage.PageName = urlItem.Title; detailPage.PageUrl = urlItem.Url; detailPage.Sniffer(); return(detailPage); }
private void ReplacePageIndex(DetailPage detailPage, int pageIndex) { detailPage.PageIndex = pageIndex; string stringPageIndex; if (!string.IsNullOrEmpty(detailPage.DetailPageConfiguration.PageIndexFormat)) { stringPageIndex = pageIndex.ToString(detailPage.DetailPageConfiguration.PageIndexFormat); } else { stringPageIndex = pageIndex.ToString(); } string pageQuery = string.Format(detailPage.DetailPageConfiguration.ReplacePageQuery, stringPageIndex); string pageUrl = Regex.Replace(detailPage.PageUrl, detailPage.DetailPageConfiguration.PageQuery, pageQuery); detailPage.PageUrl = pageUrl; }
private bool InlineSniffer() { if (!this.Succeed) { _done = true; return(false); } //采集项 foreach (SnifferItem item in this.DetailPageConfiguration.SnifferItems) { //这里的意思应该是,如果不是多页的内容,则当然页如果大于开始页了,则不再采集了。 if (this.PageIndex > this.DetailPageConfiguration.PageStartIndex && !item.MutiPage) { continue; } if (item.RegexString != null) { MatchCollection matchs = Regex.Matches(this.PageBody, item.RegexString.Expression, (RegexOptions)25); if (matchs.Count > 0) { System.Text.StringBuilder sb = new StringBuilder(); foreach (Match match in matchs) { if (matchs.Count > 1 && sb.Length > 0) { sb.Append(item.Separator); } string value = match.Groups[item.RegexString.ValueGroupIndex].Value; if (string.IsNullOrEmpty(value)) { value = item.DefaultValue; } else { //清理垃圾 ClearRubbish(item, ref value); //将内容里的URL转成绝对路径 if (item.UrlToAbs) { UrlToAbs(item, ref value); } //如果采集的是Url则转换成绝对路径 if (item.IsUrl) { value = FileUtil.GetAbsUrl(value, this.SubPageBaseUrl); } //保存图片 if (item.SaveImage) { SaveImages(item, ref value); } //清除A元素 if (item.ClearAElement) { ClearAElement(item, ref value); } //清除HTML代码 if (item.IsClearHTML) { value = ClearHTML(value); } } sb.Append(value); } this.ResultItems.Add(item.ItemName, sb.ToString().Trim()); } else { this.ResultItems.Add(item.ItemName, item.DefaultValue); } } else { this.ResultItems.Add(item.ItemName, item.DefaultValue); } } //整合字段 string text1 = this.PageUrl; DetailPage parentPage = this.Parent as DetailPage; if (parentPage != null) { foreach (string key in this.ResultItems.Keys) { string value = (string)this.ResultItems[key]; if (this.ResultItems.Contains(key)) { string parentPageValue = (string)parentPage.ResultItems[key]; if (parentPageValue != value) { SnifferItem item = null; foreach (SnifferItem itm in parentPage.DetailPageConfiguration.SnifferItems) { if (itm.ItemName == key) { item = itm; break; } } parentPage.ResultItems[key] = parentPageValue + item.MutiPageSeparator + value; } } else { parentPage.ResultItems.Add(key, value); } } } //采集子页 foreach (DetailPageConfiguration conf in this.DetailPageConfiguration.SubPageConfigurations) { MatchCollection matches = Regex.Matches(this.PageBody, conf.SnifferSubPageUrlItem.Expression, (RegexOptions)25); if (matches != null && matches.Count > 0) { foreach (Match match in matches) { if (!string.IsNullOrEmpty(match.Value)) { DetailPage detailPage = new DetailPage(this, conf); detailPage.PageName = match.Groups[conf.SnifferSubPageUrlItem.TitleGroupIndex].Value; string url = match.Groups[conf.SnifferSubPageUrlItem.UrlGroupIndex].Value; if (!string.IsNullOrEmpty(conf.SnifferSubPageUrlItem.UrlFormat)) { url = string.Format(conf.SnifferSubPageUrlItem.UrlFormat, url); } detailPage.PageUrl = FileUtil.GetAbsUrl(url, this.SubPageBaseUrl); detailPage.Sniffer(); this.SubPages.Add(detailPage); } } } } _done = true; return(true); }