/// <summary> /// 采集详细页 /// </summary> /// <param name="listPage"></param> public void ParseDetailPage(ListPage listPage) { ListPage backListPage = null; bool isNotPage = false; int pageIndex = this.SnifferContext.GetStartPageIndex(listPage); int snifferContextCount = SnifferContext.GetSnifferPageCount(listPage); int donePageCount = 0; while (!isNotPage) { listPage.Sniffer(); //如果不成功,或者大于要采集的页数,则表示没有数据了,完成了分类 if (!listPage.Succeed || donePageCount == snifferContextCount || (backListPage != null && backListPage.PageBody == listPage.PageBody)) { break; } OnPageIndexChange(listPage.PageUrl); foreach (UrlItem urlItem in listPage.SubPageUrlResults) { DetailPageConfiguration detailPageConf = (DetailPageConfiguration)listPage.ListPageConfiguration.SubPageConfiguration; DetailPage detailPage = new DetailPage(listPage, detailPageConf); detailPage.PageIndex = detailPageConf.PageStartIndex; detailPage.PageName = urlItem.Title; detailPage.PageUrl = urlItem.Url; detailPage.Sniffer(); OnDetailPageParseDone(detailPage); } pageIndex = pageIndex + (listPage.ListPageConfiguration.PageIndexStep - 1); donePageCount++; ListPage newListPage = new ListPage(listPage.Parent, listPage.ListPageConfiguration); newListPage.PageName = listPage.PageName; newListPage.PageUrl = listPage.PageUrl; if (listPage.ListPageConfiguration.PageMethod == PageMethod.Get) { ReplacePageIndex(newListPage, pageIndex); } else { newListPage.PageQuery = string.Format(newListPage.PageQuery, pageIndex); } backListPage = listPage; listPage = newListPage; } }
/// <summary> /// 读取详细页重载 /// </summary> /// <returns></returns> public static DetailPage GetDetailPage(string fileName, string rootPageName, int firstIndex, int pageIndex, int urlIndex) { ListPage firstPage = GetListPage(fileName, rootPageName, firstIndex, pageIndex); UrlItem urlItem = firstPage.SubPageUrlResults[urlIndex]; DetailPageConfiguration detailPageConf = (DetailPageConfiguration)firstPage.ListPageConfiguration.SubPageConfiguration; DetailPage detailPage = new DetailPage(firstPage, detailPageConf); detailPage.PageIndex = detailPageConf.PageStartIndex; detailPage.PageName = urlItem.Title; detailPage.PageUrl = urlItem.Url; detailPage.Sniffer(); return(detailPage); }
private static DetailPageConfiguration CreateDetailPageConfiguration(PageConfiguration parent, XmlNode parentConfNode, XmlNode pageConfNode) { XmlNode commonListConfigNode = null; XmlNode commonConfigNode = null; if (parentConfNode.Attributes["Config"] != null) { commonListConfigNode = parentConfNode.ParentNode.SelectSingleNode(string.Format("CommonConfig[@Name='{0}']", parentConfNode.Attributes["Config"].Value)); commonConfigNode = commonListConfigNode.SelectSingleNode("SnifferPage"); } DetailPageConfiguration detailPageConf = new DetailPageConfiguration(parent); detailPageConf.PageName = pageConfNode.Attributes["PageName"].Value; if (pageConfNode.Attributes["PageUrl"] != null) { detailPageConf.PageUrl = pageConfNode.Attributes["PageUrl"].Value; } else if (commonConfigNode != null && commonConfigNode.Attributes["PageUrl"] != null) { detailPageConf.PageUrl = commonConfigNode.Attributes["PageUrl"].Value; } if (pageConfNode.Attributes["PageType"] != null) { detailPageConf.PageType = (PageType)Enum.Parse(typeof(PageType), pageConfNode.Attributes["PageType"].Value, false); } else if (commonConfigNode != null && commonConfigNode.Attributes["PageType"] != null) { detailPageConf.PageType = (PageType)Enum.Parse(typeof(PageType), commonConfigNode.Attributes["PageType"].Value, false); } if (pageConfNode.Attributes["Encoding"] != null) { detailPageConf.Encoding = Encoding.GetEncoding(pageConfNode.Attributes["Encoding"].Value); } else if (commonConfigNode != null && commonConfigNode.Attributes["Encoding"] != null) { detailPageConf.Encoding = Encoding.GetEncoding(commonConfigNode.Attributes["Encoding"].Value); } //翻页 if (pageConfNode.Attributes["PageQuery"] != null) { detailPageConf.PageQuery = pageConfNode.Attributes["PageQuery"].Value; } else if (commonConfigNode != null && commonConfigNode.Attributes["PageQuery"] != null) { detailPageConf.PageQuery = commonConfigNode.Attributes["PageQuery"].Value; } if (pageConfNode.Attributes["ReplacePageQuery"] != null) { detailPageConf.ReplacePageQuery = pageConfNode.Attributes["ReplacePageQuery"].Value; } else if (commonConfigNode != null && commonConfigNode.Attributes["ReplacePageQuery"] != null) { detailPageConf.ReplacePageQuery = commonConfigNode.Attributes["ReplacePageQuery"].Value; } if (pageConfNode.Attributes["PageIndexFormat"] != null) { detailPageConf.PageIndexFormat = pageConfNode.Attributes["PageIndexFormat"].Value; } else if (commonConfigNode != null && commonConfigNode.Attributes["PageIndexFormat"] != null) { detailPageConf.PageIndexFormat = commonConfigNode.Attributes["PageIndexFormat"].Value; } if (pageConfNode.Attributes["PageStartIndex"] != null) { detailPageConf.PageStartIndex = int.Parse(pageConfNode.Attributes["PageStartIndex"].Value); } else if (commonConfigNode != null && commonConfigNode.Attributes["PageStartIndex"] != null) { detailPageConf.PageStartIndex = int.Parse(commonConfigNode.Attributes["PageStartIndex"].Value); } if (pageConfNode.Attributes["PageIndexSeed"] != null) { detailPageConf.PageIndexSeed = int.Parse(pageConfNode.Attributes["PageIndexSeed"].Value); } else if (commonConfigNode != null && commonConfigNode.Attributes["PageIndexSeed"] != null) { detailPageConf.PageIndexSeed = int.Parse(commonConfigNode.Attributes["PageIndexSeed"].Value); } if (pageConfNode.Attributes["PageIndexStep"] != null) { detailPageConf.PageIndexStep = int.Parse(pageConfNode.Attributes["PageIndexStep"].Value); } else if (commonConfigNode != null && commonConfigNode.Attributes["PageIndexStep"] != null) { detailPageConf.PageIndexStep = int.Parse(commonConfigNode.Attributes["PageIndexStep"].Value); } if (pageConfNode.Attributes["EndPageDetermineRegex"] != null) { detailPageConf.EndPageDetermineRegex = pageConfNode.Attributes["EndPageDetermineRegex"].Value; } else if (commonConfigNode != null && commonConfigNode.Attributes["EndPageDetermineRegex"] != null) { detailPageConf.EndPageDetermineRegex = commonConfigNode.Attributes["EndPageDetermineRegex"].Value; } if (pageConfNode.Attributes["PageMethod"] != null) { detailPageConf.PageMethod = (PageMethod)Enum.Parse(typeof(PageMethod), pageConfNode.Attributes["PageMethod"].Value); } else if (commonConfigNode != null && commonConfigNode.Attributes["PageMethod"] != null) { detailPageConf.PageMethod = (PageMethod)Enum.Parse(typeof(PageMethod), commonConfigNode.Attributes["PageMethod"].Value); } XmlNode snifferUrlItemNode = pageConfNode.SelectSingleNode("SnifferSubPageUrlItem"); if (snifferUrlItemNode == null && commonConfigNode != null && commonConfigNode.Attributes["SnifferSubPageUrlItem"] != null) { snifferUrlItemNode = commonConfigNode.SelectSingleNode("SnifferSubPageUrlItem"); } if (snifferUrlItemNode != null) { detailPageConf.SnifferSubPageUrlItem = CreateSnifferUrlItem(snifferUrlItemNode); } List <XmlNodeList> lstItemLists = new List <XmlNodeList>(); XmlNodeList itemNodes = pageConfNode.SelectNodes("SnifferItem"); if (itemNodes != null) { lstItemLists.Add(itemNodes); } if (commonConfigNode != null) { XmlNodeList comItemNodes = commonConfigNode.SelectNodes("SnifferItem"); if (comItemNodes != null) { lstItemLists.Add(comItemNodes); } } for (int i = 0; i < lstItemLists.Count; i++) { itemNodes = lstItemLists[i]; foreach (XmlNode itemNode in itemNodes) { string itemName = itemNode.Attributes["ItemName"].Value; bool bool1 = false; foreach (SnifferItem itm in detailPageConf.SnifferItems) { if (itm.ItemName == itemName) { bool1 = true; } } if (!bool1) { SnifferItem item = new SnifferItem(); item.ItemName = itemNode.Attributes["ItemName"].Value; if (itemNode.Attributes["SaveImage"] != null) { item.SaveImage = bool.Parse(itemNode.Attributes["SaveImage"].Value); } if (itemNode.Attributes["SaveImagesPath"] != null) { item.SaveImagesPath = itemNode.Attributes["SaveImagesPath"].Value; } if (itemNode.Attributes["ImageUrlPath"] != null) { item.ImageUrlPath = itemNode.Attributes["ImageUrlPath"].Value; } if (itemNode.Attributes["IsClearHTML"] != null) { item.IsClearHTML = bool.Parse(itemNode.Attributes["IsClearHTML"].Value); } if (itemNode.Attributes["IsUrl"] != null) { item.IsUrl = bool.Parse(itemNode.Attributes["IsUrl"].Value); } if (itemNode.Attributes["UrlToAbs"] != null) { item.UrlToAbs = bool.Parse(itemNode.Attributes["UrlToAbs"].Value); } if (itemNode.Attributes["Separator"] != null) { item.Separator = itemNode.Attributes["Separator"].Value; } if (itemNode.Attributes["ClearAElement"] != null) { item.ClearAElement = bool.Parse(itemNode.Attributes["ClearAElement"].Value); } if (itemNode.Attributes["MutiPage"] != null) { item.MutiPage = bool.Parse(itemNode.Attributes["MutiPage"].Value); } if (itemNode.Attributes["MutiPageSeparator"] != null) { item.MutiPageSeparator = itemNode.Attributes["MutiPageSeparator"].Value; } XmlNode ndClearRegexString = itemNode.SelectSingleNode("ClearRegexString"); if (ndClearRegexString != null) { item.ClearRegexString = ndClearRegexString.InnerText; } XmlNode ndDefaultValue = itemNode.SelectSingleNode("DefaultValue"); if (ndDefaultValue != null) { item.DefaultValue = ndDefaultValue.InnerText; } XmlNode ndRegexString = itemNode.SelectSingleNode("RegexString"); if (ndRegexString != null) { item.RegexString = CreateRegexString(ndRegexString); } detailPageConf.SnifferItems.Add(item); } } } XmlNodeList subPageNodes = pageConfNode.SelectNodes("SnifferPage"); foreach (XmlNode subPageNode in subPageNodes) { detailPageConf.SubPageConfigurations.Add(CreateDetailPageConfiguration(detailPageConf, pageConfNode, subPageNode)); } if (commonConfigNode != null) { subPageNodes = commonConfigNode.SelectNodes("SnifferPage"); foreach (XmlNode subPageNode in subPageNodes) { detailPageConf.SubPageConfigurations.Add(CreateDetailPageConfiguration(detailPageConf, pageConfNode, subPageNode)); } } return(detailPageConf); }
public DetailPage(PageBase parent, DetailPageConfiguration configuration) : base(parent, configuration) { }
public DetailPage(DetailPageConfiguration configuration) : base(configuration) { }
private void AddDetailPageConfiguration(List <DetailPageConfiguration> confis, DetailPageConfiguration conf) { confis.Add(conf); foreach (DetailPageConfiguration c in conf.SubPageConfigurations) { AddDetailPageConfiguration(confis, c); } }