/// <summary> /// 采集详细页 /// </summary> /// <param name="listPage"></param> public void ParseDetailPage(ListPage listPage) { ListPage backListPage = null; bool isNotPage = false; int pageIndex = this.SnifferContext.GetStartPageIndex(listPage); int snifferContextCount = SnifferContext.GetSnifferPageCount(listPage); int donePageCount = 0; while (!isNotPage) { listPage.Sniffer(); //如果不成功,或者大于要采集的页数,则表示没有数据了,完成了分类 if (!listPage.Succeed || donePageCount == snifferContextCount || (backListPage != null && backListPage.PageBody == listPage.PageBody)) { break; } OnPageIndexChange(listPage.PageUrl); foreach (UrlItem urlItem in listPage.SubPageUrlResults) { DetailPageConfiguration detailPageConf = (DetailPageConfiguration)listPage.ListPageConfiguration.SubPageConfiguration; DetailPage detailPage = new DetailPage(listPage, detailPageConf); detailPage.PageIndex = detailPageConf.PageStartIndex; detailPage.PageName = urlItem.Title; detailPage.PageUrl = urlItem.Url; detailPage.Sniffer(); OnDetailPageParseDone(detailPage); } pageIndex = pageIndex + (listPage.ListPageConfiguration.PageIndexStep - 1); donePageCount++; ListPage newListPage = new ListPage(listPage.Parent, listPage.ListPageConfiguration); newListPage.PageName = listPage.PageName; newListPage.PageUrl = listPage.PageUrl; if (listPage.ListPageConfiguration.PageMethod == PageMethod.Get) { ReplacePageIndex(newListPage, pageIndex); } else { newListPage.PageQuery = string.Format(newListPage.PageQuery, pageIndex); } backListPage = listPage; listPage = newListPage; } }
private void _start() { try { ListPage firstPage = null; lock (SnifferContext.FirstPages) { if (SnifferContext.FirstPages.Count > 0) { firstPage = SnifferContext.FirstPages[0]; SnifferContext.FirstPages.Remove(firstPage); } else { return; } } if (firstPage.ListPageConfiguration.SubPageConfiguration.PageType == PageType.DetailPage && this.SnifferContext.GetStartPageIndex(firstPage) > 1) { if (firstPage.ListPageConfiguration.PageMethod == PageMethod.Get) { ReplacePageIndex(firstPage, this.SnifferContext.GetStartPageIndex(firstPage)); } else { firstPage.PageQuery = string.Format(firstPage.PageQuery, this.SnifferContext.GetStartPageIndex(firstPage)); } } firstPage.Sniffer(); if (firstPage.SubPageUrlResults.Count > 0 && firstPage.ListPageConfiguration.SubPageConfiguration.PageType == PageType.DetailPage) { ParseDetailPage(firstPage); OnListPageParseDone(firstPage); } else { ParseListPage(firstPage); } OnFirstPageParseDone(firstPage); _start(); } catch (System.Exception e) { InfoSniffer.LogManager.WriteLog(string.Format("<error><thread>{0}</thread><message>{1}</message></error>", this.ThreadIndex, e.Message)); } }
/// <summary> /// 读取根页重载 /// </summary> /// <param name="fileName"></param> /// <param name="rootPageName"></param> /// <returns></returns> public static List <ListPage> GetAllFirstPages(string fileName, string rootPageName) { SnifferConfig.OpenSnfFile(string.Format(AppDataPath + "{0}.xml", fileName)); RootPageConfiguration rootPageConf = SnifferConfig.GetRootPageConfiguration(rootPageName); if (rootPageConf == null) { return(null); } ListPage rootPage = new ListPage((ListPageConfiguration)rootPageConf); List <ListPage> allFirstPages = new List <ListPage>(); if (rootPageConf.IsSniffer) { rootPage.Sniffer(); if (!rootPage.Done || rootPage.SubPageUrlResults.Count == 0) { //采集不到 } foreach (UrlItem urlItem in rootPage.SubPageUrlResults) { ListPage page = new ListPage(rootPage, (ListPageConfiguration)rootPage.ListPageConfiguration.SubPageConfiguration); page.PageName = urlItem.Title; page.PageUrl = urlItem.Url; allFirstPages.Add(page); } } else { foreach (ListPageConfiguration firstPageConfi in rootPageConf.SubPageConfigurations) { allFirstPages.Add(new ListPage(rootPage, firstPageConfi)); } } return(allFirstPages); }
/// <summary> /// 读取列表页重载 /// </summary> public static ListPage GetListPage(string fileName, string rootPageName, int firstIndex, int pageIndex) { List <ListPage> allFirstPages = GetAllFirstPages(fileName, rootPageName); ListPage firstPage = allFirstPages[firstIndex]; //列表页的页码大于1,则要替换页码 if (firstPage.ListPageConfiguration.SubPageConfiguration.PageType == PageType.DetailPage && pageIndex > 1) { if (firstPage.ListPageConfiguration.PageMethod == PageMethod.Get) { ReplacePageIndex(firstPage, pageIndex); } else { firstPage.PageQuery = string.Format(firstPage.PageQuery, pageIndex); } } firstPage.Sniffer(); return(firstPage); }
/// <summary> /// 采集列表页 /// </summary> /// <param name="listPage"></param> public void ParseListPage(ListPage listPage) { listPage.Sniffer(); if (listPage.SubPageUrlResults.Count > 0) { foreach (UrlItem urlItem in listPage.SubPageUrlResults) { ListPage subListPage = new ListPage(listPage, (ListPageConfiguration)listPage.ListPageConfiguration.SubPageConfiguration); subListPage.PageName = urlItem.Title; subListPage.PageUrl = urlItem.Url; subListPage.Sniffer(); if (subListPage.SubPageUrlResults.Count > 0) { if (subListPage.ListPageConfiguration.SubPageConfiguration.PageType == PageType.DetailPage) { string pageName = subListPage.PageName; ParseDetailPage(subListPage); OnListPageParseDone(subListPage); } else { ParseListPage(subListPage); } } else if (listPage.ListPageConfiguration.For && !string.IsNullOrEmpty(listPage.PageBody)) { subListPage = new ListPage(listPage, (ListPageConfiguration)listPage.ListPageConfiguration); subListPage.PageName = urlItem.Title; subListPage.PageUrl = urlItem.Url; ParseListPage(subListPage); } } } }