private void OnFirstPageParseDone(ListPage firstPage) { if (FirstPageParseDone != null) { FirstPageParseDone(firstPage); } }
private void OnCategoryParseDone(ListPage listPage) { if (CategoryParseDone != null) { CategoryParseDone(listPage); } }
private void OnListPageParseDone(ListPage listPage) { if (ListPageParseDone != null) { ListPageParseDone(listPage); } }
/// <summary> /// 采集 /// </summary> /// <returns></returns> public override bool Sniffer() { bool bool1 = InlineSniffer(); //return bool1; if (!bool1) { return(false); } //这里开始内容页的翻页 if (string.IsNullOrEmpty(this.DetailPageConfiguration.PageQuery)) { return(bool1); } DetailPage rootDetailPage = this; ListPage listPage = rootDetailPage.Parent as ListPage; while (listPage == null) { rootDetailPage = rootDetailPage.Parent as DetailPage; listPage = rootDetailPage.Parent as ListPage; } DetailPage detailPage = new DetailPage(rootDetailPage, (DetailPageConfiguration)listPage.ListPageConfiguration.SubPageConfiguration); detailPage.PageName = this.PageName; detailPage.PageUrl = this.PageUrl; int pageIndex = this.PageIndex + this.DetailPageConfiguration.PageIndexStep; ReplacePageIndex(detailPage, pageIndex); //如果内容一样,则表示已经结束了啦 if (detailPage.PageBody == this.PageBody) { return(bool1); } else { //否则要看看,识别项的正则结果是否一样 if (!string.IsNullOrEmpty(this.Configuration.EndPageDetermineRegex)) { Match match1 = Regex.Match(this.PageBody, this.Configuration.EndPageDetermineRegex, (RegexOptions)25); Match match2 = Regex.Match(detailPage.PageBody, detailPage.Configuration.EndPageDetermineRegex, (RegexOptions)25); if (match1.Value == match2.Value) { return(bool1); } } } detailPage.Sniffer(); return(bool1); }
/// <summary> /// 采集详细页 /// </summary> /// <param name="listPage"></param> public void ParseDetailPage(ListPage listPage) { ListPage backListPage = null; bool isNotPage = false; int pageIndex = this.SnifferContext.GetStartPageIndex(listPage); int snifferContextCount = SnifferContext.GetSnifferPageCount(listPage); int donePageCount = 0; while (!isNotPage) { listPage.Sniffer(); //如果不成功,或者大于要采集的页数,则表示没有数据了,完成了分类 if (!listPage.Succeed || donePageCount == snifferContextCount || (backListPage != null && backListPage.PageBody == listPage.PageBody)) { break; } OnPageIndexChange(listPage.PageUrl); foreach (UrlItem urlItem in listPage.SubPageUrlResults) { DetailPageConfiguration detailPageConf = (DetailPageConfiguration)listPage.ListPageConfiguration.SubPageConfiguration; DetailPage detailPage = new DetailPage(listPage, detailPageConf); detailPage.PageIndex = detailPageConf.PageStartIndex; detailPage.PageName = urlItem.Title; detailPage.PageUrl = urlItem.Url; detailPage.Sniffer(); OnDetailPageParseDone(detailPage); } pageIndex = pageIndex + (listPage.ListPageConfiguration.PageIndexStep - 1); donePageCount++; ListPage newListPage = new ListPage(listPage.Parent, listPage.ListPageConfiguration); newListPage.PageName = listPage.PageName; newListPage.PageUrl = listPage.PageUrl; if (listPage.ListPageConfiguration.PageMethod == PageMethod.Get) { ReplacePageIndex(newListPage, pageIndex); } else { newListPage.PageQuery = string.Format(newListPage.PageQuery, pageIndex); } backListPage = listPage; listPage = newListPage; } }
private void _start() { try { ListPage firstPage = null; lock (SnifferContext.FirstPages) { if (SnifferContext.FirstPages.Count > 0) { firstPage = SnifferContext.FirstPages[0]; SnifferContext.FirstPages.Remove(firstPage); } else { return; } } if (firstPage.ListPageConfiguration.SubPageConfiguration.PageType == PageType.DetailPage && this.SnifferContext.GetStartPageIndex(firstPage) > 1) { if (firstPage.ListPageConfiguration.PageMethod == PageMethod.Get) { ReplacePageIndex(firstPage, this.SnifferContext.GetStartPageIndex(firstPage)); } else { firstPage.PageQuery = string.Format(firstPage.PageQuery, this.SnifferContext.GetStartPageIndex(firstPage)); } } firstPage.Sniffer(); if (firstPage.SubPageUrlResults.Count > 0 && firstPage.ListPageConfiguration.SubPageConfiguration.PageType == PageType.DetailPage) { ParseDetailPage(firstPage); OnListPageParseDone(firstPage); } else { ParseListPage(firstPage); } OnFirstPageParseDone(firstPage); _start(); } catch (System.Exception e) { InfoSniffer.LogManager.WriteLog(string.Format("<error><thread>{0}</thread><message>{1}</message></error>", this.ThreadIndex, e.Message)); } }
/// <summary> /// 读取详细页重载 /// </summary> /// <returns></returns> public static DetailPage GetDetailPage(string fileName, string rootPageName, int firstIndex, int pageIndex, int urlIndex) { ListPage firstPage = GetListPage(fileName, rootPageName, firstIndex, pageIndex); UrlItem urlItem = firstPage.SubPageUrlResults[urlIndex]; DetailPageConfiguration detailPageConf = (DetailPageConfiguration)firstPage.ListPageConfiguration.SubPageConfiguration; DetailPage detailPage = new DetailPage(firstPage, detailPageConf); detailPage.PageIndex = detailPageConf.PageStartIndex; detailPage.PageName = urlItem.Title; detailPage.PageUrl = urlItem.Url; detailPage.Sniffer(); return(detailPage); }
private void ReplacePageIndex(ListPage listPage, int pageIndex) { string stringPageIndex; if (!string.IsNullOrEmpty(listPage.ListPageConfiguration.PageIndexFormat)) { stringPageIndex = pageIndex.ToString(listPage.ListPageConfiguration.PageIndexFormat); } else { stringPageIndex = pageIndex.ToString(); } string pageQuery = string.Format(listPage.ListPageConfiguration.ReplacePageQuery, stringPageIndex); listPage.PageUrl = Regex.Replace(listPage.PageUrl, listPage.ListPageConfiguration.PageQuery, pageQuery); }
/// <summary> /// 一个列表页采集完成 /// </summary> /// <param name="listPage"></param> void SnifferThread_ListPageParseDone(ListPage listPage) { string dir = listPage.SavePath; if (!Directory.Exists(dir)) { Directory.CreateDirectory(dir); } string dirAndFileName = listPage.SavePathAndFileName; //导出到Xml文件 Data.WriteXml(dirAndFileName); //如果有插件则调用插件 RootPageConfiguration rootPageConf = null; PageConfiguration parentPageConf = listPage.Configuration.Parent; while (parentPageConf != null) { if (parentPageConf is RootPageConfiguration) { rootPageConf = (RootPageConfiguration)parentPageConf; break; } else { parentPageConf = parentPageConf.Parent; } } if (rootPageConf != null && rootPageConf.Plugin != null) { rootPageConf.Plugin.Receive(Data, dirAndFileName); } //清空数据 if (Data.Tables.Count > 0) { Data.Tables.Clear(); } InfoSniffer.LogManager.WriteLog(string.Format("<donepage><thread>{0}</thread><page>{1}</page><donetime>{2}</donetime></donepage>", this.ThreadIndex, listPage.PageUrl.Replace("&", "&"), DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss.fff"))); }
/// <summary> /// 读取根页重载 /// </summary> /// <param name="fileName"></param> /// <param name="rootPageName"></param> /// <returns></returns> public static List <ListPage> GetAllFirstPages(string fileName, string rootPageName) { SnifferConfig.OpenSnfFile(string.Format(AppDataPath + "{0}.xml", fileName)); RootPageConfiguration rootPageConf = SnifferConfig.GetRootPageConfiguration(rootPageName); if (rootPageConf == null) { return(null); } ListPage rootPage = new ListPage((ListPageConfiguration)rootPageConf); List <ListPage> allFirstPages = new List <ListPage>(); if (rootPageConf.IsSniffer) { rootPage.Sniffer(); if (!rootPage.Done || rootPage.SubPageUrlResults.Count == 0) { //采集不到 } foreach (UrlItem urlItem in rootPage.SubPageUrlResults) { ListPage page = new ListPage(rootPage, (ListPageConfiguration)rootPage.ListPageConfiguration.SubPageConfiguration); page.PageName = urlItem.Title; page.PageUrl = urlItem.Url; allFirstPages.Add(page); } } else { foreach (ListPageConfiguration firstPageConfi in rootPageConf.SubPageConfigurations) { allFirstPages.Add(new ListPage(rootPage, firstPageConfi)); } } return(allFirstPages); }
/// <summary> /// 读取列表页重载 /// </summary> public static ListPage GetListPage(string fileName, string rootPageName, int firstIndex, int pageIndex) { List <ListPage> allFirstPages = GetAllFirstPages(fileName, rootPageName); ListPage firstPage = allFirstPages[firstIndex]; //列表页的页码大于1,则要替换页码 if (firstPage.ListPageConfiguration.SubPageConfiguration.PageType == PageType.DetailPage && pageIndex > 1) { if (firstPage.ListPageConfiguration.PageMethod == PageMethod.Get) { ReplacePageIndex(firstPage, pageIndex); } else { firstPage.PageQuery = string.Format(firstPage.PageQuery, pageIndex); } } firstPage.Sniffer(); return(firstPage); }
/// <summary> /// 一个根页采集完成 /// </summary> /// <param name="firstPage"></param> void SnifferThread_FirstPageParseDone(ListPage firstPage) { ////PagePath = firstPage.Parent.PageName + "\\" + firstPage.PageName + "\\" + PagePath; //PagePath = firstPage.Parent.PageName.Trim() + "\\" + PagePath; //int int1 = PagePath.LastIndexOf("\\"); //string dir = string.Format("{0}\\Data\\{1}\\{2}", Application.StartupPath, DateTime.Now.ToShortDateString(), PagePath.Substring(0, int1)); //string fileName = PagePath.Substring(int1 + 1); //if (!Directory.Exists(dir)) // Directory.CreateDirectory(dir); //string dirAndFileName = string.Format("{0}\\{1}.xml", dir, fileName); ////导出到Xml文件 //Data.WriteXml(dirAndFileName); ////如果是要导出到Excel格式,则转换格式 ////if (SnifferContext.DocumentFormat == DocumentFormat.Xls) ////{ //// FileUtil.XmlFileToExcelFile(dirAndFileName); //// //删除XML文件 //// if (File.Exists(dirAndFileName)) //// File.Delete(dirAndFileName); ////} ////清除PagePath //PagePath = string.Empty; ////清空数据 //if (Data.Tables.Count > 0) // Data.Tables.Clear(); ////清空显示结果 //SnifferContext.ClearState(this.ThreadIndex); }
/// <summary> /// 采集列表页 /// </summary> /// <param name="listPage"></param> public void ParseListPage(ListPage listPage) { listPage.Sniffer(); if (listPage.SubPageUrlResults.Count > 0) { foreach (UrlItem urlItem in listPage.SubPageUrlResults) { ListPage subListPage = new ListPage(listPage, (ListPageConfiguration)listPage.ListPageConfiguration.SubPageConfiguration); subListPage.PageName = urlItem.Title; subListPage.PageUrl = urlItem.Url; subListPage.Sniffer(); if (subListPage.SubPageUrlResults.Count > 0) { if (subListPage.ListPageConfiguration.SubPageConfiguration.PageType == PageType.DetailPage) { string pageName = subListPage.PageName; ParseDetailPage(subListPage); OnListPageParseDone(subListPage); } else { ParseListPage(subListPage); } } else if (listPage.ListPageConfiguration.For && !string.IsNullOrEmpty(listPage.PageBody)) { subListPage = new ListPage(listPage, (ListPageConfiguration)listPage.ListPageConfiguration); subListPage.PageName = urlItem.Title; subListPage.PageUrl = urlItem.Url; ParseListPage(subListPage); } } } }
/// <summary> /// 保存图片 /// </summary> /// <param name="value"></param> public void SaveImages(SnifferItem item, ref string value) { value = value.Replace("wallpaper:", "http:"); MatchCollection imgMatchs; if (item.IsUrl) { imgMatchs = Regex.Matches(value, "^.*$", (RegexOptions)25); } else { imgMatchs = Regex.Matches(value, "(?<=<img[^>]*?src=[\"']?)(\\.|/|http)[^\"' >]*", (RegexOptions)25); } ListPage parentPage = this.Parent as ListPage; if (parentPage == null) { parentPage = this.Parent.Parent as ListPage; } string saveImagesPath = string.Empty; string dtPath = DateTime.Now.ToString("yyyy\\\\MM"); if (!string.IsNullOrEmpty(item.SaveImagesPath)) { saveImagesPath = string.Format(item.SaveImagesPath, dtPath); } else { saveImagesPath = parentPage.SavePathAndFileName.Substring(0, parentPage.SavePathAndFileName.LastIndexOf(".")) + "\\"; } if (!saveImagesPath.EndsWith("\\")) { saveImagesPath += "\\"; } if (!Directory.Exists(saveImagesPath)) { Directory.CreateDirectory(saveImagesPath); } foreach (Match match in imgMatchs) { int lastIndex = match.Value.LastIndexOf("/") + 1; string fileName; if (lastIndex >= 0) { fileName = match.Value.Substring(lastIndex); } else { fileName = match.Value; } if (fileName.Contains("?")) { fileName = fileName.Substring(0, fileName.IndexOf("?")); } fileName = HttpUtility.UrlDecode(fileName); try { string absUrl = FileUtil.GetAbsUrl(match.Value, this.SubPageBaseUrl); Stream stream = FileUtil.GetPageStream(absUrl, this.SubPageBaseUrl); if (stream != null) { if (this.Upload) { Thread thread = new Thread(new ParameterizedThreadStart(UploadFile)); thread.Start(new object[] { stream, saveImagesPath, fileName, "image/jpeg" }); } else { FileUtil.StreamSaveToFile(stream, saveImagesPath + fileName); } } string text1 = string.Empty; if (!string.IsNullOrEmpty(item.ImageUrlPath)) { string imageUrlPath = string.Format(item.ImageUrlPath, dtPath.Replace("\\", "/")); if (!imageUrlPath.EndsWith("/")) { imageUrlPath += "/"; } text1 = value.Replace(match.Value, imageUrlPath + fileName); } else { text1 = value.Replace(match.Value, saveImagesPath + fileName); } value = text1; } catch (System.Exception e) { LogManager.WriteLog(string.Format("<saveimgerr><img>{0}</img><error>{1}</error></saveimgerr>", match.Value, e.Message)); //throw e; }; } }
public int GetSnifferPageCount(ListPage listPage) { return(this.SnifferPageCount); }
public int GetStartPageIndex(ListPage listPage) { return(this.StartPageIndex); }
/// <summary> /// 分类列表搜索完成,要在这里保存 /// </summary> /// <param name="listPage"></param> void SnifferThread_CategoryParseDone(ListPage listPage) { }