public override bool AfterAllGrab(IListSheet listSheet) { this.GetPropertiesMatrix(listSheet); return(true); }
public bool Run(string parameters, IListSheet listSheet) { return(this.GenerateDetailPageInfo(listSheet)); }
public override bool AfterAllGrab(IListSheet listSheet) { this.GetSeedInfoFromListSheet(listSheet); //下一步必须执行 bool isNewDo = false; string localLogFileName = null; List <string> allListPageUrls = null; localLogFileName = this.LoginName + "_" + this.KeyWords + "_listPageUrl"; if (SysConfig.SysExecuteType == SysExecuteType.Produce) { //如果是生产环境,那么直接爬取列表页 allListPageUrls = this.GetAllListPages(this.Parameters, this.SeedPageUrl); this.RunPage.SaveInfoToMiddleFile(localLogFileName, "listPageUrl", allListPageUrls); isNewDo = true; } else { //读取历史爬取的列表页地址文件 allListPageUrls = this.RunPage.TryGetInfoFromMiddleFile(localLogFileName, "listPageUrl"); if (allListPageUrls == null) { allListPageUrls = this.GetAllListPages(this.Parameters, this.SeedPageUrl); this.RunPage.SaveInfoToMiddleFile(localLogFileName, "listPageUrl", allListPageUrls); isNewDo = true; } } List <Dictionary <string, string> > allPersonPageUrlInfos = null; localLogFileName = this.LoginName + "_" + this.KeyWords + "_personPageUrlInfo"; if (SysConfig.SysExecuteType == SysExecuteType.Produce || isNewDo) { //如果是生产环境,那么直接解析列表页 allPersonPageUrlInfos = this.GetPersonPageUrlsFromListPages(this.RunPage.GetDetailSourceFileDir(), allListPageUrls); this.RunPage.SaveInfoToMiddleFile(localLogFileName, new string[] { "personUrl", "personName" }, allPersonPageUrlInfos); } else { //读取历史解析获得的个人网页地址 allPersonPageUrlInfos = this.RunPage.TryGetInfoFromMiddleFile(localLogFileName, new string[] { "personUrl", "personName" }); if (allPersonPageUrlInfos == null) { allPersonPageUrlInfos = this.GetPersonPageUrlsFromListPages(this.RunPage.GetDetailSourceFileDir(), allListPageUrls); this.RunPage.SaveInfoToMiddleFile(localLogFileName, new string[] { "personUrl", "personName" }, allPersonPageUrlInfos); isNewDo = true; } } List <string> allPersonPageUrls = null; localLogFileName = this.LoginName + "_" + this.KeyWords + "_personPageUrl"; if (SysConfig.SysExecuteType == SysExecuteType.Produce || isNewDo) { //如果是生产环境,那么直接爬取个人详情页 allPersonPageUrls = ProcessPersonPage.GetAllPersonPageUrls(this.RunPage, allPersonPageUrlInfos, this.LoginName, this.LoginPassword); this.RunPage.SaveInfoToMiddleFile(localLogFileName, "personUrl", allPersonPageUrls); isNewDo = true; } else { //读取历史生成的个人网页网址 allPersonPageUrls = this.RunPage.TryGetInfoFromMiddleFile(this.LoginName + "." + this.KeyWords + ".personPageUrl", "personUrl"); if (allPersonPageUrls == null) { allPersonPageUrls = ProcessPersonPage.GetAllPersonPageUrls(this.RunPage, allPersonPageUrlInfos, this.LoginName, this.LoginPassword); this.RunPage.SaveInfoToMiddleFile(localLogFileName, "personUrl", allPersonPageUrls); isNewDo = true; } } List <Dictionary <string, string> > personInfoList = ProcessPersonPage.GetPersonInfoFromLocalPages(this.RunPage, allPersonPageUrls, false, null); string personInfosFilePath = this.RunPage.GetFilePath("SearchResult_Linkedin2Linkedin_" + this.LoginName + "_" + this.KeyWords + ".xlsx", this.RunPage.GetExportDir()); ProcessPersonPage.SavePersonInfoToFile(this.RunPage, personInfoList, personInfosFilePath); return(true); }
private bool GetAllPages(IListSheet listSheet) { CsvWriter mainCW = this.GetMainCsvWriter(); CsvWriter ztbCW = this.GetZtbCsvWriter(); CsvWriter sgtscCW = this.GetSgtscCsvWriter(); CsvWriter htbaCW = this.GetHtbaCsvWriter(); CsvWriter sgxkCW = this.GetSgxkCsvWriter(); CsvWriter jgysbaCW = this.GetJgysbaCsvWriter(); string detailPageUrlColumnName = SysConfig.DetailPageUrlFieldName; Dictionary <string, string> projectDic = new Dictionary <string, string>(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); string detailPageUrl = row[SysConfig.DetailPageUrlFieldName]; string detailPageName = row[SysConfig.DetailPageNameFieldName]; try { bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { HtmlAgilityPack.HtmlDocument pageHtmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); #region 基础信息 string xmmc = ""; string xmbh = ""; string sjxmbh = ""; string szqh = ""; string jsdw = ""; string jsdwzzjgdm = ""; string xmfl = ""; string jsxz = ""; string gcyt = ""; string ztz = ""; string zmj = ""; string lxjb = ""; string lxwh = ""; HtmlNode xmmcNode = pageHtmlDoc.DocumentNode.SelectSingleNode("//div[@class=\"user_info spmtop\"]"); if (xmmcNode == null) { throw new Exception("没有找到项目名称节点"); } else { xmmc = CommonUtil.HtmlDecode(xmmcNode.InnerText.Trim()).Trim(); } HtmlNodeCollection projectFieldNodeList = pageHtmlDoc.DocumentNode.SelectNodes("//div[@class=\"query_info_box \"]/div/div[@class=\"activeTinyTabContent\"]/dl/dd"); if (projectFieldNodeList != null) { for (int j = 0; j < projectFieldNodeList.Count; j++) { HtmlNode projectFieldNode = projectFieldNodeList[j]; string fieldText = projectFieldNode.InnerText.Trim(); int sIndex = fieldText.IndexOf(":"); string fieldName = CommonUtil.HtmlDecode(fieldText.Substring(0, sIndex)).Trim(); string fieldValue = CommonUtil.HtmlDecode(fieldText.Substring(sIndex + 1)).Trim(); switch (fieldName) { case "项目编号": xmbh = fieldValue; break; case "省级项目编号": sjxmbh = fieldValue; break; case "所在区划": szqh = fieldValue; break; case "建设单位": jsdw = fieldValue; break; case "建设单位组织机构代码(统一社会信用代码)": jsdwzzjgdm = fieldValue; break; case "项目分类": xmfl = fieldValue; break; case "建设性质": jsxz = fieldValue; break; case "工程用途": gcyt = fieldValue; break; case "总投资": ztz = fieldValue; break; case "总面积": zmj = fieldValue; break; case "立项级别": lxjb = fieldValue; break; case "立项文号": lxwh = fieldValue; break; } } } else { throw new Exception("无法获取项目基本信息属性值"); } Dictionary <string, string> f2vs = new Dictionary <string, string>(); f2vs.Add("项目编号", xmbh); f2vs.Add("省级项目编号", sjxmbh); f2vs.Add("项目名称", xmmc); f2vs.Add("所在区划", szqh); f2vs.Add("建设单位", jsdw); f2vs.Add("建设单位组织机构代码(统一社会信用代码)", jsdwzzjgdm); f2vs.Add("项目分类", xmfl); f2vs.Add("建设性质", jsxz); f2vs.Add("工程用途", gcyt); f2vs.Add("总投资", ztz); f2vs.Add("总面积", zmj); f2vs.Add("立项级别", lxjb); f2vs.Add("立项文号", lxwh); mainCW.AddRow(f2vs); #endregion #region 招投标 HtmlNodeCollection ztbNodeList = pageHtmlDoc.DocumentNode.SelectNodes("//div[@id=\"tab_ztb\"]/table/tbody/tr[@class=\"row\"]"); if (ztbNodeList != null) { foreach (HtmlNode ztbNode in ztbNodeList) { HtmlNodeCollection ztbFieldNodeList = ztbNode.SelectNodes("./td"); Dictionary <string, string> ztbF2vs = new Dictionary <string, string>(); ztbF2vs.Add("项目编码", xmbh); ztbF2vs.Add("招标类型", CommonUtil.HtmlDecode(ztbFieldNodeList[1].InnerText.Trim())); ztbF2vs.Add("招标方式", CommonUtil.HtmlDecode(ztbFieldNodeList[2].InnerText.Trim())); ztbF2vs.Add("中标单位名称", CommonUtil.HtmlDecode(ztbFieldNodeList[3].InnerText.Trim())); ztbF2vs.Add("中标日期", CommonUtil.HtmlDecode(ztbFieldNodeList[4].InnerText.Trim())); ztbF2vs.Add("中标金额(万元)", CommonUtil.HtmlDecode(ztbFieldNodeList[5].InnerText.Trim())); ztbF2vs.Add("中标通知书编号", CommonUtil.HtmlDecode(ztbFieldNodeList[6].InnerText.Trim())); ztbF2vs.Add("省级中标通知书编号", CommonUtil.HtmlDecode(ztbFieldNodeList[7].InnerText.Trim())); ztbCW.AddRow(ztbF2vs); } } #endregion #region 施工图审查 HtmlNodeCollection sgtscNodeList = pageHtmlDoc.DocumentNode.SelectNodes("//div[@id=\"tab_sgtsc\"]/table/tbody/tr[@class=\"row\"]"); if (sgtscNodeList != null) { foreach (HtmlNode sgtscNode in sgtscNodeList) { HtmlNodeCollection sgtscFieldNodeList = sgtscNode.SelectNodes("./td"); Dictionary <string, string> sgtscF2vs = new Dictionary <string, string>(); sgtscF2vs.Add("项目编码", xmbh); sgtscF2vs.Add("施工图审查合格书编号", CommonUtil.HtmlDecode(sgtscFieldNodeList[1].InnerText.Trim())); sgtscF2vs.Add("省级施工图审查合格书编号", CommonUtil.HtmlDecode(sgtscFieldNodeList[2].InnerText.Trim())); sgtscF2vs.Add("勘察单位名称", CommonUtil.HtmlDecode(sgtscFieldNodeList[3].InnerText.Trim())); sgtscF2vs.Add("设计单位名称", CommonUtil.HtmlDecode(sgtscFieldNodeList[4].InnerText.Trim())); sgtscF2vs.Add("施工图审查机构名称", CommonUtil.HtmlDecode(sgtscFieldNodeList[5].InnerText.Trim())); sgtscF2vs.Add("审查完成日期", CommonUtil.HtmlDecode(sgtscFieldNodeList[6].InnerText.Trim())); sgtscCW.AddRow(sgtscF2vs); } } #endregion #region 合同备案 HtmlNodeCollection htbaNodeList = pageHtmlDoc.DocumentNode.SelectNodes("//div[@id=\"tab_htba\"]/table/tbody/tr[@class=\"row\"]"); if (htbaNodeList != null) { foreach (HtmlNode htbaNode in htbaNodeList) { HtmlNodeCollection htbaFieldNodeList = htbaNode.SelectNodes("./td"); Dictionary <string, string> htbaF2vs = new Dictionary <string, string>(); htbaF2vs.Add("项目编码", xmbh); htbaF2vs.Add("合同类别", CommonUtil.HtmlDecode(htbaFieldNodeList[1].InnerText.Trim())); htbaF2vs.Add("合同备案编号", CommonUtil.HtmlDecode(htbaFieldNodeList[2].InnerText.Trim())); htbaF2vs.Add("省级合同备案编号", CommonUtil.HtmlDecode(htbaFieldNodeList[3].InnerText.Trim())); htbaF2vs.Add("合同金额(万元)", CommonUtil.HtmlDecode(htbaFieldNodeList[4].InnerText.Trim())); htbaF2vs.Add("合同签订日期", CommonUtil.HtmlDecode(htbaFieldNodeList[5].InnerText.Trim())); htbaCW.AddRow(htbaF2vs); } } #endregion #region 施工许可 HtmlNodeCollection sgxkNodeList = pageHtmlDoc.DocumentNode.SelectNodes("//div[@id=\"tab_sgxk\"]/table/tbody/tr[@class=\"row\"]"); if (sgxkNodeList != null) { foreach (HtmlNode sgxkNode in sgxkNodeList) { HtmlNodeCollection sgxkFieldNodeList = sgxkNode.SelectNodes("./td"); Dictionary <string, string> sgxkF2vs = new Dictionary <string, string>(); sgxkF2vs.Add("项目编码", xmbh); sgxkF2vs.Add("施工许可证编号", CommonUtil.HtmlDecode(sgxkFieldNodeList[1].InnerText.Trim())); sgxkF2vs.Add("省级施工许可证编号", CommonUtil.HtmlDecode(sgxkFieldNodeList[2].InnerText.Trim())); sgxkF2vs.Add("合同金额(万元)", CommonUtil.HtmlDecode(sgxkFieldNodeList[3].InnerText.Trim())); sgxkF2vs.Add("面积(平方米)", CommonUtil.HtmlDecode(sgxkFieldNodeList[4].InnerText.Trim())); sgxkF2vs.Add("发证日期", CommonUtil.HtmlDecode(sgxkFieldNodeList[5].InnerText.Trim())); sgxkCW.AddRow(sgxkF2vs); } } #endregion #region 竣工验收备案 HtmlNodeCollection jgysbaNodeList = pageHtmlDoc.DocumentNode.SelectNodes("//div[@id=\"tab_jgysba\"]/table/tbody/tr[@class=\"row\"]"); if (jgysbaNodeList != null) { foreach (HtmlNode jgysbaNode in jgysbaNodeList) { HtmlNodeCollection jgysbaFieldNodeList = jgysbaNode.SelectNodes("./td"); Dictionary <string, string> jgysbaF2vs = new Dictionary <string, string>(); jgysbaF2vs.Add("项目编码", xmbh); jgysbaF2vs.Add("竣工备案编号", CommonUtil.HtmlDecode(jgysbaFieldNodeList[1].InnerText.Trim())); jgysbaF2vs.Add("省级竣工备案编号", CommonUtil.HtmlDecode(jgysbaFieldNodeList[2].InnerText.Trim())); jgysbaF2vs.Add("实际造价(万元)", CommonUtil.HtmlDecode(jgysbaFieldNodeList[3].InnerText.Trim())); jgysbaF2vs.Add("实际面积(平方米)", CommonUtil.HtmlDecode(jgysbaFieldNodeList[4].InnerText.Trim())); jgysbaF2vs.Add("实际开工日期", CommonUtil.HtmlDecode(jgysbaFieldNodeList[5].InnerText.Trim())); jgysbaF2vs.Add("实际竣工验收日期", CommonUtil.HtmlDecode(jgysbaFieldNodeList[6].InnerText.Trim())); jgysbaCW.AddRow(jgysbaF2vs); } } #endregion } } catch (Exception ex) { //throw ex; string dir = this.RunPage.GetDetailSourceFileDir(); string toDir = Path.Combine(Path.GetDirectoryName(dir), "deleted"); string fileUrl = this.RunPage.GetFilePath(detailPageUrl, dir); string toFileUrl = this.RunPage.GetFilePath(detailPageUrl, toDir); File.Move(fileUrl, toFileUrl); this.RunPage.InvokeAppendLogText("文件不完整,删除", LogLevelType.Error, true); } } mainCW.SaveToDisk(); ztbCW.SaveToDisk(); sgtscCW.SaveToDisk(); htbaCW.SaveToDisk(); sgxkCW.SaveToDisk(); jgysbaCW.SaveToDisk(); return(true); }
public override bool AfterAllGrab(IListSheet listSheet) { this.ProcessLinkageResult(listSheet); return(true); }
public bool Run(string parameters, IListSheet listSheet) { return(this.GenerateCityReport(listSheet)); }
public bool Run(string parameters, IListSheet listSheet) { bool succeed = GetAllDetailPageUrl(listSheet); return(succeed); }
/// <summary> /// 期刊每期目录首页 /// </summary> /// <param name="listSheet"></param> private void GetAllPerioFirstIndexPageUrls(IListSheet listSheet) { String exportDir = this.RunPage.GetExportDir(); string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); int allListFileIndex = 1; ExcelWriter ew = null; Dictionary <string, string> urlDic = new Dictionary <string, string>(); for (int i = 0; i < listSheet.RowCount; i++) { if (ew == null || ew.RowCount > 500000) { if (ew != null) { ew.SaveToDisk(); } ew = this.GetAllPerioFirstIndexPageExcelWriter(allListFileIndex); allListFileIndex++; } Dictionary <string, string> row = listSheet.GetRow(i); string detailUrl = row["detailPageUrl"]; bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { string localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir); try { string pageFileText = FileHelper.GetTextFromFile(localFilePath); JArray itemJsonArray = JObject.Parse(pageFileText).GetValue("pageRow") as JArray; if (itemJsonArray != null) { for (int j = 0; j < itemJsonArray.Count; j++) { JObject itemJson = itemJsonArray[j] as JObject; string perioId = itemJson.GetValue("id").ToString().Trim(); JObject opJson = itemJson.GetValue("op") as JObject; try { if (opJson != null) { JArray opItemsArray = opJson.GetValue("perioIssue") as JArray; //每一期 if (opItemsArray != null) { for (int k = 0; k < opItemsArray.Count; k++) { JObject opItemJson = opItemsArray[k] as JObject; try { string issue_num = this.GetAttributeValue(opItemJson, "issue_num"); string publish_year = this.GetAttributeValue(opItemJson, "publish_year"); string perio_id = this.GetAttributeValue(opItemJson, "perio_id"); string perio_title = this.GetAttributeValue(opItemJson, "perio_title"); if (issue_num != null && publish_year != null && perio_id != null && perio_title != null) { string firstIndexPageUrl = "http://www.wanfangdata.com.cn/perio/articleList.do?page=1&pageSize=10&issue_num=" + issue_num + "&publish_year=" + publish_year + "&article_start=&title_article=&perio_id=" + perio_id; if (!urlDic.ContainsKey(firstIndexPageUrl)) { urlDic.Add(firstIndexPageUrl, null); Dictionary <string, string> f2vs = new Dictionary <string, string>(); f2vs.Add("detailPageUrl", firstIndexPageUrl); f2vs.Add("detailPageName", firstIndexPageUrl); f2vs.Add("perio_id", perio_id); f2vs.Add("issue_num", issue_num); f2vs.Add("publish_year", publish_year); f2vs.Add("perio_title", perio_title); f2vs.Add("pageIndex", "1"); ew.AddRow(f2vs); } } } catch (Exception ex) { throw ex; } } } } } catch (Exception ex) { throw ex; } } } } catch (Exception ex) { throw ex; } } } ew.SaveToDisk(); }
private void GetPeriodicalInfo(IListSheet listSheet) { String exportDir = this.RunPage.GetExportDir(); string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); Dictionary <string, int> resultColumnDic = CommonUtil.InitStringIndexDic(new string[] { "id", "core_perio", "avg_perio_down", "start_year02", "start_year", "issue_postcode", "perio_format", "fax", "perio_id", "language", "tag_num", "major_editor", "abstract_reading_num", "thirdparty_links_num", "import_num", "email", "share_num", "classcode_level", "publish_cycle", "address", "pinyin_title", "avg_article_down", "hostunit_name", "hostunit_area", "director", "main_column", "telephone", "country_code", "affectoi", "issn", "cn", "source_db", "dep_name", "postcode", "collection_num", "win_prize", "cite_num", "perio_title02", "download_num", "first_publish", "data_state", "article_num", "ef_name", "release_cycle", "fulltext_reading_num", "note_num", "end_year", "class_code", "end_issue", "trans_title", "perio_desc", "perio_title", "keywords", "summary", "cate1", "cateId1", "cate2", "cateId2" }); string resultFilePath = Path.Combine(exportDir, "万方期刊_期刊信息详情.xlsx"); ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic, null); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); string detailUrl = row["detailPageUrl"]; try { string cate1 = row["cate1"]; string cateId1 = row["cateId1"]; string cate2 = row["cate2"]; string cateId2 = row["cateId2"]; bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { string localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir); try { string pageFileText = FileHelper.GetTextFromFile(localFilePath); JArray itemJsonArray = JObject.Parse(pageFileText).GetValue("pageRow") as JArray; for (int j = 0; j < itemJsonArray.Count; j++) { Dictionary <string, string> f2vs = new Dictionary <string, string>(); JObject itemJson = itemJsonArray[j] as JObject; f2vs.Add("cate1", cate1); f2vs.Add("cateId1", cateId1); f2vs.Add("cate2", cate2); f2vs.Add("cateId2", cateId2); this.GetAttributeValue(itemJson, "id", f2vs); this.GetAttributeValue(itemJson, "core_perio", f2vs); this.GetAttributeValue(itemJson, "avg_perio_down", f2vs); this.GetAttributeValue(itemJson, "start_year02", f2vs); this.GetAttributeValue(itemJson, "start_year", f2vs); this.GetAttributeValue(itemJson, "issue_postcode", f2vs); this.GetAttributeValue(itemJson, "perio_format", f2vs); this.GetAttributeValue(itemJson, "fax", f2vs); this.GetAttributeValue(itemJson, "perio_id", f2vs); this.GetAttributeValue(itemJson, "language", f2vs); this.GetAttributeValue(itemJson, "tag_num", f2vs); this.GetAttributeValue(itemJson, "major_editor", f2vs); this.GetAttributeValue(itemJson, "abstract_reading_num", f2vs); this.GetAttributeValue(itemJson, "thirdparty_links_num", f2vs); this.GetAttributeValue(itemJson, "import_num", f2vs); this.GetAttributeValue(itemJson, "email", f2vs); this.GetAttributeValue(itemJson, "share_num", f2vs); this.GetAttributeValue(itemJson, "classcode_level", f2vs); this.GetAttributeValue(itemJson, "publish_cycle", f2vs); this.GetAttributeValue(itemJson, "address", f2vs); this.GetAttributeValue(itemJson, "pinyin_title", f2vs); this.GetAttributeValue(itemJson, "avg_article_down", f2vs); this.GetAttributeValue(itemJson, "hostunit_name", f2vs); this.GetAttributeValue(itemJson, "hostunit_area", f2vs); this.GetAttributeValue(itemJson, "director", f2vs); this.GetAttributeValue(itemJson, "main_column", f2vs); this.GetAttributeValue(itemJson, "telephone", f2vs); this.GetAttributeValue(itemJson, "country_code", f2vs); this.GetAttributeValue(itemJson, "affectoi", f2vs); this.GetAttributeValue(itemJson, "issn", f2vs); this.GetAttributeValue(itemJson, "cn", f2vs); this.GetAttributeValue(itemJson, "source_db", f2vs); this.GetAttributeValue(itemJson, "dep_name", f2vs); this.GetAttributeValue(itemJson, "postcode", f2vs); this.GetAttributeValue(itemJson, "collection_num", f2vs); this.GetAttributeValue(itemJson, "win_prize", f2vs); this.GetAttributeValue(itemJson, "cite_num", f2vs); this.GetAttributeValue(itemJson, "perio_title02", f2vs); this.GetAttributeValue(itemJson, "download_num", f2vs); this.GetAttributeValue(itemJson, "first_publish", f2vs); this.GetAttributeValue(itemJson, "data_state", f2vs); this.GetAttributeValue(itemJson, "article_num", f2vs); this.GetAttributeValue(itemJson, "ef_name", f2vs); this.GetAttributeValue(itemJson, "release_cycle", f2vs); this.GetAttributeValue(itemJson, "fulltext_reading_num", f2vs); this.GetAttributeValue(itemJson, "note_num", f2vs); this.GetAttributeValue(itemJson, "end_year", f2vs); this.GetAttributeValue(itemJson, "class_code", f2vs); this.GetAttributeValue(itemJson, "end_issue", f2vs); this.GetAttributeValue(itemJson, "trans_title", f2vs); this.GetAttributeValue(itemJson, "perio_desc", f2vs); this.GetAttributeValue(itemJson, "perio_title", f2vs); this.GetAttributeValue(itemJson, "keywords", f2vs); this.GetAttributeValue(itemJson, "summary", f2vs); resultEW.AddRow(f2vs); } } catch (Exception ex) { throw ex; } } } catch (Exception ex) { this.RunPage.InvokeAppendLogText(ex.Message + ". detailUrl" + detailUrl, LogLevelType.Error, true); throw ex; } } resultEW.SaveToDisk(); }
private void GetRelatedItemPageUrls(IListSheet listSheet) { ExcelWriter moreItemEW = this.CreateMoreItemWriter(); Dictionary <string, bool> itemMaps = new Dictionary <string, bool>(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> listRow = listSheet.GetRow(i); bool giveUp = "Y".Equals(listRow[SysConfig.GiveUpGrabFieldName]); string fromItemUrl = listRow[SysConfig.DetailPageUrlFieldName]; if (!giveUp) { try { HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNode titleNode = htmlDoc.DocumentNode.SelectSingleNode("//dd[@class=\"lemmaWgt-lemmaTitle-title\"]/h1"); string fromItemName = CommonUtil.HtmlDecode(titleNode.InnerText).Trim(); HtmlNode itemBaseInfoNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@class=\"lemmaWgt-promotion-rightPreciseAd\"]"); string fromItemId = itemBaseInfoNode.GetAttributeValue("data-lemmaid", ""); string fromItemTitle = itemBaseInfoNode.GetAttributeValue("data-lemmatitle", ""); if (!itemMaps.ContainsKey(fromItemUrl)) { itemMaps.Add(fromItemUrl, true); Dictionary <string, string> moreItemRow = new Dictionary <string, string>(); moreItemRow.Add("detailPageUrl", fromItemUrl); moreItemRow.Add("detailPageName", fromItemUrl); moreItemRow.Add("itemId", fromItemId); moreItemRow.Add("itemName", fromItemName); moreItemEW.AddRow(moreItemRow); } HtmlNodeCollection aNodes = htmlDoc.DocumentNode.SelectNodes("//a"); for (int j = 0; j < aNodes.Count; j++) { HtmlNode aNode = aNodes[j]; string toItemUrl = aNode.GetAttributeValue("href", ""); string toItemId = aNode.GetAttributeValue("data-lemmaid", ""); string toItemName = CommonUtil.HtmlDecode(aNode.InnerText).Trim(); string toItemFullUrl = "https://baike.baidu.com" + toItemUrl; if (toItemUrl.StartsWith("/item/") && !itemMaps.ContainsKey(toItemFullUrl) && this.IsInMainContent(aNode)) { itemMaps.Add(toItemFullUrl, true); Dictionary <string, string> moreItemRow = new Dictionary <string, string>(); moreItemRow.Add("detailPageUrl", toItemFullUrl); moreItemRow.Add("detailPageName", toItemFullUrl); moreItemRow.Add("itemId", toItemId); moreItemRow.Add("itemName", toItemName); moreItemEW.AddRow(moreItemRow); } } this.GenerateRelatedItemFile(fromItemUrl, htmlDoc); } catch (Exception ex) { throw ex; } } } moreItemEW.SaveToDisk(); }
/// <summary> /// 获取每一期的基本信息 /// </summary> /// <param name="listSheet"></param> private void GetPeriodicalPerioIssueInfo(IListSheet listSheet) { String exportDir = this.RunPage.GetExportDir(); string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); int allListFileIndex = 1; ExcelWriter ew = null; for (int i = 0; i < listSheet.RowCount; i++) { if (ew == null || ew.RowCount > 500000) { if (ew != null) { ew.SaveToDisk(); } ew = this.GetAllPerioIssueInfoExcelWriter(allListFileIndex); allListFileIndex++; } Dictionary <string, string> row = listSheet.GetRow(i); string detailUrl = row["detailPageUrl"]; bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { string localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir); try { string pageFileText = FileHelper.GetTextFromFile(localFilePath); JArray itemJsonArray = JObject.Parse(pageFileText).GetValue("pageRow") as JArray; if (itemJsonArray != null) { for (int j = 0; j < itemJsonArray.Count; j++) { JObject itemJson = itemJsonArray[j] as JObject; string perioId = itemJson.GetValue("id").ToString().Trim(); JObject opJson = itemJson.GetValue("op") as JObject; try { if (opJson != null) { JArray opItemsArray = opJson.GetValue("perioIssue") as JArray; //每一期 if (opItemsArray != null) { for (int k = 0; k < opItemsArray.Count; k++) { JObject opItemJson = opItemsArray[k] as JObject; Dictionary <string, string> f2vs = new Dictionary <string, string>(); f2vs.Add("perioId", perioId); this.GetAttributeValue(opItemJson, "publish_year", f2vs); this.GetAttributeValue(opItemJson, "trans_title", f2vs); this.GetAttributeValue(opItemJson, "issue_id", f2vs); this.GetAttributeValue(opItemJson, "show_issue_num", f2vs); this.GetAttributeValue(opItemJson, "page_cnt", f2vs); this.GetAttributeValue(opItemJson, "issue_num", f2vs); this.GetAttributeValue(opItemJson, "perio_id", f2vs); this.GetAttributeValue(opItemJson, "orig_catalog", f2vs); this.GetAttributeValue(opItemJson, "volume", f2vs); this.GetAttributeValue(opItemJson, "catalog_url", f2vs); this.GetAttributeValue(opItemJson, "total_issue", f2vs); this.GetAttributeValue(opItemJson, "special_title", f2vs); this.GetAttributeValue(opItemJson, "issue_cover", f2vs); this.GetAttributeValue(opItemJson, "id", f2vs); this.GetAttributeValue(opItemJson, "perio_title", f2vs); ew.AddRow(f2vs); } } } } catch (Exception ex) { throw ex; } } } } catch (Exception ex) { throw ex; } } } ew.SaveToDisk(); }
private void GetRenWuProperties(IListSheet listSheet) { try { List <string> propertyColumnNames = new List <string>(); ExcelWriter RenWuInfoExcelWriter = this.CreateRenWuPropertyListWriter(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> listRow = listSheet.GetRow(i); bool giveUp = "Y".Equals(listRow[SysConfig.GiveUpGrabFieldName]); string pageUrl = listRow[SysConfig.DetailPageUrlFieldName]; string name = listRow["name"]; string fullName = listRow["fullName"]; if (!giveUp) { try { HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNodeCollection dtNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"basic-info cmn-clearfix\"]/dl/dt"); if (dtNodes != null) { List <string> oneIRenWuProperties = new List <string>(); foreach (HtmlNode dtNode in dtNodes) { string pKey = CommonUtil.HtmlDecode(dtNode.InnerText).Trim().Replace(" ", "").Replace(" ", "").Replace(" ", ""); string pValue = this.GetNextDDNodeText(dtNode); int sameNamePKeyCount = 1; string newPKey = pKey; while (oneIRenWuProperties.Contains(newPKey)) { sameNamePKeyCount++; newPKey = pKey + "_" + sameNamePKeyCount.ToString(); } oneIRenWuProperties.Add(newPKey); if (!propertyColumnNames.Contains(newPKey)) { propertyColumnNames.Add(newPKey); } Dictionary <string, string> row = new Dictionary <string, string>(); row.Add("name", name); row.Add("fullName", fullName); row.Add("pKey", newPKey); row.Add("pValue", pValue); row.Add("url", pageUrl); RenWuInfoExcelWriter.AddRow(row); } } } catch (Exception ex) { throw ex; } } } RenWuInfoExcelWriter.SaveToDisk(); ExcelWriter RenWuColumnPropertyExcelWriter = this.CreateRenWuColumnPropertyListWriter(propertyColumnNames); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> listRow = listSheet.GetRow(i); bool giveUp = "Y".Equals(listRow[SysConfig.GiveUpGrabFieldName]); string pageUrl = listRow[SysConfig.DetailPageUrlFieldName]; string name = listRow["name"]; string fullName = listRow["fullName"]; if (!giveUp) { try { HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNodeCollection dtNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"basic-info cmn-clearfix\"]/dl/dt"); Dictionary <string, string> row = new Dictionary <string, string>(); row.Add("name", name); row.Add("fullName", fullName); row.Add("url", pageUrl); if (dtNodes != null) { List <string> oneIRenWuProperties = new List <string>(); foreach (HtmlNode dtNode in dtNodes) { string pKey = CommonUtil.HtmlDecode(dtNode.InnerText).Trim().Replace(" ", "").Replace(" ", "").Replace(" ", ""); string pValue = this.GetNextDDNodeText(dtNode); int sameNamePKeyCount = 1; string newPKey = pKey; while (oneIRenWuProperties.Contains(newPKey)) { sameNamePKeyCount++; newPKey = pKey + "_" + sameNamePKeyCount.ToString(); } oneIRenWuProperties.Add(newPKey); row.Add(newPKey, pValue); } } RenWuColumnPropertyExcelWriter.AddRow(row); } catch (Exception ex) { throw ex; } } } RenWuColumnPropertyExcelWriter.SaveToDisk(); } catch (Exception ex) { throw ex; } }
/// <summary> /// 保留部分属性 /// </summary> /// <param name="listSheet"></param> private void GetRenWuRemainProperties(IListSheet listSheet) { try { string[] parameters = this.Parameters.Split(new string[] { "," }, StringSplitOptions.RemoveEmptyEntries); string columnMapFilePath = parameters[0]; ExcelReader columnMapER = new ExcelReader(columnMapFilePath, "人物属性"); int rowCount = columnMapER.GetRowCount(); Dictionary <string, string> columnAliasToColumns = new Dictionary <string, string>(); for (int i = 0; i < rowCount; i++) { Dictionary <string, string> columnRow = columnMapER.GetFieldValues(i); string columnName = columnRow["column"].Trim(); columnAliasToColumns.Add(columnName, columnName); string aliasColumnsStr = columnRow["aliasColumns"]; string[] aliasColumns = aliasColumnsStr.Split(new string[] { "," }, StringSplitOptions.RemoveEmptyEntries); foreach (string alias in aliasColumns) { columnAliasToColumns.Add(alias.Trim(), columnName); } } List <string> propertyColumnNames = new List <string>(); ExcelWriter RenWuInfoExcelWriter = this.CreateRenWuRemainPropertyListWriter(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> listRow = listSheet.GetRow(i); bool giveUp = "Y".Equals(listRow[SysConfig.GiveUpGrabFieldName]); string pageUrl = listRow[SysConfig.DetailPageUrlFieldName]; string name = listRow["name"]; if (!giveUp) { try { HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNodeCollection dtNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"basic-info cmn-clearfix\"]/dl/dt"); if (dtNodes != null) { List <string> oneIRenWuProperties = new List <string>(); foreach (HtmlNode dtNode in dtNodes) { string pKey = CommonUtil.HtmlDecode(dtNode.InnerText).Trim().Replace(" ", "").Replace(" ", "").Replace(" ", ""); string pValue = this.GetNextDDNodeText(dtNode); int sameNamePKeyCount = 1; string newPKey = pKey; while (oneIRenWuProperties.Contains(newPKey)) { sameNamePKeyCount++; newPKey = pKey + "_" + sameNamePKeyCount.ToString(); } oneIRenWuProperties.Add(newPKey); if (!propertyColumnNames.Contains(newPKey) && columnAliasToColumns.ContainsValue(newPKey)) { propertyColumnNames.Add(newPKey); } if (columnAliasToColumns.ContainsKey(newPKey)) { string columnName = columnAliasToColumns[newPKey]; Dictionary <string, string> row = new Dictionary <string, string>(); row.Add("name", name); row.Add("pKey", columnName); row.Add("pValue", pValue); row.Add("url", pageUrl); RenWuInfoExcelWriter.AddRow(row); } } } } catch (Exception ex) { throw ex; } } } RenWuInfoExcelWriter.SaveToDisk(); ExcelWriter RenWuColumnPropertyExcelWriter = this.CreateRenWuRemainColumnPropertyListWriter(propertyColumnNames); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> listRow = listSheet.GetRow(i); bool giveUp = "Y".Equals(listRow[SysConfig.GiveUpGrabFieldName]); string pageUrl = listRow[SysConfig.DetailPageUrlFieldName]; string name = listRow["name"]; if (!giveUp) { try { HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNodeCollection dtNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"basic-info cmn-clearfix\"]/dl/dt"); Dictionary <string, string> row = new Dictionary <string, string>(); row.Add("name", name); row.Add("url", pageUrl); if (dtNodes != null) { List <string> oneIRenWuProperties = new List <string>(); foreach (HtmlNode dtNode in dtNodes) { string pKey = CommonUtil.HtmlDecode(dtNode.InnerText).Trim().Replace(" ", "").Replace(" ", "").Replace(" ", ""); string pValue = this.GetNextDDNodeText(dtNode); int sameNamePKeyCount = 1; string newPKey = pKey; while (oneIRenWuProperties.Contains(newPKey)) { sameNamePKeyCount++; newPKey = pKey + "_" + sameNamePKeyCount.ToString(); } oneIRenWuProperties.Add(newPKey); if (columnAliasToColumns.ContainsKey(newPKey)) { string columnName = columnAliasToColumns[newPKey]; if (row.ContainsKey(columnName)) { row[columnName] = row[columnName] + ";" + pValue; } else { row.Add(columnName, pValue); } } } } RenWuColumnPropertyExcelWriter.AddRow(row); } catch (Exception ex) { throw ex; } } } RenWuColumnPropertyExcelWriter.SaveToDisk(); } catch (Exception ex) { throw ex; } }
private void GetPropertiesMatrix(IListSheet listSheet) { string[] parameters = this.Parameters.Split(new string[] { "," }, StringSplitOptions.RemoveEmptyEntries); string sourceFilePath = parameters[0]; string destFilePath = parameters[1]; ExcelReader er = new ExcelReader(sourceFilePath); int sourceRowCount = er.GetRowCount(); Dictionary <string, int> allPropertyCountDic = new Dictionary <string, int>(); List <string> allPropertyList = new List <string>(); for (int i = 0; i < sourceRowCount; i++) { Dictionary <string, string> sourceRow = er.GetFieldValues(i); string[] itemProperties = sourceRow["properties"].Split(new string[] { "," }, StringSplitOptions.RemoveEmptyEntries); foreach (string itemProperty in itemProperties) { if (allPropertyCountDic.ContainsKey(itemProperty)) { allPropertyCountDic[itemProperty] = allPropertyCountDic[itemProperty] + 1; } else { allPropertyList.Add(itemProperty); allPropertyCountDic.Add(itemProperty, 1); } } } //如果出现少于等于2次,那么忽略此属性 int ignoreNum = 6; List <string> propertyList = new List <string>(); Dictionary <string, bool> propertyListDic = new Dictionary <string, bool>(); foreach (string itemProperty in allPropertyList) { if (allPropertyCountDic[itemProperty] > ignoreNum) { propertyList.Add(itemProperty); propertyListDic.Add(itemProperty, true); } } int maxTime = 1; Dictionary <string, Dictionary <string, int> > pToPDic = new Dictionary <string, Dictionary <string, int> >(); for (int i = 0; i < sourceRowCount; i++) { Dictionary <string, string> sourceRow = er.GetFieldValues(i); string[] itemProperties = sourceRow["properties"].Split(new string[] { "," }, StringSplitOptions.RemoveEmptyEntries); foreach (string fromItemProperty in itemProperties) { if (propertyListDic.ContainsKey(fromItemProperty)) { if (!pToPDic.ContainsKey(fromItemProperty)) { pToPDic.Add(fromItemProperty, new Dictionary <string, int>()); } Dictionary <string, int> propertyDic = pToPDic[fromItemProperty]; if (!propertyDic.ContainsKey(fromItemProperty)) { propertyDic.Add(fromItemProperty, 1); } else { propertyDic[fromItemProperty] = propertyDic[fromItemProperty] + 1; } foreach (string toItemProperty in itemProperties) { if (propertyListDic.ContainsKey(toItemProperty)) { if (fromItemProperty != toItemProperty) { if (!propertyDic.ContainsKey(toItemProperty)) { propertyDic.Add(toItemProperty, 1); } else { int tmpValue = propertyDic[toItemProperty] + 1; propertyDic[toItemProperty] = tmpValue; if (tmpValue > maxTime) { maxTime = tmpValue; } } } } } } } } Dictionary <string, int> resultColumnDic = new Dictionary <string, int>(); resultColumnDic.Add("pToP", 0); for (int i = 0; i < propertyList.Count; i++) { resultColumnDic.Add(propertyList[i], i + 1); } CsvWriter propertyMatrixCW = new CsvWriter(destFilePath, resultColumnDic); foreach (string fromProperty in propertyList) { Dictionary <string, string> resultRow = new Dictionary <string, string>(); resultRow.Add("pToP", fromProperty); Dictionary <string, int> propertyDic = pToPDic.ContainsKey(fromProperty) ? pToPDic[fromProperty] : null; foreach (string toProperty in propertyList) { double value = fromProperty == toProperty ? 0 : (propertyDic == null || !propertyDic.ContainsKey(toProperty) || propertyDic[toProperty] == 0 ? 2 * (double)maxTime : ((double)maxTime / (double)propertyDic[toProperty])); resultRow.Add(toProperty, value.ToString()); } propertyMatrixCW.AddRow(resultRow); } propertyMatrixCW.SaveToDisk(); string allPropertyNameFilePath = destFilePath + "_AllPropertyName.xlsx"; Dictionary <string, int> allPropertyNameColumnDic = new Dictionary <string, int>(); allPropertyNameColumnDic.Add("name", 0); allPropertyNameColumnDic.Add("count", 1); Dictionary <string, string> allPropertyNameColumnFormats = new Dictionary <string, string>(); allPropertyNameColumnFormats.Add("count", "#0"); ExcelWriter allPropertyNameEW = new ExcelWriter(allPropertyNameFilePath, "List", allPropertyNameColumnDic, allPropertyNameColumnFormats); for (int i = 0; i < allPropertyList.Count; i++) { string fromProperty = allPropertyList[i]; Dictionary <string, object> resultRow = new Dictionary <string, object>(); resultRow.Add("name", fromProperty); resultRow.Add("count", allPropertyCountDic[fromProperty]); allPropertyNameEW.AddRow(resultRow); } allPropertyNameEW.SaveToDisk(); string propertyNameFilePath = destFilePath + "_PropertyName.xlsx"; Dictionary <string, int> propertyNameColumnDic = new Dictionary <string, int>(); propertyNameColumnDic.Add("name", 0); ExcelWriter propertyNameEW = new ExcelWriter(propertyNameFilePath, "List", propertyNameColumnDic); for (int i = 0; i < propertyList.Count; i++) { string fromProperty = propertyList[i]; Dictionary <string, string> resultRow = new Dictionary <string, string>(); resultRow.Add("name", fromProperty); propertyNameEW.AddRow(resultRow); } propertyNameEW.SaveToDisk(); string propertyArrayFilePath = destFilePath + "_Array.txt"; StringBuilder propertyArrayStringBuilder = new StringBuilder(); propertyArrayStringBuilder.Append("arr = ["); for (int i = 0; i < propertyList.Count; i++) { string fromProperty = propertyList[i]; propertyArrayStringBuilder.Append((i == 0 ? "" : ", \r\n") + "["); Dictionary <string, string> resultRow = new Dictionary <string, string>(); resultRow.Add("pToP", fromProperty); Dictionary <string, int> propertyDic = pToPDic.ContainsKey(fromProperty) ? pToPDic[fromProperty] : null; for (int j = 0; j < propertyListDic.Count; j++) { string toProperty = propertyList[j]; double value = fromProperty == toProperty ? 0 : (propertyDic == null || !propertyDic.ContainsKey(toProperty) || propertyDic[toProperty] == 0 ? 2 * (double)maxTime : ((double)maxTime / (double)propertyDic[toProperty])); resultRow.Add(toProperty, value.ToString()); propertyArrayStringBuilder.Append((j == 0 ? "" : ", ") + value.ToString()); } propertyMatrixCW.AddRow(resultRow); propertyArrayStringBuilder.Append("]"); } propertyArrayStringBuilder.Append("]"); FileHelper.SaveTextToFile(propertyArrayStringBuilder.ToString(), propertyArrayFilePath); }
public override bool AfterAllGrab(IListSheet listSheet) { this.GetWordPageUrls(listSheet); return(true); }
public override bool AfterAllGrab(IListSheet listSheet) { this.GetImage(listSheet); return(true); }
/// <summary> /// GetShopDetail /// </summary> /// <param name="listSheet"></param> /// <param name="pageSourceDir"></param> /// <param name="resultEW"></param> private void GetShopDetail(IListSheet listSheet, string pageSourceDir, ExcelWriter resultEW) { Dictionary <string, string> shopDic = new Dictionary <string, string>(); for (int i = 0; i < listSheet.RowCount; i++) { //listSheet中只有一条记录 Dictionary <string, string> row = listSheet.GetRow(i); string pageUrl = row[SysConfig.DetailPageUrlFieldName]; string provinceCode = row["provinceCode"]; string provinceName = row["provinceName"]; string cityCode = row["cityCode"]; string cityName = row["cityName"]; string shopCode = row["shopCode"]; string shopName = row["shopName"]; string level = ""; string address = ""; Nullable <decimal> lng = null; Nullable <decimal> lat = null; string serviceItems = ""; string localFilePath = this.RunPage.GetFilePath(pageUrl, pageSourceDir); HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNode levelNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@class=\"shop-level\"]/span[1]"); if (levelNode != null) { level = levelNode.InnerText; } HtmlNode addressNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@class=\"address clearfix\"]/div[@id=\"submitbtns\"]/span"); if (addressNode != null) { address = addressNode.InnerText; } HtmlNode scriptNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@id=\"scriptSection\"]"); if (scriptNode != null) { string script = scriptNode.InnerText; int lngBeginIndex = script.IndexOf("Position: '") + 11; int lngEndIndex = script.IndexOf(",", lngBeginIndex); int latBeginIndex = lngEndIndex + 1; int latEndIndex = script.IndexOf("',", latBeginIndex); lng = decimal.Parse(script.Substring(lngBeginIndex, lngEndIndex - lngBeginIndex)); lat = decimal.Parse(script.Substring(latBeginIndex, latEndIndex - latBeginIndex)); } StringBuilder serviceItemSB = new StringBuilder(); HtmlNodeCollection allServiceItemNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"sever-xm\"]/ul/li"); if (allServiceItemNodes != null) { for (int j = 0; j < allServiceItemNodes.Count; j++) { HtmlNode serviceNode = allServiceItemNodes[j]; if (!serviceNode.Attributes.Contains("class") || serviceNode.Attributes["class"].Value != "not-have") { string serviceText = serviceNode.InnerText.Trim(); serviceItemSB.Append(serviceText + ";"); } } serviceItems = serviceItemSB.ToString(); } Dictionary <string, object> f2vs = new Dictionary <string, object>(); f2vs.Add("provinceCode", provinceCode); f2vs.Add("provinceName", provinceName); f2vs.Add("cityCode", cityCode); f2vs.Add("cityName", cityName); f2vs.Add("shopCode", shopCode); f2vs.Add("shopName", shopName); f2vs.Add("level", level); f2vs.Add("address", address); f2vs.Add("lng", lng); f2vs.Add("lat", lat); f2vs.Add("serviceItems", serviceItems); resultEW.AddRow(f2vs); } }
/// <summary> /// 生成车辆信息抓取URL列表 /// </summary> /// <param name="listSheet"></param> /// <returns></returns> private bool GenerateCLXX(IListSheet listSheet) { bool succeed = true; string exportDir = this.RunPage.GetExportDir(); string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); Dictionary <string, int> clxxColumnDic = CommonUtil.InitStringIndexDic(new string[] { "detailPageUrl", "detailPageName", "cookie", "grabStatus", "giveUpGrab" }); string clxxPath = Path.Combine(exportDir, this.RunPage.Project.Name + "_车辆信息.xlsx"); ExcelWriter clxxEW = new ExcelWriter(clxxPath, "List", clxxColumnDic, null); int detailUrlColumnIndex = this.RunPage.ColumnNameToIndex["detailPageUrl"]; Dictionary <string, string> rIdToNull = new Dictionary <string, string>(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); string detailUrl = row["detailPageUrl"]; string cookie = row["cookie"]; string localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir); TextReader tr = null; try { tr = new StreamReader(localFilePath, Encoding.GetEncoding(((Proj_Detail_SingleLine)this.RunPage.Project.DetailGrabInfoObject).Encoding)); string webPageHtml = tr.ReadToEnd(); HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); htmlDoc.LoadHtml(webPageHtml); HtmlNodeCollection listTrNodeList = htmlDoc.DocumentNode.SelectNodes("//table[@class=\"GridView\"]/tr"); if (listTrNodeList.Count > 1) { for (int j = 1; j < listTrNodeList.Count; j++) { HtmlNode listTrNode = listTrNodeList[j]; HtmlNodeCollection vNodeList = listTrNode.SelectNodes("./td"); Dictionary <string, object> reportInfo = new Dictionary <string, object>(); string clickUrl = vNodeList[3].SelectSingleNode("./span/a").GetAttributeValue("onclick", ""); string rId = clickUrl.Substring(clickUrl.IndexOf("=") + 1, clickUrl.LastIndexOf("'") - clickUrl.IndexOf("=") - 1); if (!rIdToNull.ContainsKey(rId)) { string pageUrl = "http://218.56.62.250/hnts/VehicleGas/VehicleGasView.aspx?RegId=" + rId; rIdToNull.Add(rId, ""); reportInfo.Add("detailPageUrl", pageUrl); reportInfo.Add("detailPageName", rId); reportInfo.Add("cookie", cookie); clxxEW.AddRow(reportInfo); } } } } catch (Exception ex) { if (tr != null) { tr.Dispose(); tr = null; } this.RunPage.InvokeAppendLogText("读取出错. " + ex.Message + " LocalPath = " + localFilePath, LogLevelType.Error, true); } } clxxEW.SaveToDisk(); return(succeed); }
private bool GenerateCityReport(IListSheet listSheet) { bool succeed = true; string exportDir = this.RunPage.GetExportDir(); string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); Dictionary <string, int> cityReportColumnDic = CommonUtil.InitStringIndexDic(new string[] { "cityCode", "city", "日期", "AQI指数", "质量等级", "当天AQI排名", "PM2.5", "PM10", "Co", "No2", "So2", "O3" }); string cityReportPath = Path.Combine(exportDir, this.RunPage.Project.Name + "_List.xlsx"); Dictionary <string, string> columnFormats = new Dictionary <string, string>(); columnFormats.Add("日期", "yyyy-m-d"); columnFormats.Add("AQI指数", "#0"); columnFormats.Add("当天AQI排名", "#0"); columnFormats.Add("PM2.5", "#0"); columnFormats.Add("PM10", "#0"); columnFormats.Add("Co", "#0.00"); columnFormats.Add("No2", "#0"); columnFormats.Add("So2", "#0"); columnFormats.Add("O3", "#0"); ExcelWriter cityReportEW = new ExcelWriter(cityReportPath, "List", cityReportColumnDic, columnFormats); int detailUrlColumnIndex = this.RunPage.ColumnNameToIndex["detailPageUrl"]; Dictionary <string, string> codeDateToNull = new Dictionary <string, string>(); string sourceDateFormat = "yyyy-MM-dd"; for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); string detailUrl = row["detailPageUrl"]; string cityCode = row["cityCode"]; string city = row["cityName"]; string localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir); TextReader tr = null; try { tr = new StreamReader(localFilePath, Encoding.GetEncoding(((Proj_Detail_SingleLine)this.RunPage.Project.DetailGrabInfoObject).Encoding)); string webPageHtml = tr.ReadToEnd(); HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); htmlDoc.LoadHtml(webPageHtml); HtmlNodeCollection listDivNodeList = htmlDoc.DocumentNode.SelectNodes("//*[@id=\"content\"]/div[3]/table[1]/tr"); if (listDivNodeList.Count > 1) { Dictionary <int, string> cityReportColumnIndexDic = new Dictionary <int, string>(); HtmlNodeCollection nameNodes = listDivNodeList[0].SelectNodes("td"); for (int j = 0; j < nameNodes.Count; j++) { HtmlNode nameNode = nameNodes[j]; string name = nameNode.InnerText.Trim(); cityReportColumnIndexDic.Add(j, name); } for (int j = 1; j < listDivNodeList.Count; j++) { HtmlNode listDivNode = listDivNodeList[j]; HtmlNodeCollection vNodeList = listDivNode.SelectNodes("./td"); Dictionary <string, object> reportInfo = new Dictionary <string, object>(); reportInfo.Add("cityCode", cityCode); reportInfo.Add("city", city); for (int k = 0; k < nameNodes.Count; k++) { HtmlNode vNode = vNodeList[k]; string value = vNode.InnerText.Trim(); string columName = cityReportColumnIndexDic[k]; switch (columName) { case "日期": DateTime dt = DateTime.ParseExact(value, sourceDateFormat, System.Globalization.CultureInfo.CurrentCulture); reportInfo.Add(columName, dt); break; case "AQI指数": case "当天AQI排名": case "PM2.5": case "PM10": case "Co": case "No2": case "So2": reportInfo.Add(columName, decimal.Parse(value)); break; default: reportInfo.Add(columName, value); break; } } string codeDate = cityCode + "_" + ((DateTime)reportInfo["日期"]).ToString("yyyy-MM-dd"); if (!codeDateToNull.ContainsKey(codeDate)) { cityReportEW.AddRow(reportInfo); codeDateToNull.Add(codeDate, null); } } } } catch (Exception ex) { if (tr != null) { tr.Dispose(); tr = null; } this.RunPage.InvokeAppendLogText("读取出错. " + ex.Message + " LocalPath = " + localFilePath, LogLevelType.Error, true); } } cityReportEW.SaveToDisk(); return(succeed); }
public override bool AfterAllGrab(IListSheet listSheet) { return(GenerateDAGLJL(listSheet) && GenerateCLXX(listSheet)); }
private bool GetAllDetailPageUrl(IListSheet listSheet) { string exportDir = this.RunPage.GetExportDir(); string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); string[] resultColumns = new string[] { "detailPageUrl", "detailPageName", "cookie", "grabStatus", "giveUpGrab", "productCode", "productName", "productCurrentPrice", "productOldPrice", "categoryCode", "categoryName", "standard", "city" }; Dictionary <string, int> resultColumnDic = CommonUtil.InitStringIndexDic(resultColumns); string resultFilePath = Path.Combine(exportDir, this.RunPage.Project.Name + "_AllDetailPageUrl.xlsx"); ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic); string detailPageUrlPrefix = "http://www.fruitday.com"; Dictionary <string, string> allProductCodes = new Dictionary <string, string>(); string detailPageUrlColumnName = SysConfig.DetailPageUrlFieldName; string categoryNameColumnName = SysConfig.DetailPageNameFieldName; for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { string url = row[detailPageUrlColumnName]; string categoryCode = row["categoryCode"]; string categoryName = row["categoryName"]; string cookie = row["cookie"]; string city = row["city"]; string localFilePath = this.RunPage.GetFilePath(url, pageSourceDir); try { HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNodeCollection allItemNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"leftpart pull-left\"]/ul/li"); if (allItemNodes != null) { foreach (HtmlNode itemNode in allItemNodes) { string productCode = ""; string productName = ""; string productCurrentPrice = ""; string productOldPrice = ""; string detailPageUrl = ""; string detailPageName = ""; string standard = ""; HtmlNode urlNode = itemNode.SelectSingleNode("./div/div[@class=\"s-img\"]/a"); detailPageUrl = detailPageUrlPrefix + urlNode.Attributes["href"].Value; int startIndex = detailPageUrl.LastIndexOf("/") + 1; detailPageName = detailPageUrl.Substring(startIndex); productCode = detailPageName; HtmlNodeCollection propertyNodes = itemNode.SelectSingleNode("./div/div[@class=\"s-info clearfix\"]").ChildNodes; foreach (HtmlNode propertyNode in propertyNodes) { if (propertyNode.NodeType == HtmlNodeType.Text) { productName = propertyNode.InnerText.Trim(); } else { if (propertyNode.Attributes.Contains("class") && propertyNode.Attributes["class"].Value == "s-unit pull-right font-color") { string priceStr = propertyNode.InnerText.Trim(); productCurrentPrice = priceStr.Substring(1); } } } HtmlNode standardNode = itemNode.SelectSingleNode("./div/div[@class=\"p-operate clearfix\"]"); if (standardNode != null) { standard = standardNode.InnerText.Trim(); } detailPageName = city + "_" + detailPageName; if (!allProductCodes.ContainsKey(detailPageName)) { allProductCodes.Add(detailPageName, null); Dictionary <string, string> p2vs = new Dictionary <string, string>(); p2vs.Add("detailPageUrl", detailPageUrl + "?city=" + city); p2vs.Add("detailPageName", detailPageName); p2vs.Add("city", city); p2vs.Add("cookie", cookie); p2vs.Add("productCode", productCode); p2vs.Add("productName", productName); p2vs.Add("productCurrentPrice", productCurrentPrice); p2vs.Add("productOldPrice", productOldPrice); p2vs.Add("categoryCode", categoryCode); p2vs.Add("categoryName", categoryName); p2vs.Add("standard", standard); resultEW.AddRow(p2vs); } } } } catch (Exception ex) { this.RunPage.InvokeAppendLogText("读取出错. " + ex.Message + " LocalPath = " + localFilePath, LogLevelType.Error, true); throw ex; } } } resultEW.SaveToDisk(); //执行后续任务 TaskManager.StartTask("易果", "天天果园获取所有详情页", resultFilePath, null, null, false); return(true); }
/// <summary> /// 生成档案管理记录 /// </summary> /// <param name="listSheet"></param> /// <returns></returns> private bool GenerateDAGLJL(IListSheet listSheet) { bool succeed = true; string exportDir = this.RunPage.GetExportDir(); string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); Dictionary <string, int> dagljlColumnDic = CommonUtil.InitStringIndexDic(new string[] { "区划", "录入单位", "登记证编号", "车牌号码", "安装数量", "使用单位", "安装日期", "登记日期", "状态" }); string dagljlPath = Path.Combine(exportDir, this.RunPage.Project.Name + "_档案管理记录.xlsx"); Dictionary <string, string> columnFormats = new Dictionary <string, string>(); ExcelWriter cityReportEW = new ExcelWriter(dagljlPath, "List", dagljlColumnDic, columnFormats); int detailUrlColumnIndex = this.RunPage.ColumnNameToIndex["detailPageUrl"]; Dictionary <string, string> codeDateToNull = new Dictionary <string, string>(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); string detailUrl = row["detailPageUrl"]; string localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir); TextReader tr = null; try { tr = new StreamReader(localFilePath, Encoding.GetEncoding(((Proj_Detail_SingleLine)this.RunPage.Project.DetailGrabInfoObject).Encoding)); string webPageHtml = tr.ReadToEnd(); HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); htmlDoc.LoadHtml(webPageHtml); HtmlNodeCollection listTrNodeList = htmlDoc.DocumentNode.SelectNodes("//table[@class=\"GridView\"]/tr"); if (listTrNodeList.Count > 1) { for (int j = 1; j < listTrNodeList.Count; j++) { HtmlNode listTrNode = listTrNodeList[j]; HtmlNodeCollection vNodeList = listTrNode.SelectNodes("./td"); Dictionary <string, object> reportInfo = new Dictionary <string, object>(); reportInfo.Add("区划", vNodeList[1].InnerText.Trim()); reportInfo.Add("录入单位", vNodeList[2].InnerText.Trim()); reportInfo.Add("登记证编号", vNodeList[3].InnerText.Trim()); reportInfo.Add("车牌号码", vNodeList[4].InnerText.Trim()); reportInfo.Add("安装数量", vNodeList[5].InnerText.Trim()); reportInfo.Add("使用单位", vNodeList[6].InnerText.Trim()); reportInfo.Add("安装日期", vNodeList[7].InnerText.Trim()); reportInfo.Add("登记日期", vNodeList[8].InnerText.Trim()); reportInfo.Add("状态", vNodeList[9].InnerText.Trim()); cityReportEW.AddRow(reportInfo); } } } catch (Exception ex) { if (tr != null) { tr.Dispose(); tr = null; } this.RunPage.InvokeAppendLogText("读取出错. " + ex.Message + " LocalPath = " + localFilePath, LogLevelType.Error, true); } } cityReportEW.SaveToDisk(); return(succeed); }
private void GetXiaoquInfos(IListSheet listSheet) { string[] paramterParts = this.Parameters.Split(new string[] { "," }, StringSplitOptions.RemoveEmptyEntries); string cityName = paramterParts[0]; string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); ExcelWriter resultEW = null; int fileIndex = 1; Dictionary <string, string> fangLinkUrlDic = new Dictionary <string, string>(); for (int i = 0; i < listSheet.RowCount; i++) { if (resultEW == null || resultEW.RowCount > 500000) { if (resultEW != null) { resultEW.SaveToDisk(); } resultEW = this.GetExcelWriter(fileIndex, cityName); fileIndex++; } Dictionary <string, string> row = listSheet.GetRow(i); bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); try { HtmlNodeCollection fangNodeList = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"baseinfo\"]/a"); if (fangNodeList != null) { foreach (HtmlNode fangNode in fangNodeList) { string fangLinkUrl = fangNode.GetAttributeValue("href", ""); if (!fangLinkUrlDic.ContainsKey(fangLinkUrl)) { fangLinkUrlDic.Add(fangLinkUrl, null); Dictionary <string, string> f2vs = new Dictionary <string, string>(); f2vs.Add("detailPageUrl", fangLinkUrl); f2vs.Add("detailPageName", fangLinkUrl); f2vs.Add("xiaoquname", row["xiaoquName"]); f2vs.Add("xiaoquurl", row["xiaoquUrl"]); f2vs.Add("cityName", row["cityName"]); f2vs.Add("cityCode", row["cityCode"]); f2vs.Add("level1AreaName", row["level1AreaName"]); f2vs.Add("level1AreaCode", row["level1AreaCode"]); f2vs.Add("level2AreaCode", row["level2AreaCode"]); f2vs.Add("level2AreaName", row["level2AreaName"]); resultEW.AddRow(f2vs); } } } } catch (Exception ex) { throw ex; } } } resultEW.SaveToDisk(); }
private void GetPeriodicalListPageUrls(IListSheet listSheet) { String exportDir = this.RunPage.GetExportDir(); string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); Dictionary <string, int> resultColumnDic = new Dictionary <string, int>(); resultColumnDic.Add("detailPageUrl", 0); resultColumnDic.Add("detailPageName", 1); resultColumnDic.Add("cookie", 2); resultColumnDic.Add("grabStatus", 3); resultColumnDic.Add("giveUpGrab", 4); resultColumnDic.Add("cate1", 5); resultColumnDic.Add("cateId1", 6); resultColumnDic.Add("cate2", 7); resultColumnDic.Add("cateId2", 8); resultColumnDic.Add("pageIndex", 9); string resultFilePath = Path.Combine(exportDir, "万方期刊_期刊列表.xlsx"); ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic, null); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); string detailUrl = row["detailPageUrl"]; string cate1 = row["cate1"]; string cateId1 = row["cateId1"]; bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { string localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir); try { string pageFileText = FileHelper.GetTextFromFile(localFilePath); JArray itemJsonArray = JArray.Parse(pageFileText); for (int j = 0; j < itemJsonArray.Count; j++) { JObject itemJson = itemJsonArray[j] as JObject; string cateId2 = itemJson.GetValue("id").ToString(); string cate2 = itemJson.GetValue("showName").ToString().Trim(); int periodicalCount = int.Parse(itemJson.GetValue("count").ToString().Trim()); int pageCount = periodicalCount == 0 ? 0 : (periodicalCount / 20 + 1); for (int k = 0; k < pageCount; k++) { string newUrl = "http://www.wanfangdata.com.cn/perio/page.do?page=" + (k + 1).ToString() + "&pageSize=20&selectOrder=affectoi&fmList=" + cateId2 + "&a_title=&core=&fromData=WF&included=&publishyear=&isfirst="; Dictionary <string, string> f2vs = new Dictionary <string, string>(); f2vs.Add("detailPageUrl", newUrl); f2vs.Add("detailPageName", newUrl); f2vs.Add("cate1", cate1); f2vs.Add("cateId1", cateId1); f2vs.Add("cate2", cate2); f2vs.Add("cateId2", cateId2); f2vs.Add("pageIndex", (k + 1).ToString()); resultEW.AddRow(f2vs); } } } catch (Exception ex) { throw ex; } } } resultEW.SaveToDisk(); }
public bool Run(string parameters, IListSheet listSheet) { return(GetShopList(parameters, listSheet)); }
public bool Run(string parameters, IListSheet listSheet) { return(GetAllDetailPageUrl(listSheet)); }
private void GetList(IListSheet listSheet, string pageSourceDir, ExcelWriter resultEW) { for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); string pageUrl = row["detailPageUrl"]; string pageName = row["detailPageName"]; bool giveUpGrab = row["giveUpGrab"] == "Y"; string localFilePath = this.RunPage.GetFilePath(pageUrl, pageSourceDir); string productCode = row["productCode"]; string productName = row["productName"]; string category1Code = row["category1Code"]; string category2Code = row["category2Code"]; string category3Code = row["category3Code"]; string category1Name = row["category1Name"]; string category2Name = row["category2Name"]; string category3Name = row["category3Name"]; string pinpai = ""; decimal productCurrentPrice = 0; if (!giveUpGrab) { TextReader tr = null; try { tr = new StreamReader(localFilePath); string webPageHtml = tr.ReadToEnd(); HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); htmlDoc.LoadHtml(webPageHtml); { HtmlNode propertyParentNode = htmlDoc.DocumentNode.SelectSingleNode("//dl[@class=\"dl-proInfo\"]"); if (propertyParentNode != null) { HtmlNodeCollection propertyNodes = propertyParentNode.SelectNodes("./dd"); foreach (HtmlNode pNode in propertyNodes) { string pText = pNode.InnerText.Trim(); if (pText.StartsWith("品牌:")) { pinpai = pText.Substring(3); break; } } } } { HtmlNode propertyParentNode = htmlDoc.DocumentNode.SelectSingleNode("//ul[@class=\"depict-list fn-clear\"]"); if (propertyParentNode != null) { HtmlNodeCollection propertyNodes = propertyParentNode.SelectNodes("./li"); foreach (HtmlNode pNode in propertyNodes) { HtmlNode pnNode = pNode.SelectSingleNode("./span[1]"); string pnText = pnNode.InnerText.Trim(); if (pnText.StartsWith("品牌:")) { HtmlNode pvNode = pNode.SelectSingleNode("./span[2]"); pinpai = pvNode == null ? "" : pvNode.InnerText.Trim(); break; } } } } HtmlNode priceNode = htmlDoc.DocumentNode.SelectSingleNode("//font[@class=\"info-price\"]/b[@class=\"JS-control-price\"]"); priceNode = priceNode == null?htmlDoc.DocumentNode.SelectSingleNode("//div[@id=\"J_product_value\"]/div/strong[@class=\"fn-rmb-num\"]") : priceNode; if (priceNode != null) { string priceStr = priceNode.InnerText.Trim(); productCurrentPrice = decimal.Parse(priceStr); } else { //throw new Exception("None price! url = " + pageUrl); this.RunPage.InvokeAppendLogText("None price! url = " + pageUrl, LogLevelType.Error, true); if (tr != null) { tr.Close(); tr.Dispose(); } File.Delete(localFilePath); } } catch (Exception ex) { this.RunPage.InvokeAppendLogText("读取出错. url = " + pageUrl + ". " + ex.Message, LogLevelType.Error, true); throw ex; } finally { if (tr != null) { tr.Close(); tr.Dispose(); } } Dictionary <string, object> f2vs = new Dictionary <string, object>(); f2vs.Add("商品编码", productCode); f2vs.Add("商品名称", productName); f2vs.Add("价格", productCurrentPrice); f2vs.Add("品牌", pinpai); f2vs.Add("一级分类", category1Name); f2vs.Add("二级分类", category2Name); f2vs.Add("三级分类", category3Name); f2vs.Add("url", pageUrl); f2vs.Add("一级分类编码", category1Code); f2vs.Add("二级分类编码", category2Code); f2vs.Add("三级分类编码", category3Code); resultEW.AddRow(f2vs); } } }
private bool GetAllDetailPageUrl(IListSheet listSheet) { string exportDir = this.RunPage.GetExportDir(); string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); Dictionary <string, int> resultColumnDic = CommonUtil.InitStringIndexDic(new string[] { "detailPageUrl", "detailPageName", "cookie", "grabStatus", "giveUpGrab", "productCode", "productName", "category1Code", "category2Code", "category3Code", "category1Name", "category2Name", "category3Name" }); string resultFilePath = Path.Combine(exportDir, this.RunPage.Project.Name + "_AllDetailPageUrl.xlsx"); ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic); Dictionary <string, string> goodsDic = new Dictionary <string, string>(); string detailPageUrlColumnName = SysConfig.DetailPageUrlFieldName; string categoryNameColumnName = SysConfig.DetailPageNameFieldName; for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { string url = row[detailPageUrlColumnName]; string category1Code = row["category1Code"]; string category2Code = row["category2Code"]; string category3Code = row["category3Code"]; string category1Name = row["category1Name"]; string category2Name = row["category2Name"]; string category3Name = row["category3Name"]; string cookie = row["cookie"]; string detailPageUrlPrefix = "http://www.cityshop.com.cn"; string localFilePath = this.RunPage.GetFilePath(url, pageSourceDir); try { { HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNodeCollection itemNodes = htmlDoc.DocumentNode.SelectNodes("//ul[@class=\"row product-grid\"]/li"); if (itemNodes != null) { foreach (HtmlNode itemNode in itemNodes) { //HtmlNodeCollection allPageNodes = listNode.SelectNodes("./div[@class='p_item_container p_item_ab ']"); string productCode = ""; string productName = ""; string detailPageUrl = ""; string detailPageName = ""; HtmlNode nameNode = itemNode.SelectSingleNode("./div[2]/p[1]/a"); detailPageUrl = detailPageUrlPrefix + nameNode.Attributes["href"].Value; int startIndex = detailPageUrl.LastIndexOf("/") + 1; int endIndex = detailPageUrl.LastIndexOf("?"); int length = endIndex - startIndex; //商品类型为礼品卡时,length==0,不用获取详情页 if (length > 0) { detailPageName = detailPageUrl.Substring(startIndex, length); productCode = detailPageName; productName = nameNode.InnerText.Trim(); Dictionary <string, string> p2vs = new Dictionary <string, string>(); p2vs.Add("detailPageUrl", detailPageUrl); p2vs.Add("detailPageName", detailPageName); p2vs.Add("productCode", productCode); p2vs.Add("productName", productName); p2vs.Add("category1Code", category1Code); p2vs.Add("category2Code", category2Code); p2vs.Add("category3Code", category3Code); p2vs.Add("category1Name", category1Name); p2vs.Add("category2Name", category2Name); p2vs.Add("category3Name", category3Name); p2vs.Add("cookie", cookie); resultEW.AddRow(p2vs); } } } } } catch (Exception ex) { this.RunPage.InvokeAppendLogText("读取出错. " + ex.Message + " LocalPath = " + localFilePath, LogLevelType.Error, true); throw ex; } } } resultEW.SaveToDisk(); return(true); }
public override bool AfterAllGrab(IListSheet listSheet) { String exportDir = this.RunPage.GetExportDir(); Dictionary <string, int> resultColumnDic = new Dictionary <string, int>(); resultColumnDic.Add("city", 0); resultColumnDic.Add("gName", 1); resultColumnDic.Add("rName", 2); resultColumnDic.Add("shopName", 3); resultColumnDic.Add("reviewNum", 4); resultColumnDic.Add("serviceRating", 5); resultColumnDic.Add("environmentRating", 6); resultColumnDic.Add("tasteRating", 7); resultColumnDic.Add("address", 8); resultColumnDic.Add("lat", 9); resultColumnDic.Add("lng", 10); string resultFilePath = Path.Combine(exportDir, "大众点评店铺信息.xlsx"); Dictionary <string, string> resultColumnFormat = new Dictionary <string, string>(); resultColumnFormat.Add("reviewNum", "#,##0"); resultColumnFormat.Add("lat", "#,##0.000000"); resultColumnFormat.Add("lng", "#,##0.000000"); resultColumnFormat.Add("serviceRating", "#,##0.00"); resultColumnFormat.Add("environmentRating", "#,##0.0"); resultColumnFormat.Add("tasteRating", "#,##0.0"); ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic); string detailPageUrlColumnName = SysConfig.DetailPageUrlFieldName; Dictionary <string, string> shopDic = new Dictionary <string, string>(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { try { string url = row[detailPageUrlColumnName]; string city = row["city"]; Nullable <decimal> lat = null; Nullable <decimal> lng = null; HtmlAgilityPack.HtmlDocument pageHtmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); string pageText = pageHtmlDoc.DocumentNode.InnerHtml; int latNameBeginIndex = pageText.IndexOf("shopGlat:"); if (latNameBeginIndex > 0) { int latBeginIndex = pageText.IndexOf("\"", latNameBeginIndex); int latEndIndex = pageText.IndexOf("\"", latBeginIndex + 1); if (latEndIndex - latBeginIndex > 1) { decimal latValue = 0; if (decimal.TryParse(pageText.Substring(latBeginIndex + 1, latEndIndex - latBeginIndex - 1), out latValue)) { lat = latValue; } } } int lngNameBeginIndex = pageText.IndexOf("shopGlng:"); if (lngNameBeginIndex > 0) { int lngBeginIndex = pageText.IndexOf("\"", lngNameBeginIndex); int lngEndIndex = pageText.IndexOf("\"", lngBeginIndex + 1); if (lngEndIndex - lngBeginIndex > 1) { decimal lngValue = 0; if (decimal.TryParse(pageText.Substring(lngBeginIndex + 1, lngEndIndex - lngBeginIndex - 1), out lngValue)) { lng = lngValue; } } } Dictionary <string, object> f2vs = new Dictionary <string, object>(); f2vs.Add("city", row["city"]); f2vs.Add("gName", row["gName"]); f2vs.Add("rName", row["rName"]); f2vs.Add("shopName", row["shopName"]); f2vs.Add("reviewNum", row["reviewNum"]); f2vs.Add("serviceRating", row["serviceRating"]); f2vs.Add("environmentRating", row["environmentRating"]); f2vs.Add("tasteRating", row["tasteRating"]); f2vs.Add("address", row["address"]); f2vs.Add("lat", lat); f2vs.Add("lng", lng); resultEW.AddRow(f2vs); } catch (Exception ex) { throw ex; } } } resultEW.SaveToDisk(); return(true); }
private void GetAllInfos(IListSheet listSheet) { CsvWriter cw = this.GetCsvExcelWriter(); string detailPageUrlColumnName = SysConfig.DetailPageUrlFieldName; Dictionary <string, string> companyDic = new Dictionary <string, string>(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); string detailPageUrl = row[SysConfig.DetailPageUrlFieldName]; string detailPageName = row[SysConfig.DetailPageNameFieldName]; string companyId = row["companyId"]; bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { HtmlAgilityPack.HtmlDocument pageHtmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNodeCollection trNodeList = pageHtmlDoc.DocumentNode.SelectNodes("//table/tbody/tr"); if (trNodeList != null) { for (int j = 0; j < trNodeList.Count; j++) { try { HtmlNode trNode = trNodeList[j]; HtmlNodeCollection tdNodeList = trNode.SelectNodes("./td"); if (tdNodeList != null && tdNodeList.Count > 0) { HtmlNode indexNode = tdNodeList[0]; if (indexNode.GetAttributeValue("data-header", "") == "序号") { try { Dictionary <string, string> f2vs = new Dictionary <string, string>(); f2vs.Add("CompanyId", companyId); f2vs.Add("项目编码", tdNodeList.Count < 2 ? "" : tdNodeList[1].InnerText.Trim()); f2vs.Add("项目名称", tdNodeList.Count < 3 ? "" : tdNodeList[2].InnerText.Trim()); f2vs.Add("项目属地", tdNodeList.Count < 4 ? "" : tdNodeList[3].InnerText.Trim()); f2vs.Add("项目类别", tdNodeList.Count < 5 ? "" : tdNodeList[4].InnerText.Trim()); f2vs.Add("建设单位", tdNodeList.Count < 6 ? "" : tdNodeList[5].InnerText.Trim()); cw.AddRow(f2vs); } catch (Exception ex) { throw ex; } } } } catch (Exception ex) { throw ex; } } } } } cw.SaveToDisk(); }