private void GetListPageUrls(IListSheet listSheet) { String exportDir = this.RunPage.GetExportDir(); string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); Dictionary <string, int> resultColumnDic = new Dictionary <string, int>(); resultColumnDic.Add("detailPageUrl", 0); resultColumnDic.Add("detailPageName", 1); resultColumnDic.Add("cookie", 2); resultColumnDic.Add("grabStatus", 3); resultColumnDic.Add("giveUpGrab", 4); resultColumnDic.Add("category", 5); resultColumnDic.Add("subCategory", 6); string resultFilePath = Path.Combine(exportDir, "美食天下_获取各小类菜谱列表页.xlsx"); ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic, null); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); string detailUrl = row["detailPageUrl"]; bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { HtmlAgilityPack.HtmlDocument pageHtmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); try { HtmlNodeCollection categoryDivList = pageHtmlDoc.DocumentNode.SelectNodes("//div[@class=\"category_sub clear\"]"); foreach (HtmlNode categoryDiv in categoryDivList) { HtmlNode categoryNameNode = categoryDiv.SelectSingleNode("./h3"); string categoryName = CommonUtil.HtmlDecode(categoryNameNode.InnerText).Trim(); HtmlNodeCollection subCategoryNodeList = categoryDiv.SelectNodes("./ul/li/a"); for (int j = 0; j < subCategoryNodeList.Count; j++) { HtmlNode subCategoryNode = subCategoryNodeList[j]; string subCategoryName = subCategoryNode.GetAttributeValue("title", ""); string subCategoryPageUrl = subCategoryNode.GetAttributeValue("href", ""); Dictionary <string, string> f2vs = new Dictionary <string, string>(); f2vs.Add("detailPageUrl", subCategoryPageUrl); f2vs.Add("detailPageName", subCategoryPageUrl); f2vs.Add("category", categoryName); f2vs.Add("subCategory", subCategoryName); resultEW.AddRow(f2vs); } } } catch (Exception ex) { throw ex; } } } resultEW.SaveToDisk(); }
private void GetShiShuDetailPageUrls(IListSheet listSheet) { string sourceDir = this.RunPage.GetDetailSourceFileDir(); ExcelWriter resultEW = this.CreateResultWriter(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> listRow = listSheet.GetRow(i); HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNodeCollection linkNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"info_cate clearfix\"]/dl/dd/a"); foreach (HtmlNode linkNode in linkNodes) { string juanName = CommonUtil.HtmlDecode(linkNode.InnerText).Trim(); string juanPageUrl = linkNode.GetAttributeValue("href", ""); string fullJuanPageUrl = "http://www.guoxuedashi.com" + juanPageUrl; Dictionary <string, string> resultRow = new Dictionary <string, string>(); resultRow.Add("detailPageUrl", fullJuanPageUrl); resultRow.Add("detailPageName", fullJuanPageUrl); resultRow.Add("shiShu", listRow["shiShu"]); resultRow.Add("leiXing", listRow["leiXing"]); resultRow.Add("juan", juanName); resultEW.AddRow(resultRow); } } resultEW.SaveToDisk(); }
/// <summary> /// GetCats /// </summary> /// <param name="listSheet"></param> /// <param name="pageSourceDir"></param> /// <param name="resultEW"></param> private void GetCats(IListSheet listSheet, string pageSourceDir, ExcelWriter resultEW) { for (int i = 0; i < listSheet.RowCount; i++) { //listSheet中只有一条记录 string pageUrl = listSheet.PageUrlList[i]; Dictionary <string, string> row = listSheet.GetRow(i); string localFilePath = this.RunPage.GetFilePath(pageUrl, pageSourceDir); HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNodeCollection allCat1Nodes = htmlDoc.DocumentNode.SelectNodes("//div[@id=\"catDiv\"]/div/h5"); HtmlNodeCollection allCat2GroupNodes = htmlDoc.DocumentNode.SelectNodes("//div[@id=\"catDiv\"]/div/ul"); for (int j = 0; j < allCat1Nodes.Count; j++) { HtmlNode cat1Node = allCat1Nodes[j]; HtmlNode cat2GroupNode = allCat2GroupNodes[j]; string cat1Name = cat1Node.InnerText.Trim(); HtmlNodeCollection allCat2Nodes = cat2GroupNode.SelectNodes("./li"); for (int k = 0; k < allCat2Nodes.Count; k++) { HtmlNode cat2Node = allCat2Nodes[k]; string cat2Code = cat2Node.Attributes["catid"].Value; string cat2Name = cat2Node.InnerText.Trim(); Dictionary <string, string> f2vs = new Dictionary <string, string>(); f2vs.Add("cat1Name", cat1Name); f2vs.Add("cat2Name", cat2Name); f2vs.Add("cat2Code", cat2Code); resultEW.AddRow(f2vs); } } } }
/// <summary> /// 保存一条记录 /// </summary> /// <param name="listSheet"></param> /// <param name="columnNameToIndex"></param> /// <param name="fieldValues"></param> /// <param name="rowIndex"></param> /// <param name="pageUrl"></param> public void SaveDetailFieldValue(IListSheet listSheet, Dictionary <string, int> columnNameToIndex, Dictionary <string, string> fieldValues, int rowIndex, string pageUrl) { Dictionary <string, string> listRow = listSheet.GetRow(rowIndex); string urlCellValue = listRow[SysConfig.DetailPageUrlFieldName]; if (urlCellValue == pageUrl) { IRow detailRow = this._DetailSheet.CreateRow(this._DetailSheet.LastRowNum + 1); foreach (string columnName in columnNameToIndex.Keys) { int index = columnNameToIndex[columnName]; if (listRow.ContainsKey(columnName)) { string v = listRow[columnName]; if (CommonUtil.IsNullOrBlank(v)) { detailRow.CreateCell(index).SetCellValue(v); } } } foreach (string fieldName in fieldValues.Keys) { int index = columnNameToIndex[fieldName]; string value = fieldValues[fieldName]; ICell cell = detailRow.CreateCell(index); cell.SetCellValue(value); } } else { throw new Exception("第" + rowIndex.ToString() + "行地址不匹配. Url_1 = " + pageUrl + ", Url_2 = " + urlCellValue); } }
private int InitGrabDetailPageIndexList(IListSheet listSheet, string sourceDir) { int detailPageIndex = 0; this.RunPage.NeedGrabIndexs = new List <int>(); this.RunPage.InvokeAppendLogText("开始统计需要下载的页面.", LogLevelType.System, true); while (detailPageIndex < this.RunPage.DetailPageUrlList.Count) { string pageUrl = this.RunPage.DetailPageUrlList[detailPageIndex]; string localPagePath = this.RunPage.GetFilePath(pageUrl, sourceDir); if (this.CheckNeedGrab(listSheet.GetRow(detailPageIndex), localPagePath) && !this.RunPage.CheckGiveUpGrabPage(listSheet, pageUrl, detailPageIndex)) { this.RunPage.NeedGrabIndexs.Add(detailPageIndex); } detailPageIndex++; if (detailPageIndex % 1000 == 0) { double perc = (double)detailPageIndex / (double)this.RunPage.DetailPageUrlList.Count; this.RunPage.InvokeAppendLogText("正在统计需要下载的页面..." + perc.ToString("#0.00%"), LogLevelType.System, true); } } this.RunPage.InvokeAppendLogText("完成统计需要下载的页面.", LogLevelType.System, true); return(this.RunPage.NeedGrabIndexs.Count); }
private void GetList(IListSheet listSheet) { String exportDir = this.RunPage.GetExportDir(); string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); Dictionary <string, int> resultColumnDic = new Dictionary <string, int>(); resultColumnDic.Add("fromName", 0); resultColumnDic.Add("toCode", 1); string resultFilePath = Path.Combine(exportDir, "翻译结果.xlsx"); ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic, null); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); string detailUrl = row["detailPageUrl"]; bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { try { string resultTextFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir); CsvReader csvReader = new CsvReader(resultTextFilePath); Dictionary <string, string> f2vs = csvReader.GetFieldValues(0); resultEW.AddRow(f2vs); } catch (Exception ex) { throw ex; } } } resultEW.SaveToDisk(); }
/// <summary> /// GetCities /// </summary> /// <param name="listSheet"></param> /// <param name="pageSourceDir"></param> /// <param name="resultEW"></param> private void GetCities(IListSheet listSheet, string pageSourceDir, ExcelWriter resultEW) { for (int i = 0; i < listSheet.RowCount; i++) { string pageUrl = listSheet.PageUrlList[i]; Dictionary <string, string> row = listSheet.GetRow(i); string areaLevel1Code = row["areaLevel1Code"]; string areaLevel1Name = row["areaLevel1Name"]; string localFilePath = this.RunPage.GetFilePath(pageUrl, pageSourceDir); string fileText = FileHelper.GetTextFromFile(localFilePath); int jsonBeginIndex = fileText.IndexOf("{"); int jsonEndIndex = fileText.LastIndexOf("}"); string jsonStr = fileText.Substring(jsonBeginIndex, jsonEndIndex - jsonBeginIndex + 1); JObject rootJo = JObject.Parse(jsonStr); JArray allAreaObjects = rootJo.SelectToken("data") as JArray; for (int j = 0; j < allAreaObjects.Count; j++) { JObject areaObject = allAreaObjects[j] as JObject; string areaLevel2Code = (areaObject.SelectToken("id") as JValue).Value.ToString(); string areaLevel2Name = (areaObject.SelectToken("name") as JValue).Value.ToString(); Dictionary <string, string> f2vs = new Dictionary <string, string>(); f2vs.Add("detailPageUrl", "http://autobeta.jd.com/queryAreaList?area_lev=3&area_id=" + areaLevel2Code + "&callback=jQuery7711772&_=1469734421125"); f2vs.Add("detailPageName", areaLevel2Code + areaLevel2Name); f2vs.Add("areaLevel1Code", areaLevel1Code); f2vs.Add("areaLevel1Name", areaLevel1Name); f2vs.Add("areaLevel2Code", areaLevel2Code); f2vs.Add("areaLevel2Name", areaLevel2Name); resultEW.AddRow(f2vs); } } }
/// <summary> /// GetProvinces /// </summary> /// <param name="listSheet"></param> /// <param name="pageSourceDir"></param> /// <param name="resultEW"></param> private void GetProvinces(IListSheet listSheet, string pageSourceDir, ExcelWriter resultEW) { for (int i = 0; i < listSheet.RowCount; i++) { //listSheet中只有一条记录 string pageUrl = listSheet.PageUrlList[i]; Dictionary <string, string> row = listSheet.GetRow(i); string localFilePath = this.RunPage.GetFilePath(pageUrl, pageSourceDir); HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNodeCollection allProvinceNodes = htmlDoc.DocumentNode.SelectNodes("//div[@id=\"listTab\"]/ul[1]/li"); for (int j = 0; j < allProvinceNodes.Count; j++) { HtmlNode provinceNode = allProvinceNodes[j]; string provinceCode = provinceNode.Attributes["data-value"].Value; string provinceName = provinceNode.InnerText; Dictionary <string, string> f2vs = new Dictionary <string, string>(); f2vs.Add("detailPageUrl", "http://www.tuhu.cn/Shops/" + provinceCode + ".aspx"); f2vs.Add("detailPageName", provinceCode + provinceName); f2vs.Add("provinceCode", provinceCode); f2vs.Add("provinceName", provinceName); resultEW.AddRow(f2vs); } } }
/// <summary> /// GetCities /// </summary> /// <param name="listSheet"></param> /// <param name="pageSourceDir"></param> /// <param name="resultEW"></param> private void ReadCityPages(IListSheet listSheet, string pageSourceDir, ExcelWriter resultEW) { for (int i = 0; i < listSheet.RowCount; i++) { //listSheet中只有一条记录 Dictionary <string, string> row = listSheet.GetRow(i); string pageUrl = row[SysConfig.DetailPageUrlFieldName]; string provinceCode = row["provinceCode"]; string provinceName = row["provinceName"]; string localFilePath = this.RunPage.GetFilePath(pageUrl, pageSourceDir); HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNodeCollection allCityNodes = htmlDoc.DocumentNode.SelectNodes("//div[@id=\"listTab\"]/ul[2]/li/a"); for (int j = 0; j < allCityNodes.Count; j++) { HtmlNode cityNode = allCityNodes[j]; string cityUrl = cityNode.Attributes["href"].Value; string[] cityUrlPieces = cityUrl.Split(new string[] { "/" }, StringSplitOptions.RemoveEmptyEntries); string[] cityPageNamePieces = cityUrlPieces[cityUrlPieces.Length - 1].Split(new string[] { "." }, StringSplitOptions.RemoveEmptyEntries); string cityCode = cityPageNamePieces[0]; string cityName = cityNode.InnerText; Dictionary <string, string> f2vs = new Dictionary <string, string>(); f2vs.Add("detailPageUrl", cityUrl); f2vs.Add("detailPageName", cityCode + cityName); f2vs.Add("provinceCode", provinceCode); f2vs.Add("provinceName", provinceName); f2vs.Add("cityCode", cityCode); f2vs.Add("cityName", cityName); resultEW.AddRow(f2vs); } } }
private void ThreadGrabDetailPage(IListSheet listSheet, int detailPageIndex, Proj_Detail_SingleLine detailPageInfo, string sourceDir) { DateTime dt1 = DateTime.Now; string pageUrl = this.RunPage.DetailPageUrlList[detailPageIndex]; string cookie = this.RunPage.DetailPageCookieList[detailPageIndex]; string localPagePath = this.RunPage.GetFilePath(pageUrl, sourceDir); Dictionary <string, string> listRow = listSheet.GetRow(detailPageIndex); bool succeed = true; bool existLocalFile = File.Exists(localPagePath); if (!existLocalFile) { succeed = this.GrabDetailPage(listSheet, pageUrl, listRow, localPagePath, detailPageIndex, detailPageInfo, cookie); } this.RunPage.RefreshGrabCount(succeed); DateTime dt2 = DateTime.Now; TimeSpan ts = dt2 - dt1; this.RunPage.InvokeAppendLogText("线程" + Thread.CurrentThread.ManagedThreadId.ToString() + ": 抓取了第" + (detailPageIndex + 1).ToString() + "个页面, 用时" + ts.TotalSeconds.ToString("0.00") + "秒", LogLevelType.Normal, false); this.RunPage.RecordGrabDetailStatus(succeed, dt1, dt2); }
private bool GetAllListPageUrls(IListSheet listSheet) { int pageIndex = 1; bool needMoreFirstPage = false; { ExcelWriter ew = this.GetExcelWriter(); string detailPageUrlColumnName = SysConfig.DetailPageUrlFieldName; Dictionary <string, string> companyDic = new Dictionary <string, string>(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); string detailPageUrl = row[SysConfig.DetailPageUrlFieldName]; string detailPageName = row[SysConfig.DetailPageNameFieldName]; string cookie = row[SysConfig.DetailPageCookieFieldName]; bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { HtmlAgilityPack.HtmlDocument pageHtmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); } } ew.SaveToDisk(); } return(true); }
private void GetListPageUrls(IListSheet listSheet) { string sourceDir = this.RunPage.GetDetailSourceFileDir(); ExcelWriter resultEW = this.CreateResultWriter(); Dictionary <string, bool> pageUrlDic = new Dictionary <string, bool>(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> listRow = listSheet.GetRow(i); HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNodeCollection pageUrlNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"gclear pp bt center f14\"]/a"); foreach (HtmlNode pageUrlNode in pageUrlNodes) { string pageUrl = pageUrlNode.GetAttributeValue("href", ""); string fullPageUrl = "https://chengyu.911cha.com/" + pageUrl; if (!pageUrlDic.ContainsKey(fullPageUrl)) { pageUrlDic.Add(fullPageUrl, true); Dictionary <string, string> resultRow = new Dictionary <string, string>(); resultRow.Add("detailPageUrl", fullPageUrl); resultRow.Add("detailPageName", fullPageUrl); resultEW.AddRow(resultRow); } } } resultEW.SaveToDisk(); }
public override bool AfterAllGrab(IListSheet listSheet) { string exportDir = this.RunPage.GetExportDir(); string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); string detailUrl = row["detailPageUrl"]; string code = row["detailPageName"]; if (row["giveUpGrab"] != "Y") { string localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir); StreamReader tr = new StreamReader(localFilePath, Encoding.UTF8); string webPageHtml = tr.ReadToEnd(); HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); htmlDoc.LoadHtml(webPageHtml); HtmlNodeCollection itemNodes = htmlDoc.DocumentNode.SelectNodes("//table[@role=\"list\"]/tbody/tr"); this.GetInfos(itemNodes); } } return(true); }
/// <summary> /// GetCities /// </summary> /// <param name="listSheet"></param> /// <param name="pageSourceDir"></param> /// <param name="resultEW"></param> private void GetShopList(IListSheet listSheet, string pageSourceDir, ExcelWriter resultEW) { for (int i = 0; i < listSheet.RowCount; i++) { string pageUrl = listSheet.PageUrlList[i]; Dictionary <string, string> row = listSheet.GetRow(i); string provinceName = row["provinceName"]; string cityName = row["cityName"]; string cityCode = row["cityCode"]; string localFilePath = this.RunPage.GetFilePath(pageUrl, pageSourceDir); HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNodeCollection allShopNodes = htmlDoc.DocumentNode.SelectNodes("//div[@id=\"cityMapLeft\"]/div/b/a"); for (int j = 0; j < allShopNodes.Count; j++) { HtmlNode shopNode = allShopNodes[j]; string shopUrl = shopNode.Attributes["href"].Value; string[] shopPieces = shopUrl.Split(new string[] { "/" }, StringSplitOptions.RemoveEmptyEntries); string shopCodeStr = shopPieces[shopPieces.Length - 1]; string shopCode = shopCodeStr.Substring(0, shopCodeStr.IndexOf(".")); string shopName = shopNode.InnerText.Trim(); Dictionary <string, string> f2vs = new Dictionary <string, string>(); f2vs.Add("detailPageUrl", shopUrl); f2vs.Add("detailPageName", shopCode + shopName); f2vs.Add("provinceName", provinceName); f2vs.Add("cityCode", cityCode); f2vs.Add("cityName", cityName); f2vs.Add("shopCode", shopCode); f2vs.Add("shopName", shopName); resultEW.AddRow(f2vs); } } }
private void GetCategoryToPageUrls(IListSheet listSheet) { String exportDir = this.RunPage.GetExportDir(); string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); string resultFilePath = Path.Combine(exportDir, "美食天下_分类与菜谱列表对照.xlsx"); ExcelWriter resultEW = this.CreateSubCategoryMapWriter(resultFilePath); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); string detailPageUrl = row[SysConfig.DetailPageUrlFieldName]; string category = row["category"]; string subCategory = row["subCategory"]; string sourceDir = this.RunPage.GetDetailSourceFileDir(); string subCategoryFilePath = this.RunPage.GetFilePath(detailPageUrl, sourceDir); ExcelReader er = new ExcelReader(subCategoryFilePath); int rowCount = er.GetRowCount(); for (int j = 0; j < rowCount; j++) { Dictionary <string, string> subRow = er.GetFieldValues(j); Dictionary <string, string> mapRow = new Dictionary <string, string>(); mapRow.Add("category", subRow["category"]); mapRow.Add("subCategory", subRow["subCategory"]); mapRow.Add("name", subRow["name"]); mapRow.Add("url", subRow["url"]); resultEW.AddRow(mapRow); } } resultEW.SaveToDisk(); }
private void GetRenWuInfos(IListSheet listSheet) { string sourceDir = this.RunPage.GetDetailSourceFileDir(); ExcelWriter resultEW = this.CreateRenWuResultWriter(); Dictionary <string, bool> pageUrlDic = new Dictionary <string, bool>(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> listRow = listSheet.GetRow(i); HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); try { HtmlNode mainInfoNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@class=\"info_txt2 clearfix\"]"); HtmlNode titleNode = mainInfoNode.SelectSingleNode("./h2"); string renWuTitle = CommonUtil.HtmlDecode(titleNode.InnerText).Trim(); HtmlNode descriptionNode = mainInfoNode.SelectSingleNode("./p"); string description = descriptionNode == null ? "" : CommonUtil.HtmlDecode(descriptionNode.InnerText).Trim(); Dictionary <string, string> resultRow = new Dictionary <string, string>(); resultRow.Add("人物", listRow["renWu"]); resultRow.Add("时代", listRow["shiDai"]); resultRow.Add("人物页面标题", renWuTitle); resultRow.Add("简介", description); resultRow.Add("url", listRow[SysConfig.DetailPageUrlFieldName]); resultEW.AddRow(resultRow); } catch (Exception ex) { throw ex; } } resultEW.SaveToDisk(); }
private void GetListPageUrls(IListSheet listSheet) { ExcelWriter ew = this.CreateWriter(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> listRow = listSheet.GetRow(i); bool giveUp = "Y".Equals(listRow[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { try { HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNodeCollection linkNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"cont\"]/a"); for (int j = 0; j < linkNodes.Count; j++) { HtmlNode linkNode = linkNodes[j]; string url = "http://www.lszj.com" + linkNode.GetAttributeValue("href", ""); string name = CommonUtil.HtmlDecode(linkNode.InnerText).Trim(); Dictionary <string, string> row = new Dictionary <string, string>(); row.Add("detailPageUrl", url); row.Add("detailPageName", url); row.Add("name", name); ew.AddRow(row); } } catch (Exception ex) { throw ex; } } } ew.SaveToDisk(); }
private void GetAllInfos(IListSheet listSheet) { CsvWriter cw = this.GetCsvExcelWriter(); string detailPageUrlColumnName = SysConfig.DetailPageUrlFieldName; Dictionary <string, string> companyDic = new Dictionary <string, string>(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); string detailPageUrl = row[SysConfig.DetailPageUrlFieldName]; string detailPageName = row[SysConfig.DetailPageNameFieldName]; string companyId = row["companyId"]; bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { HtmlAgilityPack.HtmlDocument pageHtmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNodeCollection trNodeList = pageHtmlDoc.DocumentNode.SelectNodes("//table/tbody/tr"); if (trNodeList != null) { for (int j = 0; j < trNodeList.Count; j++) { try { HtmlNode trNode = trNodeList[j]; HtmlNodeCollection tdNodeList = trNode.SelectNodes("./td"); HtmlNode indexNode = tdNodeList[0]; if (indexNode.GetAttributeValue("data-header", "") == "序号") { try { Dictionary <string, string> f2vs = new Dictionary <string, string>(); f2vs.Add("CompanyId", companyId); f2vs.Add("资质类别", tdNodeList.Count < 2 ? "" : tdNodeList[1].InnerText.Trim()); f2vs.Add("资质证书号", tdNodeList.Count < 3 ? "" : tdNodeList[2].InnerText.Trim()); f2vs.Add("资质名称", tdNodeList.Count < 4 ? "" : tdNodeList[3].InnerText.Trim()); f2vs.Add("发证日期", tdNodeList.Count < 5 ? "" : tdNodeList[4].InnerText.Trim()); f2vs.Add("证件有效期", tdNodeList.Count < 6 ? "" : tdNodeList[5].InnerText.Trim()); f2vs.Add("发证机关", tdNodeList.Count < 7 ? "" : tdNodeList[6].InnerText.Trim()); cw.AddRow(f2vs); } catch (Exception ex) { throw ex; } } } catch (Exception ex) { throw ex; } } } } } cw.SaveToDisk(); }
private bool GenerateNewPage(IListSheet listSheet) { bool succeed = true; string exportDir = this.RunPage.GetExportDir(); string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); string localHtmlDir = Path.Combine(exportDir, "LocalHtml"); if (!Directory.Exists(localHtmlDir)) { Directory.CreateDirectory(localHtmlDir); } for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { string url = row[SysConfig.DetailPageUrlFieldName]; string name = row[SysConfig.DetailPageNameFieldName]; string localFilePath = this.RunPage.GetFilePath(url, pageSourceDir); HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNode tableNode = htmlDoc.DocumentNode.SelectSingleNode("//body/table[1]"); tableNode.Attributes["border"].Value = "1"; string destFilePath = Path.Combine(localHtmlDir, name + ".html"); htmlDoc.Save(destFilePath); } } return(succeed); }
public override bool AfterAllGrab(IListSheet listSheet) { String exportDir = this.RunPage.GetExportDir(); Dictionary <string, int> resultColumnDic = new Dictionary <string, int>(); resultColumnDic.Add("detailPageUrl", 0); resultColumnDic.Add("detailPageName", 1); resultColumnDic.Add("cookie", 2); resultColumnDic.Add("grabStatus", 3); resultColumnDic.Add("giveUpGrab", 4); resultColumnDic.Add("CompanyId", 5); resultColumnDic.Add("企业名称", 6); resultColumnDic.Add("统一社会信用代码", 7); resultColumnDic.Add("企业法定代表人", 8); resultColumnDic.Add("企业登记注册类型", 9); resultColumnDic.Add("企业注册属地", 10); resultColumnDic.Add("企业经营地址", 11); resultColumnDic.Add("addressParts", 12); string resultFilePath = Path.Combine(exportDir, "企业数据_企业工商信息列表页.xlsx"); ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic, null); string detailPageUrlColumnName = SysConfig.DetailPageUrlFieldName; Dictionary <string, string> companyDic = new Dictionary <string, string>(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { string companyName = row["企业名称"].Trim().Replace("造价企业", "").Replace("测试企业", ""); if (!companyDic.ContainsKey(companyName)) { Dictionary <string, string> f2vs = new Dictionary <string, string>(); companyDic.Add(companyName, null); f2vs.Add("detailPageUrl", "https://www.tianyancha.com/search?key=" + companyName); f2vs.Add("detailPageName", row["CompanyId"]); f2vs.Add("CompanyId", row["CompanyId"]); f2vs.Add("企业名称", companyName); f2vs.Add("统一社会信用代码", row["统一社会信用代码"]); f2vs.Add("企业法定代表人", row["企业法定代表人"]); f2vs.Add("企业登记注册类型", row["企业登记注册类型"]); f2vs.Add("企业注册属地", row["企业注册属地"]); f2vs.Add("企业经营地址", row["企业经营地址"]); string addressParts = this.GetAddresParts(row); f2vs.Add("addressParts", addressParts); resultEW.AddRow(f2vs); } } } resultEW.SaveToDisk(); return(true); }
private void GetCityList(IListSheet listSheet) { String exportDir = this.RunPage.GetExportDir(); string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); Dictionary <string, int> resultColumnDic = new Dictionary <string, int>(); resultColumnDic.Add("code", 0); resultColumnDic.Add("name", 1); resultColumnDic.Add("url", 2); string resultFilePath = Path.Combine(exportDir, "安居客城市列表.xlsx"); ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic, null); Dictionary <string, string> urlDic = new Dictionary <string, string>(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); string detailUrl = row["detailPageUrl"]; bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { string localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir); try { HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNodeCollection allCityNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"cl-c-list\"]/ul[@class=\"cl-c-l-ul\"]/li[@class=\"cl-c-l-li\"]/a"); for (int j = 0; j < allCityNodes.Count; j++) { HtmlNode cityNode = allCityNodes[j]; string url = cityNode.GetAttributeValue("href", ""); int cityCodeFromIndex = url.IndexOf("com/") + 4; int cityCodeEndIndex = url.IndexOf("/commu"); if (cityCodeEndIndex > 0) { string code = url.Substring(cityCodeFromIndex, cityCodeEndIndex - cityCodeFromIndex); string name = CommonUtil.HtmlDecode(cityNode.InnerText.Trim()).Trim(); if (!urlDic.ContainsKey(url)) { urlDic.Add(url, null); Dictionary <string, string> f2vs = new Dictionary <string, string>(); f2vs.Add("code", code); f2vs.Add("name", name); f2vs.Add("url", url); resultEW.AddRow(f2vs); } } } } catch (Exception ex) { throw ex; } } } resultEW.SaveToDisk(); }
private void GetYearInfos(IListSheet listSheet) { string sourceDir = this.RunPage.GetDetailSourceFileDir(); ExcelWriter resultEW = this.CreateResultWriter(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> listRow = listSheet.GetRow(i); bool giveUp = "Y".Equals(listRow[SysConfig.GiveUpGrabFieldName]); string detailPageUrl = listRow[SysConfig.DetailPageUrlFieldName]; if (!giveUp) { try { string localFilePath = this.RunPage.GetFilePath(detailPageUrl, sourceDir); string html = FileHelper.GetTextFromFile(localFilePath, Encoding.UTF8); if (!html.Contains("您所访问的页面不存在")) { HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); htmlDoc.LoadHtml(html); HtmlNode mainInfoNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@class=\"lemma-summary\"]"); if (mainInfoNode == null) { this.RunPage.InvokeAppendLogText("此词条不存在摘要信息, pageUrl = " + detailPageUrl, LogLevelType.Error, true); } else { HtmlNode itemBaseInfoNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@class=\"lemmaWgt-promotion-rightPreciseAd\"]"); string itemId = itemBaseInfoNode.GetAttributeValue("data-lemmaid", ""); string itemName = itemBaseInfoNode.GetAttributeValue("data-lemmatitle", ""); string mainInfo = CommonUtil.HtmlDecode(mainInfoNode.InnerText).Trim(); Dictionary <string, string> newRow = new Dictionary <string, string>(); newRow.Add("url", detailPageUrl); newRow.Add("yearValue", listRow["yearValue"]); newRow.Add("yearName", listRow["yearName"]); newRow.Add("itemId", itemId); newRow.Add("itemName", itemName); newRow.Add("mainInfo", mainInfo); resultEW.AddRow(newRow); } } else { this.RunPage.InvokeAppendLogText("放弃解析此页, 所访问的页面不存在, pageUrl = " + detailPageUrl, LogLevelType.Error, true); } } catch (Exception ex) { this.RunPage.InvokeAppendLogText(ex.Message + ". 解析出错, pageUrl = " + detailPageUrl, LogLevelType.Error, true); throw ex; } } } resultEW.SaveToDisk(); }
/// <summary> /// 从listSheet中加载获取爬取需要的输入参数 /// </summary> /// <param name="listSheet"></param> private void GetSeedInfoFromListSheet(IListSheet listSheet) { Dictionary<string, string> listRow = listSheet.GetRow(0); this.KeyWords = listRow["keyWords"]; this.LoginName = listRow["loginName"]; this.LoginPassword = listRow["loginPassword"]; this.SeedPageUrl = listRow[SysConfig.DetailPageUrlFieldName]; }
private bool GetBuildingListPageUrls(IListSheet listSheet) { String exportDir = this.RunPage.GetExportDir(); Dictionary <string, int> resultColumnDic = new Dictionary <string, int>(); resultColumnDic.Add("detailPageUrl", 0); resultColumnDic.Add("detailPageName", 1); resultColumnDic.Add("cookie", 2); resultColumnDic.Add("grabStatus", 3); resultColumnDic.Add("giveUpGrab", 4); resultColumnDic.Add("projectId", 5); resultColumnDic.Add("projectName", 6); resultColumnDic.Add("pageIndex", 7); string resultFilePath = Path.Combine(exportDir, "济南楼盘_楼列表页.xlsx"); Dictionary <string, string> resultColumnFormat = new Dictionary <string, string>(); ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic, resultColumnFormat); string detailPageUrlColumnName = SysConfig.DetailPageUrlFieldName; Dictionary <string, string> loupanDic = new Dictionary <string, string>(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { string url = row[detailPageUrlColumnName]; string projectId = row["projectId"]; string projectName = row["projectName"]; HtmlAgilityPack.HtmlDocument pageHtmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNode pageCountNode = pageHtmlDoc.DocumentNode.SelectSingleNode("//input[@id=\"allpage\"]"); if (pageCountNode != null) { int pageCount = int.Parse(pageCountNode.GetAttributeValue("value", "")); for (int j = 0; j < pageCount; j++) { int pageIndex = j + 1; string detailPageUrl = "http://www.jnfdc.gov.cn/onsaling/show_" + pageIndex.ToString() + ".shtml?prjno=" + projectId; Dictionary <string, object> f2vs = new Dictionary <string, object>(); f2vs.Add("detailPageUrl", detailPageUrl); f2vs.Add("detailPageName", projectId + "_" + pageIndex.ToString()); f2vs.Add("projectId", projectId); f2vs.Add("projectName", projectName); f2vs.Add("pageIndex", pageIndex.ToString()); resultEW.AddRow(f2vs); } } } } resultEW.SaveToDisk(); return(true); }
private bool GetLoupanDetailInfos(IListSheet listSheet) { String exportDir = this.RunPage.GetExportDir(); Dictionary <string, int> resultColumnDic = new Dictionary <string, int>(); resultColumnDic.Add("项目ID", 0); resultColumnDic.Add("项目名称", 1); resultColumnDic.Add("项目地址", 2); resultColumnDic.Add("企业名称", 3); resultColumnDic.Add("所在区县", 4); resultColumnDic.Add("项目规模", 5); resultColumnDic.Add("总栋数", 6); resultColumnDic.Add("可售套数", 7); string resultFilePath = Path.Combine(exportDir, "济南楼盘_楼盘详情.xlsx"); Dictionary <string, string> resultColumnFormat = new Dictionary <string, string>(); ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic, resultColumnFormat); string detailPageUrlColumnName = SysConfig.DetailPageUrlFieldName; for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { string url = row[detailPageUrlColumnName]; string projectId = row["projectId"]; string sellable = row["sellable"]; HtmlAgilityPack.HtmlDocument pageHtmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNodeCollection trNodeList = pageHtmlDoc.DocumentNode.SelectNodes("//table[@class=\"message_table\"]/tr"); string projectName = trNodeList[1].SelectNodes("./td")[1].InnerText.Trim(); string address = trNodeList[1].SelectNodes("./td")[3].InnerText.Trim(); string companyName = trNodeList[2].SelectNodes("./td")[1].InnerText.Trim(); string scope = trNodeList[2].SelectNodes("./td")[3].InnerText.Trim(); string projectSize = trNodeList[3].SelectNodes("./td")[1].InnerText.Trim(); string buildingCount = trNodeList[3].SelectNodes("./td")[3].InnerText.Trim(); Dictionary <string, object> f2vs = new Dictionary <string, object>(); f2vs.Add("项目ID", projectId); f2vs.Add("项目名称", projectName); f2vs.Add("项目地址", address); f2vs.Add("企业名称", companyName); f2vs.Add("所在区县", scope); f2vs.Add("项目规模", projectSize); f2vs.Add("总栋数", buildingCount); f2vs.Add("可售套数", sellable); resultEW.AddRow(f2vs); } } resultEW.SaveToDisk(); return(true); }
private void GetList(IListSheet listSheet) { String exportDir = this.RunPage.GetExportDir(); string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); Dictionary <string, int> resultColumnDic = new Dictionary <string, int>(); resultColumnDic.Add("detailPageUrl", 0); resultColumnDic.Add("detailPageName", 1); resultColumnDic.Add("cookie", 2); resultColumnDic.Add("grabStatus", 3); resultColumnDic.Add("giveUpGrab", 4); resultColumnDic.Add("学科", 5); resultColumnDic.Add("学科id", 6); resultColumnDic.Add("门类", 7); resultColumnDic.Add("门类id", 8); string resultFilePath = Path.Combine(exportDir, "教育_本科_专业_jhcee_com.xlsx"); ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic, null); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); string detailUrl = row["detailPageUrl"]; bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { string localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir); try { string pageFileText = FileHelper.GetTextFromFile(localFilePath); JObject rootJo = JObject.Parse(pageFileText); JArray itemJsons = rootJo["data"] as JArray; foreach (JObject itemJson in itemJsons) { string name = itemJson["name"].ToString(); string id = itemJson["id"].ToString(); string parentId = itemJson["parentId"].ToString(); Dictionary <string, string> f2vs = new Dictionary <string, string>(); f2vs.Add("detailPageUrl", "http://www.jhcee.com/specialized/loadByParentId.json?parentId=" + id); f2vs.Add("detailPageName", id); f2vs.Add("门类", name); f2vs.Add("门类id", id); f2vs.Add("学科", row["name"]); f2vs.Add("学科id", row["id"]); resultEW.AddRow(f2vs); } } catch (Exception ex) { throw ex; } } } resultEW.SaveToDisk(); }
private bool GetProvinceCompCountList(IListSheet listSheet) { String exportDir = this.RunPage.GetExportDir(); Dictionary <string, int> resultColumnDic = new Dictionary <string, int>(); resultColumnDic.Add("regionId", 0); resultColumnDic.Add("regionName", 1); resultColumnDic.Add("regionFullName", 2); resultColumnDic.Add("aptCode", 3); resultColumnDic.Add("aptScope", 4); resultColumnDic.Add("companyCount", 5); string resultFilePath = Path.Combine(exportDir, "各省企业个数.xlsx"); Dictionary <string, string> resultColumnFormat = new Dictionary <string, string>(); resultColumnFormat.Add("companyCount", "#,##0"); ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic, resultColumnFormat); string detailPageUrlColumnName = SysConfig.DetailPageUrlFieldName; for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { string url = row[detailPageUrlColumnName]; string provinceId = row["regionId"]; string provinceName = row["regionName"]; string provinceFullName = row["regionFullName"]; string aptCode = row["aptCode"]; string aptScope = row["aptScope"]; string cookie = row["cookie"]; HtmlAgilityPack.HtmlDocument pageHtmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); string pageText = pageHtmlDoc.DocumentNode.SelectSingleNode("//form[@class=\"pagingform\"]").NextSibling.NextSibling.InnerText; int totalStartIndex = pageText.IndexOf("\"$total\":") + 9; int totalEndIndex = pageText.IndexOf(",", totalStartIndex); string totalCountStr = pageText.Substring(totalStartIndex, totalEndIndex - totalStartIndex); int companyCount = int.Parse(totalCountStr); Dictionary <string, object> f2vs = new Dictionary <string, object>(); f2vs.Add("regionId", provinceId); f2vs.Add("regionName", provinceName); f2vs.Add("regionFullName", provinceFullName); f2vs.Add("aptCode", aptCode); f2vs.Add("aptScope", aptScope); f2vs.Add("companyCount", companyCount); resultEW.AddRow(f2vs); } } resultEW.SaveToDisk(); return(true); }
private void GetKeywordsInfos(IListSheet listSheet) { string exportDir = this.RunPage.GetExportDir(); string exportFilePath = Path.Combine(exportDir, "论文_EBSCO_论文关键字.xlsx"); ExcelWriter resultWriter = this.GetKeywordsResultExcelWriter(exportFilePath); String sourceDir = this.RunPage.GetDetailSourceFileDir(); List <string> fileNameList = GetDirFileNames(sourceDir); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> listRow = listSheet.GetRow(i); string publicationName = listRow[SysConfig.DetailPageNameFieldName]; string moreKeywords = listRow["moreKeywords"]; this.RunPage.InvokeAppendLogText("处理期刊关键字, publicationName = " + publicationName + ", " + i.ToString() + "/" + listSheet.RowCount.ToString(), LogLevelType.System, true); foreach (string fileName in fileNameList) { if (fileName.StartsWith(publicationName + "_")) { int year = int.Parse(fileName.Replace(publicationName + "_", "")); string pubYearExcelFilePath = Path.Combine(sourceDir, fileName); ExcelReader er = new ExcelReader(pubYearExcelFilePath); int rowCount = er.GetRowCount(); for (int j = 0; j < rowCount; j++) { Dictionary <string, string> pubYearRow = er.GetFieldValues(j); string itemName = pubYearRow["itemName"]; string pubDir = Path.Combine(sourceDir, publicationName); string itemBaseInfoFilePath = this.RunPage.GetFilePath(itemName, pubDir) + "_baseInfo"; try { if (!File.Exists(itemBaseInfoFilePath)) { itemBaseInfoFilePath = Path.Combine(pubDir, CommonUtil.ProcessFileName(itemName, "_") + "_baseInfo"); if (!File.Exists(itemBaseInfoFilePath)) { throw new Exception("不存在文件, itemName = " + itemName); } } List <Dictionary <string, object> > fileRows = this.GetKeywordsInfos(itemBaseInfoFilePath, moreKeywords.Length == 0 ? publicationName : moreKeywords); foreach (Dictionary <string, object> fileRow in fileRows) { fileRow.Add("publication", publicationName); fileRow.Add("year", year); resultWriter.AddRow(fileRow); } } catch (Exception ex) { this.RunPage.InvokeAppendLogText("错误, " + ex.Message + " itemName = " + itemName, LogLevelType.Error, true); //throw ex; } } } } } resultWriter.SaveToDisk(); }
private bool SaveAllPointsToFile() { List <Dictionary <string, string> > allPoints = new List <Dictionary <string, string> >(); Dictionary <string, string> uidDic = new Dictionary <string, string>(); for (int i = 0; i < _ListSheet.RowCount; i++) { if (i % OnePageRowCount == 0) { Dictionary <string, string> listValues = _ListSheet.GetRow(i); string name = listValues["detailPageName"]; string rangeResultFilePath = this.GetResultPath(name, "csv"); CsvReader er = new CsvReader(rangeResultFilePath); for (int j = 0; j < er.GetRowCount(); j++) { Dictionary <string, string> row = er.GetFieldValues(j); Dictionary <string, string> p2vs = new Dictionary <string, string>(); p2vs.Add("city", row["city"]); p2vs.Add("district", row["district"]); p2vs.Add("province", row["province"]); p2vs.Add("street", row["street"]); p2vs.Add("streetNumber", row["streetNumber"]); p2vs.Add("township", row["township"]); p2vs.Add("formattedAddress", row["formattedAddress"]); p2vs.Add("building", row["building"]); p2vs.Add("buildingType", row["buildingType"]); p2vs.Add("lat", row["lat"]); p2vs.Add("lng", row["lng"]); allPoints.Add(p2vs); } } } bool succeed = true; Dictionary <string, int> resultColumnDic = CommonUtil.InitStringIndexDic(new string[] { "province", "city", "district", "street", "streetNumber", "township", "formattedAddress", "building", "buildingType", "lat", "lng" }); string resultFilePath = this.GetResultPath("百度地图爬取结果", "csv"); CsvWriter resultEW = new CsvWriter(resultFilePath, resultColumnDic); SavePointsToFile(allPoints, resultEW); return(succeed); }
private void GetDetailPageUrls(IListSheet listSheet) { String exportDir = this.RunPage.GetExportDir(); string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); Dictionary<string, int> resultColumnDic = new Dictionary<string, int>(); resultColumnDic.Add("detailPageUrl", 0); resultColumnDic.Add("detailPageName", 1); resultColumnDic.Add("cookie", 2); resultColumnDic.Add("grabStatus", 3); resultColumnDic.Add("giveUpGrab", 4); resultColumnDic.Add("name", 5); string resultFilePath = Path.Combine(exportDir, "www.nhm.ac.uk恐龙详情页.xlsx"); ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic, null); Dictionary<string, string> urlDic = new Dictionary<string, string>(); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary<string, string> row = listSheet.GetRow(i); string detailUrl = row["detailPageUrl"]; bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { string localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir); try { HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); htmlDoc.Load(localFilePath, Encoding.GetEncoding("utf-8")); HtmlNodeCollection dinosaurNodes = htmlDoc.DocumentNode.SelectNodes("//ul[@class='dino-list dino-wrap']/li/a"); for (int j = 0; j < dinosaurNodes.Count; j++) { HtmlNode dinosaurNode = dinosaurNodes[j]; string name = dinosaurNode.InnerText.Trim(); string url = "http://www.nhm.ac.uk" + dinosaurNode.GetAttributeValue("href", ""); if (!urlDic.ContainsKey(url)) { urlDic.Add(url, null); Dictionary<string, string> f2vs = new Dictionary<string, string>(); f2vs.Add("detailPageUrl", url); f2vs.Add("detailPageName", url); f2vs.Add("name", name); resultEW.AddRow(f2vs); } } } catch (Exception ex) { throw ex; } } } resultEW.SaveToDisk(); }