Exemplo n.º 1
0
        private void GetListPageUrls(IListSheet listSheet)
        {
            String exportDir     = this.RunPage.GetExportDir();
            string pageSourceDir = this.RunPage.GetDetailSourceFileDir();

            Dictionary <string, int> resultColumnDic = new Dictionary <string, int>();

            resultColumnDic.Add("detailPageUrl", 0);
            resultColumnDic.Add("detailPageName", 1);
            resultColumnDic.Add("cookie", 2);
            resultColumnDic.Add("grabStatus", 3);
            resultColumnDic.Add("giveUpGrab", 4);
            resultColumnDic.Add("category", 5);
            resultColumnDic.Add("subCategory", 6);
            string      resultFilePath = Path.Combine(exportDir, "美食天下_获取各小类菜谱列表页.xlsx");
            ExcelWriter resultEW       = new ExcelWriter(resultFilePath, "List", resultColumnDic, null);

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                string detailUrl = row["detailPageUrl"];

                bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    HtmlAgilityPack.HtmlDocument pageHtmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);

                    try
                    {
                        HtmlNodeCollection categoryDivList = pageHtmlDoc.DocumentNode.SelectNodes("//div[@class=\"category_sub clear\"]");

                        foreach (HtmlNode categoryDiv in categoryDivList)
                        {
                            HtmlNode           categoryNameNode    = categoryDiv.SelectSingleNode("./h3");
                            string             categoryName        = CommonUtil.HtmlDecode(categoryNameNode.InnerText).Trim();
                            HtmlNodeCollection subCategoryNodeList = categoryDiv.SelectNodes("./ul/li/a");
                            for (int j = 0; j < subCategoryNodeList.Count; j++)
                            {
                                HtmlNode subCategoryNode    = subCategoryNodeList[j];
                                string   subCategoryName    = subCategoryNode.GetAttributeValue("title", "");
                                string   subCategoryPageUrl = subCategoryNode.GetAttributeValue("href", "");

                                Dictionary <string, string> f2vs = new Dictionary <string, string>();
                                f2vs.Add("detailPageUrl", subCategoryPageUrl);
                                f2vs.Add("detailPageName", subCategoryPageUrl);
                                f2vs.Add("category", categoryName);
                                f2vs.Add("subCategory", subCategoryName);

                                resultEW.AddRow(f2vs);
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        throw ex;
                    }
                }
            }
            resultEW.SaveToDisk();
        }
Exemplo n.º 2
0
        private void GetShiShuDetailPageUrls(IListSheet listSheet)
        {
            string      sourceDir = this.RunPage.GetDetailSourceFileDir();
            ExcelWriter resultEW  = this.CreateResultWriter();

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string>  listRow = listSheet.GetRow(i);
                HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);

                HtmlNodeCollection linkNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"info_cate clearfix\"]/dl/dd/a");
                foreach (HtmlNode linkNode in linkNodes)
                {
                    string juanName = CommonUtil.HtmlDecode(linkNode.InnerText).Trim();

                    string juanPageUrl     = linkNode.GetAttributeValue("href", "");
                    string fullJuanPageUrl = "http://www.guoxuedashi.com" + juanPageUrl;
                    Dictionary <string, string> resultRow = new Dictionary <string, string>();
                    resultRow.Add("detailPageUrl", fullJuanPageUrl);
                    resultRow.Add("detailPageName", fullJuanPageUrl);
                    resultRow.Add("shiShu", listRow["shiShu"]);
                    resultRow.Add("leiXing", listRow["leiXing"]);
                    resultRow.Add("juan", juanName);
                    resultEW.AddRow(resultRow);
                }
            }

            resultEW.SaveToDisk();
        }
Exemplo n.º 3
0
        /// <summary>
        /// GetCats
        /// </summary>
        /// <param name="listSheet"></param>
        /// <param name="pageSourceDir"></param>
        /// <param name="resultEW"></param>
        private void GetCats(IListSheet listSheet, string pageSourceDir, ExcelWriter resultEW)
        {
            for (int i = 0; i < listSheet.RowCount; i++)
            {
                //listSheet中只有一条记录
                string pageUrl = listSheet.PageUrlList[i];
                Dictionary <string, string> row      = listSheet.GetRow(i);
                string localFilePath                 = this.RunPage.GetFilePath(pageUrl, pageSourceDir);
                HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);

                HtmlNodeCollection allCat1Nodes      = htmlDoc.DocumentNode.SelectNodes("//div[@id=\"catDiv\"]/div/h5");
                HtmlNodeCollection allCat2GroupNodes = htmlDoc.DocumentNode.SelectNodes("//div[@id=\"catDiv\"]/div/ul");

                for (int j = 0; j < allCat1Nodes.Count; j++)
                {
                    HtmlNode           cat1Node      = allCat1Nodes[j];
                    HtmlNode           cat2GroupNode = allCat2GroupNodes[j];
                    string             cat1Name      = cat1Node.InnerText.Trim();
                    HtmlNodeCollection allCat2Nodes  = cat2GroupNode.SelectNodes("./li");
                    for (int k = 0; k < allCat2Nodes.Count; k++)
                    {
                        HtmlNode cat2Node = allCat2Nodes[k];
                        string   cat2Code = cat2Node.Attributes["catid"].Value;
                        string   cat2Name = cat2Node.InnerText.Trim();

                        Dictionary <string, string> f2vs = new Dictionary <string, string>();
                        f2vs.Add("cat1Name", cat1Name);
                        f2vs.Add("cat2Name", cat2Name);
                        f2vs.Add("cat2Code", cat2Code);
                        resultEW.AddRow(f2vs);
                    }
                }
            }
        }
Exemplo n.º 4
0
        /// <summary>
        /// 保存一条记录
        /// </summary>
        /// <param name="listSheet"></param>
        /// <param name="columnNameToIndex"></param>
        /// <param name="fieldValues"></param>
        /// <param name="rowIndex"></param>
        /// <param name="pageUrl"></param>
        public void SaveDetailFieldValue(IListSheet listSheet, Dictionary <string, int> columnNameToIndex, Dictionary <string, string> fieldValues, int rowIndex, string pageUrl)
        {
            Dictionary <string, string> listRow = listSheet.GetRow(rowIndex);
            string urlCellValue = listRow[SysConfig.DetailPageUrlFieldName];

            if (urlCellValue == pageUrl)
            {
                IRow detailRow = this._DetailSheet.CreateRow(this._DetailSheet.LastRowNum + 1);
                foreach (string columnName in columnNameToIndex.Keys)
                {
                    int index = columnNameToIndex[columnName];
                    if (listRow.ContainsKey(columnName))
                    {
                        string v = listRow[columnName];
                        if (CommonUtil.IsNullOrBlank(v))
                        {
                            detailRow.CreateCell(index).SetCellValue(v);
                        }
                    }
                }

                foreach (string fieldName in fieldValues.Keys)
                {
                    int    index = columnNameToIndex[fieldName];
                    string value = fieldValues[fieldName];
                    ICell  cell  = detailRow.CreateCell(index);
                    cell.SetCellValue(value);
                }
            }
            else
            {
                throw new Exception("第" + rowIndex.ToString() + "行地址不匹配. Url_1 = " + pageUrl + ", Url_2 = " + urlCellValue);
            }
        }
Exemplo n.º 5
0
        private int InitGrabDetailPageIndexList(IListSheet listSheet, string sourceDir)
        {
            int detailPageIndex = 0;

            this.RunPage.NeedGrabIndexs = new List <int>();
            this.RunPage.InvokeAppendLogText("开始统计需要下载的页面.", LogLevelType.System, true);
            while (detailPageIndex < this.RunPage.DetailPageUrlList.Count)
            {
                string pageUrl       = this.RunPage.DetailPageUrlList[detailPageIndex];
                string localPagePath = this.RunPage.GetFilePath(pageUrl, sourceDir);
                if (this.CheckNeedGrab(listSheet.GetRow(detailPageIndex), localPagePath) &&
                    !this.RunPage.CheckGiveUpGrabPage(listSheet, pageUrl, detailPageIndex))
                {
                    this.RunPage.NeedGrabIndexs.Add(detailPageIndex);
                }
                detailPageIndex++;
                if (detailPageIndex % 1000 == 0)
                {
                    double perc = (double)detailPageIndex / (double)this.RunPage.DetailPageUrlList.Count;
                    this.RunPage.InvokeAppendLogText("正在统计需要下载的页面..." + perc.ToString("#0.00%"), LogLevelType.System, true);
                }
            }
            this.RunPage.InvokeAppendLogText("完成统计需要下载的页面.", LogLevelType.System, true);
            return(this.RunPage.NeedGrabIndexs.Count);
        }
Exemplo n.º 6
0
        private void GetList(IListSheet listSheet)
        {
            String exportDir     = this.RunPage.GetExportDir();
            string pageSourceDir = this.RunPage.GetDetailSourceFileDir();

            Dictionary <string, int> resultColumnDic = new Dictionary <string, int>();

            resultColumnDic.Add("fromName", 0);
            resultColumnDic.Add("toCode", 1);
            string      resultFilePath = Path.Combine(exportDir, "翻译结果.xlsx");
            ExcelWriter resultEW       = new ExcelWriter(resultFilePath, "List", resultColumnDic, null);

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                string detailUrl = row["detailPageUrl"];
                bool   giveUp    = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    try
                    {
                        string resultTextFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir);

                        CsvReader csvReader = new CsvReader(resultTextFilePath);
                        Dictionary <string, string> f2vs = csvReader.GetFieldValues(0);
                        resultEW.AddRow(f2vs);
                    }
                    catch (Exception ex)
                    {
                        throw ex;
                    }
                }
            }
            resultEW.SaveToDisk();
        }
Exemplo n.º 7
0
        /// <summary>
        /// GetCities
        /// </summary>
        /// <param name="listSheet"></param>
        /// <param name="pageSourceDir"></param>
        /// <param name="resultEW"></param>
        private void GetCities(IListSheet listSheet, string pageSourceDir, ExcelWriter resultEW)
        {
            for (int i = 0; i < listSheet.RowCount; i++)
            {
                string pageUrl = listSheet.PageUrlList[i];
                Dictionary <string, string> row = listSheet.GetRow(i);
                string areaLevel1Code           = row["areaLevel1Code"];
                string areaLevel1Name           = row["areaLevel1Name"];

                string localFilePath = this.RunPage.GetFilePath(pageUrl, pageSourceDir);
                string fileText      = FileHelper.GetTextFromFile(localFilePath);

                int jsonBeginIndex = fileText.IndexOf("{");
                int jsonEndIndex   = fileText.LastIndexOf("}");

                string  jsonStr        = fileText.Substring(jsonBeginIndex, jsonEndIndex - jsonBeginIndex + 1);
                JObject rootJo         = JObject.Parse(jsonStr);
                JArray  allAreaObjects = rootJo.SelectToken("data") as JArray;
                for (int j = 0; j < allAreaObjects.Count; j++)
                {
                    JObject areaObject               = allAreaObjects[j] as JObject;
                    string  areaLevel2Code           = (areaObject.SelectToken("id") as JValue).Value.ToString();
                    string  areaLevel2Name           = (areaObject.SelectToken("name") as JValue).Value.ToString();
                    Dictionary <string, string> f2vs = new Dictionary <string, string>();
                    f2vs.Add("detailPageUrl", "http://autobeta.jd.com/queryAreaList?area_lev=3&area_id=" + areaLevel2Code + "&callback=jQuery7711772&_=1469734421125");
                    f2vs.Add("detailPageName", areaLevel2Code + areaLevel2Name);
                    f2vs.Add("areaLevel1Code", areaLevel1Code);
                    f2vs.Add("areaLevel1Name", areaLevel1Name);
                    f2vs.Add("areaLevel2Code", areaLevel2Code);
                    f2vs.Add("areaLevel2Name", areaLevel2Name);
                    resultEW.AddRow(f2vs);
                }
            }
        }
Exemplo n.º 8
0
        /// <summary>
        /// GetProvinces
        /// </summary>
        /// <param name="listSheet"></param>
        /// <param name="pageSourceDir"></param>
        /// <param name="resultEW"></param>
        private void GetProvinces(IListSheet listSheet, string pageSourceDir, ExcelWriter resultEW)
        {
            for (int i = 0; i < listSheet.RowCount; i++)
            {
                //listSheet中只有一条记录
                string pageUrl = listSheet.PageUrlList[i];
                Dictionary <string, string> row      = listSheet.GetRow(i);
                string localFilePath                 = this.RunPage.GetFilePath(pageUrl, pageSourceDir);
                HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);

                HtmlNodeCollection allProvinceNodes = htmlDoc.DocumentNode.SelectNodes("//div[@id=\"listTab\"]/ul[1]/li");

                for (int j = 0; j < allProvinceNodes.Count; j++)
                {
                    HtmlNode provinceNode = allProvinceNodes[j];
                    string   provinceCode = provinceNode.Attributes["data-value"].Value;
                    string   provinceName = provinceNode.InnerText;

                    Dictionary <string, string> f2vs = new Dictionary <string, string>();
                    f2vs.Add("detailPageUrl", "http://www.tuhu.cn/Shops/" + provinceCode + ".aspx");
                    f2vs.Add("detailPageName", provinceCode + provinceName);
                    f2vs.Add("provinceCode", provinceCode);
                    f2vs.Add("provinceName", provinceName);
                    resultEW.AddRow(f2vs);
                }
            }
        }
Exemplo n.º 9
0
        /// <summary>
        /// GetCities
        /// </summary>
        /// <param name="listSheet"></param>
        /// <param name="pageSourceDir"></param>
        /// <param name="resultEW"></param>
        private void ReadCityPages(IListSheet listSheet, string pageSourceDir, ExcelWriter resultEW)
        {
            for (int i = 0; i < listSheet.RowCount; i++)
            {
                //listSheet中只有一条记录
                Dictionary <string, string> row = listSheet.GetRow(i);
                string pageUrl       = row[SysConfig.DetailPageUrlFieldName];
                string provinceCode  = row["provinceCode"];
                string provinceName  = row["provinceName"];
                string localFilePath = this.RunPage.GetFilePath(pageUrl, pageSourceDir);
                HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);

                HtmlNodeCollection allCityNodes = htmlDoc.DocumentNode.SelectNodes("//div[@id=\"listTab\"]/ul[2]/li/a");

                for (int j = 0; j < allCityNodes.Count; j++)
                {
                    HtmlNode cityNode           = allCityNodes[j];
                    string   cityUrl            = cityNode.Attributes["href"].Value;
                    string[] cityUrlPieces      = cityUrl.Split(new string[] { "/" }, StringSplitOptions.RemoveEmptyEntries);
                    string[] cityPageNamePieces = cityUrlPieces[cityUrlPieces.Length - 1].Split(new string[] { "." }, StringSplitOptions.RemoveEmptyEntries);
                    string   cityCode           = cityPageNamePieces[0];
                    string   cityName           = cityNode.InnerText;

                    Dictionary <string, string> f2vs = new Dictionary <string, string>();
                    f2vs.Add("detailPageUrl", cityUrl);
                    f2vs.Add("detailPageName", cityCode + cityName);
                    f2vs.Add("provinceCode", provinceCode);
                    f2vs.Add("provinceName", provinceName);
                    f2vs.Add("cityCode", cityCode);
                    f2vs.Add("cityName", cityName);
                    resultEW.AddRow(f2vs);
                }
            }
        }
Exemplo n.º 10
0
        private void ThreadGrabDetailPage(IListSheet listSheet, int detailPageIndex, Proj_Detail_SingleLine detailPageInfo, string sourceDir)
        {
            DateTime dt1           = DateTime.Now;
            string   pageUrl       = this.RunPage.DetailPageUrlList[detailPageIndex];
            string   cookie        = this.RunPage.DetailPageCookieList[detailPageIndex];
            string   localPagePath = this.RunPage.GetFilePath(pageUrl, sourceDir);
            Dictionary <string, string> listRow = listSheet.GetRow(detailPageIndex);

            bool succeed        = true;
            bool existLocalFile = File.Exists(localPagePath);


            if (!existLocalFile)
            {
                succeed = this.GrabDetailPage(listSheet, pageUrl, listRow, localPagePath, detailPageIndex, detailPageInfo, cookie);
            }

            this.RunPage.RefreshGrabCount(succeed);

            DateTime dt2 = DateTime.Now;
            TimeSpan ts  = dt2 - dt1;

            this.RunPage.InvokeAppendLogText("线程" + Thread.CurrentThread.ManagedThreadId.ToString() + ": 抓取了第" + (detailPageIndex + 1).ToString() + "个页面, 用时" + ts.TotalSeconds.ToString("0.00") + "秒", LogLevelType.Normal, false);

            this.RunPage.RecordGrabDetailStatus(succeed, dt1, dt2);
        }
Exemplo n.º 11
0
        private bool GetAllListPageUrls(IListSheet listSheet)
        {
            int  pageIndex         = 1;
            bool needMoreFirstPage = false;

            {
                ExcelWriter ew = this.GetExcelWriter();

                string detailPageUrlColumnName         = SysConfig.DetailPageUrlFieldName;
                Dictionary <string, string> companyDic = new Dictionary <string, string>();
                for (int i = 0; i < listSheet.RowCount; i++)
                {
                    Dictionary <string, string> row = listSheet.GetRow(i);
                    string detailPageUrl            = row[SysConfig.DetailPageUrlFieldName];
                    string detailPageName           = row[SysConfig.DetailPageNameFieldName];
                    string cookie = row[SysConfig.DetailPageCookieFieldName];

                    bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                    if (!giveUp)
                    {
                        HtmlAgilityPack.HtmlDocument pageHtmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);
                    }
                }

                ew.SaveToDisk();
            }

            return(true);
        }
        private void GetListPageUrls(IListSheet listSheet)
        {
            string      sourceDir = this.RunPage.GetDetailSourceFileDir();
            ExcelWriter resultEW  = this.CreateResultWriter();
            Dictionary <string, bool> pageUrlDic = new Dictionary <string, bool>();

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string>  listRow = listSheet.GetRow(i);
                HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);

                HtmlNodeCollection pageUrlNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"gclear pp bt center f14\"]/a");
                foreach (HtmlNode pageUrlNode in pageUrlNodes)
                {
                    string pageUrl     = pageUrlNode.GetAttributeValue("href", "");
                    string fullPageUrl = "https://chengyu.911cha.com/" + pageUrl;
                    if (!pageUrlDic.ContainsKey(fullPageUrl))
                    {
                        pageUrlDic.Add(fullPageUrl, true);
                        Dictionary <string, string> resultRow = new Dictionary <string, string>();
                        resultRow.Add("detailPageUrl", fullPageUrl);
                        resultRow.Add("detailPageName", fullPageUrl);
                        resultEW.AddRow(resultRow);
                    }
                }
            }

            resultEW.SaveToDisk();
        }
Exemplo n.º 13
0
        public override bool AfterAllGrab(IListSheet listSheet)
        {
            string exportDir     = this.RunPage.GetExportDir();
            string pageSourceDir = this.RunPage.GetDetailSourceFileDir();

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                string detailUrl = row["detailPageUrl"];
                string code      = row["detailPageName"];

                if (row["giveUpGrab"] != "Y")
                {
                    string localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir);

                    StreamReader tr                      = new StreamReader(localFilePath, Encoding.UTF8);
                    string       webPageHtml             = tr.ReadToEnd();
                    HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
                    htmlDoc.LoadHtml(webPageHtml);
                    HtmlNodeCollection itemNodes = htmlDoc.DocumentNode.SelectNodes("//table[@role=\"list\"]/tbody/tr");
                    this.GetInfos(itemNodes);
                }
            }
            return(true);
        }
Exemplo n.º 14
0
        /// <summary>
        /// GetCities
        /// </summary>
        /// <param name="listSheet"></param>
        /// <param name="pageSourceDir"></param>
        /// <param name="resultEW"></param>
        private void GetShopList(IListSheet listSheet, string pageSourceDir, ExcelWriter resultEW)
        {
            for (int i = 0; i < listSheet.RowCount; i++)
            {
                string pageUrl = listSheet.PageUrlList[i];
                Dictionary <string, string> row = listSheet.GetRow(i);
                string provinceName             = row["provinceName"];
                string cityName      = row["cityName"];
                string cityCode      = row["cityCode"];
                string localFilePath = this.RunPage.GetFilePath(pageUrl, pageSourceDir);
                HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);

                HtmlNodeCollection allShopNodes = htmlDoc.DocumentNode.SelectNodes("//div[@id=\"cityMapLeft\"]/div/b/a");

                for (int j = 0; j < allShopNodes.Count; j++)
                {
                    HtmlNode shopNode    = allShopNodes[j];
                    string   shopUrl     = shopNode.Attributes["href"].Value;
                    string[] shopPieces  = shopUrl.Split(new string[] { "/" }, StringSplitOptions.RemoveEmptyEntries);
                    string   shopCodeStr = shopPieces[shopPieces.Length - 1];
                    string   shopCode    = shopCodeStr.Substring(0, shopCodeStr.IndexOf("."));
                    string   shopName    = shopNode.InnerText.Trim();

                    Dictionary <string, string> f2vs = new Dictionary <string, string>();
                    f2vs.Add("detailPageUrl", shopUrl);
                    f2vs.Add("detailPageName", shopCode + shopName);
                    f2vs.Add("provinceName", provinceName);
                    f2vs.Add("cityCode", cityCode);
                    f2vs.Add("cityName", cityName);
                    f2vs.Add("shopCode", shopCode);
                    f2vs.Add("shopName", shopName);
                    resultEW.AddRow(f2vs);
                }
            }
        }
Exemplo n.º 15
0
        private void GetCategoryToPageUrls(IListSheet listSheet)
        {
            String exportDir      = this.RunPage.GetExportDir();
            string pageSourceDir  = this.RunPage.GetDetailSourceFileDir();
            string resultFilePath = Path.Combine(exportDir, "美食天下_分类与菜谱列表对照.xlsx");

            ExcelWriter resultEW = this.CreateSubCategoryMapWriter(resultFilePath);

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                string detailPageUrl            = row[SysConfig.DetailPageUrlFieldName];
                string category            = row["category"];
                string subCategory         = row["subCategory"];
                string sourceDir           = this.RunPage.GetDetailSourceFileDir();
                string subCategoryFilePath = this.RunPage.GetFilePath(detailPageUrl, sourceDir);

                ExcelReader er       = new ExcelReader(subCategoryFilePath);
                int         rowCount = er.GetRowCount();
                for (int j = 0; j < rowCount; j++)
                {
                    Dictionary <string, string> subRow = er.GetFieldValues(j);

                    Dictionary <string, string> mapRow = new Dictionary <string, string>();
                    mapRow.Add("category", subRow["category"]);
                    mapRow.Add("subCategory", subRow["subCategory"]);
                    mapRow.Add("name", subRow["name"]);
                    mapRow.Add("url", subRow["url"]);
                    resultEW.AddRow(mapRow);
                }
            }
            resultEW.SaveToDisk();
        }
Exemplo n.º 16
0
        private void GetRenWuInfos(IListSheet listSheet)
        {
            string      sourceDir = this.RunPage.GetDetailSourceFileDir();
            ExcelWriter resultEW  = this.CreateRenWuResultWriter();
            Dictionary <string, bool> pageUrlDic = new Dictionary <string, bool>();

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string>  listRow = listSheet.GetRow(i);
                HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);
                try
                {
                    HtmlNode mainInfoNode    = htmlDoc.DocumentNode.SelectSingleNode("//div[@class=\"info_txt2 clearfix\"]");
                    HtmlNode titleNode       = mainInfoNode.SelectSingleNode("./h2");
                    string   renWuTitle      = CommonUtil.HtmlDecode(titleNode.InnerText).Trim();
                    HtmlNode descriptionNode = mainInfoNode.SelectSingleNode("./p");
                    string   description     = descriptionNode == null ? "" : CommonUtil.HtmlDecode(descriptionNode.InnerText).Trim();

                    Dictionary <string, string> resultRow = new Dictionary <string, string>();
                    resultRow.Add("人物", listRow["renWu"]);
                    resultRow.Add("时代", listRow["shiDai"]);
                    resultRow.Add("人物页面标题", renWuTitle);
                    resultRow.Add("简介", description);
                    resultRow.Add("url", listRow[SysConfig.DetailPageUrlFieldName]);
                    resultEW.AddRow(resultRow);
                }
                catch (Exception ex)
                {
                    throw ex;
                }
            }

            resultEW.SaveToDisk();
        }
Exemplo n.º 17
0
        private void GetListPageUrls(IListSheet listSheet)
        {
            ExcelWriter ew = this.CreateWriter();

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> listRow = listSheet.GetRow(i);
                bool giveUp = "Y".Equals(listRow[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    try
                    {
                        HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);

                        HtmlNodeCollection linkNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"cont\"]/a");
                        for (int j = 0; j < linkNodes.Count; j++)
                        {
                            HtmlNode linkNode = linkNodes[j];
                            string   url      = "http://www.lszj.com" + linkNode.GetAttributeValue("href", "");
                            string   name     = CommonUtil.HtmlDecode(linkNode.InnerText).Trim();
                            Dictionary <string, string> row = new Dictionary <string, string>();
                            row.Add("detailPageUrl", url);
                            row.Add("detailPageName", url);
                            row.Add("name", name);
                            ew.AddRow(row);
                        }
                    }
                    catch (Exception ex)
                    {
                        throw ex;
                    }
                }
            }
            ew.SaveToDisk();
        }
Exemplo n.º 18
0
        private void GetAllInfos(IListSheet listSheet)
        {
            CsvWriter cw = this.GetCsvExcelWriter();

            string detailPageUrlColumnName         = SysConfig.DetailPageUrlFieldName;
            Dictionary <string, string> companyDic = new Dictionary <string, string>();

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                string detailPageUrl            = row[SysConfig.DetailPageUrlFieldName];
                string detailPageName           = row[SysConfig.DetailPageNameFieldName];
                string companyId = row["companyId"];

                bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    HtmlAgilityPack.HtmlDocument pageHtmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);

                    HtmlNodeCollection trNodeList = pageHtmlDoc.DocumentNode.SelectNodes("//table/tbody/tr");
                    if (trNodeList != null)
                    {
                        for (int j = 0; j < trNodeList.Count; j++)
                        {
                            try
                            {
                                HtmlNode           trNode     = trNodeList[j];
                                HtmlNodeCollection tdNodeList = trNode.SelectNodes("./td");
                                HtmlNode           indexNode  = tdNodeList[0];
                                if (indexNode.GetAttributeValue("data-header", "") == "序号")
                                {
                                    try
                                    {
                                        Dictionary <string, string> f2vs = new Dictionary <string, string>();
                                        f2vs.Add("CompanyId", companyId);
                                        f2vs.Add("资质类别", tdNodeList.Count < 2 ? "" : tdNodeList[1].InnerText.Trim());
                                        f2vs.Add("资质证书号", tdNodeList.Count < 3 ? "" : tdNodeList[2].InnerText.Trim());
                                        f2vs.Add("资质名称", tdNodeList.Count < 4 ? "" : tdNodeList[3].InnerText.Trim());
                                        f2vs.Add("发证日期", tdNodeList.Count < 5 ? "" : tdNodeList[4].InnerText.Trim());
                                        f2vs.Add("证件有效期", tdNodeList.Count < 6 ? "" : tdNodeList[5].InnerText.Trim());
                                        f2vs.Add("发证机关", tdNodeList.Count < 7 ? "" : tdNodeList[6].InnerText.Trim());
                                        cw.AddRow(f2vs);
                                    }
                                    catch (Exception ex)
                                    {
                                        throw ex;
                                    }
                                }
                            }
                            catch (Exception ex)
                            {
                                throw ex;
                            }
                        }
                    }
                }
            }

            cw.SaveToDisk();
        }
Exemplo n.º 19
0
        private bool GenerateNewPage(IListSheet listSheet)
        {
            bool   succeed       = true;
            string exportDir     = this.RunPage.GetExportDir();
            string pageSourceDir = this.RunPage.GetDetailSourceFileDir();
            string localHtmlDir  = Path.Combine(exportDir, "LocalHtml");

            if (!Directory.Exists(localHtmlDir))
            {
                Directory.CreateDirectory(localHtmlDir);
            }
            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    string url           = row[SysConfig.DetailPageUrlFieldName];
                    string name          = row[SysConfig.DetailPageNameFieldName];
                    string localFilePath = this.RunPage.GetFilePath(url, pageSourceDir);
                    HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);
                    HtmlNode tableNode = htmlDoc.DocumentNode.SelectSingleNode("//body/table[1]");
                    tableNode.Attributes["border"].Value = "1";
                    string destFilePath = Path.Combine(localHtmlDir, name + ".html");
                    htmlDoc.Save(destFilePath);
                }
            }
            return(succeed);
        }
        public override bool AfterAllGrab(IListSheet listSheet)
        {
            String exportDir = this.RunPage.GetExportDir();

            Dictionary <string, int> resultColumnDic = new Dictionary <string, int>();

            resultColumnDic.Add("detailPageUrl", 0);
            resultColumnDic.Add("detailPageName", 1);
            resultColumnDic.Add("cookie", 2);
            resultColumnDic.Add("grabStatus", 3);
            resultColumnDic.Add("giveUpGrab", 4);
            resultColumnDic.Add("CompanyId", 5);
            resultColumnDic.Add("企业名称", 6);
            resultColumnDic.Add("统一社会信用代码", 7);
            resultColumnDic.Add("企业法定代表人", 8);
            resultColumnDic.Add("企业登记注册类型", 9);
            resultColumnDic.Add("企业注册属地", 10);
            resultColumnDic.Add("企业经营地址", 11);
            resultColumnDic.Add("addressParts", 12);
            string      resultFilePath = Path.Combine(exportDir, "企业数据_企业工商信息列表页.xlsx");
            ExcelWriter resultEW       = new ExcelWriter(resultFilePath, "List", resultColumnDic, null);

            string detailPageUrlColumnName         = SysConfig.DetailPageUrlFieldName;
            Dictionary <string, string> companyDic = new Dictionary <string, string>();

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    string companyName = row["企业名称"].Trim().Replace("造价企业", "").Replace("测试企业", "");

                    if (!companyDic.ContainsKey(companyName))
                    {
                        Dictionary <string, string> f2vs = new Dictionary <string, string>();
                        companyDic.Add(companyName, null);

                        f2vs.Add("detailPageUrl", "https://www.tianyancha.com/search?key=" + companyName);
                        f2vs.Add("detailPageName", row["CompanyId"]);
                        f2vs.Add("CompanyId", row["CompanyId"]);
                        f2vs.Add("企业名称", companyName);
                        f2vs.Add("统一社会信用代码", row["统一社会信用代码"]);
                        f2vs.Add("企业法定代表人", row["企业法定代表人"]);
                        f2vs.Add("企业登记注册类型", row["企业登记注册类型"]);
                        f2vs.Add("企业注册属地", row["企业注册属地"]);
                        f2vs.Add("企业经营地址", row["企业经营地址"]);

                        string addressParts = this.GetAddresParts(row);
                        f2vs.Add("addressParts", addressParts);

                        resultEW.AddRow(f2vs);
                    }
                }
            }

            resultEW.SaveToDisk();

            return(true);
        }
Exemplo n.º 21
0
        private void GetCityList(IListSheet listSheet)
        {
            String exportDir     = this.RunPage.GetExportDir();
            string pageSourceDir = this.RunPage.GetDetailSourceFileDir();

            Dictionary <string, int> resultColumnDic = new Dictionary <string, int>();

            resultColumnDic.Add("code", 0);
            resultColumnDic.Add("name", 1);
            resultColumnDic.Add("url", 2);
            string      resultFilePath = Path.Combine(exportDir, "安居客城市列表.xlsx");
            ExcelWriter resultEW       = new ExcelWriter(resultFilePath, "List", resultColumnDic, null);

            Dictionary <string, string> urlDic = new Dictionary <string, string>();

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                string detailUrl = row["detailPageUrl"];
                bool   giveUp    = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    string localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir);

                    try
                    {
                        HtmlAgilityPack.HtmlDocument htmlDoc      = this.RunPage.GetLocalHtmlDocument(listSheet, i);
                        HtmlNodeCollection           allCityNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"cl-c-list\"]/ul[@class=\"cl-c-l-ul\"]/li[@class=\"cl-c-l-li\"]/a");

                        for (int j = 0; j < allCityNodes.Count; j++)
                        {
                            HtmlNode cityNode          = allCityNodes[j];
                            string   url               = cityNode.GetAttributeValue("href", "");
                            int      cityCodeFromIndex = url.IndexOf("com/") + 4;
                            int      cityCodeEndIndex  = url.IndexOf("/commu");
                            if (cityCodeEndIndex > 0)
                            {
                                string code = url.Substring(cityCodeFromIndex, cityCodeEndIndex - cityCodeFromIndex);
                                string name = CommonUtil.HtmlDecode(cityNode.InnerText.Trim()).Trim();
                                if (!urlDic.ContainsKey(url))
                                {
                                    urlDic.Add(url, null);
                                    Dictionary <string, string> f2vs = new Dictionary <string, string>();
                                    f2vs.Add("code", code);
                                    f2vs.Add("name", name);
                                    f2vs.Add("url", url);
                                    resultEW.AddRow(f2vs);
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        throw ex;
                    }
                }
            }
            resultEW.SaveToDisk();
        }
Exemplo n.º 22
0
        private void GetYearInfos(IListSheet listSheet)
        {
            string sourceDir = this.RunPage.GetDetailSourceFileDir();

            ExcelWriter resultEW = this.CreateResultWriter();

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> listRow = listSheet.GetRow(i);
                bool   giveUp        = "Y".Equals(listRow[SysConfig.GiveUpGrabFieldName]);
                string detailPageUrl = listRow[SysConfig.DetailPageUrlFieldName];
                if (!giveUp)
                {
                    try
                    {
                        string localFilePath = this.RunPage.GetFilePath(detailPageUrl, sourceDir);
                        string html          = FileHelper.GetTextFromFile(localFilePath, Encoding.UTF8);
                        if (!html.Contains("您所访问的页面不存在"))
                        {
                            HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
                            htmlDoc.LoadHtml(html);
                            HtmlNode mainInfoNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@class=\"lemma-summary\"]");
                            if (mainInfoNode == null)
                            {
                                this.RunPage.InvokeAppendLogText("此词条不存在摘要信息, pageUrl = " + detailPageUrl, LogLevelType.Error, true);
                            }
                            else
                            {
                                HtmlNode itemBaseInfoNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@class=\"lemmaWgt-promotion-rightPreciseAd\"]");
                                string   itemId           = itemBaseInfoNode.GetAttributeValue("data-lemmaid", "");
                                string   itemName         = itemBaseInfoNode.GetAttributeValue("data-lemmatitle", "");

                                string mainInfo = CommonUtil.HtmlDecode(mainInfoNode.InnerText).Trim();

                                Dictionary <string, string> newRow = new Dictionary <string, string>();
                                newRow.Add("url", detailPageUrl);
                                newRow.Add("yearValue", listRow["yearValue"]);
                                newRow.Add("yearName", listRow["yearName"]);
                                newRow.Add("itemId", itemId);
                                newRow.Add("itemName", itemName);
                                newRow.Add("mainInfo", mainInfo);
                                resultEW.AddRow(newRow);
                            }
                        }
                        else
                        {
                            this.RunPage.InvokeAppendLogText("放弃解析此页, 所访问的页面不存在, pageUrl = " + detailPageUrl, LogLevelType.Error, true);
                        }
                    }
                    catch (Exception ex)
                    {
                        this.RunPage.InvokeAppendLogText(ex.Message + ". 解析出错, pageUrl = " + detailPageUrl, LogLevelType.Error, true);
                        throw ex;
                    }
                }
            }

            resultEW.SaveToDisk();
        }
Exemplo n.º 23
0
 /// <summary>
 /// 从listSheet中加载获取爬取需要的输入参数
 /// </summary>
 /// <param name="listSheet"></param>
 private void GetSeedInfoFromListSheet(IListSheet listSheet)
 {
     Dictionary<string, string> listRow = listSheet.GetRow(0);
     this.KeyWords = listRow["keyWords"];
     this.LoginName = listRow["loginName"];
     this.LoginPassword = listRow["loginPassword"];
     this.SeedPageUrl = listRow[SysConfig.DetailPageUrlFieldName];
 }
Exemplo n.º 24
0
        private bool GetBuildingListPageUrls(IListSheet listSheet)
        {
            String exportDir = this.RunPage.GetExportDir();

            Dictionary <string, int> resultColumnDic = new Dictionary <string, int>();

            resultColumnDic.Add("detailPageUrl", 0);
            resultColumnDic.Add("detailPageName", 1);
            resultColumnDic.Add("cookie", 2);
            resultColumnDic.Add("grabStatus", 3);
            resultColumnDic.Add("giveUpGrab", 4);
            resultColumnDic.Add("projectId", 5);
            resultColumnDic.Add("projectName", 6);
            resultColumnDic.Add("pageIndex", 7);
            string resultFilePath = Path.Combine(exportDir, "济南楼盘_楼列表页.xlsx");
            Dictionary <string, string> resultColumnFormat = new Dictionary <string, string>();
            ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic, resultColumnFormat);

            string detailPageUrlColumnName = SysConfig.DetailPageUrlFieldName;

            Dictionary <string, string> loupanDic = new Dictionary <string, string>();

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    string url         = row[detailPageUrlColumnName];
                    string projectId   = row["projectId"];
                    string projectName = row["projectName"];

                    HtmlAgilityPack.HtmlDocument pageHtmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);
                    HtmlNode pageCountNode = pageHtmlDoc.DocumentNode.SelectSingleNode("//input[@id=\"allpage\"]");
                    if (pageCountNode != null)
                    {
                        int pageCount = int.Parse(pageCountNode.GetAttributeValue("value", ""));

                        for (int j = 0; j < pageCount; j++)
                        {
                            int    pageIndex                 = j + 1;
                            string detailPageUrl             = "http://www.jnfdc.gov.cn/onsaling/show_" + pageIndex.ToString() + ".shtml?prjno=" + projectId;
                            Dictionary <string, object> f2vs = new Dictionary <string, object>();
                            f2vs.Add("detailPageUrl", detailPageUrl);
                            f2vs.Add("detailPageName", projectId + "_" + pageIndex.ToString());
                            f2vs.Add("projectId", projectId);
                            f2vs.Add("projectName", projectName);
                            f2vs.Add("pageIndex", pageIndex.ToString());
                            resultEW.AddRow(f2vs);
                        }
                    }
                }
            }

            resultEW.SaveToDisk();

            return(true);
        }
Exemplo n.º 25
0
        private bool GetLoupanDetailInfos(IListSheet listSheet)
        {
            String exportDir = this.RunPage.GetExportDir();

            Dictionary <string, int> resultColumnDic = new Dictionary <string, int>();

            resultColumnDic.Add("项目ID", 0);
            resultColumnDic.Add("项目名称", 1);
            resultColumnDic.Add("项目地址", 2);
            resultColumnDic.Add("企业名称", 3);
            resultColumnDic.Add("所在区县", 4);
            resultColumnDic.Add("项目规模", 5);
            resultColumnDic.Add("总栋数", 6);
            resultColumnDic.Add("可售套数", 7);
            string resultFilePath = Path.Combine(exportDir, "济南楼盘_楼盘详情.xlsx");
            Dictionary <string, string> resultColumnFormat = new Dictionary <string, string>();
            ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic, resultColumnFormat);

            string detailPageUrlColumnName = SysConfig.DetailPageUrlFieldName;

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    string url       = row[detailPageUrlColumnName];
                    string projectId = row["projectId"];
                    string sellable  = row["sellable"];

                    HtmlAgilityPack.HtmlDocument pageHtmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);
                    HtmlNodeCollection           trNodeList  = pageHtmlDoc.DocumentNode.SelectNodes("//table[@class=\"message_table\"]/tr");


                    string projectName   = trNodeList[1].SelectNodes("./td")[1].InnerText.Trim();
                    string address       = trNodeList[1].SelectNodes("./td")[3].InnerText.Trim();
                    string companyName   = trNodeList[2].SelectNodes("./td")[1].InnerText.Trim();
                    string scope         = trNodeList[2].SelectNodes("./td")[3].InnerText.Trim();
                    string projectSize   = trNodeList[3].SelectNodes("./td")[1].InnerText.Trim();
                    string buildingCount = trNodeList[3].SelectNodes("./td")[3].InnerText.Trim();

                    Dictionary <string, object> f2vs = new Dictionary <string, object>();
                    f2vs.Add("项目ID", projectId);
                    f2vs.Add("项目名称", projectName);
                    f2vs.Add("项目地址", address);
                    f2vs.Add("企业名称", companyName);
                    f2vs.Add("所在区县", scope);
                    f2vs.Add("项目规模", projectSize);
                    f2vs.Add("总栋数", buildingCount);
                    f2vs.Add("可售套数", sellable);
                    resultEW.AddRow(f2vs);
                }
            }

            resultEW.SaveToDisk();

            return(true);
        }
Exemplo n.º 26
0
        private void GetList(IListSheet listSheet)
        {
            String exportDir     = this.RunPage.GetExportDir();
            string pageSourceDir = this.RunPage.GetDetailSourceFileDir();

            Dictionary <string, int> resultColumnDic = new Dictionary <string, int>();

            resultColumnDic.Add("detailPageUrl", 0);
            resultColumnDic.Add("detailPageName", 1);
            resultColumnDic.Add("cookie", 2);
            resultColumnDic.Add("grabStatus", 3);
            resultColumnDic.Add("giveUpGrab", 4);
            resultColumnDic.Add("学科", 5);
            resultColumnDic.Add("学科id", 6);
            resultColumnDic.Add("门类", 7);
            resultColumnDic.Add("门类id", 8);
            string      resultFilePath = Path.Combine(exportDir, "教育_本科_专业_jhcee_com.xlsx");
            ExcelWriter resultEW       = new ExcelWriter(resultFilePath, "List", resultColumnDic, null);

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                string detailUrl = row["detailPageUrl"];
                bool   giveUp    = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    string localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir);

                    try
                    {
                        string  pageFileText = FileHelper.GetTextFromFile(localFilePath);
                        JObject rootJo       = JObject.Parse(pageFileText);

                        JArray itemJsons = rootJo["data"] as JArray;
                        foreach (JObject itemJson in itemJsons)
                        {
                            string name     = itemJson["name"].ToString();
                            string id       = itemJson["id"].ToString();
                            string parentId = itemJson["parentId"].ToString();

                            Dictionary <string, string> f2vs = new Dictionary <string, string>();
                            f2vs.Add("detailPageUrl", "http://www.jhcee.com/specialized/loadByParentId.json?parentId=" + id);
                            f2vs.Add("detailPageName", id);
                            f2vs.Add("门类", name);
                            f2vs.Add("门类id", id);
                            f2vs.Add("学科", row["name"]);
                            f2vs.Add("学科id", row["id"]);
                            resultEW.AddRow(f2vs);
                        }
                    }
                    catch (Exception ex)
                    {
                        throw ex;
                    }
                }
            }
            resultEW.SaveToDisk();
        }
        private bool GetProvinceCompCountList(IListSheet listSheet)
        {
            String exportDir = this.RunPage.GetExportDir();

            Dictionary <string, int> resultColumnDic = new Dictionary <string, int>();

            resultColumnDic.Add("regionId", 0);
            resultColumnDic.Add("regionName", 1);
            resultColumnDic.Add("regionFullName", 2);
            resultColumnDic.Add("aptCode", 3);
            resultColumnDic.Add("aptScope", 4);
            resultColumnDic.Add("companyCount", 5);

            string resultFilePath = Path.Combine(exportDir, "各省企业个数.xlsx");

            Dictionary <string, string> resultColumnFormat = new Dictionary <string, string>();

            resultColumnFormat.Add("companyCount", "#,##0");
            ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic, resultColumnFormat);

            string detailPageUrlColumnName = SysConfig.DetailPageUrlFieldName;

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    string url              = row[detailPageUrlColumnName];
                    string provinceId       = row["regionId"];
                    string provinceName     = row["regionName"];
                    string provinceFullName = row["regionFullName"];
                    string aptCode          = row["aptCode"];
                    string aptScope         = row["aptScope"];
                    string cookie           = row["cookie"];

                    HtmlAgilityPack.HtmlDocument pageHtmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);
                    string pageText        = pageHtmlDoc.DocumentNode.SelectSingleNode("//form[@class=\"pagingform\"]").NextSibling.NextSibling.InnerText;
                    int    totalStartIndex = pageText.IndexOf("\"$total\":") + 9;
                    int    totalEndIndex   = pageText.IndexOf(",", totalStartIndex);
                    string totalCountStr   = pageText.Substring(totalStartIndex, totalEndIndex - totalStartIndex);
                    int    companyCount    = int.Parse(totalCountStr);

                    Dictionary <string, object> f2vs = new Dictionary <string, object>();
                    f2vs.Add("regionId", provinceId);
                    f2vs.Add("regionName", provinceName);
                    f2vs.Add("regionFullName", provinceFullName);
                    f2vs.Add("aptCode", aptCode);
                    f2vs.Add("aptScope", aptScope);
                    f2vs.Add("companyCount", companyCount);
                    resultEW.AddRow(f2vs);
                }
            }

            resultEW.SaveToDisk();

            return(true);
        }
Exemplo n.º 28
0
        private void GetKeywordsInfos(IListSheet listSheet)
        {
            string        exportDir      = this.RunPage.GetExportDir();
            string        exportFilePath = Path.Combine(exportDir, "论文_EBSCO_论文关键字.xlsx");
            ExcelWriter   resultWriter   = this.GetKeywordsResultExcelWriter(exportFilePath);
            String        sourceDir      = this.RunPage.GetDetailSourceFileDir();
            List <string> fileNameList   = GetDirFileNames(sourceDir);

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> listRow = listSheet.GetRow(i);
                string publicationName = listRow[SysConfig.DetailPageNameFieldName];
                string moreKeywords    = listRow["moreKeywords"];
                this.RunPage.InvokeAppendLogText("处理期刊关键字, publicationName = " + publicationName + ", " + i.ToString() + "/" + listSheet.RowCount.ToString(), LogLevelType.System, true);
                foreach (string fileName in fileNameList)
                {
                    if (fileName.StartsWith(publicationName + "_"))
                    {
                        int year = int.Parse(fileName.Replace(publicationName + "_", ""));

                        string      pubYearExcelFilePath = Path.Combine(sourceDir, fileName);
                        ExcelReader er       = new ExcelReader(pubYearExcelFilePath);
                        int         rowCount = er.GetRowCount();
                        for (int j = 0; j < rowCount; j++)
                        {
                            Dictionary <string, string> pubYearRow = er.GetFieldValues(j);
                            string itemName             = pubYearRow["itemName"];
                            string pubDir               = Path.Combine(sourceDir, publicationName);
                            string itemBaseInfoFilePath = this.RunPage.GetFilePath(itemName, pubDir) + "_baseInfo";
                            try
                            {
                                if (!File.Exists(itemBaseInfoFilePath))
                                {
                                    itemBaseInfoFilePath = Path.Combine(pubDir, CommonUtil.ProcessFileName(itemName, "_") + "_baseInfo");
                                    if (!File.Exists(itemBaseInfoFilePath))
                                    {
                                        throw new Exception("不存在文件, itemName =  " + itemName);
                                    }
                                }
                                List <Dictionary <string, object> > fileRows = this.GetKeywordsInfos(itemBaseInfoFilePath, moreKeywords.Length == 0 ? publicationName : moreKeywords);
                                foreach (Dictionary <string, object> fileRow in fileRows)
                                {
                                    fileRow.Add("publication", publicationName);
                                    fileRow.Add("year", year);
                                    resultWriter.AddRow(fileRow);
                                }
                            }
                            catch (Exception ex)
                            {
                                this.RunPage.InvokeAppendLogText("错误, " + ex.Message + " itemName = " + itemName, LogLevelType.Error, true);
                                //throw ex;
                            }
                        }
                    }
                }
            }
            resultWriter.SaveToDisk();
        }
Exemplo n.º 29
0
        private bool SaveAllPointsToFile()
        {
            List <Dictionary <string, string> > allPoints = new List <Dictionary <string, string> >();
            Dictionary <string, string>         uidDic    = new Dictionary <string, string>();

            for (int i = 0; i < _ListSheet.RowCount; i++)
            {
                if (i % OnePageRowCount == 0)
                {
                    Dictionary <string, string> listValues = _ListSheet.GetRow(i);
                    string name = listValues["detailPageName"];
                    string rangeResultFilePath = this.GetResultPath(name, "csv");

                    CsvReader er = new CsvReader(rangeResultFilePath);

                    for (int j = 0; j < er.GetRowCount(); j++)
                    {
                        Dictionary <string, string> row  = er.GetFieldValues(j);
                        Dictionary <string, string> p2vs = new Dictionary <string, string>();
                        p2vs.Add("city", row["city"]);
                        p2vs.Add("district", row["district"]);
                        p2vs.Add("province", row["province"]);
                        p2vs.Add("street", row["street"]);
                        p2vs.Add("streetNumber", row["streetNumber"]);
                        p2vs.Add("township", row["township"]);
                        p2vs.Add("formattedAddress", row["formattedAddress"]);
                        p2vs.Add("building", row["building"]);
                        p2vs.Add("buildingType", row["buildingType"]);
                        p2vs.Add("lat", row["lat"]);
                        p2vs.Add("lng", row["lng"]);

                        allPoints.Add(p2vs);
                    }
                }
            }

            bool succeed = true;
            Dictionary <string, int> resultColumnDic = CommonUtil.InitStringIndexDic(new string[] {
                "province",
                "city",
                "district",
                "street",
                "streetNumber",
                "township",
                "formattedAddress",
                "building",
                "buildingType",
                "lat",
                "lng"
            });
            string    resultFilePath = this.GetResultPath("百度地图爬取结果", "csv");
            CsvWriter resultEW       = new CsvWriter(resultFilePath, resultColumnDic);

            SavePointsToFile(allPoints, resultEW);

            return(succeed);
        }
        private void GetDetailPageUrls(IListSheet listSheet)
        {
            String exportDir = this.RunPage.GetExportDir();
            string pageSourceDir = this.RunPage.GetDetailSourceFileDir();

            Dictionary<string, int> resultColumnDic = new Dictionary<string, int>();
            resultColumnDic.Add("detailPageUrl", 0);
            resultColumnDic.Add("detailPageName", 1);
            resultColumnDic.Add("cookie", 2);
            resultColumnDic.Add("grabStatus", 3);
            resultColumnDic.Add("giveUpGrab", 4);
            resultColumnDic.Add("name", 5);
            string resultFilePath = Path.Combine(exportDir, "www.nhm.ac.uk恐龙详情页.xlsx");
            ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic, null);
            Dictionary<string, string> urlDic = new Dictionary<string, string>();
            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary<string, string> row = listSheet.GetRow(i);
                string detailUrl = row["detailPageUrl"];
                bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    string localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir); 

                    try
                    {
                        HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
                        htmlDoc.Load(localFilePath, Encoding.GetEncoding("utf-8"));

                        HtmlNodeCollection dinosaurNodes = htmlDoc.DocumentNode.SelectNodes("//ul[@class='dino-list dino-wrap']/li/a");

                        for (int j = 0; j < dinosaurNodes.Count; j++)
                        {
                            HtmlNode dinosaurNode = dinosaurNodes[j];
                            string name = dinosaurNode.InnerText.Trim();
                            string url = "http://www.nhm.ac.uk" + dinosaurNode.GetAttributeValue("href", "");
                            if (!urlDic.ContainsKey(url))
                            {
                                urlDic.Add(url, null);

                                Dictionary<string, string> f2vs = new Dictionary<string, string>();
                                f2vs.Add("detailPageUrl", url);
                                f2vs.Add("detailPageName", url);
                                f2vs.Add("name", name); 
                                resultEW.AddRow(f2vs);
                            }
                        }

                    }
                    catch (Exception ex)
                    {
                        throw ex;
                    } 
                }
            } 
            resultEW.SaveToDisk();
        }