Ejemplo n.º 1
0
 public override bool AfterAllGrab(IListSheet listSheet)
 {
     this.GetPropertiesMatrix(listSheet);
     return(true);
 }
Ejemplo n.º 2
0
 public bool Run(string parameters, IListSheet listSheet)
 {
     return(this.GenerateDetailPageInfo(listSheet));
 }
Ejemplo n.º 3
0
        public override bool AfterAllGrab(IListSheet listSheet)
        {
            this.GetSeedInfoFromListSheet(listSheet);

            //下一步必须执行
            bool   isNewDo          = false;
            string localLogFileName = null;

            List <string> allListPageUrls = null;

            localLogFileName = this.LoginName + "_" + this.KeyWords + "_listPageUrl";
            if (SysConfig.SysExecuteType == SysExecuteType.Produce)
            {
                //如果是生产环境,那么直接爬取列表页
                allListPageUrls = this.GetAllListPages(this.Parameters, this.SeedPageUrl);
                this.RunPage.SaveInfoToMiddleFile(localLogFileName, "listPageUrl", allListPageUrls);
                isNewDo = true;
            }
            else
            {
                //读取历史爬取的列表页地址文件
                allListPageUrls = this.RunPage.TryGetInfoFromMiddleFile(localLogFileName, "listPageUrl");
                if (allListPageUrls == null)
                {
                    allListPageUrls = this.GetAllListPages(this.Parameters, this.SeedPageUrl);
                    this.RunPage.SaveInfoToMiddleFile(localLogFileName, "listPageUrl", allListPageUrls);
                    isNewDo = true;
                }
            }

            List <Dictionary <string, string> > allPersonPageUrlInfos = null;

            localLogFileName = this.LoginName + "_" + this.KeyWords + "_personPageUrlInfo";
            if (SysConfig.SysExecuteType == SysExecuteType.Produce || isNewDo)
            {
                //如果是生产环境,那么直接解析列表页
                allPersonPageUrlInfos = this.GetPersonPageUrlsFromListPages(this.RunPage.GetDetailSourceFileDir(), allListPageUrls);
                this.RunPage.SaveInfoToMiddleFile(localLogFileName, new string[] { "personUrl", "personName" }, allPersonPageUrlInfos);
            }
            else
            {
                //读取历史解析获得的个人网页地址
                allPersonPageUrlInfos = this.RunPage.TryGetInfoFromMiddleFile(localLogFileName, new string[] { "personUrl", "personName" });
                if (allPersonPageUrlInfos == null)
                {
                    allPersonPageUrlInfos = this.GetPersonPageUrlsFromListPages(this.RunPage.GetDetailSourceFileDir(), allListPageUrls);
                    this.RunPage.SaveInfoToMiddleFile(localLogFileName, new string[] { "personUrl", "personName" }, allPersonPageUrlInfos);
                    isNewDo = true;
                }
            }

            List <string> allPersonPageUrls = null;

            localLogFileName = this.LoginName + "_" + this.KeyWords + "_personPageUrl";
            if (SysConfig.SysExecuteType == SysExecuteType.Produce || isNewDo)
            {
                //如果是生产环境,那么直接爬取个人详情页
                allPersonPageUrls = ProcessPersonPage.GetAllPersonPageUrls(this.RunPage, allPersonPageUrlInfos, this.LoginName, this.LoginPassword);
                this.RunPage.SaveInfoToMiddleFile(localLogFileName, "personUrl", allPersonPageUrls);
                isNewDo = true;
            }
            else
            {
                //读取历史生成的个人网页网址
                allPersonPageUrls = this.RunPage.TryGetInfoFromMiddleFile(this.LoginName + "." + this.KeyWords + ".personPageUrl", "personUrl");
                if (allPersonPageUrls == null)
                {
                    allPersonPageUrls = ProcessPersonPage.GetAllPersonPageUrls(this.RunPage, allPersonPageUrlInfos, this.LoginName, this.LoginPassword);
                    this.RunPage.SaveInfoToMiddleFile(localLogFileName, "personUrl", allPersonPageUrls);
                    isNewDo = true;
                }
            }

            List <Dictionary <string, string> > personInfoList = ProcessPersonPage.GetPersonInfoFromLocalPages(this.RunPage, allPersonPageUrls, false, null);

            string personInfosFilePath = this.RunPage.GetFilePath("SearchResult_Linkedin2Linkedin_" + this.LoginName + "_" + this.KeyWords + ".xlsx", this.RunPage.GetExportDir());

            ProcessPersonPage.SavePersonInfoToFile(this.RunPage, personInfoList, personInfosFilePath);

            return(true);
        }
Ejemplo n.º 4
0
        private bool GetAllPages(IListSheet listSheet)
        {
            CsvWriter mainCW   = this.GetMainCsvWriter();
            CsvWriter ztbCW    = this.GetZtbCsvWriter();
            CsvWriter sgtscCW  = this.GetSgtscCsvWriter();
            CsvWriter htbaCW   = this.GetHtbaCsvWriter();
            CsvWriter sgxkCW   = this.GetSgxkCsvWriter();
            CsvWriter jgysbaCW = this.GetJgysbaCsvWriter();
            string    detailPageUrlColumnName      = SysConfig.DetailPageUrlFieldName;
            Dictionary <string, string> projectDic = new Dictionary <string, string>();

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                string detailPageUrl            = row[SysConfig.DetailPageUrlFieldName];
                string detailPageName           = row[SysConfig.DetailPageNameFieldName];
                try
                {
                    bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                    if (!giveUp)
                    {
                        HtmlAgilityPack.HtmlDocument pageHtmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);

                        #region 基础信息
                        string   xmmc       = "";
                        string   xmbh       = "";
                        string   sjxmbh     = "";
                        string   szqh       = "";
                        string   jsdw       = "";
                        string   jsdwzzjgdm = "";
                        string   xmfl       = "";
                        string   jsxz       = "";
                        string   gcyt       = "";
                        string   ztz        = "";
                        string   zmj        = "";
                        string   lxjb       = "";
                        string   lxwh       = "";
                        HtmlNode xmmcNode   = pageHtmlDoc.DocumentNode.SelectSingleNode("//div[@class=\"user_info spmtop\"]");
                        if (xmmcNode == null)
                        {
                            throw new Exception("没有找到项目名称节点");
                        }
                        else
                        {
                            xmmc = CommonUtil.HtmlDecode(xmmcNode.InnerText.Trim()).Trim();
                        }

                        HtmlNodeCollection projectFieldNodeList = pageHtmlDoc.DocumentNode.SelectNodes("//div[@class=\"query_info_box \"]/div/div[@class=\"activeTinyTabContent\"]/dl/dd");
                        if (projectFieldNodeList != null)
                        {
                            for (int j = 0; j < projectFieldNodeList.Count; j++)
                            {
                                HtmlNode projectFieldNode = projectFieldNodeList[j];
                                string   fieldText        = projectFieldNode.InnerText.Trim();
                                int      sIndex           = fieldText.IndexOf(":");
                                string   fieldName        = CommonUtil.HtmlDecode(fieldText.Substring(0, sIndex)).Trim();
                                string   fieldValue       = CommonUtil.HtmlDecode(fieldText.Substring(sIndex + 1)).Trim();
                                switch (fieldName)
                                {
                                case "项目编号":
                                    xmbh = fieldValue;
                                    break;

                                case "省级项目编号":
                                    sjxmbh = fieldValue;
                                    break;

                                case "所在区划":
                                    szqh = fieldValue;
                                    break;

                                case "建设单位":
                                    jsdw = fieldValue;
                                    break;

                                case "建设单位组织机构代码(统一社会信用代码)":
                                    jsdwzzjgdm = fieldValue;
                                    break;

                                case "项目分类":
                                    xmfl = fieldValue;
                                    break;

                                case "建设性质":
                                    jsxz = fieldValue;
                                    break;

                                case "工程用途":
                                    gcyt = fieldValue;
                                    break;

                                case "总投资":
                                    ztz = fieldValue;
                                    break;

                                case "总面积":
                                    zmj = fieldValue;
                                    break;

                                case "立项级别":
                                    lxjb = fieldValue;
                                    break;

                                case "立项文号":
                                    lxwh = fieldValue;
                                    break;
                                }
                            }
                        }
                        else
                        {
                            throw new Exception("无法获取项目基本信息属性值");
                        }

                        Dictionary <string, string> f2vs = new Dictionary <string, string>();
                        f2vs.Add("项目编号", xmbh);
                        f2vs.Add("省级项目编号", sjxmbh);
                        f2vs.Add("项目名称", xmmc);
                        f2vs.Add("所在区划", szqh);
                        f2vs.Add("建设单位", jsdw);
                        f2vs.Add("建设单位组织机构代码(统一社会信用代码)", jsdwzzjgdm);
                        f2vs.Add("项目分类", xmfl);
                        f2vs.Add("建设性质", jsxz);
                        f2vs.Add("工程用途", gcyt);
                        f2vs.Add("总投资", ztz);
                        f2vs.Add("总面积", zmj);
                        f2vs.Add("立项级别", lxjb);
                        f2vs.Add("立项文号", lxwh);
                        mainCW.AddRow(f2vs);
                        #endregion

                        #region 招投标
                        HtmlNodeCollection ztbNodeList = pageHtmlDoc.DocumentNode.SelectNodes("//div[@id=\"tab_ztb\"]/table/tbody/tr[@class=\"row\"]");
                        if (ztbNodeList != null)
                        {
                            foreach (HtmlNode ztbNode in ztbNodeList)
                            {
                                HtmlNodeCollection          ztbFieldNodeList = ztbNode.SelectNodes("./td");
                                Dictionary <string, string> ztbF2vs          = new Dictionary <string, string>();
                                ztbF2vs.Add("项目编码", xmbh);
                                ztbF2vs.Add("招标类型", CommonUtil.HtmlDecode(ztbFieldNodeList[1].InnerText.Trim()));
                                ztbF2vs.Add("招标方式", CommonUtil.HtmlDecode(ztbFieldNodeList[2].InnerText.Trim()));
                                ztbF2vs.Add("中标单位名称", CommonUtil.HtmlDecode(ztbFieldNodeList[3].InnerText.Trim()));
                                ztbF2vs.Add("中标日期", CommonUtil.HtmlDecode(ztbFieldNodeList[4].InnerText.Trim()));
                                ztbF2vs.Add("中标金额(万元)", CommonUtil.HtmlDecode(ztbFieldNodeList[5].InnerText.Trim()));
                                ztbF2vs.Add("中标通知书编号", CommonUtil.HtmlDecode(ztbFieldNodeList[6].InnerText.Trim()));
                                ztbF2vs.Add("省级中标通知书编号", CommonUtil.HtmlDecode(ztbFieldNodeList[7].InnerText.Trim()));
                                ztbCW.AddRow(ztbF2vs);
                            }
                        }
                        #endregion

                        #region 施工图审查
                        HtmlNodeCollection sgtscNodeList = pageHtmlDoc.DocumentNode.SelectNodes("//div[@id=\"tab_sgtsc\"]/table/tbody/tr[@class=\"row\"]");
                        if (sgtscNodeList != null)
                        {
                            foreach (HtmlNode sgtscNode in sgtscNodeList)
                            {
                                HtmlNodeCollection          sgtscFieldNodeList = sgtscNode.SelectNodes("./td");
                                Dictionary <string, string> sgtscF2vs          = new Dictionary <string, string>();
                                sgtscF2vs.Add("项目编码", xmbh);
                                sgtscF2vs.Add("施工图审查合格书编号", CommonUtil.HtmlDecode(sgtscFieldNodeList[1].InnerText.Trim()));
                                sgtscF2vs.Add("省级施工图审查合格书编号", CommonUtil.HtmlDecode(sgtscFieldNodeList[2].InnerText.Trim()));
                                sgtscF2vs.Add("勘察单位名称", CommonUtil.HtmlDecode(sgtscFieldNodeList[3].InnerText.Trim()));
                                sgtscF2vs.Add("设计单位名称", CommonUtil.HtmlDecode(sgtscFieldNodeList[4].InnerText.Trim()));
                                sgtscF2vs.Add("施工图审查机构名称", CommonUtil.HtmlDecode(sgtscFieldNodeList[5].InnerText.Trim()));
                                sgtscF2vs.Add("审查完成日期", CommonUtil.HtmlDecode(sgtscFieldNodeList[6].InnerText.Trim()));
                                sgtscCW.AddRow(sgtscF2vs);
                            }
                        }
                        #endregion

                        #region 合同备案
                        HtmlNodeCollection htbaNodeList = pageHtmlDoc.DocumentNode.SelectNodes("//div[@id=\"tab_htba\"]/table/tbody/tr[@class=\"row\"]");
                        if (htbaNodeList != null)
                        {
                            foreach (HtmlNode htbaNode in htbaNodeList)
                            {
                                HtmlNodeCollection          htbaFieldNodeList = htbaNode.SelectNodes("./td");
                                Dictionary <string, string> htbaF2vs          = new Dictionary <string, string>();
                                htbaF2vs.Add("项目编码", xmbh);
                                htbaF2vs.Add("合同类别", CommonUtil.HtmlDecode(htbaFieldNodeList[1].InnerText.Trim()));
                                htbaF2vs.Add("合同备案编号", CommonUtil.HtmlDecode(htbaFieldNodeList[2].InnerText.Trim()));
                                htbaF2vs.Add("省级合同备案编号", CommonUtil.HtmlDecode(htbaFieldNodeList[3].InnerText.Trim()));
                                htbaF2vs.Add("合同金额(万元)", CommonUtil.HtmlDecode(htbaFieldNodeList[4].InnerText.Trim()));
                                htbaF2vs.Add("合同签订日期", CommonUtil.HtmlDecode(htbaFieldNodeList[5].InnerText.Trim()));
                                htbaCW.AddRow(htbaF2vs);
                            }
                        }
                        #endregion

                        #region 施工许可
                        HtmlNodeCollection sgxkNodeList = pageHtmlDoc.DocumentNode.SelectNodes("//div[@id=\"tab_sgxk\"]/table/tbody/tr[@class=\"row\"]");
                        if (sgxkNodeList != null)
                        {
                            foreach (HtmlNode sgxkNode in sgxkNodeList)
                            {
                                HtmlNodeCollection          sgxkFieldNodeList = sgxkNode.SelectNodes("./td");
                                Dictionary <string, string> sgxkF2vs          = new Dictionary <string, string>();
                                sgxkF2vs.Add("项目编码", xmbh);
                                sgxkF2vs.Add("施工许可证编号", CommonUtil.HtmlDecode(sgxkFieldNodeList[1].InnerText.Trim()));
                                sgxkF2vs.Add("省级施工许可证编号", CommonUtil.HtmlDecode(sgxkFieldNodeList[2].InnerText.Trim()));
                                sgxkF2vs.Add("合同金额(万元)", CommonUtil.HtmlDecode(sgxkFieldNodeList[3].InnerText.Trim()));
                                sgxkF2vs.Add("面积(平方米)", CommonUtil.HtmlDecode(sgxkFieldNodeList[4].InnerText.Trim()));
                                sgxkF2vs.Add("发证日期", CommonUtil.HtmlDecode(sgxkFieldNodeList[5].InnerText.Trim()));
                                sgxkCW.AddRow(sgxkF2vs);
                            }
                        }
                        #endregion

                        #region 竣工验收备案
                        HtmlNodeCollection jgysbaNodeList = pageHtmlDoc.DocumentNode.SelectNodes("//div[@id=\"tab_jgysba\"]/table/tbody/tr[@class=\"row\"]");
                        if (jgysbaNodeList != null)
                        {
                            foreach (HtmlNode jgysbaNode in jgysbaNodeList)
                            {
                                HtmlNodeCollection          jgysbaFieldNodeList = jgysbaNode.SelectNodes("./td");
                                Dictionary <string, string> jgysbaF2vs          = new Dictionary <string, string>();
                                jgysbaF2vs.Add("项目编码", xmbh);
                                jgysbaF2vs.Add("竣工备案编号", CommonUtil.HtmlDecode(jgysbaFieldNodeList[1].InnerText.Trim()));
                                jgysbaF2vs.Add("省级竣工备案编号", CommonUtil.HtmlDecode(jgysbaFieldNodeList[2].InnerText.Trim()));
                                jgysbaF2vs.Add("实际造价(万元)", CommonUtil.HtmlDecode(jgysbaFieldNodeList[3].InnerText.Trim()));
                                jgysbaF2vs.Add("实际面积(平方米)", CommonUtil.HtmlDecode(jgysbaFieldNodeList[4].InnerText.Trim()));
                                jgysbaF2vs.Add("实际开工日期", CommonUtil.HtmlDecode(jgysbaFieldNodeList[5].InnerText.Trim()));
                                jgysbaF2vs.Add("实际竣工验收日期", CommonUtil.HtmlDecode(jgysbaFieldNodeList[6].InnerText.Trim()));
                                jgysbaCW.AddRow(jgysbaF2vs);
                            }
                        }
                        #endregion
                    }
                }
                catch (Exception ex)
                {
                    //throw ex;
                    string dir       = this.RunPage.GetDetailSourceFileDir();
                    string toDir     = Path.Combine(Path.GetDirectoryName(dir), "deleted");
                    string fileUrl   = this.RunPage.GetFilePath(detailPageUrl, dir);
                    string toFileUrl = this.RunPage.GetFilePath(detailPageUrl, toDir);
                    File.Move(fileUrl, toFileUrl);
                    this.RunPage.InvokeAppendLogText("文件不完整,删除", LogLevelType.Error, true);
                }
            }

            mainCW.SaveToDisk();
            ztbCW.SaveToDisk();
            sgtscCW.SaveToDisk();
            htbaCW.SaveToDisk();
            sgxkCW.SaveToDisk();
            jgysbaCW.SaveToDisk();
            return(true);
        }
Ejemplo n.º 5
0
 public override bool AfterAllGrab(IListSheet listSheet)
 {
     this.ProcessLinkageResult(listSheet);
     return(true);
 }
Ejemplo n.º 6
0
 public bool Run(string parameters, IListSheet listSheet)
 {
     return(this.GenerateCityReport(listSheet));
 }
Ejemplo n.º 7
0
        public bool Run(string parameters, IListSheet listSheet)
        {
            bool succeed = GetAllDetailPageUrl(listSheet);

            return(succeed);
        }
        /// <summary>
        /// 期刊每期目录首页
        /// </summary>
        /// <param name="listSheet"></param>
        private void GetAllPerioFirstIndexPageUrls(IListSheet listSheet)
        {
            String      exportDir        = this.RunPage.GetExportDir();
            string      pageSourceDir    = this.RunPage.GetDetailSourceFileDir();
            int         allListFileIndex = 1;
            ExcelWriter ew = null;
            Dictionary <string, string> urlDic = new Dictionary <string, string>();

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                if (ew == null || ew.RowCount > 500000)
                {
                    if (ew != null)
                    {
                        ew.SaveToDisk();
                    }
                    ew = this.GetAllPerioFirstIndexPageExcelWriter(allListFileIndex);
                    allListFileIndex++;
                }

                Dictionary <string, string> row = listSheet.GetRow(i);
                string detailUrl = row["detailPageUrl"];

                bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    string localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir);

                    try
                    {
                        string pageFileText  = FileHelper.GetTextFromFile(localFilePath);
                        JArray itemJsonArray = JObject.Parse(pageFileText).GetValue("pageRow") as JArray;

                        if (itemJsonArray != null)
                        {
                            for (int j = 0; j < itemJsonArray.Count; j++)
                            {
                                JObject itemJson = itemJsonArray[j] as JObject;
                                string  perioId  = itemJson.GetValue("id").ToString().Trim();
                                JObject opJson   = itemJson.GetValue("op") as JObject;
                                try
                                {
                                    if (opJson != null)
                                    {
                                        JArray opItemsArray = opJson.GetValue("perioIssue") as JArray;
                                        //每一期
                                        if (opItemsArray != null)
                                        {
                                            for (int k = 0; k < opItemsArray.Count; k++)
                                            {
                                                JObject opItemJson = opItemsArray[k] as JObject;
                                                try
                                                {
                                                    string issue_num    = this.GetAttributeValue(opItemJson, "issue_num");
                                                    string publish_year = this.GetAttributeValue(opItemJson, "publish_year");
                                                    string perio_id     = this.GetAttributeValue(opItemJson, "perio_id");
                                                    string perio_title  = this.GetAttributeValue(opItemJson, "perio_title");
                                                    if (issue_num != null && publish_year != null && perio_id != null && perio_title != null)
                                                    {
                                                        string firstIndexPageUrl = "http://www.wanfangdata.com.cn/perio/articleList.do?page=1&pageSize=10&issue_num=" + issue_num + "&publish_year=" + publish_year + "&article_start=&title_article=&perio_id=" + perio_id;
                                                        if (!urlDic.ContainsKey(firstIndexPageUrl))
                                                        {
                                                            urlDic.Add(firstIndexPageUrl, null);

                                                            Dictionary <string, string> f2vs = new Dictionary <string, string>();

                                                            f2vs.Add("detailPageUrl", firstIndexPageUrl);
                                                            f2vs.Add("detailPageName", firstIndexPageUrl);
                                                            f2vs.Add("perio_id", perio_id);
                                                            f2vs.Add("issue_num", issue_num);
                                                            f2vs.Add("publish_year", publish_year);
                                                            f2vs.Add("perio_title", perio_title);
                                                            f2vs.Add("pageIndex", "1");
                                                            ew.AddRow(f2vs);
                                                        }
                                                    }
                                                }
                                                catch (Exception ex)
                                                {
                                                    throw ex;
                                                }
                                            }
                                        }
                                    }
                                }
                                catch (Exception ex)
                                {
                                    throw ex;
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        throw ex;
                    }
                }
            }
            ew.SaveToDisk();
        }
        private void GetPeriodicalInfo(IListSheet listSheet)
        {
            String exportDir     = this.RunPage.GetExportDir();
            string pageSourceDir = this.RunPage.GetDetailSourceFileDir();

            Dictionary <string, int> resultColumnDic = CommonUtil.InitStringIndexDic(new string[] { "id", "core_perio", "avg_perio_down", "start_year02", "start_year", "issue_postcode", "perio_format", "fax", "perio_id", "language", "tag_num", "major_editor", "abstract_reading_num", "thirdparty_links_num", "import_num", "email", "share_num", "classcode_level", "publish_cycle", "address", "pinyin_title", "avg_article_down", "hostunit_name", "hostunit_area", "director", "main_column", "telephone", "country_code", "affectoi", "issn", "cn", "source_db", "dep_name", "postcode", "collection_num", "win_prize", "cite_num", "perio_title02", "download_num", "first_publish", "data_state", "article_num", "ef_name", "release_cycle", "fulltext_reading_num", "note_num", "end_year", "class_code", "end_issue", "trans_title", "perio_desc", "perio_title", "keywords", "summary", "cate1", "cateId1", "cate2", "cateId2" });
            string      resultFilePath = Path.Combine(exportDir, "万方期刊_期刊信息详情.xlsx");
            ExcelWriter resultEW       = new ExcelWriter(resultFilePath, "List", resultColumnDic, null);

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                string detailUrl = row["detailPageUrl"];

                try
                {
                    string cate1   = row["cate1"];
                    string cateId1 = row["cateId1"];
                    string cate2   = row["cate2"];
                    string cateId2 = row["cateId2"];
                    bool   giveUp  = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                    if (!giveUp)
                    {
                        string localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir);

                        try
                        {
                            string pageFileText  = FileHelper.GetTextFromFile(localFilePath);
                            JArray itemJsonArray = JObject.Parse(pageFileText).GetValue("pageRow") as JArray;


                            for (int j = 0; j < itemJsonArray.Count; j++)
                            {
                                Dictionary <string, string> f2vs = new Dictionary <string, string>();
                                JObject itemJson = itemJsonArray[j] as JObject;
                                f2vs.Add("cate1", cate1);
                                f2vs.Add("cateId1", cateId1);
                                f2vs.Add("cate2", cate2);
                                f2vs.Add("cateId2", cateId2);

                                this.GetAttributeValue(itemJson, "id", f2vs);
                                this.GetAttributeValue(itemJson, "core_perio", f2vs);
                                this.GetAttributeValue(itemJson, "avg_perio_down", f2vs);
                                this.GetAttributeValue(itemJson, "start_year02", f2vs);
                                this.GetAttributeValue(itemJson, "start_year", f2vs);
                                this.GetAttributeValue(itemJson, "issue_postcode", f2vs);
                                this.GetAttributeValue(itemJson, "perio_format", f2vs);
                                this.GetAttributeValue(itemJson, "fax", f2vs);
                                this.GetAttributeValue(itemJson, "perio_id", f2vs);
                                this.GetAttributeValue(itemJson, "language", f2vs);
                                this.GetAttributeValue(itemJson, "tag_num", f2vs);
                                this.GetAttributeValue(itemJson, "major_editor", f2vs);
                                this.GetAttributeValue(itemJson, "abstract_reading_num", f2vs);
                                this.GetAttributeValue(itemJson, "thirdparty_links_num", f2vs);
                                this.GetAttributeValue(itemJson, "import_num", f2vs);
                                this.GetAttributeValue(itemJson, "email", f2vs);
                                this.GetAttributeValue(itemJson, "share_num", f2vs);
                                this.GetAttributeValue(itemJson, "classcode_level", f2vs);
                                this.GetAttributeValue(itemJson, "publish_cycle", f2vs);
                                this.GetAttributeValue(itemJson, "address", f2vs);
                                this.GetAttributeValue(itemJson, "pinyin_title", f2vs);
                                this.GetAttributeValue(itemJson, "avg_article_down", f2vs);
                                this.GetAttributeValue(itemJson, "hostunit_name", f2vs);
                                this.GetAttributeValue(itemJson, "hostunit_area", f2vs);
                                this.GetAttributeValue(itemJson, "director", f2vs);
                                this.GetAttributeValue(itemJson, "main_column", f2vs);
                                this.GetAttributeValue(itemJson, "telephone", f2vs);
                                this.GetAttributeValue(itemJson, "country_code", f2vs);
                                this.GetAttributeValue(itemJson, "affectoi", f2vs);
                                this.GetAttributeValue(itemJson, "issn", f2vs);
                                this.GetAttributeValue(itemJson, "cn", f2vs);
                                this.GetAttributeValue(itemJson, "source_db", f2vs);
                                this.GetAttributeValue(itemJson, "dep_name", f2vs);
                                this.GetAttributeValue(itemJson, "postcode", f2vs);
                                this.GetAttributeValue(itemJson, "collection_num", f2vs);
                                this.GetAttributeValue(itemJson, "win_prize", f2vs);
                                this.GetAttributeValue(itemJson, "cite_num", f2vs);
                                this.GetAttributeValue(itemJson, "perio_title02", f2vs);
                                this.GetAttributeValue(itemJson, "download_num", f2vs);
                                this.GetAttributeValue(itemJson, "first_publish", f2vs);
                                this.GetAttributeValue(itemJson, "data_state", f2vs);
                                this.GetAttributeValue(itemJson, "article_num", f2vs);
                                this.GetAttributeValue(itemJson, "ef_name", f2vs);
                                this.GetAttributeValue(itemJson, "release_cycle", f2vs);
                                this.GetAttributeValue(itemJson, "fulltext_reading_num", f2vs);
                                this.GetAttributeValue(itemJson, "note_num", f2vs);
                                this.GetAttributeValue(itemJson, "end_year", f2vs);
                                this.GetAttributeValue(itemJson, "class_code", f2vs);
                                this.GetAttributeValue(itemJson, "end_issue", f2vs);
                                this.GetAttributeValue(itemJson, "trans_title", f2vs);
                                this.GetAttributeValue(itemJson, "perio_desc", f2vs);
                                this.GetAttributeValue(itemJson, "perio_title", f2vs);
                                this.GetAttributeValue(itemJson, "keywords", f2vs);
                                this.GetAttributeValue(itemJson, "summary", f2vs);

                                resultEW.AddRow(f2vs);
                            }
                        }
                        catch (Exception ex)
                        {
                            throw ex;
                        }
                    }
                }
                catch (Exception ex)
                {
                    this.RunPage.InvokeAppendLogText(ex.Message + ". detailUrl" + detailUrl, LogLevelType.Error, true);
                    throw ex;
                }
            }
            resultEW.SaveToDisk();
        }
Ejemplo n.º 10
0
        private void GetRelatedItemPageUrls(IListSheet listSheet)
        {
            ExcelWriter moreItemEW             = this.CreateMoreItemWriter();
            Dictionary <string, bool> itemMaps = new Dictionary <string, bool>();

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> listRow = listSheet.GetRow(i);
                bool   giveUp      = "Y".Equals(listRow[SysConfig.GiveUpGrabFieldName]);
                string fromItemUrl = listRow[SysConfig.DetailPageUrlFieldName];
                if (!giveUp)
                {
                    try
                    {
                        HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);

                        HtmlNode titleNode    = htmlDoc.DocumentNode.SelectSingleNode("//dd[@class=\"lemmaWgt-lemmaTitle-title\"]/h1");
                        string   fromItemName = CommonUtil.HtmlDecode(titleNode.InnerText).Trim();

                        HtmlNode itemBaseInfoNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@class=\"lemmaWgt-promotion-rightPreciseAd\"]");
                        string   fromItemId       = itemBaseInfoNode.GetAttributeValue("data-lemmaid", "");
                        string   fromItemTitle    = itemBaseInfoNode.GetAttributeValue("data-lemmatitle", "");

                        if (!itemMaps.ContainsKey(fromItemUrl))
                        {
                            itemMaps.Add(fromItemUrl, true);

                            Dictionary <string, string> moreItemRow = new Dictionary <string, string>();
                            moreItemRow.Add("detailPageUrl", fromItemUrl);
                            moreItemRow.Add("detailPageName", fromItemUrl);
                            moreItemRow.Add("itemId", fromItemId);
                            moreItemRow.Add("itemName", fromItemName);

                            moreItemEW.AddRow(moreItemRow);
                        }


                        HtmlNodeCollection aNodes = htmlDoc.DocumentNode.SelectNodes("//a");
                        for (int j = 0; j < aNodes.Count; j++)
                        {
                            HtmlNode aNode         = aNodes[j];
                            string   toItemUrl     = aNode.GetAttributeValue("href", "");
                            string   toItemId      = aNode.GetAttributeValue("data-lemmaid", "");
                            string   toItemName    = CommonUtil.HtmlDecode(aNode.InnerText).Trim();
                            string   toItemFullUrl = "https://baike.baidu.com" + toItemUrl;
                            if (toItemUrl.StartsWith("/item/") && !itemMaps.ContainsKey(toItemFullUrl) && this.IsInMainContent(aNode))
                            {
                                itemMaps.Add(toItemFullUrl, true);

                                Dictionary <string, string> moreItemRow = new Dictionary <string, string>();
                                moreItemRow.Add("detailPageUrl", toItemFullUrl);
                                moreItemRow.Add("detailPageName", toItemFullUrl);
                                moreItemRow.Add("itemId", toItemId);
                                moreItemRow.Add("itemName", toItemName);

                                moreItemEW.AddRow(moreItemRow);
                            }
                        }

                        this.GenerateRelatedItemFile(fromItemUrl, htmlDoc);
                    }
                    catch (Exception ex)
                    {
                        throw ex;
                    }
                }
            }

            moreItemEW.SaveToDisk();
        }
        /// <summary>
        /// 获取每一期的基本信息
        /// </summary>
        /// <param name="listSheet"></param>
        private void GetPeriodicalPerioIssueInfo(IListSheet listSheet)
        {
            String      exportDir        = this.RunPage.GetExportDir();
            string      pageSourceDir    = this.RunPage.GetDetailSourceFileDir();
            int         allListFileIndex = 1;
            ExcelWriter ew = null;

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                if (ew == null || ew.RowCount > 500000)
                {
                    if (ew != null)
                    {
                        ew.SaveToDisk();
                    }
                    ew = this.GetAllPerioIssueInfoExcelWriter(allListFileIndex);
                    allListFileIndex++;
                }

                Dictionary <string, string> row = listSheet.GetRow(i);
                string detailUrl = row["detailPageUrl"];

                bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    string localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir);

                    try
                    {
                        string pageFileText  = FileHelper.GetTextFromFile(localFilePath);
                        JArray itemJsonArray = JObject.Parse(pageFileText).GetValue("pageRow") as JArray;

                        if (itemJsonArray != null)
                        {
                            for (int j = 0; j < itemJsonArray.Count; j++)
                            {
                                JObject itemJson = itemJsonArray[j] as JObject;
                                string  perioId  = itemJson.GetValue("id").ToString().Trim();
                                JObject opJson   = itemJson.GetValue("op") as JObject;
                                try
                                {
                                    if (opJson != null)
                                    {
                                        JArray opItemsArray = opJson.GetValue("perioIssue") as JArray;
                                        //每一期
                                        if (opItemsArray != null)
                                        {
                                            for (int k = 0; k < opItemsArray.Count; k++)
                                            {
                                                JObject opItemJson = opItemsArray[k] as JObject;

                                                Dictionary <string, string> f2vs = new Dictionary <string, string>();
                                                f2vs.Add("perioId", perioId);
                                                this.GetAttributeValue(opItemJson, "publish_year", f2vs);
                                                this.GetAttributeValue(opItemJson, "trans_title", f2vs);
                                                this.GetAttributeValue(opItemJson, "issue_id", f2vs);
                                                this.GetAttributeValue(opItemJson, "show_issue_num", f2vs);
                                                this.GetAttributeValue(opItemJson, "page_cnt", f2vs);
                                                this.GetAttributeValue(opItemJson, "issue_num", f2vs);
                                                this.GetAttributeValue(opItemJson, "perio_id", f2vs);
                                                this.GetAttributeValue(opItemJson, "orig_catalog", f2vs);
                                                this.GetAttributeValue(opItemJson, "volume", f2vs);
                                                this.GetAttributeValue(opItemJson, "catalog_url", f2vs);
                                                this.GetAttributeValue(opItemJson, "total_issue", f2vs);
                                                this.GetAttributeValue(opItemJson, "special_title", f2vs);
                                                this.GetAttributeValue(opItemJson, "issue_cover", f2vs);
                                                this.GetAttributeValue(opItemJson, "id", f2vs);
                                                this.GetAttributeValue(opItemJson, "perio_title", f2vs);

                                                ew.AddRow(f2vs);
                                            }
                                        }
                                    }
                                }
                                catch (Exception ex)
                                {
                                    throw ex;
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        throw ex;
                    }
                }
            }
            ew.SaveToDisk();
        }
Ejemplo n.º 12
0
        private void GetRenWuProperties(IListSheet listSheet)
        {
            try
            {
                List <string> propertyColumnNames = new List <string>();

                ExcelWriter RenWuInfoExcelWriter = this.CreateRenWuPropertyListWriter();
                for (int i = 0; i < listSheet.RowCount; i++)
                {
                    Dictionary <string, string> listRow = listSheet.GetRow(i);
                    bool   giveUp   = "Y".Equals(listRow[SysConfig.GiveUpGrabFieldName]);
                    string pageUrl  = listRow[SysConfig.DetailPageUrlFieldName];
                    string name     = listRow["name"];
                    string fullName = listRow["fullName"];
                    if (!giveUp)
                    {
                        try
                        {
                            HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);
                            HtmlNodeCollection           dtNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"basic-info cmn-clearfix\"]/dl/dt");
                            if (dtNodes != null)
                            {
                                List <string> oneIRenWuProperties = new List <string>();
                                foreach (HtmlNode dtNode in dtNodes)
                                {
                                    string pKey   = CommonUtil.HtmlDecode(dtNode.InnerText).Trim().Replace(" ", "").Replace(" ", "").Replace(" ", "");
                                    string pValue = this.GetNextDDNodeText(dtNode);

                                    int    sameNamePKeyCount = 1;
                                    string newPKey           = pKey;
                                    while (oneIRenWuProperties.Contains(newPKey))
                                    {
                                        sameNamePKeyCount++;
                                        newPKey = pKey + "_" + sameNamePKeyCount.ToString();
                                    }
                                    oneIRenWuProperties.Add(newPKey);

                                    if (!propertyColumnNames.Contains(newPKey))
                                    {
                                        propertyColumnNames.Add(newPKey);
                                    }

                                    Dictionary <string, string> row = new Dictionary <string, string>();

                                    row.Add("name", name);
                                    row.Add("fullName", fullName);
                                    row.Add("pKey", newPKey);
                                    row.Add("pValue", pValue);
                                    row.Add("url", pageUrl);

                                    RenWuInfoExcelWriter.AddRow(row);
                                }
                            }
                        }
                        catch (Exception ex)
                        {
                            throw ex;
                        }
                    }
                }
                RenWuInfoExcelWriter.SaveToDisk();

                ExcelWriter RenWuColumnPropertyExcelWriter = this.CreateRenWuColumnPropertyListWriter(propertyColumnNames);
                for (int i = 0; i < listSheet.RowCount; i++)
                {
                    Dictionary <string, string> listRow = listSheet.GetRow(i);
                    bool   giveUp   = "Y".Equals(listRow[SysConfig.GiveUpGrabFieldName]);
                    string pageUrl  = listRow[SysConfig.DetailPageUrlFieldName];
                    string name     = listRow["name"];
                    string fullName = listRow["fullName"];
                    if (!giveUp)
                    {
                        try
                        {
                            HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);
                            HtmlNodeCollection           dtNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"basic-info cmn-clearfix\"]/dl/dt");
                            Dictionary <string, string>  row     = new Dictionary <string, string>();
                            row.Add("name", name);
                            row.Add("fullName", fullName);
                            row.Add("url", pageUrl);
                            if (dtNodes != null)
                            {
                                List <string> oneIRenWuProperties = new List <string>();
                                foreach (HtmlNode dtNode in dtNodes)
                                {
                                    string pKey   = CommonUtil.HtmlDecode(dtNode.InnerText).Trim().Replace(" ", "").Replace(" ", "").Replace(" ", "");
                                    string pValue = this.GetNextDDNodeText(dtNode);

                                    int    sameNamePKeyCount = 1;
                                    string newPKey           = pKey;
                                    while (oneIRenWuProperties.Contains(newPKey))
                                    {
                                        sameNamePKeyCount++;
                                        newPKey = pKey + "_" + sameNamePKeyCount.ToString();
                                    }
                                    oneIRenWuProperties.Add(newPKey);

                                    row.Add(newPKey, pValue);
                                }
                            }

                            RenWuColumnPropertyExcelWriter.AddRow(row);
                        }
                        catch (Exception ex)
                        {
                            throw ex;
                        }
                    }
                }
                RenWuColumnPropertyExcelWriter.SaveToDisk();
            }
            catch (Exception ex)
            {
                throw ex;
            }
        }
Ejemplo n.º 13
0
        /// <summary>
        /// 保留部分属性
        /// </summary>
        /// <param name="listSheet"></param>
        private void GetRenWuRemainProperties(IListSheet listSheet)
        {
            try
            {
                string[] parameters        = this.Parameters.Split(new string[] { "," }, StringSplitOptions.RemoveEmptyEntries);
                string   columnMapFilePath = parameters[0];

                ExcelReader columnMapER = new ExcelReader(columnMapFilePath, "人物属性");
                int         rowCount    = columnMapER.GetRowCount();
                Dictionary <string, string> columnAliasToColumns = new Dictionary <string, string>();
                for (int i = 0; i < rowCount; i++)
                {
                    Dictionary <string, string> columnRow = columnMapER.GetFieldValues(i);
                    string columnName = columnRow["column"].Trim();
                    columnAliasToColumns.Add(columnName, columnName);

                    string   aliasColumnsStr = columnRow["aliasColumns"];
                    string[] aliasColumns    = aliasColumnsStr.Split(new string[] { "," }, StringSplitOptions.RemoveEmptyEntries);
                    foreach (string alias in aliasColumns)
                    {
                        columnAliasToColumns.Add(alias.Trim(), columnName);
                    }
                }

                List <string> propertyColumnNames = new List <string>();

                ExcelWriter RenWuInfoExcelWriter = this.CreateRenWuRemainPropertyListWriter();
                for (int i = 0; i < listSheet.RowCount; i++)
                {
                    Dictionary <string, string> listRow = listSheet.GetRow(i);
                    bool   giveUp  = "Y".Equals(listRow[SysConfig.GiveUpGrabFieldName]);
                    string pageUrl = listRow[SysConfig.DetailPageUrlFieldName];
                    string name    = listRow["name"];
                    if (!giveUp)
                    {
                        try
                        {
                            HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);
                            HtmlNodeCollection           dtNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"basic-info cmn-clearfix\"]/dl/dt");
                            if (dtNodes != null)
                            {
                                List <string> oneIRenWuProperties = new List <string>();
                                foreach (HtmlNode dtNode in dtNodes)
                                {
                                    string pKey   = CommonUtil.HtmlDecode(dtNode.InnerText).Trim().Replace(" ", "").Replace(" ", "").Replace(" ", "");
                                    string pValue = this.GetNextDDNodeText(dtNode);

                                    int    sameNamePKeyCount = 1;
                                    string newPKey           = pKey;
                                    while (oneIRenWuProperties.Contains(newPKey))
                                    {
                                        sameNamePKeyCount++;
                                        newPKey = pKey + "_" + sameNamePKeyCount.ToString();
                                    }
                                    oneIRenWuProperties.Add(newPKey);

                                    if (!propertyColumnNames.Contains(newPKey) && columnAliasToColumns.ContainsValue(newPKey))
                                    {
                                        propertyColumnNames.Add(newPKey);
                                    }

                                    if (columnAliasToColumns.ContainsKey(newPKey))
                                    {
                                        string columnName = columnAliasToColumns[newPKey];

                                        Dictionary <string, string> row = new Dictionary <string, string>();

                                        row.Add("name", name);
                                        row.Add("pKey", columnName);
                                        row.Add("pValue", pValue);
                                        row.Add("url", pageUrl);

                                        RenWuInfoExcelWriter.AddRow(row);
                                    }
                                }
                            }
                        }
                        catch (Exception ex)
                        {
                            throw ex;
                        }
                    }
                }
                RenWuInfoExcelWriter.SaveToDisk();

                ExcelWriter RenWuColumnPropertyExcelWriter = this.CreateRenWuRemainColumnPropertyListWriter(propertyColumnNames);
                for (int i = 0; i < listSheet.RowCount; i++)
                {
                    Dictionary <string, string> listRow = listSheet.GetRow(i);
                    bool   giveUp  = "Y".Equals(listRow[SysConfig.GiveUpGrabFieldName]);
                    string pageUrl = listRow[SysConfig.DetailPageUrlFieldName];
                    string name    = listRow["name"];
                    if (!giveUp)
                    {
                        try
                        {
                            HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);
                            HtmlNodeCollection           dtNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"basic-info cmn-clearfix\"]/dl/dt");
                            Dictionary <string, string>  row     = new Dictionary <string, string>();
                            row.Add("name", name);
                            row.Add("url", pageUrl);
                            if (dtNodes != null)
                            {
                                List <string> oneIRenWuProperties = new List <string>();
                                foreach (HtmlNode dtNode in dtNodes)
                                {
                                    string pKey   = CommonUtil.HtmlDecode(dtNode.InnerText).Trim().Replace(" ", "").Replace(" ", "").Replace(" ", "");
                                    string pValue = this.GetNextDDNodeText(dtNode);

                                    int    sameNamePKeyCount = 1;
                                    string newPKey           = pKey;
                                    while (oneIRenWuProperties.Contains(newPKey))
                                    {
                                        sameNamePKeyCount++;
                                        newPKey = pKey + "_" + sameNamePKeyCount.ToString();
                                    }
                                    oneIRenWuProperties.Add(newPKey);

                                    if (columnAliasToColumns.ContainsKey(newPKey))
                                    {
                                        string columnName = columnAliasToColumns[newPKey];
                                        if (row.ContainsKey(columnName))
                                        {
                                            row[columnName] = row[columnName] + ";" + pValue;
                                        }
                                        else
                                        {
                                            row.Add(columnName, pValue);
                                        }
                                    }
                                }
                            }

                            RenWuColumnPropertyExcelWriter.AddRow(row);
                        }
                        catch (Exception ex)
                        {
                            throw ex;
                        }
                    }
                }
                RenWuColumnPropertyExcelWriter.SaveToDisk();
            }
            catch (Exception ex)
            {
                throw ex;
            }
        }
Ejemplo n.º 14
0
        private void GetPropertiesMatrix(IListSheet listSheet)
        {
            string[] parameters     = this.Parameters.Split(new string[] { "," }, StringSplitOptions.RemoveEmptyEntries);
            string   sourceFilePath = parameters[0];
            string   destFilePath   = parameters[1];

            ExcelReader er             = new ExcelReader(sourceFilePath);
            int         sourceRowCount = er.GetRowCount();

            Dictionary <string, int> allPropertyCountDic = new Dictionary <string, int>();

            List <string> allPropertyList = new List <string>();

            for (int i = 0; i < sourceRowCount; i++)
            {
                Dictionary <string, string> sourceRow = er.GetFieldValues(i);
                string[] itemProperties = sourceRow["properties"].Split(new string[] { "," }, StringSplitOptions.RemoveEmptyEntries);
                foreach (string itemProperty in itemProperties)
                {
                    if (allPropertyCountDic.ContainsKey(itemProperty))
                    {
                        allPropertyCountDic[itemProperty] = allPropertyCountDic[itemProperty] + 1;
                    }
                    else
                    {
                        allPropertyList.Add(itemProperty);
                        allPropertyCountDic.Add(itemProperty, 1);
                    }
                }
            }

            //如果出现少于等于2次,那么忽略此属性
            int                       ignoreNum       = 6;
            List <string>             propertyList    = new List <string>();
            Dictionary <string, bool> propertyListDic = new Dictionary <string, bool>();

            foreach (string itemProperty in allPropertyList)
            {
                if (allPropertyCountDic[itemProperty] > ignoreNum)
                {
                    propertyList.Add(itemProperty);
                    propertyListDic.Add(itemProperty, true);
                }
            }

            int maxTime = 1;

            Dictionary <string, Dictionary <string, int> > pToPDic = new Dictionary <string, Dictionary <string, int> >();

            for (int i = 0; i < sourceRowCount; i++)
            {
                Dictionary <string, string> sourceRow = er.GetFieldValues(i);
                string[] itemProperties = sourceRow["properties"].Split(new string[] { "," }, StringSplitOptions.RemoveEmptyEntries);
                foreach (string fromItemProperty in itemProperties)
                {
                    if (propertyListDic.ContainsKey(fromItemProperty))
                    {
                        if (!pToPDic.ContainsKey(fromItemProperty))
                        {
                            pToPDic.Add(fromItemProperty, new Dictionary <string, int>());
                        }
                        Dictionary <string, int> propertyDic = pToPDic[fromItemProperty];

                        if (!propertyDic.ContainsKey(fromItemProperty))
                        {
                            propertyDic.Add(fromItemProperty, 1);
                        }
                        else
                        {
                            propertyDic[fromItemProperty] = propertyDic[fromItemProperty] + 1;
                        }

                        foreach (string toItemProperty in itemProperties)
                        {
                            if (propertyListDic.ContainsKey(toItemProperty))
                            {
                                if (fromItemProperty != toItemProperty)
                                {
                                    if (!propertyDic.ContainsKey(toItemProperty))
                                    {
                                        propertyDic.Add(toItemProperty, 1);
                                    }
                                    else
                                    {
                                        int tmpValue = propertyDic[toItemProperty] + 1;
                                        propertyDic[toItemProperty] = tmpValue;
                                        if (tmpValue > maxTime)
                                        {
                                            maxTime = tmpValue;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }



            Dictionary <string, int> resultColumnDic = new Dictionary <string, int>();

            resultColumnDic.Add("pToP", 0);
            for (int i = 0; i < propertyList.Count; i++)
            {
                resultColumnDic.Add(propertyList[i], i + 1);
            }

            CsvWriter propertyMatrixCW = new CsvWriter(destFilePath, resultColumnDic);

            foreach (string fromProperty in propertyList)
            {
                Dictionary <string, string> resultRow = new Dictionary <string, string>();
                resultRow.Add("pToP", fromProperty);
                Dictionary <string, int> propertyDic = pToPDic.ContainsKey(fromProperty) ? pToPDic[fromProperty] : null;
                foreach (string toProperty in propertyList)
                {
                    double value = fromProperty == toProperty ? 0 : (propertyDic == null || !propertyDic.ContainsKey(toProperty) || propertyDic[toProperty] == 0 ? 2 * (double)maxTime : ((double)maxTime / (double)propertyDic[toProperty]));
                    resultRow.Add(toProperty, value.ToString());
                }
                propertyMatrixCW.AddRow(resultRow);
            }

            propertyMatrixCW.SaveToDisk();

            string allPropertyNameFilePath = destFilePath + "_AllPropertyName.xlsx";
            Dictionary <string, int> allPropertyNameColumnDic = new Dictionary <string, int>();

            allPropertyNameColumnDic.Add("name", 0);
            allPropertyNameColumnDic.Add("count", 1);
            Dictionary <string, string> allPropertyNameColumnFormats = new Dictionary <string, string>();

            allPropertyNameColumnFormats.Add("count", "#0");
            ExcelWriter allPropertyNameEW = new ExcelWriter(allPropertyNameFilePath, "List", allPropertyNameColumnDic, allPropertyNameColumnFormats);

            for (int i = 0; i < allPropertyList.Count; i++)
            {
                string fromProperty = allPropertyList[i];
                Dictionary <string, object> resultRow = new Dictionary <string, object>();
                resultRow.Add("name", fromProperty);
                resultRow.Add("count", allPropertyCountDic[fromProperty]);
                allPropertyNameEW.AddRow(resultRow);
            }
            allPropertyNameEW.SaveToDisk();

            string propertyNameFilePath = destFilePath + "_PropertyName.xlsx";
            Dictionary <string, int> propertyNameColumnDic = new Dictionary <string, int>();

            propertyNameColumnDic.Add("name", 0);
            ExcelWriter propertyNameEW = new ExcelWriter(propertyNameFilePath, "List", propertyNameColumnDic);

            for (int i = 0; i < propertyList.Count; i++)
            {
                string fromProperty = propertyList[i];
                Dictionary <string, string> resultRow = new Dictionary <string, string>();
                resultRow.Add("name", fromProperty);
                propertyNameEW.AddRow(resultRow);
            }
            propertyNameEW.SaveToDisk();


            string        propertyArrayFilePath      = destFilePath + "_Array.txt";
            StringBuilder propertyArrayStringBuilder = new StringBuilder();

            propertyArrayStringBuilder.Append("arr = [");
            for (int i = 0; i < propertyList.Count; i++)
            {
                string fromProperty = propertyList[i];
                propertyArrayStringBuilder.Append((i == 0 ? "" : ", \r\n") + "[");
                Dictionary <string, string> resultRow = new Dictionary <string, string>();
                resultRow.Add("pToP", fromProperty);
                Dictionary <string, int> propertyDic = pToPDic.ContainsKey(fromProperty) ? pToPDic[fromProperty] : null;
                for (int j = 0; j < propertyListDic.Count; j++)
                {
                    string toProperty = propertyList[j];
                    double value      = fromProperty == toProperty ? 0 : (propertyDic == null || !propertyDic.ContainsKey(toProperty) || propertyDic[toProperty] == 0 ? 2 * (double)maxTime : ((double)maxTime / (double)propertyDic[toProperty]));
                    resultRow.Add(toProperty, value.ToString());
                    propertyArrayStringBuilder.Append((j == 0 ? "" : ", ") + value.ToString());
                }
                propertyMatrixCW.AddRow(resultRow);
                propertyArrayStringBuilder.Append("]");
            }
            propertyArrayStringBuilder.Append("]");
            FileHelper.SaveTextToFile(propertyArrayStringBuilder.ToString(), propertyArrayFilePath);
        }
Ejemplo n.º 15
0
 public override bool AfterAllGrab(IListSheet listSheet)
 {
     this.GetWordPageUrls(listSheet);
     return(true);
 }
Ejemplo n.º 16
0
 public override bool AfterAllGrab(IListSheet listSheet)
 {
     this.GetImage(listSheet);
     return(true);
 }
Ejemplo n.º 17
0
        /// <summary>
        /// GetShopDetail
        /// </summary>
        /// <param name="listSheet"></param>
        /// <param name="pageSourceDir"></param>
        /// <param name="resultEW"></param>
        private void GetShopDetail(IListSheet listSheet, string pageSourceDir, ExcelWriter resultEW)
        {
            Dictionary <string, string> shopDic = new Dictionary <string, string>();

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                //listSheet中只有一条记录
                Dictionary <string, string> row = listSheet.GetRow(i);
                string             pageUrl      = row[SysConfig.DetailPageUrlFieldName];
                string             provinceCode = row["provinceCode"];
                string             provinceName = row["provinceName"];
                string             cityCode     = row["cityCode"];
                string             cityName     = row["cityName"];
                string             shopCode     = row["shopCode"];
                string             shopName     = row["shopName"];
                string             level        = "";
                string             address      = "";
                Nullable <decimal> lng          = null;
                Nullable <decimal> lat          = null;
                string             serviceItems = "";

                string localFilePath = this.RunPage.GetFilePath(pageUrl, pageSourceDir);
                HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);

                HtmlNode levelNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@class=\"shop-level\"]/span[1]");
                if (levelNode != null)
                {
                    level = levelNode.InnerText;
                }

                HtmlNode addressNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@class=\"address clearfix\"]/div[@id=\"submitbtns\"]/span");
                if (addressNode != null)
                {
                    address = addressNode.InnerText;
                }

                HtmlNode scriptNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@id=\"scriptSection\"]");
                if (scriptNode != null)
                {
                    string script        = scriptNode.InnerText;
                    int    lngBeginIndex = script.IndexOf("Position: '") + 11;
                    int    lngEndIndex   = script.IndexOf(",", lngBeginIndex);
                    int    latBeginIndex = lngEndIndex + 1;
                    int    latEndIndex   = script.IndexOf("',", latBeginIndex);
                    lng = decimal.Parse(script.Substring(lngBeginIndex, lngEndIndex - lngBeginIndex));
                    lat = decimal.Parse(script.Substring(latBeginIndex, latEndIndex - latBeginIndex));
                }

                StringBuilder      serviceItemSB       = new StringBuilder();
                HtmlNodeCollection allServiceItemNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"sever-xm\"]/ul/li");

                if (allServiceItemNodes != null)
                {
                    for (int j = 0; j < allServiceItemNodes.Count; j++)
                    {
                        HtmlNode serviceNode = allServiceItemNodes[j];
                        if (!serviceNode.Attributes.Contains("class") || serviceNode.Attributes["class"].Value != "not-have")
                        {
                            string serviceText = serviceNode.InnerText.Trim();
                            serviceItemSB.Append(serviceText + ";");
                        }
                    }
                    serviceItems = serviceItemSB.ToString();
                }

                Dictionary <string, object> f2vs = new Dictionary <string, object>();
                f2vs.Add("provinceCode", provinceCode);
                f2vs.Add("provinceName", provinceName);
                f2vs.Add("cityCode", cityCode);
                f2vs.Add("cityName", cityName);
                f2vs.Add("shopCode", shopCode);
                f2vs.Add("shopName", shopName);
                f2vs.Add("level", level);
                f2vs.Add("address", address);
                f2vs.Add("lng", lng);
                f2vs.Add("lat", lat);
                f2vs.Add("serviceItems", serviceItems);
                resultEW.AddRow(f2vs);
            }
        }
Ejemplo n.º 18
0
        /// <summary>
        /// 生成车辆信息抓取URL列表
        /// </summary>
        /// <param name="listSheet"></param>
        /// <returns></returns>
        private bool GenerateCLXX(IListSheet listSheet)
        {
            bool   succeed       = true;
            string exportDir     = this.RunPage.GetExportDir();
            string pageSourceDir = this.RunPage.GetDetailSourceFileDir();

            Dictionary <string, int> clxxColumnDic = CommonUtil.InitStringIndexDic(new string[] {
                "detailPageUrl",
                "detailPageName",
                "cookie",
                "grabStatus",
                "giveUpGrab"
            });
            string      clxxPath = Path.Combine(exportDir, this.RunPage.Project.Name + "_车辆信息.xlsx");
            ExcelWriter clxxEW   = new ExcelWriter(clxxPath, "List", clxxColumnDic, null);


            int detailUrlColumnIndex = this.RunPage.ColumnNameToIndex["detailPageUrl"];
            Dictionary <string, string> rIdToNull = new Dictionary <string, string>();

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                string     detailUrl            = row["detailPageUrl"];
                string     cookie        = row["cookie"];
                string     localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir);
                TextReader tr            = null;

                try
                {
                    tr = new StreamReader(localFilePath, Encoding.GetEncoding(((Proj_Detail_SingleLine)this.RunPage.Project.DetailGrabInfoObject).Encoding));
                    string webPageHtml = tr.ReadToEnd();

                    HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
                    htmlDoc.LoadHtml(webPageHtml);

                    HtmlNodeCollection listTrNodeList = htmlDoc.DocumentNode.SelectNodes("//table[@class=\"GridView\"]/tr");
                    if (listTrNodeList.Count > 1)
                    {
                        for (int j = 1; j < listTrNodeList.Count; j++)
                        {
                            HtmlNode listTrNode = listTrNodeList[j];

                            HtmlNodeCollection          vNodeList  = listTrNode.SelectNodes("./td");
                            Dictionary <string, object> reportInfo = new Dictionary <string, object>();
                            string clickUrl = vNodeList[3].SelectSingleNode("./span/a").GetAttributeValue("onclick", "");
                            string rId      = clickUrl.Substring(clickUrl.IndexOf("=") + 1, clickUrl.LastIndexOf("'") - clickUrl.IndexOf("=") - 1);
                            if (!rIdToNull.ContainsKey(rId))
                            {
                                string pageUrl = "http://218.56.62.250/hnts/VehicleGas/VehicleGasView.aspx?RegId=" + rId;
                                rIdToNull.Add(rId, "");
                                reportInfo.Add("detailPageUrl", pageUrl);
                                reportInfo.Add("detailPageName", rId);
                                reportInfo.Add("cookie", cookie);
                                clxxEW.AddRow(reportInfo);
                            }
                        }
                    }
                }
                catch (Exception ex)
                {
                    if (tr != null)
                    {
                        tr.Dispose();
                        tr = null;
                    }
                    this.RunPage.InvokeAppendLogText("读取出错. " + ex.Message + " LocalPath = " + localFilePath, LogLevelType.Error, true);
                }
            }
            clxxEW.SaveToDisk();
            return(succeed);
        }
Ejemplo n.º 19
0
        private bool GenerateCityReport(IListSheet listSheet)
        {
            bool   succeed       = true;
            string exportDir     = this.RunPage.GetExportDir();
            string pageSourceDir = this.RunPage.GetDetailSourceFileDir();

            Dictionary <string, int> cityReportColumnDic = CommonUtil.InitStringIndexDic(new string[] {
                "cityCode",
                "city",
                "日期",
                "AQI指数",
                "质量等级",
                "当天AQI排名",
                "PM2.5",
                "PM10",
                "Co",
                "No2",
                "So2",
                "O3"
            });
            string cityReportPath = Path.Combine(exportDir, this.RunPage.Project.Name + "_List.xlsx");
            Dictionary <string, string> columnFormats = new Dictionary <string, string>();

            columnFormats.Add("日期", "yyyy-m-d");
            columnFormats.Add("AQI指数", "#0");
            columnFormats.Add("当天AQI排名", "#0");
            columnFormats.Add("PM2.5", "#0");
            columnFormats.Add("PM10", "#0");
            columnFormats.Add("Co", "#0.00");
            columnFormats.Add("No2", "#0");
            columnFormats.Add("So2", "#0");
            columnFormats.Add("O3", "#0");
            ExcelWriter cityReportEW = new ExcelWriter(cityReportPath, "List", cityReportColumnDic, columnFormats);


            int detailUrlColumnIndex = this.RunPage.ColumnNameToIndex["detailPageUrl"];
            Dictionary <string, string> codeDateToNull = new Dictionary <string, string>();
            string sourceDateFormat = "yyyy-MM-dd";

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                string     detailUrl            = row["detailPageUrl"];
                string     cityCode             = row["cityCode"];
                string     city          = row["cityName"];
                string     localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir);
                TextReader tr            = null;

                try
                {
                    tr = new StreamReader(localFilePath, Encoding.GetEncoding(((Proj_Detail_SingleLine)this.RunPage.Project.DetailGrabInfoObject).Encoding));
                    string webPageHtml = tr.ReadToEnd();

                    HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
                    htmlDoc.LoadHtml(webPageHtml);

                    HtmlNodeCollection listDivNodeList = htmlDoc.DocumentNode.SelectNodes("//*[@id=\"content\"]/div[3]/table[1]/tr");
                    if (listDivNodeList.Count > 1)
                    {
                        Dictionary <int, string> cityReportColumnIndexDic = new Dictionary <int, string>();
                        HtmlNodeCollection       nameNodes = listDivNodeList[0].SelectNodes("td");
                        for (int j = 0; j < nameNodes.Count; j++)
                        {
                            HtmlNode nameNode = nameNodes[j];
                            string   name     = nameNode.InnerText.Trim();
                            cityReportColumnIndexDic.Add(j, name);
                        }
                        for (int j = 1; j < listDivNodeList.Count; j++)
                        {
                            HtmlNode listDivNode = listDivNodeList[j];

                            HtmlNodeCollection          vNodeList  = listDivNode.SelectNodes("./td");
                            Dictionary <string, object> reportInfo = new Dictionary <string, object>();
                            reportInfo.Add("cityCode", cityCode);
                            reportInfo.Add("city", city);

                            for (int k = 0; k < nameNodes.Count; k++)
                            {
                                HtmlNode vNode     = vNodeList[k];
                                string   value     = vNode.InnerText.Trim();
                                string   columName = cityReportColumnIndexDic[k];
                                switch (columName)
                                {
                                case "日期":
                                    DateTime dt = DateTime.ParseExact(value, sourceDateFormat, System.Globalization.CultureInfo.CurrentCulture);
                                    reportInfo.Add(columName, dt);
                                    break;

                                case "AQI指数":
                                case "当天AQI排名":
                                case "PM2.5":
                                case "PM10":
                                case "Co":
                                case "No2":
                                case "So2":
                                    reportInfo.Add(columName, decimal.Parse(value));
                                    break;

                                default:
                                    reportInfo.Add(columName, value);
                                    break;
                                }
                            }
                            string codeDate = cityCode + "_" + ((DateTime)reportInfo["日期"]).ToString("yyyy-MM-dd");
                            if (!codeDateToNull.ContainsKey(codeDate))
                            {
                                cityReportEW.AddRow(reportInfo);
                                codeDateToNull.Add(codeDate, null);
                            }
                        }
                    }
                }
                catch (Exception ex)
                {
                    if (tr != null)
                    {
                        tr.Dispose();
                        tr = null;
                    }
                    this.RunPage.InvokeAppendLogText("读取出错. " + ex.Message + " LocalPath = " + localFilePath, LogLevelType.Error, true);
                }
            }
            cityReportEW.SaveToDisk();
            return(succeed);
        }
Ejemplo n.º 20
0
 public override bool AfterAllGrab(IListSheet listSheet)
 {
     return(GenerateDAGLJL(listSheet) && GenerateCLXX(listSheet));
 }
Ejemplo n.º 21
0
        private bool GetAllDetailPageUrl(IListSheet listSheet)
        {
            string exportDir     = this.RunPage.GetExportDir();
            string pageSourceDir = this.RunPage.GetDetailSourceFileDir();

            string[] resultColumns = new string[] { "detailPageUrl",
                                                    "detailPageName",
                                                    "cookie",
                                                    "grabStatus",
                                                    "giveUpGrab",
                                                    "productCode",
                                                    "productName",
                                                    "productCurrentPrice",
                                                    "productOldPrice",
                                                    "categoryCode",
                                                    "categoryName",
                                                    "standard",
                                                    "city" };
            Dictionary <string, int> resultColumnDic = CommonUtil.InitStringIndexDic(resultColumns);
            string      resultFilePath = Path.Combine(exportDir, this.RunPage.Project.Name + "_AllDetailPageUrl.xlsx");
            ExcelWriter resultEW       = new ExcelWriter(resultFilePath, "List", resultColumnDic);

            string detailPageUrlPrefix = "http://www.fruitday.com";
            Dictionary <string, string> allProductCodes = new Dictionary <string, string>();

            string detailPageUrlColumnName = SysConfig.DetailPageUrlFieldName;
            string categoryNameColumnName  = SysConfig.DetailPageNameFieldName;

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    string url           = row[detailPageUrlColumnName];
                    string categoryCode  = row["categoryCode"];
                    string categoryName  = row["categoryName"];
                    string cookie        = row["cookie"];
                    string city          = row["city"];
                    string localFilePath = this.RunPage.GetFilePath(url, pageSourceDir);

                    try
                    {
                        HtmlAgilityPack.HtmlDocument htmlDoc      = this.RunPage.GetLocalHtmlDocument(listSheet, i);
                        HtmlNodeCollection           allItemNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"leftpart pull-left\"]/ul/li");
                        if (allItemNodes != null)
                        {
                            foreach (HtmlNode itemNode in allItemNodes)
                            {
                                string productCode         = "";
                                string productName         = "";
                                string productCurrentPrice = "";
                                string productOldPrice     = "";
                                string detailPageUrl       = "";
                                string detailPageName      = "";
                                string standard            = "";

                                HtmlNode urlNode = itemNode.SelectSingleNode("./div/div[@class=\"s-img\"]/a");
                                detailPageUrl = detailPageUrlPrefix + urlNode.Attributes["href"].Value;
                                int startIndex = detailPageUrl.LastIndexOf("/") + 1;
                                detailPageName = detailPageUrl.Substring(startIndex);
                                productCode    = detailPageName;

                                HtmlNodeCollection propertyNodes = itemNode.SelectSingleNode("./div/div[@class=\"s-info clearfix\"]").ChildNodes;
                                foreach (HtmlNode propertyNode in propertyNodes)
                                {
                                    if (propertyNode.NodeType == HtmlNodeType.Text)
                                    {
                                        productName = propertyNode.InnerText.Trim();
                                    }
                                    else
                                    {
                                        if (propertyNode.Attributes.Contains("class") &&
                                            propertyNode.Attributes["class"].Value == "s-unit pull-right font-color")
                                        {
                                            string priceStr = propertyNode.InnerText.Trim();
                                            productCurrentPrice = priceStr.Substring(1);
                                        }
                                    }
                                }

                                HtmlNode standardNode = itemNode.SelectSingleNode("./div/div[@class=\"p-operate clearfix\"]");
                                if (standardNode != null)
                                {
                                    standard = standardNode.InnerText.Trim();
                                }
                                detailPageName = city + "_" + detailPageName;

                                if (!allProductCodes.ContainsKey(detailPageName))
                                {
                                    allProductCodes.Add(detailPageName, null);
                                    Dictionary <string, string> p2vs = new Dictionary <string, string>();
                                    p2vs.Add("detailPageUrl", detailPageUrl + "?city=" + city);
                                    p2vs.Add("detailPageName", detailPageName);
                                    p2vs.Add("city", city);
                                    p2vs.Add("cookie", cookie);
                                    p2vs.Add("productCode", productCode);
                                    p2vs.Add("productName", productName);
                                    p2vs.Add("productCurrentPrice", productCurrentPrice);
                                    p2vs.Add("productOldPrice", productOldPrice);
                                    p2vs.Add("categoryCode", categoryCode);
                                    p2vs.Add("categoryName", categoryName);
                                    p2vs.Add("standard", standard);
                                    resultEW.AddRow(p2vs);
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        this.RunPage.InvokeAppendLogText("读取出错.  " + ex.Message + " LocalPath = " + localFilePath, LogLevelType.Error, true);
                        throw ex;
                    }
                }
            }
            resultEW.SaveToDisk();

            //执行后续任务
            TaskManager.StartTask("易果", "天天果园获取所有详情页", resultFilePath, null, null, false);
            return(true);
        }
Ejemplo n.º 22
0
        /// <summary>
        /// 生成档案管理记录
        /// </summary>
        /// <param name="listSheet"></param>
        /// <returns></returns>
        private bool GenerateDAGLJL(IListSheet listSheet)
        {
            bool   succeed       = true;
            string exportDir     = this.RunPage.GetExportDir();
            string pageSourceDir = this.RunPage.GetDetailSourceFileDir();

            Dictionary <string, int> dagljlColumnDic = CommonUtil.InitStringIndexDic(new string[] {
                "区划",
                "录入单位",
                "登记证编号",
                "车牌号码",
                "安装数量",
                "使用单位",
                "安装日期",
                "登记日期",
                "状态"
            });
            string dagljlPath = Path.Combine(exportDir, this.RunPage.Project.Name + "_档案管理记录.xlsx");
            Dictionary <string, string> columnFormats = new Dictionary <string, string>();
            ExcelWriter cityReportEW = new ExcelWriter(dagljlPath, "List", dagljlColumnDic, columnFormats);


            int detailUrlColumnIndex = this.RunPage.ColumnNameToIndex["detailPageUrl"];
            Dictionary <string, string> codeDateToNull = new Dictionary <string, string>();

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                string     detailUrl            = row["detailPageUrl"];
                string     localFilePath        = this.RunPage.GetFilePath(detailUrl, pageSourceDir);
                TextReader tr = null;

                try
                {
                    tr = new StreamReader(localFilePath, Encoding.GetEncoding(((Proj_Detail_SingleLine)this.RunPage.Project.DetailGrabInfoObject).Encoding));
                    string webPageHtml = tr.ReadToEnd();

                    HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
                    htmlDoc.LoadHtml(webPageHtml);

                    HtmlNodeCollection listTrNodeList = htmlDoc.DocumentNode.SelectNodes("//table[@class=\"GridView\"]/tr");
                    if (listTrNodeList.Count > 1)
                    {
                        for (int j = 1; j < listTrNodeList.Count; j++)
                        {
                            HtmlNode listTrNode = listTrNodeList[j];

                            HtmlNodeCollection          vNodeList  = listTrNode.SelectNodes("./td");
                            Dictionary <string, object> reportInfo = new Dictionary <string, object>();

                            reportInfo.Add("区划", vNodeList[1].InnerText.Trim());
                            reportInfo.Add("录入单位", vNodeList[2].InnerText.Trim());
                            reportInfo.Add("登记证编号", vNodeList[3].InnerText.Trim());
                            reportInfo.Add("车牌号码", vNodeList[4].InnerText.Trim());
                            reportInfo.Add("安装数量", vNodeList[5].InnerText.Trim());
                            reportInfo.Add("使用单位", vNodeList[6].InnerText.Trim());

                            reportInfo.Add("安装日期", vNodeList[7].InnerText.Trim());
                            reportInfo.Add("登记日期", vNodeList[8].InnerText.Trim());
                            reportInfo.Add("状态", vNodeList[9].InnerText.Trim());
                            cityReportEW.AddRow(reportInfo);
                        }
                    }
                }
                catch (Exception ex)
                {
                    if (tr != null)
                    {
                        tr.Dispose();
                        tr = null;
                    }
                    this.RunPage.InvokeAppendLogText("读取出错. " + ex.Message + " LocalPath = " + localFilePath, LogLevelType.Error, true);
                }
            }
            cityReportEW.SaveToDisk();
            return(succeed);
        }
        private void GetXiaoquInfos(IListSheet listSheet)
        {
            string[] paramterParts = this.Parameters.Split(new string[] { "," }, StringSplitOptions.RemoveEmptyEntries);
            string   cityName      = paramterParts[0];

            string      pageSourceDir = this.RunPage.GetDetailSourceFileDir();
            ExcelWriter resultEW      = null;
            int         fileIndex     = 1;

            Dictionary <string, string> fangLinkUrlDic = new Dictionary <string, string>();

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                if (resultEW == null || resultEW.RowCount > 500000)
                {
                    if (resultEW != null)
                    {
                        resultEW.SaveToDisk();
                    }
                    resultEW = this.GetExcelWriter(fileIndex, cityName);
                    fileIndex++;
                }

                Dictionary <string, string> row = listSheet.GetRow(i);
                bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);
                    try
                    {
                        HtmlNodeCollection fangNodeList = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"baseinfo\"]/a");

                        if (fangNodeList != null)
                        {
                            foreach (HtmlNode fangNode in fangNodeList)
                            {
                                string fangLinkUrl = fangNode.GetAttributeValue("href", "");
                                if (!fangLinkUrlDic.ContainsKey(fangLinkUrl))
                                {
                                    fangLinkUrlDic.Add(fangLinkUrl, null);
                                    Dictionary <string, string> f2vs = new Dictionary <string, string>();
                                    f2vs.Add("detailPageUrl", fangLinkUrl);
                                    f2vs.Add("detailPageName", fangLinkUrl);
                                    f2vs.Add("xiaoquname", row["xiaoquName"]);
                                    f2vs.Add("xiaoquurl", row["xiaoquUrl"]);
                                    f2vs.Add("cityName", row["cityName"]);
                                    f2vs.Add("cityCode", row["cityCode"]);
                                    f2vs.Add("level1AreaName", row["level1AreaName"]);
                                    f2vs.Add("level1AreaCode", row["level1AreaCode"]);
                                    f2vs.Add("level2AreaCode", row["level2AreaCode"]);
                                    f2vs.Add("level2AreaName", row["level2AreaName"]);

                                    resultEW.AddRow(f2vs);
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        throw ex;
                    }
                }
            }
            resultEW.SaveToDisk();
        }
Ejemplo n.º 24
0
        private void GetPeriodicalListPageUrls(IListSheet listSheet)
        {
            String exportDir     = this.RunPage.GetExportDir();
            string pageSourceDir = this.RunPage.GetDetailSourceFileDir();

            Dictionary <string, int> resultColumnDic = new Dictionary <string, int>();

            resultColumnDic.Add("detailPageUrl", 0);
            resultColumnDic.Add("detailPageName", 1);
            resultColumnDic.Add("cookie", 2);
            resultColumnDic.Add("grabStatus", 3);
            resultColumnDic.Add("giveUpGrab", 4);
            resultColumnDic.Add("cate1", 5);
            resultColumnDic.Add("cateId1", 6);
            resultColumnDic.Add("cate2", 7);
            resultColumnDic.Add("cateId2", 8);
            resultColumnDic.Add("pageIndex", 9);
            string      resultFilePath = Path.Combine(exportDir, "万方期刊_期刊列表.xlsx");
            ExcelWriter resultEW       = new ExcelWriter(resultFilePath, "List", resultColumnDic, null);

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                string detailUrl = row["detailPageUrl"];

                string cate1   = row["cate1"];
                string cateId1 = row["cateId1"];
                bool   giveUp  = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    string localFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir);

                    try
                    {
                        string pageFileText  = FileHelper.GetTextFromFile(localFilePath);
                        JArray itemJsonArray = JArray.Parse(pageFileText);


                        for (int j = 0; j < itemJsonArray.Count; j++)
                        {
                            JObject itemJson        = itemJsonArray[j] as JObject;
                            string  cateId2         = itemJson.GetValue("id").ToString();
                            string  cate2           = itemJson.GetValue("showName").ToString().Trim();
                            int     periodicalCount = int.Parse(itemJson.GetValue("count").ToString().Trim());
                            int     pageCount       = periodicalCount == 0 ? 0 : (periodicalCount / 20 + 1);
                            for (int k = 0; k < pageCount; k++)
                            {
                                string newUrl = "http://www.wanfangdata.com.cn/perio/page.do?page=" + (k + 1).ToString() + "&pageSize=20&selectOrder=affectoi&fmList=" + cateId2 + "&a_title=&core=&fromData=WF&included=&publishyear=&isfirst=";
                                Dictionary <string, string> f2vs = new Dictionary <string, string>();
                                f2vs.Add("detailPageUrl", newUrl);
                                f2vs.Add("detailPageName", newUrl);
                                f2vs.Add("cate1", cate1);
                                f2vs.Add("cateId1", cateId1);
                                f2vs.Add("cate2", cate2);
                                f2vs.Add("cateId2", cateId2);
                                f2vs.Add("pageIndex", (k + 1).ToString());
                                resultEW.AddRow(f2vs);
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        throw ex;
                    }
                }
            }
            resultEW.SaveToDisk();
        }
Ejemplo n.º 25
0
 public bool Run(string parameters, IListSheet listSheet)
 {
     return(GetShopList(parameters, listSheet));
 }
Ejemplo n.º 26
0
 public bool Run(string parameters, IListSheet listSheet)
 {
     return(GetAllDetailPageUrl(listSheet));
 }
Ejemplo n.º 27
0
        private void GetList(IListSheet listSheet, string pageSourceDir, ExcelWriter resultEW)
        {
            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                string  pageUrl             = row["detailPageUrl"];
                string  pageName            = row["detailPageName"];
                bool    giveUpGrab          = row["giveUpGrab"] == "Y";
                string  localFilePath       = this.RunPage.GetFilePath(pageUrl, pageSourceDir);
                string  productCode         = row["productCode"];
                string  productName         = row["productName"];
                string  category1Code       = row["category1Code"];
                string  category2Code       = row["category2Code"];
                string  category3Code       = row["category3Code"];
                string  category1Name       = row["category1Name"];
                string  category2Name       = row["category2Name"];
                string  category3Name       = row["category3Name"];
                string  pinpai              = "";
                decimal productCurrentPrice = 0;

                if (!giveUpGrab)
                {
                    TextReader tr = null;

                    try
                    {
                        tr = new StreamReader(localFilePath);
                        string webPageHtml = tr.ReadToEnd();
                        HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
                        htmlDoc.LoadHtml(webPageHtml);
                        {
                            HtmlNode propertyParentNode = htmlDoc.DocumentNode.SelectSingleNode("//dl[@class=\"dl-proInfo\"]");
                            if (propertyParentNode != null)
                            {
                                HtmlNodeCollection propertyNodes = propertyParentNode.SelectNodes("./dd");
                                foreach (HtmlNode pNode in propertyNodes)
                                {
                                    string pText = pNode.InnerText.Trim();
                                    if (pText.StartsWith("品牌:"))
                                    {
                                        pinpai = pText.Substring(3);
                                        break;
                                    }
                                }
                            }
                        }
                        {
                            HtmlNode propertyParentNode = htmlDoc.DocumentNode.SelectSingleNode("//ul[@class=\"depict-list fn-clear\"]");
                            if (propertyParentNode != null)
                            {
                                HtmlNodeCollection propertyNodes = propertyParentNode.SelectNodes("./li");
                                foreach (HtmlNode pNode in propertyNodes)
                                {
                                    HtmlNode pnNode = pNode.SelectSingleNode("./span[1]");
                                    string   pnText = pnNode.InnerText.Trim();
                                    if (pnText.StartsWith("品牌:"))
                                    {
                                        HtmlNode pvNode = pNode.SelectSingleNode("./span[2]");
                                        pinpai = pvNode == null ? "" : pvNode.InnerText.Trim();
                                        break;
                                    }
                                }
                            }
                        }


                        HtmlNode priceNode = htmlDoc.DocumentNode.SelectSingleNode("//font[@class=\"info-price\"]/b[@class=\"JS-control-price\"]");
                        priceNode = priceNode == null?htmlDoc.DocumentNode.SelectSingleNode("//div[@id=\"J_product_value\"]/div/strong[@class=\"fn-rmb-num\"]") : priceNode;

                        if (priceNode != null)
                        {
                            string priceStr = priceNode.InnerText.Trim();
                            productCurrentPrice = decimal.Parse(priceStr);
                        }
                        else
                        {
                            //throw new Exception("None price! url = " + pageUrl);
                            this.RunPage.InvokeAppendLogText("None price! url = " + pageUrl, LogLevelType.Error, true);

                            if (tr != null)
                            {
                                tr.Close();
                                tr.Dispose();
                            }
                            File.Delete(localFilePath);
                        }
                    }
                    catch (Exception ex)
                    {
                        this.RunPage.InvokeAppendLogText("读取出错. url = " + pageUrl + ". " + ex.Message, LogLevelType.Error, true);
                        throw ex;
                    }
                    finally
                    {
                        if (tr != null)
                        {
                            tr.Close();
                            tr.Dispose();
                        }
                    }

                    Dictionary <string, object> f2vs = new Dictionary <string, object>();
                    f2vs.Add("商品编码", productCode);
                    f2vs.Add("商品名称", productName);
                    f2vs.Add("价格", productCurrentPrice);
                    f2vs.Add("品牌", pinpai);
                    f2vs.Add("一级分类", category1Name);
                    f2vs.Add("二级分类", category2Name);
                    f2vs.Add("三级分类", category3Name);
                    f2vs.Add("url", pageUrl);
                    f2vs.Add("一级分类编码", category1Code);
                    f2vs.Add("二级分类编码", category2Code);
                    f2vs.Add("三级分类编码", category3Code);

                    resultEW.AddRow(f2vs);
                }
            }
        }
Ejemplo n.º 28
0
        private bool GetAllDetailPageUrl(IListSheet listSheet)
        {
            string exportDir     = this.RunPage.GetExportDir();
            string pageSourceDir = this.RunPage.GetDetailSourceFileDir();

            Dictionary <string, int> resultColumnDic = CommonUtil.InitStringIndexDic(new string[] {
                "detailPageUrl",
                "detailPageName",
                "cookie",
                "grabStatus",
                "giveUpGrab",
                "productCode",
                "productName",
                "category1Code",
                "category2Code",
                "category3Code",
                "category1Name",
                "category2Name",
                "category3Name"
            });
            string resultFilePath = Path.Combine(exportDir, this.RunPage.Project.Name + "_AllDetailPageUrl.xlsx");

            ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic);

            Dictionary <string, string> goodsDic = new Dictionary <string, string>();

            string detailPageUrlColumnName = SysConfig.DetailPageUrlFieldName;
            string categoryNameColumnName  = SysConfig.DetailPageNameFieldName;

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    string url                 = row[detailPageUrlColumnName];
                    string category1Code       = row["category1Code"];
                    string category2Code       = row["category2Code"];
                    string category3Code       = row["category3Code"];
                    string category1Name       = row["category1Name"];
                    string category2Name       = row["category2Name"];
                    string category3Name       = row["category3Name"];
                    string cookie              = row["cookie"];
                    string detailPageUrlPrefix = "http://www.cityshop.com.cn";
                    string localFilePath       = this.RunPage.GetFilePath(url, pageSourceDir);

                    try
                    {
                        {
                            HtmlAgilityPack.HtmlDocument htmlDoc   = this.RunPage.GetLocalHtmlDocument(listSheet, i);
                            HtmlNodeCollection           itemNodes = htmlDoc.DocumentNode.SelectNodes("//ul[@class=\"row product-grid\"]/li");
                            if (itemNodes != null)
                            {
                                foreach (HtmlNode itemNode in itemNodes)
                                {
                                    //HtmlNodeCollection allPageNodes = listNode.SelectNodes("./div[@class='p_item_container p_item_ab ']");
                                    string productCode    = "";
                                    string productName    = "";
                                    string detailPageUrl  = "";
                                    string detailPageName = "";

                                    HtmlNode nameNode = itemNode.SelectSingleNode("./div[2]/p[1]/a");
                                    detailPageUrl = detailPageUrlPrefix + nameNode.Attributes["href"].Value;
                                    int startIndex = detailPageUrl.LastIndexOf("/") + 1;
                                    int endIndex   = detailPageUrl.LastIndexOf("?");
                                    int length     = endIndex - startIndex;

                                    //商品类型为礼品卡时,length==0,不用获取详情页
                                    if (length > 0)
                                    {
                                        detailPageName = detailPageUrl.Substring(startIndex, length);
                                        productCode    = detailPageName;
                                        productName    = nameNode.InnerText.Trim();

                                        Dictionary <string, string> p2vs = new Dictionary <string, string>();
                                        p2vs.Add("detailPageUrl", detailPageUrl);
                                        p2vs.Add("detailPageName", detailPageName);
                                        p2vs.Add("productCode", productCode);
                                        p2vs.Add("productName", productName);
                                        p2vs.Add("category1Code", category1Code);
                                        p2vs.Add("category2Code", category2Code);
                                        p2vs.Add("category3Code", category3Code);
                                        p2vs.Add("category1Name", category1Name);
                                        p2vs.Add("category2Name", category2Name);
                                        p2vs.Add("category3Name", category3Name);
                                        p2vs.Add("cookie", cookie);
                                        resultEW.AddRow(p2vs);
                                    }
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        this.RunPage.InvokeAppendLogText("读取出错.  " + ex.Message + " LocalPath = " + localFilePath, LogLevelType.Error, true);
                        throw ex;
                    }
                }
            }
            resultEW.SaveToDisk();
            return(true);
        }
Ejemplo n.º 29
0
        public override bool AfterAllGrab(IListSheet listSheet)
        {
            String exportDir = this.RunPage.GetExportDir();

            Dictionary <string, int> resultColumnDic = new Dictionary <string, int>();

            resultColumnDic.Add("city", 0);
            resultColumnDic.Add("gName", 1);
            resultColumnDic.Add("rName", 2);
            resultColumnDic.Add("shopName", 3);
            resultColumnDic.Add("reviewNum", 4);
            resultColumnDic.Add("serviceRating", 5);
            resultColumnDic.Add("environmentRating", 6);
            resultColumnDic.Add("tasteRating", 7);
            resultColumnDic.Add("address", 8);
            resultColumnDic.Add("lat", 9);
            resultColumnDic.Add("lng", 10);
            string resultFilePath = Path.Combine(exportDir, "大众点评店铺信息.xlsx");
            Dictionary <string, string> resultColumnFormat = new Dictionary <string, string>();

            resultColumnFormat.Add("reviewNum", "#,##0");
            resultColumnFormat.Add("lat", "#,##0.000000");
            resultColumnFormat.Add("lng", "#,##0.000000");
            resultColumnFormat.Add("serviceRating", "#,##0.00");
            resultColumnFormat.Add("environmentRating", "#,##0.0");
            resultColumnFormat.Add("tasteRating", "#,##0.0");

            ExcelWriter resultEW = new ExcelWriter(resultFilePath, "List", resultColumnDic);

            string detailPageUrlColumnName = SysConfig.DetailPageUrlFieldName;

            Dictionary <string, string> shopDic = new Dictionary <string, string>();

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    try
                    {
                        string             url  = row[detailPageUrlColumnName];
                        string             city = row["city"];
                        Nullable <decimal> lat  = null;
                        Nullable <decimal> lng  = null;



                        HtmlAgilityPack.HtmlDocument pageHtmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);
                        string pageText = pageHtmlDoc.DocumentNode.InnerHtml;

                        int latNameBeginIndex = pageText.IndexOf("shopGlat:");
                        if (latNameBeginIndex > 0)
                        {
                            int latBeginIndex = pageText.IndexOf("\"", latNameBeginIndex);
                            int latEndIndex   = pageText.IndexOf("\"", latBeginIndex + 1);
                            if (latEndIndex - latBeginIndex > 1)
                            {
                                decimal latValue = 0;
                                if (decimal.TryParse(pageText.Substring(latBeginIndex + 1, latEndIndex - latBeginIndex - 1), out latValue))
                                {
                                    lat = latValue;
                                }
                            }
                        }
                        int lngNameBeginIndex = pageText.IndexOf("shopGlng:");
                        if (lngNameBeginIndex > 0)
                        {
                            int lngBeginIndex = pageText.IndexOf("\"", lngNameBeginIndex);
                            int lngEndIndex   = pageText.IndexOf("\"", lngBeginIndex + 1);
                            if (lngEndIndex - lngBeginIndex > 1)
                            {
                                decimal lngValue = 0;
                                if (decimal.TryParse(pageText.Substring(lngBeginIndex + 1, lngEndIndex - lngBeginIndex - 1), out lngValue))
                                {
                                    lng = lngValue;
                                }
                            }
                        }

                        Dictionary <string, object> f2vs = new Dictionary <string, object>();
                        f2vs.Add("city", row["city"]);
                        f2vs.Add("gName", row["gName"]);
                        f2vs.Add("rName", row["rName"]);
                        f2vs.Add("shopName", row["shopName"]);
                        f2vs.Add("reviewNum", row["reviewNum"]);
                        f2vs.Add("serviceRating", row["serviceRating"]);
                        f2vs.Add("environmentRating", row["environmentRating"]);
                        f2vs.Add("tasteRating", row["tasteRating"]);
                        f2vs.Add("address", row["address"]);
                        f2vs.Add("lat", lat);
                        f2vs.Add("lng", lng);
                        resultEW.AddRow(f2vs);
                    }
                    catch (Exception ex)
                    {
                        throw ex;
                    }
                }
            }

            resultEW.SaveToDisk();

            return(true);
        }
Ejemplo n.º 30
0
        private void GetAllInfos(IListSheet listSheet)
        {
            CsvWriter cw = this.GetCsvExcelWriter();

            string detailPageUrlColumnName         = SysConfig.DetailPageUrlFieldName;
            Dictionary <string, string> companyDic = new Dictionary <string, string>();

            for (int i = 0; i < listSheet.RowCount; i++)
            {
                Dictionary <string, string> row = listSheet.GetRow(i);
                string detailPageUrl            = row[SysConfig.DetailPageUrlFieldName];
                string detailPageName           = row[SysConfig.DetailPageNameFieldName];
                string companyId = row["companyId"];

                bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]);
                if (!giveUp)
                {
                    HtmlAgilityPack.HtmlDocument pageHtmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i);

                    HtmlNodeCollection trNodeList = pageHtmlDoc.DocumentNode.SelectNodes("//table/tbody/tr");
                    if (trNodeList != null)
                    {
                        for (int j = 0; j < trNodeList.Count; j++)
                        {
                            try
                            {
                                HtmlNode           trNode     = trNodeList[j];
                                HtmlNodeCollection tdNodeList = trNode.SelectNodes("./td");
                                if (tdNodeList != null && tdNodeList.Count > 0)
                                {
                                    HtmlNode indexNode = tdNodeList[0];
                                    if (indexNode.GetAttributeValue("data-header", "") == "序号")
                                    {
                                        try
                                        {
                                            Dictionary <string, string> f2vs = new Dictionary <string, string>();
                                            f2vs.Add("CompanyId", companyId);
                                            f2vs.Add("项目编码", tdNodeList.Count < 2 ? "" : tdNodeList[1].InnerText.Trim());
                                            f2vs.Add("项目名称", tdNodeList.Count < 3 ? "" : tdNodeList[2].InnerText.Trim());
                                            f2vs.Add("项目属地", tdNodeList.Count < 4 ? "" : tdNodeList[3].InnerText.Trim());
                                            f2vs.Add("项目类别", tdNodeList.Count < 5 ? "" : tdNodeList[4].InnerText.Trim());
                                            f2vs.Add("建设单位", tdNodeList.Count < 6 ? "" : tdNodeList[5].InnerText.Trim());
                                            cw.AddRow(f2vs);
                                        }
                                        catch (Exception ex)
                                        {
                                            throw ex;
                                        }
                                    }
                                }
                            }
                            catch (Exception ex)
                            {
                                throw ex;
                            }
                        }
                    }
                }
            }

            cw.SaveToDisk();
        }