Exemplo n.º 1
0
        public override bool BeforeAllGrab()
        {
            try
            {
                string[] ps = this.Parameters.Split(new string[] { "," }, StringSplitOptions.RemoveEmptyEntries);
                string loginPageUrl = ps[0];
                string loginSucceedCheckUrl = ps[1];

                this.RunPage.MustReGrab = true;

                if (SysConfig.SysExecuteType == SysExecuteType.Produce)
                {
                    //如果是生产环境,那么直接爬取列表页
                    LoginLinkedin.Login(this.RunPage, loginPageUrl, loginSucceedCheckUrl);
                }
                else
                {
                    //读取历史爬取的列表页地址文件
                    if (this.RunPage.TryGetInfoFromMiddleFile("login", "login") == null)
                    {
                        LoginLinkedin.Login(this.RunPage, loginPageUrl, loginSucceedCheckUrl);
                        this.RunPage.SaveInfoToMiddleFile("login", "login", new List<string>());
                    }
                }
                this.RunPage.BeginGrab();

                return false;
            }
            catch (Exception ex)
            {
                throw new Exception("登录Linkedin失败! ", ex);
            }
        }
Exemplo n.º 2
0
        private void GetOneKeyWordsRelatedInfos(Dictionary <string, string> seedRow)
        {
            //下一步必须执行
            bool   isNewDo          = false;
            string localLogFileName = null;
            string keyWords         = this.GetKeyWords(seedRow);

            List <string> allListPageUrls = null;

            localLogFileName = "_" + this.GetLoginName(seedRow) + "_" + this.GetKeyWords(seedRow) + "_listPageUrl";
            if (SysConfig.SysExecuteType == SysExecuteType.Produce)
            {
                //如果是生产环境,那么直接爬取列表页
                allListPageUrls = this.GetAllListPages(this.GetSeedPageUrl(seedRow), keyWords);
                this.RunPage.SaveInfoToMiddleFile(localLogFileName, "listPageUrl", allListPageUrls);
                isNewDo = true;
            }
            else
            {
                //读取历史爬取的列表页地址文件
                allListPageUrls = this.RunPage.TryGetInfoFromMiddleFile(localLogFileName, "listPageUrl");
                if (allListPageUrls == null)
                {
                    allListPageUrls = this.GetAllListPages(this.GetSeedPageUrl(seedRow), keyWords);
                    this.RunPage.SaveInfoToMiddleFile(localLogFileName, "listPageUrl", allListPageUrls);
                    isNewDo = true;
                }
            }

            List <Dictionary <string, string> > allPersonPageUrlInfos = null;

            localLogFileName = "_" + this.GetLoginName(seedRow) + "_" + this.GetKeyWords(seedRow) + "_personPageUrlInfo";
            if (SysConfig.SysExecuteType == SysExecuteType.Produce || isNewDo)
            {
                //如果是生产环境,那么直接解析列表页
                allPersonPageUrlInfos = this.GetPersonPageUrlsFromListPages(this.RunPage.GetDetailSourceFileDir(), allListPageUrls);
                this.RunPage.SaveInfoToMiddleFile(localLogFileName, new string[] { "personUrl", "personName" }, allPersonPageUrlInfos);
            }
            else
            {
                //读取历史解析获得的个人网页地址
                allPersonPageUrlInfos = this.RunPage.TryGetInfoFromMiddleFile(localLogFileName, new string[] { "personUrl", "personName" });
                if (allPersonPageUrlInfos == null)
                {
                    allPersonPageUrlInfos = this.GetPersonPageUrlsFromListPages(this.RunPage.GetDetailSourceFileDir(), allListPageUrls);
                    this.RunPage.SaveInfoToMiddleFile(localLogFileName, new string[] { "personUrl", "personName" }, allPersonPageUrlInfos);
                    isNewDo = true;
                }
            }

            LoginLinkedin.LoginByRandomUser(this.RunPage, this.LinkedinLoginPageUrl, this.LinkedinLoginSucceedCheckUrl);

            List <Dictionary <string, string> > allPersonPageInfosWithJustDownloadMark = null;

            localLogFileName = "_" + this.GetLoginName(seedRow) + "_" + this.GetKeyWords(seedRow) + "_personPageUrl";
            if (SysConfig.SysExecuteType == SysExecuteType.Produce || isNewDo)
            {
                //如果是生产环境,那么直接爬取个人详情页
                allPersonPageInfosWithJustDownloadMark = ProcessPersonPage.GetAllPersonPages(this.RunPage, allPersonPageUrlInfos, this.GetLoginName(seedRow), this.GetLoginPassword(seedRow));
                this.RunPage.SaveInfoToMiddleFile(localLogFileName, new string[] { "personUrl", "personUrl", "isJustDownload" }, allPersonPageInfosWithJustDownloadMark);
                isNewDo = true;
            }
            else
            {
                //读取历史生成的个人网页网址
                allPersonPageInfosWithJustDownloadMark = this.RunPage.TryGetInfoFromMiddleFile(this.GetLoginName(seedRow) + "." + this.GetKeyWords(seedRow) + ".personPageUrl", new string[] { "personUrl", "personUrl", "isJustDownload" });
                if (allPersonPageInfosWithJustDownloadMark == null)
                {
                    allPersonPageInfosWithJustDownloadMark = ProcessPersonPage.GetAllPersonPages(this.RunPage, allPersonPageUrlInfos, this.GetLoginName(seedRow), this.GetLoginPassword(seedRow));
                    this.RunPage.SaveInfoToMiddleFile(localLogFileName, new string[] { "personUrl", "personUrl", "isJustDownload" }, allPersonPageInfosWithJustDownloadMark);
                    isNewDo = true;
                }
            }

            List <Dictionary <string, string> > personInfoList = ProcessPersonPage.GetPersonInfoFromLocalPages(this.RunPage, allPersonPageInfosWithJustDownloadMark, true, this.GetKeyWords(seedRow));

            string personInfosResultFilePath = this.RunPage.GetFilePath("SearchResult_Google2Linkedin_" + this.GetLoginName(seedRow) + "_" + this.GetKeyWords(seedRow) + ".xlsx", this.RunPage.GetExportDir());

            ProcessPersonPage.SavePersonInfoToFile(this.RunPage, personInfoList, personInfosResultFilePath);
        }
Exemplo n.º 3
0
        public override void AfterGrabOne(string pageUrl, Dictionary <string, string> seedRow, bool needReGrab, bool existLocalFile)
        {
            try
            {
                this.GetPageFromBaidu(pageUrl, this.GetKeyWords(seedRow));

                //下一步必须执行
                bool   isNewDo          = false;
                string localLogFileName = null;

                List <string> allListPageUrls = null;
                localLogFileName = "_" + this.GetKeyWords(seedRow) + "_listPageUrl";
                if (SysConfig.SysExecuteType == SysExecuteType.Produce)
                {
                    //如果是生产环境,那么直接爬取列表页
                    allListPageUrls = this.GetAllListPages(this.GetSeedPageUrl(seedRow), this.GetKeyWords(seedRow));
                    this.RunPage.SaveInfoToMiddleFile(localLogFileName, "listPageUrl", allListPageUrls);
                    isNewDo = true;
                }
                else
                {
                    //读取历史爬取的列表页地址文件
                    allListPageUrls = this.RunPage.TryGetInfoFromMiddleFile(localLogFileName, "listPageUrl");
                    if (allListPageUrls == null)
                    {
                        allListPageUrls = this.GetAllListPages(this.GetSeedPageUrl(seedRow), this.GetKeyWords(seedRow));
                        this.RunPage.SaveInfoToMiddleFile(localLogFileName, "listPageUrl", allListPageUrls);
                        isNewDo = true;
                    }
                }

                List <Dictionary <string, string> > allPersonPageUrlInfos = null;
                localLogFileName = "_" + this.GetKeyWords(seedRow) + "_personPageUrlInfo";
                if (SysConfig.SysExecuteType == SysExecuteType.Produce || isNewDo)
                {
                    //如果是生产环境,那么直接解析列表页
                    allPersonPageUrlInfos = this.GetPersonPageUrlsFromListPages(this.RunPage.GetDetailSourceFileDir(), allListPageUrls, this.GetKeyWords(seedRow));
                    this.RunPage.SaveInfoToMiddleFile(localLogFileName, new string[] { "personUrl", "personName" }, allPersonPageUrlInfos);
                }
                else
                {
                    //读取历史解析获得的个人网页地址
                    allPersonPageUrlInfos = this.RunPage.TryGetInfoFromMiddleFile(localLogFileName, new string[] { "personUrl", "personName" });
                    if (allPersonPageUrlInfos == null)
                    {
                        allPersonPageUrlInfos = this.GetPersonPageUrlsFromListPages(this.RunPage.GetDetailSourceFileDir(), allListPageUrls, this.GetKeyWords(seedRow));
                        this.RunPage.SaveInfoToMiddleFile(localLogFileName, new string[] { "personUrl", "personName" }, allPersonPageUrlInfos);
                        isNewDo = true;
                    }
                }

                LoginLinkedin.LoginByRandomUser(this.RunPage, this.LinkedinLoginPageUrl, this.LinkedinLoginSucceedCheckUrl);

                List <string> allPersonPageUrls = null;
                localLogFileName = "_" + this.GetKeyWords(seedRow) + "_personPageUrl";
                if (SysConfig.SysExecuteType == SysExecuteType.Produce || isNewDo)
                {
                    //如果是生产环境,那么直接爬取个人详情页
                    allPersonPageUrls = ProcessPersonPage.GetAllPersonPageUrls(this.RunPage, allPersonPageUrlInfos, this.GetLoginName(seedRow), this.GetLoginPassword(seedRow));
                    this.RunPage.SaveInfoToMiddleFile(localLogFileName, "personUrl", allPersonPageUrls);
                    isNewDo = true;
                }
                else
                {
                    //读取历史生成的个人网页网址
                    allPersonPageUrls = this.RunPage.TryGetInfoFromMiddleFile(localLogFileName, "personUrl");
                    if (allPersonPageUrls == null)
                    {
                        allPersonPageUrls = ProcessPersonPage.GetAllPersonPageUrls(this.RunPage, allPersonPageUrlInfos, this.GetLoginName(seedRow), this.GetLoginPassword(seedRow));
                        this.RunPage.SaveInfoToMiddleFile(localLogFileName, "personUrl", allPersonPageUrls);
                        isNewDo = true;
                    }
                }

                List <Dictionary <string, string> > personInfoList = ProcessPersonPage.GetPersonInfoFromLocalPages(this.RunPage, allPersonPageUrls, true, this.GetKeyWords(seedRow));

                string personInfosResultFilePath = this.RunPage.GetFilePath("_SearchResult_Baidu2Linkedin_" + this.GetKeyWords(seedRow) + ".xlsx", this.RunPage.GetExportDir());

                ProcessPersonPage.SavePersonInfoToFile(this.RunPage, personInfoList, personInfosResultFilePath);
            }
            catch (Exception ex)
            {
                throw ex;
            }
        }
Exemplo n.º 4
0
        public override void GetDataByOtherAccessType(Dictionary <string, string> listRow)
        {
            string keyWords      = listRow["keyWords"];
            string loginName     = listRow["loginName"];
            string loginPassword = listRow["loginPassword"];

            string fileName = "_" + keyWords + "_" + loginName;
            List <Dictionary <string, string> > personInfos = this.RunPage.TryGetInfoFromMiddleFile(fileName, new string[] { "personName", "personUrl" });

            if (personInfos == null)
            {
                this.RunPage.InvokeAppendLogText("开始使用APP搜索相关人员, 关键词为'" + keyWords + "'", LogLevelType.System, true);

                AndroidAppAccess appAccess = null;

                try
                {
                    this.RunPage.InvokeAppendLogText("开始连接手机APP", LogLevelType.System, true);
                    appAccess = this.InitAppAccess();
                    this.RunPage.InvokeAppendLogText("连接手机APP成功", LogLevelType.System, true);

                    bool needLogout = false;

                    try
                    {
                        this.RunPage.InvokeAppendLogText("检测是否需要登出", LogLevelType.System, true);
                        needLogout = this.CheckNeedLogout(appAccess);
                    }
                    catch (Exception ex)
                    {
                        throw new Exception("检查是否需要登出出错.", ex);
                    }
                    if (needLogout)
                    {
                        try
                        {
                            this.RunPage.InvokeAppendLogText("开始登出", LogLevelType.System, true);
                            this.DoLogout(appAccess);
                            this.RunPage.InvokeAppendLogText("已经登出", LogLevelType.System, true);
                        }
                        catch (Exception ex)
                        {
                            throw new Exception("登出失败.", ex);
                        }
                    }

                    try
                    {
                        this.RunPage.InvokeAppendLogText("开始登录", LogLevelType.System, true);
                        this.DoLogin(appAccess, loginName, loginPassword);
                        this.RunPage.InvokeAppendLogText("已经登录", LogLevelType.System, true);
                    }
                    catch (Exception ex)
                    {
                        throw new Exception("登录失败.", ex);
                    }

                    try
                    {
                        personInfos = this.SearchPersonByKeyWord(appAccess, keyWords);
                    }
                    catch (Exception ex)
                    {
                        throw new Exception("在app中根据关键字抓取出错, keyWords = " + keyWords);
                    }

                    this.RunPage.SaveInfoToMiddleFile(fileName, new string[] { "personName", "personUrl" }, personInfos);
                    this.RunPage.InvokeAppendLogText("完成使用手机APP搜索关键词'" + keyWords + "", LogLevelType.System, true);
                }
                catch (Exception ex)
                {
                    throw ex;
                }
                finally
                {
                    if (appAccess != null)
                    {
                        this.CloseAppAccess(appAccess);
                    }
                }
            }

            try
            {
                string checkedRelatedPersonInfosResultFilePath = this.RunPage.GetFilePath("_SearchResult_LinkedinApp_" + loginName + "_" + keyWords + ".xlsx", this.RunPage.GetExportDir());
                if (!File.Exists(checkedRelatedPersonInfosResultFilePath))
                {
                    this.RunPage.InvokeAppendLogText("登录Linkedin系统", LogLevelType.System, true);
                    LoginLinkedin.LoginByRandomUser(this.RunPage, this.LinkedinLoginPageUrl, this.LinkedinLoginSucceedCheckUrl);
                    this.RunPage.InvokeAppendLogText("已登录Linkedin系统", LogLevelType.System, true);

                    this.RunPage.InvokeAppendLogText("根据个人页面地址(人员列表是从手机APP搜索到的,关键词为'" + keyWords + "'),从Linkedin网页版获取个人信息", LogLevelType.System, true);
                    string personPageInfosWithJustDownloadMarkFileName = "_" + loginName + "_" + keyWords + "_personPageInfosWithJustDownloadMark";
                    //读取历史生成的个人网页网址
                    List <Dictionary <string, string> > allPersonPageInfosWithJustDownloadMark = this.RunPage.TryGetInfoFromMiddleFile(personPageInfosWithJustDownloadMarkFileName, new string[] { "personUrl", "personName", "isJustDownload" });
                    if (allPersonPageInfosWithJustDownloadMark == null)
                    {
                        allPersonPageInfosWithJustDownloadMark = ProcessPersonPage.GetAllPersonPages(this.RunPage, personInfos, loginName, loginPassword);
                        this.RunPage.SaveInfoToMiddleFile(personPageInfosWithJustDownloadMarkFileName, new string[] { "personUrl", "personName", "isJustDownload" }, allPersonPageInfosWithJustDownloadMark);
                    }

                    string personInfosFilePath = this.RunPage.GetFilePath("_SearchResult_LinkedinApp_" + loginName + "_" + keyWords + ".xlsx", this.RunPage.GetExportDir());
                    if (!File.Exists(personInfosFilePath))
                    {
                        List <Dictionary <string, string> > personInfoList = ProcessPersonPage.GetPersonInfoFromLocalPages(this.RunPage, allPersonPageInfosWithJustDownloadMark, false, null);
                        ProcessPersonPage.SavePersonInfoToFile(this.RunPage, personInfoList, personInfosFilePath);
                    }
                    this.RunPage.InvokeAppendLogText("完成获取并处理个人页面(人员列表是从手机APP搜索到的,关键词为'" + keyWords + "')", LogLevelType.System, true);

                    /*
                     * this.RunPage.InvokeAppendLogText("从已爬取到的页面中,找到'看过本页的会员还看了'栏目里的个人信息,递归获取", LogLevelType.System, true);
                     * string checkedRelatedPersonIdsFileName = "_" + keyWords + "_" + loginName + "_CheckedRelated";
                     * List<Dictionary<string, string>> allCheckedRelatedIds = this.RunPage.TryGetInfoFromMiddleFile(checkedRelatedPersonIdsFileName, new string[] { "checkedRelatedPersonInfoId", "levelCount" });
                     * if (allCheckedRelatedIds == null)
                     * {
                     *  allCheckedRelatedIds = new List<Dictionary<string, string>>();
                     *  Dictionary<string, string> allRelatedPersonUrlInfos = new Dictionary<string, string>();
                     *  ProcessPersonPage.GetRelatedPersonInfos(this.RunPage, allPersonPageInfosWithJustDownloadMark, allRelatedPersonUrlInfos, allCheckedRelatedIds, keyWords);
                     *  this.RunPage.SaveInfoToMiddleFile(checkedRelatedPersonIdsFileName, new string[] { "checkedRelatedPersonInfoId", "levelCount" }, allCheckedRelatedIds);
                     * }
                     *
                     * this.RunPage.InvokeAppendLogText("从已爬取到的页面中,找到'看过本页的会员还看了'栏目里的个人信息", LogLevelType.System, true);
                     * string checkedRelatedPersonUrlsFileName = "_" + loginName + "_" + keyWords + "_personUrl_CheckedRelated";
                     * List<Dictionary<string, string>> allCheckAllCheckedRelatedUrls = this.RunPage.TryGetInfoFromMiddleFile(checkedRelatedPersonUrlsFileName, new string[] { "personUrl", "personName" });
                     * if (allCheckAllCheckedRelatedUrls == null)
                     * {
                     *  allCheckAllCheckedRelatedUrls = ProcessPersonPage.GetAllPersonPages(this.RunPage, allCheckedRelatedIds);
                     * }
                     *
                     * List<string> checkRelatedPersonPageIds = new List<string>();
                     * foreach (Dictionary<string, string> checkedRelatedId in allCheckedRelatedIds)
                     * {
                     *  string checkedRelatedPersonInfoId = checkedRelatedId["checkedRelatedPersonInfoId"];
                     *  checkRelatedPersonPageIds.Add(checkedRelatedPersonInfoId);
                     * }
                     * List<Dictionary<string, string>> relatedPersonInfoList = ProcessPersonPage.GetPersonInfoFromLocalPages(this.RunPage, allCheckAllCheckedRelatedUrls, false, null);
                     * ProcessPersonPage.SavePersonInfoToFile(this.RunPage, relatedPersonInfoList, checkedRelatedPersonInfosResultFilePath);
                     */

                    this.RunPage.InvokeAppendLogText("完成递归爬取到关键词'" + keyWords + "'相关的所有的'看过本页的会员还看了'", LogLevelType.System, true);
                }
            }
            catch (Exception ex)
            {
                throw new Exception("通过网页获取/处理个人信息出错", ex);
            }
        }