protected StatusQueue queueStatus; //微博队列引用 //构造函数,需要传入相应的新浪微博API和主界面 public RobotBase(SysArgFor robotType) { crawler = new SinaMBCrawler(robotType); api = GlobalPool.GetAPI(robotType); switch (robotType) { case SysArgFor.USER_INFO: if (iMinSleep < GlobalPool.MinSleepMsForUserInfo) { iMinSleep = GlobalPool.MinSleepMsForUserInfo; } break; case SysArgFor.USER_TAG: if (iMinSleep < GlobalPool.MinSleepMsForUserTag) { iMinSleep = GlobalPool.MinSleepMsForUserTag; } break; case SysArgFor.STATUS: if (iMinSleep < GlobalPool.MinSleepMsForStatus) { iMinSleep = GlobalPool.MinSleepMsForStatus; } break; case SysArgFor.COMMENT: if (iMinSleep < GlobalPool.MinSleepMsForComment) { iMinSleep = GlobalPool.MinSleepMsForComment; } break; default: if (iMinSleep < GlobalPool.MinSleepMsForUserRelation) { iMinSleep = GlobalPool.MinSleepMsForUserRelation; } break; } AdjustRealFreq(); }
/// <summary> /// 以指定的UserID为起点开始爬行 /// </summary> /// <param name="lUid"></param> public void Start() { //获取上次中止处的用户ID并入队 long lLastUID = SysArg.GetCurrentID(SysArgFor.USER_INFO); if (lLastUID > 0) { queueUserForUserInfoRobot.Enqueue(lLastUID); } while (queueUserForUserInfoRobot.Count == 0) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); //若队列为空,则等待 } AdjustFreq(); SetCrawlerFreq(); Log("The initial requesting interval is " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour."); User user; //对队列循环爬行 while (true) { if (blnAsyncCancelled) { return; } while (blnSuspending) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); } //将队头取出 //lCurrentID = queueUserForUserInfoRobot.RollQueue(); lCurrentID = queueUserForUserInfoRobot.FirstValue; //日志 Log("Recording current UserID: " + lCurrentID.ToString() + "..."); SysArg.SetCurrentID(lCurrentID, SysArgFor.USER_INFO); #region 用户基本信息 if (blnAsyncCancelled) { return; } while (blnSuspending) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); } Log("Crawling information of User " + lCurrentID.ToString() + "..."); user = crawler.GetUserInfo(lCurrentID); //日志 AdjustFreq(); SetCrawlerFreq(); Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour."); if (user != null && user.user_id > 0) { //若数据库中不存在当前用户的基本信息,则爬取,加入数据库 if (!User.ExistInDB(lCurrentID)) { //日志 Log("Saving User " + lCurrentID.ToString() + " into database..."); user.Add(); } else { //日志 Log("Updating the information of User " + lCurrentID.ToString() + "..."); user.Update(); } if (InvalidUser.ExistInDB(lCurrentID)) { //日志 Log("Removing User " + lCurrentID.ToString() + " from invalid users..."); InvalidUser.RemoveFromDB(lCurrentID); } //日志 Log("The information of User " + lCurrentID.ToString() + " crawled."); queueUserForUserInfoRobot.RollQueue(); } else if (user == null) //用户不存在 { Log("Recording invalid User " + lCurrentID.ToString() + "..."); InvalidUser iu = new InvalidUser(); iu.user_id = lCurrentID; iu.Add(); //将该用户ID从各个队列中去掉 Log("Removing invalid User " + lCurrentID.ToString() + " from all queues..."); queueUserForUserRelationRobot.Remove(lCurrentID); queueUserForUserInfoRobot.Remove(lCurrentID); if (GlobalPool.TagRobotEnabled) { queueUserForUserTagRobot.Remove(lCurrentID); } if (GlobalPool.StatusRobotEnabled) { queueUserForStatusRobot.Remove(lCurrentID); } } else if (user.user_id == -1) //forbidden { int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds; Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue..."); for (int i = 0; i < iSleepSeconds; i++) { if (blnAsyncCancelled) { return; } Thread.Sleep(1000); } } else if (user.user_id == -2) //timeout { int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds; Log("Time out. I will crawl user " + lCurrentID.ToString() + " again..."); } #endregion } }
public SinaMBCrawler(SysArgFor crawlerType) { api = GlobalPool.GetAPI(crawlerType); }
/// <summary> /// 以指定的UserID为起点开始爬行 /// </summary> /// <param name="lUid"></param> public void Start(long lStartUserID) { if (lStartUserID == 0) { return; } AdjustRealFreq(); SetCrawlerFreq(); Log("The initial requesting interval is " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour."); //将起始UserID入队 queueUserForUserRelationRobot.Enqueue(lStartUserID); if (GlobalPool.UserInfoRobotEnabled) { queueUserForUserInfoRobot.Enqueue(lStartUserID); } if (GlobalPool.TagRobotEnabled) { queueUserForUserTagRobot.Enqueue(lStartUserID); } if (GlobalPool.StatusRobotEnabled) { queueUserForStatusRobot.Enqueue(lStartUserID); } lCurrentID = lStartUserID; //对队列无限循环爬行,直至有操作暂停或停止 while (true) { if (blnAsyncCancelled) { return; } while (blnSuspending) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); } //将队头取出 //lCurrentID = queueUserForUserRelationRobot.RollQueue(); lCurrentID = queueUserForUserRelationRobot.FirstValue; //日志 Log("Recording current UserID:" + lCurrentID.ToString() + "..."); SysArg.SetCurrentID(lCurrentID, SysArgFor.USER_RELATION); #region 用户关注列表 if (blnAsyncCancelled) { return; } while (blnSuspending) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); } //日志 Log("Crawling the followings of User " + lCurrentID.ToString() + "..."); //爬取当前用户的关注的用户ID,记录关系,加入队列 LinkedList <long> lstBuffer = crawler.GetFriendsOf(lCurrentID, -1); if (lstBuffer.Count > 0 && lstBuffer.First.Value == -1) { int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds; Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue..."); for (int i = 0; i < iSleepSeconds; i++) { if (blnAsyncCancelled) { return; } Thread.Sleep(1000); } continue; } //日志 Log(lstBuffer.Count.ToString() + " followings crawled."); //日志 AdjustFreq(); SetCrawlerFreq(); Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour."); while (lstBuffer.Count > 0) { if (blnAsyncCancelled) { return; } while (blnSuspending) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); } lQueueBufferFirst = lstBuffer.First.Value; int nRecordRelation = 1; if (blnConfirmRelationship) { //日志 Log("Confirming the relationship between User " + lCurrentID.ToString() + " and User " + lQueueBufferFirst.ToString()); nRecordRelation = crawler.RelationExistBetween(lCurrentID, lQueueBufferFirst); //日志 AdjustFreq(); SetCrawlerFreq(); Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour."); if (nRecordRelation == -1) { int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds; Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue..."); for (int i = 0; i < iSleepSeconds; i++) { if (blnAsyncCancelled) { return; } Thread.Sleep(1000); } continue; } if (nRecordRelation == 1) { //日志 Log("Relationship confirmed. Recording User " + lCurrentID.ToString() + " follows User " + lQueueBufferFirst.ToString() + "..."); } else { //日志 Log("Relationship not exists. Recording invalid relationship..."); InvalidRelation ir = new InvalidRelation(); ir.source_user_id = lCurrentID; ir.target_user_id = lQueueBufferFirst; ir.Add(); Log("Recording invalid User " + lQueueBufferFirst.ToString() + "..."); InvalidUser iu = new InvalidUser(); iu.user_id = lQueueBufferFirst; iu.Add(); //将该用户ID从各个队列中去掉 Log("Removing invalid User " + lQueueBufferFirst.ToString() + " from all queues..."); queueUserForUserRelationRobot.Remove(lQueueBufferFirst); if (GlobalPool.UserInfoRobotEnabled) { queueUserForUserInfoRobot.Remove(lQueueBufferFirst); } if (GlobalPool.TagRobotEnabled) { queueUserForUserTagRobot.Remove(lQueueBufferFirst); } if (GlobalPool.StatusRobotEnabled) { queueUserForStatusRobot.Remove(lQueueBufferFirst); } } }//if (blnConfirmRelationship) else { //日志 Log("Recording User " + lCurrentID.ToString() + " follows User " + lQueueBufferFirst.ToString() + "..."); } if (nRecordRelation == 1) { if (UserRelation.RelationshipExist(lCurrentID, lQueueBufferFirst)) { //日志 Log("Relationship exists."); } else { UserRelation ur = new UserRelation(); ur.source_user_id = lCurrentID; ur.target_user_id = lQueueBufferFirst; ur.Add(); } //加入队列 if (queueUserForUserRelationRobot.Enqueue(lQueueBufferFirst)) { //日志 Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of User Relation Robot..."); } if (GlobalPool.UserInfoRobotEnabled && queueUserForUserInfoRobot.Enqueue(lQueueBufferFirst)) { //日志 Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of User Information Robot..."); } if (GlobalPool.TagRobotEnabled && queueUserForUserTagRobot.Enqueue(lQueueBufferFirst)) { //日志 Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of User Tag Robot..."); } if (GlobalPool.StatusRobotEnabled && queueUserForStatusRobot.Enqueue(lQueueBufferFirst)) { //日志 Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of Status Robot..."); } } lstBuffer.RemoveFirst(); }//while (lstBuffer.Count > 0) #endregion #region 用户粉丝列表 if (blnAsyncCancelled) { return; } while (blnSuspending) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); } //日志 Log("Crawling the followers of User " + lCurrentID.ToString() + "..."); //爬取当前用户的粉丝的用户ID,记录关系,加入队列 lstBuffer = crawler.GetFollowersOf(lCurrentID, -1); if (lstBuffer.Count > 0 && lstBuffer.First.Value == -1) { int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds; Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue..."); for (int i = 0; i < iSleepSeconds; i++) { if (blnAsyncCancelled) { return; } Thread.Sleep(1000); } continue; } //日志 Log(lstBuffer.Count.ToString() + " followers crawled."); //日志 AdjustFreq(); SetCrawlerFreq(); Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour."); while (lstBuffer.Count > 0) { if (blnAsyncCancelled) { return; } while (blnSuspending) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); } lQueueBufferFirst = lstBuffer.First.Value; int nRecordRelation = 1; if (blnConfirmRelationship) { //日志 Log("Confirming the relationship between User " + lQueueBufferFirst.ToString() + " and User " + lCurrentID.ToString()); nRecordRelation = crawler.RelationExistBetween(lQueueBufferFirst, lCurrentID); //日志 AdjustFreq(); SetCrawlerFreq(); Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour."); if (nRecordRelation == -1) { int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds; Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue..."); for (int i = 0; i < iSleepSeconds; i++) { if (blnAsyncCancelled) { return; } Thread.Sleep(1000); } continue; } if (nRecordRelation == 1) { //日志 Log("Relationship confirmed. Recording User " + lQueueBufferFirst.ToString() + " follows User " + lCurrentID.ToString() + "..."); } else { //日志 Log("Relationship not exists. Recording invalid relationship..."); InvalidRelation ir = new InvalidRelation(); ir.source_user_id = lQueueBufferFirst; ir.target_user_id = lCurrentID; ir.Add(); Log("Recording invalid User " + lQueueBufferFirst.ToString() + "..."); InvalidUser iu = new InvalidUser(); iu.user_id = lQueueBufferFirst; iu.Add(); //将该用户ID从各个队列中去掉 Log("Removing invalid User " + lQueueBufferFirst.ToString() + " from all queues..."); queueUserForUserRelationRobot.Remove(lQueueBufferFirst); if (GlobalPool.UserInfoRobotEnabled) { queueUserForUserInfoRobot.Remove(lQueueBufferFirst); } if (GlobalPool.TagRobotEnabled) { queueUserForUserTagRobot.Remove(lQueueBufferFirst); } if (GlobalPool.StatusRobotEnabled) { queueUserForStatusRobot.Remove(lQueueBufferFirst); } } } else { //日志 Log("Recording User " + lQueueBufferFirst.ToString() + " follows User " + lCurrentID.ToString() + "..."); } if (nRecordRelation == 1) { if (UserRelation.RelationshipExist(lQueueBufferFirst, lCurrentID)) { //日志 Log("Relationship exists."); } else { UserRelation ur = new UserRelation(); ur.source_user_id = lQueueBufferFirst; ur.target_user_id = lCurrentID; ur.Add(); } //加入队列 if (queueUserForUserRelationRobot.Enqueue(lQueueBufferFirst)) { //日志 Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of User Relation Robot..."); } if (GlobalPool.UserInfoRobotEnabled && queueUserForUserInfoRobot.Enqueue(lQueueBufferFirst)) { //日志 Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of User Information Robot..."); } if (GlobalPool.TagRobotEnabled && queueUserForUserTagRobot.Enqueue(lQueueBufferFirst)) { //日志 Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of User Tag Robot..."); } if (GlobalPool.StatusRobotEnabled && queueUserForStatusRobot.Enqueue(lQueueBufferFirst)) { //日志 Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of Status Robot..."); } } lstBuffer.RemoveFirst(); }//while (lstBuffer.Count > 0) #endregion queueUserForUserRelationRobot.RollQueue(); //日志 Log("Social grapgh of User " + lCurrentID.ToString() + " crawled."); } }
/// <summary> /// 以指定的UserID为起点开始爬行 /// </summary> /// <param name="lUid"></param> public void Start() { //获取上次中止处的用户ID并入队 long lLastUID = SysArg.GetCurrentID(SysArgFor.USER_TAG); if (lLastUID > 0) { queueUserForUserTagRobot.Enqueue(lLastUID); } while (queueUserForUserTagRobot.Count == 0) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); //若队列为空,则等待 } AdjustRealFreq(); SetCrawlerFreq(); Log("The initial requesting interval is " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour."); //对队列无限循环爬行,直至有操作暂停或停止 while (true) { if (blnAsyncCancelled) { return; } while (blnSuspending) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); } //将队头取出 //lCurrentID = queueUserForUserTagRobot.RollQueue(); lCurrentID = queueUserForUserTagRobot.FirstValue; //日志 Log("Recording current UserID: " + lCurrentID.ToString() + "..."); SysArg.SetCurrentID(lCurrentID, SysArgFor.USER_TAG); #region 用户标签信息 if (blnAsyncCancelled) { return; } while (blnSuspending) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); } //日志 Log("Crawling tags of User " + lCurrentID.ToString() + "..."); LinkedList <Tag> lstTag = crawler.GetTagsOf(lCurrentID); if (lstTag.Count > 0 && lstTag.First.Value.tag_id > 0) { //日志 Log(lstTag.Count.ToString() + " tags crawled."); //日志 AdjustFreq(); SetCrawlerFreq(); Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour."); while (lstTag.Count > 0) { if (blnAsyncCancelled) { return; } while (blnSuspending) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); } Tag tag = lstTag.First.Value; if (!Tag.Exists(tag.tag_id)) { //日志 Log("Saving Tag " + tag.tag_id.ToString() + " into database..."); tag.Add(); } else { //日志 //Log( "Tag " + tag.tag_id.ToString() + " exists." ); Log("Updating Tag " + tag.tag_id.ToString() + " into database..."); tag.Update(); } if (!UserTag.Exists(lCurrentID, tag.tag_id)) { //日志 Log("Recording User " + lCurrentID.ToString() + " has Tag " + tag.tag_id.ToString() + "..."); UserTag user_tag = new UserTag(); user_tag.user_id = lCurrentID; user_tag.tag_id = tag.tag_id; user_tag.Add(); } else { //日志 Log("Tag " + tag.tag_id.ToString() + " of User " + lCurrentID.ToString() + " exists."); } lstTag.RemoveFirst(); } queueUserForUserTagRobot.RollQueue(); //日志 Log("Tags of User " + lCurrentID.ToString() + " crawled."); } else if (lstTag.Count > 0 && lstTag.First.Value.tag_id == -1) { lstTag.Clear(); int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds; Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue..."); for (int i = 0; i < iSleepSeconds; i++) { if (blnAsyncCancelled) { return; } Thread.Sleep(1000); } continue; } else if (lstTag.Count > 0 && lstTag.First.Value.tag_id == -2) { int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds; Log("Error! The error message is \"" + lstTag.First.Value.tag + "\". I will wait for " + iSleepSeconds.ToString() + "s to continue..."); lstTag.Clear(); for (int i = 0; i < iSleepSeconds; i++) { if (blnAsyncCancelled) { return; } Thread.Sleep(1000); } continue; } else { queueUserForUserTagRobot.RollQueue(); //日志 Log("Tags of User " + lCurrentID.ToString() + " crawled."); } #endregion } }
/// <summary> /// 开始爬行取微博评论 /// </summary> public void Start() { //获取上次中止处的微博ID并入队 long lLastStatusID = SysArg.GetCurrentID(SysArgFor.COMMENT); if (lLastStatusID > 0) { queueStatus.Enqueue(lLastStatusID); } while (queueStatus.Count == 0) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); //若队列为空,则等待 } AdjustRealFreq(); SetCrawlerFreq(); Log("The initial requesting interval is " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour."); //对队列无限循环爬行,直至有操作暂停或停止 while (true) { bool blnForbidden = false; if (blnAsyncCancelled) { return; } while (blnSuspending) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); } //将队头取出 lCurrentID = queueStatus.FirstValue; //lCurrentID = queueStatus.RollQueue(); //日志 Log("Recording current StatusID: " + lCurrentID.ToString() + "..."); SysArg.SetCurrentID(lCurrentID, SysArgFor.COMMENT); #region 微博相应评论 if (blnAsyncCancelled) { return; } while (blnSuspending) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); } //日志 Log("Crawling the comments of Status " + lCurrentID.ToString() + "..."); int iPage = 1; //爬取当前微博的评论 LinkedList <Comment> lstComment = new LinkedList <Comment>(); LinkedList <Comment> lstTemp = new LinkedList <Comment>(); lstTemp = crawler.GetCommentsOf(lCurrentID, iPage); //日志 AdjustFreq(); SetCrawlerFreq(); Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour."); while (lstTemp.Count > 0) { if (blnAsyncCancelled) { return; } while (blnSuspending) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); } while (lstTemp.Count > 0) { if (lstTemp.First.Value.comment_id > 0) { lstComment.AddLast(lstTemp.First.Value); lstTemp.RemoveFirst(); } else { blnForbidden = true; lstTemp.Clear(); int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds; Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue..."); for (int i = 0; i < iSleepSeconds; i++) { if (blnAsyncCancelled) { return; } Thread.Sleep(1000); } continue; } } iPage++; lstTemp = crawler.GetCommentsOf(lCurrentID, iPage); //日志 AdjustFreq(); SetCrawlerFreq(); Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour."); } if (blnForbidden) { continue; } //日志 Log(lstComment.Count.ToString() + " comments of Status " + lCurrentID.ToString() + " crawled."); Comment comment; while (lstComment.Count > 0) { if (blnAsyncCancelled) { return; } while (blnSuspending) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); } comment = lstComment.First.Value; if (!Comment.Exists(comment.comment_id)) { //日志 Log("Saving Comment " + comment.comment_id.ToString() + " into database..."); comment.Add(); } if (queueUserForUserRelationRobot.Enqueue(comment.user.user_id)) { Log("Adding Commenter " + comment.user.user_id.ToString() + " to the user queue of User Relation Robot..."); } if (GlobalPool.UserInfoRobotEnabled && queueUserForUserInfoRobot.Enqueue(comment.user.user_id)) { Log("Adding Commenter " + comment.user.user_id.ToString() + " to the user queue of User Information Robot..."); } if (GlobalPool.TagRobotEnabled && queueUserForUserTagRobot.Enqueue(comment.user.user_id)) { Log("Adding Commenter " + comment.user.user_id.ToString() + " to the user queue of User Tag Robot..."); } if (GlobalPool.StatusRobotEnabled && queueUserForStatusRobot.Enqueue(comment.user.user_id)) { Log("Adding Commenter " + comment.user.user_id.ToString() + " to the user queue of Status Robot..."); } if (!User.ExistInDB(comment.user.user_id)) { Log("Saving Commenter " + comment.user.user_id.ToString() + " into database..."); comment.user.Add(); } lstComment.RemoveFirst(); if (comment.reply_comment != null) { lstComment.AddLast(comment.reply_comment); } }//while for lstComment queueStatus.RollQueue(); //日志 Log("Comments of Status " + lCurrentID.ToString() + " crawled."); #endregion } }
/// <summary> /// 开始爬取微博数据 /// </summary> public void Start() { //获取上次中止处的用户ID并入队 long lLastUID = SysArg.GetCurrentID(SysArgFor.STATUS); if (lLastUID > 0) { queueUserForStatusRobot.Enqueue(lLastUID); } while (queueUserForStatusRobot.Count == 0) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); //若队列为空,则等待 } AdjustRealFreq(); SetCrawlerFreq(); Log("The initial requesting interval is " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour."); //对队列无限循环爬行,直至有操作暂停或停止 while (true) { if (blnAsyncCancelled) { return; } while (blnSuspending) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); } //将队头取出 //lCurrentID = queueUserForStatusRobot.RollQueue(); lCurrentID = queueUserForStatusRobot.FirstValue; //日志 Log("Recording current UserID: " + lCurrentID.ToString() + "..."); SysArg.SetCurrentID(lCurrentID, SysArgFor.STATUS); #region 用户微博信息 if (blnAsyncCancelled) { return; } while (blnSuspending) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); } //日志 Log("Getting the latest Status ID of User " + lCurrentID.ToString() + "..."); //获取数据库中当前用户最新一条微博的ID long lCurrentSID = Status.GetLastStatusIDOf(lCurrentID); if (blnAsyncCancelled) { return; } while (blnSuspending) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); } Status status; #region 后续微博 //日志 Log("Crawling statuses after Status " + lCurrentSID.ToString() + " of User " + lCurrentID.ToString() + "..."); //爬取数据库中当前用户最新一条微博的ID之后的微博,存入数据库 LinkedList <Status> lstStatus = crawler.GetStatusesOfSince(lCurrentID, lCurrentSID); if (lstStatus.Count > 0 && lstStatus.First.Value.status_id > 0) { //日志 Log(lstStatus.Count.ToString() + " statuses crawled."); //日志 AdjustFreq(); SetCrawlerFreq(); Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour."); while (lstStatus.Count > 0) { if (blnAsyncCancelled) { return; } while (blnSuspending) { if (blnAsyncCancelled) { return; } Thread.Sleep(GlobalPool.SleepMsForThread); } status = lstStatus.First.Value; SaveStatus(status); lstStatus.RemoveFirst(); } queueUserForStatusRobot.RollQueue(); //日志 Log("Statuses of User " + lCurrentID.ToString() + " crawled."); } else if (lstStatus.Count > 0 && lstStatus.First.Value.status_id == -1) { lstStatus.Clear(); int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.STATUS).ResetTimeInSeconds; Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue..."); for (int i = 0; i < iSleepSeconds; i++) { if (blnAsyncCancelled) { return; } Thread.Sleep(1000); } continue; } else { queueUserForStatusRobot.RollQueue(); //日志 Log("Statuses of User " + lCurrentID.ToString() + " crawled."); } #endregion #endregion } }