Ejemplo n.º 1
0
        /// <summary>
        /// 以指定的UserID为起点开始爬行
        /// </summary>
        /// <param name="lUid"></param>
        public void Start ()
        {
            //获取上次中止处的用户ID并入队
            long lLastUID = SysArg.GetCurrentID(SysArgFor.USER_INFO);
            if (lLastUID > 0) queueUserForUserInfoRobot.Enqueue(lLastUID);
            while (queueUserForUserInfoRobot.Count == 0)
            {
                if (blnAsyncCancelled) return;
                Thread.Sleep(GlobalPool.SleepMsForThread);   //若队列为空,则等待
            }
            AdjustFreq();
            SetCrawlerFreq();
            Log("The initial requesting interval is " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");
            User user;
            //对队列循环爬行
            while (true)
            {
                if (blnAsyncCancelled) return;
                while (blnSuspending)
                {
                    if (blnAsyncCancelled) return;
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }
                //将队头取出
                //lCurrentID = queueUserForUserInfoRobot.RollQueue();
                lCurrentID = queueUserForUserInfoRobot.FirstValue;
                
                //日志
                Log("Recording current UserID: " + lCurrentID.ToString()+"...");
                SysArg.SetCurrentID(lCurrentID, SysArgFor.USER_INFO);

                #region 用户基本信息
                if (blnAsyncCancelled) return;
                while (blnSuspending)
                {
                    if (blnAsyncCancelled) return;
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }

                Log("Crawling information of User " + lCurrentID.ToString() + "...");
                user = crawler.GetUserInfo(lCurrentID);
                //日志
                AdjustFreq();
                SetCrawlerFreq();
                Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");
                if (user!=null && user.user_id > 0)
                {
                    //若数据库中不存在当前用户的基本信息,则爬取,加入数据库
                    if (!User.ExistInDB( lCurrentID ))
                    {
                        //日志
                        Log("Saving User " + lCurrentID.ToString() + " into database...");
                        user.Add();
                    }
                    else
                    {
                        //日志
                        Log("Updating the information of User " + lCurrentID.ToString() + "...");
                        user.Update();
                    }
                    if(InvalidUser.ExistInDB(lCurrentID))
                    {
                        //日志
                        Log("Removing User " + lCurrentID.ToString() + " from invalid users...");
                        InvalidUser.RemoveFromDB(lCurrentID);
                    }
                    //日志
                    Log( "The information of User " + lCurrentID.ToString() + " crawled." );
                    queueUserForUserInfoRobot.RollQueue();
                }
                else if(user==null) //用户不存在
                {
                    Log("Recording invalid User " + lCurrentID.ToString() + "...");
                    InvalidUser iu = new InvalidUser();
                    iu.user_id = lCurrentID;
                    iu.Add();

                    //将该用户ID从各个队列中去掉
                    Log("Removing invalid User " + lCurrentID.ToString() + " from all queues...");
                    queueUserForUserRelationRobot.Remove(lCurrentID);
                    queueUserForUserInfoRobot.Remove(lCurrentID);
                    if (GlobalPool.TagRobotEnabled)
                        queueUserForUserTagRobot.Remove(lCurrentID);
                    if (GlobalPool.StatusRobotEnabled)
                        queueUserForStatusRobot.Remove(lCurrentID);
                }
                else if (user.user_id == -1)   //forbidden
                {
                    int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                    Log("Service is forbidden now. I will wait for " + iSleepSeconds .ToString()+ "s to continue...");
                    for(int i=0;i<iSleepSeconds;i++)
                    {
                        if (blnAsyncCancelled) return;
                        Thread.Sleep(1000);
                    }
                }
                else if (user.user_id == -2)   //timeout
                {
                    int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                    Log("Time out. I will crawl user "+lCurrentID.ToString()+" again...");
                }
                #endregion
            }
        }
Ejemplo n.º 2
0
        /// <summary>
        /// 以指定的UserID为起点开始爬行
        /// </summary>
        /// <param name="lUid"></param>
        public void Start(long lStartUserID)
        {
            if (lStartUserID == 0) return;
            AdjustRealFreq();
            SetCrawlerFreq();
            Log("The initial requesting interval is " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and "+api.RemainingUserHits.ToString()+" user hits left this hour.");

            //将起始UserID入队
            queueUserForUserRelationRobot.Enqueue(lStartUserID);
            if(GlobalPool.UserInfoRobotEnabled)
                queueUserForUserInfoRobot.Enqueue(lStartUserID);
            if(GlobalPool.TagRobotEnabled)
                queueUserForUserTagRobot.Enqueue(lStartUserID);
            if(GlobalPool.StatusRobotEnabled)
                queueUserForStatusRobot.Enqueue(lStartUserID);
            lCurrentID = lStartUserID;

            //对队列无限循环爬行,直至有操作暂停或停止
            while (true)
            {
                if (blnAsyncCancelled) return;
                while (blnSuspending)
                {
                    if (blnAsyncCancelled) return;
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }

                //将队头取出
                //lCurrentID = queueUserForUserRelationRobot.RollQueue();
                lCurrentID = queueUserForUserRelationRobot.FirstValue;

                //日志
                Log("Recording current UserID:" + lCurrentID.ToString() + "...");
                SysArg.SetCurrentID(lCurrentID, SysArgFor.USER_RELATION);

                #region 用户关注列表
                if (blnAsyncCancelled) return;
                while (blnSuspending)
                {
                    if (blnAsyncCancelled) return;
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }
                //日志                
                Log("Crawling the followings of User " + lCurrentID.ToString() + "...");
                //爬取当前用户的关注的用户ID,记录关系,加入队列
                LinkedList<long> lstBuffer = crawler.GetFriendsOf(lCurrentID, -1);
                if (lstBuffer.Count > 0 && lstBuffer.First.Value == -1)
                {
                    int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                    Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue...");
                    for (int i = 0; i < iSleepSeconds; i++)
                    {
                        if (blnAsyncCancelled) return;
                        Thread.Sleep(1000);
                    }
                    continue;
                }
                //日志
                Log(lstBuffer.Count.ToString() + " followings crawled.");
                //日志
                AdjustFreq();
                SetCrawlerFreq();
                Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");

                while (lstBuffer.Count > 0)
                {
                    if (blnAsyncCancelled) return;
                    while (blnSuspending)
                    {
                        if (blnAsyncCancelled) return;
                        Thread.Sleep(GlobalPool.SleepMsForThread);
                    }
                    lQueueBufferFirst = lstBuffer.First.Value;
                    int nRecordRelation = 1;
                    if (blnConfirmRelationship)
                    {
                        //日志                
                        Log("Confirming the relationship between User " + lCurrentID.ToString() + " and User " + lQueueBufferFirst.ToString());
                        nRecordRelation = crawler.RelationExistBetween(lCurrentID, lQueueBufferFirst);
                        //日志
                        AdjustFreq();
                        SetCrawlerFreq();
                        Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");
                        if (nRecordRelation == -1)
                        {
                            int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                            Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue...");
                            for (int i = 0; i < iSleepSeconds; i++)
                            {
                                if (blnAsyncCancelled) return;
                                Thread.Sleep(1000);
                            }
                            continue;
                        }
                        if (nRecordRelation==1)
                        {
                            //日志
                            Log("Relationship confirmed. Recording User " + lCurrentID.ToString() + " follows User " + lQueueBufferFirst.ToString() + "...");
                        }
                        else
                        {
                            //日志
                            Log("Relationship not exists. Recording invalid relationship...");
                            InvalidRelation ir = new InvalidRelation();
                            ir.source_user_id = lCurrentID;
                            ir.target_user_id = lQueueBufferFirst;
                            ir.Add();

                            Log("Recording invalid User " + lQueueBufferFirst.ToString() + "...");
                            InvalidUser iu = new InvalidUser();
                            iu.user_id = lQueueBufferFirst;
                            iu.Add();

                            //将该用户ID从各个队列中去掉
                            Log("Removing invalid User " + lQueueBufferFirst.ToString() + " from all queues...");
                            queueUserForUserRelationRobot.Remove(lQueueBufferFirst);
                            if(GlobalPool.UserInfoRobotEnabled)
                                queueUserForUserInfoRobot.Remove(lQueueBufferFirst);
                            if(GlobalPool.TagRobotEnabled)
                                queueUserForUserTagRobot.Remove(lQueueBufferFirst);
                            if(GlobalPool.StatusRobotEnabled)
                                queueUserForStatusRobot.Remove(lQueueBufferFirst);
                        }
                    }//if (blnConfirmRelationship)
                    else
                    {
                        //日志
                        Log("Recording User " + lCurrentID.ToString() + " follows User " + lQueueBufferFirst.ToString() + "...");
                    }
                    if (nRecordRelation==1)
                    {
                        if (UserRelation.RelationshipExist(lCurrentID, lQueueBufferFirst))
                        {
                            //日志
                            Log("Relationship exists.");
                        }
                        else
                        {
                            UserRelation ur = new UserRelation();
                            ur.source_user_id = lCurrentID;
                            ur.target_user_id = lQueueBufferFirst;
                            ur.Add();
                        }

                        //加入队列
                        if (queueUserForUserRelationRobot.Enqueue(lQueueBufferFirst))
                            //日志
                            Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of User Relation Robot...");
                        if (GlobalPool.UserInfoRobotEnabled && queueUserForUserInfoRobot.Enqueue(lQueueBufferFirst))
                            //日志
                            Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of User Information Robot...");
                        if (GlobalPool.TagRobotEnabled && queueUserForUserTagRobot.Enqueue(lQueueBufferFirst))
                            //日志
                            Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of User Tag Robot...");
                        if (GlobalPool.StatusRobotEnabled && queueUserForStatusRobot.Enqueue(lQueueBufferFirst))
                            //日志
                            Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of Status Robot...");
                    }
                    lstBuffer.RemoveFirst();
                }//while (lstBuffer.Count > 0)
                #endregion
                #region 用户粉丝列表
                if (blnAsyncCancelled) return;
                while (blnSuspending)
                {
                    if (blnAsyncCancelled) return;
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }
                //日志                
                Log("Crawling the followers of User " + lCurrentID.ToString() + "...");
                //爬取当前用户的粉丝的用户ID,记录关系,加入队列
                lstBuffer = crawler.GetFollowersOf(lCurrentID, -1);
                if (lstBuffer.Count>0 && lstBuffer.First.Value == -1)
                {
                    int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                    Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue...");
                    for (int i = 0; i < iSleepSeconds; i++)
                    {
                        if (blnAsyncCancelled) return;
                        Thread.Sleep(1000);
                    }
                    continue;
                }
                //日志
                Log(lstBuffer.Count.ToString() + " followers crawled.");
                //日志
                AdjustFreq();
                SetCrawlerFreq();
                Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");

                while (lstBuffer.Count > 0)
                {
                    if (blnAsyncCancelled) return;
                    while (blnSuspending)
                    {
                        if (blnAsyncCancelled) return;
                        Thread.Sleep(GlobalPool.SleepMsForThread);
                    }
                    lQueueBufferFirst = lstBuffer.First.Value;
                    int nRecordRelation = 1;
                    if (blnConfirmRelationship)
                    {
                        //日志                
                        Log("Confirming the relationship between User " + lQueueBufferFirst.ToString() + " and User " + lCurrentID.ToString());
                        nRecordRelation = crawler.RelationExistBetween(lQueueBufferFirst, lCurrentID);
                        //日志
                        AdjustFreq();
                        SetCrawlerFreq();
                        Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");
                        if (nRecordRelation == -1)
                        {
                            int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                            Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue...");
                            for (int i = 0; i < iSleepSeconds; i++)
                            {
                                if (blnAsyncCancelled) return;
                                Thread.Sleep(1000);
                            }
                            continue;
                        }
                        if (nRecordRelation==1)
                        {
                            //日志
                            Log("Relationship confirmed. Recording User " + lQueueBufferFirst.ToString() + " follows User " + lCurrentID.ToString() + "...");
                        }
                        else
                        {
                            //日志
                            Log("Relationship not exists. Recording invalid relationship...");
                            InvalidRelation ir = new InvalidRelation();
                            ir.source_user_id = lQueueBufferFirst;
                            ir.target_user_id = lCurrentID;
                            ir.Add();

                            Log("Recording invalid User " + lQueueBufferFirst.ToString() + "...");
                            InvalidUser iu = new InvalidUser();
                            iu.user_id = lQueueBufferFirst;
                            iu.Add();

                            //将该用户ID从各个队列中去掉
                            Log("Removing invalid User " + lQueueBufferFirst.ToString() + " from all queues...");
                            queueUserForUserRelationRobot.Remove(lQueueBufferFirst);
                            if (GlobalPool.UserInfoRobotEnabled)
                                queueUserForUserInfoRobot.Remove(lQueueBufferFirst);
                            if (GlobalPool.TagRobotEnabled)
                                queueUserForUserTagRobot.Remove(lQueueBufferFirst);
                            if (GlobalPool.StatusRobotEnabled)
                                queueUserForStatusRobot.Remove(lQueueBufferFirst);
                        }
                    }
                    else
                    {
                        //日志
                        Log("Recording User " + lQueueBufferFirst.ToString() + " follows User " + lCurrentID.ToString() + "...");
                    }
                    if (nRecordRelation==1)
                    {
                        if (UserRelation.RelationshipExist(lQueueBufferFirst, lCurrentID))
                        {
                            //日志
                            Log("Relationship exists.");
                        }
                        else
                        {
                            UserRelation ur = new UserRelation();
                            ur.source_user_id = lQueueBufferFirst;
                            ur.target_user_id = lCurrentID;
                            ur.Add();
                        }

                        //加入队列
                        if (queueUserForUserRelationRobot.Enqueue(lQueueBufferFirst))
                            //日志
                            Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of User Relation Robot...");
                        if (GlobalPool.UserInfoRobotEnabled && queueUserForUserInfoRobot.Enqueue(lQueueBufferFirst))
                            //日志
                            Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of User Information Robot...");
                        if (GlobalPool.TagRobotEnabled && queueUserForUserTagRobot.Enqueue(lQueueBufferFirst))
                            //日志
                            Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of User Tag Robot...");
                        if (GlobalPool.StatusRobotEnabled && queueUserForStatusRobot.Enqueue(lQueueBufferFirst))
                            //日志
                            Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of Status Robot...");
                    }
                    lstBuffer.RemoveFirst();
                }//while (lstBuffer.Count > 0)
                #endregion
                queueUserForUserRelationRobot.RollQueue();
                //日志
                Log("Social grapgh of User " + lCurrentID.ToString() + " crawled.");
            }
        }