Пример #1
0
        /// <summary>
        /// 以指定的UserID为起点开始爬行
        /// </summary>
        /// <param name="lUid"></param>
        public void Start()
        {
            //获取上次中止处的用户ID并入队
            long lLastUID = SysArg.GetCurrentID(SysArgFor.USER_INFO);

            if (lLastUID > 0)
            {
                queueUserForUserInfoRobot.Enqueue(lLastUID);
            }
            while (queueUserForUserInfoRobot.Count == 0)
            {
                if (blnAsyncCancelled)
                {
                    return;
                }
                Thread.Sleep(GlobalPool.SleepMsForThread);   //若队列为空,则等待
            }
            AdjustFreq();
            SetCrawlerFreq();
            Log("The initial requesting interval is " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");
            User user;

            //对队列循环爬行
            while (true)
            {
                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }
                //将队头取出
                //lCurrentID = queueUserForUserInfoRobot.RollQueue();
                lCurrentID = queueUserForUserInfoRobot.FirstValue;

                //日志
                Log("Recording current UserID: " + lCurrentID.ToString() + "...");
                SysArg.SetCurrentID(lCurrentID, SysArgFor.USER_INFO);

                #region 用户基本信息
                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }

                Log("Crawling information of User " + lCurrentID.ToString() + "...");
                user = crawler.GetUserInfo(lCurrentID);
                //日志
                AdjustFreq();
                SetCrawlerFreq();
                Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");
                if (user != null && user.user_id > 0)
                {
                    //若数据库中不存在当前用户的基本信息,则爬取,加入数据库
                    if (!User.ExistInDB(lCurrentID))
                    {
                        //日志
                        Log("Saving User " + lCurrentID.ToString() + " into database...");
                        user.Add();
                    }
                    else
                    {
                        //日志
                        Log("Updating the information of User " + lCurrentID.ToString() + "...");
                        user.Update();
                    }
                    if (InvalidUser.ExistInDB(lCurrentID))
                    {
                        //日志
                        Log("Removing User " + lCurrentID.ToString() + " from invalid users...");
                        InvalidUser.RemoveFromDB(lCurrentID);
                    }
                    //日志
                    Log("The information of User " + lCurrentID.ToString() + " crawled.");
                    queueUserForUserInfoRobot.RollQueue();
                }
                else if (user == null) //用户不存在
                {
                    Log("Recording invalid User " + lCurrentID.ToString() + "...");
                    InvalidUser iu = new InvalidUser();
                    iu.user_id = lCurrentID;
                    iu.Add();

                    //将该用户ID从各个队列中去掉
                    Log("Removing invalid User " + lCurrentID.ToString() + " from all queues...");
                    queueUserForUserRelationRobot.Remove(lCurrentID);
                    queueUserForUserInfoRobot.Remove(lCurrentID);
                    if (GlobalPool.TagRobotEnabled)
                    {
                        queueUserForUserTagRobot.Remove(lCurrentID);
                    }
                    if (GlobalPool.StatusRobotEnabled)
                    {
                        queueUserForStatusRobot.Remove(lCurrentID);
                    }
                }
                else if (user.user_id == -1)   //forbidden
                {
                    int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                    Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue...");
                    for (int i = 0; i < iSleepSeconds; i++)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        Thread.Sleep(1000);
                    }
                }
                else if (user.user_id == -2)   //timeout
                {
                    int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                    Log("Time out. I will crawl user " + lCurrentID.ToString() + " again...");
                }
                #endregion
            }
        }
Пример #2
0
        /// <summary>
        /// 以指定的UserID为起点开始爬行
        /// </summary>
        /// <param name="lUid"></param>
        public void Start(long lStartUserID)
        {
            if (lStartUserID == 0)
            {
                return;
            }
            AdjustRealFreq();
            SetCrawlerFreq();
            Log("The initial requesting interval is " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");

            //将起始UserID入队
            queueUserForUserRelationRobot.Enqueue(lStartUserID);
            if (GlobalPool.UserInfoRobotEnabled)
            {
                queueUserForUserInfoRobot.Enqueue(lStartUserID);
            }
            if (GlobalPool.TagRobotEnabled)
            {
                queueUserForUserTagRobot.Enqueue(lStartUserID);
            }
            if (GlobalPool.StatusRobotEnabled)
            {
                queueUserForStatusRobot.Enqueue(lStartUserID);
            }
            lCurrentID = lStartUserID;

            //对队列无限循环爬行,直至有操作暂停或停止
            while (true)
            {
                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }

                //将队头取出
                //lCurrentID = queueUserForUserRelationRobot.RollQueue();
                lCurrentID = queueUserForUserRelationRobot.FirstValue;

                //日志
                Log("Recording current UserID:" + lCurrentID.ToString() + "...");
                SysArg.SetCurrentID(lCurrentID, SysArgFor.USER_RELATION);

                #region 用户关注列表
                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }
                //日志
                Log("Crawling the followings of User " + lCurrentID.ToString() + "...");
                //爬取当前用户的关注的用户ID,记录关系,加入队列
                LinkedList <long> lstBuffer = crawler.GetFriendsOf(lCurrentID, -1);
                if (lstBuffer.Count > 0 && lstBuffer.First.Value == -1)
                {
                    int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                    Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue...");
                    for (int i = 0; i < iSleepSeconds; i++)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        Thread.Sleep(1000);
                    }
                    continue;
                }
                //日志
                Log(lstBuffer.Count.ToString() + " followings crawled.");
                //日志
                AdjustFreq();
                SetCrawlerFreq();
                Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");

                while (lstBuffer.Count > 0)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    while (blnSuspending)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        Thread.Sleep(GlobalPool.SleepMsForThread);
                    }
                    lQueueBufferFirst = lstBuffer.First.Value;
                    int nRecordRelation = 1;
                    if (blnConfirmRelationship)
                    {
                        //日志
                        Log("Confirming the relationship between User " + lCurrentID.ToString() + " and User " + lQueueBufferFirst.ToString());
                        nRecordRelation = crawler.RelationExistBetween(lCurrentID, lQueueBufferFirst);
                        //日志
                        AdjustFreq();
                        SetCrawlerFreq();
                        Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");
                        if (nRecordRelation == -1)
                        {
                            int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                            Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue...");
                            for (int i = 0; i < iSleepSeconds; i++)
                            {
                                if (blnAsyncCancelled)
                                {
                                    return;
                                }
                                Thread.Sleep(1000);
                            }
                            continue;
                        }
                        if (nRecordRelation == 1)
                        {
                            //日志
                            Log("Relationship confirmed. Recording User " + lCurrentID.ToString() + " follows User " + lQueueBufferFirst.ToString() + "...");
                        }
                        else
                        {
                            //日志
                            Log("Relationship not exists. Recording invalid relationship...");
                            InvalidRelation ir = new InvalidRelation();
                            ir.source_user_id = lCurrentID;
                            ir.target_user_id = lQueueBufferFirst;
                            ir.Add();

                            Log("Recording invalid User " + lQueueBufferFirst.ToString() + "...");
                            InvalidUser iu = new InvalidUser();
                            iu.user_id = lQueueBufferFirst;
                            iu.Add();

                            //将该用户ID从各个队列中去掉
                            Log("Removing invalid User " + lQueueBufferFirst.ToString() + " from all queues...");
                            queueUserForUserRelationRobot.Remove(lQueueBufferFirst);
                            if (GlobalPool.UserInfoRobotEnabled)
                            {
                                queueUserForUserInfoRobot.Remove(lQueueBufferFirst);
                            }
                            if (GlobalPool.TagRobotEnabled)
                            {
                                queueUserForUserTagRobot.Remove(lQueueBufferFirst);
                            }
                            if (GlobalPool.StatusRobotEnabled)
                            {
                                queueUserForStatusRobot.Remove(lQueueBufferFirst);
                            }
                        }
                    }//if (blnConfirmRelationship)
                    else
                    {
                        //日志
                        Log("Recording User " + lCurrentID.ToString() + " follows User " + lQueueBufferFirst.ToString() + "...");
                    }
                    if (nRecordRelation == 1)
                    {
                        if (UserRelation.RelationshipExist(lCurrentID, lQueueBufferFirst))
                        {
                            //日志
                            Log("Relationship exists.");
                        }
                        else
                        {
                            UserRelation ur = new UserRelation();
                            ur.source_user_id = lCurrentID;
                            ur.target_user_id = lQueueBufferFirst;
                            ur.Add();
                        }

                        //加入队列
                        if (queueUserForUserRelationRobot.Enqueue(lQueueBufferFirst))
                        {
                            //日志
                            Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of User Relation Robot...");
                        }
                        if (GlobalPool.UserInfoRobotEnabled && queueUserForUserInfoRobot.Enqueue(lQueueBufferFirst))
                        {
                            //日志
                            Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of User Information Robot...");
                        }
                        if (GlobalPool.TagRobotEnabled && queueUserForUserTagRobot.Enqueue(lQueueBufferFirst))
                        {
                            //日志
                            Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of User Tag Robot...");
                        }
                        if (GlobalPool.StatusRobotEnabled && queueUserForStatusRobot.Enqueue(lQueueBufferFirst))
                        {
                            //日志
                            Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of Status Robot...");
                        }
                    }
                    lstBuffer.RemoveFirst();
                }//while (lstBuffer.Count > 0)
                #endregion
                #region 用户粉丝列表
                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }
                //日志
                Log("Crawling the followers of User " + lCurrentID.ToString() + "...");
                //爬取当前用户的粉丝的用户ID,记录关系,加入队列
                lstBuffer = crawler.GetFollowersOf(lCurrentID, -1);
                if (lstBuffer.Count > 0 && lstBuffer.First.Value == -1)
                {
                    int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                    Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue...");
                    for (int i = 0; i < iSleepSeconds; i++)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        Thread.Sleep(1000);
                    }
                    continue;
                }
                //日志
                Log(lstBuffer.Count.ToString() + " followers crawled.");
                //日志
                AdjustFreq();
                SetCrawlerFreq();
                Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");

                while (lstBuffer.Count > 0)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    while (blnSuspending)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        Thread.Sleep(GlobalPool.SleepMsForThread);
                    }
                    lQueueBufferFirst = lstBuffer.First.Value;
                    int nRecordRelation = 1;
                    if (blnConfirmRelationship)
                    {
                        //日志
                        Log("Confirming the relationship between User " + lQueueBufferFirst.ToString() + " and User " + lCurrentID.ToString());
                        nRecordRelation = crawler.RelationExistBetween(lQueueBufferFirst, lCurrentID);
                        //日志
                        AdjustFreq();
                        SetCrawlerFreq();
                        Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");
                        if (nRecordRelation == -1)
                        {
                            int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                            Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue...");
                            for (int i = 0; i < iSleepSeconds; i++)
                            {
                                if (blnAsyncCancelled)
                                {
                                    return;
                                }
                                Thread.Sleep(1000);
                            }
                            continue;
                        }
                        if (nRecordRelation == 1)
                        {
                            //日志
                            Log("Relationship confirmed. Recording User " + lQueueBufferFirst.ToString() + " follows User " + lCurrentID.ToString() + "...");
                        }
                        else
                        {
                            //日志
                            Log("Relationship not exists. Recording invalid relationship...");
                            InvalidRelation ir = new InvalidRelation();
                            ir.source_user_id = lQueueBufferFirst;
                            ir.target_user_id = lCurrentID;
                            ir.Add();

                            Log("Recording invalid User " + lQueueBufferFirst.ToString() + "...");
                            InvalidUser iu = new InvalidUser();
                            iu.user_id = lQueueBufferFirst;
                            iu.Add();

                            //将该用户ID从各个队列中去掉
                            Log("Removing invalid User " + lQueueBufferFirst.ToString() + " from all queues...");
                            queueUserForUserRelationRobot.Remove(lQueueBufferFirst);
                            if (GlobalPool.UserInfoRobotEnabled)
                            {
                                queueUserForUserInfoRobot.Remove(lQueueBufferFirst);
                            }
                            if (GlobalPool.TagRobotEnabled)
                            {
                                queueUserForUserTagRobot.Remove(lQueueBufferFirst);
                            }
                            if (GlobalPool.StatusRobotEnabled)
                            {
                                queueUserForStatusRobot.Remove(lQueueBufferFirst);
                            }
                        }
                    }
                    else
                    {
                        //日志
                        Log("Recording User " + lQueueBufferFirst.ToString() + " follows User " + lCurrentID.ToString() + "...");
                    }
                    if (nRecordRelation == 1)
                    {
                        if (UserRelation.RelationshipExist(lQueueBufferFirst, lCurrentID))
                        {
                            //日志
                            Log("Relationship exists.");
                        }
                        else
                        {
                            UserRelation ur = new UserRelation();
                            ur.source_user_id = lQueueBufferFirst;
                            ur.target_user_id = lCurrentID;
                            ur.Add();
                        }

                        //加入队列
                        if (queueUserForUserRelationRobot.Enqueue(lQueueBufferFirst))
                        {
                            //日志
                            Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of User Relation Robot...");
                        }
                        if (GlobalPool.UserInfoRobotEnabled && queueUserForUserInfoRobot.Enqueue(lQueueBufferFirst))
                        {
                            //日志
                            Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of User Information Robot...");
                        }
                        if (GlobalPool.TagRobotEnabled && queueUserForUserTagRobot.Enqueue(lQueueBufferFirst))
                        {
                            //日志
                            Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of User Tag Robot...");
                        }
                        if (GlobalPool.StatusRobotEnabled && queueUserForStatusRobot.Enqueue(lQueueBufferFirst))
                        {
                            //日志
                            Log("Adding User " + lQueueBufferFirst.ToString() + " to the user queue of Status Robot...");
                        }
                    }
                    lstBuffer.RemoveFirst();
                }//while (lstBuffer.Count > 0)
                #endregion
                queueUserForUserRelationRobot.RollQueue();
                //日志
                Log("Social grapgh of User " + lCurrentID.ToString() + " crawled.");
            }
        }
Пример #3
0
        /// <summary>
        /// 开始爬行取微博评论
        /// </summary>
        public void Start()
        {
            //获取上次中止处的微博ID并入队
            long lLastStatusID = SysArg.GetCurrentID(SysArgFor.COMMENT);

            if (lLastStatusID > 0)
            {
                queueStatus.Enqueue(lLastStatusID);
            }
            while (queueStatus.Count == 0)
            {
                if (blnAsyncCancelled)
                {
                    return;
                }
                Thread.Sleep(GlobalPool.SleepMsForThread);   //若队列为空,则等待
            }

            AdjustRealFreq();
            SetCrawlerFreq();
            Log("The initial requesting interval is " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");

            //对队列无限循环爬行,直至有操作暂停或停止
            while (true)
            {
                bool blnForbidden = false;
                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }

                //将队头取出
                lCurrentID = queueStatus.FirstValue;
                //lCurrentID = queueStatus.RollQueue();

                //日志
                Log("Recording current StatusID: " + lCurrentID.ToString() + "...");
                SysArg.SetCurrentID(lCurrentID, SysArgFor.COMMENT);

                #region 微博相应评论
                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }

                //日志
                Log("Crawling the comments of Status " + lCurrentID.ToString() + "...");
                int iPage = 1;
                //爬取当前微博的评论
                LinkedList <Comment> lstComment = new LinkedList <Comment>();
                LinkedList <Comment> lstTemp    = new LinkedList <Comment>();
                lstTemp = crawler.GetCommentsOf(lCurrentID, iPage);
                //日志
                AdjustFreq();
                SetCrawlerFreq();
                Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");
                while (lstTemp.Count > 0)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    while (blnSuspending)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        Thread.Sleep(GlobalPool.SleepMsForThread);
                    }
                    while (lstTemp.Count > 0)
                    {
                        if (lstTemp.First.Value.comment_id > 0)
                        {
                            lstComment.AddLast(lstTemp.First.Value);
                            lstTemp.RemoveFirst();
                        }
                        else
                        {
                            blnForbidden = true;
                            lstTemp.Clear();
                            int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                            Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue...");
                            for (int i = 0; i < iSleepSeconds; i++)
                            {
                                if (blnAsyncCancelled)
                                {
                                    return;
                                }
                                Thread.Sleep(1000);
                            }
                            continue;
                        }
                    }
                    iPage++;
                    lstTemp = crawler.GetCommentsOf(lCurrentID, iPage);
                    //日志
                    AdjustFreq();
                    SetCrawlerFreq();
                    Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");
                }

                if (blnForbidden)
                {
                    continue;
                }

                //日志
                Log(lstComment.Count.ToString() + " comments of Status " + lCurrentID.ToString() + " crawled.");
                Comment comment;
                while (lstComment.Count > 0)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    while (blnSuspending)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        Thread.Sleep(GlobalPool.SleepMsForThread);
                    }
                    comment = lstComment.First.Value;

                    if (!Comment.Exists(comment.comment_id))
                    {
                        //日志
                        Log("Saving Comment " + comment.comment_id.ToString() + " into database...");
                        comment.Add();
                    }

                    if (queueUserForUserRelationRobot.Enqueue(comment.user.user_id))
                    {
                        Log("Adding Commenter " + comment.user.user_id.ToString() + " to the user queue of User Relation Robot...");
                    }
                    if (GlobalPool.UserInfoRobotEnabled && queueUserForUserInfoRobot.Enqueue(comment.user.user_id))
                    {
                        Log("Adding Commenter " + comment.user.user_id.ToString() + " to the user queue of User Information Robot...");
                    }
                    if (GlobalPool.TagRobotEnabled && queueUserForUserTagRobot.Enqueue(comment.user.user_id))
                    {
                        Log("Adding Commenter " + comment.user.user_id.ToString() + " to the user queue of User Tag Robot...");
                    }
                    if (GlobalPool.StatusRobotEnabled && queueUserForStatusRobot.Enqueue(comment.user.user_id))
                    {
                        Log("Adding Commenter " + comment.user.user_id.ToString() + " to the user queue of Status Robot...");
                    }
                    if (!User.ExistInDB(comment.user.user_id))
                    {
                        Log("Saving Commenter " + comment.user.user_id.ToString() + " into database...");
                        comment.user.Add();
                    }

                    lstComment.RemoveFirst();

                    if (comment.reply_comment != null)
                    {
                        lstComment.AddLast(comment.reply_comment);
                    }
                }//while for lstComment
                queueStatus.RollQueue();
                //日志
                Log("Comments of Status " + lCurrentID.ToString() + " crawled.");
                #endregion
            }
        }
Пример #4
0
        /// <summary>
        /// 以指定的UserID为起点开始爬行
        /// </summary>
        /// <param name="lUid"></param>
        public void Start()
        {
            //获取上次中止处的用户ID并入队
            long lLastUID = SysArg.GetCurrentID(SysArgFor.USER_TAG);

            if (lLastUID > 0)
            {
                queueUserForUserTagRobot.Enqueue(lLastUID);
            }
            while (queueUserForUserTagRobot.Count == 0)
            {
                if (blnAsyncCancelled)
                {
                    return;
                }
                Thread.Sleep(GlobalPool.SleepMsForThread);   //若队列为空,则等待
            }

            AdjustRealFreq();
            SetCrawlerFreq();
            Log("The initial requesting interval is " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");

            //对队列无限循环爬行,直至有操作暂停或停止
            while (true)
            {
                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }

                //将队头取出
                //lCurrentID = queueUserForUserTagRobot.RollQueue();
                lCurrentID = queueUserForUserTagRobot.FirstValue;

                //日志
                Log("Recording current UserID: " + lCurrentID.ToString() + "...");
                SysArg.SetCurrentID(lCurrentID, SysArgFor.USER_TAG);

                #region 用户标签信息
                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }

                //日志
                Log("Crawling tags of User " + lCurrentID.ToString() + "...");
                LinkedList <Tag> lstTag = crawler.GetTagsOf(lCurrentID);
                if (lstTag.Count > 0 && lstTag.First.Value.tag_id > 0)
                {
                    //日志
                    Log(lstTag.Count.ToString() + " tags crawled.");
                    //日志
                    AdjustFreq();
                    SetCrawlerFreq();
                    Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");

                    while (lstTag.Count > 0)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        while (blnSuspending)
                        {
                            if (blnAsyncCancelled)
                            {
                                return;
                            }
                            Thread.Sleep(GlobalPool.SleepMsForThread);
                        }
                        Tag tag = lstTag.First.Value;
                        if (!Tag.Exists(tag.tag_id))
                        {
                            //日志
                            Log("Saving Tag " + tag.tag_id.ToString() + " into database...");
                            tag.Add();
                        }
                        else
                        {
                            //日志
                            //Log( "Tag " + tag.tag_id.ToString() + " exists." );
                            Log("Updating Tag " + tag.tag_id.ToString() + " into database...");
                            tag.Update();
                        }

                        if (!UserTag.Exists(lCurrentID, tag.tag_id))
                        {
                            //日志
                            Log("Recording User " + lCurrentID.ToString() + " has Tag " + tag.tag_id.ToString() + "...");
                            UserTag user_tag = new UserTag();
                            user_tag.user_id = lCurrentID;
                            user_tag.tag_id  = tag.tag_id;
                            user_tag.Add();
                        }
                        else
                        {
                            //日志
                            Log("Tag " + tag.tag_id.ToString() + " of User " + lCurrentID.ToString() + " exists.");
                        }

                        lstTag.RemoveFirst();
                    }
                    queueUserForUserTagRobot.RollQueue();
                    //日志
                    Log("Tags of User " + lCurrentID.ToString() + " crawled.");
                }
                else if (lstTag.Count > 0 && lstTag.First.Value.tag_id == -1)
                {
                    lstTag.Clear();
                    int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                    Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue...");
                    for (int i = 0; i < iSleepSeconds; i++)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        Thread.Sleep(1000);
                    }
                    continue;
                }
                else if (lstTag.Count > 0 && lstTag.First.Value.tag_id == -2)
                {
                    int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                    Log("Error! The error message is \"" + lstTag.First.Value.tag + "\". I will wait for " + iSleepSeconds.ToString() + "s to continue...");
                    lstTag.Clear();
                    for (int i = 0; i < iSleepSeconds; i++)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        Thread.Sleep(1000);
                    }
                    continue;
                }
                else
                {
                    queueUserForUserTagRobot.RollQueue();
                    //日志
                    Log("Tags of User " + lCurrentID.ToString() + " crawled.");
                }
                #endregion
            }
        }
Пример #5
0
        /// <summary>
        /// 开始爬取微博数据
        /// </summary>
        public void Start()
        {
            //获取上次中止处的用户ID并入队
            long lLastUID = SysArg.GetCurrentID(SysArgFor.STATUS);

            if (lLastUID > 0)
            {
                queueUserForStatusRobot.Enqueue(lLastUID);
            }
            while (queueUserForStatusRobot.Count == 0)
            {
                if (blnAsyncCancelled)
                {
                    return;
                }
                Thread.Sleep(GlobalPool.SleepMsForThread);   //若队列为空,则等待
            }

            AdjustRealFreq();
            SetCrawlerFreq();
            Log("The initial requesting interval is " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");

            //对队列无限循环爬行,直至有操作暂停或停止
            while (true)
            {
                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }

                //将队头取出
                //lCurrentID = queueUserForStatusRobot.RollQueue();
                lCurrentID = queueUserForStatusRobot.FirstValue;

                //日志
                Log("Recording current UserID: " + lCurrentID.ToString() + "...");
                SysArg.SetCurrentID(lCurrentID, SysArgFor.STATUS);

                #region 用户微博信息
                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }
                //日志
                Log("Getting the latest Status ID of User " + lCurrentID.ToString() + "...");
                //获取数据库中当前用户最新一条微博的ID
                long lCurrentSID = Status.GetLastStatusIDOf(lCurrentID);

                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }

                Status status;
                #region 后续微博
                //日志
                Log("Crawling statuses after Status " + lCurrentSID.ToString() + " of User " + lCurrentID.ToString() + "...");
                //爬取数据库中当前用户最新一条微博的ID之后的微博,存入数据库
                LinkedList <Status> lstStatus = crawler.GetStatusesOfSince(lCurrentID, lCurrentSID);
                if (lstStatus.Count > 0 && lstStatus.First.Value.status_id > 0)
                {
                    //日志
                    Log(lstStatus.Count.ToString() + " statuses crawled.");
                    //日志
                    AdjustFreq();
                    SetCrawlerFreq();
                    Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");

                    while (lstStatus.Count > 0)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        while (blnSuspending)
                        {
                            if (blnAsyncCancelled)
                            {
                                return;
                            }
                            Thread.Sleep(GlobalPool.SleepMsForThread);
                        }
                        status = lstStatus.First.Value;
                        SaveStatus(status);
                        lstStatus.RemoveFirst();
                    }
                    queueUserForStatusRobot.RollQueue();
                    //日志
                    Log("Statuses of User " + lCurrentID.ToString() + " crawled.");
                }
                else if (lstStatus.Count > 0 && lstStatus.First.Value.status_id == -1)
                {
                    lstStatus.Clear();
                    int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.STATUS).ResetTimeInSeconds;
                    Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue...");
                    for (int i = 0; i < iSleepSeconds; i++)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        Thread.Sleep(1000);
                    }
                    continue;
                }
                else
                {
                    queueUserForStatusRobot.RollQueue();
                    //日志
                    Log("Statuses of User " + lCurrentID.ToString() + " crawled.");
                }
                #endregion
                #endregion
            }
        }