示例#1
0
        /// <summary>
        /// 以指定的UserID为起点开始爬行
        /// </summary>
        /// <param name="lUid"></param>
        public void Start()
        {
            //获取上次中止处的用户ID并入队
            long lLastUID = SysArg.GetCurrentID(SysArgFor.USER_INFO);

            if (lLastUID > 0)
            {
                queueUserForUserInfoRobot.Enqueue(lLastUID);
            }
            while (queueUserForUserInfoRobot.Count == 0)
            {
                if (blnAsyncCancelled)
                {
                    return;
                }
                Thread.Sleep(GlobalPool.SleepMsForThread);   //若队列为空,则等待
            }
            AdjustFreq();
            SetCrawlerFreq();
            Log("The initial requesting interval is " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");
            User user;

            //对队列循环爬行
            while (true)
            {
                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }
                //将队头取出
                //lCurrentID = queueUserForUserInfoRobot.RollQueue();
                lCurrentID = queueUserForUserInfoRobot.FirstValue;

                //日志
                Log("Recording current UserID: " + lCurrentID.ToString() + "...");
                SysArg.SetCurrentID(lCurrentID, SysArgFor.USER_INFO);

                #region 用户基本信息
                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }

                Log("Crawling information of User " + lCurrentID.ToString() + "...");
                user = crawler.GetUserInfo(lCurrentID);
                //日志
                AdjustFreq();
                SetCrawlerFreq();
                Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");
                if (user != null && user.user_id > 0)
                {
                    //若数据库中不存在当前用户的基本信息,则爬取,加入数据库
                    if (!User.ExistInDB(lCurrentID))
                    {
                        //日志
                        Log("Saving User " + lCurrentID.ToString() + " into database...");
                        user.Add();
                    }
                    else
                    {
                        //日志
                        Log("Updating the information of User " + lCurrentID.ToString() + "...");
                        user.Update();
                    }
                    if (InvalidUser.ExistInDB(lCurrentID))
                    {
                        //日志
                        Log("Removing User " + lCurrentID.ToString() + " from invalid users...");
                        InvalidUser.RemoveFromDB(lCurrentID);
                    }
                    //日志
                    Log("The information of User " + lCurrentID.ToString() + " crawled.");
                    queueUserForUserInfoRobot.RollQueue();
                }
                else if (user == null) //用户不存在
                {
                    Log("Recording invalid User " + lCurrentID.ToString() + "...");
                    InvalidUser iu = new InvalidUser();
                    iu.user_id = lCurrentID;
                    iu.Add();

                    //将该用户ID从各个队列中去掉
                    Log("Removing invalid User " + lCurrentID.ToString() + " from all queues...");
                    queueUserForUserRelationRobot.Remove(lCurrentID);
                    queueUserForUserInfoRobot.Remove(lCurrentID);
                    if (GlobalPool.TagRobotEnabled)
                    {
                        queueUserForUserTagRobot.Remove(lCurrentID);
                    }
                    if (GlobalPool.StatusRobotEnabled)
                    {
                        queueUserForStatusRobot.Remove(lCurrentID);
                    }
                }
                else if (user.user_id == -1)   //forbidden
                {
                    int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                    Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue...");
                    for (int i = 0; i < iSleepSeconds; i++)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        Thread.Sleep(1000);
                    }
                }
                else if (user.user_id == -2)   //timeout
                {
                    int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                    Log("Time out. I will crawl user " + lCurrentID.ToString() + " again...");
                }
                #endregion
            }
        }
示例#2
0
        /// <summary>
        /// 开始爬行取微博评论
        /// </summary>
        public void Start()
        {
            //获取上次中止处的微博ID并入队
            long lLastStatusID = SysArg.GetCurrentID(SysArgFor.COMMENT);

            if (lLastStatusID > 0)
            {
                queueStatus.Enqueue(lLastStatusID);
            }
            while (queueStatus.Count == 0)
            {
                if (blnAsyncCancelled)
                {
                    return;
                }
                Thread.Sleep(GlobalPool.SleepMsForThread);   //若队列为空,则等待
            }

            AdjustRealFreq();
            SetCrawlerFreq();
            Log("The initial requesting interval is " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");

            //对队列无限循环爬行,直至有操作暂停或停止
            while (true)
            {
                bool blnForbidden = false;
                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }

                //将队头取出
                lCurrentID = queueStatus.FirstValue;
                //lCurrentID = queueStatus.RollQueue();

                //日志
                Log("Recording current StatusID: " + lCurrentID.ToString() + "...");
                SysArg.SetCurrentID(lCurrentID, SysArgFor.COMMENT);

                #region 微博相应评论
                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }

                //日志
                Log("Crawling the comments of Status " + lCurrentID.ToString() + "...");
                int iPage = 1;
                //爬取当前微博的评论
                LinkedList <Comment> lstComment = new LinkedList <Comment>();
                LinkedList <Comment> lstTemp    = new LinkedList <Comment>();
                lstTemp = crawler.GetCommentsOf(lCurrentID, iPage);
                //日志
                AdjustFreq();
                SetCrawlerFreq();
                Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");
                while (lstTemp.Count > 0)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    while (blnSuspending)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        Thread.Sleep(GlobalPool.SleepMsForThread);
                    }
                    while (lstTemp.Count > 0)
                    {
                        if (lstTemp.First.Value.comment_id > 0)
                        {
                            lstComment.AddLast(lstTemp.First.Value);
                            lstTemp.RemoveFirst();
                        }
                        else
                        {
                            blnForbidden = true;
                            lstTemp.Clear();
                            int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                            Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue...");
                            for (int i = 0; i < iSleepSeconds; i++)
                            {
                                if (blnAsyncCancelled)
                                {
                                    return;
                                }
                                Thread.Sleep(1000);
                            }
                            continue;
                        }
                    }
                    iPage++;
                    lstTemp = crawler.GetCommentsOf(lCurrentID, iPage);
                    //日志
                    AdjustFreq();
                    SetCrawlerFreq();
                    Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");
                }

                if (blnForbidden)
                {
                    continue;
                }

                //日志
                Log(lstComment.Count.ToString() + " comments of Status " + lCurrentID.ToString() + " crawled.");
                Comment comment;
                while (lstComment.Count > 0)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    while (blnSuspending)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        Thread.Sleep(GlobalPool.SleepMsForThread);
                    }
                    comment = lstComment.First.Value;

                    if (!Comment.Exists(comment.comment_id))
                    {
                        //日志
                        Log("Saving Comment " + comment.comment_id.ToString() + " into database...");
                        comment.Add();
                    }

                    if (queueUserForUserRelationRobot.Enqueue(comment.user.user_id))
                    {
                        Log("Adding Commenter " + comment.user.user_id.ToString() + " to the user queue of User Relation Robot...");
                    }
                    if (GlobalPool.UserInfoRobotEnabled && queueUserForUserInfoRobot.Enqueue(comment.user.user_id))
                    {
                        Log("Adding Commenter " + comment.user.user_id.ToString() + " to the user queue of User Information Robot...");
                    }
                    if (GlobalPool.TagRobotEnabled && queueUserForUserTagRobot.Enqueue(comment.user.user_id))
                    {
                        Log("Adding Commenter " + comment.user.user_id.ToString() + " to the user queue of User Tag Robot...");
                    }
                    if (GlobalPool.StatusRobotEnabled && queueUserForStatusRobot.Enqueue(comment.user.user_id))
                    {
                        Log("Adding Commenter " + comment.user.user_id.ToString() + " to the user queue of Status Robot...");
                    }
                    if (!User.ExistInDB(comment.user.user_id))
                    {
                        Log("Saving Commenter " + comment.user.user_id.ToString() + " into database...");
                        comment.user.Add();
                    }

                    lstComment.RemoveFirst();

                    if (comment.reply_comment != null)
                    {
                        lstComment.AddLast(comment.reply_comment);
                    }
                }//while for lstComment
                queueStatus.RollQueue();
                //日志
                Log("Comments of Status " + lCurrentID.ToString() + " crawled.");
                #endregion
            }
        }
示例#3
0
        /// <summary>
        /// 以指定的UserID为起点开始爬行
        /// </summary>
        /// <param name="lUid"></param>
        public void Start()
        {
            //获取上次中止处的用户ID并入队
            long lLastUID = SysArg.GetCurrentID(SysArgFor.USER_TAG);

            if (lLastUID > 0)
            {
                queueUserForUserTagRobot.Enqueue(lLastUID);
            }
            while (queueUserForUserTagRobot.Count == 0)
            {
                if (blnAsyncCancelled)
                {
                    return;
                }
                Thread.Sleep(GlobalPool.SleepMsForThread);   //若队列为空,则等待
            }

            AdjustRealFreq();
            SetCrawlerFreq();
            Log("The initial requesting interval is " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");

            //对队列无限循环爬行,直至有操作暂停或停止
            while (true)
            {
                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }

                //将队头取出
                //lCurrentID = queueUserForUserTagRobot.RollQueue();
                lCurrentID = queueUserForUserTagRobot.FirstValue;

                //日志
                Log("Recording current UserID: " + lCurrentID.ToString() + "...");
                SysArg.SetCurrentID(lCurrentID, SysArgFor.USER_TAG);

                #region 用户标签信息
                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }

                //日志
                Log("Crawling tags of User " + lCurrentID.ToString() + "...");
                LinkedList <Tag> lstTag = crawler.GetTagsOf(lCurrentID);
                if (lstTag.Count > 0 && lstTag.First.Value.tag_id > 0)
                {
                    //日志
                    Log(lstTag.Count.ToString() + " tags crawled.");
                    //日志
                    AdjustFreq();
                    SetCrawlerFreq();
                    Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");

                    while (lstTag.Count > 0)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        while (blnSuspending)
                        {
                            if (blnAsyncCancelled)
                            {
                                return;
                            }
                            Thread.Sleep(GlobalPool.SleepMsForThread);
                        }
                        Tag tag = lstTag.First.Value;
                        if (!Tag.Exists(tag.tag_id))
                        {
                            //日志
                            Log("Saving Tag " + tag.tag_id.ToString() + " into database...");
                            tag.Add();
                        }
                        else
                        {
                            //日志
                            //Log( "Tag " + tag.tag_id.ToString() + " exists." );
                            Log("Updating Tag " + tag.tag_id.ToString() + " into database...");
                            tag.Update();
                        }

                        if (!UserTag.Exists(lCurrentID, tag.tag_id))
                        {
                            //日志
                            Log("Recording User " + lCurrentID.ToString() + " has Tag " + tag.tag_id.ToString() + "...");
                            UserTag user_tag = new UserTag();
                            user_tag.user_id = lCurrentID;
                            user_tag.tag_id  = tag.tag_id;
                            user_tag.Add();
                        }
                        else
                        {
                            //日志
                            Log("Tag " + tag.tag_id.ToString() + " of User " + lCurrentID.ToString() + " exists.");
                        }

                        lstTag.RemoveFirst();
                    }
                    queueUserForUserTagRobot.RollQueue();
                    //日志
                    Log("Tags of User " + lCurrentID.ToString() + " crawled.");
                }
                else if (lstTag.Count > 0 && lstTag.First.Value.tag_id == -1)
                {
                    lstTag.Clear();
                    int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                    Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue...");
                    for (int i = 0; i < iSleepSeconds; i++)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        Thread.Sleep(1000);
                    }
                    continue;
                }
                else if (lstTag.Count > 0 && lstTag.First.Value.tag_id == -2)
                {
                    int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.USER_INFO).ResetTimeInSeconds;
                    Log("Error! The error message is \"" + lstTag.First.Value.tag + "\". I will wait for " + iSleepSeconds.ToString() + "s to continue...");
                    lstTag.Clear();
                    for (int i = 0; i < iSleepSeconds; i++)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        Thread.Sleep(1000);
                    }
                    continue;
                }
                else
                {
                    queueUserForUserTagRobot.RollQueue();
                    //日志
                    Log("Tags of User " + lCurrentID.ToString() + " crawled.");
                }
                #endregion
            }
        }
示例#4
0
        /// <summary>
        /// 开始爬取微博数据
        /// </summary>
        public void Start()
        {
            //获取上次中止处的用户ID并入队
            long lLastUID = SysArg.GetCurrentID(SysArgFor.STATUS);

            if (lLastUID > 0)
            {
                queueUserForStatusRobot.Enqueue(lLastUID);
            }
            while (queueUserForStatusRobot.Count == 0)
            {
                if (blnAsyncCancelled)
                {
                    return;
                }
                Thread.Sleep(GlobalPool.SleepMsForThread);   //若队列为空,则等待
            }

            AdjustRealFreq();
            SetCrawlerFreq();
            Log("The initial requesting interval is " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");

            //对队列无限循环爬行,直至有操作暂停或停止
            while (true)
            {
                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }

                //将队头取出
                //lCurrentID = queueUserForStatusRobot.RollQueue();
                lCurrentID = queueUserForStatusRobot.FirstValue;

                //日志
                Log("Recording current UserID: " + lCurrentID.ToString() + "...");
                SysArg.SetCurrentID(lCurrentID, SysArgFor.STATUS);

                #region 用户微博信息
                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }
                //日志
                Log("Getting the latest Status ID of User " + lCurrentID.ToString() + "...");
                //获取数据库中当前用户最新一条微博的ID
                long lCurrentSID = Status.GetLastStatusIDOf(lCurrentID);

                if (blnAsyncCancelled)
                {
                    return;
                }
                while (blnSuspending)
                {
                    if (blnAsyncCancelled)
                    {
                        return;
                    }
                    Thread.Sleep(GlobalPool.SleepMsForThread);
                }

                Status status;
                #region 后续微博
                //日志
                Log("Crawling statuses after Status " + lCurrentSID.ToString() + " of User " + lCurrentID.ToString() + "...");
                //爬取数据库中当前用户最新一条微博的ID之后的微博,存入数据库
                LinkedList <Status> lstStatus = crawler.GetStatusesOfSince(lCurrentID, lCurrentSID);
                if (lstStatus.Count > 0 && lstStatus.First.Value.status_id > 0)
                {
                    //日志
                    Log(lstStatus.Count.ToString() + " statuses crawled.");
                    //日志
                    AdjustFreq();
                    SetCrawlerFreq();
                    Log("Requesting interval is adjusted as " + crawler.SleepTime.ToString() + "ms. " + api.ResetTimeInSeconds.ToString() + "s, " + api.RemainingIPHits.ToString() + " IP hits and " + api.RemainingUserHits.ToString() + " user hits left this hour.");

                    while (lstStatus.Count > 0)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        while (blnSuspending)
                        {
                            if (blnAsyncCancelled)
                            {
                                return;
                            }
                            Thread.Sleep(GlobalPool.SleepMsForThread);
                        }
                        status = lstStatus.First.Value;
                        SaveStatus(status);
                        lstStatus.RemoveFirst();
                    }
                    queueUserForStatusRobot.RollQueue();
                    //日志
                    Log("Statuses of User " + lCurrentID.ToString() + " crawled.");
                }
                else if (lstStatus.Count > 0 && lstStatus.First.Value.status_id == -1)
                {
                    lstStatus.Clear();
                    int iSleepSeconds = GlobalPool.GetAPI(SysArgFor.STATUS).ResetTimeInSeconds;
                    Log("Service is forbidden now. I will wait for " + iSleepSeconds.ToString() + "s to continue...");
                    for (int i = 0; i < iSleepSeconds; i++)
                    {
                        if (blnAsyncCancelled)
                        {
                            return;
                        }
                        Thread.Sleep(1000);
                    }
                    continue;
                }
                else
                {
                    queueUserForStatusRobot.RollQueue();
                    //日志
                    Log("Statuses of User " + lCurrentID.ToString() + " crawled.");
                }
                #endregion
                #endregion
            }
        }