Beispiel #1
0
        /// <summary>
        /// 将新浪返回的微博转换为本地微博类型(Item)
        /// </summary>
        /// <param name="status">新浪返回的微博</param>
        /// <param name="source">微博来源</param>
        /// <returns></returns>
        public static Item ConvertToItem(NetDimension.Weibo.Entities.status.Entity status, Enums.AuthorSource source, string CrawlID)
        {
            Item item = new Item();

            item.CrawlID = CrawlID;
            item.Crawler = ConfigurationManager.AppSettings["ServerLocation"] + CrawlID;

            //设置媒体信息
            WeiboUtilities.SetItemMediaInfo(item);

            #region 抓取任务数据

            item.FetchTime          = DateTime.Now;
            item.UpdateTime         = null;
            item.ContentDetailLevel = Enums.ContentDetailLevel.Weibo;

            #endregion 抓取任务数据
            try
            {
                #region 基础数据
                item.Url                = Utilities.GetItemUrl(status.User.ID, status.MID);
                item.ItemID             = Palas.Common.Utility.MD5Helper.getMd5Hash(item.Url);
                item.ClientItemID       = status.ID;
                item.CleanTitle         = status.Text;
                item.PubDate            = Utilities.ParseToDateTime(status.CreatedAt);
                item.Location           = null;
                item.PoID               = null;
                item.AuthorName         = status.User.Name;
                item.AuthorID           = status.User.ID;
                item.AuthorImg          = status.User.ProfileImageUrl;
                item.AuthorCertificated = Utilities.GetCertificationType(status.User.VerifiedType, status.User.Verified);
                item.Source             = status.Source;
                item.AttachImg          = status.OriginalPictureUrl;
                #endregion

                #region Item跟踪

                item.CurrentCount = new ItemCountData(DateTime.Now);
                item.CurrentCount.ForwardCount = status.RepostsCount;
                item.CurrentCount.ReplyCount   = status.CommentsCount;

                item.CountHistory    = new ItemCountData[1];
                item.CountHistory[0] = item.CurrentCount;

                WeiboUtilities.SetItemTrackingRule(item, source);
                #endregion Item跟踪

                if (status.RetweetedStatus != null)
                {
                    Item tmp = ConvertToItem(status.RetweetedStatus, source, CrawlID);
                    if (tmp.ItemID != null)
                    {
                        InsertOrUpdateItem(tmp);
                    }
                    item.ParentItemID = tmp.ItemID;
                }
            }
            catch (Exception) { }
            return(item);
        }
Beispiel #2
0
        public static void InitLocHistJob()
        {
            var query      = Query.NE("InternalSubscribeID", MongoDB.Bson.BsonNull.Value);
            var collection = GetCollections <Author>();
            var authors    = collection.FindAs <Author>(query);
            int cnt        = 0;

            foreach (var author in authors)
            {
                Console.WriteLine("init {0}", author.AuthorName);
                if (WeiboUtilities.IsRedSkin(author.AuthorSource))
                {
                    cnt++;
                    author.Location_RefreshStatus   = Enums.CrawlStatus.Normal;
                    author.Location_LastSinceID     = null;
                    author.Location_NextRefreshTime = Utilities.Epoch;
                    author.Location_UpdateCount     = 0;
                    author.Location_UpdateTime      = Utilities.Epoch;

                    string[] parameters = new string[5];
                    parameters[0] = "Location_LastSinceID";
                    parameters[1] = "Location_NextRefreshTime";
                    parameters[2] = "Location_UpdateCount";
                    parameters[3] = "Location_UpdateTime";
                    parameters[4] = "Location_RefreshStatus";
                    UpdateDB <Author>(author, "AuthorID", parameters, SafeMode.True);
                }
            }
            Console.WriteLine(cnt);
        }
Beispiel #3
0
        /// <summary>
        /// 插入一个新用户,如果该用户已经存在,那么更新其信息
        /// </summary>
        /// <param name="author">待操作的用户</param>
        public static void InsertOrUpdateAuthorInfo(Author author)
        {
            if (!WeiboUtilities.ShouldFetchAuthor(author))
            {
                return;
            }
            var query = Query.EQ("AuthorID", author.AuthorID);

            if (Exists <Author>(query))
            {
                UpdateUserInfo(author);
            }
            else
            {
                InsertOrReplace <Author>(author, "AuthorID", SafeMode.True);
            }
        }
        public static string AddNewLocation(Tuple <float, float> coordinate)
        {
            Location loc = new Location();

            loc.PoIDSource            = WeiboUtilities.GetPoIDSource(null);
            loc.CheckInCount          = 0;
            loc.Url                   = null;
            loc.IntervalMins          = 15;
            loc.LocationSampleMethode = Enums.SampleMethod.All;
            loc.PoID                  = Guid.NewGuid().ToString("N");
            loc.Radius                = 600;
            loc.RefreshStatus         = Enums.CrawlStatus.Stop;
            loc.Lon                   = coordinate.Item1;
            loc.Lat                   = coordinate.Item2;
            loc.CategoryID            = "unknown";
            LocationDBManager.AddNewLocation(loc);
            return(loc.PoID);
        }
Beispiel #5
0
        private static void InitWork()
        {
            //#region 取消关闭按钮,只能通过命令方式退出
            //IntPtr hMenu = Process.GetCurrentProcess().MainWindowHandle;
            //IntPtr hSystemMenu = GetSystemMenu(hMenu, false);

            //EnableMenuItem(hSystemMenu, SC_CLOSE, MF_GRAYED);
            //RemoveMenu(hSystemMenu, SC_CLOSE, MF_BYCOMMAND);
            //#endregion

            Thread t = new Thread(MainLoop);

            t.Start();

            WeiboUtilities.InitSinaCityTable();

            WeiboUtilities.InitPOISourceWhiteList();
        }
Beispiel #6
0
        /// <summary>
        /// 设置用户抓取信息
        /// </summary>
        /// <param name="author">用户</param>
        /// <param name="source">用户来源</param>
        private static void SetAuthorCrawlInfo(Author author, Enums.AuthorSource source)
        {
            #region 抓取信息
            author.AuthorSource = source;
            author.CreateTime   = DateTime.Now;

            #region 基本信息和微博刷新
            WeiboUtilities.SetAuthorPostSampleRule(author, source);
            author.UpdateTime      = Utilities.Epoch;
            author.UpdateCount     = 0;
            author.NextRefreshTime = Utilities.Epoch;
            #endregion

            #region 刷新粉丝和关注
            author.Fans_UpdateTime      = Utilities.Epoch;
            author.Fans_UpdateCount     = 0;
            author.Fans_NextRefreshTime = Utilities.Epoch;
            WeiboUtilities.SetAuthorFansAndFollowersSampleRule(author, source);
            #endregion

            #region 刷新地理列表
            author.Location_UpdateTime      = Utilities.Epoch;
            author.Location_UpdateCount     = 0;
            author.Location_NextRefreshTime = Utilities.Epoch;
            WeiboUtilities.SetAuthorLocationSampleRule(author, source);
            #endregion

            switch (author.AuthorSource)
            {
            case Enums.AuthorSource.ListedTop:
            case Enums.AuthorSource.Partner:
            case Enums.AuthorSource.PublicLeader:
                author.InternalSubscribeID = DefaultSettings.ToBeFollowed;
                break;

            default:
                author.InternalSubscribeID = null;
                break;
            }

            #endregion
        }
        public static Tuple <string, float, float> AddNewLocation(dynamic place, string url)
        {
            Location loc = new Location();

            loc.PoIDSource            = WeiboUtilities.GetPoIDSource(url);
            loc.Url                   = url;
            loc.IntervalMins          = 15;
            loc.LocationSampleMethode = Enums.SampleMethod.All;
            loc.PoID                  = Guid.NewGuid().ToString("N");
            loc.Radius                = 600;
            loc.RefreshStatus         = Enums.CrawlStatus.Stop;
            try
            {
                loc.Lat = float.Parse(place.lat);
                loc.Lon = float.Parse(place.lon);
            }
            catch (Exception)
            {
                try
                {
                    loc.Lat = (float)place.lat;
                    loc.Lon = (float)place.lon;
                }
                catch (Exception) { }
            }
            loc.Title    = place.title;
            loc.ClientID = place.poiid;
            try
            {
                if (loc.ClientID != null)
                {
                    WeiboAPI.SetPOIInfo(loc, loc.ClientID);
                }
            }
            catch (Exception) { }
            LocationDBManager.AddNewLocation(loc);
            return(new Tuple <string, float, float>(loc.PoID, loc.Lon, loc.Lat));
        }
        public string DoOneJob(IPipeline Pipeline)
        {
            int      SuccCount = 0, ErrCount = 0;
            DateTime nextWorkTime = Utilities.Epoch;


            while (!StopFlag)
            {
                if (DateTime.Now > nextWorkTime)
                {
                    Author author = GetNextJob();
                    if (author != null)
                    {
                        try
                        {
                            //如果不是红人,那么只刷新一次就结束
                            if (WeiboUtilities.IsRedSkin(author.AuthorSource))
                            {
                                author.Fans_RefreshStatus = Enums.CrawlStatus.Normal;
                            }
                            else
                            {
                                author.Fans_RefreshStatus = Enums.CrawlStatus.Stop;
                            }

                            #region 用户粉丝刷新
                            List <NetDimension.Weibo.Entities.user.Entity> users = new List <NetDimension.Weibo.Entities.user.Entity>();
                            try
                            {
                                SendMsg(string.Format("正在刷新{0}的粉丝", author.AuthorName));
                                WeiboAPI.GetFollowers(author.AuthorID, author.FollowerSampleMethode, users);
                            }
                            catch (IOException)
                            {
                                ErrCount++;
                                nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime;
                            }
                            catch (Exception ex)
                            {
                                SendMsg("获取粉丝列表时发生错误,见日志");
                                ErrCount++;
                                nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime;
                                author.Fans_RefreshStatus = Enums.CrawlStatus.Normal;
                                Logger.Error(ex.ToString());
                            }

                            SendMsg(string.Format("{0}的粉丝抓取到{1}个,开始插入数据库", author.AuthorName, users.Count));
                            double avg = 0; //用户粉丝的粉丝平均数
                            for (int i = 0; i < users.Count; ++i)
                            {
                                var user = AuthorDBManager.ConvertToAuthor(users[i], Enums.AuthorSource.FansDiscover);
                                AuthorDBManager.InsertOrUpdateAuthorInfo(user);
                                CntData.Tick();
                                AuthorRelationDBManager.InsertOrUpdateRelation(user.AuthorID, author.AuthorID);
                                avg += (double)users[i].FollowersCount / (double)users.Count;
                            }
                            #endregion

                            #region 用户关注列表
                            try
                            {
                                IEnumerable <string> friends = null;
                                SendMsg(string.Format("{0}的粉丝插入完成,开始获取他的关注列表", author.AuthorName));
                                friends = WeiboAPI.GetFriendsIDs(author.AuthorID, author.FansSampleMethode);
                                if (friends != null)
                                {
                                    foreach (var user in friends)
                                    {
                                        AuthorRelationDBManager.InsertOrUpdateRelation(user, author.AuthorID);
                                    }
                                }
                            }
                            catch (IOException)
                            {
                                ErrCount++;
                                nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime;
                            }
                            catch (Exception ex)
                            {
                                SendMsg("获取关注列表时发生错误,见日志");
                                ErrCount++;
                                nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime;
                                author.Fans_RefreshStatus = Enums.CrawlStatus.Normal;
                                Logger.Error(ex.ToString());
                            }

                            SendMsg(string.Format("{0}的关系刷新任务完成", author.AuthorName));
                            #endregion

                            author.AvgFansCountOfFans = (int)avg;
                            SuccCount++;
                            continue;
                        }
                        catch (Exception ex)
                        {
                            ErrCount++;
                            nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime;
                            author.Fans_RefreshStatus = Enums.CrawlStatus.Normal;
                            SendMsg(ex.ToString());
                            Logger.Error(ex.ToString());
                        }
                        finally
                        {
                            author.Fans_UpdateCount++;
                            author.Fans_NextRefreshTime = DateTime.Now.AddDays(author.Fans_IntervalDays);
                            AuthorDBManager.PushbackRelationshipJob(author);
                        }
                    }
                }
                Thread.Sleep(IntervalMS);
            }
            StopFlag = false;
            return(SuccCount == 0 && ErrCount == 0 ? "Nothing to do" : string.Format("OneJob Done. Succ {0} Err {1}", SuccCount, ErrCount));
        }
Beispiel #9
0
        /// <summary>
        /// 将新浪返回的动态微博类型转换为本地微博类型(Item)
        /// </summary>
        /// <param name="status">动态类型微博</param>
        /// <param name="source">微博来源</param>
        /// <returns></returns>
        public static Item ConvertToItem(Enums.AuthorSource source, string CrawlID, dynamic status, dynamic user = null, Author author = null)
        {
            Item item = new Item();

            item.CrawlID = CrawlID;
            item.Crawler = ConfigurationManager.AppSettings["ServerLocation"] + CrawlID;

            //设置媒体信息
            WeiboUtilities.SetItemMediaInfo(item);

            try
            {
                #region 抓取任务数据

                item.FetchTime          = DateTime.Now;
                item.UpdateTime         = null;
                item.ContentDetailLevel = Enums.ContentDetailLevel.Weibo;

                #endregion 抓取任务数据

                #region 基础数据
                if (user == null)
                {
                    if (author != null)
                    {
                        item.Url = Utilities.GetItemUrl(author.AuthorID, status.mid);
                    }
                    else
                    {
                        item.Url = Utilities.GetItemUrl(status.user.id, status.mid);
                    }
                }
                else
                {
                    item.Url = Utilities.GetItemUrl(user.id, status.mid);
                }
                if (item.Url == null)
                {
                    item.ItemID = null;
                }
                else
                {
                    item.ItemID = Palas.Common.Utility.MD5Helper.getMd5Hash(item.Url);
                }
                item.ClientItemID = status.id;
                item.CleanTitle   = status.text;
                item.PubDate      = Utilities.ParseToDateTime(status.created_at);
                item.Location     = null;
                string checkinUrl = null;
                try
                {
                    checkinUrl = Utilities.GetCheckInUrl(status.text);
                }
                catch (Exception) { }

                LocationDBManager.SetPoIDAndCoordinate(item, status, checkinUrl);

                if (user == null)
                {
                    if (author != null)
                    {
                        item.AuthorName         = author.AuthorName;
                        item.AuthorID           = author.AuthorID;
                        item.AuthorCertificated = author.Certification;
                        item.Source             = status.source;
                        item.AuthorImg          = author.AuthorImg;
                    }
                    else
                    {
                        item.AuthorName         = status.user.name;
                        item.AuthorID           = status.user.id;
                        item.AuthorCertificated = Utilities.GetCertificationType(status.user.verified_type, status.user.verified);
                        item.Source             = status.source;
                        item.AuthorImg          = status.user.profile_image_url;
                    }
                }
                else
                {
                    item.AuthorName         = user.name;
                    item.AuthorID           = user.id;
                    item.AuthorCertificated = Utilities.GetCertificationType(user.verified_type, user.verified);
                    item.Source             = status.source;
                    item.AuthorImg          = user.profile_image_url;
                }
                try
                {
                    item.AttachImg = status.original_pic;
                }
                catch (Exception) { }
                #endregion

                #region Item跟踪
                item.CurrentCount = new ItemCountData(DateTime.Now);
                try
                {
                    item.CurrentCount.ForwardCount = int.Parse(status.reposts_count);
                    item.CurrentCount.ReplyCount   = int.Parse(status.comments_count);
                }
                catch (Exception) { }
                item.CountHistory    = new ItemCountData[1];
                item.CountHistory[0] = item.CurrentCount;
                WeiboUtilities.SetItemTrackingRule(item, source);
                #endregion Item跟踪
                try
                {
                    if (status.retweeted_status != null)
                    {
                        Item tmp = ConvertToItem(status.retweeted_status, source, CrawlID);
                        if (tmp.ItemID != null)
                        {
                            InsertOrUpdateItem(tmp);
                        }
                        item.ParentItemID = tmp.ItemID;
                    }
                }
                catch (Exception) { }
            }
            catch (Exception) {  }
            return(item);
        }
        public static void SetPoIDAndCoordinate(Item item, dynamic status, string checkinUrl)
        {
            item.PoIDSource = WeiboUtilities.GetPoIDSource(checkinUrl);
            if (!WeiboUtilities.IsPOISourceInWhiteList(item.PoIDSource))
            {
                item.PoID = item.PoIDSource = null;
                return;
            }
            //情形一,有POID
            try
            {
                foreach (var anno in status.annotations)
                {
                    Tuple <string, float, float> tupe = LocationDBManager.GetPoIDAndCoordinateViaClientID(anno.place.poiid);
                    if (tupe == null)
                    {
                        tupe = AddNewLocation(anno.place, checkinUrl);
                    }
                    if (tupe != null)
                    {
                        item.PoID = tupe.Item1;
                        item.Lon  = tupe.Item2;
                        item.Lat  = tupe.Item3;
                        return;
                    }
                }
            }
            catch (Exception) { }

            #region 尝试获取坐标
            Tuple <float, float> coordinate = null;
            try
            {
                foreach (var anno in status.annotations)
                {
                    //从wpinfo获取坐标
                    coordinate = Utilities.GetCoordinateViaWPInfo(anno.wpinfo);
                }
            }
            catch (Exception) { }
            if (coordinate == null)
            {
                try
                {
                    //从GEO获取坐标
                    coordinate = Utilities.GetCoordinateViaGEO(status.geo);
                }
                catch (Exception) { }
            }
            if (coordinate == null)
            {
                try
                {
                    //尝试解析签到链接获取url
                    coordinate = Utilities.GetCoordinateViaUrl(checkinUrl);
                }
                catch (Exception) { }
            }
            #endregion

            if (coordinate != null)
            {
                item.Lon = coordinate.Item1;
                item.Lat = coordinate.Item2;
            }

            //有checkinUrl
            if (checkinUrl != null)
            {
                var tupe = GetPoIDAndCoordinateViaUrl(checkinUrl);
                if (tupe == null)
                {
                    tupe = AddNewLocation(coordinate, checkinUrl);
                }
                if (tupe != null)
                {
                    item.PoID = tupe.Item1;
                    item.Lon  = tupe.Item2;
                    item.Lat  = tupe.Item3;
                }
                return;
            }

            //只有坐标
            if (coordinate != null)
            {
                item.PoID = GetPoIDViaCoordinate(coordinate.Item1, coordinate.Item2);
                if (item.PoID == null)
                {
                    item.PoID = AddNewLocation(coordinate);
                }
                return;
            }

            //什么都没有
            item.PoID = item.PoIDSource = null;
        }
        public string DoOneJob(IPipeline Pipeline)
        {
            int      SuccCount = 0, ErrCount = 0;
            DateTime nextWorkTime = Utilities.Epoch;

            while (!StopFlag)
            {
                if (DateTime.Now > nextWorkTime)
                {
                    Item item = GetNextJob();
                    if (item != null)
                    {
                        try
                        {
                            #region 获取最新转发列表
                            List <NetDimension.Weibo.Entities.status.Entity> result = new List <NetDimension.Weibo.Entities.status.Entity>();
                            try
                            {
                                WeiboAPI.GetRepostOfStatus(item, result);
                            }
                            catch (WeiboException ex)
                            {
                                ErrCount++;
                                nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime;
                                SendMsg(ex.ToString());
                            }
                            for (int i = 0; i < result.Count; ++i)
                            {
                                var newItem = ItemDBManager.ConvertToItem(result[i], Enums.AuthorSource.TopicTrack, CrawlID);
                                ItemDBManager.InsertOrUpdateItem(newItem);
                            }
                            #endregion

                            #region 更新转发评论数的历史记录
                            try
                            {
                                var countData = WeiboAPI.GetRepostAndReplyCount(item.ClientItemID);
                                item.CurrentCount.FetchTime    = DateTime.Now;
                                item.CurrentCount.ForwardCount = countData.Item1;
                                item.CurrentCount.ReplyCount   = countData.Item2;
                            }
                            catch (WeiboException ex)
                            {
                                ErrCount++;
                                nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime;
                                SendMsg(ex.ToString());
                            }

                            List <ItemCountData> count = null;
                            if (item.CountHistory == null)
                            {
                                count = new List <ItemCountData>();
                            }
                            else
                            {
                                count = new List <ItemCountData>(item.CountHistory);
                            }
                            count.Add(item.CurrentCount);
                            item.CountHistory = count.ToArray();
                            #endregion

                            item.Tracking_Forward.FollowCount++;
                            if (WeiboUtilities.ShouldKeepFollow(item))
                            {
                                item.Tracking_Forward.FollowStatus = Enums.CrawlStatus.Normal;
                            }
                            else
                            {
                                item.Tracking_Forward.FollowStatus = Enums.CrawlStatus.Stop;
                            }

                            SuccCount++;
                            continue;
                        }
                        catch (Exception ex)
                        {
                            item.Tracking_Forward.FollowStatus = Enums.CrawlStatus.Normal;
                            ErrCount++;
                            nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime;
                            SendMsg(ex.ToString());
                        }
                        finally
                        {
                            item.Tracking_Forward.FollowNextTime = DateTime.Now.AddMinutes(DefaultSettings.RepostTrackingInterval.TotalMinutes);
                            ItemDBManager.PushbackRepostTrackingJob(item);
                        }
                    }
                }
                Thread.Sleep(IntervalMS);
            }
            StopFlag = false;
            return(SuccCount == 0 && ErrCount == 0 ? "Nothing to do" : string.Format("OneJob Done. Succ {0} Err {1}", SuccCount, ErrCount));
        }
Beispiel #12
0
        public string DoOneJob(IPipeline Pipeline)
        {
            SendMsg("暂时不执行回复跟踪任务");
            StopFlag = true;
            int      SuccCount = 0, ErrCount = 0;
            DateTime nextWorkTime = Utilities.Epoch;

            while (!StopFlag)
            {
                if (DateTime.Now > nextWorkTime)
                {
                    Item item = GetNextJob();
                    if (item != null)
                    {
                        try
                        {
                            //最新评论列表
                            List <NetDimension.Weibo.Entities.comment.Entity> result = new List <NetDimension.Weibo.Entities.comment.Entity>();
                            try
                            {
                                WeiboAPI.GetCommentsOfStatus(item, result);
                            }
                            catch (WeiboException ex)
                            {
                                ErrCount++;
                                nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime;
                                SendMsg(ex.ToString());
                            }
                            for (int i = 0; i < result.Count; ++i)
                            {
                                var reply = ItemReplyDBManager.ConvertToItemReply(result[i]);
                                ItemReplyDBManager.InsertItemReply(reply);
                            }

                            item.Tracking.ReplyCount += result.Count;
                            item.Tracking.FollowCount++;
                            if (WeiboUtilities.ShouldKeepFollow(item))
                            {
                                item.Tracking.FollowStatus = Enums.CrawlStatus.Normal;
                            }
                            else
                            {
                                item.Tracking.FollowStatus = Enums.CrawlStatus.Stop;
                            }
                            SuccCount++;
                            continue;
                        }
                        catch (WeiboException ex)
                        {
                            item.Tracking.FollowStatus = Enums.CrawlStatus.Normal;
                            ErrCount++;
                            nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime;
                            SendMsg(ex.ToString());
                        }
                        finally
                        {
                            item.Tracking.FollowNextTime = DateTime.Now.AddMinutes(DefaultSettings.ReplyTrackingInterval.TotalMinutes);
                            ItemDBManager.PushbackReplyTrackingJob(item);
                        }
                    }
                }
                Thread.Sleep(IntervalMS);
            }
            StopFlag = false;
            return(SuccCount == 0 && ErrCount == 0 ? "Nothing to do" : string.Format("OneJob Done. Succ {0} Err {1}", SuccCount, ErrCount));
        }