/// <summary> /// 设置用户普查的规则,默认为每7天普查前100条微博 /// </summary> /// <param name="author">用户</param> /// <param name="source">用户来源</param> public static void SetAuthorPostSampleRule(Author author, Enums.AuthorSource source) { switch (source) { case Enums.AuthorSource.ListedTop: case Enums.AuthorSource.Partner: case Enums.AuthorSource.PublicLeader: author.RefreshStatus = Enums.CrawlStatus.Normal; break; default: author.RefreshStatus = Enums.CrawlStatus.Stop; break; } switch (source) { case Enums.AuthorSource.ListedTop: author.PostSampleMethode = Enums.SampleMethod.All; break; case Enums.AuthorSource.PublicLeader: author.PostSampleMethode = Enums.SampleMethod.All; break; default: author.PostSampleMethode = Enums.SampleMethod.First100; break; } author.IntervalDays = 7; }
/// <summary> /// 手工添加一个用户到库内(使用用户ID) /// </summary> /// <param name="AuthorID">用户ID</param> /// <param name="source">用户来源</param> public static void AddNewAuthorWithID(string AuthorID, Enums.AuthorSource source) { var user = WeiboAPI.GetAuthorInfo(AuthorID); var author = AuthorDBManager.ConvertToAuthor(user, source); AuthorDBManager.InsertOrUpdateAuthorInfo(author); }
/// <summary> /// 手工添加一个用户到库内(使用用户微博主页地址) /// </summary> /// <param name="url">用户微博主页</param> /// <param name="source">用户来源</param> public static void AddNewAuthorWithUrl(string url, Enums.AuthorSource source) { string html = HTMLJobProcessor.GetHTMLViaRequest(url); Match match = userIDReg.Match(html); if (match.Groups[1].Success) { AddNewAuthorWithID(match.Groups[1].Value, source); } }
/// <summary> /// 判断该用户来源是否属于红人 /// </summary> /// <param name="source"></param> /// <returns></returns> public static bool IsRedSkin(Enums.AuthorSource source) { switch (source) { case Enums.AuthorSource.ListedTop: case Enums.AuthorSource.Partner: case Enums.AuthorSource.PublicLeader: return(true); default: return(false); } }
/// <summary> /// 将新浪返回的动态类转换为Author类 /// </summary> /// <param name="user">动态类用户</param> /// <param name="source">用户来源</param> /// <returns>转换后的用户</returns> public static Author ConvertToAuthor(dynamic user, Enums.AuthorSource source) { Author author = new Author(); #region 注册信息 try { author.AuthorID = user.id; author.AuthorName = user.screen_name; //是昵称还是? author.RealName = user.name; author.Certification = Utilities.GetCertificationType(user.verified_type, user.verified); author.CertificationInfo = user.verified_reason; author.Gender = Utilities.GetGender(user.gender); author.RegisterTime = Utilities.ParseToDateTime(user.created_at); author.RegionID = RegionDBManager.GetRegionID(user.location); author.Description = user.description; author.AuthorImg = user.profile_image_url; author.Homepage = user.profile_url; if (!string.IsNullOrEmpty(author.Homepage) && !author.Homepage.Contains("http:")) { if (author.Homepage.Contains('/')) { author.Homepage = "http://weibo.com/" + author.Homepage; } else { author.Homepage = "http://blog.sina.com.cn/" + author.Homepage; } } } catch (Exception) { } #endregion #region 行为数据 try { author.FansCount = int.Parse(user.followers_count); author.FollowCount = int.Parse(user.friends_count); author.CloseFriendsCount = int.Parse(user.bi_followers_count); author.PostCount = int.Parse(user.statuses_count); author.AvgForward = author.AvgReply = author.AvgFansCountOfFans; } catch (Exception) { } #endregion SetAuthorCrawlInfo(author, source); return(author); }
/// <summary> /// 设置单条微博的转发和跟踪规则,默认为每15分钟刷一次转发和评论 /// </summary> /// <param name="item">微博</param> /// <param name="source">微博来源</param> public static void SetItemTrackingRule(Item item, Enums.AuthorSource source) { item.Tracking = null; item.Tracking_Forward = new ItemTracking(); item.Tracking_Forward.FollowNextTime = Utilities.Epoch; if (ItemCountData.ShouldFollow(item.CurrentCount, -1, DefaultSettings.MinReply, DefaultSettings.MinForward)) { item.Tracking_Forward.FollowStatus = Enums.CrawlStatus.Normal; } else { item.Tracking_Forward.FollowStatus = Enums.CrawlStatus.Stop; } item.Tracking_Forward.FollowPriority = 0; }
/// <summary> /// 设置用户粉丝和关注的跟踪规则,默认为每7天抓取用户前5000个粉丝和关注 /// </summary> /// <param name="author">用户</param> /// <param name="source">用户来源</param> public static void SetAuthorFansAndFollowersSampleRule(Author author, Enums.AuthorSource source) { switch (source) { case Enums.AuthorSource.ListedTop: case Enums.AuthorSource.Partner: case Enums.AuthorSource.PublicLeader: author.Fans_RefreshStatus = Enums.CrawlStatus.Normal; break; default: author.Fans_RefreshStatus = Enums.CrawlStatus.Stop; break; } author.FollowerSampleMethode = Enums.SampleMethod.First5000; author.FansSampleMethode = Enums.SampleMethod.First5000; author.Fans_IntervalDays = 7; }
/// <summary> /// 设置用户抓取信息 /// </summary> /// <param name="author">用户</param> /// <param name="source">用户来源</param> private static void SetAuthorCrawlInfo(Author author, Enums.AuthorSource source) { #region 抓取信息 author.AuthorSource = source; author.CreateTime = DateTime.Now; #region 基本信息和微博刷新 WeiboUtilities.SetAuthorPostSampleRule(author, source); author.UpdateTime = Utilities.Epoch; author.UpdateCount = 0; author.NextRefreshTime = Utilities.Epoch; #endregion #region 刷新粉丝和关注 author.Fans_UpdateTime = Utilities.Epoch; author.Fans_UpdateCount = 0; author.Fans_NextRefreshTime = Utilities.Epoch; WeiboUtilities.SetAuthorFansAndFollowersSampleRule(author, source); #endregion #region 刷新地理列表 author.Location_UpdateTime = Utilities.Epoch; author.Location_UpdateCount = 0; author.Location_NextRefreshTime = Utilities.Epoch; WeiboUtilities.SetAuthorLocationSampleRule(author, source); #endregion switch (author.AuthorSource) { case Enums.AuthorSource.ListedTop: case Enums.AuthorSource.Partner: case Enums.AuthorSource.PublicLeader: author.InternalSubscribeID = DefaultSettings.ToBeFollowed; break; default: author.InternalSubscribeID = null; break; } #endregion }
/// <summary> /// 将新浪返回的动态微博类型转换为本地微博类型(Item) /// </summary> /// <param name="status">动态类型微博</param> /// <param name="source">微博来源</param> /// <returns></returns> public static Item ConvertToItem(Enums.AuthorSource source, string CrawlID, dynamic status, dynamic user = null, Author author = null) { Item item = new Item(); item.CrawlID = CrawlID; item.Crawler = ConfigurationManager.AppSettings["ServerLocation"] + CrawlID; //设置媒体信息 WeiboUtilities.SetItemMediaInfo(item); try { #region 抓取任务数据 item.FetchTime = DateTime.Now; item.UpdateTime = null; item.ContentDetailLevel = Enums.ContentDetailLevel.Weibo; #endregion 抓取任务数据 #region 基础数据 if (user == null) { if (author != null) { item.Url = Utilities.GetItemUrl(author.AuthorID, status.mid); } else { item.Url = Utilities.GetItemUrl(status.user.id, status.mid); } } else { item.Url = Utilities.GetItemUrl(user.id, status.mid); } if (item.Url == null) { item.ItemID = null; } else { item.ItemID = Palas.Common.Utility.MD5Helper.getMd5Hash(item.Url); } item.ClientItemID = status.id; item.CleanTitle = status.text; item.PubDate = Utilities.ParseToDateTime(status.created_at); item.Location = null; string checkinUrl = null; try { checkinUrl = Utilities.GetCheckInUrl(status.text); } catch (Exception) { } LocationDBManager.SetPoIDAndCoordinate(item, status, checkinUrl); if (user == null) { if (author != null) { item.AuthorName = author.AuthorName; item.AuthorID = author.AuthorID; item.AuthorCertificated = author.Certification; item.Source = status.source; item.AuthorImg = author.AuthorImg; } else { item.AuthorName = status.user.name; item.AuthorID = status.user.id; item.AuthorCertificated = Utilities.GetCertificationType(status.user.verified_type, status.user.verified); item.Source = status.source; item.AuthorImg = status.user.profile_image_url; } } else { item.AuthorName = user.name; item.AuthorID = user.id; item.AuthorCertificated = Utilities.GetCertificationType(user.verified_type, user.verified); item.Source = status.source; item.AuthorImg = user.profile_image_url; } try { item.AttachImg = status.original_pic; } catch (Exception) { } #endregion #region Item跟踪 item.CurrentCount = new ItemCountData(DateTime.Now); try { item.CurrentCount.ForwardCount = int.Parse(status.reposts_count); item.CurrentCount.ReplyCount = int.Parse(status.comments_count); } catch (Exception) { } item.CountHistory = new ItemCountData[1]; item.CountHistory[0] = item.CurrentCount; WeiboUtilities.SetItemTrackingRule(item, source); #endregion Item跟踪 try { if (status.retweeted_status != null) { Item tmp = ConvertToItem(status.retweeted_status, source, CrawlID); if (tmp.ItemID != null) { InsertOrUpdateItem(tmp); } item.ParentItemID = tmp.ItemID; } } catch (Exception) { } } catch (Exception) { } return(item); }
/// <summary> /// 将新浪返回的微博转换为本地微博类型(Item) /// </summary> /// <param name="status">新浪返回的微博</param> /// <param name="source">微博来源</param> /// <returns></returns> public static Item ConvertToItem(NetDimension.Weibo.Entities.status.Entity status, Enums.AuthorSource source, string CrawlID) { Item item = new Item(); item.CrawlID = CrawlID; item.Crawler = ConfigurationManager.AppSettings["ServerLocation"] + CrawlID; //设置媒体信息 WeiboUtilities.SetItemMediaInfo(item); #region 抓取任务数据 item.FetchTime = DateTime.Now; item.UpdateTime = null; item.ContentDetailLevel = Enums.ContentDetailLevel.Weibo; #endregion 抓取任务数据 try { #region 基础数据 item.Url = Utilities.GetItemUrl(status.User.ID, status.MID); item.ItemID = Palas.Common.Utility.MD5Helper.getMd5Hash(item.Url); item.ClientItemID = status.ID; item.CleanTitle = status.Text; item.PubDate = Utilities.ParseToDateTime(status.CreatedAt); item.Location = null; item.PoID = null; item.AuthorName = status.User.Name; item.AuthorID = status.User.ID; item.AuthorImg = status.User.ProfileImageUrl; item.AuthorCertificated = Utilities.GetCertificationType(status.User.VerifiedType, status.User.Verified); item.Source = status.Source; item.AttachImg = status.OriginalPictureUrl; #endregion #region Item跟踪 item.CurrentCount = new ItemCountData(DateTime.Now); item.CurrentCount.ForwardCount = status.RepostsCount; item.CurrentCount.ReplyCount = status.CommentsCount; item.CountHistory = new ItemCountData[1]; item.CountHistory[0] = item.CurrentCount; WeiboUtilities.SetItemTrackingRule(item, source); #endregion Item跟踪 if (status.RetweetedStatus != null) { Item tmp = ConvertToItem(status.RetweetedStatus, source, CrawlID); if (tmp.ItemID != null) { InsertOrUpdateItem(tmp); } item.ParentItemID = tmp.ItemID; } } catch (Exception) { } return(item); }
/// <summary> /// 将新浪返回的用户类型转换为Author类型 /// </summary> /// <param name="user">新浪返回的用户</param> /// <param name="source">用户来源</param> /// <returns>转换后的用户</returns> public static Author ConvertToAuthor(NetDimension.Weibo.Entities.user.Entity user, Enums.AuthorSource source) { Author author = new Author(); #region 注册信息 author.AuthorID = user.ID; author.AuthorName = user.ScreenName; author.RealName = user.Name; author.Certification = Utilities.GetCertificationType(user.VerifiedType, user.Verified); author.CertificationInfo = user.VerifiedReason; author.Gender = Utilities.GetGender(user.Gender); author.RegionID = RegionDBManager.GetRegionID(user.Location); author.Description = user.Description; author.AuthorImg = user.ProfileImageUrl; author.Homepage = user.ProfileUrl; if (!string.IsNullOrEmpty(author.Homepage) && !author.Homepage.Contains("http:")) { if (author.Homepage.Contains('/')) { author.Homepage = "http://weibo.com/" + author.Homepage; } else { author.Homepage = "http://blog.sina.com.cn/" + author.Homepage; } } author.RegisterTime = Utilities.ParseToDateTime(user.CreatedAt); #endregion #region 行为数据 author.FansCount = user.FollowersCount; author.FollowCount = user.FriendsCount; author.CloseFriendsCount = user.BIFollowersCount; author.PostCount = user.StatusesCount; author.AvgForward = author.AvgReply = author.AvgFansCountOfFans = 0; #endregion SetAuthorCrawlInfo(author, source); return(author); }