/// <summary> /// 将新浪返回的微博转换为本地微博类型(Item) /// </summary> /// <param name="status">新浪返回的微博</param> /// <param name="source">微博来源</param> /// <returns></returns> public static Item ConvertToItem(NetDimension.Weibo.Entities.status.Entity status, Enums.AuthorSource source, string CrawlID) { Item item = new Item(); item.CrawlID = CrawlID; item.Crawler = ConfigurationManager.AppSettings["ServerLocation"] + CrawlID; //设置媒体信息 WeiboUtilities.SetItemMediaInfo(item); #region 抓取任务数据 item.FetchTime = DateTime.Now; item.UpdateTime = null; item.ContentDetailLevel = Enums.ContentDetailLevel.Weibo; #endregion 抓取任务数据 try { #region 基础数据 item.Url = Utilities.GetItemUrl(status.User.ID, status.MID); item.ItemID = Palas.Common.Utility.MD5Helper.getMd5Hash(item.Url); item.ClientItemID = status.ID; item.CleanTitle = status.Text; item.PubDate = Utilities.ParseToDateTime(status.CreatedAt); item.Location = null; item.PoID = null; item.AuthorName = status.User.Name; item.AuthorID = status.User.ID; item.AuthorImg = status.User.ProfileImageUrl; item.AuthorCertificated = Utilities.GetCertificationType(status.User.VerifiedType, status.User.Verified); item.Source = status.Source; item.AttachImg = status.OriginalPictureUrl; #endregion #region Item跟踪 item.CurrentCount = new ItemCountData(DateTime.Now); item.CurrentCount.ForwardCount = status.RepostsCount; item.CurrentCount.ReplyCount = status.CommentsCount; item.CountHistory = new ItemCountData[1]; item.CountHistory[0] = item.CurrentCount; WeiboUtilities.SetItemTrackingRule(item, source); #endregion Item跟踪 if (status.RetweetedStatus != null) { Item tmp = ConvertToItem(status.RetweetedStatus, source, CrawlID); if (tmp.ItemID != null) { InsertOrUpdateItem(tmp); } item.ParentItemID = tmp.ItemID; } } catch (Exception) { } return(item); }
public static void InitLocHistJob() { var query = Query.NE("InternalSubscribeID", MongoDB.Bson.BsonNull.Value); var collection = GetCollections <Author>(); var authors = collection.FindAs <Author>(query); int cnt = 0; foreach (var author in authors) { Console.WriteLine("init {0}", author.AuthorName); if (WeiboUtilities.IsRedSkin(author.AuthorSource)) { cnt++; author.Location_RefreshStatus = Enums.CrawlStatus.Normal; author.Location_LastSinceID = null; author.Location_NextRefreshTime = Utilities.Epoch; author.Location_UpdateCount = 0; author.Location_UpdateTime = Utilities.Epoch; string[] parameters = new string[5]; parameters[0] = "Location_LastSinceID"; parameters[1] = "Location_NextRefreshTime"; parameters[2] = "Location_UpdateCount"; parameters[3] = "Location_UpdateTime"; parameters[4] = "Location_RefreshStatus"; UpdateDB <Author>(author, "AuthorID", parameters, SafeMode.True); } } Console.WriteLine(cnt); }
/// <summary> /// 插入一个新用户,如果该用户已经存在,那么更新其信息 /// </summary> /// <param name="author">待操作的用户</param> public static void InsertOrUpdateAuthorInfo(Author author) { if (!WeiboUtilities.ShouldFetchAuthor(author)) { return; } var query = Query.EQ("AuthorID", author.AuthorID); if (Exists <Author>(query)) { UpdateUserInfo(author); } else { InsertOrReplace <Author>(author, "AuthorID", SafeMode.True); } }
public static string AddNewLocation(Tuple <float, float> coordinate) { Location loc = new Location(); loc.PoIDSource = WeiboUtilities.GetPoIDSource(null); loc.CheckInCount = 0; loc.Url = null; loc.IntervalMins = 15; loc.LocationSampleMethode = Enums.SampleMethod.All; loc.PoID = Guid.NewGuid().ToString("N"); loc.Radius = 600; loc.RefreshStatus = Enums.CrawlStatus.Stop; loc.Lon = coordinate.Item1; loc.Lat = coordinate.Item2; loc.CategoryID = "unknown"; LocationDBManager.AddNewLocation(loc); return(loc.PoID); }
private static void InitWork() { //#region 取消关闭按钮,只能通过命令方式退出 //IntPtr hMenu = Process.GetCurrentProcess().MainWindowHandle; //IntPtr hSystemMenu = GetSystemMenu(hMenu, false); //EnableMenuItem(hSystemMenu, SC_CLOSE, MF_GRAYED); //RemoveMenu(hSystemMenu, SC_CLOSE, MF_BYCOMMAND); //#endregion Thread t = new Thread(MainLoop); t.Start(); WeiboUtilities.InitSinaCityTable(); WeiboUtilities.InitPOISourceWhiteList(); }
/// <summary> /// 设置用户抓取信息 /// </summary> /// <param name="author">用户</param> /// <param name="source">用户来源</param> private static void SetAuthorCrawlInfo(Author author, Enums.AuthorSource source) { #region 抓取信息 author.AuthorSource = source; author.CreateTime = DateTime.Now; #region 基本信息和微博刷新 WeiboUtilities.SetAuthorPostSampleRule(author, source); author.UpdateTime = Utilities.Epoch; author.UpdateCount = 0; author.NextRefreshTime = Utilities.Epoch; #endregion #region 刷新粉丝和关注 author.Fans_UpdateTime = Utilities.Epoch; author.Fans_UpdateCount = 0; author.Fans_NextRefreshTime = Utilities.Epoch; WeiboUtilities.SetAuthorFansAndFollowersSampleRule(author, source); #endregion #region 刷新地理列表 author.Location_UpdateTime = Utilities.Epoch; author.Location_UpdateCount = 0; author.Location_NextRefreshTime = Utilities.Epoch; WeiboUtilities.SetAuthorLocationSampleRule(author, source); #endregion switch (author.AuthorSource) { case Enums.AuthorSource.ListedTop: case Enums.AuthorSource.Partner: case Enums.AuthorSource.PublicLeader: author.InternalSubscribeID = DefaultSettings.ToBeFollowed; break; default: author.InternalSubscribeID = null; break; } #endregion }
public static Tuple <string, float, float> AddNewLocation(dynamic place, string url) { Location loc = new Location(); loc.PoIDSource = WeiboUtilities.GetPoIDSource(url); loc.Url = url; loc.IntervalMins = 15; loc.LocationSampleMethode = Enums.SampleMethod.All; loc.PoID = Guid.NewGuid().ToString("N"); loc.Radius = 600; loc.RefreshStatus = Enums.CrawlStatus.Stop; try { loc.Lat = float.Parse(place.lat); loc.Lon = float.Parse(place.lon); } catch (Exception) { try { loc.Lat = (float)place.lat; loc.Lon = (float)place.lon; } catch (Exception) { } } loc.Title = place.title; loc.ClientID = place.poiid; try { if (loc.ClientID != null) { WeiboAPI.SetPOIInfo(loc, loc.ClientID); } } catch (Exception) { } LocationDBManager.AddNewLocation(loc); return(new Tuple <string, float, float>(loc.PoID, loc.Lon, loc.Lat)); }
public string DoOneJob(IPipeline Pipeline) { int SuccCount = 0, ErrCount = 0; DateTime nextWorkTime = Utilities.Epoch; while (!StopFlag) { if (DateTime.Now > nextWorkTime) { Author author = GetNextJob(); if (author != null) { try { //如果不是红人,那么只刷新一次就结束 if (WeiboUtilities.IsRedSkin(author.AuthorSource)) { author.Fans_RefreshStatus = Enums.CrawlStatus.Normal; } else { author.Fans_RefreshStatus = Enums.CrawlStatus.Stop; } #region 用户粉丝刷新 List <NetDimension.Weibo.Entities.user.Entity> users = new List <NetDimension.Weibo.Entities.user.Entity>(); try { SendMsg(string.Format("正在刷新{0}的粉丝", author.AuthorName)); WeiboAPI.GetFollowers(author.AuthorID, author.FollowerSampleMethode, users); } catch (IOException) { ErrCount++; nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime; } catch (Exception ex) { SendMsg("获取粉丝列表时发生错误,见日志"); ErrCount++; nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime; author.Fans_RefreshStatus = Enums.CrawlStatus.Normal; Logger.Error(ex.ToString()); } SendMsg(string.Format("{0}的粉丝抓取到{1}个,开始插入数据库", author.AuthorName, users.Count)); double avg = 0; //用户粉丝的粉丝平均数 for (int i = 0; i < users.Count; ++i) { var user = AuthorDBManager.ConvertToAuthor(users[i], Enums.AuthorSource.FansDiscover); AuthorDBManager.InsertOrUpdateAuthorInfo(user); CntData.Tick(); AuthorRelationDBManager.InsertOrUpdateRelation(user.AuthorID, author.AuthorID); avg += (double)users[i].FollowersCount / (double)users.Count; } #endregion #region 用户关注列表 try { IEnumerable <string> friends = null; SendMsg(string.Format("{0}的粉丝插入完成,开始获取他的关注列表", author.AuthorName)); friends = WeiboAPI.GetFriendsIDs(author.AuthorID, author.FansSampleMethode); if (friends != null) { foreach (var user in friends) { AuthorRelationDBManager.InsertOrUpdateRelation(user, author.AuthorID); } } } catch (IOException) { ErrCount++; nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime; } catch (Exception ex) { SendMsg("获取关注列表时发生错误,见日志"); ErrCount++; nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime; author.Fans_RefreshStatus = Enums.CrawlStatus.Normal; Logger.Error(ex.ToString()); } SendMsg(string.Format("{0}的关系刷新任务完成", author.AuthorName)); #endregion author.AvgFansCountOfFans = (int)avg; SuccCount++; continue; } catch (Exception ex) { ErrCount++; nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime; author.Fans_RefreshStatus = Enums.CrawlStatus.Normal; SendMsg(ex.ToString()); Logger.Error(ex.ToString()); } finally { author.Fans_UpdateCount++; author.Fans_NextRefreshTime = DateTime.Now.AddDays(author.Fans_IntervalDays); AuthorDBManager.PushbackRelationshipJob(author); } } } Thread.Sleep(IntervalMS); } StopFlag = false; return(SuccCount == 0 && ErrCount == 0 ? "Nothing to do" : string.Format("OneJob Done. Succ {0} Err {1}", SuccCount, ErrCount)); }
/// <summary> /// 将新浪返回的动态微博类型转换为本地微博类型(Item) /// </summary> /// <param name="status">动态类型微博</param> /// <param name="source">微博来源</param> /// <returns></returns> public static Item ConvertToItem(Enums.AuthorSource source, string CrawlID, dynamic status, dynamic user = null, Author author = null) { Item item = new Item(); item.CrawlID = CrawlID; item.Crawler = ConfigurationManager.AppSettings["ServerLocation"] + CrawlID; //设置媒体信息 WeiboUtilities.SetItemMediaInfo(item); try { #region 抓取任务数据 item.FetchTime = DateTime.Now; item.UpdateTime = null; item.ContentDetailLevel = Enums.ContentDetailLevel.Weibo; #endregion 抓取任务数据 #region 基础数据 if (user == null) { if (author != null) { item.Url = Utilities.GetItemUrl(author.AuthorID, status.mid); } else { item.Url = Utilities.GetItemUrl(status.user.id, status.mid); } } else { item.Url = Utilities.GetItemUrl(user.id, status.mid); } if (item.Url == null) { item.ItemID = null; } else { item.ItemID = Palas.Common.Utility.MD5Helper.getMd5Hash(item.Url); } item.ClientItemID = status.id; item.CleanTitle = status.text; item.PubDate = Utilities.ParseToDateTime(status.created_at); item.Location = null; string checkinUrl = null; try { checkinUrl = Utilities.GetCheckInUrl(status.text); } catch (Exception) { } LocationDBManager.SetPoIDAndCoordinate(item, status, checkinUrl); if (user == null) { if (author != null) { item.AuthorName = author.AuthorName; item.AuthorID = author.AuthorID; item.AuthorCertificated = author.Certification; item.Source = status.source; item.AuthorImg = author.AuthorImg; } else { item.AuthorName = status.user.name; item.AuthorID = status.user.id; item.AuthorCertificated = Utilities.GetCertificationType(status.user.verified_type, status.user.verified); item.Source = status.source; item.AuthorImg = status.user.profile_image_url; } } else { item.AuthorName = user.name; item.AuthorID = user.id; item.AuthorCertificated = Utilities.GetCertificationType(user.verified_type, user.verified); item.Source = status.source; item.AuthorImg = user.profile_image_url; } try { item.AttachImg = status.original_pic; } catch (Exception) { } #endregion #region Item跟踪 item.CurrentCount = new ItemCountData(DateTime.Now); try { item.CurrentCount.ForwardCount = int.Parse(status.reposts_count); item.CurrentCount.ReplyCount = int.Parse(status.comments_count); } catch (Exception) { } item.CountHistory = new ItemCountData[1]; item.CountHistory[0] = item.CurrentCount; WeiboUtilities.SetItemTrackingRule(item, source); #endregion Item跟踪 try { if (status.retweeted_status != null) { Item tmp = ConvertToItem(status.retweeted_status, source, CrawlID); if (tmp.ItemID != null) { InsertOrUpdateItem(tmp); } item.ParentItemID = tmp.ItemID; } } catch (Exception) { } } catch (Exception) { } return(item); }
public static void SetPoIDAndCoordinate(Item item, dynamic status, string checkinUrl) { item.PoIDSource = WeiboUtilities.GetPoIDSource(checkinUrl); if (!WeiboUtilities.IsPOISourceInWhiteList(item.PoIDSource)) { item.PoID = item.PoIDSource = null; return; } //情形一,有POID try { foreach (var anno in status.annotations) { Tuple <string, float, float> tupe = LocationDBManager.GetPoIDAndCoordinateViaClientID(anno.place.poiid); if (tupe == null) { tupe = AddNewLocation(anno.place, checkinUrl); } if (tupe != null) { item.PoID = tupe.Item1; item.Lon = tupe.Item2; item.Lat = tupe.Item3; return; } } } catch (Exception) { } #region 尝试获取坐标 Tuple <float, float> coordinate = null; try { foreach (var anno in status.annotations) { //从wpinfo获取坐标 coordinate = Utilities.GetCoordinateViaWPInfo(anno.wpinfo); } } catch (Exception) { } if (coordinate == null) { try { //从GEO获取坐标 coordinate = Utilities.GetCoordinateViaGEO(status.geo); } catch (Exception) { } } if (coordinate == null) { try { //尝试解析签到链接获取url coordinate = Utilities.GetCoordinateViaUrl(checkinUrl); } catch (Exception) { } } #endregion if (coordinate != null) { item.Lon = coordinate.Item1; item.Lat = coordinate.Item2; } //有checkinUrl if (checkinUrl != null) { var tupe = GetPoIDAndCoordinateViaUrl(checkinUrl); if (tupe == null) { tupe = AddNewLocation(coordinate, checkinUrl); } if (tupe != null) { item.PoID = tupe.Item1; item.Lon = tupe.Item2; item.Lat = tupe.Item3; } return; } //只有坐标 if (coordinate != null) { item.PoID = GetPoIDViaCoordinate(coordinate.Item1, coordinate.Item2); if (item.PoID == null) { item.PoID = AddNewLocation(coordinate); } return; } //什么都没有 item.PoID = item.PoIDSource = null; }
public string DoOneJob(IPipeline Pipeline) { int SuccCount = 0, ErrCount = 0; DateTime nextWorkTime = Utilities.Epoch; while (!StopFlag) { if (DateTime.Now > nextWorkTime) { Item item = GetNextJob(); if (item != null) { try { #region 获取最新转发列表 List <NetDimension.Weibo.Entities.status.Entity> result = new List <NetDimension.Weibo.Entities.status.Entity>(); try { WeiboAPI.GetRepostOfStatus(item, result); } catch (WeiboException ex) { ErrCount++; nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime; SendMsg(ex.ToString()); } for (int i = 0; i < result.Count; ++i) { var newItem = ItemDBManager.ConvertToItem(result[i], Enums.AuthorSource.TopicTrack, CrawlID); ItemDBManager.InsertOrUpdateItem(newItem); } #endregion #region 更新转发评论数的历史记录 try { var countData = WeiboAPI.GetRepostAndReplyCount(item.ClientItemID); item.CurrentCount.FetchTime = DateTime.Now; item.CurrentCount.ForwardCount = countData.Item1; item.CurrentCount.ReplyCount = countData.Item2; } catch (WeiboException ex) { ErrCount++; nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime; SendMsg(ex.ToString()); } List <ItemCountData> count = null; if (item.CountHistory == null) { count = new List <ItemCountData>(); } else { count = new List <ItemCountData>(item.CountHistory); } count.Add(item.CurrentCount); item.CountHistory = count.ToArray(); #endregion item.Tracking_Forward.FollowCount++; if (WeiboUtilities.ShouldKeepFollow(item)) { item.Tracking_Forward.FollowStatus = Enums.CrawlStatus.Normal; } else { item.Tracking_Forward.FollowStatus = Enums.CrawlStatus.Stop; } SuccCount++; continue; } catch (Exception ex) { item.Tracking_Forward.FollowStatus = Enums.CrawlStatus.Normal; ErrCount++; nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime; SendMsg(ex.ToString()); } finally { item.Tracking_Forward.FollowNextTime = DateTime.Now.AddMinutes(DefaultSettings.RepostTrackingInterval.TotalMinutes); ItemDBManager.PushbackRepostTrackingJob(item); } } } Thread.Sleep(IntervalMS); } StopFlag = false; return(SuccCount == 0 && ErrCount == 0 ? "Nothing to do" : string.Format("OneJob Done. Succ {0} Err {1}", SuccCount, ErrCount)); }
public string DoOneJob(IPipeline Pipeline) { SendMsg("暂时不执行回复跟踪任务"); StopFlag = true; int SuccCount = 0, ErrCount = 0; DateTime nextWorkTime = Utilities.Epoch; while (!StopFlag) { if (DateTime.Now > nextWorkTime) { Item item = GetNextJob(); if (item != null) { try { //最新评论列表 List <NetDimension.Weibo.Entities.comment.Entity> result = new List <NetDimension.Weibo.Entities.comment.Entity>(); try { WeiboAPI.GetCommentsOfStatus(item, result); } catch (WeiboException ex) { ErrCount++; nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime; SendMsg(ex.ToString()); } for (int i = 0; i < result.Count; ++i) { var reply = ItemReplyDBManager.ConvertToItemReply(result[i]); ItemReplyDBManager.InsertItemReply(reply); } item.Tracking.ReplyCount += result.Count; item.Tracking.FollowCount++; if (WeiboUtilities.ShouldKeepFollow(item)) { item.Tracking.FollowStatus = Enums.CrawlStatus.Normal; } else { item.Tracking.FollowStatus = Enums.CrawlStatus.Stop; } SuccCount++; continue; } catch (WeiboException ex) { item.Tracking.FollowStatus = Enums.CrawlStatus.Normal; ErrCount++; nextWorkTime = WeiboAPI.rateLimitStatus.ResetTime; SendMsg(ex.ToString()); } finally { item.Tracking.FollowNextTime = DateTime.Now.AddMinutes(DefaultSettings.ReplyTrackingInterval.TotalMinutes); ItemDBManager.PushbackReplyTrackingJob(item); } } } Thread.Sleep(IntervalMS); } StopFlag = false; return(SuccCount == 0 && ErrCount == 0 ? "Nothing to do" : string.Format("OneJob Done. Succ {0} Err {1}", SuccCount, ErrCount)); }