protected override Task ParseAsync(DataFlowContext context) { var newsList = context.Selectable.SelectList(Selectors.XPath(".//div[@class='news_block']")); foreach (var news in newsList) { var title = news.Select(Selectors.XPath(".//h2[@class='news_entry']"))?.Value; var url = news.Select(Selectors.XPath(".//h2[@class='news_entry']/a/@href"))?.Value; var summary = news.Select(Selectors.XPath(".//div[@class='entry_summary']"))?.Value; var views = news.Select(Selectors.XPath(".//span[@class='view']"))?.Value.Replace(" 人浏览", ""); if (!string.IsNullOrWhiteSpace(url)) { var request = context.CreateNewRequest(new Uri(url)); request.Properties.Add("title", title); request.Properties.Add("url", url); request.Properties.Add("summary", summary); request.Properties.Add("views", views); context.AddFollowRequests(request); } } return(Task.CompletedTask); }
public override void OnHanlder(DataFlowContext context, BilibiliListRet parseObj) { // Console.WriteLine(Newtonsoft.Json.JsonConvert.SerializeObject(parseObj)); if (parseObj == null) { return; } if (parseObj.data.archives.Length == 0) { return; } using var db = DBSet.GetCon(DBSet.SqliteDBName.Bilibili); int newCount = 0; foreach (var item in parseObj.data.archives) { var av = new AV() { Id = item.aid, bvId = item.bvid, copyright = item.copyright, ctime = item.ctime, ctime2 = item.ctime.UnixToDateTime().ToLongDateString(), pic = item.pic, title = item.title, videos = item.videos, view = item.stat.view, rid = item.tid, UpId = item.owner.mid, cid = item.cid, }; av.stat = item.stat; var existsDB = db.SingleById <AV>(av.Id); if (existsDB != null) { // 之前有过这个视频,则忽略这次操作 Console.WriteLine($"exits {existsDB.title}"); continue; } db.Insert(av); newCount++; // 这里可以获得up的一些简单的信息 https://api.bilibili.com/x/web-interface/card?mid=3630684&photo=1 // 这里不判断了,只要发新视频,都更新一次up主信息 var request = UpProcess.CreateRquerst(av.UpId); context.AddFollowRequests(request); // 爬取 tag 信息,这个信息可能需要不断的更新才行,但也仅限于视频更新1个月以内吧 https://api.bilibili.com/x/web-interface/view/detail/tag?aid=286927170 request = TagProcess.CreateRquerst(av.Id); context.AddFollowRequests(request); // todo 如果视频有多个,还得获得视频下面分视频的数据 https://api.bilibili.com/x/player/pagelist?bvid=BV1wf4y1X7ka // 获得视频的简介 https://api.bilibili.com/x/web-interface/archive/desc?aid=286927170 request = DescProcess.CreateRquerst(av.Id); context.AddFollowRequests(request); // todo 定时更新获得视频的状态 https://api.bilibili.com/x/web-interface/archive/stat?aid=286927170 // todo 读取评论信息 https://api.bilibili.com/x/v2/reply?pn=2&type=1&oid=286927170&sort=0 // 评论的回复翻页内容 https://api.bilibili.com/x/v2/reply/reply?&pn=2&type=1&oid=244913305&ps=10&root=3602777175 // todo 抓取弹幕信息 http://comment.bilibili.com/245666614.xml 这里用的是cid // 爬取封面照片 request = ImageProcess.CreateRequest(av); context.AddFollowRequests(request); } if (newCount > -1) { notfindCount++; } else { notfindCount = 0; } if (notfindCount < 20) { Console.WriteLine("getNextPage"); var page = (int)context.Request.Properties["pageNo"]; var tid = (int)context.Request.Properties["rid"]; // if (page < 2) { var request = CreateListRequest(tid, page + 1); context.AddFollowRequests(request); } } //else //{ // Console.WriteLine("finish"); //} }
/// <summary> /// 数据解析 /// </summary> /// <param name="context">处理上下文</param> /// <returns></returns> public override async Task HandleAsync(DataFlowContext context) { context.NotNull(nameof(context)); context.Response.NotNull(nameof(context.Response)); if (!IsValidRequest(context.Request)) { Logger.LogInformation( $"{GetType().Name} ignore parse request {context.Request.RequestUri}, {context.Request.Hash}"); return; } if (context.Selectable == null) { if (SelectableBuilder != null) { context.Selectable = SelectableBuilder(context); } else { var text = context.Response.ReadAsString().TrimStart(); if (text.StartsWith("<!DOCTYPE html>") || text.StartsWith("<html>")) { context.Selectable = CreateHtmlSelectable(context, text); } else { try { var token = (JObject)JsonConvert.DeserializeObject(text); context.Selectable = new JsonSelectable(token); } catch { context.Selectable = new TextSelectable(text); } } } } await ParseAsync(context); var requests = new List <Request>(); if (_followRequestQueriers != null) { foreach (var followRequestQuerier in _followRequestQueriers) { var followRequests = followRequestQuerier(context); if (followRequests != null) { requests.AddRange(followRequests); } } } foreach (var request in requests) { if (IsValidRequest(request)) { // 在此强制设制 Owner, 防止用户忘记导致出错 request.Owner = context.Request.Owner; request.Agent = context.Response.Agent; context.AddFollowRequests(request); } } }