protected override async Task <TaskDataExecutionResult> HandleInternal(YandeListTaskData data, CancellationToken ct) { var client = await GetHttpClient(); var listUrl = string.Format(Options.Value.ListUrlTpl, data.Page); var html = await client.GetStringAsync(listUrl); var cQuery = new CQ(html); var posts = cQuery["#post-list-posts li"]; var imageUrls = new List <string>(); var newTaskData = new List <TaskData>(); var stop = false; foreach (var post in posts) { var id = int.Parse(post.Attributes["id"].Substring(1)); if (data.LastImageIds?.Contains(id) == true) { stop = true; break; } var url = post.Cq().Children("a").Attr("href"); if (url.StartsWith("//")) { url = "https:" + url; } imageUrls.Add(url); data.CrawledImageIds.Add(id); } if (!stop) { newTaskData.Add(new YandeListTaskData { Page = data.Page + 1, LastImageIds = data.LastImageIds }); } if (imageUrls.Any()) { newTaskData.AddRange(imageUrls.Select(t => { var newData = new DownloadTaskData { Url = t }; var filename = WebUtility.UrlDecode(t.Substring(t.LastIndexOf('/') + 1)); var regexSearch = new string(Path.GetInvalidFileNameChars()) + new string(Path.GetInvalidPathChars()); var r = new Regex($"[{Regex.Escape(regexSearch)}]"); newData.RelativeFilename = r.Replace(filename, ""); return(newData); })); } await TaskDistributor.Distribute(newTaskData); return(TaskDataExecutionResult.Complete); }
protected override async Task HandleInternalUnstatable(TaobaoGetItemTaskData taskData, CancellationToken ct) { var client = await GetHttpClient(); var url = string.Format(Options.Value.UrlTemplate, taskData.ItemId); var html = await client.GetStringAsync(url); var cq = new CQ(html); var title = cq["#J_Title h3"].Attr("data-title").Trim(); //china url var match = Regex.Match(html, @"descUrl\s*\:\slocation.*(?<url>\/\/.*?)',").Groups["url"]; //world url if (!match.Success) { match = Regex.Match(html, "descUrlSSL\\s*\\:\\s*\"(?<url>\\/\\/.*?)\".*").Groups["url"]; } var descUrl = match.Value; if (descUrl.StartsWith("//")) { descUrl = $"https:{descUrl}"; } var descJsonp = await client.GetStringAsync(descUrl); var descHtml = Regex.Match(descJsonp, @"var\s*desc\s*=\s*'\s*(?<html>[\s\S]*)\s*'\s*;") .Groups["html"].Value; var descCq = new CQ(descHtml); var urlList = await ImageUrlListExtractor.ExtractImageUrlList(descCq); var index = 0; var newTaskData = urlList.Select(t => { var filename = $"{index++}_{t.Substring(t.LastIndexOf('/') + 1)}"; if (filename.Contains("?")) { filename = filename.Substring(0, filename.IndexOf('?')); } var data = new DownloadTaskData { RelativeFilename = $"{title}/{filename}", Url = t }; return(data); }).ToList(); if (DbContextProvider != null) { var db = await DbContextProvider.Get(); var record = await db.TaobaoItems.FirstOrDefaultAsync(t => t.ItemId == taskData.ItemId, ct); if (record == null) { record = new TaobaoItem { ItemId = taskData.ItemId }; db.Add(record); } record.LastCheckDt = DateTime.Now; await db.SaveChangesAsync(ct); } await TaskDistributor.Distribute(newTaskData); }