public static async Task AnalysisMain(IWebElement element, Dictionary <string, string> state, WebDriverWrapper webDriverWrapper) { var regex = new Regex(@"(?<=avatar\/)(\d+)"); var exutor = TaskExutor.Instance; for (int i = 0; i < 4; i++) { webDriverWrapper.WebDriver.ExecuteScript("window.scrollTo(0,document.body.offsetHeight)"); Thread.Sleep(2 * 1000); } var _curState = ExtensionState(state); //解析 post 列表 var _elementPostList = webDriverWrapper.WebDriver.FindElements(By.CssSelector(".rank-index-box .rank-cos-item")); for (int i = 2; i < _elementPostList.Count;) { var item = _elementPostList[i++]; var itemHtml2 = _elementPostList[3].GetAttribute("innerHTML"); var itemHtml = item.GetAttribute("innerHTML"); var imgSrc = item.FindElement(By.CssSelector(".rank-cos-bottom img")).GetAttribute("src"); var userId = regex.Match(imgSrc); var profileUrl = "https://bcy.net/u/" + userId; _curState.Add("profileUrl", profileUrl); var profileSpiderTask = new TaskSpider(profileUrl, _curState); await Task.Run(() => exutor.SetTask(profileSpiderTask, AnalysisProfile)); //var postUrl = item.FindElement(By.CssSelector("a")).GetAttribute("href"); //var postSpiderTask = new TaskSpider(postUrl, _curState); //exutor.SetTask(postSpiderTask, AnalysisPost); } }
public static async Task AnalysisProfile(IWebElement element, Dictionary <string, string> state, WebDriverWrapper webDriverWrapper) { for (int i = 0; i < 4; i++) { webDriverWrapper.WebDriver.ExecuteScript("window.scrollTo(0,document.body.offsetHeight)"); Thread.Sleep(2 * 1000); } var body = element.FindElement(By.TagName("body")).GetAttribute("innerHTML"); var profileElement = element.FindElement(By.CssSelector(".user-info")); var avatar = profileElement.FindElement(By.CssSelector(".user-info-top img.avatar-img")).GetAttribute("src"); var nickName = profileElement.FindElement(By.CssSelector(".user-info-bottom .user-info-name")).GetAttribute("innerText"); var userModel = GetOrSetUser(nickName, avatar); var _curState = ExtensionState(state); _curState.Add("nickName", nickName); _curState.Add("avatar", avatar); var exutor = TaskExutor.Instance; var contentWrap = element.FindElement(By.CssSelector(".one-fall-li-wrap")); var postUrlList = contentWrap.FindElements(By.CssSelector(".desc-content")); foreach (var postUrlElement in postUrlList) { var postUrl = postUrlElement.GetAttribute("href"); var postSpiderTask = new TaskSpider(postUrl, _curState); Task.Run(() => exutor.SetTask(postSpiderTask, AnalysisPost)).Wait(); } }
static void Main(string[] args) { Timer timer = new Timer(paramState => { var wc = new WebClient(); var proxyList = wc.DownloadString("http://dps.kdlapi.com/api/getdps/?orderid=993831743973837&num=2&pt=1&ut=3&dedup=1&sep=1"); File.WriteAllText(AppDomain.CurrentDomain.BaseDirectory + "proxyList.txt", proxyList); }, null, TimeSpan.FromMilliseconds(20), TimeSpan.FromMinutes(50)); var uri = new Uri("https://mioto.milbit.com/2353w5/234234/adfasdf/test.jpg"); var dbCtx = GetDbContext(); // var contentEntryList = dbCtx.ContentEntry.Where(x => x.MediaResource == null || x.MediaResource.Count <= 5 && x.Category.Tags.Any(c => c.Name == "恋爱")).ToList(); zeroList = dbCtx.Categories.Where(x => x.ContentList.Any(c => c.MediaResource == null || c.MediaResource.Count == 0) && x.Tags.Any(c => c.Name == "恋爱")).ToList(); var zeroContentList = dbCtx.ContentEntry.Where(x => (x.MediaResource == null || x.MediaResource.Count == 0) && x.Category.Tags.Any(c => c.Name == "恋爱")).ToList(); var contentEntryList3 = dbCtx.Categories.Where(x => x.Tags.Any(c => c.Name == "恋爱")).Count(); Console.WriteLine(); Console.ReadLine(); TaskExutor exuctor = TaskExutor.Instance; //exuctor.OnTaskResult += Exuctor_OnTaskResult; for (int i = 1; i < 30; i++) { var state = new Dictionary <string, string>(); state.Add("mainIdex", i.ToString()); var taskSpider = new TaskSpider("http://www.weehui.com/cartoon/list/" + i, state); Task.Run(() => exuctor.SetTask(taskSpider, ProcessListPage)).Wait(); // Thread.Sleep(1500000); } Console.WriteLine("Press any key to exit..."); Console.ReadLine(); }
static void Main(string[] args) { Timer timer = new Timer(paramState => { try { var wc = new WebClient(); var proxyList = wc.DownloadString("http://dps.kdlapi.com/api/getdps/?orderid=953622430174277&num=3&pt=1&ut=3&dedup=1&sep=1"); File.WriteAllText(AppDomain.CurrentDomain.BaseDirectory + "proxyList.txt", proxyList); } catch (Exception exc) { Console.WriteLine(exc.Message); } }, null, TimeSpan.FromMinutes(15), TimeSpan.FromMinutes(15)); //var categoryDbContext = GetDbContext(); //var contentEntryList = categoryDbContext.ContentEntry.Where(x => x.MediaResource.Count > 0 && x.Category.Tags.Any(c => c.Name == "恐怖")).ToList(); TaskExutor exuctor = TaskExutor.Instance; //exuctor.OnTaskResult += Exuctor_OnTaskResult; for (int i = 40; i < 126; i++) { var state = new Dictionary <string, string>(); state.Add("index", i.ToString()); var taskSpider = new TaskSpider("http://m.yookes.com/weimanhua/kbmh/page/" + i, state); //var taskSpider = new TaskSpider("http://m.yookes.com/weimanhua/kbmh", state); Task.Run(() => exuctor.SetTask(taskSpider, ProcessListPage)); // Thread.Sleep(1500000); } Console.ReadLine(); Console.WriteLine("Waiting..."); }
private static void processChapter(string categoryTitle, TaskSpider contentTaskSpider) { TaskExutor exuctor = TaskExutor.Instance; var taskSpider = contentTaskSpider; Task.Run(() => exuctor.SetTask(taskSpider, (contentWebElement, state) => { var chapterList = contentWebElement.FindElement(By.TagName("body")).FindElements(By.CssSelector(".chapterList .item")); var contentIndex = 0; //章节列表 foreach (var chapterItem in chapterList) { var chapterTitle = chapterItem.GetAttribute("innerText"); var chatpterItemUrl = chapterItem.FindElement(By.TagName("a")).GetAttribute("href"); var curState = new Dictionary <string, string>(); foreach (var item in state) { curState.Add(item.Key, item.Value); } curState.Add("chapterTitle", chapterTitle); //保存content entry //判断是否存在content ContentEntry contentEntry; var chapterDbContext = GetDbContext(); var hasCurrentContent = chapterDbContext.ContentEntry.AnyAsync(x => x.Category.Name == categoryTitle && x.Title == chapterTitle).Result; if (!hasCurrentContent) { var currentCategory = chapterDbContext.Categories.FirstOrDefaultAsync(x => x.Name == categoryTitle).Result; contentEntry = new ContentEntry { Id = Guid.NewGuid(), Category = currentCategory, Title = chapterTitle, CreateTime = DateTime.Now, Order = contentIndex }; chapterDbContext.ContentEntry.Add(contentEntry); chapterDbContext.SaveChanges(); } else { contentEntry = chapterDbContext.ContentEntry.FirstOrDefaultAsync(x => x.Title == chapterTitle && x.Category.Name == categoryTitle).Result; } //Console.WriteLine(chapterTitle); var chapterTaskSpider = new TaskSpider(chatpterItemUrl, curState); chapterTaskSpider.SetCookie(_cookie); var chapterExcutor = TaskExutor.Instance; processContent(categoryTitle, chapterTitle, contentIndex, chatpterItemUrl, contentEntry.Id, chapterTaskSpider); contentIndex++; } })); }
private static void ProcessListPage(IWebElement obj, Dictionary <string, string> State) { var exuctor = TaskExutor.Instance; Console.WriteLine(obj.FindElement(By.CssSelector("body")).GetAttribute("innerHTML")); var categories = obj.FindElements(By.CssSelector(".ajax-load-con.content")); var categoryDbContext = GetDbContext(); foreach (var item in categories) { var nextUrl = item.FindElement(By.CssSelector("a")).GetAttribute("href"); Console.WriteLine(item.GetAttribute("innerHTML")); var coverUrl = item.FindElement(By.CssSelector("li .image-item img")).GetAttribute("src"); var categoryTitle = item.FindElement(By.CssSelector(".posts-default-title a")).GetAttribute("title").Trim(); var creatTime = DateTime.Parse(item.FindElement(By.CssSelector(".ico-time")).GetAttribute("innerText")); var imgStream = DownloadImg(coverUrl); var coverImgPath = "cartoon/" + categoryTitle + "/coverimg.jpg"; QiniuTool.UploadImage(imgStream, coverImgPath).Wait(); var dbCategory = categoryDbContext.Categories.FirstOrDefault(x => x.Name == categoryTitle); if (dbCategory == null) { var categoryModel = CreateCategory(categoryTitle, creatTime, coverImgPath); categoryDbContext.Add(categoryModel); } var dbContent = categoryDbContext.ContentEntry.FirstOrDefault(x => x.Title == categoryTitle); if (dbContent == null) { dbContent = CreateContent(dbCategory, categoryTitle, creatTime); categoryDbContext.Add(dbContent); } if (dbContent.Category == null) { var subCate = categoryDbContext.Categories.FirstOrDefault(x => x.Name == categoryTitle); dbContent.Category = subCate; } categoryDbContext.SaveChanges(); if (!string.IsNullOrEmpty(nextUrl)) { var state = new Dictionary <string, string>(); var categoriesTaskSpider = new TaskSpider(nextUrl, state); exuctor.SetTask(categoriesTaskSpider, ProcessImgPage); } } }
static void Main(string[] args) { //Timer timer = new Timer(paramState => //{ // var wc = new WebClient(); // var proxyList = wc.DownloadString("http://dps.kdlapi.com/api/getdps/?orderid=993831743973837&num=2&pt=1&ut=3&dedup=1&sep=1"); // File.WriteAllText(AppDomain.CurrentDomain.BaseDirectory + "proxyList.txt", proxyList); //}, null, TimeSpan.FromMilliseconds(20), TimeSpan.FromMinutes(50)); //UpdateDAT(); //return; var excutor = TaskExutor.Instance; var mainState = new Dictionary <string, string>(); var mainSpiderTask = new TaskSpider("https://bcy.net/coser/toppost100", mainState); Task.Run(() => excutor.SetTask(mainSpiderTask, AnalysisMain)).Wait(); Console.WriteLine("Hello World!"); Console.ReadLine(); }
private static void ProcessChapterPage(IWebElement webElement, Dictionary <string, string> state) { var dbCtx = GetDbContext(); var cateTitle = state["cateTitle"]; var categoryData = dbCtx.Categories.FirstOrDefault(x => x.Name == cateTitle); var chapterList = webElement.FindElement(By.TagName("body")).FindElements(By.CssSelector(".chapterList .item")); var contentIndex = 0; foreach (var chapterItem in chapterList) { var chapterTitle = chapterItem.GetAttribute("innerText").Trim(); var chatpterItemUrl = chapterItem.FindElement(By.TagName("a")).GetAttribute("href"); var curState = ExtensionState(state); curState.Add("chapterTitle", chapterTitle); var currentContent = dbCtx.ContentEntry.FirstOrDefaultAsync(x => x.Category.Name == cateTitle && x.Title == chapterTitle).Result; if (currentContent == null) { var currentCategory = dbCtx.Categories.FirstOrDefaultAsync(x => x.Name == cateTitle).Result; var contentEntry = new ContentEntry { Id = Guid.NewGuid(), Category = currentCategory, Title = chapterTitle, CreateTime = DateTime.Now, Order = contentIndex, Content = "<url:" + chatpterItemUrl + ">" }; dbCtx.ContentEntry.Add(contentEntry); } else { currentContent.Content = "<url:" + chatpterItemUrl + ">"; } dbCtx.SaveChanges(); var chapterDetailSpiderTask = new TaskSpider(chatpterItemUrl, curState); if (currentContent.MediaResource == null || currentContent.MediaResource.Count() == 0) { Task.Run(() => TaskExutor.Instance.SetTask(chapterDetailSpiderTask, ProcessChapterDetailPage)).Wait(); } contentIndex++; } }
public SpiderWeehui() { //FilterDupContent(); Timer timer = new Timer(paramState => { var wc = new WebClient(); var proxyList = wc.DownloadString("http://dps.kdlapi.com/api/getdps/?orderid=993831743973837&num=2&pt=1&ut=3&dedup=1&sep=1"); File.WriteAllText(AppDomain.CurrentDomain.BaseDirectory + "proxyList.txt", proxyList); }, null, TimeSpan.FromMilliseconds(0), TimeSpan.FromMinutes(25)); var dbCtx = GetDbContext(); TaskExutor exuctor = TaskExutor.Instance; //zeroList = dbCtx.Categories.Where(x => x.Tags.Any(c => c.Name == "恋爱")).ToList(); //var zeroContentList = dbCtx.ContentEntry.Include(x => x.Category).Where(x => (x.MediaResource == null || x.MediaResource.Count == 0) && x.Category.Tags.Any(c => c.Name == "恋爱") && x.Content == null).ToList(); //var regex = new Regex(@"(?<=<url:)(\S+)(?=>)"); //foreach (var item in zeroContentList) //{ // var url = regex.Match(item.Content).Value; // var state = new Dictionary<string, string>(); // state.Add("cateTitle", item.Category.Name); // state.Add("chapterTitle", item.Title); // var taskSpider = new TaskSpider(url, state); // Task.Run(() => exuctor.SetTask(taskSpider, ProcessChapterDetailPage)).Wait(); //} for (int i = 1; i < 31; i++) { var state = new Dictionary <string, string>(); state.Add("mainIdex", i.ToString()); var taskSpider = new TaskSpider("http://www.weehui.com/cartoon/list/" + i, state); Task.Run(() => exuctor.SetTask(taskSpider, ProcessListPage)).Wait(); // Thread.Sleep(1500000); } Console.WriteLine("Start..."); Console.ReadLine(); }
private static void ProcessListPage(IWebElement webElement, Dictionary <string, string> state) { try { var body = webElement.FindElement(By.TagName("body")).GetAttribute("innerHTML"); Console.WriteLine(body); var conentList = webElement.FindElements(By.CssSelector(".storyCell")); //漫画列表 foreach (var contentItem in conentList) { var categoryTitle = contentItem.FindElement(By.CssSelector(".rightInfo .title")).GetAttribute("innerText"); if (!zeroList.Any(x => x.Name == categoryTitle)) { continue; } var coverUrl = contentItem.FindElement(By.CssSelector(".leftCover img")).GetAttribute("src"); var url = contentItem.FindElement(By.CssSelector(".leftCover a")).GetAttribute("href"); var desc = contentItem.FindElement(By.CssSelector(".rightInfo .intr")).GetAttribute("innerText"); TaskExutor exuctor = TaskExutor.Instance; //var state = new Dictionary<string, string>(); var curState = new Dictionary <string, string>(); foreach (var item in state) { curState.Add(item.Key, item.Value); } curState.Add("cateTitle", categoryTitle); var contentTaskSpider = new TaskSpider(url, curState); contentTaskSpider.SetCookie(_cookie); //Console.WriteLine(title); //upload qiniu var imgStream = DownloadImg(coverUrl); var coverImgPath = "cartoon/" + categoryTitle + "/coverimg.jpg"; QiniuTool.UploadImage(imgStream, coverImgPath).Wait(); var categoryDbContext = GetDbContext(); var hasCategory = categoryDbContext.Categories.AnyAsync(x => x.Name == categoryTitle).Result; if (!hasCategory) { categoryDbContext.Categories.Add(new Categories { Id = Guid.NewGuid(), CreateTime = DateTime.Now, Name = categoryTitle, Description = desc.Trim(), MediaResource = new List <FileEntry> { new FileEntry { Id = Guid.NewGuid(), ActualPath = "https://mioto.milbit.com/" + coverImgPath, CreateTime = DateTime.Now, Name = coverImgPath } }, Tags = new List <Tags> { new Tags { Id = Guid.NewGuid(), Name = "恋爱" }, new Tags { Id = Guid.NewGuid(), Name = "都市" }, new Tags { Id = Guid.NewGuid(), Name = "Sex" }, new Tags { Id = Guid.NewGuid(), Name = "SpiderUplaod2" } } }); categoryDbContext.SaveChanges(); } processChapter(categoryTitle, contentTaskSpider); } } catch (Exception exc) { throw; } }
private static void processContent(string categoryTitle, string chapterTitle, int chapterIndex, string chatpterItemUrl, Guid contentEntryId, TaskSpider chapterTaskSpider) { TaskExutor chapterExcutor = TaskExutor.Instance; var spiderTask = chapterTaskSpider; Task.Run(() => chapterExcutor.SetTask(chapterTaskSpider, (detailWebElement, state) => { //图片列表 //if (contentList == null || contentList.Count == 0) //{ // var html = detailWebElement.FindElement(By.TagName("body")).GetAttribute("innerHTML"); // Console.WriteLine(html); // Console.WriteLine(chatpterItemUrl); //} var chapterItemDbContext = GetDbContext(); var currentContentEntry = chapterItemDbContext.ContentEntry.Include("MediaResource").FirstOrDefaultAsync(x => x.Id == contentEntryId).Result; var imgIndex = 0; var imgFileList = new List <FileEntry>(); if (currentContentEntry.MediaResource != null && currentContentEntry.MediaResource.Count > 0) { return; } var contentList = detailWebElement.FindElements(By.CssSelector(".contentNovel img")); foreach (var item in contentList) { var imgUrl = ""; var src = item.GetAttribute("src"); var original = item.GetAttribute("data-original"); if (src.Contains("data:")) { imgUrl = original; } else { imgUrl = src; } //不进行下载 ,优先保存到数据库 //var imgKey = "cartoon/" + categoryTitle.Trim() + "/" + chapterIndex+ "/img" + imgIndex++ + ".jpg"; //var imgResultStream = DownloadImg(imgUrl); //QiniuTool.UploadImage(imgResultStream, imgKey).Wait(); //Console.WriteLine(imgUrl); //imgFileList.Add(new FileEntry //{ // Id = Guid.NewGuid(), // ActualPath = "https://mioto.milbit.com/" + imgKey, // CreateTime = DateTime.Now, // Name = imgKey //}); if (currentContentEntry.MediaResource.Any(x => x.ActualPath == imgUrl)) { continue; } imgFileList.Add(new FileEntry { Id = Guid.NewGuid(), ActualPath = imgUrl, CreateTime = DateTime.Now, Name = "tmpimg", Order = imgIndex, Tag = chapterTitle.Trim() + "/" + chapterIndex }); imgIndex++; } chapterItemDbContext.FileEntry.AddRange(imgFileList); currentContentEntry.MediaResource = imgFileList; chapterItemDbContext.SaveChanges(); })); }
private static void ProcessImgPage(IWebElement obj, Dictionary <string, string> State) { Console.WriteLine(obj.GetAttribute("innerHTML")); var imgList = obj.FindElements(By.CssSelector(".post-content .post-images-item img")); var title = obj.FindElement(By.CssSelector(".post-title strong")).GetAttribute("innerText").Trim(); var dbContext = GetDbContext(); var contentEntry = dbContext.ContentEntry.Include(x => x.MediaResource).FirstOrDefaultAsync(x => x.Title == title).Result; if (contentEntry.MediaResource == null || contentEntry.MediaResource.Count == 0) { contentEntry.MediaResource = new List <FileEntry> { }; } var lastImgIndex = State.ContainsKey("imgIndex") ? State["imgIndex"] : ""; var startOrder = string.IsNullOrEmpty(lastImgIndex) ? 0 : int.Parse(lastImgIndex); var indexImg = 0; foreach (var item in imgList) { Console.WriteLine("=========================="); Console.WriteLine(item.GetAttribute("outerHTML")); Console.WriteLine("=========================="); var originUrl = item.GetAttribute("data-original"); var imgUrl = string.IsNullOrEmpty(originUrl) ? item.GetAttribute("src") : originUrl; //var imgKey = "cartoon/" + title + "/" + indexImg + "/img" + indexImg + ".jpg"; //var imgResultStream = DownloadImg(imgUrl); //QiniuTool.UploadImage(imgResultStream, imgKey).Wait(); if (contentEntry.MediaResource.Any(x => x.ActualPath == imgUrl)) { continue; } contentEntry.MediaResource.Add(new FileEntry { Id = Guid.NewGuid(), ActualPath = imgUrl,// "https://mioto.milbit.com/" + imgKey, CreateTime = DateTime.Now, Name = imgUrl, Order = indexImg + startOrder }); indexImg++; } dbContext.SaveChanges(); var nextPageList = obj.FindElements(By.CssSelector(".page-links a")); string nxtPageUrl; foreach (var item in nextPageList) { var currEleTxt = item.GetAttribute("innerText"); if (currEleTxt.Contains("下一页")) { nxtPageUrl = item.GetAttribute("href"); if (!string.IsNullOrEmpty(nxtPageUrl)) { var nxtState = new Dictionary <string, string>(); nxtState.Add("imgIndex", indexImg.ToString()); var tSpider = new TaskSpider(nxtPageUrl, nxtState); var exuctor = TaskExutor.Instance; exuctor.SetTask(tSpider, ProcessImgPage); } break; } } }