/// <summary> /// 从在线网页提取数据 /// </summary> /// <param name="i"></param> private static void CrawlerMovieInfoFromOnline(string indexURL, int movieType) { var newMovieCount = 0; var htmlDoc = HTTPHelper.GetHTMLByURL(indexURL); var dom = htmlParser.Parse(htmlDoc); dom.QuerySelector("div.co_content8") ?.QuerySelectorAll("a") .Where(a => a.GetAttribute("href").StartsWith("/i/")) .ForEach(a => { var onlineURL = "http://www.dy2018.com" + a.GetAttribute("href"); if (!MovieDataContent.Movies.Any(mo => mo.OnlineUrl == onlineURL)) { var movieInfo = GetMovieInfoFromURL(onlineURL); if (movieInfo != null) { movieInfo.MovieType = MovieType.Latest; MovieDataContent.Movies.Add(movieInfo); newMovieCount++; } } }); MovieDataContent.SaveChanges(); LogHelper.Info($"Finish Dy2018 Crawl {movieType.ToString()}MovieInfo,New Data Count:{newMovieCount},IndexURL:{indexURL}"); }
private static string GetHTMLByHTTPWebRequest(string indexURL) { HttpWebRequest httpWebRequest = WebRequest.CreateHttp(indexURL); AddCookies(httpWebRequest); var html = HTTPHelper.GetHTML(httpWebRequest); return(html); }
private static string GetHTMLOnJumpWebPage(string htmlDoc) { if (htmlDoc.Contains("window.location")) { var tempDom = htmlParser.Parse(htmlDoc); var scriptDom = tempDom.QuerySelector("script"); var tempURL = "http://www.dy2018.com" + scriptDom.InnerHtml.Replace("window.location=", "") .Replace("+", "").Replace("\"", "").Replace(" ", "").Replace(";", ""); htmlDoc = HTTPHelper.GetHTMLByURL(tempURL); LogHelper.Info($"GetHTML From JumpURL {(string.IsNullOrEmpty(htmlDoc) ? "Success" : "Fail")}!,the URL:{tempURL}"); } //LogHelper.Info(htmlDoc); return(htmlDoc); }
public static void CrawlHostMovieInfo() { Task.Factory.StartNew(() => { try { #region int newMovieCount = 0; var indexURL = "http://www.btdytt520.com/movie/"; var html = HTTPHelper.GetHTMLByURL(indexURL, true); if (string.IsNullOrEmpty(html)) { return; } var htmlDom = htmlParser.Parse(html); htmlDom.QuerySelector("div.index_Sidebar_cc") .QuerySelectorAll("a") .ForEach(a => { var onlineURL = "http://www.btdytt520.com" + a.GetAttribute("href"); if (!MovieDataContent.Movies.Any(mo => mo.OnlineUrl == onlineURL)) { var movieInfo = GetMovieInfoURL(onlineURL); if (movieInfo != null) { movieInfo.MovieType = MovieType.Latest; MovieDataContent.Movies.Add(movieInfo); newMovieCount++; } } }); MovieDataContent.SaveChanges(); LogHelper.Info($"Finish Btdytt520 CrawlHostMovieInfo,New Data Count:{newMovieCount}"); #endregion } catch (Exception ex) { LogHelper.Error("Btdytt520 CrawlHostMovieInfo Exception", ex); } }); }
/// <summary> /// 从在线网页提取电影数据 /// </summary> /// <param name="onlineURL"></param> /// <returns></returns> private static MovieInfo GetMovieInfoFromURL(string onlineURL) { try { var movieHTML = HTTPHelper.GetHTMLByURL(onlineURL); if (string.IsNullOrEmpty(movieHTML)) { return(null); } var movieDoc = htmlParser.Parse(movieHTML); var zoom = movieDoc.GetElementById("Zoom"); var lstDownLoadURL = movieDoc.QuerySelectorAll("[bgcolor='#fdfddf']"); var updatetime = movieDoc.QuerySelector("span.updatetime"); var pubDate = DateTime.Now; if (!string.IsNullOrEmpty(updatetime?.TextContent)) { DateTime.TryParse(updatetime.TextContent.Replace("发布时间:", ""), out pubDate); } var lstURL = lstDownLoadURL.Select(a => a.QuerySelector("a")?.TextContent ?? ""); var movieName = movieDoc.QuerySelector("div.title_all")?.QuerySelector("h1"); var movieInfo = new MovieInfo() { MovieName = movieName.TextContent ?? "找不到影片信息...", OnlineUrl = onlineURL, MovieIntro = zoom?.TextContent ?? "暂无介绍...", DownLoadURLList = string.Join(";", lstURL), PubDate = pubDate.Date, DataCreateTime = DateTime.Now, SoureceDomain = SoureceDomainConsts.Dy2018Domain, //MovieType=(int)MovieTypeEnum.Latest }; return(movieInfo); } catch (Exception ex) { LogHelper.Error("Dy2018 GetMovieInfoFromURL Exception", ex, new { OnloneURL = onlineURL }); return(null); } }
/// <summary> /// 爬取数据 /// </summary> public static void CrawlHotMovie() { Task.Factory.StartNew(() => { try { var newMovieCount = 0; LogHelper.Info("Dy2018 CrawlHotMovie Start..."); var htmlDoc = HTTPHelper.GetHTMLByURL("http://www.dy2018.com/"); htmlDoc = GetHTMLOnJumpWebPage(htmlDoc); var dom = htmlParser.Parse(htmlDoc); dom.QuerySelectorAll("div.co_content222") ?.Take(3) .Select(divInfo => divInfo.QuerySelectorAll("a").Where(a => a.GetAttribute("href").StartsWith("/i/"))) .Aggregate((IEnumerable <IElement> a, IEnumerable <IElement> b) => a.Concat(b)) .ForEach(a => { var onlineURL = "http://www.dy2018.com" + a.GetAttribute("href"); if (!MovieDataContent.Movies.Any(mo => mo.OnlineUrl == onlineURL)) { var movieInfo = GetMovieInfoFromURL(onlineURL); if (movieInfo != null) { movieInfo.MovieType = MovieType.Latest; MovieDataContent.Movies.Add(movieInfo); newMovieCount++; } } }); MovieDataContent.SaveChanges(); LogHelper.Info($"Finish Dy2018 CrawlHotMovie,New Data Count:{newMovieCount}"); } catch (Exception ex) { LogHelper.Error("Dy2018 CrawlHotMovie Exception", ex); } }); }