public AinunuDownloadDTO GetMovieDownloadDetail(string downloadPageUrl, HtmlNodeCollection downloadNodes, string movieName, int MovieId) { HtmlNodeCollection nodes = null; if (downloadPageUrl != null && downloadPageUrl != "") { var helper = new CrawlerHelper(); var html = helper.DownloadHtml(downloadPageUrl, Encoding.GetEncoding("UTF-8")); HtmlDocument document = new HtmlDocument(); if (html == null || html == "") { return(null); } document.LoadHtml(html); nodes = document.DocumentNode.SelectNodes("/html/body/div[1]/div[1]/div[1]/div[1]/article[1]/div[1]/div"); } else { nodes = downloadNodes; } if (nodes == null || nodes.Count < 2) { return(null); } var Resources = new List <ResourceDTO>(); for (int i = 0; i < nodes.Count; i = i + 2)//第一个是说明内容的div { HtmlNode node = nodes[i]; if (i < nodes.Count) { HtmlNode node2 = null; if (i + 1 < nodes.Count) { node2 = nodes[i + 1]; } string formatName = node.InnerHtml; var resourceLinks = new List <ResourceLinkDTO>(); HtmlNodeCollection resourceNodes = node2 == null ? null : node2.SelectNodes("child::a"); HtmlNodeCollection resourceOthersNodes = node2 == null ? null : node2.SelectNodes("child::text()"); if (resourceNodes != null) { for (int j = 0; j < resourceNodes.Count; j++) { string resourceLinkName = resourceNodes[j].InnerHtml; string resourceLinkUrl = resourceNodes[j].GetAttributeValue("href", ""); string resourceLinkOther = resourceOthersNodes != null ? resourceOthersNodes[j].InnerHtml : ""; if (resourceLinkUrl != null) { var entity = new ResourceLinkDTO() { Name = resourceLinkName, Type = "", Url = resourceLinkUrl, Others = resourceLinkOther }; resourceLinks.Add(entity); } } } var resource = new ResourceDTO() { Id = i.ToString(), FormatName = formatName, ResourceLinks = resourceLinks }; Resources.Add(resource); } } var download = new AinunuDownloadDTO() { id = MovieId.ToString(), MovieId = MovieId, MovieName = movieName, Resources = Resources, UpdateDate = "", CreateDate = DateTimeOffset.UtcNow.ToString("yyyy-MM-dd HH:mm:ss"), partitionKey = "download" }; return(download); }
public async Task <string> GetAinunuContent(int switcher) { this.cosmosClient = new CosmosClient(EndpointUri, PrimaryKey, new CosmosClientOptions() { ApplicationName = "CosmosDBDotnetQuickstart" }); // Create a new database this.database = await this.cosmosClient.CreateDatabaseIfNotExistsAsync(databaseId); this.detailContainer = await this.database.CreateContainerIfNotExistsAsync(detailContainerId, "/partitionKey"); this.downloadContainer = await this.database.CreateContainerIfNotExistsAsync(downloadContainerId, "/partitionKey"); var JsonFileHelper = new JsonFileHelper(); string exsistMoives = ""; string exsistDownloads = ""; var tempMoives = new List <AinunuMovieDTO>(); var tempDownloads = new List <AinunuDownloadDTO>(); //增量更新 if (switcher == 2) { exsistMoives = JsonFileHelper.ReadJsonFile("Data.json"); exsistDownloads = JsonFileHelper.ReadJsonFile("Download.json"); tempMoives = JsonConvert.DeserializeObject <List <AinunuMovieDTO> >(exsistMoives); tempDownloads = JsonConvert.DeserializeObject <List <AinunuDownloadDTO> >(exsistDownloads); } Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); var movieList = new List <AinunuMovieDTO>(); var downloadList = new List <AinunuDownloadDTO>(); var totalPageUrl = baseUrl + "/c/movie/"; var helper = new CrawlerHelper(); var html = helper.DownloadHtml(totalPageUrl, Encoding.GetEncoding("GB2312")); HtmlDocument document = new HtmlDocument(); document.LoadHtml(html); HtmlNode totalPage = document.DocumentNode.SelectSingleNode("/html/body/div[4]/div[1]/div[3]/div/ul/li[15]/span/strong[1]"); var totalPageCount = int.Parse(totalPage.InnerText); HtmlNode totalMovies = document.DocumentNode.SelectSingleNode("/html/body/div[4]/div[1]/div[3]/div/ul/li[15]/span/strong[2]"); //Start Id 12286 total -- 2021/1/2 var Id = int.Parse(totalMovies.InnerText); Id = Id > 0 ? Id : 15000; //起始Id for (int i = 1; i <= totalPageCount; i++) //410 total -- 2022/1/2 TODO:动态读取总页数 { var breakFlag = false; var url = baseUrl + "/c/movie/list_" + i.ToString() + ".html"; html = helper.DownloadHtml(url, Encoding.GetEncoding("GB2312")); if (html == "") { break; } HtmlNodeCollection nodes = GetMovieList(html); for (int j = 0; j < nodes.Count; j++) { HtmlNode node = nodes[j].SelectSingleNode("child::a[2]"); string movieCategory = nodes[j].SelectSingleNode("child::a[1]").InnerHtml; string movieUpdateTime = nodes[j].SelectSingleNode("child::span").InnerHtml; string detaiRelativelUrl = node.GetAttributeValue("href", ""); string movieDetailPageUrl = baseUrl + detaiRelativelUrl; var urlName = node.InnerHtml; Console.WriteLine(urlName); if (tempMoives.Exists(x => x.UrlName == urlName) && switcher == 2) { breakFlag = true; break; } try { var detail = GetMovieDetail(movieDetailPageUrl, Id); if (detail.movie != null) { detail.movie.UpdateDate = movieUpdateTime; detail.movie.Category = movieCategory; detail.movie.partitionKey = movieCategory; detail.movie.UrlName = urlName; await InsertMovieIntoCosmos(detail.movie); movieList.Add(detail.movie); var download = GetMovieDownloadDetail(detail.movie.DownloadUrl, detail.downloadNodes, detail.movie.Name, Id); if (download != null) { downloadList.Add(download); await InsertDownloadIntoCosmos(download); } } } catch (Exception ex) { LogHelper.Error(string.Format(ex.Message + " " + node.InnerHtml), ex); } Id -= 1; } if (breakFlag) { break; } } var newMovieList = movieList.Concat(tempMoives).ToList(); var newDownloadList = downloadList.Concat(tempDownloads).ToList(); JsonFileHelper.WriteJsonFile("Data.json", JsonConvert.SerializeObject(newMovieList)); JsonFileHelper.WriteJsonFile("Download.json", JsonConvert.SerializeObject(newDownloadList)); // JsonFileHelper.WriteJsonFile("../FrontEnd/src/data/movie.json", JsonConvert.SerializeObject(newMovieList)); // JsonFileHelper.WriteJsonFile("../FrontEnd/src/data/download.json", JsonConvert.SerializeObject(newDownloadList)); return("grab done Total:" + movieList.Count.ToString()); }
public virtual Recipe CreateRecipe(string recipeURL, string recipeName, int recipeSourceID) { string html = getHTML(recipeURL); if (string.IsNullOrEmpty(html)) { return(null); } var doc = new HtmlDocument(); doc.LoadHtml(html); if (string.IsNullOrEmpty(recipeName)) { var recipeNameNode = doc.DocumentNode.SelectSingleNode(recipeNameXPath); if (recipeNameNode == null) { throw new NotImplementedException(); } recipeName = CrawlerHelper.ChildSafeName(HttpUtility.HtmlDecode(Common.StripHTML(recipeNameNode.InnerText))).Trim(); } Recipe rec = new Recipe(); rec.RecipeSourceID = recipeSourceID; rec.RecipeName = recipeName; rec.RecipeURL = recipeURL; var directions = doc.DocumentNode.SelectNodes(directionsXPath); if (directions == null || !directions.Any()) { directions = doc.DocumentNode.SelectNodes(directions2XPath); } rec.Directions = string.Empty; if (directions != null) { foreach (HtmlNode d in directions) { rec.Directions += Common.StripHTML(d.InnerText) + "\r\n"; } rec.Directions = rec.Directions.Trim(); } rec.Rating = getRating(doc.DocumentNode); var servingsNode = doc.DocumentNode.SelectSingleNode(servingsXPath); if (servingsNode == null) { servingsNode = doc.DocumentNode.SelectSingleNode(servings2XPath); } if (servingsNode != null) { int tempInt = -1; if (!int.TryParse(Common.StripHTML(servingsNode.InnerText), out tempInt)) { Match servingsMatch2 = Regex.Match(Common.StripHTML(servingsNode.InnerText), "(\\d+)"); if (!int.TryParse(servingsMatch2.Groups[1].Value, out tempInt)) { } } rec.NumberOfServings = tempInt; } var ingredients = getIngredients(doc.DocumentNode); if (ingredients == null) { return(null); } foreach (var ing in ingredients) { rec.RecipeIngredientMeasurements.Add(ing); } foreach (var img in getRecipeImages(doc.DocumentNode)) { rec.RecipeImages.Add(img); } DbContext.Recipes.Add(rec); DbContext.SaveChanges(); return(rec); }