Exemple #1
0
        public AinunuDownloadDTO GetMovieDownloadDetail(string downloadPageUrl, HtmlNodeCollection downloadNodes, string movieName, int MovieId)
        {
            HtmlNodeCollection nodes = null;

            if (downloadPageUrl != null && downloadPageUrl != "")
            {
                var          helper   = new CrawlerHelper();
                var          html     = helper.DownloadHtml(downloadPageUrl, Encoding.GetEncoding("UTF-8"));
                HtmlDocument document = new HtmlDocument();
                if (html == null || html == "")
                {
                    return(null);
                }
                document.LoadHtml(html);
                nodes = document.DocumentNode.SelectNodes("/html/body/div[1]/div[1]/div[1]/div[1]/article[1]/div[1]/div");
            }
            else
            {
                nodes = downloadNodes;
            }
            if (nodes == null || nodes.Count < 2)
            {
                return(null);
            }
            var Resources = new List <ResourceDTO>();

            for (int i = 0; i < nodes.Count; i = i + 2)//第一个是说明内容的div
            {
                HtmlNode node = nodes[i];
                if (i < nodes.Count)
                {
                    HtmlNode node2 = null;
                    if (i + 1 < nodes.Count)
                    {
                        node2 = nodes[i + 1];
                    }
                    string             formatName          = node.InnerHtml;
                    var                resourceLinks       = new List <ResourceLinkDTO>();
                    HtmlNodeCollection resourceNodes       = node2 == null ? null : node2.SelectNodes("child::a");
                    HtmlNodeCollection resourceOthersNodes = node2 == null ? null : node2.SelectNodes("child::text()");
                    if (resourceNodes != null)
                    {
                        for (int j = 0; j < resourceNodes.Count; j++)
                        {
                            string resourceLinkName  = resourceNodes[j].InnerHtml;
                            string resourceLinkUrl   = resourceNodes[j].GetAttributeValue("href", "");
                            string resourceLinkOther = resourceOthersNodes != null ? resourceOthersNodes[j].InnerHtml : "";
                            if (resourceLinkUrl != null)
                            {
                                var entity = new ResourceLinkDTO()
                                {
                                    Name   = resourceLinkName,
                                    Type   = "",
                                    Url    = resourceLinkUrl,
                                    Others = resourceLinkOther
                                };
                                resourceLinks.Add(entity);
                            }
                        }
                    }
                    var resource = new ResourceDTO()
                    {
                        Id            = i.ToString(),
                        FormatName    = formatName,
                        ResourceLinks = resourceLinks
                    };
                    Resources.Add(resource);
                }
            }
            var download = new AinunuDownloadDTO()
            {
                id           = MovieId.ToString(),
                MovieId      = MovieId,
                MovieName    = movieName,
                Resources    = Resources,
                UpdateDate   = "",
                CreateDate   = DateTimeOffset.UtcNow.ToString("yyyy-MM-dd HH:mm:ss"),
                partitionKey = "download"
            };

            return(download);
        }
Exemple #2
0
        public async Task <string> GetAinunuContent(int switcher)
        {
            this.cosmosClient = new CosmosClient(EndpointUri, PrimaryKey, new CosmosClientOptions()
            {
                ApplicationName = "CosmosDBDotnetQuickstart"
            });
            // Create a new database
            this.database = await this.cosmosClient.CreateDatabaseIfNotExistsAsync(databaseId);

            this.detailContainer = await this.database.CreateContainerIfNotExistsAsync(detailContainerId, "/partitionKey");

            this.downloadContainer = await this.database.CreateContainerIfNotExistsAsync(downloadContainerId, "/partitionKey");

            var JsonFileHelper = new JsonFileHelper();

            string exsistMoives    = "";
            string exsistDownloads = "";
            var    tempMoives      = new List <AinunuMovieDTO>();
            var    tempDownloads   = new List <AinunuDownloadDTO>();

            //增量更新
            if (switcher == 2)
            {
                exsistMoives    = JsonFileHelper.ReadJsonFile("Data.json");
                exsistDownloads = JsonFileHelper.ReadJsonFile("Download.json");
                tempMoives      = JsonConvert.DeserializeObject <List <AinunuMovieDTO> >(exsistMoives);
                tempDownloads   = JsonConvert.DeserializeObject <List <AinunuDownloadDTO> >(exsistDownloads);
            }

            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
            var movieList    = new List <AinunuMovieDTO>();
            var downloadList = new List <AinunuDownloadDTO>();

            var          totalPageUrl = baseUrl + "/c/movie/";
            var          helper       = new CrawlerHelper();
            var          html         = helper.DownloadHtml(totalPageUrl, Encoding.GetEncoding("GB2312"));
            HtmlDocument document     = new HtmlDocument();

            document.LoadHtml(html);
            HtmlNode totalPage      = document.DocumentNode.SelectSingleNode("/html/body/div[4]/div[1]/div[3]/div/ul/li[15]/span/strong[1]");
            var      totalPageCount = int.Parse(totalPage.InnerText);
            HtmlNode totalMovies    = document.DocumentNode.SelectSingleNode("/html/body/div[4]/div[1]/div[3]/div/ul/li[15]/span/strong[2]");
            //Start Id 12286 total -- 2021/1/2
            var Id = int.Parse(totalMovies.InnerText);

            Id = Id > 0 ? Id : 15000;                 //起始Id

            for (int i = 1; i <= totalPageCount; i++) //410 total -- 2022/1/2 TODO:动态读取总页数
            {
                var breakFlag = false;
                var url       = baseUrl + "/c/movie/list_" + i.ToString() + ".html";
                html = helper.DownloadHtml(url, Encoding.GetEncoding("GB2312"));
                if (html == "")
                {
                    break;
                }
                HtmlNodeCollection nodes = GetMovieList(html);

                for (int j = 0; j < nodes.Count; j++)
                {
                    HtmlNode node               = nodes[j].SelectSingleNode("child::a[2]");
                    string   movieCategory      = nodes[j].SelectSingleNode("child::a[1]").InnerHtml;
                    string   movieUpdateTime    = nodes[j].SelectSingleNode("child::span").InnerHtml;
                    string   detaiRelativelUrl  = node.GetAttributeValue("href", "");
                    string   movieDetailPageUrl = baseUrl + detaiRelativelUrl;
                    var      urlName            = node.InnerHtml;
                    Console.WriteLine(urlName);
                    if (tempMoives.Exists(x => x.UrlName == urlName) && switcher == 2)
                    {
                        breakFlag = true;
                        break;
                    }
                    try
                    {
                        var detail = GetMovieDetail(movieDetailPageUrl, Id);
                        if (detail.movie != null)
                        {
                            detail.movie.UpdateDate   = movieUpdateTime;
                            detail.movie.Category     = movieCategory;
                            detail.movie.partitionKey = movieCategory;
                            detail.movie.UrlName      = urlName;
                            await InsertMovieIntoCosmos(detail.movie);

                            movieList.Add(detail.movie);
                            var download = GetMovieDownloadDetail(detail.movie.DownloadUrl, detail.downloadNodes, detail.movie.Name, Id);
                            if (download != null)
                            {
                                downloadList.Add(download);
                                await InsertDownloadIntoCosmos(download);
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        LogHelper.Error(string.Format(ex.Message + " " + node.InnerHtml), ex);
                    }
                    Id -= 1;
                }
                if (breakFlag)
                {
                    break;
                }
            }
            var newMovieList    = movieList.Concat(tempMoives).ToList();
            var newDownloadList = downloadList.Concat(tempDownloads).ToList();

            JsonFileHelper.WriteJsonFile("Data.json", JsonConvert.SerializeObject(newMovieList));
            JsonFileHelper.WriteJsonFile("Download.json", JsonConvert.SerializeObject(newDownloadList));
            // JsonFileHelper.WriteJsonFile("../FrontEnd/src/data/movie.json", JsonConvert.SerializeObject(newMovieList));
            // JsonFileHelper.WriteJsonFile("../FrontEnd/src/data/download.json", JsonConvert.SerializeObject(newDownloadList));
            return("grab done Total:" + movieList.Count.ToString());
        }
Exemple #3
0
        public virtual Recipe CreateRecipe(string recipeURL, string recipeName, int recipeSourceID)
        {
            string html = getHTML(recipeURL);

            if (string.IsNullOrEmpty(html))
            {
                return(null);
            }

            var doc = new HtmlDocument();

            doc.LoadHtml(html);

            if (string.IsNullOrEmpty(recipeName))
            {
                var recipeNameNode = doc.DocumentNode.SelectSingleNode(recipeNameXPath);
                if (recipeNameNode == null)
                {
                    throw new NotImplementedException();
                }
                recipeName = CrawlerHelper.ChildSafeName(HttpUtility.HtmlDecode(Common.StripHTML(recipeNameNode.InnerText))).Trim();
            }

            Recipe rec = new Recipe();

            rec.RecipeSourceID = recipeSourceID;
            rec.RecipeName     = recipeName;
            rec.RecipeURL      = recipeURL;

            var directions = doc.DocumentNode.SelectNodes(directionsXPath);

            if (directions == null || !directions.Any())
            {
                directions = doc.DocumentNode.SelectNodes(directions2XPath);
            }
            rec.Directions = string.Empty;
            if (directions != null)
            {
                foreach (HtmlNode d in directions)
                {
                    rec.Directions += Common.StripHTML(d.InnerText) + "\r\n";
                }
                rec.Directions = rec.Directions.Trim();
            }
            rec.Rating = getRating(doc.DocumentNode);

            var servingsNode = doc.DocumentNode.SelectSingleNode(servingsXPath);

            if (servingsNode == null)
            {
                servingsNode = doc.DocumentNode.SelectSingleNode(servings2XPath);
            }
            if (servingsNode != null)
            {
                int tempInt = -1;
                if (!int.TryParse(Common.StripHTML(servingsNode.InnerText), out tempInt))
                {
                    Match servingsMatch2 = Regex.Match(Common.StripHTML(servingsNode.InnerText), "(\\d+)");
                    if (!int.TryParse(servingsMatch2.Groups[1].Value, out tempInt))
                    {
                    }
                }
                rec.NumberOfServings = tempInt;
            }

            var ingredients = getIngredients(doc.DocumentNode);

            if (ingredients == null)
            {
                return(null);
            }

            foreach (var ing in ingredients)
            {
                rec.RecipeIngredientMeasurements.Add(ing);
            }

            foreach (var img in getRecipeImages(doc.DocumentNode))
            {
                rec.RecipeImages.Add(img);
            }

            DbContext.Recipes.Add(rec);
            DbContext.SaveChanges();
            return(rec);
        }