コード例 #1
0
ファイル: CrawlerBase.cs プロジェクト: pj-martins/PaJaMa
        protected virtual void crawlPage(string url, int pageNum, ref int maxPage)
        {
            var doc = new HtmlDocument();

            doc.LoadHtml(getHTML(url));
            var urls = getRecipeURLs(doc);

            foreach (var kvp in urls)
            {
                try
                {
                    string recipeURL = kvp.Key;
                    if (string.IsNullOrEmpty(recipeURL))
                    {
                        throw new NotImplementedException();
                    }

                    if (!recipeURL.StartsWith(baseURL) && !recipeURL.StartsWith("http:") && !recipeURL.StartsWith("https:"))
                    {
                        recipeURL = baseURL + recipeURL;
                    }

                    recipeURL = recipeURL.Replace("\t", "");

                    string recipeName = CrawlerHelper.ChildSafeName(Common.StripHTML(HttpUtility.HtmlDecode(kvp.Value))).Trim();

                    if (existingRecipes.Contains(myAttribute.UniqueRecipeName ? recipeName : recipeURL))
                    {
                        continue;
                    }

                    string display = myAttribute.RecipeSourceName + " - Page " + pageNum + " of " + maxPage + " - " + (string.IsNullOrEmpty(recipeName) ? recipeURL : recipeName);
                    Console.WriteLine("* " + display);

                    lock (CrawlerHelper.LockObject)
                        CreateRecipe(recipeURL, recipeName, recipeSource.RecipeSourceID);

                    existingRecipes.Add(myAttribute.UniqueRecipeName ? recipeName : recipeURL);
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.GetFullExceptionTextWithStackTrace());
                    System.IO.File.AppendAllText("errors.txt", ex.GetFullExceptionTextWithStackTrace());
                }
            }

            updateMaxPage(doc, ref maxPage);
        }
コード例 #2
0
ファイル: CrawlerBase.cs プロジェクト: pj-martins/PaJaMa
        public virtual Recipe CreateRecipe(string recipeURL, string recipeName, int recipeSourceID)
        {
            string html = getHTML(recipeURL);

            if (string.IsNullOrEmpty(html))
            {
                return(null);
            }

            var doc = new HtmlDocument();

            doc.LoadHtml(html);

            if (string.IsNullOrEmpty(recipeName))
            {
                var recipeNameNode = doc.DocumentNode.SelectSingleNode(recipeNameXPath);
                if (recipeNameNode == null)
                {
                    throw new NotImplementedException();
                }
                recipeName = CrawlerHelper.ChildSafeName(HttpUtility.HtmlDecode(Common.StripHTML(recipeNameNode.InnerText))).Trim();
            }

            Recipe rec = new Recipe();

            rec.RecipeSourceID = recipeSourceID;
            rec.RecipeName     = recipeName;
            rec.RecipeURL      = recipeURL;

            var directions = doc.DocumentNode.SelectNodes(directionsXPath);

            if (directions == null || !directions.Any())
            {
                directions = doc.DocumentNode.SelectNodes(directions2XPath);
            }
            rec.Directions = string.Empty;
            if (directions != null)
            {
                foreach (HtmlNode d in directions)
                {
                    rec.Directions += Common.StripHTML(d.InnerText) + "\r\n";
                }
                rec.Directions = rec.Directions.Trim();
            }
            rec.Rating = getRating(doc.DocumentNode);

            var servingsNode = doc.DocumentNode.SelectSingleNode(servingsXPath);

            if (servingsNode == null)
            {
                servingsNode = doc.DocumentNode.SelectSingleNode(servings2XPath);
            }
            if (servingsNode != null)
            {
                int tempInt = -1;
                if (!int.TryParse(Common.StripHTML(servingsNode.InnerText), out tempInt))
                {
                    Match servingsMatch2 = Regex.Match(Common.StripHTML(servingsNode.InnerText), "(\\d+)");
                    if (!int.TryParse(servingsMatch2.Groups[1].Value, out tempInt))
                    {
                    }
                }
                rec.NumberOfServings = tempInt;
            }

            var ingredients = getIngredients(doc.DocumentNode);

            if (ingredients == null)
            {
                return(null);
            }

            foreach (var ing in ingredients)
            {
                rec.RecipeIngredientMeasurements.Add(ing);
            }

            foreach (var img in getRecipeImages(doc.DocumentNode))
            {
                rec.RecipeImages.Add(img);
            }

            DbContext.Recipes.Add(rec);
            DbContext.SaveChanges();
            return(rec);
        }