protected virtual void crawlPage(string url, int pageNum, ref int maxPage) { var doc = new HtmlDocument(); doc.LoadHtml(getHTML(url)); var urls = getRecipeURLs(doc); foreach (var kvp in urls) { try { string recipeURL = kvp.Key; if (string.IsNullOrEmpty(recipeURL)) { throw new NotImplementedException(); } if (!recipeURL.StartsWith(baseURL) && !recipeURL.StartsWith("http:") && !recipeURL.StartsWith("https:")) { recipeURL = baseURL + recipeURL; } recipeURL = recipeURL.Replace("\t", ""); string recipeName = CrawlerHelper.ChildSafeName(Common.StripHTML(HttpUtility.HtmlDecode(kvp.Value))).Trim(); if (existingRecipes.Contains(myAttribute.UniqueRecipeName ? recipeName : recipeURL)) { continue; } string display = myAttribute.RecipeSourceName + " - Page " + pageNum + " of " + maxPage + " - " + (string.IsNullOrEmpty(recipeName) ? recipeURL : recipeName); Console.WriteLine("* " + display); lock (CrawlerHelper.LockObject) CreateRecipe(recipeURL, recipeName, recipeSource.RecipeSourceID); existingRecipes.Add(myAttribute.UniqueRecipeName ? recipeName : recipeURL); } catch (Exception ex) { Console.WriteLine(ex.GetFullExceptionTextWithStackTrace()); System.IO.File.AppendAllText("errors.txt", ex.GetFullExceptionTextWithStackTrace()); } } updateMaxPage(doc, ref maxPage); }
public virtual Recipe CreateRecipe(string recipeURL, string recipeName, int recipeSourceID) { string html = getHTML(recipeURL); if (string.IsNullOrEmpty(html)) { return(null); } var doc = new HtmlDocument(); doc.LoadHtml(html); if (string.IsNullOrEmpty(recipeName)) { var recipeNameNode = doc.DocumentNode.SelectSingleNode(recipeNameXPath); if (recipeNameNode == null) { throw new NotImplementedException(); } recipeName = CrawlerHelper.ChildSafeName(HttpUtility.HtmlDecode(Common.StripHTML(recipeNameNode.InnerText))).Trim(); } Recipe rec = new Recipe(); rec.RecipeSourceID = recipeSourceID; rec.RecipeName = recipeName; rec.RecipeURL = recipeURL; var directions = doc.DocumentNode.SelectNodes(directionsXPath); if (directions == null || !directions.Any()) { directions = doc.DocumentNode.SelectNodes(directions2XPath); } rec.Directions = string.Empty; if (directions != null) { foreach (HtmlNode d in directions) { rec.Directions += Common.StripHTML(d.InnerText) + "\r\n"; } rec.Directions = rec.Directions.Trim(); } rec.Rating = getRating(doc.DocumentNode); var servingsNode = doc.DocumentNode.SelectSingleNode(servingsXPath); if (servingsNode == null) { servingsNode = doc.DocumentNode.SelectSingleNode(servings2XPath); } if (servingsNode != null) { int tempInt = -1; if (!int.TryParse(Common.StripHTML(servingsNode.InnerText), out tempInt)) { Match servingsMatch2 = Regex.Match(Common.StripHTML(servingsNode.InnerText), "(\\d+)"); if (!int.TryParse(servingsMatch2.Groups[1].Value, out tempInt)) { } } rec.NumberOfServings = tempInt; } var ingredients = getIngredients(doc.DocumentNode); if (ingredients == null) { return(null); } foreach (var ing in ingredients) { rec.RecipeIngredientMeasurements.Add(ing); } foreach (var img in getRecipeImages(doc.DocumentNode)) { rec.RecipeImages.Add(img); } DbContext.Recipes.Add(rec); DbContext.SaveChanges(); return(rec); }