public static void ParseRecipesImpl( string ingredientsLogFileName, string downloaderPath, IIngredientClassifier classifier, IIngredientTypes types, List <ClassifiedRecipe> recipes) { Dictionary <string, int> whyNotParsed = new Dictionary <string, int>(); var sources = new List <KeyValuePair <IRecipeDownloader, IRecipeParser> >(); // adding skinnytaste sources.Add( new KeyValuePair <IRecipeDownloader, IRecipeParser>( new SkinnyTasteRecipeDownloader("/", downloaderPath), new SkinnyTasteRecipeParser() )); // adding paleoleap sources.Add( new KeyValuePair <IRecipeDownloader, IRecipeParser>( new PaleoLeapRecipeDownloader("/", downloaderPath), new PaleoLeapRecipeParser() )); int recipesTotal = 0, recipesParsed = 0; HashSet <string> ingredientTypesUsed = new HashSet <string>(); HashSet <string> ingredientGroupsUsed = new HashSet <string>(); using (StreamWriter interpretedIngredients = new StreamWriter(ingredientsLogFileName)) { interpretedIngredients.WriteLine("url\tdeclaration\tdetail\tquantity\tname\tclass\ttype"); foreach (var source in sources) { IRecipeDownloader d = source.Key; IRecipeParser p = source.Value; foreach (RawWebPage rawWebPage in d.DownloadedAlready) { ++recipesTotal; rawWebPage.Name = RecipeDownloader.RemoveSpecialCharacters(rawWebPage.Name); RecipeParsingResult parsed = p.TryParseRawWebPage(rawWebPage); if (!parsed.Succeeded) { Console.Error.WriteLine("WARNING: failed to parse {0}:{1}:{2} ({3})", rawWebPage.Source, rawWebPage.Url, rawWebPage.FileName, parsed.ErrorMessage); if (!whyNotParsed.ContainsKey(parsed.ErrorMessage)) { whyNotParsed[parsed.ErrorMessage] = 0; } ++whyNotParsed[parsed.ErrorMessage]; continue; } ++recipesParsed; Recipe recipe = parsed.Result; ClassifiedRecipe makingSenseOfIt = classifier.ClassifyIngredients(recipe); if (makingSenseOfIt.Succeeded) { recipes.Add(makingSenseOfIt); } foreach (var ingredient in recipe.Ingredients) { // understand what kind of ingredient this is string ingredientClass = null; IngredientType ingredientType = null; if (null != ingredient.Name && !makingSenseOfIt.Classification.Classified.TryGetValue(ingredient.Name, out ingredientClass)) { ingredientClass = null; } if (null != ingredientClass && !types.ClassToType.TryGetValue(ingredientClass, out ingredientType)) { ingredientType = null; } if (null == ingredientClass) { ingredientClass = "[unknown]"; } if (null == ingredientType) { ingredientType = new IngredientType(); } interpretedIngredients.WriteLine("{0}\t{1}\t{2}\t=\"{3}\"\t{4}\t{5}\t{6}", d.UrlRoot + recipe.OriginalWebPage.Url, ingredient.Declaration, ingredient.Detail, ingredient.Quantity, ingredient.Name.Name, ingredientClass, ingredientType); ingredientTypesUsed.Add(ingredientType.Type); ingredientGroupsUsed.Add(ingredientType.Group); } } } } Console.WriteLine("Recipes: {0} total, {1} parsed, {2} distinct ingredient types, {3} distinct ingredient groups used", recipesTotal, recipesParsed, ingredientTypesUsed.Count, ingredientGroupsUsed.Count); List <KeyValuePair <string, int> > whyNot = new List <KeyValuePair <string, int> >(whyNotParsed); whyNot.Sort((KeyValuePair <string, int> x, KeyValuePair <string, int> y) => { return(-x.Value.CompareTo(y.Value)); }); foreach (var whyNotElement in whyNot) { Console.WriteLine("Failed to parse {0} recipe pages because {1}.", whyNotElement.Value, whyNotElement.Key); } }
public RecipeParsingResult TryParseRawWebPage(RawWebPage rawWebPage) { if (!isRecipeUrl_.Match(rawWebPage.Url).Success) { return(RecipeParsingResult.Error("not a recipe web page")); } string text = rawWebPage.RawText; string[] headAndBody = bodyTag_.Split(text); if (headAndBody.Length < 2) { return(RecipeParsingResult.Error("body tag not found")); } if (headAndBody.Length > 2) { return(RecipeParsingResult.Error("more than one body tag found")); } text = headAndBody[1]; string[] splitByIngredients = text.Split(new string[] { ">Ingredients<" }, StringSplitOptions.RemoveEmptyEntries); if (splitByIngredients.Length < 2) { // if the hail mary thing didn't work, then complain if (startsWithNumbers_.Match(rawWebPage.Name).Success) { return(RecipeParsingResult.Error("didn't find ingredients, but the title starts with numbers")); } else if (rawWebPage.Url.StartsWith("/all-about-")) { return(RecipeParsingResult.Error("didn't find ingredients, but this is an 'all about' web page")); } else if (rawWebPage.Url.StartsWith("/about-")) { return(RecipeParsingResult.Error("didn't find ingredients, but this is an 'about' web page")); } else if (rawWebPage.Url.StartsWith("/are-")) { return(RecipeParsingResult.Error("didn't find ingredients, but this is an 'are' web page")); } else if (rawWebPage.Url.StartsWith("/is-")) { return(RecipeParsingResult.Error("didn't find ingredients, but this is an 'is' web page")); } else { return(RecipeParsingResult.Error("didn't find ingredients")); } } if (splitByIngredients.Length > 2) { // hail mary #2 string[] splitByIngredientsLt = text.Split(new string[] { "ngredients:<" }, StringSplitOptions.RemoveEmptyEntries); if (splitByIngredientsLt.Length == 2) { splitByIngredients = splitByIngredientsLt; } } if (splitByIngredients.Length > 2) { return(RecipeParsingResult.Error("more than one ingredient section found")); } text = splitByIngredients[1]; string[] splitByEndOfListOrDirections = text.Contains("irections:") ? text.Split(new string[] { "irections:" }, StringSplitOptions.RemoveEmptyEntries) : text.Split(new string[] { "</ul>" }, StringSplitOptions.RemoveEmptyEntries); if (splitByEndOfListOrDirections.Length < 2) { return(RecipeParsingResult.Error("end-of-list not found")); } text = splitByEndOfListOrDirections[0]; if (splitByEndOfListOrDirections.Length > 1 && RecipeParsingUtils.LooksLikeMoreIngredients(splitByEndOfListOrDirections[1], isListElement_, isListElementEnd_)) { text = splitByEndOfListOrDirections[0] + "</ul>" + splitByEndOfListOrDirections[1]; } Match element = isListElement_.Match(text); if (!element.Success) { return(RecipeParsingResult.Error("not a single ingredient found")); } List <string> ingredients = new List <string>(); while (element.Success) { string ingredient = RecipeParsingUtils.ExtractIngredient(text, element, isListElementEnd_); if (null == ingredient) { break; } ingredients.Add(ingredient); element = element.NextMatch(); } var quantityFilter = new Regex( "^" + "([0-9/â„]+\\sto\\s)?" + "(½|[0-9/\\- â„]+|0\\.5|0\\.25|0\\.75|1\\.5|half|one third|one quarter|one|two|three|four|five|six|seven|eight|nine|ten|a few)?" + "(\\s?(lbs?|pounds?|grams?|quarts?|oz|ounces?|cups?|tsps?|tbsp?|tbsps?|tbsps?|half|halves|thirds?|quarters?|pinch(es)?|cloves?|links?|sprigs?|tea\\s?spoons?|table\\s?spoons?|inch(es)?)\\.?)?" + "(\\s?\\([^)]+\\))?" + "(\\s?(bag|pouch|box|bottle|pack)[^a-zA-Z]+)?" + "(\\s?of)?" , RegexOptions.IgnoreCase); Recipe result = new Recipe { OriginalWebPage = rawWebPage }; foreach (string ingredientText in ingredients) { string quantity = "", detail = ""; string ingredient = ingredientText; var quantityFound = quantityFilter.Match(ingredient); if (quantityFound.Success) { quantity = quantityFound.Value.Trim(); ingredient = ingredientText.Substring(quantity.Length, ingredientText.Length - quantity.Length).Trim(); } uint commaFound = (uint)ingredient.IndexOf(','); uint braceFound = (uint)ingredient.IndexOf('('); uint detailFound = Math.Min(commaFound, braceFound); if (-1 != (int)detailFound) { detail = ingredient.Substring((int)detailFound, ingredient.Length - (int)detailFound); ingredient = ingredient.Substring(0, (int)detailFound); detail = detail.Trim(',', '\t', ' ').Trim(); } result.Ingredients.Add(new Ingredient() { Declaration = ingredientText, Quantity = quantity, Name = new IngredientName() { Name = RecipeDownloader.RemoveSpecialCharacters(ingredient) }, Detail = detail, }); } if (ingredients.Count < 2) { return(RecipeParsingResult.Error("less than three ingredients")); } return(RecipeParsingResult.Success(result)); // no errors }