public RecipeDownloader(string nextUrl, string path, string source, string urlRoot) { // things not specific to Kraft nextUrl_ = nextUrl; source_ = source; urlRoot_ = urlRoot; if (!Directory.Exists(path + "\\" + source_)) { Directory.CreateDirectory(path + "\\" + source_); } DirectoryInfo downloaded = new DirectoryInfo(path + "\\" + source_); foreach (FileInfo file in downloaded.GetFiles()) { using (StreamReader input = file.OpenText()) { try { RawWebPage rwp = (RawWebPage)serializer_.Deserialize(input); if (null != rwp) { for (int index = 0; index != rwp.ReferencedUrls.Length; ++index) { rwp.ReferencedUrls[index] = TrimUrlArgs(rwp.ReferencedUrls[index]); } rwp.Url = TrimUrlArgs(rwp.Url); discoveredUrls_.Add(rwp.Url); downloadedRecipes_[rwp.Url] = file.FullName; foreach (var referencedUrl in rwp.ReferencedUrls) { discoveredUrls_.Add(referencedUrl); } rwp.UrlRoot = urlRoot_; } } catch (Exception e) { Console.Error.WriteLine("Failed to deserialize the contents of {0}: {1}", file.FullName, e.ToString()); } } } foreach (var discovered in discoveredUrls_) { urlsToDownload_.Add(discovered); } foreach (var already in downloadedRecipes_) { urlsToDownload_.Remove(already.Key); } // if a given url was already downloaded, skip it if (downloadedRecipes_.ContainsKey(nextUrl_)) { PickNextUrl(); } }
public RawWebPage DownloadNext() { Console.WriteLine("{0}: Downloaded {1} out of {2} discovered recipes ({3})", DateTime.Now, downloadedRecipes_.Count, discoveredUrls_.Count, source_); Console.WriteLine("{0}: Downloading {1} ({2})...", DateTime.Now, nextUrl_, source_); string url = nextUrl_; urlsToDownload_.Remove(nextUrl_); PickNextUrl(); string title, text; HashSet <string> referencedUrls; try { DownloadRawPage(url, out title, out text, out referencedUrls); } catch (Exception e) { Console.WriteLine("Problem when downloading from {0} (will try later): {1}", url, e.ToString()); downloadLater_.Add(url); throw; } RawWebPage result = new RawWebPage(); result.Url = url; result.UrlRoot = urlRoot_; result.Source = source_; result.RawText = text; result.UtcTimeStamp = DateTime.UtcNow; result.Name = title; result.ReferencedUrls = referencedUrls.ToArray(); string filename = title.Replace(' ', '_') + ".raw"; string invalidChars = new string(Path.GetInvalidFileNameChars()) + new string(Path.GetInvalidPathChars()); foreach (char c in invalidChars) { filename = filename.Replace(c.ToString(), ""); } filename = source_ + "/" + filename; downloadedRecipes_[url] = filename; // very important to do this after storing the new page foreach (var discovered in referencedUrls) { discoveredUrls_.Add(discovered); if (!downloadedRecipes_.ContainsKey(discovered)) { urlsToDownload_.Add(discovered); } } // in case we now have more urls to choose from, choose again PickNextUrl(); // save to disk now (because even if this results in an exception, we already advanced the iterator) using (TextWriter w = new StreamWriter(filename)) serializer_.Serialize(w, result); return(result); }
public RecipeParsingResult TryParseRawWebPage(RawWebPage rawWebPage) { if (!isRecipeUrl_.Match(rawWebPage.Url).Success) { return(RecipeParsingResult.Error("not a recipe web page")); } string text = rawWebPage.RawText; string[] headAndBody = bodyTag_.Split(text); if (headAndBody.Length < 2) { return(RecipeParsingResult.Error("body tag not found")); } if (headAndBody.Length > 2) { return(RecipeParsingResult.Error("more than one body tag found")); } text = headAndBody[1]; string[] splitByIngredients = text.Split(new string[] { ">Ingredients<" }, StringSplitOptions.RemoveEmptyEntries); if (splitByIngredients.Length < 2) { // if the hail mary thing didn't work, then complain if (startsWithNumbers_.Match(rawWebPage.Name).Success) { return(RecipeParsingResult.Error("didn't find ingredients, but the title starts with numbers")); } else if (rawWebPage.Url.StartsWith("/all-about-")) { return(RecipeParsingResult.Error("didn't find ingredients, but this is an 'all about' web page")); } else if (rawWebPage.Url.StartsWith("/about-")) { return(RecipeParsingResult.Error("didn't find ingredients, but this is an 'about' web page")); } else if (rawWebPage.Url.StartsWith("/are-")) { return(RecipeParsingResult.Error("didn't find ingredients, but this is an 'are' web page")); } else if (rawWebPage.Url.StartsWith("/is-")) { return(RecipeParsingResult.Error("didn't find ingredients, but this is an 'is' web page")); } else { return(RecipeParsingResult.Error("didn't find ingredients")); } } if (splitByIngredients.Length > 2) { // hail mary #2 string[] splitByIngredientsLt = text.Split(new string[] { "ngredients:<" }, StringSplitOptions.RemoveEmptyEntries); if (splitByIngredientsLt.Length == 2) { splitByIngredients = splitByIngredientsLt; } } if (splitByIngredients.Length > 2) { return(RecipeParsingResult.Error("more than one ingredient section found")); } text = splitByIngredients[1]; string[] splitByEndOfListOrDirections = text.Contains("irections:") ? text.Split(new string[] { "irections:" }, StringSplitOptions.RemoveEmptyEntries) : text.Split(new string[] { "</ul>" }, StringSplitOptions.RemoveEmptyEntries); if (splitByEndOfListOrDirections.Length < 2) { return(RecipeParsingResult.Error("end-of-list not found")); } text = splitByEndOfListOrDirections[0]; if (splitByEndOfListOrDirections.Length > 1 && RecipeParsingUtils.LooksLikeMoreIngredients(splitByEndOfListOrDirections[1], isListElement_, isListElementEnd_)) { text = splitByEndOfListOrDirections[0] + "</ul>" + splitByEndOfListOrDirections[1]; } Match element = isListElement_.Match(text); if (!element.Success) { return(RecipeParsingResult.Error("not a single ingredient found")); } List <string> ingredients = new List <string>(); while (element.Success) { string ingredient = RecipeParsingUtils.ExtractIngredient(text, element, isListElementEnd_); if (null == ingredient) { break; } ingredients.Add(ingredient); element = element.NextMatch(); } var quantityFilter = new Regex( "^" + "([0-9/â„]+\\sto\\s)?" + "(½|[0-9/\\- â„]+|0\\.5|0\\.25|0\\.75|1\\.5|half|one third|one quarter|one|two|three|four|five|six|seven|eight|nine|ten|a few)?" + "(\\s?(lbs?|pounds?|grams?|quarts?|oz|ounces?|cups?|tsps?|tbsp?|tbsps?|tbsps?|half|halves|thirds?|quarters?|pinch(es)?|cloves?|links?|sprigs?|tea\\s?spoons?|table\\s?spoons?|inch(es)?)\\.?)?" + "(\\s?\\([^)]+\\))?" + "(\\s?(bag|pouch|box|bottle|pack)[^a-zA-Z]+)?" + "(\\s?of)?" , RegexOptions.IgnoreCase); Recipe result = new Recipe { OriginalWebPage = rawWebPage }; foreach (string ingredientText in ingredients) { string quantity = "", detail = ""; string ingredient = ingredientText; var quantityFound = quantityFilter.Match(ingredient); if (quantityFound.Success) { quantity = quantityFound.Value.Trim(); ingredient = ingredientText.Substring(quantity.Length, ingredientText.Length - quantity.Length).Trim(); } uint commaFound = (uint)ingredient.IndexOf(','); uint braceFound = (uint)ingredient.IndexOf('('); uint detailFound = Math.Min(commaFound, braceFound); if (-1 != (int)detailFound) { detail = ingredient.Substring((int)detailFound, ingredient.Length - (int)detailFound); ingredient = ingredient.Substring(0, (int)detailFound); detail = detail.Trim(',', '\t', ' ').Trim(); } result.Ingredients.Add(new Ingredient() { Declaration = ingredientText, Quantity = quantity, Name = new IngredientName() { Name = RecipeDownloader.RemoveSpecialCharacters(ingredient) }, Detail = detail, }); } if (ingredients.Count < 2) { return(RecipeParsingResult.Error("less than three ingredients")); } return(RecipeParsingResult.Success(result)); // no errors }