public RecipeDownloader(string nextUrl, string path, string source, string urlRoot)
            // things not specific to Kraft
            nextUrl_ = nextUrl;
            source_  = source;
            urlRoot_ = urlRoot;
            if (!Directory.Exists(path + "\\" + source_))
                Directory.CreateDirectory(path + "\\" + source_);
            DirectoryInfo downloaded = new DirectoryInfo(path + "\\" + source_);

            foreach (FileInfo file in downloaded.GetFiles())
                using (StreamReader input = file.OpenText())
                        RawWebPage rwp = (RawWebPage)serializer_.Deserialize(input);
                        if (null != rwp)
                            for (int index = 0; index != rwp.ReferencedUrls.Length; ++index)
                                rwp.ReferencedUrls[index] = TrimUrlArgs(rwp.ReferencedUrls[index]);
                            rwp.Url = TrimUrlArgs(rwp.Url);
                            downloadedRecipes_[rwp.Url] = file.FullName;
                            foreach (var referencedUrl in rwp.ReferencedUrls)
                            rwp.UrlRoot = urlRoot_;
                    catch (Exception e)
                        Console.Error.WriteLine("Failed to deserialize the contents of {0}: {1}", file.FullName, e.ToString());
            foreach (var discovered in discoveredUrls_)
            foreach (var already in downloadedRecipes_)

            // if a given url was already downloaded, skip it
            if (downloadedRecipes_.ContainsKey(nextUrl_))
        public RawWebPage DownloadNext()
            Console.WriteLine("{0}: Downloaded {1} out of {2} discovered recipes ({3})", DateTime.Now, downloadedRecipes_.Count, discoveredUrls_.Count, source_);
            Console.WriteLine("{0}: Downloading {1} ({2})...", DateTime.Now, nextUrl_, source_);

            string url = nextUrl_;


            string           title, text;
            HashSet <string> referencedUrls;

                DownloadRawPage(url, out title, out text, out referencedUrls);
            catch (Exception e)
                Console.WriteLine("Problem when downloading from {0} (will try later): {1}", url, e.ToString());

            RawWebPage result = new RawWebPage();

            result.Url            = url;
            result.UrlRoot        = urlRoot_;
            result.Source         = source_;
            result.RawText        = text;
            result.UtcTimeStamp   = DateTime.UtcNow;
            result.Name           = title;
            result.ReferencedUrls = referencedUrls.ToArray();

            string filename     = title.Replace(' ', '_') + ".raw";
            string invalidChars = new string(Path.GetInvalidFileNameChars()) + new string(Path.GetInvalidPathChars());

            foreach (char c in invalidChars)
                filename = filename.Replace(c.ToString(), "");
            filename = source_ + "/" + filename;

            downloadedRecipes_[url] = filename;

            // very important to do this after storing the new page
            foreach (var discovered in referencedUrls)
                if (!downloadedRecipes_.ContainsKey(discovered))

            // in case we now have more urls to choose from, choose again

            // save to disk now (because even if this results in an exception, we already advanced the iterator)
            using (TextWriter w = new StreamWriter(filename))
                serializer_.Serialize(w, result);
        public RecipeParsingResult TryParseRawWebPage(RawWebPage rawWebPage)
            if (!isRecipeUrl_.Match(rawWebPage.Url).Success)
                return(RecipeParsingResult.Error("not a recipe web page"));
            string text = rawWebPage.RawText;

            string[] headAndBody = bodyTag_.Split(text);
            if (headAndBody.Length < 2)
                return(RecipeParsingResult.Error("body tag not found"));
            if (headAndBody.Length > 2)
                return(RecipeParsingResult.Error("more than one body tag found"));
            text = headAndBody[1];
            string[] splitByIngredients = text.Split(new string[] { ">Ingredients<" }, StringSplitOptions.RemoveEmptyEntries);
            if (splitByIngredients.Length < 2)
                // if the hail mary thing didn't work, then complain
                if (startsWithNumbers_.Match(rawWebPage.Name).Success)
                    return(RecipeParsingResult.Error("didn't find ingredients, but the title starts with numbers"));
                else if (rawWebPage.Url.StartsWith("/all-about-"))
                    return(RecipeParsingResult.Error("didn't find ingredients, but this is an 'all about' web page"));
                else if (rawWebPage.Url.StartsWith("/about-"))
                    return(RecipeParsingResult.Error("didn't find ingredients, but this is an 'about' web page"));
                else if (rawWebPage.Url.StartsWith("/are-"))
                    return(RecipeParsingResult.Error("didn't find ingredients, but this is an 'are' web page"));
                else if (rawWebPage.Url.StartsWith("/is-"))
                    return(RecipeParsingResult.Error("didn't find ingredients, but this is an 'is' web page"));
                    return(RecipeParsingResult.Error("didn't find ingredients"));
            if (splitByIngredients.Length > 2)
                // hail mary #2
                string[] splitByIngredientsLt = text.Split(new string[] { "ngredients:<" }, StringSplitOptions.RemoveEmptyEntries);
                if (splitByIngredientsLt.Length == 2)
                    splitByIngredients = splitByIngredientsLt;
            if (splitByIngredients.Length > 2)
                return(RecipeParsingResult.Error("more than one ingredient section found"));
            text = splitByIngredients[1];
            string[] splitByEndOfListOrDirections = text.Contains("irections:") ?
                                                    text.Split(new string[] { "irections:" }, StringSplitOptions.RemoveEmptyEntries) :
                                                    text.Split(new string[] { "</ul>" }, StringSplitOptions.RemoveEmptyEntries);
            if (splitByEndOfListOrDirections.Length < 2)
                return(RecipeParsingResult.Error("end-of-list not found"));
            text = splitByEndOfListOrDirections[0];
            if (splitByEndOfListOrDirections.Length > 1 && RecipeParsingUtils.LooksLikeMoreIngredients(splitByEndOfListOrDirections[1], isListElement_, isListElementEnd_))
                text = splitByEndOfListOrDirections[0] + "</ul>" + splitByEndOfListOrDirections[1];
            Match element = isListElement_.Match(text);

            if (!element.Success)
                return(RecipeParsingResult.Error("not a single ingredient found"));

            List <string> ingredients = new List <string>();

            while (element.Success)
                string ingredient = RecipeParsingUtils.ExtractIngredient(text, element, isListElementEnd_);
                if (null == ingredient)
                element = element.NextMatch();

            var quantityFilter = new Regex(
                + "([0-9/⁄]+\\sto\\s)?"
                + "(½|[0-9/\\- ⁄]+|0\\.5|0\\.25|0\\.75|1\\.5|half|one third|one quarter|one|two|three|four|five|six|seven|eight|nine|ten|a few)?"
                + "(\\s?(lbs?|pounds?|grams?|quarts?|oz|ounces?|cups?|tsps?|tbsp?|tbsps?|tbsps?|half|halves|thirds?|quarters?|pinch(es)?|cloves?|links?|sprigs?|tea\\s?spoons?|table\\s?spoons?|inch(es)?)\\.?)?"
                + "(\\s?\\([^)]+\\))?"
                + "(\\s?(bag|pouch|box|bottle|pack)[^a-zA-Z]+)?"
                + "(\\s?of)?"
                , RegexOptions.IgnoreCase);

            Recipe result = new Recipe {
                OriginalWebPage = rawWebPage

            foreach (string ingredientText in ingredients)
                string quantity = "", detail = "";
                string ingredient    = ingredientText;
                var    quantityFound = quantityFilter.Match(ingredient);
                if (quantityFound.Success)
                    quantity   = quantityFound.Value.Trim();
                    ingredient = ingredientText.Substring(quantity.Length, ingredientText.Length - quantity.Length).Trim();
                uint commaFound  = (uint)ingredient.IndexOf(',');
                uint braceFound  = (uint)ingredient.IndexOf('(');
                uint detailFound = Math.Min(commaFound, braceFound);
                if (-1 != (int)detailFound)
                    detail     = ingredient.Substring((int)detailFound, ingredient.Length - (int)detailFound);
                    ingredient = ingredient.Substring(0, (int)detailFound);
                    detail     = detail.Trim(',', '\t', ' ').Trim();
                result.Ingredients.Add(new Ingredient()
                    Declaration = ingredientText,
                    Quantity    = quantity,
                    Name        = new IngredientName()
                        Name = RecipeDownloader.RemoveSpecialCharacters(ingredient)
                    Detail = detail,
            if (ingredients.Count < 2)
                return(RecipeParsingResult.Error("less than three ingredients"));
            return(RecipeParsingResult.Success(result)); // no errors