コード例 #1
0
        public RecipeDownloader(string nextUrl, string path, string source, string urlRoot)
        {
            // things not specific to Kraft
            nextUrl_ = nextUrl;
            source_  = source;
            urlRoot_ = urlRoot;
            if (!Directory.Exists(path + "\\" + source_))
            {
                Directory.CreateDirectory(path + "\\" + source_);
            }
            DirectoryInfo downloaded = new DirectoryInfo(path + "\\" + source_);

            foreach (FileInfo file in downloaded.GetFiles())
            {
                using (StreamReader input = file.OpenText())
                {
                    try
                    {
                        RawWebPage rwp = (RawWebPage)serializer_.Deserialize(input);
                        if (null != rwp)
                        {
                            for (int index = 0; index != rwp.ReferencedUrls.Length; ++index)
                            {
                                rwp.ReferencedUrls[index] = TrimUrlArgs(rwp.ReferencedUrls[index]);
                            }
                            rwp.Url = TrimUrlArgs(rwp.Url);
                            discoveredUrls_.Add(rwp.Url);
                            downloadedRecipes_[rwp.Url] = file.FullName;
                            foreach (var referencedUrl in rwp.ReferencedUrls)
                            {
                                discoveredUrls_.Add(referencedUrl);
                            }
                            rwp.UrlRoot = urlRoot_;
                        }
                    }
                    catch (Exception e)
                    {
                        Console.Error.WriteLine("Failed to deserialize the contents of {0}: {1}", file.FullName, e.ToString());
                    }
                }
            }
            foreach (var discovered in discoveredUrls_)
            {
                urlsToDownload_.Add(discovered);
            }
            foreach (var already in downloadedRecipes_)
            {
                urlsToDownload_.Remove(already.Key);
            }

            // if a given url was already downloaded, skip it
            if (downloadedRecipes_.ContainsKey(nextUrl_))
            {
                PickNextUrl();
            }
        }
コード例 #2
0
        public RawWebPage DownloadNext()
        {
            Console.WriteLine("{0}: Downloaded {1} out of {2} discovered recipes ({3})", DateTime.Now, downloadedRecipes_.Count, discoveredUrls_.Count, source_);
            Console.WriteLine("{0}: Downloading {1} ({2})...", DateTime.Now, nextUrl_, source_);

            string url = nextUrl_;

            urlsToDownload_.Remove(nextUrl_);
            PickNextUrl();

            string           title, text;
            HashSet <string> referencedUrls;

            try
            {
                DownloadRawPage(url, out title, out text, out referencedUrls);
            }
            catch (Exception e)
            {
                Console.WriteLine("Problem when downloading from {0} (will try later): {1}", url, e.ToString());
                downloadLater_.Add(url);
                throw;
            }

            RawWebPage result = new RawWebPage();

            result.Url            = url;
            result.UrlRoot        = urlRoot_;
            result.Source         = source_;
            result.RawText        = text;
            result.UtcTimeStamp   = DateTime.UtcNow;
            result.Name           = title;
            result.ReferencedUrls = referencedUrls.ToArray();

            string filename     = title.Replace(' ', '_') + ".raw";
            string invalidChars = new string(Path.GetInvalidFileNameChars()) + new string(Path.GetInvalidPathChars());

            foreach (char c in invalidChars)
            {
                filename = filename.Replace(c.ToString(), "");
            }
            filename = source_ + "/" + filename;

            downloadedRecipes_[url] = filename;

            // very important to do this after storing the new page
            foreach (var discovered in referencedUrls)
            {
                discoveredUrls_.Add(discovered);
                if (!downloadedRecipes_.ContainsKey(discovered))
                {
                    urlsToDownload_.Add(discovered);
                }
            }

            // in case we now have more urls to choose from, choose again
            PickNextUrl();

            // save to disk now (because even if this results in an exception, we already advanced the iterator)
            using (TextWriter w = new StreamWriter(filename))
                serializer_.Serialize(w, result);
            return(result);
        }
コード例 #3
0
        public RecipeParsingResult TryParseRawWebPage(RawWebPage rawWebPage)
        {
            if (!isRecipeUrl_.Match(rawWebPage.Url).Success)
            {
                return(RecipeParsingResult.Error("not a recipe web page"));
            }
            string text = rawWebPage.RawText;

            string[] headAndBody = bodyTag_.Split(text);
            if (headAndBody.Length < 2)
            {
                return(RecipeParsingResult.Error("body tag not found"));
            }
            if (headAndBody.Length > 2)
            {
                return(RecipeParsingResult.Error("more than one body tag found"));
            }
            text = headAndBody[1];
            string[] splitByIngredients = text.Split(new string[] { ">Ingredients<" }, StringSplitOptions.RemoveEmptyEntries);
            if (splitByIngredients.Length < 2)
            {
                // if the hail mary thing didn't work, then complain
                if (startsWithNumbers_.Match(rawWebPage.Name).Success)
                {
                    return(RecipeParsingResult.Error("didn't find ingredients, but the title starts with numbers"));
                }
                else if (rawWebPage.Url.StartsWith("/all-about-"))
                {
                    return(RecipeParsingResult.Error("didn't find ingredients, but this is an 'all about' web page"));
                }
                else if (rawWebPage.Url.StartsWith("/about-"))
                {
                    return(RecipeParsingResult.Error("didn't find ingredients, but this is an 'about' web page"));
                }
                else if (rawWebPage.Url.StartsWith("/are-"))
                {
                    return(RecipeParsingResult.Error("didn't find ingredients, but this is an 'are' web page"));
                }
                else if (rawWebPage.Url.StartsWith("/is-"))
                {
                    return(RecipeParsingResult.Error("didn't find ingredients, but this is an 'is' web page"));
                }
                else
                {
                    return(RecipeParsingResult.Error("didn't find ingredients"));
                }
            }
            if (splitByIngredients.Length > 2)
            {
                // hail mary #2
                string[] splitByIngredientsLt = text.Split(new string[] { "ngredients:<" }, StringSplitOptions.RemoveEmptyEntries);
                if (splitByIngredientsLt.Length == 2)
                {
                    splitByIngredients = splitByIngredientsLt;
                }
            }
            if (splitByIngredients.Length > 2)
            {
                return(RecipeParsingResult.Error("more than one ingredient section found"));
            }
            text = splitByIngredients[1];
            string[] splitByEndOfListOrDirections = text.Contains("irections:") ?
                                                    text.Split(new string[] { "irections:" }, StringSplitOptions.RemoveEmptyEntries) :
                                                    text.Split(new string[] { "</ul>" }, StringSplitOptions.RemoveEmptyEntries);
            if (splitByEndOfListOrDirections.Length < 2)
            {
                return(RecipeParsingResult.Error("end-of-list not found"));
            }
            text = splitByEndOfListOrDirections[0];
            if (splitByEndOfListOrDirections.Length > 1 && RecipeParsingUtils.LooksLikeMoreIngredients(splitByEndOfListOrDirections[1], isListElement_, isListElementEnd_))
            {
                text = splitByEndOfListOrDirections[0] + "</ul>" + splitByEndOfListOrDirections[1];
            }
            Match element = isListElement_.Match(text);

            if (!element.Success)
            {
                return(RecipeParsingResult.Error("not a single ingredient found"));
            }

            List <string> ingredients = new List <string>();

            while (element.Success)
            {
                string ingredient = RecipeParsingUtils.ExtractIngredient(text, element, isListElementEnd_);
                if (null == ingredient)
                {
                    break;
                }
                ingredients.Add(ingredient);
                element = element.NextMatch();
            }

            var quantityFilter = new Regex(
                "^"
                + "([0-9/⁄]+\\sto\\s)?"
                + "(½|[0-9/\\- ⁄]+|0\\.5|0\\.25|0\\.75|1\\.5|half|one third|one quarter|one|two|three|four|five|six|seven|eight|nine|ten|a few)?"
                + "(\\s?(lbs?|pounds?|grams?|quarts?|oz|ounces?|cups?|tsps?|tbsp?|tbsps?|tbsps?|half|halves|thirds?|quarters?|pinch(es)?|cloves?|links?|sprigs?|tea\\s?spoons?|table\\s?spoons?|inch(es)?)\\.?)?"
                + "(\\s?\\([^)]+\\))?"
                + "(\\s?(bag|pouch|box|bottle|pack)[^a-zA-Z]+)?"
                + "(\\s?of)?"
                , RegexOptions.IgnoreCase);

            Recipe result = new Recipe {
                OriginalWebPage = rawWebPage
            };

            foreach (string ingredientText in ingredients)
            {
                string quantity = "", detail = "";
                string ingredient    = ingredientText;
                var    quantityFound = quantityFilter.Match(ingredient);
                if (quantityFound.Success)
                {
                    quantity   = quantityFound.Value.Trim();
                    ingredient = ingredientText.Substring(quantity.Length, ingredientText.Length - quantity.Length).Trim();
                }
                uint commaFound  = (uint)ingredient.IndexOf(',');
                uint braceFound  = (uint)ingredient.IndexOf('(');
                uint detailFound = Math.Min(commaFound, braceFound);
                if (-1 != (int)detailFound)
                {
                    detail     = ingredient.Substring((int)detailFound, ingredient.Length - (int)detailFound);
                    ingredient = ingredient.Substring(0, (int)detailFound);
                    detail     = detail.Trim(',', '\t', ' ').Trim();
                }
                result.Ingredients.Add(new Ingredient()
                {
                    Declaration = ingredientText,
                    Quantity    = quantity,
                    Name        = new IngredientName()
                    {
                        Name = RecipeDownloader.RemoveSpecialCharacters(ingredient)
                    },
                    Detail = detail,
                });
            }
            if (ingredients.Count < 2)
            {
                return(RecipeParsingResult.Error("less than three ingredients"));
            }
            return(RecipeParsingResult.Success(result)); // no errors
        }