예제 #1
0
        public void scrapData(HtmlNode[] nodes, int dataformat)
        {
            string urlActual = string.Empty;

            cantidad = nodes.Count();
            lbTotalNodosIdentificados.Text = nodes.Count().ToString();
            cantNodeIndetified             = nodes.Count();
            progressBarBusqueda.Maximum    = Convert.ToInt32(cantidad);
            progressBarBusqueda.Step       = 1;
            foreach (HtmlNode enlace in nodes)
            {
                itemCount++;
                lbTotalNodosEvaluados.Text = itemCount.ToString();
                percent        = Math.Round((itemCount / cantidad * 100), 2, MidpointRounding.AwayFromZero);
                lbPorcent.Text = percent.ToString() + "%";
                progressBarBusqueda.PerformStep();
                progressBarBusqueda.Refresh();
                this.Refresh();
                foreach (HtmlAttribute atributo in enlace.Attributes)
                {
                    urlActual           = atributo.Value.ToString();
                    lbEnlaceActual.Text = urlActual;
                    lbEnlaceActual.Refresh();
                    if (RegexTool.isUrl(urlActual))
                    {
                        if (!isVisited(urlActual))
                        {
                            HtmlAgilityPack.HtmlDocument recipe = web.Load(urlActual);

                            if (hasRecipe(recipe, dataformat))
                            {
                                addValueToDic(urlActual, "");
                                lbTotalRegistroEncontrados.Refresh();
                                GetMicroData(recipe, dataformat);
                            }
                            else
                            {
                                var recetas = recipe.DocumentNode.SelectNodes(".//article/a/@href");
                                addValueToDic(urlActual, "");
                                if (recetas != null)
                                {
                                    scrapData(recetas.ToArray(), dataformat);
                                }
                            }
                        }
                    }
                }
            }
        }
        public void scrapData(string Url, int cantidad)
        {
            browser = new System.Windows.Forms.WebBrowser();
            browser.DocumentCompleted += new System.Windows.Forms.WebBrowserDocumentCompletedEventHandler(DocumentCompleted);

            HtmlDocument document = null;
            int          contador = 0;
            HtmlWeb      web      = new HtmlWeb();

            while (contador <= cantidad)
            {
                browser.Navigate(Url);
                browser.Visible = true;
                browser.Refresh();
                while (completed)
                {
                    System.Windows.Forms.Application.DoEvents();
                    Thread.Sleep(100);
                    botonClik = browser.Document.GetElementById("btnMoreResults");
                    botonClik.InvokeMember("click");
                    document = (HtmlDocument)browser.Document.DomDocument;
                    HtmlNode[] nodes = document.DocumentNode.SelectNodes(".//a/@href").ToArray();
                    Console.Write("Buscando Recetas.....");
                    using (var progress = new ProgressBar())
                    {
                        foreach (HtmlNode enlace in nodes)
                        {
                            foreach (HtmlAttribute atributo in enlace.Attributes)
                            {
                                if (RegexTool.isUrl(atributo.Value))
                                {
                                    HtmlDocument recipe = web.Load(atributo.Value);
                                    if (hasRecipe(recipe))
                                    {
                                        contador++;
                                        progress.Report((double)contador / cantidad);
                                        Thread.Sleep(20);
                                        getMicroData(recipe);
                                    }
                                }
                            }
                        }
                    }
                }
            }
            Console.WriteLine("Listo.");
            Console.ReadLine();
        }
        public DataSet getMicroData(HtmlDocument Document)
        {
            ClDataAcces objDataAccess = new ClDataAcces();
            string      Name = string.Empty;
            string      Ingredient = string.Empty;
            DataSet     recipeData = new DataSet();
            DataTable   recipe = new DataTable();
            DataTable   nutritionData = new DataTable();
            string      calories = string.Empty, fat = string.Empty, saturatefat = string.Empty,
                        fiber = string.Empty, carbohydrate = string.Empty, protein = string.Empty,
                        cholesterol = string.Empty, sugar = string.Empty, sodium = string.Empty;

            // RECIPE DATA
            recipe.Columns.Add("Name");
            recipe.Columns.Add("Ingrediente");

            // NUTRICION DATA

            nutritionData.Columns.Add("Calorias", typeof(decimal));
            nutritionData.Columns.Add("fat", typeof(decimal));
            nutritionData.Columns.Add("saturatefat", typeof(decimal));
            nutritionData.Columns.Add("fiber", typeof(decimal));
            nutritionData.Columns.Add("carbohydrate", typeof(decimal));
            nutritionData.Columns.Add("protein", typeof(decimal));
            nutritionData.Columns.Add("cholesterol", typeof(decimal));
            nutritionData.Columns.Add("sugar", typeof(decimal));
            nutritionData.Columns.Add("sodium", typeof(decimal));


            HtmlNode[] nodes    = Document.DocumentNode.SelectNodes(".//*[@itemtype='http://schema.org/Recipe']").ToArray();
            HtmlNode[] child    = null;
            HtmlNode   NameNode = Document.DocumentNode.SelectSingleNode("/html/body/div[2]/div/div[1]/div[3]/section[1]/section[2]/div/h3");

            Name = NameNode.InnerText;

            foreach (HtmlNode item in nodes)
            {
                child = item.SelectNodes(".//*[@itemprop='ingredients']").ToArray();
                Console.WriteLine("INGREDIENTES\r");
                Console.WriteLine("-----------------------------------------------------------------------");
                foreach (HtmlNode chldnode in child)
                {
                    for (int ind = 0; ind < chldnode.Attributes.Count; ind++)
                    {
                        Console.WriteLine(chldnode.Attributes[ind].Name + ": " +
                                          chldnode.Attributes[ind].Value.ToString() + " ---> " +
                                          chldnode.InnerText.Trim() + "\r");
                        Ingredient = chldnode.InnerText.Trim();
                        recipe.NewRow();
                        recipe.Rows.Add(Name, Ingredient);
                    }
                }
                nodes = Document.DocumentNode.SelectNodes(".//*[@itemtype='http://schema.org/NutritionInformation']").ToArray();

                Console.WriteLine("INFORMACION DE NUTRICION\r");
                Console.WriteLine("-----------------------------------------------------------------------");

                foreach (HtmlNode chldnode in nodes)
                {
                    child = chldnode.SelectNodes(".//*[@itemprop]").ToArray();
                    foreach (HtmlNode ntNodo in child)
                    {
                        for (int ind = 0; ind < ntNodo.Attributes.Count; ind++)
                        {
                            Console.WriteLine(ntNodo.Attributes[ind].Value + ": " +
                                              RegexTool.GetNumber(ntNodo.InnerText) + "\r");
                            switch (ntNodo.Attributes[ind].Value)
                            {
                            case "calories":
                                calories = RegexTool.GetNumber(ntNodo.InnerText);
                                break;

                            case "fatContent":
                                fat = RegexTool.GetNumber(ntNodo.InnerText);
                                break;

                            case "saturatedFatContent":
                                saturatefat = RegexTool.GetNumber(ntNodo.InnerText);
                                break;

                            case "fiberContent":
                                fiber = RegexTool.GetNumber(ntNodo.InnerText);
                                break;

                            case "carbohydrateContent":
                                carbohydrate = RegexTool.GetNumber(ntNodo.InnerText);
                                break;

                            case "proteinContent":
                                protein = RegexTool.GetNumber(ntNodo.InnerText);
                                break;

                            case "cholesterolContent":
                                cholesterol = RegexTool.GetNumber(ntNodo.InnerText);
                                break;

                            case "sugarContent":
                                sugar = RegexTool.GetNumber(ntNodo.InnerText);
                                break;

                            case "sodiumContent":
                                sodium = RegexTool.GetNumber(ntNodo.InnerText);
                                break;
                            }
                        }
                    }
                }
                nutritionData.Rows.Add(Convert.ToDecimal(calories), Convert.ToDecimal(fat), Convert.ToDecimal(saturatefat),
                                       Convert.ToDecimal(fiber), Convert.ToDecimal(carbohydrate), Convert.ToDecimal(protein), Convert.ToDecimal(cholesterol)
                                       , Convert.ToDecimal(sugar), Convert.ToDecimal(sodium));
            }
            recipeData.Tables.Add(recipe);
            recipeData.Tables.Add(nutritionData);

            objDataAccess.insertRecipeData(recipeData, false);
            Console.WriteLine("Datos Registrados con exito");
            return(recipeData);
        }
예제 #4
0
        public DataSet GetMicroData(HtmlAgilityPack.HtmlDocument Document, int dataFormat)
        {
            string      dataError = string.Empty, pictureRoute = string.Empty, appRoute = string.Empty, filename = string.Empty;
            ClDataAcces objDataAccess = new ClDataAcces();
            string      Name          = string.Empty;
            string      Ingredient    = string.Empty;
            DataSet     recipeData    = new DataSet();
            DataTable   recipeTable   = new DataTable();
            DataTable   nutritionData = new DataTable();

            HtmlNode[]     nodes = null;
            HtmlNode[]     child = null;
            HtmlNode       NameNode = null;
            WebClient      cliente = new WebClient();
            IList <string> listIngredient = null;
            Uri            Url = null;
            string         calories = "0", fat = "0", saturatefat = "0",
                           fiber = "0", carbohydrate = "0", protein = "0",
                           cholesterol = "0", sugar = "0", sodium = "0", imagenUrl = string.Empty;

            // RECIPE DATA
            recipeTable.Columns.Add("Name", typeof(string));
            recipeTable.Columns.Add("Ingrediente", typeof(string));
            recipeTable.Columns.Add("recipeTipoPlatoData", typeof(string));
            recipeTable.Columns.Add("recipeCulturaData", typeof(string));
            recipeTable.Columns.Add("recipeNacionalidadData", typeof(string));
            recipeTable.Columns.Add("recipeMomentoData", typeof(string));
            recipeTable.Columns.Add("recipeTemporadaData", typeof(string));
            recipeTable.Columns.Add("Picture", typeof(string));
            recipeTable.Columns.Add("Origen", typeof(string));

            recipeTable.Columns.Add("esSopa", typeof(Boolean));
            recipeTable.Columns.Add("esPasta", typeof(Boolean));
            recipeTable.Columns.Add("esMarisco", typeof(Boolean));
            recipeTable.Columns.Add("esEnsalada", typeof(Boolean));
            recipeTable.Columns.Add("esBebida", typeof(Boolean));
            recipeTable.Columns.Add("esBajoColesterol", typeof(Boolean));
            recipeTable.Columns.Add("esBajoEnCalorias", typeof(Boolean));
            recipeTable.Columns.Add("esLibreGluten", typeof(Boolean));



            // NUTRICION DATA

            nutritionData.Columns.Add("Calorias", typeof(decimal));
            nutritionData.Columns.Add("fat", typeof(decimal));
            nutritionData.Columns.Add("saturatefat", typeof(decimal));
            nutritionData.Columns.Add("fiber", typeof(decimal));
            nutritionData.Columns.Add("carbohydrate", typeof(decimal));
            nutritionData.Columns.Add("protein", typeof(decimal));
            nutritionData.Columns.Add("cholesterol", typeof(decimal));
            nutritionData.Columns.Add("sugar", typeof(decimal));
            nutritionData.Columns.Add("sodium", typeof(decimal));

            appRoute = Application.StartupPath;

            switch (dataFormat)
            {
            case 0:

                nodes     = Document.DocumentNode.SelectNodes(".//*[@itemtype='http://schema.org/Recipe']").ToArray();
                NameNode  = Document.DocumentNode.SelectSingleNode(".//*[@class='recipeDetailHeader showOnTabletToDesktop']");
                imagenUrl = Document.DocumentNode.Descendants("img")
                            .Where(node => node.Attributes["class"] != null && node.Attributes["class"].Value == "recipeDetailSummaryImageMain")
                            .Select(node => node.Attributes["src"].Value)
                            .DefaultIfEmpty(string.Empty)
                            .FirstOrDefault()
                            .ToString();


                cliente.DownloadFileCompleted += new AsyncCompletedEventHandler(Cliente_DownloadFileCompleted);
                Url = new Uri(imagenUrl);

                filename     = System.IO.Path.GetFileName(Url.LocalPath);
                pictureRoute = appRoute.Remove(appRoute.Length - 10) + "\\Picture\\" + filename;
                cliente.DownloadFileAsync(Url, pictureRoute);
                Name = NameNode.InnerText;

                foreach (HtmlNode item in nodes)
                {
                    child = item.SelectNodes(".//*[@itemprop='ingredients']").ToArray();

                    foreach (HtmlNode chldnode in child)
                    {
                        for (int ind = 0; ind < chldnode.Attributes.Count; ind++)
                        {
                            Ingredient = chldnode.InnerText.Trim();
                            recipeTable.NewRow();
                            recipeTable.Rows.Add(Name, Ingredient, CboTipoPlato.Text, cboCultura.Text,
                                                 cboNacionalidad.Text, cboMomentoComida.Text, cboTemporada.Text, pictureRoute,
                                                 txtOrigenRegistro.Text, chkSopa.Checked,
                                                 chkPasta.Checked, chkPescadoMarisco.Checked, chkEnsalada.Checked,
                                                 chkBebida.Checked, chkBajoColesterol.Checked, chkBajoCalorias.Checked,
                                                 chkLibreGluten.Checked);
                        }
                    }
                    if (Document.DocumentNode.SelectNodes(".//*[@itemtype='http://schema.org/NutritionInformation']") != null)
                    {
                        nodes = Document.DocumentNode.SelectNodes(".//*[@itemtype='http://schema.org/NutritionInformation']").ToArray();

                        foreach (HtmlNode chldnode in nodes)
                        {
                            child = chldnode.SelectNodes(".//*[@itemprop]").ToArray();
                            foreach (HtmlNode ntNodo in child)
                            {
                                for (int ind = 0; ind < ntNodo.Attributes.Count; ind++)
                                {
                                    Console.WriteLine(ntNodo.Attributes[ind].Value + ": " +
                                                      RegexTool.GetNumber(ntNodo.InnerText) + "\r");
                                    if (ntNodo.InnerText != "")
                                    {
                                        switch (ntNodo.Attributes[ind].Value)
                                        {
                                        case "calories":
                                            calories = RegexTool.GetNumber(ntNodo.InnerText);

                                            break;

                                        case "fatContent":
                                            fat = RegexTool.GetNumber(ntNodo.InnerText);
                                            break;

                                        case "saturatedFatContent":
                                            saturatefat = RegexTool.GetNumber(ntNodo.InnerText);
                                            break;

                                        case "fiberContent":
                                            fiber = RegexTool.GetNumber(ntNodo.InnerText);
                                            break;

                                        case "carbohydrateContent":
                                            carbohydrate = RegexTool.GetNumber(ntNodo.InnerText);
                                            break;

                                        case "proteinContent":
                                            protein = RegexTool.GetNumber(ntNodo.InnerText);
                                            break;

                                        case "cholesterolContent":
                                            cholesterol = RegexTool.GetNumber(ntNodo.InnerText);
                                            break;

                                        case "sugarContent":
                                            sugar = RegexTool.GetNumber(ntNodo.InnerText);
                                            break;

                                        case "sodiumContent":
                                            sodium = RegexTool.GetNumber(ntNodo.InnerText);
                                            break;
                                        }
                                    }
                                }
                            }
                        }
                        try
                        {
                            nutritionData.Rows.Add(Convert.ToDecimal(calories), Convert.ToDecimal(fat), Convert.ToDecimal(saturatefat),
                                                   Convert.ToDecimal(fiber), Convert.ToDecimal(carbohydrate), Convert.ToDecimal(protein), Convert.ToDecimal(cholesterol)
                                                   , Convert.ToDecimal(sugar), Convert.ToDecimal(sodium));
                        }
                        catch (Exception error)
                        {
                            dataError = error.Message;
                        }
                    }
                }
                if (nutritionData.Rows.Count > 0)
                {
                    recipeData.Tables.Add(recipeTable);
                    recipeData.Tables.Add(nutritionData);
                    dataError = objDataAccess.insertRecipeData(recipeData, chkClasificacion.Checked);
                    if (dataError != "")
                    {
                        errorCount++;
                        lbErrorCount.Text = errorCount.ToString() + "\r MicroData scraper ";
                        Ftool.WriteLogFile(DateTime.Now.ToShortDateString() + " " + DateTime.Now.ToShortTimeString() + " " + dataError);
                    }
                    else
                    {
                        contRegistroEncontrados++;
                        lbTotalRegistroEncontrados.Text = contRegistroEncontrados.ToString();
                    }
                }
                break;

            case 1:
                // RECUPERANDO EN VALOR JSON EN LA PAGINA

                string jsonValue = Document.DocumentNode.
                                   SelectSingleNode("//script[@type='application/ld+json']").InnerText;

                //SERIALIZANDO EL OBJETO JSON A UN OBJETO DINAMICO

                dynamic jsonData = JsonConvert.DeserializeObject(jsonValue);
                try
                {
                    jsonData.nutrition.Remove("@type");
                    Dictionary <string, string> values = jsonData.nutrition.ToObject <Dictionary <string, string> >();
                    listIngredient = jsonData.recipeIngredient.ToObject <IList <string> >();

                    // RECUPERANDO VALORES NUTRICIONALES

                    foreach (KeyValuePair <string, string> valor in values)
                    {
                        switch (valor.Key)
                        {
                        case "calories":
                            calories = RegexTool.GetNumber(valor.Value);
                            break;

                        case "fatContent":
                            fat = RegexTool.GetNumber(valor.Value);
                            break;

                        case "saturatedFatContent":
                            saturatefat = RegexTool.GetNumber(valor.Value);
                            break;

                        case "proteinContent":
                            protein = RegexTool.GetNumber(valor.Value);
                            break;

                        case "carbohydrateContent":
                            carbohydrate = RegexTool.GetNumber(valor.Value);
                            break;

                        case "sugarContent":
                            sugar = RegexTool.GetNumber(valor.Value);
                            break;

                        case "sodiumContent":
                            sodium = RegexTool.GetNumber(valor.Value);
                            break;

                        case "fiberContent":
                            fiber = RegexTool.GetNumber(valor.Value);
                            break;
                        }
                    }
                }
                catch (Exception e)
                {
                    dataError         = e.Message;
                    lbErrorCount.Text = errorCount.ToString() + "\r Json-Ld Scraper ";
                    Ftool.WriteLogFile(DateTime.Now.ToShortDateString() + " " + DateTime.Now.ToShortTimeString() + " " + dataError);
                }
                try
                {
                    nutritionData.Rows.Clear();
                    nutritionData.Rows.Add(Convert.ToDecimal(calories), Convert.ToDecimal(fat), Convert.ToDecimal(saturatefat),
                                           Convert.ToDecimal(fiber), Convert.ToDecimal(carbohydrate), Convert.ToDecimal(protein), Convert.ToDecimal(cholesterol)
                                           , Convert.ToDecimal(sugar), Convert.ToDecimal(sodium));


                    //RECUPERANDO INGREDIENTES

                    foreach (string ingrediente in listIngredient)
                    {
                        Ingredient = ingrediente;
                        Name       = jsonData.name;
                        recipeTable.NewRow();
                        recipeTable.Rows.Add(Name, Ingredient, CboTipoPlato.Text, cboCultura.Text,
                                             cboNacionalidad.Text, cboMomentoComida.Text, cboTemporada.Text, pictureRoute, txtOrigenRegistro.Text, chkSopa.Checked,
                                             chkPasta.Checked, chkPescadoMarisco.Checked, chkEnsalada.Checked, chkBebida.Checked, chkBajoColesterol.Checked,
                                             chkBajoCalorias.Checked, chkLibreGluten.Checked);
                    }
                }
                catch (Exception error)
                {
                    errorCount++;
                    dataError         = error.Message;
                    lbErrorCount.Text = errorCount.ToString() + "\r Json-Ld Scraper ";
                    Ftool.WriteLogFile(DateTime.Now.ToShortDateString() + " " + DateTime.Now.ToShortTimeString() + " " + dataError);
                }
                if (nutritionData.Rows.Count > 0)
                {
                    if (nutritionData.Rows[0]["Calorias"].ToString() != "0" || nutritionData.Rows[0]["fat"].ToString() != "0" ||
                        nutritionData.Rows[0]["saturatefat"].ToString() != "0" || nutritionData.Rows[0]["fiber"].ToString() != "0" ||
                        nutritionData.Rows[0]["protein"].ToString() != "0" || nutritionData.Rows[0]["carbohydrate"].ToString() != "0" ||
                        nutritionData.Rows[0]["cholesterol"].ToString() != "0" || nutritionData.Rows[0]["sugar"].ToString() != "0" ||
                        nutritionData.Rows[0]["sodium"].ToString() != "0"

                        )
                    {
                        //RECUPERANDO IMAGEN DEL LA RECETA
                        imagenUrl = jsonData.image;
                        cliente.DownloadFileCompleted += new AsyncCompletedEventHandler(Cliente_DownloadFileCompleted);
                        if (RegexTool.isUrl(imagenUrl))
                        {
                            Url          = new Uri(imagenUrl);
                            filename     = System.IO.Path.GetFileName(Url.LocalPath);
                            pictureRoute = appRoute.Remove(appRoute.Length - 10) + "\\Picture\\" + filename;
                            cliente.DownloadFileAsync(Url, pictureRoute);
                        }

                        recipeData.Tables.Add(recipeTable);
                        recipeData.Tables.Add(nutritionData);

                        dataError = objDataAccess.insertRecipeData(recipeData, chkClasificacion.Checked);
                        if (dataError != "")
                        {
                            errorCount++;
                            lbErrorCount.Text = errorCount.ToString();
                            Ftool.WriteLogFile(DateTime.Now.ToShortDateString() + " " + DateTime.Now.ToShortTimeString() + " " + dataError);
                        }
                        else
                        {
                            contRegistroEncontrados++;
                            lbTotalRegistroEncontrados.Text = contRegistroEncontrados.ToString();
                        }
                    }
                }
                break;
            }
            return(recipeData);
        }