public void scrapData(HtmlNode[] nodes, int dataformat) { string urlActual = string.Empty; cantidad = nodes.Count(); lbTotalNodosIdentificados.Text = nodes.Count().ToString(); cantNodeIndetified = nodes.Count(); progressBarBusqueda.Maximum = Convert.ToInt32(cantidad); progressBarBusqueda.Step = 1; foreach (HtmlNode enlace in nodes) { itemCount++; lbTotalNodosEvaluados.Text = itemCount.ToString(); percent = Math.Round((itemCount / cantidad * 100), 2, MidpointRounding.AwayFromZero); lbPorcent.Text = percent.ToString() + "%"; progressBarBusqueda.PerformStep(); progressBarBusqueda.Refresh(); this.Refresh(); foreach (HtmlAttribute atributo in enlace.Attributes) { urlActual = atributo.Value.ToString(); lbEnlaceActual.Text = urlActual; lbEnlaceActual.Refresh(); if (RegexTool.isUrl(urlActual)) { if (!isVisited(urlActual)) { HtmlAgilityPack.HtmlDocument recipe = web.Load(urlActual); if (hasRecipe(recipe, dataformat)) { addValueToDic(urlActual, ""); lbTotalRegistroEncontrados.Refresh(); GetMicroData(recipe, dataformat); } else { var recetas = recipe.DocumentNode.SelectNodes(".//article/a/@href"); addValueToDic(urlActual, ""); if (recetas != null) { scrapData(recetas.ToArray(), dataformat); } } } } } } }
public void scrapData(string Url, int cantidad) { browser = new System.Windows.Forms.WebBrowser(); browser.DocumentCompleted += new System.Windows.Forms.WebBrowserDocumentCompletedEventHandler(DocumentCompleted); HtmlDocument document = null; int contador = 0; HtmlWeb web = new HtmlWeb(); while (contador <= cantidad) { browser.Navigate(Url); browser.Visible = true; browser.Refresh(); while (completed) { System.Windows.Forms.Application.DoEvents(); Thread.Sleep(100); botonClik = browser.Document.GetElementById("btnMoreResults"); botonClik.InvokeMember("click"); document = (HtmlDocument)browser.Document.DomDocument; HtmlNode[] nodes = document.DocumentNode.SelectNodes(".//a/@href").ToArray(); Console.Write("Buscando Recetas....."); using (var progress = new ProgressBar()) { foreach (HtmlNode enlace in nodes) { foreach (HtmlAttribute atributo in enlace.Attributes) { if (RegexTool.isUrl(atributo.Value)) { HtmlDocument recipe = web.Load(atributo.Value); if (hasRecipe(recipe)) { contador++; progress.Report((double)contador / cantidad); Thread.Sleep(20); getMicroData(recipe); } } } } } } } Console.WriteLine("Listo."); Console.ReadLine(); }
public DataSet getMicroData(HtmlDocument Document) { ClDataAcces objDataAccess = new ClDataAcces(); string Name = string.Empty; string Ingredient = string.Empty; DataSet recipeData = new DataSet(); DataTable recipe = new DataTable(); DataTable nutritionData = new DataTable(); string calories = string.Empty, fat = string.Empty, saturatefat = string.Empty, fiber = string.Empty, carbohydrate = string.Empty, protein = string.Empty, cholesterol = string.Empty, sugar = string.Empty, sodium = string.Empty; // RECIPE DATA recipe.Columns.Add("Name"); recipe.Columns.Add("Ingrediente"); // NUTRICION DATA nutritionData.Columns.Add("Calorias", typeof(decimal)); nutritionData.Columns.Add("fat", typeof(decimal)); nutritionData.Columns.Add("saturatefat", typeof(decimal)); nutritionData.Columns.Add("fiber", typeof(decimal)); nutritionData.Columns.Add("carbohydrate", typeof(decimal)); nutritionData.Columns.Add("protein", typeof(decimal)); nutritionData.Columns.Add("cholesterol", typeof(decimal)); nutritionData.Columns.Add("sugar", typeof(decimal)); nutritionData.Columns.Add("sodium", typeof(decimal)); HtmlNode[] nodes = Document.DocumentNode.SelectNodes(".//*[@itemtype='http://schema.org/Recipe']").ToArray(); HtmlNode[] child = null; HtmlNode NameNode = Document.DocumentNode.SelectSingleNode("/html/body/div[2]/div/div[1]/div[3]/section[1]/section[2]/div/h3"); Name = NameNode.InnerText; foreach (HtmlNode item in nodes) { child = item.SelectNodes(".//*[@itemprop='ingredients']").ToArray(); Console.WriteLine("INGREDIENTES\r"); Console.WriteLine("-----------------------------------------------------------------------"); foreach (HtmlNode chldnode in child) { for (int ind = 0; ind < chldnode.Attributes.Count; ind++) { Console.WriteLine(chldnode.Attributes[ind].Name + ": " + chldnode.Attributes[ind].Value.ToString() + " ---> " + chldnode.InnerText.Trim() + "\r"); Ingredient = chldnode.InnerText.Trim(); recipe.NewRow(); recipe.Rows.Add(Name, Ingredient); } } nodes = Document.DocumentNode.SelectNodes(".//*[@itemtype='http://schema.org/NutritionInformation']").ToArray(); Console.WriteLine("INFORMACION DE NUTRICION\r"); Console.WriteLine("-----------------------------------------------------------------------"); foreach (HtmlNode chldnode in nodes) { child = chldnode.SelectNodes(".//*[@itemprop]").ToArray(); foreach (HtmlNode ntNodo in child) { for (int ind = 0; ind < ntNodo.Attributes.Count; ind++) { Console.WriteLine(ntNodo.Attributes[ind].Value + ": " + RegexTool.GetNumber(ntNodo.InnerText) + "\r"); switch (ntNodo.Attributes[ind].Value) { case "calories": calories = RegexTool.GetNumber(ntNodo.InnerText); break; case "fatContent": fat = RegexTool.GetNumber(ntNodo.InnerText); break; case "saturatedFatContent": saturatefat = RegexTool.GetNumber(ntNodo.InnerText); break; case "fiberContent": fiber = RegexTool.GetNumber(ntNodo.InnerText); break; case "carbohydrateContent": carbohydrate = RegexTool.GetNumber(ntNodo.InnerText); break; case "proteinContent": protein = RegexTool.GetNumber(ntNodo.InnerText); break; case "cholesterolContent": cholesterol = RegexTool.GetNumber(ntNodo.InnerText); break; case "sugarContent": sugar = RegexTool.GetNumber(ntNodo.InnerText); break; case "sodiumContent": sodium = RegexTool.GetNumber(ntNodo.InnerText); break; } } } } nutritionData.Rows.Add(Convert.ToDecimal(calories), Convert.ToDecimal(fat), Convert.ToDecimal(saturatefat), Convert.ToDecimal(fiber), Convert.ToDecimal(carbohydrate), Convert.ToDecimal(protein), Convert.ToDecimal(cholesterol) , Convert.ToDecimal(sugar), Convert.ToDecimal(sodium)); } recipeData.Tables.Add(recipe); recipeData.Tables.Add(nutritionData); objDataAccess.insertRecipeData(recipeData, false); Console.WriteLine("Datos Registrados con exito"); return(recipeData); }
public DataSet GetMicroData(HtmlAgilityPack.HtmlDocument Document, int dataFormat) { string dataError = string.Empty, pictureRoute = string.Empty, appRoute = string.Empty, filename = string.Empty; ClDataAcces objDataAccess = new ClDataAcces(); string Name = string.Empty; string Ingredient = string.Empty; DataSet recipeData = new DataSet(); DataTable recipeTable = new DataTable(); DataTable nutritionData = new DataTable(); HtmlNode[] nodes = null; HtmlNode[] child = null; HtmlNode NameNode = null; WebClient cliente = new WebClient(); IList <string> listIngredient = null; Uri Url = null; string calories = "0", fat = "0", saturatefat = "0", fiber = "0", carbohydrate = "0", protein = "0", cholesterol = "0", sugar = "0", sodium = "0", imagenUrl = string.Empty; // RECIPE DATA recipeTable.Columns.Add("Name", typeof(string)); recipeTable.Columns.Add("Ingrediente", typeof(string)); recipeTable.Columns.Add("recipeTipoPlatoData", typeof(string)); recipeTable.Columns.Add("recipeCulturaData", typeof(string)); recipeTable.Columns.Add("recipeNacionalidadData", typeof(string)); recipeTable.Columns.Add("recipeMomentoData", typeof(string)); recipeTable.Columns.Add("recipeTemporadaData", typeof(string)); recipeTable.Columns.Add("Picture", typeof(string)); recipeTable.Columns.Add("Origen", typeof(string)); recipeTable.Columns.Add("esSopa", typeof(Boolean)); recipeTable.Columns.Add("esPasta", typeof(Boolean)); recipeTable.Columns.Add("esMarisco", typeof(Boolean)); recipeTable.Columns.Add("esEnsalada", typeof(Boolean)); recipeTable.Columns.Add("esBebida", typeof(Boolean)); recipeTable.Columns.Add("esBajoColesterol", typeof(Boolean)); recipeTable.Columns.Add("esBajoEnCalorias", typeof(Boolean)); recipeTable.Columns.Add("esLibreGluten", typeof(Boolean)); // NUTRICION DATA nutritionData.Columns.Add("Calorias", typeof(decimal)); nutritionData.Columns.Add("fat", typeof(decimal)); nutritionData.Columns.Add("saturatefat", typeof(decimal)); nutritionData.Columns.Add("fiber", typeof(decimal)); nutritionData.Columns.Add("carbohydrate", typeof(decimal)); nutritionData.Columns.Add("protein", typeof(decimal)); nutritionData.Columns.Add("cholesterol", typeof(decimal)); nutritionData.Columns.Add("sugar", typeof(decimal)); nutritionData.Columns.Add("sodium", typeof(decimal)); appRoute = Application.StartupPath; switch (dataFormat) { case 0: nodes = Document.DocumentNode.SelectNodes(".//*[@itemtype='http://schema.org/Recipe']").ToArray(); NameNode = Document.DocumentNode.SelectSingleNode(".//*[@class='recipeDetailHeader showOnTabletToDesktop']"); imagenUrl = Document.DocumentNode.Descendants("img") .Where(node => node.Attributes["class"] != null && node.Attributes["class"].Value == "recipeDetailSummaryImageMain") .Select(node => node.Attributes["src"].Value) .DefaultIfEmpty(string.Empty) .FirstOrDefault() .ToString(); cliente.DownloadFileCompleted += new AsyncCompletedEventHandler(Cliente_DownloadFileCompleted); Url = new Uri(imagenUrl); filename = System.IO.Path.GetFileName(Url.LocalPath); pictureRoute = appRoute.Remove(appRoute.Length - 10) + "\\Picture\\" + filename; cliente.DownloadFileAsync(Url, pictureRoute); Name = NameNode.InnerText; foreach (HtmlNode item in nodes) { child = item.SelectNodes(".//*[@itemprop='ingredients']").ToArray(); foreach (HtmlNode chldnode in child) { for (int ind = 0; ind < chldnode.Attributes.Count; ind++) { Ingredient = chldnode.InnerText.Trim(); recipeTable.NewRow(); recipeTable.Rows.Add(Name, Ingredient, CboTipoPlato.Text, cboCultura.Text, cboNacionalidad.Text, cboMomentoComida.Text, cboTemporada.Text, pictureRoute, txtOrigenRegistro.Text, chkSopa.Checked, chkPasta.Checked, chkPescadoMarisco.Checked, chkEnsalada.Checked, chkBebida.Checked, chkBajoColesterol.Checked, chkBajoCalorias.Checked, chkLibreGluten.Checked); } } if (Document.DocumentNode.SelectNodes(".//*[@itemtype='http://schema.org/NutritionInformation']") != null) { nodes = Document.DocumentNode.SelectNodes(".//*[@itemtype='http://schema.org/NutritionInformation']").ToArray(); foreach (HtmlNode chldnode in nodes) { child = chldnode.SelectNodes(".//*[@itemprop]").ToArray(); foreach (HtmlNode ntNodo in child) { for (int ind = 0; ind < ntNodo.Attributes.Count; ind++) { Console.WriteLine(ntNodo.Attributes[ind].Value + ": " + RegexTool.GetNumber(ntNodo.InnerText) + "\r"); if (ntNodo.InnerText != "") { switch (ntNodo.Attributes[ind].Value) { case "calories": calories = RegexTool.GetNumber(ntNodo.InnerText); break; case "fatContent": fat = RegexTool.GetNumber(ntNodo.InnerText); break; case "saturatedFatContent": saturatefat = RegexTool.GetNumber(ntNodo.InnerText); break; case "fiberContent": fiber = RegexTool.GetNumber(ntNodo.InnerText); break; case "carbohydrateContent": carbohydrate = RegexTool.GetNumber(ntNodo.InnerText); break; case "proteinContent": protein = RegexTool.GetNumber(ntNodo.InnerText); break; case "cholesterolContent": cholesterol = RegexTool.GetNumber(ntNodo.InnerText); break; case "sugarContent": sugar = RegexTool.GetNumber(ntNodo.InnerText); break; case "sodiumContent": sodium = RegexTool.GetNumber(ntNodo.InnerText); break; } } } } } try { nutritionData.Rows.Add(Convert.ToDecimal(calories), Convert.ToDecimal(fat), Convert.ToDecimal(saturatefat), Convert.ToDecimal(fiber), Convert.ToDecimal(carbohydrate), Convert.ToDecimal(protein), Convert.ToDecimal(cholesterol) , Convert.ToDecimal(sugar), Convert.ToDecimal(sodium)); } catch (Exception error) { dataError = error.Message; } } } if (nutritionData.Rows.Count > 0) { recipeData.Tables.Add(recipeTable); recipeData.Tables.Add(nutritionData); dataError = objDataAccess.insertRecipeData(recipeData, chkClasificacion.Checked); if (dataError != "") { errorCount++; lbErrorCount.Text = errorCount.ToString() + "\r MicroData scraper "; Ftool.WriteLogFile(DateTime.Now.ToShortDateString() + " " + DateTime.Now.ToShortTimeString() + " " + dataError); } else { contRegistroEncontrados++; lbTotalRegistroEncontrados.Text = contRegistroEncontrados.ToString(); } } break; case 1: // RECUPERANDO EN VALOR JSON EN LA PAGINA string jsonValue = Document.DocumentNode. SelectSingleNode("//script[@type='application/ld+json']").InnerText; //SERIALIZANDO EL OBJETO JSON A UN OBJETO DINAMICO dynamic jsonData = JsonConvert.DeserializeObject(jsonValue); try { jsonData.nutrition.Remove("@type"); Dictionary <string, string> values = jsonData.nutrition.ToObject <Dictionary <string, string> >(); listIngredient = jsonData.recipeIngredient.ToObject <IList <string> >(); // RECUPERANDO VALORES NUTRICIONALES foreach (KeyValuePair <string, string> valor in values) { switch (valor.Key) { case "calories": calories = RegexTool.GetNumber(valor.Value); break; case "fatContent": fat = RegexTool.GetNumber(valor.Value); break; case "saturatedFatContent": saturatefat = RegexTool.GetNumber(valor.Value); break; case "proteinContent": protein = RegexTool.GetNumber(valor.Value); break; case "carbohydrateContent": carbohydrate = RegexTool.GetNumber(valor.Value); break; case "sugarContent": sugar = RegexTool.GetNumber(valor.Value); break; case "sodiumContent": sodium = RegexTool.GetNumber(valor.Value); break; case "fiberContent": fiber = RegexTool.GetNumber(valor.Value); break; } } } catch (Exception e) { dataError = e.Message; lbErrorCount.Text = errorCount.ToString() + "\r Json-Ld Scraper "; Ftool.WriteLogFile(DateTime.Now.ToShortDateString() + " " + DateTime.Now.ToShortTimeString() + " " + dataError); } try { nutritionData.Rows.Clear(); nutritionData.Rows.Add(Convert.ToDecimal(calories), Convert.ToDecimal(fat), Convert.ToDecimal(saturatefat), Convert.ToDecimal(fiber), Convert.ToDecimal(carbohydrate), Convert.ToDecimal(protein), Convert.ToDecimal(cholesterol) , Convert.ToDecimal(sugar), Convert.ToDecimal(sodium)); //RECUPERANDO INGREDIENTES foreach (string ingrediente in listIngredient) { Ingredient = ingrediente; Name = jsonData.name; recipeTable.NewRow(); recipeTable.Rows.Add(Name, Ingredient, CboTipoPlato.Text, cboCultura.Text, cboNacionalidad.Text, cboMomentoComida.Text, cboTemporada.Text, pictureRoute, txtOrigenRegistro.Text, chkSopa.Checked, chkPasta.Checked, chkPescadoMarisco.Checked, chkEnsalada.Checked, chkBebida.Checked, chkBajoColesterol.Checked, chkBajoCalorias.Checked, chkLibreGluten.Checked); } } catch (Exception error) { errorCount++; dataError = error.Message; lbErrorCount.Text = errorCount.ToString() + "\r Json-Ld Scraper "; Ftool.WriteLogFile(DateTime.Now.ToShortDateString() + " " + DateTime.Now.ToShortTimeString() + " " + dataError); } if (nutritionData.Rows.Count > 0) { if (nutritionData.Rows[0]["Calorias"].ToString() != "0" || nutritionData.Rows[0]["fat"].ToString() != "0" || nutritionData.Rows[0]["saturatefat"].ToString() != "0" || nutritionData.Rows[0]["fiber"].ToString() != "0" || nutritionData.Rows[0]["protein"].ToString() != "0" || nutritionData.Rows[0]["carbohydrate"].ToString() != "0" || nutritionData.Rows[0]["cholesterol"].ToString() != "0" || nutritionData.Rows[0]["sugar"].ToString() != "0" || nutritionData.Rows[0]["sodium"].ToString() != "0" ) { //RECUPERANDO IMAGEN DEL LA RECETA imagenUrl = jsonData.image; cliente.DownloadFileCompleted += new AsyncCompletedEventHandler(Cliente_DownloadFileCompleted); if (RegexTool.isUrl(imagenUrl)) { Url = new Uri(imagenUrl); filename = System.IO.Path.GetFileName(Url.LocalPath); pictureRoute = appRoute.Remove(appRoute.Length - 10) + "\\Picture\\" + filename; cliente.DownloadFileAsync(Url, pictureRoute); } recipeData.Tables.Add(recipeTable); recipeData.Tables.Add(nutritionData); dataError = objDataAccess.insertRecipeData(recipeData, chkClasificacion.Checked); if (dataError != "") { errorCount++; lbErrorCount.Text = errorCount.ToString(); Ftool.WriteLogFile(DateTime.Now.ToShortDateString() + " " + DateTime.Now.ToShortTimeString() + " " + dataError); } else { contRegistroEncontrados++; lbTotalRegistroEncontrados.Text = contRegistroEncontrados.ToString(); } } } break; } return(recipeData); }