private void removeDdItemNotUse(HtmlNodeCollection ddItemNodes) { int index = 0; var dtItems = victimDetailContent.SelectNodes("//dl[@class='movie-dl'] //dt"); var temp = ddItemNodes.ToArray(); foreach (var dtItem in dtItems) { if (dtItem.InnerText.Trim().IndexOf("Điểm") == -1 && dtItem.InnerText.Trim().IndexOf("Ngày") == -1 && dtItem.InnerText.Trim().IndexOf("Số tập") == -1 && dtItem.InnerText.Trim().IndexOf("Công ty SX") == -1 && dtItem.InnerText.Trim().IndexOf("Chất lượng") == -1 && dtItem.InnerText.Trim().IndexOf("Số người đánh giá") == -1) { ddItems.Add(ddItemNodes.ToArray()[index]); } index++; } }
private static async void getHtml() { var url1 = "https://news.google.com/topstories?hl=es-419&pli=1&gl=AR&ceid=AR:es-419"; //var url1 = "https://weather.com/es-AR/tiempo/hoy/l/ARBA0009:1:AR"; var httpclient = new HttpClient(); var html = await httpclient.GetStringAsync(url1); var htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(html); HtmlNodeCollection nodes = htmlDocument.DocumentNode.ChildNodes; var nodos = nodes.ToArray(); string datos1; datos1 = nodos[1].InnerText; var datosHTML = nodos[1].InnerHtml; var sigue = true; string str1 = ""; int fin = 5; string str3 = ""; int last = 0; do { last = datos1.IndexOf("bookmark_bordersharemore_vertVer cobertura completakeyboard_arrow_up") + "bookmark_bordersharemore_vertVer cobertura completakeyboard_arrow_up".Length; str1 = datos1.Substring(last, datos1.Length - last); fin = str1.IndexOf("ampvideo"); if (fin < 1) { Console.WriteLine("--------------------------------"); Console.WriteLine(""); Console.WriteLine("--------------------------------"); Console.WriteLine("Llegó al final"); sigue = false; return; } str3 = str1.Substring(0, fin - 1); Console.WriteLine("--------------------------------"); Console.WriteLine(""); Console.WriteLine("--------------------------------"); Console.WriteLine(str3); datos1 = str1; } while (sigue); }
private async Task <Array> getHtmlClima() { var url1 = "https://weather.com/es-AR/tiempo/hoy/l/ARBA0009:1:AR"; var httpclient = new HttpClient(); string[] datosArray = new string[20]; var html = await httpclient.GetStringAsync(url1); var htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(html); HtmlNodeCollection nodes = htmlDocument.DocumentNode.ChildNodes; var nodos = nodes.ToArray(); string datos1; datos1 = nodos[1].InnerText; var datosHTML = nodos[1].InnerHtml; var sigue = true; string str1 = ""; int fin = 5; string str3 = ""; int last = 0; int i = 0; do { last = datos1.IndexOf("bookmark_bordersharemore_vertVer cobertura completakeyboard_arrow_up") + "bookmark_bordersharemore_vertVer cobertura completakeyboard_arrow_up".Length; str1 = datos1.Substring(last, datos1.Length - last); fin = str1.IndexOf("ampvideo"); if (fin < 1 || i == 20) { return(datosArray); } str3 = str1.Substring(0, fin - 1); //str3 = str3.Replace("|", "--"); datosArray[i] = str3; i++; datos1 = str1; } while (sigue); return(datosArray); }
private string[] ParseHtmlSplitTables(string urlLink) { string[] result = new string[] { }; HtmlDocument htmlDoc = new HtmlWeb().Load(urlLink); HtmlNodeCollection tableNodes = htmlDoc.DocumentNode.SelectNodes("//table"); if (tableNodes != null) { result = Array.ConvertAll <HtmlNode, string>(tableNodes.ToArray(), n => n.OuterHtml); } return(result); }
public object Convert(object value, Type targetType, object parameter, CultureInfo culture) { HtmlNodeCollection nodes = value as HtmlNodeCollection; if (nodes == null || nodes.Count == 0) { return(nodes); } foreach (var node in nodes.ToArray()) { if (node.NodeType == HtmlNodeType.Text) { nodes.Remove(node); } } return(nodes); }
/// <summary> /// Obtain pricing information from Digikey website /// </summary> /// <returns></returns> public override PricingInfo[] GetPricingInfo() { try { List <PricingInfo> priceslist = new List <PricingInfo>(50); HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(WebPageData); HtmlNode node = htmlDoc.GetElementbyId("product-dollars"); HtmlNodeCollection tablerows = node.SelectNodes("tbody//tr"); foreach (HtmlNode row in tablerows) { HtmlNodeCollection tablecolumns = row.SelectNodes("td"); if (!ReferenceEquals(tablecolumns, null)) { if (tablecolumns.Count() > 2) { HtmlNode[] cols = tablecolumns.ToArray(); string minqtystr = cols[0].InnerText; minqtystr = minqtystr.Replace(",", ""); int minqty = Int32.Parse(minqtystr); double srcunitprice = Double.Parse(cols[1].InnerText); double destprice = Currency.Convert("USD", DefDestCurrency, srcunitprice); PricingInfo p = new PricingInfo("USD", DefDestCurrency, srcunitprice, destprice, minqty, 999999); priceslist.Add(p); } } } FixMaximumQtys(ref priceslist); return(priceslist.ToArray()); } catch (Exception ex) { throw ex; } }
public (List <string>, List <string>, List <string>, Boolean) ReturnBaseXmlData(string fetchedXml) { List <string> _xmlWords = new List <string>(); List <string> _mp3s = new List <string>(); List <string> _occ = new List <string>(); HtmlAgilityPack.HtmlDocument XmlDocument = new HtmlAgilityPack.HtmlDocument(); HtmlNode[] XmlNodes; XmlDocument.LoadHtml(fetchedXml); HtmlNodeCollection XmlTempNodes = XmlDocument.DocumentNode.SelectNodes("//a[@onclick]"); if (XmlTempNodes == null) { //no parseable Item found return(_xmlWords, _mp3s, _occ, false); } XmlNodes = XmlTempNodes.ToArray(); int _i = 1; string attrib = null; foreach (HtmlNode htmlNode in XmlNodes) { attrib = XmlDocument.DocumentNode.SelectNodes("//a[@onclick]")[_i - 1].GetAttributeValue("onclick", "default"); string tmpxml = attrib.Remove(0, attrib.IndexOf("'") + 1); // result: PERSOUN1.xml','persoun1.mp3') string tmpmp3 = tmpxml.Remove(0, tmpxml.IndexOf("'") + 1); // result: ,'persoun1.mp3') tmpmp3 = tmpmp3.Remove(0, tmpmp3.IndexOf("'") + 1); string tmpWordForm = XmlDocument.DocumentNode.SelectSingleNode("//span[@class='s4']").InnerText.Trim(); htmlNode.RemoveChild(XmlDocument.DocumentNode.SelectNodes("//span[@class='s4']").First()); _i++; _occ.Add(htmlNode.InnerText.Trim()); _xmlWords.Add(tmpxml.Substring(0, tmpxml.IndexOf("'"))); _mp3s.Add(tmpmp3.Substring(0, tmpmp3.IndexOf("'"))); } return(_xmlWords, _mp3s, _occ, true); }
private async Task LoadHtmlAsync() { if (!await LoadHtmlDocumentAsync()) { return; } HtmlNodeCollection nodes = null; if (htmlDoc != null) { nodes = htmlDoc.DocumentNode.ChildNodes; foreach (var node in nodes.ToArray()) { if (node.NodeType != HtmlNodeType.Element) { nodes.Remove(node); } } } if (WebPage.BlackWhiteList != null) { foreach (var id in WebPage.BlackWhiteList) { if (id == null) { continue; } var line = new HtmlBlackWhiteListItemLine(id.Clone()); line.Deleted += Line_Deleted; stkIdentifies.Children.Add(line); } } tree.ItemTemplate = tree.Resources["htmlTemplate"] as HierarchicalDataTemplate; HtmlNodes = nodes; tree.ItemsSource = HtmlNodes; }
private static async void getHtml(string provincia) { //var url1 = "https://preciosmaximos.argentina.gob.ar/api/products?pag=1&Provincia=CABA®s=15000"; var url1 = "https://preciosmaximos.argentina.gob.ar/api/products?pag=1&Provincia="; var url2 = url1 + provincia + "®s=15000"; var httpclient = new HttpClient(); var html = await httpclient.GetStringAsync(url2); var htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(html); HtmlNodeCollection nodes = htmlDocument.DocumentNode.ChildNodes; var nodos = nodes.ToArray(); string datos1; datos1 = nodos[0].InnerText; //JavaScriptSerializer js = new JavaScriptSerializer(); //StreamReader sr = new StreamReader(datos); //string jsonString = sr.ReadToEnd(); datos1 = datos1.Replace(@"Precio sugerido", "Precio_sugerido"); var items = JsonConvert.DeserializeObject <dynamic>(datos1); var item7 = items.result; var largo = items.result.Count; //var item9 = item7[1].Precio_sugerido; listaPer Produc = new listaPer(); List <Person> laLista = new List <Person>(); for (int i = 0; i < largo; i++) { Person indiv = new Person(); indiv.Precio_sugerido = item7[i].Precio_sugerido; indiv.Producto = item7[i].Producto; indiv.Producto_s_tilde = item7[i].Producto_s_tilde; indiv.Provincia = item7[i].Provincia; indiv.Region = item7[i].Region; indiv.categoria = item7[i].categoria; indiv.id_producto = item7[i].id_producto; indiv.marca = item7[i].marca; indiv.subcategoria = item7[i].subcategoria; laLista.Add(indiv); } var workbook = new XLWorkbook(); workbook.AddWorksheet("sheetName"); var ws = workbook.Worksheet("sheetName"); //Recorrer el objecto int row = 1; ws.Cell("A" + row.ToString()).Value = "Precio_sugerido"; ws.Cell("B" + row.ToString()).Value = "Producto"; ws.Cell("C" + row.ToString()).Value = "Producto_s_tilde"; ws.Cell("D" + row.ToString()).Value = "Provincia"; ws.Cell("E" + row.ToString()).Value = "Region"; ws.Cell("F" + row.ToString()).Value = "categoria"; ws.Cell("G" + row.ToString()).Value = "id_producto"; ws.Cell("H" + row.ToString()).Value = "marca"; ws.Cell("I" + row.ToString()).Value = "subcategoria"; row++; foreach (var c in laLista) { //Escribrie en Excel en cada celda ws.Cell("A" + row.ToString()).Value = c.Precio_sugerido; ws.Cell("B" + row.ToString()).Value = c.Producto; ws.Cell("C" + row.ToString()).Value = c.Producto_s_tilde; ws.Cell("D" + row.ToString()).Value = c.Provincia; ws.Cell("E" + row.ToString()).Value = c.Region; ws.Cell("F" + row.ToString()).Value = c.categoria; ws.Cell("G" + row.ToString()).Value = c.id_producto; ws.Cell("H" + row.ToString()).Value = c.marca; ws.Cell("I" + row.ToString()).Value = c.subcategoria; row++; } //Guardar Excel //Ruta = Nombre_Proyecto\bin\Debug //C:\\Users\\Marcelo\\source\\repos\\Core\\Pruebas\\Pruebas\\Excels workbook.SaveAs("C:\\Users\\Marcelo\\source\\repos\\Core\\Pruebas\\Pruebas\\Excels\\Productos_" + provincia + ".xlsx"); Console.WriteLine("Listo provincia de " + provincia); }
static void Main(string[] args) { string detail; var hasil = 0; MyContext _context = new MyContext(); HtmlWeb document = new HtmlWeb(); Console.Write("Input Year : "); //set year by input string param = Convert.ToString(Console.ReadLine()); //set year now //string param = Convert.ToString(DateTime.Now.Year); //cek database if the year alredy exists var cek = _context.Holiday.Where(a => a.Tahun == param).ToList(); if (cek.Count == 0) { // grap table from website var document2 = document.Load(@"https://www.officeholidays.com/countries/indonesia/" + param); HtmlNodeCollection nodes = document2.DocumentNode.SelectNodes("//table[@class='country-table']/tbody/tr"); if (nodes != null) { HtmlNode[] nodes1 = nodes.ToArray(); foreach (HtmlNode item in nodes1) { //splite data TBL_M_HOLIDAYS _holidays = new TBL_M_HOLIDAYS(); var date = Regex.Split(item.InnerHtml, "\""); var tgl = date[5]; var splitDate = Regex.Split(tgl, "-"); var splitWords = Regex.Split(item.InnerText, "\n"); var words = splitWords .Where(x => !x.Contains(" ") && !string.IsNullOrEmpty(x.Trim())) .ToList(); if (words[2].Contains("'")) { var words1 = words[2]; var splitWords1 = Regex.Split(words1, "'"); detail = splitWords1[0] + "'" + splitWords1[1]; } else { detail = words[2]; } var tahun = splitDate[0]; //insert to DB _holidays.Date_Holiday = Convert.ToDateTime(tgl); _holidays.Keterangan = detail; _holidays.Tahun = tahun; _context.Holiday.Add(_holidays); hasil = _context.SaveChanges(); //show date and detail in console var result = $"{tgl} ; {detail}"; Console.WriteLine(result); //show innerHtml and innerText in console //Console.WriteLine(item.InnerHtml); //Console.WriteLine(item.InnerText); } if (hasil > 0) { Console.WriteLine("\nInsert Success"); } else { Console.WriteLine("\nInsert Fail"); } } else { Console.WriteLine("List holiday in year " + param + " not found!"); } } else { Console.WriteLine("List holiday in year " + param + " already exists"); } Console.ReadKey(); }
//returns a tuple of the increased index size and increased url queue size public Tuple <int, int> crawlSite(string url) { int updateIndex = 0; int updateQueue = -1; try { Uri uri = new Uri(url); Host host; if (hosts.TryGetValue(uri.Host, out host)) { if (host.isAllowed(uri)) { //check if url has been visited before if (!host.hasVisited(uri)) { HtmlDocument htmlDoc; HtmlWeb web = new HtmlWeb(); htmlDoc = web.Load(uri.AbsoluteUri); if (web.StatusCode == HttpStatusCode.OK) { string title = ""; string date = DateTime.UtcNow.ToString("s", System.Globalization.CultureInfo.InvariantCulture); string body = ""; host.addVisited(uri); //get title HtmlNode metaTitleNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='title']"); if (metaTitleNode != null) { title = metaTitleNode.GetAttributeValue("content", ""); body = title; } else { HtmlNode metaOgTitleNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='og:title']"); if (metaOgTitleNode != null) { title = metaOgTitleNode.GetAttributeValue("content", ""); body = title; } else { HtmlNode titleNode = htmlDoc.DocumentNode.SelectSingleNode("//title"); if (titleNode != null) { title = HttpUtility.HtmlDecode(titleNode.InnerHtml); body = title; } } } //get last mod date of page HtmlNode modNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='lastmod']"); if (modNode != null) { date = modNode.GetAttributeValue("content", ""); } else { //if no last mod date, check if there is a pub date HtmlNode pubNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='pubdate']"); if (pubNode != null) { date = pubNode.GetAttributeValue("content", ""); } } //get body of page HtmlNode metaDescNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']"); if (metaDescNode != null) { body = metaDescNode.GetAttributeValue("content", ""); } else { HtmlNode metaOgDescNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='og:description']"); if (metaOgDescNode != null) { body = metaOgDescNode.GetAttributeValue("content", ""); } } //HtmlNode bodyNode = htmlDoc.DocumentNode.SelectSingleNode("//p[contains(@class,'zn-body__paragraph')]"); //if (bodyNode != null) //{ // if (body.Length > 200) // { // body = Operation.stripHtml(bodyNode.InnerText).Substring(0, 200) + "..."; // } //} //Insert page with each word in the title as a row key string[] keyWord = Operation.stripPunct(title.ToLower()).Split().Distinct().ToArray(); foreach (string key in keyWord) { if (key != "") { try { //get page data and store to table PageEntity page = new PageEntity(uri, title, date, body, key); TableOperation insertOperation = TableOperation.Insert(page); pagesTable.Execute(insertOperation); updateIndex++; } catch (Exception e) { //Insert error to table ErrorEntity err = new ErrorEntity(url, e.Message, DateTime.Now.ToString()); TableOperation insertOperation = TableOperation.Insert(err); errorsTable.ExecuteAsync(insertOperation); Console.Write(e.ToString()); } } } HtmlNode[] linkNodes = new HtmlNode[0]; HtmlNodeCollection tempNodes = htmlDoc.DocumentNode.SelectNodes("//a"); if (tempNodes != null) { linkNodes = tempNodes.ToArray(); } Uri newUri; foreach (HtmlNode link in linkNodes) { //add url if within allowed domain try { newUri = new Uri(uri, link.GetAttributeValue("href", null)); if (Operation.domains.Values.Any(newUri.Host.Contains)) { if (newUri.Host.Contains(Operation.domains["BR1"]) || newUri.Host.Contains(Operation.domains["BR2"])) { if (newUri.AbsolutePath.StartsWith(Operation._BR_PATH)) { CloudQueueMessage urlMessage = new CloudQueueMessage(newUri.AbsoluteUri); urlQueue.AddMessageAsync(urlMessage); updateQueue++; } } else { CloudQueueMessage urlMessage = new CloudQueueMessage(newUri.AbsoluteUri); urlQueue.AddMessageAsync(urlMessage); updateQueue++; } } } catch (Exception e) { //Invalid url Console.WriteLine("Invalid html url found: " + e.ToString()); } } } } } } else { //if robots.txt has not been parsed for the given url site //and is within domain if (Operation.domains.Values.Any(uri.Host.Contains)) { //add to xmlqueue and add url back into urlqueue CloudQueueMessage robotMessage = new CloudQueueMessage(uri.AbsoluteUri); robotQueue.AddMessage(robotMessage); CloudQueueMessage urlMessage = new CloudQueueMessage(uri.AbsoluteUri); urlQueue.AddMessage(urlMessage); updateQueue++; } } } catch (Exception e) { //Insert error to table ErrorEntity err = new ErrorEntity(url, e.Message, DateTime.Now.ToString()); try { TableOperation insertOperation = TableOperation.Insert(err); errorsTable.ExecuteAsync(insertOperation); } catch (Exception insErr) { Console.Write(insErr.ToString()); } } return(new Tuple <int, int>(updateIndex, updateQueue)); }