Exemplo n.º 1
0
        private void removeDdItemNotUse(HtmlNodeCollection ddItemNodes)
        {
            int index   = 0;
            var dtItems = victimDetailContent.SelectNodes("//dl[@class='movie-dl'] //dt");
            var temp    = ddItemNodes.ToArray();

            foreach (var dtItem in dtItems)
            {
                if (dtItem.InnerText.Trim().IndexOf("Điểm") == -1 &&
                    dtItem.InnerText.Trim().IndexOf("Ngày") == -1 &&
                    dtItem.InnerText.Trim().IndexOf("Số tập") == -1 &&
                    dtItem.InnerText.Trim().IndexOf("Công ty SX") == -1 &&
                    dtItem.InnerText.Trim().IndexOf("Chất lượng") == -1 &&
                    dtItem.InnerText.Trim().IndexOf("Số người đánh giá") == -1)
                {
                    ddItems.Add(ddItemNodes.ToArray()[index]);
                }
                index++;
            }
        }
Exemplo n.º 2
0
        private static async void getHtml()
        {
            var url1 = "https://news.google.com/topstories?hl=es-419&pli=1&gl=AR&ceid=AR:es-419";
            //var url1 = "https://weather.com/es-AR/tiempo/hoy/l/ARBA0009:1:AR";

            var httpclient = new HttpClient();

            var html = await httpclient.GetStringAsync(url1);

            var htmlDocument = new HtmlDocument();

            htmlDocument.LoadHtml(html);
            HtmlNodeCollection nodes = htmlDocument.DocumentNode.ChildNodes;

            var nodos = nodes.ToArray();

            string datos1;

            datos1 = nodos[1].InnerText;
            var datosHTML = nodos[1].InnerHtml;

            var    sigue = true;
            string str1  = "";
            int    fin   = 5;
            string str3  = "";
            int    last  = 0;

            do
            {
                last = datos1.IndexOf("bookmark_bordersharemore_vertVer cobertura completakeyboard_arrow_up") + "bookmark_bordersharemore_vertVer cobertura completakeyboard_arrow_up".Length;


                str1 = datos1.Substring(last, datos1.Length - last);
                fin  = str1.IndexOf("ampvideo");

                if (fin < 1)
                {
                    Console.WriteLine("--------------------------------");
                    Console.WriteLine("");
                    Console.WriteLine("--------------------------------");
                    Console.WriteLine("Llegó al final");
                    sigue = false;
                    return;
                }

                str3 = str1.Substring(0, fin - 1);

                Console.WriteLine("--------------------------------");
                Console.WriteLine("");
                Console.WriteLine("--------------------------------");
                Console.WriteLine(str3);
                datos1 = str1;
            } while (sigue);
        }
Exemplo n.º 3
0
        private async Task <Array> getHtmlClima()
        {
            var url1 = "https://weather.com/es-AR/tiempo/hoy/l/ARBA0009:1:AR";

            var httpclient = new HttpClient();

            string[] datosArray = new string[20];



            var html = await httpclient.GetStringAsync(url1);

            var htmlDocument = new HtmlDocument();

            htmlDocument.LoadHtml(html);
            HtmlNodeCollection nodes = htmlDocument.DocumentNode.ChildNodes;

            var nodos = nodes.ToArray();

            string datos1;

            datos1 = nodos[1].InnerText;
            var datosHTML = nodos[1].InnerHtml;

            var    sigue = true;
            string str1  = "";
            int    fin   = 5;
            string str3  = "";
            int    last  = 0;
            int    i     = 0;

            do
            {
                last = datos1.IndexOf("bookmark_bordersharemore_vertVer cobertura completakeyboard_arrow_up") + "bookmark_bordersharemore_vertVer cobertura completakeyboard_arrow_up".Length;

                str1 = datos1.Substring(last, datos1.Length - last);
                fin  = str1.IndexOf("ampvideo");

                if (fin < 1 || i == 20)
                {
                    return(datosArray);
                }

                str3 = str1.Substring(0, fin - 1);
                //str3 = str3.Replace("|", "--");
                datosArray[i] = str3;
                i++;
                datos1 = str1;
            } while (sigue);
            return(datosArray);
        }
Exemplo n.º 4
0
        private string[] ParseHtmlSplitTables(string urlLink)
        {
            string[] result = new string[] { };

            HtmlDocument htmlDoc = new HtmlWeb().Load(urlLink);

            HtmlNodeCollection tableNodes = htmlDoc.DocumentNode.SelectNodes("//table");

            if (tableNodes != null)
            {
                result = Array.ConvertAll <HtmlNode, string>(tableNodes.ToArray(), n => n.OuterHtml);
            }

            return(result);
        }
Exemplo n.º 5
0
        public object Convert(object value, Type targetType, object parameter, CultureInfo culture)
        {
            HtmlNodeCollection nodes = value as HtmlNodeCollection;

            if (nodes == null || nodes.Count == 0)
            {
                return(nodes);
            }
            foreach (var node in nodes.ToArray())
            {
                if (node.NodeType == HtmlNodeType.Text)
                {
                    nodes.Remove(node);
                }
            }
            return(nodes);
        }
Exemplo n.º 6
0
        /// <summary>
        /// Obtain pricing information from Digikey website
        /// </summary>
        /// <returns></returns>
        public override PricingInfo[] GetPricingInfo()
        {
            try
            {
                List <PricingInfo> priceslist = new List <PricingInfo>(50);

                HtmlDocument htmlDoc = new HtmlDocument();
                htmlDoc.LoadHtml(WebPageData);

                HtmlNode           node      = htmlDoc.GetElementbyId("product-dollars");
                HtmlNodeCollection tablerows = node.SelectNodes("tbody//tr");

                foreach (HtmlNode row in tablerows)
                {
                    HtmlNodeCollection tablecolumns = row.SelectNodes("td");

                    if (!ReferenceEquals(tablecolumns, null))
                    {
                        if (tablecolumns.Count() > 2)
                        {
                            HtmlNode[] cols = tablecolumns.ToArray();

                            string minqtystr = cols[0].InnerText;
                            minqtystr = minqtystr.Replace(",", "");
                            int    minqty       = Int32.Parse(minqtystr);
                            double srcunitprice = Double.Parse(cols[1].InnerText);
                            double destprice    = Currency.Convert("USD", DefDestCurrency, srcunitprice);

                            PricingInfo p = new PricingInfo("USD", DefDestCurrency, srcunitprice, destprice, minqty, 999999);
                            priceslist.Add(p);
                        }
                    }
                }

                FixMaximumQtys(ref priceslist);

                return(priceslist.ToArray());
            }
            catch (Exception ex)
            {
                throw ex;
            }
        }
        public (List <string>, List <string>, List <string>, Boolean) ReturnBaseXmlData(string fetchedXml)
        {
            List <string> _xmlWords = new List <string>();
            List <string> _mp3s     = new List <string>();
            List <string> _occ      = new List <string>();

            HtmlAgilityPack.HtmlDocument XmlDocument = new HtmlAgilityPack.HtmlDocument();
            HtmlNode[] XmlNodes;
            XmlDocument.LoadHtml(fetchedXml);
            HtmlNodeCollection XmlTempNodes = XmlDocument.DocumentNode.SelectNodes("//a[@onclick]");

            if (XmlTempNodes == null)
            {
                //no parseable Item found
                return(_xmlWords, _mp3s, _occ, false);
            }
            XmlNodes = XmlTempNodes.ToArray();
            int    _i     = 1;
            string attrib = null;

            foreach (HtmlNode htmlNode in XmlNodes)
            {
                attrib = XmlDocument.DocumentNode.SelectNodes("//a[@onclick]")[_i - 1].GetAttributeValue("onclick", "default");
                string tmpxml = attrib.Remove(0, attrib.IndexOf("'") + 1); // result: PERSOUN1.xml','persoun1.mp3')
                string tmpmp3 = tmpxml.Remove(0, tmpxml.IndexOf("'") + 1); // result: ,'persoun1.mp3')
                tmpmp3 = tmpmp3.Remove(0, tmpmp3.IndexOf("'") + 1);

                string tmpWordForm = XmlDocument.DocumentNode.SelectSingleNode("//span[@class='s4']").InnerText.Trim();
                htmlNode.RemoveChild(XmlDocument.DocumentNode.SelectNodes("//span[@class='s4']").First());

                _i++;

                _occ.Add(htmlNode.InnerText.Trim());
                _xmlWords.Add(tmpxml.Substring(0, tmpxml.IndexOf("'")));
                _mp3s.Add(tmpmp3.Substring(0, tmpmp3.IndexOf("'")));
            }

            return(_xmlWords, _mp3s, _occ, true);
        }
Exemplo n.º 8
0
        private async Task LoadHtmlAsync()
        {
            if (!await LoadHtmlDocumentAsync())
            {
                return;
            }
            HtmlNodeCollection nodes = null;

            if (htmlDoc != null)
            {
                nodes = htmlDoc.DocumentNode.ChildNodes;
                foreach (var node in nodes.ToArray())
                {
                    if (node.NodeType != HtmlNodeType.Element)
                    {
                        nodes.Remove(node);
                    }
                }
            }
            if (WebPage.BlackWhiteList != null)
            {
                foreach (var id in WebPage.BlackWhiteList)
                {
                    if (id == null)
                    {
                        continue;
                    }
                    var line = new HtmlBlackWhiteListItemLine(id.Clone());
                    line.Deleted += Line_Deleted;
                    stkIdentifies.Children.Add(line);
                }
            }
            tree.ItemTemplate = tree.Resources["htmlTemplate"] as HierarchicalDataTemplate;

            HtmlNodes        = nodes;
            tree.ItemsSource = HtmlNodes;
        }
Exemplo n.º 9
0
        private static async void getHtml(string provincia)
        {
            //var url1 = "https://preciosmaximos.argentina.gob.ar/api/products?pag=1&Provincia=CABA&regs=15000";
            var url1       = "https://preciosmaximos.argentina.gob.ar/api/products?pag=1&Provincia=";
            var url2       = url1 + provincia + "&regs=15000";
            var httpclient = new HttpClient();
            var html       = await httpclient.GetStringAsync(url2);

            var htmlDocument = new HtmlDocument();

            htmlDocument.LoadHtml(html);
            HtmlNodeCollection nodes = htmlDocument.DocumentNode.ChildNodes;

            var nodos = nodes.ToArray();

            string datos1;

            datos1 = nodos[0].InnerText;



            //JavaScriptSerializer js = new JavaScriptSerializer();
            //StreamReader sr = new StreamReader(datos);
            //string jsonString = sr.ReadToEnd();
            datos1 = datos1.Replace(@"Precio sugerido", "Precio_sugerido");
            var items = JsonConvert.DeserializeObject <dynamic>(datos1);

            var item7 = items.result;
            var largo = items.result.Count;
            //var item9 = item7[1].Precio_sugerido;

            listaPer      Produc  = new listaPer();
            List <Person> laLista = new List <Person>();

            for (int i = 0; i < largo; i++)
            {
                Person indiv = new Person();

                indiv.Precio_sugerido  = item7[i].Precio_sugerido;
                indiv.Producto         = item7[i].Producto;
                indiv.Producto_s_tilde = item7[i].Producto_s_tilde;
                indiv.Provincia        = item7[i].Provincia;
                indiv.Region           = item7[i].Region;
                indiv.categoria        = item7[i].categoria;
                indiv.id_producto      = item7[i].id_producto;
                indiv.marca            = item7[i].marca;
                indiv.subcategoria     = item7[i].subcategoria;

                laLista.Add(indiv);
            }


            var workbook = new XLWorkbook();

            workbook.AddWorksheet("sheetName");
            var ws = workbook.Worksheet("sheetName");
            //Recorrer el objecto
            int row = 1;

            ws.Cell("A" + row.ToString()).Value = "Precio_sugerido";
            ws.Cell("B" + row.ToString()).Value = "Producto";
            ws.Cell("C" + row.ToString()).Value = "Producto_s_tilde";
            ws.Cell("D" + row.ToString()).Value = "Provincia";
            ws.Cell("E" + row.ToString()).Value = "Region";
            ws.Cell("F" + row.ToString()).Value = "categoria";
            ws.Cell("G" + row.ToString()).Value = "id_producto";
            ws.Cell("H" + row.ToString()).Value = "marca";
            ws.Cell("I" + row.ToString()).Value = "subcategoria";
            row++;

            foreach (var c in laLista)
            {
                //Escribrie en Excel en cada celda
                ws.Cell("A" + row.ToString()).Value = c.Precio_sugerido;
                ws.Cell("B" + row.ToString()).Value = c.Producto;
                ws.Cell("C" + row.ToString()).Value = c.Producto_s_tilde;
                ws.Cell("D" + row.ToString()).Value = c.Provincia;
                ws.Cell("E" + row.ToString()).Value = c.Region;
                ws.Cell("F" + row.ToString()).Value = c.categoria;
                ws.Cell("G" + row.ToString()).Value = c.id_producto;
                ws.Cell("H" + row.ToString()).Value = c.marca;
                ws.Cell("I" + row.ToString()).Value = c.subcategoria;
                row++;
            }
            //Guardar Excel
            //Ruta = Nombre_Proyecto\bin\Debug
            //C:\\Users\\Marcelo\\source\\repos\\Core\\Pruebas\\Pruebas\\Excels
            workbook.SaveAs("C:\\Users\\Marcelo\\source\\repos\\Core\\Pruebas\\Pruebas\\Excels\\Productos_" + provincia + ".xlsx");
            Console.WriteLine("Listo provincia de " + provincia);
        }
Exemplo n.º 10
0
        static void Main(string[] args)
        {
            string detail;
            var    hasil = 0;

            MyContext _context = new MyContext();
            HtmlWeb   document = new HtmlWeb();

            Console.Write("Input Year : ");
            //set year by input
            string param = Convert.ToString(Console.ReadLine());
            //set year now
            //string param = Convert.ToString(DateTime.Now.Year);

            //cek database if the year alredy exists
            var cek = _context.Holiday.Where(a => a.Tahun == param).ToList();

            if (cek.Count == 0)
            {
                // grap table from website
                var document2            = document.Load(@"https://www.officeholidays.com/countries/indonesia/" + param);
                HtmlNodeCollection nodes = document2.DocumentNode.SelectNodes("//table[@class='country-table']/tbody/tr");

                if (nodes != null)
                {
                    HtmlNode[] nodes1 = nodes.ToArray();
                    foreach (HtmlNode item in nodes1)
                    {
                        //splite data
                        TBL_M_HOLIDAYS _holidays = new TBL_M_HOLIDAYS();
                        var            date      = Regex.Split(item.InnerHtml, "\"");
                        var            tgl       = date[5];
                        var            splitDate = Regex.Split(tgl, "-");

                        var splitWords = Regex.Split(item.InnerText, "\n");
                        var words      = splitWords
                                         .Where(x => !x.Contains("&nbsp;") && !string.IsNullOrEmpty(x.Trim()))
                                         .ToList();

                        if (words[2].Contains("&#039;"))
                        {
                            var words1      = words[2];
                            var splitWords1 = Regex.Split(words1, "&#039;");

                            detail = splitWords1[0] + "'" + splitWords1[1];
                        }
                        else
                        {
                            detail = words[2];
                        }

                        var tahun = splitDate[0];

                        //insert to DB
                        _holidays.Date_Holiday = Convert.ToDateTime(tgl);
                        _holidays.Keterangan   = detail;
                        _holidays.Tahun        = tahun;

                        _context.Holiday.Add(_holidays);
                        hasil = _context.SaveChanges();

                        //show date and detail in console
                        var result = $"{tgl} ; {detail}";
                        Console.WriteLine(result);

                        //show innerHtml and innerText in console
                        //Console.WriteLine(item.InnerHtml);
                        //Console.WriteLine(item.InnerText);
                    }
                    if (hasil > 0)
                    {
                        Console.WriteLine("\nInsert Success");
                    }
                    else
                    {
                        Console.WriteLine("\nInsert Fail");
                    }
                }
                else
                {
                    Console.WriteLine("List holiday in year " + param + " not found!");
                }
            }
            else
            {
                Console.WriteLine("List holiday in year " + param + " already exists");
            }
            Console.ReadKey();
        }
Exemplo n.º 11
0
        //returns a tuple of the increased index size and increased url queue size
        public Tuple <int, int> crawlSite(string url)
        {
            int updateIndex = 0;
            int updateQueue = -1;

            try
            {
                Uri  uri = new Uri(url);
                Host host;

                if (hosts.TryGetValue(uri.Host, out host))
                {
                    if (host.isAllowed(uri))
                    {
                        //check if url has been visited before
                        if (!host.hasVisited(uri))
                        {
                            HtmlDocument htmlDoc;

                            HtmlWeb web = new HtmlWeb();
                            htmlDoc = web.Load(uri.AbsoluteUri);

                            if (web.StatusCode == HttpStatusCode.OK)
                            {
                                string title = "";
                                string date  = DateTime.UtcNow.ToString("s", System.Globalization.CultureInfo.InvariantCulture);
                                string body  = "";

                                host.addVisited(uri);

                                //get title
                                HtmlNode metaTitleNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='title']");
                                if (metaTitleNode != null)
                                {
                                    title = metaTitleNode.GetAttributeValue("content", "");
                                    body  = title;
                                }
                                else
                                {
                                    HtmlNode metaOgTitleNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='og:title']");
                                    if (metaOgTitleNode != null)
                                    {
                                        title = metaOgTitleNode.GetAttributeValue("content", "");
                                        body  = title;
                                    }
                                    else
                                    {
                                        HtmlNode titleNode = htmlDoc.DocumentNode.SelectSingleNode("//title");
                                        if (titleNode != null)
                                        {
                                            title = HttpUtility.HtmlDecode(titleNode.InnerHtml);
                                            body  = title;
                                        }
                                    }
                                }

                                //get last mod date of page
                                HtmlNode modNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='lastmod']");
                                if (modNode != null)
                                {
                                    date = modNode.GetAttributeValue("content", "");
                                }
                                else
                                {
                                    //if no last mod date, check if there is a pub date
                                    HtmlNode pubNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='pubdate']");
                                    if (pubNode != null)
                                    {
                                        date = pubNode.GetAttributeValue("content", "");
                                    }
                                }

                                //get body of page
                                HtmlNode metaDescNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']");
                                if (metaDescNode != null)
                                {
                                    body = metaDescNode.GetAttributeValue("content", "");
                                }
                                else
                                {
                                    HtmlNode metaOgDescNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='og:description']");
                                    if (metaOgDescNode != null)
                                    {
                                        body = metaOgDescNode.GetAttributeValue("content", "");
                                    }
                                }

                                //HtmlNode bodyNode = htmlDoc.DocumentNode.SelectSingleNode("//p[contains(@class,'zn-body__paragraph')]");
                                //if (bodyNode != null)
                                //{
                                //    if (body.Length > 200)
                                //    {
                                //        body = Operation.stripHtml(bodyNode.InnerText).Substring(0, 200) + "...";
                                //    }
                                //}

                                //Insert page with each word in the title as a row key
                                string[] keyWord = Operation.stripPunct(title.ToLower()).Split().Distinct().ToArray();
                                foreach (string key in keyWord)
                                {
                                    if (key != "")
                                    {
                                        try
                                        {
                                            //get page data and store to table
                                            PageEntity     page            = new PageEntity(uri, title, date, body, key);
                                            TableOperation insertOperation = TableOperation.Insert(page);
                                            pagesTable.Execute(insertOperation);
                                            updateIndex++;
                                        }
                                        catch (Exception e)
                                        {
                                            //Insert error to table
                                            ErrorEntity    err             = new ErrorEntity(url, e.Message, DateTime.Now.ToString());
                                            TableOperation insertOperation = TableOperation.Insert(err);
                                            errorsTable.ExecuteAsync(insertOperation);

                                            Console.Write(e.ToString());
                                        }
                                    }
                                }

                                HtmlNode[]         linkNodes = new HtmlNode[0];
                                HtmlNodeCollection tempNodes = htmlDoc.DocumentNode.SelectNodes("//a");
                                if (tempNodes != null)
                                {
                                    linkNodes = tempNodes.ToArray();
                                }
                                Uri newUri;
                                foreach (HtmlNode link in linkNodes)
                                {
                                    //add url if within allowed domain
                                    try
                                    {
                                        newUri = new Uri(uri, link.GetAttributeValue("href", null));
                                        if (Operation.domains.Values.Any(newUri.Host.Contains))
                                        {
                                            if (newUri.Host.Contains(Operation.domains["BR1"]) || newUri.Host.Contains(Operation.domains["BR2"]))
                                            {
                                                if (newUri.AbsolutePath.StartsWith(Operation._BR_PATH))
                                                {
                                                    CloudQueueMessage urlMessage = new CloudQueueMessage(newUri.AbsoluteUri);
                                                    urlQueue.AddMessageAsync(urlMessage);
                                                    updateQueue++;
                                                }
                                            }
                                            else
                                            {
                                                CloudQueueMessage urlMessage = new CloudQueueMessage(newUri.AbsoluteUri);
                                                urlQueue.AddMessageAsync(urlMessage);
                                                updateQueue++;
                                            }
                                        }
                                    }
                                    catch (Exception e)
                                    {
                                        //Invalid url
                                        Console.WriteLine("Invalid html url found: " + e.ToString());
                                    }
                                }
                            }
                        }
                    }
                }
                else
                {
                    //if robots.txt has not been parsed for the given url site
                    //and is within domain
                    if (Operation.domains.Values.Any(uri.Host.Contains))
                    {
                        //add to xmlqueue and add url back into urlqueue
                        CloudQueueMessage robotMessage = new CloudQueueMessage(uri.AbsoluteUri);
                        robotQueue.AddMessage(robotMessage);

                        CloudQueueMessage urlMessage = new CloudQueueMessage(uri.AbsoluteUri);
                        urlQueue.AddMessage(urlMessage);
                        updateQueue++;
                    }
                }
            }
            catch (Exception e)
            {
                //Insert error to table
                ErrorEntity err = new ErrorEntity(url, e.Message, DateTime.Now.ToString());
                try
                {
                    TableOperation insertOperation = TableOperation.Insert(err);
                    errorsTable.ExecuteAsync(insertOperation);
                }
                catch (Exception insErr)
                {
                    Console.Write(insErr.ToString());
                }
            }
            return(new Tuple <int, int>(updateIndex, updateQueue));
        }