public static void ParsePages(string datasetId, int startFrom = 10000, int count = 600) { Devmasters.Batch.Manager.DoActionForAll <int>(Enumerable.Range(startFrom, count), //jedeme v 2 threadech, bud ohleduplny a nedavej vice (i) => { string url = ""; try { //stahnutí HTML stránky s rozhodnutím UOHS. //rozhodnutí jsou na samostatnych stránkach, s jednoduchym URL, kde cislo stranky s rozhodnutim postupně roste. // k 1.9.2018 ma posledni rozhodnuti cislo asi 15500 string html = ""; url = $"http://www.uohs.cz/cs/verejne-zakazky/sbirky-rozhodnuti/detail-{i}.html"; //stahnuti HTML System.Net.WebClient wc = new System.Net.WebClient(); wc.Encoding = System.Text.Encoding.UTF8; html = wc.DownloadString(url); //prevedeni do XHTML pomoci HTMLAgilityPacku. //XPath je trida a sada funkci pro jednodusi XPath parsovani Devmasters.XPath page = new Devmasters.XPath(html); //vsechna ziskavana data jsou ziskana pomoci XPATH //stranka neexistuje, tak ji preskocime if (page.GetNodeText("//head/title")?.Contains("stránka neexistuje") == true) { return(new Devmasters.Batch.ActionOutputData()); } logger.Debug($"parsing {url}"); //do item davam postupně získané údaje var item = new UOHSData(); item.Url = url; item.Id = i.ToString(); //žádný obsah není mimo tento DIV, tak si ho sem dam, abych tento retezec nemusel porad opakovat var root = "//div[@id='content']"; //parsování pomocí XPath. item.Cj = page.GetNodeText(root + "//div/h1/strong[1]")?.Replace("Rozhodnutí: ", ""); item.SpisovaZnacka = page.GetNodeText(root + "//div/h1/strong[2]")?.Replace("Rozhodnutí: ", ""); item.SoudniRozhodnuti = page.GetNodeText(root + "//div//h1/following-sibling::h2[1]"); item.Instance = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Instance')]/parent::tr/td"); item.Vec = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Věc')]/parent::tr/td"); var ucastniciNode = page.GetNodes(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Účastníci')]/parent::tr/td/ol/li"); List <UOHSData.Ucastnik> ucastnici = new List <UOHSData.Ucastnik>(); if (ucastniciNode != null) { foreach (var node in ucastniciNode) { var firmaJmeno = System.Net.WebUtility.HtmlDecode(node.InnerText); //konverze HTML entity to UTF-8; é -> é //dohledat ICO var ico = httpClient.GetAsync("https://www.hlidacstatu.cz/api/v2/firmy/" + System.Net.WebUtility.UrlEncode(firmaJmeno)) .Result.Content .ReadAsStringAsync().Result; try { var icoRes = Newtonsoft.Json.Linq.JObject.Parse(ico); if (icoRes["ico"] == null) { ucastnici.Add(new UOHSData.Ucastnik() { Jmeno = firmaJmeno }); } else { ucastnici.Add(new UOHSData.Ucastnik() { Jmeno = firmaJmeno, ICO = icoRes["ico"].Value <string>() }); } } catch (Exception) { ucastnici.Add(new UOHSData.Ucastnik() { Jmeno = firmaJmeno }); } } } item.Ucastnici = ucastnici.ToArray(); item.Typ_spravniho_rizeni = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Typ správního řízení')]/parent::tr/td"); item.Typ_rozhodnuti = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Typ rozhodnutí')]/parent::tr/td"); item.Rok = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Rok')]/parent::tr/td"); item.PravniMoc = ToDateTimeFromCZ( page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Datum nabytí právní moci')]/parent::tr/td") ); var souvis_urls = page.GetNodes(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Související rozhodnutí')]/parent::tr/td/a"); if (souvis_urls != null) { item.SouvisejiciUrl = souvis_urls .Select(m => m.Attributes["href"]?.Value) .Where(m => m != null) .Select(u => "http://www.uohs.cz" + u) .ToArray(); } item.Rozhodnuti = new UOHSData.Dokument(); var documents = page.GetNodes(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Dokumenty')]/parent::tr/td/a"); item.Rozhodnuti.Url = page.GetNode(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Dokumenty')]/parent::tr/td/a") ?.Attributes["href"]?.Value; if (!string.IsNullOrEmpty(item.Rozhodnuti.Url)) { item.Rozhodnuti.Url = "http://www.uohs.cz" + item.SouvisejiciUrl; } item.Rozhodnuti.PlainText = page.GetNode("//div[@id='content']//div[@class='res_text']")?.InnerText ?? ""; //parsovani hotovo, jdu ulozit zaznam do Datasetu logger.Debug($"adding item {item.Id} - {item.Url}"); ds.AddOrUpdateItem(item, HlidacStatu.Api.V2.Dataset.Typed.ItemInsertMode.rewrite); } catch (Exception e) { logger.Error(url, e); } return(new Devmasters.Batch.ActionOutputData()); }, outputWriter.OutputWriter, progressWriter.ProgressWriter, !System.Diagnostics.Debugger.IsAttached ); }
private static void GetExcelFromUzisZIP_Old() { string fn = GetExecutingDirectoryName() + $"\\dip-report-kraje-{DateTime.Now:yyyyMMdd-HHmmss}.xlsx"; string fnTemp = System.IO.Path.GetTempFileName(); //nejnovejsi ZIP for (int i = 0; i < 7; i++) { DateTime dt = DateTime.Now.Date.AddDays(-1 * i); string zipUrl = $"https://share.uzis.cz/s/fbCgFKagS6fCrzc/download?path=%2F{dt.Year}-{dt.ToString("MM")}%20({dt.ToString("MMMM", System.Globalization.CultureInfo.GetCultureInfo("cs"))}%20{dt.Year})&files={dt:yyyy-MM-dd}-dostupnost-kapacit.zip"; //$"https://share.uzis.cz/s/fbCgFKagS6fCrzc/download?path=%2F&files={dt:yyyy-MM-dd}-dostupnost-kapacit.zip"; Devmasters.Logging.Logger.Root.Info($"Getting ZIP url {zipUrl}"); using (Devmasters.Net.HttpClient.URLContent net = new Devmasters.Net.HttpClient.URLContent(zipUrl)) { try { System.IO.File.WriteAllBytes(fnTemp, net.GetBinary().Binary); break; } catch (Exception e) { } } } try { Devmasters.Logging.Logger.Root.Info("Getting Excel from ZIP"); //get xlsx from ZIP using (ZipArchive archive = ZipFile.OpenRead(fnTemp)) { foreach (ZipArchiveEntry entry in archive.Entries) { if (entry.FullName.EndsWith(".xlsx", StringComparison.OrdinalIgnoreCase)) { entry.ExtractToFile(fn); } } } if (false) //download xls from web { //find xls url string openDataPage = "https://onemocneni-aktualne.mzcr.cz/api/v2/covid-19"; Uri xlsUrl = null; Devmasters.Logging.Logger.Root.Info("Getting URL of XLS from " + openDataPage); using (Devmasters.Net.HttpClient.URLContent net = new Devmasters.Net.HttpClient.URLContent(openDataPage)) { Devmasters.Logging.Logger.Root.Info("Getting Excel URL"); var html = net.GetContent().Text; Devmasters.XPath xp = new Devmasters.XPath(html); var node = xp.GetNode("//a[contains(@href,'dip-report-kraje.xlsx')]"); if (node != null) { xlsUrl = new Uri("https://onemocneni-aktualne.mzcr.cz" + node.Attributes["href"].Value); } } if (xlsUrl == null) { Devmasters.Logging.Logger.Root.Fatal("No URL to download"); return; } using (Devmasters.Net.HttpClient.URLContent net = new Devmasters.Net.HttpClient.URLContent(xlsUrl.AbsoluteUri)) { Devmasters.Logging.Logger.Root.Info("Getting Excel"); System.IO.File.WriteAllBytes(fn, net.GetBinary().Binary); } } //debug //fn = @"c:\!!\ONLINE_DISPECINK_IP_dostupne_kapacity_20201014_05-50.xlsx"; ExcelPackage.LicenseContext = LicenseContext.NonCommercial; using (var p = new ExcelPackage(new System.IO.FileInfo(fn))) { ExcelWorksheet ws = p.Workbook.Worksheets[1]; //find date //Analýza provedena z exportu 01.10.2020 for (int row = 1; row < 100000; row++) { Console.Write("."); var txt = ws.Cells[row, 1].GetValue <string>(); if (txt != null && txt.StartsWith("Stav k datu:")) { string head = txt.Replace("Stav k datu: ", ""); string sdate = Devmasters.RegexUtil.GetRegexGroupValue(head, @" \s* (?<dt>\d{1,2}\s*\.\s*\d{1,2}\s*\.\s*\d{4} )", "dt"); DateTime dt = Devmasters.DT.Util.ToDate(sdate).Value; string id = "id_" + dt.ToString("yyyy-MM-dd"); NemocniceData nd = null; try { nd = ds.GetItem(id); // new NemocniceData(); } catch (Exception) { } if (nd == null) { nd = new NemocniceData(); nd.regions = new List <NemocniceData.Region>(); } nd.lastUpdated = dt; nd.id = id; Console.WriteLine("."); Devmasters.Logging.Logger.Root.Info(nd.lastUpdated.ToString()); row = row + 4; List <NemocniceData.Region> finalRegs = new List <NemocniceData.Region>(); for (int regs = 0; regs < 14; regs++) { string region = ws.Cells[row + regs, 1].GetValue <string>(); NemocniceData.Region r = nd.regions.FirstOrDefault(m => m.region == region); //new NemocniceData.Region(); if (r == null) { r = new NemocniceData.Region(); } r.lastModified = nd.lastUpdated; r.region = region; r.UPV_celkem = ws.Cells[row + regs, 4].GetValue <int>(); r.UPV_volna = ws.Cells[row + regs, 5].GetValue <int>(); r.ECMO_celkem = ws.Cells[row + regs, 5].GetValue <int>(); r.ECMO_volna = ws.Cells[row + regs, 6].GetValue <int>(); r.CRRT_celkem = ws.Cells[row + regs, 8].GetValue <int>(); r.CRRT_volna = ws.Cells[row + regs, 9].GetValue <int>(); r.IHD_celkem = ws.Cells[row + regs, 11].GetValue <int>(); r.IHD_volna = ws.Cells[row + regs, 12].GetValue <int>(); r.AROJIP_luzka_celkem = ws.Cells[row + regs, 14].GetValue <int>(); r.AROJIP_luzka_covid = ws.Cells[row + regs, 15].GetValue <int>(); r.AROJIP_luzka_necovid = ws.Cells[row + regs, 16].GetValue <int>(); r.Standard_luzka_s_kyslikem_celkem = ws.Cells[row + regs, 18].GetValue <int>(); r.Standard_luzka_s_kyslikem_covid = ws.Cells[row + regs, 19].GetValue <int>(); r.Standard_luzka_s_kyslikem_necovid = ws.Cells[row + regs, 20].GetValue <int>(); r.Lekari_AROJIP_celkem = ws.Cells[row + regs, 22].GetValue <int>(); r.Lekari_AROJIP_dostupni = ws.Cells[row + regs, 23].GetValue <int>(); r.Sestry_AROJIP_celkem = ws.Cells[row + regs, 25].GetValue <int>(); r.Sestry_AROJIP_dostupni = ws.Cells[row + regs, 26].GetValue <int>(); r.Ventilatory_prenosne_celkem = ws.Cells[row + regs, 28].GetValue <int>(); r.Ventilatory_operacnisal_celkem = ws.Cells[row + regs, 29].GetValue <int>(); r.Standard_luzka_celkem = ws.Cells[row + regs, 30].GetValue <int>(); r.Standard_luzka_s_monitor_celkem = ws.Cells[row + regs, 31].GetValue <int>(); finalRegs.Add(r); } nd.regions = finalRegs; row = row + 16; Devmasters.Logging.Logger.Root.Info("Saving"); ds.AddOrUpdateItem(nd, HlidacStatu.Api.V2.Dataset.Typed.ItemInsertMode.rewrite); } } } } catch (Exception e) { Devmasters.Logging.Logger.Root.Error("Processing ZIP XLS error", e); SendMail("*****@*****.**", "Selhalo zpracovani dat z UZIS", e.ToString(), ""); } }