Example #1
0
        public static void ParsePages(string datasetId, int startFrom = 10000, int count = 600)
        {
            Devmasters.Batch.Manager.DoActionForAll <int>(Enumerable.Range(startFrom, count),
                                                          //jedeme v 2 threadech, bud ohleduplny a nedavej vice
                                                          (i) =>
            {
                string url = "";
                try
                {
                    //stahnutí HTML stránky s rozhodnutím UOHS.
                    //rozhodnutí jsou na samostatnych stránkach, s jednoduchym URL, kde cislo stranky s rozhodnutim postupně roste.
                    // k 1.9.2018 ma posledni rozhodnuti cislo asi 15500
                    string html = "";
                    url         = $"http://www.uohs.cz/cs/verejne-zakazky/sbirky-rozhodnuti/detail-{i}.html";

                    //stahnuti HTML
                    System.Net.WebClient wc = new System.Net.WebClient();
                    wc.Encoding             = System.Text.Encoding.UTF8;
                    html = wc.DownloadString(url);

                    //prevedeni do XHTML pomoci HTMLAgilityPacku.
                    //XPath je trida a sada funkci pro jednodusi XPath parsovani
                    Devmasters.XPath page = new Devmasters.XPath(html);

                    //vsechna ziskavana data jsou ziskana pomoci XPATH


                    //stranka neexistuje, tak ji preskocime
                    if (page.GetNodeText("//head/title")?.Contains("stránka neexistuje") == true)
                    {
                        return(new Devmasters.Batch.ActionOutputData());
                    }

                    logger.Debug($"parsing {url}");

                    //do item davam postupně získané údaje
                    var item = new UOHSData();
                    item.Url = url;
                    item.Id  = i.ToString();

                    //žádný obsah není mimo tento DIV, tak si ho sem dam, abych tento retezec nemusel porad opakovat
                    var root = "//div[@id='content']";

                    //parsování pomocí XPath.
                    item.Cj               = page.GetNodeText(root + "//div/h1/strong[1]")?.Replace("Rozhodnutí: ", "");
                    item.SpisovaZnacka    = page.GetNodeText(root + "//div/h1/strong[2]")?.Replace("Rozhodnutí: ", "");
                    item.SoudniRozhodnuti = page.GetNodeText(root + "//div//h1/following-sibling::h2[1]");


                    item.Instance = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Instance')]/parent::tr/td");

                    item.Vec = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Věc')]/parent::tr/td");

                    var ucastniciNode = page.GetNodes(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Účastníci')]/parent::tr/td/ol/li");
                    List <UOHSData.Ucastnik> ucastnici = new List <UOHSData.Ucastnik>();
                    if (ucastniciNode != null)
                    {
                        foreach (var node in ucastniciNode)
                        {
                            var firmaJmeno = System.Net.WebUtility.HtmlDecode(node.InnerText);     //konverze HTML entity to UTF-8;  &eacute; -> é


                            //dohledat ICO
                            var ico = httpClient.GetAsync("https://www.hlidacstatu.cz/api/v2/firmy/" + System.Net.WebUtility.UrlEncode(firmaJmeno))
                                      .Result.Content
                                      .ReadAsStringAsync().Result;
                            try
                            {
                                var icoRes = Newtonsoft.Json.Linq.JObject.Parse(ico);
                                if (icoRes["ico"] == null)
                                {
                                    ucastnici.Add(new UOHSData.Ucastnik()
                                    {
                                        Jmeno = firmaJmeno
                                    });
                                }
                                else
                                {
                                    ucastnici.Add(new UOHSData.Ucastnik()
                                    {
                                        Jmeno = firmaJmeno,
                                        ICO   = icoRes["ico"].Value <string>()
                                    });
                                }
                            }
                            catch (Exception)
                            {
                                ucastnici.Add(new UOHSData.Ucastnik()
                                {
                                    Jmeno = firmaJmeno
                                });
                            }
                        }
                    }
                    item.Ucastnici = ucastnici.ToArray();

                    item.Typ_spravniho_rizeni = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Typ správního řízení')]/parent::tr/td");
                    item.Typ_rozhodnuti       = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Typ rozhodnutí')]/parent::tr/td");
                    item.Rok = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Rok')]/parent::tr/td");

                    item.PravniMoc = ToDateTimeFromCZ(
                        page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Datum nabytí právní moci')]/parent::tr/td")
                        );

                    var souvis_urls = page.GetNodes(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Související rozhodnutí')]/parent::tr/td/a");
                    if (souvis_urls != null)
                    {
                        item.SouvisejiciUrl = souvis_urls
                                              .Select(m => m.Attributes["href"]?.Value)
                                              .Where(m => m != null)
                                              .Select(u => "http://www.uohs.cz" + u)
                                              .ToArray();
                    }


                    item.Rozhodnuti = new UOHSData.Dokument();

                    var documents = page.GetNodes(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Dokumenty')]/parent::tr/td/a");


                    item.Rozhodnuti.Url = page.GetNode(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Dokumenty')]/parent::tr/td/a")
                                          ?.Attributes["href"]?.Value;
                    if (!string.IsNullOrEmpty(item.Rozhodnuti.Url))
                    {
                        item.Rozhodnuti.Url = "http://www.uohs.cz" + item.SouvisejiciUrl;
                    }

                    item.Rozhodnuti.PlainText = page.GetNode("//div[@id='content']//div[@class='res_text']")?.InnerText ?? "";


                    //parsovani hotovo, jdu ulozit zaznam do Datasetu
                    logger.Debug($"adding item {item.Id} - {item.Url}");

                    ds.AddOrUpdateItem(item, HlidacStatu.Api.V2.Dataset.Typed.ItemInsertMode.rewrite);
                }
                catch (Exception e)
                {
                    logger.Error(url, e);
                }

                return(new Devmasters.Batch.ActionOutputData());
            },
                                                          outputWriter.OutputWriter, progressWriter.ProgressWriter,
                                                          !System.Diagnostics.Debugger.IsAttached
                                                          );
        }
Example #2
0
        private static void GetExcelFromUzisZIP_Old()
        {
            string fn = GetExecutingDirectoryName() + $"\\dip-report-kraje-{DateTime.Now:yyyyMMdd-HHmmss}.xlsx";

            string fnTemp = System.IO.Path.GetTempFileName();

            //nejnovejsi ZIP
            for (int i = 0; i < 7; i++)
            {
                DateTime dt     = DateTime.Now.Date.AddDays(-1 * i);
                string   zipUrl = $"https://share.uzis.cz/s/fbCgFKagS6fCrzc/download?path=%2F{dt.Year}-{dt.ToString("MM")}%20({dt.ToString("MMMM", System.Globalization.CultureInfo.GetCultureInfo("cs"))}%20{dt.Year})&files={dt:yyyy-MM-dd}-dostupnost-kapacit.zip";   //$"https://share.uzis.cz/s/fbCgFKagS6fCrzc/download?path=%2F&files={dt:yyyy-MM-dd}-dostupnost-kapacit.zip";
                Devmasters.Logging.Logger.Root.Info($"Getting ZIP url {zipUrl}");

                using (Devmasters.Net.HttpClient.URLContent net = new Devmasters.Net.HttpClient.URLContent(zipUrl))
                {
                    try
                    {
                        System.IO.File.WriteAllBytes(fnTemp, net.GetBinary().Binary);
                        break;
                    }
                    catch (Exception e)
                    {
                    }
                }
            }

            try
            {
                Devmasters.Logging.Logger.Root.Info("Getting Excel from ZIP");
                //get xlsx from ZIP
                using (ZipArchive archive = ZipFile.OpenRead(fnTemp))
                {
                    foreach (ZipArchiveEntry entry in archive.Entries)
                    {
                        if (entry.FullName.EndsWith(".xlsx", StringComparison.OrdinalIgnoreCase))
                        {
                            entry.ExtractToFile(fn);
                        }
                    }
                }

                if (false) //download xls from web
                {
                    //find xls url
                    string openDataPage = "https://onemocneni-aktualne.mzcr.cz/api/v2/covid-19";
                    Uri    xlsUrl       = null;
                    Devmasters.Logging.Logger.Root.Info("Getting URL of XLS from " + openDataPage);
                    using (Devmasters.Net.HttpClient.URLContent net = new Devmasters.Net.HttpClient.URLContent(openDataPage))
                    {
                        Devmasters.Logging.Logger.Root.Info("Getting Excel URL");
                        var html = net.GetContent().Text;

                        Devmasters.XPath xp = new Devmasters.XPath(html);
                        var node            = xp.GetNode("//a[contains(@href,'dip-report-kraje.xlsx')]");
                        if (node != null)
                        {
                            xlsUrl = new Uri("https://onemocneni-aktualne.mzcr.cz" + node.Attributes["href"].Value);
                        }
                    }

                    if (xlsUrl == null)
                    {
                        Devmasters.Logging.Logger.Root.Fatal("No URL to download");
                        return;
                    }

                    using (Devmasters.Net.HttpClient.URLContent net = new Devmasters.Net.HttpClient.URLContent(xlsUrl.AbsoluteUri))
                    {
                        Devmasters.Logging.Logger.Root.Info("Getting Excel");
                        System.IO.File.WriteAllBytes(fn, net.GetBinary().Binary);
                    }
                }

                //debug
                //fn = @"c:\!!\ONLINE_DISPECINK_IP_dostupne_kapacity_20201014_05-50.xlsx";

                ExcelPackage.LicenseContext = LicenseContext.NonCommercial;
                using (var p = new ExcelPackage(new System.IO.FileInfo(fn)))
                {
                    ExcelWorksheet ws = p.Workbook.Worksheets[1];

                    //find date
                    //Analýza provedena z exportu 01.10.2020

                    for (int row = 1; row < 100000; row++)
                    {
                        Console.Write(".");
                        var txt = ws.Cells[row, 1].GetValue <string>();
                        if (txt != null && txt.StartsWith("Stav k datu:"))
                        {
                            string        head  = txt.Replace("Stav k datu: ", "");
                            string        sdate = Devmasters.RegexUtil.GetRegexGroupValue(head, @" \s* (?<dt>\d{1,2}\s*\.\s*\d{1,2}\s*\.\s*\d{4} )", "dt");
                            DateTime      dt    = Devmasters.DT.Util.ToDate(sdate).Value;
                            string        id    = "id_" + dt.ToString("yyyy-MM-dd");
                            NemocniceData nd    = null;
                            try
                            {
                                nd = ds.GetItem(id); // new NemocniceData();
                            }
                            catch (Exception)
                            {
                            }
                            if (nd == null)
                            {
                                nd         = new NemocniceData();
                                nd.regions = new List <NemocniceData.Region>();
                            }
                            nd.lastUpdated = dt;

                            nd.id = id;

                            Console.WriteLine(".");
                            Devmasters.Logging.Logger.Root.Info(nd.lastUpdated.ToString());

                            row = row + 4;

                            List <NemocniceData.Region> finalRegs = new List <NemocniceData.Region>();

                            for (int regs = 0; regs < 14; regs++)
                            {
                                string region          = ws.Cells[row + regs, 1].GetValue <string>();
                                NemocniceData.Region r = nd.regions.FirstOrDefault(m => m.region == region); //new NemocniceData.Region();
                                if (r == null)
                                {
                                    r = new NemocniceData.Region();
                                }
                                r.lastModified = nd.lastUpdated;
                                r.region       = region;

                                r.UPV_celkem = ws.Cells[row + regs, 4].GetValue <int>();
                                r.UPV_volna  = ws.Cells[row + regs, 5].GetValue <int>();

                                r.ECMO_celkem = ws.Cells[row + regs, 5].GetValue <int>();
                                r.ECMO_volna  = ws.Cells[row + regs, 6].GetValue <int>();

                                r.CRRT_celkem = ws.Cells[row + regs, 8].GetValue <int>();
                                r.CRRT_volna  = ws.Cells[row + regs, 9].GetValue <int>();

                                r.IHD_celkem = ws.Cells[row + regs, 11].GetValue <int>();
                                r.IHD_volna  = ws.Cells[row + regs, 12].GetValue <int>();

                                r.AROJIP_luzka_celkem  = ws.Cells[row + regs, 14].GetValue <int>();
                                r.AROJIP_luzka_covid   = ws.Cells[row + regs, 15].GetValue <int>();
                                r.AROJIP_luzka_necovid = ws.Cells[row + regs, 16].GetValue <int>();

                                r.Standard_luzka_s_kyslikem_celkem  = ws.Cells[row + regs, 18].GetValue <int>();
                                r.Standard_luzka_s_kyslikem_covid   = ws.Cells[row + regs, 19].GetValue <int>();
                                r.Standard_luzka_s_kyslikem_necovid = ws.Cells[row + regs, 20].GetValue <int>();

                                r.Lekari_AROJIP_celkem   = ws.Cells[row + regs, 22].GetValue <int>();
                                r.Lekari_AROJIP_dostupni = ws.Cells[row + regs, 23].GetValue <int>();

                                r.Sestry_AROJIP_celkem   = ws.Cells[row + regs, 25].GetValue <int>();
                                r.Sestry_AROJIP_dostupni = ws.Cells[row + regs, 26].GetValue <int>();

                                r.Ventilatory_prenosne_celkem    = ws.Cells[row + regs, 28].GetValue <int>();
                                r.Ventilatory_operacnisal_celkem = ws.Cells[row + regs, 29].GetValue <int>();

                                r.Standard_luzka_celkem           = ws.Cells[row + regs, 30].GetValue <int>();
                                r.Standard_luzka_s_monitor_celkem = ws.Cells[row + regs, 31].GetValue <int>();

                                finalRegs.Add(r);
                            }
                            nd.regions = finalRegs;
                            row        = row + 16;

                            Devmasters.Logging.Logger.Root.Info("Saving");

                            ds.AddOrUpdateItem(nd, HlidacStatu.Api.V2.Dataset.Typed.ItemInsertMode.rewrite);
                        }
                    }
                }
            }
            catch (Exception e)
            {
                Devmasters.Logging.Logger.Root.Error("Processing ZIP XLS error", e);
                SendMail("*****@*****.**", "Selhalo zpracovani dat z UZIS", e.ToString(), "");
            }
        }