Example #1
0
        private static Jednani.Dokument[] GetZapisy(Jednani j)
        {
            if (hzapis == null)
            {
                using (Devmasters.Net.HttpClient.URLContent nzapis = new Devmasters.Net.HttpClient.URLContent("https://www.ceskatelevize.cz/rada-ct/zapisy-z-jednani/"))
                {
                    hzapis = nzapis.GetContent().Text;
                }
            }
            var dzapis = new Devmasters.XPath(hzapis);

            var zapisy = dzapis.GetNodes("//a[@class='pdf']");

            List <Jednani.Dokument> docs = new List <Jednani.Dokument>();

            foreach (var z in zapisy)
            {
                if (z.InnerText.Contains($"({j.DatumJednani.ToString("d. M. yyyy")})"))
                {
                    Uri?url = null;
                    Uri.TryCreate(new Uri(urlPrefix), z.Attributes["href"].Value, out url);
                    docs.Add(new Jednani.Dokument()
                    {
                        HsProcessType = "document",
                        DocumentUrl   = url.AbsoluteUri,
                        Nazev         = z.InnerText.Trim(),
                        Typ           = "zápis"
                    });
                }
            }

            return(docs.ToArray());
        }
Example #2
0
        private static Jednani.Dokument[] GetMaterialy(Jednani j)
        {
            if (mzapis == null)
            {
                using (Devmasters.Net.HttpClient.URLContent nzapis = new Devmasters.Net.HttpClient.URLContent("https://www.ceskatelevize.cz/rada-ct/materialy-projednane-radou/"))
                {
                    mzapis = nzapis.GetContent().Text;
                }
            }
            var dzapis = new Devmasters.XPath(mzapis);

            var casti = dzapis.GetNodes("//div[contains(@class,'contentArticle')]/h4[@class='odsazeni']");

            List <Jednani.Dokument> docs = new List <Jednani.Dokument>();

            foreach (var z in casti)
            {
                if (z.InnerText.Contains($"{j.DatumJednani.ToString("d. M. yyyy")}"))
                {
                    var pars = Devmasters.XPath.Tools.GetNodes(z, "following::*");
                    //jdi az do dalsiho h4
                    foreach (var par in pars)
                    {
                        if (par.Name == "p")
                        {
                            var link = par.ChildNodes.Where(m => m.Name == "a").FirstOrDefault();
                            if (link != null)
                            {
                                Uri?url = null;
                                Uri.TryCreate(new Uri(urlPrefix), link.Attributes["href"].Value, out url);
                                docs.Add(new Jednani.Dokument()
                                {
                                    HsProcessType = "document",
                                    DocumentUrl   = url.AbsoluteUri,
                                    Typ           = "material",
                                    Nazev         = link.InnerText
                                });
                            }
                        }
                        if (par.Name == "h4" && par.Attributes.FirstOrDefault()?.Value == "odsazeni")
                        {
                            goto end; //dalsi h4, pryc
                        }
                    }
                }
            }
end:
            return(docs.ToArray());
        }
Example #3
0
        private Dictionary <string, DateTime> GetBankStatementLinks()
        {
            using (var url = new Devmasters.Net.HttpClient.URLContent(Ucet.Url))
            {
                var doc = new Devmasters.XPath(url.GetContent().Text);
                return(doc.GetNodes(
                           "//div[@class='npw-transaction-group']/ul[@class='npw-documents']//a[text()[contains(.,'Transakce')]]")
                       ?.Select(n => new
                {
                    url = "https://www.csob.cz" + n.Attributes["href"].Value,
                    month = "01-" + n.InnerText.Replace("Transakce ", "").Replace("/", "-").Trim()
                }
                                )
                       ?.ToDictionary(k => k.url,
                                      v => DateTime.ParseExact(v.month, "dd-MM-yyyy", Consts.czCulture))
                       ?? new Dictionary <string, DateTime>());

                ;
            }
        }
Example #4
0
        static void Main(string[] arguments)
        {
            Console.WriteLine($"Jednání-Rady-ČT - {System.Reflection.Assembly.GetEntryAssembly().GetName().Version}");
            Devmasters.Logging.Logger.Root.Info($"Jednání-Rady-ČT - {System.Reflection.Assembly.GetEntryAssembly().GetName().Version}");
            Devmasters.Logging.Logger.Root.Debug("Jednání Rady ČT starting with " + string.Join(',', arguments));


            var args = new Devmasters.Args(arguments, new string[] { "/mp3path", "/apikey" });

            if (args.MandatoryPresent() == false)
            {
                Help();
            }

            mp3path = args.Get("/mp3path", null);

            if (args.Exists("/utdl"))
            {
                YTDL = args["/utdl"];
            }
            else
            {
                YTDL = System.IO.Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location) + "\\youtube-dl.exe";
            }

            startPath = System.IO.Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location);

            apiKey   = args["/apikey"];
            rewrite  = args.Exists("/rewrite");
            afterDay = DateTime.Now.Date.AddDays(-1 * args.GetNumber("/daysback", 10000).Value);
            if (args.Exists("/ids"))
            {
                ids = args.GetArray("/ids");
            }
            skips2t = args.Exists("/skips2t");



            int threads = args.GetNumber("/t") ?? 5;

            try
            {
                ds = HlidacStatu.Api.V2.Dataset.Typed.Dataset <Jednani> .OpenDataset(apiKey, DataSetId);
            }
            catch (ApiException e)
            {
                ds = HlidacStatu.Api.V2.Dataset.Typed.Dataset <Jednani> .CreateDataset(apiKey, Registration());
            }
            catch (Exception e)
            {
                throw;
            }



            string nextPages = "https://www.ceskatelevize.cz/ivysilani/10000000064-jednani-rady-ceske-televize/dalsi-casti/{0}";

            int            page    = 0;
            bool           stop    = false;
            List <Jednani> jednani = new List <Jednani>();

            do
            {
                page++;
                using (Devmasters.Net.HttpClient.URLContent net = new Devmasters.Net.HttpClient.URLContent(string.Format(nextPages, page)))
                {
                    Console.WriteLine($"Page {page}");
                    net.IgnoreHttpErrors     = true;
                    net.Tries                = 5;
                    net.TimeInMsBetweenTries = 2000;
                    string html = "";
                    try
                    {
                        Devmasters.Logging.Logger.Root.Debug($"downloading {net.Url} ");
                        html = net.GetContent().Text;
                    }
                    catch (Exception e)
                    {
                        Devmasters.Logging.Logger.Root.Error($"{net.Url} failed", e);
                    }

                    Devmasters.XPath xp = new Devmasters.XPath(html);
                    var links           = xp.GetNodes("//li[contains(@class,'itemBlock')]");
                    if (links == null || links.Count == 0)
                    {
                        break;
                    }

                    foreach (var link in links)
                    {
                        Jednani j = new Jednani();
                        j.Odkaz        = urlPrefix + Devmasters.XPath.Tools.GetNodeAttributeValue(link, "div/h3/a[@class='itemSetPaging']", "href");
                        j.Titulek      = Devmasters.XPath.Tools.GetNodeText(link, "div/h3/a[@class='itemSetPaging']").Trim();
                        j.DatumJednani = Devmasters.DT.Util.ToDate(Devmasters.XPath.Tools.GetNodeText(link, "div/p").Trim()) ?? DateTime.MinValue;
                        j.Id           = Devmasters.RegexUtil.GetRegexGroupValue(j.Odkaz, "/ivysilani/10000000064-jednani-rady-ceske-televize/(?<id>\\d{2,})", "id");
                        if (j.DatumJednani > afterDay &&
                            (ids == null || ids.Contains(j.Id))
                            )
                        {
                            jednani.Add(j);
                        }
                    }
                }
            } while (stop == false);

            //
            Devmasters.Logging.Logger.Root.Debug($"Starting {jednani.Count} items ");

            Devmasters.Batch.Manager.DoActionForAll <string>(jednani.Select(m => m.Id).Reverse(),
                                                             id =>
            {
                bool exists = ds.ItemExists(id);
                if (!string.IsNullOrEmpty(id) &&
                    (!exists || rewrite)
                    )
                {
                    Devmasters.Logging.Logger.Root.Debug($"Start parsing {id} ");
                    var fullJ = ParseJednani(jednani.First(m => m.Id == id));

                    Devmasters.Logging.Logger.Root.Debug($"Saving {id} ");
                    ds.AddOrUpdateItem(fullJ, HlidacStatu.Api.V2.Dataset.Typed.ItemInsertMode.rewrite);
                }
                else if (exists)
                {
                    //check voice2text
                    var fullJ = ds.GetItemSafe(id);
                    if (!(fullJ.PrepisAudia?.Count() > 0))
                    {
                        Devmasters.Logging.Logger.Root.Debug($"Checking AUDIO text {id} ");
                        var aud = Audio(fullJ);
                        if (aud?.Count() > 0)
                        {
                            fullJ.PrepisAudia = aud;
                            ds.AddOrUpdateItem(fullJ, HlidacStatu.Api.V2.Dataset.Typed.ItemInsertMode.rewrite);
                        }
                    }
                }
                return(new Devmasters.Batch.ActionOutputData()
                {
                    Log = id
                });
            }, true, maxDegreeOfParallelism: threads);
        }
Example #5
0
        public static void ParsePages(string datasetId, int startFrom = 10000, int count = 600)
        {
            Devmasters.Batch.Manager.DoActionForAll <int>(Enumerable.Range(startFrom, count),
                                                          //jedeme v 2 threadech, bud ohleduplny a nedavej vice
                                                          (i) =>
            {
                string url = "";
                try
                {
                    //stahnutí HTML stránky s rozhodnutím UOHS.
                    //rozhodnutí jsou na samostatnych stránkach, s jednoduchym URL, kde cislo stranky s rozhodnutim postupně roste.
                    // k 1.9.2018 ma posledni rozhodnuti cislo asi 15500
                    string html = "";
                    url         = $"http://www.uohs.cz/cs/verejne-zakazky/sbirky-rozhodnuti/detail-{i}.html";

                    //stahnuti HTML
                    System.Net.WebClient wc = new System.Net.WebClient();
                    wc.Encoding             = System.Text.Encoding.UTF8;
                    html = wc.DownloadString(url);

                    //prevedeni do XHTML pomoci HTMLAgilityPacku.
                    //XPath je trida a sada funkci pro jednodusi XPath parsovani
                    Devmasters.XPath page = new Devmasters.XPath(html);

                    //vsechna ziskavana data jsou ziskana pomoci XPATH


                    //stranka neexistuje, tak ji preskocime
                    if (page.GetNodeText("//head/title")?.Contains("stránka neexistuje") == true)
                    {
                        return(new Devmasters.Batch.ActionOutputData());
                    }

                    logger.Debug($"parsing {url}");

                    //do item davam postupně získané údaje
                    var item = new UOHSData();
                    item.Url = url;
                    item.Id  = i.ToString();

                    //žádný obsah není mimo tento DIV, tak si ho sem dam, abych tento retezec nemusel porad opakovat
                    var root = "//div[@id='content']";

                    //parsování pomocí XPath.
                    item.Cj               = page.GetNodeText(root + "//div/h1/strong[1]")?.Replace("Rozhodnutí: ", "");
                    item.SpisovaZnacka    = page.GetNodeText(root + "//div/h1/strong[2]")?.Replace("Rozhodnutí: ", "");
                    item.SoudniRozhodnuti = page.GetNodeText(root + "//div//h1/following-sibling::h2[1]");


                    item.Instance = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Instance')]/parent::tr/td");

                    item.Vec = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Věc')]/parent::tr/td");

                    var ucastniciNode = page.GetNodes(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Účastníci')]/parent::tr/td/ol/li");
                    List <UOHSData.Ucastnik> ucastnici = new List <UOHSData.Ucastnik>();
                    if (ucastniciNode != null)
                    {
                        foreach (var node in ucastniciNode)
                        {
                            var firmaJmeno = System.Net.WebUtility.HtmlDecode(node.InnerText);     //konverze HTML entity to UTF-8;  &eacute; -> é


                            //dohledat ICO
                            var ico = httpClient.GetAsync("https://www.hlidacstatu.cz/api/v2/firmy/" + System.Net.WebUtility.UrlEncode(firmaJmeno))
                                      .Result.Content
                                      .ReadAsStringAsync().Result;
                            try
                            {
                                var icoRes = Newtonsoft.Json.Linq.JObject.Parse(ico);
                                if (icoRes["ico"] == null)
                                {
                                    ucastnici.Add(new UOHSData.Ucastnik()
                                    {
                                        Jmeno = firmaJmeno
                                    });
                                }
                                else
                                {
                                    ucastnici.Add(new UOHSData.Ucastnik()
                                    {
                                        Jmeno = firmaJmeno,
                                        ICO   = icoRes["ico"].Value <string>()
                                    });
                                }
                            }
                            catch (Exception)
                            {
                                ucastnici.Add(new UOHSData.Ucastnik()
                                {
                                    Jmeno = firmaJmeno
                                });
                            }
                        }
                    }
                    item.Ucastnici = ucastnici.ToArray();

                    item.Typ_spravniho_rizeni = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Typ správního řízení')]/parent::tr/td");
                    item.Typ_rozhodnuti       = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Typ rozhodnutí')]/parent::tr/td");
                    item.Rok = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Rok')]/parent::tr/td");

                    item.PravniMoc = ToDateTimeFromCZ(
                        page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Datum nabytí právní moci')]/parent::tr/td")
                        );

                    var souvis_urls = page.GetNodes(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Související rozhodnutí')]/parent::tr/td/a");
                    if (souvis_urls != null)
                    {
                        item.SouvisejiciUrl = souvis_urls
                                              .Select(m => m.Attributes["href"]?.Value)
                                              .Where(m => m != null)
                                              .Select(u => "http://www.uohs.cz" + u)
                                              .ToArray();
                    }


                    item.Rozhodnuti = new UOHSData.Dokument();

                    var documents = page.GetNodes(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Dokumenty')]/parent::tr/td/a");


                    item.Rozhodnuti.Url = page.GetNode(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Dokumenty')]/parent::tr/td/a")
                                          ?.Attributes["href"]?.Value;
                    if (!string.IsNullOrEmpty(item.Rozhodnuti.Url))
                    {
                        item.Rozhodnuti.Url = "http://www.uohs.cz" + item.SouvisejiciUrl;
                    }

                    item.Rozhodnuti.PlainText = page.GetNode("//div[@id='content']//div[@class='res_text']")?.InnerText ?? "";


                    //parsovani hotovo, jdu ulozit zaznam do Datasetu
                    logger.Debug($"adding item {item.Id} - {item.Url}");

                    ds.AddOrUpdateItem(item, HlidacStatu.Api.V2.Dataset.Typed.ItemInsertMode.rewrite);
                }
                catch (Exception e)
                {
                    logger.Error(url, e);
                }

                return(new Devmasters.Batch.ActionOutputData());
            },
                                                          outputWriter.OutputWriter, progressWriter.ProgressWriter,
                                                          !System.Diagnostics.Debugger.IsAttached
                                                          );
        }
Example #6
0
        private IEnumerable <IBankovniPolozka> ParseStatement(string url)
        {
            var polozky = new HashSet <IBankovniPolozka>();

            using (var net = new Devmasters.Net.HttpClient.URLContent(url))
            {
                net.IgnoreHttpErrors = true;
                var content = net.GetContent(Encoding.UTF8).Text;
                if (content.Contains("Některé pohyby nemusí být zobrazeny. Zmenšete datumový rozsah."))
                {
                    throw new StatementTooLongException();
                }
                var doc = new Devmasters.XPath(content);

                var xoverviewRows = "//div[contains(@class, 'pohybySum')]/table/tbody/tr";
                var overviewRows  = doc.GetNodes(xoverviewRows)?.Count ?? 0;
                if (overviewRows == 0)
                {
                    TULogger.Warning($"FIO: Account statement page was not found for account {Ucet.CisloUctu}. Account has been probably canceled. Url: {url}");
                    return(new List <IBankovniPolozka>());
                }

                var overview = new StatementOverview
                {
                    OpeningBalance = parseAmount(doc.GetNodeText(xoverviewRows + "/td[1]")),
                    FinalBalance   = parseAmount(doc.GetNodeText(xoverviewRows + "/td[2]")),
                    CreditSum      = parseAmount(doc.GetNodeText(xoverviewRows + "/td[3]")),
                    DebitSum       = parseAmount(doc.GetNodeText(xoverviewRows + "/td[4]"))
                };

                var xrows = "//table[@class='table' and starts-with(@id,'id')]/tbody/tr";
                var rows  = doc.GetNodes(xrows)?.Count ?? 0;
                for (var row = 1; row <= rows; row++)
                {
                    var xroot = xrows + "[" + row + "]";

                    var p = new SimpleBankovniPolozka
                    {
                        CisloUctu         = Ucet.CisloUctu,
                        Datum             = Devmasters.DT.Util.ToDateTime(doc.GetNodeText(xroot + "/td[1]"), "dd.MM.yyyy").Value,
                        Castka            = parseAmount(System.Net.WebUtility.HtmlDecode(doc.GetNodeText(xroot + "/td[2]"))),
                        PopisTransakce    = System.Net.WebUtility.HtmlDecode(doc.GetNodeText(xroot + "/td[3]")),
                        NazevProtiuctu    = System.Net.WebUtility.HtmlDecode(doc.GetNodeText(xroot + "/td[4]")),
                        ZpravaProPrijemce = Devmasters.TextUtil.NormalizeToBlockText(
                            System.Net.WebUtility.HtmlDecode(doc.GetNodeHtml(xroot + "/td[5]"))
                            ?.Replace("<br>", " \n")
                            )
                    };

                    var poznamka = Devmasters.TextUtil.NormalizeToBlockText(
                        System.Net.WebUtility.HtmlDecode(doc.GetNodeHtml(xroot + "/td[9]"))
                        ?.Replace("<br>", " \n")
                        );

                    if (poznamka != p.ZpravaProPrijemce)
                    {
                        p.ZpravaProPrijemce += " " + poznamka;
                    }

                    p.KS       = doc.GetNodeText(xroot + "/td[6]");
                    p.VS       = doc.GetNodeText(xroot + "/td[7]");
                    p.SS       = doc.GetNodeText(xroot + "/td[8]");
                    p.ZdrojUrl = net.Url;


                    p.CisloProtiuctu = ""; //neni k dispozici

                    if (!polozky.Contains(p))
                    {
                        polozky.Add(p);
                    }
                }

                ValidateParsedItems(polozky, overview);
            }

            return(polozky);
        }