private static Jednani.Dokument[] GetZapisy(Jednani j) { if (hzapis == null) { using (Devmasters.Net.HttpClient.URLContent nzapis = new Devmasters.Net.HttpClient.URLContent("")) { hzapis = nzapis.GetContent().Text; } } var dzapis = new Devmasters.XPath(hzapis); var zapisy = dzapis.GetNodes("//a[@class='pdf']"); List <Jednani.Dokument> docs = new List <Jednani.Dokument>(); foreach (var z in zapisy) { if (z.InnerText.Contains($"({j.DatumJednani.ToString("d. M. yyyy")})")) { Uri?url = null; Uri.TryCreate(new Uri(urlPrefix), z.Attributes["href"].Value, out url); docs.Add(new Jednani.Dokument() { HsProcessType = "document", DocumentUrl = url.AbsoluteUri, Nazev = z.InnerText.Trim(), Typ = "zápis" }); } } return(docs.ToArray()); }
private static Jednani.Dokument[] GetMaterialy(Jednani j) { if (mzapis == null) { using (Devmasters.Net.HttpClient.URLContent nzapis = new Devmasters.Net.HttpClient.URLContent("")) { mzapis = nzapis.GetContent().Text; } } var dzapis = new Devmasters.XPath(mzapis); var casti = dzapis.GetNodes("//div[contains(@class,'contentArticle')]/h4[@class='odsazeni']"); List <Jednani.Dokument> docs = new List <Jednani.Dokument>(); foreach (var z in casti) { if (z.InnerText.Contains($"{j.DatumJednani.ToString("d. M. yyyy")}")) { var pars = Devmasters.XPath.Tools.GetNodes(z, "following::*"); //jdi az do dalsiho h4 foreach (var par in pars) { if (par.Name == "p") { var link = par.ChildNodes.Where(m => m.Name == "a").FirstOrDefault(); if (link != null) { Uri?url = null; Uri.TryCreate(new Uri(urlPrefix), link.Attributes["href"].Value, out url); docs.Add(new Jednani.Dokument() { HsProcessType = "document", DocumentUrl = url.AbsoluteUri, Typ = "material", Nazev = link.InnerText }); } } if (par.Name == "h4" && par.Attributes.FirstOrDefault()?.Value == "odsazeni") { goto end; //dalsi h4, pryc } } } } end: return(docs.ToArray()); }
private Dictionary <string, DateTime> GetBankStatementLinks() { using (var url = new Devmasters.Net.HttpClient.URLContent(Ucet.Url)) { var doc = new Devmasters.XPath(url.GetContent().Text); return(doc.GetNodes( "//div[@class='npw-transaction-group']/ul[@class='npw-documents']//a[text()[contains(.,'Transakce')]]") ?.Select(n => new { url = "" + n.Attributes["href"].Value, month = "01-" + n.InnerText.Replace("Transakce ", "").Replace("/", "-").Trim() } ) ?.ToDictionary(k => k.url, v => DateTime.ParseExact(v.month, "dd-MM-yyyy", Consts.czCulture)) ?? new Dictionary <string, DateTime>()); ; } }
static void Main(string[] arguments) { Console.WriteLine($"Jednání-Rady-ČT - {System.Reflection.Assembly.GetEntryAssembly().GetName().Version}"); Devmasters.Logging.Logger.Root.Info($"Jednání-Rady-ČT - {System.Reflection.Assembly.GetEntryAssembly().GetName().Version}"); Devmasters.Logging.Logger.Root.Debug("Jednání Rady ČT starting with " + string.Join(',', arguments)); var args = new Devmasters.Args(arguments, new string[] { "/mp3path", "/apikey" }); if (args.MandatoryPresent() == false) { Help(); } mp3path = args.Get("/mp3path", null); if (args.Exists("/utdl")) { YTDL = args["/utdl"]; } else { YTDL = System.IO.Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location) + "\\youtube-dl.exe"; } startPath = System.IO.Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location); apiKey = args["/apikey"]; rewrite = args.Exists("/rewrite"); afterDay = DateTime.Now.Date.AddDays(-1 * args.GetNumber("/daysback", 10000).Value); if (args.Exists("/ids")) { ids = args.GetArray("/ids"); } skips2t = args.Exists("/skips2t"); int threads = args.GetNumber("/t") ?? 5; try { ds = HlidacStatu.Api.V2.Dataset.Typed.Dataset <Jednani> .OpenDataset(apiKey, DataSetId); } catch (ApiException e) { ds = HlidacStatu.Api.V2.Dataset.Typed.Dataset <Jednani> .CreateDataset(apiKey, Registration()); } catch (Exception e) { throw; } string nextPages = "{0}"; int page = 0; bool stop = false; List <Jednani> jednani = new List <Jednani>(); do { page++; using (Devmasters.Net.HttpClient.URLContent net = new Devmasters.Net.HttpClient.URLContent(string.Format(nextPages, page))) { Console.WriteLine($"Page {page}"); net.IgnoreHttpErrors = true; net.Tries = 5; net.TimeInMsBetweenTries = 2000; string html = ""; try { Devmasters.Logging.Logger.Root.Debug($"downloading {net.Url} "); html = net.GetContent().Text; } catch (Exception e) { Devmasters.Logging.Logger.Root.Error($"{net.Url} failed", e); } Devmasters.XPath xp = new Devmasters.XPath(html); var links = xp.GetNodes("//li[contains(@class,'itemBlock')]"); if (links == null || links.Count == 0) { break; } foreach (var link in links) { Jednani j = new Jednani(); j.Odkaz = urlPrefix + Devmasters.XPath.Tools.GetNodeAttributeValue(link, "div/h3/a[@class='itemSetPaging']", "href"); j.Titulek = Devmasters.XPath.Tools.GetNodeText(link, "div/h3/a[@class='itemSetPaging']").Trim(); j.DatumJednani = Devmasters.DT.Util.ToDate(Devmasters.XPath.Tools.GetNodeText(link, "div/p").Trim()) ?? DateTime.MinValue; j.Id = Devmasters.RegexUtil.GetRegexGroupValue(j.Odkaz, "/ivysilani/10000000064-jednani-rady-ceske-televize/(?<id>\\d{2,})", "id"); if (j.DatumJednani > afterDay && (ids == null || ids.Contains(j.Id)) ) { jednani.Add(j); } } } } while (stop == false); // Devmasters.Logging.Logger.Root.Debug($"Starting {jednani.Count} items "); Devmasters.Batch.Manager.DoActionForAll <string>(jednani.Select(m => m.Id).Reverse(), id => { bool exists = ds.ItemExists(id); if (!string.IsNullOrEmpty(id) && (!exists || rewrite) ) { Devmasters.Logging.Logger.Root.Debug($"Start parsing {id} "); var fullJ = ParseJednani(jednani.First(m => m.Id == id)); Devmasters.Logging.Logger.Root.Debug($"Saving {id} "); ds.AddOrUpdateItem(fullJ, HlidacStatu.Api.V2.Dataset.Typed.ItemInsertMode.rewrite); } else if (exists) { //check voice2text var fullJ = ds.GetItemSafe(id); if (!(fullJ.PrepisAudia?.Count() > 0)) { Devmasters.Logging.Logger.Root.Debug($"Checking AUDIO text {id} "); var aud = Audio(fullJ); if (aud?.Count() > 0) { fullJ.PrepisAudia = aud; ds.AddOrUpdateItem(fullJ, HlidacStatu.Api.V2.Dataset.Typed.ItemInsertMode.rewrite); } } } return(new Devmasters.Batch.ActionOutputData() { Log = id }); }, true, maxDegreeOfParallelism: threads); }
public static void ParsePages(string datasetId, int startFrom = 10000, int count = 600) { Devmasters.Batch.Manager.DoActionForAll <int>(Enumerable.Range(startFrom, count), //jedeme v 2 threadech, bud ohleduplny a nedavej vice (i) => { string url = ""; try { //stahnutí HTML stránky s rozhodnutím UOHS. //rozhodnutí jsou na samostatnych stránkach, s jednoduchym URL, kde cislo stranky s rozhodnutim postupně roste. // k 1.9.2018 ma posledni rozhodnuti cislo asi 15500 string html = ""; url = $"{i}.html"; //stahnuti HTML System.Net.WebClient wc = new System.Net.WebClient(); wc.Encoding = System.Text.Encoding.UTF8; html = wc.DownloadString(url); //prevedeni do XHTML pomoci HTMLAgilityPacku. //XPath je trida a sada funkci pro jednodusi XPath parsovani Devmasters.XPath page = new Devmasters.XPath(html); //vsechna ziskavana data jsou ziskana pomoci XPATH //stranka neexistuje, tak ji preskocime if (page.GetNodeText("//head/title")?.Contains("stránka neexistuje") == true) { return(new Devmasters.Batch.ActionOutputData()); } logger.Debug($"parsing {url}"); //do item davam postupně získané údaje var item = new UOHSData(); item.Url = url; item.Id = i.ToString(); //žádný obsah není mimo tento DIV, tak si ho sem dam, abych tento retezec nemusel porad opakovat var root = "//div[@id='content']"; //parsování pomocí XPath. item.Cj = page.GetNodeText(root + "//div/h1/strong[1]")?.Replace("Rozhodnutí: ", ""); item.SpisovaZnacka = page.GetNodeText(root + "//div/h1/strong[2]")?.Replace("Rozhodnutí: ", ""); item.SoudniRozhodnuti = page.GetNodeText(root + "//div//h1/following-sibling::h2[1]"); item.Instance = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Instance')]/parent::tr/td"); item.Vec = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Věc')]/parent::tr/td"); var ucastniciNode = page.GetNodes(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Účastníci')]/parent::tr/td/ol/li"); List <UOHSData.Ucastnik> ucastnici = new List <UOHSData.Ucastnik>(); if (ucastniciNode != null) { foreach (var node in ucastniciNode) { var firmaJmeno = System.Net.WebUtility.HtmlDecode(node.InnerText); //konverze HTML entity to UTF-8; é -> é //dohledat ICO var ico = httpClient.GetAsync("" + System.Net.WebUtility.UrlEncode(firmaJmeno)) .Result.Content .ReadAsStringAsync().Result; try { var icoRes = Newtonsoft.Json.Linq.JObject.Parse(ico); if (icoRes["ico"] == null) { ucastnici.Add(new UOHSData.Ucastnik() { Jmeno = firmaJmeno }); } else { ucastnici.Add(new UOHSData.Ucastnik() { Jmeno = firmaJmeno, ICO = icoRes["ico"].Value <string>() }); } } catch (Exception) { ucastnici.Add(new UOHSData.Ucastnik() { Jmeno = firmaJmeno }); } } } item.Ucastnici = ucastnici.ToArray(); item.Typ_spravniho_rizeni = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Typ správního řízení')]/parent::tr/td"); item.Typ_rozhodnuti = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Typ rozhodnutí')]/parent::tr/td"); item.Rok = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Rok')]/parent::tr/td"); item.PravniMoc = ToDateTimeFromCZ( page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Datum nabytí právní moci')]/parent::tr/td") ); var souvis_urls = page.GetNodes(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Související rozhodnutí')]/parent::tr/td/a"); if (souvis_urls != null) { item.SouvisejiciUrl = souvis_urls .Select(m => m.Attributes["href"]?.Value) .Where(m => m != null) .Select(u => "" + u) .ToArray(); } item.Rozhodnuti = new UOHSData.Dokument(); var documents = page.GetNodes(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Dokumenty')]/parent::tr/td/a"); item.Rozhodnuti.Url = page.GetNode(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Dokumenty')]/parent::tr/td/a") ?.Attributes["href"]?.Value; if (!string.IsNullOrEmpty(item.Rozhodnuti.Url)) { item.Rozhodnuti.Url = "" + item.SouvisejiciUrl; } item.Rozhodnuti.PlainText = page.GetNode("//div[@id='content']//div[@class='res_text']")?.InnerText ?? ""; //parsovani hotovo, jdu ulozit zaznam do Datasetu logger.Debug($"adding item {item.Id} - {item.Url}"); ds.AddOrUpdateItem(item, HlidacStatu.Api.V2.Dataset.Typed.ItemInsertMode.rewrite); } catch (Exception e) { logger.Error(url, e); } return(new Devmasters.Batch.ActionOutputData()); }, outputWriter.OutputWriter, progressWriter.ProgressWriter, !System.Diagnostics.Debugger.IsAttached ); }
private IEnumerable <IBankovniPolozka> ParseStatement(string url) { var polozky = new HashSet <IBankovniPolozka>(); using (var net = new Devmasters.Net.HttpClient.URLContent(url)) { net.IgnoreHttpErrors = true; var content = net.GetContent(Encoding.UTF8).Text; if (content.Contains("Některé pohyby nemusí být zobrazeny. Zmenšete datumový rozsah.")) { throw new StatementTooLongException(); } var doc = new Devmasters.XPath(content); var xoverviewRows = "//div[contains(@class, 'pohybySum')]/table/tbody/tr"; var overviewRows = doc.GetNodes(xoverviewRows)?.Count ?? 0; if (overviewRows == 0) { TULogger.Warning($"FIO: Account statement page was not found for account {Ucet.CisloUctu}. Account has been probably canceled. Url: {url}"); return(new List <IBankovniPolozka>()); } var overview = new StatementOverview { OpeningBalance = parseAmount(doc.GetNodeText(xoverviewRows + "/td[1]")), FinalBalance = parseAmount(doc.GetNodeText(xoverviewRows + "/td[2]")), CreditSum = parseAmount(doc.GetNodeText(xoverviewRows + "/td[3]")), DebitSum = parseAmount(doc.GetNodeText(xoverviewRows + "/td[4]")) }; var xrows = "//table[@class='table' and starts-with(@id,'id')]/tbody/tr"; var rows = doc.GetNodes(xrows)?.Count ?? 0; for (var row = 1; row <= rows; row++) { var xroot = xrows + "[" + row + "]"; var p = new SimpleBankovniPolozka { CisloUctu = Ucet.CisloUctu, Datum = Devmasters.DT.Util.ToDateTime(doc.GetNodeText(xroot + "/td[1]"), "dd.MM.yyyy").Value, Castka = parseAmount(System.Net.WebUtility.HtmlDecode(doc.GetNodeText(xroot + "/td[2]"))), PopisTransakce = System.Net.WebUtility.HtmlDecode(doc.GetNodeText(xroot + "/td[3]")), NazevProtiuctu = System.Net.WebUtility.HtmlDecode(doc.GetNodeText(xroot + "/td[4]")), ZpravaProPrijemce = Devmasters.TextUtil.NormalizeToBlockText( System.Net.WebUtility.HtmlDecode(doc.GetNodeHtml(xroot + "/td[5]")) ?.Replace("<br>", " \n") ) }; var poznamka = Devmasters.TextUtil.NormalizeToBlockText( System.Net.WebUtility.HtmlDecode(doc.GetNodeHtml(xroot + "/td[9]")) ?.Replace("<br>", " \n") ); if (poznamka != p.ZpravaProPrijemce) { p.ZpravaProPrijemce += " " + poznamka; } p.KS = doc.GetNodeText(xroot + "/td[6]"); p.VS = doc.GetNodeText(xroot + "/td[7]"); p.SS = doc.GetNodeText(xroot + "/td[8]"); p.ZdrojUrl = net.Url; p.CisloProtiuctu = ""; //neni k dispozici if (!polozky.Contains(p)) { polozky.Add(p); } } ValidateParsedItems(polozky, overview); } return(polozky); }