public static void DownloadAllData(DatasetConnector dsc, int?fromYear = null) { int ifromYear = fromYear ?? 2007; var years = Enumerable.Range(ifromYear, DateTime.Now.Year - ifromYear + 1); int totalSave = 0; List <string> agendy = new List <string>(); Devmasters.Batch.Manager.DoActionForAll(years.Reverse(), (y) => { agendy.AddRange(AgendaList(y).OrderBy(o => o)); return(new Devmasters.Batch.ActionOutputData()); } , Devmasters.Batch.Manager.DefaultOutputWriter , new Devmasters.Batch.ActionProgressWriter(0.1f).Write , false , maxDegreeOfParallelism: 5 ); Devmasters.Batch.Manager.DoActionForAll(agendy, (ag) => { Console.WriteLine("Getting " + ag + " "); var jednaniArr = ParseAgenda(ag); Console.WriteLine("SAVING " + ag + " "); foreach (var j in jednaniArr.OrderBy(m => m.Id)) { jednani exists = null; try { Console.Write(j.Id + " L"); exists = dsc.GetItemFromDataset <jednani>(datasetname, j.Id).Result; } catch (Exception) { } string id = ""; if (exists == null) { id = dsc.AddItemToDataset(datasetname, j, DatasetConnector.AddItemMode.Rewrite).Result; totalSave++; Console.Write("S"); } else { bool replace = false; //compare replace = replace || (exists.dokumenty?.Count() ?? 0) != (j.dokumenty?.Count() ?? 0); replace = replace || (exists.veklep?.Count() ?? 0) != (j.veklep?.Count() ?? 0); replace = replace || (exists.meni?.Count() ?? 0) != (j.meni?.Count() ?? 0); replace = replace || (exists.zmeneno?.Count() ?? 0) != (j.zmeneno?.Count() ?? 0); replace = replace || (exists.rusi?.Count() ?? 0) != (j.rusi?.Count() ?? 0); replace = replace || (exists.zruseno?.Count() ?? 0) != (j.zruseno?.Count() ?? 0); if (replace) { id = dsc.AddItemToDataset(datasetname, j, DatasetConnector.AddItemMode.Rewrite).Result; Console.Write("S"); totalSave++; } } Console.WriteLine("."); } return(new Devmasters.Batch.ActionOutputData()); } , null //Devmasters.Core.Batch.Manager.DefaultOutputWriter , null //new Devmasters.Core.Batch.ActionProgressWriter(0.1f).Write , false , maxDegreeOfParallelism: 5, prefix: "AGENDY: " ); Console.WriteLine(); Console.WriteLine(); Console.WriteLine("TOtal saved: " + totalSave); }
public static IEnumerable <jednani> ParseAgenda(string sdatum) { string html = ""; using (var net = new Devmasters.Net.HttpClient.URLContent(string.Format(agendaUrl, sdatum))) { html = net.GetContent().Text; } DateTime datum = DateTime.ParseExact(sdatum, "yyyy-MM-dd", System.Globalization.CultureInfo.GetCultureInfo("en-US"), System.Globalization.DateTimeStyles.AssumeLocal); List <string> usneseni = new List <string>(); List <jednani> js = new List <jednani>(); object lockObj = new object(); var xp = new XPath(html); var rows = xp.GetNodes("//table[@class='lfr-table djv-agenda-table']//tr"); int ruzneCount = 0; foreach (var r in rows) { if (XPath.Tools.GetNodeAttributeValue(r, "td[1]", "colspan") != null) { continue; } var bod = XPath.Tools.GetNodeText(r, "td[2]") ?? ""; var obsah = XPath.Tools.GetNode(r, "td[5]"); if (bod == null && obsah == null) { obsah = XPath.Tools.GetNode(r, "td[1]"); } if (bod.Contains("Usnesení č.")) { usneseni.Add(bod); } else if (bod.Contains("Příloha č.")) { } else { jednani j = new jednani(); if (string.IsNullOrEmpty(bod)) { ruzneCount++; bod = "Různé č." + ruzneCount; } j.bod = bod.Trim(); j.datum = datum; j.vec = obsah?.InnerText; var docs = XPath.Tools.GetNodes(r, "td[3]/a"); if (docs != null) { j.dokumenty = docs.Select(m => new jednani.dokument() { DocumentUrl = NormalizeUrl(m.GetAttributeValue("href", "")), stahnuto = DateTime.Now, jmeno = "Znění", }).ToArray(); } if (obsah != null && XPath.Tools.GetNodes(obsah, ".//a") != null) { j.veklep = XPath.Tools.GetNodes(obsah, ".//a") .Select(m => m.GetAttributeValue("href", null)) .Where(n => n?.StartsWith("/veklep") == true) .Select(m => m.Replace("/veklep-detail?pid=", "")) .ToArray(); j.dokumenty = XPath.Tools.GetNodes(obsah, ".//a") .Where(m => m.GetAttributeValue("href", null) != null && m.GetAttributeValue("href", null)?.StartsWith("/veklep") == false) .Select(m => new jednani.dokument() { DocumentUrl = NormalizeUrl(m.GetAttributeValue("href", null)), jmeno = m.InnerText } ) .ToArray(); } j.SetId(); js.Add(j); } } //parse usneseni Devmasters.Batch.Manager.DoActionForAll(usneseni, (u) => { lock (lockObj) { js.Add(ParseUsneseni(datum, System.Text.RegularExpressions.Regex.Replace(u, "\\D", ""))); } return(new Devmasters.Batch.ActionOutputData()); } , null , null //new Devmasters.Core.Batch.ActionProgressWriter(0.1f).Write , true , maxDegreeOfParallelism: 5, prefix: "DAT " + sdatum + ":" ); var dids = js.Select(m => m.Id).Distinct().ToArray(); if (dids.Count() != js.Count()) { for (int i = 1; i < js.Count(); i++) { for (int j = 0; j < i; j++) { if (js[i].Id == js[j].Id) { js[i].Id = js[i].Id + "-" + i; } } } } return(js); }
public static jednani ParseUsneseni(DateTime den, string cislo) { int rok = den.Year; string html = ""; using (var net = new Devmasters.Net.HttpClient.URLContent(string.Format(usneseniUrl, rok, cislo))) { html = net.GetContent().Text; } var xp = new XPath(html); jednani j = new jednani(); j.datum = den; j.usneseni = $"{cislo}/{den.Year}"; j.bod = "Usnesení č. " + cislo; j.vec = xp.GetNodeText("(//table[@class='lfr-table material-table'])//tr//th[contains(text(),'Název')]/following-sibling::td"); j.cisloJednaci = xp.GetNodeText("(//table[@class='lfr-table material-table'])//tr//th[contains(text(),'Čj')]/following-sibling::td"); j.veklep = new string[] { xp.GetNodeText("(//table[@class='lfr-table material-table'])//tr//th[contains(text(),'PID')]/following-sibling::td") }; List <jednani.dokument> jdocs = new List <jednani.dokument>(); var xprilohyRows = xp.GetNodes("(//table[@class='lfr-table material-table'])[1]/tr"); if (xprilohyRows != null) { foreach (var xpri in xprilohyRows) { var jdoc = XPath.Tools.GetNodes(xpri, "td[2]/a"); if (jdoc != null) { jdocs.AddRange(jdoc.Select(m => new jednani.dokument() { DocumentUrl = NormalizeUrl(m.GetAttributeValue("href", "")), stahnuto = DateTime.Now, jmeno = XPath.Tools.GetNodeText(xpri, "td[1]") }) ); j.dokumenty = jdocs.ToArray(); } } } if (xp.GetNode("//div[@class='resolutionDerogation']") != null) { var souvisTypes = new string[] { "repeals", "amends", "repealed", "amended" }; foreach (var styp in souvisTypes) { List <jednani.souvis> jsouvis = new List <jednani.souvis>(); if (xp.GetNode($"//div[@class='resolutionDerogation']/div[@class='{styp}']/p") != null) { var txt = xp .GetNodeText($"//div[@class='resolutionDerogation']/div[@class='{styp}']/p")? .Trim()? .Replace(" následující", ""); foreach (var item in xp.GetNodes($"//div[@class='resolutionDerogation']/div[@class='{styp}']/dl/dd")) { var jss = new jednani.souvis() { zmena = txt, usneseni = item.InnerText?.Trim(), usneseniOrigUrl = NormalizeUrl(item.SelectSingleNode(".//a").GetAttributeValue("href", null)), }; var srok = GetRegexGroupValue(jss.usneseni, regexRokFromTxt, "rok")?.Replace("20", ""); var scislo = GetRegexGroupValue(jss.usneseni, regexRokFromTxt, "cislo"); if (!string.IsNullOrEmpty(srok) && !string.IsNullOrEmpty(scislo)) { jss.usneseniCislo = $"{scislo}/{srok}"; } jsouvis.Add(jss); } } if (jsouvis != null && jsouvis.Count > 0) { switch (styp) { case "repeals": j.rusi = jsouvis.ToArray(); break; case "amends": j.meni = jsouvis.ToArray(); break; case "repealed": j.zruseno = jsouvis.ToArray(); break; case "amended": j.zmeneno = jsouvis.ToArray(); break; default: break; } } } } j.SetId(); return(j); }