static string _vyborUsneseni = "https://www.psp.cz/sqw/hp.sqw?k={0}&kk=5&td=1&n={1}"; //idVyboru+5 , page public static void Vybor(DatasetConnector dsc, int vyborId, bool rewrite = false) { List <_usneseni> vsechnausneseni = VsechnaUsneseniVyboru(vyborId); Console.WriteLine($"Vybor {vyborId}"); int lastJednani = 0; var xp = GetPage(string.Format(_vyborHP, vyborId + 2)); var xpozvanky = xp.GetNodes("//div[@id='main-content']//h2")[0].NextSibling.SelectNodes(".//tr//a"); string jednaniUrlTemplate = ""; bool komplexni = false; if (xpozvanky != null && xpozvanky.Count > 0) { var link = xpozvanky[0].GetAttributeValue("href", ""); if (link.StartsWith("hp.sqw")) { komplexni = true; var tmp = GetRegexGroupValue(link, @"&cu=(?<cislo>\d*)", "cislo"); int.TryParse(tmp, out lastJednani); if (lastJednani > 0) { jednaniUrlTemplate = _vyborJednaniHProot + link.Replace("&cu=" + lastJednani, "&cu={0}"); } } else { komplexni = false; var tmp = GetRegexGroupValue(System.Net.WebUtility.HtmlDecode(xpozvanky[0].InnerText), @"č\. \s* (?<cislo>\d*)", "cislo"); int.TryParse(tmp, out lastJednani); } } if (komplexni && lastJednani > 0 && !string.IsNullOrEmpty(jednaniUrlTemplate)) { //parse HP jednani //for (int cisloJednani = lastJednani; cisloJednani > 0; cisloJednani--) Devmasters.Batch.Manager.DoActionForAll <int>(Enumerable.Range(1, lastJednani).Reverse(), (cisloJednani) => { var jednani = JednaniKomplexni(vyborId, cisloJednani, string.Format(jednaniUrlTemplate, cisloJednani)); if (jednani == null) { return(new Devmasters.Batch.ActionOutputData()); } //add usneseni var docFromUsneseni = vsechnausneseni .Where(m => m.datum == jednani.datum) .Select(m => new jednani.dokument() { typ = "Usnesení", DocumentUrl = m.fileUrl, popis = m.popis, jmeno = $"usneseni_{m.cislo}.docx" }).ToList(); if (docFromUsneseni.Count() > 0) { jednani.dokumenty = jednani.dokumenty.Concat(docFromUsneseni).ToArray(); } jednani.SetId(); //merge with existing jednani exists = null; try { exists = dsc.GetItemFromDataset <jednani>(datasetname, jednani.Id).Result; } catch (Exception) { } bool changed = true; if (exists != null) { jednani = jednani.Merge(exists, jednani, out changed); } //add OCR Devmasters.Batch.Manager.DoActionForAll <jednani.dokument>(jednani.dokumenty, d => { if (string.IsNullOrWhiteSpace(d.DocumentPlainText)) { Console.WriteLine("OCR for " + d.DocumentUrl); var ocrRes = HlidacStatu.Lib.OCR.Api.Client.TextFromUrl( Devmasters.Config.GetWebConfigValue("OCRServerApiKey"), new Uri(d.DocumentUrl), "Vybory-PSP-parser", HlidacStatu.Lib.OCR.Api.Client.TaskPriority.High, HlidacStatu.Lib.OCR.Api.Client.MiningIntensity.Maximum, null, TimeSpan.FromMinutes(120)); if (ocrRes.IsValid == HlidacStatu.Lib.OCR.Api.Result.ResultStatus.Valid) { d.DocumentPlainText = ocrRes.MergedDocuments().Text; changed = true; } else { Console.WriteLine($"Invalid OCR {d.DocumentUrl} - {ocrRes.Error}"); } } return(new Devmasters.Batch.ActionOutputData()); }, null, null, !System.Diagnostics.Debugger.IsAttached, prefix: $"Jednani {cisloJednani} OCR:"); string jedn = "zápis"; if (jednani.dokumenty.Any(m => m.typ.ToLower().Contains(jedn))) { jednani.zapisJednani = string.Join(@"\n", jednani.dokumenty.Where(m => m.typ.ToLower().Contains(jedn)).Select(m => m.DocumentPlainText) ); } string id = ""; if (changed) { id = dsc.AddItemToDataset(datasetname, jednani, DatasetConnector.AddItemMode.Rewrite).Result; } Console.WriteLine($"Saved vybor {jednani.vybor} jednani {jednani.Id} id {id}"); return(new Devmasters.Batch.ActionOutputData()); }, null, null, !System.Diagnostics.Debugger.IsAttached, maxDegreeOfParallelism: 5, prefix: "cisloJednani: "); } }
public static void DownloadAllData(DatasetConnector dsc, int?fromYear = null) { int ifromYear = fromYear ?? 2007; var years = Enumerable.Range(ifromYear, DateTime.Now.Year - ifromYear + 1); int totalSave = 0; List <string> agendy = new List <string>(); Devmasters.Batch.Manager.DoActionForAll(years.Reverse(), (y) => { agendy.AddRange(AgendaList(y).OrderBy(o => o)); return(new Devmasters.Batch.ActionOutputData()); } , Devmasters.Batch.Manager.DefaultOutputWriter , new Devmasters.Batch.ActionProgressWriter(0.1f).Write , false , maxDegreeOfParallelism: 5 ); Devmasters.Batch.Manager.DoActionForAll(agendy, (ag) => { Console.WriteLine("Getting " + ag + " "); var jednaniArr = ParseAgenda(ag); Console.WriteLine("SAVING " + ag + " "); foreach (var j in jednaniArr.OrderBy(m => m.Id)) { jednani exists = null; try { Console.Write(j.Id + " L"); exists = dsc.GetItemFromDataset <jednani>(datasetname, j.Id).Result; } catch (Exception) { } string id = ""; if (exists == null) { id = dsc.AddItemToDataset(datasetname, j, DatasetConnector.AddItemMode.Rewrite).Result; totalSave++; Console.Write("S"); } else { bool replace = false; //compare replace = replace || (exists.dokumenty?.Count() ?? 0) != (j.dokumenty?.Count() ?? 0); replace = replace || (exists.veklep?.Count() ?? 0) != (j.veklep?.Count() ?? 0); replace = replace || (exists.meni?.Count() ?? 0) != (j.meni?.Count() ?? 0); replace = replace || (exists.zmeneno?.Count() ?? 0) != (j.zmeneno?.Count() ?? 0); replace = replace || (exists.rusi?.Count() ?? 0) != (j.rusi?.Count() ?? 0); replace = replace || (exists.zruseno?.Count() ?? 0) != (j.zruseno?.Count() ?? 0); if (replace) { id = dsc.AddItemToDataset(datasetname, j, DatasetConnector.AddItemMode.Rewrite).Result; Console.Write("S"); totalSave++; } } Console.WriteLine("."); } return(new Devmasters.Batch.ActionOutputData()); } , null //Devmasters.Core.Batch.Manager.DefaultOutputWriter , null //new Devmasters.Core.Batch.ActionProgressWriter(0.1f).Write , false , maxDegreeOfParallelism: 5, prefix: "AGENDY: " ); Console.WriteLine(); Console.WriteLine(); Console.WriteLine("TOtal saved: " + totalSave); }