private void StoreToHlidacStatu() { var datasetConnector = new DatasetConnector(ApiToken); var dataset = InsolvencniRejstrikDataset.InsolvencniRejstrik; while (TaskWs.Any(t => t.Status == TaskStatus.Running) || !ForDetailRequest.IsEmpty) { Rizeni rizeni; if (ForStore.TryDequeue(out rizeni)) { // retrying until get correct answer :) while (true) { try { var result = datasetConnector.AddItemToDataset <Rizeni>(dataset, rizeni).Result; break; } catch (Exception e) { AddError("Store", e); } } Interlocked.Increment(ref StoredItems); LastStoredItem = rizeni.ZahajeniRizeni; } else { Thread.Sleep(100); } } }
static string _vyborUsneseni = "https://www.psp.cz/sqw/hp.sqw?k={0}&kk=5&td=1&n={1}"; //idVyboru+5 , page public static void Vybor(DatasetConnector dsc, int vyborId, bool rewrite = false) { List <_usneseni> vsechnausneseni = VsechnaUsneseniVyboru(vyborId); Console.WriteLine($"Vybor {vyborId}"); int lastJednani = 0; var xp = GetPage(string.Format(_vyborHP, vyborId + 2)); var xpozvanky = xp.GetNodes("//div[@id='main-content']//h2")[0].NextSibling.SelectNodes(".//tr//a"); string jednaniUrlTemplate = ""; bool komplexni = false; if (xpozvanky != null && xpozvanky.Count > 0) { var link = xpozvanky[0].GetAttributeValue("href", ""); if (link.StartsWith("hp.sqw")) { komplexni = true; var tmp = GetRegexGroupValue(link, @"&cu=(?<cislo>\d*)", "cislo"); int.TryParse(tmp, out lastJednani); if (lastJednani > 0) { jednaniUrlTemplate = _vyborJednaniHProot + link.Replace("&cu=" + lastJednani, "&cu={0}"); } } else { komplexni = false; var tmp = GetRegexGroupValue(System.Net.WebUtility.HtmlDecode(xpozvanky[0].InnerText), @"č\. \s* (?<cislo>\d*)", "cislo"); int.TryParse(tmp, out lastJednani); } } if (komplexni && lastJednani > 0 && !string.IsNullOrEmpty(jednaniUrlTemplate)) { //parse HP jednani //for (int cisloJednani = lastJednani; cisloJednani > 0; cisloJednani--) Devmasters.Batch.Manager.DoActionForAll <int>(Enumerable.Range(1, lastJednani).Reverse(), (cisloJednani) => { var jednani = JednaniKomplexni(vyborId, cisloJednani, string.Format(jednaniUrlTemplate, cisloJednani)); if (jednani == null) { return(new Devmasters.Batch.ActionOutputData()); } //add usneseni var docFromUsneseni = vsechnausneseni .Where(m => m.datum == jednani.datum) .Select(m => new jednani.dokument() { typ = "Usnesení", DocumentUrl = m.fileUrl, popis = m.popis, jmeno = $"usneseni_{m.cislo}.docx" }).ToList(); if (docFromUsneseni.Count() > 0) { jednani.dokumenty = jednani.dokumenty.Concat(docFromUsneseni).ToArray(); } jednani.SetId(); //merge with existing jednani exists = null; try { exists = dsc.GetItemFromDataset <jednani>(datasetname, jednani.Id).Result; } catch (Exception) { } bool changed = true; if (exists != null) { jednani = jednani.Merge(exists, jednani, out changed); } //add OCR Devmasters.Batch.Manager.DoActionForAll <jednani.dokument>(jednani.dokumenty, d => { if (string.IsNullOrWhiteSpace(d.DocumentPlainText)) { Console.WriteLine("OCR for " + d.DocumentUrl); var ocrRes = HlidacStatu.Lib.OCR.Api.Client.TextFromUrl( Devmasters.Config.GetWebConfigValue("OCRServerApiKey"), new Uri(d.DocumentUrl), "Vybory-PSP-parser", HlidacStatu.Lib.OCR.Api.Client.TaskPriority.High, HlidacStatu.Lib.OCR.Api.Client.MiningIntensity.Maximum, null, TimeSpan.FromMinutes(120)); if (ocrRes.IsValid == HlidacStatu.Lib.OCR.Api.Result.ResultStatus.Valid) { d.DocumentPlainText = ocrRes.MergedDocuments().Text; changed = true; } else { Console.WriteLine($"Invalid OCR {d.DocumentUrl} - {ocrRes.Error}"); } } return(new Devmasters.Batch.ActionOutputData()); }, null, null, !System.Diagnostics.Debugger.IsAttached, prefix: $"Jednani {cisloJednani} OCR:"); string jedn = "zápis"; if (jednani.dokumenty.Any(m => m.typ.ToLower().Contains(jedn))) { jednani.zapisJednani = string.Join(@"\n", jednani.dokumenty.Where(m => m.typ.ToLower().Contains(jedn)).Select(m => m.DocumentPlainText) ); } string id = ""; if (changed) { id = dsc.AddItemToDataset(datasetname, jednani, DatasetConnector.AddItemMode.Rewrite).Result; } Console.WriteLine($"Saved vybor {jednani.vybor} jednani {jednani.Id} id {id}"); return(new Devmasters.Batch.ActionOutputData()); }, null, null, !System.Diagnostics.Debugger.IsAttached, maxDegreeOfParallelism: 5, prefix: "cisloJednani: "); } }
static string ParseTiskovku(DatasetConnector dsc, zapis zap) { string html = ""; using (var net = new Devmasters.Net.Web.URLContent(zap.url)) { html = net.GetContent().Text; } var xp = new XPath(html); var casti = xp.GetNodes("//div[@class='content-main']//div[@class='detail']/p"); if (casti == null) { return(""); // zatim neni prepsana, jdi na dalsi } List <zapis.vstup> vyjadreni = new List <zapis.vstup>(); int poradi = 1; zapis.vstup vyj = new zapis.vstup(); vyj.poradi = poradi; string prevMluvci = "~~~"; foreach (var cast in casti) { string text = System.Net.WebUtility.HtmlDecode(cast.InnerText); string mluvci = XPath.Tools.GetNodeText(cast, "./strong"); if (string.IsNullOrWhiteSpace(mluvci)) { vyj.text = vyj.text + "\n" + text; } else if (mluvci != prevMluvci) { if (prevMluvci != "~~~" && !string.IsNullOrEmpty(vyj.jmeno) && !string.IsNullOrEmpty(vyj.prijmeni)) { vyjadreni.Add(vyj); poradi++; vyj = new zapis.vstup(); vyj.poradi = poradi; } using (var net = new Devmasters.Net.Web.URLContent($"https://www.hlidacstatu.cz/api/v1/PolitikFromText?text={System.Net.WebUtility.UrlEncode(mluvci)}&Authorization={System.Configuration.ConfigurationManager.AppSettings["apikey"]}")) { net.Timeout = 60 * 1000; var osobahtml = net.GetContent().Text; var osoba = Newtonsoft.Json.Linq.JObject.Parse(osobahtml); if (!string.IsNullOrEmpty(osoba.Value <string>("jmeno"))) { vyj.jmeno = osoba.Value <string>("jmeno"); vyj.prijmeni = osoba.Value <string>("prijmeni"); vyj.osobaId = osoba.Value <string>("osobaid"); var info = Regex.Replace(mluvci, $@"({vyj.jmeno} \s {vyj.prijmeni} \s* ,?) | ({vyj.prijmeni} \s {vyj.jmeno} \s* ,?)", "", options) .Replace(":", ""); vyj.osobainfo = info?.Trim(); } else { vyj.osobainfo = mluvci; } } text = text.Replace(mluvci, "").Trim(); vyj.text = text; prevMluvci = mluvci; } } if (!string.IsNullOrEmpty(vyj.text) && !string.IsNullOrEmpty(vyj.jmeno) && !string.IsNullOrEmpty(vyj.prijmeni)) { vyjadreni.Add(vyj); } ////////// END vyjadreni zap.vstupy = vyjadreni.ToArray(); //mp3 var mp3urls = xp.GetNodes("//div[@class='record']//a"); if (mp3urls != null && mp3urls.Count > 0) { var ahref = mp3urls .FirstOrDefault(m => m.Attributes["href"]?.Value?.ToLower()?.EndsWith(".mp3") == true); if (ahref != null) { zap.audio = new zapis.odkaz() { url = "https://www.vlada.cz" + ahref.Attributes["href"].Value, nazev = "Zvukový záznam" } } ; } ////// MP3 //RELATED var relsdt = xp.GetNodes("//dl[@class='related']/dt"); var relsdd = xp.GetNodes("//dl[@class='related']/dd"); if (relsdd?.Count > 0 && relsdd?.Count == relsdt?.Count) { List <zapis.odkaz> odkazy = new List <zapis.odkaz>(); for (int i = 0; i < relsdt.Count; i++) { var odkaz = new zapis.odkaz(); odkaz.nazev = System.Net.WebUtility.HtmlDecode(relsdt[i].InnerText) + ": " + System.Net.WebUtility.HtmlDecode(relsdd[i].InnerText); odkaz.url = "https://www.vlada.cz" + XPath.Tools.GetNodeAttributeValue(relsdd[i], "./a", "href"); odkazy.Add(odkaz); } zap.souvisejici = odkazy.ToArray(); } if (zap.vstupy.Count() == 0) { if (html.Contains("<ifram") || html.Contains("jwplayer") ) { zap.vstupy = new zapis.vstup[] { new zapis.vstup() { poradi = 1, text = "Přepis tiskové konference není dostupný, pouze videozáznam." } }; } else { zap.vstupy = new zapis.vstup[] { new zapis.vstup() { poradi = 1, text = "Přepis tiskové konference není dostupný." } }; } } Console.WriteLine(zap.Id + " " + zap.nazev); zap.PrepareBeforeSave(); var id = dsc.AddItemToDataset <zapis>(Parse.datasetname, zap, DatasetConnector.AddItemMode.Rewrite).Result; return(id); }
public static void DownloadAllData(DatasetConnector dsc, int?fromYear = null) { int ifromYear = fromYear ?? 2007; var years = Enumerable.Range(ifromYear, DateTime.Now.Year - ifromYear + 1); int totalSave = 0; List <string> agendy = new List <string>(); Devmasters.Batch.Manager.DoActionForAll(years.Reverse(), (y) => { agendy.AddRange(AgendaList(y).OrderBy(o => o)); return(new Devmasters.Batch.ActionOutputData()); } , Devmasters.Batch.Manager.DefaultOutputWriter , new Devmasters.Batch.ActionProgressWriter(0.1f).Write , false , maxDegreeOfParallelism: 5 ); Devmasters.Batch.Manager.DoActionForAll(agendy, (ag) => { Console.WriteLine("Getting " + ag + " "); var jednaniArr = ParseAgenda(ag); Console.WriteLine("SAVING " + ag + " "); foreach (var j in jednaniArr.OrderBy(m => m.Id)) { jednani exists = null; try { Console.Write(j.Id + " L"); exists = dsc.GetItemFromDataset <jednani>(datasetname, j.Id).Result; } catch (Exception) { } string id = ""; if (exists == null) { id = dsc.AddItemToDataset(datasetname, j, DatasetConnector.AddItemMode.Rewrite).Result; totalSave++; Console.Write("S"); } else { bool replace = false; //compare replace = replace || (exists.dokumenty?.Count() ?? 0) != (j.dokumenty?.Count() ?? 0); replace = replace || (exists.veklep?.Count() ?? 0) != (j.veklep?.Count() ?? 0); replace = replace || (exists.meni?.Count() ?? 0) != (j.meni?.Count() ?? 0); replace = replace || (exists.zmeneno?.Count() ?? 0) != (j.zmeneno?.Count() ?? 0); replace = replace || (exists.rusi?.Count() ?? 0) != (j.rusi?.Count() ?? 0); replace = replace || (exists.zruseno?.Count() ?? 0) != (j.zruseno?.Count() ?? 0); if (replace) { id = dsc.AddItemToDataset(datasetname, j, DatasetConnector.AddItemMode.Rewrite).Result; Console.Write("S"); totalSave++; } } Console.WriteLine("."); } return(new Devmasters.Batch.ActionOutputData()); } , null //Devmasters.Core.Batch.Manager.DefaultOutputWriter , null //new Devmasters.Core.Batch.ActionProgressWriter(0.1f).Write , false , maxDegreeOfParallelism: 5, prefix: "AGENDY: " ); Console.WriteLine(); Console.WriteLine(); Console.WriteLine("TOtal saved: " + totalSave); }