public static jednani Merge(jednani prim, jednani sec, out bool changed) { Console.WriteLine($"Merging {prim.cisloJednani} with {sec.cisloJednani}"); changed = false; if (!string.IsNullOrEmpty(sec.vec) && prim.vec != sec.vec) { prim.vec = sec.vec; changed = true; } //merge if (sec.audio?.Length != prim.audio?.Length) { if (sec.audio == null) { sec.audio = prim.audio; } else { List <mp3> docs = new List <mp3>(prim.audio ?? new mp3[] { }); foreach (var a in sec.audio ?? new mp3[] { }) { if (!docs.Any(m => m.DocumentUrl == a.DocumentUrl)) { docs.Add(a); changed = true; } } prim.audio = docs.ToArray(); } } if (sec.dokumenty?.Length != prim.dokumenty?.Length) { if (sec.dokumenty == null) { sec.dokumenty = prim.dokumenty; } else { List <dokument> docs = new List <dokument>(prim.dokumenty ?? new dokument[] { }); foreach (var a in sec.dokumenty ?? new dokument[] { }) { if (!docs.Any(m => m.DocumentUrl == a.DocumentUrl)) { docs.Add(a); changed = true; } } prim.dokumenty = docs.ToArray(); } } return(prim); }
static string _vyborUsneseni = "https://www.psp.cz/sqw/hp.sqw?k={0}&kk=5&td=1&n={1}"; //idVyboru+5 , page public static void Vybor(DatasetConnector dsc, int vyborId, bool rewrite = false) { List <_usneseni> vsechnausneseni = VsechnaUsneseniVyboru(vyborId); Console.WriteLine($"Vybor {vyborId}"); int lastJednani = 0; var xp = GetPage(string.Format(_vyborHP, vyborId + 2)); var xpozvanky = xp.GetNodes("//div[@id='main-content']//h2")[0].NextSibling.SelectNodes(".//tr//a"); string jednaniUrlTemplate = ""; bool komplexni = false; if (xpozvanky != null && xpozvanky.Count > 0) { var link = xpozvanky[0].GetAttributeValue("href", ""); if (link.StartsWith("hp.sqw")) { komplexni = true; var tmp = GetRegexGroupValue(link, @"&cu=(?<cislo>\d*)", "cislo"); int.TryParse(tmp, out lastJednani); if (lastJednani > 0) { jednaniUrlTemplate = _vyborJednaniHProot + link.Replace("&cu=" + lastJednani, "&cu={0}"); } } else { komplexni = false; var tmp = GetRegexGroupValue(System.Net.WebUtility.HtmlDecode(xpozvanky[0].InnerText), @"č\. \s* (?<cislo>\d*)", "cislo"); int.TryParse(tmp, out lastJednani); } } if (komplexni && lastJednani > 0 && !string.IsNullOrEmpty(jednaniUrlTemplate)) { //parse HP jednani //for (int cisloJednani = lastJednani; cisloJednani > 0; cisloJednani--) Devmasters.Batch.Manager.DoActionForAll <int>(Enumerable.Range(1, lastJednani).Reverse(), (cisloJednani) => { var jednani = JednaniKomplexni(vyborId, cisloJednani, string.Format(jednaniUrlTemplate, cisloJednani)); if (jednani == null) { return(new Devmasters.Batch.ActionOutputData()); } //add usneseni var docFromUsneseni = vsechnausneseni .Where(m => m.datum == jednani.datum) .Select(m => new jednani.dokument() { typ = "Usnesení", DocumentUrl = m.fileUrl, popis = m.popis, jmeno = $"usneseni_{m.cislo}.docx" }).ToList(); if (docFromUsneseni.Count() > 0) { jednani.dokumenty = jednani.dokumenty.Concat(docFromUsneseni).ToArray(); } jednani.SetId(); //merge with existing jednani exists = null; try { exists = dsc.GetItemFromDataset <jednani>(datasetname, jednani.Id).Result; } catch (Exception) { } bool changed = true; if (exists != null) { jednani = jednani.Merge(exists, jednani, out changed); } //add OCR Devmasters.Batch.Manager.DoActionForAll <jednani.dokument>(jednani.dokumenty, d => { if (string.IsNullOrWhiteSpace(d.DocumentPlainText)) { Console.WriteLine("OCR for " + d.DocumentUrl); var ocrRes = HlidacStatu.Lib.OCR.Api.Client.TextFromUrl( Devmasters.Config.GetWebConfigValue("OCRServerApiKey"), new Uri(d.DocumentUrl), "Vybory-PSP-parser", HlidacStatu.Lib.OCR.Api.Client.TaskPriority.High, HlidacStatu.Lib.OCR.Api.Client.MiningIntensity.Maximum, null, TimeSpan.FromMinutes(120)); if (ocrRes.IsValid == HlidacStatu.Lib.OCR.Api.Result.ResultStatus.Valid) { d.DocumentPlainText = ocrRes.MergedDocuments().Text; changed = true; } else { Console.WriteLine($"Invalid OCR {d.DocumentUrl} - {ocrRes.Error}"); } } return(new Devmasters.Batch.ActionOutputData()); }, null, null, !System.Diagnostics.Debugger.IsAttached, prefix: $"Jednani {cisloJednani} OCR:"); string jedn = "zápis"; if (jednani.dokumenty.Any(m => m.typ.ToLower().Contains(jedn))) { jednani.zapisJednani = string.Join(@"\n", jednani.dokumenty.Where(m => m.typ.ToLower().Contains(jedn)).Select(m => m.DocumentPlainText) ); } string id = ""; if (changed) { id = dsc.AddItemToDataset(datasetname, jednani, DatasetConnector.AddItemMode.Rewrite).Result; } Console.WriteLine($"Saved vybor {jednani.vybor} jednani {jednani.Id} id {id}"); return(new Devmasters.Batch.ActionOutputData()); }, null, null, !System.Diagnostics.Debugger.IsAttached, maxDegreeOfParallelism: 5, prefix: "cisloJednani: "); } }
private static jednani JednaniKomplexni(int vyborId, int cisloJednani, string url) { Console.WriteLine($"Vybor {vyborId} jednani {cisloJednani}"); var xp = GetPage(url); jednani j = new jednani(); j.cisloJednani = cisloJednani; //simpledatum string sdatum = ""; sdatum = GetRegexGroupValue( xp.GetNodeText("//div[@id='main-content']//h1") , @"\(\s* (?<datum>\d{1,2}\. \s \w{4,15} \s \d{4}) \s* \)" , "datum"); if (string.IsNullOrEmpty(sdatum)) { string regexKombiDatum = @"\(\s* ((?<den>\d{1,2})\. \s* ((?<mesic>\w{4,15})\s* a|a|až) )? \s* \d{1,2}\. \s (?<mesic>\w{4,15}) \s (?<rok>\d{4}) \s* \)"; //kombinovany datum var sden = GetRegexGroupValue( xp.GetNodeText("//div[@id='main-content']//h1") , regexKombiDatum , "den"); var smesic = GetRegexGroupValue( xp.GetNodeText("//div[@id='main-content']//h1") , regexKombiDatum , "mesic"); var srok = GetRegexGroupValue( xp.GetNodeText("//div[@id='main-content']//h1") , regexKombiDatum , "rok"); sdatum = $"{sden}. {smesic} {srok}"; } try { j.datum = DateTime.ParseExact(sdatum, "d. MMMM yyyy", System.Globalization.CultureInfo.GetCultureInfo("cs-CZ")); } catch (Exception) { return(null); } j.vybor = Vybory[vyborId]; j.vyborId = vyborId; j.vyborUrl = string.Format(_vyborHP, vyborId + 2); j.SetId(); //typy dokumentu var xtypes = xp.GetNodes("//div[@id='main-content']//h4"); List <jednani.dokument> docs = new List <jednani.dokument>(); List <jednani.mp3> mp3s = new List <jednani.mp3>(); for (int ityp = 0; ityp < xtypes.Count; ityp++) { var typDok = xtypes[ityp].InnerText; var xrows = xtypes[ityp].NextSibling.SelectNodes(".//tr"); if (xrows != null) { foreach (var xrow in xrows) { var rootUrl = "https://www.psp.cz/sqw/"; var fUrlNode = XPath.Tools.GetNode(xrow, "td[1]/a"); if (fUrlNode != null) { var fUrl = fUrlNode.GetAttributeValue("href", ""); if (fUrl.StartsWith("text/text")) //link to another page { FilePage(rootUrl + fUrl, out string title, out string fileUrl, out string ext); if (!string.IsNullOrEmpty(fileUrl)) { docs.Add(new jednani.dokument() { DocumentUrl = fileUrl, jmeno = MakeValidFileName(title) + ext, popis = title, typ = typDok, }); } } else if (fUrl.StartsWith("text/orig")) { if (typDok.ToLower().Contains("mp3")) { //direct link to file var jmp3 = new jednani.mp3() { DocumentUrl = rootUrl + fUrl, jmeno = MakeValidFileName(fUrlNode.InnerText), }; mp3s.Add(jmp3); MP3 mp3 = new MP3(Program.mp3path, Program.apikey); string smp3id = mp3s.Count == 1 ? "" : "_" + mp3s.Count; var blocks = mp3.CheckDownloadAndStartV2TOrGet(Parse.datasetname, j.Id + smp3id, rootUrl + fUrl); if (blocks != null) { jmp3.DocumentPlainText = Devmasters.SpeechToText.VoiceToTextFormatter.TextWithTimestampsToText(blocks); jmp3.prepisAudia = blocks .Select(t => new jednani.mp3.blok() { sekundOdZacatku = (long)t.Start.TotalSeconds, text = t.Text }) .ToArray(); } } else { //direct link to file docs.Add(new jednani.dokument() { DocumentUrl = rootUrl + fUrl, jmeno = MakeValidFileName(fUrlNode.InnerText), popis = "", typ = typDok, }); } } } //fUrlNode != null } } } j.audio = mp3s.Count == 0 ? null : mp3s.ToArray(); j.dokumenty = docs.Count == 0 ? null : docs.ToArray(); return(j); }