Esempio n. 1
0
        public static jednani Merge(jednani prim, jednani sec, out bool changed)
        {
            Console.WriteLine($"Merging {prim.cisloJednani} with {sec.cisloJednani}");
            changed = false;
            if (!string.IsNullOrEmpty(sec.vec) && prim.vec != sec.vec)
            {
                prim.vec = sec.vec;
                changed  = true;
            }

            //merge
            if (sec.audio?.Length != prim.audio?.Length)
            {
                if (sec.audio == null)
                {
                    sec.audio = prim.audio;
                }
                else
                {
                    List <mp3> docs = new List <mp3>(prim.audio ?? new mp3[] { });
                    foreach (var a in sec.audio ?? new mp3[] { })
                    {
                        if (!docs.Any(m => m.DocumentUrl == a.DocumentUrl))
                        {
                            docs.Add(a);
                            changed = true;
                        }
                    }
                    prim.audio = docs.ToArray();
                }
            }
            if (sec.dokumenty?.Length != prim.dokumenty?.Length)
            {
                if (sec.dokumenty == null)
                {
                    sec.dokumenty = prim.dokumenty;
                }
                else
                {
                    List <dokument> docs = new List <dokument>(prim.dokumenty ?? new dokument[] { });
                    foreach (var a in sec.dokumenty ?? new dokument[] { })
                    {
                        if (!docs.Any(m => m.DocumentUrl == a.DocumentUrl))
                        {
                            docs.Add(a);
                            changed = true;
                        }
                    }
                    prim.dokumenty = docs.ToArray();
                }
            }

            return(prim);
        }
Esempio n. 2
0
        static string _vyborUsneseni      = "https://www.psp.cz/sqw/hp.sqw?k={0}&kk=5&td=1&n={1}"; //idVyboru+5 , page

        public static void Vybor(DatasetConnector dsc, int vyborId, bool rewrite = false)
        {
            List <_usneseni> vsechnausneseni = VsechnaUsneseniVyboru(vyborId);

            Console.WriteLine($"Vybor {vyborId}");

            int    lastJednani        = 0;
            var    xp                 = GetPage(string.Format(_vyborHP, vyborId + 2));
            var    xpozvanky          = xp.GetNodes("//div[@id='main-content']//h2")[0].NextSibling.SelectNodes(".//tr//a");
            string jednaniUrlTemplate = "";

            bool komplexni = false;

            if (xpozvanky != null && xpozvanky.Count > 0)
            {
                var link = xpozvanky[0].GetAttributeValue("href", "");
                if (link.StartsWith("hp.sqw"))
                {
                    komplexni = true;
                    var tmp = GetRegexGroupValue(link, @"&cu=(?<cislo>\d*)", "cislo");
                    int.TryParse(tmp, out lastJednani);
                    if (lastJednani > 0)
                    {
                        jednaniUrlTemplate = _vyborJednaniHProot + link.Replace("&cu=" + lastJednani, "&cu={0}");
                    }
                }
                else
                {
                    komplexni = false;
                    var tmp = GetRegexGroupValue(System.Net.WebUtility.HtmlDecode(xpozvanky[0].InnerText), @"č\. \s* (?<cislo>\d*)", "cislo");
                    int.TryParse(tmp, out lastJednani);
                }
            }

            if (komplexni && lastJednani > 0 && !string.IsNullOrEmpty(jednaniUrlTemplate))
            {
                //parse HP jednani
                //for (int cisloJednani = lastJednani; cisloJednani > 0; cisloJednani--)
                Devmasters.Batch.Manager.DoActionForAll <int>(Enumerable.Range(1, lastJednani).Reverse(),
                                                              (cisloJednani) =>
                {
                    var jednani = JednaniKomplexni(vyborId, cisloJednani, string.Format(jednaniUrlTemplate, cisloJednani));
                    if (jednani == null)
                    {
                        return(new Devmasters.Batch.ActionOutputData());
                    }
                    //add usneseni
                    var docFromUsneseni = vsechnausneseni
                                          .Where(m => m.datum == jednani.datum)
                                          .Select(m => new jednani.dokument()
                    {
                        typ         = "Usnesení",
                        DocumentUrl = m.fileUrl,
                        popis       = m.popis,
                        jmeno       = $"usneseni_{m.cislo}.docx"
                    }).ToList();
                    if (docFromUsneseni.Count() > 0)
                    {
                        jednani.dokumenty = jednani.dokumenty.Concat(docFromUsneseni).ToArray();
                    }
                    jednani.SetId();
                    //merge with existing

                    jednani exists = null;
                    try
                    {
                        exists = dsc.GetItemFromDataset <jednani>(datasetname, jednani.Id).Result;
                    }
                    catch (Exception)
                    {
                    }

                    bool changed = true;
                    if (exists != null)
                    {
                        jednani = jednani.Merge(exists, jednani, out changed);
                    }

                    //add OCR
                    Devmasters.Batch.Manager.DoActionForAll <jednani.dokument>(jednani.dokumenty,
                                                                               d =>
                    {
                        if (string.IsNullOrWhiteSpace(d.DocumentPlainText))
                        {
                            Console.WriteLine("OCR for " + d.DocumentUrl);
                            var ocrRes = HlidacStatu.Lib.OCR.Api.Client.TextFromUrl(
                                Devmasters.Config.GetWebConfigValue("OCRServerApiKey"),
                                new Uri(d.DocumentUrl),
                                "Vybory-PSP-parser", HlidacStatu.Lib.OCR.Api.Client.TaskPriority.High,
                                HlidacStatu.Lib.OCR.Api.Client.MiningIntensity.Maximum,
                                null, TimeSpan.FromMinutes(120));

                            if (ocrRes.IsValid == HlidacStatu.Lib.OCR.Api.Result.ResultStatus.Valid)
                            {
                                d.DocumentPlainText = ocrRes.MergedDocuments().Text;
                                changed             = true;
                            }
                            else
                            {
                                Console.WriteLine($"Invalid OCR {d.DocumentUrl} - {ocrRes.Error}");
                            }
                        }
                        return(new Devmasters.Batch.ActionOutputData());
                    }, null, null, !System.Diagnostics.Debugger.IsAttached, prefix: $"Jednani {cisloJednani} OCR:");


                    string jedn = "zápis";
                    if (jednani.dokumenty.Any(m => m.typ.ToLower().Contains(jedn)))
                    {
                        jednani.zapisJednani = string.Join(@"\n",
                                                           jednani.dokumenty.Where(m => m.typ.ToLower().Contains(jedn)).Select(m => m.DocumentPlainText)
                                                           );
                    }


                    string id = "";
                    if (changed)
                    {
                        id = dsc.AddItemToDataset(datasetname, jednani, DatasetConnector.AddItemMode.Rewrite).Result;
                    }
                    Console.WriteLine($"Saved vybor {jednani.vybor} jednani {jednani.Id} id {id}");
                    return(new Devmasters.Batch.ActionOutputData());
                }, null, null, !System.Diagnostics.Debugger.IsAttached, maxDegreeOfParallelism: 5, prefix: "cisloJednani: ");
            }
        }
Esempio n. 3
0
        private static jednani JednaniKomplexni(int vyborId, int cisloJednani, string url)
        {
            Console.WriteLine($"Vybor {vyborId} jednani {cisloJednani}");

            var     xp = GetPage(url);
            jednani j  = new jednani();

            j.cisloJednani = cisloJednani;

            //simpledatum
            string sdatum = "";

            sdatum = GetRegexGroupValue(
                xp.GetNodeText("//div[@id='main-content']//h1")
                , @"\(\s*  (?<datum>\d{1,2}\. \s \w{4,15} \s \d{4})  \s* \)"
                , "datum");
            if (string.IsNullOrEmpty(sdatum))
            {
                string regexKombiDatum = @"\(\s* 

((?<den>\d{1,2})\. \s* ((?<mesic>\w{4,15})\s* a|a|až) )? 

\s* \d{1,2}\. \s (?<mesic>\w{4,15}) \s (?<rok>\d{4})

\s* \)";
                //kombinovany datum
                var sden = GetRegexGroupValue(
                    xp.GetNodeText("//div[@id='main-content']//h1")
                    , regexKombiDatum
                    , "den");
                var smesic = GetRegexGroupValue(
                    xp.GetNodeText("//div[@id='main-content']//h1")
                    , regexKombiDatum
                    , "mesic");
                var srok = GetRegexGroupValue(
                    xp.GetNodeText("//div[@id='main-content']//h1")
                    , regexKombiDatum
                    , "rok");

                sdatum = $"{sden}. {smesic} {srok}";
            }

            try
            {
                j.datum = DateTime.ParseExact(sdatum, "d. MMMM yyyy", System.Globalization.CultureInfo.GetCultureInfo("cs-CZ"));
            }
            catch (Exception)
            {
                return(null);
            }
            j.vybor    = Vybory[vyborId];
            j.vyborId  = vyborId;
            j.vyborUrl = string.Format(_vyborHP, vyborId + 2);
            j.SetId();
            //typy dokumentu
            var xtypes = xp.GetNodes("//div[@id='main-content']//h4");
            List <jednani.dokument> docs = new List <jednani.dokument>();
            List <jednani.mp3>      mp3s = new List <jednani.mp3>();


            for (int ityp = 0; ityp < xtypes.Count; ityp++)
            {
                var typDok = xtypes[ityp].InnerText;
                var xrows  = xtypes[ityp].NextSibling.SelectNodes(".//tr");
                if (xrows != null)
                {
                    foreach (var xrow in xrows)
                    {
                        var rootUrl  = "https://www.psp.cz/sqw/";
                        var fUrlNode = XPath.Tools.GetNode(xrow, "td[1]/a");
                        if (fUrlNode != null)
                        {
                            var fUrl = fUrlNode.GetAttributeValue("href", "");
                            if (fUrl.StartsWith("text/text")) //link to another page
                            {
                                FilePage(rootUrl + fUrl, out string title, out string fileUrl, out string ext);
                                if (!string.IsNullOrEmpty(fileUrl))
                                {
                                    docs.Add(new jednani.dokument()
                                    {
                                        DocumentUrl = fileUrl,
                                        jmeno       = MakeValidFileName(title) + ext,
                                        popis       = title,
                                        typ         = typDok,
                                    });
                                }
                            }
                            else if (fUrl.StartsWith("text/orig"))
                            {
                                if (typDok.ToLower().Contains("mp3"))
                                {
                                    //direct link to file
                                    var jmp3 = new jednani.mp3()
                                    {
                                        DocumentUrl = rootUrl + fUrl,
                                        jmeno       = MakeValidFileName(fUrlNode.InnerText),
                                    };
                                    mp3s.Add(jmp3);
                                    MP3    mp3    = new MP3(Program.mp3path, Program.apikey);
                                    string smp3id = mp3s.Count == 1 ? "" : "_" + mp3s.Count;
                                    var    blocks = mp3.CheckDownloadAndStartV2TOrGet(Parse.datasetname, j.Id + smp3id, rootUrl + fUrl);
                                    if (blocks != null)
                                    {
                                        jmp3.DocumentPlainText = Devmasters.SpeechToText.VoiceToTextFormatter.TextWithTimestampsToText(blocks);
                                        jmp3.prepisAudia       = blocks
                                                                 .Select(t => new jednani.mp3.blok()
                                        {
                                            sekundOdZacatku = (long)t.Start.TotalSeconds, text = t.Text
                                        })
                                                                 .ToArray();
                                    }
                                }
                                else
                                {
                                    //direct link to file
                                    docs.Add(new jednani.dokument()
                                    {
                                        DocumentUrl = rootUrl + fUrl,
                                        jmeno       = MakeValidFileName(fUrlNode.InnerText),
                                        popis       = "",
                                        typ         = typDok,
                                    });
                                }
                            }
                        } //fUrlNode != null
                    }
                }
            }

            j.audio = mp3s.Count == 0 ? null : mp3s.ToArray();


            j.dokumenty = docs.Count == 0 ? null : docs.ToArray();


            return(j);
        }