Exemplo n.º 1
0
        static string _vyborUsneseni      = "https://www.psp.cz/sqw/hp.sqw?k={0}&kk=5&td=1&n={1}"; //idVyboru+5 , page

        public static void Vybor(DatasetConnector dsc, int vyborId, bool rewrite = false)
        {
            List <_usneseni> vsechnausneseni = VsechnaUsneseniVyboru(vyborId);

            Console.WriteLine($"Vybor {vyborId}");

            int    lastJednani        = 0;
            var    xp                 = GetPage(string.Format(_vyborHP, vyborId + 2));
            var    xpozvanky          = xp.GetNodes("//div[@id='main-content']//h2")[0].NextSibling.SelectNodes(".//tr//a");
            string jednaniUrlTemplate = "";

            bool komplexni = false;

            if (xpozvanky != null && xpozvanky.Count > 0)
            {
                var link = xpozvanky[0].GetAttributeValue("href", "");
                if (link.StartsWith("hp.sqw"))
                {
                    komplexni = true;
                    var tmp = GetRegexGroupValue(link, @"&cu=(?<cislo>\d*)", "cislo");
                    int.TryParse(tmp, out lastJednani);
                    if (lastJednani > 0)
                    {
                        jednaniUrlTemplate = _vyborJednaniHProot + link.Replace("&cu=" + lastJednani, "&cu={0}");
                    }
                }
                else
                {
                    komplexni = false;
                    var tmp = GetRegexGroupValue(System.Net.WebUtility.HtmlDecode(xpozvanky[0].InnerText), @"č\. \s* (?<cislo>\d*)", "cislo");
                    int.TryParse(tmp, out lastJednani);
                }
            }

            if (komplexni && lastJednani > 0 && !string.IsNullOrEmpty(jednaniUrlTemplate))
            {
                //parse HP jednani
                //for (int cisloJednani = lastJednani; cisloJednani > 0; cisloJednani--)
                Devmasters.Batch.Manager.DoActionForAll <int>(Enumerable.Range(1, lastJednani).Reverse(),
                                                              (cisloJednani) =>
                {
                    var jednani = JednaniKomplexni(vyborId, cisloJednani, string.Format(jednaniUrlTemplate, cisloJednani));
                    if (jednani == null)
                    {
                        return(new Devmasters.Batch.ActionOutputData());
                    }
                    //add usneseni
                    var docFromUsneseni = vsechnausneseni
                                          .Where(m => m.datum == jednani.datum)
                                          .Select(m => new jednani.dokument()
                    {
                        typ         = "Usnesení",
                        DocumentUrl = m.fileUrl,
                        popis       = m.popis,
                        jmeno       = $"usneseni_{m.cislo}.docx"
                    }).ToList();
                    if (docFromUsneseni.Count() > 0)
                    {
                        jednani.dokumenty = jednani.dokumenty.Concat(docFromUsneseni).ToArray();
                    }
                    jednani.SetId();
                    //merge with existing

                    jednani exists = null;
                    try
                    {
                        exists = dsc.GetItemFromDataset <jednani>(datasetname, jednani.Id).Result;
                    }
                    catch (Exception)
                    {
                    }

                    bool changed = true;
                    if (exists != null)
                    {
                        jednani = jednani.Merge(exists, jednani, out changed);
                    }

                    //add OCR
                    Devmasters.Batch.Manager.DoActionForAll <jednani.dokument>(jednani.dokumenty,
                                                                               d =>
                    {
                        if (string.IsNullOrWhiteSpace(d.DocumentPlainText))
                        {
                            Console.WriteLine("OCR for " + d.DocumentUrl);
                            var ocrRes = HlidacStatu.Lib.OCR.Api.Client.TextFromUrl(
                                Devmasters.Config.GetWebConfigValue("OCRServerApiKey"),
                                new Uri(d.DocumentUrl),
                                "Vybory-PSP-parser", HlidacStatu.Lib.OCR.Api.Client.TaskPriority.High,
                                HlidacStatu.Lib.OCR.Api.Client.MiningIntensity.Maximum,
                                null, TimeSpan.FromMinutes(120));

                            if (ocrRes.IsValid == HlidacStatu.Lib.OCR.Api.Result.ResultStatus.Valid)
                            {
                                d.DocumentPlainText = ocrRes.MergedDocuments().Text;
                                changed             = true;
                            }
                            else
                            {
                                Console.WriteLine($"Invalid OCR {d.DocumentUrl} - {ocrRes.Error}");
                            }
                        }
                        return(new Devmasters.Batch.ActionOutputData());
                    }, null, null, !System.Diagnostics.Debugger.IsAttached, prefix: $"Jednani {cisloJednani} OCR:");


                    string jedn = "zápis";
                    if (jednani.dokumenty.Any(m => m.typ.ToLower().Contains(jedn)))
                    {
                        jednani.zapisJednani = string.Join(@"\n",
                                                           jednani.dokumenty.Where(m => m.typ.ToLower().Contains(jedn)).Select(m => m.DocumentPlainText)
                                                           );
                    }


                    string id = "";
                    if (changed)
                    {
                        id = dsc.AddItemToDataset(datasetname, jednani, DatasetConnector.AddItemMode.Rewrite).Result;
                    }
                    Console.WriteLine($"Saved vybor {jednani.vybor} jednani {jednani.Id} id {id}");
                    return(new Devmasters.Batch.ActionOutputData());
                }, null, null, !System.Diagnostics.Debugger.IsAttached, maxDegreeOfParallelism: 5, prefix: "cisloJednani: ");
            }
        }
Exemplo n.º 2
0
        public static void DownloadAllData(DatasetConnector dsc, int?fromYear = null)
        {
            int ifromYear = fromYear ?? 2007;
            var years     = Enumerable.Range(ifromYear, DateTime.Now.Year - ifromYear + 1);
            int totalSave = 0;

            List <string> agendy = new List <string>();

            Devmasters.Batch.Manager.DoActionForAll(years.Reverse(),
                                                    (y) =>
            {
                agendy.AddRange(AgendaList(y).OrderBy(o => o));
                return(new Devmasters.Batch.ActionOutputData());
            }
                                                    , Devmasters.Batch.Manager.DefaultOutputWriter
                                                    , new Devmasters.Batch.ActionProgressWriter(0.1f).Write
                                                    , false
                                                    , maxDegreeOfParallelism: 5
                                                    );

            Devmasters.Batch.Manager.DoActionForAll(agendy,
                                                    (ag) =>
            {
                Console.WriteLine("Getting " + ag + " ");
                var jednaniArr = ParseAgenda(ag);

                Console.WriteLine("SAVING " + ag + " ");

                foreach (var j in jednaniArr.OrderBy(m => m.Id))
                {
                    jednani exists = null;
                    try
                    {
                        Console.Write(j.Id + " L");
                        exists = dsc.GetItemFromDataset <jednani>(datasetname, j.Id).Result;
                    }
                    catch (Exception)
                    {
                    }

                    string id = "";
                    if (exists == null)
                    {
                        id = dsc.AddItemToDataset(datasetname, j, DatasetConnector.AddItemMode.Rewrite).Result;
                        totalSave++;
                        Console.Write("S");
                    }
                    else
                    {
                        bool replace = false;
                        //compare
                        replace = replace ||
                                  (exists.dokumenty?.Count() ?? 0) != (j.dokumenty?.Count() ?? 0);

                        replace = replace ||
                                  (exists.veklep?.Count() ?? 0) != (j.veklep?.Count() ?? 0);

                        replace = replace ||
                                  (exists.meni?.Count() ?? 0) != (j.meni?.Count() ?? 0);
                        replace = replace ||
                                  (exists.zmeneno?.Count() ?? 0) != (j.zmeneno?.Count() ?? 0);
                        replace = replace ||
                                  (exists.rusi?.Count() ?? 0) != (j.rusi?.Count() ?? 0);
                        replace = replace ||
                                  (exists.zruseno?.Count() ?? 0) != (j.zruseno?.Count() ?? 0);

                        if (replace)
                        {
                            id = dsc.AddItemToDataset(datasetname, j, DatasetConnector.AddItemMode.Rewrite).Result;
                            Console.Write("S");
                            totalSave++;
                        }
                    }
                    Console.WriteLine(".");
                }
                return(new Devmasters.Batch.ActionOutputData());
            }
                                                    , null //Devmasters.Core.Batch.Manager.DefaultOutputWriter
                                                    , null //new Devmasters.Core.Batch.ActionProgressWriter(0.1f).Write
                                                    , false
                                                    , maxDegreeOfParallelism: 5, prefix: "AGENDY: "
                                                    );

            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine("TOtal saved: " + totalSave);
        }