private void StoreToHlidacStatu()
        {
            var datasetConnector = new DatasetConnector(ApiToken);
            var dataset          = InsolvencniRejstrikDataset.InsolvencniRejstrik;

            while (TaskWs.Any(t => t.Status == TaskStatus.Running) || !ForDetailRequest.IsEmpty)
            {
                Rizeni rizeni;
                if (ForStore.TryDequeue(out rizeni))
                {
                    // retrying until get correct answer :)
                    while (true)
                    {
                        try
                        {
                            var result = datasetConnector.AddItemToDataset <Rizeni>(dataset, rizeni).Result;
                            break;
                        }
                        catch (Exception e)
                        {
                            AddError("Store", e);
                        }
                    }
                    Interlocked.Increment(ref StoredItems);
                    LastStoredItem = rizeni.ZahajeniRizeni;
                }
                else
                {
                    Thread.Sleep(100);
                }
            }
        }
        private void PrepareDataset(bool removeDatasetIfExists = false, bool updateDatasetIfExists = false)
        {
            var datasetConnector = new DatasetConnector(ApiToken);
            var dataset          = InsolvencniRejstrikDataset.InsolvencniRejstrik;

            var datasetExists = datasetConnector.DatasetExists(dataset).Result;

            if (datasetExists && removeDatasetIfExists)
            {
                Console.WriteLine("Maze se stary dataset");
                datasetConnector.DeleteDataset(dataset).Wait();
                datasetExists = false;
            }
            if (datasetExists && updateDatasetIfExists)
            {
                Console.WriteLine("Aktualizuje se dataset");
                Console.WriteLine(" > " + datasetConnector.UpdateDataset(dataset).Result);
            }
            if (!datasetExists)
            {
                Console.WriteLine("Vytvari se novy dataset");
                Console.WriteLine(" > " + datasetConnector.CreateDataset(dataset).Result);
            }
        }
Esempio n. 3
0
        static string _vyborUsneseni      = "https://www.psp.cz/sqw/hp.sqw?k={0}&kk=5&td=1&n={1}"; //idVyboru+5 , page

        public static void Vybor(DatasetConnector dsc, int vyborId, bool rewrite = false)
        {
            List <_usneseni> vsechnausneseni = VsechnaUsneseniVyboru(vyborId);

            Console.WriteLine($"Vybor {vyborId}");

            int    lastJednani        = 0;
            var    xp                 = GetPage(string.Format(_vyborHP, vyborId + 2));
            var    xpozvanky          = xp.GetNodes("//div[@id='main-content']//h2")[0].NextSibling.SelectNodes(".//tr//a");
            string jednaniUrlTemplate = "";

            bool komplexni = false;

            if (xpozvanky != null && xpozvanky.Count > 0)
            {
                var link = xpozvanky[0].GetAttributeValue("href", "");
                if (link.StartsWith("hp.sqw"))
                {
                    komplexni = true;
                    var tmp = GetRegexGroupValue(link, @"&cu=(?<cislo>\d*)", "cislo");
                    int.TryParse(tmp, out lastJednani);
                    if (lastJednani > 0)
                    {
                        jednaniUrlTemplate = _vyborJednaniHProot + link.Replace("&cu=" + lastJednani, "&cu={0}");
                    }
                }
                else
                {
                    komplexni = false;
                    var tmp = GetRegexGroupValue(System.Net.WebUtility.HtmlDecode(xpozvanky[0].InnerText), @"č\. \s* (?<cislo>\d*)", "cislo");
                    int.TryParse(tmp, out lastJednani);
                }
            }

            if (komplexni && lastJednani > 0 && !string.IsNullOrEmpty(jednaniUrlTemplate))
            {
                //parse HP jednani
                //for (int cisloJednani = lastJednani; cisloJednani > 0; cisloJednani--)
                Devmasters.Batch.Manager.DoActionForAll <int>(Enumerable.Range(1, lastJednani).Reverse(),
                                                              (cisloJednani) =>
                {
                    var jednani = JednaniKomplexni(vyborId, cisloJednani, string.Format(jednaniUrlTemplate, cisloJednani));
                    if (jednani == null)
                    {
                        return(new Devmasters.Batch.ActionOutputData());
                    }
                    //add usneseni
                    var docFromUsneseni = vsechnausneseni
                                          .Where(m => m.datum == jednani.datum)
                                          .Select(m => new jednani.dokument()
                    {
                        typ         = "Usnesení",
                        DocumentUrl = m.fileUrl,
                        popis       = m.popis,
                        jmeno       = $"usneseni_{m.cislo}.docx"
                    }).ToList();
                    if (docFromUsneseni.Count() > 0)
                    {
                        jednani.dokumenty = jednani.dokumenty.Concat(docFromUsneseni).ToArray();
                    }
                    jednani.SetId();
                    //merge with existing

                    jednani exists = null;
                    try
                    {
                        exists = dsc.GetItemFromDataset <jednani>(datasetname, jednani.Id).Result;
                    }
                    catch (Exception)
                    {
                    }

                    bool changed = true;
                    if (exists != null)
                    {
                        jednani = jednani.Merge(exists, jednani, out changed);
                    }

                    //add OCR
                    Devmasters.Batch.Manager.DoActionForAll <jednani.dokument>(jednani.dokumenty,
                                                                               d =>
                    {
                        if (string.IsNullOrWhiteSpace(d.DocumentPlainText))
                        {
                            Console.WriteLine("OCR for " + d.DocumentUrl);
                            var ocrRes = HlidacStatu.Lib.OCR.Api.Client.TextFromUrl(
                                Devmasters.Config.GetWebConfigValue("OCRServerApiKey"),
                                new Uri(d.DocumentUrl),
                                "Vybory-PSP-parser", HlidacStatu.Lib.OCR.Api.Client.TaskPriority.High,
                                HlidacStatu.Lib.OCR.Api.Client.MiningIntensity.Maximum,
                                null, TimeSpan.FromMinutes(120));

                            if (ocrRes.IsValid == HlidacStatu.Lib.OCR.Api.Result.ResultStatus.Valid)
                            {
                                d.DocumentPlainText = ocrRes.MergedDocuments().Text;
                                changed             = true;
                            }
                            else
                            {
                                Console.WriteLine($"Invalid OCR {d.DocumentUrl} - {ocrRes.Error}");
                            }
                        }
                        return(new Devmasters.Batch.ActionOutputData());
                    }, null, null, !System.Diagnostics.Debugger.IsAttached, prefix: $"Jednani {cisloJednani} OCR:");


                    string jedn = "zápis";
                    if (jednani.dokumenty.Any(m => m.typ.ToLower().Contains(jedn)))
                    {
                        jednani.zapisJednani = string.Join(@"\n",
                                                           jednani.dokumenty.Where(m => m.typ.ToLower().Contains(jedn)).Select(m => m.DocumentPlainText)
                                                           );
                    }


                    string id = "";
                    if (changed)
                    {
                        id = dsc.AddItemToDataset(datasetname, jednani, DatasetConnector.AddItemMode.Rewrite).Result;
                    }
                    Console.WriteLine($"Saved vybor {jednani.vybor} jednani {jednani.Id} id {id}");
                    return(new Devmasters.Batch.ActionOutputData());
                }, null, null, !System.Diagnostics.Debugger.IsAttached, maxDegreeOfParallelism: 5, prefix: "cisloJednani: ");
            }
        }
Esempio n. 4
0
        static string ParseTiskovku(DatasetConnector dsc, zapis zap)
        {
            string html = "";

            using (var net = new Devmasters.Net.Web.URLContent(zap.url))
            {
                html = net.GetContent().Text;
            }
            var xp    = new XPath(html);
            var casti = xp.GetNodes("//div[@class='content-main']//div[@class='detail']/p");

            if (casti == null)
            {
                return(""); // zatim neni prepsana, jdi na dalsi
            }
            List <zapis.vstup> vyjadreni = new List <zapis.vstup>();
            int poradi = 1;

            zapis.vstup vyj = new zapis.vstup();
            vyj.poradi = poradi;
            string prevMluvci = "~~~";

            foreach (var cast in casti)
            {
                string text   = System.Net.WebUtility.HtmlDecode(cast.InnerText);
                string mluvci = XPath.Tools.GetNodeText(cast, "./strong");
                if (string.IsNullOrWhiteSpace(mluvci))
                {
                    vyj.text = vyj.text + "\n" + text;
                }
                else if (mluvci != prevMluvci)
                {
                    if (prevMluvci != "~~~" && !string.IsNullOrEmpty(vyj.jmeno) && !string.IsNullOrEmpty(vyj.prijmeni))
                    {
                        vyjadreni.Add(vyj);

                        poradi++;
                        vyj        = new zapis.vstup();
                        vyj.poradi = poradi;
                    }
                    using (var net = new Devmasters.Net.Web.URLContent($"https://www.hlidacstatu.cz/api/v1/PolitikFromText?text={System.Net.WebUtility.UrlEncode(mluvci)}&Authorization={System.Configuration.ConfigurationManager.AppSettings["apikey"]}"))
                    {
                        net.Timeout = 60 * 1000;
                        var osobahtml = net.GetContent().Text;
                        var osoba     = Newtonsoft.Json.Linq.JObject.Parse(osobahtml);
                        if (!string.IsNullOrEmpty(osoba.Value <string>("jmeno")))
                        {
                            vyj.jmeno    = osoba.Value <string>("jmeno");
                            vyj.prijmeni = osoba.Value <string>("prijmeni");
                            vyj.osobaId  = osoba.Value <string>("osobaid");
                            var info = Regex.Replace(mluvci, $@"({vyj.jmeno} \s {vyj.prijmeni} \s* ,?) | ({vyj.prijmeni} \s {vyj.jmeno} \s* ,?)", "", options)
                                       .Replace(":", "");
                            vyj.osobainfo = info?.Trim();
                        }
                        else
                        {
                            vyj.osobainfo = mluvci;
                        }
                    }
                    text       = text.Replace(mluvci, "").Trim();
                    vyj.text   = text;
                    prevMluvci = mluvci;
                }
            }
            if (!string.IsNullOrEmpty(vyj.text) && !string.IsNullOrEmpty(vyj.jmeno) && !string.IsNullOrEmpty(vyj.prijmeni))
            {
                vyjadreni.Add(vyj);
            }

            ////////// END vyjadreni

            zap.vstupy = vyjadreni.ToArray();

            //mp3
            var mp3urls = xp.GetNodes("//div[@class='record']//a");

            if (mp3urls != null && mp3urls.Count > 0)
            {
                var ahref = mp3urls
                            .FirstOrDefault(m => m.Attributes["href"]?.Value?.ToLower()?.EndsWith(".mp3") == true);
                if (ahref != null)
                {
                    zap.audio = new zapis.odkaz()
                    {
                        url = "https://www.vlada.cz" + ahref.Attributes["href"].Value, nazev = "Zvukový záznam"
                    }
                }
                ;
            }
            ////// MP3

            //RELATED

            var relsdt = xp.GetNodes("//dl[@class='related']/dt");
            var relsdd = xp.GetNodes("//dl[@class='related']/dd");

            if (relsdd?.Count > 0 && relsdd?.Count == relsdt?.Count)
            {
                List <zapis.odkaz> odkazy = new List <zapis.odkaz>();
                for (int i = 0; i < relsdt.Count; i++)
                {
                    var odkaz = new zapis.odkaz();
                    odkaz.nazev = System.Net.WebUtility.HtmlDecode(relsdt[i].InnerText) + ": " + System.Net.WebUtility.HtmlDecode(relsdd[i].InnerText);
                    odkaz.url   = "https://www.vlada.cz" + XPath.Tools.GetNodeAttributeValue(relsdd[i], "./a", "href");
                    odkazy.Add(odkaz);
                }
                zap.souvisejici = odkazy.ToArray();
            }

            if (zap.vstupy.Count() == 0)
            {
                if (html.Contains("<ifram") ||
                    html.Contains("jwplayer")
                    )
                {
                    zap.vstupy = new zapis.vstup[] { new zapis.vstup()
                                                     {
                                                         poradi = 1, text = "Přepis tiskové konference není dostupný, pouze videozáznam."
                                                     } };
                }
                else
                {
                    zap.vstupy = new zapis.vstup[] { new zapis.vstup()
                                                     {
                                                         poradi = 1, text = "Přepis tiskové konference není dostupný."
                                                     } };
                }
            }
            Console.WriteLine(zap.Id + " " + zap.nazev);
            zap.PrepareBeforeSave();
            var id = dsc.AddItemToDataset <zapis>(Parse.datasetname, zap, DatasetConnector.AddItemMode.Rewrite).Result;

            return(id);
        }
Esempio n. 5
0
        public static void DownloadAllData(DatasetConnector dsc, DateTime?from = null)
        {
            from = from ?? new DateTime(2006, 1, 1);

            List <zapis> tiskovky = new List <zapis>();

            int  page     = 1;
            bool nextPage = false;

            do
            {
                nextPage = false;
                string url = string.Format(Program.startUrl, page);
                Console.WriteLine($"Page {page}");

                string html = "";
                using (var net = new Devmasters.Net.Web.URLContent(url))
                {
                    html = net.GetContent().Text;
                }
                var xp    = new XPath(html);
                var items = xp.GetNodes("//div[@class='record-offset']//div[@class='record']");
                foreach (var item in items)
                {
                    zapis zap = new zapis();
                    zap.datum = DateTime.ParseExact(
                        XPath.Tools.GetNodeText(item, "./p[@class='info']"),
                        "d. M. yyyy",
                        System.Globalization.CultureInfo.GetCultureInfo("en-US"), System.Globalization.DateTimeStyles.AssumeLocal);
                    zap.nazev = XPath.Tools.GetNodeText(item, "./h2/a");
                    zap.url   = "https://www.vlada.cz" + XPath.Tools.GetNodeAttributeValue(item, "./h2/a", "href");

                    //https://www.vlada.cz/cz/media-centrum/tiskove-konference/videozaznam-z-tiskove-konference-predsedy-vlady-cr-mirka-topolanka-s-predsedou-vlady-sr-robertem-ficem-19001/
                    //id from unique page id from URL - last integer in url
                    zap.Id = Regex.Match(zap.url, @"\w* - (?<id>\d{1,6}) / $", options).Groups["id"].Value;

                    tiskovky.Add(zap);
                }
                if (tiskovky.Last().datum <= from)
                {
                    goto parse;
                }


                //Dokumenty 1301 až 1305 z 1305
                var counter = xp.GetNodeText("//p[@class='counter']");
                var m       = Regex.Match(counter, @"\d* \s* až \s* (?<to>\d{1,4})\s* z \s* (?<z>\d{1,4})", options);
                if (m.Success)
                {
                    var to = m.Groups["to"].Value;
                    var z  = m.Groups["z"].Value;
                    nextPage = to != z;
                }

                page++;
            } while (nextPage);



parse:
            Devmasters.Core.Batch.Manager.DoActionForAll <zapis>(tiskovky,
                                                                 (zap) =>
            {
                try
                {
                    var ret = ParseTiskovku(dsc, zap);
                }
                catch (Exception e)
                {
                    Console.WriteLine(e.ToString());
                }
                return(new Devmasters.Core.Batch.ActionOutputData());
            }
                                                                 , Devmasters.Core.Batch.Manager.DefaultOutputWriter
                                                                 , new Devmasters.Core.Batch.ActionProgressWriter(0.1f).Write
                                                                 , !System.Diagnostics.Debugger.IsAttached, 20
                                                                 );
        }
Esempio n. 6
0
        public static void DownloadAllData(DatasetConnector dsc, int?fromYear = null)
        {
            int ifromYear = fromYear ?? 2007;
            var years     = Enumerable.Range(ifromYear, DateTime.Now.Year - ifromYear + 1);
            int totalSave = 0;

            List <string> agendy = new List <string>();

            Devmasters.Batch.Manager.DoActionForAll(years.Reverse(),
                                                    (y) =>
            {
                agendy.AddRange(AgendaList(y).OrderBy(o => o));
                return(new Devmasters.Batch.ActionOutputData());
            }
                                                    , Devmasters.Batch.Manager.DefaultOutputWriter
                                                    , new Devmasters.Batch.ActionProgressWriter(0.1f).Write
                                                    , false
                                                    , maxDegreeOfParallelism: 5
                                                    );

            Devmasters.Batch.Manager.DoActionForAll(agendy,
                                                    (ag) =>
            {
                Console.WriteLine("Getting " + ag + " ");
                var jednaniArr = ParseAgenda(ag);

                Console.WriteLine("SAVING " + ag + " ");

                foreach (var j in jednaniArr.OrderBy(m => m.Id))
                {
                    jednani exists = null;
                    try
                    {
                        Console.Write(j.Id + " L");
                        exists = dsc.GetItemFromDataset <jednani>(datasetname, j.Id).Result;
                    }
                    catch (Exception)
                    {
                    }

                    string id = "";
                    if (exists == null)
                    {
                        id = dsc.AddItemToDataset(datasetname, j, DatasetConnector.AddItemMode.Rewrite).Result;
                        totalSave++;
                        Console.Write("S");
                    }
                    else
                    {
                        bool replace = false;
                        //compare
                        replace = replace ||
                                  (exists.dokumenty?.Count() ?? 0) != (j.dokumenty?.Count() ?? 0);

                        replace = replace ||
                                  (exists.veklep?.Count() ?? 0) != (j.veklep?.Count() ?? 0);

                        replace = replace ||
                                  (exists.meni?.Count() ?? 0) != (j.meni?.Count() ?? 0);
                        replace = replace ||
                                  (exists.zmeneno?.Count() ?? 0) != (j.zmeneno?.Count() ?? 0);
                        replace = replace ||
                                  (exists.rusi?.Count() ?? 0) != (j.rusi?.Count() ?? 0);
                        replace = replace ||
                                  (exists.zruseno?.Count() ?? 0) != (j.zruseno?.Count() ?? 0);

                        if (replace)
                        {
                            id = dsc.AddItemToDataset(datasetname, j, DatasetConnector.AddItemMode.Rewrite).Result;
                            Console.Write("S");
                            totalSave++;
                        }
                    }
                    Console.WriteLine(".");
                }
                return(new Devmasters.Batch.ActionOutputData());
            }
                                                    , null //Devmasters.Core.Batch.Manager.DefaultOutputWriter
                                                    , null //new Devmasters.Core.Batch.ActionProgressWriter(0.1f).Write
                                                    , false
                                                    , maxDegreeOfParallelism: 5, prefix: "AGENDY: "
                                                    );

            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine("TOtal saved: " + totalSave);
        }