Пример #1
0
        public static ConnectionSettings GetElasticSearchConnectionSettings(string indexName, int timeOut = 60000, int?connectionLimit = null)
        {
            string esUrl = Devmasters.Config.GetWebConfigValue("ESConnection");

            //var singlePool = new Elasticsearch.Net.SingleNodeConnectionPool(new Uri(esUrl));
            var pool = new Elasticsearch.Net.StaticConnectionPool(esUrl
                                                                  .Split(';')
                                                                  .Where(m => !string.IsNullOrWhiteSpace(m))
                                                                  .Select(u => new Uri(u))
                                                                  );

            //var pool = new Elasticsearch.Net.SniffingConnectionPool(esUrl
            //    .Split(';')
            //    .Where(m=>!string.IsNullOrWhiteSpace(m))
            //    .Select(u => new Uri(u)));
            var settings = new ConnectionSettings(pool)
                           .DefaultIndex(indexName)
                           .DisableAutomaticProxyDetection(false)
                           .RequestTimeout(TimeSpan.FromMilliseconds(timeOut))
                           .SniffLifeSpan(null)
                           .OnRequestCompleted(call =>
            {
                // log out the request and the request body, if one exists for the type of request
                if (call.RequestBodyInBytes != null)
                {
                    ESTraceLogger.Debug($"{call.HttpMethod}\t{call.Uri}\t" +
                                        $"{Encoding.UTF8.GetString(call.RequestBodyInBytes)}");
                }
                else
                {
                    ESTraceLogger.Debug($"{call.HttpMethod}\t{call.Uri}\t");
                }
            })
            ;

            if (System.Diagnostics.Debugger.IsAttached || ESTraceLoggerExists || Devmasters.Config.GetWebConfigValue("ESDebugDataEnabled") == "true")
            {
                settings = settings.DisableDirectStreaming();
            }

            if (connectionLimit.HasValue)
            {
                settings = settings.ConnectionLimit(connectionLimit.Value);
            }

            //.ConnectionLimit(connectionLimit)
            //.MaximumRetries(2)

            //.SetProxy(new Uri("http://localhost.fiddler:8888"), "", "")


#if DEBUG
            //settings = settings.;
#endif
            return(settings);
        }
Пример #2
0
        private static void ProcessXML(Devmasters.Args args, string name)
        {
            logger.Debug($"Starting {name}.xml");
            if (System.IO.File.Exists(name + ".xml"))
            {
                if (args.Exists("/uselocal"))
                {
                    //skip next, use local file
                }
                else if (force || (DateTime.Now - new System.IO.FileInfo(name + ".xml").LastWriteTime).TotalDays > 4)
                {
                    logger.Debug($"downloading new {name}.xml");
                    Console.WriteLine($"Downloading new {name}");
                    DownloadFile(name);
                }
            }
            else
            {
                logger.Debug($"downloading new {name}.xml");
                Console.WriteLine($"Downloading {name}");
                DownloadFile(name);
            }

            if (!System.IO.File.Exists(name + ".xml"))
            {
                return;
            }

            rawXML d = null;

            Console.WriteLine($"Deserializing {name}");
            logger.Debug($"Deserializing {name}.xml");
            using (var xmlReader = new System.IO.StreamReader(name + ".xml"))
            {
                var serializer = new XmlSerializer(typeof(rawXML));
                d = (rawXML)serializer.Deserialize(xmlReader);
            }
            Console.WriteLine($"{d.Subjekt?.Count()} subjects");



            Devmasters.Batch.Manager.DoActionForAll <xmlSubjekt>(d.Subjekt //.Where(m=>m.ico== "3493661")  //debug
                                                                 , subj =>
            {
                majitele item = majitele.GetMajitele(subj);
                if (item != null && item?.skutecni_majitele?.Count() > 0)
                {
                    if (!ds.ItemExists(item.ico) || force)
                    {
                        item.UpdateOsobaId();
                        ds.AddOrUpdateItem(item, HlidacStatu.Api.V2.Dataset.Typed.ItemInsertMode.rewrite);
                    }
                    else
                    {
                        //check change
                        var old = ds.GetItem(item.ico);
                        if (old != null)
                        {
                            var same = true;
                            if (old.skutecni_majitele?.Count() != item.skutecni_majitele?.Count())
                            {
                                same = false;
                            }
                            else if (item.skutecni_majitele?.Count() == old.skutecni_majitele?.Count() && item.skutecni_majitele?.Count() > 0)
                            {
                                foreach (var sm in item.skutecni_majitele)
                                {
                                    same = same && old.skutecni_majitele.Any(m =>
                                                                             m.osoba_jmeno == sm.osoba_jmeno &&
                                                                             m.osoba_prijmeni == sm.osoba_prijmeni &&
                                                                             m.osoba_datum_narozeni == sm.osoba_datum_narozeni &&
                                                                             m.osoba_titul_pred == sm.osoba_titul_pred &&
                                                                             m.osoba_titul_za == sm.osoba_titul_za &&
                                                                             m.adresa_cast_obce == sm.adresa_cast_obce &&
                                                                             m.adresa_cislo_ev == sm.adresa_cislo_ev &&
                                                                             m.adresa_cislo_or == sm.adresa_cislo_or &&
                                                                             m.adresa_cislo_po == sm.adresa_cislo_po &&
                                                                             m.adresa_obec == sm.adresa_obec &&
                                                                             m.adresa_okres == sm.adresa_okres &&
                                                                             m.adresa_psc == sm.adresa_psc &&
                                                                             m.adresa_stat_nazev == sm.adresa_stat_nazev &&
                                                                             m.adresa_text == sm.adresa_text &&
                                                                             m.adresa_ulice == sm.adresa_ulice &&
                                                                             m.slovni_vyjadreni == sm.slovni_vyjadreni &&
                                                                             m.podil == sm.podil &&
                                                                             m.postaveni == sm.postaveni &&
                                                                             !string.IsNullOrEmpty(m.osobaId)
                                                                             );
                                }
                            }
                            if (same == false)
                            {
                                item.UpdateOsobaId();
                                ds.AddOrUpdateItem(item, HlidacStatu.Api.V2.Dataset.Typed.ItemInsertMode.rewrite);
                            }
                        }
                    }
                }
                return(new Devmasters.Batch.ActionOutputData());
            }, Devmasters.Batch.Manager.DefaultOutputWriter, Devmasters.Batch.Manager.DefaultProgressWriter,
                                                                 !System.Diagnostics.Debugger.IsAttached,
                                                                 maxDegreeOfParallelism: 4, prefix: $"{name} ITEMS ");
        }
Пример #3
0
        public static void Process(osoba o, string playlist, int threads, int max, string[] vids, string mp3path)
        {
            logger.Info($"Starting {o.Jmeno} {o.Prijmeni} {o.NameId} for {playlist} ");

            List <string> videos = null;

            if (vids?.Count() > 0)
            {
                videos = vids
                         .Select(m => "https://www.youtube.com/watch?v=" + m)
                         .ToList();
            }
            else
            {
                System.Diagnostics.ProcessStartInfo pi = new System.Diagnostics.ProcessStartInfo("youtube-dl",
                                                                                                 $"--flat-playlist --get-id --playlist-end {max} " + playlist
                                                                                                 );
                Devmasters.ProcessExecutor pe = new Devmasters.ProcessExecutor(pi, 60 * 6 * 24);
                logger.Info($"Starting Youtube-dl playlist video list ");
                pe.Start();

                videos = pe.StandardOutput
                         .Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries)
                         .Select(m => "https://www.youtube.com/watch?v=" + m)
                         .ToList();
            }
            Console.WriteLine();
            Console.WriteLine($"Processing {videos.Count} videos");

            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();
            Devmasters.Batch.Manager.DoActionForAll(videos,
                                                    vid =>
            {
                string uniqId = record.UniqueID(vid);
                record rec    = null;
                bool merge    = false;
                bool changed  = false;
                if (Program.api2.ItemExists(uniqId))
                {
                    rec   = Program.api2.GetItem(uniqId);
                    merge = true;
                }
                else
                {
                    rec = YTDL.GetVideoInfo(vid);
                    if (rec == null)
                    {
                        return(new Devmasters.Batch.ActionOutputData());
                    }

                    rec.osobaid = o.NameId;
                    changed     = true;
                }
                string recId  = uniqId;
                string fnFile = $"{mp3path}\\{DataSetId}\\{recId}";
                var MP3Fn     = $"{fnFile}.mp3";
                var newtonFn  = $"{fnFile}.mp3.raw_s2t";
                var dockerFn  = $"{fnFile}.ctm";

                if (System.IO.File.Exists(MP3Fn) == false)
                {
                    System.Diagnostics.ProcessStartInfo piv =
                        new System.Diagnostics.ProcessStartInfo("youtube-dl.exe",
                                                                $"--no-progress --extract-audio --audio-format mp3 --postprocessor-args \" -ac 1 -ar 16000\" -o \"{fnFile}.%(ext)s\" " + vid
                                                                );
                    Devmasters.ProcessExecutor pev  = new Devmasters.ProcessExecutor(piv, 60 * 6 * 24);
                    pev.StandardOutputDataReceived += (ox, e) => { logger.Debug(e.Data); };

                    logger.Info($"Starting Youtube-dl for {vid} ");
                    pev.Start();
                }
                bool exists_S2T = System.IO.File.Exists(newtonFn) || System.IO.File.Exists(dockerFn);
                if (exists_S2T == false && rec.prepisAudia == null)
                {
                    using (Devmasters.Net.HttpClient.URLContent net = new Devmasters.Net.HttpClient.URLContent(
                               $"https://www.hlidacstatu.cz/api/v2/internalq/Voice2TextNewTask/{DataSetId}/{recId}?priority=2")
                           )
                    {
                        net.Method = Devmasters.Net.HttpClient.MethodEnum.POST;
                        net.RequestParams.Headers.Add("Authorization", System.Configuration.ConfigurationManager.AppSettings["apikey"]);
                        net.GetContent();
                    }
                }
                if (exists_S2T && !(rec.prepisAudia?.Count() > 0))
                {
                    if (System.IO.File.Exists(dockerFn))
                    {
                        var tt     = new KaldiASR.SpeechToText.VoiceToTerms(System.IO.File.ReadAllText(dockerFn));
                        var blocks = new Devmasters.SpeechToText.VoiceToTextFormatter(tt.Terms)
                                     .TextWithTimestamps(TimeSpan.FromSeconds(10), true)
                                     .Select(t => new record.Blok()
                        {
                            sekundOdZacatku = (long)t.Start.TotalSeconds, text = t.Text
                        })
                                     .ToArray();

                        //TODO opravit casem
                        var tmpRec = YTDL.GetVideoInfo(vid);
                        if (tmpRec != null)
                        {
                            rec.text = tmpRec.text + "\n\n" + new Devmasters.SpeechToText.VoiceToTextFormatter(tt.Terms).Text(true);
                        }
                        rec.prepisAudia = blocks;
                        changed         = true;
                    }
                }
                if (changed)
                {
                    api2.AddOrUpdateItem(rec, HlidacStatu.Api.V2.Dataset.Typed.ItemInsertMode.rewrite);
                }

                return(new Devmasters.Batch.ActionOutputData());
            }, Devmasters.Batch.Manager.DefaultOutputWriter, Devmasters.Batch.Manager.DefaultProgressWriter,
                                                    !System.Diagnostics.Debugger.IsAttached, maxDegreeOfParallelism: threads
                                                    );
        }
Пример #4
0
        public static void ParsePages(string datasetId, int startFrom = 10000, int count = 600)
        {
            Devmasters.Batch.Manager.DoActionForAll <int>(Enumerable.Range(startFrom, count),
                                                          //jedeme v 2 threadech, bud ohleduplny a nedavej vice
                                                          (i) =>
            {
                string url = "";
                try
                {
                    //stahnutí HTML stránky s rozhodnutím UOHS.
                    //rozhodnutí jsou na samostatnych stránkach, s jednoduchym URL, kde cislo stranky s rozhodnutim postupně roste.
                    // k 1.9.2018 ma posledni rozhodnuti cislo asi 15500
                    string html = "";
                    url         = $"http://www.uohs.cz/cs/verejne-zakazky/sbirky-rozhodnuti/detail-{i}.html";

                    //stahnuti HTML
                    System.Net.WebClient wc = new System.Net.WebClient();
                    wc.Encoding             = System.Text.Encoding.UTF8;
                    html = wc.DownloadString(url);

                    //prevedeni do XHTML pomoci HTMLAgilityPacku.
                    //XPath je trida a sada funkci pro jednodusi XPath parsovani
                    Devmasters.XPath page = new Devmasters.XPath(html);

                    //vsechna ziskavana data jsou ziskana pomoci XPATH


                    //stranka neexistuje, tak ji preskocime
                    if (page.GetNodeText("//head/title")?.Contains("stránka neexistuje") == true)
                    {
                        return(new Devmasters.Batch.ActionOutputData());
                    }

                    logger.Debug($"parsing {url}");

                    //do item davam postupně získané údaje
                    var item = new UOHSData();
                    item.Url = url;
                    item.Id  = i.ToString();

                    //žádný obsah není mimo tento DIV, tak si ho sem dam, abych tento retezec nemusel porad opakovat
                    var root = "//div[@id='content']";

                    //parsování pomocí XPath.
                    item.Cj               = page.GetNodeText(root + "//div/h1/strong[1]")?.Replace("Rozhodnutí: ", "");
                    item.SpisovaZnacka    = page.GetNodeText(root + "//div/h1/strong[2]")?.Replace("Rozhodnutí: ", "");
                    item.SoudniRozhodnuti = page.GetNodeText(root + "//div//h1/following-sibling::h2[1]");


                    item.Instance = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Instance')]/parent::tr/td");

                    item.Vec = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Věc')]/parent::tr/td");

                    var ucastniciNode = page.GetNodes(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Účastníci')]/parent::tr/td/ol/li");
                    List <UOHSData.Ucastnik> ucastnici = new List <UOHSData.Ucastnik>();
                    if (ucastniciNode != null)
                    {
                        foreach (var node in ucastniciNode)
                        {
                            var firmaJmeno = System.Net.WebUtility.HtmlDecode(node.InnerText);     //konverze HTML entity to UTF-8;  &eacute; -> é


                            //dohledat ICO
                            var ico = httpClient.GetAsync("https://www.hlidacstatu.cz/api/v2/firmy/" + System.Net.WebUtility.UrlEncode(firmaJmeno))
                                      .Result.Content
                                      .ReadAsStringAsync().Result;
                            try
                            {
                                var icoRes = Newtonsoft.Json.Linq.JObject.Parse(ico);
                                if (icoRes["ico"] == null)
                                {
                                    ucastnici.Add(new UOHSData.Ucastnik()
                                    {
                                        Jmeno = firmaJmeno
                                    });
                                }
                                else
                                {
                                    ucastnici.Add(new UOHSData.Ucastnik()
                                    {
                                        Jmeno = firmaJmeno,
                                        ICO   = icoRes["ico"].Value <string>()
                                    });
                                }
                            }
                            catch (Exception)
                            {
                                ucastnici.Add(new UOHSData.Ucastnik()
                                {
                                    Jmeno = firmaJmeno
                                });
                            }
                        }
                    }
                    item.Ucastnici = ucastnici.ToArray();

                    item.Typ_spravniho_rizeni = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Typ správního řízení')]/parent::tr/td");
                    item.Typ_rozhodnuti       = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Typ rozhodnutí')]/parent::tr/td");
                    item.Rok = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Rok')]/parent::tr/td");

                    item.PravniMoc = ToDateTimeFromCZ(
                        page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Datum nabytí právní moci')]/parent::tr/td")
                        );

                    var souvis_urls = page.GetNodes(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Související rozhodnutí')]/parent::tr/td/a");
                    if (souvis_urls != null)
                    {
                        item.SouvisejiciUrl = souvis_urls
                                              .Select(m => m.Attributes["href"]?.Value)
                                              .Where(m => m != null)
                                              .Select(u => "http://www.uohs.cz" + u)
                                              .ToArray();
                    }


                    item.Rozhodnuti = new UOHSData.Dokument();

                    var documents = page.GetNodes(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Dokumenty')]/parent::tr/td/a");


                    item.Rozhodnuti.Url = page.GetNode(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Dokumenty')]/parent::tr/td/a")
                                          ?.Attributes["href"]?.Value;
                    if (!string.IsNullOrEmpty(item.Rozhodnuti.Url))
                    {
                        item.Rozhodnuti.Url = "http://www.uohs.cz" + item.SouvisejiciUrl;
                    }

                    item.Rozhodnuti.PlainText = page.GetNode("//div[@id='content']//div[@class='res_text']")?.InnerText ?? "";


                    //parsovani hotovo, jdu ulozit zaznam do Datasetu
                    logger.Debug($"adding item {item.Id} - {item.Url}");

                    ds.AddOrUpdateItem(item, HlidacStatu.Api.V2.Dataset.Typed.ItemInsertMode.rewrite);
                }
                catch (Exception e)
                {
                    logger.Error(url, e);
                }

                return(new Devmasters.Batch.ActionOutputData());
            },
                                                          outputWriter.OutputWriter, progressWriter.ProgressWriter,
                                                          !System.Diagnostics.Debugger.IsAttached
                                                          );
        }
Пример #5
0
        public static async Task <Result> TextFromUrlAsync_old(string apikey, Uri url, string client, int priority,
                                                               MiningIntensity intensity, string origFilename = null, TimeSpan?maxWaitingTime = null,
                                                               TimeSpan?restartTaskAfterTime = null /*, Api.CallbackData callBackData = null*/)
        {
            string fullUrl = null;
            string taskId  = null;

            Api.CallbackData callBackData = null; //temporaty disable callBack
            byte[]           resbyte;
            string           res = "";

            try
            {
                if (string.IsNullOrEmpty(origFilename))
                {
                    origFilename = Lib.OCR.DocTools.GetFilename(url.LocalPath);
                }
                TimeSpan?waitTime = maxWaitingTime;
                if (waitTime == null && callBackData != null)
                {
                    waitTime = TimeSpan.FromDays(14);
                }
                else if (waitTime == null)
                {
                    waitTime = defaultWaitingTime;
                }

                string callBackDataString = "";
                if (callBackData != null)
                {
                    callBackDataString = Newtonsoft.Json.JsonConvert.SerializeObject(callBackData);
                }

                using (WebOcr wc = new WebOcr())
                {
                    string param = "url=" + System.Net.WebUtility.UrlEncode(url.AbsoluteUri)
                                   + "&apikey=" + apikey
                                   + "&fn=" + System.Net.WebUtility.UrlEncode(origFilename ?? "")
                                   + "&client=" + System.Net.WebUtility.UrlEncode(client ?? "")
                                   + "&priority=" + priority
                                   + "&intensity=" + (int)intensity
                                   + "&expirationIn=" + (int)(waitTime.Value.TotalSeconds * 1.05) //add 5%
                                   + "&restartIn=" + (int)(restartTaskAfterTime?.TotalSeconds ?? 0)
                                   + "&callbackData=" + System.Net.WebUtility.UrlEncode(callBackDataString);

                    logger.Debug($"TextFromUrlAsync calling OCR API for {url.AbsoluteUri} ");

                    fullUrl = ApiUrl + "addTask.ashx?" + param;
                    resbyte = await wc.DownloadDataTaskAsync(fullUrl);

                    res = System.Text.Encoding.UTF8.GetString(resbyte);
                    Newtonsoft.Json.Linq.JToken json = Newtonsoft.Json.Linq.JToken.Parse(res);

                    if (json["taskid"] != null)
                    {
                        taskId = json["taskid"].ToString();
                    }
                    else
                    {
                        logger.Error($"ExtApi.TextFromUrlAsync API Exception\nUrl:{url.AbsoluteUri}\n content: " + res);
                        return(new Result()
                        {
                            Id = taskId, IsValid = Result.ResultStatus.Invalid, Error = json["error"].Value <string>()
                        });
                    }
                    logger.Debug($"TextFromUrlAsync called OCR API taskid:{taskId} for {url.AbsoluteUri} ");
                }

                if (callBackData == null)
                {
                    return(WaitingForResult(apikey, taskId, maxWaitingTime ?? defaultWaitingTime));
                }
                else
                {
                    return new Result()
                           {
                               Id = taskId, IsValid = Result.ResultStatus.InQueueWithCallback
                           }
                };
            }
            catch (System.Net.WebException e)
            {
                logger.Debug($"called ext API TextFromFile {fullUrl}.\nResponse: {res}\n" + ApiUrl, e);
                throw new ApiException("called ext API ", e);
            }
            catch (Exception e)
            {
                logger.Error($"exception API TextFromFile {fullUrl}.\nResponse: {res}\n" + ApiUrl, e);
                throw new ApiException("exception API TextFromFile  ", e);
            }
            finally
            {
                //TempIO.DeleteFile(tmpFile);
            }
        }
Пример #6
0
        private void Log()
        {
            Devmasters.Logging.LogMessage msg = null;
            if (this.ExactElapsedMiliseconds > slowLoggerThreshold)
            {
                if (context != null)
                {
                    if (context.Request != null)
                    {
                        msg = new Devmasters.Logging.LogMessage();
                        //<conversionPattern value="%date|%property{page}|%property{params}|%property{user}|%property{elapsedtime}" />
                        msg.SetCustomKeyValue("web_page", context.Request.Url.AbsolutePath);
                        msg.SetCustomKeyValue("web_params", FormatParams(context));
                        msg.SetCustomKeyValue("web_elapsedtime", this.ExactElapsedMiliseconds);

                        if (context.User != null && context.User.Identity != null && context.User.Identity.Name != null)
                        {
                            msg.SetCustomKeyValue("web_user", context.User.Identity.Name);
                        }
                    }
                }
                else
                {
                    if (this.IsRunning || this.ElapsedTicks == 0)
                    {
                        return;
                    }

                    if (
                        (level != Devmasters.Logging.PriorityLevel.Fatal | level != Devmasters.Logging.PriorityLevel.Error) &&
                        this.ExactElapsedMiliseconds > slowLoggerThreshold && slowLoggerThreshold > 0)
                    {
                        logger.Error(string.Format(textTemplate + " TOO SLOW", this.ExactElapsedMiliseconds));
                    }
                }

                switch (level)
                {
                case Devmasters.Logging.PriorityLevel.Debug:
                    if (msg != null)
                    {
                        logger.Debug(msg);
                    }
                    else
                    {
                        logger.Debug(string.Format(textTemplate, this.ExactElapsedMiliseconds));
                    }
                    break;

                case Devmasters.Logging.PriorityLevel.Information:
                    if (msg != null)
                    {
                        logger.Info(msg);
                    }
                    else
                    {
                        logger.Info(string.Format(textTemplate, this.ExactElapsedMiliseconds));
                    }
                    break;

                case Devmasters.Logging.PriorityLevel.Warning:
                    if (msg != null)
                    {
                        logger.Warning(msg);
                    }
                    else
                    {
                        logger.Warning(string.Format(textTemplate, this.ExactElapsedMiliseconds));
                    }
                    break;

                case Devmasters.Logging.PriorityLevel.Error:
                    if (msg != null)
                    {
                        logger.Error(msg);
                    }
                    else
                    {
                        logger.Error(string.Format(textTemplate, this.ExactElapsedMiliseconds));
                    }
                    break;

                case Devmasters.Logging.PriorityLevel.Fatal:
                    if (msg != null)
                    {
                        logger.Fatal(msg);
                    }
                    else
                    {
                        logger.Fatal(string.Format(textTemplate, this.ExactElapsedMiliseconds));
                    }
                    break;
                }
            }
        }