public static void LogQueryError <T>(Nest.ISearchResponse <T> esReq, string text = "", System.Web.HttpContextBase httpContext = null, Exception ex = null) where T : class { Elasticsearch.Net.ServerError serverErr = esReq.ServerError; ESLogger.Error(new Devmasters.Logging.LogMessage() .SetMessage("ES query error: " + text + "\n\nCause:" + serverErr?.Error?.ToString() + "\n\nDetail:" + esReq.DebugInformation + "\n\n\n" ) .SetException(ex) .SetCustomKeyValue("URL", httpContext?.Request?.RawUrl) .SetCustomKeyValue("Stack-trace", System.Environment.StackTrace) .SetCustomKeyValue("Referer", httpContext?.Request?.UrlReferrer?.AbsoluteUri) .SetCustomKeyValue("User-agent", httpContext?.Request?.Browser?.Browser) .SetCustomKeyValue("IP", httpContext?.Request?.UserHostAddress + " " + System.Web.HttpContext.Current?.Request?.UserHostName) ); }
public static void ParsePages(string datasetId, int startFrom = 10000, int count = 600) { Devmasters.Batch.Manager.DoActionForAll <int>(Enumerable.Range(startFrom, count), //jedeme v 2 threadech, bud ohleduplny a nedavej vice (i) => { string url = ""; try { //stahnutí HTML stránky s rozhodnutím UOHS. //rozhodnutí jsou na samostatnych stránkach, s jednoduchym URL, kde cislo stranky s rozhodnutim postupně roste. // k 1.9.2018 ma posledni rozhodnuti cislo asi 15500 string html = ""; url = $"http://www.uohs.cz/cs/verejne-zakazky/sbirky-rozhodnuti/detail-{i}.html"; //stahnuti HTML System.Net.WebClient wc = new System.Net.WebClient(); wc.Encoding = System.Text.Encoding.UTF8; html = wc.DownloadString(url); //prevedeni do XHTML pomoci HTMLAgilityPacku. //XPath je trida a sada funkci pro jednodusi XPath parsovani Devmasters.XPath page = new Devmasters.XPath(html); //vsechna ziskavana data jsou ziskana pomoci XPATH //stranka neexistuje, tak ji preskocime if (page.GetNodeText("//head/title")?.Contains("stránka neexistuje") == true) { return(new Devmasters.Batch.ActionOutputData()); } logger.Debug($"parsing {url}"); //do item davam postupně získané údaje var item = new UOHSData(); item.Url = url; item.Id = i.ToString(); //žádný obsah není mimo tento DIV, tak si ho sem dam, abych tento retezec nemusel porad opakovat var root = "//div[@id='content']"; //parsování pomocí XPath. item.Cj = page.GetNodeText(root + "//div/h1/strong[1]")?.Replace("Rozhodnutí: ", ""); item.SpisovaZnacka = page.GetNodeText(root + "//div/h1/strong[2]")?.Replace("Rozhodnutí: ", ""); item.SoudniRozhodnuti = page.GetNodeText(root + "//div//h1/following-sibling::h2[1]"); item.Instance = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Instance')]/parent::tr/td"); item.Vec = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Věc')]/parent::tr/td"); var ucastniciNode = page.GetNodes(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Účastníci')]/parent::tr/td/ol/li"); List <UOHSData.Ucastnik> ucastnici = new List <UOHSData.Ucastnik>(); if (ucastniciNode != null) { foreach (var node in ucastniciNode) { var firmaJmeno = System.Net.WebUtility.HtmlDecode(node.InnerText); //konverze HTML entity to UTF-8; é -> é //dohledat ICO var ico = httpClient.GetAsync("https://www.hlidacstatu.cz/api/v2/firmy/" + System.Net.WebUtility.UrlEncode(firmaJmeno)) .Result.Content .ReadAsStringAsync().Result; try { var icoRes = Newtonsoft.Json.Linq.JObject.Parse(ico); if (icoRes["ico"] == null) { ucastnici.Add(new UOHSData.Ucastnik() { Jmeno = firmaJmeno }); } else { ucastnici.Add(new UOHSData.Ucastnik() { Jmeno = firmaJmeno, ICO = icoRes["ico"].Value <string>() }); } } catch (Exception) { ucastnici.Add(new UOHSData.Ucastnik() { Jmeno = firmaJmeno }); } } } item.Ucastnici = ucastnici.ToArray(); item.Typ_spravniho_rizeni = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Typ správního řízení')]/parent::tr/td"); item.Typ_rozhodnuti = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Typ rozhodnutí')]/parent::tr/td"); item.Rok = page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Rok')]/parent::tr/td"); item.PravniMoc = ToDateTimeFromCZ( page.GetNodeText(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Datum nabytí právní moci')]/parent::tr/td") ); var souvis_urls = page.GetNodes(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Související rozhodnutí')]/parent::tr/td/a"); if (souvis_urls != null) { item.SouvisejiciUrl = souvis_urls .Select(m => m.Attributes["href"]?.Value) .Where(m => m != null) .Select(u => "http://www.uohs.cz" + u) .ToArray(); } item.Rozhodnuti = new UOHSData.Dokument(); var documents = page.GetNodes(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Dokumenty')]/parent::tr/td/a"); item.Rozhodnuti.Url = page.GetNode(root + "//table[@id='resolution_detail']//tr//th[contains(text(),'Dokumenty')]/parent::tr/td/a") ?.Attributes["href"]?.Value; if (!string.IsNullOrEmpty(item.Rozhodnuti.Url)) { item.Rozhodnuti.Url = "http://www.uohs.cz" + item.SouvisejiciUrl; } item.Rozhodnuti.PlainText = page.GetNode("//div[@id='content']//div[@class='res_text']")?.InnerText ?? ""; //parsovani hotovo, jdu ulozit zaznam do Datasetu logger.Debug($"adding item {item.Id} - {item.Url}"); ds.AddOrUpdateItem(item, HlidacStatu.Api.V2.Dataset.Typed.ItemInsertMode.rewrite); } catch (Exception e) { logger.Error(url, e); } return(new Devmasters.Batch.ActionOutputData()); }, outputWriter.OutputWriter, progressWriter.ProgressWriter, !System.Diagnostics.Debugger.IsAttached ); }
public static async Task <Result> TextFromUrlAsync_old(string apikey, Uri url, string client, int priority, MiningIntensity intensity, string origFilename = null, TimeSpan?maxWaitingTime = null, TimeSpan?restartTaskAfterTime = null /*, Api.CallbackData callBackData = null*/) { string fullUrl = null; string taskId = null; Api.CallbackData callBackData = null; //temporaty disable callBack byte[] resbyte; string res = ""; try { if (string.IsNullOrEmpty(origFilename)) { origFilename = Lib.OCR.DocTools.GetFilename(url.LocalPath); } TimeSpan?waitTime = maxWaitingTime; if (waitTime == null && callBackData != null) { waitTime = TimeSpan.FromDays(14); } else if (waitTime == null) { waitTime = defaultWaitingTime; } string callBackDataString = ""; if (callBackData != null) { callBackDataString = Newtonsoft.Json.JsonConvert.SerializeObject(callBackData); } using (WebOcr wc = new WebOcr()) { string param = "url=" + System.Net.WebUtility.UrlEncode(url.AbsoluteUri) + "&apikey=" + apikey + "&fn=" + System.Net.WebUtility.UrlEncode(origFilename ?? "") + "&client=" + System.Net.WebUtility.UrlEncode(client ?? "") + "&priority=" + priority + "&intensity=" + (int)intensity + "&expirationIn=" + (int)(waitTime.Value.TotalSeconds * 1.05) //add 5% + "&restartIn=" + (int)(restartTaskAfterTime?.TotalSeconds ?? 0) + "&callbackData=" + System.Net.WebUtility.UrlEncode(callBackDataString); logger.Debug($"TextFromUrlAsync calling OCR API for {url.AbsoluteUri} "); fullUrl = ApiUrl + "addTask.ashx?" + param; resbyte = await wc.DownloadDataTaskAsync(fullUrl); res = System.Text.Encoding.UTF8.GetString(resbyte); Newtonsoft.Json.Linq.JToken json = Newtonsoft.Json.Linq.JToken.Parse(res); if (json["taskid"] != null) { taskId = json["taskid"].ToString(); } else { logger.Error($"ExtApi.TextFromUrlAsync API Exception\nUrl:{url.AbsoluteUri}\n content: " + res); return(new Result() { Id = taskId, IsValid = Result.ResultStatus.Invalid, Error = json["error"].Value <string>() }); } logger.Debug($"TextFromUrlAsync called OCR API taskid:{taskId} for {url.AbsoluteUri} "); } if (callBackData == null) { return(WaitingForResult(apikey, taskId, maxWaitingTime ?? defaultWaitingTime)); } else { return new Result() { Id = taskId, IsValid = Result.ResultStatus.InQueueWithCallback } }; } catch (System.Net.WebException e) { logger.Debug($"called ext API TextFromFile {fullUrl}.\nResponse: {res}\n" + ApiUrl, e); throw new ApiException("called ext API ", e); } catch (Exception e) { logger.Error($"exception API TextFromFile {fullUrl}.\nResponse: {res}\n" + ApiUrl, e); throw new ApiException("exception API TextFromFile ", e); } finally { //TempIO.DeleteFile(tmpFile); } }
private void Log() { Devmasters.Logging.LogMessage msg = null; if (this.ExactElapsedMiliseconds > slowLoggerThreshold) { if (context != null) { if (context.Request != null) { msg = new Devmasters.Logging.LogMessage(); //<conversionPattern value="%date|%property{page}|%property{params}|%property{user}|%property{elapsedtime}" /> msg.SetCustomKeyValue("web_page", context.Request.Url.AbsolutePath); msg.SetCustomKeyValue("web_params", FormatParams(context)); msg.SetCustomKeyValue("web_elapsedtime", this.ExactElapsedMiliseconds); if (context.User != null && context.User.Identity != null && context.User.Identity.Name != null) { msg.SetCustomKeyValue("web_user", context.User.Identity.Name); } } } else { if (this.IsRunning || this.ElapsedTicks == 0) { return; } if ( (level != Devmasters.Logging.PriorityLevel.Fatal | level != Devmasters.Logging.PriorityLevel.Error) && this.ExactElapsedMiliseconds > slowLoggerThreshold && slowLoggerThreshold > 0) { logger.Error(string.Format(textTemplate + " TOO SLOW", this.ExactElapsedMiliseconds)); } } switch (level) { case Devmasters.Logging.PriorityLevel.Debug: if (msg != null) { logger.Debug(msg); } else { logger.Debug(string.Format(textTemplate, this.ExactElapsedMiliseconds)); } break; case Devmasters.Logging.PriorityLevel.Information: if (msg != null) { logger.Info(msg); } else { logger.Info(string.Format(textTemplate, this.ExactElapsedMiliseconds)); } break; case Devmasters.Logging.PriorityLevel.Warning: if (msg != null) { logger.Warning(msg); } else { logger.Warning(string.Format(textTemplate, this.ExactElapsedMiliseconds)); } break; case Devmasters.Logging.PriorityLevel.Error: if (msg != null) { logger.Error(msg); } else { logger.Error(string.Format(textTemplate, this.ExactElapsedMiliseconds)); } break; case Devmasters.Logging.PriorityLevel.Fatal: if (msg != null) { logger.Fatal(msg); } else { logger.Fatal(string.Format(textTemplate, this.ExactElapsedMiliseconds)); } break; } } }