internal override async Task <JObject> GetMetaDataAsync(Element element, string fileType) { string entityPath = GetMetaUrl(element.Class, element.Id, fileType); if (!File.Exists(entityPath)) { s_log.Warn(LocalizationService.FormatResourceString("MetaFileReaderMessage01", entityPath)); return(null); } string jsonString = await File.ReadAllTextAsync(entityPath); if (string.IsNullOrWhiteSpace(jsonString)) { return(null); } try { return(JsonConvert.DeserializeObject <JObject>(jsonString)); } catch { s_log.Warn(LocalizationService.FormatResourceString("MetaFileReaderMessage02")); return(null); } }
/// <summary> /// Prüfe das Element zuvor, bevor man es zum Indexieren schickt. /// <para>Es werden mehrere Parameter geprüft. /// <list type="number"> /// <item>Soll das Element überhaupt indexiert werden: Soll diese Klasse indexiert werden? Sind alle Pflichtfelder vorhanden? Ist irgendein Ausnahmewert vorhanden?</item> /// <item>Ist das Element bereits indexiert?</item> /// <item>Hat sich der Inhalt des Elements geändert? </item> /// </list> /// </para> /// </summary> /// <param name="element">Das Element, das indexiert werden soll.</param> internal async Task SendToIndexerAsync(Element element) { bool isAlreadyIndexed = IsAlreadyIndexed(element); bool isForbidden = RemoveWhenForbidden(element, isAlreadyIndexed); if (isForbidden) { return; } using ElementLogContext context = new(); bool hasContentChanged = HasContentChanged(element); bool isIndexingNeeded = RemoveWhenChanged(element, isAlreadyIndexed, hasContentChanged, context); if (isIndexingNeeded == false) { return; } _log.Info(LocalizationService.FormatResourceString("CrawlerIndexerInterfaceMessage03", element.Id)); bool isIndexingSuccess = await _indexer.AddElementToIndexAsync(element); MarkElementFound(element, context, isIndexingSuccess); }
/// <summary> /// /// </summary> /// <param name="jobIdParameters"></param> internal static void ScheduleJobs(List <string> jobIdParameters) { DateTime startTime = DateTime.Now; log.Info(LocalizationService.FormatResourceString("JobSchedulerMessage01", startTime)); ErrorControlService.GetService().StartRuntimeStopwatch(); JobManager manager = JobManager.GetJobManager(); if (manager.AllJobs == null) { return; } int jobCount = manager.AllJobs.Count; List <string> jobIds = new(jobCount); for (int i = 0; i < jobCount; i++) { JobConfig jobConfig = manager.AllJobs[i]; if (jobConfig == null) { continue; } ExitWhenInvalidId(jobIds, jobConfig.Id); InitNewThread(jobIdParameters, jobIds, jobConfig); } LogTime(startTime); }
/// <summary> /// /// </summary> /// <param name="element"></param> /// <returns></returns> internal async Task <List <Element> > CrawlAttachementsAsync(Element element) { ConverterService converter = new(); List <Element> attachements = new(); StatisticService statisticService = StatisticService.GetService(_jobConfig.Id); foreach (var attachementName in element.AttachementNames) { ConverterResult res; try { res = await converter.ConvertAttachementAsync(element, attachementName); } catch { _log.Error(LocalizationService.FormatResourceString("AttachementCrawlerMessage01", attachementName, element.Id)); ErrorControlService.GetService().IncreaseErrorCount(); continue; } Element attachement = element.Clone() as Element; OverwriteAttachementValues(attachement, res, attachementName); attachements.Add(attachement); statisticService.IncreaseFoundDocumentsCount(); } return(attachements); }
/// <summary> /// /// </summary> /// <param name="temporaryValue"></param> /// <param name="expressionEvaluator"></param> /// <param name="metaData"></param> /// <returns>Eine Liste mit den Werten oder null, wenn der Ausdruck kein Ergebnis liefert.</returns> private string[] EvaluateField(string temporaryValue, ExpressionEvaluator expressionEvaluator, MetaDataCollection metaData) { if (!ExpressionEvaluator.IsExpression(temporaryValue, out string expression)) { _log.Debug(LocalizationService.FormatResourceString("ElementCrawlerMessage02", temporaryValue)); return(string.IsNullOrWhiteSpace(temporaryValue) ? null : new string[] { temporaryValue }); } if (ExpressionEvaluator.IsVariableExpression(expression, out string variableName)) { string value = Util.RemoveMarkup(MetaAnalyzer.ExtractValue(metaData, variableName)); _log.Debug(LocalizationService.FormatResourceString("ElementCrawlerMessage03", expression, value)); return(string.IsNullOrWhiteSpace(value) ? null : new string[] { value }); } if (ExpressionEvaluator.IsConverterExpression(expression, out string converterClassName, out string[] converterParameterNames)) { int parameterCount = converterParameterNames.Length; string[] converterParameterValues = new string[parameterCount]; for (int i = 0; i < parameterCount; i++) { string converterParameter = converterParameterNames[i]; bool isVariable = ExpressionEvaluator.IsVariableExpression(converterParameter, out string converterVariableName); converterParameterValues[i] = isVariable ? Util.RemoveMarkup(MetaAnalyzer.ExtractValue(metaData, converterVariableName)) : converterParameterNames[i]; } _log.Debug(LocalizationService.FormatResourceString("ElementCrawlerMessage04", expression, converterClassName, "")); return(ExpressionEvaluator.EvaluateAsConverter(converterClassName, converterParameterValues)); } return(null); }
/// <summary> /// Die Daten eines Elements analysieren und daraus ein Objekt erstellen. /// </summary> /// <param name="id">Die ID des Element, zum Beispiel GR_389F860B088563B1.</param> /// <returns>Ein Objekt, das die Daten des Elements in Enable Now enthält</returns> internal async Task <Element> CrawlElementAsync(string id) { Element element = new(id); FillInitialFields(element); MetaDataCollection metaData = await MetaAnalyzer.LoadMetaFilesAsync(element); FillFields(element, metaData); AddAssets(element, metaData); string autostartId = GetAutostartId(metaData); StatisticService statisticService = StatisticService.GetService(_jobConfig.Id); if (autostartId != null) { try { Element autostartElement = await CrawlElementAsync(autostartId); OverwriteValuesByAutostartElement(element, autostartElement); statisticService.IncreaseAutostartElementsCount(); } catch { _log.Warn(LocalizationService.FormatResourceString("ElementCrawlerMessage01")); } } element.Hash = element.GenerateHashCode(); SetDateValue(element); statisticService.IncreaseFoundDocumentsCount(); return(element); }
/// <summary> /// Prüft, ob das Element indexiert oder ausgeschlossen werden soll. /// <para>Es wird durch alle Feldnamen in der Ausnahmeliste iteriert. Falls das Element ebenfalls ein Feld mit diesen Namen enthält, /// wird durch dessen Werte iteriert. Es wird geprüft, ob der Wert mit dem regulären Ausdruck in der Ausnahmeliste übereinstimmt.</para> /// </summary> /// <param name="element">Das Objekt des Elements, das geprüft werden soll.</param> /// <returns>Gibt wahr zurück, wenn ein Wert auf der Ausnahmeliste erscheint, ansonsten falsch.</returns> private bool HasBlacklistedValue(Element element) { // Werte, die in beiden Listen sind var fieldnames = from fieldName in _jobConfig.BlacklistFields.Keys join key in element.Fields.Keys on fieldName equals key select fieldName; foreach (var fieldName in fieldnames) { var values = element.Fields[fieldName]; foreach (var value in values) { try { if (Regex.IsMatch(value, _jobConfig.BlacklistFields[fieldName])) { return(true); } } catch { _log.Debug(LocalizationService.FormatResourceString("CrawlerIndexerInterfaceMessage05", element.Id)); } } } return(false); }
/// <summary> /// Das Programm beenden, wenn die JobId leer ist oder doppelt auftritt. /// </summary> /// <param name="jobIds">Liste der JobIds, die bereits abgearbeitet sind</param> /// <param name="jobConfig"></param> private static void ExitWhenInvalidId(List <string> jobIds, string jobId) { if (jobIds.Contains(jobId) || string.IsNullOrWhiteSpace(jobId)) { log.Fatal(LocalizationService.FormatResourceString("JobSchedulerMessage04", jobId)); Environment.Exit(-1); } }
/// <summary> /// /// </summary> /// <param name="startTime"></param> private static void LogTime(DateTime startTime) { DateTime endTime = DateTime.Now; TimeSpan duration = endTime - startTime; log.Info(LocalizationService.FormatResourceString("JobSchedulerMessage02", duration)); log.Info(LocalizationService.FormatResourceString("JobSchedulerMessage03", endTime)); }
/// <summary> /// /// </summary> /// <param name="id"></param> internal void RemoveElementCompletly(string id) { _log.Debug(LocalizationService.FormatResourceString("CrawlerIndexerInterfaceMessage04", id)); using ElementLogContext context = new(); context.RemoveElementLog(id, _jobConfig.Id); _indexer.RemoveElementFromIndexAsync(id); StatisticService.GetService(_jobConfig.Id).IncreaseRemovedDocumentsCount(); }
private void CheckPath(string path) { if (string.IsNullOrWhiteSpace(path)) { _log.Error(LocalizationService.FormatResourceString("JobWriterMessage01")); throw new ArgumentException("JobWriterMessage01"); } }
private static void ConfigLogging() { var logFile = Path.Combine(Util.GetApplicationRoot(), "log4net.xml"); var logDirectory = Path.Combine(Util.GetApplicationRoot(), "logs"); if (Util.IsDirectoryWritable(logDirectory)) { log.Warn(LocalizationService.FormatResourceString("ProgramMessage01")); } XmlConfigurator.Configure(new FileInfo(logFile)); }
/// <summary> /// Erstellt ein neues Objekt, das einem Element in Enable Now entspricht. /// </summary> /// <param name="id">Die ID es Enable Now Elements.</param> /// <exception cref="ArgumentException">Wirft eine Ausnahme, falls die ID ungültig ist.</exception> internal Element(string id) { if (!Validator.Validate(id, Validator.EnableNowIdPattern)) { string message = LocalizationService.FormatResourceString("ElementMessage01", id); _log.Error(message); throw new ArgumentException(message); } Id = id; Class = Id.Split('_')[0]; }
/// <summary> /// /// </summary> /// <returns></returns> internal IEnumerable <string> ReadJobPaths() { try { return(Directory.EnumerateFiles(JobDirectory)); } catch (Exception e) { log.Error(LocalizationService.FormatResourceString("JobReaderMessage01", JobDirectory), e); return(null); } }
/// <summary> /// Liest den Text aus der Datei aus. /// </summary> /// <param name="filePath">Pfad zu der Datei.</param> /// <returns>Text der Datei.</returns> private string ReadFile(string filePath) { try { return(File.ReadAllText(filePath)); } catch (Exception e) { log.Error(LocalizationService.FormatResourceString("JobReaderMessage04", filePath), e); return(null); } }
/// <summary> /// /// </summary> /// <param name="fields"></param> /// <param name="fieldName"></param> /// <exception cref="ArgumentNullException"></exception> /// <returns></returns> private string GetConverterFieldValue(JToken fields, string fieldName) { var field = fields[fieldName]?[0]; if (field == null) { string message = LocalizationService.FormatResourceString("ConverterServiceMessage03", fieldName); _log.Error(message); throw new ArgumentNullException(message); } return(field.Value <string>()); }
/// <summary> /// /// </summary> /// <param name="jobIdParameters"></param> /// <param name="jobIds"></param> /// <param name="jobConfig"></param> private static void InitNewThread(List <string> jobIdParameters, List <string> jobIds, JobConfig jobConfig) { if (jobIdParameters.Count == 0 || jobIdParameters.Contains(jobConfig.Id)) { jobIds.Add(jobConfig.Id); Task t = Task.Run(delegate() { RunJob(jobConfig); }); t.Wait(); } else { log.Info(LocalizationService.FormatResourceString("JobSchedulerMessage05", jobConfig.Id)); } }
/// <summary> /// Liest den Text aus einer Datei. /// </summary> /// <returns></returns> private static string ReadFile() { try { return(File.ReadAllText(s_filePath)); } catch (Exception e) { string message = LocalizationService.FormatResourceString("ConfigReaderMessage01", s_filePath); s_log.Error(message); throw new Exception(message, e); } }
private static List <string> GetJobParameter(string[] args) { if (args.Length == 0) { return(new List <string>()); } List <string> parameters = args.Where(arg => !string.IsNullOrWhiteSpace(arg)).ToList(); foreach (string p in parameters) { log.Info(LocalizationService.FormatResourceString("ProgramMessage02", p)); } return(parameters); }
/// <summary> /// /// </summary> /// <param name="element"></param> /// <param name="context"></param> /// <param name="isIndexingSuccess"></param> private void MarkElementFound(Element element, ElementLogContext context, bool isIndexingSuccess) { if (isIndexingSuccess) { context.SetElementFound(element, _jobConfig.Id, true); _log.Info(LocalizationService.FormatResourceString("CrawlerIndexerInterfaceMessage06", element.Id)); StatisticService.GetService(_jobConfig.Id).IncreaseIndexedDocumentsCount(); } else { _log.Error(LocalizationService.FormatResourceString("CrawlerIndexerInterfaceMessage08", element.Id)); ErrorControlService.GetService().IncreaseErrorCount(); } }
internal override async Task <JObject> GetMetaDataAsync(Element element, string fileType) { string entityUrl = GetMetaUrl(element.Class, element.Id, fileType); try { string jsonString = await new HttpRequest(_jobConfig).SendRequestAsync(entityUrl); return(JsonConvert.DeserializeObject <JObject>(jsonString)); } catch { s_log.Warn(LocalizationService.FormatResourceString("MetaWebsiteReaderMessage01")); return(null); } }
/// <summary> /// /// </summary> /// <param name="element"></param> /// <param name="isAlreadyIndexed"></param> private bool RemoveWhenForbidden(Element element, bool isAlreadyIndexed) { if (!ShouldBeIndexed(element)) { _log.Info(LocalizationService.FormatResourceString("CrawlerIndexerInterfaceMessage01", element.Id)); StatisticService.GetService(_jobConfig.Id).IncreaseSkippedDocumentsCount(); if (isAlreadyIndexed) { RemoveElementCompletly(element); } ; return(true); } return(false); }
/// <summary> /// /// </summary> /// <param name="content"></param> /// <returns></returns> private async Task<string> SendFormAsync(string url, string content) { Log.Info(LocalizationService.FormatResourceString("HttpFormAuthenticationMessage02", url)); List<KeyValuePair<string, string>> body = new() { new KeyValuePair<string, string>(jobConfig.AuthUserControl, jobConfig.AuthUser), new KeyValuePair<string, string>(jobConfig.AuthPasswordControl, jobConfig.AuthPassword) }; foreach (var param in jobConfig.AuthFormActionAdditionalParameters) { body.Add(new KeyValuePair<string, string>(param.Key, param.Value)); } AddFormParameters(content, body); HttpResponseMessage response = await client.PostAsync(url, new FormUrlEncodedContent(body)); return await response.Content.ReadAsStringAsync(); }
/// <summary> /// Liest die Konnektor-Konfiguration aus der config.json heraus. /// </summary> /// <returns></returns> internal static Config ReadConfig() { Config config; try { string jsonString = ReadFile(); config = JsonConvert.DeserializeObject <Config>(jsonString); } catch (Exception e) { string message = LocalizationService.FormatResourceString("ConfigReaderMessage02", s_filePath); s_log.Error(message); throw new Exception(message, e); } return(config); }
/// <summary> /// /// </summary> /// <param name="element"></param> /// <returns></returns> internal async override Task <bool> AddElementToIndexAsync(Element element) { Config config = ConfigManager.GetConfigManager().ConnectorConfig; string paramString = GetIndexingParameterString(element); string url = $"{config.IndexUrl}{paramString}"; try { await new HttpRequest(JobConfig).SendRequestAsync(url); return(true); } catch (Exception e) { _log.Error(LocalizationService.FormatResourceString("JsonIndexerMessage01"), e); return(false); } }
/// <summary> /// /// </summary> /// <param name="element"></param> /// <param name="fileName"></param> /// <exception cref="Exception"></exception> /// <returns></returns> internal async Task <ConverterResult> ConvertAttachementAsync(Element element, string fileName) { string url = GetConverterRequestUrl(element, fileName); string result; try { result = await new HttpRequest(JobManager.GetJobManager().SelectedJobConfig).SendRequestAsync(url); } catch { _log.Error(LocalizationService.FormatResourceString("ConverterServiceMessage02")); throw; } return(ExtractValues(result)); }
/// <summary> /// /// </summary> /// <param name="id"></param> /// <returns></returns> internal async override Task <bool> RemoveElementFromIndexAsync(string id) { Config config = ConfigManager.GetConfigManager().ConnectorConfig; string encodedParam = HttpUtility.UrlEncode($"[{GetElasticsearchId(id)}]"); string url = $"{config.RemoveUrl}{encodedParam}"; try { await new HttpRequest(JobConfig).SendRequestAsync(url); return(true); } catch (Exception e) { _log.Error(LocalizationService.FormatResourceString("JsonIndexerMessage02"), e); return(false); } }
/// <summary> /// /// </summary> /// <param name="form"></param> /// <param name="formData"></param> private void AddFormParameters(HtmlNode form, List<KeyValuePair<string, string>> formData) { if (form == null) return; var inputElements = form.SelectNodes("//input"); foreach (var inputElement in inputElements) { string name = inputElement.GetAttributeValue("name", ""); if (!jobConfig.AuthFormActionAdditionalParameters.ContainsKey(name) && !name.Equals(jobConfig.AuthUserControl) && !name.Equals(jobConfig.AuthPasswordControl) && name.Length > 0) { string value = inputElement.GetAttributeValue("value", ""); Log.Debug(LocalizationService.FormatResourceString("HttpFormAuthenticationMessage01", name, value)); formData.Add(new KeyValuePair<string, string>(name, value)); } } }
/// <summary> /// /// </summary> /// <param name="jobConfig"></param> private static void RunJob(JobConfig jobConfig) { JobManager.GetJobManager().SelectedJobConfig = jobConfig; PublicationCrawler crawler = new(); crawler.Initialize(); crawler.StartCrawling(); crawler.CompleteCrawling(); StatisticService.GetService(jobConfig.Id).PrintStatistic(); ErrorControlService.GetService().PrintErrorStatistic(); StatisticService service = StatisticService.GetService(jobConfig.Id); string text = LocalizationService.FormatResourceString("MailClientMessage01", jobConfig.Id, DateTime.Now, service.FoundDocumentsCount, service.IndexedDocumentsCount, service.RemovedDocumentsCount); new MailService(jobConfig).SendMail(text); }
/// <summary> /// /// </summary> /// <param name="element"></param> /// <param name="isAlreadyIndexed"></param> /// <param name="hasContentChanged"></param> /// <param name="context"></param> /// <returns></returns> private bool RemoveWhenChanged(Element element, bool isAlreadyIndexed, bool hasContentChanged, ElementLogContext context) { if (isAlreadyIndexed) { if (hasContentChanged) { _log.Info(LocalizationService.FormatResourceString("CrawlerIndexerInterfaceMessage02", element.Id)); RemoveElementCompletly(element); return(true); } else { _log.Info(LocalizationService.FormatResourceString("CrawlerIndexerInterfaceMessage07", element.Id)); StatisticService.GetService(_jobConfig.Id).IncreaseUnchangedDocumentsCount(); context.SetElementFound(element, _jobConfig.Id, true); return(false); } } return(true); }