Exemple #1
0
        internal override async Task <JObject> GetMetaDataAsync(Element element, string fileType)
        {
            string entityPath = GetMetaUrl(element.Class, element.Id, fileType);

            if (!File.Exists(entityPath))
            {
                s_log.Warn(LocalizationService.FormatResourceString("MetaFileReaderMessage01", entityPath));
                return(null);
            }

            string jsonString = await File.ReadAllTextAsync(entityPath);

            if (string.IsNullOrWhiteSpace(jsonString))
            {
                return(null);
            }

            try
            {
                return(JsonConvert.DeserializeObject <JObject>(jsonString));
            }
            catch
            {
                s_log.Warn(LocalizationService.FormatResourceString("MetaFileReaderMessage02"));
                return(null);
            }
        }
        /// <summary>
        /// Prüfe das Element zuvor, bevor man es zum Indexieren schickt.
        /// <para>Es werden mehrere Parameter geprüft.
        /// <list type="number">
        /// <item>Soll das Element überhaupt indexiert werden: Soll diese Klasse indexiert werden? Sind alle Pflichtfelder vorhanden? Ist irgendein Ausnahmewert vorhanden?</item>
        /// <item>Ist das Element bereits indexiert?</item>
        /// <item>Hat sich der Inhalt des Elements geändert? </item>
        /// </list>
        /// </para>
        /// </summary>
        /// <param name="element">Das Element, das indexiert werden soll.</param>
        internal async Task SendToIndexerAsync(Element element)
        {
            bool isAlreadyIndexed = IsAlreadyIndexed(element);
            bool isForbidden      = RemoveWhenForbidden(element, isAlreadyIndexed);

            if (isForbidden)
            {
                return;
            }

            using ElementLogContext context = new();
            bool hasContentChanged = HasContentChanged(element);
            bool isIndexingNeeded  = RemoveWhenChanged(element, isAlreadyIndexed, hasContentChanged, context);

            if (isIndexingNeeded == false)
            {
                return;
            }

            _log.Info(LocalizationService.FormatResourceString("CrawlerIndexerInterfaceMessage03", element.Id));

            bool isIndexingSuccess = await _indexer.AddElementToIndexAsync(element);

            MarkElementFound(element, context, isIndexingSuccess);
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="jobIdParameters"></param>
        internal static void ScheduleJobs(List <string> jobIdParameters)
        {
            DateTime startTime = DateTime.Now;

            log.Info(LocalizationService.FormatResourceString("JobSchedulerMessage01", startTime));
            ErrorControlService.GetService().StartRuntimeStopwatch();
            JobManager manager = JobManager.GetJobManager();

            if (manager.AllJobs == null)
            {
                return;
            }

            int           jobCount = manager.AllJobs.Count;
            List <string> jobIds   = new(jobCount);

            for (int i = 0; i < jobCount; i++)
            {
                JobConfig jobConfig = manager.AllJobs[i];
                if (jobConfig == null)
                {
                    continue;
                }

                ExitWhenInvalidId(jobIds, jobConfig.Id);

                InitNewThread(jobIdParameters, jobIds, jobConfig);
            }

            LogTime(startTime);
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="element"></param>
        /// <returns></returns>
        internal async Task <List <Element> > CrawlAttachementsAsync(Element element)
        {
            ConverterService converter        = new();
            List <Element>   attachements     = new();
            StatisticService statisticService = StatisticService.GetService(_jobConfig.Id);

            foreach (var attachementName in element.AttachementNames)
            {
                ConverterResult res;
                try
                {
                    res = await converter.ConvertAttachementAsync(element, attachementName);
                }
                catch
                {
                    _log.Error(LocalizationService.FormatResourceString("AttachementCrawlerMessage01", attachementName, element.Id));
                    ErrorControlService.GetService().IncreaseErrorCount();
                    continue;
                }
                Element attachement = element.Clone() as Element;
                OverwriteAttachementValues(attachement, res, attachementName);
                attachements.Add(attachement);
                statisticService.IncreaseFoundDocumentsCount();
            }
            return(attachements);
        }
Exemple #5
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="temporaryValue"></param>
        /// <param name="expressionEvaluator"></param>
        /// <param name="metaData"></param>
        /// <returns>Eine Liste mit den Werten oder null, wenn der Ausdruck kein Ergebnis liefert.</returns>
        private string[] EvaluateField(string temporaryValue, ExpressionEvaluator expressionEvaluator, MetaDataCollection metaData)
        {
            if (!ExpressionEvaluator.IsExpression(temporaryValue, out string expression))
            {
                _log.Debug(LocalizationService.FormatResourceString("ElementCrawlerMessage02", temporaryValue));
                return(string.IsNullOrWhiteSpace(temporaryValue) ? null : new string[] { temporaryValue });
            }

            if (ExpressionEvaluator.IsVariableExpression(expression, out string variableName))
            {
                string value = Util.RemoveMarkup(MetaAnalyzer.ExtractValue(metaData, variableName));
                _log.Debug(LocalizationService.FormatResourceString("ElementCrawlerMessage03", expression, value));
                return(string.IsNullOrWhiteSpace(value) ? null : new string[] { value });
            }
            if (ExpressionEvaluator.IsConverterExpression(expression, out string converterClassName, out string[] converterParameterNames))
            {
                int      parameterCount           = converterParameterNames.Length;
                string[] converterParameterValues = new string[parameterCount];
                for (int i = 0; i < parameterCount; i++)
                {
                    string converterParameter = converterParameterNames[i];
                    bool   isVariable         = ExpressionEvaluator.IsVariableExpression(converterParameter, out string converterVariableName);
                    converterParameterValues[i] = isVariable ?
                                                  Util.RemoveMarkup(MetaAnalyzer.ExtractValue(metaData, converterVariableName)) :
                                                  converterParameterNames[i];
                }
                _log.Debug(LocalizationService.FormatResourceString("ElementCrawlerMessage04", expression, converterClassName, ""));
                return(ExpressionEvaluator.EvaluateAsConverter(converterClassName, converterParameterValues));
            }
            return(null);
        }
Exemple #6
0
        /// <summary>
        /// Die Daten eines Elements analysieren und daraus ein Objekt erstellen.
        /// </summary>
        /// <param name="id">Die ID des Element, zum Beispiel GR_389F860B088563B1.</param>
        /// <returns>Ein Objekt, das die Daten des Elements in Enable Now enthält</returns>
        internal async Task <Element> CrawlElementAsync(string id)
        {
            Element element = new(id);

            FillInitialFields(element);
            MetaDataCollection metaData = await MetaAnalyzer.LoadMetaFilesAsync(element);

            FillFields(element, metaData);
            AddAssets(element, metaData);
            string           autostartId      = GetAutostartId(metaData);
            StatisticService statisticService = StatisticService.GetService(_jobConfig.Id);

            if (autostartId != null)
            {
                try
                {
                    Element autostartElement = await CrawlElementAsync(autostartId);

                    OverwriteValuesByAutostartElement(element, autostartElement);
                    statisticService.IncreaseAutostartElementsCount();
                }
                catch
                {
                    _log.Warn(LocalizationService.FormatResourceString("ElementCrawlerMessage01"));
                }
            }
            element.Hash = element.GenerateHashCode();
            SetDateValue(element);
            statisticService.IncreaseFoundDocumentsCount();
            return(element);
        }
        /// <summary>
        /// Prüft, ob das Element indexiert oder ausgeschlossen werden soll.
        /// <para>Es wird durch alle Feldnamen in der Ausnahmeliste iteriert. Falls das Element ebenfalls ein Feld mit diesen Namen enthält,
        /// wird durch dessen Werte iteriert. Es wird geprüft, ob der Wert mit dem regulären Ausdruck in der Ausnahmeliste übereinstimmt.</para>
        /// </summary>
        /// <param name="element">Das Objekt des Elements, das geprüft werden soll.</param>
        /// <returns>Gibt wahr zurück, wenn ein Wert auf der Ausnahmeliste erscheint, ansonsten falsch.</returns>
        private bool HasBlacklistedValue(Element element)
        {
            // Werte, die in beiden Listen sind
            var fieldnames = from fieldName in _jobConfig.BlacklistFields.Keys
                             join key in element.Fields.Keys
                             on fieldName equals key
                             select fieldName;

            foreach (var fieldName in fieldnames)
            {
                var values = element.Fields[fieldName];
                foreach (var value in values)
                {
                    try
                    {
                        if (Regex.IsMatch(value, _jobConfig.BlacklistFields[fieldName]))
                        {
                            return(true);
                        }
                    }
                    catch
                    {
                        _log.Debug(LocalizationService.FormatResourceString("CrawlerIndexerInterfaceMessage05", element.Id));
                    }
                }
            }
            return(false);
        }
 /// <summary>
 /// Das Programm beenden, wenn die JobId leer ist oder doppelt auftritt.
 /// </summary>
 /// <param name="jobIds">Liste der JobIds, die bereits abgearbeitet sind</param>
 /// <param name="jobConfig"></param>
 private static void ExitWhenInvalidId(List <string> jobIds, string jobId)
 {
     if (jobIds.Contains(jobId) || string.IsNullOrWhiteSpace(jobId))
     {
         log.Fatal(LocalizationService.FormatResourceString("JobSchedulerMessage04", jobId));
         Environment.Exit(-1);
     }
 }
        /// <summary>
        ///
        /// </summary>
        /// <param name="startTime"></param>
        private static void LogTime(DateTime startTime)
        {
            DateTime endTime  = DateTime.Now;
            TimeSpan duration = endTime - startTime;

            log.Info(LocalizationService.FormatResourceString("JobSchedulerMessage02", duration));
            log.Info(LocalizationService.FormatResourceString("JobSchedulerMessage03", endTime));
        }
 /// <summary>
 ///
 /// </summary>
 /// <param name="id"></param>
 internal void RemoveElementCompletly(string id)
 {
     _log.Debug(LocalizationService.FormatResourceString("CrawlerIndexerInterfaceMessage04", id));
     using ElementLogContext context = new();
     context.RemoveElementLog(id, _jobConfig.Id);
     _indexer.RemoveElementFromIndexAsync(id);
     StatisticService.GetService(_jobConfig.Id).IncreaseRemovedDocumentsCount();
 }
 private void CheckPath(string path)
 {
     if (string.IsNullOrWhiteSpace(path))
     {
         _log.Error(LocalizationService.FormatResourceString("JobWriterMessage01"));
         throw new ArgumentException("JobWriterMessage01");
     }
 }
Exemple #12
0
        private static void ConfigLogging()
        {
            var logFile      = Path.Combine(Util.GetApplicationRoot(), "log4net.xml");
            var logDirectory = Path.Combine(Util.GetApplicationRoot(), "logs");

            if (Util.IsDirectoryWritable(logDirectory))
            {
                log.Warn(LocalizationService.FormatResourceString("ProgramMessage01"));
            }
            XmlConfigurator.Configure(new FileInfo(logFile));
        }
 /// <summary>
 /// Erstellt ein neues Objekt, das einem Element in Enable Now entspricht.
 /// </summary>
 /// <param name="id">Die ID es Enable Now Elements.</param>
 /// <exception cref="ArgumentException">Wirft eine Ausnahme, falls die ID ungültig ist.</exception>
 internal Element(string id)
 {
     if (!Validator.Validate(id, Validator.EnableNowIdPattern))
     {
         string message = LocalizationService.FormatResourceString("ElementMessage01", id);
         _log.Error(message);
         throw new ArgumentException(message);
     }
     Id    = id;
     Class = Id.Split('_')[0];
 }
 /// <summary>
 ///
 /// </summary>
 /// <returns></returns>
 internal IEnumerable <string> ReadJobPaths()
 {
     try
     {
         return(Directory.EnumerateFiles(JobDirectory));
     }
     catch (Exception e)
     {
         log.Error(LocalizationService.FormatResourceString("JobReaderMessage01", JobDirectory), e);
         return(null);
     }
 }
 /// <summary>
 /// Liest den Text aus der Datei aus.
 /// </summary>
 /// <param name="filePath">Pfad zu der Datei.</param>
 /// <returns>Text der Datei.</returns>
 private string ReadFile(string filePath)
 {
     try
     {
         return(File.ReadAllText(filePath));
     }
     catch (Exception e)
     {
         log.Error(LocalizationService.FormatResourceString("JobReaderMessage04", filePath), e);
         return(null);
     }
 }
Exemple #16
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="fields"></param>
        /// <param name="fieldName"></param>
        /// <exception cref="ArgumentNullException"></exception>
        /// <returns></returns>
        private string GetConverterFieldValue(JToken fields, string fieldName)
        {
            var field = fields[fieldName]?[0];

            if (field == null)
            {
                string message = LocalizationService.FormatResourceString("ConverterServiceMessage03", fieldName);
                _log.Error(message);
                throw new ArgumentNullException(message);
            }
            return(field.Value <string>());
        }
 /// <summary>
 ///
 /// </summary>
 /// <param name="jobIdParameters"></param>
 /// <param name="jobIds"></param>
 /// <param name="jobConfig"></param>
 private static void InitNewThread(List <string> jobIdParameters, List <string> jobIds, JobConfig jobConfig)
 {
     if (jobIdParameters.Count == 0 || jobIdParameters.Contains(jobConfig.Id))
     {
         jobIds.Add(jobConfig.Id);
         Task t = Task.Run(delegate() { RunJob(jobConfig); });
         t.Wait();
     }
     else
     {
         log.Info(LocalizationService.FormatResourceString("JobSchedulerMessage05", jobConfig.Id));
     }
 }
 /// <summary>
 /// Liest den Text aus einer Datei.
 /// </summary>
 /// <returns></returns>
 private static string ReadFile()
 {
     try
     {
         return(File.ReadAllText(s_filePath));
     }
     catch (Exception e)
     {
         string message = LocalizationService.FormatResourceString("ConfigReaderMessage01", s_filePath);
         s_log.Error(message);
         throw new Exception(message, e);
     }
 }
Exemple #19
0
        private static List <string> GetJobParameter(string[] args)
        {
            if (args.Length == 0)
            {
                return(new List <string>());
            }
            List <string> parameters = args.Where(arg => !string.IsNullOrWhiteSpace(arg)).ToList();

            foreach (string p in parameters)
            {
                log.Info(LocalizationService.FormatResourceString("ProgramMessage02", p));
            }
            return(parameters);
        }
 /// <summary>
 ///
 /// </summary>
 /// <param name="element"></param>
 /// <param name="context"></param>
 /// <param name="isIndexingSuccess"></param>
 private void MarkElementFound(Element element, ElementLogContext context, bool isIndexingSuccess)
 {
     if (isIndexingSuccess)
     {
         context.SetElementFound(element, _jobConfig.Id, true);
         _log.Info(LocalizationService.FormatResourceString("CrawlerIndexerInterfaceMessage06", element.Id));
         StatisticService.GetService(_jobConfig.Id).IncreaseIndexedDocumentsCount();
     }
     else
     {
         _log.Error(LocalizationService.FormatResourceString("CrawlerIndexerInterfaceMessage08", element.Id));
         ErrorControlService.GetService().IncreaseErrorCount();
     }
 }
        internal override async Task <JObject> GetMetaDataAsync(Element element, string fileType)
        {
            string entityUrl = GetMetaUrl(element.Class, element.Id, fileType);

            try
            {
                string jsonString = await new HttpRequest(_jobConfig).SendRequestAsync(entityUrl);
                return(JsonConvert.DeserializeObject <JObject>(jsonString));
            }
            catch
            {
                s_log.Warn(LocalizationService.FormatResourceString("MetaWebsiteReaderMessage01"));
                return(null);
            }
        }
 /// <summary>
 ///
 /// </summary>
 /// <param name="element"></param>
 /// <param name="isAlreadyIndexed"></param>
 private bool RemoveWhenForbidden(Element element, bool isAlreadyIndexed)
 {
     if (!ShouldBeIndexed(element))
     {
         _log.Info(LocalizationService.FormatResourceString("CrawlerIndexerInterfaceMessage01", element.Id));
         StatisticService.GetService(_jobConfig.Id).IncreaseSkippedDocumentsCount();
         if (isAlreadyIndexed)
         {
             RemoveElementCompletly(element);
         }
         ;
         return(true);
     }
     return(false);
 }
 /// <summary>
 /// 
 /// </summary>
 /// <param name="content"></param>
 /// <returns></returns>
 private async Task<string> SendFormAsync(string url, string content)
 {
     Log.Info(LocalizationService.FormatResourceString("HttpFormAuthenticationMessage02", url));
     List<KeyValuePair<string, string>> body = new()
     {
         new KeyValuePair<string, string>(jobConfig.AuthUserControl, jobConfig.AuthUser),
         new KeyValuePair<string, string>(jobConfig.AuthPasswordControl, jobConfig.AuthPassword)
     };
     foreach (var param in jobConfig.AuthFormActionAdditionalParameters)
     {
         body.Add(new KeyValuePair<string, string>(param.Key, param.Value));
     }
     AddFormParameters(content, body);
     HttpResponseMessage response = await client.PostAsync(url, new FormUrlEncodedContent(body));
     return await response.Content.ReadAsStringAsync();
 }
        /// <summary>
        /// Liest die Konnektor-Konfiguration aus der config.json heraus.
        /// </summary>
        /// <returns></returns>
        internal static Config ReadConfig()
        {
            Config config;

            try
            {
                string jsonString = ReadFile();
                config = JsonConvert.DeserializeObject <Config>(jsonString);
            }
            catch (Exception e)
            {
                string message = LocalizationService.FormatResourceString("ConfigReaderMessage02", s_filePath);
                s_log.Error(message);
                throw new Exception(message, e);
            }
            return(config);
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="element"></param>
        /// <returns></returns>
        internal async override Task <bool> AddElementToIndexAsync(Element element)
        {
            Config config      = ConfigManager.GetConfigManager().ConnectorConfig;
            string paramString = GetIndexingParameterString(element);
            string url         = $"{config.IndexUrl}{paramString}";

            try
            {
                await new HttpRequest(JobConfig).SendRequestAsync(url);
                return(true);
            }
            catch (Exception e)
            {
                _log.Error(LocalizationService.FormatResourceString("JsonIndexerMessage01"), e);
                return(false);
            }
        }
Exemple #26
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="element"></param>
        /// <param name="fileName"></param>
        /// <exception cref="Exception"></exception>
        /// <returns></returns>
        internal async Task <ConverterResult> ConvertAttachementAsync(Element element, string fileName)
        {
            string url = GetConverterRequestUrl(element, fileName);
            string result;

            try
            {
                result = await new HttpRequest(JobManager.GetJobManager().SelectedJobConfig).SendRequestAsync(url);
            }
            catch
            {
                _log.Error(LocalizationService.FormatResourceString("ConverterServiceMessage02"));
                throw;
            }

            return(ExtractValues(result));
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="id"></param>
        /// <returns></returns>
        internal async override Task <bool> RemoveElementFromIndexAsync(string id)
        {
            Config config       = ConfigManager.GetConfigManager().ConnectorConfig;
            string encodedParam = HttpUtility.UrlEncode($"[{GetElasticsearchId(id)}]");
            string url          = $"{config.RemoveUrl}{encodedParam}";

            try
            {
                await new HttpRequest(JobConfig).SendRequestAsync(url);
                return(true);
            }
            catch (Exception e)
            {
                _log.Error(LocalizationService.FormatResourceString("JsonIndexerMessage02"), e);
                return(false);
            }
        }
        /// <summary>
        /// 
        /// </summary>
        /// <param name="form"></param>
        /// <param name="formData"></param>
        private void AddFormParameters(HtmlNode form, List<KeyValuePair<string, string>> formData)
        {
            if (form == null) return;

            var inputElements = form.SelectNodes("//input");
            foreach (var inputElement in inputElements)
            {
                string name = inputElement.GetAttributeValue("name", "");
                if (!jobConfig.AuthFormActionAdditionalParameters.ContainsKey(name) &&
                        !name.Equals(jobConfig.AuthUserControl) &&
                        !name.Equals(jobConfig.AuthPasswordControl) &&
                        name.Length > 0)
                {
                    string value = inputElement.GetAttributeValue("value", "");
                    Log.Debug(LocalizationService.FormatResourceString("HttpFormAuthenticationMessage01", name, value));
                    formData.Add(new KeyValuePair<string, string>(name, value));
                }
            }
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="jobConfig"></param>
        private static void RunJob(JobConfig jobConfig)
        {
            JobManager.GetJobManager().SelectedJobConfig = jobConfig;
            PublicationCrawler crawler = new();

            crawler.Initialize();
            crawler.StartCrawling();
            crawler.CompleteCrawling();
            StatisticService.GetService(jobConfig.Id).PrintStatistic();
            ErrorControlService.GetService().PrintErrorStatistic();
            StatisticService service = StatisticService.GetService(jobConfig.Id);
            string           text    = LocalizationService.FormatResourceString("MailClientMessage01",
                                                                                jobConfig.Id,
                                                                                DateTime.Now,
                                                                                service.FoundDocumentsCount,
                                                                                service.IndexedDocumentsCount,
                                                                                service.RemovedDocumentsCount);

            new MailService(jobConfig).SendMail(text);
        }
 /// <summary>
 ///
 /// </summary>
 /// <param name="element"></param>
 /// <param name="isAlreadyIndexed"></param>
 /// <param name="hasContentChanged"></param>
 /// <param name="context"></param>
 /// <returns></returns>
 private bool RemoveWhenChanged(Element element, bool isAlreadyIndexed, bool hasContentChanged, ElementLogContext context)
 {
     if (isAlreadyIndexed)
     {
         if (hasContentChanged)
         {
             _log.Info(LocalizationService.FormatResourceString("CrawlerIndexerInterfaceMessage02", element.Id));
             RemoveElementCompletly(element);
             return(true);
         }
         else
         {
             _log.Info(LocalizationService.FormatResourceString("CrawlerIndexerInterfaceMessage07", element.Id));
             StatisticService.GetService(_jobConfig.Id).IncreaseUnchangedDocumentsCount();
             context.SetElementFound(element, _jobConfig.Id, true);
             return(false);
         }
     }
     return(true);
 }