/// <summary> /// Fügt dem Dictionary eine gesammelte Information hinzu /// </summary> /// <param name="currentContent">Webseiten Teilinhalt in dem gerade gelesen wird.</param> /// <param name="command">Kommando für das Lesen von Websiteabschnitten</param> /// <param name="querytarget">Ziel in dem der ermittelte Informationsteil abgelegt werden soll</param> /// <param name="isTargetInformationItem">Handelt es sich bei dem Ziel für den Informationsteil um ein Informationsitem</param> /// <param name="querycontent">Neu ausgelesener Teilinhalt der Website</param> private void AddInformationToken( string currentContent, WebcrawlerCommand command, string querytarget, bool isTargetInformationItem, IEnumerable <string> querycontent ) { var isFirstStoreCommand = ContextCommandset.First(storecommand => storecommand.Target == querytarget) == command; // Die Information wird hinzugefügt, wenn ... if ( // kein Informationsitem ergänzt werden soll, oder ... !isTargetInformationItem || // der aktuell gefundene Inhalt nicht leer ist. (Sonst werden statisch hinzugefügte Informationen in einem Informationsitem abgelegt, obwohl es gar kein Informationsitem gibt.) !string.IsNullOrEmpty(currentContent) ) { // Erstes Speicherkommando >> Neuanlage einer Informationswiederholung if (isFirstStoreCommand || !isTargetInformationItem) { var storecontent = ContextDictionary.ContainsKey(querytarget) ? new List <string>(ContextDictionary[querytarget]) : new List <string>(); storecontent.AddRange(querycontent); ContextDictionary[querytarget] = storecontent; } // Ergänzen der letzten Speicherinformation else { var storeitems = ContextDictionary.ContainsKey(querytarget) ? ContextDictionary[querytarget] : default(IEnumerable <string>); var lastStoreitem = storeitems != null && storeitems.Any() ? storeitems.Last() : default(string); if (lastStoreitem != null) { lastStoreitem += string.Format(" {0}", string.Join(" ", querycontent)); var storecontent = ContextDictionary[querytarget].ToList(); storecontent[storecontent.Count - 1] = lastStoreitem; ContextDictionary[querytarget] = storecontent; } } } }
private IEnumerable<string> FindContent(string context, WebcrawlerCommand command) { var content = new List<string>(); var originalQuerytext = command?.Command; // Inhalt über JSON suchen if (!content.Any()) { if ((context ?? string.Empty).StartsWith("{")) { dynamic json = JsonConvert.DeserializeObject(context); var jsonvalue = json?.Property(originalQuerytext); if(!string.IsNullOrEmpty(jsonvalue?.Value.ToString())) { content = new List<string> { jsonvalue.Value.ToString() }; } } } // Inhalt über CSS-Query suchen if (!content.Any()) { var queryattribute = command?.AttributID; var quertarget = command?.Target; if (!string.IsNullOrEmpty(context) || !string.IsNullOrEmpty(queryattribute)) { var commandtokens = default(string[]); commandtokens = (quertarget ?? string.Empty).Split('.'); var isTargetInformationItem = commandtokens?.Length > 1; var dom = new CQ(context); var query = default(CQ); try { query = string.IsNullOrEmpty(originalQuerytext) ? dom : dom[originalQuerytext]; } catch (Exception) { } if (query != null && query.Any()) { try { content = query.Select( item => string.IsNullOrEmpty(queryattribute) ? isTargetInformationItem ? item.InnerHTML : item.OuterHTML : item.Attributes[queryattribute] ?? string.Empty ).ToList(); } catch (Exception) { } } } } // Inhalt über Regularexpression suchen if (!content.Any()) { try { var expression = (@"" + originalQuerytext ?? string.Empty); var matches = Regex.Matches(context, expression). OfType<Match>(). Where(match => match.Success && !string.IsNullOrEmpty(match.Value)). Select(match => match.Value) ; if(matches != null && matches.Any()) { content = matches.ToList(); } } catch (Exception) { } } // Query in das Contextdictionary leiten if (!content.Any()) { var tokens = originalQuerytext.Split(':'); var querytext = tokens?.FirstOrDefault() ?? string.Empty; content = ContextDictionary.ContainsKey(querytext) ? ContextDictionary[querytext].ToList() : content; } // Inhalt mit "" umschlossen >> Statischer Inhalt if (!content.Any()) { var quotation = "\""; if (originalQuerytext.StartsWith(quotation) && originalQuerytext.EndsWith(quotation)) { content = originalQuerytext.Replace(quotation, string.Empty).Split(',').Select(token => token.Trim()).ToList(); } } // Soll für den aktuellen Inhalt eine Anpassung stattfinden? var querytarget = (command?.Target ?? string.Empty).ToLower(); if( content != null && content.Any() && !string.IsNullOrEmpty(querytarget) && WebcrawlingUtilityConstants.BasicProperties.Any(prop => querytarget.EndsWith($"{prop}")) ) { for(int i = 0; i<content.Count();i++) { var token = content[i]; // Bei URIs ggf. die Source-Uri ergänzen. if (querytarget.EndsWith($"{WebcrawlingUtilityConstants.Uri}")) { if (!new[] { "http:", "https:" }.Any(prefix => token.StartsWith(prefix))) { var baseuri = ContextDictionary.ContainsKey(WebcrawlingUtilityConstants.BaseUri) ? ContextDictionary[WebcrawlingUtilityConstants.BaseUri].FirstOrDefault() ?? string.Empty : string.Empty; var separator = !baseuri.EndsWith("/") && !token.StartsWith("/") ? "/" : string.Empty; token = $"{baseuri}{separator}{token}"; } } // Bei IDs und Corellations ggf. Site BasisUri als Prefix hinzufügen if (new[] { WebcrawlingUtilityConstants.Corellation, WebcrawlingUtilityConstants.Id }.Any(prop => querytarget.EndsWith($".{prop}")) ) { var sitename = this.ContextDictionary.ContainsKey(WebcrawlingUtilityConstants.BaseUri) ? this.ContextDictionary[WebcrawlingUtilityConstants.BaseUri].First() : string.Empty; sitename = Regex.Split(sitename, @"://")?.Skip(1).FirstOrDefault() ?? string.Empty; token = token.StartsWith(sitename) ? token : $"{sitename}.{token}"; } // Bei Bildern wird ggf. http:// bei '//' ergänzt. if (new[] { WebcrawlingUtilityConstants.Image }.Any(prop => querytarget.EndsWith($".{prop}")) ) { var shortcut = "//"; token = token.StartsWith(shortcut) ? $"http:{token}" : token; } content[i] = token; } } return content; }
/// <summary> /// Fügt dem Dictionary eine gesammelte Information hinzu /// </summary> /// <param name="currentContent">Webseiten Teilinhalt in dem gerade gelesen wird.</param> /// <param name="command">Kommando für das Lesen von Websiteabschnitten</param> /// <param name="querytarget">Ziel in dem der ermittelte Informationsteil abgelegt werden soll</param> /// <param name="isTargetInformationItem">Handelt es sich bei dem Ziel für den Informationsteil um ein Informationsitem</param> /// <param name="querycontent">Neu ausgelesener Teilinhalt der Website</param> private void AddInformationToken( string currentContent, WebcrawlerCommand command, string querytarget, bool isTargetInformationItem, IEnumerable<string> querycontent ) { var isFirstStoreCommand = ContextCommandset.First(storecommand => storecommand.Target == querytarget) == command; // Die Information wird hinzugefügt, wenn ... if( // kein Informationsitem ergänzt werden soll, oder ... !isTargetInformationItem || // der aktuell gefundene Inhalt nicht leer ist. (Sonst werden statisch hinzugefügte Informationen in einem Informationsitem abgelegt, obwohl es gar kein Informationsitem gibt.) !string.IsNullOrEmpty(currentContent) ) { // Erstes Speicherkommando >> Neuanlage einer Informationswiederholung if (isFirstStoreCommand || !isTargetInformationItem) { var storecontent = ContextDictionary.ContainsKey(querytarget) ? new List<string>(ContextDictionary[querytarget]) : new List<string>(); storecontent.AddRange(querycontent); ContextDictionary[querytarget] = storecontent; } // Ergänzen der letzten Speicherinformation else { var storeitems = ContextDictionary.ContainsKey(querytarget) ? ContextDictionary[querytarget] : default(IEnumerable<string>); var lastStoreitem = storeitems != null && storeitems.Any() ? storeitems.Last() : default(string); if (lastStoreitem != null) { lastStoreitem += string.Format(" {0}", string.Join(" ", querycontent)); var storecontent = ContextDictionary[querytarget].ToList(); storecontent[storecontent.Count - 1] = lastStoreitem; ContextDictionary[querytarget] = storecontent; } } } }
private IEnumerable <string> FindContent(string context, WebcrawlerCommand command) { var content = new List <string>(); var originalQuerytext = command?.Command; // Inhalt über JSON suchen if (!content.Any()) { if ((context ?? string.Empty).StartsWith("{")) { dynamic json = JsonConvert.DeserializeObject(context); var jsonvalue = json?.Property(originalQuerytext); if (!string.IsNullOrEmpty(jsonvalue?.Value.ToString())) { content = new List <string> { jsonvalue.Value.ToString() }; } } } // Inhalt über CSS-Query suchen if (!content.Any()) { var queryattribute = command?.AttributID; var quertarget = command?.Target; if (!string.IsNullOrEmpty(context) || !string.IsNullOrEmpty(queryattribute)) { var commandtokens = default(string[]); commandtokens = (quertarget ?? string.Empty).Split('.'); var isTargetInformationItem = commandtokens?.Length > 1; var dom = new CQ(context); var query = default(CQ); try { query = string.IsNullOrEmpty(originalQuerytext) ? dom : dom[originalQuerytext]; } catch (Exception) { } if (query != null && query.Any()) { try { content = query.Select( item => string.IsNullOrEmpty(queryattribute) ? isTargetInformationItem ? item.InnerHTML : item.OuterHTML : item.Attributes[queryattribute] ?? string.Empty ).ToList(); } catch (Exception) { } } } } // Inhalt über Regularexpression suchen if (!content.Any()) { try { var expression = (@"" + originalQuerytext ?? string.Empty); var matches = Regex.Matches(context, expression). OfType <Match>(). Where(match => match.Success && !string.IsNullOrEmpty(match.Value)). Select(match => match.Value) ; if (matches != null && matches.Any()) { content = matches.ToList(); } } catch (Exception) { } } // Query in das Contextdictionary leiten if (!content.Any()) { var tokens = originalQuerytext.Split(':'); var querytext = tokens?.FirstOrDefault() ?? string.Empty; content = ContextDictionary.ContainsKey(querytext) ? ContextDictionary[querytext].ToList() : content; } // Inhalt mit "" umschlossen >> Statischer Inhalt if (!content.Any()) { var quotation = "\""; if (originalQuerytext.StartsWith(quotation) && originalQuerytext.EndsWith(quotation)) { content = originalQuerytext.Replace(quotation, string.Empty).Split(',').Select(token => token.Trim()).ToList(); } } // Soll für den aktuellen Inhalt eine Anpassung stattfinden? var querytarget = (command?.Target ?? string.Empty).ToLower(); if ( content != null && content.Any() && !string.IsNullOrEmpty(querytarget) && WebcrawlingUtilityConstants.BasicProperties.Any(prop => querytarget.EndsWith($"{prop}")) ) { for (int i = 0; i < content.Count(); i++) { var token = content[i]; // Bei URIs ggf. die Source-Uri ergänzen. if (querytarget.EndsWith($"{WebcrawlingUtilityConstants.Uri}")) { if (!new[] { "http:", "https:" }.Any(prefix => token.StartsWith(prefix))) { var baseuri = ContextDictionary.ContainsKey(WebcrawlingUtilityConstants.BaseUri) ? ContextDictionary[WebcrawlingUtilityConstants.BaseUri].FirstOrDefault() ?? string.Empty : string.Empty; var separator = !baseuri.EndsWith("/") && !token.StartsWith("/") ? "/" : string.Empty; token = $"{baseuri}{separator}{token}"; } } // Bei IDs und Corellations ggf. Site BasisUri als Prefix hinzufügen if (new[] { WebcrawlingUtilityConstants.Corellation, WebcrawlingUtilityConstants.Id }.Any(prop => querytarget.EndsWith($".{prop}"))) { var sitename = this.ContextDictionary.ContainsKey(WebcrawlingUtilityConstants.BaseUri) ? this.ContextDictionary[WebcrawlingUtilityConstants.BaseUri].First() : string.Empty; sitename = Regex.Split(sitename, @"://")?.Skip(1).FirstOrDefault() ?? string.Empty; token = token.StartsWith(sitename) ? token : $"{sitename}.{token}"; } content[i] = token; } } return(content); }
public IEnumerable <WebcrawlerCommand> ParseCommandset(IEnumerable <string> lines, int index) { var commands = default(List <WebcrawlerCommand>); var line = lines.ToArray()[index]; var level = line.GetLevel(); var commandlines = new List <KeyValuePair <int, string> >(); foreach (var followingline in lines.Skip(index).Where(l => !string.IsNullOrEmpty(l))) { if (followingline.GetLevel() < level) { break; } if (followingline.GetLevel() == level) { commandlines.Add(new KeyValuePair <int, string>(lines.ToList().IndexOf(followingline), followingline)); } } foreach (var commandline in commandlines) { var commandtext = commandline.Value; var tokens = default(IEnumerable <string>); // Enthält das Kommando ein !, wird dieses Kommando solange wiederholt wie es mindestens ein Ergebnis zurückgibt. var isLoop = commandtext.Contains("!"); commandtext = commandtext.Replace("!", string.Empty); // Ermittlung des Kommandoziels tokens = Regex.Split(commandtext, ">>"); var target = (tokens?.Count() > 1 ? (tokens.Skip(1).FirstOrDefault() ?? string.Empty) : string.Empty).Trim() ; commandtext = (tokens?.FirstOrDefault() ?? string.Empty).Trim(); // Ermittlung eines Attributes tokens = commandtext.Split('@'); var attribute = tokens?.Count() > 1 ? (tokens.Skip(1).FirstOrDefault() ?? string.Empty).Trim() : string.Empty; commandtext = string.IsNullOrEmpty(attribute) ? commandtext : tokens.FirstOrDefault() ?? commandtext; commands = commands ?? new List <WebcrawlerCommand>(); var command = new WebcrawlerCommand { AttributID = attribute, IsLoop = isLoop, Command = commandtext, Target = target }; this.ContextCommandset = this.ContextCommandset ?? new List <WebcrawlerCommand>(); var subcommands = default(List <WebcrawlerCommand>); var followingLine = new KeyValuePair <int, string>(commandline.Key + 1, lines.Skip(commandline.Key + 1).FirstOrDefault() ?? string.Empty); if (!string.IsNullOrEmpty(followingLine.Value) && followingLine.Value.GetLevel() == level + 1) { subcommands = subcommands ?? new List <WebcrawlerCommand>(); var set = ParseCommandset(lines, followingLine.Key); subcommands.AddRange(set); } command.Subcommands = subcommands; commands.Add(command); this.ContextCommandset.Add(command); } return(commands); }
private IEnumerable<string> FindContent(string context, WebcrawlerCommand command) { var content = new List<string>(); var originalQuerytext = command?.Command; // Inhalt über JSON suchen if (!content.Any()) { if ((context ?? string.Empty).StartsWith("{")) { dynamic json = JsonConvert.DeserializeObject(context); var jsonvalue = json?.Property(originalQuerytext); if(!string.IsNullOrEmpty(jsonvalue?.Value.ToString())) { content = new List<string> { jsonvalue.Value.ToString() }; } } } // Inhalt über CSS-Query suchen if (!content.Any()) { var queryattribute = command?.AttributID; var quertarget = command?.Target; if (!string.IsNullOrEmpty(context) || !string.IsNullOrEmpty(queryattribute)) { var commandtokens = default(string[]); commandtokens = (quertarget ?? string.Empty).Split('.'); var isTargetInformationItem = commandtokens?.Length > 1; var dom = new CQ(context); var query = default(CQ); try { query = string.IsNullOrEmpty(originalQuerytext) ? dom : dom[originalQuerytext]; } catch (Exception) { } if (query != null && query.Any()) { try { content = query.Select( item => string.IsNullOrEmpty(queryattribute) ? isTargetInformationItem ? item.InnerHTML : item.OuterHTML : item.Attributes[queryattribute] ?? string.Empty ).ToList(); } catch (Exception) { } } } } // Inhalt über Regularexpression suchen if (!content.Any()) { try { var expression = (@"" + originalQuerytext ?? string.Empty); var matches = Regex.Matches(context, expression). OfType<Match>(). Where(match => match.Success && !string.IsNullOrEmpty(match.Value)). Select(match => match.Value) ; if(matches != null && matches.Any()) { content = matches.ToList(); } } catch (Exception) { } } // Query in das Contextdictionary leiten if (!content.Any()) { var tokens = originalQuerytext.Split(':'); var querytext = tokens?.FirstOrDefault() ?? string.Empty; content = ContextDictionary.ContainsKey(querytext) ? ContextDictionary[querytext].ToList() : content; } // Inhalt mit "" umschlossen >> Statischer Inhalt if (!content.Any()) { var quotation = "\""; if (originalQuerytext.StartsWith(quotation) && originalQuerytext.EndsWith(quotation)) { content = originalQuerytext.Replace(quotation, string.Empty).Split(',').Select(token => token.Trim()).ToList(); } } return content; }
public IEnumerable<WebcrawlerCommand> ParseCommandset(IEnumerable<string> lines, int index = default(int)) { var commands = default(List<WebcrawlerCommand>); var line = lines.ToArray()[index]; var level = line.GetLevel(); var commandlines = new List<KeyValuePair<int, string>>(); foreach(var followingline in lines.Skip(index).Where(l => !string.IsNullOrEmpty(l))) { if (followingline.GetLevel() < level) { break; } if (followingline.GetLevel() == level) { commandlines.Add(new KeyValuePair<int, string>(lines.ToList().IndexOf(followingline), followingline)); } } foreach (var commandline in commandlines) { var commandtext = commandline.Value; var tokens = default(IEnumerable<string>); // Enthält das Kommando ein !, wird dieses Kommando solange wiederholt wie es mindestens ein Ergebnis zurückgibt. var isLoop = commandtext.Contains("!"); commandtext = commandtext.Replace("!", string.Empty); // Ermittlung des Kommandoziels tokens = Regex.Split(commandtext, ">>"); var target = (tokens?.Count() > 1 ? (tokens.Skip(1).FirstOrDefault() ?? string.Empty) : string.Empty).Trim() ; commandtext = (tokens?.FirstOrDefault() ?? string.Empty).Trim(); // Ermittlung eines Attributes tokens = commandtext.Split('@'); var attribute = tokens?.Count() > 1 ? (tokens.Skip(1).FirstOrDefault() ?? string.Empty).Trim() : string.Empty; commandtext = string.IsNullOrEmpty(attribute) ? commandtext : tokens.FirstOrDefault() ?? commandtext; commands = commands ?? new List<WebcrawlerCommand>(); var command = new WebcrawlerCommand { AttributID = attribute, IsLoop = isLoop, Command = commandtext, Target = target }; this.ContextCommandset = this.ContextCommandset ?? new List<WebcrawlerCommand>(); var subcommands = default(List<WebcrawlerCommand>); var followingLine = new KeyValuePair<int, string>(commandline.Key + 1, lines.Skip(commandline.Key + 1).FirstOrDefault() ?? string.Empty); if (!string.IsNullOrEmpty(followingLine.Value) && followingLine.Value.GetLevel() == level + 1) { subcommands = subcommands ?? new List<WebcrawlerCommand>(); var set = ParseCommandset(lines, followingLine.Key); subcommands.AddRange(set); } command.Subcommands = subcommands; commands.Add(command); this.ContextCommandset.Add(command); } return commands; }
private IEnumerable <string> FindContent(string context, WebcrawlerCommand command) { var content = new List <string>(); var originalQuerytext = command?.Command; // Inhalt über JSON suchen if (!content.Any()) { if ((context ?? string.Empty).StartsWith("{")) { dynamic json = JsonConvert.DeserializeObject(context); var jsonvalue = json?.Property(originalQuerytext); if (!string.IsNullOrEmpty(jsonvalue?.Value.ToString())) { content = new List <string> { jsonvalue.Value.ToString() }; } } } // Inhalt über CSS-Query suchen if (!content.Any()) { var queryattribute = command?.AttributID; var quertarget = command?.Target; if (!string.IsNullOrEmpty(context) || !string.IsNullOrEmpty(queryattribute)) { var commandtokens = default(string[]); commandtokens = (quertarget ?? string.Empty).Split('.'); var isTargetInformationItem = commandtokens?.Length > 1; var dom = new CQ(context); var query = default(CQ); try { query = string.IsNullOrEmpty(originalQuerytext) ? dom : dom[originalQuerytext]; } catch (Exception) { } if (query != null && query.Any()) { try { content = query.Select( item => string.IsNullOrEmpty(queryattribute) ? isTargetInformationItem ? item.InnerHTML : item.OuterHTML : item.Attributes[queryattribute] ?? string.Empty ).ToList(); } catch (Exception) { } } } } // Inhalt über Regularexpression suchen if (!content.Any()) { try { var expression = (@"" + originalQuerytext ?? string.Empty); var matches = Regex.Matches(context, expression). OfType <Match>(). Where(match => match.Success && !string.IsNullOrEmpty(match.Value)). Select(match => match.Value) ; if (matches != null && matches.Any()) { content = matches.ToList(); } } catch (Exception) { } } // Query in das Contextdictionary leiten if (!content.Any()) { var tokens = originalQuerytext.Split(':'); var querytext = tokens?.FirstOrDefault() ?? string.Empty; content = ContextDictionary.ContainsKey(querytext) ? ContextDictionary[querytext].ToList() : content; } // Inhalt mit "" umschlossen >> Statischer Inhalt if (!content.Any()) { var quotation = "\""; if (originalQuerytext.StartsWith(quotation) && originalQuerytext.EndsWith(quotation)) { content = originalQuerytext.Replace(quotation, string.Empty).Split(',').Select(token => token.Trim()).ToList(); } } return(content); }