public static void CheckEntity(this ProfileEntity entity) { if (entity.Key.Trim() == string.Empty) { throw new Exception("Empty Key!"); } if (entity.EntityType != ProfileEntityType.Category && entity.Value.Trim() == string.Empty && entity.Note.Trim() == string.Empty && entity.Children.Count == 0) { throw new Exception("No value not child entities!"); } }
public static void PrintEntity(this ProfileEntity entity) { if (entity.Key.Trim() == string.Empty) { throw new Exception("Empty Key!"); } if (entity.Value.Trim() == string.Empty && entity.Note.Trim() == string.Empty && entity.Children.Count == 0) { throw new Exception("No value not child entities!"); } if (entity.EntityType == ProfileEntityType.Category) { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("\n================================================================"); Console.WriteLine(entity.Key); Console.WriteLine("==================================================================\n"); foreach (var child in entity.Children) { child.PrintEntity(); } } else if (entity.EntityType == ProfileEntityType.Field) { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("\n----------------------------------------------------------------"); Console.WriteLine($" +{entity.Key}"); Console.WriteLine("------------------------------------------------------------------\n"); if (entity.Value.Trim() != string.Empty) { Console.WriteLine($" {entity.Value}\n"); } foreach (var child in entity.Children) { child.PrintEntity(); } Console.WriteLine(); Console.ForegroundColor = ConsoleColor.Yellow; if (entity.Note.Trim() != string.Empty) { Console.WriteLine($" note: {entity.Note}"); } if (entity.Date.Trim() != string.Empty) { Console.WriteLine($" date: {entity.Date}"); } if (entity.ComparisonRank.HasValue) { Console.WriteLine($" country comparison to the world:: {entity.ComparisonRank}"); } } else if (entity.EntityType == ProfileEntityType.SubField) { Console.ForegroundColor = ConsoleColor.Red; if (entity.Key == "*") { Console.WriteLine($" * {entity.Value} {entity.Note} {entity.Date}"); } else { Console.WriteLine($" +{entity.Key}: {entity.Value} {entity.Note} {entity.Date}"); } } }
/// <summary> /// Parses the HTML content of the country profile data and converts them to a list of profile entities /// </summary> /// <param name="content">HTML content of the country profile data</param> /// <returns></returns> public static List <ProfileEntity> ParseProfileData(string content) { List <ProfileEntity> entities = new List <ProfileEntity>(); var parser = new HtmlParser(); var doc = parser.ParseDocument(content); //Category Lists var cat_lis = doc.QuerySelectorAll("li").Where(x => x.HasAttribute("id") && x.GetAttribute("id").EndsWith("-category-section-anchor")); foreach (var cat_li in cat_lis) { var catElem = cat_li.Children.Where(x => x.LocalName.ToLower() == "div" && x.HasAttribute("sectiontitle")).First(); ProfileEntity Category = new ProfileEntity() { EntityType = ProfileEntityType.Category, Key = catElem.GetAttribute("sectiontitle"), Children = new List <ProfileEntity> () }; var field_divs = doc.QuerySelectorAll("div").Where(x => x.LocalName.ToLower() == "div" && x.HasAttribute("id") && x.GetAttribute("id").StartsWith($"field-anchor-{Category.Key.Trim().Replace(" ", "-").ToLower()}-")); foreach (var field_div in field_divs) { string fieldname = field_div.GetAttribute("id").Trim(); fieldname = fieldname.Replace($"-anchor-{Category.Key.Trim().Replace(" ", "-").ToLower()}", ""); ProfileEntity Field = new ProfileEntity() { EntityType = ProfileEntityType.Field, Key = field_div.TextContent.Trim(new char[] { ':' }).Replace(":", "").Trim(), Children = new List <ProfileEntity> () }; //Get field content div var field_content_div = doc.QuerySelectorAll("div").Where(x => x.HasAttribute("id") && x.GetAttribute("id").Trim() == fieldname).First(); //Iterate through sub-fields //-------------------------- //1.0 Check for Notes - directly for the field var notes = field_content_div.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class").Trim() == "category_data note"); foreach (var note in notes) { Field.Note += note.TextContent.Replace("note:", "").Trim() + "\n"; } //2.0 Go through subfields foreach (var subfield in field_content_div.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class").Contains("subfield") && !x.GetAttribute("class").Contains("note"))) { //2.1 Check if the sub-filed has a direct value if (subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class").Contains("subfield")).Count() == 0) { Field.Value += subfield.TextContent + "\n"; } else //3.2 Contains sub-fields { ProfileEntity SubField = new ProfileEntity() { EntityType = ProfileEntityType.SubField }; //3.2.1 Handle historic fields if (subfield.GetAttribute("class").Contains("historic")) { if (subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class").Contains("subfield-name")).Count() != 0) //Sub Field is Numeric { SubField.Key = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-name").First().TextContent.Replace(":", "").Trim(); } else //Historic sub field with no key { SubField.Key = "*"; } SubField.IsHistoricEntity = true; SubField.Value = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-number").FirstOrDefault()?.TextContent ?? ""; SubField.Note = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-note").FirstOrDefault()?.TextContent ?? ""; SubField.Date = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-date").FirstOrDefault()?.TextContent ?? ""; } //3.2.2 Handle numeric fields else if (subfield.GetAttribute("class").Contains("numeric")) { if (subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class").Contains("subfield-name")).Count() != 0) //Sub Field is Numeric { SubField.IsNumericEntity = true; SubField.Key = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-name").First().TextContent.Replace(":", "").Trim(); SubField.Value = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-number").FirstOrDefault()?.TextContent ?? ""; SubField.Note = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-note").FirstOrDefault()?.TextContent ?? ""; SubField.Date = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-date").FirstOrDefault()?.TextContent ?? ""; } else //Parent field is numeric { Field.IsNumericEntity = true; Field.Value = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-number").FirstOrDefault()?.TextContent ?? ""; Field.Note = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-note").FirstOrDefault()?.TextContent ?? ""; Field.Date = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-date").FirstOrDefault()?.TextContent ?? ""; continue; } } //3.3.4 Handle grouped subfield case else if (subfield.GetAttribute("class").Contains("grouped_subfield")) { if (subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class").Contains("subfield-name")).Count() != 0) //Sub Field is Numeric { SubField.IsGroupedEntity = true; SubField.Key = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-name").First().TextContent.Replace(":", "").Trim(); SubField.Value = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-number").First().TextContent; SubField.Note = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-note").FirstOrDefault()?.TextContent ?? ""; SubField.Date = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-date").FirstOrDefault()?.TextContent ?? ""; } else //Parent field is grouped { Field.IsGroupedEntity = true; Field.Value = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-number").First().TextContent; Field.Note = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-note").FirstOrDefault()?.TextContent ?? ""; Field.Date = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-date").FirstOrDefault()?.TextContent ?? ""; continue; } } //3.3.4 Handle text cases else if (subfield.GetAttribute("class").Contains("text")) { if (subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class").Contains("subfield-name")).Count() != 0) //Is a sub filed { var keyNode = subfield.Children.Where(x => x.GetAttribute("class") == "subfield-name").First(); SubField.Key = keyNode.TextContent.Trim(new char[] { ':', ' ' }); subfield.RemoveChild(keyNode); SubField.Value = subfield.TextContent.Trim(); } else { string note = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-note").FirstOrDefault()?.TextContent.Trim() ?? ""; string date = subfield.Children.Where(x => x.HasAttribute("class") && x.GetAttribute("class") == "subfield-date").FirstOrDefault()?.TextContent.Trim() ?? ""; string val = subfield.TextContent.Trim(); if (note != string.Empty) { val.Replace(note, ""); } if (date != string.Empty) { val.Replace(date, ""); } Field.Value += val + "\n"; Field.Note += note + "\n"; Field.Date += date + "\n"; continue; } } SubField.Value = SubField.Value.Trim(); SubField.Note = SubField.Note.Trim(); SubField.Date = SubField.Date.Trim(); Field.Children.Add(SubField); } } //Check for country comparison for the field AngleSharp.Dom.IElement compareField = null; if ((compareField = field_content_div.Children.Where(x => x.LocalName.ToLower() == "div" && !x.HasAttribute("class") && x.TextContent.ToLower().Contains("country comparison to the world")).FirstOrDefault()) != null) { int rank = 0; if (int.TryParse(compareField.Children.Where(x => x.GetAttribute("class") == "category_data").FirstOrDefault()?.TextContent ?? "".Trim(), out rank)) { Field.ComparisonRank = rank; } } Field.Value = Field.Value.Trim(); Field.Note = Field.Note.Trim(); Field.Date = Field.Date.Trim(); Category.Children.Add(Field); } entities.Add(Category); } return(entities); }