public static Dictionary <int, string> ParseRegestrationEventTargets(IHtmlTableElement classesTable) { ConcurrentBag <(int ClassId, string ClassRegistrationEventTarget)> targets = new ConcurrentBag <(int, string)>(); Parallel.ForEach(classesTable.Rows.Skip(1), row => targets.Add(ParseClassIdAndRegistrationEventTarget(row))); return(targets.ToDictionary(c => c.ClassId, c => c.ClassRegistrationEventTarget)); }
private static async Task scrapeWeapons() { string[] expectedNames = { "Type", "Id", "Name", "ModelType", "MainModel", "PartModel" }; IHtmlTableElement table = await getTable(baseURL + weaponURL); verifyHeaderNames(expectedNames.ToList(), table); List <Weapon> allWeapon = new List <Weapon>(); List <IHtmlTableRowElement> rows = getRows(table); foreach (IHtmlTableRowElement row in rows) { Weapon weapon = new Weapon(); weapon.weapon_type = (Weapon.WEAPON_TYPE)Enum.Parse(typeof(Weapon.WEAPON_TYPE), row.Cells[0].InnerHtml, true); weapon.ID = Int32.Parse(row.Cells[1].InnerHtml); weapon.name = row.Cells[2].InnerHtml; weapon.model_type = (Weapon.MODEL_TYPE)Enum.Parse(typeof(Weapon.MODEL_TYPE), row.Cells[3].InnerHtml, true); weapon.main_model = row.Cells[4].InnerHtml; weapon.part_model = row.Cells[5].InnerHtml; allWeapon.Add(weapon); } using (StreamWriter sw = new StreamWriter("weapon.json", false)) { sw.Write(JsonConvert.SerializeObject(allWeapon, Formatting.Indented)); } }
public static UClass[] ParseClassesTable(IHtmlTableElement classesTable, int classYear = 0, USemester classSemester = USemester.Unknown) { var classes = new ConcurrentBag <UClass>(); Parallel.ForEach(classesTable.Rows.Skip(1), row => classes.Add(ParseClass(row, classYear, classSemester))); return(classes.ToArray()); }
private static async Task <IHtmlTableElement> getTable(string url) { CancellationTokenSource cancellationToken = new CancellationTokenSource(); HttpClient httpClient = new HttpClient(); HttpResponseMessage request = await httpClient.GetAsync(url); cancellationToken.Token.ThrowIfCancellationRequested(); Stream response = await request.Content.ReadAsStreamAsync(); cancellationToken.Token.ThrowIfCancellationRequested(); HtmlParser parser = new HtmlParser(); IHtmlDocument document = parser.ParseDocument(response); //A sanity check to ensure that there are no unexpected changes List <AngleSharp.Dom.IElement> tables = document.All.Where(elem => "table".Equals(elem.TagName, StringComparison.InvariantCultureIgnoreCase)).ToList(); if (tables.Count != 1) { throw new Exception("Unexpect number of tables: " + tables.Count + ", expected 1"); } IHtmlTableElement table = (IHtmlTableElement)tables[0]; return(table); }
private static async Task scrapeArmor() { string[] expectedNames = { "Type", "ArmorId", "Name", "LayeredId", "Male", "Female" }; IHtmlTableElement table = await getTable(baseURL + armorURL); verifyHeaderNames(expectedNames.ToList(), table); List <Armor> allArmor = new List <Armor>(); List <IHtmlTableRowElement> rows = getRows(table); foreach (IHtmlTableRowElement row in rows) { Armor armor = new Armor(); armor.type = (Armor.ARMOR_SLOT)Enum.Parse(typeof(Armor.ARMOR_SLOT), row.Cells[0].InnerHtml, true); armor.ID = Int32.Parse(row.Cells[1].InnerHtml); armor.name = row.Cells[2].InnerHtml; armor.layered_ID = Int32.Parse(row.Cells[3].InnerHtml); armor.male_location = row.Cells[4].InnerHtml; armor.female_location = row.Cells[5].InnerHtml; allArmor.Add(armor); } using (StreamWriter sw = new StreamWriter("armor.json", false)) { sw.Write(JsonConvert.SerializeObject(allArmor, Formatting.Indented)); } }
private Dictionary <string, R4UReleaseSet> CreatePRSets(IHtmlTableElement table) { if (table != null) { return(table.Rows // .Select(row => releaseIDMatcher.Match(row.Cells[0].TextContent).Groups[1].Value) // .Distinct() // .ToDictionary(rid => rid, rid => CreatePromoSet(rid)) // ); } else { return(new Dictionary <string, R4UReleaseSet>()); } }
//Sanity check to ensure the page hasn't change format private static bool verifyHeaderNames(List <string> expected, IHtmlTableElement given) { List <string> headerNames = new List <string>(); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> headerCells = given.QuerySelectorAll("th"); foreach (IHtmlTableHeaderCellElement headerCell in headerCells) { headerNames.Add(headerCell.TextContent); } if (!expected.ToList().SequenceEqual(headerNames)) { throw new Exception("Headers do not match. Expected: |" + String.Join(",", expected) + "| but found |" + String.Join(",", headerNames)); } return(true); }
public static IEnumerable <FoodItem> ParseTable(IHtmlTableElement foodTable, LocalDate date, ILogger logger) { var allRows = foodTable.Bodies.First().Rows; var seed = (currentSection : "none", rows : Enumerable.Empty <FoodItem>()); logger.LogDebug($"found {allRows.Length} row(s) in food table"); logger.LogTrace($"parsing food table {foodTable.ToHtml()}"); return(allRows.Aggregate(seed, (acc, currentRow) => { var(currentSection, rows) = acc; return IsSectionRow(currentRow) ? (currentRow.TextContent.Trim(), rows) : (currentSection, rows.Concat(ParseMealFoodItem(currentRow, currentSection, date, logger))); },
static void Inspect(IHtmlTableElement table, List <Entry> list) { var entry = new Entry(); var cells = table.QuerySelectorAll("table td"); var blub = cells.Select(m => m.TextContent).ToArray(); var path = String.Empty; entry.Rules = cells[2].TextContent; entry.FileName = cells[4].TextContent; entry.Collection = cells[6].TextContent; entry.Text = cells[7].TextContent.Trim(); if (entry.Collection.StartsWith("IBM")) { path = IBMPath(entry.FileName); } else if (entry.Collection.StartsWith("James Clark")) { path = XmlTestPath(entry.FileName); } else if (entry.Collection.StartsWith("Sun")) { path = SunPath(entry.FileName); } else if (entry.Collection.StartsWith("OASIS")) { path = OasisPath(entry.FileName); } else if (entry.Collection.StartsWith("Fuji")) { path = XmlFujiPath(entry.FileName); } else { path = EdUniPath(entry.FileName); } if (File.Exists(path)) { entry.Content = File.ReadAllText(path); list.Add(entry); } }
private HtmlTable(IHtmlTableElement e) { var headings = e.QuerySelectorAll("tr") .Where(x => x.Children.All(y => y is IHtmlTableHeaderCellElement)) .Select(x => (IHtmlTableRowElement)x) .ToList(); if (headings.Any()) { var rowChildren = headings.Select(x => (Row: x, Children: RowChildren(x).ToList())).ToList(); var num = rowChildren.First().Children.Count; foreach (var(row, children) in rowChildren.Skip(1)) { if (children.Count != num) { throw new HtmlElementException(row, $"Expected all of the rows to have the same amount of cells ({num}). But this one has {RowChildren(row).Count()}"); } } ColumnTitles = Linq.Range(num) .Select(i => rowChildren.Select(x => x.Children[i])) .Select(x => x.Select(y => y.TextContent.Trim()).Join(" ").Trim()) .ToList(); _rows = e.QuerySelectorAll("tr").Skip(headings.Count).Select(row => (IHtmlTableRowElement)row).ToList(); } else { ColumnTitles = Array.Empty <string>(); _rows = e.QuerySelectorAll("tr").Select(row => (IHtmlTableRowElement)row).ToList(); } _columnTitleToIndex = ColumnTitles.Enumerate() .Distinct((i1, i2) => i1.Index == i2.Index) .ToDictionary(tup => tup.Elem, tup => tup.Index); }
static async Task Main(string[] args) { Console.WriteLine("Hello World!"); var config = Configuration.Default; //Create a new context for evaluating webpages with the given config var context = BrowsingContext.New(config); //Parse the document from the content of a response to a virtual request IDocument document = await context.OpenAsync(req => req.Content(new StreamReader("..\\..\\..\\..\\TestParser\\filmPage.html").BaseStream)); var filmInfoTables = document.All.Where(t => t.LocalName == "table" && t.Id == "details"); var filmInfoTable2 = document.QuerySelector <IHtmlTableElement>("table#details"); if (filmInfoTables.Count() == 1) { IHtmlTableElement filmInfoTable = (IHtmlTableElement)filmInfoTables.ElementAt(0); var htmlTableCellElement = filmInfoTable.Rows[0].Cells[1]; var boldElements = htmlTableCellElement.ChildNodes.Where(t => t is IHtmlElement && (t as IHtmlElement).NodeName == "B").ToArray(); INode boldElName = boldElements.FirstOrDefault(t => t.TextContent.Contains("Название")); INode boldElYear = boldElements.FirstOrDefault(t => t.TextContent.Contains("Год")); INode boldElCountry = boldElements.FirstOrDefault(t => t.TextContent.Contains("Страна")); INode boldElGenres = boldElements.FirstOrDefault(t => t.TextContent.Contains("Жанр")); INode boldElDuration = boldElements.FirstOrDefault(t => t.TextContent.Contains("Продолжительность")); string filmTitle = boldElName?.NextSibling.TextContent; string filmYear = boldElYear?.NextSibling.TextContent; string filmCountry = boldElCountry?.NextSibling.TextContent; string filmGenres = boldElGenres?.NextSibling.TextContent; string filmDuration = boldElDuration?.NextSibling.TextContent; //Год выпуска: 1999 //Страна: США //Жанр: детектив, криминал, драма, триллер //Продолжительность: 01:28:45 } }
Table Render(IHtmlTableElement element) { var table = new Table(); return(table); }
static void Inspect(IHtmlTableElement table, List<Entry> list) { var entry = new Entry(); var cells = table.QuerySelectorAll("table td"); var blub = cells.Select(m => m.TextContent).ToArray(); var path = String.Empty; entry.Rules = cells[2].TextContent; entry.FileName = cells[4].TextContent; entry.Collection = cells[6].TextContent; entry.Text = cells[7].TextContent.Trim(); if (entry.Collection.StartsWith("IBM")) path = IBMPath(entry.FileName); else if (entry.Collection.StartsWith("James Clark")) path = XmlTestPath(entry.FileName); else if (entry.Collection.StartsWith("Sun")) path =SunPath(entry.FileName); else if (entry.Collection.StartsWith("OASIS")) path = OasisPath(entry.FileName); else if (entry.Collection.StartsWith("Fuji")) path = XmlFujiPath(entry.FileName); else path = EdUniPath(entry.FileName); if (File.Exists(path)) { entry.Content = File.ReadAllText(path); list.Add(entry); } }
private static List <IHtmlTableRowElement> getRows(IHtmlTableElement table) { IHtmlTableSectionElement body = (IHtmlTableSectionElement)table.QuerySelector("tbody"); return(body.Rows.ToList()); }
public static Try <HtmlTable, HtmlElementException> Create(IHtmlTableElement e) => new Try <HtmlTable, HtmlElementException>(() => new HtmlTable(e));
private FinancialInfo ParseFinInfo(string content, DocLinkInfo.LinkTypeEnum linktype, FinInfoCategoria categoria, FinInfoTipo tipo) { var parser = new HtmlParser(); var doc = parser.Parse(content); FinancialInfo finInfo = new FinancialInfo(); finInfo.Categoria = categoria; finInfo.Tipo = tipo; IHtmlTableElement table = null; if (linktype == DocLinkInfo.LinkTypeEnum.Bovespa) { var div = doc.QuerySelector("div.ScrollMaker"); if (div == null) { var scripts = doc.QuerySelectorAll("script"); if (scripts.Any(s => s.TextContent.Contains("Não Possui Dados para Carregar a Página"))) { // dado não existe return(null); } } table = div.FirstElementChild as IHtmlTableElement; //table anterior a table é a linha que contém o multiplicador var multiplierText = div.PreviousElementSibling.TextContent; if (multiplierText.Contains("Mil")) { finInfo.Multiplicador = 1000; } } else //cvm { var title = doc.QuerySelector("#TituloTabelaSemBorda"); if (title.TextContent.Contains("Reais Mil")) { finInfo.Multiplicador = 1000; } table = title.NextElementSibling as IHtmlTableElement; if (table.InnerHtml.Contains("Justificativa para a não prestação da informação")) { // dado não existe return(null); } } foreach (var row in table.Rows) { bool isTopLine = false; if (linktype == DocLinkInfo.LinkTypeEnum.Bovespa) { isTopLine = row.GetAttribute("valign") == "top"; } else { isTopLine = row.Cells[0].TextContent.Trim() == "Conta"; } if (isTopLine) // linha de título { // pega a data da terceira célula // Valor do Trimestre Atual 01/04/2009 a 30/06/2009 var text = row.Cells[2].TextContent; var iUltimoNum = text.LastIndexOfNum(); var start = iUltimoNum - 9; var datetext = text.Substring(start, 10).Trim(); finInfo.Data = DateTime.ParseExact(datetext, "dd/MM/yyyy", new CultureInfo("pt-BR")); } else { var codconta = row.Cells[0].TextContent; var nomeconta = row.Cells[1].TextContent; var valortext = row.Cells[2].TextContent; FinancialItem item = new FinancialItem(); item.Conta = codconta.Trim(); item.Nome = nomeconta.Trim(); item.Valor = ParseValor(valortext.Trim()); finInfo.Items.Add(item); } } return(finInfo); }