/// <summary> /// Gets the HtmlTable the given path is pointing to. /// If the path is pointing into a table, the embedding table is returned. /// If the path is not pointing to a table element null is returned. /// </summary> public static HtmlTable GetByPath(IHtmlDocument doc, HtmlPath path) { var start = doc.GetElementByPath(path); if (start == null) { return(null); } return(GetByElement(start)); }
/// <summary> /// Returns a new HtmlPath pointing to the table element this instance is pointing into. /// If this instance is not pointing into any table at all null is returned; /// </summary> public HtmlPath GetPathToTable() { var result = new HtmlPath(Elements); while (result.Elements.Count > 0) { if (result.PointsToTable) { return(result); } result = new HtmlPath(result.Elements.Take(result.Elements.Count - 1)); } return(result.Elements.Any() ? result : null); }
/// <summary> /// Extracts the complete html table the given path is pointing to. If the path points /// to a cell of a table the complete table is extracted still. /// <remarks> /// Returns null if table not found by path. Currently we cannot handle thead /// and tfoot. The number of the column is defined by the html table row with the most /// html columns /// </remarks> /// </summary> private DataTable ExtractTable(IHtmlDocument doc, HtmlPath path) { Contract.RequiresNotNull(doc, "doc"); Contract.RequiresNotNull(path, "path"); var htmlTable = HtmlTable.GetByPath(doc, path); if (htmlTable == null) { throw new Exception("Could not get table by path"); } var table = new DataTable(); // TODO: should we get the culture from the HTML page somehow? table.Locale = CultureInfo.InvariantCulture; foreach (var tr in htmlTable.Rows) { var rowData = tr.Children .Where(td => htmlTable.IsCell(td)) .Select(td => td.InnerText) .ToList(); if (rowData.Count > table.Columns.Count) { (rowData.Count - table.Columns.Count).Times(x => table.Columns.Add(string.Empty, typeof(object))); } var row = table.NewRow(); table.Rows.Add(row); table.AcceptChanges(); for (int i = 0; i < rowData.Count; ++i) { row[i] = rowData[i]; } } if (table.Rows.Count == 0) { table.Dispose(); throw new Exception("Table was empty"); } return(table); }
public DataTable ExtractTable() { var pathSeriesDescriptor = myDescriptor as PathSeriesDescriptor; if (pathSeriesDescriptor != null) { var table = ExtractTable(myDocument, HtmlPath.Parse(pathSeriesDescriptor.Path)); return(TableFormatter.ToFormattedTable(pathSeriesDescriptor, table)); } var pathTableDescriptor = myDescriptor as PathTableDescriptor; if (pathTableDescriptor != null) { var table = ExtractTable(myDocument, HtmlPath.Parse(pathTableDescriptor.Path)); return(TableFormatter.ToFormattedTable(pathTableDescriptor, table)); } var pathCellDescriptor = myDescriptor as PathCellDescriptor; if (pathCellDescriptor != null) { var table = ExtractTable(myDocument, HtmlPath.Parse(pathCellDescriptor.Path)); var value = TableFormatter.GetValue(pathCellDescriptor, table); // XXX: this is really ugly - i have to create a table just to satisfy the interface :( return(CreateTableForScalar(pathCellDescriptor.ValueFormat.Type, value)); } var pathSingleValueDescriptor = myDescriptor as PathSingleValueDescriptor; if (pathSingleValueDescriptor != null) { var e = myDocument.GetElementByPath(HtmlPath.Parse(pathSingleValueDescriptor.Path)); var str = e == null ? null : e.InnerText; var value = pathSingleValueDescriptor.ValueFormat.Convert(str); // XXX: this is really ugly - i have to create a table just to satisfy the interface :( return(CreateTableForScalar(pathSingleValueDescriptor.ValueFormat.Type, value)); } throw new NotSupportedException("Format not supported for Html documents: " + myDescriptor.GetType()); }
/// <summary> /// Returns the element specified by the given <see cref="HtmlPath"/>. /// </summary> public static IHtmlElement GetElementByPath(this IHtmlDocument doc, HtmlPath path) { Contract.RequiresNotNull(doc, "doc"); Contract.RequiresNotNull(path, "path"); var current = doc.Body.GetRoot(); foreach (var element in path.Elements) { current = GetNthChildWithTag(current, element.TagName, element.Position); if (current == null) { return(null); } } return(current); }