/// <summary> /// Gets the HtmlTable the given path is pointing to. /// If the path is pointing into a table, the embedding table is returned. /// If the path is not pointing to a table element null is returned. /// </summary> public static HtmlTable GetTableByPath(this IHtmlDocument doc, HtmlPath path) { var start = doc.GetElementByPath(path); if (start == null) { return(null); } return(start.FindEmbeddingTable()); }
/// <summary> /// Gets the text of the element specified by the given <see cref="HtmlPath"/>. /// </summary> public static string GetTextByPath(this IHtmlDocument doc, HtmlPath path) { var e = doc.GetElementByPath(path); if (e == null) { return(null); } return(e.InnerText); }
public void SimplePath() { HtmlPath p = new HtmlPath(); p.Elements.Add( new HtmlPathElement( "body", 0 ) ); p.Elements.Add( new HtmlPathElement( "h3", 0 ) ); Assert.AreEqual( 2, p.Elements.Count ); Assert.IsFalse( p.PointsToTableCell ); Assert.IsFalse( p.PointsToTable ); Assert.AreEqual( "H3", p.Last.TagName ); Assert.AreEqual( "/BODY[0]/H3[0]", p.ToString() ); }
/// <summary> /// Returns the <see cref="HtmlPath"/> of the HtmlElement up to root. /// </summary> public static HtmlPath GetPath(this IHtmlElement element) { element.Require(x => element != null); HtmlPath path = new HtmlPath(); var cur = element; while (cur.Parent != null) { path.Elements.Insert(0, new HtmlPathElement(cur.TagName, cur.GetChildPos())); cur = cur.Parent; } return(path); }
/// <summary> /// Returns the element specified by the given <see cref="HtmlPath"/>. /// </summary> public static IHtmlElement GetElementByPath(this IHtmlDocument doc, HtmlPath path) { doc.Require(d => doc != null); path.Require(p => path != null); var root = doc.Body.GetRoot(); if (root == null) { return(null); } foreach (var element in path.Elements) { root = root.GetChildAt(element.TagName, element.Position); if (root == null) { return(null); } } return(root); }
/// <summary> /// Returns the <see cref="HtmlPath"/> of the HtmlElement up to root. /// </summary> public static HtmlPath GetPath( this IHtmlElement element ) { element.Require( x => element != null ); HtmlPath path = new HtmlPath(); var cur = element; while ( cur.Parent != null ) { path.Elements.Insert( 0, new HtmlPathElement( cur.TagName, cur.GetChildPos() ) ); cur = cur.Parent; } return path; }
public void TableCellPath() { HtmlPath p = new HtmlPath(); p.Elements.Add( new HtmlPathElement( "table", 0 ) ); p.Elements.Add( new HtmlPathElement( "tr", 2 ) ); p.Elements.Add( new HtmlPathElement( "td", 4 ) ); Assert.AreEqual( 3, p.Elements.Count ); Assert.IsTrue( p.PointsToTableCell ); Assert.IsFalse( p.PointsToTable ); Assert.AreEqual( "TD", p.Last.TagName ); Assert.AreEqual( "/TABLE[0]/TR[2]/TD[4]", p.ToString() ); }
public void TablePath() { HtmlPath p = new HtmlPath(); p.Elements.Add( new HtmlPathElement( "table", 1 ) ); Assert.AreEqual( 1, p.Elements.Count ); Assert.IsFalse( p.PointsToTableCell ); Assert.IsTrue( p.PointsToTable ); Assert.AreEqual( "TABLE", p.Last.TagName ); Assert.AreEqual( "/TABLE[1]", p.ToString() ); }
/// <summary> /// Extracts a cell or a series of the html table the given path is pointing to. /// If the path points to the table element itself instead of a cell the whole table will be /// extracted. /// The series to be extracted is always arranged in a column (independed of the original layout /// in the html table). The first column contains the values, the second the series header (if /// any defined). The series name is stored in the ColumnName of the first column. /// <remarks> /// Returns null if table not found by path. Currently we cannot handle thead /// and tfoot. /// </remarks> /// </summary> /// <param name="path">points to a cell or the body of a table (pointers to TR elements are invalid)</param> /// <param name="doc">the HTML document</param> /// <param name="htmlSettings">the HTML settings used to configure the extraction process</param> /// <param name="tableSettings">the table specific configuration</param> public static FallibleActionResult <DataTable> ExtractTable(this IHtmlDocument doc, HtmlPath path, TableExtractionSettings tableSettings, HtmlExtractionSettings htmlSettings) { if (!path.PointsToTable && !path.PointsToTableCell) { throw new InvalidExpressionException("Path neither points to table nor to cell"); } FallibleActionResult <DataTable> result = ExtractTable(doc, path, !htmlSettings.ExtractLinkUrl); if (!result.Success) { // pass throu failure result return(result); } // path points to whole table => return whole table if (path.PointsToTable) { return(result); } // get the x,y position of the cell the path is pointing to Point cellCoords = path.GetTableCellPosition(); if (cellCoords.X < 0 || cellCoords.Y < 0) { throw new InvalidExpressionException("Path expression corrupt: cell position in table could not be calculated"); } // get the value of the raw cell. extract the link url if configured. Func <object, object> GetValue = e => { if (htmlSettings.ExtractLinkUrl) { return(((IHtmlElement)e).FirstLinkOrInnerText()); } else { return(e); } }; var t = result.Value.ExtractSeries(cellCoords, GetValue, tableSettings); if (t == null) { return(FallibleActionResult <DataTable> .CreateFailureResult("Could not extract series specified")); } return(FallibleActionResult <DataTable> .CreateSuccessResult(t)); }
/// <summary> /// Extracts the complete html table the given path is pointing to. If the path points /// to a cell of a table the complete table is extracted still. /// <remarks> /// Returns null if table not found by path. Currently we cannot handle thead /// and tfoot. The number of the column is defined by the html table row with the most /// html columns /// </remarks> /// </summary> /// <param name="doc">the HTML document</param> /// <param name="path">the path to the table</param> /// <param name="textOnly">set this to true to get only the text of the cell, otherwise the /// cell itself as HtmlElement is returned</param> public static FallibleActionResult <DataTable> ExtractTable(this IHtmlDocument doc, HtmlPath path, bool textOnly) { doc.Require(x => doc != null); path.Require(x => path != null); HtmlTable htmlTable = doc.GetTableByPath(path); if (htmlTable == null) { return(FallibleActionResult <DataTable> .CreateFailureResult("Could not get table by path")); } DataTable table = new DataTable(); // TODO: should we get the culture from the HTML page somehow? table.Locale = CultureInfo.InvariantCulture; Func <IHtmlElement, object> GetContent = element => (textOnly ? (object)element.InnerText : element); foreach (var tr in htmlTable.Rows) { var htmlRow = new List <IHtmlElement>(); foreach (var td in tr.Children) { if (td.TagName == "TD" || td.TagName == "TH") { htmlRow.Add(td); } } // add columns if necessary if (htmlRow.Count > table.Columns.Count) { (htmlRow.Count - table.Columns.Count).Times(x => table.Columns.Add(string.Empty, typeof(object))); } // add new row to table DataRow row = table.NewRow(); table.Rows.Add(row); table.AcceptChanges(); // add data htmlRow.ForeachIndex((element, idx) => row[idx] = GetContent(element)); } if (table.Rows.Count == 0) { table.Dispose(); return(FallibleActionResult <DataTable> .CreateFailureResult("Table was empty")); } return(FallibleActionResult <DataTable> .CreateSuccessResult(table)); }