public void ExtractEpsFromAriva() { HtmlPath path = HtmlPath.Parse("/BODY[0]/DIV[5]/DIV[0]/DIV[1]/TABLE[7]/TBODY[0]/TR[6]/TD[1]"); TableExtractionSettings settings = new TableExtractionSettings(); settings.RowHeaderColumn = 0; settings.ColumnHeaderRow = 1; settings.Dimension = CellDimension.Row; settings.SeriesName = "verwässertes Ergebnis pro Aktie"; settings.SeriesHeaderType = typeof(int); settings.SeriesValueType = typeof(float); var doc = LoadDocument("ariva.html"); var result = doc.ExtractTable(path, settings, new HtmlExtractionSettings()); Assert.IsTrue(result.Success); var table = result.Value; table.Dump(); Assert.AreEqual(6, table.Rows.Count); Assert.AreEqual(2.78f, table.Rows[0][0]); Assert.AreEqual(3.00f, table.Rows[1][0]); Assert.AreEqual(2.89f, table.Rows[2][0]); Assert.AreEqual(3.30f, table.Rows[3][0]); Assert.AreEqual(3.33f, table.Rows[4][0]); Assert.AreEqual(4.38f, table.Rows[5][0]); Assert.AreEqual(2001, table.Rows[0][1]); Assert.AreEqual(2002, table.Rows[1][1]); Assert.AreEqual(2003, table.Rows[2][1]); Assert.AreEqual(2004, table.Rows[3][1]); Assert.AreEqual(2005, table.Rows[4][1]); Assert.AreEqual(2006, table.Rows[5][1]); }
/// <summary> /// SeriesName validation not enabled here. /// </summary> public TableExtractionSettings ToExtractionSettings() { TableExtractionSettings settings = new TableExtractionSettings(); settings.Dimension = Expand; if (Expand == CellDimension.Column) { settings.ColumnHeaderRow = SeriesNamePosition; settings.RowHeaderColumn = TimeAxisPosition; } else { settings.ColumnHeaderRow = TimeAxisPosition; settings.RowHeaderColumn = SeriesNamePosition; } settings.SkipColumns = SkipColumns; settings.SkipRows = SkipRows; // do not enable validation here //settings.SeriesName = format.SeriesNameContains; return(settings); }
public void ExtractEpsFromAriva() { HtmlPath path = HtmlPath.Parse( "/BODY[0]/DIV[5]/DIV[0]/DIV[1]/TABLE[7]/TBODY[0]/TR[6]/TD[1]" ); TableExtractionSettings settings = new TableExtractionSettings(); settings.RowHeaderColumn = 0; settings.ColumnHeaderRow = 1; settings.Dimension = CellDimension.Row; settings.SeriesName = "verwässertes Ergebnis pro Aktie"; settings.SeriesHeaderType = typeof( int ); settings.SeriesValueType = typeof( float ); var doc = LoadDocument( "ariva.html" ); var result = doc.ExtractTable( path, settings, new HtmlExtractionSettings() ); Assert.IsTrue( result.Success ); var table = result.Value; table.Dump(); Assert.AreEqual( 6, table.Rows.Count ); Assert.AreEqual( 2.78f, table.Rows[ 0 ][ 0 ] ); Assert.AreEqual( 3.00f, table.Rows[ 1 ][ 0 ] ); Assert.AreEqual( 2.89f, table.Rows[ 2 ][ 0 ] ); Assert.AreEqual( 3.30f, table.Rows[ 3 ][ 0 ] ); Assert.AreEqual( 3.33f, table.Rows[ 4 ][ 0 ] ); Assert.AreEqual( 4.38f, table.Rows[ 5 ][ 0 ] ); Assert.AreEqual( 2001, table.Rows[ 0 ][ 1 ] ); Assert.AreEqual( 2002, table.Rows[ 1 ][ 1 ] ); Assert.AreEqual( 2003, table.Rows[ 2 ][ 1 ] ); Assert.AreEqual( 2004, table.Rows[ 3 ][ 1 ] ); Assert.AreEqual( 2005, table.Rows[ 4 ][ 1 ] ); Assert.AreEqual( 2006, table.Rows[ 5 ][ 1 ] ); }
/// <summary> /// SeriesName validation not enabled here. /// </summary> public TableExtractionSettings ToExtractionSettings() { TableExtractionSettings settings = new TableExtractionSettings(); settings.Dimension = Expand; if ( Expand == CellDimension.Column ) { settings.ColumnHeaderRow = SeriesNamePosition; settings.RowHeaderColumn = TimeAxisPosition; } else { settings.ColumnHeaderRow = TimeAxisPosition; settings.RowHeaderColumn = SeriesNamePosition; } settings.SkipColumns = SkipColumns; settings.SkipRows = SkipRows; // do not enable validation here //settings.SeriesName = format.SeriesNameContains; return settings; }
/// <summary> /// Extracts a cell or a series of the html table the given path is pointing to. /// If the path points to the table element itself instead of a cell the whole table will be /// extracted. /// The series to be extracted is always arranged in a column (independed of the original layout /// in the html table). The first column contains the values, the second the series header (if /// any defined). The series name is stored in the ColumnName of the first column. /// <remarks> /// Returns null if table not found by path. Currently we cannot handle thead /// and tfoot. /// </remarks> /// </summary> /// <param name="path">points to a cell or the body of a table (pointers to TR elements are invalid)</param> /// <param name="doc">the HTML document</param> /// <param name="htmlSettings">the HTML settings used to configure the extraction process</param> /// <param name="tableSettings">the table specific configuration</param> public static FallibleActionResult <DataTable> ExtractTable(this IHtmlDocument doc, HtmlPath path, TableExtractionSettings tableSettings, HtmlExtractionSettings htmlSettings) { if (!path.PointsToTable && !path.PointsToTableCell) { throw new InvalidExpressionException("Path neither points to table nor to cell"); } FallibleActionResult <DataTable> result = ExtractTable(doc, path, !htmlSettings.ExtractLinkUrl); if (!result.Success) { // pass throu failure result return(result); } // path points to whole table => return whole table if (path.PointsToTable) { return(result); } // get the x,y position of the cell the path is pointing to Point cellCoords = path.GetTableCellPosition(); if (cellCoords.X < 0 || cellCoords.Y < 0) { throw new InvalidExpressionException("Path expression corrupt: cell position in table could not be calculated"); } // get the value of the raw cell. extract the link url if configured. Func <object, object> GetValue = e => { if (htmlSettings.ExtractLinkUrl) { return(((IHtmlElement)e).FirstLinkOrInnerText()); } else { return(e); } }; var t = result.Value.ExtractSeries(cellCoords, GetValue, tableSettings); if (t == null) { return(FallibleActionResult <DataTable> .CreateFailureResult("Could not extract series specified")); } return(FallibleActionResult <DataTable> .CreateSuccessResult(t)); }