public DataTable ExtractTable( IFormat format ) { PathSeriesFormat pathSeriesFormat = format as PathSeriesFormat; PathTableFormat pathTableFormat = format as PathTableFormat; if ( pathSeriesFormat != null ) { var htmlSettings = new HtmlExtractionSettings(); htmlSettings.ExtractLinkUrl = pathSeriesFormat.ExtractLinkUrl; var result = Content.ExtractTable( HtmlPath.Parse( pathSeriesFormat.Path ), pathSeriesFormat.ToExtractionSettings(), htmlSettings ); if ( !result.Success ) { throw new Exception( "Failed to extract table from document: " + result.FailureReason ); } return pathSeriesFormat.ToFormattedTable( result.Value ); } else if ( pathTableFormat != null ) { var result = Content.ExtractTable( HtmlPath.Parse( pathTableFormat.Path ), true ); if ( !result.Success ) { throw new Exception( "Failed to extract table from document: " + result.FailureReason ); } return pathTableFormat.ToFormattedTable( result.Value ); } else if ( format is PathSingleValueFormat ) { var f = (PathSingleValueFormat)format; var str = Content.GetTextByPath( HtmlPath.Parse( f.Path ) ); var value = f.ValueFormat.Convert( str ); // XXX: this is really ugly - i have to create a table just to satisfy the interface :( return CreateTableForScalar( f.ValueFormat.Type, value ); } else { throw new NotSupportedException( "Format not supported for Html documents: " + format.GetType() ); } }
/// <summary> /// Extracts a cell or a series of the html table the given path is pointing to. /// If the path points to the table element itself instead of a cell the whole table will be /// extracted. /// The series to be extracted is always arranged in a column (independed of the original layout /// in the html table). The first column contains the values, the second the series header (if /// any defined). The series name is stored in the ColumnName of the first column. /// <remarks> /// Returns null if table not found by path. Currently we cannot handle thead /// and tfoot. /// </remarks> /// </summary> /// <param name="path">points to a cell or the body of a table (pointers to TR elements are invalid)</param> /// <param name="doc">the HTML document</param> /// <param name="htmlSettings">the HTML settings used to configure the extraction process</param> /// <param name="tableSettings">the table specific configuration</param> public static FallibleActionResult <DataTable> ExtractTable(this IHtmlDocument doc, HtmlPath path, TableExtractionSettings tableSettings, HtmlExtractionSettings htmlSettings) { if (!path.PointsToTable && !path.PointsToTableCell) { throw new InvalidExpressionException("Path neither points to table nor to cell"); } FallibleActionResult <DataTable> result = ExtractTable(doc, path, !htmlSettings.ExtractLinkUrl); if (!result.Success) { // pass throu failure result return(result); } // path points to whole table => return whole table if (path.PointsToTable) { return(result); } // get the x,y position of the cell the path is pointing to Point cellCoords = path.GetTableCellPosition(); if (cellCoords.X < 0 || cellCoords.Y < 0) { throw new InvalidExpressionException("Path expression corrupt: cell position in table could not be calculated"); } // get the value of the raw cell. extract the link url if configured. Func <object, object> GetValue = e => { if (htmlSettings.ExtractLinkUrl) { return(((IHtmlElement)e).FirstLinkOrInnerText()); } else { return(e); } }; var t = result.Value.ExtractSeries(cellCoords, GetValue, tableSettings); if (t == null) { return(FallibleActionResult <DataTable> .CreateFailureResult("Could not extract series specified")); } return(FallibleActionResult <DataTable> .CreateSuccessResult(t)); }