Example #1
0
        public DataTable ExtractTable( IFormat format )
        {
            PathSeriesFormat pathSeriesFormat = format as PathSeriesFormat;
            PathTableFormat pathTableFormat = format as PathTableFormat;

            if ( pathSeriesFormat != null )
            {
                var htmlSettings = new HtmlExtractionSettings();
                htmlSettings.ExtractLinkUrl = pathSeriesFormat.ExtractLinkUrl;

                var result = Content.ExtractTable( HtmlPath.Parse( pathSeriesFormat.Path ), pathSeriesFormat.ToExtractionSettings(), htmlSettings );
                if ( !result.Success )
                {
                    throw new Exception( "Failed to extract table from document: " + result.FailureReason );
                }

                return pathSeriesFormat.ToFormattedTable( result.Value );
            }
            else if ( pathTableFormat != null )
            {
                var result = Content.ExtractTable( HtmlPath.Parse( pathTableFormat.Path ), true );
                if ( !result.Success )
                {
                    throw new Exception( "Failed to extract table from document: " + result.FailureReason );
                }

                return pathTableFormat.ToFormattedTable( result.Value );
            }
            else if ( format is PathSingleValueFormat )
            {
                var f = (PathSingleValueFormat)format;
                var str = Content.GetTextByPath( HtmlPath.Parse( f.Path ) );
                var value = f.ValueFormat.Convert( str );

                // XXX: this is really ugly - i have to create a table just to satisfy the interface :(
                return CreateTableForScalar( f.ValueFormat.Type, value );
            }
            else
            {
                throw new NotSupportedException( "Format not supported for Html documents: " + format.GetType() );
            }
        }
Example #2
0
        /// <summary>
        /// Extracts a cell or a series of the html table the given path is pointing to.
        /// If the path points to the table element itself instead of a cell the whole table will be
        /// extracted.
        /// The series to be extracted is always arranged in a column (independed of the original layout
        /// in the html table). The first column contains the values, the second the series header (if
        /// any defined). The series name is stored in the ColumnName of the first column.
        /// <remarks>
        /// Returns null if table not found by path. Currently we cannot handle thead
        /// and tfoot.
        /// </remarks>
        /// </summary>
        /// <param name="path">points to a cell or the body of a table (pointers to TR elements are invalid)</param>
        /// <param name="doc">the HTML document</param>
        /// <param name="htmlSettings">the HTML settings used to configure the extraction process</param>
        /// <param name="tableSettings">the table specific configuration</param>
        public static FallibleActionResult <DataTable> ExtractTable(this IHtmlDocument doc, HtmlPath path, TableExtractionSettings tableSettings, HtmlExtractionSettings htmlSettings)
        {
            if (!path.PointsToTable && !path.PointsToTableCell)
            {
                throw new InvalidExpressionException("Path neither points to table nor to cell");
            }

            FallibleActionResult <DataTable> result = ExtractTable(doc, path, !htmlSettings.ExtractLinkUrl);

            if (!result.Success)
            {
                // pass throu failure result
                return(result);
            }

            // path points to whole table => return whole table
            if (path.PointsToTable)
            {
                return(result);
            }

            // get the x,y position of the cell the path is pointing to
            Point cellCoords = path.GetTableCellPosition();

            if (cellCoords.X < 0 || cellCoords.Y < 0)
            {
                throw new InvalidExpressionException("Path expression corrupt: cell position in table could not be calculated");
            }

            // get the value of the raw cell. extract the link url if configured.
            Func <object, object> GetValue = e =>
            {
                if (htmlSettings.ExtractLinkUrl)
                {
                    return(((IHtmlElement)e).FirstLinkOrInnerText());
                }
                else
                {
                    return(e);
                }
            };

            var t = result.Value.ExtractSeries(cellCoords, GetValue, tableSettings);

            if (t == null)
            {
                return(FallibleActionResult <DataTable> .CreateFailureResult("Could not extract series specified"));
            }

            return(FallibleActionResult <DataTable> .CreateSuccessResult(t));
        }