Beispiel #1
0
        public void ExtractEpsFromAriva()
        {
            HtmlPath path = HtmlPath.Parse("/BODY[0]/DIV[5]/DIV[0]/DIV[1]/TABLE[7]/TBODY[0]/TR[6]/TD[1]");

            TableExtractionSettings settings = new TableExtractionSettings();

            settings.RowHeaderColumn  = 0;
            settings.ColumnHeaderRow  = 1;
            settings.Dimension        = CellDimension.Row;
            settings.SeriesName       = "verwässertes Ergebnis pro Aktie";
            settings.SeriesHeaderType = typeof(int);
            settings.SeriesValueType  = typeof(float);

            var doc    = LoadDocument("ariva.html");
            var result = doc.ExtractTable(path, settings, new HtmlExtractionSettings());

            Assert.IsTrue(result.Success);

            var table = result.Value;

            table.Dump();

            Assert.AreEqual(6, table.Rows.Count);

            Assert.AreEqual(2.78f, table.Rows[0][0]);
            Assert.AreEqual(3.00f, table.Rows[1][0]);
            Assert.AreEqual(2.89f, table.Rows[2][0]);
            Assert.AreEqual(3.30f, table.Rows[3][0]);
            Assert.AreEqual(3.33f, table.Rows[4][0]);
            Assert.AreEqual(4.38f, table.Rows[5][0]);

            Assert.AreEqual(2001, table.Rows[0][1]);
            Assert.AreEqual(2002, table.Rows[1][1]);
            Assert.AreEqual(2003, table.Rows[2][1]);
            Assert.AreEqual(2004, table.Rows[3][1]);
            Assert.AreEqual(2005, table.Rows[4][1]);
            Assert.AreEqual(2006, table.Rows[5][1]);
        }
Beispiel #2
0
        /// <summary>
        /// SeriesName validation not enabled here.
        /// </summary>
        public TableExtractionSettings ToExtractionSettings()
        {
            TableExtractionSettings settings = new TableExtractionSettings();

            settings.Dimension = Expand;
            if (Expand == CellDimension.Column)
            {
                settings.ColumnHeaderRow = SeriesNamePosition;
                settings.RowHeaderColumn = TimeAxisPosition;
            }
            else
            {
                settings.ColumnHeaderRow = TimeAxisPosition;
                settings.RowHeaderColumn = SeriesNamePosition;
            }
            settings.SkipColumns = SkipColumns;
            settings.SkipRows    = SkipRows;

            // do not enable validation here
            //settings.SeriesName = format.SeriesNameContains;

            return(settings);
        }
        public void ExtractEpsFromAriva()
        {
            HtmlPath path = HtmlPath.Parse( "/BODY[0]/DIV[5]/DIV[0]/DIV[1]/TABLE[7]/TBODY[0]/TR[6]/TD[1]" );

            TableExtractionSettings settings = new TableExtractionSettings();
            settings.RowHeaderColumn = 0;
            settings.ColumnHeaderRow = 1;
            settings.Dimension = CellDimension.Row;
            settings.SeriesName = "verwässertes Ergebnis pro Aktie";
            settings.SeriesHeaderType = typeof( int );
            settings.SeriesValueType = typeof( float );

            var doc = LoadDocument( "ariva.html" );
            var result = doc.ExtractTable( path, settings, new HtmlExtractionSettings() );

            Assert.IsTrue( result.Success );

            var table = result.Value;
            table.Dump();

            Assert.AreEqual( 6, table.Rows.Count );

            Assert.AreEqual( 2.78f, table.Rows[ 0 ][ 0 ] );
            Assert.AreEqual( 3.00f, table.Rows[ 1 ][ 0 ] );
            Assert.AreEqual( 2.89f, table.Rows[ 2 ][ 0 ] );
            Assert.AreEqual( 3.30f, table.Rows[ 3 ][ 0 ] );
            Assert.AreEqual( 3.33f, table.Rows[ 4 ][ 0 ] );
            Assert.AreEqual( 4.38f, table.Rows[ 5 ][ 0 ] );

            Assert.AreEqual( 2001, table.Rows[ 0 ][ 1 ] );
            Assert.AreEqual( 2002, table.Rows[ 1 ][ 1 ] );
            Assert.AreEqual( 2003, table.Rows[ 2 ][ 1 ] );
            Assert.AreEqual( 2004, table.Rows[ 3 ][ 1 ] );
            Assert.AreEqual( 2005, table.Rows[ 4 ][ 1 ] );
            Assert.AreEqual( 2006, table.Rows[ 5 ][ 1 ] );
        }
Beispiel #4
0
        /// <summary>
        /// SeriesName validation not enabled here.
        /// </summary>
        public TableExtractionSettings ToExtractionSettings()
        {
            TableExtractionSettings settings = new TableExtractionSettings();
            settings.Dimension = Expand;
            if ( Expand == CellDimension.Column )
            {
                settings.ColumnHeaderRow = SeriesNamePosition;
                settings.RowHeaderColumn = TimeAxisPosition;
            }
            else
            {
                settings.ColumnHeaderRow = TimeAxisPosition;
                settings.RowHeaderColumn = SeriesNamePosition;
            }
            settings.SkipColumns = SkipColumns;
            settings.SkipRows = SkipRows;

            // do not enable validation here
            //settings.SeriesName = format.SeriesNameContains;

            return settings;
        }
Beispiel #5
0
        /// <summary>
        /// Extracts a cell or a series of the html table the given path is pointing to.
        /// If the path points to the table element itself instead of a cell the whole table will be
        /// extracted.
        /// The series to be extracted is always arranged in a column (independed of the original layout
        /// in the html table). The first column contains the values, the second the series header (if
        /// any defined). The series name is stored in the ColumnName of the first column.
        /// <remarks>
        /// Returns null if table not found by path. Currently we cannot handle thead
        /// and tfoot.
        /// </remarks>
        /// </summary>
        /// <param name="path">points to a cell or the body of a table (pointers to TR elements are invalid)</param>
        /// <param name="doc">the HTML document</param>
        /// <param name="htmlSettings">the HTML settings used to configure the extraction process</param>
        /// <param name="tableSettings">the table specific configuration</param>
        public static FallibleActionResult <DataTable> ExtractTable(this IHtmlDocument doc, HtmlPath path, TableExtractionSettings tableSettings, HtmlExtractionSettings htmlSettings)
        {
            if (!path.PointsToTable && !path.PointsToTableCell)
            {
                throw new InvalidExpressionException("Path neither points to table nor to cell");
            }

            FallibleActionResult <DataTable> result = ExtractTable(doc, path, !htmlSettings.ExtractLinkUrl);

            if (!result.Success)
            {
                // pass throu failure result
                return(result);
            }

            // path points to whole table => return whole table
            if (path.PointsToTable)
            {
                return(result);
            }

            // get the x,y position of the cell the path is pointing to
            Point cellCoords = path.GetTableCellPosition();

            if (cellCoords.X < 0 || cellCoords.Y < 0)
            {
                throw new InvalidExpressionException("Path expression corrupt: cell position in table could not be calculated");
            }

            // get the value of the raw cell. extract the link url if configured.
            Func <object, object> GetValue = e =>
            {
                if (htmlSettings.ExtractLinkUrl)
                {
                    return(((IHtmlElement)e).FirstLinkOrInnerText());
                }
                else
                {
                    return(e);
                }
            };

            var t = result.Value.ExtractSeries(cellCoords, GetValue, tableSettings);

            if (t == null)
            {
                return(FallibleActionResult <DataTable> .CreateFailureResult("Could not extract series specified"));
            }

            return(FallibleActionResult <DataTable> .CreateSuccessResult(t));
        }