Ejemplo n.º 1
0
        /// <summary>
        /// Gets the HtmlTable the given path is pointing to.
        /// If the path is pointing into a table, the embedding table is returned.
        /// If the path is not pointing to a table element null is returned.
        /// </summary>
        public static HtmlTable GetTableByPath(this IHtmlDocument doc, HtmlPath path)
        {
            var start = doc.GetElementByPath(path);

            if (start == null)
            {
                return(null);
            }

            return(start.FindEmbeddingTable());
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Gets the text of the element specified by the given <see cref="HtmlPath"/>.
        /// </summary>
        public static string GetTextByPath(this IHtmlDocument doc, HtmlPath path)
        {
            var e = doc.GetElementByPath(path);

            if (e == null)
            {
                return(null);
            }

            return(e.InnerText);
        }
Ejemplo n.º 3
0
        public void SimplePath()
        {
            HtmlPath p = new HtmlPath();
            p.Elements.Add( new HtmlPathElement( "body", 0 ) );
            p.Elements.Add( new HtmlPathElement( "h3", 0 ) );

            Assert.AreEqual( 2, p.Elements.Count );
            Assert.IsFalse( p.PointsToTableCell );
            Assert.IsFalse( p.PointsToTable );
            Assert.AreEqual( "H3", p.Last.TagName );
            Assert.AreEqual( "/BODY[0]/H3[0]", p.ToString() );
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Returns the <see cref="HtmlPath"/> of the HtmlElement up to root.
        /// </summary>
        public static HtmlPath GetPath(this IHtmlElement element)
        {
            element.Require(x => element != null);

            HtmlPath path = new HtmlPath();

            var cur = element;

            while (cur.Parent != null)
            {
                path.Elements.Insert(0, new HtmlPathElement(cur.TagName, cur.GetChildPos()));

                cur = cur.Parent;
            }

            return(path);
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Returns the element specified by the given <see cref="HtmlPath"/>.
        /// </summary>
        public static IHtmlElement GetElementByPath(this IHtmlDocument doc, HtmlPath path)
        {
            doc.Require(d => doc != null);
            path.Require(p => path != null);

            var root = doc.Body.GetRoot();

            if (root == null)
            {
                return(null);
            }

            foreach (var element in path.Elements)
            {
                root = root.GetChildAt(element.TagName, element.Position);

                if (root == null)
                {
                    return(null);
                }
            }

            return(root);
        }
Ejemplo n.º 6
0
        /// <summary>
        /// Returns the <see cref="HtmlPath"/> of the HtmlElement up to root.
        /// </summary>
        public static HtmlPath GetPath( this IHtmlElement element )
        {
            element.Require( x => element != null );

            HtmlPath path = new HtmlPath();

            var cur = element;
            while ( cur.Parent != null )
            {
                path.Elements.Insert( 0, new HtmlPathElement( cur.TagName, cur.GetChildPos() ) );

                cur = cur.Parent;
            }

            return path;
        }
Ejemplo n.º 7
0
        public void TableCellPath()
        {
            HtmlPath p = new HtmlPath();
            p.Elements.Add( new HtmlPathElement( "table", 0 ) );
            p.Elements.Add( new HtmlPathElement( "tr", 2 ) );
            p.Elements.Add( new HtmlPathElement( "td", 4 ) );

            Assert.AreEqual( 3, p.Elements.Count );
            Assert.IsTrue( p.PointsToTableCell );
            Assert.IsFalse( p.PointsToTable );
            Assert.AreEqual( "TD", p.Last.TagName );
            Assert.AreEqual( "/TABLE[0]/TR[2]/TD[4]", p.ToString() );
        }
Ejemplo n.º 8
0
        public void TablePath()
        {
            HtmlPath p = new HtmlPath();
            p.Elements.Add( new HtmlPathElement( "table", 1 ) );

            Assert.AreEqual( 1, p.Elements.Count );
            Assert.IsFalse( p.PointsToTableCell );
            Assert.IsTrue( p.PointsToTable );
            Assert.AreEqual( "TABLE", p.Last.TagName );
            Assert.AreEqual( "/TABLE[1]", p.ToString() );
        }
Ejemplo n.º 9
0
        /// <summary>
        /// Extracts a cell or a series of the html table the given path is pointing to.
        /// If the path points to the table element itself instead of a cell the whole table will be
        /// extracted.
        /// The series to be extracted is always arranged in a column (independed of the original layout
        /// in the html table). The first column contains the values, the second the series header (if
        /// any defined). The series name is stored in the ColumnName of the first column.
        /// <remarks>
        /// Returns null if table not found by path. Currently we cannot handle thead
        /// and tfoot.
        /// </remarks>
        /// </summary>
        /// <param name="path">points to a cell or the body of a table (pointers to TR elements are invalid)</param>
        /// <param name="doc">the HTML document</param>
        /// <param name="htmlSettings">the HTML settings used to configure the extraction process</param>
        /// <param name="tableSettings">the table specific configuration</param>
        public static FallibleActionResult <DataTable> ExtractTable(this IHtmlDocument doc, HtmlPath path, TableExtractionSettings tableSettings, HtmlExtractionSettings htmlSettings)
        {
            if (!path.PointsToTable && !path.PointsToTableCell)
            {
                throw new InvalidExpressionException("Path neither points to table nor to cell");
            }

            FallibleActionResult <DataTable> result = ExtractTable(doc, path, !htmlSettings.ExtractLinkUrl);

            if (!result.Success)
            {
                // pass throu failure result
                return(result);
            }

            // path points to whole table => return whole table
            if (path.PointsToTable)
            {
                return(result);
            }

            // get the x,y position of the cell the path is pointing to
            Point cellCoords = path.GetTableCellPosition();

            if (cellCoords.X < 0 || cellCoords.Y < 0)
            {
                throw new InvalidExpressionException("Path expression corrupt: cell position in table could not be calculated");
            }

            // get the value of the raw cell. extract the link url if configured.
            Func <object, object> GetValue = e =>
            {
                if (htmlSettings.ExtractLinkUrl)
                {
                    return(((IHtmlElement)e).FirstLinkOrInnerText());
                }
                else
                {
                    return(e);
                }
            };

            var t = result.Value.ExtractSeries(cellCoords, GetValue, tableSettings);

            if (t == null)
            {
                return(FallibleActionResult <DataTable> .CreateFailureResult("Could not extract series specified"));
            }

            return(FallibleActionResult <DataTable> .CreateSuccessResult(t));
        }
Ejemplo n.º 10
0
        /// <summary>
        /// Extracts the complete html table the given path is pointing to. If the path points
        /// to a cell of a table the complete table is extracted still.
        /// <remarks>
        /// Returns null if table not found by path. Currently we cannot handle thead
        /// and tfoot. The number of the column is defined by the html table row with the most
        /// html columns
        /// </remarks>
        /// </summary>
        /// <param name="doc">the HTML document</param>
        /// <param name="path">the path to the table</param>
        /// <param name="textOnly">set this to true to get only the text of the cell, otherwise the
        /// cell itself as HtmlElement is returned</param>
        public static FallibleActionResult <DataTable> ExtractTable(this IHtmlDocument doc, HtmlPath path, bool textOnly)
        {
            doc.Require(x => doc != null);
            path.Require(x => path != null);

            HtmlTable htmlTable = doc.GetTableByPath(path);

            if (htmlTable == null)
            {
                return(FallibleActionResult <DataTable> .CreateFailureResult("Could not get table by path"));
            }

            DataTable table = new DataTable();

            // TODO: should we get the culture from the HTML page somehow?
            table.Locale = CultureInfo.InvariantCulture;

            Func <IHtmlElement, object> GetContent = element => (textOnly ? (object)element.InnerText : element);

            foreach (var tr in htmlTable.Rows)
            {
                var htmlRow = new List <IHtmlElement>();
                foreach (var td in tr.Children)
                {
                    if (td.TagName == "TD" || td.TagName == "TH")
                    {
                        htmlRow.Add(td);
                    }
                }

                // add columns if necessary
                if (htmlRow.Count > table.Columns.Count)
                {
                    (htmlRow.Count - table.Columns.Count).Times(x => table.Columns.Add(string.Empty, typeof(object)));
                }

                // add new row to table
                DataRow row = table.NewRow();
                table.Rows.Add(row);
                table.AcceptChanges();

                // add data
                htmlRow.ForeachIndex((element, idx) => row[idx] = GetContent(element));
            }

            if (table.Rows.Count == 0)
            {
                table.Dispose();
                return(FallibleActionResult <DataTable> .CreateFailureResult("Table was empty"));
            }

            return(FallibleActionResult <DataTable> .CreateSuccessResult(table));
        }