Example #1
0
        /// <summary>
        /// Extracts the complete html table the given path is pointing to. If the path points
        /// to a cell of a table the complete table is extracted still.
        /// <remarks>
        /// Returns null if table not found by path. Currently we cannot handle thead
        /// and tfoot. The number of the column is defined by the html table row with the most
        /// html columns
        /// </remarks>
        /// </summary>
        /// <param name="doc">the HTML document</param>
        /// <param name="path">the path to the table</param>
        /// <param name="textOnly">set this to true to get only the text of the cell, otherwise the
        /// cell itself as HtmlElement is returned</param>
        public static FallibleActionResult <DataTable> ExtractTable(this IHtmlDocument doc, HtmlPath path, bool textOnly)
        {
            doc.Require(x => doc != null);
            path.Require(x => path != null);

            HtmlTable htmlTable = doc.GetTableByPath(path);

            if (htmlTable == null)
            {
                return(FallibleActionResult <DataTable> .CreateFailureResult("Could not get table by path"));
            }

            DataTable table = new DataTable();

            // TODO: should we get the culture from the HTML page somehow?
            table.Locale = CultureInfo.InvariantCulture;

            Func <IHtmlElement, object> GetContent = element => (textOnly ? (object)element.InnerText : element);

            foreach (var tr in htmlTable.Rows)
            {
                var htmlRow = new List <IHtmlElement>();
                foreach (var td in tr.Children)
                {
                    if (td.TagName == "TD" || td.TagName == "TH")
                    {
                        htmlRow.Add(td);
                    }
                }

                // add columns if necessary
                if (htmlRow.Count > table.Columns.Count)
                {
                    (htmlRow.Count - table.Columns.Count).Times(x => table.Columns.Add(string.Empty, typeof(object)));
                }

                // add new row to table
                DataRow row = table.NewRow();
                table.Rows.Add(row);
                table.AcceptChanges();

                // add data
                htmlRow.ForeachIndex((element, idx) => row[idx] = GetContent(element));
            }

            if (table.Rows.Count == 0)
            {
                table.Dispose();
                return(FallibleActionResult <DataTable> .CreateFailureResult("Table was empty"));
            }

            return(FallibleActionResult <DataTable> .CreateSuccessResult(table));
        }
Example #2
0
        /// <summary>
        /// Returns the element specified by the given <see cref="HtmlPath"/>.
        /// </summary>
        public static IHtmlElement GetElementByPath(this IHtmlDocument doc, HtmlPath path)
        {
            doc.Require(d => doc != null);
            path.Require(p => path != null);

            var root = doc.Body.GetRoot();

            if (root == null)
            {
                return(null);
            }

            foreach (var element in path.Elements)
            {
                root = root.GetChildAt(element.TagName, element.Position);

                if (root == null)
                {
                    return(null);
                }
            }

            return(root);
        }