Beispiel #1
0
        /// <summary>
        /// Gets the HtmlTable the given path is pointing to.
        /// If the path is pointing into a table, the embedding table is returned.
        /// If the path is not pointing to a table element null is returned.
        /// </summary>
        public static HtmlTable GetByPath(IHtmlDocument doc, HtmlPath path)
        {
            var start = doc.GetElementByPath(path);

            if (start == null)
            {
                return(null);
            }

            return(GetByElement(start));
        }
Beispiel #2
0
        /// <summary>
        /// Returns a new HtmlPath pointing to the table element this instance is pointing into.
        /// If this instance is not pointing into any table at all null is returned;
        /// </summary>
        public HtmlPath GetPathToTable()
        {
            var result = new HtmlPath(Elements);

            while (result.Elements.Count > 0)
            {
                if (result.PointsToTable)
                {
                    return(result);
                }

                result = new HtmlPath(result.Elements.Take(result.Elements.Count - 1));
            }

            return(result.Elements.Any() ? result : null);
        }
Beispiel #3
0
        /// <summary>
        /// Extracts the complete html table the given path is pointing to. If the path points
        /// to a cell of a table the complete table is extracted still.
        /// <remarks>
        /// Returns null if table not found by path. Currently we cannot handle thead
        /// and tfoot. The number of the column is defined by the html table row with the most
        /// html columns
        /// </remarks>
        /// </summary>
        private DataTable ExtractTable(IHtmlDocument doc, HtmlPath path)
        {
            Contract.RequiresNotNull(doc, "doc");
            Contract.RequiresNotNull(path, "path");

            var htmlTable = HtmlTable.GetByPath(doc, path);

            if (htmlTable == null)
            {
                throw new Exception("Could not get table by path");
            }

            var table = new DataTable();

            // TODO: should we get the culture from the HTML page somehow?
            table.Locale = CultureInfo.InvariantCulture;

            foreach (var tr in htmlTable.Rows)
            {
                var rowData = tr.Children
                              .Where(td => htmlTable.IsCell(td))
                              .Select(td => td.InnerText)
                              .ToList();

                if (rowData.Count > table.Columns.Count)
                {
                    (rowData.Count - table.Columns.Count).Times(x => table.Columns.Add(string.Empty, typeof(object)));
                }

                var row = table.NewRow();
                table.Rows.Add(row);
                table.AcceptChanges();

                for (int i = 0; i < rowData.Count; ++i)
                {
                    row[i] = rowData[i];
                }
            }

            if (table.Rows.Count == 0)
            {
                table.Dispose();
                throw new Exception("Table was empty");
            }

            return(table);
        }
Beispiel #4
0
        public DataTable ExtractTable()
        {
            var pathSeriesDescriptor = myDescriptor as PathSeriesDescriptor;

            if (pathSeriesDescriptor != null)
            {
                var table = ExtractTable(myDocument, HtmlPath.Parse(pathSeriesDescriptor.Path));
                return(TableFormatter.ToFormattedTable(pathSeriesDescriptor, table));
            }

            var pathTableDescriptor = myDescriptor as PathTableDescriptor;

            if (pathTableDescriptor != null)
            {
                var table = ExtractTable(myDocument, HtmlPath.Parse(pathTableDescriptor.Path));
                return(TableFormatter.ToFormattedTable(pathTableDescriptor, table));
            }

            var pathCellDescriptor = myDescriptor as PathCellDescriptor;

            if (pathCellDescriptor != null)
            {
                var table = ExtractTable(myDocument, HtmlPath.Parse(pathCellDescriptor.Path));
                var value = TableFormatter.GetValue(pathCellDescriptor, table);

                // XXX: this is really ugly - i have to create a table just to satisfy the interface :(
                return(CreateTableForScalar(pathCellDescriptor.ValueFormat.Type, value));
            }

            var pathSingleValueDescriptor = myDescriptor as PathSingleValueDescriptor;

            if (pathSingleValueDescriptor != null)
            {
                var e   = myDocument.GetElementByPath(HtmlPath.Parse(pathSingleValueDescriptor.Path));
                var str = e == null ? null : e.InnerText;

                var value = pathSingleValueDescriptor.ValueFormat.Convert(str);

                // XXX: this is really ugly - i have to create a table just to satisfy the interface :(
                return(CreateTableForScalar(pathSingleValueDescriptor.ValueFormat.Type, value));
            }

            throw new NotSupportedException("Format not supported for Html documents: " + myDescriptor.GetType());
        }
        /// <summary>
        /// Returns the element specified by the given <see cref="HtmlPath"/>.
        /// </summary>
        public static IHtmlElement GetElementByPath(this IHtmlDocument doc, HtmlPath path)
        {
            Contract.RequiresNotNull(doc, "doc");
            Contract.RequiresNotNull(path, "path");

            var current = doc.Body.GetRoot();

            foreach (var element in path.Elements)
            {
                current = GetNthChildWithTag(current, element.TagName, element.Position);

                if (current == null)
                {
                    return(null);
                }
            }

            return(current);
        }