Beispiel #1
0
        public static void ExtractTableFromHTML(AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> elements, int tableIndex, string outputPath, IDictionary <string, string> rowHeadOverrides, IDictionary <string, string> config)
        {
            var statementTable = elements[tableIndex];

            // Parse statement table just found into a 2d matrix (actually a list of TableRow objects, which contain a list of TableCell objects)
            var tableData   = new List <TableRow>();
            var rowElements = statementTable.QuerySelectorAll("TR");

            foreach (var rowElement in rowElements)
            {
                var rowData = new TableRow();

                var colElements = rowElement.QuerySelectorAll("TD");
                foreach (var cellElement in colElements)
                {
                    // Extract cell value
                    TableCell tableCell = new TableCell(cellElement);

                    // Duplicate cell value across all cols it spans.
                    int colSpan     = 1;
                    var colAttrs    = cellElement.Attributes;
                    var colSpanAttr = colAttrs.Where(a => a.Name == "colspan");
                    if (colSpanAttr.Count() > 0)
                    {
                        string sColSpan = colSpanAttr.First().Value;
                        Int32.TryParse(sColSpan, out colSpan);
                    }
                    for (int j = 0; j < colSpan; ++j)
                    {
                        rowData.AddCell(tableCell);
                    }
                }

                tableData.Add(rowData);
            }

            // For diagnostic purposes save a csv of the parsed table
//            writeTableToFile(outputPath + ".tbl", tableData);

            // Extract the column Headings from the table into list
            IList <string> columnHeadings = ExtractColumnHeadings(tableData);

            if (columnHeadings == null)
            {
                Console.WriteLine("FATAL: Cannot find any qualifying heading rows in table");
            }

            // Post-process the rowheads in the table, calculating its relative indentation level (compared to the rest of the rowheads)
            CalcRowheadIndentationLevels(tableData);

            // Post-process the rowheads, linking each to any parents it has, based on rules and clues.
            BuildComplexRowHeads(tableData, rowHeadOverrides, config);

            // Flatten matrix to a list of tuples using some rules
            List <FlattenedRow> results = new List <FlattenedRow>();
            string attributeName        = "";

            for (int iRow = 0; iRow < tableData.Count; ++iRow)
            {
                var row   = tableData[iRow];
                int nCols = row.Cells.Count;

                // Create the attribute name for this row from its row head and those of its parents
                attributeName = row.RowHead.Text;
                TableRow row2 = row.parentRow;
                while (row2 != null && row2.RowHead.Text.Length > 0)
                {
                    attributeName = row2.RowHead.Text + "|" + attributeName;
                    row2          = row2.parentRow;
                }
                if (attributeName.Length == 0)
                {
                    continue;                               // Assumption: rows without attribute names should be skipped.
                }
                // Standardize attribute name format: only single spaces between words
                attributeName = ConsolidateWhitespace(attributeName);

                // Scan columns
                string colContent = "";
                for (int iCol = 1; iCol < nCols; ++iCol)
                {
                    var col = row.Cells[iCol];
                    colContent = col.Text;

                    // Process numeric columns only.  Exclude centered (heading) cols.
                    if (col.HorizontalAlignment == TableCell.HORIZONTAL_ALIGNMENT.CENTER || !Regex.IsMatch(colContent, @"\(?\d+\)?"))
                    {
                        continue;
                    }

                    // Convert (xxx) to -xxx.  Drop comma separators
                    colContent = colContent.Replace('(', '-').Replace(")", "").Replace(",", "");

                    // Get the heading for this col
                    string heading = columnHeadings[iCol];

                    // Create the tuple with this data and add it to the results list.
                    FlattenedRow flatRow = new FlattenedRow(attributeName, heading, colContent);
                    results.Add(flatRow);
                }
            }

            // HTML column spans can result in duplicated entries: get rid of them.
            var distinctFlatRows = results.Distinct();

            // Write the flattened matrix out to file
            using (StreamWriter fsw = File.CreateText(outputPath))
            {
                foreach (var record in distinctFlatRows)
                {
                    fsw.Write(record + "\r\n");
                }
            }
        }
Beispiel #2
0
        public override bool Equals(object obj)
        {
            FlattenedRow other = (FlattenedRow)obj;

            return(attribute.Equals(other.attribute) && time.Equals(other.time) && value.Equals(other.value));
        }