public static void ExtractTableFromHTML(AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> elements, int tableIndex, string outputPath, IDictionary <string, string> rowHeadOverrides, IDictionary <string, string> config) { var statementTable = elements[tableIndex]; // Parse statement table just found into a 2d matrix (actually a list of TableRow objects, which contain a list of TableCell objects) var tableData = new List <TableRow>(); var rowElements = statementTable.QuerySelectorAll("TR"); foreach (var rowElement in rowElements) { var rowData = new TableRow(); var colElements = rowElement.QuerySelectorAll("TD"); foreach (var cellElement in colElements) { // Extract cell value TableCell tableCell = new TableCell(cellElement); // Duplicate cell value across all cols it spans. int colSpan = 1; var colAttrs = cellElement.Attributes; var colSpanAttr = colAttrs.Where(a => a.Name == "colspan"); if (colSpanAttr.Count() > 0) { string sColSpan = colSpanAttr.First().Value; Int32.TryParse(sColSpan, out colSpan); } for (int j = 0; j < colSpan; ++j) { rowData.AddCell(tableCell); } } tableData.Add(rowData); } // For diagnostic purposes save a csv of the parsed table // writeTableToFile(outputPath + ".tbl", tableData); // Extract the column Headings from the table into list IList <string> columnHeadings = ExtractColumnHeadings(tableData); if (columnHeadings == null) { Console.WriteLine("FATAL: Cannot find any qualifying heading rows in table"); } // Post-process the rowheads in the table, calculating its relative indentation level (compared to the rest of the rowheads) CalcRowheadIndentationLevels(tableData); // Post-process the rowheads, linking each to any parents it has, based on rules and clues. BuildComplexRowHeads(tableData, rowHeadOverrides, config); // Flatten matrix to a list of tuples using some rules List <FlattenedRow> results = new List <FlattenedRow>(); string attributeName = ""; for (int iRow = 0; iRow < tableData.Count; ++iRow) { var row = tableData[iRow]; int nCols = row.Cells.Count; // Create the attribute name for this row from its row head and those of its parents attributeName = row.RowHead.Text; TableRow row2 = row.parentRow; while (row2 != null && row2.RowHead.Text.Length > 0) { attributeName = row2.RowHead.Text + "|" + attributeName; row2 = row2.parentRow; } if (attributeName.Length == 0) { continue; // Assumption: rows without attribute names should be skipped. } // Standardize attribute name format: only single spaces between words attributeName = ConsolidateWhitespace(attributeName); // Scan columns string colContent = ""; for (int iCol = 1; iCol < nCols; ++iCol) { var col = row.Cells[iCol]; colContent = col.Text; // Process numeric columns only. Exclude centered (heading) cols. if (col.HorizontalAlignment == TableCell.HORIZONTAL_ALIGNMENT.CENTER || !Regex.IsMatch(colContent, @"\(?\d+\)?")) { continue; } // Convert (xxx) to -xxx. Drop comma separators colContent = colContent.Replace('(', '-').Replace(")", "").Replace(",", ""); // Get the heading for this col string heading = columnHeadings[iCol]; // Create the tuple with this data and add it to the results list. FlattenedRow flatRow = new FlattenedRow(attributeName, heading, colContent); results.Add(flatRow); } } // HTML column spans can result in duplicated entries: get rid of them. var distinctFlatRows = results.Distinct(); // Write the flattened matrix out to file using (StreamWriter fsw = File.CreateText(outputPath)) { foreach (var record in distinctFlatRows) { fsw.Write(record + "\r\n"); } } }
public override bool Equals(object obj) { FlattenedRow other = (FlattenedRow)obj; return(attribute.Equals(other.attribute) && time.Equals(other.time) && value.Equals(other.value)); }