public AngleHtmlAdapter(string fileName, int maxRowsToProcess) { TableRows = new List <List <HtmlAdapterCell> >(); DocumentFile = fileName; var holder = new HtmlDocHolder(GetAngleDocument(fileName)); Title = holder.FindTitleAboveTheTable(); CollectRows(holder, maxRowsToProcess); UnmergedColumnsCount = GetUnmergedColumnsCountByFirstRow(); }
void CollectRows(HtmlDocHolder docHolder, int maxRowsToProcess) { var tables = docHolder.HtmlDocument.QuerySelectorAll("*").Where(m => m.LocalName == "table").ToList(); int tableIndex = 0; TablesCount = tables.Count(); foreach (var t in tables) { ProcessHtmlTableAndUpdateTitle(docHolder, t, maxRowsToProcess, tableIndex); tableIndex++; } TableRows = DropDayOfWeekRows(TableRows); }
public HtmlAdapterCell(HtmlDocHolder docHolder, IElement inputCell, int row, int column) { InitTextProperties(docHolder, inputCell); FirstMergedRow = row; MergedRowsCount = 1; MergedColsCount = 1; Row = row; Col = column; IsMerged = false; IsEmpty = Text.IsNullOrWhiteSpace(); if (inputCell.HasAttribute("colspan")) { int v; if (Int32.TryParse(inputCell.GetAttribute("colspan"), out v)) { MergedColsCount = v; IsMerged = MergedColsCount > 1; } } if (inputCell.HasAttribute("rowspan")) { int v; if (Int32.TryParse(inputCell.GetAttribute("rowspan"), out v)) { MergedRowsCount = v; } } if (inputCell.HasAttribute("width")) { string s = inputCell.GetAttribute("width"); double width; if (s.EndsWith("%") && double.TryParse(s.Substring(0, s.Length - 1), out width)) { CellWidth = (int)((double)docHolder.DocumentPageSizeInPixels * (width / 100.0)); } if (double.TryParse(s, out width)) { CellWidth = (int)width; } else { CellWidth = 50; } } }
private void InitTextProperties(HtmlDocHolder docHolder, IElement inputCell) { FontName = ""; FontSize = 0; var myFormatter = new MyMarkupFormatter(); //var myFormatter = new AngleSharp.Html.PrettyMarkupFormatter(); Text = inputCell.ToHtml(myFormatter); IsEmpty = Text.IsNullOrWhiteSpace(); if (FontName == null || FontName == "") { FontName = docHolder.DefaultFontName; } if (FontSize == 0) { FontSize = docHolder.DefaultFontSize; } }
void ProcessHtmlTableAndUpdateTitle(HtmlDocHolder docHolder, IElement table, int maxRowsToProcess, int tableIndex) { int debugSaveRowCount = TableRows.Count; if (table.QuerySelectorAll("*").Where(m => m.LocalName == "table").ToList().Count > 0) { Logger.Debug(String.Format("ignore table {0} with subtables", tableIndex)); } else if (table.TextContent.Length > 0 && !table.TextContent.Any(x => Char.IsUpper(x))) { Logger.Debug(String.Format("ignore table {0} that has no uppercase char", tableIndex)); } else if (table.TextContent.Length < 30) { Logger.Debug(String.Format("ignore table {0}, it is too short", tableIndex)); } else { ProcessHtmlTable(docHolder, table, maxRowsToProcess); } if (TableRows.Count > debugSaveRowCount) { string tableText = table.TextContent.Length > 30 ? table.TextContent.Substring(0, 30).ReplaceEolnWithSpace() : table.TextContent.ReplaceEolnWithSpace(); Logger.Debug(String.Format("add {0} rows (TableRows.Count={1} ) from table {2} Table.innertText[0:30]='{3}'", TableRows.Count - debugSaveRowCount, TableRows.Count, tableIndex, tableText)); } if (Title.Length == 0 && table.TextContent.Length > 30 && table.TextContent.ToLower().IndexOf("декабря") != -1) { var rows = new List <String>(); foreach (var r in GetHtmlTableRows(table)) { rows.Add(r.TextContent); } Title = String.Join("\n", rows); } }
void ProcessHtmlTable(HtmlDocHolder docHolder, IElement table, int maxRowsToProcess) { var rows = GetHtmlTableRows(table); int saveRowsCount = TableRows.Count; int maxCellsCount = 0; int maxSumSpan = 0; for (int r = 0; r < rows.Count(); ++r) { List <HtmlAdapterCell> newRow = new List <HtmlAdapterCell>(); int sumspan = 0; var row = rows[r]; bool isEmpty = true; foreach (var rowCell in GetHtmlTableCells(rows[r])) { var c = new HtmlAdapterCell(docHolder, rowCell, TableRows.Count, sumspan); newRow.Add(c); for (int k = 1; k < c.MergedColsCount; ++k) { newRow.Add(new HtmlAdapterCell(TableRows.Count, sumspan + k)); } sumspan += c.MergedColsCount; isEmpty = isEmpty && c.IsEmpty; } if (isEmpty) { continue; } maxCellsCount = Math.Max(newRow.Count, maxCellsCount); maxSumSpan = Math.Max(sumspan, maxSumSpan); // see 7007_8.html in tests for (int k = sumspan; k < maxSumSpan; ++k) { newRow.Add(new HtmlAdapterCell(TableRows.Count, sumspan + k)); } if (r == 0 && TableRows.Count > 0 && BigramsHolder.CheckMergeRow( TableRows.Last().ConvertAll(x => x.Text), newRow.ConvertAll(x => x.Text))) { MergeRow(TableRows.Last(), newRow); } else { TableRows.Add(newRow); } if ((maxRowsToProcess != -1) && (TableRows.Count >= maxRowsToProcess)) { break; } } if (saveRowsCount < TableRows.Count) { if (maxCellsCount <= 4) { //remove this suspicious table TableRows.RemoveRange(saveRowsCount, TableRows.Count - saveRowsCount); } else { InsertRowSpanCells(saveRowsCount, TableRows.Count); if (CheckNameColumnIsEmpty(TableRows, saveRowsCount)) { TableRows.RemoveRange(saveRowsCount, TableRows.Count - saveRowsCount); } } } }