public bool NotEmptyParagraphIsInADeclarationTable(Paragraph p) { if (p.InnerText == "") { return(false); } var body = WordDocument.MainDocumentPart.Document.Body; if (p.Parent == body) { return(false); } var cell = p.Parent; if (cell == null || cell.GetType() != typeof(TableCell)) { return(false); } var row = cell.Parent; if (row == null || row.GetType() != typeof(TableRow) || WordDocHolder.CountInnerVerticalBorders((TableRow)row) < 3) { return(false); } var table = row.Parent; if (table != null && table.GetType().Name == "Table" && table.Descendants <TableRow>().Count() > 1) { return(true); } return(false); }
public static int CountInnerVerticalBorders(TableRow row) { int cellIndex = 0; HashSet <int> visibleBorders = new HashSet <int>(); var cells = row.Descendants <TableCell>().ToList(); foreach (var c in cells) { bool rightBorder = true; bool leftBorder = true; if (c?.TableCellProperties?.TableCellBorders != null) { var cellBorders = c.TableCellProperties.TableCellBorders; if (!WordDocHolder.BorderIsVisible(cellBorders.LeftBorder)) { leftBorder = false; } if (!WordDocHolder.BorderIsVisible(cellBorders.RightBorder)) { rightBorder = false; } } if (cellIndex > 0 && leftBorder) { visibleBorders.Add(cellIndex); } if (cellIndex + 1 < cells.Count && rightBorder) { visibleBorders.Add(cellIndex + 1); } cellIndex += 1; } return(visibleBorders.Count); }
void CollectRows(WordDocHolder docHolder, int maxRowsToProcess, string extension) { var docPart = docHolder.WordDocument.MainDocumentPart; var tables = docPart.Document.Descendants <Table>().ToList(); int tableIndex = 0; foreach (OpenXmlPart h in docPart.HeaderParts) { foreach (var t in h.RootElement.Descendants <Table>()) { ProcessWordTableAndUpdateTitle(docHolder, t, maxRowsToProcess, tableIndex); tableIndex++; } } if (extension != ".htm" && extension != ".html") // это просто костыль. Нужно как-то встроить это в архитектуру. { tables = ExtractSubtables(tables); } TablesCount = tables.Count(); foreach (var t in tables) { ProcessWordTableAndUpdateTitle(docHolder, t, maxRowsToProcess, tableIndex); tableIndex++; } TableRows = DropDayOfWeekRows(TableRows); }
void ProcessWordTableAndUpdateTitle(WordDocHolder docHolder, Table table, int maxRowsToProcess, int tableIndex) { if (CanAppendRowsFromTable(table, tableIndex)) { ProcessWordTable(docHolder, table, maxRowsToProcess); } int debugSaveRowCount = TableRows.Count; if (TableRows.Count > debugSaveRowCount) { string tableText = table.InnerText.Length > 30 ? table.InnerText.Substring(0, 30) : table.InnerText; Logger.Debug(String.Format("add {0} rows (TableRows.Count={1} ) from table {2} Table.innertText[0:30]='{3}'", TableRows.Count - debugSaveRowCount, TableRows.Count, tableIndex, tableText)); } if (Title.Length == 0 && table.InnerText.Length > 30 && table.InnerText.ToLower().IndexOf("декабря") != -1) { var rows = new List <String>(); foreach (var r in table.Descendants <TableRow>()) { rows.Add(r.InnerText); } Title = String.Join("\n", rows); } }
TableWidthInfo InitializeTableWidthInfo(WordDocHolder docHolder, Table table) { TableWidthInfo widthInfo = new TableWidthInfo(); TableProperties tProp = table.GetFirstChild <TableProperties>(); if (tProp != null) { if (tProp.TableWidth != null) { widthInfo.TableWidthInPixels = TableWidthInfo.TryReadWidth( tProp.TableWidth.Width, tProp.TableWidth.Type, docHolder.DocumentPageSizeInPixels); } if (tProp.TableIndentation != null) { widthInfo.TableIndentionInPixels = TableWidthInfo.TryReadWidth( tProp.TableIndentation.Width, tProp.TableIndentation.Type, docHolder.DocumentPageSizeInPixels); } widthInfo.TableIndentionInPixels += docHolder.DocumentPageLeftMaginInPixels; } else { widthInfo.TableWidthInPixels = docHolder.DocumentPageSizeInPixels; } TableGrid tGrid = table.GetFirstChild <TableGrid>(); if (tGrid != null) { widthInfo.ColumnWidths = new List <int>(); foreach (var col in tGrid.Elements <GridColumn>()) { widthInfo.ColumnWidths.Add( TableWidthInfo.TryReadWidth( col.Width, TableWidthUnitValues.Dxa, widthInfo.TableWidthInPixels)); } } return(widthInfo); }
private void ProcessDoc(string fileName, string extension, int maxRowsToProcess) { using (var doc = new WordDocHolder(WordprocessingDocument.Open(fileName, false))) { CurrentScheme = _allSchemes.Find(x => x.CanProcess(doc.WordDocument)); if (CurrentScheme != default) { // CollectRows from distinct Tables Title = doc.FindTitleAboveTheTable(); CurrentScheme.Document = doc.WordDocument.MainDocumentPart.Document; TablesCount = 1; } else { Title = doc.FindTitleAboveTheTable(); CollectRows(doc, maxRowsToProcess, extension); UnmergedColumnsCount = GetUnmergedColumnsCountByFirstRow(); FindRowsThatCannotBeMerged(); InitializeVerticallyMerge(); } }; }
void ProcessWordTable(WordDocHolder docHolder, Table table, int maxRowsToProcess) { var rows = table.Descendants <TableRow>().ToList(); TableWidthInfo widthInfo = InitializeTableWidthInfo(docHolder, table); int saveRowsCount = TableRows.Count; int maxCellsCount = 0; TableBorders tblBorders = GetTableBorders(table); for (int r = 0; r < rows.Count(); ++r) { OpenXmlTableRow newRow = new OpenXmlTableRow(); int sumspan = 0; var tableRow = rows[r]; int rowGridBefore = GetRowGridBefore(tableRow); bool isEmpty = true; var row = tableRow.Elements <TableCell>().ToArray(); for (var i = 0; i < row.Length; ++i) { var c = new OpenXmlWordCell(docHolder, row, i, widthInfo, TableRows.Count, sumspan, tblBorders); if (newRow.RowCells.Count == 0) { c.MergedColsCount += rowGridBefore; } if (newRow.RowCells.Count > 0 && !newRow.RowCells.Last().HasRightBorder) { newRow.RowCells.Last().Text += c.Text; newRow.RowCells.Last().CellWidth += c.CellWidth; newRow.RowCells.Last().MergedColsCount += c.MergedColsCount; newRow.RowCells.Last().HasRightBorder = c.HasRightBorder; sumspan += c.MergedColsCount; } else { newRow.RowCells.Add(c); sumspan += c.MergedColsCount; } isEmpty = isEmpty && c.IsEmpty; } if (isEmpty) { continue; } maxCellsCount = Math.Max(newRow.RowCells.Count, maxCellsCount); if (r == 0 && TableRows.Count > 0 && BigramsHolder.CheckMergeRow( TableRows.Last().RowCells.ConvertAll(x => x.Text), newRow.RowCells.ConvertAll(x => x.Text))) { MergeRow(TableRows.Last().RowCells, newRow.RowCells); } else { TableRows.Add(newRow); } if ((maxRowsToProcess != -1) && (TableRows.Count >= maxRowsToProcess)) { break; } } if ((TableRows.Count > 0) && !TableHeaderRecognizer.IsNamePositionAndIncomeTable(GetDataCells(0))) { if (maxCellsCount <= 4 || CheckNameColumnIsEmpty(saveRowsCount)) { //remove this suspicious table TableRows.RemoveRange(saveRowsCount, TableRows.Count - saveRowsCount); } } }
private void InitTextProperties(WordDocHolder docHolder, OpenXmlElement inputCell) { Text = ""; foreach (var p in inputCell.Elements <Paragraph>()) { foreach (var textOrBreak in p.Descendants()) { if (textOrBreak.LocalName == "r" && textOrBreak is Run) { Run r = textOrBreak as Run; RunProperties rProps = r.RunProperties; if (rProps != null) { if (rProps.FontSize != null) { int runFontSize = Int32.Parse(rProps.FontSize.Val); if (runFontSize <= 28) { FontSize = runFontSize; // if font is too large, it is is an ocr error, ignore it } } if (rProps.RunFonts != null) { FontName = rProps.RunFonts.ComplexScript; } } } else if (textOrBreak.LocalName == "t") { Text += textOrBreak.InnerText; } else if (textOrBreak.LocalName == "cr") { Text += "\n"; } else if (textOrBreak.LocalName == "br") /* do not use lastRenderedPageBreak, see MinRes2011 for wrong lastRenderedPageBreak in Семенов || || (textOrBreak.Name == w + "lastRenderedPageBreak") */ { Text += "\n"; } else if (textOrBreak.LocalName == "numPr") { Text += "- "; } } Text += "\n"; ParagraphProperties pPr = p.ParagraphProperties; if (pPr != null) { for (int l = 0; l < AfterLinesCount(pPr.SpacingBetweenLines); ++l) { Text += "\n"; } } } IsEmpty = Text.IsNullOrWhiteSpace(); if (string.IsNullOrEmpty(FontName)) { FontName = docHolder.DefaultFontName; } if (FontSize == 0) { FontSize = docHolder.DefaultFontSize; } }
public OpenXmlWordCell(WordDocHolder docHolder, TableCell[] row, int cellIndexInRow, TableWidthInfo tableWidth, int rowIndexInTable, int unmergedColumnIndex, TableBorders tblBorders) { TableCell inputCell = row[cellIndexInRow]; InitTextProperties(docHolder, inputCell); if (inputCell?.TableCellProperties?.TableCellBorders != null) { var borders = inputCell.TableCellProperties.TableCellBorders; HasBottomBorder = WordDocHolder.BorderIsVisible(borders.BottomBorder); HasTopBorder = WordDocHolder.BorderIsVisible(borders.TopBorder); HasRightBorder = WordDocHolder.BorderIsVisible(borders.RightBorder); if (!HasRightBorder && cellIndexInRow + 1 < row.Length && row[cellIndexInRow + 1].TableCellProperties?.TableCellBorders != null) { HasRightBorder = WordDocHolder.BorderIsVisible(row[cellIndexInRow + 1].TableCellProperties?.TableCellBorders?.LeftBorder); } if (!HasRightBorder && tblBorders?.InsideVerticalBorder != null && (uint)tblBorders.InsideVerticalBorder.Size > 0) { HasRightBorder = true; } } var vmerge = inputCell.TableCellProperties.GetFirstChild <VerticalMerge>(); VerticallyMerged = null; if (vmerge != null) { if ((vmerge.Val == null) || (vmerge.Val == MergedCellValues.Continue)) { // null -> MergedCellValues.Continue VerticallyMerged = MergedCellValues.Continue; } else if (vmerge.Val == MergedCellValues.Restart) { VerticallyMerged = MergedCellValues.Restart; } } if (tblBorders?.InsideHorizontalBorder != null && (uint)tblBorders.InsideHorizontalBorder.Size > 0) { TableHasInsideHorizontalBorders = true; } var gridSpan = inputCell.TableCellProperties.GetFirstChild <GridSpan>(); IsMerged = gridSpan != null && gridSpan.Val > 1; FirstMergedRow = -1; // init afterwards MergedRowsCount = -1; // init afterwards MergedColsCount = (gridSpan == null) ? 1 : (int)gridSpan.Val; Row = rowIndexInTable; Col = unmergedColumnIndex; if (inputCell.TableCellProperties != null && inputCell.TableCellProperties.TableCellWidth != null && inputCell.TableCellProperties.TableCellWidth.Type != null && inputCell.TableCellProperties.TableCellWidth.Type != TableWidthUnitValues.Auto ) { CellWidth = TableWidthInfo.TryReadWidth( inputCell.TableCellProperties.TableCellWidth.Width, inputCell.TableCellProperties.TableCellWidth.Type, tableWidth.TableWidthInPixels); } else { if (Col < tableWidth.ColumnWidths.Count) { CellWidth = tableWidth.ColumnWidths[Col]; } } AdditTableIndention = tableWidth.TableIndentionInPixels; }