void CollectRows(WordDocHolder docHolder, int maxRowsToProcess, string extension) { var docPart = docHolder.WordDocument.MainDocumentPart; var tables = docPart.Document.Descendants <Table>().ToList(); int tableIndex = 0; foreach (OpenXmlPart h in docPart.HeaderParts) { foreach (var t in h.RootElement.Descendants <Table>()) { ProcessWordTableAndUpdateTitle(docHolder, t, maxRowsToProcess, tableIndex); tableIndex++; } } if (extension != ".htm" && extension != ".html") // это просто костыль. Нужно как-то встроить это в архитектуру. { tables = ExtractSubtables(tables); } TablesCount = tables.Count(); foreach (var t in tables) { ProcessWordTableAndUpdateTitle(docHolder, t, maxRowsToProcess, tableIndex); tableIndex++; } TableRows = DropDayOfWeekRows(TableRows); }
void ProcessWordTable(WordDocHolder docHolder, Table table, int maxRowsToProcess) { var rows = table.Descendants <TableRow>().ToList(); TableWidthInfo widthInfo = InitializeTableWidthInfo(docHolder, table); int saveRowsCount = TableRows.Count; int maxCellsCount = 0; for (int r = 0; r < rows.Count(); ++r) { List <OpenXmlWordCell> newRow = new List <OpenXmlWordCell>(); int sumspan = 0; var row = rows[r]; int rowGridBefore = GetRowGridBefore(row); bool isEmpty = true; foreach (var rowCell in row.Elements <TableCell>()) { var c = new OpenXmlWordCell(docHolder, widthInfo, rowCell, TableRows.Count, sumspan); if (newRow.Count == 0) { c.MergedColsCount += rowGridBefore; } newRow.Add(c); sumspan += c.MergedColsCount; isEmpty = isEmpty && c.IsEmpty; } if (isEmpty) { continue; } maxCellsCount = Math.Max(newRow.Count, maxCellsCount); if (r == 0 && TableRows.Count > 0 && BigramsHolder.CheckMergeRow( TableRows.Last().ConvertAll(x => x.Text), newRow.ConvertAll(x => x.Text))) { MergeRow(TableRows.Last(), newRow); } else { TableRows.Add(newRow); } if ((maxRowsToProcess != -1) && (TableRows.Count >= maxRowsToProcess)) { break; } } if (maxCellsCount <= 4 || CheckNameColumnIsEmpty(TableRows, saveRowsCount)) { //remove this suspicious table TableRows.RemoveRange(saveRowsCount, TableRows.Count - saveRowsCount); } }
public OpenXmlWordCell(WordDocHolder docHolder, TableWidthInfo tableWidth, TableCell inputCell, int row, int column) { InitTextProperties(docHolder, inputCell); var vmerge = inputCell.TableCellProperties.GetFirstChild <VerticalMerge>(); if (vmerge == null) { IsVerticallyMerged = false; } else { if (vmerge == null || vmerge.Val == null || vmerge.Val == MergedCellValues.Continue) { IsVerticallyMerged = true; } else { //vmerge.Val == MergedCellValues.Restart IsVerticallyMerged = false; } } var gridSpan = inputCell.TableCellProperties.GetFirstChild <GridSpan>(); IsMerged = gridSpan != null && gridSpan.Val > 1; FirstMergedRow = -1; // init afterwards MergedRowsCount = -1; // init afterwards MergedColsCount = (gridSpan == null) ? 1 : (int)gridSpan.Val; Row = row; Col = column; if (inputCell.TableCellProperties != null && inputCell.TableCellProperties.TableCellWidth != null && inputCell.TableCellProperties.TableCellWidth.Type != null && inputCell.TableCellProperties.TableCellWidth.Type != TableWidthUnitValues.Auto ) { CellWidth = TableWidthInfo.TryReadWidth( inputCell.TableCellProperties.TableCellWidth.Width, inputCell.TableCellProperties.TableCellWidth.Type, tableWidth.TableWidthInPixels); } else { if (Col < tableWidth.ColumnWidths.Count) { CellWidth = tableWidth.ColumnWidths[Col]; } } AdditTableIndention = tableWidth.TableIndentionInPixels; }
TableWidthInfo InitializeTableWidthInfo(WordDocHolder docHolder, Table table) { TableWidthInfo widthInfo = new TableWidthInfo(); TableProperties tProp = table.GetFirstChild <TableProperties>(); if (tProp != null) { if (tProp.TableWidth != null) { widthInfo.TableWidthInPixels = TableWidthInfo.TryReadWidth( tProp.TableWidth.Width, tProp.TableWidth.Type, docHolder.DocumentPageSizeInPixels); } if (tProp.TableIndentation != null) { widthInfo.TableIndentionInPixels = TableWidthInfo.TryReadWidth( tProp.TableIndentation.Width, tProp.TableIndentation.Type, docHolder.DocumentPageSizeInPixels); } widthInfo.TableIndentionInPixels += docHolder.DocumentPageLeftMaginInPixels; } else { widthInfo.TableWidthInPixels = docHolder.DocumentPageSizeInPixels; } TableGrid tGrid = table.GetFirstChild <TableGrid>(); if (tGrid != null) { widthInfo.ColumnWidths = new List <int>(); foreach (var col in tGrid.Elements <GridColumn>()) { widthInfo.ColumnWidths.Add( TableWidthInfo.TryReadWidth( col.Width, TableWidthUnitValues.Dxa, widthInfo.TableWidthInPixels)); } } return(widthInfo); }
private void ProcessDoc(string fileName, string extension, int maxRowsToProcess) { using (var doc = new WordDocHolder(WordprocessingDocument.Open(fileName, false))) { CurrentScheme = _allSchemes.Find(x => x.CanProcess(doc.WordDocument)); if (CurrentScheme != default) { // CollectRows from distinct Tables Title = doc.FindTitleAboveTheTable(); CurrentScheme.Document = doc.WordDocument.MainDocumentPart.Document; TablesCount = 1; } else { Title = doc.FindTitleAboveTheTable(); CollectRows(doc, maxRowsToProcess, extension); UnmergedColumnsCount = GetUnmergedColumnsCountByFirstRow(); InitializeVerticallyMerge(); } }; }
void ProcessWordTableAndUpdateTitle(WordDocHolder docHolder, Table table, int maxRowsToProcess, int tableIndex) { int debugSaveRowCount = TableRows.Count; if (table.Descendants <Table>().ToList().Count > 0) { Logger.Debug(String.Format("ignore table {0} with subtables", tableIndex)); } else if (table.InnerText.Length > 0 && !table.InnerText.Any(x => Char.IsUpper(x))) { Logger.Debug(String.Format("ignore table {0} that has no uppercase char", tableIndex)); } else if (table.InnerText.Length < 30) { Logger.Debug(String.Format("ignore table {0}, it is too short", tableIndex)); } else { ProcessWordTable(docHolder, table, maxRowsToProcess); } if (TableRows.Count > debugSaveRowCount) { string tableText = table.InnerText.Length > 30 ? table.InnerText.Substring(0, 30) : table.InnerText; Logger.Debug(String.Format("add {0} rows (TableRows.Count={1} ) from table {2} Table.innertText[0:30]='{3}'", TableRows.Count - debugSaveRowCount, TableRows.Count, tableIndex, tableText)); } if (Title.Length == 0 && table.InnerText.Length > 30 && table.InnerText.ToLower().IndexOf("декабря") != -1) { var rows = new List <String>(); foreach (var r in table.Descendants <TableRow>()) { rows.Add(r.InnerText); } Title = String.Join("\n", rows); } }
private void InitTextProperties(WordDocHolder docHolder, OpenXmlElement inputCell) { string s = ""; FontName = ""; FontSize = 0; foreach (var p in inputCell.Elements <Paragraph>()) { foreach (var textOrBreak in p.Descendants()) { if (textOrBreak.LocalName == "r" && textOrBreak is Run) { Run r = textOrBreak as Run; RunProperties rProps = r.RunProperties; if (rProps != null) { if (rProps.FontSize != null) { int runFontSize = Int32.Parse(rProps.FontSize.Val); if (runFontSize <= 28) { FontSize = runFontSize; // if font is too large, it is is an ocr error, ignore it } } if (rProps.RunFonts != null) { FontName = rProps.RunFonts.ComplexScript; } } } else if (textOrBreak.LocalName == "t") { s += textOrBreak.InnerText; } else if (textOrBreak.LocalName == "cr") { s += "\n"; } else if (textOrBreak.LocalName == "br") /* do not use lastRenderedPageBreak, see MinRes2011 for wrong lastRenderedPageBreak in Семенов || || (textOrBreak.Name == w + "lastRenderedPageBreak") */ { s += "\n"; } else if (textOrBreak.LocalName == "numPr") { s += "- "; } } s += "\n"; ParagraphProperties pPr = p.ParagraphProperties; if (pPr != null) { for (int l = 0; l < AfterLinesCount(pPr.SpacingBetweenLines); ++l) { s += "\n"; } } } Text = s; IsEmpty = s.IsNullOrWhiteSpace(); if (string.IsNullOrEmpty(FontName)) { FontName = docHolder.DefaultFontName; } if (FontSize == 0) { FontSize = docHolder.DefaultFontSize; } }