示例#1
0
        public bool NotEmptyParagraphIsInADeclarationTable(Paragraph p)
        {
            if (p.InnerText == "")
            {
                return(false);
            }
            var body = WordDocument.MainDocumentPart.Document.Body;

            if (p.Parent == body)
            {
                return(false);
            }
            var cell = p.Parent;

            if (cell == null || cell.GetType() != typeof(TableCell))
            {
                return(false);
            }
            var row = cell.Parent;

            if (row == null || row.GetType() != typeof(TableRow) || WordDocHolder.CountInnerVerticalBorders((TableRow)row) < 3)
            {
                return(false);
            }
            var table = row.Parent;

            if (table != null && table.GetType().Name == "Table" && table.Descendants <TableRow>().Count() > 1)
            {
                return(true);
            }
            return(false);
        }
示例#2
0
        public static int CountInnerVerticalBorders(TableRow row)
        {
            int           cellIndex      = 0;
            HashSet <int> visibleBorders = new HashSet <int>();
            var           cells          = row.Descendants <TableCell>().ToList();

            foreach (var c in cells)
            {
                bool rightBorder = true;
                bool leftBorder  = true;
                if (c?.TableCellProperties?.TableCellBorders != null)
                {
                    var cellBorders = c.TableCellProperties.TableCellBorders;
                    if (!WordDocHolder.BorderIsVisible(cellBorders.LeftBorder))
                    {
                        leftBorder = false;
                    }
                    if (!WordDocHolder.BorderIsVisible(cellBorders.RightBorder))
                    {
                        rightBorder = false;
                    }
                }
                if (cellIndex > 0 && leftBorder)
                {
                    visibleBorders.Add(cellIndex);
                }
                if (cellIndex + 1 < cells.Count && rightBorder)
                {
                    visibleBorders.Add(cellIndex + 1);
                }
                cellIndex += 1;
            }
            return(visibleBorders.Count);
        }
示例#3
0
        void CollectRows(WordDocHolder docHolder, int maxRowsToProcess, string extension)
        {
            var docPart    = docHolder.WordDocument.MainDocumentPart;
            var tables     = docPart.Document.Descendants <Table>().ToList();
            int tableIndex = 0;

            foreach (OpenXmlPart h in docPart.HeaderParts)
            {
                foreach (var t in h.RootElement.Descendants <Table>())
                {
                    ProcessWordTableAndUpdateTitle(docHolder, t, maxRowsToProcess, tableIndex);
                    tableIndex++;
                }
            }
            if (extension != ".htm" && extension != ".html") // это просто костыль. Нужно как-то встроить это в архитектуру.
            {
                tables = ExtractSubtables(tables);
            }

            TablesCount = tables.Count();
            foreach (var t in tables)
            {
                ProcessWordTableAndUpdateTitle(docHolder, t, maxRowsToProcess, tableIndex);
                tableIndex++;
            }

            TableRows = DropDayOfWeekRows(TableRows);
        }
示例#4
0
        void ProcessWordTableAndUpdateTitle(WordDocHolder docHolder, Table table, int maxRowsToProcess, int tableIndex)
        {
            if (CanAppendRowsFromTable(table, tableIndex))
            {
                ProcessWordTable(docHolder, table, maxRowsToProcess);
            }

            int debugSaveRowCount = TableRows.Count;

            if (TableRows.Count > debugSaveRowCount)
            {
                string tableText = table.InnerText.Length > 30 ? table.InnerText.Substring(0, 30) : table.InnerText;
                Logger.Debug(String.Format("add {0} rows (TableRows.Count={1} ) from table {2} Table.innertText[0:30]='{3}'",
                                           TableRows.Count - debugSaveRowCount,
                                           TableRows.Count,
                                           tableIndex,
                                           tableText));
            }
            if (Title.Length == 0 && table.InnerText.Length > 30 && table.InnerText.ToLower().IndexOf("декабря") != -1)
            {
                var rows = new List <String>();
                foreach (var r in table.Descendants <TableRow>())
                {
                    rows.Add(r.InnerText);
                }
                Title = String.Join("\n", rows);
            }
        }
示例#5
0
        TableWidthInfo InitializeTableWidthInfo(WordDocHolder docHolder, Table table)
        {
            TableWidthInfo  widthInfo = new TableWidthInfo();
            TableProperties tProp     = table.GetFirstChild <TableProperties>();

            if (tProp != null)
            {
                if (tProp.TableWidth != null)
                {
                    widthInfo.TableWidthInPixels = TableWidthInfo.TryReadWidth(
                        tProp.TableWidth.Width,
                        tProp.TableWidth.Type,
                        docHolder.DocumentPageSizeInPixels);
                }

                if (tProp.TableIndentation != null)
                {
                    widthInfo.TableIndentionInPixels = TableWidthInfo.TryReadWidth(
                        tProp.TableIndentation.Width,
                        tProp.TableIndentation.Type,
                        docHolder.DocumentPageSizeInPixels);
                }
                widthInfo.TableIndentionInPixels += docHolder.DocumentPageLeftMaginInPixels;
            }
            else
            {
                widthInfo.TableWidthInPixels = docHolder.DocumentPageSizeInPixels;
            }
            TableGrid tGrid = table.GetFirstChild <TableGrid>();

            if (tGrid != null)
            {
                widthInfo.ColumnWidths = new List <int>();
                foreach (var col in tGrid.Elements <GridColumn>())
                {
                    widthInfo.ColumnWidths.Add(
                        TableWidthInfo.TryReadWidth(
                            col.Width,
                            TableWidthUnitValues.Dxa,
                            widthInfo.TableWidthInPixels));
                }
            }
            return(widthInfo);
        }
示例#6
0
 private void ProcessDoc(string fileName, string extension, int maxRowsToProcess)
 {
     using (var doc = new WordDocHolder(WordprocessingDocument.Open(fileName, false)))
     {
         CurrentScheme = _allSchemes.Find(x => x.CanProcess(doc.WordDocument));
         if (CurrentScheme != default)
         {
             // CollectRows from distinct Tables
             Title = doc.FindTitleAboveTheTable();
             CurrentScheme.Document = doc.WordDocument.MainDocumentPart.Document;
             TablesCount            = 1;
         }
         else
         {
             Title = doc.FindTitleAboveTheTable();
             CollectRows(doc, maxRowsToProcess, extension);
             UnmergedColumnsCount = GetUnmergedColumnsCountByFirstRow();
             FindRowsThatCannotBeMerged();
             InitializeVerticallyMerge();
         }
     };
 }
示例#7
0
        void ProcessWordTable(WordDocHolder docHolder, Table table, int maxRowsToProcess)
        {
            var            rows          = table.Descendants <TableRow>().ToList();
            TableWidthInfo widthInfo     = InitializeTableWidthInfo(docHolder, table);
            int            saveRowsCount = TableRows.Count;
            int            maxCellsCount = 0;
            TableBorders   tblBorders    = GetTableBorders(table);

            for (int r = 0; r < rows.Count(); ++r)
            {
                OpenXmlTableRow newRow        = new OpenXmlTableRow();
                int             sumspan       = 0;
                var             tableRow      = rows[r];
                int             rowGridBefore = GetRowGridBefore(tableRow);
                bool            isEmpty       = true;
                var             row           = tableRow.Elements <TableCell>().ToArray();
                for (var i = 0; i < row.Length; ++i)
                {
                    var c = new OpenXmlWordCell(docHolder, row, i, widthInfo, TableRows.Count, sumspan, tblBorders);
                    if (newRow.RowCells.Count == 0)
                    {
                        c.MergedColsCount += rowGridBefore;
                    }
                    if (newRow.RowCells.Count > 0 && !newRow.RowCells.Last().HasRightBorder)
                    {
                        newRow.RowCells.Last().Text            += c.Text;
                        newRow.RowCells.Last().CellWidth       += c.CellWidth;
                        newRow.RowCells.Last().MergedColsCount += c.MergedColsCount;
                        newRow.RowCells.Last().HasRightBorder   = c.HasRightBorder;
                        sumspan += c.MergedColsCount;
                    }
                    else
                    {
                        newRow.RowCells.Add(c);
                        sumspan += c.MergedColsCount;
                    }
                    isEmpty = isEmpty && c.IsEmpty;
                }
                if (isEmpty)
                {
                    continue;
                }
                maxCellsCount = Math.Max(newRow.RowCells.Count, maxCellsCount);
                if (r == 0 && TableRows.Count > 0 &&
                    BigramsHolder.CheckMergeRow(
                        TableRows.Last().RowCells.ConvertAll(x => x.Text),
                        newRow.RowCells.ConvertAll(x => x.Text)))
                {
                    MergeRow(TableRows.Last().RowCells, newRow.RowCells);
                }
                else
                {
                    TableRows.Add(newRow);
                }

                if ((maxRowsToProcess != -1) && (TableRows.Count >= maxRowsToProcess))
                {
                    break;
                }
            }
            if ((TableRows.Count > 0) && !TableHeaderRecognizer.IsNamePositionAndIncomeTable(GetDataCells(0)))
            {
                if (maxCellsCount <= 4 || CheckNameColumnIsEmpty(saveRowsCount))
                {
                    //remove this suspicious table
                    TableRows.RemoveRange(saveRowsCount, TableRows.Count - saveRowsCount);
                }
            }
        }
示例#8
0
        private void InitTextProperties(WordDocHolder docHolder, OpenXmlElement inputCell)
        {
            Text = "";
            foreach (var p in inputCell.Elements <Paragraph>())
            {
                foreach (var textOrBreak in p.Descendants())
                {
                    if (textOrBreak.LocalName == "r" && textOrBreak is Run)
                    {
                        Run           r      = textOrBreak as Run;
                        RunProperties rProps = r.RunProperties;
                        if (rProps != null)
                        {
                            if (rProps.FontSize != null)
                            {
                                int runFontSize = Int32.Parse(rProps.FontSize.Val);
                                if (runFontSize <= 28)
                                {
                                    FontSize = runFontSize;                    //  if font is too large, it is is an ocr error, ignore it
                                }
                            }
                            if (rProps.RunFonts != null)
                            {
                                FontName = rProps.RunFonts.ComplexScript;
                            }
                        }
                    }
                    else if (textOrBreak.LocalName == "t")
                    {
                        Text += textOrBreak.InnerText;
                    }
                    else if (textOrBreak.LocalName == "cr")
                    {
                        Text += "\n";
                    }
                    else if (textOrBreak.LocalName == "br")

                    /* do  not use lastRenderedPageBreak, see MinRes2011 for wrong lastRenderedPageBreak in Семенов
                    ||
                    ||    (textOrBreak.Name == w + "lastRenderedPageBreak") */
                    {
                        Text += "\n";
                    }
                    else if (textOrBreak.LocalName == "numPr")
                    {
                        Text += "- ";
                    }
                }
                Text += "\n";
                ParagraphProperties pPr = p.ParagraphProperties;
                if (pPr != null)
                {
                    for (int l = 0; l < AfterLinesCount(pPr.SpacingBetweenLines); ++l)
                    {
                        Text += "\n";
                    }
                }
            }
            IsEmpty = Text.IsNullOrWhiteSpace();
            if (string.IsNullOrEmpty(FontName))
            {
                FontName = docHolder.DefaultFontName;
            }
            if (FontSize == 0)
            {
                FontSize = docHolder.DefaultFontSize;
            }
        }
示例#9
0
        public OpenXmlWordCell(WordDocHolder docHolder, TableCell[] row, int cellIndexInRow, TableWidthInfo tableWidth,
                               int rowIndexInTable, int unmergedColumnIndex, TableBorders tblBorders)
        {
            TableCell inputCell = row[cellIndexInRow];

            InitTextProperties(docHolder, inputCell);
            if (inputCell?.TableCellProperties?.TableCellBorders != null)
            {
                var borders = inputCell.TableCellProperties.TableCellBorders;
                HasBottomBorder = WordDocHolder.BorderIsVisible(borders.BottomBorder);
                HasTopBorder    = WordDocHolder.BorderIsVisible(borders.TopBorder);
                HasRightBorder  = WordDocHolder.BorderIsVisible(borders.RightBorder);
                if (!HasRightBorder && cellIndexInRow + 1 < row.Length && row[cellIndexInRow + 1].TableCellProperties?.TableCellBorders != null)
                {
                    HasRightBorder = WordDocHolder.BorderIsVisible(row[cellIndexInRow + 1].TableCellProperties?.TableCellBorders?.LeftBorder);
                }
                if (!HasRightBorder && tblBorders?.InsideVerticalBorder != null && (uint)tblBorders.InsideVerticalBorder.Size > 0)
                {
                    HasRightBorder = true;
                }
            }
            var vmerge = inputCell.TableCellProperties.GetFirstChild <VerticalMerge>();

            VerticallyMerged = null;
            if (vmerge != null)
            {
                if ((vmerge.Val == null) || (vmerge.Val == MergedCellValues.Continue))
                {
                    // null -> MergedCellValues.Continue
                    VerticallyMerged = MergedCellValues.Continue;
                }
                else if (vmerge.Val == MergedCellValues.Restart)
                {
                    VerticallyMerged = MergedCellValues.Restart;
                }
            }
            if (tblBorders?.InsideHorizontalBorder != null && (uint)tblBorders.InsideHorizontalBorder.Size > 0)
            {
                TableHasInsideHorizontalBorders = true;
            }

            var gridSpan = inputCell.TableCellProperties.GetFirstChild <GridSpan>();

            IsMerged        = gridSpan != null && gridSpan.Val > 1;
            FirstMergedRow  = -1; // init afterwards
            MergedRowsCount = -1; // init afterwards

            MergedColsCount = (gridSpan == null) ? 1 : (int)gridSpan.Val;
            Row             = rowIndexInTable;
            Col             = unmergedColumnIndex;
            if (inputCell.TableCellProperties != null &&
                inputCell.TableCellProperties.TableCellWidth != null &&
                inputCell.TableCellProperties.TableCellWidth.Type != null &&
                inputCell.TableCellProperties.TableCellWidth.Type != TableWidthUnitValues.Auto
                )
            {
                CellWidth = TableWidthInfo.TryReadWidth(
                    inputCell.TableCellProperties.TableCellWidth.Width,
                    inputCell.TableCellProperties.TableCellWidth.Type,
                    tableWidth.TableWidthInPixels);
            }
            else
            {
                if (Col < tableWidth.ColumnWidths.Count)
                {
                    CellWidth = tableWidth.ColumnWidths[Col];
                }
            }
            AdditTableIndention = tableWidth.TableIndentionInPixels;
        }