Exemplo n.º 1
0
        void CollectRows(WordDocHolder docHolder, int maxRowsToProcess, string extension)
        {
            var docPart    = docHolder.WordDocument.MainDocumentPart;
            var tables     = docPart.Document.Descendants <Table>().ToList();
            int tableIndex = 0;

            foreach (OpenXmlPart h in docPart.HeaderParts)
            {
                foreach (var t in h.RootElement.Descendants <Table>())
                {
                    ProcessWordTableAndUpdateTitle(docHolder, t, maxRowsToProcess, tableIndex);
                    tableIndex++;
                }
            }
            if (extension != ".htm" && extension != ".html") // это просто костыль. Нужно как-то встроить это в архитектуру.
            {
                tables = ExtractSubtables(tables);
            }

            TablesCount = tables.Count();
            foreach (var t in tables)
            {
                ProcessWordTableAndUpdateTitle(docHolder, t, maxRowsToProcess, tableIndex);
                tableIndex++;
            }

            TableRows = DropDayOfWeekRows(TableRows);
        }
Exemplo n.º 2
0
        void ProcessWordTable(WordDocHolder docHolder, Table table, int maxRowsToProcess)
        {
            var            rows          = table.Descendants <TableRow>().ToList();
            TableWidthInfo widthInfo     = InitializeTableWidthInfo(docHolder, table);
            int            saveRowsCount = TableRows.Count;
            int            maxCellsCount = 0;

            for (int r = 0; r < rows.Count(); ++r)
            {
                List <OpenXmlWordCell> newRow = new List <OpenXmlWordCell>();
                int  sumspan       = 0;
                var  row           = rows[r];
                int  rowGridBefore = GetRowGridBefore(row);
                bool isEmpty       = true;
                foreach (var rowCell in row.Elements <TableCell>())
                {
                    var c = new OpenXmlWordCell(docHolder, widthInfo, rowCell, TableRows.Count, sumspan);
                    if (newRow.Count == 0)
                    {
                        c.MergedColsCount += rowGridBefore;
                    }
                    newRow.Add(c);
                    sumspan += c.MergedColsCount;
                    isEmpty  = isEmpty && c.IsEmpty;
                }
                if (isEmpty)
                {
                    continue;
                }
                maxCellsCount = Math.Max(newRow.Count, maxCellsCount);
                if (r == 0 && TableRows.Count > 0 &&
                    BigramsHolder.CheckMergeRow(
                        TableRows.Last().ConvertAll(x => x.Text),
                        newRow.ConvertAll(x => x.Text)))
                {
                    MergeRow(TableRows.Last(), newRow);
                }
                else
                {
                    TableRows.Add(newRow);
                }

                if ((maxRowsToProcess != -1) && (TableRows.Count >= maxRowsToProcess))
                {
                    break;
                }
            }

            if (maxCellsCount <= 4 || CheckNameColumnIsEmpty(TableRows, saveRowsCount))
            {
                //remove this suspicious table
                TableRows.RemoveRange(saveRowsCount, TableRows.Count - saveRowsCount);
            }
        }
Exemplo n.º 3
0
        public OpenXmlWordCell(WordDocHolder docHolder, TableWidthInfo tableWidth, TableCell inputCell, int row, int column)
        {
            InitTextProperties(docHolder, inputCell);
            var vmerge = inputCell.TableCellProperties.GetFirstChild <VerticalMerge>();

            if (vmerge == null)
            {
                IsVerticallyMerged = false;
            }
            else
            {
                if (vmerge == null || vmerge.Val == null || vmerge.Val == MergedCellValues.Continue)
                {
                    IsVerticallyMerged = true;
                }
                else
                {
                    //vmerge.Val == MergedCellValues.Restart
                    IsVerticallyMerged = false;
                }
            }
            var gridSpan = inputCell.TableCellProperties.GetFirstChild <GridSpan>();

            IsMerged        = gridSpan != null && gridSpan.Val > 1;
            FirstMergedRow  = -1; // init afterwards
            MergedRowsCount = -1; // init afterwards

            MergedColsCount = (gridSpan == null) ? 1 : (int)gridSpan.Val;
            Row             = row;
            Col             = column;
            if (inputCell.TableCellProperties != null &&
                inputCell.TableCellProperties.TableCellWidth != null &&
                inputCell.TableCellProperties.TableCellWidth.Type != null &&
                inputCell.TableCellProperties.TableCellWidth.Type != TableWidthUnitValues.Auto
                )
            {
                CellWidth = TableWidthInfo.TryReadWidth(
                    inputCell.TableCellProperties.TableCellWidth.Width,
                    inputCell.TableCellProperties.TableCellWidth.Type,
                    tableWidth.TableWidthInPixels);
            }
            else
            {
                if (Col < tableWidth.ColumnWidths.Count)
                {
                    CellWidth = tableWidth.ColumnWidths[Col];
                }
            }
            AdditTableIndention = tableWidth.TableIndentionInPixels;
        }
Exemplo n.º 4
0
        TableWidthInfo InitializeTableWidthInfo(WordDocHolder docHolder, Table table)
        {
            TableWidthInfo  widthInfo = new TableWidthInfo();
            TableProperties tProp     = table.GetFirstChild <TableProperties>();

            if (tProp != null)
            {
                if (tProp.TableWidth != null)
                {
                    widthInfo.TableWidthInPixels = TableWidthInfo.TryReadWidth(
                        tProp.TableWidth.Width,
                        tProp.TableWidth.Type,
                        docHolder.DocumentPageSizeInPixels);
                }

                if (tProp.TableIndentation != null)
                {
                    widthInfo.TableIndentionInPixels = TableWidthInfo.TryReadWidth(
                        tProp.TableIndentation.Width,
                        tProp.TableIndentation.Type,
                        docHolder.DocumentPageSizeInPixels);
                }
                widthInfo.TableIndentionInPixels += docHolder.DocumentPageLeftMaginInPixels;
            }
            else
            {
                widthInfo.TableWidthInPixels = docHolder.DocumentPageSizeInPixels;
            }
            TableGrid tGrid = table.GetFirstChild <TableGrid>();

            if (tGrid != null)
            {
                widthInfo.ColumnWidths = new List <int>();
                foreach (var col in tGrid.Elements <GridColumn>())
                {
                    widthInfo.ColumnWidths.Add(
                        TableWidthInfo.TryReadWidth(
                            col.Width,
                            TableWidthUnitValues.Dxa,
                            widthInfo.TableWidthInPixels));
                }
            }
            return(widthInfo);
        }
Exemplo n.º 5
0
 private void ProcessDoc(string fileName, string extension, int maxRowsToProcess)
 {
     using (var doc = new WordDocHolder(WordprocessingDocument.Open(fileName, false)))
     {
         CurrentScheme = _allSchemes.Find(x => x.CanProcess(doc.WordDocument));
         if (CurrentScheme != default)
         {
             // CollectRows from distinct Tables
             Title = doc.FindTitleAboveTheTable();
             CurrentScheme.Document = doc.WordDocument.MainDocumentPart.Document;
             TablesCount            = 1;
         }
         else
         {
             Title = doc.FindTitleAboveTheTable();
             CollectRows(doc, maxRowsToProcess, extension);
             UnmergedColumnsCount = GetUnmergedColumnsCountByFirstRow();
             InitializeVerticallyMerge();
         }
     };
 }
Exemplo n.º 6
0
        void ProcessWordTableAndUpdateTitle(WordDocHolder docHolder, Table table, int maxRowsToProcess, int tableIndex)
        {
            int debugSaveRowCount = TableRows.Count;

            if (table.Descendants <Table>().ToList().Count > 0)
            {
                Logger.Debug(String.Format("ignore table {0} with subtables", tableIndex));
            }
            else if (table.InnerText.Length > 0 && !table.InnerText.Any(x => Char.IsUpper(x)))
            {
                Logger.Debug(String.Format("ignore table {0} that has no uppercase char", tableIndex));
            }
            else if (table.InnerText.Length < 30)
            {
                Logger.Debug(String.Format("ignore table {0}, it is too short", tableIndex));
            }
            else
            {
                ProcessWordTable(docHolder, table, maxRowsToProcess);
            }

            if (TableRows.Count > debugSaveRowCount)
            {
                string tableText = table.InnerText.Length > 30  ? table.InnerText.Substring(0, 30) : table.InnerText;
                Logger.Debug(String.Format("add {0} rows (TableRows.Count={1} ) from table {2} Table.innertText[0:30]='{3}'",
                                           TableRows.Count - debugSaveRowCount,
                                           TableRows.Count,
                                           tableIndex,
                                           tableText));
            }
            if (Title.Length == 0 && table.InnerText.Length > 30 && table.InnerText.ToLower().IndexOf("декабря") != -1)
            {
                var rows = new List <String>();
                foreach (var r in table.Descendants <TableRow>())
                {
                    rows.Add(r.InnerText);
                }
                Title = String.Join("\n", rows);
            }
        }
Exemplo n.º 7
0
        private void InitTextProperties(WordDocHolder docHolder, OpenXmlElement inputCell)
        {
            string s = "";

            FontName = "";
            FontSize = 0;
            foreach (var p in inputCell.Elements <Paragraph>())
            {
                foreach (var textOrBreak in p.Descendants())
                {
                    if (textOrBreak.LocalName == "r" && textOrBreak is Run)
                    {
                        Run           r      = textOrBreak as Run;
                        RunProperties rProps = r.RunProperties;
                        if (rProps != null)
                        {
                            if (rProps.FontSize != null)
                            {
                                int runFontSize = Int32.Parse(rProps.FontSize.Val);
                                if (runFontSize <= 28)
                                {
                                    FontSize = runFontSize;                    //  if font is too large, it is is an ocr error, ignore it
                                }
                            }
                            if (rProps.RunFonts != null)
                            {
                                FontName = rProps.RunFonts.ComplexScript;
                            }
                        }
                    }
                    else if (textOrBreak.LocalName == "t")
                    {
                        s += textOrBreak.InnerText;
                    }
                    else if (textOrBreak.LocalName == "cr")
                    {
                        s += "\n";
                    }
                    else if (textOrBreak.LocalName == "br")

                    /* do  not use lastRenderedPageBreak, see MinRes2011 for wrong lastRenderedPageBreak in Семенов
                    ||
                    ||    (textOrBreak.Name == w + "lastRenderedPageBreak") */
                    {
                        s += "\n";
                    }
                    else if (textOrBreak.LocalName == "numPr")
                    {
                        s += "- ";
                    }
                }
                s += "\n";
                ParagraphProperties pPr = p.ParagraphProperties;
                if (pPr != null)
                {
                    for (int l = 0; l < AfterLinesCount(pPr.SpacingBetweenLines); ++l)
                    {
                        s += "\n";
                    }
                }
            }
            Text    = s;
            IsEmpty = s.IsNullOrWhiteSpace();
            if (string.IsNullOrEmpty(FontName))
            {
                FontName = docHolder.DefaultFontName;
            }
            if (FontSize == 0)
            {
                FontSize = docHolder.DefaultFontSize;
            }
        }