Example #1
0
        private TableRectangle getTableFromText(List <TableLine> lines, List <TextEdge> relevantEdges, int relevantEdgeCount, List <Ruling> horizontalRulings)
        {
            TableRectangle table = new TableRectangle();

            TableLine prevRow       = null;
            TableLine firstTableRow = null;
            TableLine lastTableRow  = null;

            int    tableSpaceCount = 0;
            double totalRowSpacing = 0;

            // go through the lines and find the ones that have the correct count of the relevant edges
            foreach (TableLine textRow in lines)
            {
                int numRelevantEdges = 0;

                if (firstTableRow != null && tableSpaceCount > 0)
                {
                    // check to make sure this text row is within a line or so of the other lines already added
                    // if it's not, we should stop the table here
                    double tableLineThreshold = (totalRowSpacing / tableSpaceCount) * 2.5;
                    double lineDistance       = prevRow.Bottom - textRow.Bottom; // bobld: textRow.Top - prevRow.Top

                    System.Diagnostics.Debug.Assert(lineDistance >= 0);

                    if (lineDistance > tableLineThreshold)
                    {
                        lastTableRow = prevRow;
                        break;
                    }
                }

                // for larger tables, be a little lenient on the number of relevant rows the text intersects
                // for smaller tables, not so much - otherwise we'll end up treating paragraphs as tables too
                int relativeEdgeDifferenceThreshold = 1;
                if (relevantEdgeCount <= 3)
                {
                    relativeEdgeDifferenceThreshold = 0;
                }

                foreach (TextEdge edge in relevantEdges)
                {
                    if (textRow.IntersectsLine(edge.Line))
                    {
                        numRelevantEdges++;
                    }
                }

                // see if we have a candidate text row
                if (numRelevantEdges >= (relevantEdgeCount - relativeEdgeDifferenceThreshold))
                {
                    // keep track of table row spacing
                    if (prevRow != null && firstTableRow != null)
                    {
                        tableSpaceCount++;
                        totalRowSpacing += prevRow.Bottom - textRow.Bottom; // bobld: textRow.Top - prevRow.Top
                    }

                    // row is part of a table
                    if (table.Area == 0)
                    {
                        firstTableRow = textRow;
                        table.SetRect(textRow);
                    }
                    else
                    {
                        table.SetLeft(Math.Min(table.Left, textRow.Left));
                        table.SetBottom(Math.Min(table.Bottom, textRow.Bottom)); // bobld: Max
                        table.SetRight(Math.Max(table.Right, textRow.Right));
                    }
                }
                else
                {
                    // no dice
                    // if we're at the end of the table, save the last row
                    if (firstTableRow != null && lastTableRow == null)
                    {
                        lastTableRow = prevRow;
                    }
                }

                prevRow = textRow;
            }

            // if we don't have a table now, we won't after the next step either
            if (table.Area == 0)
            {
                return(null);
            }

            if (lastTableRow == null)
            {
                // takes care of one-row tables or tables that end at the bottom of a page
                lastTableRow = prevRow;
            }

            // use the average row height and nearby horizontal lines to extend the table area
            double avgRowHeight;

            if (tableSpaceCount > 0)
            {
                System.Diagnostics.Debug.Assert(totalRowSpacing >= 0);
                avgRowHeight = totalRowSpacing / tableSpaceCount;
            }
            else
            {
                avgRowHeight = lastTableRow.Height;
            }

            double rowHeightThreshold = avgRowHeight * 1.5;

            // check lines after the bottom of the table
            //foreach (Ruling ruling in sortedHorizontalRulings) //Line2D.Float
            for (int i = horizontalRulings.Count - 1; i >= 0; i--) // reverse order
            {
                var ruling = horizontalRulings[i];
                if (ruling.Y1 > table.Bottom) // bobld: <
                {
                    continue;
                }

                double distanceFromTable = table.Bottom - ruling.Y2; // bobld: Y1
                System.Diagnostics.Debug.Assert(distanceFromTable >= 0);
                if (distanceFromTable <= rowHeightThreshold)
                {
                    // use this ruling to help define the table
                    table.SetBottom(Math.Min(table.Bottom, ruling.Y2));  // bobld: Max Y1
                    table.SetLeft(Math.Min(table.Left, ruling.X1));
                    table.SetRight(Math.Max(table.Right, ruling.X2));
                }
                else
                {
                    // no use checking any further
                    break;
                }
            }

            // do the same for lines at the top, but make the threshold greater since table headings tend to be
            // larger to fit up to three-ish rows of text (at least but we don't want to grab too much)
            rowHeightThreshold = avgRowHeight * 3.8;

            //for (int i = horizontalRulings.Count - 1; i >= 0; i--)
            for (int i = 0; i < horizontalRulings.Count; i++)
            {
                Ruling ruling = horizontalRulings[i];

                if (ruling.Y1 < table.Top) //bobld: >
                {
                    continue;
                }

                double distanceFromTable = ruling.Y1 - table.Top; // bobld: table.Top - ruling.Y1
                System.Diagnostics.Debug.Assert(distanceFromTable >= 0);
                if (distanceFromTable <= rowHeightThreshold)
                {
                    table.SetTop(Math.Max(table.Top, ruling.Y2));  // bobld: Min Y1
                    table.SetLeft(Math.Min(table.Left, ruling.X1));
                    table.SetRight(Math.Max(table.Right, ruling.X2));
                }
                else
                {
                    break;
                }
            }

            // add a bit of padding since the halved horizontal lines are a little fuzzy anyways
            table.SetTop(Math.Ceiling(table.Top) + TABLE_PADDING_AMOUNT);       // bobld: Floor -
            table.SetBottom(Math.Floor(table.Bottom) - TABLE_PADDING_AMOUNT);   // bobld: Ceiling +
            table.SetLeft(Math.Floor(table.Left) - TABLE_PADDING_AMOUNT);
            table.SetRight(Math.Ceiling(table.Right) + TABLE_PADDING_AMOUNT);

            return(table);
        }