/// <summary> /// Performs a parsing pass over a tbody to read information about column width and rowspan attributes. Information read about width /// attributes is stored in the reference ArrayList parameter columnStarts, which contains a list of all starting /// positions of all columns in the table, ordered from left to right. Row spans are taken into consideration when /// computing column starts /// </summary> /// <param name="htmlTbodyElement"> /// XmlElement representing Html tbody whose structure is to be analyzed /// </param> /// <param name="columnStarts"> /// ArrayList of type double which contains the function output. If analysis fails, this parameter is set to null /// </param> /// <param name="tableWidth"> /// Current width of the table. This is used to determine if a new column when added to the end of table should /// come after the last column in the table or is actually splitting the last column in two. If it is only splitting /// the last column it should inherit row span for that column /// </param> /// <returns> /// Calculated width of a tbody. /// In case of non-analizable column width structure return 0; /// </returns> private static double AnalyzeTbodyStructure(IXmlNode htmlTbodyElement, List<double> columnStarts, List<int> activeRowSpans, double tableWidth) { // Parameter validation Debug.Assert(htmlTbodyElement.LocalName.ToLower() == "tbody"); Debug.Assert(columnStarts != null); double tbodyWidth = 0; bool columnWidthsAvailable = true; if (!htmlTbodyElement.HasChildNodes()) { return tbodyWidth; } // Set active row spans to 0 - thus ignoring row spans crossing tbody boundaries ClearActiveRowSpans(activeRowSpans); var htmlChildNode = htmlTbodyElement.FirstChild; // Analyze tr elements while (htmlChildNode != null && columnWidthsAvailable) { switch (htmlChildNode.LocalName.ToLower()) { case "tr": double trWidth = AnalyzeTRStructure(htmlChildNode, columnStarts, activeRowSpans, tbodyWidth); if (trWidth > tbodyWidth) { tbodyWidth = trWidth; } break; case "td": columnWidthsAvailable = false; // interrupt the analisys break; default: break; } htmlChildNode = htmlChildNode.NextSibling; } // Set active row spans to 0 - thus ignoring row spans crossing tbody boundaries ClearActiveRowSpans(activeRowSpans); return columnWidthsAvailable ? tbodyWidth : 0; }
/// <summary> /// Performs a parsing pass over a tr element to read information about column width and rowspan attributes. /// </summary> /// <param name="htmlTRElement"> /// XmlElement representing Html tr element whose structure is to be analyzed /// </param> /// <param name="columnStarts"> /// ArrayList of type double which contains the function output. If analysis is successful, this ArrayList contains /// all the points which are the starting position of any column in the tr, ordered from left to right. If analysis fails, /// the ArrayList is set to null /// </param> /// <param name="activeRowSpans"> /// ArrayList representing all columns currently spanned by an earlier row span attribute. These columns should /// not be used for data in this row. The ArrayList actually contains notation for all columns in the table, if the /// active row span is set to 0 that column is not presently spanned but if it is > 0 the column is presently spanned /// </param> /// <param name="tableWidth"> /// Double value representing the current width of the table. /// Return 0 if analisys was insuccessful. /// </param> private static double AnalyzeTRStructure(IXmlNode htmlTRElement, List<double> columnStarts, List<int> activeRowSpans, double tableWidth) { double columnWidth; // Parameter validation Debug.Assert(htmlTRElement.LocalName.ToLower() == "tr"); Debug.Assert(columnStarts != null); Debug.Assert(activeRowSpans != null); Debug.Assert(columnStarts.Count == activeRowSpans.Count); if (!htmlTRElement.HasChildNodes()) { return 0; } bool columnWidthsAvailable = true; double columnStart = 0; // starting position of current column var htmlChildNode = htmlTRElement.FirstChild; int columnIndex = 0; double trWidth = 0; // Skip spanned columns to get to real column start if (columnIndex < activeRowSpans.Count) { Debug.Assert((double)columnStarts[columnIndex] >= columnStart); if ((double)columnStarts[columnIndex] == columnStart) { // The new column may be in a spanned area while (columnIndex < activeRowSpans.Count && (int)activeRowSpans[columnIndex] > 0) { activeRowSpans[columnIndex] = (int)activeRowSpans[columnIndex] - 1; Debug.Assert((int)activeRowSpans[columnIndex] >= 0); columnIndex++; columnStart = (double)columnStarts[columnIndex]; } } } while (htmlChildNode != null && columnWidthsAvailable) { Debug.Assert(columnStarts.Count == activeRowSpans.Count); VerifyColumnStartsAscendingOrder(columnStarts); switch (htmlChildNode.LocalName.ToLower()) { case "td": Debug.Assert(columnIndex <= columnStarts.Count); if (columnIndex < columnStarts.Count) { Debug.Assert(columnStart <= (double)columnStarts[columnIndex]); if (columnStart < (double)columnStarts[columnIndex]) { columnStarts.Insert(columnIndex, columnStart); // There can be no row spans now - the column data will appear here // Row spans may appear only during the column analysis activeRowSpans.Insert(columnIndex, 0); } } else { // Column start is greater than all previous starts. Row span must still be 0 because // we are either adding after another column of the same row, in which case it should not inherit // the previous column's span. Otherwise we are adding after the last column of some previous // row, and assuming the table widths line up, we should not be spanned by it. If there is // an incorrect tbale structure where a columns starts in the middle of a row span, we do not // guarantee correct output columnStarts.Add(columnStart); activeRowSpans.Add(0); } columnWidth = GetColumnWidth(htmlChildNode); if (columnWidth != -1) { int nextColumnIndex; int rowSpan = GetRowSpan(htmlChildNode); nextColumnIndex = GetNextColumnIndex(columnIndex, columnWidth, columnStarts, activeRowSpans); if (nextColumnIndex != -1) { // Entire column width can be processed without hitting conflicting row span. This means that // column widths line up and we can process them Debug.Assert(nextColumnIndex <= columnStarts.Count); // Apply row span to affected columns for (int spannedColumnIndex = columnIndex; spannedColumnIndex < nextColumnIndex; spannedColumnIndex++) { activeRowSpans[spannedColumnIndex] = rowSpan - 1; Debug.Assert((int)activeRowSpans[spannedColumnIndex] >= 0); } columnIndex = nextColumnIndex; // Calculate columnsStart for the next cell columnStart = columnStart + columnWidth; if (columnIndex < activeRowSpans.Count) { Debug.Assert((double)columnStarts[columnIndex] >= columnStart); if ((double)columnStarts[columnIndex] == columnStart) { // The new column may be in a spanned area while (columnIndex < activeRowSpans.Count && (int)activeRowSpans[columnIndex] > 0) { activeRowSpans[columnIndex] = (int)activeRowSpans[columnIndex] - 1; Debug.Assert((int)activeRowSpans[columnIndex] >= 0); columnIndex++; columnStart = (double)columnStarts[columnIndex]; } } // else: the new column does not start at the same time as a pre existing column // so we don't have to check it for active row spans, it starts in the middle // of another column which has been checked already by the GetNextColumnIndex function } } else { // Full column width cannot be processed without a pre existing row span. // We cannot analyze widths columnWidthsAvailable = false; } } else { // Incorrect column width, stop processing columnWidthsAvailable = false; } break; default: break; } htmlChildNode = htmlChildNode.NextSibling; } // The width of the tr element is the position at which it's last td element ends, which is calculated in // the columnStart value after each td element is processed if (columnWidthsAvailable) { trWidth = columnStart; } else { trWidth = 0; } return trWidth; }
/// <summary> /// Performs a parsing pass over a table to read information about column width and rowspan attributes. This information /// is used to determine the starting point of each column. /// </summary> /// <param name="htmlTableElement"> /// XmlElement representing Html table whose structure is to be analyzed /// </param> /// <returns> /// ArrayList of type double which contains the function output. If analysis is successful, this ArrayList contains /// all the points which are the starting position of any column in the table, ordered from left to right. /// In case if analisys was impossible we return null. /// </returns> private static List<double> AnalyzeTableStructure(IXmlNode htmlTableElement) { // Parameter validation Debug.Assert(htmlTableElement.LocalName.ToLower() == "table"); if (!htmlTableElement.HasChildNodes()) { return null; } bool columnWidthsAvailable = true; var columnStarts = new List<double>(); var activeRowSpans = new List<int>(); Debug.Assert(columnStarts.Count == activeRowSpans.Count); var htmlChildNode = htmlTableElement.FirstChild; double tableWidth = 0; // Keep track of table width which is the width of its widest row // Analyze tbody and tr elements while (htmlChildNode != null && columnWidthsAvailable) { Debug.Assert(columnStarts.Count == activeRowSpans.Count); switch (htmlChildNode.LocalName.ToLower()) { case "tbody": // Tbody element, we should analyze its children for trows double tbodyWidth = AnalyzeTbodyStructure(htmlChildNode, columnStarts, activeRowSpans, tableWidth); if (tbodyWidth > tableWidth) { // Table width must be increased to supported newly added wide row tableWidth = tbodyWidth; } else if (tbodyWidth == 0) { // Tbody analysis may return 0, probably due to unprocessable format. // We should also fail. columnWidthsAvailable = false; // interrupt the analisys } break; case "tr": // Table row. Analyze column structure within row directly double trWidth = AnalyzeTRStructure(htmlChildNode, columnStarts, activeRowSpans, tableWidth); if (trWidth > tableWidth) { tableWidth = trWidth; } else if (trWidth == 0) { columnWidthsAvailable = false; // interrupt the analisys } break; case "td": // Incorrect formatting, too deep to analyze at this level. Return null. // TODO: implement analysis at this level, possibly by creating a new tr columnWidthsAvailable = false; // interrupt the analisys break; default: // Element should not occur directly in table. Ignore it. break; } htmlChildNode = htmlChildNode.NextSibling; } if (columnWidthsAvailable) { // Add an item for whole table width columnStarts.Add(tableWidth); VerifyColumnStartsAscendingOrder(columnStarts); } else { columnStarts = null; } return columnStarts; }