Ejemplo n.º 1
0
        void CollectRows(HtmlDocHolder docHolder, int maxRowsToProcess)
        {
            var tables     = docHolder.HtmlDocument.QuerySelectorAll("*").Where(m => m.LocalName == "table").ToList();
            int tableIndex = 0;

            TablesCount = tables.Count();
            foreach (var t in tables)
            {
                ProcessHtmlTableAndUpdateTitle(docHolder, t, maxRowsToProcess, tableIndex);
                tableIndex++;
            }
            TableRows = DropDayOfWeekRows(TableRows);
        }
Ejemplo n.º 2
0
        private void InitTextProperties(HtmlDocHolder docHolder, IElement inputCell)
        {
            var myFormatter = new MyMarkupFormatter();

            Text    = inputCell.ToHtml(myFormatter);
            IsEmpty = Text.IsNullOrWhiteSpace();
            if (FontName == null || FontName == "")
            {
                FontName = docHolder.DefaultFontName;
            }
            if (FontSize == 0)
            {
                FontSize = docHolder.DefaultFontSize;
            }
        }
Ejemplo n.º 3
0
 public HtmlAdapterCell(HtmlDocHolder docHolder, IElement inputCell, int row, int column)
 {
     InitTextProperties(docHolder, inputCell);
     FirstMergedRow  = row;
     MergedRowsCount = 1;
     MergedColsCount = 1;
     Row             = row;
     Col             = column;
     IsMerged        = false;
     IsEmpty         = Text.IsNullOrWhiteSpace();
     if (inputCell.HasAttribute("colspan"))
     {
         int v;
         if (Int32.TryParse(inputCell.GetAttribute("colspan"), out v))
         {
             MergedColsCount = v;
             IsMerged        = MergedColsCount > 1;
         }
     }
     if (inputCell.HasAttribute("rowspan"))
     {
         int v;
         if (Int32.TryParse(inputCell.GetAttribute("rowspan"), out v))
         {
             MergedRowsCount = v;
         }
     }
     if (inputCell.HasAttribute("width"))
     {
         string s = inputCell.GetAttribute("width");
         double width;
         if (s.EndsWith("%") && double.TryParse(s.Substring(0, s.Length - 1), out width))
         {
             CellWidth = (int)((double)docHolder.DocumentPageSizeInPixels * (width / 100.0));
         }
         if (double.TryParse(s, out width))
         {
             CellWidth = (int)width;
         }
         else
         {
             CellWidth = 50;
         }
     }
 }
Ejemplo n.º 4
0
        public AngleHtmlAdapter(string fileName, int maxRowsToProcess)
        {
            TableRows    = new List <List <HtmlAdapterCell> >();
            DocumentFile = fileName;
            var holder = new HtmlDocHolder(GetAngleDocument(fileName));

            Title = holder.FindTitleAboveTheTable();
            CollectRows(holder, maxRowsToProcess);
            UnmergedColumnsCount = GetUnmergedColumnsCountByFirstRow();
            foreach (var meta_tag in holder.HtmlDocument.QuerySelectorAll("*").Where(m => m.LocalName == "meta").ToList())
            {
                if (meta_tag.GetAttribute("name") == "smartparser_department")
                {
                    Department = meta_tag.GetAttribute("content");
                }
                if (meta_tag.GetAttribute("name") == "smartparser_url")
                {
                    DocumentUrl = meta_tag.GetAttribute("content");
                }
            }
        }
Ejemplo n.º 5
0
        void ProcessHtmlTableAndUpdateTitle(HtmlDocHolder docHolder, IElement table, int maxRowsToProcess, int tableIndex)
        {
            int debugSaveRowCount = TableRows.Count;

            if (table.QuerySelectorAll("*").Where(m => m.LocalName == "table").ToList().Count > 0)
            {
                Logger.Debug(String.Format("ignore table {0} with subtables", tableIndex));
            }
            else if (table.TextContent.Length > 0 && !table.TextContent.Any(x => Char.IsUpper(x)))
            {
                Logger.Debug(String.Format("ignore table {0} that has no uppercase char", tableIndex));
            }
            else if (table.TextContent.Length < 30)
            {
                Logger.Debug(String.Format("ignore table {0}, it is too short", tableIndex));
            }
            else
            {
                ProcessHtmlTable(docHolder, table, maxRowsToProcess);
            }
            if (TableRows.Count > debugSaveRowCount)
            {
                string tableText = table.TextContent.Length > 30 ? table.TextContent.Substring(0, 30).ReplaceEolnWithSpace() : table.TextContent.ReplaceEolnWithSpace();
                Logger.Debug(String.Format("add {0} rows (TableRows.Count={1} ) from table {2} Table.innertText[0:30]='{3}'",
                                           TableRows.Count - debugSaveRowCount,
                                           TableRows.Count,
                                           tableIndex,
                                           tableText));
            }
            if (Title.Length == 0 && table.TextContent.Length > 30 && table.TextContent.ToLower().IndexOf("декабря") != -1)
            {
                var rows = new List <String>();
                foreach (var r in GetHtmlTableRows(table))
                {
                    rows.Add(r.TextContent);
                }
                Title = String.Join("\n", rows);
            }
        }
Ejemplo n.º 6
0
        void ProcessHtmlTable(HtmlDocHolder docHolder, IElement table, int maxRowsToProcess)
        {
            var rows          = GetHtmlTableRows(table);
            int saveRowsCount = TableRows.Count;
            int maxCellsCount = 0;
            int maxSumSpan    = 0;

            for (int r = 0; r < rows.Count(); ++r)
            {
                List <HtmlAdapterCell> newRow = new List <HtmlAdapterCell>();
                int  sumspan = 0;
                var  row     = rows[r];
                bool isEmpty = true;
                foreach (var rowCell in GetHtmlTableCells(rows[r]))
                {
                    var c = new HtmlAdapterCell(docHolder, rowCell, TableRows.Count, sumspan);
                    newRow.Add(c);
                    for (int k = 1; k < c.MergedColsCount; ++k)
                    {
                        newRow.Add(new HtmlAdapterCell(TableRows.Count, sumspan + k));
                    }
                    sumspan += c.MergedColsCount;
                    isEmpty  = isEmpty && c.IsEmpty;
                }
                if (isEmpty)
                {
                    continue;
                }
                maxCellsCount = Math.Max(newRow.Count, maxCellsCount);
                maxSumSpan    = Math.Max(sumspan, maxSumSpan);

                // see 7007_8.html in tests
                for (int k = sumspan; k < maxSumSpan; ++k)
                {
                    newRow.Add(new HtmlAdapterCell(TableRows.Count, sumspan + k));
                }

                if (r == 0 && TableRows.Count > 0 &&
                    BigramsHolder.CheckMergeRow(
                        TableRows.Last().ConvertAll(x => x.Text),
                        newRow.ConvertAll(x => x.Text)))
                {
                    MergeRow(TableRows.Last(), newRow);
                }
                else
                {
                    TableRows.Add(newRow);
                }

                if ((maxRowsToProcess != -1) && (TableRows.Count >= maxRowsToProcess))
                {
                    break;
                }
            }
            if (saveRowsCount < TableRows.Count)
            {
                if (maxCellsCount <= 4)
                {
                    //remove this suspicious table
                    TableRows.RemoveRange(saveRowsCount, TableRows.Count - saveRowsCount);
                }
                else
                {
                    InsertRowSpanCells(saveRowsCount, TableRows.Count);
                    if (CheckNameColumnIsEmpty(saveRowsCount))
                    {
                        TableRows.RemoveRange(saveRowsCount, TableRows.Count - saveRowsCount);
                    }
                }
            }
        }