/// <summary> /// Remove short lines from line dictionary. /// </summary> /// <param name="diclines">Line dictionary</param> void RemoveTooShortAndTooLongLines(Page page, SortedDictionary <double, FormLineList> diclines, bool isHorizontial, Rect posRect) { double[] pageSize = PdfTronHelper.GetPageSize(page); double maxLength = isHorizontial ? pageSize[0] : pageSize[1]; diclines.Where(pair => pair.Value.Exists(line => Math.Abs(line.Length - maxLength) < 3)) .Select(pair => pair.Key).ToList() .ForEach(key => diclines.Remove(key)); FormLineList _lines = new FormLineList(diclines.SelectMany(pair => pair.Value).ToList()); if (_lines.Count > 1) { if (isHorizontial) { double[] textLeftRightXValue = pdfTronHelper.GetLeftRightTextBounds(page); maxLength = (textLeftRightXValue[1] - textLeftRightXValue[0]); diclines.Where(x => x.Value.Sum(line => line.Length) < maxLength * 0.5 ).Select(x => x.Key).ToList().ForEach(key => diclines.Remove(key)); foreach (double key in diclines.Keys.ToArray()) { FormLineList lines = diclines[key]; if (lines.Count < 2) { continue; } double _maxLength = lines.Max(line => line.Length); FormLine maxLengthLine = lines.Find(line => line.Length == _maxLength); lines.Where(line => line.Length < (_maxLength * 0.7)).ToList().ForEach(line => lines.Remove(line)); } FormLineList templines = new FormLineList(diclines.SelectMany(pair => pair.Value).ToList()); if (templines.Count > 1) { maxLength = templines.Select(line => line.Length).Max(); double scale = 0.4; double minLength = maxLength * scale; IEnumerable <double> shortLineKeys = diclines.Where( x => x.Value.Sum(line => line.Length) < minLength ).Select(x => x.Key); shortLineKeys.ToList().ForEach(key => diclines.Remove(key)); } } else { maxLength = posRect.Height(); if (posRect.Height() < 300) { maxLength = _lines.Select(line => line.Length).Max(); } double minLength = maxLength * 0.4; if (minLength < 9) { minLength = 9; } IEnumerable <double> shortLineKeys = diclines.Where( x => x.Value.Sum(line => line.Length) < minLength ).Select(x => x.Key); shortLineKeys.ToList().ForEach(key => diclines.Remove(key)); } } else { diclines.Clear(); } }
public SortedDictionary <double, FormLineList>[] GetFormLines(int pageNum, LinePos startPos, LinePos endPos , bool isSubsequentPage, SortedDictionary <double, FormLineList> lastPageVerticalLines) { //Get the information of lines and extreme points by travelsaling the page horizontalLines = new SortedDictionary <double, FormLineList>(); verticalLines = new SortedDictionary <double, FormLineList>(); _pageNum = pageNum; _startPos = startPos; _endPos = endPos; Page page = _pdfDoc.GetPage(pageNum); pageSize = PdfTronHelper.GetPageSize(page); topBound.AxisValue = startPos == null ? pageSize[1] : startPos.AxisValue; bottomBound.AxisValue = endPos == null ? 0 : endPos.AxisValue; ii = 0; using (ElementReader page_reader = new ElementReader()) { page_reader.Begin(page); ProcessElements(page_reader); } RemoveLittleLines(horizontalLines); RemoveLittleLines(verticalLines); //Remove short lines. Rect posRect = GetTablePosRect(_pageNum, _startPos, _endPos, isSubsequentPage); RemoveTooShortAndTooLongLines(page, horizontalLines, true, posRect); bool isNotNeedGenerateVerLines = isSubsequentPage && lastPageVerticalLines != null && verticalLines.Count == lastPageVerticalLines.Count; if (!isNotNeedGenerateVerLines) { RemoveTooShortAndTooLongLines(page, verticalLines, false, posRect); } //Generate drawed lines. Rect areaRect; bool existRealRect = horizontalLines.Count > 1 && verticalLines.Count > 1 && IsRect(posRect, horizontalLines, verticalLines); areaRect = existRealRect ? GenerateRectByLines() : posRect; if (existRealRect || isSubsequentPage) { RemoveLinesNotInRect(areaRect, horizontalLines, verticalLines); isNotNeedGenerateVerLines = isSubsequentPage && lastPageVerticalLines != null && verticalLines.Count == lastPageVerticalLines.Count; } else { posRect = GetTablePosRect(_pageNum, _startPos, _endPos, true); } FormLineGenerator.RemoveSpareNearLines(verticalLines, false); if (!isNotNeedGenerateVerLines) { isNotNeedGenerateVerLines = isSubsequentPage && lastPageVerticalLines != null && verticalLines.Count == lastPageVerticalLines.Count; } FormLineGenerator lineGenerator = new FormLineGenerator(page, areaRect); SortedDictionary <double, FormLineList>[] lines = lineGenerator.GetFormLines(existRealRect, horizontalLines, verticalLines, isNotNeedGenerateVerLines); return(lines); }
/// <summary> /// Constructor for TableBoundSearcher class /// </summary> /// <param name="pdfDoc">The pdf document object to be operated</param> public FormLineSearcher(PDFDoc pdfDoc) { _pdfDoc = pdfDoc; pdfTronHelper = new PdfTronHelper(pdfDoc); }