public ScanPages ExtractDocInfo(string uniqName, string fileName, int maxPagesToExtractFrom, ref int totalPages) { ScanPages scanPages = null; // Extract text and location from pdf pages using (Stream newpdfStream = new FileStream(fileName, FileMode.Open, FileAccess.Read)) { List<List<LocationTextExtractionStrategyEx.TextInfo>> extractedTextAndLoc = new List<List<LocationTextExtractionStrategyEx.TextInfo>>(); using (PdfReader pdfReader = new PdfReader(newpdfStream)) { int numPagesToUse = pdfReader.NumberOfPages; if (numPagesToUse > maxPagesToExtractFrom) numPagesToUse = maxPagesToExtractFrom; int numPagesWithText = 0; for (int pageNum = 1; pageNum <= numPagesToUse; pageNum++) { LocationTextExtractionStrategyEx locationStrategy = new LocationTextExtractionStrategyEx(); try { string text = PdfTextExtractor.GetTextFromPage(pdfReader, pageNum, locationStrategy); if (text != "") numPagesWithText++; extractedTextAndLoc.Add(locationStrategy.TextLocationInfo); } catch (Exception excp) { logger.Error("Failed to extract from pdf {0}, page {1} excp {2}", fileName, pageNum, excp.Message); } } // Create new structures for the information int pageNumber = 1; List<List<ScanTextElem>> scanPagesText = new List<List<ScanTextElem>>(); List<int> pageRotations = new List<int>(); foreach (List<LocationTextExtractionStrategyEx.TextInfo> pageInfo in extractedTextAndLoc) { iTextSharp.text.Rectangle pageRect = pdfReader.GetPageSize(pageNumber); int pageRot = pdfReader.GetPageRotation(pageNumber); // Check through found text to see if the page seems to be rotated int[] rotCounts = new int[] { 0, 0, 0, 0 }; if (pageInfo.Count > 2) { foreach (LocationTextExtractionStrategyEx.TextInfo txtInfo in pageInfo) { int thisRotation = GetTextRotation(txtInfo.TopLeft, txtInfo.BottomRight); rotCounts[(thisRotation / 90) % 4]++; } } int maxRot = 0; int maxRotCount = 0; for (int i = 0; i < rotCounts.Length; i++) if (maxRotCount < rotCounts[i]) { maxRotCount = rotCounts[i]; maxRot = i * 90; } //Console.WriteLine("{2} Page{0}rot = {1}", pageNumber, maxRot, uniqName); List<ScanTextElem> scanTextElems = new List<ScanTextElem>(); foreach (LocationTextExtractionStrategyEx.TextInfo txtInfo in pageInfo) { DocRectangle boundsRectPercent = ConvertToDocRect(txtInfo.TopLeft, txtInfo.BottomRight, pageRect, maxRot); ScanTextElem sti = new ScanTextElem(txtInfo.Text, boundsRectPercent); scanTextElems.Add(sti); } scanPagesText.Add(scanTextElems); pageRotations.Add(maxRot); pageNumber++; } // Total pages totalPages = pdfReader.NumberOfPages; scanPages = new ScanPages(uniqName, pageRotations, scanPagesText); pdfReader.Close(); // Sleep for a little to allow other things to run Thread.Sleep(100); } } // Return scanned text from pages return scanPages; }
private static void CoerceMatchesToDates(List<ExtractedDate> datesResult, double matchResultFactor, ScanTextElem textElem, MatchCollection matches, ExtractedDate.DateMatchType matchType, int yearGroupIdx, int monthGroupIdx, int dayGroupIdx) { foreach (Match match in matches) { ExtractedDate fd = new ExtractedDate(); try { string yrStr = match.Groups[yearGroupIdx].Value.Replace(" ", ""); yrStr = yrStr.ToLower().Replace("l", "1"); yrStr = yrStr.ToLower().Replace("o", "0"); int year = Convert.ToInt32(yrStr); if (year < 80) { year += 2000; fd.yearWas2Digit = true; } else if (year < 100) { year += 1900; fd.yearWas2Digit = true; } int month = 1; if (Char.IsDigit(match.Groups[monthGroupIdx].Value, 0)) month = Convert.ToInt32(match.Groups[2].Value); else month = monthDict[match.Groups[monthGroupIdx].Value.ToLower().Substring(0, 3)]; int day = 1; fd.dayWasMissing = true; if (match.Groups[dayGroupIdx].Value.Trim() != "") { day = Convert.ToInt32(match.Groups[dayGroupIdx].Value); fd.dayWasMissing = false; } if (year > DateTime.MaxValue.Year) year = DateTime.MaxValue.Year; if (year < DateTime.MinValue.Year) year = DateTime.MinValue.Year; if (day > DateTime.DaysInMonth(year, month)) day = DateTime.DaysInMonth(year, month); if (day < 1) day = 1; DateTime dt = new DateTime(year, month, day); // Add date to list fd.foundInText = textElem.text; fd.posnInText = match.Index; fd.matchLength = match.Length; fd.dateTime = dt; fd.dateMatchType = matchType; fd.locationOfDateOnPagePercent = textElem.bounds; fd.matchFactor = matchResultFactor; datesResult.Add(fd); } catch { } } }