Ejemplo n.º 1
0
        public ScanPages ExtractDocInfo(string uniqName, string fileName, int maxPagesToExtractFrom, ref int totalPages)
        {
            ScanPages scanPages = null;

            // Extract text and location from pdf pages
            using (Stream newpdfStream = new FileStream(fileName, FileMode.Open, FileAccess.Read))
            {
                List<List<LocationTextExtractionStrategyEx.TextInfo>> extractedTextAndLoc = new List<List<LocationTextExtractionStrategyEx.TextInfo>>();

                using (PdfReader pdfReader = new PdfReader(newpdfStream))
                {
                    int numPagesToUse = pdfReader.NumberOfPages;
                    if (numPagesToUse > maxPagesToExtractFrom)
                        numPagesToUse = maxPagesToExtractFrom;
                    int numPagesWithText = 0;
                    for (int pageNum = 1; pageNum <= numPagesToUse; pageNum++)
                    {
                        LocationTextExtractionStrategyEx locationStrategy = new LocationTextExtractionStrategyEx();
                        try
                        {
                            string text = PdfTextExtractor.GetTextFromPage(pdfReader, pageNum, locationStrategy);
                            if (text != "")
                                numPagesWithText++;
                            extractedTextAndLoc.Add(locationStrategy.TextLocationInfo);
                        }
                        catch (Exception excp)
                        {
                            logger.Error("Failed to extract from pdf {0}, page {1} excp {2}", fileName, pageNum, excp.Message);
                        }
                    }

                    // Create new structures for the information
                    int pageNumber = 1;
                    List<List<ScanTextElem>> scanPagesText = new List<List<ScanTextElem>>();
                    List<int> pageRotations = new List<int>();
                    foreach (List<LocationTextExtractionStrategyEx.TextInfo> pageInfo in extractedTextAndLoc)
                    {
                        iTextSharp.text.Rectangle pageRect = pdfReader.GetPageSize(pageNumber);
                        int pageRot = pdfReader.GetPageRotation(pageNumber);

                        // Check through found text to see if the page seems to be rotated
                        int[] rotCounts = new int[] { 0, 0, 0, 0 };
                        if (pageInfo.Count > 2)
                        {
                            foreach (LocationTextExtractionStrategyEx.TextInfo txtInfo in pageInfo)
                            {
                                int thisRotation = GetTextRotation(txtInfo.TopLeft, txtInfo.BottomRight);
                                rotCounts[(thisRotation / 90) % 4]++;
                            }
                        }
                        int maxRot = 0;
                        int maxRotCount = 0;
                        for (int i = 0; i < rotCounts.Length; i++)
                            if (maxRotCount < rotCounts[i])
                            {
                                maxRotCount = rotCounts[i];
                                maxRot = i * 90;
                            }
                        //Console.WriteLine("{2} Page{0}rot = {1}", pageNumber, maxRot, uniqName);

                        List<ScanTextElem> scanTextElems = new List<ScanTextElem>();
                        foreach (LocationTextExtractionStrategyEx.TextInfo txtInfo in pageInfo)
                        {
                            DocRectangle boundsRectPercent = ConvertToDocRect(txtInfo.TopLeft, txtInfo.BottomRight, pageRect, maxRot);
                            ScanTextElem sti = new ScanTextElem(txtInfo.Text, boundsRectPercent);
                            scanTextElems.Add(sti);
                        }
                        scanPagesText.Add(scanTextElems);
                        pageRotations.Add(maxRot);
                        pageNumber++;
                    }

                    // Total pages
                    totalPages = pdfReader.NumberOfPages;
                    scanPages = new ScanPages(uniqName, pageRotations, scanPagesText);
                    pdfReader.Close();

                    // Sleep for a little to allow other things to run
                    Thread.Sleep(100);
                }
            }

            // Return scanned text from pages
            return scanPages;
        }
        private static void CoerceMatchesToDates(List<ExtractedDate> datesResult, double matchResultFactor, ScanTextElem textElem, MatchCollection matches, ExtractedDate.DateMatchType matchType, int yearGroupIdx, int monthGroupIdx, int dayGroupIdx)
        {
            foreach (Match match in matches)
            {
                ExtractedDate fd = new ExtractedDate();
                try
                {
                    string yrStr = match.Groups[yearGroupIdx].Value.Replace(" ", "");
                    yrStr = yrStr.ToLower().Replace("l", "1");
                    yrStr = yrStr.ToLower().Replace("o", "0");
                    int year = Convert.ToInt32(yrStr);
                    if (year < 80)
                    {
                        year += 2000;
                        fd.yearWas2Digit = true;
                    }
                    else if (year < 100)
                    {
                        year += 1900;
                        fd.yearWas2Digit = true;
                    }
                    int month = 1;
                    if (Char.IsDigit(match.Groups[monthGroupIdx].Value, 0))
                        month = Convert.ToInt32(match.Groups[2].Value);
                    else
                        month = monthDict[match.Groups[monthGroupIdx].Value.ToLower().Substring(0, 3)];
                    int day = 1;
                    fd.dayWasMissing = true;
                    if (match.Groups[dayGroupIdx].Value.Trim() != "")
                    {
                        day = Convert.ToInt32(match.Groups[dayGroupIdx].Value);
                        fd.dayWasMissing = false;
                    }
                    if (year > DateTime.MaxValue.Year)
                        year = DateTime.MaxValue.Year;
                    if (year < DateTime.MinValue.Year)
                        year = DateTime.MinValue.Year;
                    if (day > DateTime.DaysInMonth(year, month))
                        day = DateTime.DaysInMonth(year, month);
                    if (day < 1)
                        day = 1;
                    DateTime dt = new DateTime(year, month, day);

                    // Add date to list
                    fd.foundInText = textElem.text;
                    fd.posnInText = match.Index;
                    fd.matchLength = match.Length;
                    fd.dateTime = dt;
                    fd.dateMatchType = matchType;
                    fd.locationOfDateOnPagePercent = textElem.bounds;
                    fd.matchFactor = matchResultFactor;
                    datesResult.Add(fd);
                }
                catch
                {
                }

            }
        }