private bool MatchString(string str, DocRectangle docRectPercent, ScanPages scanPages, int exprIdx, List<DocMatchingTextLoc> matchingTextLocs)
 {
     #if TEST_PERF_MATCHSTRING
     Stopwatch stopWatch1 = new Stopwatch();
     stopWatch1.Start();
     #endif
     bool result = false;
     if (scanPages == null)
         return result;
     int elemCount = 0;
     for (int pageIdx = 0; pageIdx < scanPages.scanPagesText.Count; pageIdx++)
     {
         List<ScanTextElem> scanPageText = scanPages.scanPagesText[pageIdx];
         for (int elemIdx = 0; elemIdx < scanPageText.Count; elemIdx++)
         {
             ScanTextElem textElem = scanPageText[elemIdx];
             // Check bounds
             if (docRectPercent.Intersects(textElem.bounds))
             {
                 int mtchPos = textElem.text.IndexOf(str.Trim(), StringComparison.OrdinalIgnoreCase);
                 if (mtchPos >= 0)
                 {
                     result = true;
                     if (matchingTextLocs != null)
                     {
                         DocMatchingTextLoc dtml = new DocMatchingTextLoc();
                         dtml.pageIdx = pageIdx;
                         dtml.elemIdx = elemIdx;
                         dtml.exprIdx = exprIdx;
                         dtml.posInText = mtchPos;
                         dtml.matchLen = str.Trim().Length;
                         dtml.foundInTxtLen = textElem.text.Length;
                         matchingTextLocs.Add(dtml);
                     }
                     else
                     {
                         // If not compiling all text match locations then return immediately to save time
                         return true;
                     }
                 }
             }
             elemCount++;
         }
     }
     #if TEST_PERF_MATCHSTRING
     stopWatch1.Stop();
     logger.Info("CheckForNewDocs : {0:0.00} uS, count {1}", stopWatch1.ElapsedTicks * 1000000.0 / Stopwatch.Frequency, elemCount);
     #endif
     return result;
 }
        public ScanPages ExtractDocInfo(string uniqName, string fileName, int maxPagesToExtractFrom, ref int totalPages)
        {
            ScanPages scanPages = null;

            // Extract text and location from pdf pages
            using (Stream newpdfStream = new FileStream(fileName, FileMode.Open, FileAccess.Read))
            {
                List<List<LocationTextExtractionStrategyEx.TextInfo>> extractedTextAndLoc = new List<List<LocationTextExtractionStrategyEx.TextInfo>>();

                using (PdfReader pdfReader = new PdfReader(newpdfStream))
                {
                    int numPagesToUse = pdfReader.NumberOfPages;
                    if (numPagesToUse > maxPagesToExtractFrom)
                        numPagesToUse = maxPagesToExtractFrom;
                    int numPagesWithText = 0;
                    for (int pageNum = 1; pageNum <= numPagesToUse; pageNum++)
                    {
                        LocationTextExtractionStrategyEx locationStrategy = new LocationTextExtractionStrategyEx();
                        try
                        {
                            string text = PdfTextExtractor.GetTextFromPage(pdfReader, pageNum, locationStrategy);
                            if (text != "")
                                numPagesWithText++;
                            extractedTextAndLoc.Add(locationStrategy.TextLocationInfo);
                        }
                        catch (Exception excp)
                        {
                            logger.Error("Failed to extract from pdf {0}, page {1} excp {2}", fileName, pageNum, excp.Message);
                        }
                    }

                    // Create new structures for the information
                    int pageNumber = 1;
                    List<List<ScanTextElem>> scanPagesText = new List<List<ScanTextElem>>();
                    List<int> pageRotations = new List<int>();
                    foreach (List<LocationTextExtractionStrategyEx.TextInfo> pageInfo in extractedTextAndLoc)
                    {
                        iTextSharp.text.Rectangle pageRect = pdfReader.GetPageSize(pageNumber);
                        int pageRot = pdfReader.GetPageRotation(pageNumber);

                        // Check through found text to see if the page seems to be rotated
                        int[] rotCounts = new int[] { 0, 0, 0, 0 };
                        if (pageInfo.Count > 2)
                        {
                            foreach (LocationTextExtractionStrategyEx.TextInfo txtInfo in pageInfo)
                            {
                                int thisRotation = GetTextRotation(txtInfo.TopLeft, txtInfo.BottomRight);
                                rotCounts[(thisRotation / 90) % 4]++;
                            }
                        }
                        int maxRot = 0;
                        int maxRotCount = 0;
                        for (int i = 0; i < rotCounts.Length; i++)
                            if (maxRotCount < rotCounts[i])
                            {
                                maxRotCount = rotCounts[i];
                                maxRot = i * 90;
                            }
                        //Console.WriteLine("{2} Page{0}rot = {1}", pageNumber, maxRot, uniqName);

                        List<ScanTextElem> scanTextElems = new List<ScanTextElem>();
                        foreach (LocationTextExtractionStrategyEx.TextInfo txtInfo in pageInfo)
                        {
                            DocRectangle boundsRectPercent = ConvertToDocRect(txtInfo.TopLeft, txtInfo.BottomRight, pageRect, maxRot);
                            ScanTextElem sti = new ScanTextElem(txtInfo.Text, boundsRectPercent);
                            scanTextElems.Add(sti);
                        }
                        scanPagesText.Add(scanTextElems);
                        pageRotations.Add(maxRot);
                        pageNumber++;
                    }

                    // Total pages
                    totalPages = pdfReader.NumberOfPages;
                    scanPages = new ScanPages(uniqName, pageRotations, scanPagesText);
                    pdfReader.Close();

                    // Sleep for a little to allow other things to run
                    Thread.Sleep(100);
                }
            }

            // Return scanned text from pages
            return scanPages;
        }
        private bool EvalMatch(string matchExpression, StringTok st, ScanPages scanPages, ref double matchFactorTotal, ref int curExpressionIdx, List<DocMatchingTextLoc> matchingTextLocs)
        {
            bool result = false;
            string token = "";
            bool curOpIsOr = true;
            bool opIsInverse = false;
            DocRectangle docRectPercent = new DocRectangle(0, 0, 100, 100);
            int docRectValIdx = 0;
            double matchFactorForTerm = 0;

            #if TEST_PERF_EVALMATCH
            Stopwatch stopWatch1 = new Stopwatch();
            stopWatch1.Start();
            #endif

            while((token = st.GetNextToken()) != null)
            {
                if (token.Trim() == "")
                    continue;
                else if (token == ")")
                    return result;
                else if (token == "(")
                {
                    bool tmpRslt = EvalMatch(matchExpression, st, scanPages, ref matchFactorTotal, ref curExpressionIdx, matchingTextLocs);
                    if (opIsInverse)
                        tmpRslt = !tmpRslt;
                    if (curOpIsOr)
                        result |= tmpRslt;
                    else
                        result &= tmpRslt;
                }
                else if (token == "&")
                    curOpIsOr = false;
                else if (token == "|")
                    curOpIsOr = true;
                else if (token == "!")
                    opIsInverse = true;
                else
                {
                    // We've reached a terminal token (string to match to text in the document)
                    string stringToMatch = token;

                    // Check for matchFactor - must have some text before it
                    if (token == ":")
                        return result;
                    // See if there is a location defined by the next token
                    while ((st.PeekNextToken() != null) && (st.PeekNextToken() == ""))
                        st.GetNextToken();
                    if ((st.PeekNextToken() != null) && (st.PeekNextToken() == ":"))
                    {
                        matchFactorForTerm = 0;
                        st.GetNextToken();
                        while ((st.PeekNextToken() != null) && (st.PeekNextToken() == ""))
                            st.GetNextToken();
                        token = st.GetNextToken();
                        if (token != null)
                            Double.TryParse(token, out matchFactorForTerm);
                    }

                    // Check for location on empty string
                    if (token == "{")
                        return result;
                    // See if there is a location defined by the next token
                    while ((st.PeekNextToken() != null) && (st.PeekNextToken() == ""))
                        st.GetNextToken();
                    if ((st.PeekNextToken() != null) && (st.PeekNextToken() == "{"))
                    {
                        while ((token = st.GetNextToken()) != null)
                        {
                            if (token == "")
                                continue;
                            else if (token == "{")
                                docRectValIdx = 0;
                            else if (token == ",")
                                docRectValIdx++;
                            else if (token == "}")
                                break;
                            else
                            {
                                double rectVal = 0;
                                Double.TryParse(token, out rectVal);
                                docRectPercent.SetVal(docRectValIdx, rectVal);
                            }
                        }
                    }

                    // Process the match string using the location rectangle
                    // The check for curOpIsOr || result is to avoid unnecessary work if the expression is already false and we're doing a AND
                    if ((stringToMatch.Trim().Length >= 0) && (curOpIsOr || result))
                    {
                        bool tmpRslt = MatchString(stringToMatch, docRectPercent, scanPages, curExpressionIdx, matchingTextLocs);
                        if (opIsInverse)
                            tmpRslt = !tmpRslt;
                        if (curOpIsOr)
                            result |= tmpRslt;
                        else
                            result &= tmpRslt;

                        // Clear the inverse operator after 1 use
                        opIsInverse = false;
                        // Handle match factor
                        if (tmpRslt)
                            matchFactorTotal += matchFactorForTerm;
                    }

                    // Set the docRect to the entire page (ready for next term)
                    docRectPercent = new DocRectangle(0,0,100,100);
                    matchFactorForTerm = 0;
                    curExpressionIdx++;
                }
            }

            #if TEST_PERF_EVALMATCH
            stopWatch1.Stop();
            logger.Info("EvalMatch : {0:0.00} uS, expr {1}", stopWatch1.ElapsedTicks * 1000000.0 / Stopwatch.Frequency, matchExpression);
            #endif
            return result;
        }
 private bool MatchAgainstDocText(string matchExpression, ScanPages scanPages, ref double matchFactorTotal, List<DocMatchingTextLoc> matchingTextLocs)
 {
     int curExpressionIdx = 0;
     StringTok st = new StringTok(matchExpression);
     return EvalMatch(matchExpression, st, scanPages, ref matchFactorTotal, ref curExpressionIdx, matchingTextLocs);
 }
        public DocTypeMatchResult CheckIfDocMatches(ScanPages scanPages, DocType docType, bool extractDates, List<DocMatchingTextLoc> matchingTextLocs)
        {
            // Setup check info
            DocTypeMatchResult matchResult = new DocTypeMatchResult();
            matchResult.matchCertaintyPercent = 0;
            matchResult.matchResultCode = DocTypeMatchResult.MatchResultCodes.NOT_FOUND;
            if (!docType.isEnabled)
            {
                matchResult.matchResultCode = DocTypeMatchResult.MatchResultCodes.DISABLED;
                return matchResult;
            }
            if (docType.matchExpression == null)
            {
                matchResult.matchResultCode = DocTypeMatchResult.MatchResultCodes.NO_EXPR;
                return matchResult;
            }

            // Check the expression
            double matchFactorTotal = 0;
            if (MatchAgainstDocText(docType.matchExpression, scanPages, ref matchFactorTotal, matchingTextLocs))
            {
                matchResult.matchCertaintyPercent = 100;
                matchResult.matchResultCode = DocTypeMatchResult.MatchResultCodes.FOUND_MATCH;
            }
            matchResult.docTypeName = docType.docTypeName;
            matchResult.matchFactor = matchFactorTotal;

            // Extract date
            if (extractDates)
            {
                int bestDateIdx = 0;
                List<ExtractedDate> extractedDates = DocTextAndDateExtractor.ExtractDatesFromDoc(scanPages, docType.dateExpression, out bestDateIdx);
                matchResult.datesFoundInDoc = extractedDates;
                if (extractedDates.Count > 0)
                    matchResult.docDate = extractedDates[bestDateIdx].dateTime;
            }

            return matchResult;
        }
        public DocTypeMatchResult GetMatchingDocType(ScanPages scanPages, List<DocTypeMatchResult> listOfPossibleMatches = null)
        {
            // Get list of types
            DocTypeMatchResult bestMatchResult = new DocTypeMatchResult();
            var collection_doctypes = GetDocTypesCollection();
            MongoCursor<DocType> foundSdf = collection_doctypes.Find(Query.EQ("isEnabled", true));
            #if TEST_PERF_GETMATCHINGDOCTYPE
            Stopwatch stopWatch1 = new Stopwatch();
            Stopwatch stopWatch2 = new Stopwatch();
            #endif
            foreach (DocType doctype in foundSdf)
            {
            #if TEST_PERF_GETMATCHINGDOCTYPE
                stopWatch1.Start();
            #endif
                // Check if document matches
                DocTypeMatchResult matchResult = CheckIfDocMatches(scanPages, doctype, false, null);

            #if TEST_PERF_GETMATCHINGDOCTYPE
                stopWatch1.Stop();
                stopWatch2.Start();
            #endif

                // Find the best match
                bool bThisIsBestMatch = false;
                if (bestMatchResult.matchCertaintyPercent < matchResult.matchCertaintyPercent)
                    bThisIsBestMatch = true;
                else if (bestMatchResult.matchCertaintyPercent == matchResult.matchCertaintyPercent)
                    if (bestMatchResult.matchFactor < matchResult.matchFactor)
                        bThisIsBestMatch = true;

                // Redo match to get date and time info
                if (bThisIsBestMatch)
                {
                    matchResult = CheckIfDocMatches(scanPages, doctype, true, null);
                    bestMatchResult = matchResult;
                }

                // Check if this should be returned in the list of best matches
                if (listOfPossibleMatches != null)
                    if ((matchResult.matchCertaintyPercent > 0) || (matchResult.matchFactor > 0))
                        listOfPossibleMatches.Add(matchResult);

            #if TEST_PERF_GETMATCHINGDOCTYPE
                stopWatch2.Stop();
            #endif
            }
            #if TEST_PERF_GETMATCHINGDOCTYPE
            logger.Info("T1 : {0}ms, T2 : {1}ms", stopWatch1.ElapsedMilliseconds, stopWatch2.ElapsedMilliseconds);
            #endif

            // If no exact match get date info from entire doc
            if (bestMatchResult.matchCertaintyPercent != 100)
            {
                int bestDateIdx = 0;
                List<ExtractedDate> extractedDates = DocTextAndDateExtractor.ExtractDatesFromDoc(scanPages, "", out bestDateIdx);
                bestMatchResult.datesFoundInDoc = extractedDates;
                if (extractedDates.Count > 0)
                    bestMatchResult.docDate = extractedDates[bestDateIdx].dateTime;
            }

            // If list of best matches to be returned then sort that list now
            if (listOfPossibleMatches != null)
            {
                listOfPossibleMatches = listOfPossibleMatches.OrderByDescending(o => o.matchCertaintyPercent).ThenBy(o => o.matchFactor).ToList();
            }

            return bestMatchResult;
        }
        public void ShowDocTypeList(string selDocTypeName, ScanDocInfo unfiledScanDocInfo, ScanPages unfiledScanDocPages)
        {
            _curUnfiledScanDocInfo = unfiledScanDocInfo;
            _curUnfiledScanDocPages = unfiledScanDocPages;
            DocType selDocType = null;
            List<DocType> docTypes = _docTypesMatcher.ListDocTypes();
            var docTypesSorted = from docType in docTypes
                           orderby !docType.isEnabled, docType.docTypeName
                           select docType;
            _docTypeColl.Clear();
            foreach (DocType dt in docTypesSorted)
            {
                _docTypeColl.Add(dt);
                if (dt.docTypeName == selDocTypeName)
                    selDocType = dt;
            }
            docTypeListView.ItemsSource = _docTypeColl;
            if (selDocType != null)
                docTypeListView.SelectedItem = selDocType;

            // Display example doc
            if ((_curUnfiledScanDocInfo != null) && (_curUnfiledScanDocPages != null))
            {
                DisplayExampleDoc(_curUnfiledScanDocInfo.uniqName, 1, _curUnfiledScanDocPages);
                btnShowDocToBeFiled.IsEnabled = true;
            }
            else
            {
                btnShowDocToBeFiled.IsEnabled = false;
            }
        }
        public static void SearchForDateItem(ScanPages scanPages, string dateSearchTerm, DocRectangle dateDocRect, double matchFactor, List<ExtractedDate> datesResult,
                                    ref bool latestDateRequested, ref bool earliestDateRequested, int limitToPageNumN = -1, bool ignoreWhitespace = false)
        {
            // Get date search info
            DateSrchInfo dateSrchInfo = GetDateSearchInfo(dateSearchTerm);
            if (dateSrchInfo.bEarliestDate)
                earliestDateRequested = true;
            if (dateSrchInfo.bLatestDate)
                latestDateRequested = true;

            // Find first and last pages to search
            int firstPageIdx = 0;
            int lastPageIdxPlusOne = scanPages.scanPagesText.Count;
            if (limitToPageNumN != -1)
            {
                firstPageIdx = limitToPageNumN - 1;
                lastPageIdxPlusOne = limitToPageNumN;
            }

            // Iterate pages
            for (int pageIdx = firstPageIdx; pageIdx < lastPageIdxPlusOne; pageIdx++)
            {
                List<ScanTextElem> scanPageText = scanPages.scanPagesText[pageIdx];
                string joinedText = "";     // This maybe used if ~join macrocommand used
                int joinCount = 0;

                double matchFactorForThisPage = matchFactor + (pageIdx == 0 ? MATCH_FACTOR_BUMP_FOR_PAGE1 : (pageIdx == 1 ? MATCH_FACTOR_BUMP_FOR_PAGE2 : 0));

                // Iterate text elements
                foreach (ScanTextElem textElem in scanPageText)
                {
                    // Check that the text contains at least two digits together to avoid wasting time looking for dates where there can be none
                    if (!Regex.IsMatch(textElem.text, @"\d\d"))
                        continue;

                    // Check rectangle bounds
                    if (!dateDocRect.Intersects(textElem.bounds))
                        continue;

                    // Check for join
                    if (dateSrchInfo.bJoinTextInRect)
                    {
                        if (joinCount < MAX_TEXT_ELEMS_TO_JOIN)
                            joinedText += textElem.text + " ";
                        joinCount++;
                        continue;
                    }

                    // Search within the found text
                    SearchWithinString(textElem.text, textElem.bounds, dateSearchTerm, dateSrchInfo, matchFactorForThisPage, pageIdx, datesResult, ignoreWhitespace);

                }

                // If joined then search just once
                if (dateSrchInfo.bJoinTextInRect)
                    SearchWithinString(joinedText, dateDocRect, dateSearchTerm, dateSrchInfo, matchFactorForThisPage, pageIdx, datesResult, ignoreWhitespace);
            }

            // TEST TEST TEST
            #if TEST_AGAINST_OLD_DATE_ALGORITHM
            {
                List<ExtractedDate> testDatesResult = new List<ExtractedDate>();
                SearchForDateItem2(scanPages, dateSearchTerm, dateDocRect, matchFactor, testDatesResult, limitToPageNumN);
                stp2.Stop();

                Console.WriteLine("File: " + scanPages.uniqName + " OldTime = " + stp2.ElapsedMilliseconds.ToString() + " NewTime = " + stp.ElapsedMilliseconds.ToString());

                foreach (ExtractedDate newD in datesResult)
                {
                    bool bFound = false;
                    foreach (ExtractedDate oldD in testDatesResult)
                    {
                        if (oldD.dateTime == newD.dateTime)
                        {
                            bFound = true;
                            break;
                        }
                    }
                    if (!bFound)
                    {
                        Console.WriteLine("Date Mismatch New=" + newD.dateTime.ToLongDateString());
                    }
                }
                foreach (ExtractedDate oldD in testDatesResult)
                {
                    bool bFound = false;
                    foreach (ExtractedDate newD in datesResult)
                    {
                        if (oldD.dateTime == newD.dateTime)
                        {
                            bFound = true;
                            break;
                        }
                    }
                    if (!bFound)
                    {
                        Console.WriteLine("Date Mismatch Old=" + oldD.dateTime.ToLongDateString());
                    }
                }
            }
            #endif
        }
        private void ShowDocumentFirstTime(string uniqName)
        {
            // Load document info from db
            ScanDocAllInfo scanDocAllInfo = _scanDocHandler.GetScanDocAllInfoCached(uniqName);
            if ((scanDocAllInfo == null) || (scanDocAllInfo.scanDocInfo == null))
            {
                _curDocScanPages = null;
                _curDocScanDocInfo = null;
                _curFiledDocInfo = null;
                _curSelectedDocType = null;
            }
            else
            {
                _curDocScanPages = scanDocAllInfo.scanPages;
                _curDocScanDocInfo = scanDocAllInfo.scanDocInfo;
                _curFiledDocInfo = scanDocAllInfo.filedDocInfo;
            }

            // Display image of first page
            DisplayScannedDocImage(1);

            // Signal that the cur doc has changed
            _newCurDocProcessingCancel = true;
            _newCurDocSignal.Set();
        }
Beispiel #10
0
        public List<string> GeneratePageFiles(string uniqName, ScanPages scanPages, string outputPath, int maxPages, bool rotateBasedOnText)
        {
            List<string> imgFileNames = new List<string>();

            // Create new stopwatch
            Stopwatch stopwatch = new Stopwatch();

            // Begin timing
            stopwatch.Start();

            int numPagesToConvert = _rasterizer.PageCount;
            if (numPagesToConvert > maxPages)
                numPagesToConvert = maxPages;
            for (int pageNumber = 1; pageNumber <= numPagesToConvert; pageNumber++)
            {
                string pageFileName = GetFilenameOfImageOfPage(outputPath, uniqName, pageNumber, true, "jpg");
                try
                {
                    System.Drawing.Image img = _rasterizer.GetPage(_pointsPerInch, _pointsPerInch, pageNumber);
                    // Rotate image as required
                    if (rotateBasedOnText)
                    {
                        if (pageNumber - 1 < scanPages.pageRotations.Count)
                            if (scanPages.pageRotations[pageNumber - 1] != 0)
                                img = RotateImageWithoutCrop(img, scanPages.pageRotations[pageNumber - 1]);
                    }
                    // Save to file
                    img.Save(pageFileName, ImageFormat.Jpeg);
                    imgFileNames.Add(pageFileName);
                }
                catch (Exception excp)
                {
                    logger.Error("Failed to create image of page {0}", pageFileName, excp.Message);
                }
            }
            // Stop timing
            stopwatch.Stop();

            logger.Info("Converted {0} ({1} pages) to image files in {2}", _inputPdfPath, numPagesToConvert, stopwatch.Elapsed);

            return imgFileNames;
        }
Beispiel #11
0
 public ScanDocAllInfo(ScanDocInfo sdi, ScanPages spages, FiledDocInfo fdi)
 {
     scanDocInfo = sdi;
     scanPages = spages;
     filedDocInfo = fdi;
 }
        public static List<ExtractedDate> ExtractDatesFromDoc(ScanPages scanPages, string dateExpr, out int bestDateIdx)
        {
            bestDateIdx = 0;
            List<ExtractedDate> datesResult = new List<ExtractedDate>();
            if (scanPages == null)
                return datesResult;

            // Extract location rectangles from doctype
            List<ExprParseTerm> parseTerms = DocTypesMatcher.ParseDocMatchExpression(dateExpr, 0);
            bool bAtLeastOneExprSearched = false;
            string lastDateSearchTerm = "";
            double lastDateSearchMatchFactor = 0;
            bool latestDateRequested = false;
            bool earliestDateRequested = false;
            foreach (ExprParseTerm parseTerm in parseTerms)
            {
                if (parseTerm.termType == ExprParseTerm.ExprParseTermType.exprTerm_Text)
                {
                    if (lastDateSearchTerm != "")
                    {
                        SearchForDateItem(scanPages, lastDateSearchTerm, new DocRectangle(0, 0, 100, 100), lastDateSearchMatchFactor, datesResult, ref latestDateRequested, ref earliestDateRequested);
                        bAtLeastOneExprSearched = true;
                    }
                    lastDateSearchTerm = dateExpr.Substring(parseTerm.stPos, parseTerm.termLen);
                    // Reset matchFactor for next search term
                    lastDateSearchMatchFactor = 0;
                }
                else if (parseTerm.termType == ExprParseTerm.ExprParseTermType.exprTerm_Location)
                {
                    string locStr = dateExpr.Substring(parseTerm.stPos, parseTerm.termLen);
                    DocRectangle lastDateSearchRect = new DocRectangle(locStr);
                    SearchForDateItem(scanPages, lastDateSearchTerm, lastDateSearchRect, lastDateSearchMatchFactor, datesResult, ref latestDateRequested, ref earliestDateRequested);
                    lastDateSearchTerm = "";
                    lastDateSearchMatchFactor = 0;
                    bAtLeastOneExprSearched = true;
                }
                else if (parseTerm.termType == ExprParseTerm.ExprParseTermType.exprTerm_MatchFactor)
                {
                    if (dateExpr.Length > parseTerm.stPos + 1)
                    {
                        string valStr = dateExpr.Substring(parseTerm.stPos + 1, parseTerm.termLen-1);
                        Double.TryParse(valStr, out lastDateSearchMatchFactor);
                    }
                }
            }

            // There may be one last expression still to find - but be sure that at least one is searched for
            if ((lastDateSearchTerm != "") || (!bAtLeastOneExprSearched))
                SearchForDateItem(scanPages, lastDateSearchTerm, new DocRectangle(0, 0, 100, 100), lastDateSearchMatchFactor, datesResult, ref latestDateRequested, ref earliestDateRequested);

            // If required check for the earliest and/or latest dates and bump their factors
            DateTime earliestDate = DateTime.MaxValue;
            DateTime latestDate = DateTime.MinValue;
            int earliestIdx = -1;
            int latestIdx = -1;
            for (int dateIdx = 0; dateIdx < datesResult.Count; dateIdx++)
            {
                if (earliestDate > datesResult[dateIdx].dateTime)
                {
                    earliestDate = datesResult[dateIdx].dateTime;
                    earliestIdx = dateIdx;
                }
                if (latestDate < datesResult[dateIdx].dateTime)
                {
                    latestDate = datesResult[dateIdx].dateTime;
                    latestIdx = dateIdx;
                }
            }
            if (earliestDateRequested && (earliestIdx != -1))
                datesResult[earliestIdx].matchFactor += MATCH_FACTOR_BUMP_FOR_EARLIEST_DATE;
            if (latestDateRequested && (latestIdx != -1))
                datesResult[latestIdx].matchFactor += MATCH_FACTOR_BUMP_FOR_LATEST_DATE;

            // Find the best date index based on highest match factor
            bestDateIdx = 0;
            double highestDateMatchFactor = 0;
            for (int dateIdx = 0; dateIdx < datesResult.Count; dateIdx++)
            {
                if (highestDateMatchFactor < datesResult[dateIdx].matchFactor)
                {
                    bestDateIdx = dateIdx;
                    highestDateMatchFactor = datesResult[dateIdx].matchFactor;
                }
            }

            return datesResult;
        }
        public static void SearchForDateItem2(ScanPages scanPages, string dateSearchTerm, DocRectangle dateDocRect, double matchFactor, List<ExtractedDate> datesResult, int limitToPageNumN = -1)
        {
            int firstPageIdx = 0;
            int lastPageIdxPlusOne = scanPages.scanPagesText.Count;
            if (limitToPageNumN != -1)
            {
                firstPageIdx = limitToPageNumN - 1;
                lastPageIdxPlusOne = limitToPageNumN;
            }
            for (int pageIdx = firstPageIdx; pageIdx < lastPageIdxPlusOne; pageIdx++)
            {
                List<ScanTextElem> scanPageText = scanPages.scanPagesText[pageIdx];
                foreach (ScanTextElem textElem in scanPageText)
                {
                    // Check if there are at least two digits together in the text (any date format requires this at least)
                    if (!Regex.IsMatch(textElem.text, @"\d\d"))
                        continue;

                    // Check bounds
                    if (dateDocRect.Intersects(textElem.bounds))
                    {
                        // See which date formats to try
                        bool bTryLong = false;
                        bool bTryShort = false;
                        bool bTryUS = false;
                        bool bTryNoZeroes = false;
                        bool bTrySpaceSeparated = false;
                        if (dateSearchTerm.IndexOf("~long", StringComparison.OrdinalIgnoreCase) >= 0)
                            bTryLong = true;
                        if (dateSearchTerm.IndexOf("~short", StringComparison.OrdinalIgnoreCase) >= 0)
                            bTryShort = true;
                        if (dateSearchTerm.IndexOf("~US", StringComparison.OrdinalIgnoreCase) >= 0)
                            bTryUS = true;
                        if (dateSearchTerm.IndexOf("~No0", StringComparison.OrdinalIgnoreCase) >= 0)
                            bTryNoZeroes = true;
                        if (dateSearchTerm.IndexOf("~Spaces", StringComparison.OrdinalIgnoreCase) >= 0)
                            bTrySpaceSeparated = true;
                        if (!(bTryLong | bTryShort))
                        {
                            bTryLong = true;
                            bTryShort = true;
                            bTryUS = true;
                            bTryNoZeroes = true;
                            bTrySpaceSeparated = true;
                        }

                        // Get match text if any
                        string matchText = dateSearchTerm;
                        int squigPos = dateSearchTerm.IndexOf('~');
                        if (squigPos >= 0)
                            matchText = dateSearchTerm.Substring(0, squigPos);
                        double matchResultFactor = 0;
                        if (textElem.text.IndexOf(matchText, StringComparison.OrdinalIgnoreCase) >= 0)
                            matchResultFactor = matchFactor;

                        // Try to find dates
                        if (bTryLong)
                        {
                            MatchCollection ldMatches = Regex.Matches(textElem.text, longDateRegex, RegexOptions.IgnoreCase);
                            CoerceMatchesToDates(datesResult, matchResultFactor, textElem, ldMatches, ExtractedDate.DateMatchType.LongDate, 13, 11, 1);
                            if (bTryUS)
                            {
                                MatchCollection usldMatches = Regex.Matches(textElem.text, USlongDateRegex, RegexOptions.IgnoreCase);
                                CoerceMatchesToDates(datesResult, matchResultFactor, textElem, usldMatches, ExtractedDate.DateMatchType.USLongDate, 14, 1, 4);
                            }
                        }

                        if (bTryShort)
                        {
                            MatchCollection sdlzMatches = Regex.Matches(textElem.text, shortDateLeadingZeroesRegex, RegexOptions.IgnoreCase);
                            CoerceMatchesToDates(datesResult, matchResultFactor, textElem, sdlzMatches, ExtractedDate.DateMatchType.ShortDateLeadingZeroes, 3, 2, 1);
                            if (bTryNoZeroes)
                            {
                                MatchCollection sdnlzMatches = Regex.Matches(textElem.text, shortDateNoLeadingZeroesRegex, RegexOptions.IgnoreCase);
                                CoerceMatchesToDates(datesResult, matchResultFactor, textElem, sdnlzMatches, ExtractedDate.DateMatchType.ShortDateNoLeadingZeroes, 3, 2, 1);
                            }
                            if (bTrySpaceSeparated)
                            {
                                MatchCollection sdspMatches = Regex.Matches(textElem.text, shortDateSpacesRegex, RegexOptions.IgnoreCase);
                                CoerceMatchesToDates(datesResult, matchResultFactor, textElem, sdspMatches, ExtractedDate.DateMatchType.ShortDateNoLeadingZeroes, 3, 2, 1);
                            }
                        }
                    }
                }
            }
        }
Beispiel #14
0
 public void AddScanPagesRecToMongo(ScanPages scanPages)
 {
     // Mongo append
     try
     {
         MongoCollection<ScanPages> collection_spages = GetDocPagesCollection();
         collection_spages.Insert(scanPages);
         // Log it
         logger.Info("Added scandocpages record for {0}", scanPages.uniqName);
     }
     catch (Exception excp)
     {
         logger.Error("Cannot insert scandocpages into {0} Coll... {1} for file {2} excp {3}",
                     _scanConfig._dbNameForDocs, _scanConfig._dbCollectionForDocPages, scanPages.uniqName,
                     excp.Message);
     }
 }
 private void DisplayExampleDoc(string uniqName, int pageNum, ScanPages scanPages)
 {
     _curDocDisplay_scanPages = scanPages;
     string imgFileName = PdfRasterizer.GetFilenameOfImageOfPage(Properties.Settings.Default.DocAdminImgFolderBase, uniqName, pageNum, false);
     if (!File.Exists(imgFileName))
         return;
     try
     {
         exampleFileImage.Source = new BitmapImage(new Uri("File:" + imgFileName));
         _curDocDisplay_uniqName = uniqName;
         _curDocDisplay_pageNum = pageNum;
     }
     catch (Exception excp)
     {
         logger.Error("Loading bitmap file {0} excp {1}", imgFileName, excp.Message);
         _curDocDisplay_uniqName = "";
         _curDocDisplay_pageNum = 1;
     }
 }
Beispiel #16
0
        // when processing file
        // - first move the file
        // - then update the doc record to say processed
        public bool ProcessPdfFile(string fileName, string uniqName, bool bExtractImages, bool bDontOverwriteExistingImages, bool bExtractText, bool bRecogniseDoc,
                                bool bAddToDocInfoDb, bool bAddToDocPagesDb)
        {
            // First check if doc details are already in db
            if (!ScanDocInfoRecordExists(uniqName))
                return false;

            // Make a copy of the file in the archive location
            string archiveFileName = ScanDocHandler.GetArchiveFileName(uniqName);
            if (!Delimon.Win32.IO.File.Exists(archiveFileName))
            {
                string statusStr = "";
                bool bResult = CopyFile(fileName, archiveFileName, ref statusStr);
                if (!bResult)
                {
                    logger.Error("Can't make archive copy {0} excp {1}", archiveFileName, statusStr);
                    return false;
                }
            }
            else
            {
                logger.Info("Archive file already exists {0}", archiveFileName);
            }

            // Extract text blocks from file
            ScanPages scanPages = new ScanPages(uniqName);
            int totalNumPages = 0;
            if (bExtractText)
            {
                PdfTextAndLocExtractor pdfExtractor = new PdfTextAndLocExtractor();
                scanPages = pdfExtractor.ExtractDocInfo(uniqName, fileName, _scanConfig._maxPagesForText, ref totalNumPages);
            }

            // Extract images from file
            if (bExtractImages)
            {
                bool procImages = (!bDontOverwriteExistingImages) | (!Delimon.Win32.IO.File.Exists(PdfRasterizer.GetFilenameOfImageOfPage(_scanConfig._docAdminImgFolderBase, uniqName, 1, false)));
                if (procImages)
                {
                    PdfRasterizer rs = new PdfRasterizer(fileName, THUMBNAIL_POINTS_PER_INCH);
                    try
                    {
                        List<string> imgFileNames = rs.GeneratePageFiles(uniqName, scanPages, _scanConfig._docAdminImgFolderBase, _scanConfig._maxPagesForImages, false);
                    }
                    finally
                    {
                        rs.Close();
                    }

                }
            }

            // Form partial document info
            DateTime fileDateTime = Delimon.Win32.IO.File.GetCreationTime(fileName);
            ScanDocInfo scanDocInfo = new ScanDocInfo(uniqName, totalNumPages, scanPages.scanPagesText.Count, fileDateTime, fileName.Replace('\\', '/'), false);

            // Add records to mongo databases
            if (bAddToDocPagesDb)
                AddScanPagesRecToMongo(scanPages);
            if (bAddToDocInfoDb)
                AddDocInfoRecToMongo(scanDocInfo);

            // Request update to unfiled documents list
            _scanDocInfoCache.RequestUnfiledListUpdate();

            return true;
        }
        public static string ExtractTextFromPage(ScanPages scanPages, DocRectangle docRect, int pageNum)
        {
            int pageIdx = pageNum-1;
            if ((pageIdx < 0) || (pageIdx >= scanPages.scanPagesText.Count))
                return "";

            // Get page to search
            List<ScanTextElem> scanPageText = scanPages.scanPagesText[pageNum-1];

            // Iterate text elements
            foreach (ScanTextElem textElem in scanPageText)
            {
                // Check rectangle bounds
                if (!docRect.Intersects(textElem.bounds))
                    continue;

                // Return first match
                return textElem.text;
            }
            return "";
        }