コード例 #1
0
 private bool MatchString(string str, DocRectangle docRectPercent, ScanPages scanPages, int exprIdx, List<DocMatchingTextLoc> matchingTextLocs)
 {
     #if TEST_PERF_MATCHSTRING
     Stopwatch stopWatch1 = new Stopwatch();
     stopWatch1.Start();
     #endif
     bool result = false;
     if (scanPages == null)
         return result;
     int elemCount = 0;
     for (int pageIdx = 0; pageIdx < scanPages.scanPagesText.Count; pageIdx++)
     {
         List<ScanTextElem> scanPageText = scanPages.scanPagesText[pageIdx];
         for (int elemIdx = 0; elemIdx < scanPageText.Count; elemIdx++)
         {
             ScanTextElem textElem = scanPageText[elemIdx];
             // Check bounds
             if (docRectPercent.Intersects(textElem.bounds))
             {
                 int mtchPos = textElem.text.IndexOf(str.Trim(), StringComparison.OrdinalIgnoreCase);
                 if (mtchPos >= 0)
                 {
                     result = true;
                     if (matchingTextLocs != null)
                     {
                         DocMatchingTextLoc dtml = new DocMatchingTextLoc();
                         dtml.pageIdx = pageIdx;
                         dtml.elemIdx = elemIdx;
                         dtml.exprIdx = exprIdx;
                         dtml.posInText = mtchPos;
                         dtml.matchLen = str.Trim().Length;
                         dtml.foundInTxtLen = textElem.text.Length;
                         matchingTextLocs.Add(dtml);
                     }
                     else
                     {
                         // If not compiling all text match locations then return immediately to save time
                         return true;
                     }
                 }
             }
             elemCount++;
         }
     }
     #if TEST_PERF_MATCHSTRING
     stopWatch1.Stop();
     logger.Info("CheckForNewDocs : {0:0.00} uS, count {1}", stopWatch1.ElapsedTicks * 1000000.0 / Stopwatch.Frequency, elemCount);
     #endif
     return result;
 }
コード例 #2
0
        public static void SearchForDateItem(ScanPages scanPages, string dateSearchTerm, DocRectangle dateDocRect, double matchFactor, List<ExtractedDate> datesResult,
                                    ref bool latestDateRequested, ref bool earliestDateRequested, int limitToPageNumN = -1, bool ignoreWhitespace = false)
        {
            // Get date search info
            DateSrchInfo dateSrchInfo = GetDateSearchInfo(dateSearchTerm);
            if (dateSrchInfo.bEarliestDate)
                earliestDateRequested = true;
            if (dateSrchInfo.bLatestDate)
                latestDateRequested = true;

            // Find first and last pages to search
            int firstPageIdx = 0;
            int lastPageIdxPlusOne = scanPages.scanPagesText.Count;
            if (limitToPageNumN != -1)
            {
                firstPageIdx = limitToPageNumN - 1;
                lastPageIdxPlusOne = limitToPageNumN;
            }

            // Iterate pages
            for (int pageIdx = firstPageIdx; pageIdx < lastPageIdxPlusOne; pageIdx++)
            {
                List<ScanTextElem> scanPageText = scanPages.scanPagesText[pageIdx];
                string joinedText = "";     // This maybe used if ~join macrocommand used
                int joinCount = 0;

                double matchFactorForThisPage = matchFactor + (pageIdx == 0 ? MATCH_FACTOR_BUMP_FOR_PAGE1 : (pageIdx == 1 ? MATCH_FACTOR_BUMP_FOR_PAGE2 : 0));

                // Iterate text elements
                foreach (ScanTextElem textElem in scanPageText)
                {
                    // Check that the text contains at least two digits together to avoid wasting time looking for dates where there can be none
                    if (!Regex.IsMatch(textElem.text, @"\d\d"))
                        continue;

                    // Check rectangle bounds
                    if (!dateDocRect.Intersects(textElem.bounds))
                        continue;

                    // Check for join
                    if (dateSrchInfo.bJoinTextInRect)
                    {
                        if (joinCount < MAX_TEXT_ELEMS_TO_JOIN)
                            joinedText += textElem.text + " ";
                        joinCount++;
                        continue;
                    }

                    // Search within the found text
                    SearchWithinString(textElem.text, textElem.bounds, dateSearchTerm, dateSrchInfo, matchFactorForThisPage, pageIdx, datesResult, ignoreWhitespace);

                }

                // If joined then search just once
                if (dateSrchInfo.bJoinTextInRect)
                    SearchWithinString(joinedText, dateDocRect, dateSearchTerm, dateSrchInfo, matchFactorForThisPage, pageIdx, datesResult, ignoreWhitespace);
            }

            // TEST TEST TEST
            #if TEST_AGAINST_OLD_DATE_ALGORITHM
            {
                List<ExtractedDate> testDatesResult = new List<ExtractedDate>();
                SearchForDateItem2(scanPages, dateSearchTerm, dateDocRect, matchFactor, testDatesResult, limitToPageNumN);
                stp2.Stop();

                Console.WriteLine("File: " + scanPages.uniqName + " OldTime = " + stp2.ElapsedMilliseconds.ToString() + " NewTime = " + stp.ElapsedMilliseconds.ToString());

                foreach (ExtractedDate newD in datesResult)
                {
                    bool bFound = false;
                    foreach (ExtractedDate oldD in testDatesResult)
                    {
                        if (oldD.dateTime == newD.dateTime)
                        {
                            bFound = true;
                            break;
                        }
                    }
                    if (!bFound)
                    {
                        Console.WriteLine("Date Mismatch New=" + newD.dateTime.ToLongDateString());
                    }
                }
                foreach (ExtractedDate oldD in testDatesResult)
                {
                    bool bFound = false;
                    foreach (ExtractedDate newD in datesResult)
                    {
                        if (oldD.dateTime == newD.dateTime)
                        {
                            bFound = true;
                            break;
                        }
                    }
                    if (!bFound)
                    {
                        Console.WriteLine("Date Mismatch Old=" + oldD.dateTime.ToLongDateString());
                    }
                }
            }
            #endif
        }
コード例 #3
0
        public static void SearchForDateItem2(ScanPages scanPages, string dateSearchTerm, DocRectangle dateDocRect, double matchFactor, List<ExtractedDate> datesResult, int limitToPageNumN = -1)
        {
            int firstPageIdx = 0;
            int lastPageIdxPlusOne = scanPages.scanPagesText.Count;
            if (limitToPageNumN != -1)
            {
                firstPageIdx = limitToPageNumN - 1;
                lastPageIdxPlusOne = limitToPageNumN;
            }
            for (int pageIdx = firstPageIdx; pageIdx < lastPageIdxPlusOne; pageIdx++)
            {
                List<ScanTextElem> scanPageText = scanPages.scanPagesText[pageIdx];
                foreach (ScanTextElem textElem in scanPageText)
                {
                    // Check if there are at least two digits together in the text (any date format requires this at least)
                    if (!Regex.IsMatch(textElem.text, @"\d\d"))
                        continue;

                    // Check bounds
                    if (dateDocRect.Intersects(textElem.bounds))
                    {
                        // See which date formats to try
                        bool bTryLong = false;
                        bool bTryShort = false;
                        bool bTryUS = false;
                        bool bTryNoZeroes = false;
                        bool bTrySpaceSeparated = false;
                        if (dateSearchTerm.IndexOf("~long", StringComparison.OrdinalIgnoreCase) >= 0)
                            bTryLong = true;
                        if (dateSearchTerm.IndexOf("~short", StringComparison.OrdinalIgnoreCase) >= 0)
                            bTryShort = true;
                        if (dateSearchTerm.IndexOf("~US", StringComparison.OrdinalIgnoreCase) >= 0)
                            bTryUS = true;
                        if (dateSearchTerm.IndexOf("~No0", StringComparison.OrdinalIgnoreCase) >= 0)
                            bTryNoZeroes = true;
                        if (dateSearchTerm.IndexOf("~Spaces", StringComparison.OrdinalIgnoreCase) >= 0)
                            bTrySpaceSeparated = true;
                        if (!(bTryLong | bTryShort))
                        {
                            bTryLong = true;
                            bTryShort = true;
                            bTryUS = true;
                            bTryNoZeroes = true;
                            bTrySpaceSeparated = true;
                        }

                        // Get match text if any
                        string matchText = dateSearchTerm;
                        int squigPos = dateSearchTerm.IndexOf('~');
                        if (squigPos >= 0)
                            matchText = dateSearchTerm.Substring(0, squigPos);
                        double matchResultFactor = 0;
                        if (textElem.text.IndexOf(matchText, StringComparison.OrdinalIgnoreCase) >= 0)
                            matchResultFactor = matchFactor;

                        // Try to find dates
                        if (bTryLong)
                        {
                            MatchCollection ldMatches = Regex.Matches(textElem.text, longDateRegex, RegexOptions.IgnoreCase);
                            CoerceMatchesToDates(datesResult, matchResultFactor, textElem, ldMatches, ExtractedDate.DateMatchType.LongDate, 13, 11, 1);
                            if (bTryUS)
                            {
                                MatchCollection usldMatches = Regex.Matches(textElem.text, USlongDateRegex, RegexOptions.IgnoreCase);
                                CoerceMatchesToDates(datesResult, matchResultFactor, textElem, usldMatches, ExtractedDate.DateMatchType.USLongDate, 14, 1, 4);
                            }
                        }

                        if (bTryShort)
                        {
                            MatchCollection sdlzMatches = Regex.Matches(textElem.text, shortDateLeadingZeroesRegex, RegexOptions.IgnoreCase);
                            CoerceMatchesToDates(datesResult, matchResultFactor, textElem, sdlzMatches, ExtractedDate.DateMatchType.ShortDateLeadingZeroes, 3, 2, 1);
                            if (bTryNoZeroes)
                            {
                                MatchCollection sdnlzMatches = Regex.Matches(textElem.text, shortDateNoLeadingZeroesRegex, RegexOptions.IgnoreCase);
                                CoerceMatchesToDates(datesResult, matchResultFactor, textElem, sdnlzMatches, ExtractedDate.DateMatchType.ShortDateNoLeadingZeroes, 3, 2, 1);
                            }
                            if (bTrySpaceSeparated)
                            {
                                MatchCollection sdspMatches = Regex.Matches(textElem.text, shortDateSpacesRegex, RegexOptions.IgnoreCase);
                                CoerceMatchesToDates(datesResult, matchResultFactor, textElem, sdspMatches, ExtractedDate.DateMatchType.ShortDateNoLeadingZeroes, 3, 2, 1);
                            }
                        }
                    }
                }
            }
        }
コード例 #4
0
        public static string ExtractTextFromPage(ScanPages scanPages, DocRectangle docRect, int pageNum)
        {
            int pageIdx = pageNum-1;
            if ((pageIdx < 0) || (pageIdx >= scanPages.scanPagesText.Count))
                return "";

            // Get page to search
            List<ScanTextElem> scanPageText = scanPages.scanPagesText[pageNum-1];

            // Iterate text elements
            foreach (ScanTextElem textElem in scanPageText)
            {
                // Check rectangle bounds
                if (!docRect.Intersects(textElem.bounds))
                    continue;

                // Return first match
                return textElem.text;
            }
            return "";
        }