private bool MatchString(string str, DocRectangle docRectPercent, ScanPages scanPages, int exprIdx, List<DocMatchingTextLoc> matchingTextLocs) { #if TEST_PERF_MATCHSTRING Stopwatch stopWatch1 = new Stopwatch(); stopWatch1.Start(); #endif bool result = false; if (scanPages == null) return result; int elemCount = 0; for (int pageIdx = 0; pageIdx < scanPages.scanPagesText.Count; pageIdx++) { List<ScanTextElem> scanPageText = scanPages.scanPagesText[pageIdx]; for (int elemIdx = 0; elemIdx < scanPageText.Count; elemIdx++) { ScanTextElem textElem = scanPageText[elemIdx]; // Check bounds if (docRectPercent.Intersects(textElem.bounds)) { int mtchPos = textElem.text.IndexOf(str.Trim(), StringComparison.OrdinalIgnoreCase); if (mtchPos >= 0) { result = true; if (matchingTextLocs != null) { DocMatchingTextLoc dtml = new DocMatchingTextLoc(); dtml.pageIdx = pageIdx; dtml.elemIdx = elemIdx; dtml.exprIdx = exprIdx; dtml.posInText = mtchPos; dtml.matchLen = str.Trim().Length; dtml.foundInTxtLen = textElem.text.Length; matchingTextLocs.Add(dtml); } else { // If not compiling all text match locations then return immediately to save time return true; } } } elemCount++; } } #if TEST_PERF_MATCHSTRING stopWatch1.Stop(); logger.Info("CheckForNewDocs : {0:0.00} uS, count {1}", stopWatch1.ElapsedTicks * 1000000.0 / Stopwatch.Frequency, elemCount); #endif return result; }
public static void SearchForDateItem(ScanPages scanPages, string dateSearchTerm, DocRectangle dateDocRect, double matchFactor, List<ExtractedDate> datesResult, ref bool latestDateRequested, ref bool earliestDateRequested, int limitToPageNumN = -1, bool ignoreWhitespace = false) { // Get date search info DateSrchInfo dateSrchInfo = GetDateSearchInfo(dateSearchTerm); if (dateSrchInfo.bEarliestDate) earliestDateRequested = true; if (dateSrchInfo.bLatestDate) latestDateRequested = true; // Find first and last pages to search int firstPageIdx = 0; int lastPageIdxPlusOne = scanPages.scanPagesText.Count; if (limitToPageNumN != -1) { firstPageIdx = limitToPageNumN - 1; lastPageIdxPlusOne = limitToPageNumN; } // Iterate pages for (int pageIdx = firstPageIdx; pageIdx < lastPageIdxPlusOne; pageIdx++) { List<ScanTextElem> scanPageText = scanPages.scanPagesText[pageIdx]; string joinedText = ""; // This maybe used if ~join macrocommand used int joinCount = 0; double matchFactorForThisPage = matchFactor + (pageIdx == 0 ? MATCH_FACTOR_BUMP_FOR_PAGE1 : (pageIdx == 1 ? MATCH_FACTOR_BUMP_FOR_PAGE2 : 0)); // Iterate text elements foreach (ScanTextElem textElem in scanPageText) { // Check that the text contains at least two digits together to avoid wasting time looking for dates where there can be none if (!Regex.IsMatch(textElem.text, @"\d\d")) continue; // Check rectangle bounds if (!dateDocRect.Intersects(textElem.bounds)) continue; // Check for join if (dateSrchInfo.bJoinTextInRect) { if (joinCount < MAX_TEXT_ELEMS_TO_JOIN) joinedText += textElem.text + " "; joinCount++; continue; } // Search within the found text SearchWithinString(textElem.text, textElem.bounds, dateSearchTerm, dateSrchInfo, matchFactorForThisPage, pageIdx, datesResult, ignoreWhitespace); } // If joined then search just once if (dateSrchInfo.bJoinTextInRect) SearchWithinString(joinedText, dateDocRect, dateSearchTerm, dateSrchInfo, matchFactorForThisPage, pageIdx, datesResult, ignoreWhitespace); } // TEST TEST TEST #if TEST_AGAINST_OLD_DATE_ALGORITHM { List<ExtractedDate> testDatesResult = new List<ExtractedDate>(); SearchForDateItem2(scanPages, dateSearchTerm, dateDocRect, matchFactor, testDatesResult, limitToPageNumN); stp2.Stop(); Console.WriteLine("File: " + scanPages.uniqName + " OldTime = " + stp2.ElapsedMilliseconds.ToString() + " NewTime = " + stp.ElapsedMilliseconds.ToString()); foreach (ExtractedDate newD in datesResult) { bool bFound = false; foreach (ExtractedDate oldD in testDatesResult) { if (oldD.dateTime == newD.dateTime) { bFound = true; break; } } if (!bFound) { Console.WriteLine("Date Mismatch New=" + newD.dateTime.ToLongDateString()); } } foreach (ExtractedDate oldD in testDatesResult) { bool bFound = false; foreach (ExtractedDate newD in datesResult) { if (oldD.dateTime == newD.dateTime) { bFound = true; break; } } if (!bFound) { Console.WriteLine("Date Mismatch Old=" + oldD.dateTime.ToLongDateString()); } } } #endif }
public static void SearchForDateItem2(ScanPages scanPages, string dateSearchTerm, DocRectangle dateDocRect, double matchFactor, List<ExtractedDate> datesResult, int limitToPageNumN = -1) { int firstPageIdx = 0; int lastPageIdxPlusOne = scanPages.scanPagesText.Count; if (limitToPageNumN != -1) { firstPageIdx = limitToPageNumN - 1; lastPageIdxPlusOne = limitToPageNumN; } for (int pageIdx = firstPageIdx; pageIdx < lastPageIdxPlusOne; pageIdx++) { List<ScanTextElem> scanPageText = scanPages.scanPagesText[pageIdx]; foreach (ScanTextElem textElem in scanPageText) { // Check if there are at least two digits together in the text (any date format requires this at least) if (!Regex.IsMatch(textElem.text, @"\d\d")) continue; // Check bounds if (dateDocRect.Intersects(textElem.bounds)) { // See which date formats to try bool bTryLong = false; bool bTryShort = false; bool bTryUS = false; bool bTryNoZeroes = false; bool bTrySpaceSeparated = false; if (dateSearchTerm.IndexOf("~long", StringComparison.OrdinalIgnoreCase) >= 0) bTryLong = true; if (dateSearchTerm.IndexOf("~short", StringComparison.OrdinalIgnoreCase) >= 0) bTryShort = true; if (dateSearchTerm.IndexOf("~US", StringComparison.OrdinalIgnoreCase) >= 0) bTryUS = true; if (dateSearchTerm.IndexOf("~No0", StringComparison.OrdinalIgnoreCase) >= 0) bTryNoZeroes = true; if (dateSearchTerm.IndexOf("~Spaces", StringComparison.OrdinalIgnoreCase) >= 0) bTrySpaceSeparated = true; if (!(bTryLong | bTryShort)) { bTryLong = true; bTryShort = true; bTryUS = true; bTryNoZeroes = true; bTrySpaceSeparated = true; } // Get match text if any string matchText = dateSearchTerm; int squigPos = dateSearchTerm.IndexOf('~'); if (squigPos >= 0) matchText = dateSearchTerm.Substring(0, squigPos); double matchResultFactor = 0; if (textElem.text.IndexOf(matchText, StringComparison.OrdinalIgnoreCase) >= 0) matchResultFactor = matchFactor; // Try to find dates if (bTryLong) { MatchCollection ldMatches = Regex.Matches(textElem.text, longDateRegex, RegexOptions.IgnoreCase); CoerceMatchesToDates(datesResult, matchResultFactor, textElem, ldMatches, ExtractedDate.DateMatchType.LongDate, 13, 11, 1); if (bTryUS) { MatchCollection usldMatches = Regex.Matches(textElem.text, USlongDateRegex, RegexOptions.IgnoreCase); CoerceMatchesToDates(datesResult, matchResultFactor, textElem, usldMatches, ExtractedDate.DateMatchType.USLongDate, 14, 1, 4); } } if (bTryShort) { MatchCollection sdlzMatches = Regex.Matches(textElem.text, shortDateLeadingZeroesRegex, RegexOptions.IgnoreCase); CoerceMatchesToDates(datesResult, matchResultFactor, textElem, sdlzMatches, ExtractedDate.DateMatchType.ShortDateLeadingZeroes, 3, 2, 1); if (bTryNoZeroes) { MatchCollection sdnlzMatches = Regex.Matches(textElem.text, shortDateNoLeadingZeroesRegex, RegexOptions.IgnoreCase); CoerceMatchesToDates(datesResult, matchResultFactor, textElem, sdnlzMatches, ExtractedDate.DateMatchType.ShortDateNoLeadingZeroes, 3, 2, 1); } if (bTrySpaceSeparated) { MatchCollection sdspMatches = Regex.Matches(textElem.text, shortDateSpacesRegex, RegexOptions.IgnoreCase); CoerceMatchesToDates(datesResult, matchResultFactor, textElem, sdspMatches, ExtractedDate.DateMatchType.ShortDateNoLeadingZeroes, 3, 2, 1); } } } } } }
public static string ExtractTextFromPage(ScanPages scanPages, DocRectangle docRect, int pageNum) { int pageIdx = pageNum-1; if ((pageIdx < 0) || (pageIdx >= scanPages.scanPagesText.Count)) return ""; // Get page to search List<ScanTextElem> scanPageText = scanPages.scanPagesText[pageNum-1]; // Iterate text elements foreach (ScanTextElem textElem in scanPageText) { // Check rectangle bounds if (!docRect.Intersects(textElem.bounds)) continue; // Return first match return textElem.text; } return ""; }