private bool MatchString(string str, DocRectangle docRectPercent, ScanPages scanPages, int exprIdx, List<DocMatchingTextLoc> matchingTextLocs) { #if TEST_PERF_MATCHSTRING Stopwatch stopWatch1 = new Stopwatch(); stopWatch1.Start(); #endif bool result = false; if (scanPages == null) return result; int elemCount = 0; for (int pageIdx = 0; pageIdx < scanPages.scanPagesText.Count; pageIdx++) { List<ScanTextElem> scanPageText = scanPages.scanPagesText[pageIdx]; for (int elemIdx = 0; elemIdx < scanPageText.Count; elemIdx++) { ScanTextElem textElem = scanPageText[elemIdx]; // Check bounds if (docRectPercent.Intersects(textElem.bounds)) { int mtchPos = textElem.text.IndexOf(str.Trim(), StringComparison.OrdinalIgnoreCase); if (mtchPos >= 0) { result = true; if (matchingTextLocs != null) { DocMatchingTextLoc dtml = new DocMatchingTextLoc(); dtml.pageIdx = pageIdx; dtml.elemIdx = elemIdx; dtml.exprIdx = exprIdx; dtml.posInText = mtchPos; dtml.matchLen = str.Trim().Length; dtml.foundInTxtLen = textElem.text.Length; matchingTextLocs.Add(dtml); } else { // If not compiling all text match locations then return immediately to save time return true; } } } elemCount++; } } #if TEST_PERF_MATCHSTRING stopWatch1.Stop(); logger.Info("CheckForNewDocs : {0:0.00} uS, count {1}", stopWatch1.ElapsedTicks * 1000000.0 / Stopwatch.Frequency, elemCount); #endif return result; }
public ScanPages ExtractDocInfo(string uniqName, string fileName, int maxPagesToExtractFrom, ref int totalPages) { ScanPages scanPages = null; // Extract text and location from pdf pages using (Stream newpdfStream = new FileStream(fileName, FileMode.Open, FileAccess.Read)) { List<List<LocationTextExtractionStrategyEx.TextInfo>> extractedTextAndLoc = new List<List<LocationTextExtractionStrategyEx.TextInfo>>(); using (PdfReader pdfReader = new PdfReader(newpdfStream)) { int numPagesToUse = pdfReader.NumberOfPages; if (numPagesToUse > maxPagesToExtractFrom) numPagesToUse = maxPagesToExtractFrom; int numPagesWithText = 0; for (int pageNum = 1; pageNum <= numPagesToUse; pageNum++) { LocationTextExtractionStrategyEx locationStrategy = new LocationTextExtractionStrategyEx(); try { string text = PdfTextExtractor.GetTextFromPage(pdfReader, pageNum, locationStrategy); if (text != "") numPagesWithText++; extractedTextAndLoc.Add(locationStrategy.TextLocationInfo); } catch (Exception excp) { logger.Error("Failed to extract from pdf {0}, page {1} excp {2}", fileName, pageNum, excp.Message); } } // Create new structures for the information int pageNumber = 1; List<List<ScanTextElem>> scanPagesText = new List<List<ScanTextElem>>(); List<int> pageRotations = new List<int>(); foreach (List<LocationTextExtractionStrategyEx.TextInfo> pageInfo in extractedTextAndLoc) { iTextSharp.text.Rectangle pageRect = pdfReader.GetPageSize(pageNumber); int pageRot = pdfReader.GetPageRotation(pageNumber); // Check through found text to see if the page seems to be rotated int[] rotCounts = new int[] { 0, 0, 0, 0 }; if (pageInfo.Count > 2) { foreach (LocationTextExtractionStrategyEx.TextInfo txtInfo in pageInfo) { int thisRotation = GetTextRotation(txtInfo.TopLeft, txtInfo.BottomRight); rotCounts[(thisRotation / 90) % 4]++; } } int maxRot = 0; int maxRotCount = 0; for (int i = 0; i < rotCounts.Length; i++) if (maxRotCount < rotCounts[i]) { maxRotCount = rotCounts[i]; maxRot = i * 90; } //Console.WriteLine("{2} Page{0}rot = {1}", pageNumber, maxRot, uniqName); List<ScanTextElem> scanTextElems = new List<ScanTextElem>(); foreach (LocationTextExtractionStrategyEx.TextInfo txtInfo in pageInfo) { DocRectangle boundsRectPercent = ConvertToDocRect(txtInfo.TopLeft, txtInfo.BottomRight, pageRect, maxRot); ScanTextElem sti = new ScanTextElem(txtInfo.Text, boundsRectPercent); scanTextElems.Add(sti); } scanPagesText.Add(scanTextElems); pageRotations.Add(maxRot); pageNumber++; } // Total pages totalPages = pdfReader.NumberOfPages; scanPages = new ScanPages(uniqName, pageRotations, scanPagesText); pdfReader.Close(); // Sleep for a little to allow other things to run Thread.Sleep(100); } } // Return scanned text from pages return scanPages; }
private bool EvalMatch(string matchExpression, StringTok st, ScanPages scanPages, ref double matchFactorTotal, ref int curExpressionIdx, List<DocMatchingTextLoc> matchingTextLocs) { bool result = false; string token = ""; bool curOpIsOr = true; bool opIsInverse = false; DocRectangle docRectPercent = new DocRectangle(0, 0, 100, 100); int docRectValIdx = 0; double matchFactorForTerm = 0; #if TEST_PERF_EVALMATCH Stopwatch stopWatch1 = new Stopwatch(); stopWatch1.Start(); #endif while((token = st.GetNextToken()) != null) { if (token.Trim() == "") continue; else if (token == ")") return result; else if (token == "(") { bool tmpRslt = EvalMatch(matchExpression, st, scanPages, ref matchFactorTotal, ref curExpressionIdx, matchingTextLocs); if (opIsInverse) tmpRslt = !tmpRslt; if (curOpIsOr) result |= tmpRslt; else result &= tmpRslt; } else if (token == "&") curOpIsOr = false; else if (token == "|") curOpIsOr = true; else if (token == "!") opIsInverse = true; else { // We've reached a terminal token (string to match to text in the document) string stringToMatch = token; // Check for matchFactor - must have some text before it if (token == ":") return result; // See if there is a location defined by the next token while ((st.PeekNextToken() != null) && (st.PeekNextToken() == "")) st.GetNextToken(); if ((st.PeekNextToken() != null) && (st.PeekNextToken() == ":")) { matchFactorForTerm = 0; st.GetNextToken(); while ((st.PeekNextToken() != null) && (st.PeekNextToken() == "")) st.GetNextToken(); token = st.GetNextToken(); if (token != null) Double.TryParse(token, out matchFactorForTerm); } // Check for location on empty string if (token == "{") return result; // See if there is a location defined by the next token while ((st.PeekNextToken() != null) && (st.PeekNextToken() == "")) st.GetNextToken(); if ((st.PeekNextToken() != null) && (st.PeekNextToken() == "{")) { while ((token = st.GetNextToken()) != null) { if (token == "") continue; else if (token == "{") docRectValIdx = 0; else if (token == ",") docRectValIdx++; else if (token == "}") break; else { double rectVal = 0; Double.TryParse(token, out rectVal); docRectPercent.SetVal(docRectValIdx, rectVal); } } } // Process the match string using the location rectangle // The check for curOpIsOr || result is to avoid unnecessary work if the expression is already false and we're doing a AND if ((stringToMatch.Trim().Length >= 0) && (curOpIsOr || result)) { bool tmpRslt = MatchString(stringToMatch, docRectPercent, scanPages, curExpressionIdx, matchingTextLocs); if (opIsInverse) tmpRslt = !tmpRslt; if (curOpIsOr) result |= tmpRslt; else result &= tmpRslt; // Clear the inverse operator after 1 use opIsInverse = false; // Handle match factor if (tmpRslt) matchFactorTotal += matchFactorForTerm; } // Set the docRect to the entire page (ready for next term) docRectPercent = new DocRectangle(0,0,100,100); matchFactorForTerm = 0; curExpressionIdx++; } } #if TEST_PERF_EVALMATCH stopWatch1.Stop(); logger.Info("EvalMatch : {0:0.00} uS, expr {1}", stopWatch1.ElapsedTicks * 1000000.0 / Stopwatch.Frequency, matchExpression); #endif return result; }
private bool MatchAgainstDocText(string matchExpression, ScanPages scanPages, ref double matchFactorTotal, List<DocMatchingTextLoc> matchingTextLocs) { int curExpressionIdx = 0; StringTok st = new StringTok(matchExpression); return EvalMatch(matchExpression, st, scanPages, ref matchFactorTotal, ref curExpressionIdx, matchingTextLocs); }
public DocTypeMatchResult CheckIfDocMatches(ScanPages scanPages, DocType docType, bool extractDates, List<DocMatchingTextLoc> matchingTextLocs) { // Setup check info DocTypeMatchResult matchResult = new DocTypeMatchResult(); matchResult.matchCertaintyPercent = 0; matchResult.matchResultCode = DocTypeMatchResult.MatchResultCodes.NOT_FOUND; if (!docType.isEnabled) { matchResult.matchResultCode = DocTypeMatchResult.MatchResultCodes.DISABLED; return matchResult; } if (docType.matchExpression == null) { matchResult.matchResultCode = DocTypeMatchResult.MatchResultCodes.NO_EXPR; return matchResult; } // Check the expression double matchFactorTotal = 0; if (MatchAgainstDocText(docType.matchExpression, scanPages, ref matchFactorTotal, matchingTextLocs)) { matchResult.matchCertaintyPercent = 100; matchResult.matchResultCode = DocTypeMatchResult.MatchResultCodes.FOUND_MATCH; } matchResult.docTypeName = docType.docTypeName; matchResult.matchFactor = matchFactorTotal; // Extract date if (extractDates) { int bestDateIdx = 0; List<ExtractedDate> extractedDates = DocTextAndDateExtractor.ExtractDatesFromDoc(scanPages, docType.dateExpression, out bestDateIdx); matchResult.datesFoundInDoc = extractedDates; if (extractedDates.Count > 0) matchResult.docDate = extractedDates[bestDateIdx].dateTime; } return matchResult; }
public DocTypeMatchResult GetMatchingDocType(ScanPages scanPages, List<DocTypeMatchResult> listOfPossibleMatches = null) { // Get list of types DocTypeMatchResult bestMatchResult = new DocTypeMatchResult(); var collection_doctypes = GetDocTypesCollection(); MongoCursor<DocType> foundSdf = collection_doctypes.Find(Query.EQ("isEnabled", true)); #if TEST_PERF_GETMATCHINGDOCTYPE Stopwatch stopWatch1 = new Stopwatch(); Stopwatch stopWatch2 = new Stopwatch(); #endif foreach (DocType doctype in foundSdf) { #if TEST_PERF_GETMATCHINGDOCTYPE stopWatch1.Start(); #endif // Check if document matches DocTypeMatchResult matchResult = CheckIfDocMatches(scanPages, doctype, false, null); #if TEST_PERF_GETMATCHINGDOCTYPE stopWatch1.Stop(); stopWatch2.Start(); #endif // Find the best match bool bThisIsBestMatch = false; if (bestMatchResult.matchCertaintyPercent < matchResult.matchCertaintyPercent) bThisIsBestMatch = true; else if (bestMatchResult.matchCertaintyPercent == matchResult.matchCertaintyPercent) if (bestMatchResult.matchFactor < matchResult.matchFactor) bThisIsBestMatch = true; // Redo match to get date and time info if (bThisIsBestMatch) { matchResult = CheckIfDocMatches(scanPages, doctype, true, null); bestMatchResult = matchResult; } // Check if this should be returned in the list of best matches if (listOfPossibleMatches != null) if ((matchResult.matchCertaintyPercent > 0) || (matchResult.matchFactor > 0)) listOfPossibleMatches.Add(matchResult); #if TEST_PERF_GETMATCHINGDOCTYPE stopWatch2.Stop(); #endif } #if TEST_PERF_GETMATCHINGDOCTYPE logger.Info("T1 : {0}ms, T2 : {1}ms", stopWatch1.ElapsedMilliseconds, stopWatch2.ElapsedMilliseconds); #endif // If no exact match get date info from entire doc if (bestMatchResult.matchCertaintyPercent != 100) { int bestDateIdx = 0; List<ExtractedDate> extractedDates = DocTextAndDateExtractor.ExtractDatesFromDoc(scanPages, "", out bestDateIdx); bestMatchResult.datesFoundInDoc = extractedDates; if (extractedDates.Count > 0) bestMatchResult.docDate = extractedDates[bestDateIdx].dateTime; } // If list of best matches to be returned then sort that list now if (listOfPossibleMatches != null) { listOfPossibleMatches = listOfPossibleMatches.OrderByDescending(o => o.matchCertaintyPercent).ThenBy(o => o.matchFactor).ToList(); } return bestMatchResult; }
public void ShowDocTypeList(string selDocTypeName, ScanDocInfo unfiledScanDocInfo, ScanPages unfiledScanDocPages) { _curUnfiledScanDocInfo = unfiledScanDocInfo; _curUnfiledScanDocPages = unfiledScanDocPages; DocType selDocType = null; List<DocType> docTypes = _docTypesMatcher.ListDocTypes(); var docTypesSorted = from docType in docTypes orderby !docType.isEnabled, docType.docTypeName select docType; _docTypeColl.Clear(); foreach (DocType dt in docTypesSorted) { _docTypeColl.Add(dt); if (dt.docTypeName == selDocTypeName) selDocType = dt; } docTypeListView.ItemsSource = _docTypeColl; if (selDocType != null) docTypeListView.SelectedItem = selDocType; // Display example doc if ((_curUnfiledScanDocInfo != null) && (_curUnfiledScanDocPages != null)) { DisplayExampleDoc(_curUnfiledScanDocInfo.uniqName, 1, _curUnfiledScanDocPages); btnShowDocToBeFiled.IsEnabled = true; } else { btnShowDocToBeFiled.IsEnabled = false; } }
public static void SearchForDateItem(ScanPages scanPages, string dateSearchTerm, DocRectangle dateDocRect, double matchFactor, List<ExtractedDate> datesResult, ref bool latestDateRequested, ref bool earliestDateRequested, int limitToPageNumN = -1, bool ignoreWhitespace = false) { // Get date search info DateSrchInfo dateSrchInfo = GetDateSearchInfo(dateSearchTerm); if (dateSrchInfo.bEarliestDate) earliestDateRequested = true; if (dateSrchInfo.bLatestDate) latestDateRequested = true; // Find first and last pages to search int firstPageIdx = 0; int lastPageIdxPlusOne = scanPages.scanPagesText.Count; if (limitToPageNumN != -1) { firstPageIdx = limitToPageNumN - 1; lastPageIdxPlusOne = limitToPageNumN; } // Iterate pages for (int pageIdx = firstPageIdx; pageIdx < lastPageIdxPlusOne; pageIdx++) { List<ScanTextElem> scanPageText = scanPages.scanPagesText[pageIdx]; string joinedText = ""; // This maybe used if ~join macrocommand used int joinCount = 0; double matchFactorForThisPage = matchFactor + (pageIdx == 0 ? MATCH_FACTOR_BUMP_FOR_PAGE1 : (pageIdx == 1 ? MATCH_FACTOR_BUMP_FOR_PAGE2 : 0)); // Iterate text elements foreach (ScanTextElem textElem in scanPageText) { // Check that the text contains at least two digits together to avoid wasting time looking for dates where there can be none if (!Regex.IsMatch(textElem.text, @"\d\d")) continue; // Check rectangle bounds if (!dateDocRect.Intersects(textElem.bounds)) continue; // Check for join if (dateSrchInfo.bJoinTextInRect) { if (joinCount < MAX_TEXT_ELEMS_TO_JOIN) joinedText += textElem.text + " "; joinCount++; continue; } // Search within the found text SearchWithinString(textElem.text, textElem.bounds, dateSearchTerm, dateSrchInfo, matchFactorForThisPage, pageIdx, datesResult, ignoreWhitespace); } // If joined then search just once if (dateSrchInfo.bJoinTextInRect) SearchWithinString(joinedText, dateDocRect, dateSearchTerm, dateSrchInfo, matchFactorForThisPage, pageIdx, datesResult, ignoreWhitespace); } // TEST TEST TEST #if TEST_AGAINST_OLD_DATE_ALGORITHM { List<ExtractedDate> testDatesResult = new List<ExtractedDate>(); SearchForDateItem2(scanPages, dateSearchTerm, dateDocRect, matchFactor, testDatesResult, limitToPageNumN); stp2.Stop(); Console.WriteLine("File: " + scanPages.uniqName + " OldTime = " + stp2.ElapsedMilliseconds.ToString() + " NewTime = " + stp.ElapsedMilliseconds.ToString()); foreach (ExtractedDate newD in datesResult) { bool bFound = false; foreach (ExtractedDate oldD in testDatesResult) { if (oldD.dateTime == newD.dateTime) { bFound = true; break; } } if (!bFound) { Console.WriteLine("Date Mismatch New=" + newD.dateTime.ToLongDateString()); } } foreach (ExtractedDate oldD in testDatesResult) { bool bFound = false; foreach (ExtractedDate newD in datesResult) { if (oldD.dateTime == newD.dateTime) { bFound = true; break; } } if (!bFound) { Console.WriteLine("Date Mismatch Old=" + oldD.dateTime.ToLongDateString()); } } } #endif }
private void ShowDocumentFirstTime(string uniqName) { // Load document info from db ScanDocAllInfo scanDocAllInfo = _scanDocHandler.GetScanDocAllInfoCached(uniqName); if ((scanDocAllInfo == null) || (scanDocAllInfo.scanDocInfo == null)) { _curDocScanPages = null; _curDocScanDocInfo = null; _curFiledDocInfo = null; _curSelectedDocType = null; } else { _curDocScanPages = scanDocAllInfo.scanPages; _curDocScanDocInfo = scanDocAllInfo.scanDocInfo; _curFiledDocInfo = scanDocAllInfo.filedDocInfo; } // Display image of first page DisplayScannedDocImage(1); // Signal that the cur doc has changed _newCurDocProcessingCancel = true; _newCurDocSignal.Set(); }
public List<string> GeneratePageFiles(string uniqName, ScanPages scanPages, string outputPath, int maxPages, bool rotateBasedOnText) { List<string> imgFileNames = new List<string>(); // Create new stopwatch Stopwatch stopwatch = new Stopwatch(); // Begin timing stopwatch.Start(); int numPagesToConvert = _rasterizer.PageCount; if (numPagesToConvert > maxPages) numPagesToConvert = maxPages; for (int pageNumber = 1; pageNumber <= numPagesToConvert; pageNumber++) { string pageFileName = GetFilenameOfImageOfPage(outputPath, uniqName, pageNumber, true, "jpg"); try { System.Drawing.Image img = _rasterizer.GetPage(_pointsPerInch, _pointsPerInch, pageNumber); // Rotate image as required if (rotateBasedOnText) { if (pageNumber - 1 < scanPages.pageRotations.Count) if (scanPages.pageRotations[pageNumber - 1] != 0) img = RotateImageWithoutCrop(img, scanPages.pageRotations[pageNumber - 1]); } // Save to file img.Save(pageFileName, ImageFormat.Jpeg); imgFileNames.Add(pageFileName); } catch (Exception excp) { logger.Error("Failed to create image of page {0}", pageFileName, excp.Message); } } // Stop timing stopwatch.Stop(); logger.Info("Converted {0} ({1} pages) to image files in {2}", _inputPdfPath, numPagesToConvert, stopwatch.Elapsed); return imgFileNames; }
public ScanDocAllInfo(ScanDocInfo sdi, ScanPages spages, FiledDocInfo fdi) { scanDocInfo = sdi; scanPages = spages; filedDocInfo = fdi; }
public static List<ExtractedDate> ExtractDatesFromDoc(ScanPages scanPages, string dateExpr, out int bestDateIdx) { bestDateIdx = 0; List<ExtractedDate> datesResult = new List<ExtractedDate>(); if (scanPages == null) return datesResult; // Extract location rectangles from doctype List<ExprParseTerm> parseTerms = DocTypesMatcher.ParseDocMatchExpression(dateExpr, 0); bool bAtLeastOneExprSearched = false; string lastDateSearchTerm = ""; double lastDateSearchMatchFactor = 0; bool latestDateRequested = false; bool earliestDateRequested = false; foreach (ExprParseTerm parseTerm in parseTerms) { if (parseTerm.termType == ExprParseTerm.ExprParseTermType.exprTerm_Text) { if (lastDateSearchTerm != "") { SearchForDateItem(scanPages, lastDateSearchTerm, new DocRectangle(0, 0, 100, 100), lastDateSearchMatchFactor, datesResult, ref latestDateRequested, ref earliestDateRequested); bAtLeastOneExprSearched = true; } lastDateSearchTerm = dateExpr.Substring(parseTerm.stPos, parseTerm.termLen); // Reset matchFactor for next search term lastDateSearchMatchFactor = 0; } else if (parseTerm.termType == ExprParseTerm.ExprParseTermType.exprTerm_Location) { string locStr = dateExpr.Substring(parseTerm.stPos, parseTerm.termLen); DocRectangle lastDateSearchRect = new DocRectangle(locStr); SearchForDateItem(scanPages, lastDateSearchTerm, lastDateSearchRect, lastDateSearchMatchFactor, datesResult, ref latestDateRequested, ref earliestDateRequested); lastDateSearchTerm = ""; lastDateSearchMatchFactor = 0; bAtLeastOneExprSearched = true; } else if (parseTerm.termType == ExprParseTerm.ExprParseTermType.exprTerm_MatchFactor) { if (dateExpr.Length > parseTerm.stPos + 1) { string valStr = dateExpr.Substring(parseTerm.stPos + 1, parseTerm.termLen-1); Double.TryParse(valStr, out lastDateSearchMatchFactor); } } } // There may be one last expression still to find - but be sure that at least one is searched for if ((lastDateSearchTerm != "") || (!bAtLeastOneExprSearched)) SearchForDateItem(scanPages, lastDateSearchTerm, new DocRectangle(0, 0, 100, 100), lastDateSearchMatchFactor, datesResult, ref latestDateRequested, ref earliestDateRequested); // If required check for the earliest and/or latest dates and bump their factors DateTime earliestDate = DateTime.MaxValue; DateTime latestDate = DateTime.MinValue; int earliestIdx = -1; int latestIdx = -1; for (int dateIdx = 0; dateIdx < datesResult.Count; dateIdx++) { if (earliestDate > datesResult[dateIdx].dateTime) { earliestDate = datesResult[dateIdx].dateTime; earliestIdx = dateIdx; } if (latestDate < datesResult[dateIdx].dateTime) { latestDate = datesResult[dateIdx].dateTime; latestIdx = dateIdx; } } if (earliestDateRequested && (earliestIdx != -1)) datesResult[earliestIdx].matchFactor += MATCH_FACTOR_BUMP_FOR_EARLIEST_DATE; if (latestDateRequested && (latestIdx != -1)) datesResult[latestIdx].matchFactor += MATCH_FACTOR_BUMP_FOR_LATEST_DATE; // Find the best date index based on highest match factor bestDateIdx = 0; double highestDateMatchFactor = 0; for (int dateIdx = 0; dateIdx < datesResult.Count; dateIdx++) { if (highestDateMatchFactor < datesResult[dateIdx].matchFactor) { bestDateIdx = dateIdx; highestDateMatchFactor = datesResult[dateIdx].matchFactor; } } return datesResult; }
public static void SearchForDateItem2(ScanPages scanPages, string dateSearchTerm, DocRectangle dateDocRect, double matchFactor, List<ExtractedDate> datesResult, int limitToPageNumN = -1) { int firstPageIdx = 0; int lastPageIdxPlusOne = scanPages.scanPagesText.Count; if (limitToPageNumN != -1) { firstPageIdx = limitToPageNumN - 1; lastPageIdxPlusOne = limitToPageNumN; } for (int pageIdx = firstPageIdx; pageIdx < lastPageIdxPlusOne; pageIdx++) { List<ScanTextElem> scanPageText = scanPages.scanPagesText[pageIdx]; foreach (ScanTextElem textElem in scanPageText) { // Check if there are at least two digits together in the text (any date format requires this at least) if (!Regex.IsMatch(textElem.text, @"\d\d")) continue; // Check bounds if (dateDocRect.Intersects(textElem.bounds)) { // See which date formats to try bool bTryLong = false; bool bTryShort = false; bool bTryUS = false; bool bTryNoZeroes = false; bool bTrySpaceSeparated = false; if (dateSearchTerm.IndexOf("~long", StringComparison.OrdinalIgnoreCase) >= 0) bTryLong = true; if (dateSearchTerm.IndexOf("~short", StringComparison.OrdinalIgnoreCase) >= 0) bTryShort = true; if (dateSearchTerm.IndexOf("~US", StringComparison.OrdinalIgnoreCase) >= 0) bTryUS = true; if (dateSearchTerm.IndexOf("~No0", StringComparison.OrdinalIgnoreCase) >= 0) bTryNoZeroes = true; if (dateSearchTerm.IndexOf("~Spaces", StringComparison.OrdinalIgnoreCase) >= 0) bTrySpaceSeparated = true; if (!(bTryLong | bTryShort)) { bTryLong = true; bTryShort = true; bTryUS = true; bTryNoZeroes = true; bTrySpaceSeparated = true; } // Get match text if any string matchText = dateSearchTerm; int squigPos = dateSearchTerm.IndexOf('~'); if (squigPos >= 0) matchText = dateSearchTerm.Substring(0, squigPos); double matchResultFactor = 0; if (textElem.text.IndexOf(matchText, StringComparison.OrdinalIgnoreCase) >= 0) matchResultFactor = matchFactor; // Try to find dates if (bTryLong) { MatchCollection ldMatches = Regex.Matches(textElem.text, longDateRegex, RegexOptions.IgnoreCase); CoerceMatchesToDates(datesResult, matchResultFactor, textElem, ldMatches, ExtractedDate.DateMatchType.LongDate, 13, 11, 1); if (bTryUS) { MatchCollection usldMatches = Regex.Matches(textElem.text, USlongDateRegex, RegexOptions.IgnoreCase); CoerceMatchesToDates(datesResult, matchResultFactor, textElem, usldMatches, ExtractedDate.DateMatchType.USLongDate, 14, 1, 4); } } if (bTryShort) { MatchCollection sdlzMatches = Regex.Matches(textElem.text, shortDateLeadingZeroesRegex, RegexOptions.IgnoreCase); CoerceMatchesToDates(datesResult, matchResultFactor, textElem, sdlzMatches, ExtractedDate.DateMatchType.ShortDateLeadingZeroes, 3, 2, 1); if (bTryNoZeroes) { MatchCollection sdnlzMatches = Regex.Matches(textElem.text, shortDateNoLeadingZeroesRegex, RegexOptions.IgnoreCase); CoerceMatchesToDates(datesResult, matchResultFactor, textElem, sdnlzMatches, ExtractedDate.DateMatchType.ShortDateNoLeadingZeroes, 3, 2, 1); } if (bTrySpaceSeparated) { MatchCollection sdspMatches = Regex.Matches(textElem.text, shortDateSpacesRegex, RegexOptions.IgnoreCase); CoerceMatchesToDates(datesResult, matchResultFactor, textElem, sdspMatches, ExtractedDate.DateMatchType.ShortDateNoLeadingZeroes, 3, 2, 1); } } } } } }
public void AddScanPagesRecToMongo(ScanPages scanPages) { // Mongo append try { MongoCollection<ScanPages> collection_spages = GetDocPagesCollection(); collection_spages.Insert(scanPages); // Log it logger.Info("Added scandocpages record for {0}", scanPages.uniqName); } catch (Exception excp) { logger.Error("Cannot insert scandocpages into {0} Coll... {1} for file {2} excp {3}", _scanConfig._dbNameForDocs, _scanConfig._dbCollectionForDocPages, scanPages.uniqName, excp.Message); } }
private void DisplayExampleDoc(string uniqName, int pageNum, ScanPages scanPages) { _curDocDisplay_scanPages = scanPages; string imgFileName = PdfRasterizer.GetFilenameOfImageOfPage(Properties.Settings.Default.DocAdminImgFolderBase, uniqName, pageNum, false); if (!File.Exists(imgFileName)) return; try { exampleFileImage.Source = new BitmapImage(new Uri("File:" + imgFileName)); _curDocDisplay_uniqName = uniqName; _curDocDisplay_pageNum = pageNum; } catch (Exception excp) { logger.Error("Loading bitmap file {0} excp {1}", imgFileName, excp.Message); _curDocDisplay_uniqName = ""; _curDocDisplay_pageNum = 1; } }
// when processing file // - first move the file // - then update the doc record to say processed public bool ProcessPdfFile(string fileName, string uniqName, bool bExtractImages, bool bDontOverwriteExistingImages, bool bExtractText, bool bRecogniseDoc, bool bAddToDocInfoDb, bool bAddToDocPagesDb) { // First check if doc details are already in db if (!ScanDocInfoRecordExists(uniqName)) return false; // Make a copy of the file in the archive location string archiveFileName = ScanDocHandler.GetArchiveFileName(uniqName); if (!Delimon.Win32.IO.File.Exists(archiveFileName)) { string statusStr = ""; bool bResult = CopyFile(fileName, archiveFileName, ref statusStr); if (!bResult) { logger.Error("Can't make archive copy {0} excp {1}", archiveFileName, statusStr); return false; } } else { logger.Info("Archive file already exists {0}", archiveFileName); } // Extract text blocks from file ScanPages scanPages = new ScanPages(uniqName); int totalNumPages = 0; if (bExtractText) { PdfTextAndLocExtractor pdfExtractor = new PdfTextAndLocExtractor(); scanPages = pdfExtractor.ExtractDocInfo(uniqName, fileName, _scanConfig._maxPagesForText, ref totalNumPages); } // Extract images from file if (bExtractImages) { bool procImages = (!bDontOverwriteExistingImages) | (!Delimon.Win32.IO.File.Exists(PdfRasterizer.GetFilenameOfImageOfPage(_scanConfig._docAdminImgFolderBase, uniqName, 1, false))); if (procImages) { PdfRasterizer rs = new PdfRasterizer(fileName, THUMBNAIL_POINTS_PER_INCH); try { List<string> imgFileNames = rs.GeneratePageFiles(uniqName, scanPages, _scanConfig._docAdminImgFolderBase, _scanConfig._maxPagesForImages, false); } finally { rs.Close(); } } } // Form partial document info DateTime fileDateTime = Delimon.Win32.IO.File.GetCreationTime(fileName); ScanDocInfo scanDocInfo = new ScanDocInfo(uniqName, totalNumPages, scanPages.scanPagesText.Count, fileDateTime, fileName.Replace('\\', '/'), false); // Add records to mongo databases if (bAddToDocPagesDb) AddScanPagesRecToMongo(scanPages); if (bAddToDocInfoDb) AddDocInfoRecToMongo(scanDocInfo); // Request update to unfiled documents list _scanDocInfoCache.RequestUnfiledListUpdate(); return true; }
public static string ExtractTextFromPage(ScanPages scanPages, DocRectangle docRect, int pageNum) { int pageIdx = pageNum-1; if ((pageIdx < 0) || (pageIdx >= scanPages.scanPagesText.Count)) return ""; // Get page to search List<ScanTextElem> scanPageText = scanPages.scanPagesText[pageNum-1]; // Iterate text elements foreach (ScanTextElem textElem in scanPageText) { // Check rectangle bounds if (!docRect.Intersects(textElem.bounds)) continue; // Return first match return textElem.text; } return ""; }