private static DateSrchInfo GetDateSearchInfo(string dateSearchTerm) { DateSrchInfo dateSrchInfo = new DateSrchInfo(); // Extract flags dateSrchInfo.bIsUsDate = (dateSearchTerm.IndexOf("~USDate", StringComparison.OrdinalIgnoreCase) >= 0); dateSrchInfo.bAllowColons = (dateSearchTerm.IndexOf("~AllowColons", StringComparison.OrdinalIgnoreCase) >= 0); dateSrchInfo.bAllowDots = (dateSearchTerm.IndexOf("~AllowDots", StringComparison.OrdinalIgnoreCase) >= 0); dateSrchInfo.bAllowTwoCommas = (dateSearchTerm.IndexOf("~AllowTwoCommas", StringComparison.OrdinalIgnoreCase) >= 0); dateSrchInfo.bPlusOneMonth = (dateSearchTerm.IndexOf("~PlusOneMonth", StringComparison.OrdinalIgnoreCase) >= 0); dateSrchInfo.bJoinTextInRect = (dateSearchTerm.IndexOf("~join", StringComparison.OrdinalIgnoreCase) >= 0); dateSrchInfo.bNoDateRanges = (dateSearchTerm.IndexOf("~NoDateRanges", StringComparison.OrdinalIgnoreCase) >= 0); dateSrchInfo.bLatestDate = (dateSearchTerm.IndexOf("~latest", StringComparison.OrdinalIgnoreCase) >= 0); dateSrchInfo.bEarliestDate = (dateSearchTerm.IndexOf("~earliest", StringComparison.OrdinalIgnoreCase) >= 0); dateSrchInfo.bFinancialYearEnd = (dateSearchTerm.IndexOf("~finYearEnd", StringComparison.OrdinalIgnoreCase) >= 0); // Pattern str string patternStr = ""; int squigPos = dateSearchTerm.IndexOf('~'); if (squigPos >= 0) patternStr = dateSearchTerm.Substring(0, squigPos).ToLower(); // Find order of elements bool bDayFound = false; bool bMonthFound = false; bool bYearFound = false; for (int chIdx = 0; chIdx < patternStr.Length; chIdx++) { char ch = patternStr[chIdx]; // Handle day pattern = d or dd if (!bDayFound && (ch == 'd')) { DateElemSrch el = new DateElemSrch(); el.dateElType = DateElemSrch.DateElType.DE_DAY; el.isDigits = true; el.minChars = 1; el.maxChars = 2; el.allowLeadingZeroes = false; dateSrchInfo.dateEls.Add(el); bDayFound = true; } // Handle month pattern = m or mm or mmm or mmmm if (!bMonthFound && (ch == 'm')) { DateElemSrch el = new DateElemSrch(); el.dateElType = DateElemSrch.DateElType.DE_MONTH; el.isDigits = true; el.minChars = 1; el.maxChars = 2; el.allowLeadingZeroes = false; if ((chIdx + 1 < patternStr.Length) && (patternStr[chIdx + 1] == 'm')) { el.allowLeadingZeroes = true; if ((chIdx + 2 < patternStr.Length) && (patternStr[chIdx + 2] == 'm')) { el.isDigits = false; el.minChars = 3; el.maxChars = 3; if ((chIdx + 3 < patternStr.Length) && (patternStr[chIdx + 3] == 'm')) { el.isDigits = false; el.minChars = 3; el.minChars = 9; } } } dateSrchInfo.dateEls.Add(el); bMonthFound = true; } // Handle year pattern = yy or yyyy if (!bYearFound && (ch == 'y')) { DateElemSrch el = new DateElemSrch(); el.dateElType = DateElemSrch.DateElType.DE_YEAR; el.isDigits = true; el.minChars = 2; el.maxChars = 2; el.allowLeadingZeroes = false; if ((chIdx + 2 < patternStr.Length) && (patternStr[chIdx + 2] == 'y')) el.minChars = 4; el.maxChars = 4; dateSrchInfo.dateEls.Add(el); bYearFound = true; } } return dateSrchInfo; }
private static void SearchWithinString(string inStr, DocRectangle textBounds, string dateSearchTerm, DateSrchInfo dateSrchInfo, double matchFactor, int pageIdx, List<ExtractedDate> datesResult, bool ignoreWhitespace) { int numDatesFoundInString = 0; int year = -1; // Use regex to find financial if (dateSrchInfo.bFinancialYearEnd) { const string finYearEndRegex = @"year end.{0,16}?\s?((19|20)?(\d\d))"; Match fyMatch = Regex.Match(inStr, finYearEndRegex, RegexOptions.IgnoreCase); if (fyMatch.Success) { if (fyMatch.Groups.Count > 1) { // Add result year = Convert.ToInt32(fyMatch.Groups[1].Value); AddCompletedDateToList(inStr, textBounds, 100, year, 4, 5, false, false, fyMatch.Index, fyMatch.Length, dateSrchInfo, pageIdx+1, datesResult); numDatesFoundInString++; } } } // Start at the beginning of the string string s = inStr; if (ignoreWhitespace) s = s.Replace(" ", ""); int dateSrchPos = 0; int chIdx = 0; string curStr = ""; int day = -1; int month = -1; bool bMonthFromChars = false; year = -1; s = s.ToLower(); bool strIsDigits = false; int firstMatchPos = -1; int lastMatchPos = 0; int commaCount = 0; bool bRangeIndicatorFound = false; int numSepChars = 0; for (chIdx = 0; chIdx < s.Length; chIdx++) { char ch = s[chIdx]; bool bResetNeeded = false; // Search element DateElemSrch el = null; int minChars = 1; int maxChars = 9; if (dateSrchPos < dateSrchInfo.dateEls.Count) { el = dateSrchInfo.dateEls[dateSrchPos]; minChars = el.minChars; maxChars = el.maxChars; } // Check if digits required if ((el == null) || (el.isDigits)) { char testCh = ch; if ((testCh == 'l') || (testCh == 'o')) { if (((strIsDigits) && (curStr.Length > 0)) || ((chIdx+1 < s.Length) && (Char.IsDigit(s[chIdx+1])))) { if (testCh == 'l') testCh = '1'; else if (testCh == 'o') testCh = '0'; else if (testCh == 'i') testCh = '1'; } } if (Char.IsDigit(testCh)) { numSepChars = 0; // Ignore if it's a zero and we're not allowed leading zeroes // if ((el != null) && (!el.allowLeadingZeroes) && (curStrPos == 0) && (ch == '0')) // continue; if (!strIsDigits) curStr = ""; curStr += testCh; strIsDigits = true; if (curStr.Length < minChars) continue; // Check max chars if (curStr.Length > maxChars) { curStr = ""; continue; } // Check if the next char is also a digit - if not then we've found what we're looking for if (((chIdx + 1 >= s.Length) || (!Char.IsDigit(s[chIdx + 1]))) && (curStr != "0")) { // Is this a day / month or year?? DateElemSrch.DateElType elType = DateElemSrch.DateElType.DE_NONE; if (el != null) elType = el.dateElType; else { // Handle one and two digit numbers if (curStr.Length <= 2) { // Already had a char based month? if (bMonthFromChars) { if (!dateSrchInfo.bIsUsDate) elType = DateElemSrch.DateElType.DE_YEAR; else elType = DateElemSrch.DateElType.DE_DAY; } else { // Position for standard month? if ((dateSrchPos == 1) && (!dateSrchInfo.bIsUsDate)) elType = DateElemSrch.DateElType.DE_MONTH; // Position for US month? else if ((dateSrchPos == 0) && (dateSrchInfo.bIsUsDate)) elType = DateElemSrch.DateElType.DE_MONTH; else if (dateSrchPos < 2) elType = DateElemSrch.DateElType.DE_DAY; else if ((dateSrchPos > 0) && (curStr.Length == 2)) elType = DateElemSrch.DateElType.DE_YEAR; } } else if (curStr.Length == 4) { // Num digits == 4 if (dateSrchPos > 0) elType = DateElemSrch.DateElType.DE_YEAR; } } // Handle the value if (elType == DateElemSrch.DateElType.DE_DAY) { Int32.TryParse(curStr, out day); if ((day < 1) || (day > 31)) day = -1; } else if (elType == DateElemSrch.DateElType.DE_MONTH) { Int32.TryParse(curStr, out month); if ((month < 1) || (month > 12)) month = -1; bMonthFromChars = false; } else if (elType == DateElemSrch.DateElType.DE_YEAR) { Int32.TryParse(curStr, out year); if (curStr.Length == 2) { if ((year < 0) || (year > 100)) year = -1; } else if (curStr.Length == 4) { if ((year < 1800) || (year > 2200)) year = -1; } // If no date formatting string is used then year must be the last item if ((el == null) && (year != -1)) bResetNeeded = true; } else { curStr = ""; continue; } if (firstMatchPos == -1) firstMatchPos = chIdx - curStr.Length; lastMatchPos = chIdx; dateSrchPos++; curStr = ""; } } } if ((el == null) || (!el.isDigits)) { if (Char.IsLetter(ch)) { if (strIsDigits) curStr = ""; strIsDigits = false; // Check we're still looking for a month value if (month != -1) { numSepChars++; continue; } // Form a sub-string to test for month names curStr += ch; // Check for range indicator if (numDatesFoundInString == 1) { if (chIdx - curStr.Length - 1 > 0) { string testStr = s.Substring(chIdx - curStr.Length - 1); if (testStr.Contains(" to") || testStr.Contains(" to")) bRangeIndicatorFound = true; } } // No point checking for month strings until 3 chars got if (curStr.Length < 3) continue; // Check for a month name if (shortMonthStrings.Any(curStr.Contains)) { for (int monIdx = 0; monIdx < shortMonthStrings.Length; monIdx++) if (curStr.Contains(shortMonthStrings[monIdx])) { month = monIdx + 1; bMonthFromChars = true; break; } if (firstMatchPos == -1) firstMatchPos = chIdx - curStr.Length; lastMatchPos = chIdx; dateSrchPos++; curStr = ""; numSepChars = 0; // Move chIdx on to skip to next non letter while ((chIdx < s.Length-1) && (Char.IsLetter(s[chIdx+1]))) chIdx++; // Check for another valid month string in next few chars to detect ranges without a year // e.g. should find ranges like 3 Jan - 4 Mar 2011 or 1st Jan to 31st May 2013 // but exlude ranges like 3 Jan 2012 - 4 Mar 2012 which would be seen as two separate dates if (!dateSrchInfo.bNoDateRanges) { string strNextStr = ""; bool bStrRangeIndicatorFound = false; int digitGroups = 0; bool isInDigitGroup = false; for (int chNext = chIdx+1; (chNext < s.Length) && (chNext < chIdx + 15); chNext++) { // Count the groups of digits // (if we find two groups then break out as it's probably a range that contains separate years) if (Char.IsDigit(s[chNext])) { if (!isInDigitGroup) { isInDigitGroup = true; digitGroups++; if (digitGroups >= 2) break; } } // Form a string from letters found else if (Char.IsLetter(s[chNext])) { isInDigitGroup = false; strNextStr += s[chNext]; // Check if the string contains "to" if (strNextStr.Length >= 2) if (strNextStr.Contains("to")) bStrRangeIndicatorFound = true; // Check if the string contains a short month name if (bStrRangeIndicatorFound && (strNextStr.Length >= 3)) if (shortMonthStrings.Any(strNextStr.Contains)) { bResetNeeded = true; break; } } else { // Check punctuation - this assumes a - is a range seperator isInDigitGroup = false; if (s[chNext] == '-') bStrRangeIndicatorFound = true; strNextStr = ""; } } } else { bResetNeeded = true; } } } } // Check for whitespace/punctuation/etc if (!Char.IsLetterOrDigit(ch)) { if ((day != -1) || (month != -1) || (year != -1)) { numSepChars++; if (numSepChars > MAX_SEP_CHARS_BETWEEN_DATE_ELEMS) { bResetNeeded = true; numSepChars = 0; } } curStr = ""; switch (ch) { case ':': { if (!dateSrchInfo.bAllowColons) bResetNeeded = true; break; } case ',': { commaCount++; if ((!dateSrchInfo.bAllowTwoCommas) && (commaCount > 1)) bResetNeeded = true; break; } case '.': { if (!dateSrchInfo.bAllowDots) bResetNeeded = true; break; } case '-': { if (numDatesFoundInString == 1) bRangeIndicatorFound = true; break; } } } // Check for complete date if ((year != -1) && (month != -1) && ((day != -1) || (bMonthFromChars))) { // Add result AddCompletedDateToList(s, textBounds, matchFactor, year, month, day, bMonthFromChars, bRangeIndicatorFound, firstMatchPos, lastMatchPos, dateSrchInfo, pageIdx+1, datesResult); numDatesFoundInString++; // Start again to see if another date can be found curStr = ""; bResetNeeded = true; } // Restart the process of finding a date if required if (bResetNeeded) { dateSrchPos = 0; day = -1; month = -1; year = -1; bMonthFromChars = false; strIsDigits = false; firstMatchPos = -1; bResetNeeded = false; commaCount = 0; numSepChars = 0; } } }
private static void AddCompletedDateToList(string srcStr, DocRectangle textBounds, double matchFactor, int year, int month, int day, bool bMonthFromChars, bool bRangeIndicatorFound, int firstMatchPos, int lastMatchPos, DateSrchInfo dateSrchInfo, int pageNum, List<ExtractedDate> datesResult) { double finalMatchFactor = matchFactor; ExtractedDate fd = new ExtractedDate(); if (bRangeIndicatorFound) finalMatchFactor += 10; // Bump the match factor for dates in the top 40% of page - letterhead dates if (textBounds.Y < 40) finalMatchFactor += MATCH_FACTOR_BUMP_FOR_TOP_40_PC_OF_PAGE; // Year if (year < 80) { year += 2000; fd.yearWas2Digit = true; } else if (year < 100) { year += 1900; fd.yearWas2Digit = true; } else { finalMatchFactor += MATCH_FACTOR_BUMP_FOR_4_DIGIT_YEAR; } // Month if (bMonthFromChars) finalMatchFactor += MATCH_FACTOR_BUMP_FOR_TEXT_MONTH; // Check for bump if (dateSrchInfo.bPlusOneMonth) { month += 1; if (month > 12) { month = 1; year++; } } // Day if (day == -1) { day = 1; fd.dayWasMissing = true; finalMatchFactor += MATCH_FACTOR_BUMP_FOR_DAY_MISSING; } if (day > DateTime.DaysInMonth(year, month)) day = DateTime.DaysInMonth(year, month); if (day < 1) day = 1; // Create datetime DateTime dt = DateTime.MinValue; try { dt = new DateTime(year, month, day); } catch { } // Add date to list fd.foundInText = srcStr; fd.pageNum = pageNum; fd.posnInText = firstMatchPos; fd.matchLength = lastMatchPos-firstMatchPos+1; fd.dateTime = dt; fd.dateMatchType = ExtractedDate.DateMatchType.LongDate; fd.locationOfDateOnPagePercent = textBounds; fd.matchFactor = finalMatchFactor; datesResult.Add(fd); }