public static DeclarationField PredictByStrings(List <string> words) { var negativeFreqs = new Dictionary <DeclarationField, double>(); foreach (string w in words) { if (DataHelper.IsEmptyValue(w)) { continue; } var f = HeaderHelpers.TryGetField("", w); if (f == DeclarationField.None) { f = PredictByString(w); } if (negativeFreqs.ContainsKey(f)) { negativeFreqs[f] -= 1; } else { negativeFreqs[f] = -1; } } return(FindMin(negativeFreqs)); }
public static bool WeakHeaderCheck(IAdapter adapter, List <Cell> cells) { int colCount = 0; if (cells.Count < 3) { return(false); } foreach (var c in cells) { if (colCount == 0 && HeaderHelpers.IsNumeroSign(c.Text)) { return(true); } if (HeaderHelpers.IsName(c.Text)) { return(true); } if (HeaderHelpers.HasOwnedString(c.Text) || HeaderHelpers.HasStateString(c.Text)) { if (FindSubcellsUnder(adapter, c).Count >= 3) { return(true); } } colCount += 1; if (colCount > 3) { break; } } return(false); }
public static bool CheckPersonName(String s) { if (s.Contains('.')) { return(true); } bool hasSpaces = s.Trim().Any(Char.IsWhiteSpace); if (!hasSpaces) { return(false); } string[] words = Regex.Split(s, @"[\,\s\n]+"); if (TextHelpers.CanBePatronymic(words[words.Length - 1])) { return(true); } if (words.Count() != 3) { var predictedField = ColumnByDataPredictor.PredictByString(s); if (!HeaderHelpers.IsNameDeclarationField(predictedField)) { return(false); } } return(true); }
public bool HasNameColumn() { foreach (var x in ColumnOrder.Keys) { if (HeaderHelpers.IsNameDeclarationField(x)) { return(true); } } return(false); }
// special abridged format for Moscow courts, see sud_2016.doc in the test cases public static bool IsNamePositionAndIncomeTable(List <Cell> cells) { if (cells.Count != 3) { return(false); } return(HeaderHelpers.IsName(cells[0].Text) && HeaderHelpers.IsOccupation(cells[1].Text) && HeaderHelpers.IsDeclaredYearlyIncome(cells[2].Text)); }
static public void MapColumnTitlesToInnerConstants(IAdapter adapter, List <Cell> cells, TableHeader columnOrdering) { foreach (var cell in cells) { string text = cell.GetText(true); Logger.Debug(string.Format("column title: \"{0}\"[{1}]", text.ReplaceEolnWithSpace().CoalesceWhitespace(), cell.CellWidth)); DeclarationField field; string clean_text = AbsenceMarkers.Aggregate(text, (x, y) => x.Replace(y, "")).Trim(); if (adapter.GetRowsCount() == cell.MergedRowsCount) { continue; } if ((text == "" || clean_text.Length <= 1) && (text != "№")) { // too short title, try to predict by values field = ColumnByDataPredictor.PredictEmptyColumnTitle(adapter, cell); Logger.Debug("Predict: " + field.ToString()); } else { field = HeaderHelpers.TryGetField(cell.TextAbove, text); if ((field == DeclarationField.None) && clean_text.Length <= 4) { field = ColumnByDataPredictor.PredictEmptyColumnTitle(adapter, cell); Logger.Debug("Predict: " + field.ToString()); } if (field == DeclarationField.None) { throw new SmartParserException(String.Format("Cannot recognize field \"{0}\"", text.Replace('\n', ' '))); } } if (field == DeclarationField.None && !DataHelper.IsEmptyValue(text)) { throw new ColumnDetectorException(String.Format("Fail to detect column type row: {0} title:{1}", cell.Row, text)); } if (ColumnByDataPredictor.CalcPrecision) { ColumnByDataPredictor.PredictForPrecisionCheck(adapter, cell, field); } AddColumn(columnOrdering, field, cell); if (TableHeader.SearchForFioColumnOnly) { if (HeaderHelpers.IsNameDeclarationField(field)) { break; } } } }
public static DeclarationField PredictEmptyColumnTitle(IAdapter adapter, Cell headerCell) { List <string> texts = new List <string>(); int rowIndex = headerCell.Row + headerCell.MergedRowsCount; const int maxRowToCollect = 10; int numbers = 0; int not_numbers = 0; for (int i = 0; i < maxRowToCollect; i++) { var cells = adapter.GetDataCells(rowIndex, IAdapter.MaxColumnsCount); string dummy; if (adapter.IsSectionRow(rowIndex, cells, adapter.GetColsCount(), false, out dummy)) { rowIndex += 1; } else if (IAdapter.IsNumbersRow(cells)) { rowIndex += 1; } else { var c = adapter.GetCell(rowIndex, headerCell.Col); if (c != null) { var txt = c.GetText(true); if (txt.Length > 0) { texts.Add(txt); int d; if (int.TryParse(txt, out d)) { numbers += 1; } else { not_numbers += 1; } } rowIndex += c.MergedRowsCount; } else { rowIndex += 1; } } if (rowIndex >= adapter.GetRowsCount()) { break; } } var field = DeclarationField.None; if (texts.Count == 1 && headerCell.Col == 0 && TextHelpers.CanBePatronymic(texts[0])) { // not enough data, if texts.Count == 1 field = DeclarationField.NameOrRelativeType; } else if (headerCell.Col == 0 && numbers > not_numbers) { field = DeclarationField.DeclarantIndex; } else { field = PredictByStrings(texts); if (field == DeclarationField.NameOrRelativeType && String.Join(" ", texts).Contains(",")) { field = DeclarationField.NameAndOccupationOrRelativeType; } } if (headerCell.TextAbove != null && ((field & DeclarationField.AllOwnTypes) > 0)) { string h = headerCell.TextAbove; // AllOwnTypes defined from field &= ~DeclarationField.AllOwnTypes; if (HeaderHelpers.IsMixedColumn(h)) { field |= DeclarationField.Mixed; } else if (HeaderHelpers.IsStateColumn(h)) { field |= DeclarationField.State; } else if (HeaderHelpers.IsOwnedColumn(h)) { field |= DeclarationField.Owned; } } if (field == DeclarationField.NameOrRelativeType) { if (TextHelpers.MayContainsRole(String.Join(" ", texts))) { field = DeclarationField.NameAndOccupationOrRelativeType; } } Logger.Debug(string.Format("predict by {0} -> {1}", String.Join("\\n", texts), field)); return(field); }