private bool MatchField(IEnumerable<Cell> tcs, FieldCellMap fm, SheetLayout sheetLayout, SharedStringTablePart stringTable, CellFormats formats, List<FieldCellMap> agMaps) { bool foundFld = false; foreach (var fld in sheetLayout.wsLayout.fields.Where(f => f.fldType == FieldType.cell)) { if (foundFld) break; try { switch (fm.cellLoc.dataLayout) { case CellDataLayout.combined: foundFld = MatchCombinedField(fld, fm); break; case CellDataLayout.separate: foundFld = MatchSeparatedField(tcs, fld, fm, stringTable, formats); break; case CellDataLayout.aggregate: // Cell contains an aggregate of fields. Based on { var aggCell = tcs.Where(c => c.CellReference == fm.cellLoc.TitleRef).FirstOrDefault(); var val = Spreadsheet.GetCellValue(aggCell, stringTable.SharedStringTable, formats, null); if (val != null && fm.cellLoc.aggregateCellCnt > 0 && fm.cellLoc.aggregateCellSeparator != null && fm.cellLoc.aggregateCellSeparator.Count() > 0) { var cells = Regex.Split(val.Trim(), fm.cellLoc.aggregateCellSeparator).ToList(); if (cells.Count() == fm.cellLoc.aggregateCellCnt) { var nfms = new List<FieldCellMap>(); foreach(var agv in fm.cellLoc.cellMaps) { FieldCellMap nfm = null; switch (agv.dataLayout) { case CellDataLayout.combined: nfm = new FieldCellMap { Title = cells[agv.aggregateIdx].ToLower(), Value = cells[agv.aggregateIdx], versMap = fm.versMap, cellLoc = new CellLocation { dataLayout = CellDataLayout.combined, TitleRef = fm.cellLoc.TitleRef, ValueRef = fm.cellLoc.ValueRef } }; MatchField(tcs, nfm, sheetLayout, stringTable, formats, agMaps); if (nfm.field != null) nfms.Add(nfm); break; case CellDataLayout.lookup: var nfld1 = sheetLayout.wsLayout.fields.Where(f => f.fldType == FieldType.cell); var nfld2 = nfld1.Where(f => f.titles != null); var nfld = nfld2.FirstOrDefault(f => f.titles.Select(t => t.ToLower()).Contains(agv.lookupString.ToLower())); if (nfld != null) { nfm = new FieldCellMap { Title = agv.lookupString, Value = cells[agv.aggregateIdx], field = nfld, versMap = fm.versMap, cellLoc = new CellLocation { dataLayout = CellDataLayout.separate, TitleRef = fm.cellLoc.TitleRef, ValueRef = fm.cellLoc.ValueRef } }; nfms.Add(nfm); } break; } } if(nfms.Count == fm.cellLoc.cellMaps.Count()) { agMaps.AddRange(nfms); foundFld = true; break; } } } } break; } } catch (Exception ex) { Log.New.Msg(ex); } } return foundFld; }
public MatchData MatchCellLayouts(MatchData md, IEnumerable<Cell> tcs, SheetLayout sheetLayout, SharedStringTablePart stringTable, CellFormats formats, FileInfo file) { // Obtain titles for all field cell layouts md.fldCellVersMaps = new List<FieldCellVersionMap>(); foreach (var fldLayout in sheetLayout.wsLayout.cellLayouts) { var fldLayoutVals = new List<FieldCellMap>(); foreach (var cellLoc in fldLayout.cellLocations) { try { var cl = tcs.FirstOrDefault(cll => cll.CellReference.InnerText == cellLoc.TitleRef); var clVal = tcs.FirstOrDefault(clv => clv.CellReference.InnerText == cellLoc.ValueRef); var title = Spreadsheet.GetCellValue(cl, stringTable.SharedStringTable, formats, null); if (title != null) title = System.Text.RegularExpressions.Regex.Replace(title, @"\s+", " ").Trim().ToLower(); var val = Spreadsheet.GetCellValue(clVal, stringTable.SharedStringTable, formats, null); fldLayoutVals.Add(new FieldCellMap { cellLoc = cellLoc, Title = string.IsNullOrWhiteSpace(title) ? null : title.Trim().ToLower(), Value = string.IsNullOrWhiteSpace(val) ? null : val.Trim() }); } catch (Exception ex) { Log.New.Msg(ex); } } md.fldCellVersMaps.Add(new FieldCellVersionMap { fldmaps = fldLayoutVals, fldLayout = fldLayout }); } md.fldCellVersMaps.ForEach(vm => vm.fldmaps.ForEach(vfm => vfm.versMap = vm)); var reqFlds = sheetLayout.wsLayout.fields.Where(sf => sf.fldType == FieldType.cell && sf.isRequired); // Match Titles to layout fields foreach (var flvv in md.fldCellVersMaps) { var agMaps = new List<FieldCellMap>(); foreach (var fm in flvv.fldmaps.Where(m => m.Title != null)) { MatchField(tcs, fm, sheetLayout, stringTable, formats, agMaps); } flvv.fldmaps.AddRange(agMaps); // Only keep data containing CellDataLayouts var fmts = new List<CellDataLayout> { CellDataLayout.combined, CellDataLayout.separate }; flvv.fldmaps = flvv.fldmaps.Where(fm => fm.cellLoc != null && fmts.Contains(fm.cellLoc.dataLayout)).ToList(); // Add a filename layout if the field exists. var fileName = sheetLayout.wsLayout.fields.FirstOrDefault(fld => fld.fldType == FieldType.fileName); if (fileName != null) { flvv.fldmaps.Add(new FieldCellMap { Title = FieldType.fileName.ToString(), field = fileName, Value = file.Name }); } // Add a filePath layout if the field exists. var filePath = sheetLayout.wsLayout.fields.FirstOrDefault(fld => fld.fldType == FieldType.filePath); if (filePath != null) { flvv.fldmaps.Add(new FieldCellMap { Title = FieldType.filePath.ToString(), field = filePath, Value = file.FullName }); } // Compute how well the matching went. flvv.noneNullTitleCnt = flvv.fldmaps.Where(fm => !string.IsNullOrWhiteSpace(fm.Title)).Count(); flvv.noMatchCnt = flvv.fldmaps.Where(fm => fm.field == null).Count(); flvv.missingReqFldCnt = reqFlds.Where(rf => !flvv.fldmaps.Select(fm => fm.field).Contains(rf)).Count(); flvv.noReqValCnt = flvv.fldmaps.Where(fm => fm.field != null && fm.field.isRequired && string.IsNullOrWhiteSpace(fm.Value)).Count(); flvv.noValCnt = flvv.fldmaps.Where(fm => fm.field != null && string.IsNullOrWhiteSpace(fm.Value)).Count(); } // Find the best acceptable layout match. var fldLayout_v = md.fldCellVersMaps .Where(fl => fl.noMatchCnt == 0 && fl.noReqValCnt == 0 && fl.missingReqFldCnt == 0) .OrderByDescending(fl => fl.fldmaps.Count()) .OrderBy(fl => fl.noValCnt) .FirstOrDefault(); md.fldCellMap = fldLayout_v; md.matchCnt += md.fldCellMap != null ? 1 : 0; return md; }
public MatchData MatchColLayouts(MatchData md, IEnumerable<Cell> tcs, SheetLayout sheetLayout, SharedStringTablePart stringTable, CellFormats formats, FileInfo file) { // Find and map columns to sheet layouts var field_ord = sheetLayout.wsLayout.fields .Where(c => c.fldType == FieldType.column) .OrderBy(c => c.OutputOrder) .Select(c => c.OutputOrder) .ToList(); // Obtain column titles for all signature versions. md.fldColVersMaps = new List<FieldColumnVersionMap>(); foreach (var sig in sheetLayout.wsLayout.colLayouts.OrderByDescending(scl => scl.titleLocations.Count())) // for each column layout version scape the worksheet for column title values { var fldColMaps = new List<FieldColumnMap>(); var col_ord = sig.titleLocations.OrderBy(so => so.col).Select(so => so.col).ToList(); foreach (var colLayout in sig.titleLocations) { string title = ""; foreach (var c in colLayout.cellRefs) // A column may have a number of title cells that must be scraped and concatinated to product the title used for matching to data columns. { var cl = tcs.FirstOrDefault(cll => cll.CellReference.InnerText == c); var tlt = Spreadsheet.GetCellValue(cl, stringTable.SharedStringTable, formats, null); title += tlt; } title = System.Text.RegularExpressions.Regex.Replace(title.Replace('\n', ' '), @"\s+", " ").Trim().ToLower(); fldColMaps.Add(new FieldColumnMap { column = colLayout.col, title = title, col_order = col_ord.IndexOf(colLayout.col) }); } md.fldColVersMaps.Add(new FieldColumnVersionMap { colLayout = sig, colmaps = fldColMaps }); } // Match the titles to the DataColumns foreach (var fcvm in md.fldColVersMaps) { foreach (var cm in fcvm.colmaps) { try { // TODO: Performance improvement if config strings are pre-processed for whitespace and case. cm.field = sheetLayout.wsLayout.fields .Where(cc => cc.fldType == FieldType.column && cc.titles != null) .FirstOrDefault(cc => { var lct = cc.titles.Select(t => { var tt = System.Text.RegularExpressions.Regex.Replace(t, @"\s+", " "); return tt.ToLower(); }); var hasTitle = lct.Contains(cm.title); return hasTitle; }); // if required field and should be verified then check that first row has value if (sheetLayout.wsLayout.verifyFirstRowData && cm.field != null && cm.field.isRequired) { int col = fcvm.colLayout.colLayoutType == ColLayoutType.Row_Col ? cm.column : fcvm.colLayout.FirstRow; int row = fcvm.colLayout.colLayoutType == ColLayoutType.Row_Col ? fcvm.colLayout.FirstRow : cm.column; var valRef = Spreadsheet.GetCellRef(row, col); var clVal = tcs.FirstOrDefault(clv => clv.CellReference.InnerText == valRef); var val = Spreadsheet.GetCellValue(clVal, stringTable.SharedStringTable, formats, cm.field); if (!string.IsNullOrWhiteSpace(val)) { cm.hasValue = true; cm.firstRowVal = val; } } cm.field_order = cm.field != null ? field_ord.IndexOf(cm.field.OutputOrder) : -9999; } catch (Exception ex) { Log.New.Msg(ex); } } // match by neighbor // - Locate flds located by related field var flds_byRelated = sheetLayout.wsLayout.fields.Where(f => f.locType == LocateType.byRelated); // - Locate column map for related field parent var related_pairs = flds_byRelated.Select(fr => new { fr = fr, cm = fcvm.colmaps.FirstOrDefault(rcm => rcm.field != null && rcm.field.OutputOrder == fr.RelatedCol) }); // - Locate column map for related field var rf_cm = related_pairs.Where(rp => rp.cm != null).ToList().Select(rp => new { rp = rp, rc = fcvm.colmaps.FirstOrDefault(fcm => fcm.col_order == rp.cm.col_order + 1) }); // - Update the field for located column map rf_cm.ToList().ForEach(rfcm => { rfcm.rc.field = rfcm.rp.fr; rfcm.rc.field_order = rfcm.rc.field.OutputOrder; }); fcvm.notNullTitleCnt = fcvm.colmaps.Where(cm => !string.IsNullOrWhiteSpace(cm.title)).Count(); fcvm.noMatchCnt = fcvm.colmaps.Where(cm => cm.field == null).Count(); fcvm.ReqNoValCnt = fcvm.colmaps.Where(cm => cm.field != null && sheetLayout.wsLayout.verifyFirstRowData && cm.field.isRequired && !cm.hasValue).Count(); fcvm.disOrder = (int)fcvm.colmaps.Where(dm => dm.field != null).Select(dm => Math.Pow((dm.field_order - dm.col_order), 2)).Sum(); var dupCols = fcvm.colmaps.Where(dm => dm.field != null).GroupBy(cd => cd.field).Where(d => d.Count() > 1); fcvm.colDups = dupCols.Count(); } // Only match col layout versions with zero mismatch, favoring the version with the lowest disorder. var colLayout_v = md.fldColVersMaps.Where(sv => sv.noMatchCnt == 0 && sv.colDups == 0 && (!sheetLayout.wsLayout.verifyFirstRowData || sv.ReqNoValCnt == 0)).OrderByDescending(sv => sv.notNullTitleCnt).ThenByDescending(sv => sv.disOrder).FirstOrDefault(); md.fldColMap = colLayout_v; md.matchCnt += md.fldColMap != null ? 1 : 0; return md; }
/// <remarks> /// An Excel SpreadSheet contains one or more Worksheets, each may or maynot contain data of interest. /// - Excel is a terrible way to collect data from a large number of different sources in a consistent and reliable way. /// - Be that as it may, Excel is favored by organizations that prefer manpower over automation when performing data processing tasks. /// - Most state agencies are typical of this kind of organization. /// - To top it off most of these agencies give little thought to gathering data in a consistent way. So we are likely to /// recieve a dump of spreadsheets with a variety of inconsistencies. /// /// As varied as these spreadsheets may be, a spreadsheet is expected to contain only a single type of related data set which is called a /// DataSourceType in this application. /// /// A DataSourceType describes how to process the worksheets in a spreadsheet. It indicates: /// - the name of the file to save extracted data. /// - a list of DataWorkSheets /// - an indicator as to how to process the spreadsheet against the list of DataWorkSheets: /// /// • MatchAllDataWorkSheets /// - There must be a one to one correspondence between each DataWorkSheet and each SpreadSheet WorkSheet in order. /// - The DataWorkSheet name must match the SpreadSheet WorkSheet name. /// /// • MatchByClosestWorkSheetLayout /// - Each SpreadSheet Worksheet will be matched against the closest DataWorkSheet/WorkSheetLayout /// /// A DataWorkSheet has /// - a Name to be used when processing the spreadsheet by MatchAllDataWorkSheets. /// - a WorkSheetLayout /// /// A WorkSheetLayout is /// - a collection of field layout versions with additional information about how to determine where to look /// for field cells on the spreadsheet. /// - a collection of data columns with addition information about how to determine where to look /// for the data column on the WorkSheet. /// /// - Each data column in the collection has associated with it a list of column titles that should /// map from the WorkSheet to the data column. /// /// - The WorkSheetLayout also includes a collection of col layout versions. Each of these is a list of cells /// that should be scraped for strings that are concatinated into a column title and the column to associate /// the title with. /// /// - For a given WorkSheet all layouts are processed /// /// There is a layout of column title cells that will be scaped for column titles. Those titles /// are then matched to a list of titles for a given data column. The assumption being that all titles to match /// are unique across all data columns for a given WorkSheetLayout /// </remarks> public MatchData MatchLayouts(Worksheet ws, SheetLayout sheetLayout, SharedStringTablePart stringTable, CellFormats formats, FileInfo file) { MatchData md = new MatchData(); MatchData colMD = null; MatchData cellMD = null; // All cells in worksheet. var tcs = ws.Descendants<Cell>(); switch (sheetLayout.wsLayout.layoutType) { case LayoutType.Both: colMD = MatchColLayouts(md, tcs, sheetLayout, stringTable, formats, file); cellMD = MatchCellLayouts(md, tcs, sheetLayout, stringTable, formats, file); md.fldCellMap = cellMD.fldCellMap; md.fldColMap = colMD.fldColMap; md.isPass = md.fldColMap != null && md.fldCellMap != null; break; case LayoutType.CellOnly: cellMD = MatchCellLayouts(md, tcs, sheetLayout, stringTable, formats, file); md.fldCellMap = cellMD.fldCellMap; md.isPass = md.fldCellMap != null; break; case LayoutType.ColumnOnly: colMD = MatchColLayouts(md, tcs, sheetLayout, stringTable, formats, file); md.fldColMap = colMD.fldColMap; md.isPass = md.fldColMap != null; break; } if (md.isPass) { if (sheetLayout.srcWorksheets == null) sheetLayout.srcWorksheets = new List<Worksheet>(); sheetLayout.srcWorksheets.Add(ws); } return md; }
private void ProcessCells(SheetLayout sLayout, WorksheetPart wsp) { Dictionary<int, Field> cols = new Dictionary<int, Field>(); sLayout.matchData.fldColMap.colmaps.ForEach(cm => cols.Add(cm.column, cm.field)); IEnumerable<CellRowCol> tcs = Enumerable.Empty<CellRowCol>(); switch (sLayout.matchData.fldColMap.colLayout.colLayoutType) { case ColLayoutType.Row_Col: tcs = wsp.Worksheet.Descendants<Cell>() .Where(c => c.InnerText.Length > 0) .Select(t => new CellRowCol { cell = t, row = GetRowNum(t.CellReference.InnerText), col = GetColumn(t.CellReference.InnerText) }) .Where(k => k.row >= sLayout.matchData.fldColMap.colLayout.FirstRow && cols.ContainsKey(k.col)); break; case ColLayoutType.Col_Row: tcs = wsp.Worksheet.Descendants<Cell>() .Where(c => c.InnerText.Length > 0) .Select(t => new CellRowCol { cell = t, col = GetRowNum(t.CellReference.InnerText), row = GetColumn(t.CellReference.InnerText) }) .Where(k => k.row >= sLayout.matchData.fldColMap.colLayout.FirstRow && cols.ContainsKey(k.col)); break; } foreach (var tc in tcs) { try { string sval = GetCellValue(tc.cell, stringTable.SharedStringTable, cellFormats, cols[tc.col]); var dataCell = new DataCellValue { CellReference = tc.cell.CellReference.InnerText, rowNumber = tc.row, colNumber = tc.col, field = cols[tc.col], Value = sval }; sLayout.dataSet.AddCell(dataCell); } catch (Exception ex) { Log.New.Msg(ex); } } }
public DataSet(SheetLayout layout) { sLayout = layout; RequiredCols = sLayout.wsLayout.fields.Where(c => c.isRequired && c.fldType == FieldType.column).Select(c => c.OutputOrder).ToList(); }