Пример #1
0
		public MatchData MatchColLayouts(MatchData md, IEnumerable<Cell> tcs, SheetLayout sheetLayout, SharedStringTablePart stringTable, CellFormats formats, FileInfo file)
		{
			// Find and map columns to sheet layouts
			var field_ord = sheetLayout.wsLayout.fields
							.Where(c => c.fldType == FieldType.column)
							.OrderBy(c => c.OutputOrder)
							.Select(c => c.OutputOrder)
							.ToList();

			// Obtain column titles for all signature versions.
			md.fldColVersMaps = new List<FieldColumnVersionMap>();

			foreach (var sig in sheetLayout.wsLayout.colLayouts.OrderByDescending(scl => scl.titleLocations.Count()))
			// for each column layout version scape the worksheet for column title values
			{
				var fldColMaps = new List<FieldColumnMap>();

				var col_ord = sig.titleLocations.OrderBy(so => so.col).Select(so => so.col).ToList();

				foreach (var colLayout in sig.titleLocations)
				{
					string title = "";
					foreach (var c in colLayout.cellRefs)
					// A column may have a number of title cells that must be scraped and concatinated to product the title used for matching to data columns.
					{
						var cl = tcs.FirstOrDefault(cll => cll.CellReference.InnerText == c);
						var tlt = Spreadsheet.GetCellValue(cl, stringTable.SharedStringTable, formats, null);
						title += tlt;
					}

					title = System.Text.RegularExpressions.Regex.Replace(title.Replace('\n', ' '), @"\s+", " ").Trim().ToLower();
					fldColMaps.Add(new FieldColumnMap { column = colLayout.col, title = title, col_order = col_ord.IndexOf(colLayout.col) });
				}

				md.fldColVersMaps.Add(new FieldColumnVersionMap { colLayout = sig, colmaps = fldColMaps });
			}

			// Match the titles to the DataColumns
			foreach (var fcvm in md.fldColVersMaps)
			{
				foreach (var cm in fcvm.colmaps)
				{
					try
					{
						// TODO: Performance improvement if config strings are pre-processed for whitespace and case.
						cm.field = sheetLayout.wsLayout.fields
										.Where(cc => cc.fldType == FieldType.column && cc.titles != null)
										.FirstOrDefault(cc =>
										{
											var lct = cc.titles.Select(t =>
											{
												var tt = System.Text.RegularExpressions.Regex.Replace(t, @"\s+", " ");
												return tt.ToLower();
											});

											var hasTitle = lct.Contains(cm.title);

											return hasTitle;
										});

						// if required field and should be verified then check that first row has value
						if (sheetLayout.wsLayout.verifyFirstRowData && cm.field != null && cm.field.isRequired)
						{
							int col = fcvm.colLayout.colLayoutType == ColLayoutType.Row_Col ? cm.column : fcvm.colLayout.FirstRow;
							int row = fcvm.colLayout.colLayoutType == ColLayoutType.Row_Col ? fcvm.colLayout.FirstRow : cm.column;

							var valRef = Spreadsheet.GetCellRef(row, col);
							var clVal = tcs.FirstOrDefault(clv => clv.CellReference.InnerText == valRef);

							var val = Spreadsheet.GetCellValue(clVal, stringTable.SharedStringTable, formats, cm.field);

							if (!string.IsNullOrWhiteSpace(val))
							{
								cm.hasValue = true;
								cm.firstRowVal = val;
							}
						}

						cm.field_order = cm.field != null ? field_ord.IndexOf(cm.field.OutputOrder) : -9999;
					}
					catch (Exception ex)
					{
						Log.New.Msg(ex);
					}
				}

				// match by neighbor
				// - Locate flds located by related field
				var flds_byRelated = sheetLayout.wsLayout.fields.Where(f => f.locType == LocateType.byRelated);

				// - Locate column map for related field parent
				var related_pairs = flds_byRelated.Select(fr => new { fr = fr, cm = fcvm.colmaps.FirstOrDefault(rcm => rcm.field != null && rcm.field.OutputOrder == fr.RelatedCol) });

				// - Locate column map for related field
				var rf_cm = related_pairs.Where(rp => rp.cm != null).ToList().Select(rp => new { rp = rp, rc = fcvm.colmaps.FirstOrDefault(fcm => fcm.col_order == rp.cm.col_order + 1) });

				// - Update the field for located column map
				rf_cm.ToList().ForEach(rfcm =>
				{
					rfcm.rc.field = rfcm.rp.fr;
					rfcm.rc.field_order = rfcm.rc.field.OutputOrder;
				});

				fcvm.notNullTitleCnt = fcvm.colmaps.Where(cm => !string.IsNullOrWhiteSpace(cm.title)).Count();
				fcvm.noMatchCnt = fcvm.colmaps.Where(cm => cm.field == null).Count();
				fcvm.ReqNoValCnt = fcvm.colmaps.Where(cm => cm.field != null && sheetLayout.wsLayout.verifyFirstRowData && cm.field.isRequired && !cm.hasValue).Count();
				fcvm.disOrder = (int)fcvm.colmaps.Where(dm => dm.field != null).Select(dm => Math.Pow((dm.field_order - dm.col_order), 2)).Sum();
				var dupCols = fcvm.colmaps.Where(dm => dm.field != null).GroupBy(cd => cd.field).Where(d => d.Count() > 1);
				fcvm.colDups = dupCols.Count();
			}

			// Only match col layout versions with zero mismatch, favoring the version with the lowest disorder.
			var colLayout_v = md.fldColVersMaps.Where(sv => sv.noMatchCnt == 0 && sv.colDups == 0 && (!sheetLayout.wsLayout.verifyFirstRowData || sv.ReqNoValCnt == 0)).OrderByDescending(sv => sv.notNullTitleCnt).ThenByDescending(sv => sv.disOrder).FirstOrDefault();

			md.fldColMap = colLayout_v;

			md.matchCnt += md.fldColMap != null ? 1 : 0;

			return md;
		}
Пример #2
0
		public MatchData MatchCellLayouts(MatchData md, IEnumerable<Cell> tcs, SheetLayout sheetLayout, SharedStringTablePart stringTable, CellFormats formats, FileInfo file)
		{
			// Obtain titles for all field cell layouts
			md.fldCellVersMaps = new List<FieldCellVersionMap>();

			foreach (var fldLayout in sheetLayout.wsLayout.cellLayouts)
			{
				var fldLayoutVals = new List<FieldCellMap>();

				foreach (var cellLoc in fldLayout.cellLocations)
				{
					try
					{
						var cl = tcs.FirstOrDefault(cll => cll.CellReference.InnerText == cellLoc.TitleRef);
						var clVal = tcs.FirstOrDefault(clv => clv.CellReference.InnerText == cellLoc.ValueRef);

						var title = Spreadsheet.GetCellValue(cl, stringTable.SharedStringTable, formats, null);
						if (title != null) title = System.Text.RegularExpressions.Regex.Replace(title, @"\s+", " ").Trim().ToLower();
						var val = Spreadsheet.GetCellValue(clVal, stringTable.SharedStringTable, formats, null);

						fldLayoutVals.Add(new FieldCellMap
						{
							cellLoc = cellLoc,
							Title = string.IsNullOrWhiteSpace(title) ? null : title.Trim().ToLower(),
							Value = string.IsNullOrWhiteSpace(val) ? null : val.Trim()
						});
					}
					catch (Exception ex)
					{
						Log.New.Msg(ex);
					}
				}

				md.fldCellVersMaps.Add(new FieldCellVersionMap { fldmaps = fldLayoutVals, fldLayout = fldLayout });
			}

			md.fldCellVersMaps.ForEach(vm => vm.fldmaps.ForEach(vfm => vfm.versMap = vm));

			var reqFlds = sheetLayout.wsLayout.fields.Where(sf => sf.fldType == FieldType.cell && sf.isRequired);

			// Match Titles to layout fields
			foreach (var flvv in md.fldCellVersMaps)
			{
				var agMaps = new List<FieldCellMap>();

				foreach (var fm in flvv.fldmaps.Where(m => m.Title != null))
				{
					MatchField(tcs, fm, sheetLayout, stringTable, formats, agMaps);
				}

				flvv.fldmaps.AddRange(agMaps);

				// Only keep data containing CellDataLayouts
				var fmts = new List<CellDataLayout> { CellDataLayout.combined, CellDataLayout.separate };
				flvv.fldmaps = flvv.fldmaps.Where(fm => fm.cellLoc != null && fmts.Contains(fm.cellLoc.dataLayout)).ToList();

				// Add a filename layout if the field exists.
				var fileName = sheetLayout.wsLayout.fields.FirstOrDefault(fld => fld.fldType == FieldType.fileName);
				if (fileName != null)
				{
					flvv.fldmaps.Add(new FieldCellMap { Title = FieldType.fileName.ToString(), field = fileName, Value = file.Name });
				}

				// Add a filePath layout if the field exists.
				var filePath = sheetLayout.wsLayout.fields.FirstOrDefault(fld => fld.fldType == FieldType.filePath);
				if (filePath != null)
				{
					flvv.fldmaps.Add(new FieldCellMap { Title = FieldType.filePath.ToString(), field = filePath, Value = file.FullName });
				}

				// Compute how well the matching went.
				flvv.noneNullTitleCnt = flvv.fldmaps.Where(fm => !string.IsNullOrWhiteSpace(fm.Title)).Count();
				flvv.noMatchCnt = flvv.fldmaps.Where(fm => fm.field == null).Count();
				flvv.missingReqFldCnt = reqFlds.Where(rf => !flvv.fldmaps.Select(fm => fm.field).Contains(rf)).Count();
				flvv.noReqValCnt = flvv.fldmaps.Where(fm => fm.field != null && fm.field.isRequired && string.IsNullOrWhiteSpace(fm.Value)).Count();
				flvv.noValCnt = flvv.fldmaps.Where(fm => fm.field != null && string.IsNullOrWhiteSpace(fm.Value)).Count();
			}

			// Find the best acceptable layout match.
			var fldLayout_v = md.fldCellVersMaps
									.Where(fl => fl.noMatchCnt == 0 && fl.noReqValCnt == 0 && fl.missingReqFldCnt == 0)
									.OrderByDescending(fl => fl.fldmaps.Count())
									.OrderBy(fl => fl.noValCnt)
									.FirstOrDefault();

			md.fldCellMap = fldLayout_v;

			md.matchCnt += md.fldCellMap != null ? 1 : 0;

			return md;
		}
Пример #3
0
		/// <remarks>
		///		An Excel SpreadSheet contains one or more Worksheets, each may or maynot contain data of interest.
		///			- Excel is a terrible way to collect data from a large number of different sources in a consistent and reliable way.
		///			- Be that as it may, Excel is favored by organizations that prefer manpower over automation when performing data processing tasks.
		///				- Most state agencies are typical of this kind of organization.
		///					- To top it off most of these agencies give little thought to gathering data in a consistent way. So we are likely to
		///						recieve a dump of spreadsheets with a variety of inconsistencies.
		///		
		///		As varied as these spreadsheets may be, a spreadsheet is expected to contain only a single type of related data set which is called a
		///			DataSourceType in this application.
		///		
		///		A DataSourceType describes how to process the worksheets in a spreadsheet. It indicates:
		///			- the name of the file to save extracted data.
		///			- a list of DataWorkSheets
		///			- an indicator as to how to process the spreadsheet against the list of DataWorkSheets:
		///			
		///				• MatchAllDataWorkSheets 
		///					- There must be a one to one correspondence between each DataWorkSheet and each SpreadSheet WorkSheet in order.
		///					- The DataWorkSheet name must match the SpreadSheet WorkSheet name.
		///					
		///				• MatchByClosestWorkSheetLayout 
		///					- Each SpreadSheet Worksheet will be matched against the closest DataWorkSheet/WorkSheetLayout
		///		
		///		A DataWorkSheet has
		///			- a Name to be used when processing the spreadsheet by MatchAllDataWorkSheets.
		///			- a WorkSheetLayout
		/// 
		///		A WorkSheetLayout is
		///			- a collection of field layout versions with additional information about how to determine where to look 
		///				for field cells on the spreadsheet.
		///			- a collection of data columns with addition information about how to determine where to look
		///				for the data column on the WorkSheet.
		///		
		///		- Each data column in the collection has associated with it a list of column titles that should 
		///			map from the WorkSheet to the data column.
		///			
		///		- The WorkSheetLayout also includes a collection of col layout versions. Each of these is a list of cells 
		///			that should be scraped for strings that are concatinated into a column title and the column to associate
		///			the title with.
		///			
		///		- For a given WorkSheet all layouts are processed 
		///		
		///		There is a layout of column title cells that will be scaped for column titles. Those titles 
		///		are then matched to a list of titles for a given data column. The assumption being that all titles to match
		///		are unique across all data columns for a given WorkSheetLayout
		/// </remarks>
		public MatchData MatchLayouts(Worksheet ws, SheetLayout sheetLayout, SharedStringTablePart stringTable, CellFormats formats, FileInfo file)
		{
			MatchData md = new MatchData();
			MatchData colMD = null;
			MatchData cellMD = null;

			// All cells in worksheet.
			var tcs = ws.Descendants<Cell>();

			switch (sheetLayout.wsLayout.layoutType)
			{
				case LayoutType.Both:
					colMD = MatchColLayouts(md, tcs, sheetLayout, stringTable, formats, file);
					cellMD = MatchCellLayouts(md, tcs, sheetLayout, stringTable, formats, file);
					md.fldCellMap = cellMD.fldCellMap;
					md.fldColMap = colMD.fldColMap;
					md.isPass = md.fldColMap != null && md.fldCellMap != null;
					break;
				case LayoutType.CellOnly:
					cellMD = MatchCellLayouts(md, tcs, sheetLayout, stringTable, formats, file);
					md.fldCellMap = cellMD.fldCellMap;
					md.isPass =  md.fldCellMap != null;
					break;
				case LayoutType.ColumnOnly:
					colMD = MatchColLayouts(md, tcs, sheetLayout, stringTable, formats, file);
					md.fldColMap = colMD.fldColMap;
					md.isPass = md.fldColMap != null;
					break;
			}

			if (md.isPass)
			{
				if (sheetLayout.srcWorksheets == null)
					sheetLayout.srcWorksheets = new List<Worksheet>();

				sheetLayout.srcWorksheets.Add(ws);
			}

			return md;
		}