/// <summary> /// Gets schema from a data table, but parses the data to determine what datatype is best fit. /// </summary> /// <param name="outputTableName">The output table name e.g. "CREATE TABLE [NAME] (..."</param> /// <param name="data">Data to analyze</param> /// <returns></returns> public static string FromDataTable_Smart(string outputTableName, DataTable data) { var sqlTable = new SqlTableDefinition { TableName = outputTableName }; //loop through each column for (var colIndex = 0; colIndex < data.Columns.Count; colIndex++) { var rowVals = new HashSet <string>(StringComparer.OrdinalIgnoreCase); //collect row values into hashset for (var rowIndex = 0; rowIndex < data.Rows.Count; rowIndex++) { rowVals.Add(data.Rows[rowIndex][colIndex].ToString()); } sqlTable.ColumnDefinitions.Add(CreateTableSqlInternal.GetBestFitSqlColumnType(rowVals, data.Columns[colIndex].ColumnName)); } return(CreateTableSqlInternal.FromSqlTableDefinition(sqlTable)); }
public static string FromDataReader_Smart(string outputTableName, IEnumerable <Func <DataReaderInfo> > dataReaders, int?numberOfRowsToExamine = null) { //maps column names to hashset of unique values present for those columns var uniqueValList = new ConcurrentDictionary <string, ConcurrentHashSet <string> >(); //we want the field names because these are our output table sql's column names //this is not thread safe var masterListOfFieldNames = new ConcurrentHashSet <string>(); //fit the data types from the data readers' data var result = Parallel.ForEach(dataReaders, new ParallelOptions { MaxDegreeOfParallelism = 12 }, dataReaderInfoFac => { //get column names and setup hash dictionaries var numRows = 0; var drInfo = dataReaderInfoFac(); using var dataReader = drInfo.DataReader; var readerFieldNames = dataReader.GetFieldNames(); //if there was a problem we may have to read to read first to initialize in order to get fields var tryAgain = (readerFieldNames == null) || (readerFieldNames.Length == 0); if (tryAgain) { dataReader.Read(); readerFieldNames = dataReader.GetFieldNames(); numRows++; } if ((readerFieldNames == null) || (readerFieldNames.Length == 0)) { throw new Exception($"Could not get field names for file {drInfo.FilePath}"); } //unfortunately this duplicated code is here for a reason.. we want to read to initialize the reader, but we also want to capture the first row if we already read. Please refactor though foreach (var readerFieldName in readerFieldNames) { if (!uniqueValList.ContainsKey(readerFieldName)) { uniqueValList.AddOrUpdate( readerFieldName, //our key for the dictionary new ConcurrentHashSet <string>(), //if it doesn't exist create a new concurrent dictionary (field, uniqueVals) => //otherwise we want to return the new updated value uniqueVals); } ; if (masterListOfFieldNames.Contains(readerFieldName) == false) { masterListOfFieldNames.Add(readerFieldName); } } //we tried again and got the field names so we can go ahead and add the values that are currently on record if (tryAgain) { foreach (var col in readerFieldNames) { var val = dataReader[col]?.ToString(); var uvl = uniqueValList[col]; uvl.Add(val); } } //collect unique row values by column while (dataReader.Read() && ((numRows < numberOfRowsToExamine) || (numberOfRowsToExamine == null))) { foreach (var col in readerFieldNames) { var val = dataReader[col]?.ToString(); var uvl = uniqueValList[col]; uvl.Add(val); } numRows++; } }); var sqlTable = new SqlTableDefinition { TableName = outputTableName }; foreach (var col in masterListOfFieldNames.ToArray()) { sqlTable.ColumnDefinitions.Add(CreateTableSqlInternal.GetBestFitSqlColumnType(uniqueValList[col].Hashset, col)); } return(CreateTableSqlInternal.FromSqlTableDefinition(sqlTable)); }