/// <summary> /// Combines a Sub check to an overall check /// </summary> /// <param name="subResult">The sub result.</param> public void KeepBestPossibleMatch(CheckResult subResult) { if (subResult == null || !subResult.PossibleMatch) { return; } if (this.PossibleMatch == false || subResult.ExampleNonMatch.Count < this.ExampleNonMatch.Count) { ExampleNonMatch.Clear(); this.PossibleMatch = true; this.ValueFormatPossibleMatch = subResult.ValueFormatPossibleMatch; foreach (var ex in subResult.ExampleNonMatch) { if (string.IsNullOrEmpty(ex)) { continue; } this.ExampleNonMatch.Add(ex); } } }
/// <summary> /// Combines a Sub check to an overall check /// </summary> /// <param name="subResult">The sub result.</param> public void KeepBestPossibleMatch(CheckResult subResult) { if (subResult == null || !subResult.PossibleMatch) { return; } if (PossibleMatch && subResult.ExampleNonMatch.Count >= ExampleNonMatch.Count) { return; } ExampleNonMatch.Clear(); PossibleMatch = true; ValueFormatPossibleMatch = subResult.ValueFormatPossibleMatch; foreach (var ex in subResult.ExampleNonMatch) { if (!string.IsNullOrEmpty(ex)) { ExampleNonMatch.Add(ex); } } }
/// <summary> /// Guesses the value format. /// </summary> /// <param name="cancellationToken">A cancellation token</param> /// <param name="samples">The samples.</param> /// <param name="minRequiredSamples">The minimum required samples.</param> /// <param name="trueValue">The text to be regarded as <c>true</c></param> /// <param name="falseValue">The text to be regarded as <c>false</c></param> /// <param name="guessBoolean">Try to identify a boolean</param> /// <param name="guessGuid">Try to determine if its a GUID</param> /// <param name="guessNumeric">Try to determine if its a Number</param> /// <param name="guessDateTime">Try to determine if it is a date time</param> /// <param name="guessPercentage">Accept percentage values</param> /// <param name="serialDateTime">Allow serial Date time</param> /// <param name="checkNamedDates">if set to <c>true</c> [check named dates].</param> /// <returns><c>Null</c> if no format could be determined otherwise a <see cref="ValueFormat" /></returns> public static CheckResult GuessValueFormat(IList <string> samples, int minRequiredSamples, string trueValue, string falseValue, bool guessBoolean, bool guessGuid, bool guessNumeric, bool guessDateTime, bool guessPercentage, bool serialDateTime, bool checkNamedDates, ValueFormat othersValueFormatDate, CancellationToken cancellationToken) { Contract.Requires(samples != null); if (samples.IsEmpty()) { return(null); } var count = samples.Count(); var checkResult = new CheckResult { FoundValueFormat = new ValueFormat() }; // if it only one sample value and its false, assume its a boolean if (guessBoolean && count == 1 && !string.IsNullOrEmpty(falseValue)) { foreach (var value in samples) { if (value.Equals(falseValue, StringComparison.OrdinalIgnoreCase)) { checkResult.FoundValueFormat.DataType = DataType.Boolean; return(checkResult); } break; } } if (cancellationToken.IsCancellationRequested) { return(null); } // this could be a boolean if (guessBoolean && count <= 2) { var allParsed = true; string usedTrueValue = null; string usedFalseValue = null; foreach (var value in samples) { var result = StringConversion.StringToBooleanStrict(value, trueValue, falseValue); if (result == null) { allParsed = false; break; } if (result.Item1) { usedTrueValue = result.Item2; } else { usedFalseValue = result.Item2; } } if (allParsed) { checkResult.FoundValueFormat.DataType = DataType.Boolean; if (!string.IsNullOrEmpty(usedTrueValue)) { checkResult.FoundValueFormat.True = usedTrueValue; } if (!string.IsNullOrEmpty(usedFalseValue)) { checkResult.FoundValueFormat.False = usedFalseValue; } return(checkResult); } } if (cancellationToken.IsCancellationRequested) { return(null); } if (guessGuid && StringConversion.CheckGuid(samples)) { checkResult.FoundValueFormat.DataType = DataType.Guid; return(checkResult); } if (cancellationToken.IsCancellationRequested) { return(null); } // in case we have named dates, this is not feasible if (!checkNamedDates) { // Trying some chars, if they are in, assume its a string var valuesWithChars = 0; foreach (var value in samples) { // Not having AM PM or T as it might be part of a date Not having E in there as might be // part of a number u 1.487% o 6.264% n 2.365% i 6.286% h 7.232% s 6.327% This adds to a // 30% chance for each position in the text to determine if a text a regular text, if (value.IndexOfAny(new[] { 'u', 'U', 'o', 'O', 'i', 'I', 'n', 'N', 's', 'S', 'h', 'H' }) <= -1) { continue; } valuesWithChars++; // Only do so if more then half of the samples are string if (valuesWithChars < count / 2 && valuesWithChars < 10) { continue; } checkResult.FoundValueFormat.DataType = DataType.String; return(checkResult); } } if (count < minRequiredSamples && guessDateTime && othersValueFormatDate != null) { var res = StringConversion.CheckDate(samples, othersValueFormatDate.DateFormat, othersValueFormatDate.DateSeparator, othersValueFormatDate.TimeSeparator, CultureInfo.CurrentCulture); if (res.FoundValueFormat != null) { return(res); } } // if we have less than the required samples values do not try and try to get a type if (count < minRequiredSamples || cancellationToken.IsCancellationRequested) { return(null); } var firstValue = samples.First(); if (cancellationToken.IsCancellationRequested) { return(null); } // Guess a date format that could be interpreted as number before testing numbers if (guessDateTime && firstValue.Length == 8) { var res = StringConversion.CheckDate(samples, "yyyyMMdd", string.Empty, ":", CultureInfo.InvariantCulture); if (res.FoundValueFormat != null) { return(res); } checkResult.KeepBestPossibleMatch(res); } if (cancellationToken.IsCancellationRequested) { return(null); } // We need to have at least 10 sample values here its too dangerous to assume it is a date if (guessDateTime && serialDateTime && count > 10 && count > minRequiredSamples) { var res = StringConversion.CheckSerialDate(samples, true); if (res.FoundValueFormat != null) { return(res); } checkResult.KeepBestPossibleMatch(res); } if (cancellationToken.IsCancellationRequested) { return(null); } // assume dates are of the same format across the files we check if the dates // we have would possibly match no matter how many samples we have if (guessDateTime && othersValueFormatDate != null) { var res = StringConversion.CheckDate(samples, othersValueFormatDate.DateFormat, othersValueFormatDate.DateSeparator, othersValueFormatDate.TimeSeparator, CultureInfo.CurrentCulture); if (res.FoundValueFormat != null) { return(res); } } if (cancellationToken.IsCancellationRequested) { return(null); } if (guessNumeric) { var res = GuessNumeric(samples, guessPercentage, false, cancellationToken); if (res.FoundValueFormat != null) { return(res); } checkResult.KeepBestPossibleMatch(res); } if (cancellationToken.IsCancellationRequested) { return(null); } // Minimum length of a date is 4 characters if (guessDateTime && firstValue.Length > 3) { var res = GuessDateTime(samples, checkNamedDates, cancellationToken); if (res.FoundValueFormat != null) { return(res); } checkResult.KeepBestPossibleMatch(res); } if (cancellationToken.IsCancellationRequested) { return(null); } // if we have dates and allow serial dates, but do not guess numeric (this would be a fit) try // if the dates are all serial if (!guessDateTime || !serialDateTime || guessNumeric) { return(checkResult); } { var res = StringConversion.CheckSerialDate(samples, false); if (res.FoundValueFormat != null) { return(res); } checkResult.KeepBestPossibleMatch(res); } return(checkResult); }
public static CheckResult GuessNumeric(IList <string> samples, bool guessPercentage, bool allowStartingZero, CancellationToken cancellationToken) { var checkResult = new CheckResult(); var possibleGrouping = new List <char>(); // Determine which decimalGrouping could be used foreach (var caracter in StringConversion.DecimalGroupings) { if (caracter == '\0') { continue; } foreach (var smp in samples) { if (smp.IndexOf(caracter) <= -1) { continue; } possibleGrouping.Add(caracter); break; } } possibleGrouping.Add('\0'); var possibleDecimal = new List <char>(); foreach (var caracter in StringConversion.DecimalSeparators) { if (caracter == '\0') { continue; } foreach (var smp in samples) { if (smp.IndexOf(caracter) <= -1) { continue; } possibleDecimal.Add(caracter); break; } } // Need to have at least one decimal separator if (possibleDecimal.Count == 0) { possibleDecimal.Add('.'); } foreach (var thousandSeparator in possibleGrouping) { // Try Numbers: Int and Decimal foreach (var decimalSeparator in possibleDecimal) { if (cancellationToken.IsCancellationRequested) { return(null); } if (decimalSeparator.Equals(thousandSeparator)) { continue; } var res = StringConversion.CheckNumber(samples, decimalSeparator, thousandSeparator, guessPercentage, allowStartingZero); if (res.FoundValueFormat != null) { return(res); } checkResult.KeepBestPossibleMatch(res); } } return(checkResult); }
/// <summary> /// Fills the Column Format for reader fileSettings /// </summary> /// <param name="fileSetting">The file setting to check, and fill</param> /// <param name="addTextColumns">if set to <c>true</c> event string columns are added.</param> /// <param name="processDisplay">The process display.</param> public static IList <string> FillGuessColumnFormatReader(this IFileSetting fileSetting, bool addTextColumns, IProcessDisplay processDisplay) { if (processDisplay == null) { throw new ArgumentNullException(nameof(processDisplay)); } Contract.Requires(fileSetting != null); var result = new List <string>(); // if we should not detect, we can finish if (!ApplicationSetting.FillGuessSettings.DetectBoolean && !ApplicationSetting.FillGuessSettings.DetectGUID && !ApplicationSetting.FillGuessSettings.DectectNumbers && !ApplicationSetting.FillGuessSettings.DetectDateTime && !ApplicationSetting.FillGuessSettings.DectectPercentage && !ApplicationSetting.FillGuessSettings.SerialDateTime) { return(result); } var resetSkipRows = false; try { // Make sure that if we do have a CSV file without header that we will skip the first row that // might contain headers, but its simply set as without headers. if (fileSetting is CsvFile && !fileSetting.HasFieldHeader && fileSetting.SkipRows == 0) { fileSetting.SkipRows = 1; resetSkipRows = true; } var othersValueFormatDate = CommonDateFormat(fileSetting.Column.Select(x => x.ValueFormat)); using (var fileReader = fileSetting.GetFileReader()) { Contract.Assume(fileReader != null); // fileReader.ProcessDisplay = processDisplay; fileReader.Open(false, processDisplay.CancellationToken); if (fileReader.FieldCount == 0 || fileReader.EndOfFile) { return(result); } processDisplay.SetProcess("Getting column headers"); processDisplay.Maximum = fileReader.FieldCount; var columnNamesInFile = new List <string>(); for (var colindex = 0; colindex < fileReader.FieldCount; colindex++) { var newColumn = fileReader.GetColumn(colindex); Contract.Assume(newColumn != null); columnNamesInFile.Add(newColumn.Name); var oldColumn = fileSetting.GetColumn(newColumn.Name); processDisplay.SetProcess(newColumn.Name + " – Getting values", colindex); var samples = GetSampleValues(fileReader, ApplicationSetting.FillGuessSettings.CheckedRecords, colindex, ApplicationSetting.FillGuessSettings.SampleValues, fileSetting.TreatTextAsNull, processDisplay.CancellationToken); if (samples.IsEmpty()) { processDisplay.SetProcess(newColumn.Name + " – No values found", colindex); if (!addTextColumns) { continue; } result.Add($"{newColumn.Name} – No values found – Format : {newColumn.GetTypeAndFormatDescription()}"); fileSetting.ColumnAdd(newColumn); } else { var detect = !(ApplicationSetting.FillGuessSettings.IgnoreIdColums && StringUtils.AssumeIDColumn(newColumn.Name) > 0); if (samples.Count < 10) { processDisplay.SetProcess($"{newColumn.Name} – Only {samples.Count} values found in {ApplicationSetting.FillGuessSettings.CheckedRecords} rows", colindex); } else { processDisplay.SetProcess($"{newColumn.Name} – {samples.Count} values found – Examining format", colindex); } var checkResult = GuessValueFormat(samples, ApplicationSetting.FillGuessSettings.MinSamplesForIntDate, ApplicationSetting.FillGuessSettings.TrueValue, ApplicationSetting.FillGuessSettings.FalseValue, ApplicationSetting.FillGuessSettings.DetectBoolean && detect, ApplicationSetting.FillGuessSettings.DetectGUID && detect, ApplicationSetting.FillGuessSettings.DectectNumbers && detect, ApplicationSetting.FillGuessSettings.DetectDateTime && detect, ApplicationSetting.FillGuessSettings.DectectPercentage && detect, ApplicationSetting.FillGuessSettings.SerialDateTime && detect, ApplicationSetting.FillGuessSettings.CheckNamedDates && detect, othersValueFormatDate, processDisplay.CancellationToken); if (checkResult == null) { if (addTextColumns) { checkResult = new CheckResult { FoundValueFormat = new ValueFormat() } } ; else { continue; } } // if we have a mapping to a template that expects a integer and we only have integers but not enough... if (oldColumn != null) { var oldValueFormat = oldColumn.GetTypeAndFormatDescription(); // if we have a date value format already store this if (othersValueFormatDate == null && checkResult.FoundValueFormat.DataType == DataType.DateTime && checkResult.PossibleMatch) { othersValueFormatDate = checkResult.FoundValueFormat; } if (checkResult.FoundValueFormat.Equals(oldColumn.ValueFormat)) { processDisplay.SetProcess($"{newColumn.Name} – Format : {oldValueFormat} – not changed", colindex); } else { oldColumn.ValueFormat = checkResult.FoundValueFormat; } var newValueFormat = checkResult.FoundValueFormat.GetTypeAndFormatDescription(); if (oldValueFormat.Equals(newValueFormat, StringComparison.Ordinal)) { continue; } var msg = $"{newColumn.Name} – Format : {newValueFormat} – updated from {oldValueFormat}"; result.Add(msg); processDisplay.SetProcess(msg, colindex); } else { if (!addTextColumns && checkResult.FoundValueFormat.DataType == DataType.String) { continue; } newColumn.ValueFormat = checkResult.FoundValueFormat; var msg = $"{newColumn.Name} – Format : {newColumn.GetTypeAndFormatDescription()}"; processDisplay.SetProcess(msg, colindex); result.Add(msg); fileSetting.ColumnAdd(newColumn); } } } // The fileReader does not have the column information yet, let the reader know fileReader.OverrideColumnFormatFromSetting(fileReader.FieldCount); // in case its Excel, check all doubles if they could be integer if (fileSetting is IExcelFile) { for (var colindex = 0; colindex < fileReader.FieldCount; colindex++) { var oldColumn = fileReader.GetColumn(colindex); var detect = !(ApplicationSetting.FillGuessSettings.IgnoreIdColums && StringUtils.AssumeIDColumn(oldColumn.Name) > 0); if (oldColumn != null && oldColumn.DataType == DataType.Double) { Column newColumn = null; if (detect) { var samples = GetSampleValues(fileReader, ApplicationSetting.FillGuessSettings.CheckedRecords, colindex, ApplicationSetting.FillGuessSettings.SampleValues, fileSetting.TreatTextAsNull, processDisplay.CancellationToken); if (!samples.IsEmpty()) { var checkResult = GuessNumeric(samples, false, true, processDisplay.CancellationToken); if (checkResult != null && checkResult.FoundValueFormat.DataType != DataType.Double) { newColumn = fileSetting.GetColumn(oldColumn.Name); if (newColumn == null) { newColumn = fileSetting.ColumnAdd(oldColumn); } newColumn.DataType = checkResult.FoundValueFormat.DataType; } } } else { newColumn = fileSetting.GetColumn(oldColumn.Name); if (newColumn == null) { newColumn = fileSetting.ColumnAdd(oldColumn); } newColumn.DataType = DataType.String; } if (newColumn != null) { var msg = $"{newColumn.Name} – Overwritten Excel Format : {newColumn.GetTypeAndFormatDescription()}"; processDisplay.SetProcess(msg, colindex); result.Add(msg); } } } } if (ApplicationSetting.FillGuessSettings.DateParts) { // Try to find a time for a date if the date does not already have a time // Case a) TimeFormat has already been recognized for (var colindex = 0; colindex < fileReader.FieldCount; colindex++) { var columnDate = fileReader.GetColumn(colindex); // Possibly add Time Zone if (columnDate.DataType == DataType.DateTime && string.IsNullOrEmpty(columnDate.TimeZonePart)) { for (var coltimeZone = 0; coltimeZone < fileReader.FieldCount; coltimeZone++) { var columnTimeZone = fileReader.GetColumn(coltimeZone); var colName = columnTimeZone.Name.NoSpecials().ToUpperInvariant(); if (columnTimeZone.DataType != DataType.String && columnTimeZone.DataType != DataType.Integer || colName != "TIMEZONE" && colName != "TIMEZONEID" && colName != "TIME ZONE" && colName != "TIME ZONE ID") { continue; } columnDate.TimeZonePart = columnTimeZone.Name; result.Add($"{columnDate.Name} – Added Time Zone : {columnTimeZone.Name}"); } } if (columnDate.DataType != DataType.DateTime || !string.IsNullOrEmpty(columnDate.TimePart) || columnDate.ValueFormat.DateFormat.IndexOfAny(new[] { ':', 'h', 'H', 'm', 's', 't' }) != -1) { continue; } // We have a date column without time for (var coltime = 0; coltime < fileReader.FieldCount; coltime++) { var columnTime = fileReader.GetColumn(coltime); if (columnTime.DataType != DataType.DateTime || !string.IsNullOrEmpty(columnDate.TimePart) || columnTime.ValueFormat.DateFormat.IndexOfAny(new[] { '/', 'y', 'M', 'd' }) != -1) { continue; } // We now have a time column, // checked if the names somehow make sense if (!columnDate.Name.NoSpecials().ToUpperInvariant().Replace("DATE", string.Empty).Equals(columnTime.Name.NoSpecials().ToUpperInvariant().Replace("TIME", string.Empty), StringComparison.Ordinal)) { continue; } columnDate.TimePart = columnTime.Name; columnDate.TimePartFormat = columnTime.ValueFormat.DateFormat; result.Add($"{columnDate.Name} – Added Time Part : {columnTime.Name}"); } } // Case b) TimeFormat has not been recognized (e.G. all values are 08:00) only look in adjacent fields for (var colindex = 0; colindex < fileReader.FieldCount; colindex++) { var columnDate = fileReader.GetColumn(colindex); if (columnDate.DataType != DataType.DateTime || !string.IsNullOrEmpty(columnDate.TimePart) || columnDate.ValueFormat.DateFormat.IndexOfAny(new[] { ':', 'h', 'H', 'm', 's', 't' }) != -1) { continue; } if (colindex + 1 < fileReader.FieldCount) { var columnTime = fileReader.GetColumn(colindex + 1); if (columnTime.DataType == DataType.String && columnDate.Name.NoSpecials().ToUpperInvariant() .Replace("DATE", string.Empty) .Equals(columnTime.Name.NoSpecials().ToUpperInvariant().Replace("TIME", string.Empty), StringComparison.OrdinalIgnoreCase)) { columnDate.TimePart = columnTime.Name; { var samples = GetSampleValues(fileReader, 1, colindex + 1, 1, fileSetting.TreatTextAsNull, processDisplay.CancellationToken); var first = samples.FirstOrDefault(); if (first != null) { if (first.Length == 8 || first.Length == 5) { columnTime.DataType = DataType.DateTime; var val = new ValueFormat(DataType.DateTime) { DateFormat = first.Length == 8 ? "HH:mm:ss" : "HH:mm" }; columnTime.ValueFormat = val; fileSetting.ColumnAdd(columnTime); result.Add($"{columnTime.Name} – Format : {columnTime.GetTypeAndFormatDescription()}"); } } } result.Add($"{columnDate.Name} – Added Time Part : {columnTime.Name}"); continue; } } if (colindex <= 0) { continue; } { var columnTime = fileReader.GetColumn(colindex - 1); if (columnTime.DataType != DataType.String || !columnDate.Name.NoSpecials().ToUpperInvariant().Replace("DATE", string.Empty).Equals(columnTime.Name.NoSpecials().ToUpperInvariant().Replace("TIME", string.Empty), StringComparison.Ordinal)) { continue; } columnDate.TimePart = columnTime.Name; { var samples = GetSampleValues(fileReader, 1, colindex - 1, 1, fileSetting.TreatTextAsNull, processDisplay.CancellationToken); var first = samples.FirstOrDefault(); if (first != null) { if (first.Length == 8 || first.Length == 5) { var val = new ValueFormat(DataType.DateTime) { DateFormat = first.Length == 8 ? "HH:mm:ss" : "HH:mm" }; fileSetting.ColumnAdd(columnTime); columnTime.ValueFormat = val; result.Add($"{columnTime.Name} – Format : {columnTime.GetTypeAndFormatDescription()}"); } } } result.Add($"{columnDate.Name} – Added Time Part : {columnTime.Name}"); } } } // Sort the columns in fileSetting by order in file fileSetting.SortColumnByName(columnNamesInFile); } } finally { if (resetSkipRows) { fileSetting.SkipRows = 0; } } return(result); }