private void DetectQuoted(RecordFormatInfo format, string[][] data) { if (format.ClassBuilder is FixedLengthClassBuilder) { return; } }
private void CreateDelimiterOptions(string[][] sampleData, List <RecordFormatInfo> res, char delimiter) { List <DelimiterInfo> delimiters = new List <DelimiterInfo>(); if (delimiter == '\0') { delimiters = GetDelimiters(sampleData); } else { delimiters.Add(GetDelimiterInfo(sampleData, delimiter)); } foreach (DelimiterInfo info in delimiters) { RecordFormatInfo format = new RecordFormatInfo(); format.mConfidence = (int)((1 - info.Deviation) * 100); AdjustConfidence(format, info); DelimitedClassBuilder builder = new DelimitedClassBuilder("AutoDetectedClass", info.Delimiter.ToString()); builder.AddFields(info.AvergeByLine + 1); format.mClassBuilder = builder; res.Add(format); } }
private void AdjustConfidence(RecordFormatInfo format, DelimiterInfo info) { switch (info.Delimiter) { case '"': // Avoid the quote identifier case '\'': // Avoid the quote identifier format.mConfidence = (int)(format.Confidence * 0.2); break; case '/': // Avoid the date delimiters and url to be selected case '.': // Avoid the decimal separator to be selected format.mConfidence = (int)(format.Confidence * 0.4); break; case '@': // Avoid the mails separator to be selected case '&': // Avoid this is near a letter and URLS case '=': // Avoid because URLS contains it format.mConfidence = (int)(format.Confidence * 0.6); break; case '-': // Avoid this other date separator format.mConfidence = (int)(format.Confidence * 0.7); break; case ',': // Help the , ; tab ~ to be confident case ';': case '\t': case '~': format.mConfidence = (int)Math.Min(100, format.Confidence * 1.15); break; } }
// DELIMITED private void CreateDelimiterOptions(string[][] sampleData, List <RecordFormatInfo> res, char delimiter = '\0') { var delimiters = new List <DelimiterInfo>(); if (delimiter == '\0') { delimiters = GetDelimiters(sampleData); } else { delimiters.Add(GetDelimiterInfo(sampleData, delimiter)); } foreach (var info in delimiters) { var format = new RecordFormatInfo { mConfidence = (int)((1 - info.Deviation) * 100) }; AdjustConfidence(format, info); var fileHasHeaders = false; if (FileHasHeaders.HasValue) { fileHasHeaders = FileHasHeaders.Value; } else { fileHasHeaders = DetectIfContainsHeaders(info, sampleData); } var builder = new DelimitedClassBuilder("AutoDetectedClass", info.Delimiter.ToString()) { IgnoreFirstLines = fileHasHeaders ? 1 : 0 }; var firstLineSplitted = sampleData[0][0].Split(info.Delimiter); for (int i = 0; i < info.Max + 1; i++) { string name = "Field " + (i + 1).ToString().PadLeft(3, '0'); if (fileHasHeaders && i < firstLineSplitted.Length) { name = firstLineSplitted[i]; } var f = builder.AddField(StringHelper.ToValidIdentifier(name)); if (i > info.Min) { f.FieldOptional = true; } } format.mClassBuilder = builder; res.Add(format); } }
// FIXED LENGTH private void CreateFixedLengthOptions(string[][] data, List <RecordFormatInfo> res) { var format = new RecordFormatInfo(); var stats = Indicators.CalculateAsFixedSize(data); format.mConfidence = (int)(Math.Max(0, 1 - stats.Deviation / stats.Avg) * 100); var builder = new FixedLengthClassBuilder("AutoDetectedClass"); CreateFixedLengthFields(data, builder); format.mClassBuilder = builder; res.Add(format); }
// FIXED LENGTH private void CreateFixedLengthOptions(string[][] data, List <RecordFormatInfo> res) { var format = new RecordFormatInfo(); double average = CalculateAverageLineWidth(data); double deviation = CalculateDeviationLineWidth(data, average); format.mConfidence = (int)(Math.Max(0, 1 - deviation / average) * 100); var builder = new FixedLengthClassBuilder("AutoDetectedClass"); CreateFixedLengthFields(data, builder); format.mClassBuilder = builder; res.Add(format); }
private void AssertFormat(RecordFormatInfo[] formats, string delimiter, int fields, int confidence, int numFormats) { if (numFormats > 0) Assert.AreEqual(numFormats, formats.Length); else Assert.IsTrue(formats.Length > 0); if (confidence > 0) Assert.IsTrue(formats[0].Confidence >= confidence); Assert.IsTrue(formats[0].ClassBuilder is DelimitedClassBuilder); Assert.AreEqual(delimiter, ((DelimitedClassBuilder) formats[0].ClassBuilder).Delimiter); Assert.AreEqual(fields, formats[0].ClassBuilder.FieldCount); }
private void DetectOptionals(RecordFormatInfo option, string[][] data) { }
// FIXED LENGTH private void CreateFixedLengthOptions(string[][] data, List<RecordFormatInfo> res) { RecordFormatInfo format = new RecordFormatInfo(); double average = CalculateAverageLineWidth(data); double deviation = CalculateDeviationLineWidth(data, average); format.mConfidence = (int)(Math.Max(0, 1 - deviation / average) * 100); FixedLengthClassBuilder builder = new FixedLengthClassBuilder("AutoDetectedClass"); CreateFixedLengthFields(data, builder); format.mClassBuilder = builder; res.Add(format); }
private void CreateDelimiterOptions(string[][] sampleData, List<RecordFormatInfo> res, char delimiter) { List<DelimiterInfo> delimiters = new List<DelimiterInfo>(); if (delimiter == '\0') delimiters = GetDelimiters(sampleData); else delimiters.Add(GetDelimiterInfo(sampleData, delimiter)); foreach (DelimiterInfo info in delimiters) { RecordFormatInfo format = new RecordFormatInfo(); format.mConfidence = (int)((1 - info.Deviation ) * 100); AdjustConfidence(format, info); DelimitedClassBuilder builder = new DelimitedClassBuilder("AutoDetectedClass", info.Delimiter.ToString()); builder.IgnoreFirstLines = FileHasHeaders ? 1 : 0; var firstLineSplitted = sampleData[0][0].Split(info.Delimiter); for (int i = 0; i < info.Max + 1; i++) { string name = "Field " + (i + 1).ToString().PadLeft(3, '0'); if (FileHasHeaders && i < firstLineSplitted.Length) name = firstLineSplitted[i]; var f = builder.AddField(StringHelper.ToValidIdentifier(name)); if (i > info.Min) f.FieldOptional = true; } format.mClassBuilder = builder; res.Add(format); } }
private void AdjustConfidence(RecordFormatInfo format, DelimiterInfo info) { switch (info.Delimiter) { case '"': // Avoid the quote identifier case '\'': // Avoid the quote identifier format.mConfidence = (int)(format.Confidence * 0.2); break; case '/': // Avoid the date delimiters and url to be selected case '.': // Avoid the decimal separator to be selected format.mConfidence = (int)(format.Confidence * 0.4); break; case '@': // Avoid the mails separator to be selected case '&': // Avoid this is near a letter and URLS case '=': // Avoid because URLS contains it format.mConfidence = (int)(format.Confidence * 0.6); break; case '-': // Avoid this other date separator format.mConfidence = (int)(format.Confidence * 0.7); break; case ',': // Help the , ; tab to be confident case ';': case '\t': format.mConfidence = (int)Math.Min(100, format.Confidence * 1.15); break; } }
private void DetectOptionals(RecordFormatInfo option, string[][] data) { }
private void DetectOptionals(RecordFormatInfo option, string[][] data) { // TODO: Try to detect optional fields }
private void DetectQuoted(RecordFormatInfo format, string[][] data) { if (format.ClassBuilder is FixedLengthClassBuilder) return; }
private void DetectOptionals(RecordFormatInfo option, string[][] data) { // TODO: Try to detect optional fields }
private void DetectTypes(RecordFormatInfo format, string[][] data) { // TODO: Try to detect posible formats (mostly numbers or dates) }
// FIXED LENGTH private void CreateFixedLengthOptions(string[][] data, List<RecordFormatInfo> res) { var format = new RecordFormatInfo(); var stats = Indicators.CalculateAsFixedSize (data); format.mConfidence = (int)(Math.Max (0, 1 - stats.Deviation / stats.Avg) * 100); var builder = new FixedLengthClassBuilder("AutoDetectedClass"); CreateFixedLengthFields(data, builder); format.mClassBuilder = builder; res.Add(format); }
private void DetectTypes(RecordFormatInfo format, string[][] data) { // TODO: Try to detect posible formats (mostly numbers or dates) }
private void DetectTypes(RecordFormatInfo format, string[][] data) { }
private void DetectTypes(RecordFormatInfo format, string[][] data) { }