/// <summary> /// Analyzes a CSV text and tries to figure out separators, quote chars etc /// </summary> /// <param name="csvString"></param> /// <returns></returns> public static CsvSettings Analyze(string csvString) { var result = DetectW3C(csvString); if (result != null) { return(result); } if (Main.Settings.ParseXmlFiles && XmlSettings.TryAnalyze(csvString, out var xmlSettings)) { return(xmlSettings); } // First do a letter frequency analysis on each row var s = new StringReader(csvString); string line; int lineCount = 0, linesQuoted = 0; var frequencies = new List <Dictionary <char, int> >(); var occurrences = new Dictionary <char, int>(); var frequenciesQuoted = new List <Dictionary <char, int> >(); var occurrencesQuoted = new Dictionary <char, int>(); var wordStarts = new Dictionary <int, int>(); var bigSpaces = new Dictionary <int, int>(); var inQuotes = false; var letterFrequencyQuoted = new Dictionary <char, int>(); while ((line = s.ReadLine()) != null) { var letterFrequency = new Dictionary <char, int>(); int spaces = 0, i = 0; foreach (var c in line) { letterFrequency.Increase(c); occurrences.Increase(c); if (c == '"') { inQuotes = !inQuotes; } else if (!inQuotes) { letterFrequencyQuoted.Increase(c); occurrencesQuoted.Increase(c); } if (c == ' ') { if (++spaces >= 2) { bigSpaces.Increase(i); } } else { if (spaces >= 2) { wordStarts.Increase(i); } spaces = 0; } i++; } frequencies.Add(letterFrequency); if (!inQuotes) { frequenciesQuoted.Add(letterFrequencyQuoted); letterFrequencyQuoted = new Dictionary <char, int>(); linesQuoted++; } if (lineCount++ > 20) { break; } } // Then check the variance on the frequency of each char var variances = new Dictionary <char, float>(); foreach (var c in occurrences.Keys) { var mean = (float)occurrences[c] / lineCount; float variance = 0; foreach (var frequency in frequencies) { var f = 0; if (frequency.ContainsKey(c)) { f = frequency[c]; } variance += (f - mean) * (f - mean); } variance /= lineCount; variances.Add(c, variance); } var variancesQuoted = new Dictionary <char, float>(); foreach (var c in occurrencesQuoted.Keys) { var mean = (float)occurrencesQuoted[c] / linesQuoted; float variance = 0; foreach (var frequency in frequenciesQuoted) { var f = 0; if (frequency.ContainsKey(c)) { f = frequency[c]; } variance += (f - mean) * (f - mean); } variance /= lineCount; variancesQuoted.Add(c, variance); } // The char with lowest variance is most likely the separator result = new CsvSettings { Separator = GetSeparatorFromVariance(variances, occurrences, lineCount, out var uncertancy) }; var separatorQuoted = GetSeparatorFromVariance(variancesQuoted, occurrencesQuoted, linesQuoted, out var uncertancyQuoted); if (uncertancyQuoted < uncertancy) { result.Separator = separatorQuoted; } else if (uncertancy < uncertancyQuoted || (uncertancy == uncertancyQuoted && lineCount > linesQuoted)) // It was better ignoring quotes! { result.TextQualifier = '\0'; } if (result.Separator != default(char)) { return(result); } // Failed to detect separator. Could it be a fixed-width file? var commonSpace = bigSpaces.Where(x => x.Value == lineCount).Select(x => x.Key).OrderBy(x => x); var lastvalue = 0; int lastStart = 0; var foundfieldWidths = new List <int>(); foreach (var space in commonSpace) { if (space != lastvalue + 1) { foundfieldWidths.Add(space - lastStart); lastStart = space; } lastvalue = space; } if (foundfieldWidths.Count < 3) { return(result); // unlikely fixed width } foundfieldWidths.Add(-1); // Last column gets "the rest" result.FieldWidths = foundfieldWidths; return(result); }
public static bool TryAnalyze(string text, out XmlSettings result) { var elems = new Dictionary <string, ElementFacts>(StringComparer.OrdinalIgnoreCase); using (var stringReader = new StringReader(text)) using (var reader = XmlReader.Create(stringReader)) { var parents = new Stack <string>(); try { while (reader.Read()) { switch (reader.NodeType) { case XmlNodeType.Element: if (reader.Depth != 0) { var siblings = elems[parents.Peek()].SubElements; if (!siblings.Contains(reader.LocalName)) { siblings.Add(reader.LocalName); } } if (!elems.TryGetValue(reader.LocalName, out var elem)) { elem = new ElementFacts { Name = reader.LocalName }; elems.Add(elem.Name, elem); } elem.Count++; if (!reader.IsEmptyElement) { parents.Push(reader.LocalName); } break; case XmlNodeType.Text: break; case XmlNodeType.EndElement: parents.Pop(); break; } } } catch (XmlException) { // Stop reading :/ } } var bestCandidate = elems.Values.Where(e => e.Count > 1 && e.SubElements.Count > 1).OrderByDescending(e => e.Count) .FirstOrDefault(); if (bestCandidate == null) { result = null; return(false); } Debug.WriteLine($"Found line type {bestCandidate.Name} {bestCandidate.Count} lines)"); Debug.WriteLine("Found columns: " + string.Join(", ", bestCandidate.SubElements)); result = new XmlSettings(bestCandidate.Name, bestCandidate.SubElements.ToArray()); return(true); }