예제 #1
0
        /// <summary>
        /// Analyzes a CSV text and tries to figure out separators, quote chars etc
        /// </summary>
        /// <param name="csvString"></param>
        /// <returns></returns>
        public static CsvSettings Analyze(string csvString)
        {
            var result = DetectW3C(csvString);

            if (result != null)
            {
                return(result);
            }

            if (Main.Settings.ParseXmlFiles && XmlSettings.TryAnalyze(csvString, out var xmlSettings))
            {
                return(xmlSettings);
            }

            // First do a letter frequency analysis on each row
            var    s = new StringReader(csvString);
            string line;
            int    lineCount = 0, linesQuoted = 0;
            var    frequencies           = new List <Dictionary <char, int> >();
            var    occurrences           = new Dictionary <char, int>();
            var    frequenciesQuoted     = new List <Dictionary <char, int> >();
            var    occurrencesQuoted     = new Dictionary <char, int>();
            var    wordStarts            = new Dictionary <int, int>();
            var    bigSpaces             = new Dictionary <int, int>();
            var    inQuotes              = false;
            var    letterFrequencyQuoted = new Dictionary <char, int>();

            while ((line = s.ReadLine()) != null)
            {
                var letterFrequency = new Dictionary <char, int>();
                int spaces = 0, i = 0;
                foreach (var c in line)
                {
                    letterFrequency.Increase(c);
                    occurrences.Increase(c);

                    if (c == '"')
                    {
                        inQuotes = !inQuotes;
                    }
                    else if (!inQuotes)
                    {
                        letterFrequencyQuoted.Increase(c);
                        occurrencesQuoted.Increase(c);
                    }

                    if (c == ' ')
                    {
                        if (++spaces >= 2)
                        {
                            bigSpaces.Increase(i);
                        }
                    }
                    else
                    {
                        if (spaces >= 2)
                        {
                            wordStarts.Increase(i);
                        }
                        spaces = 0;
                    }
                    i++;
                }

                frequencies.Add(letterFrequency);
                if (!inQuotes)
                {
                    frequenciesQuoted.Add(letterFrequencyQuoted);
                    letterFrequencyQuoted = new Dictionary <char, int>();
                    linesQuoted++;
                }

                if (lineCount++ > 20)
                {
                    break;
                }
            }

            // Then check the variance on the frequency of each char
            var variances = new Dictionary <char, float>();

            foreach (var c in occurrences.Keys)
            {
                var   mean     = (float)occurrences[c] / lineCount;
                float variance = 0;
                foreach (var frequency in frequencies)
                {
                    var f = 0;
                    if (frequency.ContainsKey(c))
                    {
                        f = frequency[c];
                    }
                    variance += (f - mean) * (f - mean);
                }
                variance /= lineCount;
                variances.Add(c, variance);
            }

            var variancesQuoted = new Dictionary <char, float>();

            foreach (var c in occurrencesQuoted.Keys)
            {
                var   mean     = (float)occurrencesQuoted[c] / linesQuoted;
                float variance = 0;
                foreach (var frequency in frequenciesQuoted)
                {
                    var f = 0;
                    if (frequency.ContainsKey(c))
                    {
                        f = frequency[c];
                    }
                    variance += (f - mean) * (f - mean);
                }
                variance /= lineCount;
                variancesQuoted.Add(c, variance);
            }

            // The char with lowest variance is most likely the separator
            result = new CsvSettings {
                Separator = GetSeparatorFromVariance(variances, occurrences, lineCount, out var uncertancy)
            };
            var separatorQuoted = GetSeparatorFromVariance(variancesQuoted, occurrencesQuoted, linesQuoted, out var uncertancyQuoted);

            if (uncertancyQuoted < uncertancy)
            {
                result.Separator = separatorQuoted;
            }
            else if (uncertancy < uncertancyQuoted || (uncertancy == uncertancyQuoted && lineCount > linesQuoted)) // It was better ignoring quotes!
            {
                result.TextQualifier = '\0';
            }

            if (result.Separator != default(char))
            {
                return(result);
            }

            // Failed to detect separator. Could it be a fixed-width file?
            var commonSpace      = bigSpaces.Where(x => x.Value == lineCount).Select(x => x.Key).OrderBy(x => x);
            var lastvalue        = 0;
            int lastStart        = 0;
            var foundfieldWidths = new List <int>();

            foreach (var space in commonSpace)
            {
                if (space != lastvalue + 1)
                {
                    foundfieldWidths.Add(space - lastStart);
                    lastStart = space;
                }

                lastvalue = space;
            }
            if (foundfieldWidths.Count < 3)
            {
                return(result);       // unlikely fixed width
            }
            foundfieldWidths.Add(-1); // Last column gets "the rest"
            result.FieldWidths = foundfieldWidths;
            return(result);
        }
예제 #2
0
        public static bool TryAnalyze(string text, out XmlSettings result)
        {
            var elems = new Dictionary <string, ElementFacts>(StringComparer.OrdinalIgnoreCase);

            using (var stringReader = new StringReader(text))
                using (var reader = XmlReader.Create(stringReader))
                {
                    var parents = new Stack <string>();
                    try
                    {
                        while (reader.Read())
                        {
                            switch (reader.NodeType)
                            {
                            case XmlNodeType.Element:
                                if (reader.Depth != 0)
                                {
                                    var siblings = elems[parents.Peek()].SubElements;
                                    if (!siblings.Contains(reader.LocalName))
                                    {
                                        siblings.Add(reader.LocalName);
                                    }
                                }
                                if (!elems.TryGetValue(reader.LocalName, out var elem))
                                {
                                    elem = new ElementFacts {
                                        Name = reader.LocalName
                                    };
                                    elems.Add(elem.Name, elem);
                                }

                                elem.Count++;
                                if (!reader.IsEmptyElement)
                                {
                                    parents.Push(reader.LocalName);
                                }
                                break;

                            case XmlNodeType.Text:
                                break;

                            case XmlNodeType.EndElement:
                                parents.Pop();
                                break;
                            }
                        }
                    }
                    catch (XmlException)
                    {
                        // Stop reading :/
                    }
                }

            var bestCandidate = elems.Values.Where(e => e.Count > 1 && e.SubElements.Count > 1).OrderByDescending(e => e.Count)
                                .FirstOrDefault();

            if (bestCandidate == null)
            {
                result = null;
                return(false);
            }
            Debug.WriteLine($"Found line type {bestCandidate.Name} {bestCandidate.Count} lines)");
            Debug.WriteLine("Found columns: " + string.Join(", ", bestCandidate.SubElements));

            result = new XmlSettings(bestCandidate.Name, bestCandidate.SubElements.ToArray());
            return(true);
        }