コード例 #1
0
        public static List <Record> ToRecords(this Dataset dataset, string csvFile, string term)
        {
            using (StreamReader reader = new StreamReader(csvFile, Encoding.UTF8))
                using (CsvReader csv = new CsvReader(reader))
                {
                    Console.WriteLine($"Processing '{Path.GetFileName(csvFile)}'");

                    // Danger, Will Robinson!
                    //
                    // CsvHelper stores property names in couple of Dictionaries that use a string key.
                    // These dictionaries are case sensitive, because C# supports properties the same name as long as the cases are different (e.g. "discriminator" and "Discriminator" are different properties)
                    //
                    // But, we want a case _insensitive_ match for when a new csv file differs in column header case only (e.g. "Territorial Authority" and "Territorial authority") so we don't have to fix our Dataset configuration
                    // As per CsvHelper's documentation, we're expected to use the PrepareHeaderForMatch property to call ToLower() on all column headers:
                    //   - https://joshclose.github.io/CsvHelper/getting-started/
                    //   - https://github.com/JoshClose/CsvHelper/issues/1183
                    //
                    // That violates Microsoft's sting comparison guidelines, however: https://docs.microsoft.com/en-us/dotnet/csharp/how-to/compare-strings
                    //
                    // I'd prefer to replace the Dictionaries with ones that use a case insensitive match:
                    //
                    //  - csv.Context.NamedIndexes = new Dictionary<string, List<int>>(StringComparer.OrdinalIgnoreCase);
                    //  - csv.Context.NamedIndexCache = new Dictionary<string, (string, int)>(StringComparer.OrdinalIgnoreCase);
                    //
                    // ...which works right now, but reaching into CsvHelper's internals feels kind of fragile.
                    //
                    // So, we'll grumble and go ahead with ToLower()
                    csv.Configuration.PrepareHeaderForMatch = (header, index) => header.ToLowerInvariant();

                    // If we have more or fewer headers or properties than we expect, just keep going
                    csv.Configuration.HeaderValidated   = null;
                    csv.Configuration.MissingFieldFound = null;

                    csv.Configuration.ReadingExceptionOccurred = exception =>
                    {
                        if (exception is TypeConverterException)
                        {
                            // Some text in the Value column, instead of a number
                            // In most datasets, Value is null and we're supplied a null reason to indicate why we don't have a value
                            // However, some datasets include the null reason in the value column and cause type conversion errors
                            // We'll just skip these rows
                            return(false);
                        }

                        return(true);
                    };

                    var map = new RecordMap()
                              .Map(r => r.Selector, dataset.Selector ?? "Territorial Authority")
                              .Map(r => r.Date, dataset.Date);

                    map.Map(r => r.Measures)
                    .ConvertUsing(rr => dataset
                                  .Measure
                                  .Select((measure, index) => new { measure, index })
                                  .ToDictionary(
                                      mi => mi.measure.Name,
                                      mi => new ColumnValue
                    {
                        Index     = mi.index,
                        Column    = mi.measure.Name,
                        Separator = mi.measure.Separator ?? " — ",
                        Value     = rr.GetField <string>(mi.measure.Name)
                    })
                                  );

                    map.Map(r => r.Categories)
                    .ConvertUsing(rr => dataset
                                  .Category
                                  .Select((category, index) => new { category, index })
                                  .ToDictionary(ci =>
                                                ci.category.Name,
                                                ci => new ColumnValue
                    {
                        Index     = ci.index,
                        Column    = ci.category.Name,
                        Separator = ci.category.Separator ?? " — ",
                        Value     = rr.GetField <string>(ci.category.Name)
                    })
                                  );

                    map
                    .Map(r => r.Value, dataset.Value ?? "Value")
                    .Map(r => r.ValueUnit, dataset.ValueUnit ?? "Value Unit")
                    .Map(r => r.ValueLabel, dataset.ValueLabel ?? "Value Label")
                    .Map(r => r.NullReason, dataset.NullReason ?? "Null Reason");

                    csv.Configuration.RegisterClassMap(map);

                    return(dataset.ToRecords(csv, term));
                }
        }
コード例 #2
0
        public static List <Record> ToRecords(this Dataset dataset, CsvReader csv, string term)
        {
            HashSet <string> terms = new HashSet <string>(StringComparer.OrdinalIgnoreCase)
            {
                term
            };

            if (!string.IsNullOrEmpty(dataset.TermMapping))
            {
                using (StreamReader file = File.OpenText(dataset.TermMapping))
                {
                    Dictionary <string, List <string> > mapping = new Dictionary <string, List <string> >(StringComparer.OrdinalIgnoreCase);
                    new JsonSerializer().Populate(file, mapping);

                    if (mapping.TryGetValue(term, out List <string> maps))
                    {
                        foreach (string map in maps)
                        {
                            terms.Add(map);
                        }
                    }
                }
            }

            bool hasMeasureExclusions = dataset.Measure.Any(c => c.Exclude != null && c.Exclude.Any());
            Dictionary <string, HashSet <string> > measureExclusions = dataset.Measure.Where(c => c.Exclude != null).ToDictionary(c => c.Name, c => new HashSet <string>(c.Exclude, StringComparer.OrdinalIgnoreCase), StringComparer.OrdinalIgnoreCase);

            bool hasMeasureInclusions = dataset.Measure.Any(c => c.Include != null && c.Include.Any());
            Dictionary <string, Dictionary <string, (int Index, Include Include)> > measureInclusions = dataset.Measure.Where(c => c.Include != null).ToDictionary(c => c.Name, c => c.Include.Select((include, index) => (Index: index, Include: include)).ToDictionary(ii => ii.Include.Value, i => i, StringComparer.OrdinalIgnoreCase), StringComparer.OrdinalIgnoreCase);

            bool hasCategoryExclusions = dataset.Category.Any(c => c.Exclude != null && c.Exclude.Any());
            Dictionary <string, HashSet <string> > categoryExclusions = dataset.Category.Where(c => c.Exclude != null).ToDictionary(c => c.Name, c => new HashSet <string>(c.Exclude, StringComparer.OrdinalIgnoreCase), StringComparer.OrdinalIgnoreCase);

            bool hasCategoryInclusions = dataset.Category.Any(c => c.Include != null && c.Include.Any());
            Dictionary <string, Dictionary <string, (int Index, Include Include)> > categoryInclusions = dataset.Category.Where(c => c.Include != null).ToDictionary(c => c.Name, c => c.Include.Select((include, index) => (Index: index, Include: include)).ToDictionary(ii => ii.Include.Value, i => i, StringComparer.OrdinalIgnoreCase), StringComparer.OrdinalIgnoreCase);

            List <Record> set = new List <Record>();

            int countRecords            = 0;
            int countMissingSelector    = 0;
            int countExcludedBySelector = 0;
            int countExcludedByMeasure  = 0;
            int countExcludedByGroup    = 0;
            int countExcludedByCategory = 0;
            int countExcludedByValue    = 0;

            foreach (Record r in csv.GetRecords <Record>())
            {
                countRecords++;

                if (r.Selector == null)
                {
                    countMissingSelector++;
                    continue;
                }

                if (!terms.Contains(r.Selector) && !term.Equals(dataset.AllSelectorsMatchTerm, StringComparison.OrdinalIgnoreCase))
                {
                    countExcludedBySelector++;
                    continue;
                }

                if (hasMeasureExclusions && r.Measures.Any(m => measureExclusions.TryGetValue(m.Key, out var exclusions) && exclusions.Contains(m.Value.Value)))
                {
                    countExcludedByMeasure++;
                    continue;
                }

                if (hasMeasureInclusions && r.Measures.Any(m => measureInclusions.TryGetValue(m.Key, out var inclusions) && inclusions.Any() && !inclusions.ContainsKey(m.Value.Value)))
                {
                    countExcludedByMeasure++;
                    continue;
                }

                if (hasCategoryExclusions && r.Categories.Any(m => categoryExclusions.TryGetValue(m.Key, out var exclusions) && exclusions.Contains(m.Value.Value)))
                {
                    countExcludedByCategory++;
                    continue;
                }

                if (hasCategoryInclusions && r.Categories.Any(c => categoryInclusions.TryGetValue(c.Key, out var inclusions) && inclusions.Any() && !inclusions.ContainsKey(c.Value.Value)))
                {
                    countExcludedByCategory++;
                    continue;
                }

                // If we've selected this row without a direct match to the supplied term, it's because we've got some mapping somewhere in the pipeline
                // Prepend the term to the row's selector, so we get something like "Auckland — Waitemata DHB"
                if (!term.Equals(r.Selector, StringComparison.OrdinalIgnoreCase))
                {
                    r.Selector = $"{term} — {r.Selector}";
                }

                if (hasMeasureInclusions)
                {
                    foreach (var m in r.Measures)
                    {
                        if (measureInclusions.TryGetValue(m.Key, out var mi) && mi.TryGetValue(m.Value.Value, out var ii))
                        {
                            m.Value.Label = ii.Include.Label;

                            // TODO: Move this out to Measure?
                            r.ConvertToPercentage = r.ConvertToPercentage || ii.Include.ConvertToPercentage;
                        }
                    }
                }

                if (hasCategoryInclusions)
                {
                    foreach (var c in r.Categories)
                    {
                        if (categoryInclusions.TryGetValue(c.Key, out var ci) && ci.TryGetValue(c.Value.Value, out var ii))
                        {
                            c.Value.Label = ii.Include.Label;

                            // TODO: Move this out to Category?
                            r.ConvertToPercentage = r.ConvertToPercentage || ii.Include.ConvertToPercentage;
                        }
                    }
                }

                if (r.Value != null && r.ValueLabel == "NZD thousands")
                {
                    r.ValueLabel = "NZD";
                    r.Value      = r.Value * 1000;
                }

                r.Parent    = dataset.Parent;
                r.Uri       = dataset.Source;
                r.DateLabel = dataset.Date;

                set.Add(r);
            }

            var ordered = set
                          .GroupBy(r => new { r.Selector, Measure = r.MeasureFormatted(), Category = r.CategoryFormatted() })
                          .Select(g => g
                                  .OrderByDescending(r => r.Date)
                                  .ThenByDescending(r => r.Value) // Case where Auckland appears twice for the same year because super city
                                  .First()
                                  )
                          .GroupBy(r => new { r.Selector, Measure = r.MeasureFormatted() })
                          .Select(g =>
            {
                decimal?total = g.Sum(r => r.Value);

                if (total == null || total == 0)
                {
                    return(g);
                }

                foreach (Record r in g.Where(r => r.ConvertToPercentage && r.Value != null))
                {
                    r.ValueUnit  = "percentage";
                    r.ValueLabel = r.ValueLabel.ReplaceCaseInsensitive("Number", "%");
                    r.Value      = (r.Value / total) * 100; // Multiple by 100 to stay consistent with other percentage values that are natively 100-based
                }

                return(g);
            })
                          .SelectMany(g => g)
                          .Where(r =>
            {
                // If we're excluding zero values, pull them out here
                if (dataset.ExcludeZeroValues && r.Value == 0)
                {
                    countExcludedByValue++;
                    return(false);
                }

                return(true);
            })
                          .OrderBy(r => r.Selector);

            foreach (KeyValuePair <string, Dictionary <string, (int Index, Include Include)> > measureInclusion in measureInclusions)
            {
                ordered = ordered
                          .ThenBy(r =>
                {
                    if (!r.Measures.TryGetValue(measureInclusion.Key, out var column))
                    {
                        return((int?)null);
                    }

                    if (!measureInclusion.Value.TryGetValue(column.Value, out var inclusion))
                    {
                        return((int?)null);
                    }

                    return(inclusion.Index);
                });
            }

            foreach (Column column in dataset.Measure)
            {
                ordered = ordered
                          .ThenBy(r => r.Measures[column.Name].Value, StringComparer.OrdinalIgnoreCase);
            }

            foreach (KeyValuePair <string, Dictionary <string, (int Index, Include Include)> > categoryInclusion in categoryInclusions)
            {
                ordered = ordered
                          .ThenBy(r =>
                {
                    if (!r.Categories.TryGetValue(categoryInclusion.Key, out var column))
                    {
                        return((int?)null);
                    }

                    if (!categoryInclusion.Value.TryGetValue(column.Value, out var inclusion))
                    {
                        return((int?)null);
                    }

                    return(inclusion.Index);
                });
            }

            set = ordered.ToList();

            Console.WriteLine($" - {countRecords} records read");

            if (countMissingSelector > 0)
            {
                Console.WriteLine($" - {countMissingSelector} records missing \"selector\"");
            }

            if (countExcludedBySelector > 0)
            {
                Console.WriteLine($" - {countExcludedBySelector} records excluded by \"selector\"");
            }

            if (countExcludedByMeasure > 0)
            {
                Console.WriteLine($" - {countExcludedByMeasure} records excluded by \"measure\"");
            }

            if (countExcludedByGroup > 0)
            {
                Console.WriteLine($" - {countExcludedByGroup} records excluded by \"group\"");
            }

            if (countExcludedByCategory > 0)
            {
                Console.WriteLine($" - {countExcludedByCategory} records excluded by \"category\"");
            }

            if (countExcludedByValue > 0)
            {
                Console.WriteLine($" - {countExcludedByValue} records excluded by \"value\"");
            }

            if (set.Count == 0)
            {
                Console.ForegroundColor = ConsoleColor.Red;
            }

            Console.WriteLine($" - {set.Count} records included");
            Console.ResetColor();
            Console.WriteLine();

            return(set);
        }