/// <summary>
        /// Method that extract data and save them a normalized CSV file
        /// </summary>
        public Dictionary <string, RawData> Extract()
        {
            var f = Directory.GetFiles(_repositoyFolder).ToList().OrderBy(x => x).LastOrDefault();

            var data = new Dictionary <string, RawData>();

            try
            {
                _logger.Debug($"Reading file: \"{f}\"");
                using (var sr = File.OpenRead(f))
                {
                    using (var xl = ExcelReaderFactory.CreateReader(sr, null))
                    {
                        var conf = new ExcelDataSetConfiguration
                        {
                            ConfigureDataTable = _ => new ExcelDataTableConfiguration
                            {
                                UseHeaderRow = true
                            }
                        };

                        var dataSet   = xl.AsDataSet(conf);
                        var dataTable = dataSet.Tables[0];
                        var view      = new DataView(dataTable)
                        {
                            Sort = "DateRep ASC"
                        };

                        var country = dataTable.Columns.IndexOf("CountryExp");

                        for (int i = 0; i < view.Count; i++)
                        {
                            var row = view[i];
                            var obj = new RawData()
                            {
                                DataProvider = "ECDC"
                            };

                            if (country == -1)
                            {
                                obj.Area      = row.TryGetValue("countriesAndTerritories").Replace("UK", "United Kingdom").Replace("United_Kingdom", "United Kingdom");
                                obj.Date      = new DateTime(Convert.ToInt32(row.TryGetValue("Year")), Convert.ToInt32(row.TryGetValue("Month")), Convert.ToInt32(row.TryGetValue("Day")));
                                obj.Confirmed = Convert.ToInt32(row.TryGetValue("Cases"));
                                obj.Death     = Convert.ToInt32(row.TryGetValue("Deaths"));
                            }
                            else
                            {
                                obj.Area = row.TryGetValue("CountryExp");
                                obj.Date = Convert.ToDateTime(row.TryGetValue("DateRep"));
                            }

                            if (!data.ContainsKey(obj.ToString()))
                            {
                                data.Add(obj.ToString(), obj);
                            }
                        }
                    }
                }

                foreach (var area in data.Values.GroupBy(x => x.Area).OrderBy(x => x.Key))
                {
                    _logger.Debug($"Processing {area.Key}");

                    var missingData = new List <RawData>();
                    var mindate     = area.Min(x => x.Date);
                    var maxdate     = area.Max(x => x.Date);
                    var nbDays      = (maxdate - mindate).Days;

                    RawData previous = null;
                    RawData current  = null;

                    // For some reason some days have no data. Must create missing day.
                    for (int i = 0; i <= nbDays; i++)
                    {
                        current = area.FirstOrDefault(x => x.Date == mindate.AddDays(i));
                        if (current == null)
                        {
                            current = new RawData {
                                DataProvider = "ECDC", Area = area.Key, Date = mindate.AddDays(i)
                            };
                        }

                        // Don't take the first day as previous day does not exist
                        if (i > 0)
                        {
                            // data is incremental, take previous day
                            previous = area.FirstOrDefault(x => x.Date == current.Date.AddDays(-1));

                            // If previous was missing then added to main data source, main source is not refreshed. Keep missing data in a list aside
                            if (previous == null)
                            {
                                previous = missingData.FirstOrDefault(x => x.Date == current.Date.AddDays(-1));
                            }

                            current.Confirmed += previous.Confirmed;
                            current.Death     += previous.Death;
                        }

                        if (!data.ContainsKey(current.ToString()))
                        {
                            missingData.Add(current);
                            data.Add(current.ToString(), current);
                        }
                    }

                    if (missingData.Count > 0 && _logger.IsDebugEnabled)
                    {
                        _logger.Debug($"Missing day created for {area.Key}: \"{string.Join("\", \"", missingData.Select(x => $"{x.Date:yyyy-MM-dd}, c:{x.Confirmed}, d:{x.Death}"))}\"");
                    }
                }
            }
            catch (Exception ex)
            {
                _logger.Error(ex.Message);
            }

            _logger.Info(string.Concat("Found ", data.Count, " records"));

            using (var writer = new StreamWriter(this._outputFile))
            {
                using (var csv = new CsvWriter(writer, CultureInfo.GetCultureInfo("fr-fr")))
                {
                    csv.WriteRecords(data.Values.OrderBy(x => x.Area).ThenBy(x => x.Date));
                }
            }

            return(data);
        }
Exemplo n.º 2
0
        /// <summary>
        /// Method that extract data and save them a normalized CSV file
        /// </summary>
        public Dictionary <string, RawData> Extract()
        {
            var data          = new Dictionary <string, RawData>();
            var datesErrors   = new HashSet <string>();
            var missingFields = new HashSet <Tuple <string, string> >();

            // Setting columns names aliases
            var areaAlias    = new string[] { "Country_Region", "Country/Region" };
            var subareaAlias = new string[] { "Province_State", "Province/State" };
            var latAlias     = new string[] { "Latitude", "Lat" };
            var lngAlias     = new string[] { "Longitude", "Long_" };

            var files = Directory.GetFiles(_repositoryFolder, "*.csv", SearchOption.AllDirectories).ToList().OrderBy(x => x);

            foreach (var f in files)
            #region EXTRACT DATA FROM FILES
            {
                int counter    = 0;
                int addedLines = 0;
                using (var sr = new StreamReader(f))
                {
                    using (var csv = new CustomCsvReader(sr, CultureInfo.InvariantCulture))
                    {
                        csv.Configuration.MissingFieldFound = delegate(string[] tab, int count, ReadingContext ctxt)
                        {
                            var miss = new Tuple <string, string>(Path.GetFileName(f), string.Join(", ", tab));
                            if (!missingFields.Contains(miss))
                            {
                                missingFields.Add(miss);
                            }
                        };
                        csv.Read();
                        csv.ReadHeader();

                        #region FETCH COLUMNS INDEX
                        var idxArea   = csv.GetFieldIndex(areaAlias);
                        var idxSub    = csv.GetFieldIndex(subareaAlias);
                        var idxAdmin2 = csv.GetFieldIndex("Admin2");
                        var idxConf   = csv.GetFieldIndex("Confirmed");
                        var idxDeath  = csv.GetFieldIndex("Deaths");
                        var idxLat    = csv.GetFieldIndex(latAlias);
                        var idxLng    = csv.GetFieldIndex(lngAlias);
                        #endregion

                        while (csv.Read())
                        {
                            #region PROCESSING A LINE
                            counter++;

                            try
                            {
                                var date = Path.GetFileNameWithoutExtension(f);

                                if (DateTime.TryParseExact(date, _dateFormats, CultureInfo.GetCultureInfo("fr-fr"), DateTimeStyles.AdjustToUniversal, out DateTime lastUpdate))
                                {
                                    string area    = csv.GetField(idxArea).Replace("Mainland China", "China").Replace("UK", "United Kingdom").Replace("United_Kingdom", "United Kingdom");
                                    string subarea = csv.GetField(idxSub);

                                    if (!string.IsNullOrEmpty(area))
                                    {
                                        var obj = new RawData();
                                        obj.DataProvider = "JohnsHopkins";
                                        obj.Area         = area;
                                        obj.SubArea      = subarea;
                                        obj.Admin2       = csv.GetField(idxAdmin2)?.Replace("Unassigned", string.Empty);
                                        obj.Date         = lastUpdate;
                                        obj.Confirmed    = csv.GetFieldAsInt(idxConf, _us);
                                        obj.Death        = csv.GetFieldAsInt(idxDeath, _us);
                                        obj.Latitude     = csv.GetFieldAsDouble(idxLat, _us);
                                        obj.Longitude    = csv.GetFieldAsDouble(idxLng, _us);

                                        if (!data.ContainsKey(obj.ToString()))
                                        {
                                            data.Add(obj.ToString(), obj);
                                            addedLines++;
                                        }
                                    }
                                }
                                else
                                {
                                    if (!datesErrors.Contains(date))
                                    {
                                        datesErrors.Add(string.Concat(csv.Context.Row.ToString().PadLeft(6, '0'), " \"", f, "\": ", date));
                                    }
                                }
                            }
                            catch (Exception ex)
                            {
                                _logger.Error(string.Concat("\"", f, "\"", ": ", ex.Message));
                                _logger.Debug(string.Join(" ; ", csv.Context.HeaderRecord));
                                _logger.Debug(string.Join(" ; ", csv.Context.Record));
                                throw ex;
                            }
                            #endregion
                        }
                    }
                }

                var copyFile = Path.Combine(_copyRepositoryFolder, Path.GetFileName(f));
                if (!File.Exists(copyFile))
                {
                    // Make a copy
                    File.Copy(f, copyFile);
                }

                _logger.Debug($"Processing file \"{Path.GetFileName(f)}\", number of lines added/processed: {addedLines}/{counter}");
            };
            #endregion

            #region LOGGING ERRORS
            if (missingFields.Count > 0)
            {
                _logger.Warn($"There are {missingFields.GroupBy(x => x.Item1).Count()} files in which some columns could not be found: \"{string.Join(", ", missingFields.GroupBy(x => x.Item2).Select(x => x.Key))}\"");
                if (_logger.IsDebugEnabled)
                {
                    _logger.Debug(string.Concat("Following fields could not be found:\n", string.Join("\n", missingFields.GroupBy(x => x.Item1).Select(x => $"{x.Key}: \"{string.Join("\", \"", x.Select(y => y.Item2))}\""))));
                }
            }

            if (datesErrors.Count > 0)
            {
                _logger.Warn($"There are {datesErrors.Count} dates not correctly formated");
                if (_logger.IsDebugEnabled)
                {
                    _logger.Debug(string.Concat("Following dates are not in correct format:\n", string.Join("\n", datesErrors)));
                }
            }
            #endregion

            _logger.Info(string.Concat("Found ", data.Count, " records"));

            // Writing output file
            using (var writer = new StreamWriter(_outputFile))
            {
                using (var csv = new CsvWriter(writer, CultureInfo.GetCultureInfo("fr-fr")))
                {
                    csv.WriteRecords(data.Select(x => x.Value));
                }
            }

            return(data);
        }