/// <summary> /// Method that extract data and save them a normalized CSV file /// </summary> public Dictionary <string, RawData> Extract() { var f = Directory.GetFiles(_repositoyFolder).ToList().OrderBy(x => x).LastOrDefault(); var data = new Dictionary <string, RawData>(); try { _logger.Debug($"Reading file: \"{f}\""); using (var sr = File.OpenRead(f)) { using (var xl = ExcelReaderFactory.CreateReader(sr, null)) { var conf = new ExcelDataSetConfiguration { ConfigureDataTable = _ => new ExcelDataTableConfiguration { UseHeaderRow = true } }; var dataSet = xl.AsDataSet(conf); var dataTable = dataSet.Tables[0]; var view = new DataView(dataTable) { Sort = "DateRep ASC" }; var country = dataTable.Columns.IndexOf("CountryExp"); for (int i = 0; i < view.Count; i++) { var row = view[i]; var obj = new RawData() { DataProvider = "ECDC" }; if (country == -1) { obj.Area = row.TryGetValue("countriesAndTerritories").Replace("UK", "United Kingdom").Replace("United_Kingdom", "United Kingdom"); obj.Date = new DateTime(Convert.ToInt32(row.TryGetValue("Year")), Convert.ToInt32(row.TryGetValue("Month")), Convert.ToInt32(row.TryGetValue("Day"))); obj.Confirmed = Convert.ToInt32(row.TryGetValue("Cases")); obj.Death = Convert.ToInt32(row.TryGetValue("Deaths")); } else { obj.Area = row.TryGetValue("CountryExp"); obj.Date = Convert.ToDateTime(row.TryGetValue("DateRep")); } if (!data.ContainsKey(obj.ToString())) { data.Add(obj.ToString(), obj); } } } } foreach (var area in data.Values.GroupBy(x => x.Area).OrderBy(x => x.Key)) { _logger.Debug($"Processing {area.Key}"); var missingData = new List <RawData>(); var mindate = area.Min(x => x.Date); var maxdate = area.Max(x => x.Date); var nbDays = (maxdate - mindate).Days; RawData previous = null; RawData current = null; // For some reason some days have no data. Must create missing day. for (int i = 0; i <= nbDays; i++) { current = area.FirstOrDefault(x => x.Date == mindate.AddDays(i)); if (current == null) { current = new RawData { DataProvider = "ECDC", Area = area.Key, Date = mindate.AddDays(i) }; } // Don't take the first day as previous day does not exist if (i > 0) { // data is incremental, take previous day previous = area.FirstOrDefault(x => x.Date == current.Date.AddDays(-1)); // If previous was missing then added to main data source, main source is not refreshed. Keep missing data in a list aside if (previous == null) { previous = missingData.FirstOrDefault(x => x.Date == current.Date.AddDays(-1)); } current.Confirmed += previous.Confirmed; current.Death += previous.Death; } if (!data.ContainsKey(current.ToString())) { missingData.Add(current); data.Add(current.ToString(), current); } } if (missingData.Count > 0 && _logger.IsDebugEnabled) { _logger.Debug($"Missing day created for {area.Key}: \"{string.Join("\", \"", missingData.Select(x => $"{x.Date:yyyy-MM-dd}, c:{x.Confirmed}, d:{x.Death}"))}\""); } } } catch (Exception ex) { _logger.Error(ex.Message); } _logger.Info(string.Concat("Found ", data.Count, " records")); using (var writer = new StreamWriter(this._outputFile)) { using (var csv = new CsvWriter(writer, CultureInfo.GetCultureInfo("fr-fr"))) { csv.WriteRecords(data.Values.OrderBy(x => x.Area).ThenBy(x => x.Date)); } } return(data); }
/// <summary> /// Method that extract data and save them a normalized CSV file /// </summary> public Dictionary <string, RawData> Extract() { var data = new Dictionary <string, RawData>(); var datesErrors = new HashSet <string>(); var missingFields = new HashSet <Tuple <string, string> >(); // Setting columns names aliases var areaAlias = new string[] { "Country_Region", "Country/Region" }; var subareaAlias = new string[] { "Province_State", "Province/State" }; var latAlias = new string[] { "Latitude", "Lat" }; var lngAlias = new string[] { "Longitude", "Long_" }; var files = Directory.GetFiles(_repositoryFolder, "*.csv", SearchOption.AllDirectories).ToList().OrderBy(x => x); foreach (var f in files) #region EXTRACT DATA FROM FILES { int counter = 0; int addedLines = 0; using (var sr = new StreamReader(f)) { using (var csv = new CustomCsvReader(sr, CultureInfo.InvariantCulture)) { csv.Configuration.MissingFieldFound = delegate(string[] tab, int count, ReadingContext ctxt) { var miss = new Tuple <string, string>(Path.GetFileName(f), string.Join(", ", tab)); if (!missingFields.Contains(miss)) { missingFields.Add(miss); } }; csv.Read(); csv.ReadHeader(); #region FETCH COLUMNS INDEX var idxArea = csv.GetFieldIndex(areaAlias); var idxSub = csv.GetFieldIndex(subareaAlias); var idxAdmin2 = csv.GetFieldIndex("Admin2"); var idxConf = csv.GetFieldIndex("Confirmed"); var idxDeath = csv.GetFieldIndex("Deaths"); var idxLat = csv.GetFieldIndex(latAlias); var idxLng = csv.GetFieldIndex(lngAlias); #endregion while (csv.Read()) { #region PROCESSING A LINE counter++; try { var date = Path.GetFileNameWithoutExtension(f); if (DateTime.TryParseExact(date, _dateFormats, CultureInfo.GetCultureInfo("fr-fr"), DateTimeStyles.AdjustToUniversal, out DateTime lastUpdate)) { string area = csv.GetField(idxArea).Replace("Mainland China", "China").Replace("UK", "United Kingdom").Replace("United_Kingdom", "United Kingdom"); string subarea = csv.GetField(idxSub); if (!string.IsNullOrEmpty(area)) { var obj = new RawData(); obj.DataProvider = "JohnsHopkins"; obj.Area = area; obj.SubArea = subarea; obj.Admin2 = csv.GetField(idxAdmin2)?.Replace("Unassigned", string.Empty); obj.Date = lastUpdate; obj.Confirmed = csv.GetFieldAsInt(idxConf, _us); obj.Death = csv.GetFieldAsInt(idxDeath, _us); obj.Latitude = csv.GetFieldAsDouble(idxLat, _us); obj.Longitude = csv.GetFieldAsDouble(idxLng, _us); if (!data.ContainsKey(obj.ToString())) { data.Add(obj.ToString(), obj); addedLines++; } } } else { if (!datesErrors.Contains(date)) { datesErrors.Add(string.Concat(csv.Context.Row.ToString().PadLeft(6, '0'), " \"", f, "\": ", date)); } } } catch (Exception ex) { _logger.Error(string.Concat("\"", f, "\"", ": ", ex.Message)); _logger.Debug(string.Join(" ; ", csv.Context.HeaderRecord)); _logger.Debug(string.Join(" ; ", csv.Context.Record)); throw ex; } #endregion } } } var copyFile = Path.Combine(_copyRepositoryFolder, Path.GetFileName(f)); if (!File.Exists(copyFile)) { // Make a copy File.Copy(f, copyFile); } _logger.Debug($"Processing file \"{Path.GetFileName(f)}\", number of lines added/processed: {addedLines}/{counter}"); }; #endregion #region LOGGING ERRORS if (missingFields.Count > 0) { _logger.Warn($"There are {missingFields.GroupBy(x => x.Item1).Count()} files in which some columns could not be found: \"{string.Join(", ", missingFields.GroupBy(x => x.Item2).Select(x => x.Key))}\""); if (_logger.IsDebugEnabled) { _logger.Debug(string.Concat("Following fields could not be found:\n", string.Join("\n", missingFields.GroupBy(x => x.Item1).Select(x => $"{x.Key}: \"{string.Join("\", \"", x.Select(y => y.Item2))}\"")))); } } if (datesErrors.Count > 0) { _logger.Warn($"There are {datesErrors.Count} dates not correctly formated"); if (_logger.IsDebugEnabled) { _logger.Debug(string.Concat("Following dates are not in correct format:\n", string.Join("\n", datesErrors))); } } #endregion _logger.Info(string.Concat("Found ", data.Count, " records")); // Writing output file using (var writer = new StreamWriter(_outputFile)) { using (var csv = new CsvWriter(writer, CultureInfo.GetCultureInfo("fr-fr"))) { csv.WriteRecords(data.Select(x => x.Value)); } } return(data); }