// Try importing a CSV file, checking its headers and values against the template's DataLabels and data types. // Duplicates are handled. // Return a list of errors if needed. // However, error reporting is limited to only gross mismatches. // Note that: // - rows in the CSV file that are not in the .ddb file are ignored (not reported - maybe it should be?) // - rows in the .ddb file that are not in the CSV file are ignored // - if there are more duplicate rows for an image in the .csv file than there are in the .ddb file, those extra duplicates are ignored (not reported - maybe it should be?) // - if there are more duplicate rows for an image in the .ddb file than there are in the .csv file, those extra duplicates are ignored (not reported - maybe it should be?) public static async Task <Tuple <bool, List <string> > > TryImportFromCsv(string filePath, FileDatabase fileDatabase) { // Set up a progress handler that will update the progress bar Progress <ProgressBarArguments> progressHandler = new Progress <ProgressBarArguments>(value => { // Update the progress bar CsvReaderWriter.UpdateProgressBar(GlobalReferences.BusyCancelIndicator, value.PercentDone, value.Message, value.IsCancelEnabled, value.IsIndeterminate); }); IProgress <ProgressBarArguments> progress = progressHandler; List <string> importErrors = new List <string>(); return(await Task.Run(() => { const int bulkFilesToHandle = 2000; int processedFilesCount = 0; int totalFilesProcessed = 0; int dateTimeErrors = 0; progress.Report(new ProgressBarArguments(0, "Reading the CSV file. Please wait", false, true)); // PART 1. Read in the CSV file. Return false if there is a problem in reading the CSV file or if the CSV file is empty if (false == TryReadingCSVFile(filePath, out List <List <string> > parsedFile, importErrors)) { return new Tuple <bool, List <string> >(false, importErrors); } // Now that we have a parsed file, get its headers, which we will use as DataLabels List <string> dataLabelsFromCSV = parsedFile[0].Where(s => !string.IsNullOrWhiteSpace(s)).Distinct().ToList(); // Part 2. Abort if required CSV column are missing or there is a problem matching the CSV file headers against the DB headers. if (false == VerifyCSVHeaders(fileDatabase, dataLabelsFromCSV, importErrors)) { return new Tuple <bool, List <string> >(false, importErrors); } // Part 3: Create a List of all data rows, where each row is a dictionary containing the header and that row's valued for the header List <Dictionary <string, string> > rowDictionaryList = GetAllDataRows(dataLabelsFromCSV, parsedFile); // Part 4. For every row, validate each column's data against its type. Abort if the type does not match if (false == VerifyDataInColumns(fileDatabase, dataLabelsFromCSV, rowDictionaryList, importErrors)) { return new Tuple <bool, List <string> >(false, importErrors); } // // Part 4. Check and manage duplicates // // Get a list of duplicates in the database, i.e. rows with both the Same relativePath and File List <string> databaseDuplicates = fileDatabase.GetDistinctRelativePathFileCombinationsDuplicates(); // Sort the rowDictionaryList so that duplicates in the CSV file (with the same relative path / File name) are in order, one after the other. List <Dictionary <string, string> > sortedRowDictionaryList = rowDictionaryList.OrderBy(dict => dict["RelativePath"]).ThenBy(dict => dict["File"]).ToList(); int sortedRowDictionaryListCount = sortedRowDictionaryList.Count; // Create the data structure for the query List <ColumnTuplesWithWhere> imagesToUpdate = new List <ColumnTuplesWithWhere>(); // Handle duplicates and more int nextRowIndex = 0; string currentPath = String.Empty; // the path of the current row string examinedPath = String.Empty; // the path of a surrounding row currently being examined to see if its a duplicate string duplicatePath = String.Empty; // a duplicate was identified, and this holds the duplicate path List <Dictionary <string, string> > duplicatesDictionaryList = new List <Dictionary <string, string> >(); foreach (Dictionary <string, string> rowDict in sortedRowDictionaryList) { // For every row... nextRowIndex++; currentPath = Path.Combine(rowDict[Constant.DatabaseColumn.RelativePath], rowDict[Constant.DatabaseColumn.File]); #region Handle duplicates // Duplicates are special cases, where we have to update each set of duplicates separately as a chunk. // To begin, check if its a duplicate, which occurs if the path (RelativePath/File) is identical if (currentPath == duplicatePath) { // we are in the middle of a sequence, and this record has the same path as the previously identified duplicate. // Thus the current record has to be a duplicate. // Add it to the list. duplicatesDictionaryList.Add(rowDict); // A check if we are at the end of the CSV file - this catches the condition where the very last entry in the sorted csv file is a duplicate if (nextRowIndex >= sortedRowDictionaryListCount) { string error = UpdateDuplicatesInDatabase(fileDatabase, duplicatesDictionaryList, Path.GetDirectoryName(duplicatePath), Path.GetFileName(duplicatePath)); if (false == String.IsNullOrEmpty(error)) { importErrors.Add(error); } duplicatesDictionaryList.Clear(); } continue; } else { // Check if we are at the end of a duplicate sequence if (duplicatesDictionaryList.Count > 0) { // This entry marks the end of a sequence as the paths aren't equal but we have duplicates. Process the prior sequence string error = UpdateDuplicatesInDatabase(fileDatabase, duplicatesDictionaryList, Path.GetDirectoryName(duplicatePath), Path.GetFileName(duplicatePath)); if (false == String.IsNullOrEmpty(error)) { importErrors.Add(error); } duplicatesDictionaryList.Clear(); } // We are either not in a sequence, or we completed the sequence. So we need to manage the current entry. if (nextRowIndex < sortedRowDictionaryListCount) { // We aren't currently in a sequence. Determine if the current entry is a singleton or the first duplicate in a sequence by checking its path against the next record. // If it is a duplicate, add it to the list. Dictionary <string, string> nextRow = sortedRowDictionaryList[nextRowIndex]; examinedPath = Path.Combine(nextRow[Constant.DatabaseColumn.RelativePath], nextRow[Constant.DatabaseColumn.File]); if (examinedPath == currentPath) { // Yup, its the beginning of a sequence. duplicatePath = currentPath; duplicatesDictionaryList.Clear(); duplicatesDictionaryList.Add(rowDict); continue; } else { // It must be singleton duplicatePath = String.Empty; if (databaseDuplicates.Contains(currentPath)) { // But, if the database contains a duplicate with the same relativePath/File, then we want to update just the first database duplicate, rather than update all those // database duplicates with the same value (if we let it fall thorugh duplicatesDictionaryList.Add(rowDict); string error = UpdateDuplicatesInDatabase(fileDatabase, duplicatesDictionaryList, Path.GetDirectoryName(currentPath), Path.GetFileName(currentPath)); if (false == String.IsNullOrEmpty(error)) { importErrors.Add(error); } duplicatesDictionaryList.Clear(); continue; } } } } #endregion Handle duplicates #region Process each column in a row by its header type // Process each non-duplicate row // Note that we never update: // - Path-related fields (File, RelativePath, Folder) // - Date and Time-related fields (DateTime, Date, Time, UtcOffset ColumnTuplesWithWhere imageToUpdate = new ColumnTuplesWithWhere(); CultureInfo provider = CultureInfo.InvariantCulture; DateTime datePortion = DateTime.MinValue; DateTime timePortion = DateTime.MinValue; DateTime dateTime = DateTime.MinValue; foreach (string header in rowDict.Keys) { // For every column ... ControlRow controlRow = fileDatabase.GetControlFromTemplateTable(header); // process each column but only if its of the specific type if (IsStandardColumn(controlRow.Type)) { imageToUpdate.Columns.Add(new ColumnTuple(header, rowDict[header])); } else { // Its not a standard control, so check if its a date/time control and handle that as these are special cases if (controlRow.Type == Constant.DatabaseColumn.DateTime) { string strDateTime = rowDict[header]; if (DateTime.TryParseExact(strDateTime, Constant.Time.DateTimeCSVWithoutTSeparator, provider, DateTimeStyles.None, out dateTime)) { // Standard DateTime // System.Diagnostics.Debug.Print("Standard: " + dateTime.ToString()); } else if (DateTime.TryParseExact(strDateTime, Constant.Time.DateTimeCSVWithTSeparator, provider, DateTimeStyles.None, out dateTime)) { // Standard DateTime wit T separator // System.Diagnostics.Debug.Print("StandardT: " + dateTime.ToString()); } } else if (controlRow.Type == Constant.DatabaseColumn.Date) { // Date only string strDateTime = rowDict[header]; if (DateTime.TryParseExact(strDateTime, Constant.Time.DateFormat, provider, DateTimeStyles.None, out DateTime tempDateTime)) { datePortion = tempDateTime; } } else if (controlRow.Type == Constant.DatabaseColumn.Time) { // Time only string strDateTime = rowDict[header]; if (DateTime.TryParseExact(strDateTime, Constant.Time.TimeFormat, provider, DateTimeStyles.None, out DateTime tempDateTime)) { //System.Diagnostics.Debug.Print("Time only: " + tempDateTime.ToString()); timePortion = tempDateTime; } } } } #endregion Process each column by its header type // We've now looked at all the columns in a row, so continue processing that row as needed totalFilesProcessed++; if (dateTime != DateTime.MinValue || (datePortion != DateTime.MinValue && timePortion != DateTime.MinValue)) { // If the separate date and time fields were used, update dateTime from them if (datePortion != DateTime.MinValue && timePortion != DateTime.MinValue) { // We have a valid separate date and time. Combine it. dateTime = datePortion.Date + timePortion.TimeOfDay; } // Because we expect a UTC date/time, set its kind dateTime = DateTime.SpecifyKind(dateTime, DateTimeKind.Utc); // We should now have a valid dateTime. Add it to the database. // Note that this resets UtcOffset to 0, as its recorded in local time imageToUpdate.Columns.Add(new ColumnTuple(Constant.DatabaseColumn.DateTime, dateTime)); imageToUpdate.Columns.Add(new ColumnTuple(Constant.DatabaseColumn.UtcOffset, new TimeSpan(0))); imageToUpdate.Columns.Add(new ColumnTuple(Constant.DatabaseColumn.Date, DateTimeHandler.ToStringDisplayDate(dateTime))); imageToUpdate.Columns.Add(new ColumnTuple(Constant.DatabaseColumn.Time, DateTimeHandler.ToStringDisplayTime(dateTime))); // System.Diagnostics.Debug.Print("Wrote DateTime: " + dateTime.ToString()); } else { dateTimeErrors++; // importErrors.Add(String.Format("{0}: Could not extract datetime", currentPath)); // System.Diagnostics.Debug.Print("Could not extract datetime"); } dateTime = DateTime.MinValue; datePortion = DateTime.MinValue; timePortion = DateTime.MinValue; // NOTE: We currently do NOT report an error if there is a row in the csv file whose location does not match // the location in the database. We could do this by performing a check before submitting a query, eg. something like: // Select Count (*) from DataTable where File='IMG_00197.JPG' or File='IMG_01406.JPG' or File='XX.JPG' // where we would then compare the counts against the rows. However, this likely has a performance hit, and it doesn't // return the erroneous rows... So its not done yet. // Add to the query only if there are columns to add! if (imageToUpdate.Columns.Count > 0) { if (rowDict.ContainsKey(Constant.DatabaseColumn.RelativePath) && !String.IsNullOrWhiteSpace(rowDict[Constant.DatabaseColumn.RelativePath])) { imageToUpdate.SetWhere(rowDict[Constant.DatabaseColumn.RelativePath], rowDict[Constant.DatabaseColumn.File]); } else { imageToUpdate.SetWhere(rowDict[Constant.DatabaseColumn.File]); } imagesToUpdate.Add(imageToUpdate); } // Write current batch of updates to database. Note that we Update the database every number of rows as specified in bulkFilesToHandle. // We should probably put in a cancellation token somewhere around here... if (imagesToUpdate.Count >= bulkFilesToHandle) { processedFilesCount += bulkFilesToHandle; progress.Report(new ProgressBarArguments(Convert.ToInt32(((double)processedFilesCount) / sortedRowDictionaryListCount * 100.0), String.Format("Processing {0}/{1} files. Please wait...", processedFilesCount, sortedRowDictionaryListCount), false, false)); fileDatabase.UpdateFiles(imagesToUpdate); imagesToUpdate.Clear(); } } // perform any remaining updates if (dateTimeErrors != 0) { // Need to check IF THIS WORKS FOR files with no date-time fields! importErrors.Add(String.Format("The Date/Time was not be updated for {0} / {1} files. ", dateTimeErrors, totalFilesProcessed)); if (dataLabelsFromCSV.Contains(Constant.DatabaseColumn.DateTime) || (dataLabelsFromCSV.Contains(Constant.DatabaseColumn.Date) && dataLabelsFromCSV.Contains(Constant.DatabaseColumn.Time))) { importErrors.Add("- some date / time values in the DateTime, Date or Time columns are in an unexpected format (see manual)"); } else { importErrors.Add("- the CSV file is missing either a DateTime column or both Date and Time columns (this is ok if it was intended)"); } } fileDatabase.UpdateFiles(imagesToUpdate); return new Tuple <bool, List <string> >(true, importErrors); }).ConfigureAwait(true)); }