// Given a list of duplicates and their common relative path, update the corresponding duplicates in the database // We do this by getting the IDs of duplicates in the database, where we update each database by ID to a duplicate. // If there is a mismatch in the number of duplicates in the database vs. in the CSV file, we just update whatever does match. private static string UpdateDuplicatesInDatabase(FileDatabase fileDatabase, List <Dictionary <string, string> > duplicatesDictionaryList, string relativePath, string file) { List <ColumnTuplesWithWhere> imagesToUpdate = new List <ColumnTuplesWithWhere>(); string errorMessage = String.Empty; // Find THE IDs of ImageRows with those RelativePath / File values List <long> duplicateIDS = fileDatabase.SelectFilesByRelativePathAndFileName(relativePath, file); if (duplicateIDS.Count != duplicatesDictionaryList.Count) { string dbEntry = duplicateIDS.Count == 1 ? "entry" : "entries"; string csvEntry = duplicatesDictionaryList.Count == 1 ? "entry" : "entries"; errorMessage = String.Format("duplicate entry mismatch for {0}: {1} database {2} vs. {3} CSV {4}.", Path.Combine(relativePath, file), duplicateIDS.Count, dbEntry, duplicatesDictionaryList.Count, csvEntry); } int idIndex = 0; foreach (Dictionary <string, string> rowDict in duplicatesDictionaryList) { if (idIndex >= duplicateIDS.Count) { break; } // Process each row ColumnTuplesWithWhere imageToUpdate = new ColumnTuplesWithWhere(); foreach (string header in rowDict.Keys) { ControlRow controlRow = fileDatabase.GetControlFromTemplateTable(header); // process each column but only if its off the specific type if (controlRow.Type == Constant.Control.Flag || controlRow.Type != Constant.DatabaseColumn.DeleteFlag || controlRow.Type == Constant.Control.Counter || controlRow.Type == Constant.Control.FixedChoice || controlRow.Type == Constant.DatabaseColumn.ImageQuality ) { imageToUpdate.Columns.Add(new ColumnTuple(header, rowDict[header])); } } // Add to the query only if there are columns to add! if (imageToUpdate.Columns.Count > 0) { imageToUpdate.SetWhere(duplicateIDS[idIndex]); imagesToUpdate.Add(imageToUpdate); } idIndex++; } if (imagesToUpdate.Count > 0) { fileDatabase.UpdateFiles(imagesToUpdate); } return(errorMessage); }
// Build a ColumnTuplesWithWhere containing the stock column values from the current image row // Where identifies the ID of the current image row - note that this is done in the GetDateTimeColumnTuples() public override ColumnTuplesWithWhere CreateColumnTuplesWithWhereByID() { ColumnTuplesWithWhere columnTuples = this.GetDateTimeColumnTuples(); columnTuples.Columns.Add(new ColumnTuple(Constant.DatabaseColumn.File, this.File)); columnTuples.Columns.Add(new ColumnTuple(Constant.DatabaseColumn.ImageQuality, this.ImageQuality.ToString())); columnTuples.Columns.Add(new ColumnTuple(Constant.DatabaseColumn.Folder, this.Folder)); columnTuples.Columns.Add(new ColumnTuple(Constant.DatabaseColumn.RelativePath, this.RelativePath)); return(columnTuples); }
// Try importing a CSV file, checking its headers and values against the template's DataLabels and data types. // Return a list of errors if needed. public static async Task <Tuple <bool, List <string> > > TryImportFromCsv(string filePath, FileDatabase fileDatabase) { // Set up a progress handler that will update the progress bar Progress <ProgressBarArguments> progressHandler = new Progress <ProgressBarArguments>(value => { // Update the progress bar CsvReaderWriter.UpdateProgressBar(GlobalReferences.BusyCancelIndicator, value.PercentDone, value.Message, value.IsCancelEnabled, value.IsIndeterminate); }); IProgress <ProgressBarArguments> progress = progressHandler; bool abort = false; List <string> importErrors = new List <string>(); return(await Task.Run(() => { progress.Report(new ProgressBarArguments(0, "Reading the CSV file. Please wait", false, true)); List <List <string> > parsedFile = ReadAndParseCSVFile(filePath); if (parsedFile == null) { // Could not open the file importErrors.Add(String.Format("The file '{0}' could not be read. To check: Is opened by another application? Is it a valid CSV file?", Path.GetFileName(filePath))); return new Tuple <bool, List <string> >(false, importErrors); } if (parsedFile.Count < 2) { // The CSV file is empty or only contains a header row importErrors.Add(String.Format("The file '{0}' does not contain any data.", Path.GetFileName(filePath))); return new Tuple <bool, List <string> >(false, importErrors); } List <string> dataLabels = fileDatabase.GetDataLabelsExceptIDInSpreadsheetOrder(); // Get the header (and remove any empty trailing headers from the list) List <string> dataLabelsFromHeader = parsedFile[0].Where(s => !string.IsNullOrWhiteSpace(s)).Distinct().ToList(); // validate .csv file headers against the database List <string> dataLabelsInHeaderButNotFileDatabase = dataLabelsFromHeader.Except(dataLabels).ToList(); // File - Required datalabel and contents as we can't update the file's data row without it. if (dataLabelsFromHeader.Contains(Constant.DatabaseColumn.File) == false) { importErrors.Add(String.Format("A '{0}' column containing matching file names to your images is required to do the update.", Constant.DatabaseColumn.File)); abort = true; } // Required: the column headers must exist in the template as valid DataLabels // Note: could do this as a warning rather than as an abort, but... foreach (string dataLabel in dataLabelsInHeaderButNotFileDatabase) { importErrors.Add(String.Format("The column heading '{0}' in the CSV file does not match any DataLabel in the template.", dataLabel)); abort = true; } if (abort) { // We failed. abort. return new Tuple <bool, List <string> >(false, importErrors); } // Create a List of all data rows, where each row is a dictionary containing the header and that row's valued for the header List <Dictionary <string, string> > rowDictionaryList = new List <Dictionary <string, string> >(); int rowNumber = 0; int numberOfHeaders = dataLabelsFromHeader.Count; foreach (List <string> parsedRow in parsedFile) { // For each data row rowNumber++; if (rowNumber == 1) { // Skip the 1st header row continue; } // for this row, create a dictionary of matching the CSV column Header and that column's value Dictionary <string, string> rowDictionary = new Dictionary <string, string>(); for (int i = 0; i < numberOfHeaders; i++) { string valueToAdd = (i < parsedRow.Count) ? parsedRow[i] : String.Empty; rowDictionary.Add(dataLabelsFromHeader[i], parsedRow[i]); } rowDictionaryList.Add(rowDictionary); } // Validate each value in the dictionary against the Header type and expected foreach (string header in dataLabelsFromHeader) { ControlRow controlRow = fileDatabase.GetControlFromTemplateTable(header); // We don't need to worry about File-related or Date-related controls as they are mot updated if (controlRow.Type == Constant.Control.Flag || controlRow.Type == Constant.DatabaseColumn.DeleteFlag || controlRow.Type == Constant.Control.Counter || controlRow.Type == Constant.Control.FixedChoice || controlRow.Type == Constant.DatabaseColumn.ImageQuality ) { rowNumber = 0; foreach (Dictionary <string, string> rowDict in rowDictionaryList) { rowNumber++; switch (controlRow.Type) { case Constant.Control.Flag: case Constant.DatabaseColumn.DeleteFlag: if (!Boolean.TryParse(rowDict[header], out _)) { // Flag values must be true or false, but its not. So raise an error importErrors.Add(String.Format("Error in row {1}. {0} values must be true or false, but is '{2}'", header, rowNumber, rowDict[header])); abort = true; } break; case Constant.Control.Counter: if (!String.IsNullOrWhiteSpace(rowDict[header]) && !Int32.TryParse(rowDict[header], out _)) { // Counters must be integers / blanks importErrors.Add(String.Format("Error in row {1}. {0} values must be blank or a number, but is '{2}'", header, rowNumber, rowDict[header])); abort = true; } break; case Constant.Control.FixedChoice: case Constant.DatabaseColumn.ImageQuality: if (controlRow.List.Contains(rowDict[header]) == false) { // Fixed Choices must be in the Choice List importErrors.Add(String.Format("Error in row {1}. {0} values must be in the template's choice list, but '{2}' isn't in it.", header, rowNumber, rowDict[header])); abort = true; } break; default: break; } } } } if (abort) { // We failed. abort. return new Tuple <bool, List <string> >(false, importErrors); } // Create the data structure for the query // Update the database 100 rows at a time. List <ColumnTuplesWithWhere> imagesToUpdate = new List <ColumnTuplesWithWhere>(); foreach (Dictionary <string, string> rowDict in rowDictionaryList) { // Process each row ColumnTuplesWithWhere imageToUpdate = new ColumnTuplesWithWhere(); foreach (string header in rowDict.Keys) { ControlRow controlRow = fileDatabase.GetControlFromTemplateTable(header); // process each column but only if its off the specific type if (controlRow.Type == Constant.Control.Flag || controlRow.Type != Constant.DatabaseColumn.DeleteFlag || controlRow.Type == Constant.Control.Counter || controlRow.Type == Constant.Control.FixedChoice || controlRow.Type == Constant.DatabaseColumn.ImageQuality ) { imageToUpdate.Columns.Add(new ColumnTuple(header, rowDict[header])); } } // Add to the query only if there are columns to add! if (imageToUpdate.Columns.Count > 0) { if (rowDict.ContainsKey(Constant.DatabaseColumn.RelativePath) && !String.IsNullOrWhiteSpace(rowDict[Constant.DatabaseColumn.RelativePath])) { imageToUpdate.SetWhere(rowDict[Constant.DatabaseColumn.RelativePath], rowDict[Constant.DatabaseColumn.File]); } else { imageToUpdate.SetWhere(rowDict[Constant.DatabaseColumn.File]); } imagesToUpdate.Add(imageToUpdate); } // write current batch of updates to database if (imagesToUpdate.Count >= 100) { fileDatabase.UpdateFiles(imagesToUpdate); imagesToUpdate.Clear(); } } // perform any remaining updates fileDatabase.UpdateFiles(imagesToUpdate); return new Tuple <bool, List <string> >(true, importErrors); }).ConfigureAwait(true)); }
// Try importing a CSV file, checking its headers and values against the template's DataLabels and data types. // Duplicates are handled. // Return a list of errors if needed. // However, error reporting is limited to only gross mismatches. // Note that: // - rows in the CSV file that are not in the .ddb file are ignored (not reported - maybe it should be?) // - rows in the .ddb file that are not in the CSV file are ignored // - if there are more duplicate rows for an image in the .csv file than there are in the .ddb file, those extra duplicates are ignored (not reported - maybe it should be?) // - if there are more duplicate rows for an image in the .ddb file than there are in the .csv file, those extra duplicates are ignored (not reported - maybe it should be?) public static async Task <Tuple <bool, List <string> > > TryImportFromCsv(string filePath, FileDatabase fileDatabase) { // Set up a progress handler that will update the progress bar Progress <ProgressBarArguments> progressHandler = new Progress <ProgressBarArguments>(value => { // Update the progress bar CsvReaderWriter.UpdateProgressBar(GlobalReferences.BusyCancelIndicator, value.PercentDone, value.Message, value.IsCancelEnabled, value.IsIndeterminate); }); IProgress <ProgressBarArguments> progress = progressHandler; List <string> importErrors = new List <string>(); return(await Task.Run(() => { const int bulkFilesToHandle = 2000; int processedFilesCount = 0; int totalFilesProcessed = 0; int dateTimeErrors = 0; progress.Report(new ProgressBarArguments(0, "Reading the CSV file. Please wait", false, true)); // PART 1. Read in the CSV file. Return false if there is a problem in reading the CSV file or if the CSV file is empty if (false == TryReadingCSVFile(filePath, out List <List <string> > parsedFile, importErrors)) { return new Tuple <bool, List <string> >(false, importErrors); } // Now that we have a parsed file, get its headers, which we will use as DataLabels List <string> dataLabelsFromCSV = parsedFile[0].Where(s => !string.IsNullOrWhiteSpace(s)).Distinct().ToList(); // Part 2. Abort if required CSV column are missing or there is a problem matching the CSV file headers against the DB headers. if (false == VerifyCSVHeaders(fileDatabase, dataLabelsFromCSV, importErrors)) { return new Tuple <bool, List <string> >(false, importErrors); } // Part 3: Create a List of all data rows, where each row is a dictionary containing the header and that row's valued for the header List <Dictionary <string, string> > rowDictionaryList = GetAllDataRows(dataLabelsFromCSV, parsedFile); // Part 4. For every row, validate each column's data against its type. Abort if the type does not match if (false == VerifyDataInColumns(fileDatabase, dataLabelsFromCSV, rowDictionaryList, importErrors)) { return new Tuple <bool, List <string> >(false, importErrors); } // // Part 4. Check and manage duplicates // // Get a list of duplicates in the database, i.e. rows with both the Same relativePath and File List <string> databaseDuplicates = fileDatabase.GetDistinctRelativePathFileCombinationsDuplicates(); // Sort the rowDictionaryList so that duplicates in the CSV file (with the same relative path / File name) are in order, one after the other. List <Dictionary <string, string> > sortedRowDictionaryList = rowDictionaryList.OrderBy(dict => dict["RelativePath"]).ThenBy(dict => dict["File"]).ToList(); int sortedRowDictionaryListCount = sortedRowDictionaryList.Count; // Create the data structure for the query List <ColumnTuplesWithWhere> imagesToUpdate = new List <ColumnTuplesWithWhere>(); // Handle duplicates and more int nextRowIndex = 0; string currentPath = String.Empty; // the path of the current row string examinedPath = String.Empty; // the path of a surrounding row currently being examined to see if its a duplicate string duplicatePath = String.Empty; // a duplicate was identified, and this holds the duplicate path List <Dictionary <string, string> > duplicatesDictionaryList = new List <Dictionary <string, string> >(); foreach (Dictionary <string, string> rowDict in sortedRowDictionaryList) { // For every row... nextRowIndex++; currentPath = Path.Combine(rowDict[Constant.DatabaseColumn.RelativePath], rowDict[Constant.DatabaseColumn.File]); #region Handle duplicates // Duplicates are special cases, where we have to update each set of duplicates separately as a chunk. // To begin, check if its a duplicate, which occurs if the path (RelativePath/File) is identical if (currentPath == duplicatePath) { // we are in the middle of a sequence, and this record has the same path as the previously identified duplicate. // Thus the current record has to be a duplicate. // Add it to the list. duplicatesDictionaryList.Add(rowDict); // A check if we are at the end of the CSV file - this catches the condition where the very last entry in the sorted csv file is a duplicate if (nextRowIndex >= sortedRowDictionaryListCount) { string error = UpdateDuplicatesInDatabase(fileDatabase, duplicatesDictionaryList, Path.GetDirectoryName(duplicatePath), Path.GetFileName(duplicatePath)); if (false == String.IsNullOrEmpty(error)) { importErrors.Add(error); } duplicatesDictionaryList.Clear(); } continue; } else { // Check if we are at the end of a duplicate sequence if (duplicatesDictionaryList.Count > 0) { // This entry marks the end of a sequence as the paths aren't equal but we have duplicates. Process the prior sequence string error = UpdateDuplicatesInDatabase(fileDatabase, duplicatesDictionaryList, Path.GetDirectoryName(duplicatePath), Path.GetFileName(duplicatePath)); if (false == String.IsNullOrEmpty(error)) { importErrors.Add(error); } duplicatesDictionaryList.Clear(); } // We are either not in a sequence, or we completed the sequence. So we need to manage the current entry. if (nextRowIndex < sortedRowDictionaryListCount) { // We aren't currently in a sequence. Determine if the current entry is a singleton or the first duplicate in a sequence by checking its path against the next record. // If it is a duplicate, add it to the list. Dictionary <string, string> nextRow = sortedRowDictionaryList[nextRowIndex]; examinedPath = Path.Combine(nextRow[Constant.DatabaseColumn.RelativePath], nextRow[Constant.DatabaseColumn.File]); if (examinedPath == currentPath) { // Yup, its the beginning of a sequence. duplicatePath = currentPath; duplicatesDictionaryList.Clear(); duplicatesDictionaryList.Add(rowDict); continue; } else { // It must be singleton duplicatePath = String.Empty; if (databaseDuplicates.Contains(currentPath)) { // But, if the database contains a duplicate with the same relativePath/File, then we want to update just the first database duplicate, rather than update all those // database duplicates with the same value (if we let it fall thorugh duplicatesDictionaryList.Add(rowDict); string error = UpdateDuplicatesInDatabase(fileDatabase, duplicatesDictionaryList, Path.GetDirectoryName(currentPath), Path.GetFileName(currentPath)); if (false == String.IsNullOrEmpty(error)) { importErrors.Add(error); } duplicatesDictionaryList.Clear(); continue; } } } } #endregion Handle duplicates #region Process each column in a row by its header type // Process each non-duplicate row // Note that we never update: // - Path-related fields (File, RelativePath, Folder) // - Date and Time-related fields (DateTime, Date, Time, UtcOffset ColumnTuplesWithWhere imageToUpdate = new ColumnTuplesWithWhere(); CultureInfo provider = CultureInfo.InvariantCulture; DateTime datePortion = DateTime.MinValue; DateTime timePortion = DateTime.MinValue; DateTime dateTime = DateTime.MinValue; foreach (string header in rowDict.Keys) { // For every column ... ControlRow controlRow = fileDatabase.GetControlFromTemplateTable(header); // process each column but only if its of the specific type if (IsStandardColumn(controlRow.Type)) { imageToUpdate.Columns.Add(new ColumnTuple(header, rowDict[header])); } else { // Its not a standard control, so check if its a date/time control and handle that as these are special cases if (controlRow.Type == Constant.DatabaseColumn.DateTime) { string strDateTime = rowDict[header]; if (DateTime.TryParseExact(strDateTime, Constant.Time.DateTimeCSVWithoutTSeparator, provider, DateTimeStyles.None, out dateTime)) { // Standard DateTime // System.Diagnostics.Debug.Print("Standard: " + dateTime.ToString()); } else if (DateTime.TryParseExact(strDateTime, Constant.Time.DateTimeCSVWithTSeparator, provider, DateTimeStyles.None, out dateTime)) { // Standard DateTime wit T separator // System.Diagnostics.Debug.Print("StandardT: " + dateTime.ToString()); } } else if (controlRow.Type == Constant.DatabaseColumn.Date) { // Date only string strDateTime = rowDict[header]; if (DateTime.TryParseExact(strDateTime, Constant.Time.DateFormat, provider, DateTimeStyles.None, out DateTime tempDateTime)) { datePortion = tempDateTime; } } else if (controlRow.Type == Constant.DatabaseColumn.Time) { // Time only string strDateTime = rowDict[header]; if (DateTime.TryParseExact(strDateTime, Constant.Time.TimeFormat, provider, DateTimeStyles.None, out DateTime tempDateTime)) { //System.Diagnostics.Debug.Print("Time only: " + tempDateTime.ToString()); timePortion = tempDateTime; } } } } #endregion Process each column by its header type // We've now looked at all the columns in a row, so continue processing that row as needed totalFilesProcessed++; if (dateTime != DateTime.MinValue || (datePortion != DateTime.MinValue && timePortion != DateTime.MinValue)) { // If the separate date and time fields were used, update dateTime from them if (datePortion != DateTime.MinValue && timePortion != DateTime.MinValue) { // We have a valid separate date and time. Combine it. dateTime = datePortion.Date + timePortion.TimeOfDay; } // Because we expect a UTC date/time, set its kind dateTime = DateTime.SpecifyKind(dateTime, DateTimeKind.Utc); // We should now have a valid dateTime. Add it to the database. // Note that this resets UtcOffset to 0, as its recorded in local time imageToUpdate.Columns.Add(new ColumnTuple(Constant.DatabaseColumn.DateTime, dateTime)); imageToUpdate.Columns.Add(new ColumnTuple(Constant.DatabaseColumn.UtcOffset, new TimeSpan(0))); imageToUpdate.Columns.Add(new ColumnTuple(Constant.DatabaseColumn.Date, DateTimeHandler.ToStringDisplayDate(dateTime))); imageToUpdate.Columns.Add(new ColumnTuple(Constant.DatabaseColumn.Time, DateTimeHandler.ToStringDisplayTime(dateTime))); // System.Diagnostics.Debug.Print("Wrote DateTime: " + dateTime.ToString()); } else { dateTimeErrors++; // importErrors.Add(String.Format("{0}: Could not extract datetime", currentPath)); // System.Diagnostics.Debug.Print("Could not extract datetime"); } dateTime = DateTime.MinValue; datePortion = DateTime.MinValue; timePortion = DateTime.MinValue; // NOTE: We currently do NOT report an error if there is a row in the csv file whose location does not match // the location in the database. We could do this by performing a check before submitting a query, eg. something like: // Select Count (*) from DataTable where File='IMG_00197.JPG' or File='IMG_01406.JPG' or File='XX.JPG' // where we would then compare the counts against the rows. However, this likely has a performance hit, and it doesn't // return the erroneous rows... So its not done yet. // Add to the query only if there are columns to add! if (imageToUpdate.Columns.Count > 0) { if (rowDict.ContainsKey(Constant.DatabaseColumn.RelativePath) && !String.IsNullOrWhiteSpace(rowDict[Constant.DatabaseColumn.RelativePath])) { imageToUpdate.SetWhere(rowDict[Constant.DatabaseColumn.RelativePath], rowDict[Constant.DatabaseColumn.File]); } else { imageToUpdate.SetWhere(rowDict[Constant.DatabaseColumn.File]); } imagesToUpdate.Add(imageToUpdate); } // Write current batch of updates to database. Note that we Update the database every number of rows as specified in bulkFilesToHandle. // We should probably put in a cancellation token somewhere around here... if (imagesToUpdate.Count >= bulkFilesToHandle) { processedFilesCount += bulkFilesToHandle; progress.Report(new ProgressBarArguments(Convert.ToInt32(((double)processedFilesCount) / sortedRowDictionaryListCount * 100.0), String.Format("Processing {0}/{1} files. Please wait...", processedFilesCount, sortedRowDictionaryListCount), false, false)); fileDatabase.UpdateFiles(imagesToUpdate); imagesToUpdate.Clear(); } } // perform any remaining updates if (dateTimeErrors != 0) { // Need to check IF THIS WORKS FOR files with no date-time fields! importErrors.Add(String.Format("The Date/Time was not be updated for {0} / {1} files. ", dateTimeErrors, totalFilesProcessed)); if (dataLabelsFromCSV.Contains(Constant.DatabaseColumn.DateTime) || (dataLabelsFromCSV.Contains(Constant.DatabaseColumn.Date) && dataLabelsFromCSV.Contains(Constant.DatabaseColumn.Time))) { importErrors.Add("- some date / time values in the DateTime, Date or Time columns are in an unexpected format (see manual)"); } else { importErrors.Add("- the CSV file is missing either a DateTime column or both Date and Time columns (this is ok if it was intended)"); } } fileDatabase.UpdateFiles(imagesToUpdate); return new Tuple <bool, List <string> >(true, importErrors); }).ConfigureAwait(true)); }