public async Task <(Serialized input, string errorMessage)> ReadFileAsync(string fileName, ProgressData progressData) { if (!IsSupportedFileType(fileName)) { return(null, $"{fileName} is not a *.xlsx file"); } var serialized = new Serialized { MatchIndexes = new Dictionary <string, int>(), Matches = new List <Match>(), Icw = new Dictionary <string, List <int> >(), }; try { await Task.Run(() => ReadMatchFile(serialized, fileName, progressData)); } catch (Exception ex) { FileUtils.LogException(ex, false); return(null, $"Unexpected error while reading AutoCluster match file: {ex.Message}"); } return(serialized, null); }
public async Task <(Serialized input, string errorMessage)> ReadFileAsync(string fileName, ProgressData progressData) { if (!IsSupportedFileType(fileName)) { return(null, $"{fileName} is not a *.csv file"); } // DNAGedcom saves two files: a match file ending with _Family_Finder_Matches, // and an in-common-with file ending with _ICW var trimmedFileName = GetTrimmedFileName(fileName); if (trimmedFileName == null) { return(null, "File name does not end with _Family_Finder_Matches or _ICW"); } var path = Path.GetDirectoryName(fileName); var matchFile = Path.Combine(path, $"{trimmedFileName}_Family_Finder_Matches.csv"); var icwFile = Path.Combine(path, $"{trimmedFileName}_ICW.csv"); if (!File.Exists(matchFile) || !File.Exists(icwFile)) { return(null, $"Could not find both {matchFile} and {icwFile}"); } var serialized = new Serialized(); try { await Task.Run(() => ReadMatchFile(serialized, matchFile)); } catch (Exception ex) { FileUtils.LogException(ex, false); return(null, $"Unexpected error while reading DNAGedcom match file: {ex.Message}"); } try { await Task.Run(() => ReadIcwFile(serialized, icwFile)); } catch (Exception ex) { FileUtils.LogException(ex, false); return(null, $"Unexpected error while reading DNAGedcom icw file: {ex.Message}"); } try { var treeFile = Path.Combine(path, $"{trimmedFileName}_Family_Finder_Trees.csv"); await Task.Run(() => ReadTreeFile(serialized, treeFile)); } catch (Exception) { // Not a problem if we can't read the tree file } return(serialized, null); }
private static void ReadTreeFile(Serialized serialized, string treeFile) { using (var fileStream = new FileStream(treeFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) using (var treeReader = new StreamReader(fileStream)) using (var csv = new CsvReader(treeReader)) { csv.Configuration.Delimiter = ","; csv.Configuration.HeaderValidated = null; csv.Configuration.MissingFieldFound = null; csv.Configuration.BadDataFound = null; csv.Configuration.LineBreakInQuotedFieldIsBadData = false; csv.Configuration.RegisterClassMap <DnaGedcomTreeNodeMap>(); csv.Configuration.PrepareHeaderForMatch = (string header, int index) => header.Replace('_', ' '); // Translate the ICW data. // Shared Clustering assumes that every match also matches themselves. // DNAGedcom does not include the self-matches in the saved ICW data, // so the self-matches need to be added during the translation. var trees = csv.GetRecords <DnaGedcomTreeNode>() .Where(treeNode => treeNode?.ResultId != null) .ToLookup(treeNode => treeNode.ResultId); foreach (var match in serialized.Matches) { match.TreeSize = trees[match.TestGuid].Count(); } } }
public async Task <(Serialized input, string errorMessage)> ReadFileAsync(string fileName, ProgressData progressData) { if (!IsSupportedFileType(fileName)) { return(null, $"{fileName} is not a *.csv file"); } // DNAGedcom saves two files: a match file starting with m_, // and an in-common-with file starting with icw_ var trimmedFileName = GetTrimmedFileName(fileName); if (trimmedFileName == null) { return(null, "File name does not start with m_ or icw_"); } var path = Path.GetDirectoryName(fileName); var matchFile = Path.Combine(path, $"m_{trimmedFileName}.csv"); var icwFile = Path.Combine(path, $"icw_{trimmedFileName}.csv"); if (!File.Exists(matchFile) || !File.Exists(icwFile)) { return(null, $"Could not find both {matchFile} and {icwFile}"); } var serialized = new Serialized(); try { await Task.Run(() => ReadMatchFile(serialized, matchFile)); } catch (Exception ex) { FileUtils.LogException(ex, false); return(null, $"Unexpected error while reading DNAGedcom match file: {ex.Message}"); } try { await Task.Run(() => ReadIcwFile(serialized, icwFile)); } catch (Exception ex) { FileUtils.LogException(ex, false); return(null, $"Unexpected error while reading DNAGedcom icw file: {ex.Message}"); } return(serialized, null); }
private static void ReadMatchFile(Serialized serialized, string matchFile) { using (var fileStream = new FileStream(matchFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) using (var matchReader = new StreamReader(fileStream)) using (var csv = new CsvReader(matchReader)) { csv.Configuration.Delimiter = ","; csv.Configuration.HeaderValidated = null; csv.Configuration.MissingFieldFound = null; csv.Configuration.BadDataFound = null; csv.Configuration.LineBreakInQuotedFieldIsBadData = false; csv.Configuration.RegisterClassMap <DnaGedcomMatchMap>(); csv.Configuration.PrepareHeaderForMatch = (string header, int index) => header.Replace('_', ' '); var dnaGedcomMatches = csv.GetRecords <DnaGedcomMatch>(); if (dnaGedcomMatches == null) { return; } // Translate match properties from DNAGedcom naming to Shared Clustering naming. serialized.Matches = dnaGedcomMatches .Where(match => match != null) .AsParallel() .Select(match => new Match { MatchTestDisplayName = match.Name, TestGuid = match.MatchId, SharedCentimorgans = GetDouble(match.SharedCm), LongestBlock = GetDouble(match.LongestBlock), }) // Do not assume that the DNAGedcom data is free of duplicates. .GroupBy(match => match.TestGuid) .Select(g => g.First()) // Do not assume that the DNAGedcom data is already ordered by descending Shared Centimorgans. .OrderByDescending(match => match.SharedCentimorgans) .ToList(); } // Assign zero-based indexes to the matches sorted by shared centimorgans descending. serialized.MatchIndexes = serialized.Matches .Select(match => match.TestGuid) .Distinct() .Select((id, index) => new { Id = id, Index = index }) .ToDictionary(pair => pair.Id, pair => pair.Index); }
private static void ReadMatchFile(Serialized serialized, string matchFile) { using (var fileStream = new FileStream(matchFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) using (var matchReader = new StreamReader(fileStream)) using (var csv = new CsvReader(matchReader)) { csv.Configuration.Delimiter = ","; csv.Configuration.HeaderValidated = null; csv.Configuration.MissingFieldFound = null; csv.Configuration.BadDataFound = null; csv.Configuration.LineBreakInQuotedFieldIsBadData = false; csv.Configuration.RegisterClassMap <DnaGedcomMatchMap>(); csv.Configuration.PrepareHeaderForMatch = (string header, int index) => header.Replace('_', ' '); var dnaGedcomMatches = csv.GetRecords <DnaGedcomMatch>(); // In case DNAGedcom file has data from more than one test, find the test ID with the largest number of matches. var matches = dnaGedcomMatches .Where(match => match != null) .GroupBy(match => match.TestId ?? "") .OrderByDescending(g => g.Count()) .FirstOrDefault(); if (matches == null) { return; } // This is the Test ID for the person taking the test serialized.TestTakerTestId = matches.Key; // Translate match properties from DNAGedcom naming to Shared Clustering naming. serialized.Matches = matches .AsParallel() .Select(match => new Match { MatchTestAdminDisplayName = match.Admin, MatchTestDisplayName = match.Name, TestGuid = match.MatchId, SharedCentimorgans = GetDouble(match.SharedCm), SharedSegments = int.TryParse(match.SharedSegments, out var sharedSegmentsInt) ? sharedSegmentsInt : 0, TreeSize = int.TryParse(match.People, out var peopleInt) ? peopleInt : 0, Starred = bool.TryParse(match.Starred, out var isStarred) && isStarred, HasHint = bool.TryParse(match.Hint, out var hasHint) && hasHint, Note = match.Note, })
private static void ReadIcwFile(Serialized serialized, string icwFile) { using (var fileStream = new FileStream(icwFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) using (var icwReader = new StreamReader(fileStream)) using (var csv = new CsvReader(icwReader)) { csv.Configuration.Delimiter = ","; csv.Configuration.HeaderValidated = null; csv.Configuration.MissingFieldFound = null; csv.Configuration.BadDataFound = null; csv.Configuration.LineBreakInQuotedFieldIsBadData = false; csv.Configuration.RegisterClassMap <DnaGedcomIcwMap>(); csv.Configuration.PrepareHeaderForMatch = (string header, int index) => header.Replace('_', ' '); // Translate the ICW data. // Shared Clustering assumes that every match also matches themselves. // DNAGedcom does not include the self-matches in the saved ICW data, // so the self-matches need to be added during the translation. serialized.Icw = csv.GetRecords <DnaGedcomIcw>() .Where(icw => icw != null) .GroupBy(icw => icw.MatchId, icw => icw.IcwId) .ToDictionary ( g => g.Key, g => g.Concat(new[] { g.Key }) .Select(id => serialized.MatchIndexes.TryGetValue(id, out var index) ? index : -1).Where(i => i >= 0) .OrderBy(i => i) .ToList() ); } // Also add self-matches to every match that has no shared matches at all. foreach (var guidAndIndex in serialized.MatchIndexes) { if (!serialized.Icw.ContainsKey(guidAndIndex.Key)) { serialized.Icw[guidAndIndex.Key] = new List <int> { guidAndIndex.Value }; } } }
public async Task <(Serialized input, string errorMessage)> ReadFileAsync(string fileName, ProgressData progressData) { if (!IsSupportedFileType(fileName)) { return(null, $"{fileName} is not a *.csv file"); } var serialized = new Serialized(); try { await Task.Run(() => ReadMatchFile(serialized, fileName)); } catch (Exception ex) { FileUtils.LogException(ex, false); return(null, $"Unexpected error while reading AutoCluster match file: {ex.Message}"); } return(serialized, null); }
private static void ReadMatchFile(Serialized serialized, string matchFile, ProgressData progressData) { using (var fileStream = new FileStream(matchFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) using (var package = new ExcelPackage(fileStream)) using (var ws = package.Workbook.Worksheets[1]) { var hyperlinkColumn = 0; var totalSharedCmColumn = 0; var notesColumn = 0; var treeColumn = 0; var firstMatchFieldIndex = 0; var lastMatchFieldIndex = 0; // Find the columns that have interesting data (don't assume specific column numbers) for (var col = 1; col < 1000; ++col) { var cell = ws.Cells[1, col]; var cellValue = cell.GetValue <string>(); if (cellValue.Equals("name", StringComparison.OrdinalIgnoreCase)) { hyperlinkColumn = col; } else if (cellValue.Equals("total shared cM", StringComparison.OrdinalIgnoreCase)) { totalSharedCmColumn = col; } else if (cellValue.Equals("notes", StringComparison.OrdinalIgnoreCase)) { notesColumn = col; } else if (cellValue.Equals("tree", StringComparison.OrdinalIgnoreCase)) { treeColumn = col; } var row2Cell = ws.Cells[2, col]; if (row2Cell.Style.Fill.BackgroundColor.Rgb != null) { firstMatchFieldIndex = col; break; } } if (totalSharedCmColumn == 0) { throw new Exception("Total Shared cM column not found."); } lastMatchFieldIndex = firstMatchFieldIndex; while (ws.Cells[1, lastMatchFieldIndex + 1].Value != null) { lastMatchFieldIndex++; } var maxRow = 1; while (ws.Cells[maxRow + 1, totalSharedCmColumn].Value != null) { maxRow++; } if (maxRow == 1) { throw new Exception("No rows found."); } progressData.Reset("Loading data.", maxRow - 1); for (var row = 2; row <= maxRow; ++row) { progressData.Increment(); var resultMatch = new Match(); if (hyperlinkColumn != 0) { try { // new format var url = ws.Cells[row, hyperlinkColumn].Hyperlink.ToString(); var name = ws.Cells[row, hyperlinkColumn].GetValue <string>(); var path = url.Split('/'); resultMatch.MatchTestDisplayName = name; serialized.TestTakerTestId = path[4]; resultMatch.TestGuid = path[6]; } catch { try { // old format var hyperlink = ws.Cells[row, hyperlinkColumn].GetValue <string>(); var fields = hyperlink.Split('"'); var url = fields[1]; var name = fields[3]; var path = url.Split('/'); resultMatch.MatchTestDisplayName = name; serialized.TestTakerTestId = path[4]; resultMatch.TestGuid = path[6]; } catch { } } } if (totalSharedCmColumn != 0) { resultMatch.SharedCentimorgans = ws.Cells[row, totalSharedCmColumn].GetValue <double>(); } if (notesColumn != 0) { resultMatch.Note = ws.Cells[row, notesColumn].GetValue <string>(); } if (treeColumn != 0) { try { resultMatch.TreeUrl = ws.Cells[row, treeColumn].Hyperlink?.ToString(); if (!string.IsNullOrEmpty(resultMatch.TreeUrl)) { var fields = ws.Cells[row, treeColumn].GetValue <string>().Split(' '); if (fields.Last() == "persons") { resultMatch.TreeSize = Convert.ToInt32(fields.First()); } } } catch { } } // Do not assume that the AutoCluster data is free of duplicates. if (resultMatch.TestGuid == null || serialized.MatchIndexes.ContainsKey(resultMatch.TestGuid)) { continue; } var icw = Enumerable.Range(firstMatchFieldIndex, lastMatchFieldIndex - firstMatchFieldIndex + 1) .Where(col => ws.Cells[row, col].Style.Fill.BackgroundColor.Rgb != null) .Select(col => col - firstMatchFieldIndex) .ToList(); // AutoCluster sometimes writes invalid CSV files, not properly quoting a line break in the notes field. // When that happens the ICW data cannot be read if (icw.Count == 0) { continue; } serialized.Matches.Add(resultMatch); serialized.MatchIndexes[resultMatch.TestGuid] = serialized.MatchIndexes.Count; serialized.Icw[resultMatch.TestGuid] = icw; } } if (serialized.Matches.Count == 0) { throw new Exception("No rows read."); } // Do not assume that the AutoCluster data is already ordered by descending Shared Centimorgans. serialized.SortMatchesDescending(); }
public async Task <(string, List <IClusterableMatch>, List <Tag>)> LoadClusterableMatchesAsync(string savedData, double minCentimorgansToCluster, double minCentimorgansInSharedMatches, IAnonymizer anonymizer, ProgressData progressData) { progressData.Description = "Loading data..."; var serializedMatchesReaders = _serializedMatchesReaders.Where(reader => reader.IsSupportedFileType(savedData)).ToList(); if (serializedMatchesReaders.Count == 0) { MessageBox.Show("Unsupported file type."); return(null, null, null); } Serialized input = null; string errorMessage = null; foreach (var serializedMatchesReader in serializedMatchesReaders) { string thisErrorMessage; (input, thisErrorMessage) = await serializedMatchesReader.ReadFileAsync(savedData, progressData); if (input != null) { break; } if (errorMessage == null) { errorMessage = thisErrorMessage; } } if (input == null) { MessageBox.Show(errorMessage); return(null, null, null); } return(await Task.Run(() => { var strongMatches = input.Matches.Where(match => match.SharedCentimorgans >= minCentimorgansToCluster).ToList(); var maxMatchIndex = strongMatches.Count + 1; var maxIcwIndex = Math.Min(maxMatchIndex, input.Matches.Count(match => match.SharedCentimorgans >= minCentimorgansInSharedMatches) + 1); maxIcwIndex = Math.Min(maxIcwIndex, input.Matches.Count - 1); var strongMatchesGuids = new HashSet <string>(strongMatches.Select(match => match.TestGuid), StringComparer.OrdinalIgnoreCase); var icw = input.Icw .Where(kvp => strongMatchesGuids.Contains(kvp.Key)) .OrderBy(kvp => input.MatchIndexes.TryGetValue(kvp.Key, out var index) ? index : input.MatchIndexes.Count) .ToDictionary( kvp => kvp.Key, kvp => kvp.Value.Where(index => index <= maxIcwIndex).ToList() ); var matchesDictionary = strongMatches.ToDictionary(match => match.TestGuid); var clusterableMatches = icw .AsParallel().AsOrdered() .Select((kvp, index) => { var match = matchesDictionary[kvp.Key]; match = GetAnonymizedMatch(match, anonymizer); return (IClusterableMatch) new ClusterableMatch(index, match, kvp.Value); } ) .ToList(); clusterableMatches = MaybeFilterMassivelySharedMatches(clusterableMatches); var testTakerTestId = anonymizer?.GetAnonymizedGuid(input.TestTakerTestId) ?? input.TestTakerTestId; var tags = anonymizer == null ? input.Tags : input.Tags?.Select((tag, index) => new Tag { TagId = tag.TagId, Color = tag.Color, Label = $"Group{index}" }).ToList(); return (testTakerTestId, clusterableMatches, tags); })); }
private static void ReadMatchFile(Serialized serialized, string matchFile) { using (var fileStream = new FileStream(matchFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) using (var matchReader = new StreamReader(fileStream)) using (var csv = new CsvReader(matchReader)) { csv.Configuration.Delimiter = ","; csv.Configuration.HeaderValidated = null; csv.Configuration.MissingFieldFound = null; csv.Configuration.BadDataFound = null; csv.Configuration.LineBreakInQuotedFieldIsBadData = false; csv.Configuration.RegisterClassMap <AutoClusterMatchMap>(); csv.Configuration.PrepareHeaderForMatch = (string header, int index) => header.Replace('_', ' '); serialized.Matches = new List <Match>(); serialized.MatchIndexes = new Dictionary <string, int>(); serialized.Icw = new Dictionary <string, List <int> >(); csv.Read(); csv.ReadHeader(); var firstMatchFieldIndex = csv.GetFieldIndex("Cluster") + 1; if (firstMatchFieldIndex <= 0) { firstMatchFieldIndex = csv.GetFieldIndex("cluster") + 1; } while (csv.Read()) { var match = csv.GetRecord <AutoClusterMatch>(); // Do not assume that the AutoCluster data is free of duplicates. if (serialized.MatchIndexes.ContainsKey(match.Identifier)) { continue; } var resultMatch = new Match { MatchTestDisplayName = match.Name, TestGuid = match.Identifier, SharedCentimorgans = GetDouble(match.SharedCm), TreeUrl = match.Tree, TreeSize = GetInt(match.TreeCount), Note = match.Notes, }; // AutoCluster sometimes writes invalid CSV files, not properly quoting a line break in the notes field. // When that happens the ICW data cannot be read var numHeaderFields = firstMatchFieldIndex; while (csv.Context.Record.Length <= numHeaderFields) { csv.Read(); numHeaderFields = 0; } var icw = csv.Context.Record .Skip(numHeaderFields) .Where(value => !string.IsNullOrEmpty(value)) .Select(value => int.TryParse(value, out var intValue) ? intValue : (int?)null) .Where(value => value != null) .Select(value => value.Value - 1) // AutoCluster indexes are 1-based .ToList(); // AutoCluster sometimes writes invalid CSV files, not properly quoting a line break in the notes field. // When that happens the ICW data cannot be read if (icw.Count == 0) { continue; } serialized.Matches.Add(resultMatch); serialized.MatchIndexes[match.Identifier] = serialized.MatchIndexes.Count; serialized.Icw[match.Identifier] = icw; } } // Do not assume that the AutoCluster data is already ordered by descending Shared Centimorgans. serialized.SortMatchesDescending(); }