Пример #1
0
        public async Task <(Serialized input, string errorMessage)> ReadFileAsync(string fileName, ProgressData progressData)
        {
            if (!IsSupportedFileType(fileName))
            {
                return(null, $"{fileName} is not a *.xlsx file");
            }

            var serialized = new Serialized
            {
                MatchIndexes = new Dictionary <string, int>(),
                Matches      = new List <Match>(),
                Icw          = new Dictionary <string, List <int> >(),
            };

            try
            {
                await Task.Run(() => ReadMatchFile(serialized, fileName, progressData));
            }
            catch (Exception ex)
            {
                FileUtils.LogException(ex, false);
                return(null, $"Unexpected error while reading AutoCluster match file: {ex.Message}");
            }

            return(serialized, null);
        }
Пример #2
0
        public async Task <(Serialized input, string errorMessage)> ReadFileAsync(string fileName, ProgressData progressData)
        {
            if (!IsSupportedFileType(fileName))
            {
                return(null, $"{fileName} is not a *.csv file");
            }

            // DNAGedcom saves two files: a match file ending with _Family_Finder_Matches,
            // and an in-common-with file ending with _ICW
            var trimmedFileName = GetTrimmedFileName(fileName);

            if (trimmedFileName == null)
            {
                return(null, "File name does not end with _Family_Finder_Matches or _ICW");
            }

            var path      = Path.GetDirectoryName(fileName);
            var matchFile = Path.Combine(path, $"{trimmedFileName}_Family_Finder_Matches.csv");
            var icwFile   = Path.Combine(path, $"{trimmedFileName}_ICW.csv");

            if (!File.Exists(matchFile) || !File.Exists(icwFile))
            {
                return(null, $"Could not find both {matchFile} and {icwFile}");
            }

            var serialized = new Serialized();

            try
            {
                await Task.Run(() => ReadMatchFile(serialized, matchFile));
            }
            catch (Exception ex)
            {
                FileUtils.LogException(ex, false);
                return(null, $"Unexpected error while reading DNAGedcom match file: {ex.Message}");
            }

            try
            {
                await Task.Run(() => ReadIcwFile(serialized, icwFile));
            }
            catch (Exception ex)
            {
                FileUtils.LogException(ex, false);
                return(null, $"Unexpected error while reading DNAGedcom icw file: {ex.Message}");
            }

            try
            {
                var treeFile = Path.Combine(path, $"{trimmedFileName}_Family_Finder_Trees.csv");
                await Task.Run(() => ReadTreeFile(serialized, treeFile));
            }
            catch (Exception)
            {
                // Not a problem if we can't read the tree file
            }


            return(serialized, null);
        }
Пример #3
0
        private static void ReadTreeFile(Serialized serialized, string treeFile)
        {
            using (var fileStream = new FileStream(treeFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
                using (var treeReader = new StreamReader(fileStream))
                    using (var csv = new CsvReader(treeReader))
                    {
                        csv.Configuration.Delimiter         = ",";
                        csv.Configuration.HeaderValidated   = null;
                        csv.Configuration.MissingFieldFound = null;
                        csv.Configuration.BadDataFound      = null;
                        csv.Configuration.LineBreakInQuotedFieldIsBadData = false;
                        csv.Configuration.RegisterClassMap <DnaGedcomTreeNodeMap>();
                        csv.Configuration.PrepareHeaderForMatch = (string header, int index) => header.Replace('_', ' ');

                        // Translate the ICW data.
                        // Shared Clustering assumes that every match also matches themselves.
                        // DNAGedcom does not include the self-matches in the saved ICW data,
                        // so the self-matches need to be added during the translation.
                        var trees = csv.GetRecords <DnaGedcomTreeNode>()
                                    .Where(treeNode => treeNode?.ResultId != null)
                                    .ToLookup(treeNode => treeNode.ResultId);

                        foreach (var match in serialized.Matches)
                        {
                            match.TreeSize = trees[match.TestGuid].Count();
                        }
                    }
        }
        public async Task <(Serialized input, string errorMessage)> ReadFileAsync(string fileName, ProgressData progressData)
        {
            if (!IsSupportedFileType(fileName))
            {
                return(null, $"{fileName} is not a *.csv file");
            }

            // DNAGedcom saves two files: a match file starting with m_,
            // and an in-common-with file starting with icw_
            var trimmedFileName = GetTrimmedFileName(fileName);

            if (trimmedFileName == null)
            {
                return(null, "File name does not start with m_ or icw_");
            }

            var path      = Path.GetDirectoryName(fileName);
            var matchFile = Path.Combine(path, $"m_{trimmedFileName}.csv");
            var icwFile   = Path.Combine(path, $"icw_{trimmedFileName}.csv");

            if (!File.Exists(matchFile) || !File.Exists(icwFile))
            {
                return(null, $"Could not find both {matchFile} and {icwFile}");
            }

            var serialized = new Serialized();

            try
            {
                await Task.Run(() => ReadMatchFile(serialized, matchFile));
            }
            catch (Exception ex)
            {
                FileUtils.LogException(ex, false);
                return(null, $"Unexpected error while reading DNAGedcom match file: {ex.Message}");
            }

            try
            {
                await Task.Run(() => ReadIcwFile(serialized, icwFile));
            }
            catch (Exception ex)
            {
                FileUtils.LogException(ex, false);
                return(null, $"Unexpected error while reading DNAGedcom icw file: {ex.Message}");
            }

            return(serialized, null);
        }
Пример #5
0
        private static void ReadMatchFile(Serialized serialized, string matchFile)
        {
            using (var fileStream = new FileStream(matchFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
                using (var matchReader = new StreamReader(fileStream))
                    using (var csv = new CsvReader(matchReader))
                    {
                        csv.Configuration.Delimiter         = ",";
                        csv.Configuration.HeaderValidated   = null;
                        csv.Configuration.MissingFieldFound = null;
                        csv.Configuration.BadDataFound      = null;
                        csv.Configuration.LineBreakInQuotedFieldIsBadData = false;
                        csv.Configuration.RegisterClassMap <DnaGedcomMatchMap>();
                        csv.Configuration.PrepareHeaderForMatch = (string header, int index) => header.Replace('_', ' ');
                        var dnaGedcomMatches = csv.GetRecords <DnaGedcomMatch>();
                        if (dnaGedcomMatches == null)
                        {
                            return;
                        }

                        // Translate match properties from DNAGedcom naming to Shared Clustering naming.
                        serialized.Matches = dnaGedcomMatches
                                             .Where(match => match != null)
                                             .AsParallel()
                                             .Select(match => new Match
                        {
                            MatchTestDisplayName = match.Name,
                            TestGuid             = match.MatchId,
                            SharedCentimorgans   = GetDouble(match.SharedCm),
                            LongestBlock         = GetDouble(match.LongestBlock),
                        })
                                             // Do not assume that the DNAGedcom data is free of duplicates.
                                             .GroupBy(match => match.TestGuid)
                                             .Select(g => g.First())
                                             // Do not assume that the DNAGedcom data is already ordered by descending Shared Centimorgans.
                                             .OrderByDescending(match => match.SharedCentimorgans)
                                             .ToList();
                    }

            // Assign zero-based indexes to the matches sorted by shared centimorgans descending.
            serialized.MatchIndexes = serialized.Matches
                                      .Select(match => match.TestGuid)
                                      .Distinct()
                                      .Select((id, index) => new { Id = id, Index = index })
                                      .ToDictionary(pair => pair.Id, pair => pair.Index);
        }
        private static void ReadMatchFile(Serialized serialized, string matchFile)
        {
            using (var fileStream = new FileStream(matchFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
                using (var matchReader = new StreamReader(fileStream))
                    using (var csv = new CsvReader(matchReader))
                    {
                        csv.Configuration.Delimiter         = ",";
                        csv.Configuration.HeaderValidated   = null;
                        csv.Configuration.MissingFieldFound = null;
                        csv.Configuration.BadDataFound      = null;
                        csv.Configuration.LineBreakInQuotedFieldIsBadData = false;
                        csv.Configuration.RegisterClassMap <DnaGedcomMatchMap>();
                        csv.Configuration.PrepareHeaderForMatch = (string header, int index) => header.Replace('_', ' ');
                        var dnaGedcomMatches = csv.GetRecords <DnaGedcomMatch>();

                        // In case DNAGedcom file has data from more than one test, find the test ID with the largest number of matches.
                        var matches = dnaGedcomMatches
                                      .Where(match => match != null)
                                      .GroupBy(match => match.TestId ?? "")
                                      .OrderByDescending(g => g.Count())
                                      .FirstOrDefault();
                        if (matches == null)
                        {
                            return;
                        }

                        // This is the Test ID for the person taking the test
                        serialized.TestTakerTestId = matches.Key;

                        // Translate match properties from DNAGedcom naming to Shared Clustering naming.
                        serialized.Matches = matches
                                             .AsParallel()
                                             .Select(match => new Match
                        {
                            MatchTestAdminDisplayName = match.Admin,
                            MatchTestDisplayName      = match.Name,
                            TestGuid           = match.MatchId,
                            SharedCentimorgans = GetDouble(match.SharedCm),
                            SharedSegments     = int.TryParse(match.SharedSegments, out var sharedSegmentsInt) ? sharedSegmentsInt : 0,
                            TreeSize           = int.TryParse(match.People, out var peopleInt) ? peopleInt : 0,
                            Starred            = bool.TryParse(match.Starred, out var isStarred) && isStarred,
                            HasHint            = bool.TryParse(match.Hint, out var hasHint) && hasHint,
                            Note = match.Note,
                        })
Пример #7
0
        private static void ReadIcwFile(Serialized serialized, string icwFile)
        {
            using (var fileStream = new FileStream(icwFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
                using (var icwReader = new StreamReader(fileStream))
                    using (var csv = new CsvReader(icwReader))
                    {
                        csv.Configuration.Delimiter         = ",";
                        csv.Configuration.HeaderValidated   = null;
                        csv.Configuration.MissingFieldFound = null;
                        csv.Configuration.BadDataFound      = null;
                        csv.Configuration.LineBreakInQuotedFieldIsBadData = false;
                        csv.Configuration.RegisterClassMap <DnaGedcomIcwMap>();
                        csv.Configuration.PrepareHeaderForMatch = (string header, int index) => header.Replace('_', ' ');

                        // Translate the ICW data.
                        // Shared Clustering assumes that every match also matches themselves.
                        // DNAGedcom does not include the self-matches in the saved ICW data,
                        // so the self-matches need to be added during the translation.
                        serialized.Icw = csv.GetRecords <DnaGedcomIcw>()
                                         .Where(icw => icw != null)
                                         .GroupBy(icw => icw.MatchId, icw => icw.IcwId)
                                         .ToDictionary
                                         (
                            g => g.Key,
                            g => g.Concat(new[] { g.Key })
                            .Select(id => serialized.MatchIndexes.TryGetValue(id, out var index) ? index : -1).Where(i => i >= 0)
                            .OrderBy(i => i)
                            .ToList()
                                         );
                    }

            // Also add self-matches to every match that has no shared matches at all.
            foreach (var guidAndIndex in serialized.MatchIndexes)
            {
                if (!serialized.Icw.ContainsKey(guidAndIndex.Key))
                {
                    serialized.Icw[guidAndIndex.Key] = new List <int> {
                        guidAndIndex.Value
                    };
                }
            }
        }
Пример #8
0
        public async Task <(Serialized input, string errorMessage)> ReadFileAsync(string fileName, ProgressData progressData)
        {
            if (!IsSupportedFileType(fileName))
            {
                return(null, $"{fileName} is not a *.csv file");
            }

            var serialized = new Serialized();

            try
            {
                await Task.Run(() => ReadMatchFile(serialized, fileName));
            }
            catch (Exception ex)
            {
                FileUtils.LogException(ex, false);
                return(null, $"Unexpected error while reading AutoCluster match file: {ex.Message}");
            }

            return(serialized, null);
        }
Пример #9
0
        private static void ReadMatchFile(Serialized serialized, string matchFile, ProgressData progressData)
        {
            using (var fileStream = new FileStream(matchFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
                using (var package = new ExcelPackage(fileStream))
                    using (var ws = package.Workbook.Worksheets[1])
                    {
                        var hyperlinkColumn      = 0;
                        var totalSharedCmColumn  = 0;
                        var notesColumn          = 0;
                        var treeColumn           = 0;
                        var firstMatchFieldIndex = 0;
                        var lastMatchFieldIndex  = 0;

                        // Find the columns that have interesting data (don't assume specific column numbers)
                        for (var col = 1; col < 1000; ++col)
                        {
                            var cell      = ws.Cells[1, col];
                            var cellValue = cell.GetValue <string>();
                            if (cellValue.Equals("name", StringComparison.OrdinalIgnoreCase))
                            {
                                hyperlinkColumn = col;
                            }
                            else if (cellValue.Equals("total shared cM", StringComparison.OrdinalIgnoreCase))
                            {
                                totalSharedCmColumn = col;
                            }
                            else if (cellValue.Equals("notes", StringComparison.OrdinalIgnoreCase))
                            {
                                notesColumn = col;
                            }
                            else if (cellValue.Equals("tree", StringComparison.OrdinalIgnoreCase))
                            {
                                treeColumn = col;
                            }

                            var row2Cell = ws.Cells[2, col];
                            if (row2Cell.Style.Fill.BackgroundColor.Rgb != null)
                            {
                                firstMatchFieldIndex = col;
                                break;
                            }
                        }

                        if (totalSharedCmColumn == 0)
                        {
                            throw new Exception("Total Shared cM column not found.");
                        }

                        lastMatchFieldIndex = firstMatchFieldIndex;
                        while (ws.Cells[1, lastMatchFieldIndex + 1].Value != null)
                        {
                            lastMatchFieldIndex++;
                        }

                        var maxRow = 1;
                        while (ws.Cells[maxRow + 1, totalSharedCmColumn].Value != null)
                        {
                            maxRow++;
                        }

                        if (maxRow == 1)
                        {
                            throw new Exception("No rows found.");
                        }

                        progressData.Reset("Loading data.", maxRow - 1);

                        for (var row = 2; row <= maxRow; ++row)
                        {
                            progressData.Increment();

                            var resultMatch = new Match();

                            if (hyperlinkColumn != 0)
                            {
                                try
                                {
                                    // new format
                                    var url  = ws.Cells[row, hyperlinkColumn].Hyperlink.ToString();
                                    var name = ws.Cells[row, hyperlinkColumn].GetValue <string>();
                                    var path = url.Split('/');
                                    resultMatch.MatchTestDisplayName = name;
                                    serialized.TestTakerTestId       = path[4];
                                    resultMatch.TestGuid             = path[6];
                                }
                                catch
                                {
                                    try
                                    {
                                        // old format
                                        var hyperlink = ws.Cells[row, hyperlinkColumn].GetValue <string>();
                                        var fields    = hyperlink.Split('"');
                                        var url       = fields[1];
                                        var name      = fields[3];
                                        var path      = url.Split('/');
                                        resultMatch.MatchTestDisplayName = name;
                                        serialized.TestTakerTestId       = path[4];
                                        resultMatch.TestGuid             = path[6];
                                    }
                                    catch
                                    {
                                    }
                                }
                            }
                            if (totalSharedCmColumn != 0)
                            {
                                resultMatch.SharedCentimorgans = ws.Cells[row, totalSharedCmColumn].GetValue <double>();
                            }
                            if (notesColumn != 0)
                            {
                                resultMatch.Note = ws.Cells[row, notesColumn].GetValue <string>();
                            }
                            if (treeColumn != 0)
                            {
                                try
                                {
                                    resultMatch.TreeUrl = ws.Cells[row, treeColumn].Hyperlink?.ToString();
                                    if (!string.IsNullOrEmpty(resultMatch.TreeUrl))
                                    {
                                        var fields = ws.Cells[row, treeColumn].GetValue <string>().Split(' ');
                                        if (fields.Last() == "persons")
                                        {
                                            resultMatch.TreeSize = Convert.ToInt32(fields.First());
                                        }
                                    }
                                }
                                catch { }
                            }

                            // Do not assume that the AutoCluster data is free of duplicates.
                            if (resultMatch.TestGuid == null || serialized.MatchIndexes.ContainsKey(resultMatch.TestGuid))
                            {
                                continue;
                            }

                            var icw = Enumerable.Range(firstMatchFieldIndex, lastMatchFieldIndex - firstMatchFieldIndex + 1)
                                      .Where(col => ws.Cells[row, col].Style.Fill.BackgroundColor.Rgb != null)
                                      .Select(col => col - firstMatchFieldIndex)
                                      .ToList();

                            // AutoCluster sometimes writes invalid CSV files, not properly quoting a line break in the notes field.
                            // When that happens the ICW data cannot be read
                            if (icw.Count == 0)
                            {
                                continue;
                            }

                            serialized.Matches.Add(resultMatch);
                            serialized.MatchIndexes[resultMatch.TestGuid] = serialized.MatchIndexes.Count;
                            serialized.Icw[resultMatch.TestGuid]          = icw;
                        }
                    }

            if (serialized.Matches.Count == 0)
            {
                throw new Exception("No rows read.");
            }

            // Do not assume that the AutoCluster data is already ordered by descending Shared Centimorgans.
            serialized.SortMatchesDescending();
        }
Пример #10
0
        public async Task <(string, List <IClusterableMatch>, List <Tag>)> LoadClusterableMatchesAsync(string savedData, double minCentimorgansToCluster, double minCentimorgansInSharedMatches, IAnonymizer anonymizer, ProgressData progressData)
        {
            progressData.Description = "Loading data...";

            var serializedMatchesReaders = _serializedMatchesReaders.Where(reader => reader.IsSupportedFileType(savedData)).ToList();

            if (serializedMatchesReaders.Count == 0)
            {
                MessageBox.Show("Unsupported file type.");
                return(null, null, null);
            }

            Serialized input        = null;
            string     errorMessage = null;

            foreach (var serializedMatchesReader in serializedMatchesReaders)
            {
                string thisErrorMessage;
                (input, thisErrorMessage) = await serializedMatchesReader.ReadFileAsync(savedData, progressData);

                if (input != null)
                {
                    break;
                }
                if (errorMessage == null)
                {
                    errorMessage = thisErrorMessage;
                }
            }

            if (input == null)
            {
                MessageBox.Show(errorMessage);
                return(null, null, null);
            }

            return(await Task.Run(() =>
            {
                var strongMatches = input.Matches.Where(match => match.SharedCentimorgans >= minCentimorgansToCluster).ToList();
                var maxMatchIndex = strongMatches.Count + 1;
                var maxIcwIndex = Math.Min(maxMatchIndex, input.Matches.Count(match => match.SharedCentimorgans >= minCentimorgansInSharedMatches) + 1);
                maxIcwIndex = Math.Min(maxIcwIndex, input.Matches.Count - 1);
                var strongMatchesGuids = new HashSet <string>(strongMatches.Select(match => match.TestGuid), StringComparer.OrdinalIgnoreCase);
                var icw = input.Icw
                          .Where(kvp => strongMatchesGuids.Contains(kvp.Key))
                          .OrderBy(kvp => input.MatchIndexes.TryGetValue(kvp.Key, out var index) ? index : input.MatchIndexes.Count)
                          .ToDictionary(
                    kvp => kvp.Key,
                    kvp => kvp.Value.Where(index => index <= maxIcwIndex).ToList()
                    );
                var matchesDictionary = strongMatches.ToDictionary(match => match.TestGuid);
                var clusterableMatches = icw
                                         .AsParallel().AsOrdered()
                                         .Select((kvp, index) =>
                {
                    var match = matchesDictionary[kvp.Key];
                    match = GetAnonymizedMatch(match, anonymizer);
                    return (IClusterableMatch) new ClusterableMatch(index, match, kvp.Value);
                }
                                                 )
                                         .ToList();

                clusterableMatches = MaybeFilterMassivelySharedMatches(clusterableMatches);

                var testTakerTestId = anonymizer?.GetAnonymizedGuid(input.TestTakerTestId) ?? input.TestTakerTestId;
                var tags = anonymizer == null ? input.Tags : input.Tags?.Select((tag, index) => new Tag {
                    TagId = tag.TagId, Color = tag.Color, Label = $"Group{index}"
                }).ToList();
                return (testTakerTestId, clusterableMatches, tags);
            }));
        }
Пример #11
0
        private static void ReadMatchFile(Serialized serialized, string matchFile)
        {
            using (var fileStream = new FileStream(matchFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
                using (var matchReader = new StreamReader(fileStream))
                    using (var csv = new CsvReader(matchReader))
                    {
                        csv.Configuration.Delimiter         = ",";
                        csv.Configuration.HeaderValidated   = null;
                        csv.Configuration.MissingFieldFound = null;
                        csv.Configuration.BadDataFound      = null;
                        csv.Configuration.LineBreakInQuotedFieldIsBadData = false;
                        csv.Configuration.RegisterClassMap <AutoClusterMatchMap>();
                        csv.Configuration.PrepareHeaderForMatch = (string header, int index) => header.Replace('_', ' ');

                        serialized.Matches      = new List <Match>();
                        serialized.MatchIndexes = new Dictionary <string, int>();
                        serialized.Icw          = new Dictionary <string, List <int> >();

                        csv.Read();
                        csv.ReadHeader();

                        var firstMatchFieldIndex = csv.GetFieldIndex("Cluster") + 1;
                        if (firstMatchFieldIndex <= 0)
                        {
                            firstMatchFieldIndex = csv.GetFieldIndex("cluster") + 1;
                        }

                        while (csv.Read())
                        {
                            var match = csv.GetRecord <AutoClusterMatch>();

                            // Do not assume that the AutoCluster data is free of duplicates.
                            if (serialized.MatchIndexes.ContainsKey(match.Identifier))
                            {
                                continue;
                            }

                            var resultMatch = new Match
                            {
                                MatchTestDisplayName = match.Name,
                                TestGuid             = match.Identifier,
                                SharedCentimorgans   = GetDouble(match.SharedCm),
                                TreeUrl  = match.Tree,
                                TreeSize = GetInt(match.TreeCount),
                                Note     = match.Notes,
                            };

                            // AutoCluster sometimes writes invalid CSV files, not properly quoting a line break in the notes field.
                            // When that happens the ICW data cannot be read
                            var numHeaderFields = firstMatchFieldIndex;
                            while (csv.Context.Record.Length <= numHeaderFields)
                            {
                                csv.Read();
                                numHeaderFields = 0;
                            }

                            var icw = csv.Context.Record
                                      .Skip(numHeaderFields)
                                      .Where(value => !string.IsNullOrEmpty(value))
                                      .Select(value => int.TryParse(value, out var intValue) ? intValue : (int?)null)
                                      .Where(value => value != null)
                                      .Select(value => value.Value - 1) // AutoCluster indexes are 1-based
                                      .ToList();

                            // AutoCluster sometimes writes invalid CSV files, not properly quoting a line break in the notes field.
                            // When that happens the ICW data cannot be read
                            if (icw.Count == 0)
                            {
                                continue;
                            }

                            serialized.Matches.Add(resultMatch);
                            serialized.MatchIndexes[match.Identifier] = serialized.MatchIndexes.Count;
                            serialized.Icw[match.Identifier]          = icw;
                        }
                    }

            // Do not assume that the AutoCluster data is already ordered by descending Shared Centimorgans.
            serialized.SortMatchesDescending();
        }