Exemple #1
0
        public async Task <List <string> > OutputCorrelationAsync(List <ClusterNode> nodes, Dictionary <int, IClusterableMatch> matchesByIndex, Dictionary <int, int> indexClusterNumbers)
        {
            if (string.IsNullOrEmpty(_correlationFilename))
            {
                return(new List <string>());
            }

            if (nodes.Count == 0)
            {
                return(new List <string>());
            }

            // All nodes, in order. These will become rows/columns in the Excel file.
            var leafNodes = nodes.First().GetOrderedLeafNodes().ToList();

            // Excel has a limit of 16,384 columns.
            // If there are more than 16,000 matches, split into files containing at most 10,000 columns.
            var numOutputFiles = 1;

            if (leafNodes.Count > MaxColumns)
            {
                numOutputFiles = leafNodes.Count / MaxColumnsPerSplit + 1;
            }

            _progressData.Reset("Saving clusters", leafNodes.Count * numOutputFiles);

            // Ancestry never shows matches lower than 20 cM as shared matches.
            // The distant matches will be included as rows in the Excel file, but not as columns.
            // That means that correlation diagrams that include distant matches will be rectangular (tall and narrow)
            // rather than square.
            var matches = leafNodes
                          .Where(leafNode => matchesByIndex.ContainsKey(leafNode.Index))
                          .Select(leafNode => matchesByIndex[leafNode.Index])
                          .ToList();
            var lowestClusterableCentimorgans = matches
                                                .SelectMany(match => match.Coords.Where(coord => coord != match.Index && matchesByIndex.ContainsKey(coord)))
                                                .Distinct()
                                                .Min(coord => matchesByIndex[coord].Match.SharedCentimorgans);
            var nonDistantMatches = matches
                                    .Where(match => match.Match.SharedCentimorgans >= lowestClusterableCentimorgans)
                                    .ToList();

            var orderedIndexes = nonDistantMatches
                                 .Select(match => match.Index)
                                 .ToList();

            // Because very strong matches are included in so many clusters,
            // excluding the strong matches makes it easier to identify edges of the clusters.
            var immediateFamilyIndexes = new HashSet <int>(
                matchesByIndex.Values
                .Where(match => match.Match.SharedCentimorgans > 200)
                .Select(match => match.Index)
                );

            var files = new List <string>();

            for (var fileNum = 0; fileNum < numOutputFiles; ++fileNum)
            {
                using (var p = new ExcelPackage())
                {
                    await Task.Run(() =>
                    {
                        var ws = p.Workbook.Worksheets.Add("heatmap");

                        // Start at the top left of the sheet
                        var row = 1;
                        var col = 1;

                        // Rotate the entire top row by 90 degrees
                        ws.Row(row).Style.TextRotation = 90;

                        // Fixed columns
                        var clusterNumberWriter = new ClusterNumberWriter(indexClusterNumbers);
                        var writers             = new IColumnWriter[]
                        {
                            clusterNumberWriter,
                            new NameWriter(false),
                            matches.Any(match => !string.IsNullOrEmpty(match.Match.TestGuid)) ? new TestIdWriter() : null,
                            !string.IsNullOrEmpty(_testTakerTestId) ? new LinkWriter(_testTakerTestId) : null,
                            new SharedCentimorgansWriter(),
                            matches.Any(match => match.Match.SharedSegments > 0) ? new SharedSegmentsWriter() : null,
                            matches.Any(match => match.Match.LongestBlock > 0) ? new LongestBlockWriter() : null,
                            matches.Any(match => !string.IsNullOrEmpty(match.Match.TreeUrl)) ? new TreeUrlWriter(_testTakerTestId) : null,
                            matches.Any(match => match.Match.TreeType != SavedData.TreeType.Undetermined) ? new TreeTypeWriter() : null,
                            matches.Any(match => match.Match.TreeSize > 0) ? new TreeSizeWriter() : null,
                            matches.Any(match => match.Match.Starred) ? new StarredWriter() : null,
                            matches.Any(match => match.Match.HasHint) ? new SharedAncestorHintWriter() : null,
                            new CorrelatedClustersWriter(leafNodes, immediateFamilyIndexes, indexClusterNumbers, clusterNumberWriter, _minClusterSize),
                            new NoteWriter(),
                        }.Where(writer => writer != null).ToArray();
                        var columnWriters = new ColumnWritersCollection(p, ws, writers, _testTakerTestId);

                        col = columnWriters.WriteHeaders(row, col);

                        var firstMatrixDataRow    = row + 1;
                        var firstMatrixDataColumn = col;

                        // Column headers for each match
                        var matchColumns = nonDistantMatches.Skip(fileNum *MaxColumnsPerSplit).Take(MaxColumnsPerSplit).ToList();
                        foreach (var nonDistantMatch in matchColumns)
                        {
                            ws.Cells[row, col++].Value = nonDistantMatch.Match.Name;
                        }

                        // One row for each match
                        foreach (var leafNode in leafNodes)
                        {
                            var match = matchesByIndex[leafNode.Index];
                            row++;

                            // Row headers
                            col = 1;
                            col = columnWriters.WriteColumns(row, col, match, leafNode);

                            // Correlation data
                            foreach (var coordAndIndex in leafNode.GetCoordsArray(orderedIndexes)
                                     .Zip(orderedIndexes, (c, i) => new { Coord = c, Index = i })
                                     .Skip(fileNum *MaxColumnsPerSplit).Take(MaxColumnsPerSplit))
                            {
                                if (coordAndIndex.Coord != 0)
                                {
                                    ws.Cells[row, col].Value = coordAndIndex.Coord;
                                }
                                col++;
                            }

                            _progressData.Increment();
                        }

                        // Heatmap color scale
                        var correlationData               = new ExcelAddress(firstMatrixDataRow, firstMatrixDataColumn, firstMatrixDataRow - 1 + leafNodes.Count, firstMatrixDataColumn - 1 + matchColumns.Count);
                        var threeColorScale               = ws.ConditionalFormatting.AddThreeColorScale(correlationData);
                        threeColorScale.LowValue.Type     = eExcelConditionalFormattingValueObjectType.Num;
                        threeColorScale.LowValue.Value    = 0;
                        threeColorScale.LowValue.Color    = Color.Gainsboro;
                        threeColorScale.MiddleValue.Type  = eExcelConditionalFormattingValueObjectType.Num;
                        threeColorScale.MiddleValue.Value = 1;
                        threeColorScale.MiddleValue.Color = Color.Cornsilk;
                        threeColorScale.HighValue.Type    = eExcelConditionalFormattingValueObjectType.Num;
                        threeColorScale.HighValue.Value   = 2;
                        threeColorScale.HighValue.Color   = Color.DarkRed;

                        // Heatmap number format
                        ws.Cells[$"1:{matchColumns.Count}"].Style.Numberformat.Format = "General";

                        col = 1;
                        col = columnWriters.FormatColumns(row, col);

                        // Freeze the column and row headers
                        ws.View.FreezePanes(firstMatrixDataRow, firstMatrixDataColumn);
                    });

                    var fileName = _correlationFilename;
                    if (fileNum > 0)
                    {
                        fileName = FileUtils.AddSuffixToFilename(fileName, (fileNum + 1).ToString());
                    }

                    FileUtils.Save(p, fileName);

                    files.Add(fileName);
                }
            }
            return(files);
        }
        public async Task <List <string> > OutputCorrelationAsync(
            List <ClusterNode> nodes,
            Dictionary <int, IClusterableMatch> matchesByIndex,
            Dictionary <int, int> indexClusterNumbers,
            List <Tag> tags,
            string worksheetName)
        {
            if (string.IsNullOrEmpty(_correlationFilename))
            {
                return(new List <string>());
            }

            if (nodes.Count == 0)
            {
                return(new List <string>());
            }

            // All nodes, in order. These will become rows/columns in the Excel file.
            var leafNodes = nodes.First().GetOrderedLeafNodes().ToList();

            // Ancestry never shows matches lower than 20 cM as shared matches.
            // The distant matches will be included as rows in the Excel file, but not as columns.
            // That means that correlation diagrams that include distant matches will be rectangular (tall and narrow)
            // rather than square.
            var matches = leafNodes
                          .Where(leafNode => matchesByIndex.ContainsKey(leafNode.Index))
                          .Select(leafNode => matchesByIndex[leafNode.Index])
                          .ToList();
            var lowestClusterableCentimorgans = matches
                                                .SelectMany(match => match.Coords.Where(coord => coord != match.Index && matchesByIndex.ContainsKey(coord)))
                                                .Distinct()
                                                .Where(coord => matchesByIndex[coord].Match.SharedCentimorgans >= _lowestClusterableCentimorgans)
                                                .Min(coord => matchesByIndex[coord].Match.SharedCentimorgans);
            var nonDistantMatches = matches
                                    .Where(match => match.Match.SharedCentimorgans >= lowestClusterableCentimorgans)
                                    .ToList();

            // Excel has a limit of 16,384 columns.
            // If there are more than 16,000 matches, split into files containing at most 10,000 columns.
            var numOutputFiles = 1;

            if (nonDistantMatches.Count > MaxMatchesPerClusterFile)
            {
                numOutputFiles = (nonDistantMatches.Count - 1) / MaxMatchesPerClusterFile + 1;
            }

            _progressData.Reset("Saving clusters", leafNodes.Count * numOutputFiles);

            var orderedIndexes = nonDistantMatches
                                 .Select(match => match.Index)
                                 .ToList();

            // Because very strong matches are included in so many clusters,
            // excluding the strong matches makes it easier to identify edges of the clusters.
            var immediateFamilyIndexes = new HashSet <int>(
                matchesByIndex.Values
                .Where(match => match.Match.SharedCentimorgans > 200)
                .Select(match => match.Index)
                );

            // Fixed columns
            var clusterNumberWriter = new ClusterNumberWriter(indexClusterNumbers);
            var writers             = new List <IColumnWriter>
            {
                clusterNumberWriter,
                new NameWriter(false),
                matches.Any(match => !string.IsNullOrEmpty(match.Match.TestGuid)) ? new TestIdWriter() : null,
                !string.IsNullOrEmpty(_testTakerTestId) ? new LinkWriter(_testTakerTestId, _ancestryHostName) : null,
                new SharedCentimorgansWriter(),
                matches.Any(match => match.Match.SharedSegments > 0) ? new SharedSegmentsWriter() : null,
                matches.Any(match => match.Match.LongestBlock > 0) ? new LongestBlockWriter() : null,
                matches.Any(match => !string.IsNullOrEmpty(match.Match.TreeUrl)) ? new TreeUrlWriter(_testTakerTestId) : null,
                matches.Any(match => match.Match.TreeType != SavedData.TreeType.Undetermined) ? new TreeTypeWriter() : null,
                matches.Any(match => match.Match.TreeSize > 0) ? new TreeSizeWriter() : null,
                matches.Any(match => match.Match.CommonAncestors?.Count > 0) ? new CommonAncestorsWriter() : null,
                matches.Any(match => match.Match.Starred) ? new StarredWriter() : null,
                matches.Any(match => match.Match.HasHint) ? new SharedAncestorHintWriter() : null,
                new CorrelatedClustersWriter(leafNodes, immediateFamilyIndexes, indexClusterNumbers, clusterNumberWriter, _minClusterSize),
            }.Where(writer => writer != null).ToList();

            if (tags != null)
            {
                writers.AddRange(tags.OrderBy(tag => tag.Label).Select(tag => new TagWriter(tag)));
            }
            writers.Add(new NoteWriter());

            if (!FileIsOpen())
            {
                return(await OutputFiles(worksheetName, matchesByIndex, leafNodes, nonDistantMatches, orderedIndexes, writers.ToArray(), numOutputFiles));
            }
            else
            {
                await OutputWorksheet(worksheetName, matchesByIndex, leafNodes, nonDistantMatches, orderedIndexes, writers.ToArray(), 0);

                return(new List <string> {
                    _correlationFilename
                });
            }
        }