Exemplo n.º 1
0
        public List <MutableGene> Flatten(int overlapStart = -1, int overlapEnd = -1)
        {
            var combinedGenes = new List <MutableGene>();
            var genesById     = GeneUtilities.GetGenesById(_genes, _isEnsembl);

            foreach (var gene in _genes)
            {
                if (gene.Invalid)
                {
                    continue;
                }

                var geneId = _isEnsembl
                    ? gene.EnsemblId.ToString()
                    : gene.EntrezGeneId.ToString();

                List <MutableGene> genesWithSameGeneId;
                if (!genesById.TryGetValue(geneId, out genesWithSameGeneId))
                {
                    throw new UserErrorException($"Unable to find similar genes for {geneId}");
                }

                combinedGenes.Add(GetFlattenedGene(gene, genesWithSameGeneId, overlapStart, overlapEnd));
            }

            if (_showOutput)
            {
                Console.WriteLine($"  - {_description}: {combinedGenes.Count} genes.");
            }

            return(combinedGenes);
        }
Exemplo n.º 2
0
        public List <MutableGene> Merge()
        {
            _mergedGenes.Clear();
            var genesBySymbol = GeneUtilities.GetGenesBySymbol(_genes);

            foreach (var gene in _genes)
            {
                if (gene.Invalid)
                {
                    continue;
                }

                List <MutableGene> genesWithSameSymbol;
                if (!genesBySymbol.TryGetValue(gene.Symbol, out genesWithSameSymbol))
                {
                    throw new UserErrorException($"Unable to find similar genes for {gene.Symbol}");
                }

                MergesGenesWithSameSymbol(gene, genesWithSameSymbol);
            }

            Console.WriteLine($"  - {_numOrphanEnsemblGenes} orphan Ensembl genes.");
            Console.WriteLine($"  - {_numOrphanRefSeqGenes} orphan RefSeq genes.");
            Console.WriteLine($"  - {_numMergedGenes} merged genes.");

            return(_mergedGenes);
        }
Exemplo n.º 3
0
        private void MergesGenesWithSameSymbol(MutableGene seedGene, List <MutableGene> genesWithSameSymbol)
        {
            int overlapStart, overlapEnd;
            var validGenes = GetValidGenes(seedGene, genesWithSameSymbol, out overlapStart, out overlapEnd);

            var ensemblGenes = GeneUtilities.GetGenesByDataSource(validGenes, TranscriptDataSource.Ensembl);
            var refSeqGenes  = GeneUtilities.GetGenesByDataSource(validGenes, TranscriptDataSource.RefSeq);

            var ensemblFlattener = new GeneFlattener(ensemblGenes, "Ensembl", false);
            var flatEnsemblGenes = ensemblFlattener.Flatten(overlapStart, overlapEnd);

            var refSeqFlattener = new GeneFlattener(refSeqGenes, "RefSeq", false);
            var flatRefSeqGenes = refSeqFlattener.Flatten(overlapStart, overlapEnd);

            foreach (var ensemblGene in flatEnsemblGenes)
            {
                // add the unused Ensembl genes
                string linkedEntrezId;
                if (!_linkedEnsemblIds.TryGetValue(ensemblGene.EnsemblId.ToString(), out linkedEntrezId))
                {
                    AddEnsemblOrphan(ensemblGene);
                    continue;
                }

                var refSeqGene = GeneUtilities.GetRefSeqGeneById(flatRefSeqGenes, linkedEntrezId);

                if (refSeqGene == null)
                {
                    AddEnsemblOrphan(ensemblGene);
                    continue;
                }

                // merge the Ensembl and RefSeq gene
                var mergedGene = MutableGene.Clone(ensemblGene);
                mergedGene.TranscriptDataSource = TranscriptDataSource.BothRefSeqAndEnsembl;
                UpdateCoordinates(refSeqGene, mergedGene);

                if (mergedGene.HgncId == -1 && refSeqGene.HgncId != -1)
                {
                    mergedGene.HgncId = refSeqGene.HgncId;
                }
                mergedGene.EntrezGeneId = refSeqGene.EntrezGeneId;
                _mergedGenes.Add(mergedGene);

                refSeqGene.Invalid  = true;
                ensemblGene.Invalid = true;
                _numMergedGenes++;
            }

            // add the unused RefSeq genes
            foreach (var refSeqGene in flatRefSeqGenes)
            {
                if (refSeqGene.Invalid)
                {
                    continue;
                }
                AddRefSeqOrphan(refSeqGene);
            }
        }
Exemplo n.º 4
0
        private void AddItem(string line, IDictionary <string, List <ISuppGeneItem> > geneAnnotations, bool skipGeneIdValidation, StreamWriter logWriter)
        {
            var splits = line.OptimizedSplit('\t');

            if (splits.Length != _tags.Length)
            {
                throw new UserErrorException($"Column number mismatch!! Header has {_tags.Length} columns but {line} contains {splits.Length}");
            }

            string geneId = splits[1];

            var annotationValues = new string[_numAnnotationColumns];
            var hasAnnotation    = false;

            for (var i = 0; i < _numAnnotationColumns; i++)
            {
                string annotationValue = splits[i + NumRequiredColumns];
                if (annotationValue != "" && annotationValue != ".")
                {
                    hasAnnotation = true;
                }

                annotationValues[i] = annotationValue;
                _annotationValidators[i](annotationValues[i], line);
            }

            if (!hasAnnotation)
            {
                throw new UserErrorException($"No annotation provided in line {line}");
            }

            string geneSymbol = GeneUtilities.GetGeneSymbolFromId(geneId, _entrezGeneIdToSymbol, _ensemblIdToSymbol);

            if (geneSymbol == null)
            {
                if (!skipGeneIdValidation)
                {
                    _unknownGenes.Add(geneId);
                }

                logWriter?.WriteLine($"Skipping unrecognized gene ID {geneId}");
                return;
            }
            if (geneAnnotations.ContainsKey(geneSymbol))
            {
                throw new UserErrorException($"Found the same gene {geneSymbol} in different lines. Current line is: {line}");
            }

            geneAnnotations[geneSymbol] = new List <ISuppGeneItem> {
                new CustomGene(geneSymbol, annotationValues.Select(x => new[] { x }).ToList(), JsonSchema, line)
            };
        }
Exemplo n.º 5
0
        private static ExitCodes ProgramExecution()
        {
            var(entrezGeneIdToSymbol, ensemblGeneIdToSymbol) = GeneUtilities.ParseUniversalGeneArchive(null, _universalGeneArchivePath);

            string outputPrefix      = GetOutputPrefix(_inputFile);
            string ngaFilePath       = Path.Combine(_outputDirectory, outputPrefix + SaCommon.NgaFileSuffix);
            string ngaSchemaFilePath = ngaFilePath + SaCommon.JsonSchemaSuffix;

            using (var parser = GeneAnnotationsParser.Create(GZipUtilities.GetAppropriateStreamReader(_inputFile), entrezGeneIdToSymbol, ensemblGeneIdToSymbol))
                using (var ngaStream = FileUtilities.GetCreateStream(ngaFilePath))
                    using (var ngaWriter = CaUtilities.GetNgaWriter(ngaStream, parser, CaUtilities.GetInputFileName(_inputFile)))
                        using (var saJsonSchemaStream = FileUtilities.GetCreateStream(ngaSchemaFilePath))
                            using (var schemaWriter = new StreamWriter(saJsonSchemaStream))
                            {
                                ngaWriter.Write(parser.GetItems());
                                schemaWriter.Write(parser.JsonSchema);
                            }

            return(ExitCodes.Success);
        }
Exemplo n.º 6
0
        private static ExitCodes ProgramExecution()
        {
            _apiKey = GetEnvironmentVariable(OmimApiKeyEnvironmentVariableName);
            if (_apiKey == null)
            {
                throw new InvalidDataException("Please set the OMIM API key as the environment variable \"OmimApiKey\".");
            }

            var(entrezGeneIdToSymbol, ensemblGeneIdToSymbol) = GeneUtilities.ParseUniversalGeneArchive(_inputReferencePath, _universalGeneArchivePath);
            var geneSymbolUpdater = new GeneSymbolUpdater(entrezGeneIdToSymbol, ensemblGeneIdToSymbol);

            using (var omimQuery = new OmimQuery(_apiKey, _outputDirectory))
            {
                omimQuery.GenerateMimToGeneSymbolFile(geneSymbolUpdater);
                omimQuery.GenerateJsonResponse();
            }
            OmimVersion.WriteToFile(OmimQuery.JsonResponseFile, _outputDirectory);

            geneSymbolUpdater.DisplayStatistics();
            return(ExitCodes.Success);
        }
Exemplo n.º 7
0
        private static ExitCodes ProgramExecution()
        {
            var(entrezGeneIdToSymbol, ensemblGeneIdToSymbol) = GeneUtilities.ParseUniversalGeneArchive(null, _universalGeneArchivePath);

            string outputPrefix      = GetOutputPrefix(_inputFile);
            string ngaFilePath       = Path.Combine(_outputDirectory, outputPrefix + SaCommon.GeneFileSuffix);
            string ngaSchemaFilePath = ngaFilePath + SaCommon.JsonSchemaSuffix;

            using (var parser = GeneAnnotationsParser.Create(GZipUtilities.GetAppropriateStreamReader(_inputFile), entrezGeneIdToSymbol, ensemblGeneIdToSymbol))
                using (var ngaStream = FileUtilities.GetCreateStream(ngaFilePath))
                    using (var ngaWriter = CaUtilities.GetNgaWriter(ngaStream, parser, CaUtilities.GetInputFileName(_inputFile)))
                        using (var saJsonSchemaStream = FileUtilities.GetCreateStream(ngaSchemaFilePath))
                            using (var schemaWriter = new StreamWriter(saJsonSchemaStream))
                            {
                                ngaWriter.Write(parser.GetItems());
                                if (parser.GetUnknownGenes().Count > 0)
                                {
                                    throw new UserErrorException($"The following gene IDs were not recognized in Nirvana: {string.Join(',',parser.GetUnknownGenes())}.");
                                }
                                schemaWriter.Write(parser.JsonSchema);
                            }

            return(ExitCodes.Success);
        }
Exemplo n.º 8
0
 private static GeneAnnotationsParser GetGeneAnnotationsParserFromCustomTsvStream(PersistentStream customTsvStream)
 {
     var(entrezGeneIdToSymbol, ensemblGeneIdToSymbol) = GeneUtilities.ParseUniversalGeneArchive(null, LambdaUrlHelper.GetUgaUrl());
     return(GeneAnnotationsParser.Create(new StreamReader(GZipUtilities.GetAppropriateStream(customTsvStream)), entrezGeneIdToSymbol, ensemblGeneIdToSymbol));
 }