public List <MutableGene> Flatten(int overlapStart = -1, int overlapEnd = -1) { var combinedGenes = new List <MutableGene>(); var genesById = GeneUtilities.GetGenesById(_genes, _isEnsembl); foreach (var gene in _genes) { if (gene.Invalid) { continue; } var geneId = _isEnsembl ? gene.EnsemblId.ToString() : gene.EntrezGeneId.ToString(); List <MutableGene> genesWithSameGeneId; if (!genesById.TryGetValue(geneId, out genesWithSameGeneId)) { throw new UserErrorException($"Unable to find similar genes for {geneId}"); } combinedGenes.Add(GetFlattenedGene(gene, genesWithSameGeneId, overlapStart, overlapEnd)); } if (_showOutput) { Console.WriteLine($" - {_description}: {combinedGenes.Count} genes."); } return(combinedGenes); }
public List <MutableGene> Merge() { _mergedGenes.Clear(); var genesBySymbol = GeneUtilities.GetGenesBySymbol(_genes); foreach (var gene in _genes) { if (gene.Invalid) { continue; } List <MutableGene> genesWithSameSymbol; if (!genesBySymbol.TryGetValue(gene.Symbol, out genesWithSameSymbol)) { throw new UserErrorException($"Unable to find similar genes for {gene.Symbol}"); } MergesGenesWithSameSymbol(gene, genesWithSameSymbol); } Console.WriteLine($" - {_numOrphanEnsemblGenes} orphan Ensembl genes."); Console.WriteLine($" - {_numOrphanRefSeqGenes} orphan RefSeq genes."); Console.WriteLine($" - {_numMergedGenes} merged genes."); return(_mergedGenes); }
private void MergesGenesWithSameSymbol(MutableGene seedGene, List <MutableGene> genesWithSameSymbol) { int overlapStart, overlapEnd; var validGenes = GetValidGenes(seedGene, genesWithSameSymbol, out overlapStart, out overlapEnd); var ensemblGenes = GeneUtilities.GetGenesByDataSource(validGenes, TranscriptDataSource.Ensembl); var refSeqGenes = GeneUtilities.GetGenesByDataSource(validGenes, TranscriptDataSource.RefSeq); var ensemblFlattener = new GeneFlattener(ensemblGenes, "Ensembl", false); var flatEnsemblGenes = ensemblFlattener.Flatten(overlapStart, overlapEnd); var refSeqFlattener = new GeneFlattener(refSeqGenes, "RefSeq", false); var flatRefSeqGenes = refSeqFlattener.Flatten(overlapStart, overlapEnd); foreach (var ensemblGene in flatEnsemblGenes) { // add the unused Ensembl genes string linkedEntrezId; if (!_linkedEnsemblIds.TryGetValue(ensemblGene.EnsemblId.ToString(), out linkedEntrezId)) { AddEnsemblOrphan(ensemblGene); continue; } var refSeqGene = GeneUtilities.GetRefSeqGeneById(flatRefSeqGenes, linkedEntrezId); if (refSeqGene == null) { AddEnsemblOrphan(ensemblGene); continue; } // merge the Ensembl and RefSeq gene var mergedGene = MutableGene.Clone(ensemblGene); mergedGene.TranscriptDataSource = TranscriptDataSource.BothRefSeqAndEnsembl; UpdateCoordinates(refSeqGene, mergedGene); if (mergedGene.HgncId == -1 && refSeqGene.HgncId != -1) { mergedGene.HgncId = refSeqGene.HgncId; } mergedGene.EntrezGeneId = refSeqGene.EntrezGeneId; _mergedGenes.Add(mergedGene); refSeqGene.Invalid = true; ensemblGene.Invalid = true; _numMergedGenes++; } // add the unused RefSeq genes foreach (var refSeqGene in flatRefSeqGenes) { if (refSeqGene.Invalid) { continue; } AddRefSeqOrphan(refSeqGene); } }
private void AddItem(string line, IDictionary <string, List <ISuppGeneItem> > geneAnnotations, bool skipGeneIdValidation, StreamWriter logWriter) { var splits = line.OptimizedSplit('\t'); if (splits.Length != _tags.Length) { throw new UserErrorException($"Column number mismatch!! Header has {_tags.Length} columns but {line} contains {splits.Length}"); } string geneId = splits[1]; var annotationValues = new string[_numAnnotationColumns]; var hasAnnotation = false; for (var i = 0; i < _numAnnotationColumns; i++) { string annotationValue = splits[i + NumRequiredColumns]; if (annotationValue != "" && annotationValue != ".") { hasAnnotation = true; } annotationValues[i] = annotationValue; _annotationValidators[i](annotationValues[i], line); } if (!hasAnnotation) { throw new UserErrorException($"No annotation provided in line {line}"); } string geneSymbol = GeneUtilities.GetGeneSymbolFromId(geneId, _entrezGeneIdToSymbol, _ensemblIdToSymbol); if (geneSymbol == null) { if (!skipGeneIdValidation) { _unknownGenes.Add(geneId); } logWriter?.WriteLine($"Skipping unrecognized gene ID {geneId}"); return; } if (geneAnnotations.ContainsKey(geneSymbol)) { throw new UserErrorException($"Found the same gene {geneSymbol} in different lines. Current line is: {line}"); } geneAnnotations[geneSymbol] = new List <ISuppGeneItem> { new CustomGene(geneSymbol, annotationValues.Select(x => new[] { x }).ToList(), JsonSchema, line) }; }
private static ExitCodes ProgramExecution() { var(entrezGeneIdToSymbol, ensemblGeneIdToSymbol) = GeneUtilities.ParseUniversalGeneArchive(null, _universalGeneArchivePath); string outputPrefix = GetOutputPrefix(_inputFile); string ngaFilePath = Path.Combine(_outputDirectory, outputPrefix + SaCommon.NgaFileSuffix); string ngaSchemaFilePath = ngaFilePath + SaCommon.JsonSchemaSuffix; using (var parser = GeneAnnotationsParser.Create(GZipUtilities.GetAppropriateStreamReader(_inputFile), entrezGeneIdToSymbol, ensemblGeneIdToSymbol)) using (var ngaStream = FileUtilities.GetCreateStream(ngaFilePath)) using (var ngaWriter = CaUtilities.GetNgaWriter(ngaStream, parser, CaUtilities.GetInputFileName(_inputFile))) using (var saJsonSchemaStream = FileUtilities.GetCreateStream(ngaSchemaFilePath)) using (var schemaWriter = new StreamWriter(saJsonSchemaStream)) { ngaWriter.Write(parser.GetItems()); schemaWriter.Write(parser.JsonSchema); } return(ExitCodes.Success); }
private static ExitCodes ProgramExecution() { _apiKey = GetEnvironmentVariable(OmimApiKeyEnvironmentVariableName); if (_apiKey == null) { throw new InvalidDataException("Please set the OMIM API key as the environment variable \"OmimApiKey\"."); } var(entrezGeneIdToSymbol, ensemblGeneIdToSymbol) = GeneUtilities.ParseUniversalGeneArchive(_inputReferencePath, _universalGeneArchivePath); var geneSymbolUpdater = new GeneSymbolUpdater(entrezGeneIdToSymbol, ensemblGeneIdToSymbol); using (var omimQuery = new OmimQuery(_apiKey, _outputDirectory)) { omimQuery.GenerateMimToGeneSymbolFile(geneSymbolUpdater); omimQuery.GenerateJsonResponse(); } OmimVersion.WriteToFile(OmimQuery.JsonResponseFile, _outputDirectory); geneSymbolUpdater.DisplayStatistics(); return(ExitCodes.Success); }
private static ExitCodes ProgramExecution() { var(entrezGeneIdToSymbol, ensemblGeneIdToSymbol) = GeneUtilities.ParseUniversalGeneArchive(null, _universalGeneArchivePath); string outputPrefix = GetOutputPrefix(_inputFile); string ngaFilePath = Path.Combine(_outputDirectory, outputPrefix + SaCommon.GeneFileSuffix); string ngaSchemaFilePath = ngaFilePath + SaCommon.JsonSchemaSuffix; using (var parser = GeneAnnotationsParser.Create(GZipUtilities.GetAppropriateStreamReader(_inputFile), entrezGeneIdToSymbol, ensemblGeneIdToSymbol)) using (var ngaStream = FileUtilities.GetCreateStream(ngaFilePath)) using (var ngaWriter = CaUtilities.GetNgaWriter(ngaStream, parser, CaUtilities.GetInputFileName(_inputFile))) using (var saJsonSchemaStream = FileUtilities.GetCreateStream(ngaSchemaFilePath)) using (var schemaWriter = new StreamWriter(saJsonSchemaStream)) { ngaWriter.Write(parser.GetItems()); if (parser.GetUnknownGenes().Count > 0) { throw new UserErrorException($"The following gene IDs were not recognized in Nirvana: {string.Join(',',parser.GetUnknownGenes())}."); } schemaWriter.Write(parser.JsonSchema); } return(ExitCodes.Success); }
private static GeneAnnotationsParser GetGeneAnnotationsParserFromCustomTsvStream(PersistentStream customTsvStream) { var(entrezGeneIdToSymbol, ensemblGeneIdToSymbol) = GeneUtilities.ParseUniversalGeneArchive(null, LambdaUrlHelper.GetUgaUrl()); return(GeneAnnotationsParser.Create(new StreamReader(GZipUtilities.GetAppropriateStream(customTsvStream)), entrezGeneIdToSymbol, ensemblGeneIdToSymbol)); }