public void GetChromosome_RefIndex_DoesNotExist() { Assert.Throws <InvalidDataException>(delegate { ReferenceNameUtilities.GetChromosome(_refIndexToChromosome, 1); }); }
CdsEndNotFound, string TranslateableSequence, int StartExonPhase, string BamEditStatus) ReadTranscriptInfo( string line) { var cols = GetColumns("Transcript", line); string id = cols[1]; byte version = byte.Parse(cols[2]); ushort referenceIndex = ushort.Parse(cols[4]); int start = int.Parse(cols[5]); int end = int.Parse(cols[6]); var biotype = (BioType)byte.Parse(cols[8]); bool isCanonical = cols[9] == "Y"; int totalExonLength = int.Parse(cols[10]); string ccdsId = cols[11]; string refSeqId = cols[12]; var source = (Source)byte.Parse(cols[13]); bool cdsStartNotFound = cols[14] == "Y"; bool cdsEndNotFound = cols[15] == "Y"; int startExonPhase = int.Parse(cols[16]); string bamEditStatus = cols[17]; string translateableSequence = _reader.ReadLine(); var chromosome = ReferenceNameUtilities.GetChromosome(_refIndexToChromosome, referenceIndex); return(id, version, chromosome, start, end, biotype, isCanonical, totalExonLength, ccdsId, refSeqId, source , cdsStartNotFound, cdsEndNotFound, translateableSequence, startExonPhase, bamEditStatus); }
private UgaGene GetNextGene() { string line = _reader.ReadLine(); if (line == null) { return(null); } var cols = line.OptimizedSplit('\t'); if (cols.Length != 11) { throw new InvalidDataException($"Expected 11 columns, but found {cols.Length} columns."); } string ucscRefName = cols[0]; var chromosome = ReferenceNameUtilities.GetChromosome(_refNameToChromosome, ucscRefName); string symbol = cols[2]; int start37 = int.Parse(cols[3]); int end37 = int.Parse(cols[4]); int start38 = int.Parse(cols[5]); int end38 = int.Parse(cols[6]); bool onReverseStrand = cols[7] == "R"; int hgncId = int.Parse(cols[8]); string ensemblId = cols[9]; string entrezGeneId = cols[10]; var grch37 = new Interval(start37, end37); var grch38 = new Interval(start38, end38); return(new UgaGene(chromosome, grch37, grch38, onReverseStrand, entrezGeneId, ensemblId, symbol, hgncId)); }
private static ExitCodes ProgramExecution() { var logger = new ConsoleLogger(); var bundle = DataBundle.GetDataBundle(_inputReferencePath, _inputPrefix); int numRefSeqs = bundle.SequenceReader.NumRefSeqs; var chromosome = ReferenceNameUtilities.GetChromosome(bundle.SequenceReader.RefNameToChromosome, _referenceName); bundle.Load(chromosome); string outputStub = GetOutputStub(chromosome, bundle.Source); var interval = new ChromosomeInterval(chromosome, _referencePosition, _referenceEndPosition); var transcripts = GetTranscripts(logger, bundle, interval); var sift = GetPredictionStaging(logger, "SIFT", transcripts, chromosome, bundle.SiftPredictions, bundle.SiftReader, x => x.SiftIndex, numRefSeqs); var polyphen = GetPredictionStaging(logger, "PolyPhen", transcripts, chromosome, bundle.PolyPhenPredictions, bundle.PolyPhenReader, x => x.PolyPhenIndex, numRefSeqs); string referenceBases = GetReferenceBases(logger, bundle.SequenceReader, interval); var regulatoryRegionIntervalArrays = GetRegulatoryRegionIntervalArrays(logger, bundle.TranscriptCache, interval, numRefSeqs); var transcriptIntervalArrays = PredictionUtilities.UpdateTranscripts(transcripts, bundle.SiftPredictions, sift.Predictions, bundle.PolyPhenPredictions, polyphen.Predictions, numRefSeqs); var transcriptStaging = GetTranscriptStaging(bundle.TranscriptCacheData.Header, transcriptIntervalArrays, regulatoryRegionIntervalArrays); WriteCache(logger, FileUtilities.GetCreateStream(CacheConstants.TranscriptPath(outputStub)), transcriptStaging, "transcript"); WriteCache(logger, FileUtilities.GetCreateStream(CacheConstants.SiftPath(outputStub)), sift.Staging, "SIFT"); WriteCache(logger, FileUtilities.GetCreateStream(CacheConstants.PolyPhenPath(outputStub)), polyphen.Staging, "PolyPhen"); WriteReference(logger, CacheConstants.BasesPath(outputStub), bundle.SequenceReader, chromosome, referenceBases, interval.Start); return(ExitCodes.Success); }
public void GetChromosome_RefName_NullName() { var chromosome = ReferenceNameUtilities.GetChromosome(_refNameToChromosome, null); Assert.Equal(string.Empty, chromosome.EnsemblName); Assert.Equal(string.Empty, chromosome.UcscName); Assert.True(chromosome.IsEmpty()); }
internal static SimplePosition GetSimplePosition(string vcfLine, IDictionary <string, IChromosome> refNameToChromosome) { string[] vcfFields = vcfLine.OptimizedSplit('\t'); var chromosome = ReferenceNameUtilities.GetChromosome(refNameToChromosome, vcfFields[VcfCommon.ChromIndex]); int position = int.Parse(vcfFields[VcfCommon.PosIndex]); return(SimplePosition.GetSimplePosition(chromosome, position, vcfFields, new NullVcfFilter())); }
public void GetChromosome_RefName_DoesNotExist() { const string chromosomeName = "dummy"; var chromosome = ReferenceNameUtilities.GetChromosome(_refNameToChromosome, chromosomeName); Assert.Equal(chromosomeName, chromosome.EnsemblName); Assert.Equal(chromosomeName, chromosome.UcscName); Assert.True(chromosome.IsEmpty()); }
private static IChromosome GetChromosome(IDictionary <string, IChromosome> refNameToChromosome, string name) { var chromosome = ReferenceNameUtilities.GetChromosome(refNameToChromosome, name); if (chromosome.IsEmpty()) { throw new InvalidDataException($"Could not find the chromosome ({name}) in the reference name dictionary."); } return(chromosome); }
private IChromosome GetChromosome(string cytogeneticBand) { int armPos = GetArmPos(cytogeneticBand); if (armPos == -1) { return(new EmptyChromosome(cytogeneticBand)); } string chrName = cytogeneticBand.Substring(0, armPos); return(ReferenceNameUtilities.GetChromosome(_refNameToChromosome, chrName)); }
public GenomicRange ToGenomicRange(IDictionary <string, IChromosome> refNameToChromosome) { var startGenomicPosition = new GenomicPosition(ReferenceNameUtilities.GetChromosome(refNameToChromosome, Start.Chromosome), Start.Position); GenomicPosition?endGenomicPosition = null; if (End != null) { endGenomicPosition = new GenomicPosition(ReferenceNameUtilities.GetChromosome(refNameToChromosome, End.Value.Chromosome), End.Value.Position); } return(new GenomicRange(startGenomicPosition, endGenomicPosition)); }
internal IBreakEnd[] GetSvBreakEnds(string ensemblName, int start, VariantType svType, int?svEnd, bool isInv3, bool isInv5) { if (svEnd == null) { return(null); } var end = svEnd.Value; var breakEnds = new IBreakEnd[2]; var chromosome = ReferenceNameUtilities.GetChromosome(_refNameToChromosome, ensemblName); // ReSharper disable once SwitchStatementMissingSomeCases switch (svType) { case VariantType.deletion: breakEnds[0] = new BreakEnd(chromosome, chromosome, start, end + 1, false, true); breakEnds[1] = new BreakEnd(chromosome, chromosome, end + 1, start, true, false); break; case VariantType.tandem_duplication: case VariantType.duplication: breakEnds[0] = new BreakEnd(chromosome, chromosome, end, start, false, true); breakEnds[1] = new BreakEnd(chromosome, chromosome, start, end, true, false); break; case VariantType.inversion: if (isInv3) { breakEnds[0] = new BreakEnd(chromosome, chromosome, start, end, false, false); breakEnds[1] = new BreakEnd(chromosome, chromosome, end, start, false, false); break; } if (isInv5) { breakEnds[0] = new BreakEnd(chromosome, chromosome, start + 1, end + 1, true, true); breakEnds[1] = new BreakEnd(chromosome, chromosome, end + 1, start + 1, true, true); break; } breakEnds[0] = new BreakEnd(chromosome, chromosome, start, end, false, false); breakEnds[1] = new BreakEnd(chromosome, chromosome, end + 1, start + 1, true, true); break; default: return(null); } return(breakEnds); }
public static SimplePosition GetSimplePosition(string[] vcfFields, IDictionary <string, IChromosome> refNameToChromosome, bool isRecomposed = false) { var simplePosition = new SimplePosition { Start = Convert.ToInt32(vcfFields[VcfCommon.PosIndex]), Chromosome = ReferenceNameUtilities.GetChromosome(refNameToChromosome, vcfFields[VcfCommon.ChromIndex]), RefAllele = vcfFields[VcfCommon.RefIndex] }; simplePosition.End = vcfFields[VcfCommon.AltIndex].StartsWith("<") || vcfFields[VcfCommon.AltIndex] == "*" ? -1 : simplePosition.Start + simplePosition.RefAllele.Length - 1; simplePosition.AltAlleles = vcfFields[VcfCommon.AltIndex].Split(","); simplePosition.VcfFields = vcfFields; simplePosition.IsRecomposed = isRecomposed; simplePosition.IsDecomposed = new bool[simplePosition.AltAlleles.Length]; // fasle by default return(simplePosition); }
public IEnumerable <SimplePosition> GetRecomposedPositions(IDictionary <string, IChromosome> refNameToChromosome) { foreach (var variantSite in RecomposedAlleles.Keys.OrderBy(x => x)) { var varInfo = RecomposedAlleles[variantSite]; var altAlleleList = new List <string>(); var genotypeIndex = 1; // genotype index of alt allele var sampleGenotypes = new List <int> [_numSamples]; for (var i = 0; i < _numSamples; i++) { sampleGenotypes[i] = new List <int>(); } List <List <string> > allLinkedVids = new List <List <string> >(); foreach (string altAllele in varInfo.AltAlleleToSample.Keys.OrderBy(x => x)) { var(sampleAlleles, linkedVids) = varInfo.AltAlleleToSample[altAllele]; int currentGenotypeIndex; if (altAllele == variantSite.RefAllele) { currentGenotypeIndex = 0; } else { currentGenotypeIndex = genotypeIndex; genotypeIndex++; altAlleleList.Add(altAllele); allLinkedVids.Add(linkedVids); } foreach (var sampleAllele in sampleAlleles) { SetGenotypeWithAlleleIndex(sampleGenotypes[sampleAllele.SampleIndex], sampleAllele.HaplotypeIndex, currentGenotypeIndex); } } string altAlleleColumn = string.Join(",", altAlleleList); var vcfFields = GetVcfFields(variantSite, varInfo, altAlleleColumn, sampleGenotypes); var chromosome = ReferenceNameUtilities.GetChromosome(refNameToChromosome, vcfFields[VcfCommon.ChromIndex]); var position = SimplePosition.GetSimplePosition(chromosome, variantSite.Start, vcfFields, new NullVcfFilter(), true); for (var i = 0; i < allLinkedVids.Count; i++) { position.LinkedVids[i] = allLinkedVids[i]; } yield return(position); } }
public static List <Band>[] GetCytogeneticBands(Stream stream, int numRefSeqs, IDictionary <string, IChromosome> refNameToChromosome) { var bandLists = new List <Band> [numRefSeqs]; for (var i = 0; i < numRefSeqs; i++) { bandLists[i] = new List <Band>(); } using (var reader = new StreamReader(stream)) { while (true) { string line = reader.ReadLine(); if (string.IsNullOrEmpty(line)) { break; } string[] cols = line.Split('\t'); const int expectedNumColumns = 5; if (cols.Length != expectedNumColumns) { throw new InvalidDataException($"Expected {expectedNumColumns} columns, but found {cols.Length} columns: [{line}]"); } string ucscName = cols[0]; int begin = int.Parse(cols[1]) + 1; int end = int.Parse(cols[2]); string name = cols[3]; var chromosome = ReferenceNameUtilities.GetChromosome(refNameToChromosome, ucscName); if (chromosome.IsEmpty()) { continue; } bandLists[chromosome.Index].Add(new Band(begin, end, name)); } } return(bandLists); }
/// <summary> /// parses the alternate allele /// </summary> private (IChromosome Chromosome2, int Position2, bool IsSuffix1, bool IsSuffix2) ParseBreakendAltAllele(string refAllele, string altAllele) { string referenceName2; int position2; bool isSuffix2; // (\w+)([\[\]])([^:]+):(\d+)([\[\]]) // ([\[\]])([^:]+):(\d+)([\[\]])(\w+) if (altAllele.StartsWith(refAllele)) { var forwardRegex = new Regex(@"\w+([\[\]])([^:]+):(\d+)([\[\]])", RegexOptions.Compiled); var match = forwardRegex.Match(altAllele); if (!match.Success) { throw new InvalidDataException( "Unable to successfully parse the complex rearrangements for the following allele: " + altAllele); } isSuffix2 = match.Groups[4].Value == ForwardBreakEnd; position2 = Convert.ToInt32(match.Groups[3].Value); referenceName2 = match.Groups[2].Value; return(ReferenceNameUtilities.GetChromosome(_refNameToChromosome, referenceName2), position2, false, isSuffix2); } else { var reverseRegex = new Regex(@"([\[\]])([^:]+):(\d+)([\[\]])\w+", RegexOptions.Compiled); var match = reverseRegex.Match(altAllele); if (!match.Success) { throw new InvalidDataException( "Unable to successfully parse the complex rearrangements for the following allele: " + altAllele); } isSuffix2 = match.Groups[1].Value == ForwardBreakEnd; position2 = Convert.ToInt32(match.Groups[3].Value); referenceName2 = match.Groups[2].Value; return(ReferenceNameUtilities.GetChromosome(_refNameToChromosome, referenceName2), position2, true, isSuffix2); } }
public Dictionary <ushort, string> GetRefIndexToVepDir(string dirPath) { var vepDirectories = Directory.GetDirectories(dirPath); var referenceDict = new Dictionary <ushort, string>(); foreach (string dir in vepDirectories) { string referenceName = Path.GetFileName(dir); var chromosome = ReferenceNameUtilities.GetChromosome(_refNameToChromosome, referenceName); if (chromosome.IsEmpty()) { continue; } referenceDict[chromosome.Index] = dir; } return(referenceDict); }
private static AnnotationPosition FindProperStartPosition(AnnotationPosition genomicPosition, IIntervalForest <IGene> geneIntervalForest, IDictionary <string, IChromosome> refNameToChromosome) { var chromosome = ReferenceNameUtilities.GetChromosome(refNameToChromosome, genomicPosition.Chromosome); int currentPosition = genomicPosition.Position; IGene[] overlappingGenes; while ((overlappingGenes = geneIntervalForest.GetAllOverlappingValues(chromosome.Index, currentPosition, currentPosition)) != null) { if (overlappingGenes.Length > 0) { currentPosition = overlappingGenes.Select(x => x.Start).Min() - 1; } } // Always return the position right before the overlapping genes to KISS return(new AnnotationPosition(genomicPosition.Chromosome, currentPosition < 1 ? 1 : currentPosition)); }
private ISimplePosition GetNextSimplePosition() { while (_queuedPositions.Count == 0) { VcfLine = _vcfFilter.GetNextLine(_reader); SimplePosition vcfPosition = null; if (VcfLine != null) { string[] vcfFields = VcfLine.OptimizedSplit('\t'); var chromosome = ReferenceNameUtilities.GetChromosome(_refNameToChromosome, vcfFields[VcfCommon.ChromIndex]); CheckVcfOrder(vcfFields[VcfCommon.ChromIndex]); (int start, bool foundError) = vcfFields[VcfCommon.PosIndex].OptimizedParseInt32(); if (foundError) { throw new InvalidDataException($"Unable to convert the VCF position to an integer: {vcfFields[VcfCommon.PosIndex]}"); } if (InconsistentSampleFields(vcfFields)) { int sampleCount = _sampleNames?.Length ?? 0; throw new UserErrorException($"Inconsistent number of sample fields in line:\n{VcfLine}\nExpected number of sample fields: {sampleCount}"); } vcfPosition = SimplePosition.GetSimplePosition(chromosome, start, vcfFields, _vcfFilter); } IEnumerable <ISimplePosition> simplePositions = _recomposer.ProcessSimplePosition(vcfPosition); foreach (var simplePosition in simplePositions) { _queuedPositions.Enqueue(simplePosition); } if (VcfLine == null) { break; } } return(_queuedPositions.Count == 0 ? null : _queuedPositions.Dequeue()); }
private (IChromosome Chromosome, int NumPredictions) GetChromosomeHeader() { string line = _reader.ReadLine(); var cols = line?.OptimizedSplit('\t'); if (cols == null) { throw new InvalidDataException("Found an unexpected null line when parsing the chromosome header in the prediction reader."); } if (cols.Length != 3) { throw new InvalidDataException($"Expected 3 columns in the chromosome header, but found {cols.Length}"); } ushort referenceIndex = ushort.Parse(cols[1]); var chromosome = ReferenceNameUtilities.GetChromosome(_refIndexToChromosome, referenceIndex); int numPredictions = int.Parse(cols[2]); return(chromosome, numPredictions); }
private IRegulatoryRegion GetNextRegulatoryRegion() { string line = _reader.ReadLine(); if (line == null) { return(null); } var cols = line.OptimizedSplit('\t'); ushort referenceIndex = ushort.Parse(cols[1]); int start = int.Parse(cols[2]); int end = int.Parse(cols[3]); var id = CompactId.Convert(cols[4]); var type = (RegulatoryRegionType)byte.Parse(cols[6]); var chromosome = ReferenceNameUtilities.GetChromosome(_refIndexToChromosome, referenceIndex); return(new RegulatoryRegion(chromosome, start, end, id, type)); }
private static BreakEndAdjacency[] ConvertTranslocation(ISimpleVariant variant, Regex regex, bool onReverseStrand, int partnerBracketIndex, IDictionary <string, IChromosome> refNameToChromosome) { var match = regex.Match(variant.AltAllele); if (!match.Success) { throw new InvalidDataException($"Unable to successfully parse the complex rearrangements for the following allele: {variant.AltAllele}"); } bool partnerOnReverseStrand = match.Groups[partnerBracketIndex].Value == ReverseBracket; var partnerPosition = Convert.ToInt32(match.Groups[3].Value); string partnerReferenceName = match.Groups[2].Value; var partnerChromosome = ReferenceNameUtilities.GetChromosome(refNameToChromosome, partnerReferenceName); var origin = new BreakPoint(variant.Chromosome, variant.Start, onReverseStrand); var partner = new BreakPoint(partnerChromosome, partnerPosition, partnerOnReverseStrand); return(new[] { new BreakEndAdjacency(origin, partner) }); }
public static SimplePosition GetSimplePosition(string[] vcfFields, IVcfFilter vcfFilter, IDictionary <string, IChromosome> refNameToChromosome, bool isRecomposed = false) { var simplePosition = new SimplePosition( ReferenceNameUtilities.GetChromosome(refNameToChromosome, vcfFields[VcfCommon.ChromIndex]), int.Parse(vcfFields[VcfCommon.PosIndex]), vcfFields[VcfCommon.RefIndex], vcfFields[VcfCommon.AltIndex].OptimizedSplit(',')); if (vcfFilter.PassedTheEnd(simplePosition.Chromosome, simplePosition.Start)) { return(null); } simplePosition.End = vcfFields[VcfCommon.AltIndex].OptimizedStartsWith('<') || vcfFields[VcfCommon.AltIndex] == "*" ? -1 : simplePosition.Start + simplePosition.RefAllele.Length - 1; simplePosition.VcfFields = vcfFields; simplePosition.IsRecomposed = isRecomposed; simplePosition.IsDecomposed = new bool[simplePosition.AltAlleles.Length]; // false by default simplePosition.Vids = new string[simplePosition.AltAlleles.Length]; simplePosition.LinkedVids = new List <string> [simplePosition.AltAlleles.Length]; return(simplePosition); }
private string GetNextChromDestinations(string line) { //extracting current chrom info from first line provided var currentChromName = line.Split('\t', 2)[VcfCommon.ChromIndex]; Console.Write($"Getting destinations for chromosome:{currentChromName}..."); var currentChrom = ReferenceNameUtilities.GetChromosome(_desSequenceProvider.RefNameToChromosome, currentChromName); _desSequenceProvider.LoadChromosome(currentChrom); do { var splits = line.Split('\t', VcfCommon.InfoIndex); var chrom = splits[VcfCommon.ChromIndex]; if (chrom != currentChromName) { break; } var refAllele = splits[VcfCommon.RefIndex]; var altAlleles = splits[VcfCommon.AltIndex].Split(','); var position = int.Parse(splits[VcfCommon.PosIndex]); var rsIds = Utilities.GetRsids(splits[VcfCommon.IdIndex]); if (rsIds == null) { continue; } var processedVariants = altAlleles.Select(x => VariantUtils.TrimAndLeftAlign(position, refAllele, x, _desSequenceProvider.Sequence)).ToArray(); foreach (var(start, variantRef, variantAlt) in processedVariants) { foreach (var rsId in rsIds) { if (!_destinationVariants.TryGetValue((rsId, variantRef.Length, variantAlt), out var variants)) { variants = new List <int>(); _destinationVariants[(rsId, variantRef.Length, variantAlt)] = variants;
// ReSharper disable once MemberCanBePrivate.Global public static Index Read(BinaryReader reader, IDictionary <string, IChromosome> refNameToChromosome) { int magic = reader.ReadInt32(); if (magic != Constants.TabixMagic) { throw new InvalidDataException("This does not seem to be a tabix file. Did you use a GZipStream?"); } int numReferenceSequences = reader.ReadInt32(); int format = reader.ReadInt32(); int sequenceNameIndex = reader.ReadInt32() - 1; int sequenceBeginIndex = reader.ReadInt32() - 1; int sequenceEndIndex = reader.ReadInt32() - 1; var commentChar = (char)reader.ReadInt32(); int numLinesToSkip = reader.ReadInt32(); int concatenatedSequenceNameLen = reader.ReadInt32(); byte[] concatenatedNames = reader.ReadBytes(concatenatedSequenceNameLen); string[] referenceSequenceNames = GetReferenceSequenceNames(concatenatedNames, numReferenceSequences); var referenceSequences = new ReferenceIndex[numReferenceSequences]; var refNameToTabixIndex = new Dictionary <string, ushort>(numReferenceSequences); for (ushort i = 0; i < numReferenceSequences; i++) { string chromosomeName = referenceSequenceNames[i]; var chromosome = ReferenceNameUtilities.GetChromosome(refNameToChromosome, chromosomeName); referenceSequences[i] = ReadReferenceSequence(reader, chromosome); refNameToTabixIndex[chromosome.UcscName] = i; refNameToTabixIndex[chromosome.EnsemblName] = i; } return(new Index(format, sequenceNameIndex, sequenceBeginIndex, sequenceEndIndex, commentChar, numLinesToSkip, referenceSequences, refNameToTabixIndex)); }
private static (ushort RefIndex, Interval <RepeatExpansionPhenotype> Interval) GetPhenotype(string line, IDictionary <string, IChromosome> refNameToChromosome) { string[] cols = line.OptimizedSplit('\t'); if (cols.Length < MinNumberOfColumns) { throw new InvalidDataException($"Expected at least {MinNumberOfColumns} columns in the STR data file, but found only {cols.Length}."); } string chromosomeString = cols[ChromIndex]; int start = int.Parse(cols[StartIndex]); int end = int.Parse(cols[EndIndex]); string phenotype = cols[PhenotypeIndex]; string omimId = cols[OmimIndex]; int[] repeatNumbers = cols[RepeatNumbersIndex].Split(',').Select(int.Parse).ToArray(); int[] alleleCounts = cols[AlleleCountsIndex].Split(',').Select(int.Parse).ToArray(); string[] classifications = cols[CategoriesIndex].Split(',').ToArray(); Interval[] classificationRanges = cols[CategoryRangesIndex].Split(',').Select(GetInterval).ToArray(); if (repeatNumbers.Length != alleleCounts.Length) { throw new InvalidDataException($"Inconsistent number of repeat numbers ({repeatNumbers.Length}) vs. allele counts ({alleleCounts.Length})"); } if (classifications.Length != classificationRanges.Length) { throw new InvalidDataException($"Inconsistent number of values of classifications ({classifications.Length}) vs. classification ranges ({classificationRanges.Length})"); } var chromosome = ReferenceNameUtilities.GetChromosome(refNameToChromosome, chromosomeString); var chromosomeInterval = new ChromosomeInterval(chromosome, start, end); double[] percentiles = PercentileUtilities.ComputePercentiles(repeatNumbers.Length, alleleCounts); var rePhenotype = new RepeatExpansionPhenotype(chromosomeInterval, phenotype, omimId, repeatNumbers, percentiles, classifications, classificationRanges); return(chromosome.Index, new Interval <RepeatExpansionPhenotype>(start, end, rePhenotype)); }
public void GetChromosome_RefName_Exists() { var chromosome = ReferenceNameUtilities.GetChromosome(_refNameToChromosome, "1"); Assert.Equal(0, chromosome.Index); }
private string ProcessNextChromSource(string line) { //extracting current chrom info from first line provided var currentChromName = line.Split('\t', 2)[VcfCommon.ChromIndex]; var currentChrom = ReferenceNameUtilities.GetChromosome(_srcSequenceProvider.RefNameToChromosome, currentChromName); _srcSequenceProvider.LoadChromosome(currentChrom); var leftoverCount = 0; do { var splits = line.Split('\t', VcfCommon.InfoIndex); var chrom = splits[VcfCommon.ChromIndex]; if (chrom != currentChromName) { break; } var refAllele = splits[VcfCommon.RefIndex]; var altAlleles = splits[VcfCommon.AltIndex].Split(','); var position = int.Parse(splits[VcfCommon.PosIndex]); var rsIds = Utilities.GetRsids(splits[VcfCommon.IdIndex]); if (rsIds == null) { continue; } var processedVariants = altAlleles.Select(x => VariantUtils.TrimAndLeftAlign(position, refAllele, x, _srcSequenceProvider.Sequence)).ToArray(); var foundInDest = false; foreach (var(_, variantRef, variantAlt) in processedVariants) { foreach (var rsId in rsIds) { if (!_destinationVariants.TryGetValue((rsId, variantRef.Length, variantAlt), out var targetPositions)) { continue; } targetPositions.ForEach(x => WriteRemappedEntry(chrom, x, variantRef, variantAlt, line)); //flipping the sign to indicate it has been mapped //_destinationVariants[rsId] = (-variant.position, variant.refAllele, variant.altAlleles); foundInDest = true; } } if (foundInDest) { continue; } foreach (var(_, _, variantAlt) in processedVariants) { foreach (var rsId in rsIds) { _leftoverWriter.WriteLine(string.Join('#', rsId.ToString(), variantAlt, line)); } } leftoverCount++; } while ((line = _srcReader.ReadLine()) != null); Console.WriteLine($"Leftover count for {currentChromName}: {leftoverCount}"); //Console.WriteLine($"Number of entries discarded due to allele mismatch: {_alleleMismatchCount}"); _leftoverCount += leftoverCount; return(line); }
public int Map() { // write out the relocated locations of the leftover rsIds whenever possible //reading in the leftover ids var leftoverIds = new HashSet <(long, string)>(); Console.Write("Loading leftover ids..."); string line; while ((line = _leftoverReader.ReadLine()) != null) { var splits = line.Split('#', 3); var id = long.Parse(splits[0]); var alt = splits[1]; leftoverIds.Add((id, alt)); } Console.WriteLine($"{leftoverIds.Count} found."); // stream through the dest file to find locations var leftoversWithDest = new Dictionary <(long, string), List <GenomicLocation> >(); var currentChromName = ""; while ((line = _destReader.ReadLine()) != null) { if (line.OptimizedStartsWith('#')) { continue; } var splits = line.Split('\t', VcfCommon.InfoIndex); var chromName = splits[VcfCommon.ChromIndex]; if (chromName != currentChromName) { currentChromName = chromName; Console.WriteLine($"Getting destinations for chromosome:{currentChromName}..."); var currentChrom = ReferenceNameUtilities.GetChromosome(_desSequenceProvider.RefNameToChromosome, currentChromName); _desSequenceProvider.LoadChromosome(currentChrom); } var refAllele = splits[VcfCommon.RefIndex]; var altAlleles = splits[VcfCommon.AltIndex].Split(','); var position = int.Parse(splits[VcfCommon.PosIndex]); var rsIds = Utilities.GetRsids(splits[VcfCommon.IdIndex]); if (rsIds == null) { continue; } var processedVariants = altAlleles.Select(x => VariantUtils.TrimAndLeftAlign(position, refAllele, x, _desSequenceProvider.Sequence)).ToArray(); foreach (var(_, _, variantAlt) in processedVariants) { foreach (var rsId in rsIds) { if (!leftoverIds.Contains((rsId, variantAlt))) { continue; } var pos = int.Parse(splits[VcfCommon.PosIndex]); if (!leftoversWithDest.TryGetValue((rsId, variantAlt), out var locations)) { locations = new List <GenomicLocation>(); leftoversWithDest[(rsId, variantAlt)] = locations;
public void GetChromosome_RefIndex_Exists() { var chromosome = ReferenceNameUtilities.GetChromosome(_refIndexToChromosome, 2); Assert.Equal("3", chromosome.EnsemblName); }