private void LookForDeletion() { bool movingUp; var v = Assembly.Where(x => x.IsInReference).ToList(); var difs = Enumerable.Zip(v.Skip(1), v.Take(v.Count - 1), (x, y) => { if (x.ReferenceGenomePosition > y.ReferenceGenomePosition) { return(1); } else { return(0); } }).Sum(); // If monotonically changing, should only not change once (when it goes around the circle). if (difs < 2 || difs > (v.Count - 3)) { ReferenceValuesChangeMonotonically = true; } // Now which way is it increasing, up (big sum) or down (small sum) if (difs > (v.Count / 2)) { movingUp = true; } else { movingUp = false; } if (!movingUp) { Assembly.ReversePath(); movingUp = true; } // Only report for sensible assemblies if (ReferenceValuesChangeMonotonically) { Assembly.FinalizeAndOrientToReference(); // Get Alignments var alns = HaploGrepSharp.ReferenceGenome.GetDeltaAlignments(Assembly.Sequence).SelectMany(x => x).ToList(); if (alns.Count > 0) { StartReference = (int)alns.First().FirstSequenceStart; EndReference = (int)alns.Last().FirstSequenceEnd; } SizeOfDeletionsSeen = String.Empty; if (alns.Count > 1) { HasDeletion = true; StringBuilder sb = new StringBuilder(); List <int> DeletionSizes = new List <int>(); for (int i = 0; i < (alns.Count - 1); i++) { var s = ReferenceGenome.ConvertTorCRSPosition((int)alns[i].FirstSequenceEnd); var e = ReferenceGenome.ConvertTorCRSPosition((int)alns[i + 1].FirstSequenceStart); sb.Append(s.ToString()); sb.Append("-"); sb.Append(e.ToString()); sb.Append(";"); DeletionSizes.Add(e - s + 1); } DeletedRegions = sb.ToString(); SizeOfDeletionsSeen = String.Join(";", DeletionSizes.Select(x => x.ToString())); } // Now see if we can get the fractional evidence for this. // Note this code can get very nasty as it we have multiple // nodes with 2, we can look for var totBifurcations = 0; double avg = 0.0; var totL = Assembly.Where(x => x.LeftExtensionNodesCount == 2).ToList(); var torR = Assembly.Where(x => x.RightExtensionNodesCount == (byte)2).ToList(); for (int i = 0; i < (Assembly.Count - 1); i++) { var cnode = Assembly[i]; var lefts = cnode.GetLeftExtensionNodes().ToList(); var rights = cnode.GetRightExtensionNodes().ToList(); List <List <DeBruijnNode> > neighbors = new List <List <DeBruijnNode> >() { lefts, rights }; foreach (var neighbor in neighbors) { if (neighbor.Count == 2) { var tot = neighbor.Sum(z => (double)z.KmerCount); var cur = (double)neighbor.Where(z => z == Assembly[i - 1] || z == Assembly[i + 1]).First().KmerCount; avg += cur / tot; totBifurcations++; } else if (neighbor.Count > 2) { totBifurcations = 100; //arbitrarily set to too high a value break; } } } if (totBifurcations == 2) { SimpleBifurcation = true; avg *= .5;// .5 * (a + b) = Average } AvgFractionBySplit = SimpleBifurcation ? avg : Double.NaN; } }
private void attemptToCreateAssembly() { //TODO: This node should always be a good start node, but may be an erroneous one, check for this. var curNode = gg.MetaNodes.Where(x => x.Lowest_Reference_Position != 0).MaxBy(x => (x.AvgKmerCoverage * x.ConstituentNodes.Count));//*(.2/x.Lowest_Reference_Position));//.MinBy(x => x.Lowest_Reference_Position); //Let's try just going with the forward primer //var match = forwardPrimer.Substring(0, gg.MegaNodes.First().LeadingKmer.Length); //var rc_match = ((new Bio.Sequence(Bio.Alphabets.NoGapDNA, match)).GetReverseComplementedSequence() as Bio.Sequence).ConvertToString(); //var curNode = gg.MegaNodes.Where(x => x.Sequence.Contains(match) || x.Sequence.Contains(rc_match)).First(); _greedyPathAssembly = new PossibleAssembly(); if (!curNode.CircularLoop) { MitoPaintedAssembler.RaiseStatusEvent("\tAttempting to find greedy path, frequencies of majority split below"); //now to attempt to loop back to the start node //will move along while greedily grabbing the next node with the highest kmer coverage //constantly oriented everyone so we go right ot left while (true) { assemblyNodes.Add(curNode); _greedyPathAssembly.AddMetaNode(curNode); var possibles = curNode.GetOutgoingNodes().ToList(); if (possibles.Count > 0) { SplitData sd = new SplitData(possibles); PathSplits.Add(sd); if (possibles.Count > 1) { if (sd.MaxFrequency < MinimumGreedySplit) { MinimumGreedySplit = sd.MaxFrequency; } MitoPaintedAssembler.RaiseStatusEvent("\tPossible Paths: " + possibles.Count + " Frequency: " + sd.MaxFrequency.ToString("P1") + " Range: " + curNode.Lowest_Reference_Position.ToString() + "-" + curNode.Highest_Reference_Position.ToString()); } curNode = sd.BestPath.NeighborNode; if (assemblyNodes.Contains(curNode)) { FormsCompleteLoop = true; break; } } else { FormsCompleteLoop = false; SuccessfulAssembly = false; break; } } } else { FormsCompleteLoop = true; assemblyNodes.Add(curNode); _greedyPathAssembly.AddMetaNode(curNode); MinimumGreedySplit = 1.0; } int length = assemblyNodes.Sum(x => x.LengthOfNode); //now, did we form an assembly? if (FormsCompleteLoop || Math.Abs(length - AssemblyLength) < 100) { SuccessfulAssembly = true; _greedyPathAssembly.FinalizeAndOrientToReference(); AssemblyLength = (int)_greedyPathAssembly.Sequence.Count; //TODO: More sophisticated criteria than larger than 8 kb to validate assembly if (AssemblyLength > StaticResources.SIZE_DIF_BETWEEN_LARGE_AND_SMALL_DELETION) { SuccessfulAssembly = true; MitoPaintedAssembler.RaiseStatusEvent("\tSuccessful assembly of length: " + AssemblyLength.ToString()); } else { SuccessfulAssembly = false; MitoPaintedAssembler.RaiseStatusEvent("\tAssembly failed. Only recovered sequence of length: " + AssemblyLength.ToString()); } } }