public void NextGeneration() { CalculateFitness(); generation++; survivors.Clear(); List <Genome> children = new List <Genome>(); int numChampions = (int)(GenomeUtils.POP_SIZE * GenomeUtils.CHAMPION_RATE); int numASexual = (int)(GenomeUtils.POP_SIZE * GenomeUtils.ASEXUAL_RATE); //The top 10% of nnets will be reproduced untouched for (int i = 0; i < numChampions; i++) { Genome champ = GenomeUtils.Clone(genomes[i]); children.Add(champ); survivors.Add(champ); } //In each generation, 25% of offspring resulted from mutation without crossover. for (int i = 0; i < numASexual; i++) { children.Add(GenomeUtils.Clone(genomes[i])); } //Remaining will be crossovers for (int i = children.Count; i < GenomeUtils.POP_SIZE; i++) { //pick random species Species s = species[Random.Range(0, species.Count)]; Genome parent1 = s.GetRandomGenome(); Genome parent2 = s.GetRandomGenome(); children.Add(GenomeUtils.Crossover(parent1, parent2)); } //Mutate all children for (int i = (int)(GenomeUtils.POP_SIZE * GenomeUtils.CHAMPION_RATE); i < children.Count; i++) { if (Random.value < GenomeUtils.MUTATION_RATE) { children[i].Mutate(); } if (Random.value < GenomeUtils.ADD_CONNECTION_RATE) { children[i].AddConnectionMutation(); } if (Random.value < GenomeUtils.ADD_NODE_RATE) { children[i].AddNodeMutation(); } } genomes = children; SetSpecies(); MakeNNets(); }
void SetSpecies() { species.Clear(); speciesMap.Clear(); foreach (Genome g in genomes) { bool match = false; foreach (Species s in species) { if (GenomeUtils.CompatibilityDistance(g, s.mascot, C1, C2, C3) < GenomeUtils.SPECIES_DIST) { s.AddMember(g); speciesMap.Add(g, s); match = true; break; } } if (!match) { Species newSpecies = new Species(g); species.Add(newSpecies); speciesMap.Add(g, newSpecies); } } Debug.Log("Gen: " + generation + ", Population: " + population + ", Species: " + species.Count); }
void GetTargetDestination() { if (mode == Mode.AI) { float[] output = brain.GetOutput(); float angle = (output[0]) * 2 * Mathf.PI; float speed = GenomeUtils.Sigmoid(output[1]); //float speed = 1f; SetAcceleration(angle, speed); if (output.Length > 2) { float splitUrge = GenomeUtils.Sigmoid(output[2]); if (splitUrge >= 0.8f) { Split(); } } } else { Vector3 worldPosition = Camera.main.ScreenToWorldPoint(Input.mousePosition); float xDiff = worldPosition.x - player.x; float yDiff = worldPosition.y - player.y; float angle = Mathf.Atan2(yDiff, xDiff); float distance = Mathf.Sqrt(xDiff * xDiff + yDiff * yDiff); distance = Mathf.Clamp(distance, 0f, 1f); SetAcceleration(angle, distance); } }
public static void SortTRna(this List<FeatureItemGroup> items) { if (items.All(m => TRNA.Match(m.Name).Success)) { GenomeUtils.SortChromosome(items, m => TRNA.Match(m.Name).Groups[1].Value, m => int.Parse(TRNA.Match(m.Name).Groups[2].Value)); } }
public override IEnumerable <string> Process() { var data = new CnMOPsItemReader().ReadFromFile(options.InputFile); Dictionary <string, List <ItemRange> > result = MergeRange(data); var seqnames = result.Keys.OrderBy(m => m).ToList(); GenomeUtils.SortChromosome(seqnames, l => l, l => 1); using (var sw = new StreamWriter(options.OutputFile)) { sw.WriteLine("seqname\tstart\tend\tlocus\tsample\tsample_start\tsample_end\tsample_type"); foreach (var seqname in seqnames) { var ranges = result[seqname]; foreach (var range in ranges) { foreach (var cn in range.Items) { if (options.IgnoreCN1CN3 && (cn.CN.Equals("CN1") || cn.CN.Equals("CN3"))) { continue; } sw.WriteLine("{0}\t{1}\t{2}\t{0}:{1}-{2}\t{3}\t{4}\t{5}\t{6}", seqname, range.Start, range.End, cn.FileName, cn.Start, cn.End, cn.CN); } } } } var filenames = (from d in data select d.FileName).Distinct().OrderBy(l => l).ToArray(); using (var sw = new StreamWriter(options.OutputFile + ".cnvr")) { sw.WriteLine("seqname\tstart\tend\tfile\t{0}", filenames.Merge("\t")); foreach (var seqname in seqnames) { var ranges = result[seqname]; foreach (var range in ranges) { var cns = (from filename in filenames let cn = range.Items.Where(l => l.FileName.Equals(filename)).FirstOrDefault() select cn == null ? "CN2" : cn.CN).ToArray(); if (options.IgnoreCN1CN3 && cns.All(l => l.Equals("CN1") || l.Equals("CN2") || l.Equals("CN3"))) { continue; } sw.WriteLine("{0}\t{1}\t{2}\t{0}_{1}_{2}\t{3}", seqname, range.Start, range.End, cns.Merge("\t")); } } } return(new[] { options.OutputFile, options.OutputFile + ".cnvr" }); }
//Add a random connection between two nodes public void AddConnectionMutation() { List <NodeGene> values = Enumerable.ToList(nodes.Values); NodeGene node1 = values[Random.Range(0, values.Count)]; NodeGene node2 = values[Random.Range(0, values.Count)]; if (node1.type == node2.type && node1.type != NodeGene.Type.Hidden) { //try again AddConnectionMutation(); return; } bool reversed = false; if (node1.type == NodeGene.Type.Hidden && node2.type == NodeGene.Type.Input) { reversed = true; } else if (node1.type == NodeGene.Type.Output && node2.type == NodeGene.Type.Input) { reversed = true; } else if (node1.type == NodeGene.Type.Output && node2.type == NodeGene.Type.Hidden) { reversed = true; } float weight = GenomeUtils.RandomWeight(); bool connectionExists = false; foreach (ConnectionGene con in connections.Values) { if (con.inNode == node1.id && con.outNode == node2.id || con.inNode == node2.id && con.outNode == node1.id) { connectionExists = true; break; } } if (connectionExists) { return; } ConnectionGene newCon = new ConnectionGene(reversed? node2.id : node1.id, reversed? node1.id : node2.id, weight, true, Counter.NextConnection()); connections.Add(newCon.innovation, newCon); }
public void Mutate() { foreach (ConnectionGene con in connections.Values) { if (Random.value < GenomeUtils.PERTURB_RATE) //TODO CHANGE TO NORMAL DISTRIBUTION { con.weight = con.weight += Random.Range(GenomeUtils.WEIGHT_MIN / 2, GenomeUtils.WEIGHT_MAX / 2); //con.weight = Mathf.Clamp(con.weight, GenomeUtils.WEIGHT_MIN, GenomeUtils.WEIGHT_MAX); } else { con.weight = GenomeUtils.RandomWeight(); } } }
public override IEnumerable <string> Process() { Progress.SetMessage("Reading sequences from: " + _options.InputFile + "..."); var seqs = SequenceUtils.Read(_options.InputFile); seqs.Sort((m1, m2) => { var chr1 = m1.Name.StringBefore("_").StringAfter("chr"); var suffix1 = m1.Name.Contains("_") ? m1.Name.StringAfter("_") : string.Empty; var chr2 = m2.Name.StringBefore("_").StringAfter("chr"); var suffix2 = m2.Name.Contains("_") ? m2.Name.StringAfter("_") : string.Empty; if (string.IsNullOrWhiteSpace(suffix1)) { if (string.IsNullOrWhiteSpace(suffix2)) { return(GenomeUtils.CompareChromosome(chr1, chr2)); } else { return(-1); } } else { if (string.IsNullOrWhiteSpace(suffix2)) { return(1); } else { var ret = GenomeUtils.CompareChromosome(chr1, chr2); if (ret == 0) { ret = suffix1.CompareTo(suffix2); } return(ret); } } }); Progress.SetMessage("Writing sequences to: " + _options.OutputFile + "..."); SequenceUtils.Write(new FastaFormat(), _options.OutputFile, seqs); Progress.SetMessage("Finished."); return(new[] { _options.OutputFile }); }
private void AssignSpecies() { speciesMap = new Dictionary <Genome, Species>(); foreach (Genome gen in genomes) { bool found = false; foreach (Species species in speciesList) { float distance = GenomeUtils.CompatiblityDistance(gen, species.GetMascot(), C1, C2, C3); if (distance < compatiblityThreshold) { species.AddMember(gen); speciesMap.Add(gen, species); found = true; break; } } if (!found) { Species species = new Species(gen); speciesList.Add(species); speciesMap.Add(gen, species); } } System.Random r = new System.Random(); for (int i = speciesList.Count - 1; i >= 0; i--) { if (speciesList[i].GetCount() == 0) { speciesList.RemoveAt(i); } else { speciesList[i].RandomizeMascot(r); } } //Debug.Log("Gen: " + generation + ", Population: " + population + ", Species: " + speciesList.Count); }
public static Dictionary <string, List <ItemRange> > MergeRange(List <CnMOPsItem> data) { GenomeUtils.SortChromosome(data, m => m.Seqname, m => m.Start); Dictionary <string, List <ItemRange> > result = new Dictionary <string, List <ItemRange> >(); foreach (var d in data) { if (!result.ContainsKey(d.Seqname)) { result[d.Seqname] = new List <ItemRange>(); } var ranges = result[d.Seqname]; bool bFound = false; foreach (var range in ranges) { if (range.Overlap(d, 0)) { range.Items.Add(d); range.Start = Math.Min(range.Start, d.Start); range.End = Math.Max(range.End, d.End); bFound = true; break; } } if (!bFound) { var range = new ItemRange(); range.Items.Add(d); range.Seqname = d.Seqname; range.Start = d.Start; range.End = d.End; ranges.Add(range); } } return(result); }
public void ReceiveValue() { value = 0; if (activation == GenomeUtils.Activation.Multiply) { value = 1; } foreach (Connection con in inConnections) { if (activation == GenomeUtils.Activation.Multiply) { value *= con.value * con.weight; } else { value += con.value * con.weight; } con.Reset(); } value = GenomeUtils.Activate(value, activation); }
public Genome(int inputNodes, int outputNodes) { Counter.Reset(); nodes = new Dictionary <int, NodeGene>(); connections = new Dictionary <int, ConnectionGene>(); for (int i = 0; i < inputNodes; i++) { AddNodeGene(new NodeGene(NodeGene.Type.Input, Counter.NextNode(), GenomeUtils.Activation.None)); } for (int j = inputNodes + 1; j <= inputNodes + outputNodes; j++) { AddNodeGene(new NodeGene(NodeGene.Type.Output, Counter.NextNode(), GenomeUtils.Activation.None)); for (int i = 1; i <= inputNodes; i++) { float weight = GenomeUtils.RandomWeight(); AddConnectionGene(new ConnectionGene(i, j, weight, true, Counter.NextConnection())); } } }
public override IEnumerable <string> Process() { var result = new List <string>(); var bimfile = Path.ChangeExtension(options.InputFile, ".bim"); var snps = PlinkLocus.ReadFromBimFile(bimfile, false, false); snps.RemoveAll(m => IsIndel(m) || IsMissing(m)); var snpItems = (from snp in snps select new SNPItem() { Chrom = snp.Chromosome, Name = snp.MarkerId, Position = snp.PhysicalPosition, Allele1 = snp.Allele1[0], Allele2 = snp.Allele2 }).ToList(); var nameMap = snpItems.FillDbsnpIdByPosition(options.DbsnpFile, this.Progress); using (var sw = new StreamWriter(options.OutputPrefix + ".namemap")) { sw.WriteLine("NewName\tOldName"); foreach (var n in nameMap) { sw.WriteLine("{0}\t{1}", n.Key, n.Value); } } //remove all snps without corresponding dbsnp entry snpItems.RemoveAll(m => m.DbsnpRefAllele == ' '); var nameDic = snpItems.ToGroupDictionary(m => m.Name); foreach (var n in nameDic) { if (n.Value.Count > 1) { Console.Error.WriteLine("Duplicated SNP:" + n.Key); foreach (var v in n.Value) { Console.Error.WriteLine("{0}:{1}-{2}:{3},{4}:{5},{6}", n.Key, v.Chrom, v.Position, v.Allele1, v.Allele2, v.DbsnpRefAllele, v.DbsnpAltAllele); } } } if (File.Exists(options.G1000File)) { snpItems.FindAllele2FrequencyFrom1000GomeByName(options.G1000File, this.Progress); } if (File.Exists(options.FastaFile)) { snpItems.FillReferenceAlleleFromFasta(options.FastaFile, this.Progress); } Dictionary <string, StrandAction> actionMap = new Dictionary <string, StrandAction>(); var statFile = options.OutputPrefix + ".stat"; result.Add(statFile); using (var sw = new StreamWriter(statFile)) { sw.WriteLine("Name\tChromosome\tPosition\tSource_Allele1\tSource_Allele2\tReference_Allele\tDbsnp_RefAllele\tDbsnp_AltAllele\tDbsnp_IsReversed\tG1000_RefAllele\tG1000_AltAllele\tG1000_MAF\tAction"); foreach (var v in snpItems) { StrandAction action = v.SuggestAction(); sw.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11:0.####}\t{12}", v.Name, v.Chrom, v.Position, v.Allele1, v.Allele2, v.RefChar, v.DbsnpRefAllele, v.DbsnpAltAllele, v.DbsnpIsReversed, v.G1000Allele1, v.G1000Allele2, v.G1000Allele2Frequency, action); actionMap[v.Name] = action; } } using (var reader = new PlinkBedRandomFile(options.InputFile) { Progress = this.Progress }) { var data = reader.Data; var chrs = (from v in snpItems select v.Chrom).Distinct().OrderBy(m => m).ToArray(); foreach (var chr in chrs) { var genfile = string.Format("{0}.{1}.gen", options.OutputPrefix, chr.ToString().PadLeft(2, '0')); result.Add(genfile); var map = FileUtils.ChangeExtension(genfile, ".sample"); new GwasSampleFormat().WriteToFile(map, data.Individual); //save gen file using (var sw = new StreamWriter(genfile)) { sw.NewLine = Environment.NewLine; var chrItems = snpItems.Where(m => m.Chrom == chr).ToList(); GenomeUtils.SortChromosome(chrItems, m => chr.ToString(), m => m.Position); foreach (var snp in chrItems) { var ldata = reader.Read(nameMap[snp.Name]); var action = actionMap[snp.Name]; sw.Write("{0} {1} {2} {3} {4}", snp.Chrom, snp.Name, snp.Position, snp.DbsnpRefAllele, snp.DbsnpAltAllele); for (int individualIndex = 0; individualIndex < data.Individual.Count; individualIndex++) { if (PlinkData.IsMissing(ldata[0, individualIndex], ldata[1, individualIndex])) { sw.Write(" 0 0 0"); } else { char alle1, alle2; if (StrandAction.Switch == action || StrandAction.FlipSwitch == action) { alle1 = ldata[0, individualIndex] ? snp.DbsnpAltAllele : snp.DbsnpRefAllele; alle2 = ldata[1, individualIndex] ? snp.DbsnpAltAllele : snp.DbsnpRefAllele; } else { alle1 = ldata[0, individualIndex] ? snp.DbsnpRefAllele : snp.DbsnpAltAllele; alle2 = ldata[1, individualIndex] ? snp.DbsnpRefAllele : snp.DbsnpAltAllele; } if (alle1 != alle2) { sw.Write(" 0 1 0"); } else if (alle1 == snp.DbsnpRefAllele) { sw.Write(" 1 0 0"); } else { sw.Write(" 0 0 1"); } } } sw.WriteLine(); } } } } return(result); }
public void LoadGenome(string filename, bool all) { string path = filename; Genome genome = new Genome(); using (StreamReader reader = new StreamReader(path)) { bool node = false; bool connection = false; string line; while ((line = reader.ReadLine()) != null) { if (line.Equals("Nodes")) { node = true; connection = false; } else if (line.Equals("Connections")) { node = false; connection = true; } else { string[] info = line.Split(','); if (node) { int id = int.Parse(info[0]); NodeGene.Type type = (NodeGene.Type)System.Enum.Parse(typeof(NodeGene.Type), info[1]); GenomeUtils.Activation activation = (GenomeUtils.Activation)System.Enum.Parse(typeof(GenomeUtils.Activation), info[2]); genome.AddNodeGene(new NodeGene(type, id, activation)); Counter.SetNodeCounter(id); } else if (connection) { int innovation = int.Parse(info[0]); bool expressed = bool.Parse(info[1]); int inNode = int.Parse(info[2]); int outNode = int.Parse(info[3]); float weight = float.Parse(info[4]); genome.AddConnectionGene(new ConnectionGene(inNode, outNode, weight, expressed, innovation)); Counter.SetConnectionCounter(innovation); } else { Debug.LogError("Invalid genome file"); } } } } genomes.Clear(); genomes.Add(genome); if (all) { for (int i = 1; i < GenomeUtils.POP_SIZE; i++) { genomes.Add(GenomeUtils.Clone(genome)); } } else { for (int i = 1; i < GenomeUtils.POP_SIZE; i++) { genomes.Add(new Genome(inputNodes, outputNodes)); } } SetSpecies(); MakeNNets(); }
public override IEnumerable <string> Process() { var items = ReadGtfItems(); items.RemoveAll(m => m.Feature.Equals("region")); for (int i = items.Count - 1; i > 0; i--) { for (int j = i - 1; j >= 0; j--) { var res = items[i].Contains(items[j]); if (res == -1) { items.RemoveAt(i); break; } } } var groups = items.ToGroupDictionary(m => m.GetLocation()); foreach (var g in groups.Values) { if (g.Any(l => !IsCDS(l) && !IsExon(l) && !IsGene(l))) { g.RemoveAll(l => IsCDS(l) || IsExon(l) || IsGene(l)); g.ForEach(l => { l.Name = l.Feature + ":" + l.Attributes.StringAfter("ID=").StringBefore(";"); if (l.Attributes.Contains("product=")) { var product = l.Attributes.StringAfter("product=").StringBefore(";"); if (!product.Contains(" ")) { l.Name = l.Name + ":" + product; } } }); } else { if (g.Any(l => IsCDS(l))) { g.RemoveAll(l => IsGene(l)); } g.ForEach(l => l.Name = l.Feature + ":" + l.Attributes.StringAfter("Name=").StringBefore(";")); } if (g.Count > 1) { Console.WriteLine(g[0].GetLocation() + " : " + (from l in g select l.Feature).Merge("/")); } } var values = groups.Values.ToList(); GenomeUtils.SortChromosome(values, m => m[0].Seqname, m => m[0].Start); using (StreamWriter sw = new StreamWriter(options.OutputFile)) { foreach (var value in values) { foreach (var gtf in value) { sw.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}", gtf.Seqname, gtf.Start - 1, gtf.End, gtf.Name, 0, gtf.Strand); } } } return(new string[] { options.OutputFile }); }
public override IEnumerable <string> Process() { if (!_options.PrepareOptions()) { throw new Exception(_options.ParsingErrors.Merge("\n")); } HashSet <int> sampleCodes = new HashSet <int>(_options.GetTCGASampleCodes().ToList().ConvertAll(m => m.Code)); Func <string, bool> acceptBarcode = m => sampleCodes.Contains(new BarInfo(m, null).Sample); var tec = _options.GetTechnology(); var items = new List <MutationItem>(); foreach (var tumor in _options.TumorTypes) { var dir = Path.Combine(_options.TCGADirectory, tumor); if (!Directory.Exists(dir)) { continue; } var tecdir = tec.GetTechnologyDirectory(dir); if (!Directory.Exists(tecdir)) { continue; } foreach (var platform in _options.Platforms) { var platdir = Path.Combine(tecdir, platform); var datadirs = Directory.GetDirectories(platdir, "*Level_2*"); foreach (var datadir in datadirs) { var maffiles = Directory.GetFiles(datadir, "*.somatic.maf"); if (maffiles.Length == 0) { continue; } foreach (var maffile in maffiles) { using (var sr = new StreamReader(maffile)) { string line; //skip comments while ((line = sr.ReadLine()) != null && line.StartsWith("#")) { } if (string.IsNullOrEmpty(line)) { continue; } //read header var headers = line.Split('\t'); var nameIndex = Array.IndexOf(headers, "Hugo_Symbol"); var ncbiIndex = Array.IndexOf(headers, "NCBI_Build"); var chromosomeIndex = Array.IndexOf(headers, "Chromosome"); var startIndex = Array.IndexOf(headers, "Start_position"); var endIndex = Array.IndexOf(headers, "End_position"); var strandIndex = Array.IndexOf(headers, "Strand"); var variantClassificationIndex = Array.IndexOf(headers, "Variant_Classification"); var variantTypeIndex = Array.IndexOf(headers, "Variant_Type"); var barcodeIndex = Array.IndexOf(headers, "Tumor_Sample_Barcode"); while ((line = sr.ReadLine()) != null) { var parts = line.Split('\t'); var item = new MutationItem() { Tumor = tumor, Platform = platform, Name = parts[nameIndex], NcbiBuild = parts[ncbiIndex], Chromosome = parts[chromosomeIndex], Start = parts[startIndex], End = parts[endIndex], Strand = parts[strandIndex], VariantClassification = parts[variantClassificationIndex], VariantType = parts[variantTypeIndex], TumorBarcode = parts[barcodeIndex] }; item.InitLocus(); item.InitPaticipant(); items.Add(item); } } } } } } using (var sw = new StreamWriter(_options.OutputFile)) { var paticipants = (from item in items select item.Paticipant).Distinct().OrderBy(m => m).ToList(); var itemMap = items.ToDoubleDictionaryGroup(m => m.Locus, m => m.Paticipant); var locusList = itemMap.Keys.ToList(); GenomeUtils.SortChromosome(locusList, m => m.StringBefore(":"), m => int.Parse(m.StringAfter(":").StringBefore("-"))); sw.WriteLine("Hugo_Symbol\tNCBI_Build\tChromosome\tStart_position\tEnd_position\tStrand\tVariant_Classification\tVariant_Type\t{0}", paticipants.Merge("\t")); foreach (var locus in locusList) { var dic = itemMap[locus]; var item = dic.Values.First().First(); sw.Write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}", item.Name, item.NcbiBuild, item.Chromosome, item.Start, item.End, item.Start, item.VariantClassification, item.VariantType); foreach (var paticipant in paticipants) { if (dic.ContainsKey(paticipant)) { sw.Write("\t1"); } else { sw.Write("\t0"); } } sw.WriteLine(); } } var genefile = FileUtils.ChangeExtension(_options.OutputFile, ".gene.tsv"); using (var sw = new StreamWriter(genefile)) { var paticipants = (from item in items select item.Paticipant).Distinct().OrderBy(m => m).ToList(); var itemMap = items.ToDoubleDictionaryGroup(m => m.Name, m => m.Paticipant); var nameList = itemMap.Keys.OrderBy(m => m).ToList(); sw.WriteLine("Hugo_Symbol\t{0}", paticipants.Merge("\t")); foreach (var name in nameList) { var dic = itemMap[name]; var item = dic.Values.First().First(); sw.Write("{0}", item.Name); foreach (var paticipant in paticipants) { if (dic.ContainsKey(paticipant)) { sw.Write("\t1"); } else { sw.Write("\t0"); } } sw.WriteLine(); } } return(new[] { _options.OutputFile, genefile }); }
public override IEnumerable <string> Process() { if (!File.Exists(_options.BaseFilename) || new FileInfo(_options.BaseFilename).Length == 0) { base.Process(); } else { Progress.SetMessage("Base file {0} exists, ignore pileup ...", _options.BaseFilename); } var filterOptions = options.GetFilterOptions(); if (new FileInfo(_options.BaseFilename).Length > 0) { if (!filterOptions.PrepareOptions()) { throw new Exception("Filter options failed: " + filterOptions.ParsingErrors.Merge("\n")); } new FilterProcessor(filterOptions).Process(); var lines = File.ReadAllLines(filterOptions.ROutputFile).Skip(1).ToArray(); var glmfailed = lines.Count(m => m.Contains("GLM_PVALUE")); var summarylines = File.ReadAllLines(_options.SummaryFilename).ToList(); if (summarylines.Last().StartsWith("glm pvalue")) { summarylines.RemoveAt(summarylines.Count - 1); } summarylines.Add(string.Format("glm pvalue > {0}\t{1}\t{2}", options.GlmPvalue, glmfailed, lines.Length - glmfailed)); File.WriteAllLines(_options.SummaryFilename, summarylines); } var mutationList = GetValidationList(); var candidates = new MpileupFisherResultFileFormat().ReadFromFile(options.CandidatesFilename).ToDictionary(m => GenomeUtils.GetKey(m.Item.SequenceIdentifier, m.Item.Position)); var items = new FilterItemTextFormat().ReadFromFile(filterOptions.ROutputFile).ToDictionary(m => GenomeUtils.GetKey(m.Chr, m.Start)); var result = new List <FilterItem>(); foreach (var mutation in mutationList.Items) { var key = GenomeUtils.GetKey(mutation.Chr, mutation.Pos); if (items.ContainsKey(key)) { result.Add(items[key]); } else { var item = new FilterItem(); item.Chr = mutation.Chr; item.Start = mutation.Pos.ToString(); item.End = item.Start; item.FisherNormal = string.Empty; item.BrglmConverged = string.Empty; item.BrglmGroup = 1.0; item.BrglmGroupFdr = 1.0; item.BrglmScore = string.Empty; item.BrglmStrand = string.Empty; item.BrglmPosition = string.Empty; item.Identity = string.Empty; result.Add(item); if (candidates.ContainsKey(key)) { var cand = candidates[key]; item.ReferenceAllele = cand.Item.Nucleotide.ToString(); item.MajorAllele = cand.Group.SucceedName; item.MinorAllele = cand.Group.FailedName; item.NormalMajorCount = cand.Group.Sample1.Succeed; item.NormalMinorCount = cand.Group.Sample1.Failed; item.TumorMajorCount = cand.Group.Sample2.Succeed; item.TumorMinorCount = cand.Group.Sample2.Failed; item.FisherGroup = cand.Group.PValue; item.Filter = cand.FailedReason; Console.WriteLine("In candidates : " + item.Filter); } else { item.NormalMajorCount = 0; item.NormalMinorCount = 0; item.TumorMajorCount = 0; item.TumorMinorCount = 0; item.Filter = "No coverage"; Console.WriteLine("No read : " + item.Filter); } } } new FilterItemVcfWriter(filterOptions).WriteToFile(_options.OutputSuffix + ".vcf", result); new FilterItemTextFormat().WriteToFile(_options.OutputSuffix + ".tsv", result); return(new string[] { _options.OutputSuffix + ".tsv", _options.OutputSuffix + ".vcf" }); }
public override IEnumerable <string> Process() { var itemMap = new Dictionary <string, Dictionary <string, SomaticItem> >(); var files = (from line in File.ReadAllLines(options.InputFile) where !string.IsNullOrWhiteSpace(line) let parts = line.Split('\t') select new { Key = parts[0], File = parts[1] }).ToList(); if (files.Count > 0 && !File.Exists(files[0].File)) {//maybe header files.RemoveAt(0); } foreach (var file in files) { var items = SomaticMutationUtils.ParseGlmvcFile(file.File, options.AcceptChromosome); itemMap[file.Key] = items.ToDictionary(m => m.Key); } using (var sw = new StreamWriter(options.OutputFile)) { var samples = itemMap.Keys.OrderBy(m => m).ToArray(); List <Tuple <string, Func <SomaticItem, string> > > funcs = new List <Tuple <string, Func <SomaticItem, string> > >(); funcs.Add(new Tuple <string, Func <SomaticItem, string> >("#chr", m => m.Chrom)); funcs.Add(new Tuple <string, Func <SomaticItem, string> >("start", m => m.StartPosition.ToString())); funcs.Add(new Tuple <string, Func <SomaticItem, string> >("end", m => m.StartPosition.ToString())); if (itemMap.Values.Any(m => m.Values.Any(l => !string.IsNullOrWhiteSpace(l.RefGeneName)))) { funcs.Add(new Tuple <string, Func <SomaticItem, string> >("gene", m => m.RefGeneName)); funcs.Add(new Tuple <string, Func <SomaticItem, string> >("func", m => m.RefGeneFunc)); funcs.Add(new Tuple <string, Func <SomaticItem, string> >("exonic_func", m => m.RefGeneExonicFunc)); funcs.Add(new Tuple <string, Func <SomaticItem, string> >("aa_change", m => m.RefGeneAAChange)); } sw.Write(funcs.ConvertAll(l => l.Item1).Merge("\t")); foreach (var sample in samples) { sw.Write("\t{0}", sample); } sw.WriteLine("\tDetectedTimes"); var locus = (from v in itemMap.Values from vv in v.Keys select vv).Distinct().ToList(); GenomeUtils.SortChromosome(locus, m => m.StringBefore("_"), m => int.Parse(m.StringAfter("_"))); foreach (var loc in locus) { var item = (from v in itemMap.Values from vv in v where vv.Key.Equals(loc) select vv.Value).First(); sw.Write("{0}", funcs.ConvertAll(l => l.Item2(item)).Merge("\t")); var count = 0; foreach (var sample in samples) { SomaticItem curitem; if (itemMap[sample].TryGetValue(loc, out curitem)) { sw.Write("\t{0}/{1}={2}/{3}|{4}/{5}", curitem.RefAllele, curitem.AltAllele, curitem.NormalMajorCount, curitem.NormalMinorCount, curitem.TumorMajorCount, curitem.TumorMinorCount); count++; } else { sw.Write("\t"); } } sw.WriteLine("\t{0}", count); } } return(new[] { options.OutputFile }); }
public override IEnumerable <string> Process() { var files = _options.GetAnnovarFiles(); var filelist = files.Keys.ToArray(); using (var sw = new StreamWriter(_options.OutputFile)) { //deal with comments using (var sr = new StreamReader(filelist[0])) { string line; while ((line = sr.ReadLine()) != null) { if (line.StartsWith("##MuTect=")) { sw.WriteLine(line); for (var i = 1; i < filelist.Length; i++) { using (var sr2 = new StreamReader(filelist[i])) { while ((line = sr2.ReadLine()) != null) { if (!line.StartsWith("##MuTect=")) { continue; } sw.WriteLine(line); break; } } } } else if (!line.StartsWith("#")) { break; } else { sw.WriteLine(line); } } } //deal with data var data = new List <FileData>(); foreach (var file in filelist) { var lines = File.ReadAllLines(file); var mutect = lines.FirstOrDefault(m => m.StartsWith("##MuTect=")); string normal, tumor, normalName, tumorName; if (mutect != null) { normal = mutect.StringAfter("normal_sample_name=").StringBefore(" "); tumor = mutect.StringAfter("tumor_sample_name=").StringBefore(" "); normalName = normal; tumorName = tumor; } else { normal = "NORMAL"; tumor = "TUMOR"; normalName = Path.GetFileName(file).StringBefore(".") + "_normal"; tumorName = Path.GetFileName(file).StringBefore(".") + "_tumor"; } var header = lines.First(m => !m.StartsWith("#")); var headers = header.Split('\t'); var infoIndex = Array.IndexOf(headers, "INFO"); var formatIndex = Array.IndexOf(headers, "FORMAT"); var normalIndex = Array.IndexOf(headers, normal); var tumorIndex = Array.IndexOf(headers, tumor); var dictionary = new Dictionary <string, FileDataValue>(); foreach (var line in lines) { if (string.IsNullOrWhiteSpace(line) || line.StartsWith("#") || line.StartsWith("Chr")) { continue; } var parts = line.Split('\t'); if (parts.Length != headers.Length) { continue; } var vnormal = GetAllele(parts, normalIndex); var vtumor = GetAllele(parts, tumorIndex); var value = new FileDataValue() { Key = parts[0] + "_" + parts[1], Parts = parts, VNormal = vnormal, VTumor = vtumor }; dictionary.Add(value.Key, value); } data.Add(new FileData() { File = file, Normal = normalName, Tumor = tumorName, Headers = headers, InfoIndex = infoIndex, FormatIndex = formatIndex, NormalIndex = normalIndex, TumorIndex = tumorIndex, Data = dictionary }); } //get all positions var keys = (from d in data from k in d.Data.Keys select k).Distinct().ToList().ConvertAll(m => { var p = m.Split('_'); return(new { Key = m, Chr = p[0], Position = int.Parse(p[1]) }); } ); GenomeUtils.SortChromosome(keys, m => m.Chr, m => m.Position); var keyMap = keys.ToDictionary(m => m.Key); //check by original vcf file to fill the other columns foreach (var d in data) { var vcf = files[d.File]; if (string.IsNullOrEmpty(vcf)) { continue; } var vd = d.Data; using (var sr = new StreamReader(vcf)) { string line; var normalIndex = -1; var tumorIndex = -1; while ((line = sr.ReadLine()) != null) { if (!line.StartsWith("#CHROM")) { continue; } var parts = line.Split('\t'); normalIndex = Array.IndexOf(parts, d.Normal); tumorIndex = Array.IndexOf(parts, d.Tumor); break; } if (normalIndex == -1) { throw new Exception(string.Format("Normal {0} is not included in detail vcf file {1} but in annovar result {1}", d.Normal, vcf, d.File)); } if (tumorIndex == -1) { throw new Exception(string.Format("Tumor {0} is not included in detail vcf file {1} but in annovar result {1}", d.Tumor, vcf, d.File)); } var minIndex = Math.Max(normalIndex, tumorIndex) + 1; while ((line = sr.ReadLine()) != null) { var parts = line.Split('\t'); if (parts.Length < minIndex) { break; } var key = parts[0] + "_" + parts[1]; if (!keyMap.ContainsKey(key)) { continue; } FileDataValue fdv; if (!vd.TryGetValue(key, out fdv)) { fdv = new FileDataValue() { Key = key, Parts = null }; vd[key] = fdv; } fdv.VNormal = GetAllele(parts, normalIndex); fdv.VTumor = GetAllele(parts, tumorIndex); } } } //write header for (var i = 0; i < data[0].Headers.Length; i++) { if (i == data[0].NormalIndex || i == data[0].TumorIndex || i == data[0].InfoIndex || i == data[0].FormatIndex) { continue; } else { if (i != 0) { sw.Write("\t"); } sw.Write(data[0].Headers[i]); } } var normalnames = (from d in data select d.Normal).Distinct().ToArray(); sw.Write("\t{0}", normalnames.Merge('\t')); var tumornames = (from d in data select d.Tumor).Distinct().ToArray(); sw.WriteLine("\t{0}", tumornames.Merge('\t')); foreach (var key in keys) { var d1 = data.First(d => d.Data.ContainsKey(key.Key) && d.Data[key.Key].Parts != null); var v1 = d1.Data[key.Key]; for (var i = 0; i < v1.Parts.Length; i++) { if (i == 0) { sw.Write("{0}", v1.Parts[0]); } else if (i == d1.InfoIndex || i == d1.FormatIndex || i == d1.NormalIndex || i == d1.TumorIndex) { continue; } else { sw.Write("\t{0}", v1.Parts[i]); } } foreach (var name in normalnames) { var dn = (from d in data where d.Normal.Equals(name) && d.Data.ContainsKey(key.Key) select d).FirstOrDefault(); if (dn == null) { sw.Write("\t"); } else { var vn = dn.Data[key.Key].VNormal; sw.Write("\t{0}", vn); } } foreach (var name in tumornames) { var dn = (from d in data where d.Tumor.Equals(name) && d.Data.ContainsKey(key.Key) select d).FirstOrDefault(); if (dn == null) { sw.Write("\t"); } else { var vn = dn.Data[key.Key].VTumor; sw.Write("\t{0}", vn); } } sw.WriteLine(); } } return(new[] { _options.OutputFile }); }
private void NextGen() { Global.game.restart(); generation++; float totalFitness = 0; float leftPopulation = population * (1 - survivalChance); List <Genome> nextGenomes = new List <Genome>(); foreach (Species species in speciesList) { totalFitness += species.GetFitness(); } for (int i = 0; i < (int)(population * survivalChance); i++) { nextGenomes.Add(nets[i].GetGenome()); } foreach (Species species in speciesList) { for (int i = 0; i < (int)(species.GetFitness() / totalFitness * leftPopulation); i++) { Genome parent1 = species.GetRandomGenome(random); Genome parent2 = species.GetRandomGenome(random); Genome child = new Genome(); if (networkMap[parent1].GetFitness() > networkMap[parent2].GetFitness()) { child = GenomeUtils.Crossover(parent1, parent2, random); } else { child = GenomeUtils.Crossover(parent2, parent1, random); } nextGenomes.Add(child); } } while (nextGenomes.Count < population) { Genome parent1 = speciesList[0].GetRandomGenome(random); Genome parent2 = speciesList[0].GetRandomGenome(random); Genome child = new Genome(); if (networkMap[parent1].GetFitness() > networkMap[parent2].GetFitness()) { child = GenomeUtils.Crossover(parent1, parent2, random); } else { child = GenomeUtils.Crossover(parent2, parent1, random); } nextGenomes.Add(child); } foreach (Genome genome in nextGenomes) { double roll = random.NextDouble(); if (roll < weightMutationChance) { genome.Mutate(randomWeightChance, random); } else if (roll < weightMutationChance + addNodeChance) { genome.AddNodeMutation(random); } else if (roll < weightMutationChance + addNodeChance + addConnectionChance) { genome.AddConnectionMutation(random); } } foreach (Species species in speciesList) { species.Reset(); } genomes = nextGenomes; }
protected override MpileupResult GetMpileupResult() { var result = new MpileupResult(string.Empty, _options.CandidatesDirectory); Progress.SetMessage("Single thread mode ..."); var parser = _options.GetPileupItemParser(false); var pfile = new PileupFile(parser); var mutationList = GetValidationList(); result.TotalCount = mutationList.Items.Length; var map = mutationList.Items.ToDictionary(m => GenomeUtils.GetKey(m.Chr, m.Pos)); switch (_options.From) { case DataSourceType.Mpileup: pfile.Open(_options.MpileupFile); break; case DataSourceType.BAM: var posFile = Path.Combine(_options.CandidatesDirectory, "pos.bed"); mutationList.WriteToFile(posFile, 500); var proc = new MpileupProcessor(_options).ExecuteSamtools(new[] { _options.NormalBam, _options.TumorBam }, "", posFile); if (proc == null) { throw new Exception("Cannot execute mpileup."); } pfile.Open(proc.StandardOutput); pfile.Samtools = proc; break; case DataSourceType.Console: pfile.Open(Console.In); break; } Progress.SetMessage("Total {0} entries in validation list", mutationList.Items.Length); foreach (var m in map) { Console.WriteLine(m.Key); } using (pfile) { try { IMpileupParser proc = new ValidationParser(_options, result); string line; while ((line = pfile.ReadLine()) != null) { try { var locus = parser.GetSequenceIdentifierAndPosition(line); var locusKey = GenomeUtils.GetKey(locus.SequenceIdentifier, locus.Position); //Console.WriteLine(locusKey); ValidationItem vitem = null; if (!map.TryGetValue(locusKey, out vitem)) { continue; } //Console.WriteLine("Parsing " + line); var parres = proc.Parse(line, true); if (!string.IsNullOrEmpty(parres.FailedReason)) { Progress.SetMessage("{0}\t{1}\t{2} ~ {3}\t{4}", parres.Item.SequenceIdentifier, parres.Item.Position, parres.Group.Sample1, parres.Group.Sample2, parres.FailedReason); } result.Results.Add(parres); } catch (Exception ex) { var error = string.Format("parsing error {0}\n{1}", ex.Message, line); Progress.SetMessage(error); Console.Error.WriteLine(ex.StackTrace); throw new Exception(error); } } } finally { if (pfile.Samtools != null) { try { pfile.Samtools.Kill(); } // ReSharper disable once EmptyGeneralCatchClause catch (Exception) { } } } } result.NotCovered = result.TotalCount - result.Results.Count; return(result); }
public override IEnumerable <string> Process() { options.PrintParameter(Console.Out); Progress.SetMessage("Single thread mode ..."); var parser = options.GetPileupItemParser(); var pfile = new PileupFile(parser); var mutationList = new ValidationFile().ReadFromFile(options.BedFile); var map = mutationList.Items.ToDictionary(m => GenomeUtils.GetKey(m.Chr, m.Pos)); var posFile = Path.Combine(options.OutputFile + ".pos.bed"); mutationList.WriteToFile(posFile, 500); var proc = new MpileupProcessor(options).ExecuteSamtools(options.BamFiles, "", posFile); if (proc == null) { return(null); } pfile.Open(proc.StandardOutput); pfile.Samtools = proc; Progress.SetMessage("Total {0} entries in extraction list", mutationList.Items.Length); var result = new Dictionary <ValidationItem, PileupItem>(); using (pfile) { try { string line; string lastChrom = string.Empty; while ((line = pfile.ReadLine()) != null) { try { var locus = parser.GetSequenceIdentifierAndPosition(line); var locusKey = GenomeUtils.GetKey(locus.SequenceIdentifier, locus.Position); if (!locus.SequenceIdentifier.Equals(lastChrom)) { Progress.SetMessage("Processing chromosome " + locus.SequenceIdentifier + " ..."); lastChrom = locus.SequenceIdentifier; } ValidationItem vitem = null; if (!map.TryGetValue(locusKey, out vitem)) { continue; } result[vitem] = parser.GetValue(line); } catch (Exception ex) { var error = string.Format("Parsing error {0}\n{1}", ex.Message, line); Progress.SetMessage(error); Console.Error.WriteLine(ex.StackTrace); throw new Exception(error); } } } finally { if (pfile.Samtools != null) { try { pfile.Samtools.Kill(); } // ReSharper disable once EmptyGeneralCatchClause catch (Exception) { } } } } if (result.Count == 0) { throw new Exception("Nothing found. Look at the log file for error please."); } using (var sw = new StreamWriter(options.OutputFile)) { sw.WriteLine("{0}\t{1}", mutationList.Header, options.GetBamNames().Merge("\t")); var emptyevents = new string('\t', options.BamFiles.Count); foreach (var mu in mutationList.Items) { sw.Write("{0}", mu.Line); PileupItem item; if (result.TryGetValue(mu, out item)) { foreach (var sample in item.Samples) { sample.InitEventCountList(false); if (sample.EventCountList.Count > 0) { sw.Write("\t{0}", (from ecl in sample.EventCountList let v = string.Format("{0}:{1}", ecl.Event, ecl.Count) select v).Merge(",")); } else { sw.Write("\t"); } } } else { sw.Write(emptyevents); } sw.WriteLine(); } } return(new[] { options.OutputFile }); }
public NodeGene(Type type, int id) { this.type = type; this.id = id; this.activation = GenomeUtils.RandomActivation(); }
public override IEnumerable <string> Process() { var candidates = options.ReadSeeds(); Progress.SetMessage("Total {0} seeds readed.", candidates.Length); var offsets = GetPossibleOffsets(string.Empty); Progress.SetMessage("Build target {0} mers...", options.MinimumSeedLength); var seeds = (from seq in candidates from offset in offsets select seq.Substring(offset, options.MinimumSeedLength)).ToList(); var targetSeedMap = ParclipUtils.BuildTargetSeedMap(options, seeds, this.Progress); //var seeds = new HashSet<string>(from seq in candidates // from offset in offsets // select seq.Substring(offset, options.MinimumSeedLength)); //var targetSeedMap = ParclipUtils.BuildTargetSeedMap(options, m => seeds.Contains(m.Sequence), this.Progress); Progress.SetMessage("Finding target..."); using (var sw = new StreamWriter(options.OutputFile)) { sw.WriteLine("Sequence\tSeed\tSeedOffset\tSeedLength\tTarget\tTargetCoverage\tTargetGeneSymbol\tTargetName"); foreach (var seq in candidates) { foreach (var offset in offsets) { if (seq.Length < offset + options.MinimumSeedLength) { break; } var seed = seq.Substring(offset, options.MinimumSeedLength); List <SeedItem> target; if (targetSeedMap.TryGetValue(seed, out target)) { if (target.ConvertAll(l => l.Coverage).Distinct().Count() == 1) { GenomeUtils.SortChromosome(target, m => m.Seqname, m => m.Start); } else { target.Sort((m1, m2) => { return(m2.Coverage.CompareTo(m1.Coverage)); }); } var longest = ParclipUtils.ExtendToLongestTarget(target, null, seq, offset, options.MinimumSeedLength, int.MaxValue, options.MinimumCoverage); for (int j = 0; j < longest.Count; j++) { var t = longest[j]; var finalSeed = seq.Substring(offset, (int)t.Length); sw.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}:{5}-{6}:{7}\t{8}\t{9}\t{10}", seq, finalSeed, offset, finalSeed.Length, t.Seqname, t.Start, t.End, t.Strand, t.Coverage, t.GeneSymbol, t.Name); } } } } } return(new[] { options.OutputFile }); }
public override IEnumerable <string> Process() { var paramFile = options.OutputFile + ".param"; options.SaveToFile(options.OutputFile + ".param"); var bedfile = new BedItemFile <BedItem>(6); Progress.SetMessage("building chromosome name map ..."); var mitoName = "M"; Dictionary <string, string> chrNameMap = new Dictionary <string, string>(); var ff = new FastaFormat(int.MaxValue); var faiFile = options.FastaFile + ".fai"; if (File.Exists(faiFile)) { using (StreamReader sr = new StreamReader(faiFile)) { string line; while ((line = sr.ReadLine()) != null) { var name = line.Split('\t')[0]; chrNameMap[name] = name; if (name.StartsWith("chr")) { chrNameMap[name.StringAfter("chr")] = name; } if (!name.StartsWith("chr")) { chrNameMap["chr" + name] = name; } if (name.Equals("chrMT") || name.Equals("MT")) { mitoName = "MT"; } if (name.Equals("chrM") || name.Equals("M")) { mitoName = "M"; } } } } else { using (StreamReader sr = new StreamReader(options.FastaFile)) { Sequence seq; while ((seq = ff.ReadSequence(sr)) != null) { var name = seq.Name; chrNameMap[name] = name; if (name.StartsWith("chr")) { chrNameMap[name.StringAfter("chr")] = name; } if (!name.StartsWith("chr")) { chrNameMap["chr" + name] = name; } if (name.Equals("chrMT") || name.Equals("MT")) { mitoName = "MT"; } if (name.Equals("chrM") || name.Equals("M")) { mitoName = "M"; } } } } var longMitoName = chrNameMap[mitoName]; Progress.SetMessage("mitochondral chromosome name = {0}", longMitoName); var mirnas = new List <BedItem>(); if (File.Exists(options.MiRBaseFile)) { Progress.SetMessage("Processing {0} ...", options.MiRBaseFile); if (options.MiRBaseFile.EndsWith(".bed")) { mirnas = bedfile.ReadFromFile(options.MiRBaseFile); mirnas.ForEach(m => { m.Seqname = m.Seqname.StringAfter("chr"); m.Name = options.MiRBaseKey + ":" + m.Name; }); } else { using (var gf = new GtfItemFile(options.MiRBaseFile)) { GtfItem item; while ((item = gf.Next(options.MiRBaseKey)) != null) { BedItem loc = new BedItem(); loc.Seqname = item.Seqname.StringAfter("chr"); loc.Start = item.Start - 1; loc.End = item.End; loc.Name = options.MiRBaseKey + ":" + item.Attributes.StringAfter("Name=").StringBefore(";"); loc.Score = 1000; loc.Strand = item.Strand; mirnas.Add(loc); } } } Progress.SetMessage("{0} miRNA readed.", mirnas.Count); } List <BedItem> trnas = new List <BedItem>(); if (File.Exists(options.UcscTrnaFile)) { //reading tRNA from ucsc table without mitocondrom tRNA Progress.SetMessage("Processing {0} ...", options.UcscTrnaFile); trnas = bedfile.ReadFromFile(options.UcscTrnaFile); trnas.ForEach(m => m.Seqname = m.Seqname.StringAfter("chr")); var removed = trnas.Where(m => (m.Seqname.Length > 1) && !m.Seqname.All(n => char.IsDigit(n))).ToList(); if (removed.Count != trnas.Count) { //remove the tRNA not from 1-22, X and Y trnas.RemoveAll(m => (m.Seqname.Length > 1) && !m.Seqname.All(n => char.IsDigit(n))); //mitocondrom tRNA will be extracted from ensembl gtf file trnas.RemoveAll(m => m.Seqname.Equals("M") || m.Seqname.Equals("MT")); } trnas.ForEach(m => m.Name = GetTRNAName(m.Name)); Progress.SetMessage("{0} tRNA from ucsc readed.", trnas.Count); if (File.Exists(options.UcscMatureTrnaFastaFile)) { var seqs = SequenceUtils.Read(options.UcscMatureTrnaFastaFile); foreach (var seq in seqs) { var tRNAName = GetTRNAName(seq.Name); trnas.Add(new BedItem() { Seqname = seq.Name, Start = 0, End = seq.SeqString.Length, Strand = '+', Name = tRNAName, Sequence = seq.SeqString }); } } } var others = new List <BedItem>(); if (File.Exists(options.EnsemblGtfFile)) { //reading smallRNA/tRNA from ensembl gtf file Progress.SetMessage("Processing {0} ...", options.EnsemblGtfFile); using (var gf = new GtfItemFile(options.EnsemblGtfFile)) { var biotypes = new HashSet <string>(SmallRNAConsts.Biotypes); biotypes.Remove(SmallRNAConsts.miRNA); GtfItem item; int count = 0; while ((item = gf.Next("gene")) != null) { string biotype; if (item.Attributes.Contains("gene_biotype")) { biotype = item.Attributes.StringAfter("gene_biotype \"").StringBefore("\""); } else if (item.Attributes.Contains("gene_type")) { biotype = item.Attributes.StringAfter("gene_type \"").StringBefore("\""); } else { continue; } if (File.Exists(options.UcscTrnaFile) && biotype.Equals(SmallRNAConsts.tRNA)) { continue; } if (biotype.Equals("Mt_tRNA")) { count++; var gene_name = item.Attributes.Contains("gene_name") ? item.Attributes.StringAfter("gene_name \"").StringBefore("\"") : item.GeneId; BedItem loc = new BedItem(); loc.Seqname = mitoName; loc.Start = item.Start - 1; loc.End = item.End; loc.Name = string.Format(SmallRNAConsts.mt_tRNA + ":" + longMitoName + ".tRNA{0}-{1}", count, gene_name.StringAfter("-")); loc.Score = 1000; loc.Strand = item.Strand; trnas.Add(loc); } else if (biotypes.Contains(biotype)) { string seqName; if (item.Seqname.ToLower().StartsWith("chr")) { seqName = item.Seqname.Substring(3); } else { seqName = item.Seqname; } if (seqName.Equals("M") || seqName.Equals("MT")) { seqName = mitoName; } //ignore all smallRNA coordinates on scaffold or contig. //if (seqName.Length > 5) //{ // continue; //} var gene_name = item.Attributes.StringAfter("gene_name \"").StringBefore("\""); var lowGeneName = gene_name.ToLower(); if (lowGeneName.StartsWith("rny") || lowGeneName.Equals("y_rna")) { biotype = "yRNA"; } BedItem loc = new BedItem(); loc.Seqname = seqName; loc.Start = item.Start - 1; loc.End = item.End; //if (lowGeneName.EndsWith("_rrna") && loc.Length < 200) //{ // biotype = "rRNA"; //} loc.Name = biotype + ":" + gene_name + ":" + item.GeneId; loc.Score = 1000; loc.Strand = item.Strand; others.Add(loc); } } } } var all = new List <BedItem>(); all.AddRange(mirnas); all.AddRange(trnas); all.AddRange(others); foreach (var bi in all) { if (chrNameMap.ContainsKey(bi.Seqname)) { bi.Seqname = chrNameMap[bi.Seqname]; } } if (File.Exists(options.RRNAFile)) { var seqs = SequenceUtils.Read(options.RRNAFile); foreach (var seq in seqs) { all.Add(new BedItem() { Seqname = seq.Name, Start = 0, End = seq.SeqString.Length, Strand = '+', Name = "rRNA:" + SmallRNAConsts.rRNADB_KEY + seq.Name }); } } Progress.SetMessage("Saving smallRNA coordinates to " + options.OutputFile + "..."); using (var sw = new StreamWriter(options.OutputFile)) { foreach (var pir in SmallRNAConsts.Biotypes) { var locs = all.Where(m => m.Name.StartsWith(pir)).ToList(); Progress.SetMessage("{0} : {1}", pir, locs.Count); GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start); foreach (var loc in locs) { sw.WriteLine(bedfile.GetValue(loc)); } } } var miRNA_bed = FileUtils.ChangeExtension(options.OutputFile, ".miRNA.bed"); Progress.SetMessage("Saving miRNA coordinates to " + miRNA_bed + "..."); using (var sw = new StreamWriter(miRNA_bed)) { var pir = SmallRNAConsts.miRNA; var locs = all.Where(m => m.Name.StartsWith(pir)).ToList(); Progress.SetMessage("{0} : {1}", pir, locs.Count); GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start); foreach (var loc in locs) { sw.WriteLine(bedfile.GetValue(loc)); } } Progress.SetMessage("Saving smallRNA miss1 coordinates to " + options.OutputFile + ".miss1 ..."); using (var sw = new StreamWriter(options.OutputFile + ".miss1")) { foreach (var pir in SmallRNAConsts.Biotypes) { if (pir == SmallRNABiotype.lincRNA.ToString() || pir == SmallRNABiotype.lncRNA.ToString()) { continue; } var locs = all.Where(m => m.Name.StartsWith(pir)).ToList(); locs.RemoveAll(l => l.Name.Contains(SmallRNAConsts.rRNADB_KEY)); Progress.SetMessage("{0} : {1}", pir, locs.Count); GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start); foreach (var loc in locs) { sw.WriteLine(bedfile.GetValue(loc)); } } } Progress.SetMessage("Saving smallRNA miss1 coordinates to " + options.OutputFile + ".miss0 ..."); using (var sw = new StreamWriter(options.OutputFile + ".miss0")) { foreach (var pir in SmallRNAConsts.Biotypes) { if (pir != SmallRNABiotype.lincRNA.ToString() && pir != SmallRNABiotype.lncRNA.ToString() && pir != SmallRNABiotype.rRNA.ToString()) { continue; } var locs = all.Where(m => m.Name.StartsWith(pir)).ToList(); if (pir == SmallRNABiotype.rRNA.ToString()) { locs.RemoveAll(l => !l.Name.Contains(SmallRNAConsts.rRNADB_KEY)); } Progress.SetMessage("{0} : {1}", pir, locs.Count); GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start); foreach (var loc in locs) { sw.WriteLine(bedfile.GetValue(loc)); } } } var summaryFile = options.OutputFile + ".info"; Progress.SetMessage("Writing summary to " + summaryFile + "..."); using (var sw = new StreamWriter(summaryFile)) { sw.WriteLine("Biotype\tCount"); all.ConvertAll(m => m.Name).Distinct().GroupBy(m => m.StringBefore(":")).OrderByDescending(m => m.Count()).ToList().ForEach(m => sw.WriteLine("{0}\t{1}", m.Key, m.Count())); } var result = new List <string>(new[] { options.OutputFile }); var fasta = Path.ChangeExtension(options.OutputFile, ".fasta"); if ((File.Exists(options.UcscTrnaFile) && File.Exists(options.UcscMatureTrnaFastaFile)) || File.Exists(options.RRNAFile)) { result.Add(fasta); using (var sw = new StreamWriter(fasta)) { string line; using (var sr = new StreamReader(options.FastaFile)) { while ((line = sr.ReadLine()) != null) { sw.WriteLine(line); } } if (File.Exists(options.UcscTrnaFile) && File.Exists(options.UcscMatureTrnaFastaFile)) { using (var sr = new StreamReader(options.UcscMatureTrnaFastaFile)) { while ((line = sr.ReadLine()) != null) { sw.WriteLine(line); } } } if (File.Exists(options.RRNAFile)) { using (var sr = new StreamReader(options.RRNAFile)) { while ((line = sr.ReadLine()) != null) { sw.WriteLine(line); } } } } } var faFile = options.OutputFile + ".fa"; Progress.SetMessage("Extracting sequence from " + options.FastaFile + "..."); var b2foptions = new Bed2FastaProcessorOptions() { GenomeFastaFile = options.FastaFile, InputFile = options.OutputFile, OutputFile = faFile, KeepChrInName = false, }; if (!File.Exists(options.UcscMatureTrnaFastaFile)) { b2foptions.AcceptName = m => m.StartsWith(SmallRNAConsts.miRNA) || m.StartsWith(SmallRNAConsts.mt_tRNA) || m.StartsWith(SmallRNAConsts.tRNA); } else { b2foptions.AcceptName = m => m.StartsWith(SmallRNAConsts.miRNA) || m.StartsWith(SmallRNAConsts.mt_tRNA); } new Bed2FastaProcessor(b2foptions) { Progress = this.Progress }.Process(); if (File.Exists(options.UcscMatureTrnaFastaFile)) { Progress.SetMessage("Extracting sequence from " + options.UcscMatureTrnaFastaFile + " ..."); using (var sw = new StreamWriter(faFile, true)) { foreach (var tRNA in trnas) { if (!string.IsNullOrEmpty(tRNA.Sequence)) { sw.WriteLine(">{0}", tRNA.Name); sw.WriteLine("{0}", tRNA.Sequence); } } } } return(result); }