private void MakeUniprotDictionary(uniprot db) { uniprotDB.Clear(); foreach (var item in db.entry) { foreach (var accession in item.accession) { uniprotDB.Add(accession, item); } } }
/// <summary> /// Setup all for data loaded from UniProt /// </summary> /// <param name="db"></param> public void SetupFromUniProt(uniprot db) { Log("Building chain names from UniProt dataset"); var proteinDB = from entry in db.entry let chains = entry.GetChains().ToArray() from chain in chains let chainIndex = Array.IndexOf(chains, chain) select new ChainDescriptor { ChainId = string.Format("{0}-{1}", entry.accession[0], chainIndex), Accession = entry.accession[0], ChainIndex = chainIndex, Description = chain.description, LocationBegin = chain.GetFirstLocation(), LocationEnd = chain.GetLastLocation(), Length = chain.GetFeatureLength(), Lineage = entry.organism.lineage.LastOrDefault() }; ProteinDB = proteinDB.ToList(); }
private static void Sample(string cleavageSiteName, uniprot db, IEnumerable<Tuple<TagInfo, TagInfo>> cleavageSites, string windowString, Workspace workspace) { var array = windowString.Split('-'); Debug.Assert(array.Length == 2, "Window string dont have 2 numbers"); var windowLeft = int.Parse(array[0]); var windowRight = int.Parse(array[1]); var acc = workspace.CreateSamplingData(windowLeft + windowRight); acc.Class = cleavageSiteName + "|" + windowString; foreach (var cleavageSite in cleavageSites) { Debug.Assert(cleavageSite.Item1.Accession == cleavageSite.Item2.Accession); var entry = db.entry.First(x => x.accession.Contains(cleavageSite.Item1.Accession)); var text = CleanText(entry.sequence.Value); Console.Error.WriteLine("Sampling window {0}-{1} applied to {2}:{3}", windowLeft, windowRight, cleavageSite.Item1.Accession, cleavageSiteName); SamplePositivesNegatives(acc, text, windowLeft, windowRight, cleavageSite); } var num = acc.Positives.Count + acc.Negatives.Count; Console.Error.WriteLine("Collected {0} samples (p:{1} / n:{2})", num, acc.Positives.Count, acc.Negatives.Count); }