Example #1
0
    /// <summary>
    /// Fill dbsnp information. The name of SNPItem will be replaced by dbSNP name and the mapping between dbSNP name and old SNPItem name will be returned.
    /// </summary>
    /// <param name="snpItems"></param>
    /// <param name="dbSnpVcfFile"></param>
    /// <param name="progress"></param>
    /// <returns></returns>
    public static Dictionary<string, string> FillDbsnpIdByPosition(this IEnumerable<SNPItem> snpItems, string dbSnpVcfFile, IProgressCallback progress = null)
    {
      var sourceDbsnpMap = snpItems.ToDictionary(m => m.Name, m => m.Name);

      if (progress == null)
      {
        progress = new  EmptyProgressCallback();
      }

      var dic = snpItems.ToDoubleDictionary(m => m.Chrom, m => m.Position);

      progress.SetMessage("Filling dbSNP id from {0} ...", dbSnpVcfFile);
      using (var sr = new StreamReader(dbSnpVcfFile))
      {
        progress.SetRange(0, sr.BaseStream.Length);

        string line;
        while ((line = sr.ReadLine()) != null)
        {
          if (!line.StartsWith("##"))
          {
            break;
          }
        }

        int linecount = 0;
        Dictionary<int, SNPItem> chrMap = null;
        int lastChr = -1;
        while (line != null)
        {
          linecount++;

          if (linecount % 10000 == 0)
          {
            progress.SetPosition(sr.GetCharpos());
          }

          try
          {
            //make sure it is SNV
            if (!line.Contains("VC=SNV"))
            {
              continue;
            }

            //Even it marked as SNV, it still could be insertion/deletion
            //2       179658175       rs11537855      C       CC,CT   .       .       RS=11537855;RSPOS=179658175;dbSNPBuildID=120;SSR=0;SAO=0;VP=0x050100001205000002000110;GENEINFO=TTN:7273;WGT=1;VC=SNV;SLO;NSF;REF;ASP;OTHERKG;NOC
            var parts = line.Split('\t');
            if (parts[3].Split(',').Any(l => l.Length != 1))
            {
              continue;
            }

            if (parts[4].Split(',').Any(l => l.Length != 1))
            {
              continue;
            }

            var chr = HumanChromosomeToInt(parts[0]);
            var position = int.Parse(parts[1]);

            if (lastChr != chr)
            {
              if (!dic.TryGetValue(chr, out chrMap))
              {
                continue;
              }
              lastChr = chr;
            }

            SNPItem source;
            if (!chrMap.TryGetValue(position, out source))
            {
              continue;
            }

            if (!source.Name.Equals(parts[2]))
            {
              sourceDbsnpMap.Remove(source.Name);
              sourceDbsnpMap[source.Name] = parts[2];
            }

            source.DbsnpRefAllele = parts[3][0];
            source.DbsnpAltAllele = parts[4][0];
            source.DbsnpIsReversed = parts[7].Contains(";RV;");
          }
          finally
          {
            line = sr.ReadLine();
          }
        }
      }

      var snpMap = snpItems.ToDictionary(m => m.Name);
      var result = new Dictionary<string, string>();
      foreach (var r in sourceDbsnpMap)
      {
        result[r.Value] = r.Key;
        if (!r.Key.Equals(r.Value))
        {
          snpMap[r.Key].Name = r.Value;
        }
      }

      progress.SetMessage("Filling dbSNP id finished.");
      return result;
    }
Example #2
0
    private static void DoFillAllele2FrequencyFrom1000Gome(this IEnumerable<SNPItem> snpItems, string g1000VcfFile, Func<SNPItem, string> keyFunc, IProgressCallback progress = null)
    {
      if (progress == null)
      {
        progress = new ConsoleProgressCallback();
      }

      var dic = snpItems.ToDictionary(m => keyFunc(m));

      progress.SetMessage("Filling MAF from {0} ...", g1000VcfFile);
      using (var sr = new StreamReader(g1000VcfFile))
      {
        progress.SetRange(0, sr.BaseStream.Length);

        string line;
        while ((line = sr.ReadLine()) != null)
        {
          if (!line.StartsWith("##"))
          {
            break;
          }
        }

        int linecount = 0;
        while ((line = sr.ReadLine()) != null)
        {
          linecount++;

          if (linecount % 10000 == 0)
          {
            progress.SetPosition(sr.GetCharpos());
          }

          var parts = line.Split('\t');
          var snp = new SNPItem()
          {
            Chrom = HumanChromosomeToInt(parts[0]),
            Position = int.Parse(parts[1]),
            Name = parts[2]
          };

          SNPItem loc;
          if (!dic.TryGetValue(keyFunc(snp), out loc))
          {
            continue;
          }

          loc.G1000Allele1 = parts[3][0];
          var allele2 = parts[4].Split(',');
          var frequencies = parts[7].StringAfter("AF=").StringBefore(";").Split(',');
          bool bFound = false;
          for (int i = 0; i < allele2.Length; i++)
          {
            if (allele2[i].Length != 1)
            {
              continue;
            }

            loc.G1000Allele2 = allele2[i][0];
            if (loc.IsSourceAllelesMatchedWithG1000())
            {
              loc.G1000Allele2Frequency = double.Parse(frequencies[i]);
              bFound = true;
              break;
            }
          }

          if (!bFound)
          {
            loc.G1000Allele1 = ' ';
            loc.G1000Allele2 = ' ';
            loc.G1000Allele2Frequency = 0.0;
          }
        }
      }
      progress.SetMessage("Filling MAF finished.");
    }