Ejemplo n.º 1
0
        public static void DnaExperiment()
        {
            Util.NormalizationHandler = new Util.Normalize(Util.MeanZero_Normalization);
            // in-memory data, referenced by the index
            const string DATAFOLDER = @"K:\Datasets\DNA\Dna2Ts\Monkey_Binary";

            // load index
            Index <Meta2DataFormat> si = Index <Meta2DataFormat> .Load(Globals.IndexRootDir);

            // populate in-memory data
            DnaDataLoader.LoadDnaToMetaBuffer(DATAFOLDER);

            // generate queries
            DateTime queryStart = DateTime.Now;
            int      numQueries = 0;

            string[] humanChrs = Directory.GetFiles(@"K:\Datasets\DNA\Dna2Ts\Human_Binary", "*.dat");
            Array.Sort(humanChrs, new NaturalStringComparer());
            Dictionary <string, DnaChrResult> queryResult = new Dictionary <string, DnaChrResult>(humanChrs.Length);

            for (int chrNo = 0; chrNo < humanChrs.Length; ++chrNo)
            {
                string chrFile = humanChrs[chrNo];
                GC.Collect();
                using (BinaryReader br = new BinaryReader(new FileStream(chrFile, FileMode.Open, FileAccess.Read)))
                {
                    List <DnaSearchResult> qResults = new List <DnaSearchResult>();
                    // List<Meta2DataFormat> _queryApproxRes = new List<Meta2DataFormat>();
                    // List<double> _dists = new List<double>();
                    // List<int> _queryPos = new List<int>();

                    long fileLength = br.BaseStream.Length / sizeof(int);
                    int  posShift   = Globals.TimeSeriesLength / 4; // shift by quarters

                    double[] dnaChr = new double[(int)Math.Floor((fileLength / sizeof(int)) / (double)DnaDataLoader.SAMPLERATE)];
                    Console.WriteLine("F:{0} OrigLen:{1} newLen:{2} Shift:{3}", chrFile, fileLength, dnaChr.Length, posShift);

                    // downsample
                    int    count = 0;
                    double sum   = 0;
                    for (int i = 0; i < dnaChr.Length; ++i)
                    {
                        sum   = 0;
                        count = 0;
                        while (count < DnaDataLoader.SAMPLERATE)
                        {
                            sum += br.ReadInt32();
                            count++;
                        }
                        dnaChr[i] = sum / DnaDataLoader.SAMPLERATE;
                    }

                    double[] ts = new double[Globals.TimeSeriesLength];
                    for (int pos = 0; pos < dnaChr.Length - Globals.TimeSeriesLength; pos += posShift)
                    {
                        numQueries += 2;
                        Array.Copy(dnaChr, pos, ts, 0, Globals.TimeSeriesLength);
                        double mean = Util.Mean(ts, 0, ts.Length - 1);
                        for (int k = 0; k < ts.Length; ++k)
                        {
                            ts[k] = ts[k] - mean;
                        }

                        TermEntry tEntry = si.ApproximateSearch(ts);
                        List <Meta2DataFormat> termNodeEntries = si.ReturnDataFormatFromTermEntry(tEntry);

                        double          bsfDist = Double.MaxValue;
                        Meta2DataFormat bsfMeta = new Meta2DataFormat();
                        foreach (Meta2DataFormat m in termNodeEntries)
                        {
                            double dist = Util.EuclideanDistance(Util.NormalizationHandler(m.GetTimeSeries()), ts);
                            if (dist < bsfDist)
                            {
                                bsfDist = dist;
                                bsfMeta = m;
                            }
                        }

                        qResults.Add(new DnaSearchResult()
                        {
                            dist        = bsfDist,
                            matchingChr = bsfMeta._chrNo,
                            matchingPos = bsfMeta._pos,
                            queryChr    = chrNo,
                            queryPos    = pos,
                        });


                        // reverse
                        ts              = ts.Reverse().ToArray();
                        tEntry          = si.ApproximateSearch(ts);
                        termNodeEntries = si.ReturnDataFormatFromTermEntry(tEntry);
                        bsfDist         = Double.MaxValue;
                        bsfMeta         = new Meta2DataFormat();
                        foreach (Meta2DataFormat m in termNodeEntries)
                        {
                            double dist = Util.EuclideanDistance(Util.NormalizationHandler(m.GetTimeSeries()), ts);
                            if (dist < bsfDist)
                            {
                                bsfDist = dist;
                                bsfMeta = m;
                            }
                        }

                        qResults.Add(new DnaSearchResult()
                        {
                            dist        = bsfDist,
                            matchingChr = bsfMeta._chrNo,
                            matchingPos = bsfMeta._pos,
                            queryChr    = chrNo,
                            queryPos    = pos,
                        });
                    }
                    queryResult.Add(chrFile, new DnaChrResult()
                    {
                        results = qResults
                    });
                }
            }
            DateTime queryStop = DateTime.Now;

            Console.WriteLine("{0} Queries, {1} TimeElapsed.", numQueries, queryStop - queryStart);
            //// print results
            using (StreamWriter sw = new StreamWriter(Path.Combine(Globals.IndexRootDir, "queryOutput.txt")))
            {
                foreach (KeyValuePair <string, DnaChrResult> kvp in queryResult)
                {
                    //    Console.WriteLine("HumanChromosome:{0}", kvp.Key);
                    //    Console.WriteLine("AverageDistance:{0}", kvp.Value.AverageDistance);
                    //    Console.WriteLine();
                    foreach (DnaSearchResult sr in kvp.Value.results)
                    {
                        sw.WriteLine(sr.ToString());
                    }
                }
            }

            //using (StreamWriter sw = new StreamWriter(Path.Combine(Globals.IndexRootDir, "queryOutputTop.txt")))
            //{
            //    foreach (KeyValuePair<string, DnaChrResult> kvp in queryResult)
            //    {
            //        //    Console.WriteLine("HumanChromosome:{0}", kvp.Key);
            //        //    Console.WriteLine("AverageDistance:{0}", kvp.Value.AverageDistance);
            //        //    Console.WriteLine();
            //        List<DnaSearchResult>sr  = kvp.Value.results;
            //        sr.Sort();
            //        sr = sr.GetRange(0, 10);

            //        Console.WriteLine("For Human Chr:{0}", kvp.Key);
            //        var counts = from q in sr
            //                     group q by q.matchingChr into g
            //                     select new { Chr = g.Key, NumHits = g.Count() };
            //        foreach (var v in counts)
            //            Console.WriteLine("{0} : {1}", v.Chr, v.NumHits);

            //    }
            //    //    //{
            //    //    //    for (int i = 0; i < kvp.Value.queryTs.Count; ++i)
            //    //    //    {
            //    //    //        sw.WriteLine(Util.ArrayToString(kvp.Value.queryTs[i]));
            //    //    //        sw.WriteLine(Util.ArrayToString(Util.NormalizationHandler(kvp.Value.queryApproxRes[i].GetTimeSeries())));
            //    //    //    }
            //    //    //    //foreach (double[] d in kvp.Value.queryTs)
            //    //    //    //    sw.WriteLine(Util.ArrayToString(d));
            //}
        }
Ejemplo n.º 2
0
        public override void LoadIndex()
        {
            ushort     maskval = (ushort)(Math.Log(Globals.SaxMaxCard, 2) - Math.Log(Globals.SaxBaseCard, 2));
            SaxOptions opts    = new SaxOptions(Util.UnsignedShortArray(Globals.SaxWordLength, maskval));

            double[]    ts;
            IDataFormat dl;

            // load dna into memory
            LoadDnaToMetaBuffer(_dataFolder);

            // iterate through each chr and insert
            double mean       = 0;
            int    signChange = 0;
            double delta      = 0;
            double lastVal    = 0;

            for (int chrNo = 0; chrNo < Meta2DataFormat.dnaBuffer.Count; ++chrNo)
            {
                //    Console.WriteLine("ChrNo:{0} Processed:{1} Discarded:{2} IndexDiscarded:{3}", chrNo, processed, discarded, Index<Meta2DataFormat>.discarded);
                //    if (_si.NumTimeSeries != processed - discarded - Index<Meta2DataFormat>.discarded)
                //        throw new ApplicationException();
                for (int pos = 0; pos <= Meta2DataFormat.dnaBuffer[chrNo].Length - Globals.TimeSeriesLength; pos += SHIFT)
                {
                    dl = new Meta2DataFormat(chrNo, pos);
                    ts = dl.GetTimeSeries();

                    // normalize
                    mean       = Util.Mean(ts, 0, ts.Length - 1);
                    signChange = 0;
                    lastVal    = ts[1] - ts[0];
                    for (int k = 2; k < ts.Length; ++k)
                    {
                        delta = ts[k] - ts[k - 1];
                        if (Math.Sign(lastVal) != Math.Sign(delta))
                        {
                            signChange++;
                        }
                        lastVal = delta;
                    }

                    for (int k = 0; k < ts.Length; ++k)
                    {
                        ts[k] = ts[k] - mean;
                    }

                    // filter
                    if (signChange > NUMSIGNCHANGE)
                    {
                        _si.Insert(new SaxData(dl, Sax.ArrayToSaxVals(ts, opts)));
                        processed++;

                        if (processed % Globals.FlushTsVal == 0)
                        {
                            _si.FlushEntries();
                        }
                    }
                    else
                    {
                        discarded++;
                    }
                }
                GC.Collect();
            }
            // Console.WriteLine("Processed:{0} Discarded:{1} IndexDiscarded:{2}", processed, discarded, Index<Meta2DataFormat>.discarded);
            _si.FlushEntries();
        }