public static void DnaExperiment() { Util.NormalizationHandler = new Util.Normalize(Util.MeanZero_Normalization); // in-memory data, referenced by the index const string DATAFOLDER = @"K:\Datasets\DNA\Dna2Ts\Monkey_Binary"; // load index Index <Meta2DataFormat> si = Index <Meta2DataFormat> .Load(Globals.IndexRootDir); // populate in-memory data DnaDataLoader.LoadDnaToMetaBuffer(DATAFOLDER); // generate queries DateTime queryStart = DateTime.Now; int numQueries = 0; string[] humanChrs = Directory.GetFiles(@"K:\Datasets\DNA\Dna2Ts\Human_Binary", "*.dat"); Array.Sort(humanChrs, new NaturalStringComparer()); Dictionary <string, DnaChrResult> queryResult = new Dictionary <string, DnaChrResult>(humanChrs.Length); for (int chrNo = 0; chrNo < humanChrs.Length; ++chrNo) { string chrFile = humanChrs[chrNo]; GC.Collect(); using (BinaryReader br = new BinaryReader(new FileStream(chrFile, FileMode.Open, FileAccess.Read))) { List <DnaSearchResult> qResults = new List <DnaSearchResult>(); // List<Meta2DataFormat> _queryApproxRes = new List<Meta2DataFormat>(); // List<double> _dists = new List<double>(); // List<int> _queryPos = new List<int>(); long fileLength = br.BaseStream.Length / sizeof(int); int posShift = Globals.TimeSeriesLength / 4; // shift by quarters double[] dnaChr = new double[(int)Math.Floor((fileLength / sizeof(int)) / (double)DnaDataLoader.SAMPLERATE)]; Console.WriteLine("F:{0} OrigLen:{1} newLen:{2} Shift:{3}", chrFile, fileLength, dnaChr.Length, posShift); // downsample int count = 0; double sum = 0; for (int i = 0; i < dnaChr.Length; ++i) { sum = 0; count = 0; while (count < DnaDataLoader.SAMPLERATE) { sum += br.ReadInt32(); count++; } dnaChr[i] = sum / DnaDataLoader.SAMPLERATE; } double[] ts = new double[Globals.TimeSeriesLength]; for (int pos = 0; pos < dnaChr.Length - Globals.TimeSeriesLength; pos += posShift) { numQueries += 2; Array.Copy(dnaChr, pos, ts, 0, Globals.TimeSeriesLength); double mean = Util.Mean(ts, 0, ts.Length - 1); for (int k = 0; k < ts.Length; ++k) { ts[k] = ts[k] - mean; } TermEntry tEntry = si.ApproximateSearch(ts); List <Meta2DataFormat> termNodeEntries = si.ReturnDataFormatFromTermEntry(tEntry); double bsfDist = Double.MaxValue; Meta2DataFormat bsfMeta = new Meta2DataFormat(); foreach (Meta2DataFormat m in termNodeEntries) { double dist = Util.EuclideanDistance(Util.NormalizationHandler(m.GetTimeSeries()), ts); if (dist < bsfDist) { bsfDist = dist; bsfMeta = m; } } qResults.Add(new DnaSearchResult() { dist = bsfDist, matchingChr = bsfMeta._chrNo, matchingPos = bsfMeta._pos, queryChr = chrNo, queryPos = pos, }); // reverse ts = ts.Reverse().ToArray(); tEntry = si.ApproximateSearch(ts); termNodeEntries = si.ReturnDataFormatFromTermEntry(tEntry); bsfDist = Double.MaxValue; bsfMeta = new Meta2DataFormat(); foreach (Meta2DataFormat m in termNodeEntries) { double dist = Util.EuclideanDistance(Util.NormalizationHandler(m.GetTimeSeries()), ts); if (dist < bsfDist) { bsfDist = dist; bsfMeta = m; } } qResults.Add(new DnaSearchResult() { dist = bsfDist, matchingChr = bsfMeta._chrNo, matchingPos = bsfMeta._pos, queryChr = chrNo, queryPos = pos, }); } queryResult.Add(chrFile, new DnaChrResult() { results = qResults }); } } DateTime queryStop = DateTime.Now; Console.WriteLine("{0} Queries, {1} TimeElapsed.", numQueries, queryStop - queryStart); //// print results using (StreamWriter sw = new StreamWriter(Path.Combine(Globals.IndexRootDir, "queryOutput.txt"))) { foreach (KeyValuePair <string, DnaChrResult> kvp in queryResult) { // Console.WriteLine("HumanChromosome:{0}", kvp.Key); // Console.WriteLine("AverageDistance:{0}", kvp.Value.AverageDistance); // Console.WriteLine(); foreach (DnaSearchResult sr in kvp.Value.results) { sw.WriteLine(sr.ToString()); } } } //using (StreamWriter sw = new StreamWriter(Path.Combine(Globals.IndexRootDir, "queryOutputTop.txt"))) //{ // foreach (KeyValuePair<string, DnaChrResult> kvp in queryResult) // { // // Console.WriteLine("HumanChromosome:{0}", kvp.Key); // // Console.WriteLine("AverageDistance:{0}", kvp.Value.AverageDistance); // // Console.WriteLine(); // List<DnaSearchResult>sr = kvp.Value.results; // sr.Sort(); // sr = sr.GetRange(0, 10); // Console.WriteLine("For Human Chr:{0}", kvp.Key); // var counts = from q in sr // group q by q.matchingChr into g // select new { Chr = g.Key, NumHits = g.Count() }; // foreach (var v in counts) // Console.WriteLine("{0} : {1}", v.Chr, v.NumHits); // } // // //{ // // // for (int i = 0; i < kvp.Value.queryTs.Count; ++i) // // // { // // // sw.WriteLine(Util.ArrayToString(kvp.Value.queryTs[i])); // // // sw.WriteLine(Util.ArrayToString(Util.NormalizationHandler(kvp.Value.queryApproxRes[i].GetTimeSeries()))); // // // } // // // //foreach (double[] d in kvp.Value.queryTs) // // // // sw.WriteLine(Util.ArrayToString(d)); //} }
public override void LoadIndex() { ushort maskval = (ushort)(Math.Log(Globals.SaxMaxCard, 2) - Math.Log(Globals.SaxBaseCard, 2)); SaxOptions opts = new SaxOptions(Util.UnsignedShortArray(Globals.SaxWordLength, maskval)); double[] ts; IDataFormat dl; // load dna into memory LoadDnaToMetaBuffer(_dataFolder); // iterate through each chr and insert double mean = 0; int signChange = 0; double delta = 0; double lastVal = 0; for (int chrNo = 0; chrNo < Meta2DataFormat.dnaBuffer.Count; ++chrNo) { // Console.WriteLine("ChrNo:{0} Processed:{1} Discarded:{2} IndexDiscarded:{3}", chrNo, processed, discarded, Index<Meta2DataFormat>.discarded); // if (_si.NumTimeSeries != processed - discarded - Index<Meta2DataFormat>.discarded) // throw new ApplicationException(); for (int pos = 0; pos <= Meta2DataFormat.dnaBuffer[chrNo].Length - Globals.TimeSeriesLength; pos += SHIFT) { dl = new Meta2DataFormat(chrNo, pos); ts = dl.GetTimeSeries(); // normalize mean = Util.Mean(ts, 0, ts.Length - 1); signChange = 0; lastVal = ts[1] - ts[0]; for (int k = 2; k < ts.Length; ++k) { delta = ts[k] - ts[k - 1]; if (Math.Sign(lastVal) != Math.Sign(delta)) { signChange++; } lastVal = delta; } for (int k = 0; k < ts.Length; ++k) { ts[k] = ts[k] - mean; } // filter if (signChange > NUMSIGNCHANGE) { _si.Insert(new SaxData(dl, Sax.ArrayToSaxVals(ts, opts))); processed++; if (processed % Globals.FlushTsVal == 0) { _si.FlushEntries(); } } else { discarded++; } } GC.Collect(); } // Console.WriteLine("Processed:{0} Discarded:{1} IndexDiscarded:{2}", processed, discarded, Index<Meta2DataFormat>.discarded); _si.FlushEntries(); }