//this version of loader is for cluster distribution //take in the number of nodes, and current node number //we add in data only if it corresponds to our current node number public TsvFileLoader(RankingTSVFile<MsnData> tsvFile, Random r, double rangeLower, double rangeUpper) { DataNullProc<MsnData> dataNullPro = new DataNullProc<MsnData>(); IDataEnum<MsnData, MsnData, DataNullProc<MsnData>> tsvDataEnum = new TsvDataStream<MsnData, MsnData, DataNullProc<MsnData>>(tsvFile, dataNullPro); int gId = 0; featureDataMatrix = new DataMatrixArray<float>(); List<float> labelList = new List<float>(); List<int> groupIdList = new List<int>(); //for each query in the set of queries foreach (MsnData d in tsvDataEnum) { //if it falls in the "Test set" range, load it into the test set double rDouble = r.NextDouble(); if (rDouble >= rangeLower && rDouble < rangeUpper) { //load the example if (featureNames == null) { featureNames = new string[d.Feature.NumColumns]; for (int i = 0; i < d.Feature.NumColumns; i++) { featureNames[i] = tsvFile.ColumnNames[d.Feature.Parser.columnIndex(i)]; } } //get labels for (int i = 0; i < d.Labels.Data.NumRows; i++) { labelList.Add(d.Labels.Data.GetValue(i, 0)); groupIdList.Add(gId); // group index/Id } //get feature data featureDataMatrix.Add(d.Feature.Data); gId++; } //otherwise, skip that query } int numRows = labelList.Count; labels = new float[numRows]; groupId = new int[numRows]; for (int i = 0; i < numRows; i++) { labels[i] = labelList[i]; groupId[i] = groupIdList[i]; } }
public static RankingTSVFile<MsnData> CreateTsvFile(string tsvFileName, IParser<string> metaParser, IParser<float> labelParser, IParser<float> featureParser, IGroupBoundary groupBoundary) { MsnData msnData = new MsnData(); if (metaParser != null) { msnData.Meta.Parser = metaParser; } else { msnData.Meta.Parser = DefaultMetaParser; } if (labelParser != null) { msnData.Labels.Parser = labelParser; } else { msnData.Labels.Parser = DefaultLabelParser; } if (featureParser != null) { msnData.Feature.Parser = featureParser; } RankingTSVFile<MsnData> tsvFile = new RankingTSVFile<MsnData>(tsvFileName, msnData); if (groupBoundary != null) { tsvFile.GroupBoundary = groupBoundary; } else { tsvFile.GroupBoundary = DefaultGroupBoundary; } return tsvFile; }
public TsvFileLoader(RankingTSVFile<MsnData> tsvFile) { DataNullProc<MsnData> dataNullPro = new DataNullProc<MsnData>(); IDataEnum<MsnData, MsnData, DataNullProc<MsnData>> tsvDataEnum = new TsvDataStream<MsnData, MsnData, DataNullProc<MsnData>>(tsvFile, dataNullPro); int gId = 0; featureDataMatrix = new DataMatrixArray<float>(); List<float> labelList = new List<float>(); List<int> groupIdList = new List<int>(); foreach (MsnData d in tsvDataEnum) { if (featureNames == null) { featureNames = new string[d.Feature.NumColumns]; for (int i = 0; i < d.Feature.NumColumns; i++) { featureNames[i] = tsvFile.ColumnNames[d.Feature.Parser.columnIndex(i)]; } } //get labels for (int i = 0; i < d.Labels.Data.NumRows; i++) { labelList.Add(d.Labels.Data.GetValue(i, 0)); groupIdList.Add(gId); // group index/Id } //get feature data featureDataMatrix.Add(d.Feature.Data); gId++; } int numRows = labelList.Count; labels = new float[numRows]; groupId = new int[numRows]; for (int i = 0; i < numRows; i++) { labels[i] = labelList[i]; groupId[i] = groupIdList[i]; } }