/// <summary> /// Given the auxiliary data in a bunch of parts, set the concatenated dataset appropriately. /// </summary> /// <param name="parts">The individual parts of the dataset</param> /// <param name="concat">The concatenated version of this dataset</param> private static void SetConcatenatedAuxiliaryData(DatasetSkeleton[] parts, DatasetSkeleton concat) { // Get the union of all the auxiliary data names. Dictionary <string, bool> auxNames = new Dictionary <string, bool>(); foreach (DatasetSkeleton part in parts) { foreach (string name in part.AuxiliaryData.Keys) { auxNames[name] = true; } } DatasetSkeletonQueryDocData[] partsDatas = new DatasetSkeletonQueryDocData[parts.Length]; int[] docLengths = parts.Select(x => x.NumDocs).ToArray(); int[] queryLengths = parts.Select(x => x.NumQueries).ToArray(); foreach (string name in auxNames.Keys) { for (int p = 0; p < parts.Length; ++p) { partsDatas[p] = parts[p].AuxiliaryData.ContainsKey(name) ? parts[p].AuxiliaryData[name] : default(DatasetSkeletonQueryDocData); } bool isQuery = partsDatas.First(pd => pd.Data != null).IsQueryLevel; if (partsDatas.Any(pd => pd.Data != null && pd.IsQueryLevel != isQuery)) { throw Contracts.Except("On auxiliary data {0}, disagreement on whether this is query/doc", name); } Array concatArray = ConcatArrays(partsDatas.Select(pd => pd.Data).ToArray(), isQuery ? queryLengths : docLengths, name); concat.SetData(name, concatArray, isQuery); } }
public DatasetSkeleton[] Split(double[] fraction, int randomSeed, out int[][] assignment) { int[][] queries = GetAssignments(fraction, randomSeed, out assignment); int numParts = queries.Length; // get boundaries int[][] boundaries = queries.Select(q => new int[q.Length + 1]).ToArray(numParts); for (int p = 0; p < numParts; ++p) { boundaries[p][0] = 0; for (int q = 0; q < queries[p].Length; ++q) { boundaries[p][q + 1] = boundaries[p][q] + Boundaries[queries[p][q] + 1] - Boundaries[queries[p][q]]; } } // get docIds, queryIds, and labels short[][] ratings = new short[numParts][]; ulong[][] queryIds = new ulong[numParts][]; ulong[][] docIds = new ulong[numParts][]; for (int p = 0; p < numParts; ++p) { ratings[p] = assignment[p].Select(d => Ratings[d]).ToArray(); queryIds[p] = queries[p].Select(q => QueryIds[q]).ToArray(); docIds[p] = assignment[p].Select(d => DocIds[d]).ToArray(); } // package everything up in datasetSkeleton objects DatasetSkeleton[] datasetSkeleton = Enumerable.Range(0, numParts).Select( p => new DatasetSkeleton(ratings[p], boundaries[p], queryIds[p], docIds[p])).ToArray(numParts); // Do the auxiliary data. foreach (KeyValuePair <string, DatasetSkeletonQueryDocData> pair in AuxiliaryData) { DatasetSkeletonQueryDocData qddata = pair.Value; Type arrayDataType = qddata.Data.GetType().GetElementType(); for (int p = 0; p < numParts; ++p) { int[] mapping = (qddata.IsQueryLevel ? queries : assignment)[p]; Array newData = Array.CreateInstance(arrayDataType, mapping.Length); for (int i = 0; i < mapping.Length; ++i) { newData.SetValue(qddata.Data.GetValue(mapping[i]), i); } datasetSkeleton[p].SetData(pair.Key, newData, qddata.IsQueryLevel); } } return(datasetSkeleton); }
public DatasetSkeletonQueryDocData GetSubset(int[] docArray) { DatasetSkeletonQueryDocData qdd = new DatasetSkeletonQueryDocData(); qdd.IsQueryLevel = IsQueryLevel; Type arrayDataType = Data.GetType().GetElementType(); qdd.Data = Array.CreateInstance(arrayDataType, docArray.Length); for (int i = 0; i < docArray.Length; ++i) { qdd.Data.SetValue(Data.GetValue(docArray[i]), i); } return(qdd); }