Esempio n. 1
0
        /// <summary>
        /// Initializes a new instance of the <see cref="Dataset"/> class.
        /// </summary>
        /// <param name="datasetSkeleton">The dataset skeleton corresponding to the features</param>
        /// <param name="flocks">An array of feature flocks</param>
        public Dataset(DatasetSkeleton datasetSkeleton, FeatureFlockBase[] flocks)
        {
            Contracts.AssertValue(datasetSkeleton);
            Contracts.AssertValue(flocks);
            Contracts.Assert(flocks.All(f => f.Examples == datasetSkeleton.NumDocs));

            _datasetSkeleton = datasetSkeleton;
            _maxDocsPerQuery = -1;
            _flocks          = flocks;

            _flockToFirstFeature = new int[_flocks.Length];
            if (_flocks.Length > 0)
            {
                for (int i = 1; i < _flocks.Length; ++i)
                {
                    Contracts.AssertValue(_flocks[i - 1]);
                    _flockToFirstFeature[i] = _flockToFirstFeature[i - 1] + _flocks[i - 1].Count;
                }
                var lastFlock = _flocks[_flocks.Length - 1];
                Contracts.AssertValue(lastFlock);
                int numFeatures = _flockToFirstFeature[_flockToFirstFeature.Length - 1] + lastFlock.Count;
                Contracts.Assert(numFeatures == _flocks.Sum(f => f.Count));
                _featureToFlock = new int[numFeatures];
                for (int flock = 0; flock < _flockToFirstFeature.Length; ++flock)
                {
                    int min = _flockToFirstFeature[flock];
                    int lim = min + _flocks[flock].Count;
                    for (int feat = min; feat < lim; ++feat)
                    {
                        _featureToFlock[feat] = flock;
                    }
                }
            }
            else
            {
                _featureToFlock = new int[0];
            }
        }
Esempio n. 2
0
            /// <summary>
            /// Takes an array of DatasetSkeleton objects and concatenates them into one big DatasetSkeleton
            /// </summary>
            /// <param name="parts">An array of DatasetSkeletons</param>
            /// <returns>A concatenated DatasetSkeleton</returns>
            public static DatasetSkeleton Concat(DatasetSkeleton[] parts)
            {
                int concatNumDocs    = parts.Sum(x => x.NumDocs);
                int concatNumQueries = parts.Sum(x => x.NumQueries);

                // allocate
                short[] concatRatings    = new short[concatNumDocs];
                ulong[] concatDocIds     = new ulong[concatNumDocs];
                ulong[] concatQueryIds   = new ulong[concatNumQueries];
                int[]   concatBoundaries = new int[concatNumQueries + 1];

                // copy components into new arrays
                int docBegin   = 0;
                int queryBegin = 0;

                for (int p = 0; p < parts.Length; ++p)
                {
                    int numDocs    = parts[p].NumDocs;
                    int numQueries = parts[p].NumQueries;
                    Array.Copy(parts[p].Ratings, 0, concatRatings, docBegin, numDocs);
                    Array.Copy(parts[p].DocIds, 0, concatDocIds, docBegin, numDocs);
                    Array.Copy(parts[p].QueryIds, 0, concatQueryIds, queryBegin, numQueries);
                    for (int q = 0; q < numQueries; ++q)
                    {
                        concatBoundaries[queryBegin + q] = parts[p].Boundaries[q] + docBegin;
                    }
                    docBegin   += numDocs;
                    queryBegin += numQueries;
                }
                concatBoundaries[queryBegin] = docBegin;

                DatasetSkeleton skel = new DatasetSkeleton(concatRatings, concatBoundaries, concatQueryIds, concatDocIds);

                SetConcatenatedAuxiliaryData(parts, skel);
                return(skel);
            }
Esempio n. 3
0
            /// <summary>
            /// Given the auxiliary data in a bunch of parts, set the concatenated dataset appropriately.
            /// </summary>
            /// <param name="parts">The individual parts of the dataset</param>
            /// <param name="concat">The concatenated version of this dataset</param>
            private static void SetConcatenatedAuxiliaryData(DatasetSkeleton[] parts, DatasetSkeleton concat)
            {
                // Get the union of all the auxiliary data names.
                Dictionary <string, bool> auxNames = new Dictionary <string, bool>();

                foreach (DatasetSkeleton part in parts)
                {
                    foreach (string name in part.AuxiliaryData.Keys)
                    {
                        auxNames[name] = true;
                    }
                }
                DatasetSkeletonQueryDocData[] partsDatas = new DatasetSkeletonQueryDocData[parts.Length];
                int[] docLengths   = parts.Select(x => x.NumDocs).ToArray();
                int[] queryLengths = parts.Select(x => x.NumQueries).ToArray();
                foreach (string name in auxNames.Keys)
                {
                    for (int p = 0; p < parts.Length; ++p)
                    {
                        partsDatas[p] = parts[p].AuxiliaryData.ContainsKey(name) ? parts[p].AuxiliaryData[name] : default(DatasetSkeletonQueryDocData);
                    }
                    bool isQuery = partsDatas.First(pd => pd.Data != null).IsQueryLevel;
                    if (partsDatas.Any(pd => pd.Data != null && pd.IsQueryLevel != isQuery))
                    {
                        throw Contracts.Except("On auxiliary data {0}, disagreement on whether this is query/doc", name);
                    }
                    Array concatArray = ConcatArrays(partsDatas.Select(pd => pd.Data).ToArray(), isQuery ? queryLengths : docLengths, name);
                    concat.SetData(name, concatArray, isQuery);
                }
            }
Esempio n. 4
0
        public Dataset GetSubDataset(int[] docIndices, bool destroyThisDataset, FileObjectStore <IntArrayFormatter> newBinsCache)
        {
#endif
            int[]   queryIndices   = docIndices.Select(d => DocToQuery[d]).ToArray();
            ulong[] uniqueQueryIds = queryIndices.Distinct().Select(q => QueryIds[q]).ToArray();

            // calculate boundaries
            int[] boundaries = new int[uniqueQueryIds.Length + 1];
            boundaries[0] = 0;
            int queryIndex = 1;
            for (int q = 1; q < queryIndices.Length; ++q)
            {
                if (queryIndices[q] != queryIndices[q - 1])
                {
                    boundaries[queryIndex++] = q;
                }
            }
            boundaries[uniqueQueryIds.Length] = queryIndices.Length;

            // construct skeleton
            DatasetSkeleton datasetSkeleton = new DatasetSkeleton(docIndices.Select(d => Ratings[d]).ToArray(),
                                                                  boundaries,
                                                                  uniqueQueryIds,
                                                                  docIndices.Select(d => DocIds[d]).ToArray());

            // create features
            FeatureFlockBase[] features   = new FeatureFlockBase[NumFlocks];
            int[][]            assignment = new int[][] { docIndices };
            Parallel.For(0, NumFlocks, new ParallelOptions {
                MaxDegreeOfParallelism = BlockingThreadPool.NumThreads
            },
                         (int flockIndex) =>
            {
#if !NO_STORE
                GetSubDataset_ThreadWorker(features, flockIndex, assignment, destroyThisDataset, newBinsCache);
#else
                GetSubDatasetThreadWorker(features, flockIndex, assignment, destroyThisDataset);
#endif
            });

            uint[] filteredDupeIds = null;

            // Filter the dupe ids, if any
            if (DupeIds != null)
            {
                uint[] dupeIds = DupeIds;
                filteredDupeIds = docIndices.Select(i => dupeIds[i]).ToArray();
            }

            // auxiliary data
            Dictionary <string, DatasetSkeletonQueryDocData> auxData    = _datasetSkeleton.AuxiliaryData;
            Dictionary <string, DatasetSkeletonQueryDocData> newAuxData = new Dictionary <string, DatasetSkeletonQueryDocData>();
            foreach (KeyValuePair <string, DatasetSkeletonQueryDocData> pair in auxData)
            {
                newAuxData[pair.Key] = pair.Value.GetSubset(pair.Value.IsQueryLevel ? queryIndices.Distinct().ToArray() : docIndices);
            }
            datasetSkeleton.AuxiliaryData = newAuxData;

            // create new Dataset
            Dataset dataset = new Dataset(datasetSkeleton, features);
            dataset.DupeIds = filteredDupeIds;
            return(dataset);
        }