예제 #1
0
        public void TrainModelWithRangeOfPossibleParametersTest()
        {
            const string baseFolder = nameof(TrainModelWithRangeOfPossibleParametersTest);

            Directory.CreateDirectory(baseFolder);

            var generator = new ModelTrainingFilesGenerator();

            // create catalog items
            IList <SarCatalogItem> catalogItems;

            string[] featureNames;
            string   catalogFilePath = Path.Combine(baseFolder, "catalog.csv");

            generator.CreateCatalogFile(catalogFilePath);
            var itemIdsIndex  = new ConcurrentDictionary <string, uint>();
            var catalogParser = new CatalogFileParser(0, itemIdsIndex);
            var parsingReport = catalogParser.ParseCatalogFile(catalogFilePath, CancellationToken.None, out catalogItems, out featureNames);

            Assert.IsTrue(parsingReport.IsCompletedSuccessfuly);

            // create usage items
            IList <SarUsageEvent> usageEvents;
            string usageFileFolderPath = Path.Combine(baseFolder, "usage");

            Directory.CreateDirectory(usageFileFolderPath);
            generator.CreateUsageFile(Path.Combine(usageFileFolderPath, "usage.csv"), 10000);
            var userIdsIndex     = new ConcurrentDictionary <string, uint>();
            var usageFilesParser = new UsageEventsFilesParser(itemIdsIndex, userIdsIndex);

            parsingReport = usageFilesParser.ParseUsageEventFiles(usageFileFolderPath, CancellationToken.None, out usageEvents);
            Assert.IsTrue(parsingReport.IsCompletedSuccessfuly);

            int count      = 0;
            var sarTrainer = new SarTrainer();
            IDictionary <string, double> catalogFeatureWeights;

            foreach (IModelTrainerSettings settings in GetAllModelTrainingParameters())
            {
                IPredictorModel model = sarTrainer.Train(settings, usageEvents, catalogItems, featureNames, userIdsIndex.Count, itemIdsIndex.Count, out catalogFeatureWeights);
                Assert.IsNotNull(model, $"Expected training to complete successfully when using settings#{count}: {settings}");
                count++;
            }
        }
        /// <summary>
        /// Trains a model using the input files
        /// </summary>
        /// <param name="settings">The trainer settings</param>
        /// <param name="workFolderPath">A temp work folder for storing intermediate files</param>
        /// <param name="usageFolderPath">The path to the folder of usage files</param>
        /// <param name="catalogFilePath">The path to the catalog file</param>
        /// <param name="evaluationFolderPath">The path to the evaluation file (optional) </param>
        /// <param name="cancellationToken">A cancellation token used to abort the training</param>
        private ModelTrainResult TrainModelInternal(IModelTrainerSettings settings, string workFolderPath, string usageFolderPath,
                                                    string catalogFilePath, string evaluationFolderPath, CancellationToken cancellationToken)
        {
            var duration = ModelTraininigDuration.Start();
            var result   = new ModelTrainResult {
                Duration = duration
            };

            var userIdsIndexMap = new ConcurrentDictionary <string, uint>();
            var itemIdsIndexMap = new ConcurrentDictionary <string, uint>();

            // parse the catalog file
            IList <SarCatalogItem> catalogItems = null;

            string[] catalogFeatureNames = null;
            if (!string.IsNullOrWhiteSpace(catalogFilePath) && File.Exists(catalogFilePath))
            {
                // report progress
                _progressMessageReportDelegate("Parsing Catalog File");

                // create a catalog file parser
                var catalogParser = new CatalogFileParser(MaximumParsingErrorsCount, itemIdsIndexMap, _tracer);

                // parse the catalog file
                result.CatalogFilesParsingReport = catalogParser.ParseCatalogFile(catalogFilePath, cancellationToken,
                                                                                  out catalogItems, out catalogFeatureNames);

                // record the catalog parsing duration
                duration.SetCatalogParsingDuration();
                _tracer.TraceInformation($"Catalog parsing completed in {duration.CatalogParsingDuration.TotalMinutes} minutes");

                // get the catalog items count
                result.CatalogItemsCount = catalogItems.Count;

                // fail the training if parsing had failed or yielded no items
                if (!result.CatalogFilesParsingReport.IsCompletedSuccessfuly || !catalogItems.Any())
                {
                    result.CompletionMessage = "Failed to parse catalog file or parsing found no valid items";
                    _tracer.TraceInformation(result.CompletionMessage);
                    return(result);
                }

                // clear the catalog items list if it's not used anymore
                if (!settings.EnableColdItemPlacement)
                {
                    catalogItems.Clear();
                }
            }

            // report progress
            _progressMessageReportDelegate("Parsing Usage Events Files");

            // create a usage events files parser that skips events of unknown item ids (if catalog was provided))
            var usageEventsParser = new UsageEventsFilesParser(itemIdsIndexMap, userIdsIndexMap,
                                                               MaximumParsingErrorsCount, catalogItems != null, _tracer);

            _tracer.TraceInformation("Parsing the usage event files");
            IList <SarUsageEvent> usageEvents;

            result.UsageFilesParsingReport =
                usageEventsParser.ParseUsageEventFiles(usageFolderPath, cancellationToken, out usageEvents);

            // record the usage files parsing duration
            duration.SetUsageFilesParsingDuration();
            _tracer.TraceInformation($"Usage file(s) parsing completed in {duration.UsageFilesParsingDuration.TotalMinutes} minutes");

            // fail the training if parsing had failed or yielded no events
            if (!result.UsageFilesParsingReport.IsCompletedSuccessfuly || !usageEvents.Any())
            {
                result.CompletionMessage = "Failed to parse usage file(s) or parsing found no valid items";
                _tracer.TraceInformation(result.CompletionMessage);
                return(result);
            }

            _tracer.TraceInformation($"Found {userIdsIndexMap.Count} unique users");
            result.UniqueUsersCount = userIdsIndexMap.Count;

            _tracer.TraceInformation($"Found {itemIdsIndexMap.Count} unique items");
            result.UniqueItemsCount = usageEvents.Select(x => x.ItemId).Distinct().Count();

            _tracer.TraceInformation("Extracting the indexed item ids from the item index map");
            string[] itemIdsIndex = itemIdsIndexMap.OrderBy(kvp => kvp.Value).Select(kvp => kvp.Key).ToArray();

            _tracer.TraceInformation($"Sorting the usage events based on the cooccurrenceUnit unit ({settings.CooccurrenceUnit})");
            switch (settings.CooccurrenceUnit)
            {
            case CooccurrenceUnit.User:
                usageEvents = usageEvents.OrderBy(x => x.UserId).ToArray();
                break;

            case CooccurrenceUnit.Timestamp:
                usageEvents = usageEvents.OrderBy(x => x.Timestamp).ThenBy(x => x.UserId).ToArray();
                break;
            }

            _tracer.TraceInformation("Finished sorting usage events.");

            Stopwatch storeUserHistoryDuration = null;
            Task      storeUserHistoryTask     = null;

            if (settings.EnableUserToItemRecommendations && _userHistoryStore != null)
            {
                storeUserHistoryDuration = Stopwatch.StartNew();
                _tracer.TraceInformation($"Extracting the indexed user ids from the user index map ({userIdsIndexMap.Count:N} users)");
                string[] userIdsIndex = userIdsIndexMap.OrderBy(kvp => kvp.Value).Select(kvp => kvp.Key).ToArray();

                _tracer.TraceInformation($"Asynchronously starting to store usage events per user (total of {usageEvents.Count:N} items)");
                storeUserHistoryTask = Task.Run(() =>
                                                _userHistoryStore.StoreUserHistoryEventsAsync(usageEvents, userIdsIndex, cancellationToken), cancellationToken);
            }

            // if provided, parse the evaluation usage event files
            int    evaluationUsageEventsCount          = 0;
            string parsedEvaluationUsageEventsFilePath = null;

            if (!string.IsNullOrWhiteSpace(evaluationFolderPath) && Directory.Exists(evaluationFolderPath))
            {
                // report progress
                _progressMessageReportDelegate("Parsing Evaluation Usage Events Files");

                _tracer.TraceInformation("Parsing the evaluation usage event files");
                IList <SarUsageEvent> evaluationUsageEvents;
                result.EvaluationFilesParsingReport = usageEventsParser.ParseUsageEventFiles(evaluationFolderPath,
                                                                                             cancellationToken, out evaluationUsageEvents);

                if (result.EvaluationFilesParsingReport.IsCompletedSuccessfuly)
                {
                    // set the evaluation usage events count
                    evaluationUsageEventsCount = evaluationUsageEvents.Count;

                    _tracer.TraceInformation("Storing the parsed usage events for evaluation to reduce memory print");
                    parsedEvaluationUsageEventsFilePath = Path.Combine(workFolderPath, Path.GetTempFileName());
                    File.WriteAllLines(parsedEvaluationUsageEventsFilePath,
                                       evaluationUsageEvents.Select(JsonConvert.SerializeObject));
                }
                else
                {
                    _tracer.TraceWarning("Skipping model evaluation as it failed to parse evaluation usage files.");
                }

                // record the evaluation usage files parsing duration
                duration.SetEvaluationUsageFilesParsingDuration();
                _tracer.TraceInformation($"Evaluation usage file(s) parsing completed in {duration.EvaluationUsageFilesParsingDuration.TotalMinutes} minutes");
            }

            // clear the indices maps as they are no longer needed
            userIdsIndexMap.Clear();
            itemIdsIndexMap.Clear();

            cancellationToken.ThrowIfCancellationRequested();

            // report progress
            _progressMessageReportDelegate("Core Training");

            _tracer.TraceInformation("Training a new model using SAR trainer");
            IDictionary <string, double> catalogFeatureWeights;
            var             sarTrainer = new SarTrainer(_tracer);
            IPredictorModel sarModel   = sarTrainer.Train(settings, usageEvents, catalogItems, catalogFeatureNames, result.UniqueUsersCount,
                                                          result.CatalogItemsCount ?? result.UniqueItemsCount, out catalogFeatureWeights, cancellationToken);

            _tracer.TraceInformation("SAR training was completed.");

            // create the trained model properties
            var modelProperties = new ModelProperties
            {
                IncludeHistory     = settings.AllowSeedItemsInRecommendations,
                EnableUserAffinity = settings.EnableUserAffinity,
                IsUserToItemRecommendationsSupported = settings.EnableUserToItemRecommendations,
                Decay            = TimeSpan.FromDays(settings.DecayPeriodInDays),
                ReferenceDate    = usageEventsParser.MostRecentEventTimestamp,
                UniqueUsersCount = result.UniqueUsersCount,
            };

            // create the trained model
            result.Model = new TrainedModel(sarModel, modelProperties, itemIdsIndex);

            // set the catalog features weights
            result.CatalogFeatureWeights = catalogFeatureWeights;

            // record the core training duration
            duration.SetTrainingDuration();

            // run model evaluation if evaluation usage event are available
            if (evaluationUsageEventsCount > 0 && parsedEvaluationUsageEventsFilePath != null)
            {
                // report progress
                _progressMessageReportDelegate("Evaluating Trained Model");

                var evaluationUsageEvents = new List <SarUsageEvent>(evaluationUsageEventsCount);

                // load the evaluation usage events
                using (var reader = new StreamReader(parsedEvaluationUsageEventsFilePath))
                {
                    while (!reader.EndOfStream)
                    {
                        evaluationUsageEvents.Add(JsonConvert.DeserializeObject <SarUsageEvent>(reader.ReadLine()));
                    }
                }

                _tracer.TraceInformation("Starting model evaluation");
                var evaluator = new ModelEvaluator(_tracer);
                result.ModelMetrics = evaluator.Evaluate(result.Model, usageEvents, evaluationUsageEvents, cancellationToken);

                // record the evaluation duration
                duration.SetEvaluationDuration();
            }

            if (storeUserHistoryTask != null)
            {
                _tracer.TraceInformation("Waiting for storing of usage events per user (user history) to complete");

                if (!storeUserHistoryTask.IsCompleted)
                {
                    _progressMessageReportDelegate("Storing User History");

                    // set the reporting flag to true so usage history upload progress will get reported to model status
                    _reportUserHistoryProgress = true;
                }

                try
                {
                    storeUserHistoryTask.Wait(cancellationToken);
                    storeUserHistoryDuration?.Stop();
                    duration.StoringUserHistoryDuration = storeUserHistoryDuration?.Elapsed;
                    _tracer.TraceInformation(
                        $"Storing usage events per user (user history) to complete after {duration.StoringUserHistoryDuration.Value.TotalMinutes} minutes");
                }
                catch (AggregateException ex)
                {
                    var exception = new Exception("Exception while trying to store user history", ex);
                    _tracer.TraceError(exception.ToString());
                    throw exception;
                }
            }

            // stop measuring the duration and record the total duration
            duration.Stop();

            // return the train result
            result.CompletionMessage = "Model Training Completed Successfully";
            return(result);
        }