/// <summary> /// Records the error in the parsing report and determines if to continue with the parsing. /// </summary> private bool ShouldContinueAfterError(ParsingErrorReason error, FileParsingReport parsingReport, string usageFile, long lineNumber) { parsingReport.Errors.Add(new ParsingError(Path.GetFileName(usageFile), lineNumber, error)); if (parsingReport.Errors.Count > _maximumParsingErrorsCount) { parsingReport.IsCompletedSuccessfuly = false; return(false); } return(true); }
/// <summary> /// Parse a usage events file into usage events items. /// Expected format is userId,itemId,timestamp[,event-type[,event-weight] /// </summary> private IEnumerable <SarUsageEvent> ParseUsageEventsFile(string usageFile, FileParsingReport parsingReport, DateTime defaultEventTimestamp) { using (var reader = new TextFieldParser(usageFile) { Delimiters = new[] { "," } }) { while (!reader.EndOfData) { string[] fields; parsingReport.TotalLinesCount++; try { fields = reader.ReadFields(); } catch (MalformedLineException ex) { if (ShouldContinueAfterError(ParsingErrorReason.MalformedLine, parsingReport, usageFile, ex.LineNumber)) { continue; } yield break; } ParsingErrorReason?parsingError; ParsingErrorReason?parsingWarning; SarUsageEvent usageEvent = ParseUsageEvent(fields, defaultEventTimestamp, out parsingError, out parsingWarning); if (parsingError.HasValue) { if (ShouldContinueAfterError(parsingError.Value, parsingReport, usageFile, reader.LineNumber - 1)) { continue; } yield break; } if (parsingWarning.HasValue) { parsingReport.Warnings.Add(new ParsingError(Path.GetFileName(usageFile), reader.LineNumber - 1, parsingWarning.Value)); continue; } parsingReport.SuccessfulLinesCount++; yield return(usageEvent); } } }
/// <summary> /// Parse a catalog file to <see cref="SarCatalogItem"/> items. /// </summary> /// <param name="catalogFilePath">The file to parse</param> /// <param name="cancellationToken">A cancellation token used to abort the operation</param> /// <param name="catalogItems">The parsed catalog items</param> /// <param name="featureNames">The parsed names of the catalog items features, in the same order as the feature values in the catalog</param> /// <returns>The parsing report</returns> public FileParsingReport ParseCatalogFile( string catalogFilePath, CancellationToken cancellationToken, out IList <SarCatalogItem> catalogItems, out string[] featureNames) { if (string.IsNullOrWhiteSpace(catalogFilePath)) { throw new ArgumentNullException(nameof(catalogFilePath)); } if (!File.Exists(catalogFilePath)) { throw new ArgumentException($"Failed to find catalog file under '{catalogFilePath}'", nameof(catalogFilePath)); } _tracer.TraceInformation("Starting catalog file parsing"); // parse the catalog file into catalog items var featureNamesIndex = new ConcurrentDictionary <string, uint>(); var parsingReport = new FileParsingReport(); IList <SarCatalogItem> parsedCatalogItems = ParseCatalogFile(catalogFilePath, featureNamesIndex, parsingReport).ToList(); if (!parsingReport.IsCompletedSuccessfuly) { catalogItems = new List <SarCatalogItem>(0); featureNames = new string[0]; _tracer.TraceError("Failed parsing catalog file"); return(parsingReport); } cancellationToken.ThrowIfCancellationRequested(); // clear the feature index as it is no longer needed featureNames = featureNamesIndex.OrderBy(kvp => kvp.Value).Select(kvp => kvp.Key).ToArray(); featureNamesIndex.Clear(); // inflate feature vectors int numberOfFeatures = featureNames.Length; catalogItems = parsedCatalogItems.Select(item => InflateFeaturesVector(item, numberOfFeatures)).ToList(); _tracer.TraceInformation("Finished catalog file parsing"); return(parsingReport); }
/// <summary> /// Parses each file found in the input folder into usage events, /// while indexing the found user ids and item ids. /// </summary> /// <param name="usageFolder">A folder containing the usage files to parse</param> /// <param name="cancellationToken">A cancellation token used to abort the operation</param> /// <param name="usageEvents">The parsed usage events</param> /// <returns>The parsing report</returns> public FileParsingReport ParseUsageEventFiles(string usageFolder, CancellationToken cancellationToken, out IList <SarUsageEvent> usageEvents) { if (string.IsNullOrWhiteSpace(usageFolder)) { throw new ArgumentNullException(nameof(usageFolder)); } if (!Directory.Exists(usageFolder)) { throw new ArgumentException($"Failed to find usage files folder: '{usageFolder}'", nameof(usageFolder)); } _tracer.TraceInformation("Starting usage files parsing"); DefaultEventTimestamp = DateTime.UtcNow; MostRecentEventTimestamp = DefaultEventTimestamp; var parsingReport = new FileParsingReport(); usageEvents = ParseUsageEventFilesInternal(usageFolder, parsingReport, cancellationToken).ToList(); _tracer.TraceInformation("Finished usage files parsing"); return(parsingReport); }
/// <summary> /// Parse the input catalog file into catalog items. /// </summary> private IEnumerable <SarCatalogItem> ParseCatalogFile( string catalogFilePath, ConcurrentDictionary <string, uint> featureNamesIndex, FileParsingReport parsingReport) { using (var reader = new TextFieldParser(catalogFilePath) { Delimiters = new[] { "," } }) { while (!reader.EndOfData) { string[] fields; try { parsingReport.TotalLinesCount++; fields = reader.ReadFields(); } catch (MalformedLineException) { parsingReport.Errors.Add( new ParsingError(Path.GetFileName(catalogFilePath), reader.ErrorLineNumber, ParsingErrorReason.MalformedLine)); if (parsingReport.Errors.Count > _maximumParsingErrorsCount) { parsingReport.IsCompletedSuccessfuly = false; yield break; } continue; } ParsingErrorReason?parsingError; ParsingErrorReason?parsingWarning; SarCatalogItem catalogItem = ParseCatalogItem(fields, featureNamesIndex, out parsingError, out parsingWarning); if (parsingError.HasValue) { parsingReport.Errors.Add( new ParsingError(Path.GetFileName(catalogFilePath), reader.LineNumber - 1, parsingError.Value)); if (parsingReport.Errors.Count > _maximumParsingErrorsCount) { parsingReport.IsCompletedSuccessfuly = false; yield break; } continue; } if (parsingWarning.HasValue) { parsingReport.Warnings.Add( new ParsingError(Path.GetFileName(catalogFilePath), reader.LineNumber - 1, parsingWarning.Value)); continue; } parsingReport.SuccessfulLinesCount++; yield return(catalogItem); } } // no more lines to parse - mark the parsing as successful parsingReport.IsCompletedSuccessfuly = true; }
/// <summary> /// Parses each file found in the input folder into usage events, /// while indexing the found user ids and item ids. /// </summary> private IEnumerable <SarUsageEvent> ParseUsageEventFilesInternal(string usageFolder, FileParsingReport parsingReport, DateTime defaultEventTimestamp, CancellationToken cancellationToken) { foreach (string usageFile in Directory.GetFiles(usageFolder)) { cancellationToken.ThrowIfCancellationRequested(); _tracer.TraceInformation($"Parsing file {Path.GetFileName(usageFile)} ({(double)new FileInfo(usageFile).Length/(1024*1024):F2} MB)"); foreach (SarUsageEvent sarUsageEvent in ParseUsageEventsFile(usageFile, parsingReport, defaultEventTimestamp)) { yield return(sarUsageEvent); } if (parsingReport.Errors.Count > _maximumParsingErrorsCount) { parsingReport.IsCompletedSuccessfuly = false; yield break; } } // no more lines to parse - mark the parsing as successful parsingReport.IsCompletedSuccessfuly = true; }