public static async Task PrepareAndSaveDatasetsForPrsAsync(DataFilePaths prFiles, DatasetModifier datasetModifier) { var ds = DatasetHelperInner.Instance; var lines = await ds.AddOrRemoveColumnsPriorToTrainingAsync(prFiles.InputPath, datasetModifier, includeFileColumns : true); lines = ds.OnlyPrs(lines); await ds.BreakIntoTrainValidateTestDatasetsAsync(lines, prFiles.TrainPath, prFiles.ValidatePath, prFiles.TestPath); }
/// <summary> /// saves to file a dataset ready for training, given one created using GithubIssueDownloader. /// For training we can remove ID column, and further expand information in FilePaths /// We also retrieve user @ mentions from instead Description and add into new columns /// </summary> /// <param name="input">path to the reference dataset</param> /// <param name="output">the output to store the new dataset</param> /// <param name="includeFileColumns">when true, it contains extra columns with file related information</param> /// <param name="reMapFiles">for PRs in archived repos, how they could be re-mapped if they were transferred</param> public async Task <string[]> AddOrRemoveColumnsPriorToTrainingAsync( string input, DatasetModifier datasetModifier, bool includeFileColumns = true) { var existingHeaders = new string[] { "CombinedID", "ID", "Area", "Title", "Description", "Author", "IsPR", "FilePaths" }; var headersToKeep = new string[] { "CombinedID", "ID", "Area", "Title", "Description", "Author", "IsPR" }; var newOnesToAdd = new string[] { "NumMentions", "UserMentions" }; var headerIndices = new Dictionary <string, int>(); for (var i = 0; i < existingHeaders.Length; i++) { headerIndices.Add(existingHeaders[i], i); } var sbInner = new StringBuilder(); foreach (var item in headersToKeep.Union(newOnesToAdd.SkipLast(1))) { sbInner.Append(item).Append("\t"); } sbInner.Append(newOnesToAdd.Last()); if (includeFileColumns) { if (datasetModifier.ReMapLabel == null) { throw new InvalidOperationException(nameof(datasetModifier)); } sbInner.Append("\tFileCount\tFiles\tFilenames\tFileExtensions\tFolderNames\tFolders"); } var newHeader = sbInner.ToString(); var newLines = new List <string>(); newLines.Add(newHeader); var lines = await File.ReadAllLinesAsync(input); string body; if (lines.Length != 0) { foreach (var line in lines.Where(x => !x.StartsWith("CombinedID", StringComparison.Ordinal) && !string.IsNullOrEmpty(x))) { _sb.Clear(); var lineSplitByTab = line.Split("\t"); var fromRepo = lineSplitByTab[headerIndices["CombinedID"]].Split(",")[1]; var area = datasetModifier.ReMapLabel(lineSplitByTab[headerIndices["Area"]], fromRepo); if (string.IsNullOrWhiteSpace(area)) { // the label from archived file is not being used in targetRepo.. can skip this row continue; } _sb .Append(lineSplitByTab[headerIndices["CombinedID"]]) .Append('\t').Append(lineSplitByTab[headerIndices["ID"]]) .Append('\t').Append(area) .Append('\t').Append(lineSplitByTab[headerIndices["Title"]]); body = lineSplitByTab[headerIndices["Description"]]; _sb.Append('\t').Append(body); _sb.Append('\t').Append(lineSplitByTab[headerIndices["Author"]]); int.TryParse(lineSplitByTab[headerIndices["IsPR"]], out var isPrAsNumber); Debug.Assert((isPrAsNumber == 1 || isPrAsNumber == 0)); _sb.Append('\t').Append(isPrAsNumber); AppendColumnsForUserMentions(body); if (includeFileColumns) { AppendColumnsForFileDiffs(lineSplitByTab[headerIndices["FilePaths"]], isPr: isPrAsNumber == 1, datasetModifier.ReMapFiles, fromRepo); } newLines.Add(_sb.ToString()); } } return(newLines.ToArray()); }