Ejemplo n.º 1
0
        public static async Task PrepareAndSaveDatasetsForPrsAsync(DataFilePaths prFiles, DatasetModifier datasetModifier)
        {
            var ds = DatasetHelperInner.Instance;

            var lines = await ds.AddOrRemoveColumnsPriorToTrainingAsync(prFiles.InputPath, datasetModifier, includeFileColumns : true);

            lines = ds.OnlyPrs(lines);
            await ds.BreakIntoTrainValidateTestDatasetsAsync(lines, prFiles.TrainPath, prFiles.ValidatePath, prFiles.TestPath);
        }
Ejemplo n.º 2
0
            /// <summary>
            /// saves to file a dataset ready for training, given one created using GithubIssueDownloader.
            /// For training we can remove ID column, and further expand information in FilePaths
            /// We also retrieve user @ mentions from instead Description and add into new columns
            /// </summary>
            /// <param name="input">path to the reference dataset</param>
            /// <param name="output">the output to store the new dataset</param>
            /// <param name="includeFileColumns">when true, it contains extra columns with file related information</param>
            /// <param name="reMapFiles">for PRs in archived repos, how they could be re-mapped if they were transferred</param>
            public async Task <string[]> AddOrRemoveColumnsPriorToTrainingAsync(
                string input,
                DatasetModifier datasetModifier,
                bool includeFileColumns = true)
            {
                var existingHeaders =
                    new string[] { "CombinedID", "ID", "Area", "Title", "Description", "Author", "IsPR", "FilePaths" };
                var headersToKeep =
                    new string[] { "CombinedID", "ID", "Area", "Title", "Description", "Author", "IsPR" };
                var newOnesToAdd =
                    new string[] { "NumMentions", "UserMentions" };

                var headerIndices = new Dictionary <string, int>();

                for (var i = 0; i < existingHeaders.Length; i++)
                {
                    headerIndices.Add(existingHeaders[i], i);
                }

                var sbInner = new StringBuilder();

                foreach (var item in headersToKeep.Union(newOnesToAdd.SkipLast(1)))
                {
                    sbInner.Append(item).Append("\t");
                }
                sbInner.Append(newOnesToAdd.Last());
                if (includeFileColumns)
                {
                    if (datasetModifier.ReMapLabel == null)
                    {
                        throw new InvalidOperationException(nameof(datasetModifier));
                    }
                    sbInner.Append("\tFileCount\tFiles\tFilenames\tFileExtensions\tFolderNames\tFolders");
                }
                var newHeader = sbInner.ToString();

                var newLines = new List <string>();

                newLines.Add(newHeader);

                var lines = await File.ReadAllLinesAsync(input);

                string body;

                if (lines.Length != 0)
                {
                    foreach (var line in lines.Where(x => !x.StartsWith("CombinedID", StringComparison.Ordinal) && !string.IsNullOrEmpty(x)))
                    {
                        _sb.Clear();
                        var lineSplitByTab = line.Split("\t");
                        var fromRepo       = lineSplitByTab[headerIndices["CombinedID"]].Split(",")[1];
                        var area           = datasetModifier.ReMapLabel(lineSplitByTab[headerIndices["Area"]], fromRepo);
                        if (string.IsNullOrWhiteSpace(area))
                        {
                            // the label from archived file is not being used in targetRepo.. can skip this row
                            continue;
                        }

                        _sb
                        .Append(lineSplitByTab[headerIndices["CombinedID"]])
                        .Append('\t').Append(lineSplitByTab[headerIndices["ID"]])
                        .Append('\t').Append(area)
                        .Append('\t').Append(lineSplitByTab[headerIndices["Title"]]);

                        body = lineSplitByTab[headerIndices["Description"]];
                        _sb.Append('\t').Append(body);
                        _sb.Append('\t').Append(lineSplitByTab[headerIndices["Author"]]);

                        int.TryParse(lineSplitByTab[headerIndices["IsPR"]], out var isPrAsNumber);
                        Debug.Assert((isPrAsNumber == 1 || isPrAsNumber == 0));
                        _sb.Append('\t').Append(isPrAsNumber);

                        AppendColumnsForUserMentions(body);
                        if (includeFileColumns)
                        {
                            AppendColumnsForFileDiffs(lineSplitByTab[headerIndices["FilePaths"]], isPr: isPrAsNumber == 1, datasetModifier.ReMapFiles, fromRepo);
                        }
                        newLines.Add(_sb.ToString());
                    }
                }

                return(newLines.ToArray());
            }