// helper to generate a set of models to test public IEnumerable <FactoredSegmenterCoder> ModelsToTest(bool includeInlineFixes = true) { var models = new List <FactoredSegmenterCoder>(); if (includeInlineFixes) { models.Add(FactoredSegmenterCoder.CreateForTest(inlineFixes: true)); } models.AddRange(new[] // test multiple model sets of model options { // uncomment this one to debug an actual SPM model //FactoredSegmenterCoder.CreateForTest(@"\\mt-data-04\humanparity_tier_1\TeacherStage2Systems\enu\kor\2019_04_30_05h_47m_08s_FS_4repl\final\enu.kor.teacher.fsm", serializeIndicesAndUnrepresentables: serializeIndicesAndUnrepresentables), FactoredSegmenterCoder.CreateForTest(), FactoredSegmenterCoder.CreateForTest(sourceSentenceAnnotationTypes: new[] { "target_language", "politeness" }), FactoredSegmenterCoder.CreateForTest(singleLetterCaseFactors: true, distinguishInitialAndInternalPieces: true, serializeIndicesAndUnrepresentables: true, rightWordGlue: true) }); return(models); }
/// <summary> /// Command-line format: /// factored-segmenter train|encode|decode [--option]* [input file|-] /// </summary> static void Main(string[] args) { var(GetAndConsumeArg, GetArg) = IterateArgs(args); var action = GetAndConsumeArg(); if (action != "train" && action != "encode" && action != "decode" && action != "runtests") { BadArgument("The first argument must be 'train', 'encode', 'decode', or 'runtests'"); } // parse options string dataOutPath = "-"; string modelPath = null; string vocabOutputPath = null; string fieldSeparator = null; bool quiet = false; FactoredSegmenterModelTrainConfig newModelConfig = new FactoredSegmenterModelTrainConfig(); while (GetArg() != null && ((GetArg().StartsWith("-") && GetArg().Length > 1) || GetArg().StartsWith("--"))) // --option, -o, and -- { bool GetBoolArg() // helper to parse bool options have an optional "true" or "false" follow them => GetArg() == null || (GetArg() != "true" && GetArg() != "false") || GetAndConsumeArg() == "true"; var option = GetAndConsumeArg(); // common args if ((option == "-o" || option == "--output") && action != "train") // output stream for encode and decode { dataOutPath = GetAndConsumeArg(); } else if (option == "-m" || option == "--model") // model path: output for train, input for encode/decode { modelPath = GetAndConsumeArg(); } else if ((option == "-v" || option == "--marian-vocab") && action == "train") { vocabOutputPath = GetAndConsumeArg(); } else if (option == "--quiet") // avoid unnecessary logging { quiet = GetBoolArg(); } else if (option == "-F") // field separator, e.g. set to "\t" to process TSV format { fieldSeparator = Regex.Unescape(GetAndConsumeArg()); // unescape so that we can pass \t } // new-model args else if (option == "--right-word-glue") { newModelConfig.ModelOptions.RightWordGlue = GetBoolArg(); } else if (option == "--distinguish-initial-and-internal-pieces") { newModelConfig.ModelOptions.DistinguishInitialAndInternalPieces = GetBoolArg(); } else if (option == "--split-han") { newModelConfig.ModelOptions.SplitHan = GetBoolArg(); } else if (option == "--single-letter-case-factors") { newModelConfig.ModelOptions.SingleLetterCaseFactors = GetBoolArg(); } else if (option == "--serialize-indices-and-unrepresentables") { newModelConfig.ModelOptions.SerializeIndicesAndUnrepresentables = GetBoolArg(); } else if (option == "--inline-fixes") { newModelConfig.ModelOptions.InlineFixes = GetBoolArg(); } else if (option == "--inline-fix-use-tags") { newModelConfig.ModelOptions.InlineFixUseTags = GetBoolArg(); } else if (option == "--no-sentence-piece") { newModelConfig.SentencePieceTrainingConfig = null; } // training args else if (option == "--vocab-size" && action == "train") { newModelConfig.SentencePieceTrainingConfig.VocabSize = int.Parse(GetAndConsumeArg()); } else if (option == "--character_coverage" && action == "train") { newModelConfig.SentencePieceTrainingConfig.CharacterCoverage = double.Parse(GetAndConsumeArg()); } else if (option == "--training-sentence-size" && action == "train") { newModelConfig.TrainingSentenceSize = int.Parse(GetAndConsumeArg()); } else if (option == "--min-piece-count" && action == "train") { newModelConfig.MinPieceCount = int.Parse(GetAndConsumeArg()); } else if (option == "--min-char-count" && action == "train") { newModelConfig.MinCharCount = int.Parse(GetAndConsumeArg()); } // other else if (option == "--") // -- ends option processing { break; } else { BadArgument($"Unknown option {option}"); } } // parse remaining arguments (one or more input files) var inputPaths = new List <string>(); while (GetArg() != null) { inputPaths.Add(GetAndConsumeArg()); } if (!inputPaths.Any()) // none given: read from stdin { inputPaths.Add("-"); } // open all input files var streams = from inputPath in inputPaths select inputPath != "-" ? new StreamReader(inputPath, encoding: Encoding.UTF8, detectEncodingFromByteOrderMarks: true, bufferSize: 1000000) : Console.In; if (action == "train") { if (!quiet) { Log($"Creating model {modelPath} from input file(s) {" ".JoinItems(inputPaths)} ..."); } if (!modelPath.EndsWith(".fsm")) // @TODO: do this inside Train() where we create the temp pathnames { BadArgument($"Extension .fsm is required for model path {modelPath}"); } var lines = from stream in streams from line in stream.ReadLines() select line; CreateDirectoryFor(modelPath); // @TODO: do this inside Train() var model = FactoredSegmenterModel.Train(newModelConfig, lines, sourceSentenceAnnotations: null, fsmModelPath: modelPath, spmBinDir: SentencePieceManaged.SpmBinaryDirPath); // save the model // The SentencePiece model is embedded in 'model'; it is not a separate file. model.Save(modelPath); if (!quiet) { Log($"Model file written to {modelPath}"); } // save the vocab for Marian consumption if (model.FactorSpec != null && vocabOutputPath != null) { File.WriteAllLines(vocabOutputPath, model.FactorSpec, new UTF8Encoding(encoderShouldEmitUTF8Identifier: false)); if (!quiet) { Log($"Marian vocabulary file written to {vocabOutputPath}"); } } } else if (action == "encode" || action == "decode") { if (!quiet) { Log($"Processing input file(s) {" ".JoinItems(inputPaths)} with model {modelPath} ..."); } var lines = from stream in streams.ToList() // ToList() eagerly opens all streams, to test upfront if all files are found from line in stream.ReadLines() select line; newModelConfig.ModelOptions.UseSentencePiece = false; var coderConfig = modelPath != null ? new FactoredSegmenterCoderConfig { ModelPath = modelPath } : new FactoredSegmenterCoderConfig // no model specified: use untrained virgin model (without SentencePiece) { Model = new FactoredSegmenterModel(newModelConfig.ModelOptions) }; var coder = new FactoredSegmenterCoder(coderConfig); // write loop if (!quiet) { Log($"Writing processed lines to {dataOutPath} ..."); } CreateDirectoryFor(dataOutPath); var outStream = dataOutPath != "-" ? // open output stream (UTF-8 without BOM) new StreamWriter(dataOutPath, append: false, encoding: new UTF8Encoding(encoderShouldEmitUTF8Identifier: false), bufferSize: 1000000) : Console.Out; var linesProcessed = 0; string ProcessLine(string line) { try { return(action == "encode" ? " ".JoinItems(coder.Encode(line).TokenStrings) : // encode coder.Decode(line).ToString()); // decode } catch (Exception e) { Log($"Failed to {action} input: {line}"); Log($"Exception: {e.ToString()}"); return(""); // back off to empty string, so that we can continue } } foreach (var line in lines) { string processedLine = fieldSeparator == null ? processedLine = ProcessLine(line) : processedLine = fieldSeparator.JoinItems(from field in line.Split(fieldSeparator) select ProcessLine(field)); //Log($"{command} IN: {line} --> OUT: {processedLine}"); outStream.WriteLine(processedLine); // @BUGBUG: Write errors are not caught, at least when writing to a pipe via stdout. linesProcessed++; if (!quiet && linesProcessed % 1000000 == 0) { Log($"Completed processing of {linesProcessed:#,##0} lines so far."); } } if (!quiet) { Log($"Completed processing of {linesProcessed:#,##0} lines."); } outStream.Flush(); // hoping to elicit an exception in case flushing fails outStream.Close(); } // @TODO: disabled for now since the tests don't build under Linux //else if (action == "runtests") //{ // // This is for easier testing when debugging environment does not support tests. // // This must be manually maintained. // var tests = new TextSegmentation.Segmenter.FactoredSegmenter_GitSubmodule.src.Test.FactoredSegmenterTests(); // tests.ReversibilityAndBasicBreakingTests(); // tests.DecodeIntoConsecutiveSegmentsTest(); // //tests.ReversibilityAndBasicBreakingTestsOnNaughtyData(); // fails in standalone build because data file is our other repo // tests.RunTraining(); // var tests1 = new TextSegmentation.Segmenter.FactoredSegmenter_GitSubmodule.src.Test.FactoredSegmenterScriptHelperTests(); // tests1.ScriptEdgeCasesTest(); // tests1.ClassificationEdgeCaseTests(); //} }