/** * Constructor * * @param fst FST with surface forms * @param dictionary token info dictionary * @param unknownDictionary unknown word dictionary * @param userDictionary user dictionary * @param mode tokenization {@link Mode mode} */ public ViterbiBuilder(FST.FST fst, TokenInfoDictionary dictionary, UnknownDictionary unknownDictionary, UserDictionary userDictionary, Mode mode) { this.fst = fst; this.dictionary = dictionary; this.unknownDictionary = unknownDictionary; this.userDictionary = userDictionary; this.useUserDictionary = (userDictionary != null); if (mode == Mode.SEARCH || mode == Mode.EXTENDED) { searchMode = true; } this.characterDefinitions = unknownDictionary.GetCharacterDefinition(); }
private void BuildTokenInfoDictionary(string inputDirAbsolutePath, string outputDirAbsolutePath, string encoding, EncodingProvider provider) { try { ProgressLog.Begin("compiling tokeninfo dict"); var tokenInfoCompiler = GetTokenInfoDictionaryCompiler(encoding, provider); ProgressLog.Println("analyzing dictionary features"); using (var stream = tokenInfoCompiler.CombinedSequentialFileInputStream(inputDirAbsolutePath)) { tokenInfoCompiler.AnalyzeTokenInfo(stream); ProgressLog.Println("reading tokeninfo"); tokenInfoCompiler.ReadTokenInfo(stream); tokenInfoCompiler.Compile(stream); } List <string> surfaces = tokenInfoCompiler.GetSurfaces(); ProgressLog.Begin("compiling fst"); FSTCompiler fstCompiler = new FSTCompiler(surfaces); using (var stream = File.Open(outputDirAbsolutePath + Path.DirectorySeparatorChar + FST.FST.FST_FILENAME, FileMode.OpenOrCreate)) { fstCompiler.Compile(stream); } ProgressLog.Println("validating saved fst"); FST.FST fst; using (var stream = File.OpenRead(outputDirAbsolutePath + Path.DirectorySeparatorChar + FST.FST.FST_FILENAME)) { fst = new FST.FST(stream); } foreach (string surface in surfaces) { if (fst.Lookup(surface) < 0) { ProgressLog.Println("failed to look up [" + surface + "]"); } } ProgressLog.End(); ProgressLog.Begin("processing target map"); for (int i = 0; i < surfaces.Count; i++) { int id = fst.Lookup(surfaces[i]); tokenInfoCompiler.AddMapping(id, i); } tokenInfoCompiler.Write(outputDirAbsolutePath); // TODO: Should be refactored -Christian ProgressLog.End(); } catch (Exception ex) { throw new Exception("DictionaryCompilerBase.BuildTokenInfoDictionary: " + ex.Message); } }