// test driver public static void MainTest(string[] args) { string inFile = "../data/informalExpression.txt"; if (args.Length == 1) { inFile = args[0]; } else if (args.Length > 0) { Console.WriteLine("Usage: java TokenObj <inFile>"); Environment.Exit(0); } // init Dictionary <string, string> informalExpMap = InformalExpHandler.GetInformalExpMapFromFile(inFile); // Unit Test Test(informalExpMap); }
// update parameter from the config file to cSpellApi private void Init(bool debugFlag) { // get config file from environment variable bool useClassPath = false; if (string.ReferenceEquals(configFile_, null)) { useClassPath = true; configFile_ = "data.Config.cSpell"; } // read in configuration file conf_ = new Configuration(configFile_, useClassPath); if (properties_ != null) { conf_.OverwriteProperties(properties_); } string cSpellDir = conf_.GetProperty(Configuration.CS_DIR); // files: pre-correction string infExpFile = cSpellDir + conf_.GetProperty(Configuration.CS_INFORMAL_EXP_FILE); infExpMap_ = InformalExpHandler.GetInformalExpMapFromFile(infExpFile); // get dictionary for spell checker string checkDicFileStrs = conf_.GetProperty(Configuration.CS_CHECK_DIC_FILES); checkDic_.AddDictionaries(checkDicFileStrs, cSpellDir, debugFlag); // get dictionary for spell suggestion - candidate string suggestDicFileStrs = conf_.GetProperty(Configuration.CS_SUGGEST_DIC_FILES); suggestDic_.AddDictionaries(suggestDicFileStrs, cSpellDir, debugFlag); // no acr/abb dictionary: en + pn, used for split check string splitWordDicFileStrs = conf_.GetProperty(Configuration.CS_SPLIT_WORD_DIC_FILES); splitWordDic_.AddDictionaries(splitWordDicFileStrs, cSpellDir, debugFlag); // mw dictionary string mwDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_MW_DIC_FILE); mwDic_.AddDictionary(mwDicFile); // properNoun dictionary string pnDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_PN_DIC_FILE); pnDic_.AddDictionary(pnDicFile); // abb/acr dictionary string aaDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_AA_DIC_FILE); aaDic_.AddDictionary(aaDicFile); // spVar dictionary string svDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_SV_DIC_FILE); svDic_.AddDictionary(svDicFile); // unit file string unitDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_UNIT_DIC_FILE); unitDic_.AddDictionary(unitDicFile); // frequency file string frequencyFile = cSpellDir + conf_.GetProperty(Configuration.CS_FREQUENCY_FILE); wordWcMap_ = new WordWcMap(frequencyFile); // word2Vec file string word2VecImFile = cSpellDir + conf_.GetProperty(Configuration.CS_W2V_IM_FILE); word2VecIm_ = new Word2Vec(word2VecImFile); string word2VecOmFile = cSpellDir + conf_.GetProperty(Configuration.CS_W2V_OM_FILE); word2VecOm_ = new Word2Vec(word2VecOmFile); // mode funcMode_ = int.Parse(conf_.GetProperty(Configuration.CS_FUNC_MODE)); rankMode_ = int.Parse(conf_.GetProperty(Configuration.CS_RANK_MODE)); // detectors maxLegitTokenLength_ = int.Parse(conf_.GetProperty(Configuration.CS_MAX_LEGIT_TOKEN_LENGTH)); dRwSplitWordMinLength_ = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_SPLIT_WORD_MIN_LENGTH)); dRwSplitWordMinWc_ = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_SPLIT_WORD_MIN_WC)); dRw1To1WordMinLength_ = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_1TO1_WORD_MIN_LENGTH)); dRw1To1WordMinWc_ = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_1TO1_WORD_MIN_WC)); // candidates cMaxCandNo_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_MAX_CANDIDATE_NO)); cNdMaxSplitNo_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_ND_MAX_SPLIT_NO)); cNwMaxSplitNo_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_NW_MAX_SPLIT_NO)); cNwMaxMergeNo_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_NW_MAX_MERGE_NO)); cNwMergeWithHyphen_ = bool.Parse(conf_.GetProperty(Configuration.CS_CAN_NW_MERGE_WITH_HYPHEN)); cRwMaxSplitNo_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MAX_SPLIT_NO)); cRwMaxMergeNo_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MAX_MERGE_NO)); cRwMergeWithHyphen_ = bool.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MERGE_WITH_HYPHEN)); cRwShortSplitWordLength_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_SHORT_SPLIT_WORD_LENGTH)); cRwMaxShortSplitWordNo_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MAX_SHORT_SPLIT_WORD_NO)); cRwMergeCandMinWc_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MERGE_CAND_MIN_WC)); cRwSplitCandMinWc_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_SPLIT_CAND_MIN_WC)); cRw1To1CandMinLength_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_1TO1_CAND_MIN_LENGTH)); cRw1To1CandMinWc_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_1TO1_CAND_MIN_WC)); cRw1To1CandMaxKeySize_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_1TO1_CAND_MAX_KEY_SIZE)); // rankers rNwS1RankRangeFac_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_NW_S1_RANK_RANGE_FAC)); rNwS1MinOScore_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_NW_S1_MIN_OSCORE)); rRw1To1CFac_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_C_FAC)); rRwSplitCFac_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_SPLIT_C_FAC)); rRwMergeCFac_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_MERGE_C_FAC)); rRw1To1WordMinCs_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_WORD_MIN_CS)); rRw1To1CandCsFac_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_CS_FAC)); rRw1To1CandMinCs_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_MIN_CS)); rRw1To1CandCsDist_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_CS_DIST)); rRw1To1CandFsFac_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_FS_FAC)); rRw1To1CandMinFs_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_MIN_FS)); rRw1To1CandFsDist_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_FS_DIST)); // Score orthoScoreEdDistFac_ = double.Parse(conf_.GetProperty(Configuration.CS_ORTHO_SCORE_ED_DIST_FAC)); orthoScorePhoneticFac_ = double.Parse(conf_.GetProperty(Configuration.CS_ORTHO_SCORE_PHONETIC_FAC)); orthoScoreOverlapFac_ = double.Parse(conf_.GetProperty(Configuration.CS_ORTHO_SCORE_OVERLAP_FAC)); // context word2VecSkipWord_ = bool.Parse(conf_.GetProperty(Configuration.CS_W2V_SKIP_WORD)); nw1To1ContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_NW_1TO1_CONTEXT_RADIUS)); nwSplitContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_NW_SPLIT_CONTEXT_RADIUS)); nwMergeContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_NW_MERGE_CONTEXT_RADIUS)); rw1To1ContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_RW_1TO1_CONTEXT_RADIUS)); rwSplitContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_RW_SPLIT_CONTEXT_RADIUS)); rwMergeContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_RW_MERGE_CONTEXT_RADIUS)); }
// Test Driver private static void Test(Dictionary <string, string> informalExpMap) { Console.WriteLine("===== Unit Test of TokenObj ====="); // init string inText = "Contraction: We cant theredve hell. Plz u r good."; List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); // construct the outstr from tokens by joining List <TokenObj> outTokenList = new List <TokenObj>(inTokenList.Select(tokenObj => InformalExpHandler.Process(tokenObj, informalExpMap)).ToList()); // result string outText = TextObj.TokenListToText(outTokenList); // print out Console.WriteLine("--------- ProcInformalExpression( ) -----------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); Console.WriteLine("--------- detail -----------"); foreach (TokenObj tokenObj in inTokenList) { Console.WriteLine(tokenObj.ToString()); } Console.WriteLine("===== End of Unit Test ====="); }
private void Init2(bool debugFlag) { _logger.LogInformation("cSpellApi initialization..."); infExpMap_ = InformalExpHandler.GetInformalExpMapFromFile(_config.Value.CS_INFORMAL_EXP_FILE); checkDic_.AddDictionaries2(_config.Value.CS_CHECK_DIC_FILES, debugFlag); suggestDic_.AddDictionaries2(_config.Value.CS_SUGGEST_DIC_FILES, debugFlag); splitWordDic_.AddDictionaries2(_config.Value.CS_SPLIT_WORD_DIC_FILES, debugFlag); mwDic_.AddDictionary(_config.Value.CS_MW_DIC_FILE); pnDic_.AddDictionary(_config.Value.CS_PN_DIC_FILE); aaDic_.AddDictionary(_config.Value.CS_AA_DIC_FILE); svDic_.AddDictionary(_config.Value.CS_SV_DIC_FILE); unitDic_.AddDictionary(_config.Value.CS_UNIT_DIC_FILE); wordWcMap_ = new WordWcMap(_config.Value.CS_FREQUENCY_FILE); word2VecIm_ = new Word2Vec(_config.Value.CS_W2V_IM_FILE); word2VecOm_ = new Word2Vec(_config.Value.CS_W2V_OM_FILE); // mode funcMode_ = _config.Value.CS_FUNC_MODE; rankMode_ = _config.Value.CS_RANK_MODE; // detectors maxLegitTokenLength_ = _config.Value.CS_MAX_LEGIT_TOKEN_LENGTH; dRwSplitWordMinLength_ = _config.Value.CS_DETECTOR_RW_SPLIT_WORD_MIN_LENGTH; dRwSplitWordMinWc_ = _config.Value.CS_DETECTOR_RW_SPLIT_WORD_MIN_WC; dRw1To1WordMinLength_ = _config.Value.CS_DETECTOR_RW_1TO1_WORD_MIN_LENGTH; dRw1To1WordMinWc_ = _config.Value.CS_DETECTOR_RW_1TO1_WORD_MIN_WC; // candidates cMaxCandNo_ = _config.Value.CS_CAN_MAX_CANDIDATE_NO; cNdMaxSplitNo_ = _config.Value.CS_CAN_ND_MAX_SPLIT_NO; cNwMaxSplitNo_ = _config.Value.CS_CAN_NW_MAX_SPLIT_NO; cNwMaxMergeNo_ = _config.Value.CS_CAN_NW_MAX_MERGE_NO; cNwMergeWithHyphen_ = _config.Value.CS_CAN_NW_MERGE_WITH_HYPHEN; cRwMaxSplitNo_ = _config.Value.CS_CAN_RW_MAX_SPLIT_NO; cRwMaxMergeNo_ = _config.Value.CS_CAN_RW_MAX_MERGE_NO; cRwMergeWithHyphen_ = _config.Value.CS_CAN_RW_MERGE_WITH_HYPHEN; cRwShortSplitWordLength_ = _config.Value.CS_CAN_RW_SHORT_SPLIT_WORD_LENGTH; cRwMaxShortSplitWordNo_ = _config.Value.CS_CAN_RW_MAX_SHORT_SPLIT_WORD_NO; cRwMergeCandMinWc_ = _config.Value.CS_CAN_RW_MERGE_CAND_MIN_WC; cRwSplitCandMinWc_ = _config.Value.CS_CAN_RW_SPLIT_CAND_MIN_WC; cRw1To1CandMinLength_ = _config.Value.CS_CAN_RW_1TO1_CAND_MIN_LENGTH; cRw1To1CandMinWc_ = _config.Value.CS_CAN_RW_1TO1_CAND_MIN_WC; cRw1To1CandMaxKeySize_ = _config.Value.CS_CAN_RW_1TO1_CAND_MAX_KEY_SIZE; // rankers rNwS1RankRangeFac_ = _config.Value.CS_RANKER_NW_S1_RANK_RANGE_FAC; rNwS1MinOScore_ = _config.Value.CS_RANKER_NW_S1_MIN_OSCORE; rRw1To1CFac_ = _config.Value.CS_RANKER_RW_1TO1_C_FAC; rRwSplitCFac_ = _config.Value.CS_RANKER_RW_SPLIT_C_FAC; rRwMergeCFac_ = _config.Value.CS_RANKER_RW_MERGE_C_FAC; rRw1To1WordMinCs_ = _config.Value.CS_RANKER_RW_1TO1_WORD_MIN_CS; rRw1To1CandCsFac_ = _config.Value.CS_RANKER_RW_1TO1_CAND_CS_FAC; rRw1To1CandMinCs_ = _config.Value.CS_RANKER_RW_1TO1_CAND_MIN_CS; rRw1To1CandCsDist_ = _config.Value.CS_RANKER_RW_1TO1_CAND_CS_DIST; rRw1To1CandFsFac_ = _config.Value.CS_RANKER_RW_1TO1_CAND_FS_FAC; rRw1To1CandMinFs_ = _config.Value.CS_RANKER_RW_1TO1_CAND_MIN_FS; rRw1To1CandFsDist_ = _config.Value.CS_RANKER_RW_1TO1_CAND_FS_DIST; // Score orthoScoreEdDistFac_ = _config.Value.CS_ORTHO_SCORE_ED_DIST_FAC; orthoScorePhoneticFac_ = _config.Value.CS_ORTHO_SCORE_PHONETIC_FAC; orthoScoreOverlapFac_ = _config.Value.CS_ORTHO_SCORE_OVERLAP_FAC; // context word2VecSkipWord_ = _config.Value.CS_W2V_SKIP_WORD; nw1To1ContextRadius_ = _config.Value.CS_NW_1TO1_CONTEXT_RADIUS; nwSplitContextRadius_ = _config.Value.CS_NW_SPLIT_CONTEXT_RADIUS; nwMergeContextRadius_ = _config.Value.CS_NW_MERGE_CONTEXT_RADIUS; rw1To1ContextRadius_ = _config.Value.CS_RW_1TO1_CONTEXT_RADIUS; rwSplitContextRadius_ = _config.Value.CS_RW_SPLIT_CONTEXT_RADIUS; rwMergeContextRadius_ = _config.Value.CS_RW_MERGE_CONTEXT_RADIUS; _logger.LogInformation("cSpellApi initialized successfully"); }