Ejemplo n.º 1
0
        // test driver
        public static void MainTest(string[] args)
        {
            string inFile = "../data/informalExpression.txt";

            if (args.Length == 1)
            {
                inFile = args[0];
            }
            else if (args.Length > 0)
            {
                Console.WriteLine("Usage: java TokenObj <inFile>");
                Environment.Exit(0);
            }

            // init
            Dictionary <string, string> informalExpMap = InformalExpHandler.GetInformalExpMapFromFile(inFile);

            // Unit Test
            Test(informalExpMap);
        }
Ejemplo n.º 2
0
        // update parameter from the config file to cSpellApi
        private void Init(bool debugFlag)
        {
            // get config file from environment variable
            bool useClassPath = false;

            if (string.ReferenceEquals(configFile_, null))
            {
                useClassPath = true;
                configFile_  = "data.Config.cSpell";
            }
            // read in configuration file
            conf_ = new Configuration(configFile_, useClassPath);
            if (properties_ != null)
            {
                conf_.OverwriteProperties(properties_);
            }
            string cSpellDir = conf_.GetProperty(Configuration.CS_DIR);
            // files: pre-correction
            string infExpFile = cSpellDir + conf_.GetProperty(Configuration.CS_INFORMAL_EXP_FILE);

            infExpMap_ = InformalExpHandler.GetInformalExpMapFromFile(infExpFile);
            // get dictionary for spell checker
            string checkDicFileStrs = conf_.GetProperty(Configuration.CS_CHECK_DIC_FILES);

            checkDic_.AddDictionaries(checkDicFileStrs, cSpellDir, debugFlag);
            // get dictionary for spell suggestion - candidate
            string suggestDicFileStrs = conf_.GetProperty(Configuration.CS_SUGGEST_DIC_FILES);

            suggestDic_.AddDictionaries(suggestDicFileStrs, cSpellDir, debugFlag);
            // no acr/abb dictionary: en + pn, used for split check
            string splitWordDicFileStrs = conf_.GetProperty(Configuration.CS_SPLIT_WORD_DIC_FILES);

            splitWordDic_.AddDictionaries(splitWordDicFileStrs, cSpellDir, debugFlag);
            // mw dictionary
            string mwDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_MW_DIC_FILE);

            mwDic_.AddDictionary(mwDicFile);
            // properNoun dictionary
            string pnDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_PN_DIC_FILE);

            pnDic_.AddDictionary(pnDicFile);
            // abb/acr dictionary
            string aaDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_AA_DIC_FILE);

            aaDic_.AddDictionary(aaDicFile);
            // spVar dictionary
            string svDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_SV_DIC_FILE);

            svDic_.AddDictionary(svDicFile);
            // unit file
            string unitDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_UNIT_DIC_FILE);

            unitDic_.AddDictionary(unitDicFile);
            // frequency file
            string frequencyFile = cSpellDir + conf_.GetProperty(Configuration.CS_FREQUENCY_FILE);

            wordWcMap_ = new WordWcMap(frequencyFile);
            // word2Vec file
            string word2VecImFile = cSpellDir + conf_.GetProperty(Configuration.CS_W2V_IM_FILE);

            word2VecIm_ = new Word2Vec(word2VecImFile);
            string word2VecOmFile = cSpellDir + conf_.GetProperty(Configuration.CS_W2V_OM_FILE);

            word2VecOm_ = new Word2Vec(word2VecOmFile);
            // mode
            funcMode_ = int.Parse(conf_.GetProperty(Configuration.CS_FUNC_MODE));
            rankMode_ = int.Parse(conf_.GetProperty(Configuration.CS_RANK_MODE));
            // detectors
            maxLegitTokenLength_   = int.Parse(conf_.GetProperty(Configuration.CS_MAX_LEGIT_TOKEN_LENGTH));
            dRwSplitWordMinLength_ = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_SPLIT_WORD_MIN_LENGTH));
            dRwSplitWordMinWc_     = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_SPLIT_WORD_MIN_WC));
            dRw1To1WordMinLength_  = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_1TO1_WORD_MIN_LENGTH));
            dRw1To1WordMinWc_      = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_1TO1_WORD_MIN_WC));
            // candidates
            cMaxCandNo_         = int.Parse(conf_.GetProperty(Configuration.CS_CAN_MAX_CANDIDATE_NO));
            cNdMaxSplitNo_      = int.Parse(conf_.GetProperty(Configuration.CS_CAN_ND_MAX_SPLIT_NO));
            cNwMaxSplitNo_      = int.Parse(conf_.GetProperty(Configuration.CS_CAN_NW_MAX_SPLIT_NO));
            cNwMaxMergeNo_      = int.Parse(conf_.GetProperty(Configuration.CS_CAN_NW_MAX_MERGE_NO));
            cNwMergeWithHyphen_ = bool.Parse(conf_.GetProperty(Configuration.CS_CAN_NW_MERGE_WITH_HYPHEN));
            cRwMaxSplitNo_      = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MAX_SPLIT_NO));
            cRwMaxMergeNo_      = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MAX_MERGE_NO));
            cRwMergeWithHyphen_ = bool.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MERGE_WITH_HYPHEN));

            cRwShortSplitWordLength_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_SHORT_SPLIT_WORD_LENGTH));
            cRwMaxShortSplitWordNo_  = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MAX_SHORT_SPLIT_WORD_NO));
            cRwMergeCandMinWc_       = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MERGE_CAND_MIN_WC));
            cRwSplitCandMinWc_       = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_SPLIT_CAND_MIN_WC));
            cRw1To1CandMinLength_    = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_1TO1_CAND_MIN_LENGTH));
            cRw1To1CandMinWc_        = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_1TO1_CAND_MIN_WC));
            cRw1To1CandMaxKeySize_   = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_1TO1_CAND_MAX_KEY_SIZE));

            // rankers
            rNwS1RankRangeFac_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_NW_S1_RANK_RANGE_FAC));
            rNwS1MinOScore_    = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_NW_S1_MIN_OSCORE));
            rRw1To1CFac_       = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_C_FAC));
            rRwSplitCFac_      = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_SPLIT_C_FAC));
            rRwMergeCFac_      = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_MERGE_C_FAC));
            rRw1To1WordMinCs_  = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_WORD_MIN_CS));
            rRw1To1CandCsFac_  = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_CS_FAC));
            rRw1To1CandMinCs_  = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_MIN_CS));
            rRw1To1CandCsDist_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_CS_DIST));
            rRw1To1CandFsFac_  = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_FS_FAC));
            rRw1To1CandMinFs_  = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_MIN_FS));
            rRw1To1CandFsDist_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_FS_DIST));

            // Score
            orthoScoreEdDistFac_   = double.Parse(conf_.GetProperty(Configuration.CS_ORTHO_SCORE_ED_DIST_FAC));
            orthoScorePhoneticFac_ = double.Parse(conf_.GetProperty(Configuration.CS_ORTHO_SCORE_PHONETIC_FAC));
            orthoScoreOverlapFac_  = double.Parse(conf_.GetProperty(Configuration.CS_ORTHO_SCORE_OVERLAP_FAC));

            // context
            word2VecSkipWord_     = bool.Parse(conf_.GetProperty(Configuration.CS_W2V_SKIP_WORD));
            nw1To1ContextRadius_  = int.Parse(conf_.GetProperty(Configuration.CS_NW_1TO1_CONTEXT_RADIUS));
            nwSplitContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_NW_SPLIT_CONTEXT_RADIUS));
            nwMergeContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_NW_MERGE_CONTEXT_RADIUS));
            rw1To1ContextRadius_  = int.Parse(conf_.GetProperty(Configuration.CS_RW_1TO1_CONTEXT_RADIUS));
            rwSplitContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_RW_SPLIT_CONTEXT_RADIUS));
            rwMergeContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_RW_MERGE_CONTEXT_RADIUS));
        }
Ejemplo n.º 3
0
        // Test Driver
        private static void Test(Dictionary <string, string> informalExpMap)
        {
            Console.WriteLine("===== Unit Test of TokenObj =====");
            // init
            string          inText      = "Contraction: We cant theredve hell. Plz u r  good.";
            List <TokenObj> inTokenList = TextObj.TextToTokenList(inText);
            // construct the outstr from tokens by joining
            List <TokenObj> outTokenList = new List <TokenObj>(inTokenList.Select(tokenObj => InformalExpHandler.Process(tokenObj, informalExpMap)).ToList());
            // result
            string outText = TextObj.TokenListToText(outTokenList);

            // print out
            Console.WriteLine("--------- ProcInformalExpression( ) -----------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
            Console.WriteLine("--------- detail -----------");
            foreach (TokenObj tokenObj in inTokenList)
            {
                Console.WriteLine(tokenObj.ToString());
            }
            Console.WriteLine("===== End of Unit Test =====");
        }
Ejemplo n.º 4
0
        private void Init2(bool debugFlag)
        {
            _logger.LogInformation("cSpellApi initialization...");
            infExpMap_ = InformalExpHandler.GetInformalExpMapFromFile(_config.Value.CS_INFORMAL_EXP_FILE);
            checkDic_.AddDictionaries2(_config.Value.CS_CHECK_DIC_FILES, debugFlag);
            suggestDic_.AddDictionaries2(_config.Value.CS_SUGGEST_DIC_FILES, debugFlag);
            splitWordDic_.AddDictionaries2(_config.Value.CS_SPLIT_WORD_DIC_FILES, debugFlag);
            mwDic_.AddDictionary(_config.Value.CS_MW_DIC_FILE);
            pnDic_.AddDictionary(_config.Value.CS_PN_DIC_FILE);
            aaDic_.AddDictionary(_config.Value.CS_AA_DIC_FILE);
            svDic_.AddDictionary(_config.Value.CS_SV_DIC_FILE);
            unitDic_.AddDictionary(_config.Value.CS_UNIT_DIC_FILE);
            wordWcMap_  = new WordWcMap(_config.Value.CS_FREQUENCY_FILE);
            word2VecIm_ = new Word2Vec(_config.Value.CS_W2V_IM_FILE);
            word2VecOm_ = new Word2Vec(_config.Value.CS_W2V_OM_FILE);

            // mode
            funcMode_ = _config.Value.CS_FUNC_MODE;
            rankMode_ = _config.Value.CS_RANK_MODE;

            // detectors
            maxLegitTokenLength_   = _config.Value.CS_MAX_LEGIT_TOKEN_LENGTH;
            dRwSplitWordMinLength_ = _config.Value.CS_DETECTOR_RW_SPLIT_WORD_MIN_LENGTH;
            dRwSplitWordMinWc_     = _config.Value.CS_DETECTOR_RW_SPLIT_WORD_MIN_WC;
            dRw1To1WordMinLength_  = _config.Value.CS_DETECTOR_RW_1TO1_WORD_MIN_LENGTH;
            dRw1To1WordMinWc_      = _config.Value.CS_DETECTOR_RW_1TO1_WORD_MIN_WC;

            // candidates
            cMaxCandNo_         = _config.Value.CS_CAN_MAX_CANDIDATE_NO;
            cNdMaxSplitNo_      = _config.Value.CS_CAN_ND_MAX_SPLIT_NO;
            cNwMaxSplitNo_      = _config.Value.CS_CAN_NW_MAX_SPLIT_NO;
            cNwMaxMergeNo_      = _config.Value.CS_CAN_NW_MAX_MERGE_NO;
            cNwMergeWithHyphen_ = _config.Value.CS_CAN_NW_MERGE_WITH_HYPHEN;
            cRwMaxSplitNo_      = _config.Value.CS_CAN_RW_MAX_SPLIT_NO;
            cRwMaxMergeNo_      = _config.Value.CS_CAN_RW_MAX_MERGE_NO;
            cRwMergeWithHyphen_ = _config.Value.CS_CAN_RW_MERGE_WITH_HYPHEN;

            cRwShortSplitWordLength_ = _config.Value.CS_CAN_RW_SHORT_SPLIT_WORD_LENGTH;
            cRwMaxShortSplitWordNo_  = _config.Value.CS_CAN_RW_MAX_SHORT_SPLIT_WORD_NO;
            cRwMergeCandMinWc_       = _config.Value.CS_CAN_RW_MERGE_CAND_MIN_WC;
            cRwSplitCandMinWc_       = _config.Value.CS_CAN_RW_SPLIT_CAND_MIN_WC;
            cRw1To1CandMinLength_    = _config.Value.CS_CAN_RW_1TO1_CAND_MIN_LENGTH;
            cRw1To1CandMinWc_        = _config.Value.CS_CAN_RW_1TO1_CAND_MIN_WC;
            cRw1To1CandMaxKeySize_   = _config.Value.CS_CAN_RW_1TO1_CAND_MAX_KEY_SIZE;

            // rankers
            rNwS1RankRangeFac_ = _config.Value.CS_RANKER_NW_S1_RANK_RANGE_FAC;
            rNwS1MinOScore_    = _config.Value.CS_RANKER_NW_S1_MIN_OSCORE;
            rRw1To1CFac_       = _config.Value.CS_RANKER_RW_1TO1_C_FAC;
            rRwSplitCFac_      = _config.Value.CS_RANKER_RW_SPLIT_C_FAC;
            rRwMergeCFac_      = _config.Value.CS_RANKER_RW_MERGE_C_FAC;
            rRw1To1WordMinCs_  = _config.Value.CS_RANKER_RW_1TO1_WORD_MIN_CS;
            rRw1To1CandCsFac_  = _config.Value.CS_RANKER_RW_1TO1_CAND_CS_FAC;
            rRw1To1CandMinCs_  = _config.Value.CS_RANKER_RW_1TO1_CAND_MIN_CS;
            rRw1To1CandCsDist_ = _config.Value.CS_RANKER_RW_1TO1_CAND_CS_DIST;
            rRw1To1CandFsFac_  = _config.Value.CS_RANKER_RW_1TO1_CAND_FS_FAC;
            rRw1To1CandMinFs_  = _config.Value.CS_RANKER_RW_1TO1_CAND_MIN_FS;
            rRw1To1CandFsDist_ = _config.Value.CS_RANKER_RW_1TO1_CAND_FS_DIST;

            // Score
            orthoScoreEdDistFac_   = _config.Value.CS_ORTHO_SCORE_ED_DIST_FAC;
            orthoScorePhoneticFac_ = _config.Value.CS_ORTHO_SCORE_PHONETIC_FAC;
            orthoScoreOverlapFac_  = _config.Value.CS_ORTHO_SCORE_OVERLAP_FAC;

            // context
            word2VecSkipWord_     = _config.Value.CS_W2V_SKIP_WORD;
            nw1To1ContextRadius_  = _config.Value.CS_NW_1TO1_CONTEXT_RADIUS;
            nwSplitContextRadius_ = _config.Value.CS_NW_SPLIT_CONTEXT_RADIUS;
            nwMergeContextRadius_ = _config.Value.CS_NW_MERGE_CONTEXT_RADIUS;
            rw1To1ContextRadius_  = _config.Value.CS_RW_1TO1_CONTEXT_RADIUS;
            rwSplitContextRadius_ = _config.Value.CS_RW_SPLIT_CONTEXT_RADIUS;
            rwMergeContextRadius_ = _config.Value.CS_RW_MERGE_CONTEXT_RADIUS;
            _logger.LogInformation("cSpellApi initialized successfully");
        }