public static bool IsRealWord(string inWord, CSpellApi cSpellApi, bool debugFlag)
        {
            // init
            RootDictionary checkDic   = cSpellApi.GetCheckDic();
            RootDictionary unitDic    = cSpellApi.GetUnitDic();
            WordWcMap      wordWcMap  = cSpellApi.GetWordWcMap();
            Word2Vec       word2VecOm = cSpellApi.GetWord2VecOm();
            int            inWordLen  = inWord.Length;
            // TBD, change method name
            int rwSplitWordMinLength = cSpellApi.GetDetectorRwSplitWordMinLength();
            int rwSplitWordMinWc     = cSpellApi.GetDetectorRwSplitWordMinWc();
            // realword must be:
            // 1. known in the dictionary
            // 2. not exception, such as url, email, digit, ...
            // => if excpetion, even is a non-word, no correction
            // 3. must have word2Vector value (inWord is auto converted to LC)
            // 4. frequency must be above a threshhold (inWord is auto to LC)
            // TBD, need to be configureable 200
            bool realWordFlag = (checkDic.IsValidWord(inWord)) && (!IsRealWordExceptions(inWord, unitDic) && (inWordLen >= rwSplitWordMinLength) && (word2VecOm.HasWordVec(inWord) == true) && (WordCountScore.GetWc(inWord, wordWcMap) >= rwSplitWordMinWc));

            if (debugFlag == true)
            {
                bool wordInDicFlag     = checkDic.IsValidWord(inWord);
                bool wordExceptionFlag = IsRealWordExceptions(inWord, unitDic);
                bool lengthFlag        = (inWordLen >= rwSplitWordMinLength);
                bool word2VecFlag      = word2VecOm.HasWordVec(inWord);
                bool wcFlag            = (WordCountScore.GetWc(inWord, wordWcMap) >= rwSplitWordMinWc);
                DebugPrint.PrintRwSplitDetect(inWord, realWordFlag, wordInDicFlag, wordExceptionFlag, lengthFlag, word2VecFlag, wcFlag, debugFlag);
            }
            return(realWordFlag);
        }
        // private methods
        private static bool IsValidMergeCand(MergeObj mergeObj, CSpellApi cSpellApi)
        {
            // WC is not used here
            WordWcMap wordWcMap        = cSpellApi.GetWordWcMap();
            Word2Vec  word2VecOm       = cSpellApi.GetWord2VecOm();
            string    coreMergeStr     = mergeObj.GetCoreMergeWord();
            int       rwMergeCandMinWc = cSpellApi.GetCanRwMergeCandMinWc();
            bool      flag             = ((word2VecOm.HasWordVec(coreMergeStr)) && (WordCountScore.GetWc(coreMergeStr, wordWcMap) >= rwMergeCandMinWc));

            return(flag);
        }
        // real-word candidate has more restriction than non-word
        // TBD, need to organize the code ...
        // the check should be done in the ranking
        // Core process for real-word candidates
        private static bool IsValid1To1Cand(string inWord, string cand, CSpellApi cSpellApi)
        {
            RootDictionary suggestDic = cSpellApi.GetSuggestDic();
            Word2Vec       word2VecOm = cSpellApi.GetWord2VecOm();
            WordWcMap      wordWcMap  = cSpellApi.GetWordWcMap();
            // real-word, check phonetic and suggDic
            // 1. check suggDic
            // 1.1 edDist <= 1
            // 1.2 edDist <= 2 && phonetic dist <= 1
            // 2. check if inflections, not a candidate real-word, not correct
            bool   flag                = false;
            int    rw1To1CandMinWc     = cSpellApi.GetCanRw1To1CandMinWc();
            int    rw1To1CandMinLength = cSpellApi.GetCanRw1To1CandMinLength();
            string inWordLc            = inWord.ToLower();
            int    inWordLen           = inWordLc.Length;
            int    candLen             = cand.Length;
            int    lenDiff             = inWordLen - candLen;

            // 1. check suggDic and inflVars
            if ((suggestDic.IsDicWord(cand) == true) && (word2VecOm.HasWordVec(cand) == true) && (candLen >= rw1To1CandMinLength) && (WordCountScore.GetWc(cand, wordWcMap) >= rw1To1CandMinWc) && (InflVarsUtil.IsInflectionVar(inWordLc, cand) == false))             // not inflVars
            {
                //&& ((lenDiff <= 1) && (lenDiff >= -1))) // length diff <= 1
                // more restriction for real-word candidates
                int pmDist     = Metaphone2.GetDistance(inWordLc, cand);
                int prDist     = RefinedSoundex.GetDistance(inWordLc, cand);
                int leadDist   = GetLeadCharDist(inWordLc, cand);
                int endDist    = GetEndCharDist(inWordLc, cand);
                int lengthDist = GetLengthDist(inWordLc, cand);
                int totalDist1 = leadDist + endDist + lengthDist + pmDist + prDist;
                int editDist   = EditDistance.GetDistanceForRealWord(inWordLc, cand);
                int totalDist2 = editDist + pmDist + prDist;
                // if they sound the same
                if ((pmDist == 0) && (prDist == 0))
                {
                    flag = true;
                }
                // if they sound similar and orthographic is also similar
                // fixed from empierical test, not configuable
                else if ((totalDist1 < 3) && (totalDist2 < 4) && (pmDist * prDist == 0))
                {
                    flag = true;
                }
            }
            return(flag);
        }
        // for the split, we don't want Aa as a valid word
        // because it will cause too much noise (less precision)
        // TBD ... re-organize
        private static bool IsValidSplitWord(string inWord, CSpellApi cSpellApi)
        {
            // splitWord uses LexiconNoAa for Dic
            RootDictionary splitWordDic = cSpellApi.GetSplitWordDic();
            WordWcMap      wordWcMap    = cSpellApi.GetWordWcMap();
            Word2Vec       word2VecOm   = cSpellApi.GetWord2VecOm();
            RootDictionary unitDic      = cSpellApi.GetUnitDic();
            RootDictionary pnDic        = cSpellApi.GetPnDic();
            //RootDictionary aaDic = cSpellApi.GetAaDic();
            int rwSplitCandMinWc = cSpellApi.GetCanRwSplitCandMinWc();
            // real-word cand split word must:
            // 1. check if in the splitWordDic, No Aa with a small length
            // such as cel is an overlap, it is aa or not-aa
            // 2. has word2Vec
            // 3. has WC
            // 4. not unit, mg -> ...
            // 5. not properNoun, human -> Hu man, where Hu is pn
            // children -> child ren, where ren is pn
            bool flag = (splitWordDic.IsDicWord(inWord)) && (word2VecOm.HasWordVec(inWord) == true) && (WordCountScore.GetWc(inWord, wordWcMap) >= rwSplitCandMinWc) && (!unitDic.IsDicWord(inWord)) && (!pnDic.IsDicWord(inWord));

            return(flag);
        }
Exemplo n.º 5
0
        // update parameter from the config file to cSpellApi
        private void Init(bool debugFlag)
        {
            // get config file from environment variable
            bool useClassPath = false;

            if (string.ReferenceEquals(configFile_, null))
            {
                useClassPath = true;
                configFile_  = "data.Config.cSpell";
            }
            // read in configuration file
            conf_ = new Configuration(configFile_, useClassPath);
            if (properties_ != null)
            {
                conf_.OverwriteProperties(properties_);
            }
            string cSpellDir = conf_.GetProperty(Configuration.CS_DIR);
            // files: pre-correction
            string infExpFile = cSpellDir + conf_.GetProperty(Configuration.CS_INFORMAL_EXP_FILE);

            infExpMap_ = InformalExpHandler.GetInformalExpMapFromFile(infExpFile);
            // get dictionary for spell checker
            string checkDicFileStrs = conf_.GetProperty(Configuration.CS_CHECK_DIC_FILES);

            checkDic_.AddDictionaries(checkDicFileStrs, cSpellDir, debugFlag);
            // get dictionary for spell suggestion - candidate
            string suggestDicFileStrs = conf_.GetProperty(Configuration.CS_SUGGEST_DIC_FILES);

            suggestDic_.AddDictionaries(suggestDicFileStrs, cSpellDir, debugFlag);
            // no acr/abb dictionary: en + pn, used for split check
            string splitWordDicFileStrs = conf_.GetProperty(Configuration.CS_SPLIT_WORD_DIC_FILES);

            splitWordDic_.AddDictionaries(splitWordDicFileStrs, cSpellDir, debugFlag);
            // mw dictionary
            string mwDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_MW_DIC_FILE);

            mwDic_.AddDictionary(mwDicFile);
            // properNoun dictionary
            string pnDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_PN_DIC_FILE);

            pnDic_.AddDictionary(pnDicFile);
            // abb/acr dictionary
            string aaDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_AA_DIC_FILE);

            aaDic_.AddDictionary(aaDicFile);
            // spVar dictionary
            string svDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_SV_DIC_FILE);

            svDic_.AddDictionary(svDicFile);
            // unit file
            string unitDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_UNIT_DIC_FILE);

            unitDic_.AddDictionary(unitDicFile);
            // frequency file
            string frequencyFile = cSpellDir + conf_.GetProperty(Configuration.CS_FREQUENCY_FILE);

            wordWcMap_ = new WordWcMap(frequencyFile);
            // word2Vec file
            string word2VecImFile = cSpellDir + conf_.GetProperty(Configuration.CS_W2V_IM_FILE);

            word2VecIm_ = new Word2Vec(word2VecImFile);
            string word2VecOmFile = cSpellDir + conf_.GetProperty(Configuration.CS_W2V_OM_FILE);

            word2VecOm_ = new Word2Vec(word2VecOmFile);
            // mode
            funcMode_ = int.Parse(conf_.GetProperty(Configuration.CS_FUNC_MODE));
            rankMode_ = int.Parse(conf_.GetProperty(Configuration.CS_RANK_MODE));
            // detectors
            maxLegitTokenLength_   = int.Parse(conf_.GetProperty(Configuration.CS_MAX_LEGIT_TOKEN_LENGTH));
            dRwSplitWordMinLength_ = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_SPLIT_WORD_MIN_LENGTH));
            dRwSplitWordMinWc_     = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_SPLIT_WORD_MIN_WC));
            dRw1To1WordMinLength_  = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_1TO1_WORD_MIN_LENGTH));
            dRw1To1WordMinWc_      = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_1TO1_WORD_MIN_WC));
            // candidates
            cMaxCandNo_         = int.Parse(conf_.GetProperty(Configuration.CS_CAN_MAX_CANDIDATE_NO));
            cNdMaxSplitNo_      = int.Parse(conf_.GetProperty(Configuration.CS_CAN_ND_MAX_SPLIT_NO));
            cNwMaxSplitNo_      = int.Parse(conf_.GetProperty(Configuration.CS_CAN_NW_MAX_SPLIT_NO));
            cNwMaxMergeNo_      = int.Parse(conf_.GetProperty(Configuration.CS_CAN_NW_MAX_MERGE_NO));
            cNwMergeWithHyphen_ = bool.Parse(conf_.GetProperty(Configuration.CS_CAN_NW_MERGE_WITH_HYPHEN));
            cRwMaxSplitNo_      = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MAX_SPLIT_NO));
            cRwMaxMergeNo_      = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MAX_MERGE_NO));
            cRwMergeWithHyphen_ = bool.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MERGE_WITH_HYPHEN));

            cRwShortSplitWordLength_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_SHORT_SPLIT_WORD_LENGTH));
            cRwMaxShortSplitWordNo_  = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MAX_SHORT_SPLIT_WORD_NO));
            cRwMergeCandMinWc_       = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MERGE_CAND_MIN_WC));
            cRwSplitCandMinWc_       = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_SPLIT_CAND_MIN_WC));
            cRw1To1CandMinLength_    = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_1TO1_CAND_MIN_LENGTH));
            cRw1To1CandMinWc_        = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_1TO1_CAND_MIN_WC));
            cRw1To1CandMaxKeySize_   = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_1TO1_CAND_MAX_KEY_SIZE));

            // rankers
            rNwS1RankRangeFac_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_NW_S1_RANK_RANGE_FAC));
            rNwS1MinOScore_    = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_NW_S1_MIN_OSCORE));
            rRw1To1CFac_       = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_C_FAC));
            rRwSplitCFac_      = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_SPLIT_C_FAC));
            rRwMergeCFac_      = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_MERGE_C_FAC));
            rRw1To1WordMinCs_  = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_WORD_MIN_CS));
            rRw1To1CandCsFac_  = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_CS_FAC));
            rRw1To1CandMinCs_  = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_MIN_CS));
            rRw1To1CandCsDist_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_CS_DIST));
            rRw1To1CandFsFac_  = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_FS_FAC));
            rRw1To1CandMinFs_  = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_MIN_FS));
            rRw1To1CandFsDist_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_FS_DIST));

            // Score
            orthoScoreEdDistFac_   = double.Parse(conf_.GetProperty(Configuration.CS_ORTHO_SCORE_ED_DIST_FAC));
            orthoScorePhoneticFac_ = double.Parse(conf_.GetProperty(Configuration.CS_ORTHO_SCORE_PHONETIC_FAC));
            orthoScoreOverlapFac_  = double.Parse(conf_.GetProperty(Configuration.CS_ORTHO_SCORE_OVERLAP_FAC));

            // context
            word2VecSkipWord_     = bool.Parse(conf_.GetProperty(Configuration.CS_W2V_SKIP_WORD));
            nw1To1ContextRadius_  = int.Parse(conf_.GetProperty(Configuration.CS_NW_1TO1_CONTEXT_RADIUS));
            nwSplitContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_NW_SPLIT_CONTEXT_RADIUS));
            nwMergeContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_NW_MERGE_CONTEXT_RADIUS));
            rw1To1ContextRadius_  = int.Parse(conf_.GetProperty(Configuration.CS_RW_1TO1_CONTEXT_RADIUS));
            rwSplitContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_RW_SPLIT_CONTEXT_RADIUS));
            rwMergeContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_RW_MERGE_CONTEXT_RADIUS));
        }
Exemplo n.º 6
0
        private void Init2(bool debugFlag)
        {
            _logger.LogInformation("cSpellApi initialization...");
            infExpMap_ = InformalExpHandler.GetInformalExpMapFromFile(_config.Value.CS_INFORMAL_EXP_FILE);
            checkDic_.AddDictionaries2(_config.Value.CS_CHECK_DIC_FILES, debugFlag);
            suggestDic_.AddDictionaries2(_config.Value.CS_SUGGEST_DIC_FILES, debugFlag);
            splitWordDic_.AddDictionaries2(_config.Value.CS_SPLIT_WORD_DIC_FILES, debugFlag);
            mwDic_.AddDictionary(_config.Value.CS_MW_DIC_FILE);
            pnDic_.AddDictionary(_config.Value.CS_PN_DIC_FILE);
            aaDic_.AddDictionary(_config.Value.CS_AA_DIC_FILE);
            svDic_.AddDictionary(_config.Value.CS_SV_DIC_FILE);
            unitDic_.AddDictionary(_config.Value.CS_UNIT_DIC_FILE);
            wordWcMap_  = new WordWcMap(_config.Value.CS_FREQUENCY_FILE);
            word2VecIm_ = new Word2Vec(_config.Value.CS_W2V_IM_FILE);
            word2VecOm_ = new Word2Vec(_config.Value.CS_W2V_OM_FILE);

            // mode
            funcMode_ = _config.Value.CS_FUNC_MODE;
            rankMode_ = _config.Value.CS_RANK_MODE;

            // detectors
            maxLegitTokenLength_   = _config.Value.CS_MAX_LEGIT_TOKEN_LENGTH;
            dRwSplitWordMinLength_ = _config.Value.CS_DETECTOR_RW_SPLIT_WORD_MIN_LENGTH;
            dRwSplitWordMinWc_     = _config.Value.CS_DETECTOR_RW_SPLIT_WORD_MIN_WC;
            dRw1To1WordMinLength_  = _config.Value.CS_DETECTOR_RW_1TO1_WORD_MIN_LENGTH;
            dRw1To1WordMinWc_      = _config.Value.CS_DETECTOR_RW_1TO1_WORD_MIN_WC;

            // candidates
            cMaxCandNo_         = _config.Value.CS_CAN_MAX_CANDIDATE_NO;
            cNdMaxSplitNo_      = _config.Value.CS_CAN_ND_MAX_SPLIT_NO;
            cNwMaxSplitNo_      = _config.Value.CS_CAN_NW_MAX_SPLIT_NO;
            cNwMaxMergeNo_      = _config.Value.CS_CAN_NW_MAX_MERGE_NO;
            cNwMergeWithHyphen_ = _config.Value.CS_CAN_NW_MERGE_WITH_HYPHEN;
            cRwMaxSplitNo_      = _config.Value.CS_CAN_RW_MAX_SPLIT_NO;
            cRwMaxMergeNo_      = _config.Value.CS_CAN_RW_MAX_MERGE_NO;
            cRwMergeWithHyphen_ = _config.Value.CS_CAN_RW_MERGE_WITH_HYPHEN;

            cRwShortSplitWordLength_ = _config.Value.CS_CAN_RW_SHORT_SPLIT_WORD_LENGTH;
            cRwMaxShortSplitWordNo_  = _config.Value.CS_CAN_RW_MAX_SHORT_SPLIT_WORD_NO;
            cRwMergeCandMinWc_       = _config.Value.CS_CAN_RW_MERGE_CAND_MIN_WC;
            cRwSplitCandMinWc_       = _config.Value.CS_CAN_RW_SPLIT_CAND_MIN_WC;
            cRw1To1CandMinLength_    = _config.Value.CS_CAN_RW_1TO1_CAND_MIN_LENGTH;
            cRw1To1CandMinWc_        = _config.Value.CS_CAN_RW_1TO1_CAND_MIN_WC;
            cRw1To1CandMaxKeySize_   = _config.Value.CS_CAN_RW_1TO1_CAND_MAX_KEY_SIZE;

            // rankers
            rNwS1RankRangeFac_ = _config.Value.CS_RANKER_NW_S1_RANK_RANGE_FAC;
            rNwS1MinOScore_    = _config.Value.CS_RANKER_NW_S1_MIN_OSCORE;
            rRw1To1CFac_       = _config.Value.CS_RANKER_RW_1TO1_C_FAC;
            rRwSplitCFac_      = _config.Value.CS_RANKER_RW_SPLIT_C_FAC;
            rRwMergeCFac_      = _config.Value.CS_RANKER_RW_MERGE_C_FAC;
            rRw1To1WordMinCs_  = _config.Value.CS_RANKER_RW_1TO1_WORD_MIN_CS;
            rRw1To1CandCsFac_  = _config.Value.CS_RANKER_RW_1TO1_CAND_CS_FAC;
            rRw1To1CandMinCs_  = _config.Value.CS_RANKER_RW_1TO1_CAND_MIN_CS;
            rRw1To1CandCsDist_ = _config.Value.CS_RANKER_RW_1TO1_CAND_CS_DIST;
            rRw1To1CandFsFac_  = _config.Value.CS_RANKER_RW_1TO1_CAND_FS_FAC;
            rRw1To1CandMinFs_  = _config.Value.CS_RANKER_RW_1TO1_CAND_MIN_FS;
            rRw1To1CandFsDist_ = _config.Value.CS_RANKER_RW_1TO1_CAND_FS_DIST;

            // Score
            orthoScoreEdDistFac_   = _config.Value.CS_ORTHO_SCORE_ED_DIST_FAC;
            orthoScorePhoneticFac_ = _config.Value.CS_ORTHO_SCORE_PHONETIC_FAC;
            orthoScoreOverlapFac_  = _config.Value.CS_ORTHO_SCORE_OVERLAP_FAC;

            // context
            word2VecSkipWord_     = _config.Value.CS_W2V_SKIP_WORD;
            nw1To1ContextRadius_  = _config.Value.CS_NW_1TO1_CONTEXT_RADIUS;
            nwSplitContextRadius_ = _config.Value.CS_NW_SPLIT_CONTEXT_RADIUS;
            nwMergeContextRadius_ = _config.Value.CS_NW_MERGE_CONTEXT_RADIUS;
            rw1To1ContextRadius_  = _config.Value.CS_RW_1TO1_CONTEXT_RADIUS;
            rwSplitContextRadius_ = _config.Value.CS_RW_SPLIT_CONTEXT_RADIUS;
            rwMergeContextRadius_ = _config.Value.CS_RW_MERGE_CONTEXT_RADIUS;
            _logger.LogInformation("cSpellApi initialized successfully");
        }