Ejemplo n.º 1
0
        public void TestGetSampleSentence()
        {
            var dictPath = PutTestFileOnDisk("HPBook3Dict.json");

            Assert.IsTrue(System.IO.File.Exists(dictPath));
            var dictText = System.IO.File.ReadAllText(dictPath);

            Assert.IsNotNull(dictText);
            var dict = Newtonsoft.Json.JsonConvert.DeserializeObject <Dictionary <string, int> >(dictText);

            Assert.IsNotNull(dict);
            Assert.IsFalse(dict.Count == 0);

            var testing = new Word2Vec();

            testing.Sample = 0;
            testing.BuildVocab(dict);
            var corpus =
                PutTestFileOnDisk("HPBook3.txt");

            testing.AssignCorpus(corpus);
            var testResult = testing.GetContextNodes(8);

            Assert.IsNotNull(testResult);
            Assert.AreNotEqual(0, testResult.Count);
            Console.WriteLine(string.Join(", ", testResult.Select(t => t.Word)));
        }
Ejemplo n.º 2
0
        public void Setup()
        {
            // Launching mkl for NumNet (path might need to be change)
            var path = "C:/data/dlls/mkl";

            StartProvider.LaunchMklRt(1, path);

            var m     = NN.Random.Normal(0f, 0.1f, Vocab, N);
            var vocab = (from i in Enumerable.Range(0, Vocab)
                         select i.ToString()).ToArray();

            _w2v = new Word2Vec(vocab, m);

            int batch = 50;

            vector_test = NN.Random.Normal(0, 0.1f, N, 1);
            matrix_test = NN.Random.Normal(0, 0.1f, N, batch);

            bestd_v    = new float[1][];
            bestw_v    = new int[1][];
            bestd_v[0] = new float[Neighbors];
            bestw_v[0] = new int[Neighbors];
            bestd_m    = new float[batch][];
            bestw_m    = new int[batch][];
            for (int i = 0; i < batch; i++)
            {
                bestd_m[i] = new float[Neighbors];
                bestw_m[i] = new int[Neighbors];
            }
        }
        public static bool IsRealWord(string inWord, CSpellApi cSpellApi, bool debugFlag)
        {
            // init
            RootDictionary checkDic   = cSpellApi.GetCheckDic();
            RootDictionary unitDic    = cSpellApi.GetUnitDic();
            WordWcMap      wordWcMap  = cSpellApi.GetWordWcMap();
            Word2Vec       word2VecOm = cSpellApi.GetWord2VecOm();
            int            inWordLen  = inWord.Length;
            // TBD, change method name
            int rwSplitWordMinLength = cSpellApi.GetDetectorRwSplitWordMinLength();
            int rwSplitWordMinWc     = cSpellApi.GetDetectorRwSplitWordMinWc();
            // realword must be:
            // 1. known in the dictionary
            // 2. not exception, such as url, email, digit, ...
            // => if excpetion, even is a non-word, no correction
            // 3. must have word2Vector value (inWord is auto converted to LC)
            // 4. frequency must be above a threshhold (inWord is auto to LC)
            // TBD, need to be configureable 200
            bool realWordFlag = (checkDic.IsValidWord(inWord)) && (!IsRealWordExceptions(inWord, unitDic) && (inWordLen >= rwSplitWordMinLength) && (word2VecOm.HasWordVec(inWord) == true) && (WordCountScore.GetWc(inWord, wordWcMap) >= rwSplitWordMinWc));

            if (debugFlag == true)
            {
                bool wordInDicFlag     = checkDic.IsValidWord(inWord);
                bool wordExceptionFlag = IsRealWordExceptions(inWord, unitDic);
                bool lengthFlag        = (inWordLen >= rwSplitWordMinLength);
                bool word2VecFlag      = word2VecOm.HasWordVec(inWord);
                bool wcFlag            = (WordCountScore.GetWc(inWord, wordWcMap) >= rwSplitWordMinWc);
                DebugPrint.PrintRwSplitDetect(inWord, realWordFlag, wordInDicFlag, wordExceptionFlag, lengthFlag, word2VecFlag, wcFlag, debugFlag);
            }
            return(realWordFlag);
        }
Ejemplo n.º 4
0
        public void TestGetRandomIndicesAroundSentencePosition()
        {
            var testing    = new Word2Vec();
            var testResult = testing.GetRandomIndicesAroundPosition(28, new Tuple <int, int>(2, 9));

            Assert.IsNotNull(testResult);
            Assert.AreNotEqual(0, testResult.Length);

            Assert.AreEqual(25, testResult[0]);
            Assert.AreEqual(26, testResult[1]);
            Assert.AreEqual(27, testResult[2]);
            Assert.AreEqual(29, testResult[3]);
            Assert.AreEqual(30, testResult[4]);
            Assert.AreEqual(31, testResult[5]);

            //test if we just want one-to-the-left and one-to-the-right
            testing.Window = 1;
            //since this is the first word there is only one to the right
            testResult = testing.GetRandomIndicesAroundPosition(0, new Tuple <int, int>(0, 2));
            Assert.IsNotNull(testResult);
            Assert.AreEqual(1, testResult.Length);
            Assert.AreEqual(1, testResult[0]);

            testResult = testing.GetRandomIndicesAroundPosition(1, new Tuple <int, int>(0, 2));
            Assert.IsNotNull(testResult);
            Assert.AreEqual(2, testResult.Length);
            Assert.AreEqual(0, testResult[0]);
            Assert.AreEqual(2, testResult[1]);
        }
Ejemplo n.º 5
0
        private Array <float> _test_3;          // [_test_1, _test_2] concatenation, shape (4, 2)

        public TestSortingNeighbors(Shared shared)
        {
            this._w2v    = shared.W2v;
            this._test_1 = shared.Test1;
            this._test_2 = shared.Test2;
            this._test_3 = shared.Test3;
        }
Ejemplo n.º 6
0
        public void TestWord2VecModel()
        {
            DataFrame documentDataFrame =
                _spark.Sql("SELECT split('Hi I heard about Spark', ' ') as text");

            Word2Vec word2vec = new Word2Vec()
                                .SetInputCol("text")
                                .SetOutputCol("result")
                                .SetMinCount(1);

            Word2VecModel model = word2vec.Fit(documentDataFrame);

            const int expectedSynonyms = 2;
            DataFrame synonyms         = model.FindSynonyms("Hi", expectedSynonyms);

            Assert.Equal(expectedSynonyms, synonyms.Count());
            synonyms.Show();

            using (var tempDirectory = new TemporaryDirectory())
            {
                string savePath = Path.Join(tempDirectory.Path, "word2vecModel");
                model.Save(savePath);

                Word2VecModel loadedModel = Word2VecModel.Load(savePath);
                Assert.Equal(model.Uid(), loadedModel.Uid());
            }
        }
Ejemplo n.º 7
0
        public static NDArray Vectorize(string doc, Word2Vec model)
        {
            doc = doc.ToLower();
            Regex rgx = new Regex("[^a-zåäöA-ZÅÄÖ0-9 -]");

            doc = rgx.Replace(doc, "");

            var words = Array.FindAll(doc.Split(' '),
                                      word => !IsStopWord(word));
            var word_vectors = np.zeros((words.Length, MODEL_VECTOR_SIZE));

            for (int i = 0; i < words.Length; i++)
            {
                try
                {
                    var vector = model[words[i]];
                    for (int j = 0; j < vector.Size; j++)
                    {
                        word_vectors[i, j] = (float)vector[j];
                    }
                }
                catch (System.Exception)
                {
                    //Console.WriteLine("Could not find word, " + words[i]);
                }
            }

            return(np.mean(word_vectors, 0));
        }
Ejemplo n.º 8
0
        public Shared()
        {
            NN.Random.Seed(123); // setting seed of NumNet

            // creating word2vec
            var matrix = NN.Random.Normal(0, 1, 10, 4);
            var words  = new string[10] {
                "a", "b", "c", "d", "e", "f", "g", "h", "i", "j"
            };

            this.W2v = new Word2Vec(words, matrix);

            // creating sample vectors for neighbor search
            Test1 = NN.Random.Normal(0, 1, 4); // vector
            Test2 = NN.Random.Normal(0, 1, 4); // second vector

            var values = new float[8];

            for (int i = 0; i < 4; i++)
            {
                values[2 * i]     = Test1.Values[i];
                values[2 * i + 1] = Test2.Values[i];
            }
            Test3 = NN.Array(values).Reshape(4, 2); // concat of the 2 first vectors
        }
Ejemplo n.º 9
0
        static void Main(string[] args)
        {
            var word2Vec = new Word2Vec();

            word2Vec.SetSeed(123);

            Console.WriteLine(word2Vec.ExplainParams());

            Console.WriteLine("~~~~~~");

            var seedParam = new Param(word2Vec, "seed", "Setting the seed to 54321");

            word2Vec.Set(seedParam, 54321L);

            Console.WriteLine(word2Vec.ExplainParams());

            Console.WriteLine("~~~~~~");

            var seed = word2Vec.GetParam("seed");

            word2Vec.Set(seed, 12345L);
            Console.WriteLine(word2Vec.ExplainParams());

            word2Vec.Clear(seed);
            Console.WriteLine(word2Vec.ExplainParams());
        }
Ejemplo n.º 10
0
        public void TestReadNextNode()
        {
            var corpus = @"The dog saw a cat. The dog chased a cat. The cat climbed a tree";

            var testing = new Word2Vec
            {
                Sample = 0,
                Size   = 3,
                Window = 1
            };

            testing.BuildVocab(corpus);

            var vocab = testing.Vocab;
            var leafs = vocab.GetLeafs();

            for (var i = 0; i < 4; i++)
            {
                var b = testing.ReadNextWord();
                Assert.IsNotNull(b);
                Assert.IsNotNull(b.TargetWord);
                Assert.IsNotNull(b.ContextWords);
                Assert.AreNotEqual(0, b.ContextWords.Count);
            }

            testing.CurrentCorpusPosition = 25;
            var testResultNull = testing.ReadNextWord();

            Assert.IsNull(testResultNull);
        }
Ejemplo n.º 11
0
        public void CanDumpVectors()
        {
            int vocSize = 100, vecSize = 10;
            var vectors = NN.Random.Uniform(-1f, 1f, vocSize, vecSize);
            var words   = Enumerable.Range(0, vocSize).Select(i => i.ToString()).ToArray();
            var w2v     = new Word2Vec(words, vectors);

            w2v.SaveBinary(Path.GetTempFileName());
        }
        // private methods
        private static bool IsValidMergeCand(MergeObj mergeObj, CSpellApi cSpellApi)
        {
            // WC is not used here
            WordWcMap wordWcMap        = cSpellApi.GetWordWcMap();
            Word2Vec  word2VecOm       = cSpellApi.GetWord2VecOm();
            string    coreMergeStr     = mergeObj.GetCoreMergeWord();
            int       rwMergeCandMinWc = cSpellApi.GetCanRwMergeCandMinWc();
            bool      flag             = ((word2VecOm.HasWordVec(coreMergeStr)) && (WordCountScore.GetWc(coreMergeStr, wordWcMap) >= rwMergeCandMinWc));

            return(flag);
        }
Ejemplo n.º 13
0
        static void Main(string[] args)
        {
            //var word2VecModel = Word2Vec.LoadBinary("models/GoogleNews-vectors-negative300.bin", true, false, "", Encoding.GetEncoding("ISO-8859-1"));
            var word2VecModel = Word2Vec.LoadText("models/bloggmix2013w2v.txt", true, false, "");

            var train = getQuestionsAsVector(0, 3000, "data/train_sv.csv", word2VecModel);
            var test  = getQuestionsAsVector(0, 100, "data/test_sv.csv", word2VecModel);

            Learning.LargeNetwork(train, test);
            Learning.LightGbm(train);
        }
Ejemplo n.º 14
0
        public void TestGetRandomWindowStartEnd()
        {
            var testing    = new Word2Vec();
            var testResult = testing.GetRandomWindowStartEnd();

            Assert.IsNotNull(testResult);
            Console.WriteLine(testResult);
            Assert.IsTrue(testResult.Item2 > testResult.Item1);

            testing.Window = 1;
            testResult     = testing.GetRandomWindowStartEnd();
            Console.WriteLine(testResult);
        }
Ejemplo n.º 15
0
        public void CanReadDumpedVectors()
        {
            int vocSize = 100, vecSize = 10;
            var vectors = NN.Random.Uniform(-1f, 1f, vocSize, vecSize);
            var words   = Enumerable.Range(0, vocSize).Select(i => i.ToString()).ToArray();
            var w2v     = new Word2Vec(words, vectors);
            var path    = Path.GetTempFileName();

            w2v.SaveBinary(path);
            var w2vLoaded = Word2Vec.LoadBinary(path, normalize: false);

            AssertArray.AreEqual(vectors, w2vLoaded.Vectors);
            AssertArray.AreEqual(words, w2vLoaded.Text);
        }
Ejemplo n.º 16
0
        public void TestWord2Vec()
        {
            DataFrame documentDataFrame = _spark.Sql("SELECT split('Spark dotnet is cool', ' ')");

            const string expectedInputCol          = "text";
            const string expectedOutputCol         = "result";
            const int    expectedMinCount          = 0;
            const int    expectedMaxIter           = 10;
            const int    expectedMaxSentenceLength = 100;
            const int    expectedNumPartitions     = 1000;
            const int    expectedSeed       = 10000;
            const double expectedStepSize   = 1.9;
            const int    expectedVectorSize = 20;
            const int    expectedWindowSize = 200;

            Word2Vec word2vec = new Word2Vec()
                                .SetInputCol(expectedInputCol)
                                .SetOutputCol(expectedOutputCol)
                                .SetMinCount(expectedMinCount)
                                .SetMaxIter(expectedMaxIter)
                                .SetMaxSentenceLength(expectedMaxSentenceLength)
                                .SetNumPartitions(expectedNumPartitions)
                                .SetSeed(expectedSeed)
                                .SetStepSize(expectedStepSize)
                                .SetVectorSize(expectedVectorSize)
                                .SetWindowSize(expectedWindowSize);

            Assert.Equal(expectedInputCol, word2vec.GetInputCol());
            Assert.Equal(expectedOutputCol, word2vec.GetOutputCol());
            Assert.Equal(expectedMinCount, word2vec.GetMinCount());
            Assert.Equal(expectedMaxIter, word2vec.GetMaxIter());
            Assert.Equal(expectedMaxSentenceLength, word2vec.GetMaxSentenceLength());
            Assert.Equal(expectedNumPartitions, word2vec.GetNumPartitions());
            Assert.Equal(expectedSeed, word2vec.GetSeed());
            Assert.Equal(expectedStepSize, word2vec.GetStepSize());
            Assert.Equal(expectedVectorSize, word2vec.GetVectorSize());
            Assert.Equal(expectedWindowSize, word2vec.GetWindowSize());

            using (var tempDirectory = new TemporaryDirectory())
            {
                string savePath = Path.Join(tempDirectory.Path, "word2vec");
                word2vec.Save(savePath);

                Word2Vec loadedWord2Vec = Word2Vec.Load(savePath);
                Assert.Equal(word2vec.Uid(), loadedWord2Vec.Uid());
            }
        }
        // real-word candidate has more restriction than non-word
        // TBD, need to organize the code ...
        // the check should be done in the ranking
        // Core process for real-word candidates
        private static bool IsValid1To1Cand(string inWord, string cand, CSpellApi cSpellApi)
        {
            RootDictionary suggestDic = cSpellApi.GetSuggestDic();
            Word2Vec       word2VecOm = cSpellApi.GetWord2VecOm();
            WordWcMap      wordWcMap  = cSpellApi.GetWordWcMap();
            // real-word, check phonetic and suggDic
            // 1. check suggDic
            // 1.1 edDist <= 1
            // 1.2 edDist <= 2 && phonetic dist <= 1
            // 2. check if inflections, not a candidate real-word, not correct
            bool   flag                = false;
            int    rw1To1CandMinWc     = cSpellApi.GetCanRw1To1CandMinWc();
            int    rw1To1CandMinLength = cSpellApi.GetCanRw1To1CandMinLength();
            string inWordLc            = inWord.ToLower();
            int    inWordLen           = inWordLc.Length;
            int    candLen             = cand.Length;
            int    lenDiff             = inWordLen - candLen;

            // 1. check suggDic and inflVars
            if ((suggestDic.IsDicWord(cand) == true) && (word2VecOm.HasWordVec(cand) == true) && (candLen >= rw1To1CandMinLength) && (WordCountScore.GetWc(cand, wordWcMap) >= rw1To1CandMinWc) && (InflVarsUtil.IsInflectionVar(inWordLc, cand) == false))             // not inflVars
            {
                //&& ((lenDiff <= 1) && (lenDiff >= -1))) // length diff <= 1
                // more restriction for real-word candidates
                int pmDist     = Metaphone2.GetDistance(inWordLc, cand);
                int prDist     = RefinedSoundex.GetDistance(inWordLc, cand);
                int leadDist   = GetLeadCharDist(inWordLc, cand);
                int endDist    = GetEndCharDist(inWordLc, cand);
                int lengthDist = GetLengthDist(inWordLc, cand);
                int totalDist1 = leadDist + endDist + lengthDist + pmDist + prDist;
                int editDist   = EditDistance.GetDistanceForRealWord(inWordLc, cand);
                int totalDist2 = editDist + pmDist + prDist;
                // if they sound the same
                if ((pmDist == 0) && (prDist == 0))
                {
                    flag = true;
                }
                // if they sound similar and orthographic is also similar
                // fixed from empierical test, not configuable
                else if ((totalDist1 < 3) && (totalDist2 < 4) && (pmDist * prDist == 0))
                {
                    flag = true;
                }
            }
            return(flag);
        }
Ejemplo n.º 18
0
        public void TestToVector()
        {
            var corpus = @"The dog saw a cat. The dog chased a cat. The cat climbed a tree";

            var testing = new Word2Vec
            {
                Sample = 0,
                Size   = 3,
                Window = 1
            };

            testing.BuildVocab(corpus);
            var word00 = testing.Vocab.GetNodeByWord("dog");
            var word01 = testing.Vocab.GetNodeByWord("cat");

            var testResult = testing.ToVector(word00);

            for (var j = 0; j < testResult.CountOfColumns(); j++)
            {
                if (j == word00.Index)
                {
                    Assert.AreEqual(1D, testResult[0, j]);
                }
                else
                {
                    Assert.AreEqual(0D, testResult[0, j]);
                }
            }

            testResult = testing.ToVector(new List <HuffmanNode> {
                word00, word01
            });
            for (var j = 0; j < testResult.CountOfColumns(); j++)
            {
                if (j == word00.Index || j == word01.Index)
                {
                    Assert.IsTrue(System.Math.Abs(testResult[0, j] - 0.5) < 0.0001);
                }
                else
                {
                    Assert.AreEqual(0D, testResult[0, j]);
                }
            }
        }
Ejemplo n.º 19
0
        public void TestPseudoInverse()
        {
            var path  = @"C:\Users\joc\AppData\Local\ProtoStudio\Banque\embeddings.bin";
            var words = Word2Vec.LoadBinary(path, normalize: true).Vectors /*[_, Until(100)]*/;
            //var pseudoInv = PseudoInv(words);
            var pseudoInv = PowerMethod(words);

            // when embeddings have linearly independent dimensions
            AssertArray.AreAlmostEqual(NN.Eye(words.Shape[1]), pseudoInv.Dot(words), 1e-6f, 1e-6f);

            // least probable: words are NOT linearly idependent
            //AssertArray.AreAlmostEqual(NN.Eye(words.Shape[0]), words.Dot(pseudoInv), 1e-3f, 1e-5f);

            if (words.Shape[0] <= 1000) // otherwise too long
            {
                AssertArray.AreAlmostEqual(words, words.Dot(pseudoInv).Dot(words), 1e-6f, 1e-6f);
            }
            AssertArray.AreAlmostEqual(pseudoInv, pseudoInv.Dot(words).Dot(pseudoInv), 1e-6f, 1e-6f);
        }
Ejemplo n.º 20
0
        public void TestBuildVocab()
        {
            var textPath = PutTestFileOnDisk("HPBook3.txt");

            Console.WriteLine(textPath);

            Assert.IsTrue(System.IO.File.Exists(textPath));
            var text = System.IO.File.ReadAllText(textPath);

            Assert.IsNotNull(text);
            var testing = new Word2Vec();

            testing.BuildVocab(text);
            Assert.IsNotNull(testing.Vocab);
            var testResultLeafs = testing.Vocab.GetLeafs();

            Assert.IsNotNull(testResultLeafs);
            Assert.AreNotEqual(0, testResultLeafs.Count);
        }
Ejemplo n.º 21
0
        public void TestGetSampleSentence_Simple()
        {
            var corpus = @"The dog saw a cat. The dog chased a cat. The cat climbed a tree";

            var testing = new Word2Vec
            {
                Sample = 0,
                Size   = 3,
                Window = 1
            };

            testing.BuildVocab(corpus);
            //TODO - once implementation is figured out
            var testResult = testing.GetContextNodes(2);

            Assert.IsNotNull(testResult);
            Assert.AreNotEqual(0, testResult.Count);
            Console.WriteLine(string.Join(", ", testResult.Select(t => t.Word)));
        }
        // for the split, we don't want Aa as a valid word
        // because it will cause too much noise (less precision)
        // TBD ... re-organize
        private static bool IsValidSplitWord(string inWord, CSpellApi cSpellApi)
        {
            // splitWord uses LexiconNoAa for Dic
            RootDictionary splitWordDic = cSpellApi.GetSplitWordDic();
            WordWcMap      wordWcMap    = cSpellApi.GetWordWcMap();
            Word2Vec       word2VecOm   = cSpellApi.GetWord2VecOm();
            RootDictionary unitDic      = cSpellApi.GetUnitDic();
            RootDictionary pnDic        = cSpellApi.GetPnDic();
            //RootDictionary aaDic = cSpellApi.GetAaDic();
            int rwSplitCandMinWc = cSpellApi.GetCanRwSplitCandMinWc();
            // real-word cand split word must:
            // 1. check if in the splitWordDic, No Aa with a small length
            // such as cel is an overlap, it is aa or not-aa
            // 2. has word2Vec
            // 3. has WC
            // 4. not unit, mg -> ...
            // 5. not properNoun, human -> Hu man, where Hu is pn
            // children -> child ren, where ren is pn
            bool flag = (splitWordDic.IsDicWord(inWord)) && (word2VecOm.HasWordVec(inWord) == true) && (WordCountScore.GetWc(inWord, wordWcMap) >= rwSplitCandMinWc) && (!unitDic.IsDicWord(inWord)) && (!pnDic.IsDicWord(inWord));

            return(flag);
        }
Ejemplo n.º 23
0
        static List <Tuple <bool, float[]> > getQuestionsAsVector(int from, int to, string filePath, Word2Vec word2VecModel)
        {
            //var word2VecModel = Word2Vec.LoadText("models/newsv/model.txt", true, false, "");

            var docRelations = new List <Tuple <bool, float[]> >();
            var qPs          = readQuestionPairs(from, to, filePath);

            foreach (QuoraQuestionPair questionPair in qPs)
            {
                try{
                    NDArray v1 = TextProcessing.Vectorize(questionPair.Question1, word2VecModel);
                    NDArray v2 = TextProcessing.Vectorize(questionPair.Question2, word2VecModel);
                    //NDArray diff = np.subtract(v1, v2);
                    double[] conc = np.concatenate((v1, v2)).ToArray <double>();
                    // NDArray -> double[] -> float[]
                    float[] merged      = Array.ConvertAll(conc, s => (float)s);
                    var     docRelation = Tuple.Create(questionPair.Related, merged);
                    docRelations.Add(docRelation);
                }catch (System.Exception) {
                    Console.WriteLine("could not vectorize.");
                }
            }
            return(docRelations);
        }
Ejemplo n.º 24
0
        // update parameter from the config file to cSpellApi
        private void Init(bool debugFlag)
        {
            // get config file from environment variable
            bool useClassPath = false;

            if (string.ReferenceEquals(configFile_, null))
            {
                useClassPath = true;
                configFile_  = "data.Config.cSpell";
            }
            // read in configuration file
            conf_ = new Configuration(configFile_, useClassPath);
            if (properties_ != null)
            {
                conf_.OverwriteProperties(properties_);
            }
            string cSpellDir = conf_.GetProperty(Configuration.CS_DIR);
            // files: pre-correction
            string infExpFile = cSpellDir + conf_.GetProperty(Configuration.CS_INFORMAL_EXP_FILE);

            infExpMap_ = InformalExpHandler.GetInformalExpMapFromFile(infExpFile);
            // get dictionary for spell checker
            string checkDicFileStrs = conf_.GetProperty(Configuration.CS_CHECK_DIC_FILES);

            checkDic_.AddDictionaries(checkDicFileStrs, cSpellDir, debugFlag);
            // get dictionary for spell suggestion - candidate
            string suggestDicFileStrs = conf_.GetProperty(Configuration.CS_SUGGEST_DIC_FILES);

            suggestDic_.AddDictionaries(suggestDicFileStrs, cSpellDir, debugFlag);
            // no acr/abb dictionary: en + pn, used for split check
            string splitWordDicFileStrs = conf_.GetProperty(Configuration.CS_SPLIT_WORD_DIC_FILES);

            splitWordDic_.AddDictionaries(splitWordDicFileStrs, cSpellDir, debugFlag);
            // mw dictionary
            string mwDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_MW_DIC_FILE);

            mwDic_.AddDictionary(mwDicFile);
            // properNoun dictionary
            string pnDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_PN_DIC_FILE);

            pnDic_.AddDictionary(pnDicFile);
            // abb/acr dictionary
            string aaDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_AA_DIC_FILE);

            aaDic_.AddDictionary(aaDicFile);
            // spVar dictionary
            string svDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_SV_DIC_FILE);

            svDic_.AddDictionary(svDicFile);
            // unit file
            string unitDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_UNIT_DIC_FILE);

            unitDic_.AddDictionary(unitDicFile);
            // frequency file
            string frequencyFile = cSpellDir + conf_.GetProperty(Configuration.CS_FREQUENCY_FILE);

            wordWcMap_ = new WordWcMap(frequencyFile);
            // word2Vec file
            string word2VecImFile = cSpellDir + conf_.GetProperty(Configuration.CS_W2V_IM_FILE);

            word2VecIm_ = new Word2Vec(word2VecImFile);
            string word2VecOmFile = cSpellDir + conf_.GetProperty(Configuration.CS_W2V_OM_FILE);

            word2VecOm_ = new Word2Vec(word2VecOmFile);
            // mode
            funcMode_ = int.Parse(conf_.GetProperty(Configuration.CS_FUNC_MODE));
            rankMode_ = int.Parse(conf_.GetProperty(Configuration.CS_RANK_MODE));
            // detectors
            maxLegitTokenLength_   = int.Parse(conf_.GetProperty(Configuration.CS_MAX_LEGIT_TOKEN_LENGTH));
            dRwSplitWordMinLength_ = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_SPLIT_WORD_MIN_LENGTH));
            dRwSplitWordMinWc_     = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_SPLIT_WORD_MIN_WC));
            dRw1To1WordMinLength_  = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_1TO1_WORD_MIN_LENGTH));
            dRw1To1WordMinWc_      = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_1TO1_WORD_MIN_WC));
            // candidates
            cMaxCandNo_         = int.Parse(conf_.GetProperty(Configuration.CS_CAN_MAX_CANDIDATE_NO));
            cNdMaxSplitNo_      = int.Parse(conf_.GetProperty(Configuration.CS_CAN_ND_MAX_SPLIT_NO));
            cNwMaxSplitNo_      = int.Parse(conf_.GetProperty(Configuration.CS_CAN_NW_MAX_SPLIT_NO));
            cNwMaxMergeNo_      = int.Parse(conf_.GetProperty(Configuration.CS_CAN_NW_MAX_MERGE_NO));
            cNwMergeWithHyphen_ = bool.Parse(conf_.GetProperty(Configuration.CS_CAN_NW_MERGE_WITH_HYPHEN));
            cRwMaxSplitNo_      = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MAX_SPLIT_NO));
            cRwMaxMergeNo_      = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MAX_MERGE_NO));
            cRwMergeWithHyphen_ = bool.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MERGE_WITH_HYPHEN));

            cRwShortSplitWordLength_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_SHORT_SPLIT_WORD_LENGTH));
            cRwMaxShortSplitWordNo_  = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MAX_SHORT_SPLIT_WORD_NO));
            cRwMergeCandMinWc_       = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MERGE_CAND_MIN_WC));
            cRwSplitCandMinWc_       = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_SPLIT_CAND_MIN_WC));
            cRw1To1CandMinLength_    = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_1TO1_CAND_MIN_LENGTH));
            cRw1To1CandMinWc_        = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_1TO1_CAND_MIN_WC));
            cRw1To1CandMaxKeySize_   = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_1TO1_CAND_MAX_KEY_SIZE));

            // rankers
            rNwS1RankRangeFac_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_NW_S1_RANK_RANGE_FAC));
            rNwS1MinOScore_    = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_NW_S1_MIN_OSCORE));
            rRw1To1CFac_       = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_C_FAC));
            rRwSplitCFac_      = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_SPLIT_C_FAC));
            rRwMergeCFac_      = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_MERGE_C_FAC));
            rRw1To1WordMinCs_  = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_WORD_MIN_CS));
            rRw1To1CandCsFac_  = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_CS_FAC));
            rRw1To1CandMinCs_  = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_MIN_CS));
            rRw1To1CandCsDist_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_CS_DIST));
            rRw1To1CandFsFac_  = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_FS_FAC));
            rRw1To1CandMinFs_  = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_MIN_FS));
            rRw1To1CandFsDist_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_FS_DIST));

            // Score
            orthoScoreEdDistFac_   = double.Parse(conf_.GetProperty(Configuration.CS_ORTHO_SCORE_ED_DIST_FAC));
            orthoScorePhoneticFac_ = double.Parse(conf_.GetProperty(Configuration.CS_ORTHO_SCORE_PHONETIC_FAC));
            orthoScoreOverlapFac_  = double.Parse(conf_.GetProperty(Configuration.CS_ORTHO_SCORE_OVERLAP_FAC));

            // context
            word2VecSkipWord_     = bool.Parse(conf_.GetProperty(Configuration.CS_W2V_SKIP_WORD));
            nw1To1ContextRadius_  = int.Parse(conf_.GetProperty(Configuration.CS_NW_1TO1_CONTEXT_RADIUS));
            nwSplitContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_NW_SPLIT_CONTEXT_RADIUS));
            nwMergeContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_NW_MERGE_CONTEXT_RADIUS));
            rw1To1ContextRadius_  = int.Parse(conf_.GetProperty(Configuration.CS_RW_1TO1_CONTEXT_RADIUS));
            rwSplitContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_RW_SPLIT_CONTEXT_RADIUS));
            rwMergeContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_RW_MERGE_CONTEXT_RADIUS));
        }
Ejemplo n.º 25
0
        private void Init2(bool debugFlag)
        {
            _logger.LogInformation("cSpellApi initialization...");
            infExpMap_ = InformalExpHandler.GetInformalExpMapFromFile(_config.Value.CS_INFORMAL_EXP_FILE);
            checkDic_.AddDictionaries2(_config.Value.CS_CHECK_DIC_FILES, debugFlag);
            suggestDic_.AddDictionaries2(_config.Value.CS_SUGGEST_DIC_FILES, debugFlag);
            splitWordDic_.AddDictionaries2(_config.Value.CS_SPLIT_WORD_DIC_FILES, debugFlag);
            mwDic_.AddDictionary(_config.Value.CS_MW_DIC_FILE);
            pnDic_.AddDictionary(_config.Value.CS_PN_DIC_FILE);
            aaDic_.AddDictionary(_config.Value.CS_AA_DIC_FILE);
            svDic_.AddDictionary(_config.Value.CS_SV_DIC_FILE);
            unitDic_.AddDictionary(_config.Value.CS_UNIT_DIC_FILE);
            wordWcMap_  = new WordWcMap(_config.Value.CS_FREQUENCY_FILE);
            word2VecIm_ = new Word2Vec(_config.Value.CS_W2V_IM_FILE);
            word2VecOm_ = new Word2Vec(_config.Value.CS_W2V_OM_FILE);

            // mode
            funcMode_ = _config.Value.CS_FUNC_MODE;
            rankMode_ = _config.Value.CS_RANK_MODE;

            // detectors
            maxLegitTokenLength_   = _config.Value.CS_MAX_LEGIT_TOKEN_LENGTH;
            dRwSplitWordMinLength_ = _config.Value.CS_DETECTOR_RW_SPLIT_WORD_MIN_LENGTH;
            dRwSplitWordMinWc_     = _config.Value.CS_DETECTOR_RW_SPLIT_WORD_MIN_WC;
            dRw1To1WordMinLength_  = _config.Value.CS_DETECTOR_RW_1TO1_WORD_MIN_LENGTH;
            dRw1To1WordMinWc_      = _config.Value.CS_DETECTOR_RW_1TO1_WORD_MIN_WC;

            // candidates
            cMaxCandNo_         = _config.Value.CS_CAN_MAX_CANDIDATE_NO;
            cNdMaxSplitNo_      = _config.Value.CS_CAN_ND_MAX_SPLIT_NO;
            cNwMaxSplitNo_      = _config.Value.CS_CAN_NW_MAX_SPLIT_NO;
            cNwMaxMergeNo_      = _config.Value.CS_CAN_NW_MAX_MERGE_NO;
            cNwMergeWithHyphen_ = _config.Value.CS_CAN_NW_MERGE_WITH_HYPHEN;
            cRwMaxSplitNo_      = _config.Value.CS_CAN_RW_MAX_SPLIT_NO;
            cRwMaxMergeNo_      = _config.Value.CS_CAN_RW_MAX_MERGE_NO;
            cRwMergeWithHyphen_ = _config.Value.CS_CAN_RW_MERGE_WITH_HYPHEN;

            cRwShortSplitWordLength_ = _config.Value.CS_CAN_RW_SHORT_SPLIT_WORD_LENGTH;
            cRwMaxShortSplitWordNo_  = _config.Value.CS_CAN_RW_MAX_SHORT_SPLIT_WORD_NO;
            cRwMergeCandMinWc_       = _config.Value.CS_CAN_RW_MERGE_CAND_MIN_WC;
            cRwSplitCandMinWc_       = _config.Value.CS_CAN_RW_SPLIT_CAND_MIN_WC;
            cRw1To1CandMinLength_    = _config.Value.CS_CAN_RW_1TO1_CAND_MIN_LENGTH;
            cRw1To1CandMinWc_        = _config.Value.CS_CAN_RW_1TO1_CAND_MIN_WC;
            cRw1To1CandMaxKeySize_   = _config.Value.CS_CAN_RW_1TO1_CAND_MAX_KEY_SIZE;

            // rankers
            rNwS1RankRangeFac_ = _config.Value.CS_RANKER_NW_S1_RANK_RANGE_FAC;
            rNwS1MinOScore_    = _config.Value.CS_RANKER_NW_S1_MIN_OSCORE;
            rRw1To1CFac_       = _config.Value.CS_RANKER_RW_1TO1_C_FAC;
            rRwSplitCFac_      = _config.Value.CS_RANKER_RW_SPLIT_C_FAC;
            rRwMergeCFac_      = _config.Value.CS_RANKER_RW_MERGE_C_FAC;
            rRw1To1WordMinCs_  = _config.Value.CS_RANKER_RW_1TO1_WORD_MIN_CS;
            rRw1To1CandCsFac_  = _config.Value.CS_RANKER_RW_1TO1_CAND_CS_FAC;
            rRw1To1CandMinCs_  = _config.Value.CS_RANKER_RW_1TO1_CAND_MIN_CS;
            rRw1To1CandCsDist_ = _config.Value.CS_RANKER_RW_1TO1_CAND_CS_DIST;
            rRw1To1CandFsFac_  = _config.Value.CS_RANKER_RW_1TO1_CAND_FS_FAC;
            rRw1To1CandMinFs_  = _config.Value.CS_RANKER_RW_1TO1_CAND_MIN_FS;
            rRw1To1CandFsDist_ = _config.Value.CS_RANKER_RW_1TO1_CAND_FS_DIST;

            // Score
            orthoScoreEdDistFac_   = _config.Value.CS_ORTHO_SCORE_ED_DIST_FAC;
            orthoScorePhoneticFac_ = _config.Value.CS_ORTHO_SCORE_PHONETIC_FAC;
            orthoScoreOverlapFac_  = _config.Value.CS_ORTHO_SCORE_OVERLAP_FAC;

            // context
            word2VecSkipWord_     = _config.Value.CS_W2V_SKIP_WORD;
            nw1To1ContextRadius_  = _config.Value.CS_NW_1TO1_CONTEXT_RADIUS;
            nwSplitContextRadius_ = _config.Value.CS_NW_SPLIT_CONTEXT_RADIUS;
            nwMergeContextRadius_ = _config.Value.CS_NW_MERGE_CONTEXT_RADIUS;
            rw1To1ContextRadius_  = _config.Value.CS_RW_1TO1_CONTEXT_RADIUS;
            rwSplitContextRadius_ = _config.Value.CS_RW_SPLIT_CONTEXT_RADIUS;
            rwMergeContextRadius_ = _config.Value.CS_RW_MERGE_CONTEXT_RADIUS;
            _logger.LogInformation("cSpellApi initialized successfully");
        }
Ejemplo n.º 26
0
        static void Main(string[] args)
        {
            // -train <file> Use text data from <file> to train the model
            string train = "Corpus.txt";

            // -output <file> Use <file> to save the resulting word vectors / word clusters
            string output = "Vectors.bin";

            // -save-vocab <file> The vocabulary will be saved to <file>
            string savevocab = "";

            // -read-vocab <file> The vocabulary will be read from <file>, not constructed from the training data
            string readvocab = "";

            // -size <int> Set size of word vectors; default is 100
            int size = 100;

            // -debug <int> Set the debug mode (default = 2 = more info during training)
            int debug = 2;

            // -binary <int> Save the resulting vectors in binary moded; default is 0 (off)
            int binary = 1;

            // -cbow <int> Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)
            int cbow = 1;

            // -alpha <float> Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW
            float alpha = 0.05f;

            // -sample <float> Set threshold for occurrence of words. Those that appear with higher frequency in the training data
            float sample = 1e-4f;

            // -hs <int> Use Hierarchical Softmax; default is 0 (not used)
            int hs = 0;

            // -negative <int> Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)
            int negative = 25;

            // -threads <int> Use <int> threads (default 12)
            int threads = 12;

            // -iter <int> Run more training iterations (default 5)
            long iter = 15;

            // -min-count <int> This will discard words that appear less than <int> times; default is 5
            int mincount = 5;

            // -classes <int> Output word classes rather than word vectors; default number of classes is 0 (vectors are written)
            long classes = 0;

            // -window <int> Set max skip length between words; default is 5
            int window = 12;

            Word2Vec word2Vec = new Word2Vec(train, output, savevocab, readvocab, size, debug, binary, cbow, alpha, sample, hs, negative, threads, iter, mincount, classes, window);

            var totalTime = Stopwatch.StartNew();
            var highRes   = Stopwatch.IsHighResolution;

            word2Vec.TrainModel();

            totalTime.Stop();

            var trainingTime = totalTime.ElapsedMilliseconds;

            Console.WriteLine("Training took {0}ms", trainingTime);

            path        = @"Vectors.bin";
            distance    = new Distance(path);
            wordAnalogy = new WordAnalogy(path);

            string[] wordList = new string[] { "paris france madrid" };

            var searchTime = Stopwatch.StartNew();

            foreach (string word in wordList)
            {
                distance.Search(word);
                wordAnalogy.Search(word);
            }

            searchTime.Stop();
            var firstSearchTime = searchTime.ElapsedMilliseconds;

            Console.WriteLine("Search took {0}ms", firstSearchTime);

            int outerN = 5;

            for (int outer = 0; outer < outerN; outer++)
            {
                foreach (string word in wordList)
                {
                    int    N             = 11;
                    var    minSearchTime = long.MaxValue;
                    var    maxSearchTime = long.MinValue;
                    long[] searchTimes   = new long[N];

                    Console.WriteLine($"Batch {outer}, searching {word}: running {N} searches");

                    for (int inner = 0; inner < N; inner++)
                    {
                        searchTime.Restart();
                        distance.Search(word);
                        BestWord[] result = wordAnalogy.Search(word);
                        searchTime.Stop();

                        /*foreach (var bestWord in result)
                         * {
                         *  Console.WriteLine("{0}\t\t{1}", bestWord.Word, bestWord.Distance);
                         * }*/

                        long interval = highRes ? searchTime.ElapsedTicks : searchTime.ElapsedMilliseconds;
                        searchTimes[inner] = interval;

                        if (interval < minSearchTime)
                        {
                            minSearchTime = interval;
                        }
                        if (interval > maxSearchTime)
                        {
                            maxSearchTime = interval;
                        }
                    }

                    if (highRes)
                    {
                        double averageSearch = 1000 * ((double)searchTimes.Sum() / N / Stopwatch.Frequency);
                        double medianSearch  = 1000 * ((double)searchTimes.OrderBy(t => t).ElementAt(N / 2) / Stopwatch.Frequency);
                        Console.WriteLine("Steadystate min search time: {0:F2}ms", (1000 * minSearchTime) / Stopwatch.Frequency);
                        Console.WriteLine("Steadystate max search time: {0:F2}ms", (1000 * maxSearchTime) / Stopwatch.Frequency);
                        Console.WriteLine("Steadystate average search time: {0:F2}ms", averageSearch);
                        Console.WriteLine("Steadystate median search time: {0:F2}ms", medianSearch);
                    }
                    else
                    {
                        long averageSearch = searchTimes.Sum() / N;
                        long medianSearch  = searchTimes.OrderBy(t => t).ElementAt(N / 2);
                        Console.WriteLine("Steadystate min search time: {0}ms", minSearchTime);
                        Console.WriteLine("Steadystate max search time: {0}ms", maxSearchTime);
                        Console.WriteLine("Steadystate average search time: {0}ms", (int)averageSearch);
                        Console.WriteLine("Steadystate median search time: {0}ms", (int)medianSearch);
                    }

                    Console.WriteLine("");
                }
            }
        }