public void TestGetSampleSentence() { var dictPath = PutTestFileOnDisk("HPBook3Dict.json"); Assert.IsTrue(System.IO.File.Exists(dictPath)); var dictText = System.IO.File.ReadAllText(dictPath); Assert.IsNotNull(dictText); var dict = Newtonsoft.Json.JsonConvert.DeserializeObject <Dictionary <string, int> >(dictText); Assert.IsNotNull(dict); Assert.IsFalse(dict.Count == 0); var testing = new Word2Vec(); testing.Sample = 0; testing.BuildVocab(dict); var corpus = PutTestFileOnDisk("HPBook3.txt"); testing.AssignCorpus(corpus); var testResult = testing.GetContextNodes(8); Assert.IsNotNull(testResult); Assert.AreNotEqual(0, testResult.Count); Console.WriteLine(string.Join(", ", testResult.Select(t => t.Word))); }
public void Setup() { // Launching mkl for NumNet (path might need to be change) var path = "C:/data/dlls/mkl"; StartProvider.LaunchMklRt(1, path); var m = NN.Random.Normal(0f, 0.1f, Vocab, N); var vocab = (from i in Enumerable.Range(0, Vocab) select i.ToString()).ToArray(); _w2v = new Word2Vec(vocab, m); int batch = 50; vector_test = NN.Random.Normal(0, 0.1f, N, 1); matrix_test = NN.Random.Normal(0, 0.1f, N, batch); bestd_v = new float[1][]; bestw_v = new int[1][]; bestd_v[0] = new float[Neighbors]; bestw_v[0] = new int[Neighbors]; bestd_m = new float[batch][]; bestw_m = new int[batch][]; for (int i = 0; i < batch; i++) { bestd_m[i] = new float[Neighbors]; bestw_m[i] = new int[Neighbors]; } }
public static bool IsRealWord(string inWord, CSpellApi cSpellApi, bool debugFlag) { // init RootDictionary checkDic = cSpellApi.GetCheckDic(); RootDictionary unitDic = cSpellApi.GetUnitDic(); WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); int inWordLen = inWord.Length; // TBD, change method name int rwSplitWordMinLength = cSpellApi.GetDetectorRwSplitWordMinLength(); int rwSplitWordMinWc = cSpellApi.GetDetectorRwSplitWordMinWc(); // realword must be: // 1. known in the dictionary // 2. not exception, such as url, email, digit, ... // => if excpetion, even is a non-word, no correction // 3. must have word2Vector value (inWord is auto converted to LC) // 4. frequency must be above a threshhold (inWord is auto to LC) // TBD, need to be configureable 200 bool realWordFlag = (checkDic.IsValidWord(inWord)) && (!IsRealWordExceptions(inWord, unitDic) && (inWordLen >= rwSplitWordMinLength) && (word2VecOm.HasWordVec(inWord) == true) && (WordCountScore.GetWc(inWord, wordWcMap) >= rwSplitWordMinWc)); if (debugFlag == true) { bool wordInDicFlag = checkDic.IsValidWord(inWord); bool wordExceptionFlag = IsRealWordExceptions(inWord, unitDic); bool lengthFlag = (inWordLen >= rwSplitWordMinLength); bool word2VecFlag = word2VecOm.HasWordVec(inWord); bool wcFlag = (WordCountScore.GetWc(inWord, wordWcMap) >= rwSplitWordMinWc); DebugPrint.PrintRwSplitDetect(inWord, realWordFlag, wordInDicFlag, wordExceptionFlag, lengthFlag, word2VecFlag, wcFlag, debugFlag); } return(realWordFlag); }
public void TestGetRandomIndicesAroundSentencePosition() { var testing = new Word2Vec(); var testResult = testing.GetRandomIndicesAroundPosition(28, new Tuple <int, int>(2, 9)); Assert.IsNotNull(testResult); Assert.AreNotEqual(0, testResult.Length); Assert.AreEqual(25, testResult[0]); Assert.AreEqual(26, testResult[1]); Assert.AreEqual(27, testResult[2]); Assert.AreEqual(29, testResult[3]); Assert.AreEqual(30, testResult[4]); Assert.AreEqual(31, testResult[5]); //test if we just want one-to-the-left and one-to-the-right testing.Window = 1; //since this is the first word there is only one to the right testResult = testing.GetRandomIndicesAroundPosition(0, new Tuple <int, int>(0, 2)); Assert.IsNotNull(testResult); Assert.AreEqual(1, testResult.Length); Assert.AreEqual(1, testResult[0]); testResult = testing.GetRandomIndicesAroundPosition(1, new Tuple <int, int>(0, 2)); Assert.IsNotNull(testResult); Assert.AreEqual(2, testResult.Length); Assert.AreEqual(0, testResult[0]); Assert.AreEqual(2, testResult[1]); }
private Array <float> _test_3; // [_test_1, _test_2] concatenation, shape (4, 2) public TestSortingNeighbors(Shared shared) { this._w2v = shared.W2v; this._test_1 = shared.Test1; this._test_2 = shared.Test2; this._test_3 = shared.Test3; }
public void TestWord2VecModel() { DataFrame documentDataFrame = _spark.Sql("SELECT split('Hi I heard about Spark', ' ') as text"); Word2Vec word2vec = new Word2Vec() .SetInputCol("text") .SetOutputCol("result") .SetMinCount(1); Word2VecModel model = word2vec.Fit(documentDataFrame); const int expectedSynonyms = 2; DataFrame synonyms = model.FindSynonyms("Hi", expectedSynonyms); Assert.Equal(expectedSynonyms, synonyms.Count()); synonyms.Show(); using (var tempDirectory = new TemporaryDirectory()) { string savePath = Path.Join(tempDirectory.Path, "word2vecModel"); model.Save(savePath); Word2VecModel loadedModel = Word2VecModel.Load(savePath); Assert.Equal(model.Uid(), loadedModel.Uid()); } }
public static NDArray Vectorize(string doc, Word2Vec model) { doc = doc.ToLower(); Regex rgx = new Regex("[^a-zåäöA-ZÅÄÖ0-9 -]"); doc = rgx.Replace(doc, ""); var words = Array.FindAll(doc.Split(' '), word => !IsStopWord(word)); var word_vectors = np.zeros((words.Length, MODEL_VECTOR_SIZE)); for (int i = 0; i < words.Length; i++) { try { var vector = model[words[i]]; for (int j = 0; j < vector.Size; j++) { word_vectors[i, j] = (float)vector[j]; } } catch (System.Exception) { //Console.WriteLine("Could not find word, " + words[i]); } } return(np.mean(word_vectors, 0)); }
public Shared() { NN.Random.Seed(123); // setting seed of NumNet // creating word2vec var matrix = NN.Random.Normal(0, 1, 10, 4); var words = new string[10] { "a", "b", "c", "d", "e", "f", "g", "h", "i", "j" }; this.W2v = new Word2Vec(words, matrix); // creating sample vectors for neighbor search Test1 = NN.Random.Normal(0, 1, 4); // vector Test2 = NN.Random.Normal(0, 1, 4); // second vector var values = new float[8]; for (int i = 0; i < 4; i++) { values[2 * i] = Test1.Values[i]; values[2 * i + 1] = Test2.Values[i]; } Test3 = NN.Array(values).Reshape(4, 2); // concat of the 2 first vectors }
static void Main(string[] args) { var word2Vec = new Word2Vec(); word2Vec.SetSeed(123); Console.WriteLine(word2Vec.ExplainParams()); Console.WriteLine("~~~~~~"); var seedParam = new Param(word2Vec, "seed", "Setting the seed to 54321"); word2Vec.Set(seedParam, 54321L); Console.WriteLine(word2Vec.ExplainParams()); Console.WriteLine("~~~~~~"); var seed = word2Vec.GetParam("seed"); word2Vec.Set(seed, 12345L); Console.WriteLine(word2Vec.ExplainParams()); word2Vec.Clear(seed); Console.WriteLine(word2Vec.ExplainParams()); }
public void TestReadNextNode() { var corpus = @"The dog saw a cat. The dog chased a cat. The cat climbed a tree"; var testing = new Word2Vec { Sample = 0, Size = 3, Window = 1 }; testing.BuildVocab(corpus); var vocab = testing.Vocab; var leafs = vocab.GetLeafs(); for (var i = 0; i < 4; i++) { var b = testing.ReadNextWord(); Assert.IsNotNull(b); Assert.IsNotNull(b.TargetWord); Assert.IsNotNull(b.ContextWords); Assert.AreNotEqual(0, b.ContextWords.Count); } testing.CurrentCorpusPosition = 25; var testResultNull = testing.ReadNextWord(); Assert.IsNull(testResultNull); }
public void CanDumpVectors() { int vocSize = 100, vecSize = 10; var vectors = NN.Random.Uniform(-1f, 1f, vocSize, vecSize); var words = Enumerable.Range(0, vocSize).Select(i => i.ToString()).ToArray(); var w2v = new Word2Vec(words, vectors); w2v.SaveBinary(Path.GetTempFileName()); }
// private methods private static bool IsValidMergeCand(MergeObj mergeObj, CSpellApi cSpellApi) { // WC is not used here WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); string coreMergeStr = mergeObj.GetCoreMergeWord(); int rwMergeCandMinWc = cSpellApi.GetCanRwMergeCandMinWc(); bool flag = ((word2VecOm.HasWordVec(coreMergeStr)) && (WordCountScore.GetWc(coreMergeStr, wordWcMap) >= rwMergeCandMinWc)); return(flag); }
static void Main(string[] args) { //var word2VecModel = Word2Vec.LoadBinary("models/GoogleNews-vectors-negative300.bin", true, false, "", Encoding.GetEncoding("ISO-8859-1")); var word2VecModel = Word2Vec.LoadText("models/bloggmix2013w2v.txt", true, false, ""); var train = getQuestionsAsVector(0, 3000, "data/train_sv.csv", word2VecModel); var test = getQuestionsAsVector(0, 100, "data/test_sv.csv", word2VecModel); Learning.LargeNetwork(train, test); Learning.LightGbm(train); }
public void TestGetRandomWindowStartEnd() { var testing = new Word2Vec(); var testResult = testing.GetRandomWindowStartEnd(); Assert.IsNotNull(testResult); Console.WriteLine(testResult); Assert.IsTrue(testResult.Item2 > testResult.Item1); testing.Window = 1; testResult = testing.GetRandomWindowStartEnd(); Console.WriteLine(testResult); }
public void CanReadDumpedVectors() { int vocSize = 100, vecSize = 10; var vectors = NN.Random.Uniform(-1f, 1f, vocSize, vecSize); var words = Enumerable.Range(0, vocSize).Select(i => i.ToString()).ToArray(); var w2v = new Word2Vec(words, vectors); var path = Path.GetTempFileName(); w2v.SaveBinary(path); var w2vLoaded = Word2Vec.LoadBinary(path, normalize: false); AssertArray.AreEqual(vectors, w2vLoaded.Vectors); AssertArray.AreEqual(words, w2vLoaded.Text); }
public void TestWord2Vec() { DataFrame documentDataFrame = _spark.Sql("SELECT split('Spark dotnet is cool', ' ')"); const string expectedInputCol = "text"; const string expectedOutputCol = "result"; const int expectedMinCount = 0; const int expectedMaxIter = 10; const int expectedMaxSentenceLength = 100; const int expectedNumPartitions = 1000; const int expectedSeed = 10000; const double expectedStepSize = 1.9; const int expectedVectorSize = 20; const int expectedWindowSize = 200; Word2Vec word2vec = new Word2Vec() .SetInputCol(expectedInputCol) .SetOutputCol(expectedOutputCol) .SetMinCount(expectedMinCount) .SetMaxIter(expectedMaxIter) .SetMaxSentenceLength(expectedMaxSentenceLength) .SetNumPartitions(expectedNumPartitions) .SetSeed(expectedSeed) .SetStepSize(expectedStepSize) .SetVectorSize(expectedVectorSize) .SetWindowSize(expectedWindowSize); Assert.Equal(expectedInputCol, word2vec.GetInputCol()); Assert.Equal(expectedOutputCol, word2vec.GetOutputCol()); Assert.Equal(expectedMinCount, word2vec.GetMinCount()); Assert.Equal(expectedMaxIter, word2vec.GetMaxIter()); Assert.Equal(expectedMaxSentenceLength, word2vec.GetMaxSentenceLength()); Assert.Equal(expectedNumPartitions, word2vec.GetNumPartitions()); Assert.Equal(expectedSeed, word2vec.GetSeed()); Assert.Equal(expectedStepSize, word2vec.GetStepSize()); Assert.Equal(expectedVectorSize, word2vec.GetVectorSize()); Assert.Equal(expectedWindowSize, word2vec.GetWindowSize()); using (var tempDirectory = new TemporaryDirectory()) { string savePath = Path.Join(tempDirectory.Path, "word2vec"); word2vec.Save(savePath); Word2Vec loadedWord2Vec = Word2Vec.Load(savePath); Assert.Equal(word2vec.Uid(), loadedWord2Vec.Uid()); } }
// real-word candidate has more restriction than non-word // TBD, need to organize the code ... // the check should be done in the ranking // Core process for real-word candidates private static bool IsValid1To1Cand(string inWord, string cand, CSpellApi cSpellApi) { RootDictionary suggestDic = cSpellApi.GetSuggestDic(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); // real-word, check phonetic and suggDic // 1. check suggDic // 1.1 edDist <= 1 // 1.2 edDist <= 2 && phonetic dist <= 1 // 2. check if inflections, not a candidate real-word, not correct bool flag = false; int rw1To1CandMinWc = cSpellApi.GetCanRw1To1CandMinWc(); int rw1To1CandMinLength = cSpellApi.GetCanRw1To1CandMinLength(); string inWordLc = inWord.ToLower(); int inWordLen = inWordLc.Length; int candLen = cand.Length; int lenDiff = inWordLen - candLen; // 1. check suggDic and inflVars if ((suggestDic.IsDicWord(cand) == true) && (word2VecOm.HasWordVec(cand) == true) && (candLen >= rw1To1CandMinLength) && (WordCountScore.GetWc(cand, wordWcMap) >= rw1To1CandMinWc) && (InflVarsUtil.IsInflectionVar(inWordLc, cand) == false)) // not inflVars { //&& ((lenDiff <= 1) && (lenDiff >= -1))) // length diff <= 1 // more restriction for real-word candidates int pmDist = Metaphone2.GetDistance(inWordLc, cand); int prDist = RefinedSoundex.GetDistance(inWordLc, cand); int leadDist = GetLeadCharDist(inWordLc, cand); int endDist = GetEndCharDist(inWordLc, cand); int lengthDist = GetLengthDist(inWordLc, cand); int totalDist1 = leadDist + endDist + lengthDist + pmDist + prDist; int editDist = EditDistance.GetDistanceForRealWord(inWordLc, cand); int totalDist2 = editDist + pmDist + prDist; // if they sound the same if ((pmDist == 0) && (prDist == 0)) { flag = true; } // if they sound similar and orthographic is also similar // fixed from empierical test, not configuable else if ((totalDist1 < 3) && (totalDist2 < 4) && (pmDist * prDist == 0)) { flag = true; } } return(flag); }
public void TestToVector() { var corpus = @"The dog saw a cat. The dog chased a cat. The cat climbed a tree"; var testing = new Word2Vec { Sample = 0, Size = 3, Window = 1 }; testing.BuildVocab(corpus); var word00 = testing.Vocab.GetNodeByWord("dog"); var word01 = testing.Vocab.GetNodeByWord("cat"); var testResult = testing.ToVector(word00); for (var j = 0; j < testResult.CountOfColumns(); j++) { if (j == word00.Index) { Assert.AreEqual(1D, testResult[0, j]); } else { Assert.AreEqual(0D, testResult[0, j]); } } testResult = testing.ToVector(new List <HuffmanNode> { word00, word01 }); for (var j = 0; j < testResult.CountOfColumns(); j++) { if (j == word00.Index || j == word01.Index) { Assert.IsTrue(System.Math.Abs(testResult[0, j] - 0.5) < 0.0001); } else { Assert.AreEqual(0D, testResult[0, j]); } } }
public void TestPseudoInverse() { var path = @"C:\Users\joc\AppData\Local\ProtoStudio\Banque\embeddings.bin"; var words = Word2Vec.LoadBinary(path, normalize: true).Vectors /*[_, Until(100)]*/; //var pseudoInv = PseudoInv(words); var pseudoInv = PowerMethod(words); // when embeddings have linearly independent dimensions AssertArray.AreAlmostEqual(NN.Eye(words.Shape[1]), pseudoInv.Dot(words), 1e-6f, 1e-6f); // least probable: words are NOT linearly idependent //AssertArray.AreAlmostEqual(NN.Eye(words.Shape[0]), words.Dot(pseudoInv), 1e-3f, 1e-5f); if (words.Shape[0] <= 1000) // otherwise too long { AssertArray.AreAlmostEqual(words, words.Dot(pseudoInv).Dot(words), 1e-6f, 1e-6f); } AssertArray.AreAlmostEqual(pseudoInv, pseudoInv.Dot(words).Dot(pseudoInv), 1e-6f, 1e-6f); }
public void TestBuildVocab() { var textPath = PutTestFileOnDisk("HPBook3.txt"); Console.WriteLine(textPath); Assert.IsTrue(System.IO.File.Exists(textPath)); var text = System.IO.File.ReadAllText(textPath); Assert.IsNotNull(text); var testing = new Word2Vec(); testing.BuildVocab(text); Assert.IsNotNull(testing.Vocab); var testResultLeafs = testing.Vocab.GetLeafs(); Assert.IsNotNull(testResultLeafs); Assert.AreNotEqual(0, testResultLeafs.Count); }
public void TestGetSampleSentence_Simple() { var corpus = @"The dog saw a cat. The dog chased a cat. The cat climbed a tree"; var testing = new Word2Vec { Sample = 0, Size = 3, Window = 1 }; testing.BuildVocab(corpus); //TODO - once implementation is figured out var testResult = testing.GetContextNodes(2); Assert.IsNotNull(testResult); Assert.AreNotEqual(0, testResult.Count); Console.WriteLine(string.Join(", ", testResult.Select(t => t.Word))); }
// for the split, we don't want Aa as a valid word // because it will cause too much noise (less precision) // TBD ... re-organize private static bool IsValidSplitWord(string inWord, CSpellApi cSpellApi) { // splitWord uses LexiconNoAa for Dic RootDictionary splitWordDic = cSpellApi.GetSplitWordDic(); WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); RootDictionary unitDic = cSpellApi.GetUnitDic(); RootDictionary pnDic = cSpellApi.GetPnDic(); //RootDictionary aaDic = cSpellApi.GetAaDic(); int rwSplitCandMinWc = cSpellApi.GetCanRwSplitCandMinWc(); // real-word cand split word must: // 1. check if in the splitWordDic, No Aa with a small length // such as cel is an overlap, it is aa or not-aa // 2. has word2Vec // 3. has WC // 4. not unit, mg -> ... // 5. not properNoun, human -> Hu man, where Hu is pn // children -> child ren, where ren is pn bool flag = (splitWordDic.IsDicWord(inWord)) && (word2VecOm.HasWordVec(inWord) == true) && (WordCountScore.GetWc(inWord, wordWcMap) >= rwSplitCandMinWc) && (!unitDic.IsDicWord(inWord)) && (!pnDic.IsDicWord(inWord)); return(flag); }
static List <Tuple <bool, float[]> > getQuestionsAsVector(int from, int to, string filePath, Word2Vec word2VecModel) { //var word2VecModel = Word2Vec.LoadText("models/newsv/model.txt", true, false, ""); var docRelations = new List <Tuple <bool, float[]> >(); var qPs = readQuestionPairs(from, to, filePath); foreach (QuoraQuestionPair questionPair in qPs) { try{ NDArray v1 = TextProcessing.Vectorize(questionPair.Question1, word2VecModel); NDArray v2 = TextProcessing.Vectorize(questionPair.Question2, word2VecModel); //NDArray diff = np.subtract(v1, v2); double[] conc = np.concatenate((v1, v2)).ToArray <double>(); // NDArray -> double[] -> float[] float[] merged = Array.ConvertAll(conc, s => (float)s); var docRelation = Tuple.Create(questionPair.Related, merged); docRelations.Add(docRelation); }catch (System.Exception) { Console.WriteLine("could not vectorize."); } } return(docRelations); }
// update parameter from the config file to cSpellApi private void Init(bool debugFlag) { // get config file from environment variable bool useClassPath = false; if (string.ReferenceEquals(configFile_, null)) { useClassPath = true; configFile_ = "data.Config.cSpell"; } // read in configuration file conf_ = new Configuration(configFile_, useClassPath); if (properties_ != null) { conf_.OverwriteProperties(properties_); } string cSpellDir = conf_.GetProperty(Configuration.CS_DIR); // files: pre-correction string infExpFile = cSpellDir + conf_.GetProperty(Configuration.CS_INFORMAL_EXP_FILE); infExpMap_ = InformalExpHandler.GetInformalExpMapFromFile(infExpFile); // get dictionary for spell checker string checkDicFileStrs = conf_.GetProperty(Configuration.CS_CHECK_DIC_FILES); checkDic_.AddDictionaries(checkDicFileStrs, cSpellDir, debugFlag); // get dictionary for spell suggestion - candidate string suggestDicFileStrs = conf_.GetProperty(Configuration.CS_SUGGEST_DIC_FILES); suggestDic_.AddDictionaries(suggestDicFileStrs, cSpellDir, debugFlag); // no acr/abb dictionary: en + pn, used for split check string splitWordDicFileStrs = conf_.GetProperty(Configuration.CS_SPLIT_WORD_DIC_FILES); splitWordDic_.AddDictionaries(splitWordDicFileStrs, cSpellDir, debugFlag); // mw dictionary string mwDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_MW_DIC_FILE); mwDic_.AddDictionary(mwDicFile); // properNoun dictionary string pnDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_PN_DIC_FILE); pnDic_.AddDictionary(pnDicFile); // abb/acr dictionary string aaDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_AA_DIC_FILE); aaDic_.AddDictionary(aaDicFile); // spVar dictionary string svDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_SV_DIC_FILE); svDic_.AddDictionary(svDicFile); // unit file string unitDicFile = cSpellDir + conf_.GetProperty(Configuration.CS_UNIT_DIC_FILE); unitDic_.AddDictionary(unitDicFile); // frequency file string frequencyFile = cSpellDir + conf_.GetProperty(Configuration.CS_FREQUENCY_FILE); wordWcMap_ = new WordWcMap(frequencyFile); // word2Vec file string word2VecImFile = cSpellDir + conf_.GetProperty(Configuration.CS_W2V_IM_FILE); word2VecIm_ = new Word2Vec(word2VecImFile); string word2VecOmFile = cSpellDir + conf_.GetProperty(Configuration.CS_W2V_OM_FILE); word2VecOm_ = new Word2Vec(word2VecOmFile); // mode funcMode_ = int.Parse(conf_.GetProperty(Configuration.CS_FUNC_MODE)); rankMode_ = int.Parse(conf_.GetProperty(Configuration.CS_RANK_MODE)); // detectors maxLegitTokenLength_ = int.Parse(conf_.GetProperty(Configuration.CS_MAX_LEGIT_TOKEN_LENGTH)); dRwSplitWordMinLength_ = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_SPLIT_WORD_MIN_LENGTH)); dRwSplitWordMinWc_ = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_SPLIT_WORD_MIN_WC)); dRw1To1WordMinLength_ = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_1TO1_WORD_MIN_LENGTH)); dRw1To1WordMinWc_ = int.Parse(conf_.GetProperty(Configuration.CS_DETECTOR_RW_1TO1_WORD_MIN_WC)); // candidates cMaxCandNo_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_MAX_CANDIDATE_NO)); cNdMaxSplitNo_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_ND_MAX_SPLIT_NO)); cNwMaxSplitNo_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_NW_MAX_SPLIT_NO)); cNwMaxMergeNo_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_NW_MAX_MERGE_NO)); cNwMergeWithHyphen_ = bool.Parse(conf_.GetProperty(Configuration.CS_CAN_NW_MERGE_WITH_HYPHEN)); cRwMaxSplitNo_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MAX_SPLIT_NO)); cRwMaxMergeNo_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MAX_MERGE_NO)); cRwMergeWithHyphen_ = bool.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MERGE_WITH_HYPHEN)); cRwShortSplitWordLength_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_SHORT_SPLIT_WORD_LENGTH)); cRwMaxShortSplitWordNo_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MAX_SHORT_SPLIT_WORD_NO)); cRwMergeCandMinWc_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_MERGE_CAND_MIN_WC)); cRwSplitCandMinWc_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_SPLIT_CAND_MIN_WC)); cRw1To1CandMinLength_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_1TO1_CAND_MIN_LENGTH)); cRw1To1CandMinWc_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_1TO1_CAND_MIN_WC)); cRw1To1CandMaxKeySize_ = int.Parse(conf_.GetProperty(Configuration.CS_CAN_RW_1TO1_CAND_MAX_KEY_SIZE)); // rankers rNwS1RankRangeFac_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_NW_S1_RANK_RANGE_FAC)); rNwS1MinOScore_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_NW_S1_MIN_OSCORE)); rRw1To1CFac_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_C_FAC)); rRwSplitCFac_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_SPLIT_C_FAC)); rRwMergeCFac_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_MERGE_C_FAC)); rRw1To1WordMinCs_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_WORD_MIN_CS)); rRw1To1CandCsFac_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_CS_FAC)); rRw1To1CandMinCs_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_MIN_CS)); rRw1To1CandCsDist_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_CS_DIST)); rRw1To1CandFsFac_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_FS_FAC)); rRw1To1CandMinFs_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_MIN_FS)); rRw1To1CandFsDist_ = double.Parse(conf_.GetProperty(Configuration.CS_RANKER_RW_1TO1_CAND_FS_DIST)); // Score orthoScoreEdDistFac_ = double.Parse(conf_.GetProperty(Configuration.CS_ORTHO_SCORE_ED_DIST_FAC)); orthoScorePhoneticFac_ = double.Parse(conf_.GetProperty(Configuration.CS_ORTHO_SCORE_PHONETIC_FAC)); orthoScoreOverlapFac_ = double.Parse(conf_.GetProperty(Configuration.CS_ORTHO_SCORE_OVERLAP_FAC)); // context word2VecSkipWord_ = bool.Parse(conf_.GetProperty(Configuration.CS_W2V_SKIP_WORD)); nw1To1ContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_NW_1TO1_CONTEXT_RADIUS)); nwSplitContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_NW_SPLIT_CONTEXT_RADIUS)); nwMergeContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_NW_MERGE_CONTEXT_RADIUS)); rw1To1ContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_RW_1TO1_CONTEXT_RADIUS)); rwSplitContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_RW_SPLIT_CONTEXT_RADIUS)); rwMergeContextRadius_ = int.Parse(conf_.GetProperty(Configuration.CS_RW_MERGE_CONTEXT_RADIUS)); }
private void Init2(bool debugFlag) { _logger.LogInformation("cSpellApi initialization..."); infExpMap_ = InformalExpHandler.GetInformalExpMapFromFile(_config.Value.CS_INFORMAL_EXP_FILE); checkDic_.AddDictionaries2(_config.Value.CS_CHECK_DIC_FILES, debugFlag); suggestDic_.AddDictionaries2(_config.Value.CS_SUGGEST_DIC_FILES, debugFlag); splitWordDic_.AddDictionaries2(_config.Value.CS_SPLIT_WORD_DIC_FILES, debugFlag); mwDic_.AddDictionary(_config.Value.CS_MW_DIC_FILE); pnDic_.AddDictionary(_config.Value.CS_PN_DIC_FILE); aaDic_.AddDictionary(_config.Value.CS_AA_DIC_FILE); svDic_.AddDictionary(_config.Value.CS_SV_DIC_FILE); unitDic_.AddDictionary(_config.Value.CS_UNIT_DIC_FILE); wordWcMap_ = new WordWcMap(_config.Value.CS_FREQUENCY_FILE); word2VecIm_ = new Word2Vec(_config.Value.CS_W2V_IM_FILE); word2VecOm_ = new Word2Vec(_config.Value.CS_W2V_OM_FILE); // mode funcMode_ = _config.Value.CS_FUNC_MODE; rankMode_ = _config.Value.CS_RANK_MODE; // detectors maxLegitTokenLength_ = _config.Value.CS_MAX_LEGIT_TOKEN_LENGTH; dRwSplitWordMinLength_ = _config.Value.CS_DETECTOR_RW_SPLIT_WORD_MIN_LENGTH; dRwSplitWordMinWc_ = _config.Value.CS_DETECTOR_RW_SPLIT_WORD_MIN_WC; dRw1To1WordMinLength_ = _config.Value.CS_DETECTOR_RW_1TO1_WORD_MIN_LENGTH; dRw1To1WordMinWc_ = _config.Value.CS_DETECTOR_RW_1TO1_WORD_MIN_WC; // candidates cMaxCandNo_ = _config.Value.CS_CAN_MAX_CANDIDATE_NO; cNdMaxSplitNo_ = _config.Value.CS_CAN_ND_MAX_SPLIT_NO; cNwMaxSplitNo_ = _config.Value.CS_CAN_NW_MAX_SPLIT_NO; cNwMaxMergeNo_ = _config.Value.CS_CAN_NW_MAX_MERGE_NO; cNwMergeWithHyphen_ = _config.Value.CS_CAN_NW_MERGE_WITH_HYPHEN; cRwMaxSplitNo_ = _config.Value.CS_CAN_RW_MAX_SPLIT_NO; cRwMaxMergeNo_ = _config.Value.CS_CAN_RW_MAX_MERGE_NO; cRwMergeWithHyphen_ = _config.Value.CS_CAN_RW_MERGE_WITH_HYPHEN; cRwShortSplitWordLength_ = _config.Value.CS_CAN_RW_SHORT_SPLIT_WORD_LENGTH; cRwMaxShortSplitWordNo_ = _config.Value.CS_CAN_RW_MAX_SHORT_SPLIT_WORD_NO; cRwMergeCandMinWc_ = _config.Value.CS_CAN_RW_MERGE_CAND_MIN_WC; cRwSplitCandMinWc_ = _config.Value.CS_CAN_RW_SPLIT_CAND_MIN_WC; cRw1To1CandMinLength_ = _config.Value.CS_CAN_RW_1TO1_CAND_MIN_LENGTH; cRw1To1CandMinWc_ = _config.Value.CS_CAN_RW_1TO1_CAND_MIN_WC; cRw1To1CandMaxKeySize_ = _config.Value.CS_CAN_RW_1TO1_CAND_MAX_KEY_SIZE; // rankers rNwS1RankRangeFac_ = _config.Value.CS_RANKER_NW_S1_RANK_RANGE_FAC; rNwS1MinOScore_ = _config.Value.CS_RANKER_NW_S1_MIN_OSCORE; rRw1To1CFac_ = _config.Value.CS_RANKER_RW_1TO1_C_FAC; rRwSplitCFac_ = _config.Value.CS_RANKER_RW_SPLIT_C_FAC; rRwMergeCFac_ = _config.Value.CS_RANKER_RW_MERGE_C_FAC; rRw1To1WordMinCs_ = _config.Value.CS_RANKER_RW_1TO1_WORD_MIN_CS; rRw1To1CandCsFac_ = _config.Value.CS_RANKER_RW_1TO1_CAND_CS_FAC; rRw1To1CandMinCs_ = _config.Value.CS_RANKER_RW_1TO1_CAND_MIN_CS; rRw1To1CandCsDist_ = _config.Value.CS_RANKER_RW_1TO1_CAND_CS_DIST; rRw1To1CandFsFac_ = _config.Value.CS_RANKER_RW_1TO1_CAND_FS_FAC; rRw1To1CandMinFs_ = _config.Value.CS_RANKER_RW_1TO1_CAND_MIN_FS; rRw1To1CandFsDist_ = _config.Value.CS_RANKER_RW_1TO1_CAND_FS_DIST; // Score orthoScoreEdDistFac_ = _config.Value.CS_ORTHO_SCORE_ED_DIST_FAC; orthoScorePhoneticFac_ = _config.Value.CS_ORTHO_SCORE_PHONETIC_FAC; orthoScoreOverlapFac_ = _config.Value.CS_ORTHO_SCORE_OVERLAP_FAC; // context word2VecSkipWord_ = _config.Value.CS_W2V_SKIP_WORD; nw1To1ContextRadius_ = _config.Value.CS_NW_1TO1_CONTEXT_RADIUS; nwSplitContextRadius_ = _config.Value.CS_NW_SPLIT_CONTEXT_RADIUS; nwMergeContextRadius_ = _config.Value.CS_NW_MERGE_CONTEXT_RADIUS; rw1To1ContextRadius_ = _config.Value.CS_RW_1TO1_CONTEXT_RADIUS; rwSplitContextRadius_ = _config.Value.CS_RW_SPLIT_CONTEXT_RADIUS; rwMergeContextRadius_ = _config.Value.CS_RW_MERGE_CONTEXT_RADIUS; _logger.LogInformation("cSpellApi initialized successfully"); }
static void Main(string[] args) { // -train <file> Use text data from <file> to train the model string train = "Corpus.txt"; // -output <file> Use <file> to save the resulting word vectors / word clusters string output = "Vectors.bin"; // -save-vocab <file> The vocabulary will be saved to <file> string savevocab = ""; // -read-vocab <file> The vocabulary will be read from <file>, not constructed from the training data string readvocab = ""; // -size <int> Set size of word vectors; default is 100 int size = 100; // -debug <int> Set the debug mode (default = 2 = more info during training) int debug = 2; // -binary <int> Save the resulting vectors in binary moded; default is 0 (off) int binary = 1; // -cbow <int> Use the continuous bag of words model; default is 1 (use 0 for skip-gram model) int cbow = 1; // -alpha <float> Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW float alpha = 0.05f; // -sample <float> Set threshold for occurrence of words. Those that appear with higher frequency in the training data float sample = 1e-4f; // -hs <int> Use Hierarchical Softmax; default is 0 (not used) int hs = 0; // -negative <int> Number of negative examples; default is 5, common values are 3 - 10 (0 = not used) int negative = 25; // -threads <int> Use <int> threads (default 12) int threads = 12; // -iter <int> Run more training iterations (default 5) long iter = 15; // -min-count <int> This will discard words that appear less than <int> times; default is 5 int mincount = 5; // -classes <int> Output word classes rather than word vectors; default number of classes is 0 (vectors are written) long classes = 0; // -window <int> Set max skip length between words; default is 5 int window = 12; Word2Vec word2Vec = new Word2Vec(train, output, savevocab, readvocab, size, debug, binary, cbow, alpha, sample, hs, negative, threads, iter, mincount, classes, window); var totalTime = Stopwatch.StartNew(); var highRes = Stopwatch.IsHighResolution; word2Vec.TrainModel(); totalTime.Stop(); var trainingTime = totalTime.ElapsedMilliseconds; Console.WriteLine("Training took {0}ms", trainingTime); path = @"Vectors.bin"; distance = new Distance(path); wordAnalogy = new WordAnalogy(path); string[] wordList = new string[] { "paris france madrid" }; var searchTime = Stopwatch.StartNew(); foreach (string word in wordList) { distance.Search(word); wordAnalogy.Search(word); } searchTime.Stop(); var firstSearchTime = searchTime.ElapsedMilliseconds; Console.WriteLine("Search took {0}ms", firstSearchTime); int outerN = 5; for (int outer = 0; outer < outerN; outer++) { foreach (string word in wordList) { int N = 11; var minSearchTime = long.MaxValue; var maxSearchTime = long.MinValue; long[] searchTimes = new long[N]; Console.WriteLine($"Batch {outer}, searching {word}: running {N} searches"); for (int inner = 0; inner < N; inner++) { searchTime.Restart(); distance.Search(word); BestWord[] result = wordAnalogy.Search(word); searchTime.Stop(); /*foreach (var bestWord in result) * { * Console.WriteLine("{0}\t\t{1}", bestWord.Word, bestWord.Distance); * }*/ long interval = highRes ? searchTime.ElapsedTicks : searchTime.ElapsedMilliseconds; searchTimes[inner] = interval; if (interval < minSearchTime) { minSearchTime = interval; } if (interval > maxSearchTime) { maxSearchTime = interval; } } if (highRes) { double averageSearch = 1000 * ((double)searchTimes.Sum() / N / Stopwatch.Frequency); double medianSearch = 1000 * ((double)searchTimes.OrderBy(t => t).ElementAt(N / 2) / Stopwatch.Frequency); Console.WriteLine("Steadystate min search time: {0:F2}ms", (1000 * minSearchTime) / Stopwatch.Frequency); Console.WriteLine("Steadystate max search time: {0:F2}ms", (1000 * maxSearchTime) / Stopwatch.Frequency); Console.WriteLine("Steadystate average search time: {0:F2}ms", averageSearch); Console.WriteLine("Steadystate median search time: {0:F2}ms", medianSearch); } else { long averageSearch = searchTimes.Sum() / N; long medianSearch = searchTimes.OrderBy(t => t).ElementAt(N / 2); Console.WriteLine("Steadystate min search time: {0}ms", minSearchTime); Console.WriteLine("Steadystate max search time: {0}ms", maxSearchTime); Console.WriteLine("Steadystate average search time: {0}ms", (int)averageSearch); Console.WriteLine("Steadystate median search time: {0}ms", (int)medianSearch); } Console.WriteLine(""); } } }