public void SubProcFillEmptyVectDictEn() { using (var dbx = new FastTextProcessDB(DBF_W2V_EN)) { var words = dbx.Dict(DictDbSet.DictKind.Addin).GetWordsWithEmptyVect(); if (words.Any()) { var fmod = DataArcPath("cc.en.300.bin"); AssertFileExists(fmod, "FastText model file"); var fexe = FastTextBin; AssertFileExists(fexe, "FastText executable"); var trans = dbx.BeginTransaction(); try { var dict = dbx.Dict(DictDbSet.DictKind.Addin); using (var ftl = new FastTextLauncher(fexe, fmod)) { ftl.RunAsync((w2v) => dict.UpdateVectOfWord(w2v)); foreach (var w in words) { ftl.Push(w); } } trans.Commit(); } catch { trans.Rollback(); throw; } } } }
public void TestCosineRUK2Sum() { AssertFileExists(DBF_W2V_RUK, "Ru-Uk w2v DB"); using (var dbx = new FastTextProcessDB(DBF_W2V_RUK)) { var dict_db = dbx.Dict(DictDbSet.DictKind.Main); var sum_w2v_en = dict_db.FindByWord("sum", LangLabel.en); var sum_w2v_ru = dict_db.FindByWord("сумма", LangLabel.ru); PrintPair(sum_w2v_en, sum_w2v_ru); Log("RU dict"); Parallel.ForEach(dict_db.GetAll(LangLabel.ru), (w2v) => //foreach (var w2v in dict_db.GetAll(LangLabel.ru)) { PrintPair(w2v, sum_w2v_en, distance_min: 0.3f); PrintPair(w2v, sum_w2v_ru, distance_min: 0.6f); }); //foreach (var w2v in dict_db.GetAll(LangLabel.ru)) //{ // PrintPair(w2v, sum_w2v_en, distance_min: 0.3f); // PrintPair(w2v, sum_w2v_ru, distance_min: 0.6f); //} Log("EN dict"); Parallel.ForEach(dict_db.GetAll(LangLabel.en), (w2v) => PrintPair(w2v, sum_w2v_en, distance_min: 0.6f)); //foreach (var w2v in dict_db.GetAll(LangLabel.en)) // PrintPair(w2v, sum_w2v_en, distance_min: 0.6f); } Log("done"); }
public void TestV2W1consumption() { AssertFileExists(DBF_W2V_RUK, "En-Ru-Uk w2v DB"); var vs = VectorsService.Instance(DBF_W2V_RUK); var w1 = vs.FindByWord("consumption", LangLabel.en); using (var dbx = new FastTextProcessDB(DBF_W2V_RUK)) { var dict_db = dbx.Dict(DictDbSet.DictKind.Main); Log("En min=0.7"); var w2v_en_all = dict_db.GetAll(LangLabel.en); Parallel.ForEach(w2v_en_all, (w2v) => PrintPair(w1, w2v, distance_min: 0.7f)); //foreach (var w2v in w2v_en_all) // PrintPair(w1, w2v, distance_min: 0.6f); Log("Ru min=0.35"); var w2v_ru_all = dict_db.GetAll(LangLabel.ru); Parallel.ForEach(w2v_ru_all, (w2v) => PrintPair(w1, w2v, distance_min: 0.35f)); //foreach (var w2v in w2v_ru_all) // PrintPair(w1, w2v, distance_min: 0.35f); Log("Uk min=0.35"); var w2v_uk_all = dict_db.GetAll(LangLabel.uk); Parallel.ForEach(w2v_uk_all, (w2v) => PrintPair(w1, w2v, distance_min: 0.35f)); } Log("done"); }
public void SubProcAclImdbResultDictEn() { using (var dbx_src = new FastTextProcessDB(DBF_W2V_EN)) { using (var dbx_dst = new FastTextResultDB(DBF_AclImdb)) { var tran = dbx_dst.BeginTransaction(); try { var inx_old = dbx_dst.GetDictInxMax(); long inx_check = inx_old.HasValue ? inx_old.Value + 1 : 0; dbx_src.ProcessEmbedJoins((itm) => { Assert.Equal(inx_check, itm.Inx); dbx_dst.StoreDictItem(itm); inx_check++; }, from_inx: inx_check); tran.Commit(); } catch { tran.Rollback(); throw; } } } }
long InsertDict(FastTextProcessDB dbx, Dict w2v, DictDbSet.DictKind kind) { var w2v_tbl = dbx.Dict(DictDbSet.DictKind.Main); w2v_tbl.Insert(w2v); return(w2v.Id); }
public void TestCosineRUK() { var DBF_W2V_RUK = "w2v_ruk.db"; AssertFileExists(DBF_W2V_RUK, "Ru-Uk w2v DB"); using (var dbx = new FastTextProcessDB(DBF_W2V_RUK)) { var dict_db = dbx.Dict(DictDbSet.DictKind.Main); var w1u = dict_db.FindByWord("шкарпетки", Enums.FTLangLabel.__label__uk); var w1r = dict_db.FindByWord("носки", Enums.FTLangLabel.__label__ru); var w1e = dict_db.FindByWord("socks", Enums.FTLangLabel.__label__en); var w2u = dict_db.FindByWord("краватка", Enums.FTLangLabel.__label__uk); var w2r = dict_db.FindByWord("галстук", Enums.FTLangLabel.__label__ru); var w2e = dict_db.FindByWord("necktie", Enums.FTLangLabel.__label__en); Log($"cos({w1u.Word}, {w1r.Word}) = {w1u.GetCosine(w1r)}"); Log($"cos({w2u.Word}, {w2r.Word}) = {w2u.GetCosine(w2r)}"); Log($"cos({w1u.Word}, {w1e.Word}) = {w1u.GetCosine(w1e)}"); Log($"cos({w1r.Word}, {w1e.Word}) = {w1r.GetCosine(w1e)}"); Log($"cos({w1u.Word}, {w2u.Word}) = {w1u.GetCosine(w2u)}"); Log($"cos({w1r.Word}, {w2r.Word}) = {w1r.GetCosine(w2r)}"); Log($"cos({w1e.Word}, {w2e.Word}) = {w1e.GetCosine(w2e)}"); } Log("done"); }
/// <summary> /// Build result DB dictionary /// </summary> /// <param name="proc_db_fn">Result Db filename</param> /// <param name="dbf_w2v_fn">DB word to vector filename</param> protected void SubProcBuildResultDict(string proc_db_fn, string dbf_w2v_fn) { using (var dbx_src = new FastTextProcessDB(dbf_w2v_fn)) { using (var dbx_dst = new FastTextResultDB(proc_db_fn)) { var tran = dbx_dst.BeginTransaction(); try { var inx_old = dbx_dst.GetDictInxMax(); long inx_check = inx_old.HasValue ? inx_old.Value + 1 : 0; dbx_src.ProcessEmbedJoins((itm) => { Assert.Equal(inx_check, itm.Inx); dbx_dst.StoreDictItem(itm); inx_check++; }, from_inx: inx_check); tran.Commit(); } catch { tran.Rollback(); throw; } } } }
public void SubProcAclImdbInsertPredefinedMacro() { using (var dbx_src = new FastTextProcessDB(DBF_W2V_EN)) { var vect_empty = Dict.CreateEmpty(); //var vect_fl = Dict.GetVectFloat(vect_empty.Vect); var dict = dbx_src.Dict(DictDbSet.DictKind.Addin); long?vect_empty_id = dict.FindIdByWord(vect_empty.Word); if (vect_empty_id.HasValue) { vect_empty.Id = vect_empty_id.Value; } else { dict.Insert(vect_empty); } var embed_dict = dbx_src.EmbedDict(); long?ed_inx = embed_dict.FindInxById(vect_empty.Id, DictDbSet.DictKind.Addin); if (ed_inx.HasValue) { Assert.True(ed_inx.Value == 0, $"'{vect_empty.Word}' dictionary index should be Zero"); } else { var ed = new EmbedDict { Inx = 0, DictAddinsId = vect_empty.Id }; embed_dict.Insert(ed); } } }
/// <summary> /// Fill Empty Add-in Dictionary Vectors using Random Vectors generation /// </summary> /// <param name="ft_bin_fn">FastText bin model filename</param> /// <param name="dbf_w2v_fn">DB word to vector filename</param> protected void SubProcFillEmptyVectDictRND( string dbf_w2v_fn, FTLangLabel lang) { using (var dbx = new FastTextProcessDB(dbf_w2v_fn)) { var words = dbx.Dict(DictDbSet.DictKind.Addin).GetWordsWithEmptyVect(); if (words.Any()) { var trans = dbx.BeginTransaction(); try { var dict = dbx.Dict(DictDbSet.DictKind.Addin); var rnd = new Random(); foreach (var w in words) { dict.UpdateVectOfWord(Dict.CreateRnd(rnd, w, lang)); } trans.Commit(); } catch { trans.Rollback(); throw; } } } }
/// <summary> /// Create word to vector db /// </summary> /// <param name="ft_vec_fn">FastText vectors filename</param> /// <param name="dbf_w2v_fn">DB word to vector filename</param> /// <param name="with_insert_or_replace">use insert_or_replace when non unique vocabulary</param> protected void ProcCreateDb(string ft_vec_fn, string dbf_w2v_fn, FTLangLabel lang, bool with_insert_or_replace = false) { var fvec = DataArcPath(ft_vec_fn); AssertFileExists(fvec, "FastText file of vectors"); AssertFileNotExists(dbf_w2v_fn, "word2vect DB"); FastTextProcessDB.CreateDB(dbf_w2v_fn); ProcAppendDb(ft_vec_fn, dbf_w2v_fn, lang, with_insert_or_replace); }
/// <summary> /// Append records to word to vector db /// </summary> /// <param name="ft_vec_fn">FastText vectors filename</param> /// <param name="dbf_w2v_fn">DB word to vector filename</param> /// <param name="with_insert_or_replace">use insert_or_replace when non unique vocabulary</param> /// <param name="fn_infilter_predicat">optional filter predicate</param> protected void ProcAppendDb(string ft_vec_fn, string dbf_w2v_fn, LangLabel lang , bool with_insert_or_replace = false , Func <Dict, bool> fn_infilter_predicat = null) { var fvec = FastTextPath(ft_vec_fn); AssertFileExists(fvec, "FastText file of vectors"); AssertFileExists(dbf_w2v_fn, "word2vect DB"); using (var dbx = new FastTextProcessDB(dbf_w2v_fn, foreign_keys: false)) { var w2v_tbl = dbx.Dict(DictDbSet.DictKind.Main); var trans = dbx.BeginTransaction(); w2v_tbl.ControlWordsIndex(is_enabled: false); using (var sr = new StreamReader(fvec)) { // header var line = sr.ReadLine(); var harr = line.Split(' '); Assert.Equal(2, harr.Length); Log($"'{fvec}': {harr[0]} - samples count, {harr[1]} - sample dim."); // data while (!sr.EndOfStream) { line = sr.ReadLine(); if (string.IsNullOrEmpty(line)) { continue; } var w2v = Dict.CreateParseFT(line, lang); if (fn_infilter_predicat == null || fn_infilter_predicat(w2v)) { if (with_insert_or_replace) { w2v_tbl.InsertOrReplace(w2v); } else { w2v_tbl.Insert(w2v); } } } } Log("ControlWordsIndex create..."); w2v_tbl.ControlWordsIndex(is_enabled: true); Log("Done"); trans.Commit(); } }
Dict DbFindByWord(string word, LangLabel lang) { using (var dbx = new FastTextProcessDB(_db_file)) { var dict_db = dbx.Dict(DictDbSet.DictKind.Main); var w2v = dict_db.FindByWord(word, lang); if (w2v == null) { dict_db = dbx.Dict(DictDbSet.DictKind.Addin); w2v = dict_db.FindByWord(word, lang); } return(w2v); } }
public void ProcAclImdbResultClean() { if (File.Exists(DBF_AclImdb)) { File.Delete(DBF_AclImdb); Log($"'{DBF_AclImdb}' deleted"); } AssertFileExists(DBF_W2V_EN, "word2vect En-Common DB"); using (var dbx = new FastTextProcessDB(DBF_W2V_EN)) { dbx.EmbedDict().DeleteAll(); dbx.Dict(DictDbSet.DictKind.Addin).DeleteAll(); } }
/// <summary> /// Cleanup result DB /// </summary> /// <param name="proc_db_fn">Result Db filename</param> /// <param name="dbf_w2v_fn">DB word to vector filename</param> protected void ProcResultClean(string proc_db_fn, string dbf_w2v_fn) { if (File.Exists(proc_db_fn)) { File.Delete(proc_db_fn); Log($"'{proc_db_fn}' deleted"); } AssertFileExists(dbf_w2v_fn, "word2vect DB"); using (var dbx = new FastTextProcessDB(dbf_w2v_fn)) { dbx.EmbedDict().DeleteAll(); dbx.Dict(DictDbSet.DictKind.Addin).DeleteAll(); } }
public void TestCreateDictInsert() { var dbf = "w2v_test.db"; FastTextProcessDB.CreateDB(dbf); var w2v = new Dict { Word = "test", Vect = new byte[] { 1, 2, 3 } }; using (var dbx = new FastTextProcessDB(dbf)) { var id1 = InsertDict(dbx, w2v, DictDbSet.DictKind.Main); w2v.Word = "Test"; Assert.True(id1 > 0); w2v.Word = "Test"; InsertDict(dbx, w2v, DictDbSet.DictKind.Main); Assert.True(w2v.Id > id1); } Log("done"); }
public void TestCreateEmbedInsert() { var dbf = "w2v_test.db"; FastTextProcessDB.CreateDB(dbf); using (var dbx = new FastTextProcessDB(dbf)) { var w2v = new Dict { Word = "test", Vect = new byte[] { 1, 2, 3 } }; var id1 = InsertDict(dbx, w2v, DictDbSet.DictKind.Main); var ed = new EmbedDict { Inx = 0, DictId = id1, Freq = 1 }; dbx.EmbedDict().Insert(ed); } using (var dbx = new FastTextProcessDB(dbf, foreign_keys: false)) { var ed = new EmbedDict { Inx = 1, DictId = 999, Freq = 1 }; dbx.EmbedDict().Insert(ed); } try { using (var dbx = new FastTextProcessDB(dbf)) { var ed = new EmbedDict { Inx = 1, DictId = 998, Freq = 1 }; dbx.EmbedDict().Insert(ed); } throw new InvalidOperationException("FK check failed"); } catch (SQLiteException ex) { Log("FK checked:" + ex.Message); } Log("done"); }
void CalcMinMax(string w2v_db_fn, Enums.FTLangLabel lang) { using (var dbx = new FastTextProcessDB(w2v_db_fn)) { var dict_db = dbx.Dict(DictDbSet.DictKind.Main); var w2v_all = dict_db.GetAll(lang); var fmin = float.MaxValue; var fmax = float.MinValue; long cnt = 0; foreach (var w2v in w2v_all) { foreach (var vv in Dict.GetVectFloat(w2v.Vect)) { fmin = Math.Min(fmin, vv); fmax = Math.Max(fmax, vv); } cnt++; } Log($"Lang={lang}: Count={cnt}; Min={fmin}; Max={fmax};"); } }
public void ProcCreateDbEn() { var fvec = DataArcPath("cc.en.300.vec"); AssertFileExists(fvec, "FastText file of vectors"); AssertFileNotExists(DBF_W2V_EN, "word2vect En-Common DB"); FastTextProcessDB.CreateDB(DBF_W2V_EN); using (var dbx = new FastTextProcessDB(DBF_W2V_EN, foreign_keys: false)) { var w2v_tbl = dbx.Dict(DictDbSet.DictKind.Main); var trans = dbx.BeginTransaction(); w2v_tbl.ControlWordsIndex(is_enabled: false); using (var sr = new StreamReader(fvec)) { // header var line = sr.ReadLine(); var harr = line.Split(' '); Assert.Equal(2, harr.Length); Log($"'{fvec}': {harr[0]} - samples count, {harr[1]} - sample dim."); // data while (!sr.EndOfStream) { line = sr.ReadLine(); if (string.IsNullOrEmpty(line)) { continue; } var w2v = Dict.Create(line); w2v_tbl.Insert(w2v); } } Log("ControlWordsIndex create..."); w2v_tbl.ControlWordsIndex(is_enabled: true); Log("Done"); trans.Commit(); } }
/// <summary> /// Fill Empty Add-in Dictionary Vectors /// </summary> /// <param name="ft_bin_fn">FastText bin model filename</param> /// <param name="dbf_w2v_fn">DB word to vector filename</param> protected void SubProcFillEmptyVectDict( string ft_bin_fn, string dbf_w2v_fn, FTLangLabel lang) { using (var dbx = new FastTextProcessDB(dbf_w2v_fn)) { var words = dbx.Dict(DictDbSet.DictKind.Addin).GetWordsWithEmptyVect(); if (words.Any()) { var fmod = DataArcPath(ft_bin_fn); AssertFileExists(fmod, "FastText model file"); var fexe = FastTextBin; AssertFileExists(fexe, "FastText executable"); var trans = dbx.BeginTransaction(); try { var dict = dbx.Dict(DictDbSet.DictKind.Addin); using (var ftl = FTCmd.CreateW2V(fexe, fmod)) { ftl.RunByLineAsync( (txt_src, res_txt) => dict.UpdateVectOfWord(Dict.CreateParseFT(res_txt, lang)) ); foreach (var w in words) { ftl.Push(w); } } trans.Commit(); } catch { trans.Rollback(); throw; } } } }
public WordToDictProcessor(string dbf_w2v) { ProcessDB = new FastTextProcessDB(dbf_w2v);//, foreign_keys:false); }
/// <summary> /// Create word to vector db /// </summary> /// <param name="dbf_w2v_fn">DB word to vector filename</param> protected void ProcCreateDb(string dbf_w2v_fn) { AssertFileNotExists(dbf_w2v_fn, "word2vect DB"); FastTextProcessDB.CreateDB(dbf_w2v_fn); SubProcInsertPredefinedMacro(dbf_w2v_fn); }