コード例 #1
0
 public void SubProcFillEmptyVectDictEn()
 {
     using (var dbx = new FastTextProcessDB(DBF_W2V_EN))
     {
         var words = dbx.Dict(DictDbSet.DictKind.Addin).GetWordsWithEmptyVect();
         if (words.Any())
         {
             var fmod = DataArcPath("cc.en.300.bin");
             AssertFileExists(fmod, "FastText model file");
             var fexe = FastTextBin;
             AssertFileExists(fexe, "FastText executable");
             var trans = dbx.BeginTransaction();
             try
             {
                 var dict = dbx.Dict(DictDbSet.DictKind.Addin);
                 using (var ftl = new FastTextLauncher(fexe, fmod))
                 {
                     ftl.RunAsync((w2v) => dict.UpdateVectOfWord(w2v));
                     foreach (var w in words)
                     {
                         ftl.Push(w);
                     }
                 }
                 trans.Commit();
             }
             catch
             {
                 trans.Rollback();
                 throw;
             }
         }
     }
 }
コード例 #2
0
        public void TestCosineRUK2Sum()
        {
            AssertFileExists(DBF_W2V_RUK, "Ru-Uk w2v DB");

            using (var dbx = new FastTextProcessDB(DBF_W2V_RUK))
            {
                var dict_db    = dbx.Dict(DictDbSet.DictKind.Main);
                var sum_w2v_en = dict_db.FindByWord("sum", LangLabel.en);
                var sum_w2v_ru = dict_db.FindByWord("сумма", LangLabel.ru);
                PrintPair(sum_w2v_en, sum_w2v_ru);
                Log("RU dict");
                Parallel.ForEach(dict_db.GetAll(LangLabel.ru), (w2v) =>
                                 //foreach (var w2v in dict_db.GetAll(LangLabel.ru))
                {
                    PrintPair(w2v, sum_w2v_en, distance_min: 0.3f);
                    PrintPair(w2v, sum_w2v_ru, distance_min: 0.6f);
                });
                //foreach (var w2v in dict_db.GetAll(LangLabel.ru))
                //{
                //    PrintPair(w2v, sum_w2v_en, distance_min: 0.3f);
                //    PrintPair(w2v, sum_w2v_ru, distance_min: 0.6f);
                //}
                Log("EN dict");
                Parallel.ForEach(dict_db.GetAll(LangLabel.en), (w2v) =>
                                 PrintPair(w2v, sum_w2v_en, distance_min: 0.6f));
                //foreach (var w2v in dict_db.GetAll(LangLabel.en))
                //    PrintPair(w2v, sum_w2v_en, distance_min: 0.6f);
            }
            Log("done");
        }
コード例 #3
0
        public void TestV2W1consumption()
        {
            AssertFileExists(DBF_W2V_RUK, "En-Ru-Uk w2v DB");

            var vs = VectorsService.Instance(DBF_W2V_RUK);
            var w1 = vs.FindByWord("consumption", LangLabel.en);

            using (var dbx = new FastTextProcessDB(DBF_W2V_RUK))
            {
                var dict_db = dbx.Dict(DictDbSet.DictKind.Main);
                Log("En min=0.7");
                var w2v_en_all = dict_db.GetAll(LangLabel.en);
                Parallel.ForEach(w2v_en_all, (w2v) =>
                                 PrintPair(w1, w2v, distance_min: 0.7f));
                //foreach (var w2v in w2v_en_all)
                //    PrintPair(w1, w2v, distance_min: 0.6f);
                Log("Ru min=0.35");
                var w2v_ru_all = dict_db.GetAll(LangLabel.ru);
                Parallel.ForEach(w2v_ru_all, (w2v) =>
                                 PrintPair(w1, w2v, distance_min: 0.35f));
                //foreach (var w2v in w2v_ru_all)
                //    PrintPair(w1, w2v, distance_min: 0.35f);
                Log("Uk min=0.35");
                var w2v_uk_all = dict_db.GetAll(LangLabel.uk);
                Parallel.ForEach(w2v_uk_all, (w2v) =>
                                 PrintPair(w1, w2v, distance_min: 0.35f));
            }
            Log("done");
        }
コード例 #4
0
 public void SubProcAclImdbResultDictEn()
 {
     using (var dbx_src = new FastTextProcessDB(DBF_W2V_EN))
     {
         using (var dbx_dst = new FastTextResultDB(DBF_AclImdb))
         {
             var tran = dbx_dst.BeginTransaction();
             try
             {
                 var  inx_old   = dbx_dst.GetDictInxMax();
                 long inx_check = inx_old.HasValue ? inx_old.Value + 1 : 0;
                 dbx_src.ProcessEmbedJoins((itm) =>
                 {
                     Assert.Equal(inx_check, itm.Inx);
                     dbx_dst.StoreDictItem(itm);
                     inx_check++;
                 }, from_inx: inx_check);
                 tran.Commit();
             }
             catch
             {
                 tran.Rollback();
                 throw;
             }
         }
     }
 }
コード例 #5
0
ファイル: DictDbTests.cs プロジェクト: sgf/FastTextProcess
        long InsertDict(FastTextProcessDB dbx, Dict w2v, DictDbSet.DictKind kind)
        {
            var w2v_tbl = dbx.Dict(DictDbSet.DictKind.Main);

            w2v_tbl.Insert(w2v);
            return(w2v.Id);
        }
コード例 #6
0
        public void TestCosineRUK()
        {
            var DBF_W2V_RUK = "w2v_ruk.db";

            AssertFileExists(DBF_W2V_RUK, "Ru-Uk w2v DB");

            using (var dbx = new FastTextProcessDB(DBF_W2V_RUK))
            {
                var dict_db = dbx.Dict(DictDbSet.DictKind.Main);
                var w1u     = dict_db.FindByWord("шкарпетки", Enums.FTLangLabel.__label__uk);
                var w1r     = dict_db.FindByWord("носки", Enums.FTLangLabel.__label__ru);
                var w1e     = dict_db.FindByWord("socks", Enums.FTLangLabel.__label__en);
                var w2u     = dict_db.FindByWord("краватка", Enums.FTLangLabel.__label__uk);
                var w2r     = dict_db.FindByWord("галстук", Enums.FTLangLabel.__label__ru);
                var w2e     = dict_db.FindByWord("necktie", Enums.FTLangLabel.__label__en);

                Log($"cos({w1u.Word}, {w1r.Word}) = {w1u.GetCosine(w1r)}");
                Log($"cos({w2u.Word}, {w2r.Word}) = {w2u.GetCosine(w2r)}");

                Log($"cos({w1u.Word}, {w1e.Word}) = {w1u.GetCosine(w1e)}");
                Log($"cos({w1r.Word}, {w1e.Word}) = {w1r.GetCosine(w1e)}");

                Log($"cos({w1u.Word}, {w2u.Word}) = {w1u.GetCosine(w2u)}");
                Log($"cos({w1r.Word}, {w2r.Word}) = {w1r.GetCosine(w2r)}");
                Log($"cos({w1e.Word}, {w2e.Word}) = {w1e.GetCosine(w2e)}");
            }
            Log("done");
        }
コード例 #7
0
 /// <summary>
 /// Build result DB dictionary
 /// </summary>
 /// <param name="proc_db_fn">Result Db filename</param>
 /// <param name="dbf_w2v_fn">DB word to vector filename</param>
 protected void SubProcBuildResultDict(string proc_db_fn, string dbf_w2v_fn)
 {
     using (var dbx_src = new FastTextProcessDB(dbf_w2v_fn))
     {
         using (var dbx_dst = new FastTextResultDB(proc_db_fn))
         {
             var tran = dbx_dst.BeginTransaction();
             try
             {
                 var  inx_old   = dbx_dst.GetDictInxMax();
                 long inx_check = inx_old.HasValue ? inx_old.Value + 1 : 0;
                 dbx_src.ProcessEmbedJoins((itm) =>
                 {
                     Assert.Equal(inx_check, itm.Inx);
                     dbx_dst.StoreDictItem(itm);
                     inx_check++;
                 }, from_inx: inx_check);
                 tran.Commit();
             }
             catch
             {
                 tran.Rollback();
                 throw;
             }
         }
     }
 }
コード例 #8
0
 public void SubProcAclImdbInsertPredefinedMacro()
 {
     using (var dbx_src = new FastTextProcessDB(DBF_W2V_EN))
     {
         var vect_empty = Dict.CreateEmpty();
         //var vect_fl = Dict.GetVectFloat(vect_empty.Vect);
         var  dict          = dbx_src.Dict(DictDbSet.DictKind.Addin);
         long?vect_empty_id = dict.FindIdByWord(vect_empty.Word);
         if (vect_empty_id.HasValue)
         {
             vect_empty.Id = vect_empty_id.Value;
         }
         else
         {
             dict.Insert(vect_empty);
         }
         var  embed_dict = dbx_src.EmbedDict();
         long?ed_inx     = embed_dict.FindInxById(vect_empty.Id, DictDbSet.DictKind.Addin);
         if (ed_inx.HasValue)
         {
             Assert.True(ed_inx.Value == 0,
                         $"'{vect_empty.Word}' dictionary index should be Zero");
         }
         else
         {
             var ed = new EmbedDict {
                 Inx = 0, DictAddinsId = vect_empty.Id
             };
             embed_dict.Insert(ed);
         }
     }
 }
コード例 #9
0
 /// <summary>
 /// Fill Empty Add-in Dictionary Vectors using Random Vectors generation
 /// </summary>
 /// <param name="ft_bin_fn">FastText bin model filename</param>
 /// <param name="dbf_w2v_fn">DB word to vector filename</param>
 protected void SubProcFillEmptyVectDictRND(
     string dbf_w2v_fn, FTLangLabel lang)
 {
     using (var dbx = new FastTextProcessDB(dbf_w2v_fn))
     {
         var words = dbx.Dict(DictDbSet.DictKind.Addin).GetWordsWithEmptyVect();
         if (words.Any())
         {
             var trans = dbx.BeginTransaction();
             try
             {
                 var dict = dbx.Dict(DictDbSet.DictKind.Addin);
                 var rnd  = new Random();
                 foreach (var w in words)
                 {
                     dict.UpdateVectOfWord(Dict.CreateRnd(rnd, w, lang));
                 }
                 trans.Commit();
             }
             catch
             {
                 trans.Rollback();
                 throw;
             }
         }
     }
 }
コード例 #10
0
        /// <summary>
        /// Create word to vector db
        /// </summary>
        /// <param name="ft_vec_fn">FastText vectors filename</param>
        /// <param name="dbf_w2v_fn">DB word to vector filename</param>
        /// <param name="with_insert_or_replace">use insert_or_replace when non unique vocabulary</param>
        protected void ProcCreateDb(string ft_vec_fn, string dbf_w2v_fn, FTLangLabel lang, bool with_insert_or_replace = false)
        {
            var fvec = DataArcPath(ft_vec_fn);

            AssertFileExists(fvec, "FastText file of vectors");

            AssertFileNotExists(dbf_w2v_fn, "word2vect DB");
            FastTextProcessDB.CreateDB(dbf_w2v_fn);

            ProcAppendDb(ft_vec_fn, dbf_w2v_fn, lang, with_insert_or_replace);
        }
コード例 #11
0
        /// <summary>
        /// Append records to word to vector db
        /// </summary>
        /// <param name="ft_vec_fn">FastText vectors filename</param>
        /// <param name="dbf_w2v_fn">DB word to vector filename</param>
        /// <param name="with_insert_or_replace">use insert_or_replace when non unique vocabulary</param>
        /// <param name="fn_infilter_predicat">optional filter predicate</param>
        protected void ProcAppendDb(string ft_vec_fn, string dbf_w2v_fn, LangLabel lang
                                    , bool with_insert_or_replace            = false
                                    , Func <Dict, bool> fn_infilter_predicat = null)
        {
            var fvec = FastTextPath(ft_vec_fn);

            AssertFileExists(fvec, "FastText file of vectors");

            AssertFileExists(dbf_w2v_fn, "word2vect DB");

            using (var dbx = new FastTextProcessDB(dbf_w2v_fn, foreign_keys: false))
            {
                var w2v_tbl = dbx.Dict(DictDbSet.DictKind.Main);
                var trans   = dbx.BeginTransaction();
                w2v_tbl.ControlWordsIndex(is_enabled: false);
                using (var sr = new StreamReader(fvec))
                {
                    // header
                    var line = sr.ReadLine();
                    var harr = line.Split(' ');
                    Assert.Equal(2, harr.Length);
                    Log($"'{fvec}': {harr[0]} - samples count, {harr[1]} - sample dim.");
                    // data
                    while (!sr.EndOfStream)
                    {
                        line = sr.ReadLine();
                        if (string.IsNullOrEmpty(line))
                        {
                            continue;
                        }
                        var w2v = Dict.CreateParseFT(line, lang);
                        if (fn_infilter_predicat == null || fn_infilter_predicat(w2v))
                        {
                            if (with_insert_or_replace)
                            {
                                w2v_tbl.InsertOrReplace(w2v);
                            }
                            else
                            {
                                w2v_tbl.Insert(w2v);
                            }
                        }
                    }
                }
                Log("ControlWordsIndex create...");
                w2v_tbl.ControlWordsIndex(is_enabled: true);
                Log("Done");
                trans.Commit();
            }
        }
コード例 #12
0
 Dict DbFindByWord(string word, LangLabel lang)
 {
     using (var dbx = new FastTextProcessDB(_db_file))
     {
         var dict_db = dbx.Dict(DictDbSet.DictKind.Main);
         var w2v     = dict_db.FindByWord(word, lang);
         if (w2v == null)
         {
             dict_db = dbx.Dict(DictDbSet.DictKind.Addin);
             w2v     = dict_db.FindByWord(word, lang);
         }
         return(w2v);
     }
 }
コード例 #13
0
 public void ProcAclImdbResultClean()
 {
     if (File.Exists(DBF_AclImdb))
     {
         File.Delete(DBF_AclImdb);
         Log($"'{DBF_AclImdb}' deleted");
     }
     AssertFileExists(DBF_W2V_EN, "word2vect En-Common DB");
     using (var dbx = new FastTextProcessDB(DBF_W2V_EN))
     {
         dbx.EmbedDict().DeleteAll();
         dbx.Dict(DictDbSet.DictKind.Addin).DeleteAll();
     }
 }
コード例 #14
0
 /// <summary>
 /// Cleanup result DB
 /// </summary>
 /// <param name="proc_db_fn">Result Db filename</param>
 /// <param name="dbf_w2v_fn">DB word to vector filename</param>
 protected void ProcResultClean(string proc_db_fn, string dbf_w2v_fn)
 {
     if (File.Exists(proc_db_fn))
     {
         File.Delete(proc_db_fn);
         Log($"'{proc_db_fn}' deleted");
     }
     AssertFileExists(dbf_w2v_fn, "word2vect DB");
     using (var dbx = new FastTextProcessDB(dbf_w2v_fn))
     {
         dbx.EmbedDict().DeleteAll();
         dbx.Dict(DictDbSet.DictKind.Addin).DeleteAll();
     }
 }
コード例 #15
0
ファイル: DictDbTests.cs プロジェクト: sgf/FastTextProcess
        public void TestCreateDictInsert()
        {
            var dbf = "w2v_test.db";

            FastTextProcessDB.CreateDB(dbf);
            var w2v = new Dict {
                Word = "test", Vect = new byte[] { 1, 2, 3 }
            };

            using (var dbx = new FastTextProcessDB(dbf))
            {
                var id1 = InsertDict(dbx, w2v, DictDbSet.DictKind.Main);
                w2v.Word = "Test";
                Assert.True(id1 > 0);
                w2v.Word = "Test";
                InsertDict(dbx, w2v, DictDbSet.DictKind.Main);
                Assert.True(w2v.Id > id1);
            }
            Log("done");
        }
コード例 #16
0
ファイル: DictDbTests.cs プロジェクト: sgf/FastTextProcess
        public void TestCreateEmbedInsert()
        {
            var dbf = "w2v_test.db";

            FastTextProcessDB.CreateDB(dbf);

            using (var dbx = new FastTextProcessDB(dbf))
            {
                var w2v = new Dict {
                    Word = "test", Vect = new byte[] { 1, 2, 3 }
                };
                var id1 = InsertDict(dbx, w2v, DictDbSet.DictKind.Main);
                var ed  = new EmbedDict {
                    Inx = 0, DictId = id1, Freq = 1
                };
                dbx.EmbedDict().Insert(ed);
            }
            using (var dbx = new FastTextProcessDB(dbf, foreign_keys: false))
            {
                var ed = new EmbedDict {
                    Inx = 1, DictId = 999, Freq = 1
                };
                dbx.EmbedDict().Insert(ed);
            }
            try
            {
                using (var dbx = new FastTextProcessDB(dbf))
                {
                    var ed = new EmbedDict {
                        Inx = 1, DictId = 998, Freq = 1
                    };
                    dbx.EmbedDict().Insert(ed);
                }
                throw new InvalidOperationException("FK check failed");
            }
            catch (SQLiteException ex)
            {
                Log("FK checked:" + ex.Message);
            }
            Log("done");
        }
コード例 #17
0
 void CalcMinMax(string w2v_db_fn, Enums.FTLangLabel lang)
 {
     using (var dbx = new FastTextProcessDB(w2v_db_fn))
     {
         var  dict_db = dbx.Dict(DictDbSet.DictKind.Main);
         var  w2v_all = dict_db.GetAll(lang);
         var  fmin    = float.MaxValue;
         var  fmax    = float.MinValue;
         long cnt     = 0;
         foreach (var w2v in w2v_all)
         {
             foreach (var vv in Dict.GetVectFloat(w2v.Vect))
             {
                 fmin = Math.Min(fmin, vv);
                 fmax = Math.Max(fmax, vv);
             }
             cnt++;
         }
         Log($"Lang={lang}: Count={cnt}; Min={fmin}; Max={fmax};");
     }
 }
コード例 #18
0
        public void ProcCreateDbEn()
        {
            var fvec = DataArcPath("cc.en.300.vec");

            AssertFileExists(fvec, "FastText file of vectors");

            AssertFileNotExists(DBF_W2V_EN, "word2vect En-Common DB");
            FastTextProcessDB.CreateDB(DBF_W2V_EN);

            using (var dbx = new FastTextProcessDB(DBF_W2V_EN, foreign_keys: false))
            {
                var w2v_tbl = dbx.Dict(DictDbSet.DictKind.Main);
                var trans   = dbx.BeginTransaction();
                w2v_tbl.ControlWordsIndex(is_enabled: false);
                using (var sr = new StreamReader(fvec))
                {
                    // header
                    var line = sr.ReadLine();
                    var harr = line.Split(' ');
                    Assert.Equal(2, harr.Length);
                    Log($"'{fvec}': {harr[0]} - samples count, {harr[1]} - sample dim.");
                    // data
                    while (!sr.EndOfStream)
                    {
                        line = sr.ReadLine();
                        if (string.IsNullOrEmpty(line))
                        {
                            continue;
                        }
                        var w2v = Dict.Create(line);
                        w2v_tbl.Insert(w2v);
                    }
                }
                Log("ControlWordsIndex create...");
                w2v_tbl.ControlWordsIndex(is_enabled: true);
                Log("Done");
                trans.Commit();
            }
        }
コード例 #19
0
 /// <summary>
 /// Fill Empty Add-in Dictionary Vectors
 /// </summary>
 /// <param name="ft_bin_fn">FastText bin model filename</param>
 /// <param name="dbf_w2v_fn">DB word to vector filename</param>
 protected void SubProcFillEmptyVectDict(
     string ft_bin_fn, string dbf_w2v_fn, FTLangLabel lang)
 {
     using (var dbx = new FastTextProcessDB(dbf_w2v_fn))
     {
         var words = dbx.Dict(DictDbSet.DictKind.Addin).GetWordsWithEmptyVect();
         if (words.Any())
         {
             var fmod = DataArcPath(ft_bin_fn);
             AssertFileExists(fmod, "FastText model file");
             var fexe = FastTextBin;
             AssertFileExists(fexe, "FastText executable");
             var trans = dbx.BeginTransaction();
             try
             {
                 var dict = dbx.Dict(DictDbSet.DictKind.Addin);
                 using (var ftl = FTCmd.CreateW2V(fexe, fmod))
                 {
                     ftl.RunByLineAsync(
                         (txt_src, res_txt) =>
                         dict.UpdateVectOfWord(Dict.CreateParseFT(res_txt, lang))
                         );
                     foreach (var w in words)
                     {
                         ftl.Push(w);
                     }
                 }
                 trans.Commit();
             }
             catch
             {
                 trans.Rollback();
                 throw;
             }
         }
     }
 }
コード例 #20
0
 public WordToDictProcessor(string dbf_w2v)
 {
     ProcessDB = new FastTextProcessDB(dbf_w2v);//, foreign_keys:false);
 }
コード例 #21
0
 /// <summary>
 /// Create word to vector db
 /// </summary>
 /// <param name="dbf_w2v_fn">DB word to vector filename</param>
 protected void ProcCreateDb(string dbf_w2v_fn)
 {
     AssertFileNotExists(dbf_w2v_fn, "word2vect DB");
     FastTextProcessDB.CreateDB(dbf_w2v_fn);
     SubProcInsertPredefinedMacro(dbf_w2v_fn);
 }