Beispiel #1
0
        public Dict FindByWord(string word, FTLangLabel lang)
        {
            var sql = string.Format(
                "SELECT {1}, {2}, {3}, {4} FROM {0} WHERE {2} = ${2} AND ({4} = ${4} OR {4} = {5}) ",
                TableName
                , Dict.FldnId, Dict.FldnWord, Dict.FldnVect, Dict.FldnLangId, (int)FTLangLabel.NotSpecified);
            var cmd = Ctx.CreateCmd(sql);

            cmd.Parameters.AddWithValue(Dict.FldnWord, word);
            cmd.Parameters.AddWithValue(Dict.FldnLangId, (int)lang);
            using (var rd = cmd.ExecuteReader())
            {
                while (rd.Read())
                {
                    return new Dict
                           {
                               Id   = rd.GetInt64(0),
                               Word = rd.GetString(1),
                               Vect = (byte[])rd[2],
                               Lang = (FTLangLabel)rd.GetInt32(3)
                           }
                }
                ;
            }
            return(null);
        }
    }
 /// <summary>
 /// Fill Empty Add-in Dictionary Vectors using Random Vectors generation
 /// </summary>
 /// <param name="ft_bin_fn">FastText bin model filename</param>
 /// <param name="dbf_w2v_fn">DB word to vector filename</param>
 protected void SubProcFillEmptyVectDictRND(
     string dbf_w2v_fn, FTLangLabel lang)
 {
     using (var dbx = new FastTextProcessDB(dbf_w2v_fn))
     {
         var words = dbx.Dict(DictDbSet.DictKind.Addin).GetWordsWithEmptyVect();
         if (words.Any())
         {
             var trans = dbx.BeginTransaction();
             try
             {
                 var dict = dbx.Dict(DictDbSet.DictKind.Addin);
                 var rnd  = new Random();
                 foreach (var w in words)
                 {
                     dict.UpdateVectOfWord(Dict.CreateRnd(rnd, w, lang));
                 }
                 trans.Commit();
             }
             catch
             {
                 trans.Rollback();
                 throw;
             }
         }
     }
 }
Beispiel #3
0
        public long?FindIdByWord(string word, FTLangLabel lang)
        {
            CmdFindIdByWord.Parameters[Dict.FldnWord].Value   = word;
            CmdFindIdByWord.Parameters[Dict.FldnLangId].Value = (int)lang;
            var res = CmdFindIdByWord.ExecuteScalar();

            return(res == null || DBNull.Value.Equals(res) ? (long?)null : Convert.ToInt64(res));
        }
        /// <summary>
        /// Create word to vector db
        /// </summary>
        /// <param name="ft_vec_fn">FastText vectors filename</param>
        /// <param name="dbf_w2v_fn">DB word to vector filename</param>
        /// <param name="with_insert_or_replace">use insert_or_replace when non unique vocabulary</param>
        protected void ProcCreateDb(string ft_vec_fn, string dbf_w2v_fn, FTLangLabel lang, bool with_insert_or_replace = false)
        {
            var fvec = DataArcPath(ft_vec_fn);

            AssertFileExists(fvec, "FastText file of vectors");

            AssertFileNotExists(dbf_w2v_fn, "word2vect DB");
            FastTextProcessDB.CreateDB(dbf_w2v_fn);

            ProcAppendDb(ft_vec_fn, dbf_w2v_fn, lang, with_insert_or_replace);
        }
Beispiel #5
0
        public long[] WordsToInxsForParallel(string[] words, FTLangLabel lang)
        {
            var dict        = ProcessDB.Dict(DictDbSet.DictKind.Main);
            var dict_addins = ProcessDB.Dict(DictDbSet.DictKind.Addin);
            var embed       = ProcessDB.EmbedDict();
            var embed_inxs  = new long[words.Length];

            for (int inx = 0; inx < words.Length; inx++)
            {
                embed_inxs[inx] = WordToInx(words[inx], lang, dict, dict_addins, embed);
            }
            return(embed_inxs);
        }
Beispiel #6
0
        public static Dict CreateEmpty(string word        = "<%NONE%>"
                                       , FTLangLabel lang = FTLangLabel.NotSpecified
                                       , int vect_sz      = DEF_VECT_SIZE)
        {
            var res = new Dict
            {
                Id   = -1,
                Word = word,
                Vect = new byte[vect_sz * 4],
                Lang = lang
            };

            Array.Clear(res.Vect, 0, vect_sz * 4);
            return(res);
        }
Beispiel #7
0
        public static Dict CreateParseFT(string str, FTLangLabel lang)
        {
            var sarr  = str.Trim().Split(' ');
            var sfarr = new string[sarr.Length - 1];

            Array.Copy(sarr, 1, sfarr, 0, sfarr.Length);
            var farr = Array.ConvertAll(sfarr, float.Parse);
            //var barr = new byte[farr.Length * 4];
            //Buffer.BlockCopy(farr, 0, barr, 0, barr.Length);
            var barr = Float2Byte(farr);

            return(new Dict {
                Word = sarr[0], Vect = barr, Lang = lang
            });
        }
Beispiel #8
0
        public static Dict CreateRnd(Random rnd, string word, FTLangLabel lang
                                     , int vect_sz = DEF_VECT_SIZE, float vv_min = DEF_VECT_MIN, float vv_max = DEF_VECT_MAX)
        {
            var farr  = new float[vect_sz];
            var delta = vv_max - vv_min;

            for (int inx = 0; inx < vect_sz; inx++)
            {
                farr[inx] = (float)(vv_max - delta * rnd.NextDouble());
            }
            var barr = Float2Byte(farr);

            return(new Dict {
                Id = -1, Word = word, Vect = barr, Lang = lang
            });
        }
Beispiel #9
0
        long WordToInx(string word, FTLangLabel lang, DictDbSet dict, DictDbSet dict_addins, EmbedDictDbSet embed)
        {
            long embed_inx;
            long?cur_id = dict.FindIdByWord(word, lang);

            if (cur_id.HasValue)
            {
                embed_inx = GetOrAddEmbed(embed, cur_id, DictDbSet.DictKind.Main, word, lang);
            }
            else
            {
                cur_id    = dict_addins.FindIdByWord(word, lang);
                embed_inx = GetOrAddEmbed(embed, cur_id, DictDbSet.DictKind.Addin, word, lang);
            }
            return(embed_inx);
        }
        /// <summary>
        /// Append records to word to vector db
        /// </summary>
        /// <param name="ft_vec_fn">FastText vectors filename</param>
        /// <param name="dbf_w2v_fn">DB word to vector filename</param>
        /// <param name="with_insert_or_replace">use insert_or_replace when non unique vocabulary</param>
        protected void ProcAppendDb(string ft_vec_fn, string dbf_w2v_fn, FTLangLabel lang, bool with_insert_or_replace = false)
        {
            var fvec = DataArcPath(ft_vec_fn);

            AssertFileExists(fvec, "FastText file of vectors");

            AssertFileExists(dbf_w2v_fn, "word2vect DB");

            using (var dbx = new FastTextProcessDB(dbf_w2v_fn, foreign_keys: false))
            {
                var w2v_tbl = dbx.Dict(DictDbSet.DictKind.Main);
                var trans   = dbx.BeginTransaction();
                w2v_tbl.ControlWordsIndex(is_enabled: false);
                using (var sr = new StreamReader(fvec))
                {
                    // header
                    var line = sr.ReadLine();
                    var harr = line.Split(' ');
                    Assert.Equal(2, harr.Length);
                    Log($"'{fvec}': {harr[0]} - samples count, {harr[1]} - sample dim.");
                    // data
                    while (!sr.EndOfStream)
                    {
                        line = sr.ReadLine();
                        if (string.IsNullOrEmpty(line))
                        {
                            continue;
                        }
                        var w2v = Dict.CreateParseFT(line, lang);
                        if (with_insert_or_replace)
                        {
                            w2v_tbl.InsertOrReplace(w2v);
                        }
                        else
                        {
                            w2v_tbl.Insert(w2v);
                        }
                    }
                }
                Log("ControlWordsIndex create...");
                w2v_tbl.ControlWordsIndex(is_enabled: true);
                Log("Done");
                trans.Commit();
            }
        }
Beispiel #11
0
        Dictionary <string, NewItem> GetSetOfNew(FTLangLabel lang)
        {
            lock (SetOfNewLock)
            {
                _setOfNew = _setOfNew
                            ?? new Dictionary <FTLangLabel, Dictionary <string, NewItem> >();

                if (_setOfNew.ContainsKey(lang))
                {
                    return(_setOfNew[lang]);
                }
                else
                {
                    var res = new Dictionary <string, NewItem>();
                    _setOfNew[lang] = res;
                    return(res);
                }
            }
        }
 /// <summary>
 /// Fill Empty Add-in Dictionary Vectors
 /// </summary>
 /// <param name="ft_bin_fn">FastText bin model filename</param>
 /// <param name="dbf_w2v_fn">DB word to vector filename</param>
 protected void SubProcFillEmptyVectDict(
     string ft_bin_fn, string dbf_w2v_fn, FTLangLabel lang)
 {
     using (var dbx = new FastTextProcessDB(dbf_w2v_fn))
     {
         var words = dbx.Dict(DictDbSet.DictKind.Addin).GetWordsWithEmptyVect();
         if (words.Any())
         {
             var fmod = DataArcPath(ft_bin_fn);
             AssertFileExists(fmod, "FastText model file");
             var fexe = FastTextBin;
             AssertFileExists(fexe, "FastText executable");
             var trans = dbx.BeginTransaction();
             try
             {
                 var dict = dbx.Dict(DictDbSet.DictKind.Addin);
                 using (var ftl = FTCmd.CreateW2V(fexe, fmod))
                 {
                     ftl.RunByLineAsync(
                         (txt_src, res_txt) =>
                         dict.UpdateVectOfWord(Dict.CreateParseFT(res_txt, lang))
                         );
                     foreach (var w in words)
                     {
                         ftl.Push(w);
                     }
                 }
                 trans.Commit();
             }
             catch
             {
                 trans.Rollback();
                 throw;
             }
         }
     }
 }
Beispiel #13
0
        public IEnumerable <Dict> GetAll(FTLangLabel lang)
        {
            var sql = string.Format(
                "SELECT {1}, {2}, {3}, {4} FROM {0} WHERE {4} = ${4}",
                TableName
                , Dict.FldnId, Dict.FldnWord, Dict.FldnVect, Dict.FldnLangId);
            var cmd = Ctx.CreateCmd(sql);

            cmd.Parameters.AddWithValue(Dict.FldnLangId, (int)lang);
            using (var rd = cmd.ExecuteReader())
            {
                while (rd.Read())
                {
                    yield return new Dict
                           {
                               Id   = rd.GetInt64(0),
                               Word = rd.GetString(1),
                               Vect = (byte[])rd[2],
                               Lang = (FTLangLabel)rd.GetInt32(3)
                           }
                }
                ;
            }
        }
 public PreprocessItem(string text, FTLangLabel lang)
 {
     Text = text;
     Lang = lang;
 }
Beispiel #15
0
        long GetOrAddEmbed(EmbedDictDbSet embed
                           , long?dict_id, DictDbSet.DictKind dict_kind, string word, FTLangLabel lang)
        {
            long?inx = null;

            if (dict_id.HasValue)
            {
                inx = embed.FindInxById(dict_id.Value, dict_kind);
            }

            if (inx.HasValue)
            {
                var item = new ExistingItem {
                    FreqAdd = 0
                };
                lock (SetOfDirtyLock)
                {
                    if (SetOfDirty.ContainsKey(inx.Value))
                    {
                        item = SetOfDirty[inx.Value];
                    }
                    item.FreqAdd++;
                    SetOfDirty[inx.Value] = item;
                }
            }
            else
            {
                var item = new NewItem {
                    DictKind = dict_kind, DictId = dict_id, Freq = 0
                };
                lock (SetOfNewLock)
                {
                    var set_of_new = GetSetOfNew(lang);
                    if (set_of_new.ContainsKey(word))
                    {
                        item = set_of_new[word];
                    }
                    else
                    {
                        item.Inx = GetNextEmbedInx(embed);
                    }
                    inx = item.Inx;
                    item.Freq++;
                    set_of_new[word] = item;
                }
            }
            return(inx.Value);
        }