public Dict FindByWord(string word, FTLangLabel lang) { var sql = string.Format( "SELECT {1}, {2}, {3}, {4} FROM {0} WHERE {2} = ${2} AND ({4} = ${4} OR {4} = {5}) ", TableName , Dict.FldnId, Dict.FldnWord, Dict.FldnVect, Dict.FldnLangId, (int)FTLangLabel.NotSpecified); var cmd = Ctx.CreateCmd(sql); cmd.Parameters.AddWithValue(Dict.FldnWord, word); cmd.Parameters.AddWithValue(Dict.FldnLangId, (int)lang); using (var rd = cmd.ExecuteReader()) { while (rd.Read()) { return new Dict { Id = rd.GetInt64(0), Word = rd.GetString(1), Vect = (byte[])rd[2], Lang = (FTLangLabel)rd.GetInt32(3) } } ; } return(null); } }
/// <summary> /// Fill Empty Add-in Dictionary Vectors using Random Vectors generation /// </summary> /// <param name="ft_bin_fn">FastText bin model filename</param> /// <param name="dbf_w2v_fn">DB word to vector filename</param> protected void SubProcFillEmptyVectDictRND( string dbf_w2v_fn, FTLangLabel lang) { using (var dbx = new FastTextProcessDB(dbf_w2v_fn)) { var words = dbx.Dict(DictDbSet.DictKind.Addin).GetWordsWithEmptyVect(); if (words.Any()) { var trans = dbx.BeginTransaction(); try { var dict = dbx.Dict(DictDbSet.DictKind.Addin); var rnd = new Random(); foreach (var w in words) { dict.UpdateVectOfWord(Dict.CreateRnd(rnd, w, lang)); } trans.Commit(); } catch { trans.Rollback(); throw; } } } }
public long?FindIdByWord(string word, FTLangLabel lang) { CmdFindIdByWord.Parameters[Dict.FldnWord].Value = word; CmdFindIdByWord.Parameters[Dict.FldnLangId].Value = (int)lang; var res = CmdFindIdByWord.ExecuteScalar(); return(res == null || DBNull.Value.Equals(res) ? (long?)null : Convert.ToInt64(res)); }
/// <summary> /// Create word to vector db /// </summary> /// <param name="ft_vec_fn">FastText vectors filename</param> /// <param name="dbf_w2v_fn">DB word to vector filename</param> /// <param name="with_insert_or_replace">use insert_or_replace when non unique vocabulary</param> protected void ProcCreateDb(string ft_vec_fn, string dbf_w2v_fn, FTLangLabel lang, bool with_insert_or_replace = false) { var fvec = DataArcPath(ft_vec_fn); AssertFileExists(fvec, "FastText file of vectors"); AssertFileNotExists(dbf_w2v_fn, "word2vect DB"); FastTextProcessDB.CreateDB(dbf_w2v_fn); ProcAppendDb(ft_vec_fn, dbf_w2v_fn, lang, with_insert_or_replace); }
public long[] WordsToInxsForParallel(string[] words, FTLangLabel lang) { var dict = ProcessDB.Dict(DictDbSet.DictKind.Main); var dict_addins = ProcessDB.Dict(DictDbSet.DictKind.Addin); var embed = ProcessDB.EmbedDict(); var embed_inxs = new long[words.Length]; for (int inx = 0; inx < words.Length; inx++) { embed_inxs[inx] = WordToInx(words[inx], lang, dict, dict_addins, embed); } return(embed_inxs); }
public static Dict CreateEmpty(string word = "<%NONE%>" , FTLangLabel lang = FTLangLabel.NotSpecified , int vect_sz = DEF_VECT_SIZE) { var res = new Dict { Id = -1, Word = word, Vect = new byte[vect_sz * 4], Lang = lang }; Array.Clear(res.Vect, 0, vect_sz * 4); return(res); }
public static Dict CreateParseFT(string str, FTLangLabel lang) { var sarr = str.Trim().Split(' '); var sfarr = new string[sarr.Length - 1]; Array.Copy(sarr, 1, sfarr, 0, sfarr.Length); var farr = Array.ConvertAll(sfarr, float.Parse); //var barr = new byte[farr.Length * 4]; //Buffer.BlockCopy(farr, 0, barr, 0, barr.Length); var barr = Float2Byte(farr); return(new Dict { Word = sarr[0], Vect = barr, Lang = lang }); }
public static Dict CreateRnd(Random rnd, string word, FTLangLabel lang , int vect_sz = DEF_VECT_SIZE, float vv_min = DEF_VECT_MIN, float vv_max = DEF_VECT_MAX) { var farr = new float[vect_sz]; var delta = vv_max - vv_min; for (int inx = 0; inx < vect_sz; inx++) { farr[inx] = (float)(vv_max - delta * rnd.NextDouble()); } var barr = Float2Byte(farr); return(new Dict { Id = -1, Word = word, Vect = barr, Lang = lang }); }
long WordToInx(string word, FTLangLabel lang, DictDbSet dict, DictDbSet dict_addins, EmbedDictDbSet embed) { long embed_inx; long?cur_id = dict.FindIdByWord(word, lang); if (cur_id.HasValue) { embed_inx = GetOrAddEmbed(embed, cur_id, DictDbSet.DictKind.Main, word, lang); } else { cur_id = dict_addins.FindIdByWord(word, lang); embed_inx = GetOrAddEmbed(embed, cur_id, DictDbSet.DictKind.Addin, word, lang); } return(embed_inx); }
/// <summary> /// Append records to word to vector db /// </summary> /// <param name="ft_vec_fn">FastText vectors filename</param> /// <param name="dbf_w2v_fn">DB word to vector filename</param> /// <param name="with_insert_or_replace">use insert_or_replace when non unique vocabulary</param> protected void ProcAppendDb(string ft_vec_fn, string dbf_w2v_fn, FTLangLabel lang, bool with_insert_or_replace = false) { var fvec = DataArcPath(ft_vec_fn); AssertFileExists(fvec, "FastText file of vectors"); AssertFileExists(dbf_w2v_fn, "word2vect DB"); using (var dbx = new FastTextProcessDB(dbf_w2v_fn, foreign_keys: false)) { var w2v_tbl = dbx.Dict(DictDbSet.DictKind.Main); var trans = dbx.BeginTransaction(); w2v_tbl.ControlWordsIndex(is_enabled: false); using (var sr = new StreamReader(fvec)) { // header var line = sr.ReadLine(); var harr = line.Split(' '); Assert.Equal(2, harr.Length); Log($"'{fvec}': {harr[0]} - samples count, {harr[1]} - sample dim."); // data while (!sr.EndOfStream) { line = sr.ReadLine(); if (string.IsNullOrEmpty(line)) { continue; } var w2v = Dict.CreateParseFT(line, lang); if (with_insert_or_replace) { w2v_tbl.InsertOrReplace(w2v); } else { w2v_tbl.Insert(w2v); } } } Log("ControlWordsIndex create..."); w2v_tbl.ControlWordsIndex(is_enabled: true); Log("Done"); trans.Commit(); } }
Dictionary <string, NewItem> GetSetOfNew(FTLangLabel lang) { lock (SetOfNewLock) { _setOfNew = _setOfNew ?? new Dictionary <FTLangLabel, Dictionary <string, NewItem> >(); if (_setOfNew.ContainsKey(lang)) { return(_setOfNew[lang]); } else { var res = new Dictionary <string, NewItem>(); _setOfNew[lang] = res; return(res); } } }
/// <summary> /// Fill Empty Add-in Dictionary Vectors /// </summary> /// <param name="ft_bin_fn">FastText bin model filename</param> /// <param name="dbf_w2v_fn">DB word to vector filename</param> protected void SubProcFillEmptyVectDict( string ft_bin_fn, string dbf_w2v_fn, FTLangLabel lang) { using (var dbx = new FastTextProcessDB(dbf_w2v_fn)) { var words = dbx.Dict(DictDbSet.DictKind.Addin).GetWordsWithEmptyVect(); if (words.Any()) { var fmod = DataArcPath(ft_bin_fn); AssertFileExists(fmod, "FastText model file"); var fexe = FastTextBin; AssertFileExists(fexe, "FastText executable"); var trans = dbx.BeginTransaction(); try { var dict = dbx.Dict(DictDbSet.DictKind.Addin); using (var ftl = FTCmd.CreateW2V(fexe, fmod)) { ftl.RunByLineAsync( (txt_src, res_txt) => dict.UpdateVectOfWord(Dict.CreateParseFT(res_txt, lang)) ); foreach (var w in words) { ftl.Push(w); } } trans.Commit(); } catch { trans.Rollback(); throw; } } } }
public IEnumerable <Dict> GetAll(FTLangLabel lang) { var sql = string.Format( "SELECT {1}, {2}, {3}, {4} FROM {0} WHERE {4} = ${4}", TableName , Dict.FldnId, Dict.FldnWord, Dict.FldnVect, Dict.FldnLangId); var cmd = Ctx.CreateCmd(sql); cmd.Parameters.AddWithValue(Dict.FldnLangId, (int)lang); using (var rd = cmd.ExecuteReader()) { while (rd.Read()) { yield return new Dict { Id = rd.GetInt64(0), Word = rd.GetString(1), Vect = (byte[])rd[2], Lang = (FTLangLabel)rd.GetInt32(3) } } ; } }
public PreprocessItem(string text, FTLangLabel lang) { Text = text; Lang = lang; }
long GetOrAddEmbed(EmbedDictDbSet embed , long?dict_id, DictDbSet.DictKind dict_kind, string word, FTLangLabel lang) { long?inx = null; if (dict_id.HasValue) { inx = embed.FindInxById(dict_id.Value, dict_kind); } if (inx.HasValue) { var item = new ExistingItem { FreqAdd = 0 }; lock (SetOfDirtyLock) { if (SetOfDirty.ContainsKey(inx.Value)) { item = SetOfDirty[inx.Value]; } item.FreqAdd++; SetOfDirty[inx.Value] = item; } } else { var item = new NewItem { DictKind = dict_kind, DictId = dict_id, Freq = 0 }; lock (SetOfNewLock) { var set_of_new = GetSetOfNew(lang); if (set_of_new.ContainsKey(word)) { item = set_of_new[word]; } else { item.Inx = GetNextEmbedInx(embed); } inx = item.Inx; item.Freq++; set_of_new[word] = item; } } return(inx.Value); }