/// <summary> /// Fill Empty Add-in Dictionary Vectors using Random Vectors generation /// </summary> /// <param name="ft_bin_fn">FastText bin model filename</param> /// <param name="dbf_w2v_fn">DB word to vector filename</param> protected void SubProcFillEmptyVectDictRND( string dbf_w2v_fn, LangLabel lang) { using (var dbx = new FastTextProcessDB(dbf_w2v_fn)) { var words = dbx.Dict(DictDbSet.DictKind.Addin).GetWordsWithEmptyVect(); if (words.Any()) { var trans = dbx.BeginTransaction(); try { var dict = dbx.Dict(DictDbSet.DictKind.Addin); var rnd = new Random(); foreach (var w in words) { dict.UpdateVectOfWord(Dict.CreateRnd(rnd, w, lang)); } trans.Commit(); } catch { trans.Rollback(); throw; } } } }
public Dict FindByWord(string word, LangLabel lang) { var sql = string.Format( "SELECT {1}, {2}, {3}, {4} FROM {0} WHERE {2} = ${2} AND ({4} = ${4} OR {4} = {5}) ", TableName , Dict.FldnId, Dict.FldnWord, Dict.FldnVect, Dict.FldnLangId, (int)LangLabel.NA); var cmd = Ctx.CreateCmd(sql); cmd.Parameters.AddWithValue(Dict.FldnWord, word); cmd.Parameters.AddWithValue(Dict.FldnLangId, (int)lang); using (var rd = cmd.ExecuteReader()) { while (rd.Read()) { return new Dict { Id = rd.GetInt64(0), Word = rd.GetString(1), Vect = (byte[])rd[2], Lang = (LangLabel)rd.GetInt32(3) } } ; } return(null); } }
/// <summary> /// get only letter characters /// </summary> /// <param name="word"></param> /// <returns></returns> public string GetLettersOnly(string word, LangLabel lang) { if (string.IsNullOrWhiteSpace(word)) { return(string.Empty); } Regex rex = null; switch (lang) { case LangLabel.en: rex = rexClnWordEn; break; case LangLabel.ru: case LangLabel.uk: rex = rexClnWordCyr; break; } if (rex == null) { return(word); } var str = rex.Replace(word, string.Empty); return(str.Trim()); }
public void ProcCreateRuUkDb() { ProcCreateDb(DBF_W2V); var cln_proc = new ServiceRoutines(); bool infilter(Dict w2v) { var cln_w = cln_proc.GetLettersOnly(w2v.Word, w2v.Lang); var reject = string.IsNullOrEmpty(cln_w); if (reject) { Log($"Skip {w2v}"); } return(!reject); } _lang = LangLabel.ru; ProcAppendDb(FTF_VECTOR, DBF_W2V, LANG , with_insert_or_replace: true, fn_infilter_predicat: infilter); _lang = LangLabel.uk; ProcAppendDb(FTF_VECTOR, DBF_W2V, LANG , with_insert_or_replace: true, fn_infilter_predicat: infilter); SubProcInsertPredefinedMacro(DBF_W2V); }
public static string GetFastTextLabel(this LangLabel enumVal) { var memInfo = typeof(LangLabel).GetMember(enumVal.ToString()); var attributes = memInfo[0].GetCustomAttributes(typeof(LangInfoAttribute), false); var attr = (attributes.Length > 0) ? (LangInfoAttribute)attributes[0] : null; return(attr == null ? "" : attr.FTLabel); }
public long?FindIdByWord(string word, LangLabel lang) { CmdFindIdByWord.Parameters[Dict.FldnWord].Value = word; CmdFindIdByWord.Parameters[Dict.FldnLangId].Value = (int)lang; var res = CmdFindIdByWord.ExecuteScalar(); return(res == null || DBNull.Value.Equals(res) ? (long?)null : Convert.ToInt64(res)); }
public long[] WordsToInxsForParallel(string[] words, LangLabel lang) { var dict = ProcessDB.Dict(DictDbSet.DictKind.Main); var dict_addins = ProcessDB.Dict(DictDbSet.DictKind.Addin); var embed = ProcessDB.EmbedDict(); var embed_inxs = new long[words.Length]; for (int inx = 0; inx < words.Length; inx++) { embed_inxs[inx] = WordToInx(words[inx], lang, dict, dict_addins, embed); } return(embed_inxs); }
/// <summary> /// Append records to word to vector db /// </summary> /// <param name="ft_vec_fn">FastText vectors filename</param> /// <param name="dbf_w2v_fn">DB word to vector filename</param> /// <param name="with_insert_or_replace">use insert_or_replace when non unique vocabulary</param> /// <param name="fn_infilter_predicat">optional filter predicate</param> protected void ProcAppendDb(string ft_vec_fn, string dbf_w2v_fn, LangLabel lang , bool with_insert_or_replace = false , Func <Dict, bool> fn_infilter_predicat = null) { var fvec = FastTextPath(ft_vec_fn); AssertFileExists(fvec, "FastText file of vectors"); AssertFileExists(dbf_w2v_fn, "word2vect DB"); using (var dbx = new FastTextProcessDB(dbf_w2v_fn, foreign_keys: false)) { var w2v_tbl = dbx.Dict(DictDbSet.DictKind.Main); var trans = dbx.BeginTransaction(); w2v_tbl.ControlWordsIndex(is_enabled: false); using (var sr = new StreamReader(fvec)) { // header var line = sr.ReadLine(); var harr = line.Split(' '); Assert.Equal(2, harr.Length); Log($"'{fvec}': {harr[0]} - samples count, {harr[1]} - sample dim."); // data while (!sr.EndOfStream) { line = sr.ReadLine(); if (string.IsNullOrEmpty(line)) { continue; } var w2v = Dict.CreateParseFT(line, lang); if (fn_infilter_predicat == null || fn_infilter_predicat(w2v)) { if (with_insert_or_replace) { w2v_tbl.InsertOrReplace(w2v); } else { w2v_tbl.Insert(w2v); } } } } Log("ControlWordsIndex create..."); w2v_tbl.ControlWordsIndex(is_enabled: true); Log("Done"); trans.Commit(); } }
Dict DbFindByWord(string word, LangLabel lang) { using (var dbx = new FastTextProcessDB(_db_file)) { var dict_db = dbx.Dict(DictDbSet.DictKind.Main); var w2v = dict_db.FindByWord(word, lang); if (w2v == null) { dict_db = dbx.Dict(DictDbSet.DictKind.Addin); w2v = dict_db.FindByWord(word, lang); } return(w2v); } }
public static Dict CreateParseFT(string str, LangLabel lang) { var sarr = str.Trim().Split(' '); var sfarr = new string[sarr.Length - 1]; Array.Copy(sarr, 1, sfarr, 0, sfarr.Length); var farr = Array.ConvertAll(sfarr, float.Parse); //var barr = new byte[farr.Length * 4]; //Buffer.BlockCopy(farr, 0, barr, 0, barr.Length); var barr = Float2Byte(farr); return(new Dict { Word = sarr[0], Vect = barr, Lang = lang }); }
public static void ProcessIterator <TReplEnum>(IVReplaceTextCursor textCursor, Action <IVReplaceTextCursor, TReplEnum, ReplaceAttribute> act_replace) { LangLabel lang = textCursor.VContext.LangLabel; var enum_vals = (TReplEnum[])Enum.GetValues(typeof(TReplEnum)); foreach (TReplEnum enumVal in enum_vals) { var replace_attrs = GetTextAttrib <TReplEnum, ReplaceAttribute>(enumVal) .Where(ra => ra.Lng == lang); foreach (var attr in replace_attrs) { act_replace(textCursor, enumVal, attr); } } }
public static Dict CreateEmpty(string word = "<%NONE%>" , LangLabel lang = LangLabel.NA , int vect_sz = DEF_VECT_SIZE) { var res = new Dict { Id = -1, Word = word, Vect = new byte[vect_sz * 4], Lang = lang }; Array.Clear(res.Vect, 0, vect_sz * 4); return(res); }
long WordToInx(string word, LangLabel lang, DictDbSet dict, DictDbSet dict_addins, EmbedDictDbSet embed) { long embed_inx; long?cur_id = dict.FindIdByWord(word, lang); if (cur_id.HasValue) { embed_inx = GetOrAddEmbed(embed, cur_id, DictDbSet.DictKind.Main, word, lang); } else { cur_id = dict_addins.FindIdByWord(word, lang); embed_inx = GetOrAddEmbed(embed, cur_id, DictDbSet.DictKind.Addin, word, lang); } return(embed_inx); }
public void ProcRuUkFull() { Log("Process Samples ..."); var conn_str = ConfRoot.GetSection("DataCyrConnStr").Value; ProcSrcItems(conn_str, "cs", ""); _lang = LangLabel.ru; SubProcFillEmptyVectDict(FTF_MODEL, DBF_W2V, LANG); _lang = LangLabel.uk; SubProcFillEmptyVectDict(FTF_MODEL, DBF_W2V, LANG); _lang = LangLabel.NA; SubProcFillEmptyVectDictRND(DBF_W2V, LANG); // SubProcBuildResultDict(DBF_RESULT, DBF_W2V); Log("Done (ProcRuUkFull)"); }
public static Dict CreateRnd(Random rnd, string word, LangLabel lang , int vect_sz = DEF_VECT_SIZE, float vv_min = DEF_VECT_MIN, float vv_max = DEF_VECT_MAX) { var farr = new float[vect_sz]; var delta = vv_max - vv_min; for (int inx = 0; inx < vect_sz; inx++) { farr[inx] = (float)(vv_max - delta * rnd.NextDouble()); } var barr = Float2Byte(farr); return(new Dict { Id = -1, Word = word, Vect = barr, Lang = lang }); }
public static bool TryParseFastTextLabel(string ft_label, out LangLabel parsedVal) { parsedVal = LangLabel.NA; if (!string.IsNullOrEmpty(ft_label)) { var val = ft_label.Trim(); foreach (LangLabel lbl in (LangLabel[])Enum.GetValues(typeof(LangLabel))) { if (val.Equals(lbl.GetFastTextLabel())) { parsedVal = lbl; return(true); } } } return(false); }
Dictionary <string, NewItem> GetSetOfNew(LangLabel lang) { lock (SetOfNewLock) { _setOfNew = _setOfNew ?? new Dictionary <LangLabel, Dictionary <string, NewItem> >(); if (_setOfNew.ContainsKey(lang)) { return(_setOfNew[lang]); } else { var res = new Dictionary <string, NewItem>(); _setOfNew[lang] = res; return(res); } } }
ConcurrentDictionary <string, Dict> GetVectDict(LangLabel lang) { lock (VectCacheLock) { if (_vectCache == null) { _vectCache = new TCache(); } if (_vectCache.ContainsKey(lang)) { if (_vectCache[lang].TryGetTarget(out ConcurrentDictionary <string, Dict> dict)) { return(dict); } } var new_dict = new ConcurrentDictionary <string, Dict>(); _vectCache[lang] = new WeakReference <ConcurrentDictionary <string, Dict> >(new_dict); return(new_dict); } }
void CalcMinMax(string w2v_db_fn, LangLabel lang) { using (var dbx = new FastTextProcessDB(w2v_db_fn)) { var dict_db = dbx.Dict(DictDbSet.DictKind.Main); var w2v_all = dict_db.GetAll(lang); var fmin = float.MaxValue; var fmax = float.MinValue; long cnt = 0; foreach (var w2v in w2v_all) { foreach (var vv in Dict.GetVectFloat(w2v.Vect)) { fmin = Math.Min(fmin, vv); fmax = Math.Max(fmax, vv); } cnt++; } Log($"Lang={lang}: Count={cnt}; Min={fmin}; Max={fmax};"); } }
public Dict FindByWord(string word, LangLabel lang) { Dict w2v; var vect_dict = GetVectDict(lang); if (vect_dict.ContainsKey(word)) { if (!vect_dict.TryGetValue(word, out w2v)) { throw new InvalidOperationException( $"FindByWord('{word}', {lang}) cache get error"); } } else { w2v = DbFindByWord(word, lang); vect_dict.TryAdd(word, w2v); //if (!vect_dict.TryAdd(word, w2v)) // throw new InvalidOperationException( // $"FindByWord('{word}', {lang}) cache add error"); } return(w2v); }
/// <summary> /// Fill Empty Add-in Dictionary Vectors /// </summary> /// <param name="ft_bin_fn">FastText bin model filename</param> /// <param name="dbf_w2v_fn">DB word to vector filename</param> protected void SubProcFillEmptyVectDict( string ft_bin_fn, string dbf_w2v_fn, LangLabel lang) { using (var dbx = new FastTextProcessDB(dbf_w2v_fn)) { var words = dbx.Dict(DictDbSet.DictKind.Addin).GetWordsWithEmptyVect(); if (words.Any()) { var fmod = FastTextPath(ft_bin_fn); AssertFileExists(fmod, "FastText model file"); var fexe = FastTextBin; AssertFileExists(fexe, "FastText executable"); var trans = dbx.BeginTransaction(); try { var dict = dbx.Dict(DictDbSet.DictKind.Addin); using (var ftl = FTCmd.CreateW2V(fexe, fmod)) { ftl.RunByLineAsync( (txt_src, res_txt) => dict.UpdateVectOfWord(Dict.CreateParseFT(res_txt, lang)) ); foreach (var w in words) { ftl.Push(w); } } trans.Commit(); } catch { trans.Rollback(); throw; } } } }
public IEnumerable <Dict> GetAll(LangLabel lang) { var sql = string.Format( "SELECT {1}, {2}, {3}, {4} FROM {0} WHERE {4} = ${4}", TableName , Dict.FldnId, Dict.FldnWord, Dict.FldnVect, Dict.FldnLangId); var cmd = Ctx.CreateCmd(sql); cmd.Parameters.AddWithValue(Dict.FldnLangId, (int)lang); using (var rd = cmd.ExecuteReader()) { while (rd.Read()) { yield return new Dict { Id = rd.GetInt64(0), Word = rd.GetString(1), Vect = (byte[])rd[2], Lang = (LangLabel)rd.GetInt32(3) } } ; } }
/// <summary> /// Purpose: Grabs language label information based on language label code /// Accepts: String /// Returns: Hashtable /// </summary> public Hashtable GetLangLabelByCode(string code) { LangLabel obj = new LangLabel(); QuickStart_DBEntities dbContext; Hashtable hsh = new Hashtable(); try { dbContext = new QuickStart_DBEntities(); obj = dbContext.LangLabels.FirstOrDefault(l => l.LangLabelCode == code); if (obj != null) { hsh["langlabelcode"] = obj.LangLabelCode; hsh["value"] = obj.Value; hsh["modified"] = obj.Modified; } } catch (Exception ex) { ErrorLoggerData.ErrorRoutine(ex, "LangLabelData", "GetLangLabelByCode"); } return(hsh); }
public PreprocessItem(string text, LangLabel lang) { Text = text; Lang = lang; }
public RenderLangLabel(LangLabel l) { LangLabelCode = Convert.ToString(l.LangLabelCode); Value = Convert.ToString(l.Value); Modified = Convert.ToDateTime(l.Modified); }
/// <summary> /// Purpose: Grabs all language labels /// Accepts: Nothing /// Returns: List<LangLabel> /// </summary> public List<LangLabel> GetAllLangLabels() { List<LangLabel> langlabels = new List<LangLabel>(); try { LangLabelData data = new LangLabelData(); List<QSRDataObjects.LangLabel> dataAudits = data.GetAllLangLabels(); foreach (QSRDataObjects.LangLabel l in dataAudits) { LangLabel langLabel = new LangLabel(); langLabel.LangLabelCode = l.LangLabelCode; langLabel.Value = l.Value; langLabel.Modified = l.Modified; langlabels.Add(langLabel); } } catch (Exception ex) { ErrorRoutine(ex, "LangLabel", "GetAllLangLabels"); } return langlabels; }
private void reloadGridView() { if (DropDownListDBTables.SelectedValue == "Administrators") { //Get all objects Administrator obj = new Administrator(); List <Administrator> list = obj.GetAllAdministrators(false); //Fill a rendered object list List <RenderAdministrator> renderedList = new List <RenderAdministrator>(); foreach (Administrator x in list) { renderedList.Add(new RenderAdministrator(x)); } GridViewDBTable.DataSource = renderedList; } else if (DropDownListDBTables.SelectedValue == "Audits") { //Get all objects Audit obj = new Audit(); List <Audit> list = obj.GetAllAudits(); //Fill a rendered object list List <RenderAudit> renderedList = new List <RenderAudit>(); foreach (Audit x in list) { renderedList.Add(new RenderAudit(x)); } GridViewDBTable.DataSource = renderedList; } else if (DropDownListDBTables.SelectedValue == "AuditTypes") { //Get all objects AuditType obj = new AuditType(); List <AuditType> list = obj.GetAllAuditTypes(); //Fill a rendered object list List <RenderAuditType> renderedList = new List <RenderAuditType>(); foreach (AuditType x in list) { renderedList.Add(new RenderAuditType(x)); } GridViewDBTable.DataSource = renderedList; } if (DropDownListDBTables.SelectedValue == "Categories") { //Get all objects Category obj = new Category(); List <Category> list = obj.GetAllCategories(false); //Fill a rendered object list List <RenderCategory> renderedList = new List <RenderCategory>(); foreach (Category x in list) { renderedList.Add(new RenderCategory(x)); } GridViewDBTable.DataSource = renderedList; } else if (DropDownListDBTables.SelectedValue == "Configurations") { //Get all objects Configuration obj = new Configuration(); List <Configuration> list = obj.GetAllConfigurations(); //Fill a rendered object list List <RenderConfiguration> renderedList = new List <RenderConfiguration>(); foreach (Configuration x in list) { renderedList.Add(new RenderConfiguration(x)); } GridViewDBTable.DataSource = renderedList; } else if (DropDownListDBTables.SelectedValue == "DeliveryType") { //Get all objects DeliveryType obj = new DeliveryType(); List <DeliveryType> list = obj.GetAllDeliveryTypes(); //Fill a rendered object list List <RenderDeliveryType> renderedList = new List <RenderDeliveryType>(); foreach (DeliveryType x in list) { renderedList.Add(new RenderDeliveryType(x)); } GridViewDBTable.DataSource = renderedList; } else if (DropDownListDBTables.SelectedValue == "LangLabels") { //Get all objects LangLabel obj = new LangLabel(); List <LangLabel> list = obj.GetAllLangLabels(); //Fill a rendered object list List <RenderLangLabel> renderedList = new List <RenderLangLabel>(); foreach (LangLabel x in list) { renderedList.Add(new RenderLangLabel(x)); } GridViewDBTable.DataSource = renderedList; } else if (DropDownListDBTables.SelectedValue == "OrderItems") { //Get all objects OrderItem obj = new OrderItem(); List <OrderItem> list = obj.GetAllOrderItems(); //Fill a rendered object list List <RenderOrderItem> renderedList = new List <RenderOrderItem>(); foreach (OrderItem x in list) { renderedList.Add(new RenderOrderItem(x)); } GridViewDBTable.DataSource = renderedList; } else if (DropDownListDBTables.SelectedValue == "Orders") { //Get all objects Order obj = new Order(); List <Order> list = obj.GetAllOrders(); //Fill a rendered object list List <RenderOrder> renderedList = new List <RenderOrder>(); foreach (Order x in list) { renderedList.Add(new RenderOrder(x)); } GridViewDBTable.DataSource = renderedList; } else if (DropDownListDBTables.SelectedValue == "ProductDeliveryTypes") { //Get all objects ProductDeliveryType obj = new ProductDeliveryType(); List <ProductDeliveryType> list = obj.GetAllProductDeliveryTypes(); //Fill a rendered object list List <RenderProductDeliveryType> renderedList = new List <RenderProductDeliveryType>(); foreach (ProductDeliveryType x in list) { renderedList.Add(new RenderProductDeliveryType(x)); } GridViewDBTable.DataSource = renderedList; } else if (DropDownListDBTables.SelectedValue == "Products") { //Get all objects Product obj = new Product(); List <Product> list = obj.GetAllProducts(false); //Fill a rendered object list List <RenderProduct> renderedList = new List <RenderProduct>(); foreach (Product x in list) { renderedList.Add(new RenderProduct(x)); } GridViewDBTable.DataSource = renderedList; } else if (DropDownListDBTables.SelectedValue == "StatesProvinces") { //Get all objects StateProvince obj = new StateProvince(); List <StateProvince> list = obj.GetAllStatesProvinces(); //Fill a rendered object list List <RenderStateProvince> renderedList = new List <RenderStateProvince>(); foreach (StateProvince x in list) { renderedList.Add(new RenderStateProvince(x)); } GridViewDBTable.DataSource = renderedList; } else if (DropDownListDBTables.SelectedValue == "Users") { //Get all objects User obj = new User(); List <User> list = obj.GetAllUsers(); //Fill a rendered object list List <RenderUser> renderedList = new List <RenderUser>(); foreach (User x in list) { renderedList.Add(new RenderUser(x)); } GridViewDBTable.DataSource = renderedList; } //Databind the new datasource obtained above GridViewDBTable.DataBind(); }
public static string GetStdLangLabel(this LangLabel enumVal) { return(Enum.GetName(typeof(LangLabel), enumVal)); }
long GetOrAddEmbed(EmbedDictDbSet embed , long?dict_id, DictDbSet.DictKind dict_kind, string word, LangLabel lang) { long?inx = null; if (dict_id.HasValue) { inx = embed.FindInxById(dict_id.Value, dict_kind); } if (inx.HasValue) { var item = new ExistingItem { FreqAdd = 0 }; lock (SetOfDirtyLock) { if (SetOfDirty.ContainsKey(inx.Value)) { item = SetOfDirty[inx.Value]; } item.FreqAdd++; SetOfDirty[inx.Value] = item; } } else { var item = new NewItem { DictKind = dict_kind, DictId = dict_id, Freq = 0 }; lock (SetOfNewLock) { var set_of_new = GetSetOfNew(lang); if (set_of_new.ContainsKey(word)) { item = set_of_new[word]; } else { item.Inx = GetNextEmbedInx(embed); } inx = item.Inx; item.Freq++; set_of_new[word] = item; } } return(inx.Value); }