public void BuildIndexedFeatureIntoFile(string strFileName, List <string> features) { //Assign id for each feature var feature2Id = new SortedDictionary <string, int>(StringComparer.Ordinal); var maxId = 0; foreach (var strFeature in features) { if (strFeature.StartsWith("U") == false) { Logger.WriteLine("Invalidated feature: {0}", strFeature); continue; } feature2Id.Add(strFeature, maxId); maxId++; } var da = new DoubleArrayTrieBuilder(4); da.build(feature2Id); da.save(strFileName + ".dart"); var swTemplate = new StreamWriter(strFileName + ".template"); swTemplate.WriteLine("MaxTemplateFeatureId:{0}", maxId); foreach (var strTemplate in m_Templates) { swTemplate.WriteLine(strTemplate); } swTemplate.Close(); }
//Build double array trie-tree from text file //strTextFileName: raw text file name used to build DA trie-tree // text file format: key \t value // key as string type // value as non-netgive integer //strDAFileName: double array trie-tree binary file name built from strTextFileName private static void Build(string strTextFileName, string strDAFileName) { StreamReader sr = new StreamReader(strTextFileName); //Load raw data from text file and sort them as ordinal SortedDictionary<string, int> sdict = new SortedDictionary<string, int>(StringComparer.Ordinal); Console.WriteLine("Loading key value pairs from raw text file and sort them..."); while (sr.EndOfStream == false) { string strLine = sr.ReadLine(); if (strLine.Length == 0) { continue; } string[] items = strLine.Split('\t'); sdict.Add(items[0], int.Parse(items[1])); } //test case for SearchAsKeyPrefix and SearchByPrefix sdict.Add("TestSearchPrefix_case0", 1234567); sdict.Add("TestSearchPrefix_case01", 2345678); sdict.Add("TestSearchPrefix_case012", 3456789); DoubleArrayTrieBuilder dab = new DoubleArrayTrieBuilder(4); Console.WriteLine("Begin to build double array trie-tree..."); dab.build(sdict); dab.save(strDAFileName); Console.WriteLine("Done!"); }
//Build double array trie-tree from text file //strTextFileName: raw text file name used to build DA trie-tree // text file format: key \t value // key as string type // value as non-netgive integer //strDAFileName: double array trie-tree binary file name built from strTextFileName private static void Build(string strTextFileName, string strDAFileName) { StreamReader sr = new StreamReader(strTextFileName); //Load raw data from text file and sort them as ordinal SortedDictionary <string, int> sdict = new SortedDictionary <string, int>(StringComparer.Ordinal); Console.WriteLine("Loading key value pairs from raw text file and sort them..."); while (sr.EndOfStream == false) { string strLine = sr.ReadLine(); if (strLine.Length == 0) { continue; } string[] items = strLine.Split('\t'); sdict.Add(items[0], int.Parse(items[1])); } //test case for SearchAsKeyPrefix and SearchByPrefix sdict.Add("TestSearchPrefix_case0", 1234567); sdict.Add("TestSearchPrefix_case01", 2345678); sdict.Add("TestSearchPrefix_case012", 3456789); DoubleArrayTrieBuilder dab = new DoubleArrayTrieBuilder(4); Console.WriteLine("Begin to build double array trie-tree..."); dab.build(sdict); dab.save(strDAFileName); Console.WriteLine("Done!"); }
public void BuildIndexedFeatureIntoFile(string strFileName, List<string> features) { //Assign id for each feature SortedDictionary<string, int> feature2Id = new SortedDictionary<string, int>(StringComparer.Ordinal); int maxId = 0; foreach (string strFeature in features) { if (strFeature.StartsWith("U") == false) { Logger.WriteLine("Invalidated feature: {0}", strFeature); continue; } feature2Id.Add(strFeature, maxId); maxId++; } DoubleArrayTrieBuilder da = new DoubleArrayTrieBuilder(4); da.build(feature2Id); da.save(strFileName + ".dart"); StreamWriter swTemplate = new StreamWriter(strFileName + ".template"); swTemplate.WriteLine("MaxTemplateFeatureId:{0}", maxId); foreach (string strTemplate in m_Templates) { swTemplate.WriteLine(strTemplate); } swTemplate.Close(); }
static void Main(string[] args) { if (args.Length != 2) { Console.WriteLine("signDict.exe [raw dictionary] [binary dictionary]"); return; } DoubleArrayTrieBuilder daBuilder = new DoubleArrayTrieBuilder(4); BTreeDictionary <string, int> dict = new BTreeDictionary <string, int>(StringComparer.Ordinal, 128); string strLine = null; StreamReader sr = new StreamReader(args[0], Encoding.UTF8); StreamWriter sw = new StreamWriter(args[1] + ".prob", false, Encoding.UTF8); BinaryWriter bw = new BinaryWriter(sw.BaseStream); int index = 0; while ((strLine = sr.ReadLine()) != null) { string[] items = strLine.Split('\t'); string strNGram = items[0].Trim(); if (dict.ContainsKey(strNGram) == true) { Console.WriteLine("duplicated line: {0}", strLine); continue; } if (strNGram.Length == 0) { continue; } string[] vals = items[1].Split(); float prob = float.Parse(vals[0]); float backoff = float.Parse(vals[1]); //Write item into file bw.Write(prob); bw.Write(backoff); dict.Add(strNGram, index); index++; } sr.Close(); daBuilder.build(dict); daBuilder.save(args[1] + ".da"); bw.Close(); }
public void Process(string strModelFileName, string strShrinkedModelFileName, int thread_num_ = 1) { var sr = new StreamReader(strModelFileName); string strLine; //读入版本号 strLine = sr.ReadLine(); var version = uint.Parse(strLine.Split(':')[1].Trim()); if (version == CRFSharp.Utils.MODEL_TYPE_SHRINKED) { Console.WriteLine("The input model has been shrinked"); return; } //读入cost_factor strLine = sr.ReadLine(); var cost_factor_ = double.Parse(strLine.Split(':')[1].Trim()); //读入maxid strLine = sr.ReadLine(); var maxid_ = long.Parse(strLine.Split(':')[1].Trim()); //读入xsize strLine = sr.ReadLine(); var xsize_ = uint.Parse(strLine.Split(':')[1].Trim()); //读入空行 strLine = sr.ReadLine(); //读入待标注的标签 var y_ = new List<string>(); while (true) { strLine = sr.ReadLine(); if (strLine.Length == 0) { break; } y_.Add(strLine); } //读入unigram和bigram模板 var unigram_templs_ = new List<string>(); var bigram_templs_ = new List<string>(); while (sr.EndOfStream == false) { strLine = sr.ReadLine(); if (strLine.Length == 0) { break; } if (strLine[0] == 'U') { unigram_templs_.Add(strLine); } if (strLine[0] == 'B') { bigram_templs_.Add(strLine); } } sr.Close(); //Load all features alpha data var filename_alpha = strModelFileName + ".alpha"; var filename_shrink_alpha = strShrinkedModelFileName + ".alpha"; var sr_alpha = new StreamReader(filename_alpha); var br_alpha = new BinaryReader(sr_alpha.BaseStream); var sw_alpha = new StreamWriter(filename_shrink_alpha); var bw_alpha = new BinaryWriter(sw_alpha.BaseStream); long shrinked_alpha_size = 0; //Only reserve non-zero feature weights and save them into file as two-tuples format var alpha_ = new FixedBigArray<double>(maxid_ + 1, 0); for (long i = 0; i < maxid_; i++) { alpha_[i] = br_alpha.ReadSingle(); if (alpha_[i] != 0) { bw_alpha.Write(i); bw_alpha.Write((float)alpha_[i]); shrinked_alpha_size++; } } br_alpha.Close(); bw_alpha.Close(); //Only reserved lexical feature whose weights is non-zero var varValue = new VarBigArray<int>(1024); var varFeature = new VarBigArray<string>(1024); var feaCnt = 0; var filename_feature = strModelFileName + ".feature.raw_text"; var sr_fea = new StreamReader(filename_feature); while (sr_fea.EndOfStream == false) { strLine = sr_fea.ReadLine(); var items = strLine.Split('\t'); var strFeature = items[0]; var key = int.Parse(items[1]); var size = (strFeature[0] == 'U' ? y_.Count : y_.Count * y_.Count); var hasAlpha = false; for (var i = key; i < key + size; i++) { if (alpha_[i] != 0) { hasAlpha = true; break; } } if (hasAlpha == true) { varFeature[feaCnt] = strFeature; varValue[feaCnt] = key; feaCnt++; } } sr_fea.Close(); Console.WriteLine("Shrink feature size from {0} to {1}", maxid_, shrinked_alpha_size); maxid_ = shrinked_alpha_size; //Build new lexical feature var val = new FixedBigArray<int>(feaCnt, 0); var fea = new FixedBigArray<string>(feaCnt, 0); for (var i = 0; i < feaCnt; i++) { fea[i] = varFeature[i]; val[i] = varValue[i]; } varFeature = null; varValue = null; var da = new DoubleArrayTrieBuilder(thread_num_); if (da.build(fea, val, 0.95) == false) { Console.WriteLine("Build lexical dictionary failed."); return; } da.save(strShrinkedModelFileName + ".feature"); var tofs = new StreamWriter(strShrinkedModelFileName); // header tofs.WriteLine("version: " + CRFSharp.Utils.MODEL_TYPE_SHRINKED); tofs.WriteLine("cost-factor: " + cost_factor_); tofs.WriteLine("maxid: " + maxid_); tofs.WriteLine("xsize: " + xsize_); tofs.WriteLine(); // y for (var i = 0; i < y_.Count; ++i) { tofs.WriteLine(y_[i]); } tofs.WriteLine(); // template for (var i = 0; i < unigram_templs_.Count; ++i) { tofs.WriteLine(unigram_templs_[i]); } for (var i = 0; i < bigram_templs_.Count; ++i) { tofs.WriteLine(bigram_templs_[i]); } tofs.Close(); }
public void Process(string strModelFileName, string strShrinkedModelFileName, int thread_num_ = 1) { var sr = new StreamReader(strModelFileName); string strLine; //读入版本号 strLine = sr.ReadLine(); var version = uint.Parse(strLine.Split(':')[1].Trim()); if (version == CRFSharp.Utils.MODEL_TYPE_SHRINKED) { Console.WriteLine("The input model has been shrinked"); return; } //读入cost_factor strLine = sr.ReadLine(); var cost_factor_ = double.Parse(strLine.Split(':')[1].Trim()); //读入maxid strLine = sr.ReadLine(); var maxid_ = long.Parse(strLine.Split(':')[1].Trim()); //读入xsize strLine = sr.ReadLine(); var xsize_ = uint.Parse(strLine.Split(':')[1].Trim()); //读入空行 strLine = sr.ReadLine(); //读入待标注的标签 var y_ = new List <string>(); while (true) { strLine = sr.ReadLine(); if (strLine.Length == 0) { break; } y_.Add(strLine); } //读入unigram和bigram模板 var unigram_templs_ = new List <string>(); var bigram_templs_ = new List <string>(); while (sr.EndOfStream == false) { strLine = sr.ReadLine(); if (strLine.Length == 0) { break; } if (strLine[0] == 'U') { unigram_templs_.Add(strLine); } if (strLine[0] == 'B') { bigram_templs_.Add(strLine); } } sr.Close(); //Load all features alpha data var filename_alpha = strModelFileName + ".alpha"; var filename_shrink_alpha = strShrinkedModelFileName + ".alpha"; var sr_alpha = new StreamReader(filename_alpha); var br_alpha = new BinaryReader(sr_alpha.BaseStream); var sw_alpha = new StreamWriter(filename_shrink_alpha); var bw_alpha = new BinaryWriter(sw_alpha.BaseStream); long shrinked_alpha_size = 0; //Only reserve non-zero feature weights and save them into file as two-tuples format var alpha_ = new FixedBigArray <double>(maxid_ + 1, 0); for (long i = 0; i < maxid_; i++) { alpha_[i] = br_alpha.ReadSingle(); if (alpha_[i] != 0) { bw_alpha.Write(i); bw_alpha.Write((float)alpha_[i]); shrinked_alpha_size++; } } br_alpha.Close(); bw_alpha.Close(); //Only reserved lexical feature whose weights is non-zero var varValue = new VarBigArray <int>(1024); var varFeature = new VarBigArray <string>(1024); var feaCnt = 0; var filename_feature = strModelFileName + ".feature.raw_text"; var sr_fea = new StreamReader(filename_feature); while (sr_fea.EndOfStream == false) { strLine = sr_fea.ReadLine(); var items = strLine.Split('\t'); var strFeature = items[0]; var key = int.Parse(items[1]); var size = (strFeature[0] == 'U' ? y_.Count : y_.Count * y_.Count); var hasAlpha = false; for (var i = key; i < key + size; i++) { if (alpha_[i] != 0) { hasAlpha = true; break; } } if (hasAlpha == true) { varFeature[feaCnt] = strFeature; varValue[feaCnt] = key; feaCnt++; } } sr_fea.Close(); Console.WriteLine("Shrink feature size from {0} to {1}", maxid_, shrinked_alpha_size); maxid_ = shrinked_alpha_size; //Build new lexical feature var val = new FixedBigArray <int>(feaCnt, 0); var fea = new FixedBigArray <string>(feaCnt, 0); for (var i = 0; i < feaCnt; i++) { fea[i] = varFeature[i]; val[i] = varValue[i]; } varFeature = null; varValue = null; var da = new DoubleArrayTrieBuilder(thread_num_); if (da.build(fea, val, 0.95) == false) { Console.WriteLine("Build lexical dictionary failed."); return; } da.save(strShrinkedModelFileName + ".feature"); var tofs = new StreamWriter(strShrinkedModelFileName); // header tofs.WriteLine("version: " + CRFSharp.Utils.MODEL_TYPE_SHRINKED); tofs.WriteLine("cost-factor: " + cost_factor_); tofs.WriteLine("maxid: " + maxid_); tofs.WriteLine("xsize: " + xsize_); tofs.WriteLine(); // y for (var i = 0; i < y_.Count; ++i) { tofs.WriteLine(y_[i]); } tofs.WriteLine(); // template for (var i = 0; i < unigram_templs_.Count; ++i) { tofs.WriteLine(unigram_templs_[i]); } for (var i = 0; i < bigram_templs_.Count; ++i) { tofs.WriteLine(bigram_templs_[i]); } tofs.Close(); }
//Build feature set into indexed data public bool BuildFeatureSetIntoIndex(string filename, double max_slot_usage_rate_threshold, int debugLevel, string strRetrainModelFileName) { Console.WriteLine("Building {0} features into index...", featureLexicalDict.Size); IList<string> keyList; IList<int> valList; featureLexicalDict.GenerateLexicalIdList(out keyList, out valList); if (debugLevel > 0) { Console.Write("Debug: Writing raw feature set into file..."); var filename_featureset_raw_format = filename + ".feature.raw_text"; var sw = new StreamWriter(filename_featureset_raw_format); // save feature and its id into lists in raw format for (var i = 0; i < keyList.Count; i++) { sw.WriteLine("{0}\t{1}", keyList[i], valList[i]); } sw.Close(); Console.WriteLine("Done."); } //Build feature index var filename_featureset = filename + ".feature"; var da = new DoubleArrayTrieBuilder(thread_num_); if (da.build(keyList, valList, max_slot_usage_rate_threshold) == false) { Console.WriteLine("Build lexical dictionary failed."); return false; } //Save indexed feature set into file da.save(filename_featureset); if (strRetrainModelFileName == null || strRetrainModelFileName.Length == 0) { //Clean up all data featureLexicalDict.Clear(); featureLexicalDict = null; keyList = null; valList = null; GC.Collect(); //Create weight matrix alpha_ = new double[feature_size() + 1]; } else { Console.WriteLine(); Console.WriteLine("Loading the existed model for re-training..."); //Create weight matrix alpha_ = new double[feature_size() + 1]; var modelReader = new ModelReader(); modelReader.LoadModel(strRetrainModelFileName); if (modelReader.y_.Count == y_.Count) { for (var i = 0; i < keyList.Count; i++) { var index = modelReader.get_id(keyList[i]); if (index < 0) { continue; } var size = (keyList[i][0] == 'U' ? y_.Count : y_.Count * y_.Count); for (var j = 0; j < size; j++) { alpha_[valList[i] + j + 1] = modelReader.GetAlpha(index + j); } } } else { Console.WriteLine("The number of tags isn't equal between two models, it cannot be re-trained."); } //Clean up all data featureLexicalDict.Clear(); featureLexicalDict = null; keyList = null; valList = null; GC.Collect(); } return true; }