//Load all records and generate features public EncoderTagger[] ReadAllRecords() { var arrayEncoderTagger = new EncoderTagger[trainCorpusList.Count]; var arrayEncoderTaggerSize = 0; //Generate each record features Parallel.For(0, trainCorpusList.Count, parallelOption, i => { var _x = new EncoderTagger(this); if (_x.GenerateFeature(trainCorpusList[i]) == false) { Logger.WriteLine("Load a training sentence failed, skip it."); } else { var oldValue = Interlocked.Increment(ref arrayEncoderTaggerSize) - 1; arrayEncoderTagger[oldValue] = _x; if (oldValue % 10000 == 0) { //Show current progress on console Console.Write("{0}...", oldValue); } } }); trainCorpusList.Clear(); trainCorpusList = null; Console.WriteLine(); return(arrayEncoderTagger); }
//Get feature id from feature set by feature string //If feature string is not existed in the set, generate a new id and return it public bool BuildFeatures(EncoderTagger tagger) { var feature = new List <long>(); using (var v = _buildersPool.GetOrCreate()) { var localBuilder = v.Item; //tagger.feature_id_ = tagger.feature_cache_.Count; for (var cur = 0; cur < tagger.word_num; ++cur) { for (int index = 0; index < unigram_templs_.Count; index++) { var it = unigram_templs_[index]; var strFeature = apply_rule(it, cur, localBuilder, tagger); if (strFeature == null) { Logger.WriteLine(Logger.Level.err, " format error: " + it); } else { var id = featureLexicalDict.GetOrAddId(strFeature.ToString()); feature.Add(id); } } tagger.feature_cache_.Add(feature.ToArray()); feature.Clear(); } for (var cur = 1; cur < tagger.word_num; ++cur) { for (int index = 0; index < bigram_templs_.Count; index++) { var it = bigram_templs_[index]; var strFeature = apply_rule(it, cur, localBuilder, tagger); if (strFeature == null) { Logger.WriteLine(Logger.Level.err, " format error: " + it); } else { var id = featureLexicalDict.GetOrAddId(strFeature.ToString()); feature.Add(id); } } tagger.feature_cache_.Add(feature.ToArray()); feature.Clear(); } } return(true); }
//Get feature id from feature set by feature string //If feature string is not existed in the set, generate a new id and return it public bool BuildFeatures(EncoderTagger tagger) { var feature = new List<long>(); //tagger.feature_id_ = tagger.feature_cache_.Count; for (var cur = 0; cur < tagger.word_num; ++cur) { for (int index = 0; index < unigram_templs_.Count; index++) { var it = unigram_templs_[index]; var strFeature = apply_rule(it, cur, tagger); if (strFeature == "") { Console.WriteLine(" format error: " + it); } var id = featureLexicalDict.GetOrAddId(strFeature); feature.Add(id); } tagger.feature_cache_.Add(feature.ToArray()); feature.Clear(); } for (var cur = 1; cur < tagger.word_num; ++cur) { for (int index = 0; index < bigram_templs_.Count; index++) { var it = bigram_templs_[index]; var strFeature = apply_rule(it, cur, tagger); if (strFeature == "") { Console.WriteLine(" format error: " + it); } var id = featureLexicalDict.GetOrAddId(strFeature); feature.Add(id); } tagger.feature_cache_.Add(feature.ToArray()); feature.Clear(); } return true; }
//Get feature id from feature set by feature string //If feature string is not existed in the set, generate a new id and return it public bool BuildFeatures(EncoderTagger tagger) { List<long> feature = new List<long>(); //tagger.feature_id_ = tagger.feature_cache_.Count; for (int cur = 0; cur < tagger.word_num; ++cur) { foreach (string it in unigram_templs_) { string strFeature = apply_rule(it, cur, tagger); if (strFeature == "") { Console.WriteLine(" format error: " + it); } long id = featureLexicalDict.GetOrAddId(strFeature); feature.Add(id); } tagger.feature_cache_.Add(feature.ToArray()); feature.Clear(); } for (int cur = 1; cur < tagger.word_num; ++cur) { foreach (string it in bigram_templs_) { string strFeature = apply_rule(it, cur, tagger); if (strFeature == "") { Console.WriteLine(" format error: " + it); } long id = featureLexicalDict.GetOrAddId(strFeature); feature.Add(id); } tagger.feature_cache_.Add(feature.ToArray()); feature.Clear(); } return true; }
//Load all records and generate features public EncoderTagger[] ReadAllRecords() { EncoderTagger[] arrayEncoderTagger = new EncoderTagger[trainCorpusList.Count]; int arrayEncoderTaggerSize = 0; //Generate each record features #if NO_SUPPORT_PARALLEL_LIB for (int i = 0;i < trainCorpusList.Count;i++) #else Parallel.For(0, trainCorpusList.Count, parallelOption, i => #endif { EncoderTagger _x = new EncoderTagger(this); if (_x.GenerateFeature(trainCorpusList[i]) == false) { Console.WriteLine("Load a training sentence failed, skip it."); } else { int oldValue = Interlocked.Increment(ref arrayEncoderTaggerSize) - 1; arrayEncoderTagger[oldValue] = _x; if (oldValue % 10000 == 0) { //Show current progress on console Console.Write("{0}...", oldValue); } } } #if NO_SUPPORT_PARALLEL_LIB #else ); #endif trainCorpusList.Clear(); trainCorpusList = null; Console.WriteLine(); return arrayEncoderTagger; }
//Regenerate feature id and shrink features with lower frequency public void Shrink(EncoderTagger[] xList, int freq) { var old2new = new BTreeDictionary<long, long>(); featureLexicalDict.Shrink(freq); maxid_ = featureLexicalDict.RegenerateFeatureId(old2new, y_.Count); var feature_count = xList.Length; //Update feature ids #if NO_SUPPORT_PARALLEL_LIB for (int i = 0;i < feature_cache_.Count;i++) #else Parallel.For(0, feature_count, parallelOption, i => #endif { for (var j = 0; j < xList[i].feature_cache_.Count; j++) { var newfs = new List<long>(); long rstValue = 0; for (int index = 0; index < xList[i].feature_cache_[j].Length; index++) { var v = xList[i].feature_cache_[j][index]; if (old2new.TryGetValue(v, out rstValue) == true) { newfs.Add(rstValue); } } xList[i].feature_cache_[j] = newfs.ToArray(); } } #if NO_SUPPORT_PARALLEL_LIB #else ); #endif Console.WriteLine("Feature size in total : {0}", maxid_); }
//Load all records and generate features public EncoderTagger[] ReadAllRecords() { var arrayEncoderTagger = new EncoderTagger[trainCorpusList.Count]; var arrayEncoderTaggerSize = 0; //Generate each record features #if NO_SUPPORT_PARALLEL_LIB for (int i = 0;i < trainCorpusList.Count;i++) #else Parallel.For(0, trainCorpusList.Count, parallelOption, i => #endif { var _x = new EncoderTagger(this); if (_x.GenerateFeature(trainCorpusList[i]) == false) { Console.WriteLine("Load a training sentence failed, skip it."); } else { var oldValue = Interlocked.Increment(ref arrayEncoderTaggerSize) - 1; arrayEncoderTagger[oldValue] = _x; if (oldValue % 10000 == 0) { //Show current progress on console Console.Write("{0}...", oldValue); } } } #if NO_SUPPORT_PARALLEL_LIB #else ); #endif trainCorpusList.Clear(); trainCorpusList = null; Console.WriteLine(); return arrayEncoderTagger; }
bool runCRF(EncoderTagger[] x, ModelWritter modelWritter, bool orthant, EncoderArgs args) { var old_obj = double.MaxValue; var converge = 0; var lbfgs = new LBFGS(args.threads_num); lbfgs.expected = new double[modelWritter.feature_size() + 1]; var processList = new List<CRFEncoderThread>(); #if NO_SUPPORT_PARALLEL_LIB #else var parallelOption = new ParallelOptions(); parallelOption.MaxDegreeOfParallelism = args.threads_num; #endif //Initialize encoding threads for (var i = 0; i < args.threads_num; i++) { var thread = new CRFEncoderThread(); thread.start_i = i; thread.thread_num = args.threads_num; thread.x = x; thread.lbfgs = lbfgs; thread.Init(); processList.Add(thread); } //Statistic term and result tags frequency var termNum = 0; int[] yfreq; yfreq = new int[modelWritter.y_.Count]; for (int index = 0; index < x.Length; index++) { var tagger = x[index]; termNum += tagger.word_num; for (var j = 0; j < tagger.word_num; j++) { yfreq[tagger.answer_[j]]++; } } //Iterative training var startDT = DateTime.Now; var dMinErrRecord = 1.0; for (var itr = 0; itr < args.max_iter; ++itr) { //Clear result container lbfgs.obj = 0.0f; lbfgs.err = 0; lbfgs.zeroone = 0; Array.Clear(lbfgs.expected, 0, lbfgs.expected.Length); var threadList = new List<Thread>(); for (var i = 0; i < args.threads_num; i++) { var thread = new Thread(processList[i].Run); thread.Start(); threadList.Add(thread); } int[,] merr; merr = new int[modelWritter.y_.Count, modelWritter.y_.Count]; for (var i = 0; i < args.threads_num; ++i) { threadList[i].Join(); lbfgs.obj += processList[i].obj; lbfgs.err += processList[i].err; lbfgs.zeroone += processList[i].zeroone; //Calculate error for (var j = 0; j < modelWritter.y_.Count; j++) { for (var k = 0; k < modelWritter.y_.Count; k++) { merr[j, k] += processList[i].merr[j, k]; } } } long num_nonzero = 0; var fsize = modelWritter.feature_size(); var alpha = modelWritter.alpha_; if (orthant == true) { //L1 regularization #if NO_SUPPORT_PARALLEL_LIB for (long k = 1; k < fsize + 1; k++) { lbfgs.obj += Math.Abs(alpha[k] / modelWritter.cost_factor_); if (alpha[k] != 0.0) { num_nonzero++; } } #else Parallel.For<double>(1, fsize + 1, parallelOption, () => 0, (k, loop, subtotal) => { subtotal += Math.Abs(alpha[k] / modelWritter.cost_factor_); if (alpha[k] != 0.0) { Interlocked.Increment(ref num_nonzero); } return subtotal; }, (subtotal) => // lock free accumulator { double initialValue; double newValue; do { initialValue = lbfgs.obj; // read current value newValue = initialValue + subtotal; //calculate new value } while (initialValue != Interlocked.CompareExchange(ref lbfgs.obj, newValue, initialValue)); } ); #endif } else { //L2 regularization num_nonzero = fsize; #if NO_SUPPORT_PARALLEL_LIB for (long k = 1; k < fsize + 1; k++) { lbfgs.obj += (alpha[k] * alpha[k] / (2.0 * modelWritter.cost_factor_)); lbfgs.expected[k] += (alpha[k] / modelWritter.cost_factor_); } #else Parallel.For<double>(1, fsize + 1, parallelOption, () => 0, (k, loop, subtotal) => { subtotal += (alpha[k] * alpha[k] / (2.0 * modelWritter.cost_factor_)); lbfgs.expected[k] += (alpha[k] / modelWritter.cost_factor_); return subtotal; }, (subtotal) => // lock free accumulator { double initialValue; double newValue; do { initialValue = lbfgs.obj; // read current value newValue = initialValue + subtotal; //calculate new value } while (initialValue != Interlocked.CompareExchange(ref lbfgs.obj, newValue, initialValue)); } ); #endif } //Show each iteration result var diff = (itr == 0 ? 1.0f : Math.Abs(old_obj - lbfgs.obj) / old_obj); old_obj = lbfgs.obj; ShowEvaluation(x.Length, modelWritter, lbfgs, termNum, itr, merr, yfreq, diff, startDT, num_nonzero, args); if (diff < args.min_diff) { converge++; } else { converge = 0; } if (itr > args.max_iter || converge == 3) { break; // 3 is ad-hoc } if (args.debugLevel > 0 && (double)lbfgs.zeroone / (double)x.Length < dMinErrRecord) { var cc = Console.ForegroundColor; Console.ForegroundColor = ConsoleColor.Red; Console.Write("[Debug Mode] "); Console.ForegroundColor = cc; Console.Write("Saving intermediate feature weights at current directory..."); //Save current best feature weight into file dMinErrRecord = (double)lbfgs.zeroone / (double)x.Length; modelWritter.SaveFeatureWeight("feature_weight_tmp"); Console.WriteLine("Done."); } int iret; iret = lbfgs.optimize(alpha, modelWritter.cost_factor_, orthant); if (iret <= 0) { return false; } } return true; }