Beispiel #1
0
        //Load all records and generate features
        public EncoderTagger[] ReadAllRecords()
        {
            var arrayEncoderTagger     = new EncoderTagger[trainCorpusList.Count];
            var arrayEncoderTaggerSize = 0;

            //Generate each record features
            Parallel.For(0, trainCorpusList.Count, parallelOption, i =>
            {
                var _x = new EncoderTagger(this);
                if (_x.GenerateFeature(trainCorpusList[i]) == false)
                {
                    Logger.WriteLine("Load a training sentence failed, skip it.");
                }
                else
                {
                    var oldValue = Interlocked.Increment(ref arrayEncoderTaggerSize) - 1;
                    arrayEncoderTagger[oldValue] = _x;

                    if (oldValue % 10000 == 0)
                    {
                        //Show current progress on console
                        Console.Write("{0}...", oldValue);
                    }
                }
            });

            trainCorpusList.Clear();
            trainCorpusList = null;

            Console.WriteLine();
            return(arrayEncoderTagger);
        }
Beispiel #2
0
        //Get feature id from feature set by feature string
        //If feature string is not existed in the set, generate a new id and return it
        public bool BuildFeatures(EncoderTagger tagger)
        {
            var feature = new List <long>();

            using (var v = _buildersPool.GetOrCreate())
            {
                var localBuilder = v.Item;
                //tagger.feature_id_ = tagger.feature_cache_.Count;
                for (var cur = 0; cur < tagger.word_num; ++cur)
                {
                    for (int index = 0; index < unigram_templs_.Count; index++)
                    {
                        var it         = unigram_templs_[index];
                        var strFeature = apply_rule(it, cur, localBuilder, tagger);
                        if (strFeature == null)
                        {
                            Logger.WriteLine(Logger.Level.err, " format error: " + it);
                        }
                        else
                        {
                            var id = featureLexicalDict.GetOrAddId(strFeature.ToString());
                            feature.Add(id);
                        }
                    }
                    tagger.feature_cache_.Add(feature.ToArray());
                    feature.Clear();
                }

                for (var cur = 1; cur < tagger.word_num; ++cur)
                {
                    for (int index = 0; index < bigram_templs_.Count; index++)
                    {
                        var it         = bigram_templs_[index];
                        var strFeature = apply_rule(it, cur, localBuilder, tagger);
                        if (strFeature == null)
                        {
                            Logger.WriteLine(Logger.Level.err, " format error: " + it);
                        }
                        else
                        {
                            var id = featureLexicalDict.GetOrAddId(strFeature.ToString());
                            feature.Add(id);
                        }
                    }

                    tagger.feature_cache_.Add(feature.ToArray());
                    feature.Clear();
                }
            }

            return(true);
        }
Beispiel #3
0
        //Get feature id from feature set by feature string
        //If feature string is not existed in the set, generate a new id and return it
        public bool BuildFeatures(EncoderTagger tagger)
        {
            var feature = new List<long>();

            //tagger.feature_id_ = tagger.feature_cache_.Count;
            for (var cur = 0; cur < tagger.word_num; ++cur)
            {
                for (int index = 0; index < unigram_templs_.Count; index++)
                {
                    var it = unigram_templs_[index];
                    var strFeature = apply_rule(it, cur, tagger);
                    if (strFeature == "")
                    {
                        Console.WriteLine(" format error: " + it);
                    }

                    var id = featureLexicalDict.GetOrAddId(strFeature);
                    feature.Add(id);
                }
                tagger.feature_cache_.Add(feature.ToArray());
                feature.Clear();
            }

            for (var cur = 1; cur < tagger.word_num; ++cur)
            {
                for (int index = 0; index < bigram_templs_.Count; index++)
                {
                    var it = bigram_templs_[index];
                    var strFeature = apply_rule(it, cur, tagger);
                    if (strFeature == "")
                    {
                        Console.WriteLine(" format error: " + it);
                    }
                    var id = featureLexicalDict.GetOrAddId(strFeature);
                    feature.Add(id);
                }

                tagger.feature_cache_.Add(feature.ToArray());
                feature.Clear();

            }

            return true;
        }
Beispiel #4
0
        //Get feature id from feature set by feature string
        //If feature string is not existed in the set, generate a new id and return it
        public bool BuildFeatures(EncoderTagger tagger)
        {
            List<long> feature = new List<long>();

            //tagger.feature_id_ = tagger.feature_cache_.Count;
            for (int cur = 0; cur < tagger.word_num; ++cur)
            {
                foreach (string it in unigram_templs_)
                {
                    string strFeature = apply_rule(it, cur, tagger);
                    if (strFeature == "")
                    {
                        Console.WriteLine(" format error: " + it);
                    }

                    long id = featureLexicalDict.GetOrAddId(strFeature);
                    feature.Add(id);
                }
                tagger.feature_cache_.Add(feature.ToArray());
                feature.Clear();
            }

            for (int cur = 1; cur < tagger.word_num; ++cur)
            {
                foreach (string it in bigram_templs_)
                {
                    string strFeature = apply_rule(it, cur, tagger);
                    if (strFeature == "")
                    {
                        Console.WriteLine(" format error: " + it);
                    }
                    long id = featureLexicalDict.GetOrAddId(strFeature);
                    feature.Add(id);
                }

                tagger.feature_cache_.Add(feature.ToArray());
                feature.Clear();

            }

            return true;
        }
Beispiel #5
0
        //Load all records and generate features
        public EncoderTagger[] ReadAllRecords()
        {
            EncoderTagger[] arrayEncoderTagger = new EncoderTagger[trainCorpusList.Count];
            int arrayEncoderTaggerSize = 0;

            //Generate each record features
#if NO_SUPPORT_PARALLEL_LIB
            for (int i = 0;i < trainCorpusList.Count;i++)
#else
            Parallel.For(0, trainCorpusList.Count, parallelOption, i =>
#endif
            {
                EncoderTagger _x = new EncoderTagger(this);
                if (_x.GenerateFeature(trainCorpusList[i]) == false)
                {
                    Console.WriteLine("Load a training sentence failed, skip it.");
                }
                else
                {
                    int oldValue = Interlocked.Increment(ref arrayEncoderTaggerSize) - 1;
                    arrayEncoderTagger[oldValue] = _x;

                    if (oldValue % 10000 == 0)
                    {
                        //Show current progress on console
                        Console.Write("{0}...", oldValue);
                    }
                }
            }
#if NO_SUPPORT_PARALLEL_LIB
#else
            );
#endif

            trainCorpusList.Clear();
            trainCorpusList = null;

            Console.WriteLine();
            return arrayEncoderTagger;
        }
Beispiel #6
0
        //Regenerate feature id and shrink features with lower frequency
        public void Shrink(EncoderTagger[] xList, int freq)
        {
            var old2new = new BTreeDictionary<long, long>();
            featureLexicalDict.Shrink(freq);
            maxid_ = featureLexicalDict.RegenerateFeatureId(old2new, y_.Count);
            var feature_count = xList.Length;

            //Update feature ids
#if NO_SUPPORT_PARALLEL_LIB
            for (int i = 0;i < feature_cache_.Count;i++)
#else
            Parallel.For(0, feature_count, parallelOption, i =>
#endif
            {
                for (var j = 0; j < xList[i].feature_cache_.Count; j++)
                {
                    var newfs = new List<long>();
                    long rstValue = 0;
                    for (int index = 0; index < xList[i].feature_cache_[j].Length; index++)
                    {
                        var v = xList[i].feature_cache_[j][index];
                        if (old2new.TryGetValue(v, out rstValue) == true)
                        {
                            newfs.Add(rstValue);
                        }
                    }
                    xList[i].feature_cache_[j] = newfs.ToArray();
                }
            }
#if NO_SUPPORT_PARALLEL_LIB
#else
);
#endif

            Console.WriteLine("Feature size in total : {0}", maxid_);
        }
Beispiel #7
0
        //Load all records and generate features
        public EncoderTagger[] ReadAllRecords()
        {
            var arrayEncoderTagger = new EncoderTagger[trainCorpusList.Count];
            var arrayEncoderTaggerSize = 0;

            //Generate each record features
#if NO_SUPPORT_PARALLEL_LIB
            for (int i = 0;i < trainCorpusList.Count;i++)
#else
            Parallel.For(0, trainCorpusList.Count, parallelOption, i =>
#endif
            {
                var _x = new EncoderTagger(this);
                if (_x.GenerateFeature(trainCorpusList[i]) == false)
                {
                    Console.WriteLine("Load a training sentence failed, skip it.");
                }
                else
                {
                    var oldValue = Interlocked.Increment(ref arrayEncoderTaggerSize) - 1;
                    arrayEncoderTagger[oldValue] = _x;

                    if (oldValue % 10000 == 0)
                    {
                        //Show current progress on console
                        Console.Write("{0}...", oldValue);
                    }
                }
            }
#if NO_SUPPORT_PARALLEL_LIB
#else
);
#endif

            trainCorpusList.Clear();
            trainCorpusList = null;

            Console.WriteLine();
            return arrayEncoderTagger;
        }
Beispiel #8
0
        bool runCRF(EncoderTagger[] x, ModelWritter modelWritter, bool orthant, EncoderArgs args)
        {
            var old_obj = double.MaxValue;
            var converge = 0;
            var lbfgs = new LBFGS(args.threads_num);
            lbfgs.expected = new double[modelWritter.feature_size() + 1];

            var processList = new List<CRFEncoderThread>();

#if NO_SUPPORT_PARALLEL_LIB
#else
            var parallelOption = new ParallelOptions();
            parallelOption.MaxDegreeOfParallelism = args.threads_num;
#endif

            //Initialize encoding threads
            for (var i = 0; i < args.threads_num; i++)
            {
                var thread = new CRFEncoderThread();
                thread.start_i = i;
                thread.thread_num = args.threads_num;
                thread.x = x;
                thread.lbfgs = lbfgs;
                thread.Init();
                processList.Add(thread);
            }

            //Statistic term and result tags frequency
            var termNum = 0;
            int[] yfreq;
            yfreq = new int[modelWritter.y_.Count];
            for (int index = 0; index < x.Length; index++)
            {
                var tagger = x[index];
                termNum += tagger.word_num;
                for (var j = 0; j < tagger.word_num; j++)
                {
                    yfreq[tagger.answer_[j]]++;
                }
            }

            //Iterative training
            var startDT = DateTime.Now;
            var dMinErrRecord = 1.0;
            for (var itr = 0; itr < args.max_iter; ++itr)
            {
                //Clear result container
                lbfgs.obj = 0.0f;
                lbfgs.err = 0;
                lbfgs.zeroone = 0;

                Array.Clear(lbfgs.expected, 0, lbfgs.expected.Length);

                var threadList = new List<Thread>();
                for (var i = 0; i < args.threads_num; i++)
                {
                    var thread = new Thread(processList[i].Run);
                    thread.Start();
                    threadList.Add(thread);
                }

                int[,] merr;
                merr = new int[modelWritter.y_.Count, modelWritter.y_.Count];
                for (var i = 0; i < args.threads_num; ++i)
                {
                    threadList[i].Join();
                    lbfgs.obj += processList[i].obj;
                    lbfgs.err += processList[i].err;
                    lbfgs.zeroone += processList[i].zeroone;

                    //Calculate error
                    for (var j = 0; j < modelWritter.y_.Count; j++)
                    {
                        for (var k = 0; k < modelWritter.y_.Count; k++)
                        {
                            merr[j, k] += processList[i].merr[j, k];
                        }
                    }
                }

                long num_nonzero = 0;
                var fsize = modelWritter.feature_size();
                var alpha = modelWritter.alpha_;
                if (orthant == true)
                {
                    //L1 regularization
#if NO_SUPPORT_PARALLEL_LIB
                    for (long k = 1; k < fsize + 1; k++)
                    {
                        lbfgs.obj += Math.Abs(alpha[k] / modelWritter.cost_factor_);
                        if (alpha[k] != 0.0)
                        {
                            num_nonzero++;
                        }
                    }
#else
                    Parallel.For<double>(1, fsize + 1, parallelOption, () => 0, (k, loop, subtotal) =>
                    {
                        subtotal += Math.Abs(alpha[k] / modelWritter.cost_factor_);
                        if (alpha[k] != 0.0)
                        {
                            Interlocked.Increment(ref num_nonzero);
                        }
                        return subtotal;
                    },
                   (subtotal) => // lock free accumulator
                   {
                       double initialValue;
                       double newValue;
                       do
                       {
                           initialValue = lbfgs.obj; // read current value
                           newValue = initialValue + subtotal;  //calculate new value
                       }
                       while (initialValue != Interlocked.CompareExchange(ref lbfgs.obj, newValue, initialValue));
                   }
                   );
#endif
                }
                else
                {
                    //L2 regularization
                    num_nonzero = fsize;

#if NO_SUPPORT_PARALLEL_LIB
                    for (long k = 1; k < fsize + 1; k++)
                    {
                        lbfgs.obj += (alpha[k] * alpha[k] / (2.0 * modelWritter.cost_factor_));
                        lbfgs.expected[k] += (alpha[k] / modelWritter.cost_factor_);
                    }
#else
                    Parallel.For<double>(1, fsize + 1, parallelOption, () => 0, (k, loop, subtotal) =>
                   {
                       subtotal += (alpha[k] * alpha[k] / (2.0 * modelWritter.cost_factor_));
                       lbfgs.expected[k] += (alpha[k] / modelWritter.cost_factor_);
                       return subtotal;
                   },
                   (subtotal) => // lock free accumulator
                   {
                       double initialValue;
                       double newValue;
                       do
                       {
                           initialValue = lbfgs.obj; // read current value
                           newValue = initialValue + subtotal;  //calculate new value
                       }
                       while (initialValue != Interlocked.CompareExchange(ref lbfgs.obj, newValue, initialValue));
                   }
                   );
#endif
                }

                //Show each iteration result
                var diff = (itr == 0 ? 1.0f : Math.Abs(old_obj - lbfgs.obj) / old_obj);
                old_obj = lbfgs.obj;

                ShowEvaluation(x.Length, modelWritter, lbfgs, termNum, itr, merr, yfreq, diff, startDT, num_nonzero, args);
                if (diff < args.min_diff)
                {
                    converge++;
                }
                else
                {
                    converge = 0;
                }
                if (itr > args.max_iter || converge == 3)
                {
                    break;  // 3 is ad-hoc
                }

                if (args.debugLevel > 0 && (double)lbfgs.zeroone / (double)x.Length < dMinErrRecord)
                {
                    var cc = Console.ForegroundColor;
                    Console.ForegroundColor = ConsoleColor.Red;
                    Console.Write("[Debug Mode] ");
                    Console.ForegroundColor = cc;
                    Console.Write("Saving intermediate feature weights at current directory...");

                    //Save current best feature weight into file
                    dMinErrRecord = (double)lbfgs.zeroone / (double)x.Length;
                    modelWritter.SaveFeatureWeight("feature_weight_tmp");

                    Console.WriteLine("Done.");
                }

                int iret;
                iret = lbfgs.optimize(alpha, modelWritter.cost_factor_, orthant);
                if (iret <= 0)
                {
                    return false;
                }
            }

            return true;
        }