예제 #1
0
        public void BuildIndexedFeatureIntoFile(string strFileName, List <string> features)
        {
            //Assign id for each feature
            var feature2Id = new SortedDictionary <string, int>(StringComparer.Ordinal);
            var maxId      = 0;

            foreach (var strFeature in features)
            {
                if (strFeature.StartsWith("U") == false)
                {
                    Logger.WriteLine("Invalidated feature: {0}", strFeature);
                    continue;
                }

                feature2Id.Add(strFeature, maxId);
                maxId++;
            }

            var da = new DoubleArrayTrieBuilder(4);

            da.build(feature2Id);
            da.save(strFileName + ".dart");

            var swTemplate = new StreamWriter(strFileName + ".template");

            swTemplate.WriteLine("MaxTemplateFeatureId:{0}", maxId);
            foreach (var strTemplate in m_Templates)
            {
                swTemplate.WriteLine(strTemplate);
            }

            swTemplate.Close();
        }
예제 #2
0
        //Build double array trie-tree from text file
        //strTextFileName: raw text file name used to build DA trie-tree
        //  text file format: key \t value
        //  key as string type
        //  value as non-netgive integer
        //strDAFileName: double array trie-tree binary file name built from strTextFileName
        private static void Build(string strTextFileName, string strDAFileName)
        {
            StreamReader sr = new StreamReader(strTextFileName);
            //Load raw data from text file and sort them as ordinal
            SortedDictionary<string, int> sdict = new SortedDictionary<string, int>(StringComparer.Ordinal);
            Console.WriteLine("Loading key value pairs from raw text file and sort them...");
            while (sr.EndOfStream == false)
            {
                string strLine = sr.ReadLine();
                if (strLine.Length == 0)
                {
                    continue;
                }

                string[] items = strLine.Split('\t');
                sdict.Add(items[0], int.Parse(items[1]));
            }

            //test case for SearchAsKeyPrefix and SearchByPrefix
            sdict.Add("TestSearchPrefix_case0", 1234567);
            sdict.Add("TestSearchPrefix_case01", 2345678);
            sdict.Add("TestSearchPrefix_case012", 3456789);

            DoubleArrayTrieBuilder dab = new DoubleArrayTrieBuilder(4);
            Console.WriteLine("Begin to build double array trie-tree...");
            dab.build(sdict);
            dab.save(strDAFileName);
            Console.WriteLine("Done!");
        }
예제 #3
0
        //Build double array trie-tree from text file
        //strTextFileName: raw text file name used to build DA trie-tree
        //  text file format: key \t value
        //  key as string type
        //  value as non-netgive integer
        //strDAFileName: double array trie-tree binary file name built from strTextFileName
        private static void Build(string strTextFileName, string strDAFileName)
        {
            StreamReader sr = new StreamReader(strTextFileName);
            //Load raw data from text file and sort them as ordinal
            SortedDictionary <string, int> sdict = new SortedDictionary <string, int>(StringComparer.Ordinal);

            Console.WriteLine("Loading key value pairs from raw text file and sort them...");
            while (sr.EndOfStream == false)
            {
                string strLine = sr.ReadLine();
                if (strLine.Length == 0)
                {
                    continue;
                }

                string[] items = strLine.Split('\t');
                sdict.Add(items[0], int.Parse(items[1]));
            }

            //test case for SearchAsKeyPrefix and SearchByPrefix
            sdict.Add("TestSearchPrefix_case0", 1234567);
            sdict.Add("TestSearchPrefix_case01", 2345678);
            sdict.Add("TestSearchPrefix_case012", 3456789);

            DoubleArrayTrieBuilder dab = new DoubleArrayTrieBuilder(4);

            Console.WriteLine("Begin to build double array trie-tree...");
            dab.build(sdict);
            dab.save(strDAFileName);
            Console.WriteLine("Done!");
        }
예제 #4
0
        public void BuildIndexedFeatureIntoFile(string strFileName, List<string> features)
        {
            //Assign id for each feature
            SortedDictionary<string, int> feature2Id = new SortedDictionary<string, int>(StringComparer.Ordinal);
            int maxId = 0;
            foreach (string strFeature in features)
            {
                if (strFeature.StartsWith("U") == false)
                {
                    Logger.WriteLine("Invalidated feature: {0}", strFeature);
                    continue;
                }

                feature2Id.Add(strFeature, maxId);
                maxId++;
            }

            DoubleArrayTrieBuilder da = new DoubleArrayTrieBuilder(4);
            da.build(feature2Id);
            da.save(strFileName + ".dart");

            StreamWriter swTemplate = new StreamWriter(strFileName + ".template");
            swTemplate.WriteLine("MaxTemplateFeatureId:{0}", maxId);
            foreach (string strTemplate in m_Templates)
            {
                swTemplate.WriteLine(strTemplate);
            }

            swTemplate.Close();
        }
예제 #5
0
        static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                Console.WriteLine("signDict.exe [raw dictionary] [binary dictionary]");
                return;
            }

            DoubleArrayTrieBuilder daBuilder = new DoubleArrayTrieBuilder(4);


            BTreeDictionary <string, int> dict = new BTreeDictionary <string, int>(StringComparer.Ordinal, 128);
            string       strLine = null;
            StreamReader sr      = new StreamReader(args[0], Encoding.UTF8);
            StreamWriter sw      = new StreamWriter(args[1] + ".prob", false, Encoding.UTF8);
            BinaryWriter bw      = new BinaryWriter(sw.BaseStream);

            int index = 0;

            while ((strLine = sr.ReadLine()) != null)
            {
                string[] items    = strLine.Split('\t');
                string   strNGram = items[0].Trim();

                if (dict.ContainsKey(strNGram) == true)
                {
                    Console.WriteLine("duplicated line: {0}", strLine);
                    continue;
                }

                if (strNGram.Length == 0)
                {
                    continue;
                }

                string[] vals    = items[1].Split();
                float    prob    = float.Parse(vals[0]);
                float    backoff = float.Parse(vals[1]);

                //Write item into file
                bw.Write(prob);
                bw.Write(backoff);
                dict.Add(strNGram, index);
                index++;
            }
            sr.Close();

            daBuilder.build(dict);
            daBuilder.save(args[1] + ".da");

            bw.Close();
        }
예제 #6
0
파일: Shrink.cs 프로젝트: Corniel/CRFSharp
        public void Process(string strModelFileName, string strShrinkedModelFileName, int thread_num_ = 1)
        {
            var sr = new StreamReader(strModelFileName);
            string strLine;

            //读入版本号
            strLine = sr.ReadLine();
            var version = uint.Parse(strLine.Split(':')[1].Trim());
            if (version == CRFSharp.Utils.MODEL_TYPE_SHRINKED)
            {
                Console.WriteLine("The input model has been shrinked");
                return;
            }

            //读入cost_factor
            strLine = sr.ReadLine();
            var cost_factor_ = double.Parse(strLine.Split(':')[1].Trim());

            //读入maxid
            strLine = sr.ReadLine();
            var maxid_ = long.Parse(strLine.Split(':')[1].Trim());

            //读入xsize
            strLine = sr.ReadLine();
            var xsize_ = uint.Parse(strLine.Split(':')[1].Trim());

            //读入空行
            strLine = sr.ReadLine();

            //读入待标注的标签
            var y_ = new List<string>();
            while (true)
            {
                strLine = sr.ReadLine();
                if (strLine.Length == 0)
                {
                    break;
                }
                y_.Add(strLine);
            }

            //读入unigram和bigram模板
            var unigram_templs_ = new List<string>();
            var bigram_templs_ = new List<string>();
            while (sr.EndOfStream == false)
            {
                strLine = sr.ReadLine();
                if (strLine.Length == 0)
                {
                    break;
                }
                if (strLine[0] == 'U')
                {
                    unigram_templs_.Add(strLine);
                }
                if (strLine[0] == 'B')
                {
                    bigram_templs_.Add(strLine);
                }
            }
            sr.Close();


            //Load all features alpha data
            var filename_alpha = strModelFileName + ".alpha";
            var filename_shrink_alpha = strShrinkedModelFileName + ".alpha";
            var sr_alpha = new StreamReader(filename_alpha);
            var br_alpha = new BinaryReader(sr_alpha.BaseStream);

            var sw_alpha = new StreamWriter(filename_shrink_alpha);
            var bw_alpha = new BinaryWriter(sw_alpha.BaseStream);
            long shrinked_alpha_size = 0;

            //Only reserve non-zero feature weights and save them into file as two-tuples format
            var alpha_ = new FixedBigArray<double>(maxid_ + 1, 0);
            for (long i = 0; i < maxid_; i++)
            {
                alpha_[i] = br_alpha.ReadSingle();
                if (alpha_[i] != 0)
                {
                    bw_alpha.Write(i);
                    bw_alpha.Write((float)alpha_[i]);
                    shrinked_alpha_size++;
                }
            }

            br_alpha.Close();
            bw_alpha.Close();

            //Only reserved lexical feature whose weights is non-zero
            var varValue = new VarBigArray<int>(1024);
            var varFeature = new VarBigArray<string>(1024);
            var feaCnt = 0;
            var filename_feature = strModelFileName + ".feature.raw_text";
            var sr_fea = new StreamReader(filename_feature);
            while (sr_fea.EndOfStream == false)
            {
                strLine = sr_fea.ReadLine();
                var items = strLine.Split('\t');
                var strFeature = items[0];
                var key = int.Parse(items[1]);
                var size = (strFeature[0] == 'U' ? y_.Count : y_.Count * y_.Count);
                var hasAlpha = false;
                for (var i = key; i < key + size; i++)
                {
                    if (alpha_[i] != 0)
                    {
                        hasAlpha = true;
                        break;
                    }
                }

                if (hasAlpha == true)
                {
                    varFeature[feaCnt] = strFeature;
                    varValue[feaCnt] = key;
                    feaCnt++;
                }

            }
            sr_fea.Close();

            Console.WriteLine("Shrink feature size from {0} to {1}", maxid_, shrinked_alpha_size);
            maxid_ = shrinked_alpha_size;

            //Build new lexical feature
            var val = new FixedBigArray<int>(feaCnt, 0);
            var fea = new FixedBigArray<string>(feaCnt, 0);
            for (var i = 0; i < feaCnt; i++)
            {
                fea[i] = varFeature[i];
                val[i] = varValue[i];
            }
            varFeature = null;
            varValue = null;
            var da = new DoubleArrayTrieBuilder(thread_num_);
            if (da.build(fea, val, 0.95) == false)
            {
                Console.WriteLine("Build lexical dictionary failed.");
                return;
            }
            da.save(strShrinkedModelFileName + ".feature");

            var tofs = new StreamWriter(strShrinkedModelFileName);

            // header
            tofs.WriteLine("version: " + CRFSharp.Utils.MODEL_TYPE_SHRINKED);
            tofs.WriteLine("cost-factor: " + cost_factor_);
            tofs.WriteLine("maxid: " + maxid_);
            tofs.WriteLine("xsize: " + xsize_);

            tofs.WriteLine();

            // y
            for (var i = 0; i < y_.Count; ++i)
            {
                tofs.WriteLine(y_[i]);
            }
            tofs.WriteLine();

            // template
            for (var i = 0; i < unigram_templs_.Count; ++i)
            {
                tofs.WriteLine(unigram_templs_[i]);
            }
            for (var i = 0; i < bigram_templs_.Count; ++i)
            {
                tofs.WriteLine(bigram_templs_[i]);
            }

            tofs.Close();
        }
예제 #7
0
        public void Process(string strModelFileName, string strShrinkedModelFileName, int thread_num_ = 1)
        {
            var    sr = new StreamReader(strModelFileName);
            string strLine;

            //读入版本号
            strLine = sr.ReadLine();
            var version = uint.Parse(strLine.Split(':')[1].Trim());

            if (version == CRFSharp.Utils.MODEL_TYPE_SHRINKED)
            {
                Console.WriteLine("The input model has been shrinked");
                return;
            }

            //读入cost_factor
            strLine = sr.ReadLine();
            var cost_factor_ = double.Parse(strLine.Split(':')[1].Trim());

            //读入maxid
            strLine = sr.ReadLine();
            var maxid_ = long.Parse(strLine.Split(':')[1].Trim());

            //读入xsize
            strLine = sr.ReadLine();
            var xsize_ = uint.Parse(strLine.Split(':')[1].Trim());

            //读入空行
            strLine = sr.ReadLine();

            //读入待标注的标签
            var y_ = new List <string>();

            while (true)
            {
                strLine = sr.ReadLine();
                if (strLine.Length == 0)
                {
                    break;
                }
                y_.Add(strLine);
            }

            //读入unigram和bigram模板
            var unigram_templs_ = new List <string>();
            var bigram_templs_  = new List <string>();

            while (sr.EndOfStream == false)
            {
                strLine = sr.ReadLine();
                if (strLine.Length == 0)
                {
                    break;
                }
                if (strLine[0] == 'U')
                {
                    unigram_templs_.Add(strLine);
                }
                if (strLine[0] == 'B')
                {
                    bigram_templs_.Add(strLine);
                }
            }
            sr.Close();


            //Load all features alpha data
            var filename_alpha        = strModelFileName + ".alpha";
            var filename_shrink_alpha = strShrinkedModelFileName + ".alpha";
            var sr_alpha = new StreamReader(filename_alpha);
            var br_alpha = new BinaryReader(sr_alpha.BaseStream);

            var  sw_alpha            = new StreamWriter(filename_shrink_alpha);
            var  bw_alpha            = new BinaryWriter(sw_alpha.BaseStream);
            long shrinked_alpha_size = 0;

            //Only reserve non-zero feature weights and save them into file as two-tuples format
            var alpha_ = new FixedBigArray <double>(maxid_ + 1, 0);

            for (long i = 0; i < maxid_; i++)
            {
                alpha_[i] = br_alpha.ReadSingle();
                if (alpha_[i] != 0)
                {
                    bw_alpha.Write(i);
                    bw_alpha.Write((float)alpha_[i]);
                    shrinked_alpha_size++;
                }
            }

            br_alpha.Close();
            bw_alpha.Close();

            //Only reserved lexical feature whose weights is non-zero
            var varValue         = new VarBigArray <int>(1024);
            var varFeature       = new VarBigArray <string>(1024);
            var feaCnt           = 0;
            var filename_feature = strModelFileName + ".feature.raw_text";
            var sr_fea           = new StreamReader(filename_feature);

            while (sr_fea.EndOfStream == false)
            {
                strLine = sr_fea.ReadLine();
                var items      = strLine.Split('\t');
                var strFeature = items[0];
                var key        = int.Parse(items[1]);
                var size       = (strFeature[0] == 'U' ? y_.Count : y_.Count * y_.Count);
                var hasAlpha   = false;
                for (var i = key; i < key + size; i++)
                {
                    if (alpha_[i] != 0)
                    {
                        hasAlpha = true;
                        break;
                    }
                }

                if (hasAlpha == true)
                {
                    varFeature[feaCnt] = strFeature;
                    varValue[feaCnt]   = key;
                    feaCnt++;
                }
            }
            sr_fea.Close();

            Console.WriteLine("Shrink feature size from {0} to {1}", maxid_, shrinked_alpha_size);
            maxid_ = shrinked_alpha_size;

            //Build new lexical feature
            var val = new FixedBigArray <int>(feaCnt, 0);
            var fea = new FixedBigArray <string>(feaCnt, 0);

            for (var i = 0; i < feaCnt; i++)
            {
                fea[i] = varFeature[i];
                val[i] = varValue[i];
            }
            varFeature = null;
            varValue   = null;
            var da = new DoubleArrayTrieBuilder(thread_num_);

            if (da.build(fea, val, 0.95) == false)
            {
                Console.WriteLine("Build lexical dictionary failed.");
                return;
            }
            da.save(strShrinkedModelFileName + ".feature");

            var tofs = new StreamWriter(strShrinkedModelFileName);

            // header
            tofs.WriteLine("version: " + CRFSharp.Utils.MODEL_TYPE_SHRINKED);
            tofs.WriteLine("cost-factor: " + cost_factor_);
            tofs.WriteLine("maxid: " + maxid_);
            tofs.WriteLine("xsize: " + xsize_);

            tofs.WriteLine();

            // y
            for (var i = 0; i < y_.Count; ++i)
            {
                tofs.WriteLine(y_[i]);
            }
            tofs.WriteLine();

            // template
            for (var i = 0; i < unigram_templs_.Count; ++i)
            {
                tofs.WriteLine(unigram_templs_[i]);
            }
            for (var i = 0; i < bigram_templs_.Count; ++i)
            {
                tofs.WriteLine(bigram_templs_[i]);
            }

            tofs.Close();
        }
예제 #8
0
        //Build feature set into indexed data
        public bool BuildFeatureSetIntoIndex(string filename, double max_slot_usage_rate_threshold, int debugLevel, string strRetrainModelFileName)
        {
            Console.WriteLine("Building {0} features into index...", featureLexicalDict.Size);

            IList<string> keyList;
            IList<int> valList;
            featureLexicalDict.GenerateLexicalIdList(out keyList, out valList);

            if (debugLevel > 0)
            {
                Console.Write("Debug: Writing raw feature set into file...");
                var filename_featureset_raw_format = filename + ".feature.raw_text";
                var sw = new StreamWriter(filename_featureset_raw_format);
                // save feature and its id into lists in raw format
                for (var i = 0; i < keyList.Count; i++)
                {
                    sw.WriteLine("{0}\t{1}", keyList[i], valList[i]);
                }
                sw.Close();
                Console.WriteLine("Done.");
            }

            //Build feature index
            var filename_featureset = filename + ".feature";
            var da = new DoubleArrayTrieBuilder(thread_num_);
            if (da.build(keyList, valList, max_slot_usage_rate_threshold) == false)
            {
                Console.WriteLine("Build lexical dictionary failed.");
                return false;
            }
            //Save indexed feature set into file
            da.save(filename_featureset);

            if (strRetrainModelFileName == null || strRetrainModelFileName.Length == 0)
            {
                //Clean up all data
                featureLexicalDict.Clear();
                featureLexicalDict = null;
                keyList = null;
                valList = null;

                GC.Collect();

                //Create weight matrix
                alpha_ = new double[feature_size() + 1];
            }
            else
            {
                Console.WriteLine();
                Console.WriteLine("Loading the existed model for re-training...");
                //Create weight matrix
                alpha_ = new double[feature_size() + 1];

                var modelReader = new ModelReader();
                modelReader.LoadModel(strRetrainModelFileName);

                if (modelReader.y_.Count == y_.Count)
                {
                    for (var i = 0; i < keyList.Count; i++)
                    {
                        var index = modelReader.get_id(keyList[i]);
                        if (index < 0)
                        {
                            continue;
                        }
                        var size = (keyList[i][0] == 'U' ? y_.Count : y_.Count * y_.Count);
                        for (var j = 0; j < size; j++)
                        {
                            alpha_[valList[i] + j + 1] = modelReader.GetAlpha(index + j);
                        }
                    }
                }
                else
                {
                    Console.WriteLine("The number of tags isn't equal between two models, it cannot be re-trained.");
                }

                //Clean up all data
                featureLexicalDict.Clear();
                featureLexicalDict = null;
                keyList = null;
                valList = null;

                GC.Collect();
            }

            return true;
        }