コード例 #1
0
        private void SetTFeatures()
        {
            string tfeatureFileName = config.GetValueOptional(TFEATURE_FILENAME);

            if (String.IsNullOrEmpty(tfeatureFileName) == false)
            {
                //Load template feature set
                var tfeatureFilePath = GetFilePath(currentDirectory, tfeatureFileName);
                Logger.WriteLine($"Loading template feature set from {tfeatureFilePath}");
                tFeaturizer = new TemplateFeaturizer(tfeatureFilePath);

                var tfeatureWeightType = config.GetValueRequired(TFEATURE_WEIGHT_TYPE);
                tFeatureWeightType = tfeatureWeightType.Equals("binary", StringComparison.InvariantCultureIgnoreCase)
                    ? TFEATURE_WEIGHT_TYPE_ENUM.BINARY
                    : TFEATURE_WEIGHT_TYPE_ENUM.FREQUENCY;
                Logger.WriteLine($"TFeature weight type: {tfeatureWeightType}");

                var tfeatureContext = config.GetValueRequired(TFEATURE_CONTEXT);
                featureContext.Add(TFEATURE_CONTEXT, new List <int>());
                foreach (var contextOffset in tfeatureContext.Split(','))
                {
                    featureContext[TFEATURE_CONTEXT].Add(int.Parse(contextOffset));
                }
                Logger.WriteLine($"TFeature context: {tfeatureContext}");
            }
            else
            {
                Logger.WriteLine($"No TFeature available.");
            }
        }
コード例 #2
0
ファイル: Featurizer.cs プロジェクト: zero76114/RNNSharp
        //The format of configuration file
        public void LoadFeatureConfigFromFile(string strFileName)
        {
            StreamReader sr      = new StreamReader(strFileName);
            string       strLine = null;

            m_FeatureConfiguration = new Dictionary <string, List <int> >();
            while ((strLine = sr.ReadLine()) != null)
            {
                strLine = strLine.Trim();
                if (strLine.Length == 0)
                {
                    //Emtpy line, ignore it
                    continue;
                }

                if (strLine.StartsWith("#") == true)
                {
                    //Comments line, ignore it
                    continue;
                }

                string[] kv       = strLine.Split(':');
                string   strKey   = kv[0].Trim();
                string   strValue = kv[1].Trim().ToLower();
                if (strKey == WORDEMBEDDING_FILENAME)
                {
                    if (m_WordEmbedding != null)
                    {
                        throw new ArgumentException("Embedding model has already been loaded. Please check if settings is duplicated in configuration file.");
                    }
                    Logger.WriteLine("Loading embedding feature set from model {0}", strValue);
                    m_WordEmbedding = new WordEMWrapFeaturizer(strValue);
                    continue;
                }
                else if (strKey == WORDEMBEDDING_RAW_FILENAME)
                {
                    if (m_WordEmbedding != null)
                    {
                        throw new ArgumentException("Embedding model has already been loaded. Please check if settings is duplicated in configuration file.");
                    }
                    Logger.WriteLine("Loading embedding feature set from model {0} in text format", strValue);
                    m_WordEmbedding = new WordEMWrapFeaturizer(strValue, true);
                    continue;
                }
                else if (strKey == TFEATURE_FILENAME)
                {
                    Logger.WriteLine("Loading template feature set...");
                    m_TFeaturizer = new TemplateFeaturizer(strValue);
                    continue;
                }
                else if (strKey == WORDEMBEDDING_COLUMN)
                {
                    m_WordEmbeddingCloumn = int.Parse(strValue);
                    Logger.WriteLine("Word embedding feature column: {0}", m_WordEmbeddingCloumn);
                    continue;
                }
                else if (strKey == TFEATURE_WEIGHT_TYPE)
                {
                    Logger.WriteLine("TFeature weighting type: {0}", strValue);
                    if (strValue == "binary")
                    {
                        m_TFeatureWeightType = TFEATURE_WEIGHT_TYPE_ENUM.BINARY;
                    }
                    else
                    {
                        m_TFeatureWeightType = TFEATURE_WEIGHT_TYPE_ENUM.FREQUENCY;
                    }

                    continue;
                }

                string[] values = strValue.Split(',');

                if (m_FeatureConfiguration.ContainsKey(strKey) == false)
                {
                    m_FeatureConfiguration.Add(strKey, new List <int>());
                }

                foreach (string value in values)
                {
                    m_FeatureConfiguration[strKey].Add(int.Parse(value));
                }
            }

            sr.Close();
        }
コード例 #3
0
ファイル: Featurizer.cs プロジェクト: shaoxuan92/RNNSharp
        //The format of configuration file
        public void LoadFeatureConfigFromFile(string strFileName)
        {
            StreamReader sr      = new StreamReader(strFileName);
            string       strLine = null;

            FeatureContext = new Dictionary <string, List <int> >();
            while ((strLine = sr.ReadLine()) != null)
            {
                strLine = strLine.Trim();
                if (strLine.Length == 0)
                {
                    //Emtpy line, ignore it
                    continue;
                }

                if (strLine.StartsWith("#") == true)
                {
                    //Comments line, ignore it
                    continue;
                }

                int    idxSeparator = strLine.IndexOf(':');
                string strKey       = strLine.Substring(0, idxSeparator).Trim();
                string strValue     = strLine.Substring(idxSeparator + 1).Trim();
                if (strKey == PRETRAINEDMODEL_FILENAME)
                {
                    if (PretainedModel != null)
                    {
                        throw new ArgumentException("Static pretrained model has already been loaded. Please check if settings is duplicated in configuration file.");
                    }
                    Logger.WriteLine("Loading pretrained dense feature set from model {0}", strValue);
                    PretainedModel = new WordEMWrapFeaturizer(strValue);
                }
                else if (strKey == PRETRAINEDMODEL_RAW_FILENAME)
                {
                    if (PretainedModel != null)
                    {
                        throw new ArgumentException("Static pretrained model has already been loaded. Please check if settings is duplicated in configuration file.");
                    }
                    Logger.WriteLine("Loading pretrained dense feature set from model {0} in text format", strValue);
                    PretainedModel = new WordEMWrapFeaturizer(strValue, true);
                }
                else if (strKey == TFEATURE_FILENAME)
                {
                    Logger.WriteLine("Loading template feature set...");
                    TFeaturizer = new TemplateFeaturizer(strValue);
                }
                else if (strKey == PRETRAINEDMODEL_COLUMN)
                {
                    PretrainedModelColumn = int.Parse(strValue);
                    Logger.WriteLine("Pretrained model feature column: {0}", PretrainedModelColumn);
                }
                else if (strKey == TFEATURE_WEIGHT_TYPE)
                {
                    Logger.WriteLine("TFeature weighting type: {0}", strValue);
                    if (strValue == "binary")
                    {
                        TFeatureWeightType = TFEATURE_WEIGHT_TYPE_ENUM.BINARY;
                    }
                    else
                    {
                        TFeatureWeightType = TFEATURE_WEIGHT_TYPE_ENUM.FREQUENCY;
                    }
                }
                else if (strKey == PRETRAIN_TYPE)
                {
                    if (strValue.Equals(RNNSharp.PRETRAIN_TYPE.AUTOENCODER.ToString(), StringComparison.InvariantCultureIgnoreCase))
                    {
                        preTrainType = RNNSharp.PRETRAIN_TYPE.AUTOENCODER;
                    }
                    else
                    {
                        preTrainType = RNNSharp.PRETRAIN_TYPE.EMBEDDING;
                    }

                    Logger.WriteLine("Pretrain type: {0}", preTrainType);
                }
                else if (strKey == AUTOENCODER_FEATURECONFIG)
                {
                    autoEncoderFeatureConfigFile = strValue;
                    Logger.WriteLine("Auto encoder configuration file: {0}", autoEncoderFeatureConfigFile);
                }
                else if (strKey == AUTOENCODER_MODEL)
                {
                    autoEncoderModelFile = strValue;
                    Logger.WriteLine("Auto encoder model file: {0}", autoEncoderModelFile);
                }
                else
                {
                    string[] values = strValue.Split(',');

                    if (FeatureContext.ContainsKey(strKey) == false)
                    {
                        FeatureContext.Add(strKey, new List <int>());
                    }

                    foreach (string value in values)
                    {
                        FeatureContext[strKey].Add(int.Parse(value));
                    }
                }
            }

            sr.Close();
        }
コード例 #4
0
ファイル: Featurizer.cs プロジェクト: dmit25/RNNSharp
        //The format of configuration file
        public void LoadFeatureConfigFromFile(string strFileName)
        {
            StreamReader sr = new StreamReader(strFileName);
            string strLine = null;

            m_FeatureConfiguration = new Dictionary<string, List<int>>();
            while ((strLine = sr.ReadLine()) != null)
            {
                strLine = strLine.Trim();
                if (strLine.Length == 0)
                {
                    //Emtpy line, ignore it
                    continue;
                }

                if (strLine.StartsWith("#") == true)
                {
                    //Comments line, ignore it
                    continue;
                }

                string[] kv = strLine.Split(':');
                string strKey = kv[0].Trim();
                string strValue = kv[1].Trim().ToLower();
                if (strKey == WORDEMBEDDING_FILENAME)
                {
                    if (m_WordEmbedding != null)
                    {
                        throw new ArgumentException("Embedding model has already been loaded. Please check if settings is duplicated in configuration file.");
                    }
                    Logger.WriteLine("Loading embedding feature set from model {0}", strValue);
                    m_WordEmbedding = new WordEMWrapFeaturizer(strValue);
                    continue;
                }
                else if (strKey == WORDEMBEDDING_RAW_FILENAME)
                {
                    if (m_WordEmbedding != null)
                    {
                        throw new ArgumentException("Embedding model has already been loaded. Please check if settings is duplicated in configuration file.");
                    }
                    Logger.WriteLine("Loading embedding feature set from model {0} in text format", strValue);
                    m_WordEmbedding = new WordEMWrapFeaturizer(strValue, true);
                    continue;
                }
                else if (strKey == TFEATURE_FILENAME)
                {
                    Logger.WriteLine("Loading template feature set...");
                    m_TFeaturizer = new TemplateFeaturizer(strValue);
                    continue;
                }
                else if (strKey == WORDEMBEDDING_COLUMN)
                {
                    m_WordEmbeddingCloumn = int.Parse(strValue);
                    Logger.WriteLine("Word embedding feature column: {0}", m_WordEmbeddingCloumn);
                    continue;
                }
                else if (strKey == TFEATURE_WEIGHT_TYPE)
                {
                    Logger.WriteLine("TFeature weighting type: {0}", strValue);
                    if (strValue == "binary")
                    {
                        m_TFeatureWeightType = TFEATURE_WEIGHT_TYPE_ENUM.BINARY;
                    }
                    else
                    {
                        m_TFeatureWeightType = TFEATURE_WEIGHT_TYPE_ENUM.FREQUENCY;
                    }

                    continue;
                }

                string[] values = strValue.Split(',');

                if (m_FeatureConfiguration.ContainsKey(strKey) == false)
                {
                    m_FeatureConfiguration.Add(strKey, new List<int>());
                }

                foreach (string value in values)
                {
                    m_FeatureConfiguration[strKey].Add(int.Parse(value));
                }
            }

            sr.Close();
        }
コード例 #5
0
ファイル: Program.cs プロジェクト: dmit25/RNNSharp
        static IDictionary<string, int> ExtractFeatureSetFromFile()
        {
            //Load templates from given file
            Logger.WriteLine("Loading feature template from {0}...", strTemplateFile);
            templateFeaturizer = new TemplateFeaturizer();
            templateFeaturizer.LoadTemplateFromFile(strTemplateFile);

            Logger.WriteLine("Generate feature set...");
            BigDictionary<string, int> feature2freq = new BigDictionary<string, int>();

            List<string[]> tokenList = new List<string[]>();
            string strLine = null;
            Sentence sentence = null;

            using (StreamReader srCorpus = new StreamReader(strInputFile, Encoding.UTF8))
            {
                while ((strLine = srCorpus.ReadLine()) != null)
                {
                    strLine = strLine.Trim();
                    if (strLine.Length == 0)
                    {
                        //The end of current record
                        sentence = new Sentence(tokenList);
                        for (int i = 0; i < sentence.TokensList.Count; i++)
                        {
                            //Get feature of i-th token
                            List<string> featureList = templateFeaturizer.GenerateFeature(sentence.TokensList, i);
                            foreach (string strFeature in featureList)
                            {
                                if (feature2freq.ContainsKey(strFeature) == false)
                                {
                                    feature2freq.Add(strFeature, 0);
                                }
                                feature2freq[strFeature]++;
                            }
                        }

                        tokenList.Clear();
                    }
                    else
                    {
                        tokenList.Add(strLine.Split('\t'));
                    }
                }

                //The end of current record
                sentence = new Sentence(tokenList);
                for (int i = 0; i < sentence.TokensList.Count; i++)
                {
                    //Get feature of i-th token
                    List<string> featureList = templateFeaturizer.GenerateFeature(sentence.TokensList, i);
                    foreach (string strFeature in featureList)
                    {
                        if (feature2freq.ContainsKey(strFeature) == false)
                        {
                            feature2freq.Add(strFeature, 0);
                        }
                        feature2freq[strFeature]++;
                    }
                }
            }

            //Only save the feature whose frequency is not less than minfreq
            Logger.WriteLine("Filter out features whose frequency is less than {0}", minfreq);
            SortedDictionary<string, int> features = new SortedDictionary<string, int>(StringComparer.Ordinal);
            foreach (KeyValuePair<string, int> pair in feature2freq)
            {
                if (pair.Value >= minfreq)
                {
                    features.Add(pair.Key, pair.Value);
                }
            }

            return features;
        }
コード例 #6
0
ファイル: Program.cs プロジェクト: dmit25/RNNSharp
        static void IndexMode()
        {
            if (File.Exists(strInputFile) == false ||
                File.Exists(strTemplateFile) == false)
            {
                UsageIndex();
                return;
            }

            //Load feature set from given file
            List<string> features = new List<string>();
            StreamReader sr = new StreamReader(strInputFile);
            string strLine = null;

            while ((strLine = sr.ReadLine()) != null)
            {
                string[] items = strLine.Split('\t');
                features.Add(items[0]);
            }
            sr.Close();

            //Build indexed feature set
            templateFeaturizer = new TemplateFeaturizer();
            templateFeaturizer.LoadTemplateFromFile(strTemplateFile);
            templateFeaturizer.BuildIndexedFeatureIntoFile(strFeatureFile, features);
        }