Пример #1
0
        static void IndexMode()
        {
            if (File.Exists(strInputFile) == false ||
                File.Exists(strTemplateFile) == false)
            {
                UsageIndex();
                return;
            }

            //Load feature set from given file
            List <string> features = new List <string>();
            StreamReader  sr       = new StreamReader(strInputFile);
            string        strLine  = null;

            while ((strLine = sr.ReadLine()) != null)
            {
                string[] items = strLine.Split('\t');
                features.Add(items[0]);
            }
            sr.Close();

            //Build indexed feature set
            templateFeaturizer = new TemplateFeaturizer();
            templateFeaturizer.LoadTemplateFromFile(strTemplateFile);
            templateFeaturizer.BuildIndexedFeatureIntoFile(strFeatureFile, features);
        }
Пример #2
0
        private static void IndexMode()
        {
            if (File.Exists(inputFilePath) == false ||
                File.Exists(templateFilePath) == false)
            {
                UsageIndex();
                return;
            }

            //Load feature set from given file
            var    features = new List <string>();
            var    sr       = new StreamReader(inputFilePath);
            string strLine;

            while ((strLine = sr.ReadLine()) != null)
            {
                var items = strLine.Split('\t');
                features.Add(items[0]);
            }
            sr.Close();

            //Build indexed feature set
            templateFeaturizer = new TemplateFeaturizer();
            templateFeaturizer.LoadTemplateFromFile(templateFilePath);
            templateFeaturizer.BuildIndexedFeatureIntoFile(featureFilePath, features);
        }
Пример #3
0
        static IDictionary <string, int> ExtractFeatureSetFromFile()
        {
            //Load templates from given file
            Logger.WriteLine("Loading feature template from {0}...", strTemplateFile);
            templateFeaturizer = new TemplateFeaturizer();
            templateFeaturizer.LoadTemplateFromFile(strTemplateFile);

            Logger.WriteLine("Generate feature set...");
            BigDictionary <string, int> feature2freq = new BigDictionary <string, int>();


            List <string[]> tokenList = new List <string[]>();
            string          strLine   = null;
            Sentence        sentence  = null;

            using (StreamReader srCorpus = new StreamReader(strInputFile, Encoding.UTF8))
            {
                while ((strLine = srCorpus.ReadLine()) != null)
                {
                    strLine = strLine.Trim();
                    if (strLine.Length == 0)
                    {
                        //The end of current record
                        sentence = new Sentence(tokenList);
                        for (int i = 0; i < sentence.TokensList.Count; i++)
                        {
                            //Get feature of i-th token
                            List <string> featureList = templateFeaturizer.GenerateFeature(sentence.TokensList, i);
                            foreach (string strFeature in featureList)
                            {
                                if (feature2freq.ContainsKey(strFeature) == false)
                                {
                                    feature2freq.Add(strFeature, 0);
                                }
                                feature2freq[strFeature]++;
                            }
                        }

                        tokenList.Clear();
                    }
                    else
                    {
                        tokenList.Add(strLine.Split('\t'));
                    }
                }

                //The end of current record
                sentence = new Sentence(tokenList);
                for (int i = 0; i < sentence.TokensList.Count; i++)
                {
                    //Get feature of i-th token
                    List <string> featureList = templateFeaturizer.GenerateFeature(sentence.TokensList, i);
                    foreach (string strFeature in featureList)
                    {
                        if (feature2freq.ContainsKey(strFeature) == false)
                        {
                            feature2freq.Add(strFeature, 0);
                        }
                        feature2freq[strFeature]++;
                    }
                }
            }

            //Only save the feature whose frequency is not less than minfreq
            Logger.WriteLine("Filter out features whose frequency is less than {0}", minfreq);
            SortedDictionary <string, int> features = new SortedDictionary <string, int>(StringComparer.Ordinal);

            foreach (KeyValuePair <string, int> pair in feature2freq)
            {
                if (pair.Value >= minfreq)
                {
                    features.Add(pair.Key, pair.Value);
                }
            }

            return(features);
        }