Beispiel #1
0
            public static void getFeatureSet(string fname)
            {
                featureSet.Clear();
                //string file = "c." + fname + ".train.txt";
                Console.Error.WriteLine("getting feature set...");

                List <string> wordsList = new List <string>();
                List <string> tagsList  = new List <string>();

                normalize(fname, wordsList, tagsList);

                //deal with featureMap and tagMap for train-input. No need for test-input
                baseHashMap <string, int> featureFreqMap = new baseHashMap <string, int>();

                for (int i = 0; i < wordsList.Count; i++)
                {
                    string   words   = wordsList[i];
                    string[] wordAry = words.Split(Global.blankAry);

                    for (int k = 0; k < wordAry.Length; k++)
                    {
                        string        word         = wordAry[k];
                        List <string> nodeFeatures = new List <string>();
                        getNodeFeatures(k, wordAry, ref nodeFeatures);

                        foreach (string f in nodeFeatures)
                        {
                            if (f == "/")
                            {
                                continue;
                            }
                            string[] fAry = f.Split(Global.slashAry);
                            string   id   = fAry[0];
                            featureFreqMap[id]++;
                        }
                    }
                }

                //build featureSet
                foreach (baseHashMap <string, int> .KeyValuePair kv in featureFreqMap)
                {
                    if (kv.Value > Global.featureTrim)
                    {
                        featureSet.Add(kv.Key);
                    }
                }
            }
Beispiel #2
0
            public static void writeFeaturesTag(List <string> wordSeqList, List <string> tagSeqList, string file)
            {
                StreamWriter swFeatureFile = new StreamWriter(file);

                //count length dist
                baseHashMap <int, int> lengthCountMap = new baseHashMap <int, int>();

                int interval = wordSeqList.Count / 10;

                for (int i = 0; i < wordSeqList.Count; i++)
                {
                    if (i % interval == 0)
                    {
                        double percent = (double)i / (double)wordSeqList.Count * 100.0;
                        Console.WriteLine("{0}: sentence #{1} --> {2}%", file, i, percent.ToString("f2"));
                    }

                    string   wordSeq = wordSeqList[i];
                    string[] wordAry = wordSeq.Split(Global.blankAry);
                    string   tagSeq  = tagSeqList[i];
                    string[] tagAry  = tagSeq.Split(Global.blankAry);

                    int length = wordAry.Length;
                    lengthCountMap[length]++;

                    for (int k = 0; k < wordAry.Length; k++)
                    {
                        List <string> nodeFeatures = new List <string>();
                        getNodeFeatures(k, wordAry, ref nodeFeatures);

                        swFeatureFile.Write(wordAry[k] + " "); //word
                        foreach (string f in nodeFeatures)     //features
                        {
                            if (f == "/")
                            {
                                swFeatureFile.Write("/ ");
                            }
                            else
                            {
                                string[] fAry = f.Split(Global.slashAry);
                                string   id   = fAry[0];
                                if (featureSet.Contains(id))
                                {
                                    swFeatureFile.Write(f + " ");
                                }
                                else
                                {
                                    swFeatureFile.Write("/ ");
                                }
                            }
                        }
                        swFeatureFile.Write(tagAry[k]);//tag
                        swFeatureFile.WriteLine();
                    }
                    swFeatureFile.WriteLine();
                }
                swFeatureFile.Close();

                //output length dist
                List <string> sortList2 = new List <string>();

                foreach (baseHashMap <int, int> .KeyValuePair kv in lengthCountMap)
                {
                    double v = (double)kv.Value / (double)wordSeqList.Count * 100.0;
                    sortList2.Add(string.Format("{0}  count:{1} --> {2}%", kv.Key, kv.Value, v.ToString("f2")));
                }
                sortList2.Sort(ListSortFunc.compareKV_key);
                Console.WriteLine("length distribution:");
                foreach (string im in sortList2)
                {
                    Console.WriteLine(im);
                }
            }