예제 #1
0
        //convert number-based list to letter-based list
        static void getNewTagList(baseHashMap <int, string> tagMap, ref List <string> tagList)
        {
            List <string> tmpList = new List <string>();

            foreach (string im in tagList)
            {
                string[] tagAry = im.Split(Global.commaAry, StringSplitOptions.RemoveEmptyEntries);

                for (int i = 0; i < tagAry.Length; i++)
                {
                    int index = int.Parse(tagAry[i]);
                    if (!tagMap.ContainsKey(index))
                    {
                        throw new Exception("error");
                    }
                    tagAry[i] = tagMap[index];
                }
                string newTags = string.Join(",", tagAry);
                tmpList.Add(newTags);
            }
            tagList.Clear();
            foreach (string im in tmpList)
            {
                tagList.Add(im);
            }
        }
예제 #2
0
        public void getMaps(string file)
        {
            if (!File.Exists(file))
            {
                Console.WriteLine("file {0} no exist!", file);
                return;
            }
            Console.WriteLine("file {0} converting...", file);
            StreamReader sr = new StreamReader(file);

            baseHashMap <string, int> featureFreqMap = new baseHashMap <string, int>();
            baseHashSet <string>      tagSet         = new baseHashSet <string>();

            //get feature-freq info and tagset
            int nFeatTemp = 0;

            while (!sr.EndOfStream)
            {
                string line = sr.ReadLine();
                line = line.Replace("\t", " ");
                line = line.Replace("\r", "");

                if (line == "")
                {
                    continue;
                }

                string[] ary = line.Split(Global.blankAry, StringSplitOptions.RemoveEmptyEntries);
                nFeatTemp = ary.Length - 2;
                for (int i = 1; i < ary.Length - 1; i++)
                {
                    if (ary[i] == "/")//no feature here
                    {
                        continue;
                    }
                    string[] ary2    = ary[i].Split(Global.slashAry, StringSplitOptions.RemoveEmptyEntries);//for real-value features
                    string   feature = i.ToString() + "." + ary2[0];
                    if (featureFreqMap.ContainsKey(feature) == false)
                    {
                        featureFreqMap[feature] = 1;
                    }
                    else
                    {
                        featureFreqMap[feature]++;
                    }
                }

                string tag = ary[ary.Length - 1];
                tagSet.Add(tag);
            }

            //sort features
            List <string> sortList = new List <string>();

            foreach (baseHashMap <string, int> .KeyValuePair kv in featureFreqMap)
            {
                sortList.Add(kv.Key + " " + kv.Value);
            }
            if (Global.regMode == "GL")//sort based on feature templates
            {
                sortList.Sort(listSortFunc.compareKV_key);
                //sortList.Reverse();

                Global.groupStart = new List <int>();
                Global.groupEnd   = new List <int>();
                Global.groupStart.Add(0);
                for (int k = 1; k < sortList.Count; k++)
                {
                    string[] thisAry = sortList[k].Split(Global.dotAry, StringSplitOptions.RemoveEmptyEntries);
                    string[] preAry = sortList[k - 1].Split(Global.dotAry, StringSplitOptions.RemoveEmptyEntries);
                    string   str = thisAry[0], preStr = preAry[0];
                    if (str != preStr)
                    {
                        Global.groupStart.Add(k);
                        Global.groupEnd.Add(k);
                    }
                }
                Global.groupEnd.Add(sortList.Count);
            }
            else//sort based on feature frequency
            {
                sortList.Sort(listSortFunc.compareKV_value);//sort feature based on freq, for 1)compress .txt file 2)better edge features
                sortList.Reverse();
            }

            if (Global.regMode == "GL")
            {
                if (nFeatTemp != Global.groupStart.Count)
                {
                    throw new Exception("inconsistent # of features per line, check the feature file for consistency!");
                }
            }

            //feature index should begin from 0
            StreamWriter swFeat = new StreamWriter("featureIndex.txt");

            for (int i = 0; i < sortList.Count; i++)
            {
                string[] ary = sortList[i].Split(Global.blankAry);
                featureIndexMap[ary[0]] = i;
                swFeat.WriteLine("{0} {1}", ary[0], i);
            }
            swFeat.Close();

            //label index should begin from 0
            StreamWriter  swTag       = new StreamWriter("tagIndex.txt");
            List <string> tagSortList = new List <string>();

            foreach (string tag in tagSet)
            {
                tagSortList.Add(tag);
            }
            tagSortList.Sort();//sort tags
            for (int i = 0; i < tagSortList.Count; i++)
            {
                tagIndexMap[tagSortList[i]] = i;
                swTag.WriteLine("{0} {1}", tagSortList[i], i);
            }
            swTag.Close();

            sr.Close();
        }
예제 #3
0
        //for small memory load, should read line by line
        public void convertFile(string file)
        {
            if (!File.Exists(file))
            {
                Console.WriteLine("file {0} no exist!", file);
                return;
            }
            Console.WriteLine("file {0} converting...", file);
            StreamReader sr = new StreamReader(file);

            //convert to files of new format
            StreamWriter swFeature, swGold;

            if (file == Global.fTrain)
            {
                swFeature = new StreamWriter(Global.fFeatureTrain);
                swGold    = new StreamWriter(Global.fGoldTrain);
            }
            else if (file == Global.fTest)
            {
                swFeature = new StreamWriter(Global.fFeatureTest);
                swGold    = new StreamWriter(Global.fGoldTest);
            }
            else
            {
                swFeature = new StreamWriter(Global.fFeatureDev);
                swGold    = new StreamWriter(Global.fGoldDev);
            }

            swFeature.WriteLine(featureIndexMap.Count);
            swFeature.WriteLine();
            swGold.WriteLine(tagIndexMap.Count);
            swGold.WriteLine();

            while (!sr.EndOfStream)
            {
                string line = sr.ReadLine();
                line = line.Replace("\t", " ");
                line = line.Replace("\r", "");
                if (line == "")//end of a sample
                {
                    swFeature.WriteLine();
                    swGold.WriteLine();
                    swGold.WriteLine();
                    continue;
                }

                string[] ary = line.Split(Global.blankAry, StringSplitOptions.RemoveEmptyEntries);
                for (int i = 1; i < ary.Length - 1; i++)
                {
                    if (ary[i] == "/")//no feature here
                    {
                        continue;
                    }
                    string[] ary2    = ary[i].Split(Global.slashAry, StringSplitOptions.RemoveEmptyEntries);//for real-value features
                    string   feature = i.ToString() + "." + ary2[0];
                    string   value   = "";
                    bool     real    = false;
                    if (ary2.Length > 1)
                    {
                        value = ary2[1];
                        real  = true;
                    }

                    if (featureIndexMap.ContainsKey(feature) == false)
                    {
                        continue;
                    }
                    int fIndex = featureIndexMap[feature];
                    if (!real)
                    {
                        swFeature.Write("{0},", fIndex);
                    }
                    else
                    {
                        swFeature.Write("{0}/{1},", fIndex, value);
                    }
                }
                swFeature.WriteLine();

                string tag    = ary[ary.Length - 1];
                int    tIndex = tagIndexMap[tag];
                swGold.Write("{0},", tIndex);
            }

            sr.Close();
            swFeature.Close();
            swGold.Close();
        }