Ejemplo n.º 1
0
 public static void Main(string[] args)
 {
     try
     {
         if (args.Length < 1)
         {
             DispHelp();
         }
         else if (args[0].ToLower() == "--pair2seqfea" && args.Length == 7)
         {
             //Pair2SeqFea(args[1], args[2], args[3], int.Parse(args[4]), args[5], args[6]);        //update by Shelson, Jan. 18
         }
         else if (args[0].ToLower() == "--seqfea2bin" && args.Length == 4)
         {
             SeqFea2Bin(args[1], int.Parse(args[2]), args[3]);
         }
         else if (args[0].ToLower() == "--shuffle" && (args.Length == 3 || args.Length == 4))
         {
             ExternalShuffle.Shuffle(args[1], args[2], args.Length == 4 ? args[3] : null);
         }
         else
         {
             DispHelp();
         }
     }
     catch (Exception exc)
     {
         Console.Error.WriteLine(exc.ToString());
         //Environment.Exit(0);
     }
 }
Ejemplo n.º 2
0
        public static void Pair2SeqFeaBin(string inFile, string vocFile, int nMaxLength, int idx, string outFile, int BatchSize, FeatureList featureList)
        {
            Dictionary <string, Dictionary <int, double> > dicRoot = new Dictionary <string, Dictionary <int, double> >();
            Vocab voc = new Vocab(false);

            if (featureList.l3g == true)
            {
                voc.Read(vocFile); voc.Lock();
            }


            int N = 3;  // letter 3-gram

            string outputDir = @"../../../../../Data/tmp/";

            if (!Directory.Exists(outputDir))
            {
                Directory.CreateDirectory(outputDir);
            }

            int    nThreads = CmpInfo.ProcessorCount;
            string suffix   = ".bin";

            //for debug
            //nThreads = 1;

            List <int> nMaxFeatureNumPerBatch = new List <int>();
            List <int> nMaxFeatureDimension   = new List <int>();
            List <int> featureDimension       = new List <int>();
            List <int> nMaxSegmentSize        = new List <int>();
            List <int> nBatch = new List <int>();

            for (int i = 0; i < nThreads; i++)
            {
                nMaxFeatureNumPerBatch.Add(0);
                nMaxFeatureDimension.Add(0);
                featureDimension.Add(0);
                nMaxSegmentSize.Add(0);
                nBatch.Add(0);
            }

            int totalLine = ExternalShuffle.Split(inFile, outputDir, nThreads);

            Parallel.For(0, nThreads, id =>
            {
                BinaryWriter bw  = new BinaryWriter(File.Open(outputDir + id + suffix, FileMode.Create));
                StreamWriter sw  = new StreamWriter(File.Open(outputDir + id + ".tsv", FileMode.Create));
                StringBuilder sb = new StringBuilder();
                Batch batch      = new Batch();

                string sLine = ""; int nLine = 0;
                using (StreamReader sr = new StreamReader(outputDir + id + ".seg"))
                {
                    while ((sLine = sr.ReadLine()) != null)
                    {
                        nLine++; if (nLine % 1000 == 0)
                        {
                            Console.Write("{0}\r", nLine);
                        }
                        sb.Append(sLine + "\n");

                        string labelLine = string.Empty;
                        string[] rgs     = sLine.Split('\t');

                        if (rgs.Length <= idx)
                        {
                            throw new Exception("Invalid format in input file! Exactly two fields separated by tabs are expected " + sLine.ToLower());
                        }

                        int pos = 0;

                        List <Dictionary <int, double> > rgWfs = new List <Dictionary <int, double> >();
                        string[] words = TextUtils.TokenizeToArray(rgs[idx]);
                        for (int i = 0; i < words.Length; i++)
                        {
                            rgWfs.Add(new Dictionary <int, double>());
                        }


                        if (featureList.l3g == true)
                        {
                            var featStrFeq = TextUtils.String2FeatStrSeq(rgs[idx], N, nMaxLength, FeatureType.l3g);  // letter N-gram
                            List <Dictionary <int, double> > tmp = TextUtils.StrFreq2IdFreq(featStrFeq, voc, pos);
                            Merge(ref rgWfs, tmp);
                            pos += voc.Count;
                        }
                        if (featureList.root == true)
                        {
                            int count      = 0;
                            var featStrFeq = TextUtils.String2FeatStrSeq(rgs[idx], N, nMaxLength, FeatureType.root);  // list of root
                            List <Dictionary <int, double> > tmp = TextUtils.StrFreq2IdFreq(featStrFeq, FeatureType.root, pos, ref count);
                            Merge(ref rgWfs, tmp);
                            pos += count;
                        }
                        if (featureList.infl == true)
                        {
                            int count      = 0;
                            var featStrFeq = TextUtils.String2FeatStrSeq(rgs[idx], N, nMaxLength, FeatureType.infl);  // list of inflections
                            List <Dictionary <int, double> > tmp = TextUtils.StrFreq2IdFreq(featStrFeq, FeatureType.infl, pos, ref count);
                            Merge(ref rgWfs, tmp);
                            pos += count;
                        }


                        // binary output
                        if (batch.BatchSize == BatchSize)
                        {
                            if (batch.ElementSize > nMaxFeatureNumPerBatch[id])
                            {
                                nMaxFeatureNumPerBatch[id] = batch.ElementSize;
                            }
                            // batch.FeatureDim = nMaxFeatureId;
                            batch.WriteSeqSample(bw);
                            batch.Clear();
                            sw.Write(sb);
                            sb = new StringBuilder();
                            nBatch[id]++;
                        }
                        featureDimension[id] = batch.LoadSeqSample(rgWfs);
                        if (featureDimension[id] > nMaxFeatureDimension[id])
                        {
                            nMaxFeatureDimension[id] = featureDimension[id];
                        }
                        if (batch.SegSize > nMaxSegmentSize[id])
                        {
                            nMaxSegmentSize[id] = batch.SegSize;
                        }
                    }
                }
                //Console.WriteLine("nLine");

                // binary output
                if (batch.BatchSize > 0)
                {
                    batch.Clear();
                }

                bw.Close();
                sw.Close();
                File.Delete(outputDir + id + ".seg");
            });

            voc.Unlock();

            ExternalShuffle.Merge(outFile, outputDir, suffix, nThreads);
            BinaryWriter bwTail = new BinaryWriter(File.Open(outFile, FileMode.Append));

            totalLine = nBatch.Sum() * BatchSize;
            bwTail.Write(nMaxFeatureDimension.Max()); bwTail.Write(totalLine); bwTail.Write(nMaxSegmentSize.Max()); bwTail.Write(nMaxFeatureNumPerBatch.Max());
            bwTail.Write(BatchSize); // part of change on 2/19/2014. Write the batch size at the end. Used to check consistency in training.
            bwTail.Close();

            ExternalShuffle.Merge(ParameterSetting.trainPairTokzNew, outputDir, ".tsv", nThreads);
            if (Directory.Exists(outputDir))
            {
                Directory.Delete(outputDir);
            }
        }
Ejemplo n.º 3
0
        /// <summary>
        /// convert seq fea file to bin file.
        /// </summary>
        /// <param name="inFile">input seq fea file</param>
        /// <param name="outFile">output bin file</param>
        static void SeqFea2Bin(string inFile, int BatchSize, string outFile)
        {
            string[] terms  = inFile.Split('/');
            string   suffix = terms[terms.Length - 1];

            string outputDir = @"../../../../../Data/tmp/";
            int    nThreads  = CmpInfo.ProcessorCount;

            int totalLine = ExternalShuffle.Split(inFile, outputDir, nThreads);

            List <int> nMaxFeatureNumPerBatch = new List <int>();
            List <int> nMaxFeatureDimension   = new List <int>();
            List <int> featureDimension       = new List <int>();
            List <int> nMaxSegmentSize        = new List <int>();
            List <int> nLine = new List <int>();

            for (int i = 0; i < nThreads; i++)
            {
                nMaxFeatureNumPerBatch.Add(0);
                nMaxFeatureDimension.Add(0);
                featureDimension.Add(0);
                nMaxSegmentSize.Add(0);
                nLine.Add(0);
            }

            Parallel.For(0, nThreads, id =>
            {
                BinaryWriter bw = new BinaryWriter(File.Open(outputDir + id + suffix, FileMode.Create));
                string sLine    = "";  //int nLine = 0;
                List <Dictionary <int, double> > rgWfs = new List <Dictionary <int, double> >();

                Batch batch = new Batch();

                using (StreamReader sr = new StreamReader(outputDir + id + ".seg"))
                {
                    while ((sLine = sr.ReadLine()) != null)
                    {
                        nLine[id]++; if (nLine[id] % 10000 == 0)
                        {
                            Console.Write("{0}\r", nLine[id]);
                        }

                        rgWfs = TextUtils.String2Matrix(sLine.Trim());

                        // binary output
                        if (batch.BatchSize == BatchSize)
                        {
                            if (batch.ElementSize > nMaxFeatureNumPerBatch[id])
                            {
                                nMaxFeatureNumPerBatch[id] = batch.ElementSize;
                            }
                            // batch.FeatureDim = nMaxFeatureId;
                            batch.WriteSeqSample(bw);
                            batch.Clear();
                        }
                        featureDimension[id] = batch.LoadSeqSample(rgWfs);
                        if (featureDimension[id] > nMaxFeatureDimension[id])
                        {
                            nMaxFeatureDimension[id] = featureDimension[id];
                        }
                        if (batch.SegSize > nMaxSegmentSize[id])
                        {
                            nMaxSegmentSize[id] = batch.SegSize;
                        }
                    }
                }

                // binary output
                if (batch.BatchSize > 0)
                {
                    if (batch.ElementSize > nMaxFeatureNumPerBatch[id])
                    {
                        nMaxFeatureNumPerBatch[id] = batch.ElementSize;
                    }
                    // batch.FeatureDim = nMaxFeatureId;
                    batch.WriteSeqSample(bw);
                    batch.Clear();
                }
                bw.Close();
                File.Delete(outputDir + id + ".seg");
            });

            ExternalShuffle.Merge(outFile, outputDir, suffix, nThreads);
            BinaryWriter bwTail = new BinaryWriter(File.Open(outFile, FileMode.Append));

            totalLine = nLine.Sum();

            bwTail.Write(nMaxFeatureDimension.Max()); bwTail.Write(totalLine); bwTail.Write(nMaxSegmentSize.Max()); bwTail.Write(nMaxFeatureNumPerBatch.Max());
            bwTail.Write(BatchSize); // part of change on 2/19/2014. Write the batch size at the end. Used to check consistency in training.
            bwTail.Close();
        }