예제 #1
0
파일: Program.cs 프로젝트: a170811/IWE
        public static void Pair2SeqFeaBin(string inFile, string vocFile, int nMaxLength, int idx, string outFile, int BatchSize, FeatureList featureList)
        {
            Dictionary <string, Dictionary <int, double> > dicRoot = new Dictionary <string, Dictionary <int, double> >();
            Vocab voc = new Vocab(false);

            if (featureList.l3g == true)
            {
                voc.Read(vocFile); voc.Lock();
            }


            int N = 3;  // letter 3-gram

            string outputDir = @"../../../../../Data/tmp/";

            if (!Directory.Exists(outputDir))
            {
                Directory.CreateDirectory(outputDir);
            }

            int    nThreads = CmpInfo.ProcessorCount;
            string suffix   = ".bin";

            //for debug
            //nThreads = 1;

            List <int> nMaxFeatureNumPerBatch = new List <int>();
            List <int> nMaxFeatureDimension   = new List <int>();
            List <int> featureDimension       = new List <int>();
            List <int> nMaxSegmentSize        = new List <int>();
            List <int> nBatch = new List <int>();

            for (int i = 0; i < nThreads; i++)
            {
                nMaxFeatureNumPerBatch.Add(0);
                nMaxFeatureDimension.Add(0);
                featureDimension.Add(0);
                nMaxSegmentSize.Add(0);
                nBatch.Add(0);
            }

            int totalLine = ExternalShuffle.Split(inFile, outputDir, nThreads);

            Parallel.For(0, nThreads, id =>
            {
                BinaryWriter bw  = new BinaryWriter(File.Open(outputDir + id + suffix, FileMode.Create));
                StreamWriter sw  = new StreamWriter(File.Open(outputDir + id + ".tsv", FileMode.Create));
                StringBuilder sb = new StringBuilder();
                Batch batch      = new Batch();

                string sLine = ""; int nLine = 0;
                using (StreamReader sr = new StreamReader(outputDir + id + ".seg"))
                {
                    while ((sLine = sr.ReadLine()) != null)
                    {
                        nLine++; if (nLine % 1000 == 0)
                        {
                            Console.Write("{0}\r", nLine);
                        }
                        sb.Append(sLine + "\n");

                        string labelLine = string.Empty;
                        string[] rgs     = sLine.Split('\t');

                        if (rgs.Length <= idx)
                        {
                            throw new Exception("Invalid format in input file! Exactly two fields separated by tabs are expected " + sLine.ToLower());
                        }

                        int pos = 0;

                        List <Dictionary <int, double> > rgWfs = new List <Dictionary <int, double> >();
                        string[] words = TextUtils.TokenizeToArray(rgs[idx]);
                        for (int i = 0; i < words.Length; i++)
                        {
                            rgWfs.Add(new Dictionary <int, double>());
                        }


                        if (featureList.l3g == true)
                        {
                            var featStrFeq = TextUtils.String2FeatStrSeq(rgs[idx], N, nMaxLength, FeatureType.l3g);  // letter N-gram
                            List <Dictionary <int, double> > tmp = TextUtils.StrFreq2IdFreq(featStrFeq, voc, pos);
                            Merge(ref rgWfs, tmp);
                            pos += voc.Count;
                        }
                        if (featureList.root == true)
                        {
                            int count      = 0;
                            var featStrFeq = TextUtils.String2FeatStrSeq(rgs[idx], N, nMaxLength, FeatureType.root);  // list of root
                            List <Dictionary <int, double> > tmp = TextUtils.StrFreq2IdFreq(featStrFeq, FeatureType.root, pos, ref count);
                            Merge(ref rgWfs, tmp);
                            pos += count;
                        }
                        if (featureList.infl == true)
                        {
                            int count      = 0;
                            var featStrFeq = TextUtils.String2FeatStrSeq(rgs[idx], N, nMaxLength, FeatureType.infl);  // list of inflections
                            List <Dictionary <int, double> > tmp = TextUtils.StrFreq2IdFreq(featStrFeq, FeatureType.infl, pos, ref count);
                            Merge(ref rgWfs, tmp);
                            pos += count;
                        }


                        // binary output
                        if (batch.BatchSize == BatchSize)
                        {
                            if (batch.ElementSize > nMaxFeatureNumPerBatch[id])
                            {
                                nMaxFeatureNumPerBatch[id] = batch.ElementSize;
                            }
                            // batch.FeatureDim = nMaxFeatureId;
                            batch.WriteSeqSample(bw);
                            batch.Clear();
                            sw.Write(sb);
                            sb = new StringBuilder();
                            nBatch[id]++;
                        }
                        featureDimension[id] = batch.LoadSeqSample(rgWfs);
                        if (featureDimension[id] > nMaxFeatureDimension[id])
                        {
                            nMaxFeatureDimension[id] = featureDimension[id];
                        }
                        if (batch.SegSize > nMaxSegmentSize[id])
                        {
                            nMaxSegmentSize[id] = batch.SegSize;
                        }
                    }
                }
                //Console.WriteLine("nLine");

                // binary output
                if (batch.BatchSize > 0)
                {
                    batch.Clear();
                }

                bw.Close();
                sw.Close();
                File.Delete(outputDir + id + ".seg");
            });

            voc.Unlock();

            ExternalShuffle.Merge(outFile, outputDir, suffix, nThreads);
            BinaryWriter bwTail = new BinaryWriter(File.Open(outFile, FileMode.Append));

            totalLine = nBatch.Sum() * BatchSize;
            bwTail.Write(nMaxFeatureDimension.Max()); bwTail.Write(totalLine); bwTail.Write(nMaxSegmentSize.Max()); bwTail.Write(nMaxFeatureNumPerBatch.Max());
            bwTail.Write(BatchSize); // part of change on 2/19/2014. Write the batch size at the end. Used to check consistency in training.
            bwTail.Close();

            ExternalShuffle.Merge(ParameterSetting.trainPairTokzNew, outputDir, ".tsv", nThreads);
            if (Directory.Exists(outputDir))
            {
                Directory.Delete(outputDir);
            }
        }
 static DSSMHelper()
 {
     vocabulary = new Vocab(false);
     vocabulary.Read(@"..\..\..\..\..\data\dssm-vocab.txt");
     vocabulary.Lock();
 }