public static void Main(string[] args) { try { if (args.Length < 1) { DispHelp(); } else if (args[0].ToLower() == "--pair2seqfea" && args.Length == 7) { //Pair2SeqFea(args[1], args[2], args[3], int.Parse(args[4]), args[5], args[6]); //update by Shelson, Jan. 18 } else if (args[0].ToLower() == "--seqfea2bin" && args.Length == 4) { SeqFea2Bin(args[1], int.Parse(args[2]), args[3]); } else if (args[0].ToLower() == "--shuffle" && (args.Length == 3 || args.Length == 4)) { ExternalShuffle.Shuffle(args[1], args[2], args.Length == 4 ? args[3] : null); } else { DispHelp(); } } catch (Exception exc) { Console.Error.WriteLine(exc.ToString()); //Environment.Exit(0); } }
public static void Pair2SeqFeaBin(string inFile, string vocFile, int nMaxLength, int idx, string outFile, int BatchSize, FeatureList featureList) { Dictionary <string, Dictionary <int, double> > dicRoot = new Dictionary <string, Dictionary <int, double> >(); Vocab voc = new Vocab(false); if (featureList.l3g == true) { voc.Read(vocFile); voc.Lock(); } int N = 3; // letter 3-gram string outputDir = @"../../../../../Data/tmp/"; if (!Directory.Exists(outputDir)) { Directory.CreateDirectory(outputDir); } int nThreads = CmpInfo.ProcessorCount; string suffix = ".bin"; //for debug //nThreads = 1; List <int> nMaxFeatureNumPerBatch = new List <int>(); List <int> nMaxFeatureDimension = new List <int>(); List <int> featureDimension = new List <int>(); List <int> nMaxSegmentSize = new List <int>(); List <int> nBatch = new List <int>(); for (int i = 0; i < nThreads; i++) { nMaxFeatureNumPerBatch.Add(0); nMaxFeatureDimension.Add(0); featureDimension.Add(0); nMaxSegmentSize.Add(0); nBatch.Add(0); } int totalLine = ExternalShuffle.Split(inFile, outputDir, nThreads); Parallel.For(0, nThreads, id => { BinaryWriter bw = new BinaryWriter(File.Open(outputDir + id + suffix, FileMode.Create)); StreamWriter sw = new StreamWriter(File.Open(outputDir + id + ".tsv", FileMode.Create)); StringBuilder sb = new StringBuilder(); Batch batch = new Batch(); string sLine = ""; int nLine = 0; using (StreamReader sr = new StreamReader(outputDir + id + ".seg")) { while ((sLine = sr.ReadLine()) != null) { nLine++; if (nLine % 1000 == 0) { Console.Write("{0}\r", nLine); } sb.Append(sLine + "\n"); string labelLine = string.Empty; string[] rgs = sLine.Split('\t'); if (rgs.Length <= idx) { throw new Exception("Invalid format in input file! Exactly two fields separated by tabs are expected " + sLine.ToLower()); } int pos = 0; List <Dictionary <int, double> > rgWfs = new List <Dictionary <int, double> >(); string[] words = TextUtils.TokenizeToArray(rgs[idx]); for (int i = 0; i < words.Length; i++) { rgWfs.Add(new Dictionary <int, double>()); } if (featureList.l3g == true) { var featStrFeq = TextUtils.String2FeatStrSeq(rgs[idx], N, nMaxLength, FeatureType.l3g); // letter N-gram List <Dictionary <int, double> > tmp = TextUtils.StrFreq2IdFreq(featStrFeq, voc, pos); Merge(ref rgWfs, tmp); pos += voc.Count; } if (featureList.root == true) { int count = 0; var featStrFeq = TextUtils.String2FeatStrSeq(rgs[idx], N, nMaxLength, FeatureType.root); // list of root List <Dictionary <int, double> > tmp = TextUtils.StrFreq2IdFreq(featStrFeq, FeatureType.root, pos, ref count); Merge(ref rgWfs, tmp); pos += count; } if (featureList.infl == true) { int count = 0; var featStrFeq = TextUtils.String2FeatStrSeq(rgs[idx], N, nMaxLength, FeatureType.infl); // list of inflections List <Dictionary <int, double> > tmp = TextUtils.StrFreq2IdFreq(featStrFeq, FeatureType.infl, pos, ref count); Merge(ref rgWfs, tmp); pos += count; } // binary output if (batch.BatchSize == BatchSize) { if (batch.ElementSize > nMaxFeatureNumPerBatch[id]) { nMaxFeatureNumPerBatch[id] = batch.ElementSize; } // batch.FeatureDim = nMaxFeatureId; batch.WriteSeqSample(bw); batch.Clear(); sw.Write(sb); sb = new StringBuilder(); nBatch[id]++; } featureDimension[id] = batch.LoadSeqSample(rgWfs); if (featureDimension[id] > nMaxFeatureDimension[id]) { nMaxFeatureDimension[id] = featureDimension[id]; } if (batch.SegSize > nMaxSegmentSize[id]) { nMaxSegmentSize[id] = batch.SegSize; } } } //Console.WriteLine("nLine"); // binary output if (batch.BatchSize > 0) { batch.Clear(); } bw.Close(); sw.Close(); File.Delete(outputDir + id + ".seg"); }); voc.Unlock(); ExternalShuffle.Merge(outFile, outputDir, suffix, nThreads); BinaryWriter bwTail = new BinaryWriter(File.Open(outFile, FileMode.Append)); totalLine = nBatch.Sum() * BatchSize; bwTail.Write(nMaxFeatureDimension.Max()); bwTail.Write(totalLine); bwTail.Write(nMaxSegmentSize.Max()); bwTail.Write(nMaxFeatureNumPerBatch.Max()); bwTail.Write(BatchSize); // part of change on 2/19/2014. Write the batch size at the end. Used to check consistency in training. bwTail.Close(); ExternalShuffle.Merge(ParameterSetting.trainPairTokzNew, outputDir, ".tsv", nThreads); if (Directory.Exists(outputDir)) { Directory.Delete(outputDir); } }
/// <summary> /// convert seq fea file to bin file. /// </summary> /// <param name="inFile">input seq fea file</param> /// <param name="outFile">output bin file</param> static void SeqFea2Bin(string inFile, int BatchSize, string outFile) { string[] terms = inFile.Split('/'); string suffix = terms[terms.Length - 1]; string outputDir = @"../../../../../Data/tmp/"; int nThreads = CmpInfo.ProcessorCount; int totalLine = ExternalShuffle.Split(inFile, outputDir, nThreads); List <int> nMaxFeatureNumPerBatch = new List <int>(); List <int> nMaxFeatureDimension = new List <int>(); List <int> featureDimension = new List <int>(); List <int> nMaxSegmentSize = new List <int>(); List <int> nLine = new List <int>(); for (int i = 0; i < nThreads; i++) { nMaxFeatureNumPerBatch.Add(0); nMaxFeatureDimension.Add(0); featureDimension.Add(0); nMaxSegmentSize.Add(0); nLine.Add(0); } Parallel.For(0, nThreads, id => { BinaryWriter bw = new BinaryWriter(File.Open(outputDir + id + suffix, FileMode.Create)); string sLine = ""; //int nLine = 0; List <Dictionary <int, double> > rgWfs = new List <Dictionary <int, double> >(); Batch batch = new Batch(); using (StreamReader sr = new StreamReader(outputDir + id + ".seg")) { while ((sLine = sr.ReadLine()) != null) { nLine[id]++; if (nLine[id] % 10000 == 0) { Console.Write("{0}\r", nLine[id]); } rgWfs = TextUtils.String2Matrix(sLine.Trim()); // binary output if (batch.BatchSize == BatchSize) { if (batch.ElementSize > nMaxFeatureNumPerBatch[id]) { nMaxFeatureNumPerBatch[id] = batch.ElementSize; } // batch.FeatureDim = nMaxFeatureId; batch.WriteSeqSample(bw); batch.Clear(); } featureDimension[id] = batch.LoadSeqSample(rgWfs); if (featureDimension[id] > nMaxFeatureDimension[id]) { nMaxFeatureDimension[id] = featureDimension[id]; } if (batch.SegSize > nMaxSegmentSize[id]) { nMaxSegmentSize[id] = batch.SegSize; } } } // binary output if (batch.BatchSize > 0) { if (batch.ElementSize > nMaxFeatureNumPerBatch[id]) { nMaxFeatureNumPerBatch[id] = batch.ElementSize; } // batch.FeatureDim = nMaxFeatureId; batch.WriteSeqSample(bw); batch.Clear(); } bw.Close(); File.Delete(outputDir + id + ".seg"); }); ExternalShuffle.Merge(outFile, outputDir, suffix, nThreads); BinaryWriter bwTail = new BinaryWriter(File.Open(outFile, FileMode.Append)); totalLine = nLine.Sum(); bwTail.Write(nMaxFeatureDimension.Max()); bwTail.Write(totalLine); bwTail.Write(nMaxSegmentSize.Max()); bwTail.Write(nMaxFeatureNumPerBatch.Max()); bwTail.Write(BatchSize); // part of change on 2/19/2014. Write the batch size at the end. Used to check consistency in training. bwTail.Close(); }