public static void Pair2SeqFeaBin(string inFile, string vocFile, int nMaxLength, int idx, string outFile, int BatchSize, FeatureList featureList) { Dictionary <string, Dictionary <int, double> > dicRoot = new Dictionary <string, Dictionary <int, double> >(); Vocab voc = new Vocab(false); if (featureList.l3g == true) { voc.Read(vocFile); voc.Lock(); } int N = 3; // letter 3-gram string outputDir = @"../../../../../Data/tmp/"; if (!Directory.Exists(outputDir)) { Directory.CreateDirectory(outputDir); } int nThreads = CmpInfo.ProcessorCount; string suffix = ".bin"; //for debug //nThreads = 1; List <int> nMaxFeatureNumPerBatch = new List <int>(); List <int> nMaxFeatureDimension = new List <int>(); List <int> featureDimension = new List <int>(); List <int> nMaxSegmentSize = new List <int>(); List <int> nBatch = new List <int>(); for (int i = 0; i < nThreads; i++) { nMaxFeatureNumPerBatch.Add(0); nMaxFeatureDimension.Add(0); featureDimension.Add(0); nMaxSegmentSize.Add(0); nBatch.Add(0); } int totalLine = ExternalShuffle.Split(inFile, outputDir, nThreads); Parallel.For(0, nThreads, id => { BinaryWriter bw = new BinaryWriter(File.Open(outputDir + id + suffix, FileMode.Create)); StreamWriter sw = new StreamWriter(File.Open(outputDir + id + ".tsv", FileMode.Create)); StringBuilder sb = new StringBuilder(); Batch batch = new Batch(); string sLine = ""; int nLine = 0; using (StreamReader sr = new StreamReader(outputDir + id + ".seg")) { while ((sLine = sr.ReadLine()) != null) { nLine++; if (nLine % 1000 == 0) { Console.Write("{0}\r", nLine); } sb.Append(sLine + "\n"); string labelLine = string.Empty; string[] rgs = sLine.Split('\t'); if (rgs.Length <= idx) { throw new Exception("Invalid format in input file! Exactly two fields separated by tabs are expected " + sLine.ToLower()); } int pos = 0; List <Dictionary <int, double> > rgWfs = new List <Dictionary <int, double> >(); string[] words = TextUtils.TokenizeToArray(rgs[idx]); for (int i = 0; i < words.Length; i++) { rgWfs.Add(new Dictionary <int, double>()); } if (featureList.l3g == true) { var featStrFeq = TextUtils.String2FeatStrSeq(rgs[idx], N, nMaxLength, FeatureType.l3g); // letter N-gram List <Dictionary <int, double> > tmp = TextUtils.StrFreq2IdFreq(featStrFeq, voc, pos); Merge(ref rgWfs, tmp); pos += voc.Count; } if (featureList.root == true) { int count = 0; var featStrFeq = TextUtils.String2FeatStrSeq(rgs[idx], N, nMaxLength, FeatureType.root); // list of root List <Dictionary <int, double> > tmp = TextUtils.StrFreq2IdFreq(featStrFeq, FeatureType.root, pos, ref count); Merge(ref rgWfs, tmp); pos += count; } if (featureList.infl == true) { int count = 0; var featStrFeq = TextUtils.String2FeatStrSeq(rgs[idx], N, nMaxLength, FeatureType.infl); // list of inflections List <Dictionary <int, double> > tmp = TextUtils.StrFreq2IdFreq(featStrFeq, FeatureType.infl, pos, ref count); Merge(ref rgWfs, tmp); pos += count; } // binary output if (batch.BatchSize == BatchSize) { if (batch.ElementSize > nMaxFeatureNumPerBatch[id]) { nMaxFeatureNumPerBatch[id] = batch.ElementSize; } // batch.FeatureDim = nMaxFeatureId; batch.WriteSeqSample(bw); batch.Clear(); sw.Write(sb); sb = new StringBuilder(); nBatch[id]++; } featureDimension[id] = batch.LoadSeqSample(rgWfs); if (featureDimension[id] > nMaxFeatureDimension[id]) { nMaxFeatureDimension[id] = featureDimension[id]; } if (batch.SegSize > nMaxSegmentSize[id]) { nMaxSegmentSize[id] = batch.SegSize; } } } //Console.WriteLine("nLine"); // binary output if (batch.BatchSize > 0) { batch.Clear(); } bw.Close(); sw.Close(); File.Delete(outputDir + id + ".seg"); }); voc.Unlock(); ExternalShuffle.Merge(outFile, outputDir, suffix, nThreads); BinaryWriter bwTail = new BinaryWriter(File.Open(outFile, FileMode.Append)); totalLine = nBatch.Sum() * BatchSize; bwTail.Write(nMaxFeatureDimension.Max()); bwTail.Write(totalLine); bwTail.Write(nMaxSegmentSize.Max()); bwTail.Write(nMaxFeatureNumPerBatch.Max()); bwTail.Write(BatchSize); // part of change on 2/19/2014. Write the batch size at the end. Used to check consistency in training. bwTail.Close(); ExternalShuffle.Merge(ParameterSetting.trainPairTokzNew, outputDir, ".tsv", nThreads); if (Directory.Exists(outputDir)) { Directory.Delete(outputDir); } }
static DSSMHelper() { vocabulary = new Vocab(false); vocabulary.Read(@"..\..\..\..\..\data\dssm-vocab.txt"); vocabulary.Lock(); }