public void Train(ILabeledExampleCollection <LblT, SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null); Dispose(); int[] trainSet = new int[dataset.Count]; int[] labels = new int[dataset.Count]; Dictionary <LblT, int> lblToIdx = new Dictionary <LblT, int>(mLblCmp); MultiSet <int> lblCount = new MultiSet <int>(); int j = 0; foreach (LabeledExample <LblT, SparseVector <double> > lblEx in dataset) { SparseVector <double> vec = lblEx.Example; int[] idx = new int[vec.Count]; float[] val = new float[vec.Count]; for (int i = 0; i < vec.Count; i++) { idx[i] = vec.InnerIdx[i] + 1; val[i] = (float)vec.InnerDat[i]; // *** cast to float } int lbl; if (!lblToIdx.TryGetValue(lblEx.Label, out lbl)) { lblToIdx.Add(lblEx.Label, lbl = lblToIdx.Count); mIdxToLbl.Add(lblEx.Label); } Utils.ThrowException(lbl == 2 ? new ArgumentValueException("dataset") : null); trainSet[j++] = SvmLightLib.NewFeatureVector(idx.Length, idx, val, lbl == 0 ? 1 : -1); lblCount.Add(lbl == 0 ? 1 : -1); } string costFactor = ""; if (mBiasedCostFunction) { costFactor = "-j " + ((double)lblCount.GetCount(-1) / (double)lblCount.GetCount(1)); } mModelId = SvmLightLib.TrainModel(string.Format(CultureInfo.InvariantCulture, "-v {0} -c {1} -t {2} -g {3} -d {4} -s {5} -r {6} -b {7} -e {8} -# {9} {10} {11}", (int)mVerbosityLevel, mC, (int)mKernelType, mKernelParamGamma, mKernelParamD, mKernelParamS, mKernelParamC, mBiasedHyperplane ? 1 : 0, mEps, mMaxIter, mCustomParams, costFactor), trainSet.Length, trainSet); // delete training vectors foreach (int vecIdx in trainSet) { SvmLightLib.DeleteFeatureVector(vecIdx); } }
private void PrecomputeProbabilities(ILabeledExampleCollection <LblT, BinaryVector> dataset) { mFeaturePriors = new Dictionary <int, double>(); ArrayList <LblT> tmp = new ArrayList <LblT>(); Dictionary <LblT, int> lblToIdx = new Dictionary <LblT, int>(mLblCmp); foreach (LabeledExample <LblT, BinaryVector> labeledExample in dataset) { if (!lblToIdx.ContainsKey(labeledExample.Label)) { lblToIdx.Add(labeledExample.Label, lblToIdx.Count); tmp.Add(labeledExample.Label); } } // prepare counters mExampleCount = new int[tmp.Count]; mFeatureProb = new Dictionary <int, double> [tmp.Count]; for (int j = 0; j < mFeatureProb.Length; j++) { mFeatureProb[j] = new Dictionary <int, double>(); } MultiSet <int> featureCounter = new MultiSet <int>(); // count features int i = 0; foreach (LabeledExample <LblT, BinaryVector> labeledExample in dataset) { mLogger.ProgressFast(Logger.Level.Info, /*sender=*/ this, "PrecomputeProbabilities", "Processing example {0} / {1}", ++i, dataset.Count); int lblIdx = lblToIdx[labeledExample.Label]; mExampleCount[lblIdx]++; double val; foreach (int idx in labeledExample.Example) { featureCounter.Add(idx); if (mFeatureProb[lblIdx].TryGetValue(idx, out val)) { mFeatureProb[lblIdx][idx] = val + 1; } else { mFeatureProb[lblIdx].Add(idx, 1); } } } // estimate probabilities i = 0; foreach (Dictionary <int, double> probVec in mFeatureProb) { foreach (int featIdx in new ArrayList <int>(probVec.Keys)) { double p0 = ((double)featureCounter.GetCount(featIdx) + 1.0) / ((double)dataset.Count + 2.0); // rule of succession (feature prior) double p = (probVec[featIdx] + 2.0 * p0) / ((double)mExampleCount[i] + 2.0); // m-estimate (m = 2) probVec[featIdx] = p; if (!mFeaturePriors.ContainsKey(featIdx)) { mFeaturePriors.Add(featIdx, p0); } } i++; } mIdxToLbl = tmp.ToArray(); }
static void Main(string[] args) { MultiSet <string> urlCount = new MultiSet <string>(); MultiSet <string> domainCount = new MultiSet <string>(); Dictionary <string, Set <string> > domainToUrlMapping = new Dictionary <string, Set <string> >(); Dictionary <string, Dictionary <string, Set <string> > > data = new Dictionary <string, Dictionary <string, Set <string> > >(); Dictionary <string, Dictionary <string, Set <string> > > domainData = new Dictionary <string, Dictionary <string, Set <string> > >(); using (SqlConnection connection = new SqlConnection(Utils.GetConfigValue("DbConnectionString"))) { connection.Open(); using (SqlCommand cmd = new SqlCommand(@"SELECT name, responseUrl from Documents", connection)) { cmd.CommandTimeout = 0; using (SqlDataReader reader = cmd.ExecuteReader()) { //foreach (string fileName in Directory.GetFiles(Utils.GetConfigValue("DataFolder", ".").TrimEnd('\\'), "*.xml.gz", SearchOption.AllDirectories)) while (reader.Read()) { //Console.WriteLine(fileName); //Document doc = new Document("", ""); //doc.ReadXmlCompressed(fileName); //Console.WriteLine(doc.Name); Console.WriteLine(reader.GetValue <string>("name")); //string url = doc.Features.GetFeatureValue("responseUrl"); string url = reader.GetValue <string>("responseUrl"); //Console.WriteLine(url); string left; ArrayList <string> path; ArrayList <KeyDat <string, string> > qParsed; ParseUrl(url, out left, out path, out qParsed); string urlKey = UrlAsString(left, path, qParsed, new Set <string>()); urlCount.Add(urlKey); domainCount.Add(left); if (!domainToUrlMapping.ContainsKey(left)) { domainToUrlMapping.Add(left, new Set <string>()); } domainToUrlMapping[left].Add(urlKey); if (!data.ContainsKey(urlKey)) { data.Add(urlKey, new Dictionary <string, Set <string> >()); } if (!domainData.ContainsKey(left)) { domainData.Add(left, new Dictionary <string, Set <string> >()); } Dictionary <string, Set <string> > urlInfo = data[urlKey]; Dictionary <string, Set <string> > domainInfo = domainData[left]; foreach (KeyDat <string, string> item in qParsed) { //Console.WriteLine(item.Key + "=" + item.Dat); if (!urlInfo.ContainsKey(item.Key)) { urlInfo.Add(item.Key, new Set <string>()); } urlInfo[item.Key].Add(item.Dat); if (!domainInfo.ContainsKey(item.Key)) { domainInfo.Add(item.Key, new Set <string>()); } domainInfo[item.Key].Add(item.Dat); } } } } } Set <string> paramShitList = new Set <string>(Utils.GetConfigValue("ExcludeUrlArgs", "utm_campaign,feedName,mod,rss_id,comment,commentid,partner").Split(',')); StreamWriter w = new StreamWriter(Utils.GetConfigValue("OutputFileName", "reportDomains.txt")); foreach (KeyValuePair <string, Dictionary <string, Set <string> > > item in domainData) { bool found = false; foreach (KeyValuePair <string, Set <string> > paramInfo in item.Value) { if (paramInfo.Value.Count > 1 && !paramShitList.Contains(paramInfo.Key.ToLower())) { found = true; break; } } if (found) { bool __found = false; StringBuilder s = new StringBuilder(); s.AppendLine("********************** Domain Info **********************"); s.AppendLine(); s.AppendLine(item.Key + " (" + domainCount.GetCount(item.Key) + ")"); foreach (KeyValuePair <string, Set <string> > paramInfo in item.Value) { if (!paramShitList.Contains(paramInfo.Key) && paramInfo.Value.Count > 1) { s.AppendLine("\t" + paramInfo.Key + "\t" + paramInfo.Value.Count + "\t" + paramInfo.Value); } } s.AppendLine(); s.AppendLine("*** Details ***"); s.AppendLine(); foreach (string url in domainToUrlMapping[item.Key]) { bool _found = false; foreach (KeyValuePair <string, Set <string> > paramInfo in data[url]) { if (paramInfo.Value.Count > 1 && !paramShitList.Contains(paramInfo.Key)) { _found = true; break; } } if (_found) { __found = true; s.AppendLine(url + " (" + urlCount.GetCount(url) + ")"); foreach (KeyValuePair <string, Set <string> > paramInfo in data[url]) { if (paramInfo.Value.Count > 1) { s.AppendLine("\t" + paramInfo.Key + "\t" + paramInfo.Value.Count + "\t" + paramInfo.Value); } } s.AppendLine(); } } s.AppendLine(); if (__found) { w.Write(s.ToString()); } } } w.Close(); }