Esempio n. 1
0
        public void Train(ILabeledExampleCollection <LblT, SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null);
            Dispose();
            int[] trainSet = new int[dataset.Count];
            int[] labels   = new int[dataset.Count];
            Dictionary <LblT, int> lblToIdx = new Dictionary <LblT, int>(mLblCmp);
            MultiSet <int>         lblCount = new MultiSet <int>();
            int j = 0;

            foreach (LabeledExample <LblT, SparseVector <double> > lblEx in dataset)
            {
                SparseVector <double> vec = lblEx.Example;
                int[]   idx = new int[vec.Count];
                float[] val = new float[vec.Count];
                for (int i = 0; i < vec.Count; i++)
                {
                    idx[i] = vec.InnerIdx[i] + 1;
                    val[i] = (float)vec.InnerDat[i]; // *** cast to float
                }
                int lbl;
                if (!lblToIdx.TryGetValue(lblEx.Label, out lbl))
                {
                    lblToIdx.Add(lblEx.Label, lbl = lblToIdx.Count);
                    mIdxToLbl.Add(lblEx.Label);
                }
                Utils.ThrowException(lbl == 2 ? new ArgumentValueException("dataset") : null);
                trainSet[j++] = SvmLightLib.NewFeatureVector(idx.Length, idx, val, lbl == 0 ? 1 : -1);
                lblCount.Add(lbl == 0 ? 1 : -1);
            }
            string costFactor = "";

            if (mBiasedCostFunction)
            {
                costFactor = "-j " + ((double)lblCount.GetCount(-1) / (double)lblCount.GetCount(1));
            }
            mModelId = SvmLightLib.TrainModel(string.Format(CultureInfo.InvariantCulture, "-v {0} -c {1} -t {2} -g {3} -d {4} -s {5} -r {6} -b {7} -e {8} -# {9} {10} {11}",
                                                            (int)mVerbosityLevel, mC, (int)mKernelType, mKernelParamGamma, mKernelParamD, mKernelParamS, mKernelParamC, mBiasedHyperplane ? 1 : 0,
                                                            mEps, mMaxIter, mCustomParams, costFactor), trainSet.Length, trainSet);
            // delete training vectors
            foreach (int vecIdx in trainSet)
            {
                SvmLightLib.DeleteFeatureVector(vecIdx);
            }
        }
Esempio n. 2
0
        private void PrecomputeProbabilities(ILabeledExampleCollection <LblT, BinaryVector> dataset)
        {
            mFeaturePriors = new Dictionary <int, double>();
            ArrayList <LblT>       tmp      = new ArrayList <LblT>();
            Dictionary <LblT, int> lblToIdx = new Dictionary <LblT, int>(mLblCmp);

            foreach (LabeledExample <LblT, BinaryVector> labeledExample in dataset)
            {
                if (!lblToIdx.ContainsKey(labeledExample.Label))
                {
                    lblToIdx.Add(labeledExample.Label, lblToIdx.Count);
                    tmp.Add(labeledExample.Label);
                }
            }
            // prepare counters
            mExampleCount = new int[tmp.Count];
            mFeatureProb  = new Dictionary <int, double> [tmp.Count];
            for (int j = 0; j < mFeatureProb.Length; j++)
            {
                mFeatureProb[j] = new Dictionary <int, double>();
            }
            MultiSet <int> featureCounter = new MultiSet <int>();
            // count features
            int i = 0;

            foreach (LabeledExample <LblT, BinaryVector> labeledExample in dataset)
            {
                mLogger.ProgressFast(Logger.Level.Info, /*sender=*/ this, "PrecomputeProbabilities", "Processing example {0} / {1}", ++i, dataset.Count);
                int lblIdx = lblToIdx[labeledExample.Label];
                mExampleCount[lblIdx]++;
                double val;
                foreach (int idx in labeledExample.Example)
                {
                    featureCounter.Add(idx);
                    if (mFeatureProb[lblIdx].TryGetValue(idx, out val))
                    {
                        mFeatureProb[lblIdx][idx] = val + 1;
                    }
                    else
                    {
                        mFeatureProb[lblIdx].Add(idx, 1);
                    }
                }
            }
            // estimate probabilities
            i = 0;
            foreach (Dictionary <int, double> probVec in mFeatureProb)
            {
                foreach (int featIdx in new ArrayList <int>(probVec.Keys))
                {
                    double p0 = ((double)featureCounter.GetCount(featIdx) + 1.0) / ((double)dataset.Count + 2.0); // rule of succession (feature prior)
                    double p  = (probVec[featIdx] + 2.0 * p0) / ((double)mExampleCount[i] + 2.0);                 // m-estimate (m = 2)
                    probVec[featIdx] = p;
                    if (!mFeaturePriors.ContainsKey(featIdx))
                    {
                        mFeaturePriors.Add(featIdx, p0);
                    }
                }
                i++;
            }
            mIdxToLbl = tmp.ToArray();
        }
Esempio n. 3
0
        static void Main(string[] args)
        {
            MultiSet <string> urlCount
                = new MultiSet <string>();
            MultiSet <string> domainCount
                = new MultiSet <string>();
            Dictionary <string, Set <string> > domainToUrlMapping
                = new Dictionary <string, Set <string> >();
            Dictionary <string, Dictionary <string, Set <string> > > data
                = new Dictionary <string, Dictionary <string, Set <string> > >();
            Dictionary <string, Dictionary <string, Set <string> > > domainData
                = new Dictionary <string, Dictionary <string, Set <string> > >();

            using (SqlConnection connection = new SqlConnection(Utils.GetConfigValue("DbConnectionString")))
            {
                connection.Open();
                using (SqlCommand cmd = new SqlCommand(@"SELECT name, responseUrl from Documents", connection))
                {
                    cmd.CommandTimeout = 0;
                    using (SqlDataReader reader = cmd.ExecuteReader())
                    {
                        //foreach (string fileName in Directory.GetFiles(Utils.GetConfigValue("DataFolder", ".").TrimEnd('\\'), "*.xml.gz", SearchOption.AllDirectories))
                        while (reader.Read())
                        {
                            //Console.WriteLine(fileName);
                            //Document doc = new Document("", "");
                            //doc.ReadXmlCompressed(fileName);
                            //Console.WriteLine(doc.Name);
                            Console.WriteLine(reader.GetValue <string>("name"));
                            //string url = doc.Features.GetFeatureValue("responseUrl");
                            string url = reader.GetValue <string>("responseUrl");
                            //Console.WriteLine(url);
                            string             left;
                            ArrayList <string> path;
                            ArrayList <KeyDat <string, string> > qParsed;
                            ParseUrl(url, out left, out path, out qParsed);
                            string urlKey = UrlAsString(left, path, qParsed, new Set <string>());
                            urlCount.Add(urlKey);
                            domainCount.Add(left);
                            if (!domainToUrlMapping.ContainsKey(left))
                            {
                                domainToUrlMapping.Add(left, new Set <string>());
                            }
                            domainToUrlMapping[left].Add(urlKey);
                            if (!data.ContainsKey(urlKey))
                            {
                                data.Add(urlKey, new Dictionary <string, Set <string> >());
                            }
                            if (!domainData.ContainsKey(left))
                            {
                                domainData.Add(left, new Dictionary <string, Set <string> >());
                            }
                            Dictionary <string, Set <string> > urlInfo    = data[urlKey];
                            Dictionary <string, Set <string> > domainInfo = domainData[left];
                            foreach (KeyDat <string, string> item in qParsed)
                            {
                                //Console.WriteLine(item.Key + "=" + item.Dat);
                                if (!urlInfo.ContainsKey(item.Key))
                                {
                                    urlInfo.Add(item.Key, new Set <string>());
                                }
                                urlInfo[item.Key].Add(item.Dat);
                                if (!domainInfo.ContainsKey(item.Key))
                                {
                                    domainInfo.Add(item.Key, new Set <string>());
                                }
                                domainInfo[item.Key].Add(item.Dat);
                            }
                        }
                    }
                }
            }

            Set <string> paramShitList
                = new Set <string>(Utils.GetConfigValue("ExcludeUrlArgs", "utm_campaign,feedName,mod,rss_id,comment,commentid,partner").Split(','));

            StreamWriter w = new StreamWriter(Utils.GetConfigValue("OutputFileName", "reportDomains.txt"));

            foreach (KeyValuePair <string, Dictionary <string, Set <string> > > item in domainData)
            {
                bool found = false;
                foreach (KeyValuePair <string, Set <string> > paramInfo in item.Value)
                {
                    if (paramInfo.Value.Count > 1 && !paramShitList.Contains(paramInfo.Key.ToLower()))
                    {
                        found = true;
                        break;
                    }
                }
                if (found)
                {
                    bool          __found = false;
                    StringBuilder s       = new StringBuilder();
                    s.AppendLine("********************** Domain Info **********************");
                    s.AppendLine();
                    s.AppendLine(item.Key + " (" + domainCount.GetCount(item.Key) + ")");
                    foreach (KeyValuePair <string, Set <string> > paramInfo in item.Value)
                    {
                        if (!paramShitList.Contains(paramInfo.Key) && paramInfo.Value.Count > 1)
                        {
                            s.AppendLine("\t" + paramInfo.Key + "\t" + paramInfo.Value.Count + "\t" + paramInfo.Value);
                        }
                    }
                    s.AppendLine();
                    s.AppendLine("*** Details ***");
                    s.AppendLine();
                    foreach (string url in domainToUrlMapping[item.Key])
                    {
                        bool _found = false;
                        foreach (KeyValuePair <string, Set <string> > paramInfo in data[url])
                        {
                            if (paramInfo.Value.Count > 1 && !paramShitList.Contains(paramInfo.Key))
                            {
                                _found = true;
                                break;
                            }
                        }
                        if (_found)
                        {
                            __found = true;
                            s.AppendLine(url + " (" + urlCount.GetCount(url) + ")");
                            foreach (KeyValuePair <string, Set <string> > paramInfo in data[url])
                            {
                                if (paramInfo.Value.Count > 1)
                                {
                                    s.AppendLine("\t" + paramInfo.Key + "\t" + paramInfo.Value.Count + "\t" + paramInfo.Value);
                                }
                            }
                            s.AppendLine();
                        }
                    }
                    s.AppendLine();
                    if (__found)
                    {
                        w.Write(s.ToString());
                    }
                }
            }
            w.Close();
        }