public static void GenUrlLiftRatio() { double matchBase = 506000; double ranBase = 330000.0 * 20000; string matchfile = @"\\mlsdata\e$\Users\v-lianji\others\CIKM16\my\common_user_stat\url02.csv"; string ranfile = @"\\mlsdata\e$\Users\v-lianji\others\CIKM16\my\common_user_stat\url_random_2w.csv"; string outfile = @"\\mlsdata\e$\Users\v-lianji\others\CIKM16\my\common_user_stat\url_lift.csv"; Dictionary <string, double> url2ratio_match = LoadUrl2Cnt(matchfile, matchBase, 20); Dictionary <string, double> url2ratio_ran = LoadUrl2Cnt(ranfile, ranBase, 0); Dictionary <string, double> url2lift = new Dictionary <string, double>(); foreach (var pair in url2ratio_match) { if (url2ratio_ran.ContainsKey(pair.Key)) { url2lift.Add(pair.Key, pair.Value / url2ratio_ran[pair.Key]); } else { url2lift.Add(pair.Key, 19.9); } } Naive.OutputSortedDict(url2lift, outfile); }
public static void GenUriStat() { string infile = @"\\mlsdata\e$\Users\v-lianji\others\CIKM16\data-train-dca\urls.csv"; string[] outfiles = { @"\\mlsdata\e$\Users\v-lianji\others\CIKM16\my\url_freq_dep00.csv", @"\\mlsdata\e$\Users\v-lianji\others\CIKM16\my\url_freq_dep01.csv", @"\\mlsdata\e$\Users\v-lianji\others\CIKM16\my\url_freq_dep02.csv", @"\\mlsdata\e$\Users\v-lianji\others\CIKM16\my\url_freq_dep03.csv" }; Dictionary <string, int>[] url2freq00 = new Dictionary <string, int> [4]; for (int i = 0; i < 4; i++) { url2freq00[i] = new Dictionary <string, int>(); } using (StreamReader rd = new StreamReader(infile)) { int cnt = 0; string content = null; while ((content = rd.ReadLine()) != null) { if (cnt++ % 10000 == 0) { Console.WriteLine(cnt); } string[] words = content.Split(','); string url = words[1]; if (url.IndexOf("?") > 0) { url = url.Substring(0, url.IndexOf("?")); } int dep = 0; int slash_idx = url.IndexOf("/"); while (dep <= 3) { AddDict(url2freq00[dep], url.Substring(0, slash_idx < 0 ? url.Length : slash_idx)); dep++; if (slash_idx < 0) { break; } slash_idx = url.IndexOf("/", slash_idx + 1); } } } for (int i = 0; i < 4; i++) { Naive.OutputSortedDict(url2freq00[i], outfiles[i]); } }
public static void Stat02() { Dictionary <string, int> fid2usercnt = new Dictionary <string, int>(); string infile = @"\\mlsdata\e$\Users\v-lianji\others\CIKM16\data-train-dca\facts.json"; string outfile = @"\\mlsdata\e$\Users\v-lianji\others\CIKM16\my\fid2usercnt.csv"; using (StreamReader rd = new StreamReader(infile)) { int factcnt = 0; string content = null; while ((content = rd.ReadLine()) != null) { if (factcnt++ % 10000 == 0) { Console.WriteLine(factcnt); } Facts ss = JsonConvert.DeserializeObject <Facts>(content); HashSet <string> cfidset = new HashSet <string>(); foreach (var re in ss.facts) { if (cfidset.Contains(re.fid)) { //.WriteLine("fid hit!!"); } else { cfidset.Add(re.fid); } } foreach (var cfid in cfidset) { if (!fid2usercnt.ContainsKey(cfid)) { fid2usercnt.Add(cfid, 1); } else { fid2usercnt[cfid]++; } } } } Naive.OutputSortedDict(fid2usercnt, outfile); }
public static void StatUserFidCnt() { Dictionary <string, int> user2factcnt = new Dictionary <string, int>(); int factcnt = 0; string infile = @"\\mlsdata\e$\Users\v-lianji\others\CIKM16\data-train-dca\facts.json"; using (StreamReader rd = new StreamReader(infile)) { factcnt = 0; string content = null; while ((content = rd.ReadLine()) != null) { if (factcnt++ % 10000 == 0) { Console.WriteLine(factcnt); } Facts ss = JsonConvert.DeserializeObject <Facts>(content); if (!user2factcnt.ContainsKey(ss.uid)) { user2factcnt.Add(ss.uid, ss.facts.Count); } else { user2factcnt[ss.uid] += ss.facts.Count; } } } Console.WriteLine("user cnt : {0}", user2factcnt.Count); Console.WriteLine("user max fact cnt : {0}", user2factcnt.Max(a => a.Value)); Naive.OutputSortedDict(user2factcnt, @"\\mlsdata\e$\Users\v-lianji\others\CIKM16\my\user2fidcnt.csv"); }
public static void UrlComAnalysis_random() { var fid2url = Loader.LoadFid2Url(); string outfile = @"\\mlsdata\e$\Users\v-lianji\others\CIKM16\my\common_user_stat\url_random_2w.csv"; var fact2url = Loader.LoadFid2Url(1); var user2facts = Loader.LoadUserFacts(); Dictionary <string, HashSet <string> > user2urls = new Dictionary <string, HashSet <string> >(); List <string> uid_set = new List <string>(user2facts.Keys); foreach (var uid in uid_set) { var cset = new HashSet <string>(); foreach (var fact in user2facts[uid].facts) { if (fact2url.ContainsKey(fact.fid)) { if (!cset.Contains(fact2url[fact.fid])) { cset.Add(fact2url[fact.fid]); } } } user2urls.Add(uid, cset); } Dictionary <string, int> url02cnt = new Dictionary <string, int>(); long N = 33000 * 20000; int len = uid_set.Count; Random rng = new Random((int)DateTime.Now.Ticks); for (long i = 0; i < N; i++) { if (i % 10000L == 0) { Console.WriteLine(i); } int a = rng.Next(len); int b = rng.Next(len); if (a == b) { continue; } string[] words = new string[] { uid_set[a], uid_set[b] }; if (user2urls.ContainsKey(words[0]) && user2urls.ContainsKey(words[1])) { foreach (var url in user2urls[words[0]]) { if (user2urls[words[1]].Contains(url)) { if (!url02cnt.ContainsKey(url)) { url02cnt.Add(url, 1); } else { url02cnt[url]++; } } } } } Naive.OutputSortedDict(url02cnt, outfile); }
public static void UrlComAnalysis() { var fid2url = Loader.LoadFid2Url(); string gtfile = @"\\mlsdata\e$\Users\v-lianji\others\CIKM16\data-train-dca\train.csv"; string outfile = @"\\mlsdata\e$\Users\v-lianji\others\CIKM16\my\common_user_stat\url02.csv"; var user2matches = Loader.LoadGroundTruth(gtfile); var user2fact = Loader.LoadUserFacts(user2matches); var fact2url = Loader.LoadFid2Url(); Dictionary <string, HashSet <string> > user2urls = new Dictionary <string, HashSet <string> >(); int target_dep = 1; HashSet <string> uid_set = new HashSet <string>(user2matches.Keys); user2matches.Clear(); user2matches = null; foreach (var uid in uid_set) { user2urls.Add(uid, new HashSet <string>()); foreach (var fact in user2fact[uid].facts) { string target_url = ExtractUrlWithDep(target_dep, fact2url[fact.fid]); if (target_url == null) { continue; } if (!user2urls[uid].Contains(target_url)) { user2urls[uid].Add(target_url); } } } Dictionary <string, int> url02cnt = new Dictionary <string, int>(); using (StreamReader rd = new StreamReader(gtfile)) { string content = null; while ((content = rd.ReadLine()) != null) { string[] words = content.Split(','); if (user2urls.ContainsKey(words[0]) && user2urls.ContainsKey(words[1])) { foreach (var url in user2urls[words[0]]) { if (user2urls[words[1]].Contains(url)) { if (!url02cnt.ContainsKey(url)) { url02cnt.Add(url, 1); } else { url02cnt[url]++; } } } } } } Naive.OutputSortedDict(url02cnt, outfile); }