public Dictionary<int,float>[] GetReposCorrs(TrainingData td, int[][] uod, List<int>[] mod) { Dictionary<int, float>[] retVal = new Dictionary<int, float>[td.Repositories.Count]; for (int i = 0; i < retVal.Length; i++) retVal[i] = new Dictionary<int, float>(); int[] matches = new int[td.Repositories.Count]; for (int reposA = 0; reposA < td.Repositories.Count; reposA++) { List<int> users = mod[reposA]; foreach (int userA in users) { int[] reposBs = uod[userA]; foreach(int reposB in reposBs) { if(reposB == reposA) continue; matches[reposB]++; } } for (int i = 0; i < td.Repositories.Count; i++) { int matchCount = matches[i]; if (matchCount > 0) { float weight = matches[i] / (float)(50 + Math.Sqrt(mod[reposA].Count) * Math.Sqrt(mod[i].Count)); retVal[reposA][i] = weight; matches[i] = 0; } } } return retVal; }
public static int[][] GetUserOrderRepositories(TrainingData td) { int usrCount = td.Users.Count; int[][] retVal = new int[usrCount][]; foreach (User u in td.Users) { Repository[] reps = u.Repo.GetList(); retVal[u.ID] = new int[reps.Length]; for (int i = 0; i < reps.Length; i++) retVal[u.ID][i] = reps[i].ID; } return retVal; }
static void Main(string[] args) { string repos = @"C:\Users\Aron\Github\download\repos.txt"; string data = @"C:\Users\Aron\Github\download\data.txt"; string lang = @"C:\Users\Aron\Github\download\lang.txt"; string test = @"C:\Users\Aron\Github\download\test.txt"; string results = @"C:\Users\Aron\Github\GithubContest\results.txt"; TrainingData trainData = new TrainingData(); trainData.Load(repos, lang, data); TestData testData = new TestData(); testData.Load(test, trainData); SimpleMovieKnn smknn = new SimpleMovieKnn(); int[][] predictions1 = smknn.Run2(trainData, testData, results); SimpleCounterModel scm = new SimpleCounterModel(); int[][] predictions2 = scm.Run(trainData, testData, results); // combine int[][] blend = new int[testData.Users.Count][]; for (int i = 0; i < blend.Length; i++) { blend[i] = new int[10]; for (int j = 0; j < 5; j++) { blend[i][j] = predictions1[i][j]; } int cnt = 5; foreach (int j in predictions2[i]) { if (!blend[i].Contains<int>(j)) { blend[i][cnt] = j; cnt++; if (cnt == 10) break; } } } DataFormatter.OutputPredictions(results, trainData, testData, blend); //QuickFix(trainData); /* LogisticSVD svd = new LogisticSVD(); svd.Setup(trainData, testData, results); for(int epoch = 0; epoch < 500; epoch++) svd.Train(1); svd.Predict();*/ }
private static void QuickFix(TrainingData trainData) { string results = @"C:\Users\Aron\Github\GithubContest\results.txt"; string[] allLines = File.ReadAllLines(results); StreamWriter sw = new StreamWriter(File.OpenWrite(results + ".bak")); foreach (string s in allLines) { int usrID = int.Parse(s.Substring(0, s.IndexOf(':'))); User u = trainData.Users[usrID]; string outStr = u.ExternalID.ToString() + s.Substring(s.IndexOf(':')); sw.WriteLine(outStr); } sw.Close(); }
public static void OutputPredictions(string outputPath, TrainingData train, TestData test, int[][] predictions) { if (File.Exists(outputPath)) File.Delete(outputPath); StreamWriter sw = new StreamWriter(File.OpenWrite(outputPath)); for (int usrIndx = 0; usrIndx < predictions.Length; usrIndx++) { string outStr = test.Users[usrIndx].ExternalID + ":"; for (int i = 0; i < predictions[usrIndx].Length; i++) { outStr += train.Repositories.GetByInternalID(predictions[usrIndx][i]).ExternalID; if (i < predictions[usrIndx].Length - 1) outStr += ","; } sw.WriteLine(outStr); } sw.Close(); }
public int[][] Run(TrainingData td, TestData test, string outPath) { int[][] uod = DataFormatter.GetUserOrderRepositories(td); List<int>[] mod = new List<int>[td.Repositories.Count]; for (int i = 0; i < td.Repositories.Count; i++) mod[i] = new List<int>(); // populate mod for (int i = 0; i < uod.Length; i++) { for (int j = 0; j < uod[i].Length; j++) { mod[uod[i][j]].Add(i); } } // Dictionary<int, float>[] corrs = GetReposCorrs(td, uod, mod); int[][] predictions = new int[test.Users.Count][]; for (int usrIndx = 0; usrIndx < test.Users.Count; usrIndx++) { User userA = test.Users[usrIndx]; int[] repos = uod[userA.ID]; int[] repoMatches = new int[mod.Length]; float[] movieWeights = new float[mod.Length]; int[] repoCountsA = new int[mod.Length]; //int[] repoCountsB = new int[mod.Length]; for (int rIndx = 0; rIndx < repos.Length; rIndx++) { int rID = repos[rIndx]; List<int> users = mod[rID]; foreach (int userB in users) { if (userB == userA.ID) continue; // for every movie in user B foreach(int repo in uod[userB]) { if (repo == rID) continue; repoMatches[repo]++; } } } for (int i = 0; i < movieWeights.Length; i++) { if (repoMatches[i] > 0) movieWeights[i] = repoMatches[i]; // (repoMatches[i] / (1000 + (float)(Math.Sqrt(repoCountsA[i]) * Math.Sqrt(repoCountsB[i])))); } // find x highest predictions[usrIndx] = new int[10]; for (int i = 0; i < 10; i++) { int highRepo = 0; float highWeight = 0; for (int j = 0; j < repoMatches.Length; j++) { float weight = movieWeights[j]; if (weight > highWeight) { highRepo = j; highWeight = weight; } } movieWeights[highRepo] = 0; // zero out to skip next round predictions[usrIndx][i] = highRepo; } } return predictions; }
// for each user in test // find similar users, count up public int[][] Run2(TrainingData td, TestData test, string outPath) { int[][] uod = DataFormatter.GetUserOrderRepositories(td); List<int>[] mod = new List<int>[td.Repositories.Count]; for (int i = 0; i < td.Repositories.Count; i++) mod[i] = new List<int>(); // populate mod for (int i = 0; i < uod.Length; i++) { for (int j = 0; j < uod[i].Length; j++) { mod[uod[i][j]].Add(i); } } Dictionary<int, float>[] corrs = GetReposCorrs(td, uod, mod); int[][] predictions = new int[test.Users.Count][]; Repository[] repoList = td.Repositories.GetList(); for (int usrIndx = 0; usrIndx < test.Users.Count; usrIndx++) { User userA = test.Users[usrIndx]; float[] weights = new float[td.Repositories.Count]; foreach(int repos in uod[userA.ID]) { foreach(KeyValuePair<int,float> kvp in corrs[repos]) { if (kvp.Key == repos) continue; weights[kvp.Key] += kvp.Value; } } // find x highest predictions[usrIndx] = new int[10]; for (int i = 0; i < 10; i++) { int highRepo = 0; float highWeight = 0; for (int j = 0; j < weights.Length; j++) { float weight = weights[j]; if (weight > highWeight) { highRepo = j; highWeight = weight; } } weights[highRepo] = 0; // zero out to skip next round predictions[usrIndx][i] = highRepo; } } return predictions; }
public void Setup(TrainingData td, TestData test, string results) { this.td = td; this.test = test; this.results = results; // model training data uod = DataFormatter.GetUserOrderRepositories(td); userCount = td.Users.Count; repoCount = td.Repositories.Count; // model hypers featureCount = 10; // model params Random r = new Random(); userBias = new float[userCount]; for (int i = 0; i < userCount; i++) userBias[i] = -10; repoBias = new float[repoCount]; for (int i = 0; i < userCount; i++) repoBias[i] = -10; userFeatures = new float[userCount][]; for (int i = 0; i < userCount; i++) { userFeatures[i] = new float[featureCount]; for (int f = 0; f < featureCount; f++) { userFeatures[i][f] = -.1f + .001f * r.Next(200); } } repoFeatures = new float[repoCount][]; for (int i = 0; i < repoCount; i++) { repoFeatures[i] = new float[featureCount]; for (int f = 0; f < featureCount; f++) { repoFeatures[i][f] = -.1f + .001f * r.Next(200); } } }