예제 #1
0
        public Dictionary<int,float>[] GetReposCorrs(TrainingData td, int[][] uod, List<int>[] mod)
        {
            Dictionary<int, float>[] retVal = new Dictionary<int, float>[td.Repositories.Count];
            for (int i = 0; i < retVal.Length; i++) retVal[i] = new Dictionary<int, float>();

            int[] matches = new int[td.Repositories.Count];

            for (int reposA = 0; reposA < td.Repositories.Count; reposA++)
            {
                List<int> users = mod[reposA];

                foreach (int userA in users)
                {
                    int[] reposBs = uod[userA];
                    foreach(int reposB in reposBs)
                    {
                        if(reposB == reposA) continue;
                        matches[reposB]++;
                    }
                }
                for (int i = 0; i < td.Repositories.Count; i++)
                {
                    int matchCount = matches[i];
                    if (matchCount > 0)
                    {
                        float weight = matches[i] / (float)(50 + Math.Sqrt(mod[reposA].Count) * Math.Sqrt(mod[i].Count));
                        retVal[reposA][i] = weight;
                        matches[i] = 0;
                    }
                }
            }
            return retVal;
        }
예제 #2
0
 public static int[][] GetUserOrderRepositories(TrainingData td)
 {
     int usrCount = td.Users.Count;
     int[][] retVal = new int[usrCount][];
     foreach (User u in td.Users)
     {
         Repository[] reps = u.Repo.GetList();
         retVal[u.ID] = new int[reps.Length];
         for (int i = 0; i < reps.Length; i++)
             retVal[u.ID][i] = reps[i].ID;
     }
     return retVal;
 }
예제 #3
0
        static void Main(string[] args)
        {
            string repos = @"C:\Users\Aron\Github\download\repos.txt";
            string data = @"C:\Users\Aron\Github\download\data.txt";
            string lang = @"C:\Users\Aron\Github\download\lang.txt";
            string test = @"C:\Users\Aron\Github\download\test.txt";
            string results = @"C:\Users\Aron\Github\GithubContest\results.txt";

            TrainingData trainData = new TrainingData();
            trainData.Load(repos, lang, data);

            TestData testData = new TestData();
            testData.Load(test, trainData);

            SimpleMovieKnn smknn = new SimpleMovieKnn();
            int[][] predictions1 = smknn.Run2(trainData, testData, results);

            SimpleCounterModel scm = new SimpleCounterModel();
            int[][] predictions2 = scm.Run(trainData, testData, results);

            // combine
            int[][] blend = new int[testData.Users.Count][];
            for (int i = 0; i < blend.Length; i++)
            {
                blend[i] = new int[10];
                for (int j = 0; j < 5; j++)
                {
                    blend[i][j] = predictions1[i][j];
                }
                int cnt = 5;
                foreach (int j in predictions2[i])
                {
                    if (!blend[i].Contains<int>(j))
                    {
                        blend[i][cnt] = j;
                        cnt++;
                        if (cnt == 10) break;
                    }
                }
            }
            DataFormatter.OutputPredictions(results, trainData, testData, blend);

            //QuickFix(trainData);
            /*
            LogisticSVD svd = new LogisticSVD();
            svd.Setup(trainData, testData, results);
            for(int epoch = 0; epoch < 500; epoch++)
                svd.Train(1);
            svd.Predict();*/
        }
예제 #4
0
        private static void QuickFix(TrainingData trainData)
        {
            string results = @"C:\Users\Aron\Github\GithubContest\results.txt";
            string[] allLines = File.ReadAllLines(results);
            StreamWriter sw = new StreamWriter(File.OpenWrite(results + ".bak"));

            foreach (string s in allLines)
            {
                int usrID = int.Parse(s.Substring(0, s.IndexOf(':')));
                User u = trainData.Users[usrID];
                string outStr = u.ExternalID.ToString() + s.Substring(s.IndexOf(':'));
                sw.WriteLine(outStr);
            }
            sw.Close();
        }
예제 #5
0
        public static void OutputPredictions(string outputPath, TrainingData train, TestData test, int[][] predictions)
        {
            if (File.Exists(outputPath)) File.Delete(outputPath);
            StreamWriter sw = new StreamWriter(File.OpenWrite(outputPath));
            for (int usrIndx = 0; usrIndx < predictions.Length; usrIndx++)
            {
                string outStr = test.Users[usrIndx].ExternalID + ":";
                for (int i = 0; i < predictions[usrIndx].Length; i++)
                {
                    outStr += train.Repositories.GetByInternalID(predictions[usrIndx][i]).ExternalID;
                    if (i < predictions[usrIndx].Length - 1) outStr += ",";
                }

                sw.WriteLine(outStr);
            }
            sw.Close();
        }
예제 #6
0
        public int[][] Run(TrainingData td, TestData test, string outPath)
        {
            int[][] uod = DataFormatter.GetUserOrderRepositories(td);

            List<int>[] mod = new List<int>[td.Repositories.Count];

            for (int i = 0; i < td.Repositories.Count; i++)
                mod[i] = new List<int>();

            // populate mod
            for (int i = 0; i < uod.Length; i++)
            {
                for (int j = 0; j < uod[i].Length; j++)
                {
                    mod[uod[i][j]].Add(i);
                }
            }
               // Dictionary<int, float>[] corrs = GetReposCorrs(td, uod, mod);

            int[][] predictions = new int[test.Users.Count][];

            for (int usrIndx = 0; usrIndx < test.Users.Count; usrIndx++)
            {
                User userA = test.Users[usrIndx];

                int[] repos = uod[userA.ID];
                int[] repoMatches = new int[mod.Length];
                float[] movieWeights = new float[mod.Length];
                int[] repoCountsA = new int[mod.Length];
                //int[] repoCountsB = new int[mod.Length];

                for (int rIndx = 0; rIndx < repos.Length; rIndx++)
                {
                    int rID = repos[rIndx];
                    List<int> users = mod[rID];

                    foreach (int userB in users)
                    {
                        if (userB == userA.ID) continue;
                        // for every movie in user B

                        foreach(int repo in uod[userB])
                        {
                            if (repo == rID) continue;
                            repoMatches[repo]++;
                        }
                    }
                }

                for (int i = 0; i < movieWeights.Length; i++)
                {
                    if (repoMatches[i] > 0)
                        movieWeights[i] = repoMatches[i]; // (repoMatches[i] / (1000 + (float)(Math.Sqrt(repoCountsA[i]) * Math.Sqrt(repoCountsB[i]))));
                }

                // find x highest
                predictions[usrIndx] = new int[10];
                for (int i = 0; i < 10; i++)
                {
                    int highRepo = 0;
                    float highWeight = 0;

                    for (int j = 0; j < repoMatches.Length; j++)
                    {
                        float weight = movieWeights[j];
                        if (weight > highWeight)
                        {
                            highRepo = j;
                            highWeight = weight;
                        }
                    }
                    movieWeights[highRepo] = 0; // zero out to skip next round
                    predictions[usrIndx][i] = highRepo;
                }
            }
            return predictions;
        }
예제 #7
0
        // for each user in test
        // find similar users, count up
        public int[][] Run2(TrainingData td, TestData test, string outPath)
        {
            int[][] uod = DataFormatter.GetUserOrderRepositories(td);

            List<int>[] mod = new List<int>[td.Repositories.Count];

            for (int i = 0; i < td.Repositories.Count; i++)
                mod[i] = new List<int>();

            // populate mod
            for (int i = 0; i < uod.Length; i++)
            {
                for (int j = 0; j < uod[i].Length; j++)
                {
                    mod[uod[i][j]].Add(i);
                }
            }
            Dictionary<int, float>[] corrs = GetReposCorrs(td, uod, mod);
            int[][] predictions = new int[test.Users.Count][];
            Repository[] repoList = td.Repositories.GetList();
            for (int usrIndx = 0; usrIndx < test.Users.Count; usrIndx++)
            {
                User userA = test.Users[usrIndx];
                float[] weights = new float[td.Repositories.Count];
                foreach(int repos in uod[userA.ID])
                {
                    foreach(KeyValuePair<int,float> kvp in corrs[repos])
                    {
                        if (kvp.Key == repos) continue;
                        weights[kvp.Key] += kvp.Value;
                    }
                }
                // find x highest
                predictions[usrIndx] = new int[10];
                for (int i = 0; i < 10; i++)
                {
                    int highRepo = 0;
                    float highWeight = 0;

                    for (int j = 0; j < weights.Length; j++)
                    {
                        float weight = weights[j];
                        if (weight > highWeight)
                        {
                            highRepo = j;
                            highWeight = weight;
                        }
                    }
                    weights[highRepo] = 0; // zero out to skip next round
                    predictions[usrIndx][i] = highRepo;
                }
            }
            return predictions;
        }
예제 #8
0
        public void Setup(TrainingData td, TestData test, string results)
        {
            this.td = td;
            this.test = test;
            this.results = results;
            // model training data
            uod = DataFormatter.GetUserOrderRepositories(td);
            userCount = td.Users.Count;
            repoCount = td.Repositories.Count;

            // model hypers
            featureCount = 10;

            // model params
            Random r = new Random();
            userBias = new float[userCount];
            for (int i = 0; i < userCount; i++)
                userBias[i] = -10;
            repoBias = new float[repoCount];
            for (int i = 0; i < userCount; i++)
                repoBias[i] = -10;
            userFeatures = new float[userCount][];
            for (int i = 0; i < userCount; i++)
            {
                userFeatures[i] = new float[featureCount];
                for (int f = 0; f < featureCount; f++)
                {
                    userFeatures[i][f] = -.1f + .001f * r.Next(200);
                }
            }

            repoFeatures = new float[repoCount][];
            for (int i = 0; i < repoCount; i++)
            {
                repoFeatures[i] = new float[featureCount];
                for (int f = 0; f < featureCount; f++)
                {
                    repoFeatures[i][f] = -.1f + .001f * r.Next(200);
                }
            }
        }