Пример #1
0
        public static Dictionary <Record, List <Record> > GetMatches(List <Record> records)
        {
            Dictionary <Record, List <Record> > ret = new Dictionary <Record, List <Record> >();

            string[]       finalDataSetList = File.ReadAllLines("c:/users/brush/desktop/finaldataset.csv");
            DecisionTree[] forest           = DataLoader.LoadForestFromDirectory("C:/users/brush/desktop/forest");

            foreach (Record record in records)
            {
                ret.Add(record, new List <Record>());
            }

            Console.WriteLine("Searching for matches...");
            //for (int c = 1; c < finalDataSetList.Length; c++)
            Parallel.For(1, finalDataSetList.Length, c =>
            {
                if (c % 10000 == 0)
                {
                    Console.WriteLine($"{(c / (finalDataSetList.Length * 1.0) * 100)}%");
                }

                string finalDataSetRow = finalDataSetList[c];
                string[] bits          = finalDataSetRow.Split(',');

                if (bits[0] != "")
                {
                    int enterpriseId = int.Parse(bits[0]);

                    if (enterpriseId > 15374761)
                    {
                        Record comparisonRecord = DataCleaner.CleanRecord(Record.FromFinalDatasetString(bits));

                        foreach (Record toMatch in records)
                        {
                            if (!toMatch.Equals(comparisonRecord))
                            {
                                RecordPair pair = new RecordPair
                                {
                                    Record1 = toMatch,
                                    Record2 = comparisonRecord,
                                };

                                bool isMatch = DecisionTreeBuilder.IsMatch(pair, forest, null);

                                if (isMatch)
                                {
                                    lock (ret)
                                    {
                                        ret[toMatch].Add(comparisonRecord);
                                    }
                                }
                            }
                        }
                    }
                }
            });

            return(ret);
        }
Пример #2
0
        public static void Test()
        {
            Console.WriteLine("Line 1:");
            string line1 = Console.ReadLine();

            Console.WriteLine("Line 2:");
            string line2 = Console.ReadLine();

            RecordPair pair = new RecordPair();

            pair.Record1 = DataCleaner.CleanRecord(Record.FromString(line1));
            pair.Record2 = DataCleaner.CleanRecord(Record.FromString(line2));

            DecisionTree[] forest = DataLoader.LoadForestFromDirectory(".");

            TreeLogger logger  = new TreeLogger();
            bool       isMatch = DecisionTreeBuilder.IsMatch(pair, forest, logger);

            Console.WriteLine("Search for those in training data who make it there?");
            string response = Console.ReadLine();

            //RecordPair pair2 = new RecordPair();
            //pair2.Record1 = Record.FromString("LAZAR,,KALLER,,M,,16/10/1965,,,,,,,-1,,,4839002,15479245,");
            //pair2.Record2 = Record.FromString("ADRIENNE,,KELLEY,,F,895535860,16/10/1965,9175738850,,1560 SILVER ST,2H,BRONX,NY,10461,,[email protected],4799491,15637549,");

            //bool ret = DecisionTreeBuilder.ReplayDecision(pair2, logger.SplittingQuestionsToTheBottom);

            if (response.ToLower() == "y")
            {
                using (StreamWriter sw = File.AppendText("c:/users/brush/desktop/gothere.txt"))
                {
                    List <RecordPair> pairs = new List <RecordPair>();
                    Console.Write("Loading training data for this iteration...");
                    pairs.AddRange(DataLoader.GetPositivesFromMRNData("mrns.csv"));
                    pairs.AddRange(DataLoader.GetHandPassedSets("more.csv"));
                    pairs.AddRange(DataLoader.GetRejectedRecordPairs("rejected.txt"));
                    //pairs.AddRange(DataLoader.GetPairsFromMisfitsFile("misfits.txt"));
                    Console.WriteLine("...done");

                    Parallel.ForEach(pairs, p =>
                    {
                        if (DecisionTreeBuilder.ReplayDecision(p, logger.SplittingQuestionsToTheBottom))
                        {
                            lock (sw)
                            {
                                sw.WriteLine(p);
                            }
                        }
                    });
                }
            }

            return;
        }
Пример #3
0
        public static List <RecordPair> LoadTrainingDataFromNoHomoFile(string noHomoFilePath)
        {
            List <RecordPair> ret = new List <RecordPair>();

            string[] lines = File.ReadAllLines(noHomoFilePath);

            for (int c = 0; c < lines.Length; c += 4)
            {
                RecordPair pair = new RecordPair();
                pair.IsMatch = bool.Parse(lines[c]);
                pair.Record1 = DataCleaner.CleanRecord(Record.FromString(lines[c + 1]));
                pair.Record2 = DataCleaner.CleanRecord(Record.FromString(lines[c + 2]));

                ret.Add(pair);
            }

            return(ret);
        }
Пример #4
0
        public static List <RecordPair> GetRejectedRecordPairs(string rejectFile)
        {
            List <RecordPair> trainingData = new List <RecordPair>();

            string[] rejectedLines = File.ReadAllLines(rejectFile);
            for (int c = 0; c < rejectedLines.Length; c += 3)
            {
                string line1 = rejectedLines[c];
                string line2 = rejectedLines[c + 1];

                RecordPair failurePair = new RecordPair();
                failurePair.Record1 = DataCleaner.CleanRecord(Record.FromString(line1));
                failurePair.Record2 = DataCleaner.CleanRecord(Record.FromString(line2));
                failurePair.IsMatch = false;
                if (PassesBigBucketFilter(failurePair))
                {
                    trainingData.Add(failurePair);
                }
            }

            return(trainingData);
        }
Пример #5
0
        public static Dictionary <int, Record> LoadFinalDataSet(string finalDataSetPath)
        {
            Dictionary <int, Record> ret   = new Dictionary <int, Record>();
            IEnumerable <string>     lines = File.ReadLines(finalDataSetPath);
            bool encounteredHeader         = false;

            foreach (string line in lines)
            {
                if (!encounteredHeader)
                {
                    encounteredHeader = true;
                }
                else
                {
                    string[] bits = line.Split(',');
                    if (bits[0] != "" && int.Parse(bits[0]) >= 15374761)
                    {
                        Record cleanedRecord = DataCleaner.CleanRecord(Record.FromFinalDatasetString(line));
                        ret.Add(cleanedRecord.EnterpriseId, cleanedRecord);
                    }
                }
            }
            return(ret);
        }
Пример #6
0
        public static List <RecordPair> GetHandPassedSets(string inputMoreFilePath)
        {
            List <RecordPair> trainingData = new List <RecordPair>();

            string[]        extraLines = File.ReadAllLines(inputMoreFilePath);
            List <Record[]> moreGroups = new List <Record[]>();

            for (int c = 0; c < extraLines.Length; c++)
            {
                List <Record> group = new List <Record>();
                for (; c < extraLines.Length; c++)
                {
                    if (extraLines[c] == "")
                    {
                        break;
                    }
                    else
                    {
                        Record record = Record.FromString(extraLines[c]);
                        group.Add(DataCleaner.CleanRecord(record));
                    }
                }
                moreGroups.Add(group.ToArray());
            }

            for (int c = 0; c < moreGroups.Count; c++)
            {
                // get the positives by iterating in the group.
                Record[] recordsInGroupC = moreGroups[c];
                for (int d = 0; d < recordsInGroupC.Length; d++)
                {
                    Record record1 = recordsInGroupC[d];
                    for (int e = d; e < recordsInGroupC.Length; e++)
                    {
                        Record     record2 = recordsInGroupC[e];
                        RecordPair pair    = new RecordPair
                        {
                            IsMatch = true,
                            Record1 = record1,
                            Record2 = record2
                        };
                        if (PassesBigBucketFilter(pair))
                        {
                            trainingData.Add(pair);
                        }
                    }
                }

                //get the negatives by iterating everyone else
                for (int d = 0; d < moreGroups.Count; d++)
                {
                    //Console.WriteLine(d.ToString());
                    if (c != d)
                    {
                        Record[] others = moreGroups[d];
                        for (int e = 0; e < recordsInGroupC.Length; e++)
                        {
                            Record record1 = recordsInGroupC[e];
                            for (int f = 0; f < others.Length; f++)
                            {
                                Record     record2 = others[f];
                                RecordPair pair    = new RecordPair
                                {
                                    IsMatch = false,
                                    Record1 = record1,
                                    Record2 = record2
                                };
                                if (PassesBigBucketFilter(pair))
                                {
                                    trainingData.Add(pair);
                                }
                                //trainingData.Add(pair);
                            }
                        }
                    }
                }
            }

            return(trainingData);
        }