private void Level1Classification(List <DocumentData> TrainData, List <DocumentData> TestData) { var selectedFeatures = AppDataCenter.Singleton.FeaturesSelected.Select((a, i) => new { item = a, index = i }) .Where(a => a.item.IsSelected).Select(a => a.index).ToArray(); // selectedFeatures = new int[] { 37 }; ClassifierService.Service.InitSelectFeatures(selectedFeatures); ScalingFactors scales = null; // if(AppDataCenter.Singleton.PathToScale != null) // { // scales = FeatureListHolder.loadScalesFromFile(ClassifierService.Service.LocalFeatures, AppDataCenter.Singleton.PathToScale); // } // AppDataCenter.Singleton.AddConsoleMessage("Building Model"); AppDataCenter.Singleton.ChosenFeatures = ClassifierService.Service.LocalFeatures; AppDataCenter.Singleton.UpdateFeatureVertHorzSelection(); AppDataCenter.Singleton.EntireModelStats = ClassifierService.Service.CreateModel(AppDataCenter.Singleton.ChosenFields , TrainData, TestData, Configuration, m_featuresCalculate, AppDataCenter.Singleton.AddConsoleMessage, TuneScale, AppDataCenter.Singleton.PathToScale); // if (AppDataCenter.Singleton.PathToScale != null) // { // FeatureListHolder.SaveScalesToFile(ClassifierService.Service.LocalFeatures, AppDataCenter.Singleton.PathToScale,scales); // } double[] confidenceOut; List <ReportResultItem> ReoprtResults = new List <ReportResultItem>(); Array.ForEach(AppDataCenter.Singleton.ChosenFields, new Action <string>(a => { ReoprtResults.Add(new ReportResultItem(a)); })); ReportResultItem NullResult = new ReportResultItem(""); ReoprtResults.Add(NullResult); AppDataCenter.Singleton.RejectedFields.Clear(); ClassifierService.Service.level1RunTestForLevel2(TrainData, AppDataCenter.Singleton.ChosenFields, AppDataCenter.Singleton.AddConsoleMessage); // add printing of train data sucss ConcurrentBag <FieldReportItem> fieldBag = new ConcurrentBag <FieldReportItem>(); AppDataCenter.Singleton.AddConsoleMessage("Running Test on : " + TestData.Count + " Pages"); int index = 0; // int result; foreach (DocumentData doc1 in TestData) { var startTimeFeatures = DateTime.Now; // AppDataCenter.Singleton.AddConsoleMessage("Testing Page : " + docnum++); // List<FieldData> fields = ClassifierService.Service.GetFieldsOfDoc(FieldsToTest, doc1); TimeSpan FeatureCalcSpan = new TimeSpan(0); /* foreach (CandidateData field in doc1.Candidates) * { * result = 0; * startTimeFeatures = DateTime.Now; * if (m_useLastRuntimeData == true) * { * foreach (var featureIndex in ClassifierService.Service.LocalFeatures * .Select((x, i) => AppDataCenter.Singleton.IsFeatureCalculate(x.Name) == true ? i : -1) * .Where(x => x >= 0)) * { * field.Features[featureIndex] = ClassifierService.Service.GetFieldFeature(doc1, field, featureIndex); * } * } * else * { * field.Features = ClassifierService.Service.GetFieldFeatures(doc1, field); * } * FeatureCalcSpan += (DateTime.Now - startTimeFeatures); * result = ClassifierService.Service.GetDescition(field.Features, out confidenceOut); * * * // field.AccordConfidance = (ClassifierService.Service.NormalizaedConfidence(confidenceOut)); * field.AccordConfidance = (confidenceOut); * }*/ List <int> featursIndexFilter = new List <int>(); // Get the index of the feature we need to calculate - all other features will be use the value already exist in the field ClassifierService.Service.getDocFeatures(doc1, null, null, AppDataCenter.Singleton.ChosenFields, featursIndexFilter, false, null, true, true); double NumGolden = 0; double NumMatched = 0; foreach (var r in AppDataCenter.Singleton.ChosenFields.Select((x, i) => new { Value = x, Index = i })) { // if (doc1.Candidates.Where(a => a.NameFromTypist == r.Value).ToList().Count() <= 0) continue; NumGolden++; CandidateData fieldCandidate = doc1.Candidates.OrderByDescending(a => a.AccordConfidance[r.Index]).FirstOrDefault(); FieldReportItem fieldReportItem = new FieldReportItem(r.Value, r.Index, fieldCandidate, doc1); if (fieldReportItem.IsMatch) { NumMatched++; } /* } * else * { * fieldReportItem.isRejected = true; * * }*/ if (!fieldReportItem.IsMatch) { Trace.WriteLine("Doc : " + doc1.DocumentName + " ,Confidance : " + fieldCandidate.AccordConfidance[r.Index] + ",Field : " + r.Value + " ,Recognized : " + fieldCandidate.NameFromTypist + " content : " + fieldCandidate.Content + (fieldReportItem.IsMatch ? " True" : " False")); } fieldBag.Add(fieldReportItem); } AppDataCenter.Singleton.AddConsoleMessage(index++ + " Page : " + doc1.DocumentName + " Getting Features (sec) : " + FeatureCalcSpan.TotalSeconds + " success : " + Math.Round(NumMatched / NumGolden * 100) + "%"); } AppDataCenter.Singleton.AddConsoleMessage("Update results"); AppDataCenter.Singleton.EntireModelStats.testresults = fieldBag.ToList(); ReportData.noOfFields = fieldBag.Count(); ReportData.NoOfPages = TestData.Count; ReportData.fpCount = fieldBag.Where(a => a.IsFP).Count(); ReportData.matchCount = fieldBag.Where(a => a.IsMatch).Count(); ReportData.rejectCount = fieldBag.Where(a => a.IsRejected).Count(); foreach (FieldReportItem fieldresult in fieldBag.Where(a => a.IsMatch)) { ReportData.QualityMatch.Add(fieldresult.MatchQuality); } List <FieldReportItem> FPList = fieldBag.Where(a => a.IsFP).ToList().OrderBy(b => b.Field.NameFromTypist ?? "").ThenByDescending(a => a.Field.AccordConfidance.Max()).ToList(); foreach (var r in AppDataCenter.Singleton.ChosenFields) { List <FieldReportItem> FPfromGolden = FPList.Where(a => a.ExpectedField.NameFromTypist == r).ToList(); List <FieldReportItem> FPnotFromGolden = FPList.Where(a => a.ExpectedField.NameFromTypist != r && a.MatchedName == r).ToList(); double success = (double)fieldBag.Where(a => a.Field.NameFromTypist == r && a.IsMatch).Count() / (double)fieldBag.Where(a => a.Field.NameFromTypist == r).Count() * 100; int countNum = fieldBag.Where(a => a.Field.NameFromTypist == r).Count(); Trace.WriteLine("*********** The field : " + r + " success : " + Math.Round(success, 1) + "% Count : " + countNum + " *************"); } AppDataCenter.Singleton.NotifyChange(NotifyGroup.ScatterData); }
private void Level3Classification(List <DocumentData> TrainData, List <DocumentData> TestData) { AppDataCenter.Singleton.AddConsoleMessage("\nStarting Level3 ***************\n"); ClassifierService.Service.InitFeatureslevel3(AppDataCenter.Singleton.ChosenFields); ClassifierService.Service.buildModelLevel3(AppDataCenter.Singleton.ChosenFields , TrainData, Configuration, AppDataCenter.Singleton.AddConsoleMessage); ConcurrentBag <FieldReportItem> fieldBag = new ConcurrentBag <FieldReportItem>(); AppDataCenter.Singleton.AddConsoleMessage("Getting results from level 1 to be used in level 3"); TestData.AsParallel().ForAll(doc => { doc.lastResultsCandidates = new Dictionary <string, CandidateData>(); for (int i = 0; i < AppDataCenter.Singleton.ChosenFields.Length; i++) { CandidateData releventCandidate = doc.Candidates.Where(a => a.AccordConfidance[i] == doc.Candidates.Select(b => b.AccordConfidance[i]).Max()).FirstOrDefault(); doc.lastResultsCandidates.Add(AppDataCenter.Singleton.ChosenFields[i], releventCandidate); } }); AppDataCenter.Singleton.AddConsoleMessage("Running Test on : " + TestData.Count + " Pages"); int index = 0; int result; double[] confidenceOut; foreach (DocumentData doc1 in TestData) { var startTimeFeatures = DateTime.Now; TimeSpan FeatureCalcSpan = new TimeSpan(0); foreach (CandidateData field in doc1.CandidatesForStage3) { { field.Features3 = ClassifierService.Service.GetFieldFeatures3(doc1, field); } FeatureCalcSpan += (DateTime.Now - startTimeFeatures); result = ClassifierService.Service.GetDescition3(field.Features3, out confidenceOut); // field.AccordConfidance = (ClassifierService.Service.NormalizaedConfidence(confidenceOut)); field.AccordConfidance3 = (confidenceOut); } double NumGolden = 0; double NumMatched = 0; foreach (var r in AppDataCenter.Singleton.ChosenFields.Select((x, i) => new { Value = x, Index = i })) { // if (doc1.Candidates.Where(a => a.NameFromTypist == r.Value).ToList().Count() <= 0) continue; NumGolden++; CandidateData fieldCandidate = doc1.CandidatesForStage3.OrderByDescending(a => a.AccordConfidance[r.Index]).Take(7).OrderByDescending(a => a.AccordConfidance3[r.Index]).FirstOrDefault(); FieldReportItem fieldReportItem = new FieldReportItem(r.Value, r.Index, fieldCandidate, doc1); if (fieldReportItem.IsMatch) { NumMatched++; } /* } * else * { * fieldReportItem.isRejected = true; * * }*/ if (!fieldReportItem.IsMatch) { Trace.WriteLine("Doc : " + doc1.DocumentName + " ,Confidance : " + fieldCandidate.AccordConfidance3[r.Index] + ",Field : " + r.Value + " ,Recognized : " + fieldCandidate.NameFromTypist + " content : " + fieldCandidate.Content + (fieldReportItem.IsMatch ? " True" : " False")); } fieldBag.Add(fieldReportItem); } AppDataCenter.Singleton.AddConsoleMessage(index++ + " Page : " + doc1.DocumentName + " Getting Features (sec) : " + FeatureCalcSpan.TotalSeconds + " success : " + Math.Round(NumMatched / NumGolden * 100, 1) + "%"); } AppDataCenter.Singleton.AddConsoleMessage("Update results"); // AppDataCenter.Singleton.EntireModelStats.testresults = fieldBag.ToList(); double noOfFields = fieldBag.Count(); double fpCount = fieldBag.Where(a => a.IsFP).Count(); double matchCount = fieldBag.Where(a => a.IsMatch).Count(); double rejectCount = fieldBag.Where(a => a.IsRejected).Count(); List <FieldReportItem> FPList = fieldBag.Where(a => a.IsFP).ToList().OrderBy(b => b.Field.NameFromTypist ?? "").ThenByDescending(a => a.Field.AccordConfidance.Max()).ToList(); foreach (var r in AppDataCenter.Singleton.ChosenFields) { List <FieldReportItem> FPfromGolden = FPList.Where(a => a.ExpectedField.NameFromTypist == r).ToList(); List <FieldReportItem> FPnotFromGolden = FPList.Where(a => a.ExpectedField.NameFromTypist != r && a.MatchedName == r).ToList(); double success = (double)fieldBag.Where(a => a.Field.NameFromTypist == r && a.IsMatch).Count() / (double)fieldBag.Where(a => a.Field.NameFromTypist == r).Count() * 100; int countNum = fieldBag.Where(a => a.Field.NameFromTypist == r).Count(); Trace.WriteLine("*********** The field : " + r + " success : " + Math.Round(success, 1) + "% Count : " + countNum + " *************"); } AppDataCenter.Singleton.AddConsoleMessage("Total result level 3 , sucsss : " + Math.Round(matchCount / noOfFields * 100, 2) + "%"); }
public void Level2Classification(List <DocumentData> TrainData, List <DocumentData> TestData) { List <int[]> groups = ClassifierService.Service.BuildGroupsOfFeatures(AppDataCenter.Singleton.ChosenFields); int[] optionsPerSol = new int[AppDataCenter.Singleton.ChosenFields.Length]; for (int i = 0; i < optionsPerSol.Length; i++) { if (groups.SelectMany(a => a).Contains(i)) { optionsPerSol[i] = 2; } else { optionsPerSol[i] = 1; } } ClassifierService.Service.InitFeatureslevel2(groups); ClassifierService.Service.Prepareraindatafalsegrouping(TrainData, optionsPerSol, 0, AppDataCenter.Singleton.AddConsoleMessage); // prepare candidates for not group with test Data and move them into the build ClassifierService.Service.buildModelLevel2(AppDataCenter.Singleton.ChosenFields, TrainData, groups, Configuration, AppDataCenter.Singleton.AddConsoleMessage); ClassifierService.Service.PrepareCandidateslevel2Test(TestData, optionsPerSol, 0, AppDataCenter.Singleton.AddConsoleMessage); TestData.ForEach(doc => { solutionData chosenSolution = null; double lastProb = -100; doc.solutionCandidates.ToList().ForEach( sol => { double conf; ClassifierService.Service.GetDescitionLevel2(sol.features, out conf); sol.Confidance = conf; if (conf > lastProb) { chosenSolution = sol; lastProb = conf; } } ); doc.chosenSolution = chosenSolution; }); ConcurrentBag <FieldReportItem> fieldBag = new ConcurrentBag <FieldReportItem>(); int index = 0; double TotalGolden = 0; double TotalMatched = 0; foreach (DocumentData doc1 in TestData) { var startTimeFeatures = DateTime.Now; TimeSpan FeatureCalcSpan = new TimeSpan(0); double NumGolden = 0; double NumMatched = 0; foreach (var r in AppDataCenter.Singleton.ChosenFields.Select((x, i) => new { Value = x, Index = i })) { // if (doc1.Candidates.Where(a => a.NameFromTypist == r.Value).ToList().Count() <= 0) continue; NumGolden++; FieldReportItem fieldReportItem = new FieldReportItem(r.Value, r.Index, doc1.chosenSolution.offeredSolution[r.Index], doc1); if (fieldReportItem.IsMatch) { NumMatched++; } /* } * else * { * fieldReportItem.isRejected = true; * * }*/ Trace.WriteLine("Doc : " + doc1.DocumentName + ",Field : " + r.Value + " ,Recognized : " + fieldReportItem.MatchedName + " content : " + doc1.chosenSolution.offeredSolution[r.Index].Content + "Match : " + (fieldReportItem.IsMatch ? " True" : " False")); fieldBag.Add(fieldReportItem); } TotalGolden += NumGolden; TotalMatched += NumMatched; AppDataCenter.Singleton.AddConsoleMessage(index++ + " Page : " + doc1.DocumentName + " Getting Features (sec) : " + FeatureCalcSpan.TotalSeconds + " success : " + Math.Round(NumMatched / NumGolden * 100) + "%"); } AppDataCenter.Singleton.AddConsoleMessage(" Total Matched : " + Math.Round(TotalMatched / TotalGolden * 100, 1) + "%"); // calculate success // print result }