public void KappaVarianceTest1() { // Example from Ientilucci, Emmett (2006). "On Using and Computing the Kappa Statistic". // Available on: http://www.cis.rit.edu/~ejipci/Reports/On_Using_and_Computing_the_Kappa_Statistic.pdf // Note: Congalton's method uses the Delta Method for approximating the Kappa variance. { int[,] matrix = // Matrix A (page 1) { { 317, 23, 0, 0 }, { 61, 120, 0, 0 }, { 2, 4, 60, 0 }, { 35, 29, 0, 8 }, }; GeneralConfusionMatrix a = new GeneralConfusionMatrix(matrix); // Method A row totals (page 2) Assert.AreEqual(340, a.RowTotals[0]); Assert.AreEqual(181, a.RowTotals[1]); Assert.AreEqual(66, a.RowTotals[2]); Assert.AreEqual(72, a.RowTotals[3]); // Method A col totals (page 2) Assert.AreEqual(415, a.ColumnTotals[0]); Assert.AreEqual(176, a.ColumnTotals[1]); Assert.AreEqual(60, a.ColumnTotals[2]); Assert.AreEqual(8, a.ColumnTotals[3]); // Number of samples for A (page 2) Assert.AreEqual(659, a.Samples); Assert.AreEqual(4, a.Classes); // Po for A (page 2) Assert.AreEqual(0.7663, a.OverallAgreement, 1e-4); Assert.IsFalse(double.IsNaN(a.OverallAgreement)); // Pc for A (page 3) Assert.AreEqual(0.4087, a.ChanceAgreement, 1e-5); Assert.IsFalse(double.IsNaN(a.ChanceAgreement)); // Kappa value k_hat for A (page 3) Assert.AreEqual(0.605, a.Kappa, 1e-3); Assert.IsFalse(double.IsNaN(a.Kappa)); double var = a.Variance; double var0 = a.VarianceUnderNull; double varD = Accord.Statistics.Testing.KappaTest.DeltaMethodKappaVariance(a); // Variance value var_k for A (page 4) Assert.AreEqual(0.00073735, varD, 1e-8); /* * Assert.AreNotEqual(0.00073735, var, 1e-8); * Assert.AreNotEqual(0.00073735, var0, 1e-8); */ Assert.IsFalse(Math.Abs(0.00073735 - var) < 1e-8); Assert.IsFalse(Math.Abs(0.00073735 - var0) < 1e-8); Assert.IsFalse(double.IsNaN(var)); Assert.IsFalse(double.IsNaN(var0)); Assert.IsFalse(double.IsNaN(varD)); } { int[,] matrix = // Matrix B { { 377, 79, 0, 0 }, { 2, 72, 0, 0 }, { 33, 5, 60, 0 }, { 3, 20, 0, 8 }, }; GeneralConfusionMatrix b = new GeneralConfusionMatrix(matrix); // Method B row totals (page 2) Assert.AreEqual(456, b.RowTotals[0]); Assert.AreEqual(74, b.RowTotals[1]); Assert.AreEqual(98, b.RowTotals[2]); Assert.AreEqual(31, b.RowTotals[3]); // Method B col totals (page 2) Assert.AreEqual(415, b.ColumnTotals[0]); Assert.AreEqual(176, b.ColumnTotals[1]); Assert.AreEqual(60, b.ColumnTotals[2]); Assert.AreEqual(8, b.ColumnTotals[3]); // Number of samples for B (page 2) Assert.AreEqual(659, b.Samples); Assert.AreEqual(4, b.Classes); // Po for B (page 2) Assert.AreEqual(0.7845, b.OverallAgreement, 1e-4); Assert.IsFalse(double.IsNaN(b.OverallAgreement)); // Pc for B (page 3) Assert.AreEqual(0.47986, b.ChanceAgreement, 1e-5); Assert.IsFalse(double.IsNaN(b.ChanceAgreement)); // Kappa value k_hat for B (page 3) Assert.AreEqual(0.586, b.Kappa, 1e-3); Assert.IsFalse(double.IsNaN(b.Kappa)); double var = b.Variance; double var0 = b.VarianceUnderNull; double varD = Accord.Statistics.Testing.KappaTest.DeltaMethodKappaVariance(b); // Variance value var_k for A (page 4) Assert.AreEqual(0.00087457, varD, 1e-8); /* * Assert.AreNotEqual(0.00087457, var, 1e-8); * Assert.AreNotEqual(0.00087457, var0, 1e-8); */ Assert.IsFalse(Math.Abs(0.00087457 - var) < 1e-8); Assert.IsFalse(Math.Abs(0.00087457 - var0) < 1e-8); Assert.IsFalse(double.IsNaN(var)); Assert.IsFalse(double.IsNaN(var0)); Assert.IsFalse(double.IsNaN(varD)); } }
private void GetConfusions(Func <string, IEnumerable <Tuple <LanguageInfo, double> > > identify, string method, HashSet <string> mostCommonLanguagesArray) { var mostCommonLanguages = mostCommonLanguagesArray.Select((item, i) => new { item, i }).ToDictionary(_ => _.item, _ => _.i); var windowLengthList = Enumerable.Range(1, 10).Concat(new[] { 13, 16, 20, 23, 26, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200 }).ToArray(); mostCommonLanguagesArray .Select( lang => { var text = File.ReadAllText(Path.Combine(_languageSamplesDir, lang + ".txt")); var middle = text.Length / 2; var window = 1 * 1000 * 1000; // take the middle of 1M characters length return(Tuple.Create(lang, text.Substring(Math.Max(middle - window / 2, 0), Math.Min(window - 1, text.Length)))); }) .AsParallel() .AsOrdered() .SelectMany( _ => { var lang = _.Item1; var sample = _.Item2; var tokenizer = new Tokenizer(); //printfn "tokenizing" var tokenNumber = 1000; var tokens = tokenizer.GetTokens(sample).Skip(5).Take(tokenNumber).ToArray(); //printfn "tokenized" return (windowLengthList .Select( windowLength => { var windowCount = tokenNumber - windowLength + 1; var samplePeriod = (int)Math.Ceiling(windowCount / 100.0); //100 samples on average var actuals = tokens.Buffer(windowLength, samplePeriod) .Select(tokenWindow => System.String.Join(" ", tokenWindow)) .Select(windowText => identify(windowText).First().Item1.Iso639_2T) .ToArray(); return Tuple.Create(lang, windowLength, actuals); })); }) .GroupBy(_ => _.Item2) .ForEach(g => { var windowLength = g.Key; var experiment = g.SelectMany( _ => { var lang = _.Item1; var actuals = _.Item3; return(actuals .Select(a => Tuple.Create(mostCommonLanguages[lang], mostCommonLanguages[a]))); }) .ToArray(); var matrix = new GeneralConfusionMatrix( mostCommonLanguagesArray.Count, experiment.Select(_ => _.Item1).ToArray(), experiment.Select(_ => _.Item2).ToArray()); using (var writer = new StreamWriter(Path.Combine(_outputFolder, windowLength + "." + method + ".csv"))) { PrintMatrix(writer, matrix, mostCommonLanguagesArray.ToArray()); } }); }