public void KappaVarianceTest1()
        {
            // Example from Ientilucci, Emmett (2006). "On Using and Computing the Kappa Statistic".
            // Available on: http://www.cis.rit.edu/~ejipci/Reports/On_Using_and_Computing_the_Kappa_Statistic.pdf

            // Note: Congalton's method uses the Delta Method for approximating the Kappa variance.

            {
                int[,] matrix = // Matrix A (page 1)
                {
                    { 317,  23,  0, 0 },
                    {  61, 120,  0, 0 },
                    {   2,   4, 60, 0 },
                    {  35,  29,  0, 8 },
                };

                GeneralConfusionMatrix a = new GeneralConfusionMatrix(matrix);

                // Method A row totals (page 2)
                Assert.AreEqual(340, a.RowTotals[0]);
                Assert.AreEqual(181, a.RowTotals[1]);
                Assert.AreEqual(66, a.RowTotals[2]);
                Assert.AreEqual(72, a.RowTotals[3]);

                // Method A col totals (page 2)
                Assert.AreEqual(415, a.ColumnTotals[0]);
                Assert.AreEqual(176, a.ColumnTotals[1]);
                Assert.AreEqual(60, a.ColumnTotals[2]);
                Assert.AreEqual(8, a.ColumnTotals[3]);

                // Number of samples for A (page 2)
                Assert.AreEqual(659, a.Samples);
                Assert.AreEqual(4, a.Classes);

                // Po for A (page 2)
                Assert.AreEqual(0.7663, a.OverallAgreement, 1e-4);
                Assert.IsFalse(double.IsNaN(a.OverallAgreement));

                // Pc for A (page 3)
                Assert.AreEqual(0.4087, a.ChanceAgreement, 1e-5);
                Assert.IsFalse(double.IsNaN(a.ChanceAgreement));



                // Kappa value k_hat for A (page 3)
                Assert.AreEqual(0.605, a.Kappa, 1e-3);
                Assert.IsFalse(double.IsNaN(a.Kappa));

                double var  = a.Variance;
                double var0 = a.VarianceUnderNull;
                double varD = Accord.Statistics.Testing.KappaTest.DeltaMethodKappaVariance(a);

                // Variance value var_k for A (page 4)
                Assert.AreEqual(0.00073735, varD, 1e-8);

                /*
                 * Assert.AreNotEqual(0.00073735, var, 1e-8);
                 * Assert.AreNotEqual(0.00073735, var0, 1e-8);
                 */
                Assert.IsFalse(Math.Abs(0.00073735 - var) < 1e-8);
                Assert.IsFalse(Math.Abs(0.00073735 - var0) < 1e-8);

                Assert.IsFalse(double.IsNaN(var));
                Assert.IsFalse(double.IsNaN(var0));
                Assert.IsFalse(double.IsNaN(varD));
            }

            {
                int[,] matrix = // Matrix B
                {
                    { 377, 79,  0, 0 },
                    {   2, 72,  0, 0 },
                    {  33,  5, 60, 0 },
                    {   3, 20,  0, 8 },
                };

                GeneralConfusionMatrix b = new GeneralConfusionMatrix(matrix);

                // Method B row totals (page 2)
                Assert.AreEqual(456, b.RowTotals[0]);
                Assert.AreEqual(74, b.RowTotals[1]);
                Assert.AreEqual(98, b.RowTotals[2]);
                Assert.AreEqual(31, b.RowTotals[3]);

                // Method B col totals (page 2)
                Assert.AreEqual(415, b.ColumnTotals[0]);
                Assert.AreEqual(176, b.ColumnTotals[1]);
                Assert.AreEqual(60, b.ColumnTotals[2]);
                Assert.AreEqual(8, b.ColumnTotals[3]);


                // Number of samples for B (page 2)
                Assert.AreEqual(659, b.Samples);
                Assert.AreEqual(4, b.Classes);

                // Po for B (page 2)
                Assert.AreEqual(0.7845, b.OverallAgreement, 1e-4);
                Assert.IsFalse(double.IsNaN(b.OverallAgreement));

                // Pc for B (page 3)
                Assert.AreEqual(0.47986, b.ChanceAgreement, 1e-5);
                Assert.IsFalse(double.IsNaN(b.ChanceAgreement));


                // Kappa value k_hat for B (page 3)
                Assert.AreEqual(0.586, b.Kappa, 1e-3);
                Assert.IsFalse(double.IsNaN(b.Kappa));


                double var  = b.Variance;
                double var0 = b.VarianceUnderNull;
                double varD = Accord.Statistics.Testing.KappaTest.DeltaMethodKappaVariance(b);

                // Variance value var_k for A (page 4)
                Assert.AreEqual(0.00087457, varD, 1e-8);

                /*
                 * Assert.AreNotEqual(0.00087457, var, 1e-8);
                 * Assert.AreNotEqual(0.00087457, var0, 1e-8);
                 */
                Assert.IsFalse(Math.Abs(0.00087457 - var) < 1e-8);
                Assert.IsFalse(Math.Abs(0.00087457 - var0) < 1e-8);

                Assert.IsFalse(double.IsNaN(var));
                Assert.IsFalse(double.IsNaN(var0));
                Assert.IsFalse(double.IsNaN(varD));
            }
        }
Esempio n. 2
0
        private void GetConfusions(Func <string, IEnumerable <Tuple <LanguageInfo, double> > > identify, string method, HashSet <string> mostCommonLanguagesArray)
        {
            var mostCommonLanguages = mostCommonLanguagesArray.Select((item, i) => new { item, i }).ToDictionary(_ => _.item, _ => _.i);
            var windowLengthList    =
                Enumerable.Range(1, 10).Concat(new[] { 13, 16, 20, 23, 26, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200 }).ToArray();

            mostCommonLanguagesArray
            .Select(
                lang =>
            {
                var text   = File.ReadAllText(Path.Combine(_languageSamplesDir, lang + ".txt"));
                var middle = text.Length / 2;
                var window = 1 * 1000 * 1000;
                // take the middle of 1M characters length
                return(Tuple.Create(lang, text.Substring(Math.Max(middle - window / 2, 0), Math.Min(window - 1, text.Length))));
            })
            .AsParallel()
            .AsOrdered()
            .SelectMany(
                _ =>
            {
                var lang      = _.Item1;
                var sample    = _.Item2;
                var tokenizer = new Tokenizer();
                //printfn "tokenizing"
                var tokenNumber = 1000;
                var tokens      = tokenizer.GetTokens(sample).Skip(5).Take(tokenNumber).ToArray();
                //printfn "tokenized"
                return
                (windowLengthList
                 .Select(
                     windowLength =>
                {
                    var windowCount = tokenNumber - windowLength + 1;
                    var samplePeriod = (int)Math.Ceiling(windowCount / 100.0);                        //100 samples on average
                    var actuals =
                        tokens.Buffer(windowLength, samplePeriod)
                        .Select(tokenWindow => System.String.Join(" ", tokenWindow))
                        .Select(windowText => identify(windowText).First().Item1.Iso639_2T)
                        .ToArray();
                    return Tuple.Create(lang, windowLength, actuals);
                }));
            })
            .GroupBy(_ => _.Item2)
            .ForEach(g =>
            {
                var windowLength = g.Key;
                var experiment   =
                    g.SelectMany(
                        _ =>
                {
                    var lang    = _.Item1;
                    var actuals = _.Item3;
                    return(actuals
                           .Select(a => Tuple.Create(mostCommonLanguages[lang], mostCommonLanguages[a])));
                })
                    .ToArray();
                var matrix = new GeneralConfusionMatrix(
                    mostCommonLanguagesArray.Count, experiment.Select(_ => _.Item1).ToArray(), experiment.Select(_ => _.Item2).ToArray());
                using (var writer = new StreamWriter(Path.Combine(_outputFolder, windowLength + "." + method + ".csv")))
                {
                    PrintMatrix(writer, matrix, mostCommonLanguagesArray.ToArray());
                }
            });
        }