Пример #1
0
        public static TagSet LoadFromStream(TextModelReader sr)
        {
            string xname = sr.Read();

            if (xname != typeof(TagSet).FullName)
            {
                throw new Exception("model name does not match");
            }

            int startlvl = sr.NestLevel;

            var xver = sr.ReadOptionUInt64("VER");

            if (xver != VER)
            {
                throw new Exception("version number does not match");
            }

            var ts = new TagSet();

            ts.ROOT = sr.ReadOptionString("ROOT");

            ts.PTs = CodeBook32.LoadFromStream(sr);
            ts.NTs = CodeBook32.LoadFromStream(sr);

            xname = sr.Read();

            if(xname != typeof(TagSet).FullName || sr.NestLevel != startlvl)
            {
                throw new Exception("model name does not match");
            }

            return ts;
        }
Пример #2
0
 public void TestTextReset()
 {
     using (var fr = File.OpenRead("model.txt"))
     {
         var reader = new TextModelReader(fr);
         var wv1    = reader.ReadVector();
         reader.Reset();
         var wv2 = reader.ReadVector();
         Assert.AreEqual(4501, reader.Words);
         Assert.AreEqual(100, reader.Size);
         Assert.AreEqual(wv1.Word, wv2.Word);
         CollectionAssert.AreEqual(wv1.Vector, wv2.Vector);
     }
 }
Пример #3
0
 public void TestReLoadingText()
 {
     var model = Model.Load("model.txt");
     Model m2;
     using (var s = new MemoryStream())
     {
         using (var writer = new TextModelWriter(s, true))
         {
             writer.Write(model);
         }
         s.Seek(0, SeekOrigin.Begin);
         using (var tmr = new TextModelReader(s))
         {
             m2 = Model.Load(tmr);
         }
     }
     Assert.AreEqual(model.Words, m2.Words);
     Assert.AreEqual(model.Size, m2.Size);
 }
Пример #4
0
        public void TestReLoadingText()
        {
            var   model = Model.Load("model.txt");
            Model m2;

            using (var s = new MemoryStream())
            {
                using (var writer = new TextModelWriter(s, true))
                {
                    writer.Write(model);
                }
                s.Seek(0, SeekOrigin.Begin);
                var tmr = new TextModelReader(s);
                {
                    m2 = Model.Load(tmr);
                }
            }
            Assert.AreEqual(model.Words, m2.Words);
            Assert.AreEqual(model.Size, m2.Size);
        }
Пример #5
0
        public static Vocabulary LoadFromStream(TextModelReader sr)
        {
            var v = new Vocabulary();
            int knownWordCount = 0;
            int sigCount = 0;
            string name = typeof(Vocabulary).FullName;
            int startLvl = 0;

            string line = sr.Read();
            startLvl = sr.NestLevel;
            if (line != name)
            {
                throw new Exception("error in model file!");
            }

            var xsig = sr.ReadOptionUInt64("SIG");
            var xver = sr.ReadOptionUInt64("VER");
            if (xsig != SIG || xver != VER)
            {
                throw new Exception("Signiture or version does not match!");
            }
            knownWordCount = sr.ReadOptionInt("knownWordCount");
            sigCount = sr.ReadOptionInt("sigCount");
            v.vocab = CodeBook32.LoadFromStream(sr);
            v.signitureVocab = CodeBook32.LoadFromStream(sr);

            if (v.vocab.Count != knownWordCount || v.signitureVocab.Count != sigCount)
            {
                throw new Exception("vocab size does not match");
            }

            string closeline = sr.Read();

            if (sr.NestLevel != startLvl || closeline != name)
            {
                throw new Exception("model is not closed!");
            }

            return v;
        }
Пример #6
0
        static double EvaluateRawParser()
        {
            string modelfile = //@"/home/nan/Data/PTB/ptb.s1.smoothed.grammar";
                                @"D:\user\nyang\data\treebank\English\pcfg\ptb.s2.smoothed.grammar";

            string testfile = //@"/home/nan/Data/PTB/xbar/wsj.23.flat";
                            @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.23.flat";

            string outputfile = @"C:\Users\v-nayang\Downloads\EVALB\EVALB\wsj.23.flat.out";

            string reffile = @"C:\Users\v-nayang\Downloads\EVALB\EVALB\wsj.23.flat.ref";

            string bklex = @"D:\user\nyang\tools\bkparser\wsj.la.grammar.txt.lexicon";
            string bkgrammar = @"D:\user\nyang\tools\bkparser\wsj.la.grammar.txt.grammar";

            int nthread = 16;

            Vocabulary vocab;
            TagSet tagSet;
            LAPCFGrammar grammar;

            using (var s = new TextModelReader(modelfile))
            {
                vocab = Vocabulary.LoadFromStream(s);
                tagSet = TagSet.LoadFromStream(s);
                grammar = LAPCFGrammar.LoadFromStream(s, vocab, tagSet);
                //grammar = new LAPCFGrammar(tagSet, vocab, bklex, bkgrammar);
            }

            //grammar.Smoothing(0.01, 0.1);

            //grammar.Normalize();

            //grammar.PropMaxUnaryPath();

            grammar.MakeCompaction();

            grammar.MakeSubruleCompaction();

            var treebank = new List<PhrasalTree>();

            LoadTrees(treebank, testfile);

            foreach (var tree in treebank)
            {
                foreach (var node in tree.TreeNodes)
                {
                    if (node.Children.Count == 0)
                    {
                        node.Lex = SimpleTokenizor.ETokenize(node.Lex);
                    }
                }

                //PennTreeProcessor.RecoverFromXBarBinarize(tree.Root);

                tree.ComputeStartEnd();
            }

            treebank = treebank.Where(x => x.Root.End <= 20).ToList();

            double ccount = 0;
            double pcount = 0;
            double gcount = 0;
            int failed = 0;
            int sentcount = treebank.Count;

            Console.Error.WriteLine("Start to parse...");
            ConsoleTimer tm = new ConsoleTimer(1);

            PhrasalTree[] ptrees = new PhrasalTree[treebank.Count];

            Parallel.For(0, nthread, thrID =>
            {
                HyperEdgePool epool = new HyperEdgePool();

                HyperVertexPool vpool = new HyperVertexPool(grammar.subTagCounts.Max());
                for (int treeId = thrID; treeId < treebank.Count; treeId += nthread)
                {
                    var tree = treebank[treeId];
                    var words = tree.GetSentence().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries);

                    int[] wids = words.Select(w => vocab.GetId(SimpleTokenizor.ETokenize(w), false)).ToArray();

                    wids[0] = vocab.GetId(SimpleTokenizor.ETokenize(words[0]), true);

                    string[] tags = tree.GetPOS().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries);

                    double[][] tprobs = new double[wids.Length][];

                    //for (int i = 0; i < wids.Length; ++i)
                    //{
                    //    tprobs[i] = rwHanlder.GetLogProbs(SimpleTokenizor.ETokenize(words[i]));
                    //}

                    bool[][] allowedTags = null;
                    //AssignTagConstraints(vocab, tagSet, words, wids);

                    try
                    {
                        //var parser = new ChartParser(wids);
                        var parser = new ChartHyperGraphParser(wids, allowedTags, tprobs);
                        parser.BuildHyperGraph(grammar, epool, vpool, grammar.subTagCounts);
                        parser.SumForward();
                        parser.SumBackward(true);
                        parser.PosteriorViterbi();

                        var ptree = parser.ExtractPosteriorViterbi(words, tagSet);

                        //parser.MaxForward();

                        //var ptree = parser.ExtractViterbi(words, tagSet);

                        ptree.ComputeStartEnd();

                        ptrees[treeId] = ptree;

                    }
                    catch
                    {
                    }
                }
            });

            using (StreamWriter sw = new StreamWriter(outputfile))
            {
                using (StreamWriter swref = new StreamWriter(reffile))
                {
                    for (int treeid = 0; treeid < treebank.Count; ++treeid)
                    {
                        var tree = treebank[treeid];
                        var ptree = ptrees[treeid];

                        swref.WriteLine(tree.GetParseLine());

                        if (ptree == null)
                        {
                            failed += 1;
                            sw.WriteLine("()");
                            continue;
                        }

                        var pbrackets = ptree.GetBracketsIgnorePunc();
                        var gbrackets = tree.GetBracketsIgnorePunc();

                        gcount += gbrackets.Count;
                        pcount += pbrackets.Count;

                        double xxc = 0;

                        foreach (var b in pbrackets)
                        {
                            if (gbrackets.Contains(b))
                            {
                                ccount += 1;
                                xxc += 1;
                            }
                        }

                        if (pbrackets.Count == 0
                            || (pbrackets.Count < gbrackets.Count / 2))
                        {
                            Console.Error.WriteLine("\nStrange prediction: p={0}, g={1}", pbrackets.Count,
                                gbrackets.Count);
                        }

                        string parseline = ptree.GetParseLine();

                        double snt_p = xxc / pbrackets.Count;
                        double snt_r = xxc / gbrackets.Count;

                        double snt_f1 = 2.0 * snt_p * snt_r / (snt_p + snt_r);

                        sw.WriteLine(parseline);

                        //sw.WriteLine(" [Current]\tP: {0:F2} R: {1:F2} F1: {2:F3}", snt_p * 100.0, snt_r * 100.0, snt_f1 * 100.0);

                    }
                }
            }

            tm.Finish();

            Console.Error.WriteLine("SENT: {0}\tFAIL: {1}", sentcount, failed);

            double prec = ccount / pcount;
            double recall = ccount / gcount;

            double f1 = 2.0 * prec * recall / (prec + recall);

            Console.Error.WriteLine("P: {0:F3}\tR: {1:F3}\tF1: {2:F3}", prec, recall, f1);

            return f1;
        }
Пример #7
0
        static void EvaluateParser()
        {
            string modelfile = //@"/home/nan/Data/PTB/ptb.s1.smoothed.grammar";
                                @"D:\user\nyang\data\treebank\English\pcfg\ptb.s1.smoothed.grammar";

            string tagmapfile = @"E:\nyang\TFS\Common\Users\v-nayang\LAPCFGParser\en-tag-map.txt";

            string testfile = //@"/home/nan/Data/PTB/xbar/wsj.23.flat";
                            @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.23.flat";

            string trainfile = @"D:\user\nyang\data\treebank\English\pcfg\xbar\wsj.02-21.flat";

            Vocabulary vocab;
            TagSet tagSet;
            LAPCFGrammar grammar;

            var traintrees = new List<PhrasalTree>();

            LoadTrees(traintrees, trainfile);

            var rwHanlder = new RareWordHandler(traintrees, 10);

            using (var s = new TextModelReader(modelfile))
            {
                vocab = Vocabulary.LoadFromStream(s);
                tagSet = TagSet.LoadFromStream(s);
                grammar = LAPCFGrammar.LoadFromStream(s, vocab, tagSet);
            }

            rwHanlder.Build(tagSet, 0.001);

            //grammar.Smoothing(0.1f);

            grammar.MakeCompaction();

            grammar.MakeSubruleCompaction();

            var grammars = new LAPCFGrammar[grammar.subtagTraces.Count + 1];

            var traces = new int[grammars.Length][][];

            grammars [grammars.Length - 1] = grammar;

            for (int i = grammars.Length - 1; i >= 1; --i)
            {
                traces [i] = grammar.subtagTraces [i - 1];
                grammars [i - 1] = grammars [i].ProjectGrammar(traces [i]);
                grammars [i - 1].MakeCompaction();
                grammars [i - 1].MakeSubruleCompaction();
            }

            string[][] tagTiers;

            using (StreamReader sr = new StreamReader(tagmapfile))
            {
                var tt = new List<string[]>();
                while (!sr.EndOfStream)
                {
                    string line = sr.ReadLine();

                    if (string.IsNullOrWhiteSpace(line))
                    {
                        continue;
                    }

                    tt.Add(line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries));
                }

                tagTiers = new string[tt [0].Length][];//tt.ToArray();

                for (int i = 0; i < tagTiers.Length; ++i)
                {
                    tagTiers [i] = new string[tt.Count];
                    for (int j = 0; j < tt.Count; ++j)
                    {
                        tagTiers [i] [j] = tt [j] [i];
                    }
                }
            }

            var cbs = new CodeBook32[tagTiers.Length];

            for (int i = 0; i < cbs.Length; ++i)
            {
                cbs [i] = new CodeBook32();

                foreach (var t in tagTiers[i])
                {
                    cbs [i].Add(t);
                }
            }

            int pgcount = cbs.Length - 1;

            int[][] tagMaps = new int[pgcount][];

            for (int i = 0; i < tagMaps.Length; ++i)
            {
                tagMaps [i] = new int[grammars [0].PTCount + 1 + cbs [i + 1].Count];

                for (int j = 0; j < grammars[0].PTCount + 1; ++j)
                {
                    tagMaps [i] [j] = j;
                }
            }

            var lastMap = tagMaps [tagMaps.Length - 1];

            for (int j = grammars[0].PTCount + 1; j < lastMap.Length; ++j)
            {
                string tstr = tagSet.GetTagString(j);
                int id = cbs [cbs.Length - 1] [tstr];
                int pid = cbs [cbs.Length - 2] [tagTiers [tagTiers.Length - 2] [id]];

                lastMap [j] = pid + grammars [0].PTCount + 1;
            }

            for (int i = 0; i < tagMaps.Length - 1; ++i)
            {
                for (int j = grammars[0].PTCount + 1; j < tagMaps[i].Length; ++j)
                {
                    string tstr = cbs [i + 1] [j - grammars [0].PTCount - 1];

                    int xid = Array.IndexOf(tagTiers [i + 1], tstr);

                    string pstr = tagTiers [i] [xid];

                    int pid = cbs [i] [pstr];

                    tagMaps [i] [j] = pid;
                }
            }

            var cgrammars = new LAPCFGrammar[tagMaps.Length];

            cgrammars [cgrammars.Length - 1] = grammars [0].CollapseNonTerminals(tagMaps [cgrammars.Length - 1], 1 + cbs [cgrammars.Length - 1].Count);

            for (int i = cgrammars.Length - 1; i >= 1; --i)
            {
                cgrammars [i - 1] = cgrammars [i].CollapseNonTerminals(tagMaps [i - 1], 1 + cbs [i - 1].Count);
            }

            for (int i = 0; i < cgrammars.Length; ++i)
            {
                cgrammars [i].MakeCompaction();
                cgrammars [i].MakeSubruleCompaction();
            }

            var treebank = new List<PhrasalTree>();

            LoadTrees(treebank, testfile);

            foreach (var tree in treebank)
            {
                foreach (var node in tree.TreeNodes)
                {
                    if (node.Children.Count == 0)
                    {
                        node.Lex = SimpleTokenizor.ETokenize(node.Lex);
                    }
                }

                //PennTreeProcessor.RecoverFromXBarBinarize(tree.Root);

                tree.ComputeStartEnd();
            }

            double ccount = 0;
            double pcount = 0;
            double gcount = 0;
            int failed = 0;
            int sentcount = 0;
            HyperEdgePool epool = new HyperEdgePool(1024 * 1024);

            HyperVertexPool vpool = new HyperVertexPool(grammars [grammars.Length - 1].subTagCounts.Max());

            //EMorph.EnglishMorph.WarmUp();

            Console.Error.WriteLine("Start to parse...");
            ConsoleTimer tm = new ConsoleTimer(1);

            Stopwatch g0bwatch = new Stopwatch();
            Stopwatch g0watch = new Stopwatch();
            Stopwatch bwatch = new Stopwatch();

            Stopwatch[] gwatch = new Stopwatch[grammars.Length];

            for (int i = 0; i < gwatch.Length; ++i)
            {
                gwatch [i] = new Stopwatch();
            }

            Stopwatch vwatch = new Stopwatch();

            foreach (var tree in treebank)
            {
                var words = tree.GetSentence().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries);

                if (words.Length > 20)
                {
                    continue;
                }

                sentcount += 1;

                int[] wids = words.Select(w => vocab.GetId(SimpleTokenizor.ETokenize(w), false)).ToArray();

                wids [0] = vocab.GetId(SimpleTokenizor.ETokenize(words [0]), true);

                string[] tags = tree.GetPOS().Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries);

                double[][] tprobs = new double[wids.Length][];

                //for (int i = 0; i < wids.Length; ++i)
                //{
                //    tprobs[i] = rwHanlder.GetLogProbs(SimpleTokenizor.ETokenize(words[i]));
                //}

                bool[][] allowedTags = null;
                    //AssignTagConstraints(vocab, tagSet, words, wids);

                try
                {
                    //var parser = new ChartParser(wids);
                    var parser = new ChartHyperGraphParser(wids, allowedTags, tprobs);
                    g0bwatch.Start();
                    parser.BuildHyperGraph(cgrammars [0], epool, vpool, grammars [grammars.Length - 1].subTagCounts);
                    g0bwatch.Stop();
                    g0watch.Start();
                    parser.SumForward();
                    parser.SumBackward(false);
                    parser.Prune(-15.0);
                    parser.Purge();
                    for (int i = 1; i < cgrammars.Length; ++i)
                    {
                        parser.ExpandHyperGraph(cgrammars [i], tagMaps [i - 1], epool, vpool,
                                                 grammars [grammars.Length - 1].subTagCounts);
                        parser.SumForward();
                        parser.SumBackward(false);
                        parser.Prune(-15.0);
                        parser.Purge();
                    }
                    g0watch.Stop();
            //
                    bwatch.Start();
                    parser.ExpandHyperGraph(grammars [0], tagMaps [2], epool, vpool, grammars [grammars.Length - 1].subTagCounts);
            //					parser.BuildHyperGraph (grammars [0], epool, vpool, grammars [grammars.Length - 1].subTagCounts);
                    bwatch.Stop();

                    for (int i = 0; i < grammars.Length - 1; ++i)
                    {
                        gwatch [i].Start();
                        parser.SumForward();
                        parser.SumBackward(false);

                        parser.Prune(-10.0);

                        parser.Purge();

                        parser.ProjectGrammar(traces [i + 1], grammars [i + 1]);
                        gwatch [i].Stop();
                    }

                    gwatch [grammars.Length - 1].Start();
                    parser.SumForward();

                    parser.SumBackward(true);
                    gwatch [grammars.Length - 1].Stop();

                    vwatch.Start();
                    parser.PosteriorViterbi();

                    var ptree = parser.ExtractPosteriorViterbi(words, tagSet);

                    vwatch.Stop();

                    //PennTreeProcessor.RecoverFromXBarBinarize(ptree.Root);

                    ptree.ComputeStartEnd();

                    var pbrackets = ptree.GetBracketsIgnorePunc();
                    var gbrackets = tree.GetBracketsIgnorePunc();

                    gcount += gbrackets.Count;
                    pcount += pbrackets.Count;

                    foreach (var b in pbrackets)
                    {
                        if (gbrackets.Contains(b))
                        {
                            ccount += 1;
                        }
                    }

                    if (pbrackets.Count == 0
                        || (pbrackets.Count < gbrackets.Count / 2))
                    {
                        Console.Error.WriteLine("\nStrange prediction: p={0}, g={1}", pbrackets.Count,
                            gbrackets.Count);
                    }

                    //Console.Error.WriteLine(tree.TextTree);
                } catch
                {
                    g0bwatch.Stop();
                    g0watch.Stop();
                    bwatch.Stop();
                    foreach (var w in gwatch)
                    {
                        w.Stop();
                    }
                    vwatch.Stop();
                    failed += 1;
                    Console.Error.WriteLine("\nFailure!");
                }

                tm.Up();
            }

            tm.Finish();

            Console.Error.WriteLine("SENT: {0}\tFAIL: {1}", sentcount, failed);

            double prec = ccount / pcount;
            double recall = ccount / gcount;

            double f1 = 2.0 * prec * recall / (prec + recall);

            Console.Error.WriteLine("P: {0:F3}\tR: {1:F3}\tF1: {2:F3}", prec, recall, f1);

            Console.Error.WriteLine("G-1 Build:\t{0:F6} s", g0bwatch.Elapsed.TotalSeconds);

            Console.Error.WriteLine("G-1 Pass:\t{0:F6} s", g0watch.Elapsed.TotalSeconds);

            Console.Error.WriteLine("G0 Build:\t{0:F6} s", bwatch.Elapsed.TotalSeconds);

            for (int i = 0; i < gwatch.Length; ++i)
            {
                Console.Error.WriteLine("G{0} Pass:\t{1:F6} s", i, gwatch [i].Elapsed.TotalSeconds);
            }

            Console.Error.WriteLine("Viterbi:\t{0:F6} s", vwatch.Elapsed.TotalSeconds);
        }
Пример #8
0
        static void TestParse()
        {
            string modelfile = //@"/home/nan/Data/PTB/ptb.s2.smoothed.grammar";
            @"D:\user\nyang\data\treebank\English\pcfg\ptb.s6.smoothed.grammar";
            string tagmapfile = @"E:\nyang\TFS\Common\Users\v-nayang\LAPCFGParser\en-tag-map.txt";

            Vocabulary vocab;
            TagSet tagSet;
            LAPCFGrammar grammar;

            using (var s = new TextModelReader(modelfile))
            {
                vocab = Vocabulary.LoadFromStream(s);
                tagSet = TagSet.LoadFromStream(s);
                grammar = LAPCFGrammar.LoadFromStream(s, vocab, tagSet);
            }

            grammar.Smoothing(0.1f);

            grammar.MakeCompaction();

            grammar.MakeSubruleCompaction();

            var grammars = new LAPCFGrammar[grammar.subtagTraces.Count + 1];

            var traces = new int[grammars.Length][][];

            grammars[grammars.Length - 1] = grammar;

            for (int i = grammars.Length - 1; i >= 1; --i)
            {
                traces[i] = grammar.subtagTraces[i - 1];
                grammars[i - 1] = grammars[i].ProjectGrammar(traces[i]);
                grammars[i - 1].MakeCompaction();
                grammars[i - 1].MakeSubruleCompaction();
            }

            string[][] tagTiers;

            using (StreamReader sr = new StreamReader(tagmapfile))
            {
                var tt = new List<string[]>();
                while (!sr.EndOfStream)
                {
                    string line = sr.ReadLine();

                    if (string.IsNullOrWhiteSpace(line))
                    {
                        continue;
                    }

                    tt.Add(line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries));
                }

                tagTiers = new string[tt[0].Length][];//tt.ToArray();

                for (int i = 0; i < tagTiers.Length; ++i)
                {
                    tagTiers[i] = new string[tt.Count];
                    for (int j = 0; j < tt.Count; ++j)
                    {
                        tagTiers[i][j] = tt[j][i];
                    }
                }
            }

            var cbs = new CodeBook32[tagTiers.Length];

            for (int i = 0; i < cbs.Length; ++i)
            {
                cbs[i] = new CodeBook32();

                foreach (var t in tagTiers[i])
                {
                    cbs[i].Add(t);
                }
            }

            int pgcount = cbs.Length - 1;

            int[][] tagMaps = new int[pgcount][];

            for (int i = 0; i < tagMaps.Length; ++i)
            {
                tagMaps[i] = new int[grammars[0].PTCount + 1 + cbs[i + 1].Count];

                for (int j = 0; j < grammars[0].PTCount + 1; ++j)
                {
                    tagMaps[i][j] = j;
                }
            }

            var lastMap = tagMaps[tagMaps.Length - 1];

            for (int j = grammars[0].PTCount + 1; j < lastMap.Length; ++j)
            {
                string tstr = tagSet.GetTagString(j);
                int id = cbs[cbs.Length - 1][tstr];
                int pid = cbs[cbs.Length - 2][tagTiers[tagTiers.Length - 2][id]];

                lastMap[j] = pid + grammars[0].PTCount + 1;
            }

            for (int i = 0; i < tagMaps.Length - 1; ++i)
            {
                for (int j = grammars[0].PTCount + 1; j < tagMaps[i].Length; ++j)
                {
                    string tstr = cbs[i + 1][j - grammars[0].PTCount - 1];

                    int xid = Array.IndexOf(tagTiers[i + 1], tstr);

                    string pstr = tagTiers[i][xid];

                    int pid = cbs[i][pstr];

                    tagMaps[i][j] = pid;
                }
            }

            var cgrammars = new LAPCFGrammar[tagMaps.Length];

            cgrammars[cgrammars.Length - 1] = grammars[0].CollapseNonTerminals(tagMaps[cgrammars.Length - 1], 1 + cbs[cgrammars.Length - 1].Count);

            for (int i = cgrammars.Length - 1; i >= 1; --i)
            {
                cgrammars[i - 1] = cgrammars[i].CollapseNonTerminals(tagMaps[i - 1], 1 + cbs[i - 1].Count);
            }

            for (int i = 0; i < cgrammars.Length; ++i)
            {
                cgrammars[i].MakeCompaction();
                cgrammars[i].MakeSubruleCompaction();
            }

            HyperEdgePool epool = new HyperEdgePool(1024 * 1024);
            HyperVertexPool vpool = new HyperVertexPool(grammars[grammars.Length - 1].subTagCounts.Max());
            EMorph.EnglishMorph.WarmUp();
            Console.Error.WriteLine("READY");

            while(true)
            {
                string line = Console.ReadLine();

                if (string.IsNullOrWhiteSpace(line))
                {
                    continue;
                }

                var words = line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries);

                int[] wids = words.Select(w => vocab.GetId(SimpleTokenizor.ETokenize(w), false)).ToArray();

                wids[0] = vocab.GetId(SimpleTokenizor.ETokenize(words[0]), true);
                bool[][] allowedTags = new bool[wids.Length][];

                for (int i = 0; i < wids.Length; ++i)
                {
                    if (vocab.IsRareOrUNK(wids[i]))
                    {
                        var lemmas = EMorph.EnglishMorph.GetBaseForm(words[i]);

                        if (lemmas == null || lemmas.Count == 0)
                        {
                            continue;
                        }

                        allowedTags[i] = new bool[tagSet.PTCount];

                        if (char.IsUpper(words[i][0]))
                        {
                            allowedTags[i][tagSet.GetID("NNP")] = true;
                            allowedTags[i][tagSet.GetID("NNPS")] = true;
                        }

                        foreach (var lemma in lemmas)
                        {
                            switch (lemma.PoS)
                            {
                                case EMorph.MorphPoS.NN:
                                    allowedTags[i][tagSet.GetID("NN")] = true;
                                    allowedTags[i][tagSet.GetID("NNS")] = true;
                                    break;
                                case EMorph.MorphPoS.NNS:
                                    allowedTags[i][tagSet.GetID("NNS")] = true;
                                    allowedTags[i][tagSet.GetID("NN")] = true;
                                    break;
                                case EMorph.MorphPoS.JJ:
                                    allowedTags[i][tagSet.GetID("JJ")] = true;
                                    break;
                                case EMorph.MorphPoS.JJR:
                                    allowedTags[i][tagSet.GetID("JJR")] = true;
                                    break;
                                case EMorph.MorphPoS.JJS:
                                    allowedTags[i][tagSet.GetID("JJS")] = true;
                                    break;
                                case EMorph.MorphPoS.RB:
                                    allowedTags[i][tagSet.GetID("RB")] = true;
                                    break;
                                case EMorph.MorphPoS.RBR:
                                    allowedTags[i][tagSet.GetID("RBR")] = true;
                                    break;
                                case EMorph.MorphPoS.RBS:
                                    allowedTags[i][tagSet.GetID("RBS")] = true;
                                    break;
                                case EMorph.MorphPoS.VB:
                                    allowedTags[i][tagSet.GetID("VB")] = true;
                                    allowedTags[i][tagSet.GetID("VBP")] = true;
                                    break;
                                case EMorph.MorphPoS.VBD:
                                    allowedTags[i][tagSet.GetID("VBD")] = true;
                                    allowedTags[i][tagSet.GetID("VBN")] = true;
                                    break;
                                case EMorph.MorphPoS.VBG:
                                    allowedTags[i][tagSet.GetID("VBG")] = true;
                                    break;
                                case EMorph.MorphPoS.VBZ:
                                    allowedTags[i][tagSet.GetID("VBZ")] = true;
                                    break;
                                default:
                                    throw new Exception("not recognized morph lemma!");
                            }
                        }
                    }
                }

                try
                {
                    var parser = new ChartHyperGraphParser(wids, allowedTags);
                    parser.BuildHyperGraph(cgrammars[0], epool, vpool, grammars[grammars.Length - 1].subTagCounts);
                    parser.SumForward();
                    parser.SumBackward(false);
                    parser.Prune(-10.0);
                    parser.Purge();
                    for (int i = 1; i < cgrammars.Length; ++i)
                    {
                        parser.ExpandHyperGraph(cgrammars[i], tagMaps[i - 1], epool, vpool,
                                                 grammars[grammars.Length - 1].subTagCounts);
                        parser.SumForward();
                        parser.SumBackward(false);
                        parser.Prune(-10.0);
                        parser.Purge();
                    }
                    parser.ExpandHyperGraph(grammars[0], tagMaps[2], epool, vpool, grammars[grammars.Length - 1].subTagCounts);

                    for (int i = 0; i < grammars.Length - 1; ++i)
                    {
                        parser.SumForward();
                        parser.SumBackward(false);
                        parser.Prune(-8.0);
                        parser.Purge();
                        parser.ProjectGrammar(traces[i + 1], grammars[i + 1]);
                    }

                    parser.SumForward();
                    parser.SumBackward(true);
                    parser.PosteriorViterbi();

                    var ptree = parser.ExtractPosteriorViterbi(words, tagSet);

                    PennTreeProcessor.RecoverFromXBarBinarize(ptree.Root);

                    ptree.ComputeStartEnd();

                    string treeline = ptree.TextTree;

                    string[] xlines = treeline.Split(new string[] { "\n", "\r", "\r\n" }, StringSplitOptions.RemoveEmptyEntries);

                    foreach (var xline in xlines)
                    {
                        Console.Error.WriteLine(xline);
                    }
                }
                catch
                {
                    Console.Error.WriteLine("Failure to parse!");
                }
            }
        }
Пример #9
0
        static double RunTrainTest(int randSeed)
        {
            string rawdir = //@"/home/nan/Data/PTB/flat/";
            @"D:\user\nyang\data\treebank\English\pcfg\flat";

            string inputdir = //@"/home/nan/Data/PTB/xbar/";
            @"D:\user\nyang\data\treebank\English\pcfg\xbar";

            string trainfile = "wsj.02-21.flat";
            string devfile = "wsj.22.flat";
            string testfile = "wsj.23.flat";
            string vocabfn = //@"/home/nan/Data/PTB/nw.wsj.vocab";
            @"D:\user\nyang\data\treebank\English\pcfg\nw.wsj.vocab";
            string tagfn = //@"/home/nan/Data/PTB/nw.wsj.tagset";
            @"D:\user\nyang\data\treebank\English\pcfg\nw.wsj.tagset";
            string modeldir = //@"/home/nan/Data/PTB/";
            @"D:\user\nyang\data\treebank\English\pcfg\";
            string urulefile = "wsj.urule.count";

            double pruningThr = -30.0 / Math.Log10(Math.E);
            double smoothingAlpha = 0.01f;

            int afterSplitRound = 50;
            int afterMergeRound = 20;
            int afterSmoothRound = 10;

            int nthread = 16;

            Vocabulary vocab;
            TagSet tagSet;

            using (var s = new TextModelReader(vocabfn))
            {
                vocab = Vocabulary.LoadFromStream(s);
            }

            using (var s = new TextModelReader(tagfn))
            {
                tagSet = TagSet.LoadFromStream(s);
            }

            var treebank = new List<PhrasalTree>();

            LoadTrees(treebank, Path.Combine(inputdir, trainfile));

            var devtrees = new List<PhrasalTree>();

            LoadTrees(devtrees, Path.Combine(inputdir, devfile));

            foreach (var tree in treebank)
            {
                foreach (var node in tree.TreeNodes)
                {
                    if (node.Children.Count == 0)
                    {
                        node.Lex = SimpleTokenizor.ETokenize(node.Lex);
                    }
                }
            }

            foreach (var tree in devtrees)
            {
                foreach (var node in tree.TreeNodes)
                {
                    if (node.Children.Count == 0)
                    {
                        node.Lex = SimpleTokenizor.ETokenize(node.Lex);
                    }
                }
            }

            Random RNG = new Random(randSeed);

            LAPCFGrammar rules;

            GrammarBuilder.Build(vocab, tagSet, treebank, out rules, RNG);
            rules.Normalize();
            rules.Prune(pruningThr);
            rules.Trim();
            rules.InitializeExpectedCounts();

            using (var sw = new TextModelWriter(new StreamWriter(Path.Combine(modeldir, "ptb.s0.grammar")), true))
            {
                vocab.DumpToStream(sw);
                tagSet.DumpToStream(sw);
                rules.DumpToStream(sw, tagSet, vocab);
            }
            //EvaluateRawParser();

            //return;

                rules = rules.SplitSymbols(RNG, 1.0f);
                rules.Normalize();
                rules.Prune(pruningThr);
                rules.Trim();
                rules.InitializeExpectedCounts();

                int failed = 0;

                var smoothRules = rules;

                double llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed);

                smoothRules = rules.Clone();

                smoothRules.Smoothing(smoothingAlpha);
                smoothRules.Normalize();

                double devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed);

                double devbest = 0;
                int bestround = 0;
                var bestRules = rules;

                for (int round = 0; round < afterSplitRound; ++round)
                {
                    GrammarBuilder.CalculateNewScores(rules);
                    rules.Normalize();
                    rules.Prune(pruningThr);
                    rules.Trim();
                    rules.InitializeExpectedCounts();

                    llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed);

                    smoothRules = rules.Clone();

                    smoothRules.Smoothing(smoothingAlpha);
                    smoothRules.Normalize();

                    devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed);

                    //Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed);
                    Console.Error.Write(".");

                    if (round == 0 || devllhd > devbest)
                    {
                        devbest = devllhd;
                        bestround = round;
                        bestRules = rules.Clone();
                    } else
                    {
                        if (round - bestround > 3 && round >= 50)
                        {
                            break;
                        }
                    }
                }
                Console.Error.WriteLine();

                rules = bestRules.Clone();
                //rules.Normalize();
                //rules.Prune(pruningThr);
                //rules.Normalize();

                using (var sw = new TextModelWriter(new StreamWriter(Path.Combine(modeldir, "ptb.s" + "1" + ".grammar")), true))
                {
                    vocab.DumpToStream(sw);
                    tagSet.DumpToStream(sw);
                    rules.DumpToStream(sw, tagSet, vocab);
                }

            return EvaluateRawParser();
        }
Пример #10
0
        static void Run()
        {
            string rawdir = //@"/home/nan/Data/PTB/flat/";
               @"D:\user\nyang\data\treebank\English\pcfg\flat";

            string inputdir = //@"/home/nan/Data/PTB/xbar/";
            @"D:\user\nyang\data\treebank\English\pcfg\xbar";

            string bklex = @"D:\user\nyang\tools\bkparser\wsj.la.grammar.txt.lexicon";
            string bkgrammar = @"D:\user\nyang\tools\bkparser\wsj.la.grammar.txt.grammar";

            string trainfile = "wsj.02-21.flat";
            string devfile = "wsj.22.flat";
            string testfile = "wsj.23.flat";
            string vocabfn = //@"/home/nan/Data/PTB/nw.wsj.vocab";
            @"D:\user\nyang\data\treebank\English\pcfg\nw.wsj.vocab";
            string tagfn = //@"/home/nan/Data/PTB/nw.wsj.tagset";
            @"D:\user\nyang\data\treebank\English\pcfg\nw.wsj.tagset";
            string modeldir = //@"/home/nan/Data/PTB/";
            @"D:\user\nyang\data\treebank\English\pcfg\";
            string urulefile = "wsj.urule.count";

            double pruningThr = -30.0 / Math.Log10(Math.E);
            double smoothingAlpha = 0.01f;

            int afterSplitRound = 50;
            int afterMergeRound = 20;
            int afterSmoothRound = 20;

            int nthread = 16;

            //Preprocess(Path.Combine(rawdir, trainfile), Path.Combine(inputdir, trainfile), Path.Combine(modeldir, urulefile));

            //Preprocess(Path.Combine(rawdir, devfile), Path.Combine(inputdir, devfile), null);

            //Preprocess(Path.Combine(rawdir, testfile), Path.Combine(inputdir, testfile), null);

            //BuildVocab(Path.Combine(inputdir, trainfile), vocabfn, 20);

            //BuildTagset(Path.Combine(inputdir, trainfile), tagfn);

            //return;

            Vocabulary vocab;
            TagSet tagSet;

            using (var s = new TextModelReader(vocabfn))
            {
                vocab = Vocabulary.LoadFromStream(s);
            }

            using (var s = new TextModelReader(tagfn))
            {
                tagSet = TagSet.LoadFromStream(s);
            }

            var treebank = new List<PhrasalTree>();

            LoadTrees(treebank, Path.Combine(inputdir, trainfile));

            var devtrees = new List<PhrasalTree>();

            LoadTrees(devtrees, Path.Combine(inputdir, devfile));

            foreach (var tree in treebank)
            {
                foreach (var node in tree.TreeNodes)
                {
                    if (node.Children.Count == 0)
                    {
                        node.Lex = SimpleTokenizor.ETokenize(node.Lex);
                    }
                }
            }

            foreach (var tree in devtrees)
            {
                foreach (var node in tree.TreeNodes)
                {
                    if (node.Children.Count == 0)
                    {
                        node.Lex = SimpleTokenizor.ETokenize(node.Lex);
                    }
                }
            }

            Random RNG = new Random(0);

            LAPCFGrammar rules;

            GrammarBuilder.Build(vocab, tagSet, treebank, out rules, RNG);

            rules.Normalize();
            rules.Prune(pruningThr);
            rules.Trim();
            rules.InitializeExpectedCounts();

            GC.Collect();
            GC.WaitForPendingFinalizers();

            Console.Error.WriteLine("save model");
            using (var sw = new TextModelWriter(new StreamWriter(Path.Combine(modeldir, "ptb.s0.grammar")), true))
            {
                vocab.DumpToStream(sw);
                tagSet.DumpToStream(sw);
                rules.DumpToStream(sw, tagSet, vocab);
            }
            //EvaluateRawParser();

            //return;

            for (int smCycle = 1; smCycle <= 6; ++smCycle)
            {
                Console.Error.WriteLine("start cycle {0}", smCycle);
                Console.Error.WriteLine("split grammar...");

                //rules = new LAPCFGrammar(tagSet, vocab, bklex, bkgrammar);
                //rules.InitializeExpectedCounts();
                rules = rules.SplitSymbols(RNG, 1.0f);
                rules.Normalize();
                rules.Prune(pruningThr);
                rules.Trim();
                rules.InitializeExpectedCounts();

                GC.Collect();
                GC.WaitForPendingFinalizers();

                Console.Error.WriteLine("done");

                GC.Collect();
                GC.WaitForPendingFinalizers();

                int failed = 0;

                var smoothRules = rules;

                double llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed);

                smoothRules = rules.Clone();

                smoothRules.Smoothing(smoothingAlpha);
                smoothRules.Normalize();

                double devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed);

                Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed);

                double devbest = 0;
                int bestround = 0;
                var bestRules = rules;

                for (int round = 0; round < afterSplitRound; ++round)
                {
                    GrammarBuilder.CalculateNewScores(rules);
                    rules.Normalize();
                    rules.Prune(pruningThr);
                    rules.Trim();
                    rules.InitializeExpectedCounts();

                    GC.Collect();
                    GC.WaitForPendingFinalizers();

                    //rules.Normalize();
                    //rules.Prune(pruningThr);
                    //rules.Normalize();
                    //GrammarBuilder.CheckProbs(rules);

                    llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed);

                    smoothRules = rules.Clone();

                    smoothRules.Smoothing(smoothingAlpha);
                    smoothRules.Normalize();

                    devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed);

                    Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed);

                    if (round == 0 || devllhd > devbest)
                    {
                        devbest = devllhd;
                        bestround = round;
                        bestRules = rules.Clone();
                    }
                    else
                    {
                        if (round - bestround > 3 && round >= 50)
                        {
                            break;
                        }
                    }
                }

                rules = bestRules.Clone();
                //rules.Normalize();
                rules.Prune(pruningThr);
                //rules.Normalize();

                Console.Error.WriteLine("save model");
                using (var sw = new TextModelWriter(new StreamWriter(Path.Combine(modeldir, "ptb.s" + smCycle.ToString() + ".grammar")), true))
                {
                    vocab.DumpToStream(sw);
                    tagSet.DumpToStream(sw);
                    rules.DumpToStream(sw, tagSet, vocab);
                }

                //EvaluateRawParser();
                //return;

                rules = bestRules;

                Console.Error.WriteLine("merging symbols...");
                rules = GrammarBuilder.MergeSymbols(0.5f, vocab, tagSet, rules, treebank, nthread);

                llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed);

                GrammarBuilder.CalculateNewScores(rules, true);
                rules.Normalize();
                rules.Prune(pruningThr);
                rules.Trim();
                rules.InitializeExpectedCounts();

                GC.Collect();
                GC.WaitForPendingFinalizers();

                llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed);

                smoothRules = rules.Clone();

                smoothRules.Smoothing(smoothingAlpha);
                smoothRules.Normalize();

                devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed);

                Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed);

                for (int round = 0; round < afterMergeRound; ++round)
                {
                    GrammarBuilder.CalculateNewScores(rules, false);
                    rules.Normalize();
                    rules.Prune(pruningThr);
                    rules.Trim();
                    rules.InitializeExpectedCounts();

                    llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed);

                    smoothRules = rules.Clone();

                    smoothRules.Smoothing(smoothingAlpha);
                    smoothRules.Normalize();

                    devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed);

                    Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed);
                    if (round == 0 || devllhd > devbest)
                    {
                        devbest = devllhd;
                        bestround = round;
                        bestRules = rules.Clone();
                    }
                    else
                    {
                        if (round - bestround > 3 && round >= 10)
                        {
                            break;
                        }
                    }
                }

                rules = bestRules.Clone();
                //rules.Normalize();
                rules.Prune(pruningThr);
                //rules.Normalize();

                Console.Error.WriteLine("save model");
                using (var sw = new TextModelWriter(new StreamWriter(Path.Combine(modeldir, "ptb.s" + smCycle.ToString() + ".merged.grammar")), true))
                {
                    vocab.DumpToStream(sw);
                    tagSet.DumpToStream(sw);
                    rules.DumpToStream(sw, tagSet, vocab);
                }
                Console.Error.WriteLine("smoothing model...");

                rules = bestRules.Clone();

                rules.InitializeExpectedCounts();

                rules.Smoothing(smoothingAlpha);

                llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed);

                smoothRules = rules.Clone();

                smoothRules.Smoothing(smoothingAlpha);
                smoothRules.Normalize();

                devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed);

                Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed);

                for (int round = 0; round < afterSmoothRound; ++round)
                {
                    GrammarBuilder.CalculateNewScores(rules, false);
                    rules.Normalize();
                    rules.Smoothing(0.01, 0.1);
                    rules.Prune(pruningThr);
                    //rules.Normalize();
                    rules.Trim();
                    rules.InitializeExpectedCounts();

                    llhd = ParseGraphAndCollect(nthread, treebank, rules, vocab, tagSet, out failed);

                    smoothRules = rules.Clone();

                    devllhd = ParseGraphs(nthread, devtrees, smoothRules, vocab, tagSet, out failed);

                    if (round == 0 || devllhd > devbest)
                    {
                        devbest = devllhd;
                        bestround = round;
                        bestRules = rules.Clone();
                    }
                    else
                    {
                        if (round - bestround > 3 && round >= 10)
                        {
                            break;
                        }
                    }

                    Console.Error.WriteLine("llhd: {0:F3}\theldout llhd: {1:F3} ({2} failed)", llhd / treebank.Count, devllhd / (devtrees.Count - failed), failed);
                }

                rules = bestRules.Clone();
                rules.Prune(pruningThr);

                Console.Error.WriteLine("save model");
                using (var sw = new TextModelWriter(new StreamWriter(Path.Combine(modeldir, "ptb.s" + smCycle.ToString() + ".smoothed.grammar")), true))
                {
                    vocab.DumpToStream(sw);
                    tagSet.DumpToStream(sw);
                    rules.DumpToStream(sw, tagSet, vocab);
                }

                rules = bestRules;
            }
            //EvaluateRawParser();
            return;
        }
Пример #11
0
        static void Main(string[] args)
        {
            try
            {
                IText        text              = new Text();
                IConcordance concordance       = new Concordance();
                var          concordanceParser = new ConcordanceParser();
                var          configuration     = new GlobalConfiguration();

                if (args.Length != 0)
                {
                    configuration.FileNames = args;
                }
                else
                {
                    using var reader = new StreamReader("../../../config.json");
                    var json = reader.ReadToEnd();
                    configuration = JsonConvert.DeserializeObject <GlobalConfiguration>(json);
                }


                foreach (var fileName in configuration.FileNames)
                {
                    using var stream = new StreamReader(new FileStream(fileName, FileMode.Open));
                    var textParser = new TextParser();
                    var textReader = new TextModelReader(stream);
                    text = textParser.ParseText(textReader.ReadAllText());
                }

                foreach (var fileName in configuration.FileNames)
                {
                    using var stream = new StreamReader(new FileStream(fileName, FileMode.Open));
                    var textParser = new TextParser();
                    var textReader = new TextModelReader(stream);
                    concordance = concordanceParser.ParseText(textReader.ReadAllText());
                }

                var jsonText = JsonTextSerializer.Serialize(text);
                var jsonConc = JsonTextSerializer.Serialize(concordance);

                using (var writer = new StreamWriter("../../../text.json"))
                {
                    var textModelWriter = new TextModelWriter(writer);
                    textModelWriter.Write(jsonText);
                }

                using (var writer = new StreamWriter("../../../concordance.json"))
                {
                    var textModelWriter = new TextModelWriter(writer);
                    textModelWriter.Write(jsonConc);
                }
                Console.WriteLine();
                Console.WriteLine("----Select words from question sentences with length 10------------------------");
                Console.WriteLine();
                foreach (var word in text.GetWordsFromQuestionSentences(10))
                {
                    Console.WriteLine(word);
                }
                Console.WriteLine();
                Console.WriteLine("----Order sentences by words count-------------------------");
                Console.WriteLine();
                foreach (var sentence in text.OrderSentencesByWordsCount())
                {
                    Console.Write(sentence);
                    Console.Write(" --- ");
                    Console.Write($"{sentence.WordsCount} words");
                    Console.WriteLine();
                }
                Console.WriteLine();
                Console.WriteLine("-----Deleting words with length 10--------------");
                Console.WriteLine();
                text.DeleteWords(10);
                foreach (var sentence in text.Sentences)
                {
                    Console.WriteLine(sentence);
                }
                Console.WriteLine();
                Console.WriteLine("-----Replacing words: \"In\" replace by \"In word replaced\"----------------");
                Console.WriteLine();
                text.ReplaceWord("In", "In word replaced");
                foreach (var sentence in text.Sentences)
                {
                    Console.WriteLine(sentence);
                }

                Console.WriteLine("------------------------------------");
            }
            catch (Exception e)
            {
                Console.WriteLine(e.Message);
            }
        }
Пример #12
0
 public void TestTextReset()
 {
     using (var fr = File.OpenRead("model.txt"))
     {
         var reader = new TextModelReader(fr);
         var wv1 = reader.ReadVector();
         reader.Reset();
         var wv2 = reader.ReadVector();
         Assert.AreEqual(4501, reader.Words);
         Assert.AreEqual(100, reader.Size);
         Assert.AreEqual(wv1.Word, wv2.Word);
         CollectionAssert.AreEqual(wv1.Vector, wv2.Vector);
     }
 }
Пример #13
0
        public static LAPCFGrammar LoadFromStream(TextModelReader sr, Vocabulary vocab, TagSet tagSet)
        {
            var grammar = new LAPCFGrammar();
            var name = typeof(LAPCFGrammar).FullName;

            sr.Require(name);
            sr.Require("VER", VER);

            grammar.NTCount = sr.ReadOptionInt("NTCount");
            grammar.PTCount = sr.ReadOptionInt("PTCount");
            grammar.ROOTID = sr.ReadOptionInt("ROOTID");

            sr.Require("TerminalRule");

            int lvl = sr.NestLevel;
            var truleStrings = new HashSet<string>();
            var uruleStrings = new HashSet<string>();
            var bruleStrings = new HashSet<string>();

            string line = sr.Read();
            while (sr.NestLevel > lvl)
            {
                truleStrings.Add(line);
                line = sr.Read();
            }

            if (line != "UnaryRule")
            {
                throw new Exception("wrong model!");
            }
            line = sr.Read();
            while (sr.NestLevel > lvl)
            {
                uruleStrings.Add(line);
                line = sr.Read();
            }

            if (line != "BinaryRule")
            {
                throw new Exception("wrong model!");
            }
            line = sr.Read();
            while (sr.NestLevel > lvl)
            {
                bruleStrings.Add(line);
                line = sr.Read();
            }

            string[] parts = line.Split('\t');

            if (parts [0] != "TraceCount")
            {
                throw new Exception("error in model");
            }

            int subtraceCount = int.Parse(parts [1]);

            grammar.subtagTraces = new List<int[][]>();

            for (int i = 0; i < subtraceCount; ++i)
            {
                int tlen = sr.ReadOptionInt("TRACE");
                int[][] trace = new int[tlen][];

                for (int j = 0; j < tlen; ++j)
                {
                    trace [j] = sr.ReadIntArray();
                }

                grammar.subtagTraces.Add(trace);
            }

            if (grammar.subtagTraces.Count == 0)
            {
                grammar.subTagCounts = new int[grammar.TotalTagCount];
                ArrayHelper.Fill(grammar.subTagCounts, 1);
            } else
            {
                var trace = grammar.subtagTraces [grammar.subtagTraces.Count - 1];
                grammar.subTagCounts = trace.Select(x => x.Length).ToArray();
            }

            sr.Require(name);

            foreach (var str in uruleStrings)
            {
                grammar.BuildUnaryRule(str, tagSet);
            }

            foreach (var str in truleStrings)
            {
                grammar.BuildTerminalRule(str, vocab, tagSet);
            }

            foreach (var str in bruleStrings)
            {
                grammar.BuildBinaryRule(str, tagSet);
            }

            return grammar;
        }