Esempio n. 1
0
        static void getMarkupRu(string fnEntries, string fnBaseOut)
        {
            var           poss         = new Dictionary <string, int>();
            var           meaningCurly = new Dictionary <string, int>();
            var           enCurly      = new Dictionary <string, int>();
            var           deCurly      = new Dictionary <string, int>();
            string        line;
            List <string> entryLines = new List <string>();

            using (var sr = new StreamReader(fnEntries))
            {
                while ((line = sr.ReadLine()) != null)
                {
                    if (line == "" && entryLines.Count > 0)
                    {
                        var we = WiktEntry.FromLinesRu(entryLines);
                        if (we != null)
                        {
                            getMarkupRu(we, poss, meaningCurly, enCurly, deCurly);
                        }
                        entryLines.Clear();
                        continue;
                    }
                    entryLines.Add(line);
                }
            }
            writeMarkupCounts(poss, fnBaseOut + "-curly-poss.txt");
            writeMarkupCounts(meaningCurly, fnBaseOut + "-curly-meanings.txt");
            writeMarkupCounts(enCurly, fnBaseOut + "-curly-en.txt");
            writeMarkupCounts(deCurly, fnBaseOut + "-curly-de.txt");
        }
Esempio n. 2
0
        static void mergeLemsRu(string fnIn, string fnPlain, string fnLems, string fnOut)
        {
            Dictionary <string, string> plainToLem = new Dictionary <string, string>();
            string l1, l2;

            using (StreamReader srPlain = new StreamReader(fnPlain))
                using (StreamReader srLems = new StreamReader(fnLems))
                {
                    while ((l1 = srPlain.ReadLine()) != null)
                    {
                        l2 = srLems.ReadLine();
                        string   lem   = "";
                        string[] parts = l2.Split(' ');
                        foreach (string p in parts)
                        {
                            if (lem != "")
                            {
                                lem += " ";
                            }
                            int ix1 = p.IndexOf('{');
                            if (ix1 == -1)
                            {
                                lem += p; continue;
                            }
                            int ix2 = p.IndexOf('|');
                            if (ix2 == -1)
                            {
                                ix2 = p.IndexOf('}');
                            }
                            lem += p.Substring(ix1 + 1, ix2 - ix1 - 1);
                        }
                        plainToLem[l1] = lem;
                    }
                }
            string        line;
            List <string> entryLines = new List <string>();

            using (var sr = new StreamReader(fnIn))
                using (var swOut = new StreamWriter(fnOut))
                {
                    swOut.NewLine = "\n";
                    while ((line = sr.ReadLine()) != null)
                    {
                        if (line == "" && entryLines.Count > 0)
                        {
                            var we = WiktEntry.FromLinesRu(entryLines);
                            if (plainToLem.ContainsKey(we.Lemma))
                            {
                                we.Lemmatized = plainToLem[we.Lemma];
                            }
                            we.WriteRu(swOut);
                            entryLines.Clear();
                            continue;
                        }
                        entryLines.Add(line);
                    }
                }
        }
Esempio n. 3
0
        static void cleanupRu(string fnIn, string fnOut, string fnToLem)
        {
            int            cntIn = 0, cntKept = 0, cntTrans = 0, cntSpacee = 0;
            RuEntryCleaner cleaner = new RuEntryCleaner();
            string         line;
            List <string>  entryLines = new List <string>();

            using (var sr = new StreamReader(fnIn))
                using (var swOut = new StreamWriter(fnOut))
                    using (var swToLem = new StreamWriter(fnToLem))
                    {
                        swOut.NewLine   = "\n";
                        swToLem.NewLine = "\n";
                        while ((line = sr.ReadLine()) != null)
                        {
                            if (line == "" && entryLines.Count > 0)
                            {
                                var we = WiktEntry.FromLinesRu(entryLines);
                                if (we != null)
                                {
                                    ++cntIn;
                                    var cleanEntry = cleaner.Clean(we);
                                    if (cleanEntry != null)
                                    {
                                        ++cntKept;
                                        if (cleanEntry.Translations.Count > 0)
                                        {
                                            ++cntTrans;
                                        }
                                        if (cleanEntry.Lemma.IndexOf(' ') != -1)
                                        {
                                            cntSpacee++;
                                        }
                                        cleanEntry.WriteRu(swOut);
                                        if (cleanEntry.Lemma.IndexOf(' ') != -1)
                                        {
                                            swToLem.WriteLine(cleanEntry.Lemma);
                                        }
                                    }
                                }
                                entryLines.Clear();
                                continue;
                            }
                            entryLines.Add(line);
                        }
                    }
            Console.WriteLine(cntIn + " entries / " + cntKept + " kept / " + cntTrans + " with translations / " + cntSpacee + " multiword");
        }