Exemple #1
0
        public static WiktEntry FromLinesDe(List <string> lines)
        {
            string[]  parts = lines[0].Split('\t');
            WiktEntry res   = new WiktEntry
            {
                Lemma = parts[0],
                PoS   = parts[1],
            };
            bool inTranslations = false;

            for (int i = 1; i < lines.Count; ++i)
            {
                string ln = lines[i];
                if (!ln.StartsWith(":"))
                {
                    inTranslations = true;
                }
                if (ln.StartsWith("::"))
                {
                    continue;
                }
                if (!inTranslations)
                {
                    res.Meanings.Add(ln);
                }
                else
                {
                    res.Translations.Add(ln);
                }
            }
            return(res);
        }
Exemple #2
0
        static void getMarkupRu(string fnEntries, string fnBaseOut)
        {
            var           poss         = new Dictionary <string, int>();
            var           meaningCurly = new Dictionary <string, int>();
            var           enCurly      = new Dictionary <string, int>();
            var           deCurly      = new Dictionary <string, int>();
            string        line;
            List <string> entryLines = new List <string>();

            using (var sr = new StreamReader(fnEntries))
            {
                while ((line = sr.ReadLine()) != null)
                {
                    if (line == "" && entryLines.Count > 0)
                    {
                        var we = WiktEntry.FromLinesRu(entryLines);
                        if (we != null)
                        {
                            getMarkupRu(we, poss, meaningCurly, enCurly, deCurly);
                        }
                        entryLines.Clear();
                        continue;
                    }
                    entryLines.Add(line);
                }
            }
            writeMarkupCounts(poss, fnBaseOut + "-curly-poss.txt");
            writeMarkupCounts(meaningCurly, fnBaseOut + "-curly-meanings.txt");
            writeMarkupCounts(enCurly, fnBaseOut + "-curly-en.txt");
            writeMarkupCounts(deCurly, fnBaseOut + "-curly-de.txt");
        }
Exemple #3
0
        static void entriesToDictDe(string fnEntries, string fnJson)
        {
            List <Entry>  entries = new List <Entry>();
            var           xformer = new WiktEntryTransformerDe();
            string        line;
            List <string> entryLines = new List <string>();

            using (var sr = new StreamReader(fnEntries))
            {
                while ((line = sr.ReadLine()) != null)
                {
                    if (line == "" && entryLines.Count > 0)
                    {
                        var we = WiktEntry.FromLinesDe(entryLines);
                        if (we != null)
                        {
                            var xfe = xformer.Transform(we);
                            if (xfe != null && xfe.Head != "")
                            {
                                entries.Add(xfe);
                            }
                        }
                        entryLines.Clear();
                        continue;
                    }
                    entryLines.Add(line);
                }
            }
            using (var sw = new StreamWriter(fnJson))
            {
                sw.NewLine = "\n";
                var allEntriesStr = JsonConvert.SerializeObject(entries, Formatting.Indented);
                sw.WriteLine(allEntriesStr);
            }
        }
Exemple #4
0
        static void getMarkupRu(WiktEntry we,
                                Dictionary <string, int> poss,
                                Dictionary <string, int> meaningCurly,
                                Dictionary <string, int> enCurly,
                                Dictionary <string, int> deCurly)
        {
            inc(poss, we.PoS);
            MatchCollection mm;

            foreach (var meaning in we.Meanings)
            {
                mm = reCurly.Matches(meaning);
                foreach (Match m in mm)
                {
                    inc(meaningCurly, m.Value);
                }
            }
            foreach (var trans in we.Translations)
            {
                mm = reCurly.Matches(trans);
                var counter = trans.StartsWith("en") ? enCurly : deCurly;
                foreach (Match m in mm)
                {
                    inc(counter, m.Value);
                }
            }
        }
Exemple #5
0
        static void mergeLemsRu(string fnIn, string fnPlain, string fnLems, string fnOut)
        {
            Dictionary <string, string> plainToLem = new Dictionary <string, string>();
            string l1, l2;

            using (StreamReader srPlain = new StreamReader(fnPlain))
                using (StreamReader srLems = new StreamReader(fnLems))
                {
                    while ((l1 = srPlain.ReadLine()) != null)
                    {
                        l2 = srLems.ReadLine();
                        string   lem   = "";
                        string[] parts = l2.Split(' ');
                        foreach (string p in parts)
                        {
                            if (lem != "")
                            {
                                lem += " ";
                            }
                            int ix1 = p.IndexOf('{');
                            if (ix1 == -1)
                            {
                                lem += p; continue;
                            }
                            int ix2 = p.IndexOf('|');
                            if (ix2 == -1)
                            {
                                ix2 = p.IndexOf('}');
                            }
                            lem += p.Substring(ix1 + 1, ix2 - ix1 - 1);
                        }
                        plainToLem[l1] = lem;
                    }
                }
            string        line;
            List <string> entryLines = new List <string>();

            using (var sr = new StreamReader(fnIn))
                using (var swOut = new StreamWriter(fnOut))
                {
                    swOut.NewLine = "\n";
                    while ((line = sr.ReadLine()) != null)
                    {
                        if (line == "" && entryLines.Count > 0)
                        {
                            var we = WiktEntry.FromLinesRu(entryLines);
                            if (plainToLem.ContainsKey(we.Lemma))
                            {
                                we.Lemmatized = plainToLem[we.Lemma];
                            }
                            we.WriteRu(swOut);
                            entryLines.Clear();
                            continue;
                        }
                        entryLines.Add(line);
                    }
                }
        }
Exemple #6
0
        static void cleanupRu(string fnIn, string fnOut, string fnToLem)
        {
            int            cntIn = 0, cntKept = 0, cntTrans = 0, cntSpacee = 0;
            RuEntryCleaner cleaner = new RuEntryCleaner();
            string         line;
            List <string>  entryLines = new List <string>();

            using (var sr = new StreamReader(fnIn))
                using (var swOut = new StreamWriter(fnOut))
                    using (var swToLem = new StreamWriter(fnToLem))
                    {
                        swOut.NewLine   = "\n";
                        swToLem.NewLine = "\n";
                        while ((line = sr.ReadLine()) != null)
                        {
                            if (line == "" && entryLines.Count > 0)
                            {
                                var we = WiktEntry.FromLinesRu(entryLines);
                                if (we != null)
                                {
                                    ++cntIn;
                                    var cleanEntry = cleaner.Clean(we);
                                    if (cleanEntry != null)
                                    {
                                        ++cntKept;
                                        if (cleanEntry.Translations.Count > 0)
                                        {
                                            ++cntTrans;
                                        }
                                        if (cleanEntry.Lemma.IndexOf(' ') != -1)
                                        {
                                            cntSpacee++;
                                        }
                                        cleanEntry.WriteRu(swOut);
                                        if (cleanEntry.Lemma.IndexOf(' ') != -1)
                                        {
                                            swToLem.WriteLine(cleanEntry.Lemma);
                                        }
                                    }
                                }
                                entryLines.Clear();
                                continue;
                            }
                            entryLines.Add(line);
                        }
                    }
            Console.WriteLine(cntIn + " entries / " + cntKept + " kept / " + cntTrans + " with translations / " + cntSpacee + " multiword");
        }
        public WiktEntry Clean(WiktEntry entry)
        {
            WiktEntry res = new WiktEntry();

            res.Lemma = entry.Lemma;
            res.PoS   = "";
            Match m = rePoS.Match(entry.PoS);

            if (m.Success)
            {
                res.PoS = m.Groups[1].Value;
            }
            res.Pron = getPron(entry.PoS);
            if (res.Pron == "")
            {
                res.Pron = entry.Lemma;
            }
            res.Details = entry.PoS;

            foreach (var mean in entry.Meanings)
            {
                // Drop if nothing there that looks like content
                if (!anyMeaning(mean))
                {
                    continue;
                }
                // Clean meaning
                string mclean = cleanMeaning(mean);
                res.Meanings.Add(mclean);
            }
            foreach (var trans in entry.Translations)
            {
                string tclean = cleanTrans(trans);
                if (tclean.Length > 2)
                {
                    res.Translations.Add(tclean);
                }
            }

            if (res.Meanings.Count == 0)
            {
                return(null);
            }
            return(res);
        }
Exemple #8
0
        public static WiktEntry FromLinesRu(List <string> lines)
        {
            string[]  parts = lines[0].Split('\t');
            WiktEntry res   = new WiktEntry
            {
                Lemma = parts[0],
                PoS   = parts[1],
            };

            if (parts.Length > 2)
            {
                res.Pron = parts[2];
            }
            if (parts.Length > 3)
            {
                res.Lemmatized = parts[3];
            }
            if (parts.Length > 4)
            {
                res.Details = parts[4];
            }
            bool inTranslations = false;

            for (int i = 1; i < lines.Count; ++i)
            {
                string ln = lines[i];
                if (!ln.StartsWith("#"))
                {
                    inTranslations = true;
                }
                if (!inTranslations)
                {
                    res.Meanings.Add(ln);
                }
                else
                {
                    res.Translations.Add(ln);
                }
            }
            return(res);
        }
        public Entry Transform(WiktEntry we)
        {
            if (we.Lemma.Contains(":"))
            {
                return(null);
            }
            if (we.PoS == "Konjugierte Form")
            {
                return(null);
            }
            if (we.PoS == "Deklinierte Form")
            {
                return(null);
            }
            Entry res = new Entry
            {
                Head = we.Lemma,
                PoS  = we.PoS,
            };

            res.Head = reLink1.Replace(res.Head, m => m.Groups[1].Value);
            res.Head = reLink2.Replace(res.Head, m => m.Groups[1].Value);

            Dictionary <int, Meaning> numToMeaning = new Dictionary <int, Meaning>();

            for (int i = 0; i < we.Meanings.Count; ++i)
            {
                var mng = we.Meanings[i];
                if (mng.StartsWith("::"))
                {
                    continue;
                }
                var mIx = reMeanIx.Match(mng);
                if (!mIx.Success)
                {
                    continue;
                }
                int meanIx = int.Parse(mIx.Groups[1].Value);
                mng = mIx.Groups[2].Value;
                mng = reLink1.Replace(mng, m => m.Groups[1].Value);
                mng = reLink2.Replace(mng, m => m.Groups[1].Value);
                mng = reTrans.Replace(mng, "{trans.}");
                mng = reK.Replace(mng, m => { return("{" + m.Groups[1].Value.Replace("|", ", ") + "}"); });
                mng = reQS.Replace(mng, "");
                mng = reMeta1.Replace(mng, m => { return("{" + m.Groups[1].Value + "}"); });
                mng = reMeta2.Replace(mng, m => { return("{" + m.Groups[1].Value + "}"); });
                mng = reItal.Replace(mng, m => { return("<" + m.Groups[1].Value + ">"); });
                mng = reRef.Replace(mng, "");
                if (mng.Trim() == "")
                {
                    continue;
                }
                Meaning meaning = new Meaning {
                    SrcDef = mng
                };
                res.Meanings.Add(meaning);
                numToMeaning[meanIx] = meaning;
            }
            List <int> meaningIndexes = new List <int>();

            foreach (var tx in we.Translations)
            {
                string[] txs = tx.Split("; [");
                for (int i = 1; i < txs.Length; ++i)
                {
                    txs[i] = "[" + txs[i];
                }
                foreach (var ln in txs)
                {
                    meaningIndexes.Clear();
                    // Which meanings does this translation refer to?
                    var mtx = reTxIx1.Match(ln);
                    if (!mtx.Success)
                    {
                        continue;
                    }
                    string[] ixParts = mtx.Groups[1].Value.Split(',');
                    foreach (var ixPart in ixParts)
                    {
                        int val;
                        if (int.TryParse(ixPart, out val))
                        {
                            meaningIndexes.Add(val);
                        }
                        else
                        {
                            string   ixPartNorm = ixPart.Replace('–', '-');
                            string[] fromToStr  = ixPartNorm.Split('-');
                            int      val2;
                            if (int.TryParse(fromToStr[0], out val) && int.TryParse(fromToStr[1], out val2))
                            {
                                for (int j = val; j <= val2; ++j)
                                {
                                    meaningIndexes.Add(j);
                                }
                            }
                        }
                    }
                    // No indexes parsed: forget it
                    if (meaningIndexes.Count == 0)
                    {
                        continue;
                    }
                    // Gather all translations
                    var mEquivs = reTxEn.Matches(ln);
                    foreach (Match mEquiv in mEquivs)
                    {
                        foreach (int meaningIndex in meaningIndexes)
                        {
                            if (numToMeaning.ContainsKey(meaningIndex))
                            {
                                numToMeaning[meaningIndex].OtherLangs.Add(mEquiv.Groups[1].Value);
                            }
                        }
                    }
                }
            }
            return(res);
        }
Exemple #10
0
        public List <WiktEntry> GetEntries(string pageTitle, string pageText)
        {
            //// DBG
            //if (pageTitle == "да")
            //{
            //    int tre = 0;
            //}
            List <WiktEntry> res = new List <WiktEntry>();

            string[] lines = pageText.Split('\n');
            var      entry = new WiktEntry();

            entry.Lemma = pageTitle;
            string transBlock = null;
            int    state      = 0;

            // 0: start; not in russian section
            // 1: russian section start observed
            // 2: morphology heading observed (within russian section)
            // 3: empty line observed after morphology heading
            // 4: meaning heaading observed (after russian morphology heading)
            // 5: empty line observer after meaning heading
            // 6: translation heading observed (after russian meaning heading)
            foreach (var x in lines)
            {
                // Resolve some remaining entities
                string line = cleanLine(x);
                // First: state changes
                // These all continue
                if (line.StartsWith("= {{-"))
                {
                    if (line.StartsWith("= {{-ru-}} ="))
                    {
                        state = 1;
                    }
                    else
                    {
                        // New language section can close gathered entry
                        if (entry.Meanings.Count > 0)
                        {
                            res.Add(entry);
                            entry       = new WiktEntry();
                            entry.Lemma = pageTitle;
                        }
                        state = 0;
                    }
                    continue;
                }
                if (line == kMorf || line == kExpr)
                {
                    // New morphology heading can close gathered entry
                    if (entry.Meanings.Count > 0)
                    {
                        res.Add(entry);
                        entry       = new WiktEntry();
                        entry.Lemma = pageTitle;
                    }
                    if (state == 1)
                    {
                        state = 2;
                    }
                    continue;
                }
                if (line == kMeaning)
                {
                    if (state == 3)
                    {
                        state = 4;
                    }
                    continue;
                }
                if (line == kTrans)
                {
                    if (state == 5)
                    {
                        state = 6; transBlock = "";
                    }
                    continue;
                }
                // Eating content within state
                // Just after morphology header
                if (state == 2)
                {
                    if (line != "")
                    {
                        entry.PoS += line;
                    }
                    else
                    {
                        state = 3; continue;
                    }
                }
                // After meanings header
                if (state == 4)
                {
                    if (line.Trim() == "" || line.Trim() == "#")
                    {
                        state = 5; continue;
                    }
                    if (line.StartsWith("#"))
                    {
                        entry.Meanings.Add(line);
                    }
                    else if (entry.Meanings.Count > 0)
                    {
                        entry.Meanings[entry.Meanings.Count - 1] += line;
                    }
                }
                // After translation header
                if (state == 6)
                {
                    // Some other heading: let's get out of translation state, back into 1
                    if (line.StartsWith("="))
                    {
                        state = 1; continue;
                    }
                    // Translation-block
                    Match m = reTransBlock.Match(line);
                    if (m.Success)
                    {
                        transBlock = m.Groups[1].Value; continue;
                    }
                    // Specific translation
                    m = reTransItem.Match(line);
                    if (m.Success)
                    {
                        entry.Translations.Add(m.Groups[1].Value + "\t" + transBlock + "\t" + m.Groups[2].Value);
                    }
                }
            }
            if (entry.Meanings.Count > 0)
            {
                res.Add(entry);
            }
            return(res);
        }
Exemple #11
0
        public List <WiktEntry> GetEntries(string pageTitle, string pageText)
        {
            List <WiktEntry> res = new List <WiktEntry>();

            string[] lines             = pageText.Split('\n');
            var      entry             = new WiktEntry();
            bool     afterMeanings     = false;
            bool     afterTranslations = false;

            foreach (var line in lines)
            {
                var m = reLemma.Match(line);
                if (m.Success)
                {
                    if (entry.Meanings.Count > 0)
                    {
                        res.Add(entry);
                        entry = new WiktEntry();
                    }
                    entry.Lemma = m.Groups[1].Value;
                    continue;
                }
                m = rePoS.Match(line);
                if (m.Success)
                {
                    if (entry.Meanings.Count > 0)
                    {
                        res.Add(entry);
                        entry = new WiktEntry {
                            Lemma = entry.Lemma
                        };
                    }
                    entry.PoS     = m.Groups[1].Value;
                    entry.Details = m.Groups[2].Value;
                    continue;
                }
                if (line == "{{Bedeutungen}}")
                {
                    afterMeanings = true; continue;
                }
                if (afterMeanings)
                {
                    if (line == "")
                    {
                        afterMeanings = false; continue;
                    }
                    if (entry.PoS == null)
                    {
                        continue;
                    }
                    entry.Meanings.Add(line);
                }
                if (line == "==== {{Übersetzungen}} ====")
                {
                    afterTranslations = true; continue;
                }
                if (afterTranslations)
                {
                    if (line == "")
                    {
                        afterMeanings = false; continue;
                    }
                    if (entry.Meanings.Count == 0)
                    {
                        continue;
                    }
                    m = reTranslation.Match(line);
                    if (m.Success)
                    {
                        entry.Translations.Add(m.Groups[1].Value);
                    }
                }
            }
            if (entry.Meanings.Count > 0)
            {
                res.Add(entry);
            }
            return(res);
        }