public static WiktEntry FromLinesDe(List <string> lines) { string[] parts = lines[0].Split('\t'); WiktEntry res = new WiktEntry { Lemma = parts[0], PoS = parts[1], }; bool inTranslations = false; for (int i = 1; i < lines.Count; ++i) { string ln = lines[i]; if (!ln.StartsWith(":")) { inTranslations = true; } if (ln.StartsWith("::")) { continue; } if (!inTranslations) { res.Meanings.Add(ln); } else { res.Translations.Add(ln); } } return(res); }
static void getMarkupRu(string fnEntries, string fnBaseOut) { var poss = new Dictionary <string, int>(); var meaningCurly = new Dictionary <string, int>(); var enCurly = new Dictionary <string, int>(); var deCurly = new Dictionary <string, int>(); string line; List <string> entryLines = new List <string>(); using (var sr = new StreamReader(fnEntries)) { while ((line = sr.ReadLine()) != null) { if (line == "" && entryLines.Count > 0) { var we = WiktEntry.FromLinesRu(entryLines); if (we != null) { getMarkupRu(we, poss, meaningCurly, enCurly, deCurly); } entryLines.Clear(); continue; } entryLines.Add(line); } } writeMarkupCounts(poss, fnBaseOut + "-curly-poss.txt"); writeMarkupCounts(meaningCurly, fnBaseOut + "-curly-meanings.txt"); writeMarkupCounts(enCurly, fnBaseOut + "-curly-en.txt"); writeMarkupCounts(deCurly, fnBaseOut + "-curly-de.txt"); }
static void entriesToDictDe(string fnEntries, string fnJson) { List <Entry> entries = new List <Entry>(); var xformer = new WiktEntryTransformerDe(); string line; List <string> entryLines = new List <string>(); using (var sr = new StreamReader(fnEntries)) { while ((line = sr.ReadLine()) != null) { if (line == "" && entryLines.Count > 0) { var we = WiktEntry.FromLinesDe(entryLines); if (we != null) { var xfe = xformer.Transform(we); if (xfe != null && xfe.Head != "") { entries.Add(xfe); } } entryLines.Clear(); continue; } entryLines.Add(line); } } using (var sw = new StreamWriter(fnJson)) { sw.NewLine = "\n"; var allEntriesStr = JsonConvert.SerializeObject(entries, Formatting.Indented); sw.WriteLine(allEntriesStr); } }
static void getMarkupRu(WiktEntry we, Dictionary <string, int> poss, Dictionary <string, int> meaningCurly, Dictionary <string, int> enCurly, Dictionary <string, int> deCurly) { inc(poss, we.PoS); MatchCollection mm; foreach (var meaning in we.Meanings) { mm = reCurly.Matches(meaning); foreach (Match m in mm) { inc(meaningCurly, m.Value); } } foreach (var trans in we.Translations) { mm = reCurly.Matches(trans); var counter = trans.StartsWith("en") ? enCurly : deCurly; foreach (Match m in mm) { inc(counter, m.Value); } } }
static void mergeLemsRu(string fnIn, string fnPlain, string fnLems, string fnOut) { Dictionary <string, string> plainToLem = new Dictionary <string, string>(); string l1, l2; using (StreamReader srPlain = new StreamReader(fnPlain)) using (StreamReader srLems = new StreamReader(fnLems)) { while ((l1 = srPlain.ReadLine()) != null) { l2 = srLems.ReadLine(); string lem = ""; string[] parts = l2.Split(' '); foreach (string p in parts) { if (lem != "") { lem += " "; } int ix1 = p.IndexOf('{'); if (ix1 == -1) { lem += p; continue; } int ix2 = p.IndexOf('|'); if (ix2 == -1) { ix2 = p.IndexOf('}'); } lem += p.Substring(ix1 + 1, ix2 - ix1 - 1); } plainToLem[l1] = lem; } } string line; List <string> entryLines = new List <string>(); using (var sr = new StreamReader(fnIn)) using (var swOut = new StreamWriter(fnOut)) { swOut.NewLine = "\n"; while ((line = sr.ReadLine()) != null) { if (line == "" && entryLines.Count > 0) { var we = WiktEntry.FromLinesRu(entryLines); if (plainToLem.ContainsKey(we.Lemma)) { we.Lemmatized = plainToLem[we.Lemma]; } we.WriteRu(swOut); entryLines.Clear(); continue; } entryLines.Add(line); } } }
static void cleanupRu(string fnIn, string fnOut, string fnToLem) { int cntIn = 0, cntKept = 0, cntTrans = 0, cntSpacee = 0; RuEntryCleaner cleaner = new RuEntryCleaner(); string line; List <string> entryLines = new List <string>(); using (var sr = new StreamReader(fnIn)) using (var swOut = new StreamWriter(fnOut)) using (var swToLem = new StreamWriter(fnToLem)) { swOut.NewLine = "\n"; swToLem.NewLine = "\n"; while ((line = sr.ReadLine()) != null) { if (line == "" && entryLines.Count > 0) { var we = WiktEntry.FromLinesRu(entryLines); if (we != null) { ++cntIn; var cleanEntry = cleaner.Clean(we); if (cleanEntry != null) { ++cntKept; if (cleanEntry.Translations.Count > 0) { ++cntTrans; } if (cleanEntry.Lemma.IndexOf(' ') != -1) { cntSpacee++; } cleanEntry.WriteRu(swOut); if (cleanEntry.Lemma.IndexOf(' ') != -1) { swToLem.WriteLine(cleanEntry.Lemma); } } } entryLines.Clear(); continue; } entryLines.Add(line); } } Console.WriteLine(cntIn + " entries / " + cntKept + " kept / " + cntTrans + " with translations / " + cntSpacee + " multiword"); }
public WiktEntry Clean(WiktEntry entry) { WiktEntry res = new WiktEntry(); res.Lemma = entry.Lemma; res.PoS = ""; Match m = rePoS.Match(entry.PoS); if (m.Success) { res.PoS = m.Groups[1].Value; } res.Pron = getPron(entry.PoS); if (res.Pron == "") { res.Pron = entry.Lemma; } res.Details = entry.PoS; foreach (var mean in entry.Meanings) { // Drop if nothing there that looks like content if (!anyMeaning(mean)) { continue; } // Clean meaning string mclean = cleanMeaning(mean); res.Meanings.Add(mclean); } foreach (var trans in entry.Translations) { string tclean = cleanTrans(trans); if (tclean.Length > 2) { res.Translations.Add(tclean); } } if (res.Meanings.Count == 0) { return(null); } return(res); }
public static WiktEntry FromLinesRu(List <string> lines) { string[] parts = lines[0].Split('\t'); WiktEntry res = new WiktEntry { Lemma = parts[0], PoS = parts[1], }; if (parts.Length > 2) { res.Pron = parts[2]; } if (parts.Length > 3) { res.Lemmatized = parts[3]; } if (parts.Length > 4) { res.Details = parts[4]; } bool inTranslations = false; for (int i = 1; i < lines.Count; ++i) { string ln = lines[i]; if (!ln.StartsWith("#")) { inTranslations = true; } if (!inTranslations) { res.Meanings.Add(ln); } else { res.Translations.Add(ln); } } return(res); }
public Entry Transform(WiktEntry we) { if (we.Lemma.Contains(":")) { return(null); } if (we.PoS == "Konjugierte Form") { return(null); } if (we.PoS == "Deklinierte Form") { return(null); } Entry res = new Entry { Head = we.Lemma, PoS = we.PoS, }; res.Head = reLink1.Replace(res.Head, m => m.Groups[1].Value); res.Head = reLink2.Replace(res.Head, m => m.Groups[1].Value); Dictionary <int, Meaning> numToMeaning = new Dictionary <int, Meaning>(); for (int i = 0; i < we.Meanings.Count; ++i) { var mng = we.Meanings[i]; if (mng.StartsWith("::")) { continue; } var mIx = reMeanIx.Match(mng); if (!mIx.Success) { continue; } int meanIx = int.Parse(mIx.Groups[1].Value); mng = mIx.Groups[2].Value; mng = reLink1.Replace(mng, m => m.Groups[1].Value); mng = reLink2.Replace(mng, m => m.Groups[1].Value); mng = reTrans.Replace(mng, "{trans.}"); mng = reK.Replace(mng, m => { return("{" + m.Groups[1].Value.Replace("|", ", ") + "}"); }); mng = reQS.Replace(mng, ""); mng = reMeta1.Replace(mng, m => { return("{" + m.Groups[1].Value + "}"); }); mng = reMeta2.Replace(mng, m => { return("{" + m.Groups[1].Value + "}"); }); mng = reItal.Replace(mng, m => { return("<" + m.Groups[1].Value + ">"); }); mng = reRef.Replace(mng, ""); if (mng.Trim() == "") { continue; } Meaning meaning = new Meaning { SrcDef = mng }; res.Meanings.Add(meaning); numToMeaning[meanIx] = meaning; } List <int> meaningIndexes = new List <int>(); foreach (var tx in we.Translations) { string[] txs = tx.Split("; ["); for (int i = 1; i < txs.Length; ++i) { txs[i] = "[" + txs[i]; } foreach (var ln in txs) { meaningIndexes.Clear(); // Which meanings does this translation refer to? var mtx = reTxIx1.Match(ln); if (!mtx.Success) { continue; } string[] ixParts = mtx.Groups[1].Value.Split(','); foreach (var ixPart in ixParts) { int val; if (int.TryParse(ixPart, out val)) { meaningIndexes.Add(val); } else { string ixPartNorm = ixPart.Replace('–', '-'); string[] fromToStr = ixPartNorm.Split('-'); int val2; if (int.TryParse(fromToStr[0], out val) && int.TryParse(fromToStr[1], out val2)) { for (int j = val; j <= val2; ++j) { meaningIndexes.Add(j); } } } } // No indexes parsed: forget it if (meaningIndexes.Count == 0) { continue; } // Gather all translations var mEquivs = reTxEn.Matches(ln); foreach (Match mEquiv in mEquivs) { foreach (int meaningIndex in meaningIndexes) { if (numToMeaning.ContainsKey(meaningIndex)) { numToMeaning[meaningIndex].OtherLangs.Add(mEquiv.Groups[1].Value); } } } } } return(res); }
public List <WiktEntry> GetEntries(string pageTitle, string pageText) { //// DBG //if (pageTitle == "да") //{ // int tre = 0; //} List <WiktEntry> res = new List <WiktEntry>(); string[] lines = pageText.Split('\n'); var entry = new WiktEntry(); entry.Lemma = pageTitle; string transBlock = null; int state = 0; // 0: start; not in russian section // 1: russian section start observed // 2: morphology heading observed (within russian section) // 3: empty line observed after morphology heading // 4: meaning heaading observed (after russian morphology heading) // 5: empty line observer after meaning heading // 6: translation heading observed (after russian meaning heading) foreach (var x in lines) { // Resolve some remaining entities string line = cleanLine(x); // First: state changes // These all continue if (line.StartsWith("= {{-")) { if (line.StartsWith("= {{-ru-}} =")) { state = 1; } else { // New language section can close gathered entry if (entry.Meanings.Count > 0) { res.Add(entry); entry = new WiktEntry(); entry.Lemma = pageTitle; } state = 0; } continue; } if (line == kMorf || line == kExpr) { // New morphology heading can close gathered entry if (entry.Meanings.Count > 0) { res.Add(entry); entry = new WiktEntry(); entry.Lemma = pageTitle; } if (state == 1) { state = 2; } continue; } if (line == kMeaning) { if (state == 3) { state = 4; } continue; } if (line == kTrans) { if (state == 5) { state = 6; transBlock = ""; } continue; } // Eating content within state // Just after morphology header if (state == 2) { if (line != "") { entry.PoS += line; } else { state = 3; continue; } } // After meanings header if (state == 4) { if (line.Trim() == "" || line.Trim() == "#") { state = 5; continue; } if (line.StartsWith("#")) { entry.Meanings.Add(line); } else if (entry.Meanings.Count > 0) { entry.Meanings[entry.Meanings.Count - 1] += line; } } // After translation header if (state == 6) { // Some other heading: let's get out of translation state, back into 1 if (line.StartsWith("=")) { state = 1; continue; } // Translation-block Match m = reTransBlock.Match(line); if (m.Success) { transBlock = m.Groups[1].Value; continue; } // Specific translation m = reTransItem.Match(line); if (m.Success) { entry.Translations.Add(m.Groups[1].Value + "\t" + transBlock + "\t" + m.Groups[2].Value); } } } if (entry.Meanings.Count > 0) { res.Add(entry); } return(res); }
public List <WiktEntry> GetEntries(string pageTitle, string pageText) { List <WiktEntry> res = new List <WiktEntry>(); string[] lines = pageText.Split('\n'); var entry = new WiktEntry(); bool afterMeanings = false; bool afterTranslations = false; foreach (var line in lines) { var m = reLemma.Match(line); if (m.Success) { if (entry.Meanings.Count > 0) { res.Add(entry); entry = new WiktEntry(); } entry.Lemma = m.Groups[1].Value; continue; } m = rePoS.Match(line); if (m.Success) { if (entry.Meanings.Count > 0) { res.Add(entry); entry = new WiktEntry { Lemma = entry.Lemma }; } entry.PoS = m.Groups[1].Value; entry.Details = m.Groups[2].Value; continue; } if (line == "{{Bedeutungen}}") { afterMeanings = true; continue; } if (afterMeanings) { if (line == "") { afterMeanings = false; continue; } if (entry.PoS == null) { continue; } entry.Meanings.Add(line); } if (line == "==== {{Übersetzungen}} ====") { afterTranslations = true; continue; } if (afterTranslations) { if (line == "") { afterMeanings = false; continue; } if (entry.Meanings.Count == 0) { continue; } m = reTranslation.Match(line); if (m.Success) { entry.Translations.Add(m.Groups[1].Value); } } } if (entry.Meanings.Count > 0) { res.Add(entry); } return(res); }