static void getMarkupRu(string fnEntries, string fnBaseOut) { var poss = new Dictionary <string, int>(); var meaningCurly = new Dictionary <string, int>(); var enCurly = new Dictionary <string, int>(); var deCurly = new Dictionary <string, int>(); string line; List <string> entryLines = new List <string>(); using (var sr = new StreamReader(fnEntries)) { while ((line = sr.ReadLine()) != null) { if (line == "" && entryLines.Count > 0) { var we = WiktEntry.FromLinesRu(entryLines); if (we != null) { getMarkupRu(we, poss, meaningCurly, enCurly, deCurly); } entryLines.Clear(); continue; } entryLines.Add(line); } } writeMarkupCounts(poss, fnBaseOut + "-curly-poss.txt"); writeMarkupCounts(meaningCurly, fnBaseOut + "-curly-meanings.txt"); writeMarkupCounts(enCurly, fnBaseOut + "-curly-en.txt"); writeMarkupCounts(deCurly, fnBaseOut + "-curly-de.txt"); }
static void mergeLemsRu(string fnIn, string fnPlain, string fnLems, string fnOut) { Dictionary <string, string> plainToLem = new Dictionary <string, string>(); string l1, l2; using (StreamReader srPlain = new StreamReader(fnPlain)) using (StreamReader srLems = new StreamReader(fnLems)) { while ((l1 = srPlain.ReadLine()) != null) { l2 = srLems.ReadLine(); string lem = ""; string[] parts = l2.Split(' '); foreach (string p in parts) { if (lem != "") { lem += " "; } int ix1 = p.IndexOf('{'); if (ix1 == -1) { lem += p; continue; } int ix2 = p.IndexOf('|'); if (ix2 == -1) { ix2 = p.IndexOf('}'); } lem += p.Substring(ix1 + 1, ix2 - ix1 - 1); } plainToLem[l1] = lem; } } string line; List <string> entryLines = new List <string>(); using (var sr = new StreamReader(fnIn)) using (var swOut = new StreamWriter(fnOut)) { swOut.NewLine = "\n"; while ((line = sr.ReadLine()) != null) { if (line == "" && entryLines.Count > 0) { var we = WiktEntry.FromLinesRu(entryLines); if (plainToLem.ContainsKey(we.Lemma)) { we.Lemmatized = plainToLem[we.Lemma]; } we.WriteRu(swOut); entryLines.Clear(); continue; } entryLines.Add(line); } } }
static void cleanupRu(string fnIn, string fnOut, string fnToLem) { int cntIn = 0, cntKept = 0, cntTrans = 0, cntSpacee = 0; RuEntryCleaner cleaner = new RuEntryCleaner(); string line; List <string> entryLines = new List <string>(); using (var sr = new StreamReader(fnIn)) using (var swOut = new StreamWriter(fnOut)) using (var swToLem = new StreamWriter(fnToLem)) { swOut.NewLine = "\n"; swToLem.NewLine = "\n"; while ((line = sr.ReadLine()) != null) { if (line == "" && entryLines.Count > 0) { var we = WiktEntry.FromLinesRu(entryLines); if (we != null) { ++cntIn; var cleanEntry = cleaner.Clean(we); if (cleanEntry != null) { ++cntKept; if (cleanEntry.Translations.Count > 0) { ++cntTrans; } if (cleanEntry.Lemma.IndexOf(' ') != -1) { cntSpacee++; } cleanEntry.WriteRu(swOut); if (cleanEntry.Lemma.IndexOf(' ') != -1) { swToLem.WriteLine(cleanEntry.Lemma); } } } entryLines.Clear(); continue; } entryLines.Add(line); } } Console.WriteLine(cntIn + " entries / " + cntKept + " kept / " + cntTrans + " with translations / " + cntSpacee + " multiword"); }