public static void MakeWordList() { using (var imp = new Impersonator("pavel", "LANGMaster", "zvahov88_")) DictLib.RunStemming<List<string>>( Dict.allLangs, lng => File.ReadAllLines(string.Format(Machines.rootPath + @"RwDicts\Sources\LingeaOld\design\wordList_{0}.txt", lng)).Concat( File.ReadAllLines(string.Format(Machines.rootPath + @"RwDicts\Sources\Wiktionary.back\{0}.txt", lng)).Select(l => l.Split(' ')[0]).Take(50000)). Where(w => !string.IsNullOrEmpty(w)). Select(w => w.ToLower()). Distinct().ToArray(), lng => new List<string>(), (lng, word, row, res) => res.AddRange(row.Select(r => doNormalize(r)).Where(r => !string.IsNullOrEmpty(r))), (lng, res) => File.WriteAllLines(string.Format(Machines.rootPath + @"RwDicts\Sources\Ultralingua.back\design\WordsStems_{0}.txt", lng), res.Distinct().OrderBy(w => w)), imp ); }
public static void MakeWordList_Other() { using (var imp = new Impersonator("pavel", "LANGMaster", "zvahov88_")) { DictLib.RunStemming<List<string>>( otherLangsStr.Values, lng => { Thread.CurrentThread.CurrentCulture = Thread.CurrentThread.CurrentUICulture = CultureInfo.CreateSpecificCulture("ar-sa"); var list = File.ReadAllLines(string.Format(@"d:\LMCom\rew\Web4\RwDicts\Sources\Wiktionary.back\src\{0}.txt", otherLangsStr.First(kv => kv.Value == lng).Key)).ToArray(); return list.Select(l => l.Split(' ')[0]).Take(200000). Where(w => !string.IsNullOrEmpty(w)). Select(w => w.ToLower()). Distinct().ToArray(); }, lng => new List<string>(), (lng, word, row, res) => res.AddRange(row.Select(r => doNormalize(r)).Where(r => !string.IsNullOrEmpty(r))), (lng, res) => File.WriteAllLines(string.Format(Machines.rootPath + @"RwDicts\Sources\Ultralingua.back\design\WordsStems_{0}.txt", lng), res.Distinct().OrderBy(w => w)), imp ); } }
//LingeaOld\temp1 + LingeaOld\design\entriesInfo.xml => LingeaOld public static void OldToNew3() { using (var imp = new Impersonator("pavel", "LANGMaster", "zvahov88_")) { var files = repairLingea(Machines.rootPath + @"RwDicts\Sources\LingeaOld\temp1\", imp); foreach (var fn in Directory.EnumerateFiles(Machines.rootPath + @"RwDicts\Sources\LingeaOld", "*.xml")) File.Delete(fn); foreach (var f in files) XmlUtils.ObjectToFile(string.Format(Machines.rootPath + @"RwDicts\Sources\LingeaOld\{1}_{0}.xml", f.crsLang, f.natLang), f); } }
public DictEntryObj toNew(Impersonator imp) { using (WindowsIdentity.Impersonate(imp.token)) { return new DictEntryObj { entryId = entryId, entry = entry.Elements().First(), soundMaster = soundMaster, type = DictEntryType.lingeaOld, headWords = headWords, courseWords = entry.AttributeValue("courseUsed", "").Split('|').Select(w => w.ToLower().Replace("'", null)).Except(headWords.Select(h => h.ToLower())).Where(w => !string.IsNullOrEmpty(w)).ToArray(), //headWords.Select(hw => new CourseDictionary2.DictStem { type = CourseDictionary2.DictStemType.wordId, word = hw }). // Concat(headWords.ToArray().SelectMany(w => CourseDictionary.RunStemming(okCrs, w)).Distinct().Select(w => new CourseDictionary2.DictStem { type = CourseDictionary2.DictStemType.stem, word = w })). // Concat(entry.AttributeValue("courseUsed", "").Split('|').Where(w => !string.IsNullOrEmpty(w)).Select(w => new CourseDictionary2.DictStem { type = CourseDictionary2.DictStemType.courseUses, word = w })). // ToArray() }; }; }
public static IEnumerable<DictObj> repairLingea(string srcDir, Impersonator imp) { var allnew = XmlUtils.FileToObject<LingeaDictionary.DictEntry[]>(@"d:\LMCom\rew\Web4\RwDicts\Sources\LingeaOld\design\entriesInfo.xml"); var allOld = LingeaDictionary.getOldLingeaEntries(srcDir).SelectMany(kv => kv.Value).SelectMany(kv => kv.Value.entryIdToEntry).ToDictionary(kv => kv.Key, kv => kv.Value); //k jednomu zvuku jeden headword Dictionary<Langs, Dictionary<string, string>> soundMasterToHeadword = allnew.Where(de => de.soundMaster != null).GroupBy(de => de.okCrs).ToDictionary( g => g.Key, g => g.GroupBy(de => de.soundMaster).ToDictionary( sg => sg.Key, sg => sg.Select(d => d.headWords[0].ToLower()).Distinct().Single())); //pripojeni .info souboru s headword k .mp3 souboru foreach (var lKv in soundMasterToHeadword) foreach (var hKv in lKv.Value) { string right = @"d:\LMCom\rew\Web4\RwDicts\LingeaSound\" + hKv.Key.Replace('/', '\\') + ".mp3"; if (!File.Exists(right)) throw new Exception(); File.WriteAllText(right.Replace(".mp3", ".info"), lKv.Key.ToString() + "=" + hKv.Value); } //vytvor nova hesla - merge entriesInfo.xml s LingeaOld.back (LingeaOld\temp1). Pridej a napln sound tag v entry List<DictObj> res = new List<DictObj>(); Parallel.ForEach(allnew.Where(de => de.okCrs != Langs.no).GroupBy(dn => dn.okCrs), //new ParallelOptions() { MaxDegreeOfParallelism = 1 }, crs => { Parallel.ForEach(crs.GroupBy(dc => dc.crsLang == dc.okCrs ? dc.natLang : dc.crsLang), //new ParallelOptions() { MaxDegreeOfParallelism = 1 }, nat => { foreach (var newDe in nat) { //merge newDe.entry = allOld[newDe.entryId]; //sound tag var snd = newDe.entry.Descendants("sound").FirstOrDefault(); if (snd == null) { snd = new XElement("sound"); var head = newDe.entry.DescendantsAttr("class", "head").First(); var pron = head.Elements().FirstOrDefault(e => e.AttributeValue("class") == "pron"); if (pron != null) pron.AddAfterSelf(snd); else { var morf = head.Elements().FirstOrDefault(e => e.AttributeValue("class") == "morf"); if (morf != null) morf.AddAfterSelf(snd); else head.Add(snd); } } snd.Value = (string.IsNullOrEmpty(newDe.soundMaster) ? "@" + newDe.headWords[0].ToLower() : @"RwDicts/LingeaSound/" + newDe.soundMaster + ".mp3").ToLower(); } lock (res) res.Add(new DictObj { crsLang = crs.Key, natLang = nat.Key, entries = nat.Select(n => n.toNew(imp)).DistinctBy(n => n.entry.ToString()).ToArray() }); }); }); return res; }