/// <summary> /// Convert method creates db file from wndb files /// dictpath - path to wndb data files /// context - dest db context /// </summary> public static void Convert(string dictPack, string jsonFile) { WNDB wndb = new WNDB(dictPack); var poses = (new[] { "n", "v", "a", "r" }).Select(s => PartOfSpeech.of(s)); // Convert to Dictionary // lemma -> { SynSetGroup: PosSymbol, Synsets = { synset: synonims, definitions, examples } } var dict = new Dictionary <string, List <ExpSynSetGroup> >(); foreach (var pos in poses) { Console.WriteLine("Process Data of {0}", pos.name); foreach (var data in wndb.GetData(pos)) { //data.adj includes 'a' & 's' pos symbols char posSymbol = pos.symbol.First(); bool singleWord = false; if (data.origWords.Count() == 1) { var w = data.origWords.First().word; singleWord = w == w.ToLower(); } var synSet = new SynSet { // Skip synonims if where is a single lowercase word Synonims = (singleWord) ? null : data.origWords.Select(ow => ow.word).ToArray(), Definition = (data.definitions.Count() == 1) ? data.definitions.First() : null, Definitions = (data.definitions.Count() > 1) ? data.definitions : null, Example = (data.examples?.Count() == 1) ? data.examples.First() : null, Examples = (data.examples?.Count() > 1) ? data.examples : null }; foreach (var lemma in data.origWords.Select(ow => ow.word.ToLower())) { var synGrps = dict.GetValue(lemma); if (synGrps != null) { var grp = synGrps.FirstOrDefault(g => g.PosSymbol == posSymbol); if (grp == null) { synGrps.Add(new ExpSynSetGroup(posSymbol, synSet)); } else { grp.Synsets.Add(synSet); } } else { dict.Add(lemma, new List <ExpSynSetGroup> { new ExpSynSetGroup(posSymbol, synSet) }); } } } } // exceptions //TODO: remove morphes, ... var excepts = new Dictionary <string, List <DictException> >(); foreach (var pos in poses) { Console.WriteLine("Process Exceptions of {0}", pos.name); foreach (var exwords in wndb.GetExceptions(pos)) { var morph = Morph.GetBasicForm(exwords[0], pos); for (int i = 1; i < exwords.Length; i++) { var baseForm = exwords[i]; if (baseForm == exwords[0] || baseForm == morph) { //Console.WriteLine($"Skip: {(exwords[0])} -> {baseForm}/{morph}"); continue; } List <ExpSynSetGroup> synGrps = dict.GetValue(baseForm); if (synGrps == null && baseForm.Contains('-')) { baseForm = baseForm.Replace('-', ' '); dict.TryGetValue(baseForm, out synGrps); } if (synGrps != null) { var posSymbols = string.Join("", synGrps.Select(sg => sg.PosSymbol)); var except = new DictException { BasicForm = baseForm, PosSymbols = posSymbols }; List <DictException> baseForms; if (excepts.TryGetValue(exwords[0], out baseForms)) { if (!baseForms.Any(e => e.BasicForm == baseForm)) { baseForms.Add(except); } } else { excepts.Add(exwords[0], new List <DictException> { except }); } } } } } Console.WriteLine("Save changes"); var storage = new ExpDictStorage { SynSets = dict, Exceptions = excepts }; var serializer = new JsonSerializer(); serializer.NullValueHandling = NullValueHandling.Ignore; using (var stream = File.Open(jsonFile, FileMode.Create)) using (var writer = new BsonWriter(stream)) { serializer.Serialize(writer, storage); } }
/// <summary> /// Convert method creates db file from wndb files /// dictpath - path to wndb data files /// context - dest db context /// </summary> public static void Convert(string dictPack, WordNetContext context) { WNDB wndb = new WNDB(dictPack); var synWords = new List <string>(); // int ind; var wordToLemma = new Dictionary <string, Lemma>(); var words = new Dictionary <string, Writing>(); var poses = (new [] { "n", "v", "a", "r" }).Select(s => PartOfSpeech.of(s)); foreach (var pos in poses) { Console.WriteLine("Process Data of {0}", pos.name); // ind = 0; foreach (var data in wndb.GetData(pos)) { if (data.pos != pos.symbol && !(data.pos == "s" && pos.symbol == "a")) //data.adj includes 'a' & 's' pos symbols { throw new Exception("pos!=data.pos"); } var synset = new SynSet { Pos = data.pos }; context.SynSets.Add(synset); synWords.Clear(); foreach (var oword in data.origWords) { Lemma lemma; string lcWord = oword.word.ToLower(); // add lemma if (!wordToLemma.TryGetValue(lcWord, out lemma)) { lemma = new Lemma { Value = lcWord, Poses = data.pos }; wordToLemma.Add(lcWord, lemma); context.Lemmas.Add(lemma); } else if (!lemma.Poses.Contains(data.pos)) { lemma.Poses += data.pos; } if (synWords.IndexOf(lcWord) < 0) { synWords.Add(lcWord); // add SynSet <-> Lemma relation context.SynsetLemmas.Add(new SynsetLemma { SynSet = synset, Lemma = lemma }); } // add original word if it differs from lemma Writing word; if (lcWord != oword.word) { if (!words.TryGetValue(oword.word, out word)) { word = new Writing { Value = oword.word, Lemma = lemma }; words.Add(oword.word, word); context.Writings.Add(word); } else if (word.Lemma != lemma) { Console.WriteLine("Word mix: {0} {1} {2}", oword.word, lemma.Value, word.Lemma.Value); continue; } } } synset.Definition = string.Join(";", data.definitions); synset.Example = string.Join(";", data.examples); // ind++; // if (ind % 1000 == 0) // ShowProgress(ind.ToString()); } Console.WriteLine("Save changes"); context.SaveChanges(); // exceptions //TODO: remove morphes, ... Console.WriteLine("Process Exceptions of {0}", pos.name); // ind = 0; foreach (var exwords in GetExceptions(wndb, pos)) { for (int i = 1; i < exwords.Length; i++) { if (exwords[i] == exwords[0]) { continue; } Lemma lemma; if (wordToLemma.TryGetValue(exwords[i], out lemma) || (exwords[i].Contains('-') && wordToLemma.TryGetValue(exwords[i].Replace('-', ' '), out lemma))) { context.Excepts.Add(new Except { Value = exwords[0], MainForm = exwords[i], Lemma = lemma }); } // else // { // Console.WriteLine("Lemma not found {0}", exwords[i]); // context.Excepts.Add(new Except { Value = exwords[0], MainForm = exwords[i] }); // } } // ind++; // if (ind % 1000 == 0) // ShowProgress(ind.ToString()); } Console.WriteLine("Save changes"); context.SaveChanges(); } //Console.WriteLine("Save changes"); context.SaveChanges(); }