예제 #1
0
        /// <summary>
        /// Convert method creates db file from wndb files
        /// dictpath - path to wndb data files
        /// context - dest db context
        /// </summary>
        public static void Convert(string dictPack, string jsonFile)
        {
            WNDB wndb  = new WNDB(dictPack);
            var  poses = (new[] { "n", "v", "a", "r" }).Select(s => PartOfSpeech.of(s));

            // Convert to  Dictionary
            // lemma -> { SynSetGroup: PosSymbol, Synsets = { synset: synonims, definitions, examples } }

            var dict = new Dictionary <string, List <ExpSynSetGroup> >();

            foreach (var pos in poses)
            {
                Console.WriteLine("Process Data of {0}", pos.name);

                foreach (var data in wndb.GetData(pos))
                {
                    //data.adj includes 'a' & 's' pos symbols
                    char posSymbol = pos.symbol.First();

                    bool singleWord = false;
                    if (data.origWords.Count() == 1)
                    {
                        var w = data.origWords.First().word;
                        singleWord = w == w.ToLower();
                    }

                    var synSet = new SynSet
                    {
                        // Skip synonims if where is a single lowercase word
                        Synonims    = (singleWord) ? null : data.origWords.Select(ow => ow.word).ToArray(),
                        Definition  = (data.definitions.Count() == 1) ? data.definitions.First() : null,
                        Definitions = (data.definitions.Count() > 1) ? data.definitions : null,
                        Example     = (data.examples?.Count() == 1) ? data.examples.First() : null,
                        Examples    = (data.examples?.Count() > 1) ? data.examples : null
                    };

                    foreach (var lemma in data.origWords.Select(ow => ow.word.ToLower()))
                    {
                        var synGrps = dict.GetValue(lemma);
                        if (synGrps != null)
                        {
                            var grp = synGrps.FirstOrDefault(g => g.PosSymbol == posSymbol);

                            if (grp == null)
                            {
                                synGrps.Add(new ExpSynSetGroup(posSymbol, synSet));
                            }
                            else
                            {
                                grp.Synsets.Add(synSet);
                            }
                        }
                        else
                        {
                            dict.Add(lemma, new List <ExpSynSetGroup> {
                                new ExpSynSetGroup(posSymbol, synSet)
                            });
                        }
                    }
                }
            }

            // exceptions
            //TODO: remove morphes, ...

            var excepts = new Dictionary <string, List <DictException> >();

            foreach (var pos in poses)
            {
                Console.WriteLine("Process Exceptions of {0}", pos.name);

                foreach (var exwords in wndb.GetExceptions(pos))
                {
                    var morph = Morph.GetBasicForm(exwords[0], pos);
                    for (int i = 1; i < exwords.Length; i++)
                    {
                        var baseForm = exwords[i];
                        if (baseForm == exwords[0] || baseForm == morph)
                        {
                            //Console.WriteLine($"Skip: {(exwords[0])} -> {baseForm}/{morph}");
                            continue;
                        }

                        List <ExpSynSetGroup> synGrps = dict.GetValue(baseForm);
                        if (synGrps == null && baseForm.Contains('-'))
                        {
                            baseForm = baseForm.Replace('-', ' ');
                            dict.TryGetValue(baseForm, out synGrps);
                        }

                        if (synGrps != null)
                        {
                            var posSymbols = string.Join("", synGrps.Select(sg => sg.PosSymbol));
                            var except     = new DictException {
                                BasicForm = baseForm, PosSymbols = posSymbols
                            };

                            List <DictException> baseForms;
                            if (excepts.TryGetValue(exwords[0], out baseForms))
                            {
                                if (!baseForms.Any(e => e.BasicForm == baseForm))
                                {
                                    baseForms.Add(except);
                                }
                            }
                            else
                            {
                                excepts.Add(exwords[0], new List <DictException> {
                                    except
                                });
                            }
                        }
                    }
                }
            }

            Console.WriteLine("Save changes");

            var storage = new ExpDictStorage
            {
                SynSets    = dict,
                Exceptions = excepts
            };

            var serializer = new JsonSerializer();

            serializer.NullValueHandling = NullValueHandling.Ignore;

            using (var stream = File.Open(jsonFile, FileMode.Create))
                using (var writer = new BsonWriter(stream))
                {
                    serializer.Serialize(writer, storage);
                }
        }
예제 #2
0
        /// <summary>
        /// Convert method creates db file from wndb files
        /// dictpath - path to wndb data files
        /// context - dest db context
        /// </summary>
        public static void Convert(string dictPack, WordNetContext context)
        {
            WNDB wndb     = new WNDB(dictPack);
            var  synWords = new List <string>();
            // int ind;

            var wordToLemma = new Dictionary <string, Lemma>();
            var words       = new Dictionary <string, Writing>();

            var poses = (new [] { "n", "v", "a", "r" }).Select(s => PartOfSpeech.of(s));

            foreach (var pos in poses)
            {
                Console.WriteLine("Process Data of {0}", pos.name);
                // ind = 0;

                foreach (var data in wndb.GetData(pos))
                {
                    if (data.pos != pos.symbol && !(data.pos == "s" && pos.symbol == "a")) //data.adj includes 'a' & 's' pos symbols
                    {
                        throw new Exception("pos!=data.pos");
                    }

                    var synset = new SynSet {
                        Pos = data.pos
                    };
                    context.SynSets.Add(synset);
                    synWords.Clear();

                    foreach (var oword in data.origWords)
                    {
                        Lemma  lemma;
                        string lcWord = oword.word.ToLower();

                        // add lemma
                        if (!wordToLemma.TryGetValue(lcWord, out lemma))
                        {
                            lemma = new Lemma {
                                Value = lcWord, Poses = data.pos
                            };
                            wordToLemma.Add(lcWord, lemma);
                            context.Lemmas.Add(lemma);
                        }
                        else if (!lemma.Poses.Contains(data.pos))
                        {
                            lemma.Poses += data.pos;
                        }

                        if (synWords.IndexOf(lcWord) < 0)
                        {
                            synWords.Add(lcWord);

                            // add SynSet <-> Lemma relation
                            context.SynsetLemmas.Add(new SynsetLemma
                            {
                                SynSet = synset,
                                Lemma  = lemma
                            });
                        }

                        // add original word if it differs from lemma
                        Writing word;
                        if (lcWord != oword.word)
                        {
                            if (!words.TryGetValue(oword.word, out word))
                            {
                                word = new Writing {
                                    Value = oword.word, Lemma = lemma
                                };
                                words.Add(oword.word, word);
                                context.Writings.Add(word);
                            }
                            else if (word.Lemma != lemma)
                            {
                                Console.WriteLine("Word mix: {0} {1} {2}", oword.word, lemma.Value, word.Lemma.Value);
                                continue;
                            }
                        }
                    }

                    synset.Definition = string.Join(";", data.definitions);
                    synset.Example    = string.Join(";", data.examples);

                    // ind++;
                    // if (ind % 1000 == 0)
                    //     ShowProgress(ind.ToString());
                }
                Console.WriteLine("Save changes");
                context.SaveChanges();

                // exceptions
                //TODO: remove morphes, ...

                Console.WriteLine("Process Exceptions of {0}", pos.name);
                // ind = 0;

                foreach (var exwords in GetExceptions(wndb, pos))
                {
                    for (int i = 1; i < exwords.Length; i++)
                    {
                        if (exwords[i] == exwords[0])
                        {
                            continue;
                        }

                        Lemma lemma;
                        if (wordToLemma.TryGetValue(exwords[i], out lemma) ||
                            (exwords[i].Contains('-') && wordToLemma.TryGetValue(exwords[i].Replace('-', ' '), out lemma)))
                        {
                            context.Excepts.Add(new Except {
                                Value = exwords[0], MainForm = exwords[i], Lemma = lemma
                            });
                        }
                        // else
                        // {
                        //     Console.WriteLine("Lemma not found {0}", exwords[i]);
                        //     context.Excepts.Add(new Except { Value = exwords[0], MainForm = exwords[i] });
                        // }
                    }

                    // ind++;
                    // if (ind % 1000 == 0)
                    //     ShowProgress(ind.ToString());
                }
                Console.WriteLine("Save changes");
                context.SaveChanges();
            }

            //Console.WriteLine("Save changes");
            context.SaveChanges();
        }