Esempio n. 1
0
        public static void TestSqlite(string wordNetDb, string wordsFile, string notFoundFile)
        {
            Console.WriteLine("Start");
            var wordSet = new HashSet <string>(File.ReadAllLines(wordsFile));
            // var notFoundList = new List<string>();
            int found    = 0;
            int morphs   = 0;
            int notFound = 0;
            int cnt      = 0;
            var start    = DateTime.Now;

            using (var context = WordNetContext.GetContext(wordNetDb))
            {
                foreach (var word in wordSet)
                {
                    if (word.Trim() == "")
                    {
                        continue;
                    }

                    var search = Search.GetSearch(word, context);
                    if (search.SynSets.Any())
                    {
                        // Console.WriteLine($"+ {word}");
                        found++;
                    }
                    else if (search.MorphStrings?.Any() ?? false)
                    {
                        // Console.WriteLine($"* {word} -> {search.MorphStrings.First()}");
                        morphs++;
                    }
                    else
                    {
                        // Console.WriteLine($"- {word}");
                        // notFoundList.Add(word);
                        notFound++;
                    }

                    if (cnt == 0)
                    {
                        // var perWord = (DateTime.Now - start).TotalMilliseconds / 1000;
                        // Console.WriteLine($"init: {perWord:F2}s");
                        start = DateTime.Now;
                    }

                    cnt++;
                    // if (cnt % 10000 == 0)
                    // {
                    //     var perWord = (DateTime.Now - start).TotalMilliseconds / cnt;
                    //     Console.WriteLine($"#{cnt}: {perWord:F2} ms/word; Found+Morphs/Total: {(found+morphs)}/{notFoundList.Count}");
                    // }
                }
            }

            // File.WriteAllLines(notFoundFile, notFoundList);

            var perWord = (DateTime.Now - start).TotalMilliseconds / wordSet.Count * 1000;

            Console.WriteLine($"Total words {wordSet.Count} Found {found} Morphs {morphs} Not found {notFound}; Speed: {perWord:F2} ms/(1000 words)");
        }
Esempio n. 2
0
        /// <summary>
        /// Convert method creates db file from wndb files
        /// dictpath - path to wndb data files
        /// context - dest db context
        /// </summary>
        public static void Convert(string dictPack, WordNetContext context)
        {
            WNDB wndb     = new WNDB(dictPack);
            var  synWords = new List <string>();
            // int ind;

            var wordToLemma = new Dictionary <string, Lemma>();
            var words       = new Dictionary <string, Writing>();

            var poses = (new [] { "n", "v", "a", "r" }).Select(s => PartOfSpeech.of(s));

            foreach (var pos in poses)
            {
                Console.WriteLine("Process Data of {0}", pos.name);
                // ind = 0;

                foreach (var data in wndb.GetData(pos))
                {
                    if (data.pos != pos.symbol && !(data.pos == "s" && pos.symbol == "a")) //data.adj includes 'a' & 's' pos symbols
                    {
                        throw new Exception("pos!=data.pos");
                    }

                    var synset = new SynSet {
                        Pos = data.pos
                    };
                    context.SynSets.Add(synset);
                    synWords.Clear();

                    foreach (var oword in data.origWords)
                    {
                        Lemma  lemma;
                        string lcWord = oword.word.ToLower();

                        // add lemma
                        if (!wordToLemma.TryGetValue(lcWord, out lemma))
                        {
                            lemma = new Lemma {
                                Value = lcWord, Poses = data.pos
                            };
                            wordToLemma.Add(lcWord, lemma);
                            context.Lemmas.Add(lemma);
                        }
                        else if (!lemma.Poses.Contains(data.pos))
                        {
                            lemma.Poses += data.pos;
                        }

                        if (synWords.IndexOf(lcWord) < 0)
                        {
                            synWords.Add(lcWord);

                            // add SynSet <-> Lemma relation
                            context.SynsetLemmas.Add(new SynsetLemma
                            {
                                SynSet = synset,
                                Lemma  = lemma
                            });
                        }

                        // add original word if it differs from lemma
                        Writing word;
                        if (lcWord != oword.word)
                        {
                            if (!words.TryGetValue(oword.word, out word))
                            {
                                word = new Writing {
                                    Value = oword.word, Lemma = lemma
                                };
                                words.Add(oword.word, word);
                                context.Writings.Add(word);
                            }
                            else if (word.Lemma != lemma)
                            {
                                Console.WriteLine("Word mix: {0} {1} {2}", oword.word, lemma.Value, word.Lemma.Value);
                                continue;
                            }
                        }
                    }

                    synset.Definition = string.Join(";", data.definitions);
                    synset.Example    = string.Join(";", data.examples);

                    // ind++;
                    // if (ind % 1000 == 0)
                    //     ShowProgress(ind.ToString());
                }
                Console.WriteLine("Save changes");
                context.SaveChanges();

                // exceptions
                //TODO: remove morphes, ...

                Console.WriteLine("Process Exceptions of {0}", pos.name);
                // ind = 0;

                foreach (var exwords in GetExceptions(wndb, pos))
                {
                    for (int i = 1; i < exwords.Length; i++)
                    {
                        if (exwords[i] == exwords[0])
                        {
                            continue;
                        }

                        Lemma lemma;
                        if (wordToLemma.TryGetValue(exwords[i], out lemma) ||
                            (exwords[i].Contains('-') && wordToLemma.TryGetValue(exwords[i].Replace('-', ' '), out lemma)))
                        {
                            context.Excepts.Add(new Except {
                                Value = exwords[0], MainForm = exwords[i], Lemma = lemma
                            });
                        }
                        // else
                        // {
                        //     Console.WriteLine("Lemma not found {0}", exwords[i]);
                        //     context.Excepts.Add(new Except { Value = exwords[0], MainForm = exwords[i] });
                        // }
                    }

                    // ind++;
                    // if (ind % 1000 == 0)
                    //     ShowProgress(ind.ToString());
                }
                Console.WriteLine("Save changes");
                context.SaveChanges();
            }

            //Console.WriteLine("Save changes");
            context.SaveChanges();
        }