private string GetTokenMorph(WordData token, SolarixGrammarEngineNET.GrammarEngine2 gren)
        {
            string part_of_speech = gren.GetClassName(token.part_of_speech);
            string tags           = string.Join(" ", token.GetTags().Select(z => string.Format("{0}={1}", gren.GetCoordName(z.Item1), gren.GetCoordStateName(z.Item1, z.Item2))).ToArray());

            return(part_of_speech + "(" + tags + ")");
        }
Ejemplo n.º 2
0
        public bool Match(SolarixGrammarEngineNET.SyntaxTreeNode proj, int iver, SolarixGrammarEngineNET.GrammarEngine2 gren)
        {
            if (lexeme != null)
            {
                return(proj.GetWord().Equals(lexeme, StringComparison.InvariantCultureIgnoreCase));
            }

            if (id_lemma != null)
            {
                int ekey = proj.GetVersionEntryID(iver);
                if (id_lemma.Contains(ekey))
                {
                    return(true);
                }

                return(false);
            }

            if (pos != null)
            {
                bool pos_matched = false;

                int ekey = proj.GetVersionEntryID(iver);
                if (ekey != -1)
                {
                    int id_class = gren.GetEntryClass(ekey);
                    pos_matched = pos.Contains(id_class);
                }

                if (!pos_matched)
                {
                    return(false);
                }
            }

            if (pairs != null)
            {
                bool contains_all_required_pairs = true;
                foreach (SolarixGrammarEngineNET.CoordPair p in pairs)
                {
                    if (!proj.VersionContains(iver, p))
                    {
                        contains_all_required_pairs = false;
                        break;
                    }
                }

                if (!contains_all_required_pairs)
                {
                    return(false);
                }
            }

            return(true);
        }
 private void LoadDict()
 {
     if (gren == null)
     {
         lock (gren_lock)
         {
             string dict = System.Configuration.ConfigurationSettings.AppSettings["dictionary_path"];
             Console.WriteLine("Loading dictionary {0}", dict);
             gren = new SolarixGrammarEngineNET.GrammarEngine2();
             gren.Load(dict, true);
         }
     }
 }
Ejemplo n.º 4
0
        public bool Match(WordData token, SolarixGrammarEngineNET.GrammarEngine2 gren)
        {
            if (lexeme != null)
            {
                return(token.GetWord().Equals(lexeme, StringComparison.InvariantCultureIgnoreCase));
            }

            if (id_lemma != null)
            {
                int ekey = token.GetEntryID();
                if (id_lemma.Contains(ekey))
                {
                    return(true);
                }

                return(false);
            }

            if (pos != null)
            {
                if (!pos.Contains(token.GetClassID()))
                {
                    return(false);
                }
            }

            if (pairs != null && pairs.Count > 0)
            {
                bool a_version_matched = true;

                foreach (SolarixGrammarEngineNET.CoordPair p in pairs)
                {
                    if (!token.ContainsTag(p.CoordID, p.StateID))
                    {
                        a_version_matched = false;
                        break;
                    }
                }


                return(a_version_matched);
            }

            return(true);
        }
Ejemplo n.º 5
0
        public int MatchTags(WordData token, SolarixGrammarEngineNET.GrammarEngine2 gren)
        {
            foreach (TagMatcher m in matchers)
            {
                if (m.Match(token, gren))
                {
                    return(m.GetId());
                }
            }

            int    entry_id       = token.GetEntryID();
            int    pos_id         = gren.GetEntryClass(entry_id);
            string part_of_speech = gren.GetClassName(pos_id);
            string tags           = string.Join(" ", token.GetTags().Select(z => string.Format("{0}={1}", gren.GetCoordName(z.Item1), gren.GetCoordStateName(z.Item1, z.Item2))).ToArray());
            string msg            = string.Format("Can not find tag for {0} {{ {1} {2} }}", token.GetWord(), part_of_speech, tags);

            throw new ApplicationException(msg);
        }
Ejemplo n.º 6
0
        public int MatchTags(SolarixGrammarEngineNET.SyntaxTreeNode proj, int iproj, SolarixGrammarEngineNET.GrammarEngine2 gren)
        {
            foreach (TagMatcher m in matchers)
            {
                if (m.Match(proj, iproj, gren))
                {
                    // ---~~~~~ DEBUG ~~~---
#if DEBUG
                    string matcher = m.ToString();
                    string form    = "";

                    int npair = SolarixGrammarEngineNET.GrammarEngine.sol_GetNodeVerPairsCount(proj.hNode, iproj);
                    for (int j = 0; j < npair; ++j)
                    {
                        int id_coord = SolarixGrammarEngineNET.GrammarEngine.sol_GetNodeVerPairCoord(proj.hNode, iproj, j);
                        int id_state = SolarixGrammarEngineNET.GrammarEngine.sol_GetNodeVerPairState(proj.hNode, iproj, j);

                        string coord_name = gren.GetCoordName(id_coord);

                        string state_name = null;
                        if (gren.CountCoordStates(id_coord) == 0)
                        {
                            state_name = id_state.ToString();
                        }
                        else
                        {
                            state_name = gren.GetCoordStateName(id_coord, id_state);
                        }

                        form += string.Format(" {0}:{1}", coord_name, state_name);
                    }
#endif
                    // ---~~~~~ ..... ~~~---

                    return(m.GetId());
                }
            }

            return(-1);
        }
Ejemplo n.º 7
0
        public void Load(string lines, SolarixGrammarEngineNET.GrammarEngine2 gren)
        {
            matchers   = new List <TagMatcher>();
            id2matcher = new Dictionary <int, TagMatcher>();
            id2index   = new Dictionary <int, int>();
            index2id   = new Dictionary <int, int>();

            foreach (string line in lines.Split('\n'))
            {
                string l = line.Trim();
                if (!string.IsNullOrEmpty(l))
                {
                    TagMatcher m = new TagMatcher(line, gren);
                    matchers.Add(m);
                    id2matcher.Add(m.GetId(), m);
                    id2index.Add(m.GetId(), matchers.Count - 1);
                    index2id.Add(matchers.Count - 1, m.GetId());
                }
            }

            return;
        }
Ejemplo n.º 8
0
        public TagMatcher(string line, SolarixGrammarEngineNET.GrammarEngine2 gren)
        {
            source = line.Trim();
            id     = ++seq_id;

            string[] toks = source.Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);
            foreach (string tok in toks)
            {
                if (tok[0] == '\'' && tok[tok.Length - 1] == '\'')
                {
                    lexeme = tok.Substring(1, tok.Length - 2);
                }
                else if (tok[0] == '"' && tok[tok.Length - 1] == '"')
                {
                    lexeme = tok.Substring(1, tok.Length - 2);
                }
                else
                {
                    if (tok.IndexOf(':') == -1)
                    {
                        string pos_name = tok.Trim();
                        int    id_class = gren.FindPartOfSpeech(pos_name);
                        if (id_class != -1)
                        {
                            if (pos == null)
                            {
                                pos       = new List <int>();
                                pos_names = new List <string>();
                            }

                            pos.Add(id_class);
                            pos_names.Add(pos_name);
                        }
                        else
                        {
                            int state = 1;

                            string t = tok.Trim();

                            if (t.StartsWith("~"))
                            {
                                t     = t.Substring(1);
                                state = 0;
                            }

                            int id_coord = gren.FindCoord(t);
                            if (id_coord == -1)
                            {
                                throw new ApplicationException(string.Format("Can not find coord [{0}]", tok));
                            }

                            if (gren.CountCoordStates(id_coord) != 0)
                            {
                                throw new ApplicationException(string.Format("[{0}] is not bistable", tok));
                            }

                            if (pairs == null)
                            {
                                pairs = new List <SolarixGrammarEngineNET.CoordPair>();
                            }

                            SolarixGrammarEngineNET.CoordPair p;
                            p.CoordID = id_coord;
                            p.StateID = state;
                            pairs.Add(p);
                        }
                    }
                    else
                    {
                        string[] t3         = tok.Split(':');
                        string   coord_name = t3[0].Trim();
                        string   state_name = t3[1].Trim();

                        int id_class = gren.FindPartOfSpeech(coord_name);
                        if (id_class != -1)
                        {
                            id_lemma    = new List <int>();
                            entry_names = new List <string>();
                            pos_names   = new List <string>();
                            pos_names.Add(coord_name);

                            int    ip    = state_name.IndexOf('{');
                            string ename = state_name.Substring(0, ip);
                            int    ekey  = gren.FindEntry(ename, id_class);

                            if (ekey == -1)
                            {
                                throw new ApplicationException(string.Format("Can not find word entry {0}:{1}", coord_name, ename));
                            }

                            id_lemma.Add(ekey);
                            entry_names.Add(ename);

                            if (entry_pos == null)
                            {
                                entry_pos = new List <int>();
                            }

                            if (!entry_pos.Contains(id_class))
                            {
                                entry_pos.Add(id_class);
                            }
                        }
                        else
                        {
                            int id_coord = gren.FindCoord(coord_name);
                            if (id_coord == -1)
                            {
                                throw new ApplicationException(string.Format("Can not find coord [{0}]", coord_name));
                            }

                            int id_state = -1;
                            if (gren.CountCoordStates(id_coord) == 0)
                            {
                                id_state = int.Parse(state_name);
                                if (id_state < 0 || id_state > 1)
                                {
                                    throw new ApplicationException(string.Format("Invalid state name {0} for bistable coord {1}", state_name, coord_name));
                                }
                            }
                            else
                            {
                                id_state = gren.FindState(id_coord, state_name);
                                if (id_state == -1)
                                {
                                    throw new ApplicationException(string.Format("Can not find state [{0}:{1}]", coord_name, state_name));
                                }
                            }

                            if (pairs == null)
                            {
                                pairs = new List <SolarixGrammarEngineNET.CoordPair>();
                            }

                            SolarixGrammarEngineNET.CoordPair p;
                            p.CoordID = id_coord;
                            p.StateID = id_state;
                            pairs.Add(p);
                        }
                    }
                }
            }
        }
        public IEnumerable <SentenceData> Read(SolarixGrammarEngineNET.GrammarEngine2 gren)
        {
            string[] bin_corpora = System.IO.Directory.GetFiles(System.IO.Path.GetDirectoryName(corpus_path), System.IO.Path.GetFileName(corpus_path));

            Console.WriteLine("There are {0} binary corpus segments", bin_corpora.Length);

            foreach (string path1 in bin_corpora)
            {
                Console.WriteLine("Reading corpus {0}...", path1);

                int corpus_count = 0;

                IntPtr hCorpus = SolarixGrammarEngineNET.GrammarEngine.sol_OpenCorpusStorage8(gren.GetEngineHandle(), path1, false);
                if (hCorpus == IntPtr.Zero)
                {
                    throw new ApplicationException(string.Format("Can not open corpus {0}", corpus_path));
                }

                while (true)
                {
                    corpus_count++;

                    IntPtr hSample1 = SolarixGrammarEngineNET.GrammarEngine.sol_LoadSyntaxTree(gren.GetEngineHandle(), hCorpus);
                    if (hSample1 == IntPtr.Zero)
                    {
                        break;
                    }

                    IntPtr hSample2 = SolarixGrammarEngineNET.GrammarEngine.sol_LoadSyntaxTree(gren.GetEngineHandle(), hCorpus);
                    IntPtr hSample3 = SolarixGrammarEngineNET.GrammarEngine.sol_LoadSyntaxTree(gren.GetEngineHandle(), hCorpus);

                    string sample       = SolarixGrammarEngineNET.GrammarEngine.sol_GetSentenceW(hSample1);
                    var    morphology   = new SolarixGrammarEngineNET.AnalysisResults(gren, SolarixGrammarEngineNET.GrammarEngine.sol_GetTreeHandle(hSample1), false);
                    var    tokenization = new SolarixGrammarEngineNET.AnalysisResults(gren, SolarixGrammarEngineNET.GrammarEngine.sol_GetTreeHandle(hSample2), false);
                    var    syntax_tree  = new SolarixGrammarEngineNET.AnalysisResults(gren, SolarixGrammarEngineNET.GrammarEngine.sol_GetTreeHandle(hSample3), false);

                    if (morphology.Count == tokenization.Count)
                    {
                        SentenceData sent = new SentenceData(sample);

                        for (int i = 0; i < morphology.Count; ++i)
                        {
                            WordData word_data = new WordData();

                            word_data.word           = morphology[i].GetWord();
                            word_data.word_index     = morphology[i].GetWordPosition();
                            word_data.entry_id       = morphology[i].GetEntryID();
                            word_data.part_of_speech = gren.GetEntryClass(word_data.entry_id);
                            word_data.all_projs      = tokenization[i];

                            foreach (var tag in morphology[i].GetPairs())
                            {
                                word_data.tags.Add(Tuple.Create(tag.CoordID, tag.StateID));
                            }

                            word_data.lemma = gren.GetEntryName(word_data.entry_id).ToLower();

                            if (word_data.lemma == "???")
                            {
                                word_data.lemma = word_data.word;
                            }
                            else if (word_data.lemma == "NUMBER_")
                            {
                                word_data.lemma = word_data.word;
                            }

                            sent.AddWord(word_data);
                        }

                        yield return(sent);
                    }

                    SolarixGrammarEngineNET.GrammarEngine.sol_FreeSyntaxTree(hSample1);
                    SolarixGrammarEngineNET.GrammarEngine.sol_FreeSyntaxTree(hSample2);
                    SolarixGrammarEngineNET.GrammarEngine.sol_FreeSyntaxTree(hSample3);
                }

                SolarixGrammarEngineNET.GrammarEngine.sol_CloseCorpusStorage(gren.GetEngineHandle(), hCorpus);
            }
        }
        public IEnumerable <SentenceData> Read(SolarixGrammarEngineNET.GrammarEngine2 gren)
        {
            int beth_class     = gren.FindPartOfSpeech("BETH");
            int entry_begin_id = gren.FindEntry("BEGIN", beth_class);
            int entry_end_id   = gren.FindEntry("END", beth_class);

            string[] files = System.IO.Directory.GetFiles(System.IO.Path.GetDirectoryName(corpus_path), System.IO.Path.GetFileName(corpus_path));

            foreach (string path1 in files)
            {
                using (System.IO.StreamReader rdr = new System.IO.StreamReader(path1))
                {
                    List <ConlluItem> items = new List <ConlluItem>();
                    while (!rdr.EndOfStream)
                    {
                        string line = rdr.ReadLine();
                        if (line == null)
                        {
                            break;
                        }

                        if (string.IsNullOrEmpty(line))
                        {
                            if (items.Count > 0)
                            {
                                string sample = string.Join(" ", items.Select(z => z.word));

                                SentenceData sent = new SentenceData(sample);

                                bool all_ok = true; // если не сможем выполнить конверию морфологической разметки, то
                                                    // выставим этот флаг в false и не будем возвращать объект SentenceData.
                                int word_index = 0;

                                SolarixGrammarEngineNET.AnalysisResults tokenization = gren.AnalyzeMorphology(sample, -1, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_TOKENIZE_ONLY, 0);

                                if (tokenization.Count - 2 == items.Count)
                                {
                                    // Добавить <BEGIN>
                                    WordData wd_begin = new WordData();
                                    wd_begin.word_index     = 0;
                                    wd_begin.word           = "BEGIN";
                                    wd_begin.entry_id       = entry_begin_id;
                                    wd_begin.part_of_speech = beth_class;
                                    wd_begin.all_projs      = tokenization[word_index];
                                    sent.AddWord(wd_begin);

                                    word_index++;

                                    foreach (var item in items)
                                    {
                                        var projs = SolarixGrammarEngineNET.GrammarEngine.sol_ProjectWord(gren.GetEngineHandle(), item.word, 1);

                                        int selected_proj = 0;

                                        int nproj = SolarixGrammarEngineNET.GrammarEngine.sol_CountProjections(projs);
                                        if (nproj > 1)
                                        {
                                            List <Tuple <int, int> > proj_scores = new List <Tuple <int, int> >();

                                            int required_class = -1;
                                            List <Tuple <int, int> > required_tags = new List <Tuple <int, int> >();

                                            if (item.ud_class == "NOUN" || item.ud_class == "PROPN")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.NOUN_ru;
                                            }
                                            else if (item.ud_class == "PUNCT")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.PUNCTUATION_class;
                                            }
                                            else if (item.ud_class == "ADJ" || item.ud_class == "DET")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.ADJ_ru;
                                            }
                                            else if (item.ud_class == "AUX" || item.ud_class == "VERB")
                                            {
                                                if (item.Contains("VerbForm ", "Inf"))
                                                {
                                                    required_class = SolarixGrammarEngineNET.GrammarEngineAPI.INFINITIVE_ru;
                                                }
                                                else if (item.Contains("VerbForm", "Trans"))
                                                {
                                                    required_class = SolarixGrammarEngineNET.GrammarEngineAPI.GERUND_2_ru;
                                                }
                                                else
                                                {
                                                    required_class = SolarixGrammarEngineNET.GrammarEngineAPI.VERB_ru;
                                                }
                                            }
                                            else if (item.ud_class == "ADP")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.PREPOS_ru;
                                            }
                                            else if (item.ud_class == "ADV")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.ADVERB_ru;
                                            }
                                            else if (item.ud_class == "PRON")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.PRONOUN_ru;
                                            }
                                            else if (item.ud_class == "PART")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.PARTICLE_ru;
                                            }
                                            else if (item.ud_class == "NUM")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.NUMBER_CLASS_ru;
                                            }
                                            else if (item.ud_class == "CONJ" || item.ud_class == "SCONJ")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.CONJ_ru;
                                            }
                                            else if (item.ud_class == "INTJ")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.PARTICIPLE_ru;
                                            }
                                            else
                                            {
                                                all_ok = false;
                                                break;
                                            }


                                            foreach (string tv in item.ud_tags)
                                            {
                                                int coord_id = -1;
                                                int state_id = -1;

                                                string[] tv2 = tv.Split('=');
                                                string   tag = tv2[0].Trim();
                                                string   val = tv2[1].Trim();

                                                if (tag == "Animacy")
                                                {
                                                    coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.FORM_ru;
                                                    if (val == "Inan")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.INANIMATIVE_FORM_ru;
                                                    }
                                                    else
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.ANIMATIVE_FORM_ru;
                                                    }
                                                }


                                                if (tag == "Case")
                                                {
                                                    coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.CASE_ru;
                                                    if (val == "Nom")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.NOMINATIVE_CASE_ru;
                                                    }
                                                    else if (val == "Acc")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.NOMINATIVE_CASE_ru;
                                                    }
                                                    else if (val == "Gen")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.GENITIVE_CASE_ru;
                                                    }
                                                    else if (val == "Ins")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.INSTRUMENTAL_CASE_ru;
                                                    }
                                                    else if (val == "Loc")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PREPOSITIVE_CASE_ru;
                                                    }
                                                    else if (val == "Dat")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.DATIVE_CASE_ru;
                                                    }
                                                    else if (val == "Par")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PARTITIVE_CASE_ru;
                                                    }
                                                    else if (val == "Voc")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.VOCATIVE_CASE_ru;
                                                    }
                                                    else
                                                    {
                                                        all_ok = false;
                                                        break;
                                                    }
                                                }

                                                if (tag == "Gender")
                                                {
                                                    coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.GENDER_ru;

                                                    if (val == "Fem")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.FEMININE_GENDER_ru;
                                                    }
                                                    else if (val == "Masc")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.MASCULINE_GENDER_ru;
                                                    }
                                                    else if (val == "Neut")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.NEUTRAL_GENDER_ru;
                                                    }
                                                    else
                                                    {
                                                        all_ok = false;
                                                        break;
                                                    }
                                                }

                                                // Number = Sing
                                                if (tag == "Number")
                                                {
                                                    coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.NUMBER_ru;
                                                    if (val == "Sing")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.SINGULAR_NUMBER_ru;
                                                    }
                                                    else
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PLURAL_NUMBER_ru;
                                                    }
                                                }

                                                // Degree=Pos
                                                if (tag == "Degree")
                                                {
                                                    coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.COMPAR_FORM_ru;

                                                    if (val == "Pos")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.ATTRIBUTIVE_FORM_ru;
                                                    }
                                                    else if (val == "Sup")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.SUPERLATIVE_FORM_ru;
                                                    }
                                                    else if (val == "Cmp")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.COMPARATIVE_FORM_ru;
                                                    }
                                                }

                                                if (tag == "Variant" && val == "Brev")
                                                {
                                                    coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.SHORTNESS_ru;
                                                    state_id = 1;
                                                }


                                                if (tag == "Person")
                                                {
                                                    coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_ru;
                                                    if (val == "1")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_1_ru;
                                                    }
                                                    else if (val == "2")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_2_ru;
                                                    }
                                                    else if (val == "3")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_3_ru;
                                                    }
                                                }

                                                if (tag == "Tense")
                                                {
                                                    coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.TENSE_ru;
                                                    if (val == "Pres")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PRESENT_ru;
                                                    }
                                                    else if (val == "Past")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PAST_ru;
                                                    }
                                                    else if (val == "Fut")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.FUTURE_ru;
                                                    }
                                                }

                                                if (tag == "Mood")
                                                {
                                                    coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.VERB_FORM_ru;
                                                    if (val == "Ind")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.VB_INF_ru;
                                                    }
                                                    else if (val == "Imp")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.VB_ORDER_ru;
                                                    }
                                                }

                                                if (coord_id != -1 && state_id != -1)
                                                {
                                                    required_tags.Add(Tuple.Create(coord_id, state_id));
                                                }
                                            }

                                            for (int iproj = 0; iproj < nproj; ++iproj)
                                            {
                                                int proj_score = 0;
                                                int entry_id   = SolarixGrammarEngineNET.GrammarEngine.sol_GetIEntry(projs, iproj);
                                                int class_id   = gren.GetEntryClass(entry_id);
                                                if (class_id == required_class)
                                                {
                                                    proj_score++;
                                                }

                                                foreach (var required_tag in required_tags)
                                                {
                                                    int state_id = SolarixGrammarEngineNET.GrammarEngine.sol_GetProjCoordState(gren.GetEngineHandle(), projs, iproj, required_tag.Item1);

                                                    if (state_id == required_tag.Item2)
                                                    {
                                                        proj_score++;
                                                    }
                                                }
                                                proj_scores.Add(Tuple.Create(iproj, proj_score));
                                            }

                                            selected_proj = proj_scores.OrderByDescending(z => z.Item2).Select(z => z.Item1).First();
                                        }



                                        // теперь выбранную проекцию слова selected_proj векторизуем
                                        WordData word_data = new WordData();
                                        word_data.word           = item.word;
                                        word_data.word_index     = word_index;
                                        word_data.all_projs      = tokenization[word_index];
                                        word_data.entry_id       = SolarixGrammarEngineNET.GrammarEngine.sol_GetIEntry(projs, selected_proj);
                                        word_data.part_of_speech = gren.GetEntryClass(word_data.entry_id);

                                        // Лемму определяем в несколько шагов.
                                        // Прежде всего пытаемся взять наименование словарной статьи в лексиконе Solarix.
                                        word_data.lemma = gren.GetEntryName(word_data.entry_id).ToLower();

                                        // Если получили наменование статьи "???", то значит это несловарный элемент и
                                        // надо брать лемму из корпуса.
                                        if (word_data.lemma == "???")
                                        {
                                            word_data.lemma = item.ud_lemma;
                                        }
                                        else if (word_data.lemma == "NUMBER_")
                                        {
                                            // Для чисел берем исходный токен в качестве леммы.
                                            word_data.lemma = item.word;
                                        }


                                        for (int j = 0; j < SolarixGrammarEngineNET.GrammarEngine.sol_GetProjCoordCount(gren.GetEngineHandle(), projs, selected_proj); ++j)
                                        {
                                            int coord_id = SolarixGrammarEngineNET.GrammarEngine.sol_GetProjCoordId(gren.GetEngineHandle(), projs, selected_proj, j);
                                            int state_id = SolarixGrammarEngineNET.GrammarEngine.sol_GetProjStateId(gren.GetEngineHandle(), projs, selected_proj, j);
                                            word_data.tags.Add(Tuple.Create(coord_id, state_id));
                                        }

                                        sent.AddWord(word_data);

                                        SolarixGrammarEngineNET.GrammarEngine.sol_DeleteProjections(projs);

                                        // DEBUG
                                        //Console.WriteLine("\nDEBUG word={0}\ncorpus:{1}({2})\nselected:{3}", item.word, item.ud_class, string.Join(" ", item.ud_tags), GetTokenMorph(word_data, gren));

                                        word_index++;
                                    }

                                    // Добавить <END>
                                    WordData wd_end = new WordData();
                                    wd_end.word_index     = word_index;
                                    wd_end.word           = "END";
                                    wd_end.entry_id       = entry_end_id;
                                    wd_end.part_of_speech = beth_class;
                                    wd_end.all_projs      = tokenization[word_index];
                                    sent.AddWord(wd_end);

                                    // Для отладки - выведем текстовое представление тегов в выбранной проекции.



                                    if (all_ok)
                                    {
                                        yield return(sent);
                                    }
                                }
                            }

                            items.Clear();
                        }
                        else
                        {
                            string[] tx = line.Trim().Split('\t');

                            ConlluItem item = new ConlluItem();
                            item.word     = tx[word_column];
                            item.ud_lemma = tx[lemma_column];
                            item.ud_class = tx[class_column];
                            item.ud_tags  = tx[tags_column].Split('|')
                                            .Select(z => z.Split('='))
                                            .Where(z => z.Length == 2)
                                            .Select(z => z[0].Trim() + '=' + z[1].Trim())
                                            .ToArray();

                            items.Add(item);
                        }
                    }
                }
            }
        }
Ejemplo n.º 11
0
 private void LoadDict()
 {
     if (gren == null)
     {
         lock (gren_lock)
         {
             string dict = System.Configuration.ConfigurationSettings.AppSettings["dictionary_path"];
             Console.WriteLine("Loading dictionary {0}", dict);
             gren = new SolarixGrammarEngineNET.GrammarEngine2();
             gren.Load(dict, true);
         }
     }
 }
Ejemplo n.º 12
0
        public IEnumerable <SentenceData> Read(SolarixGrammarEngineNET.GrammarEngine2 gren)
        {
            // Сначала надо получить из словарной список слов, которые дают однозначную лемматизацию.

            HashSet <int> classes = new HashSet <int>();

            foreach (var class_name in "СУЩЕСТВИТЕЛЬНОЕ ПРИЛАГАТЕЛЬНОЕ ГЛАГОЛ ИНФИНИТИВ ДЕЕПРИЧАСТИЕ НАРЕЧИЕ".Split(' '))
            {
                classes.Add(gren.FindPartOfSpeech(class_name));
            }

            HashSet <string> forms = new HashSet <string>();
            MultiValueDictionary <string, string> form2lemma = new MultiValueDictionary <string, string>();
            MultiValueDictionary <string, int>    form2entry = new MultiValueDictionary <string, int>();

            Console.WriteLine("Generating the list of words and lemmas...");
            IntPtr hList = SolarixGrammarEngineNET.GrammarEngine.sol_ListEntries(hEngine: gren.GetEngineHandle(),
                                                                                 Flags: 0, EntryType: 0, Mask: ".+", Language: -1, PartOfSpeech: -1);
            int nb_entries = SolarixGrammarEngineNET.GrammarEngine.sol_CountInts(hList);

            for (int i = 0; i < nb_entries; ++i)
            {
                int id_entry = SolarixGrammarEngineNET.GrammarEngine.sol_GetInt(hList, i);

                if (classes.Contains(gren.GetEntryClass(id_entry)))
                {
                    string lemma = gren.GetEntryName(id_entry);
                    if (char.IsLetter(lemma[0]))
                    {
                        IntPtr hForms   = SolarixGrammarEngineNET.GrammarEngine.sol_ListEntryForms(gren.GetEngineHandle(), id_entry);
                        int    nb_forms = SolarixGrammarEngineNET.GrammarEngine.sol_CountStrings(hForms);

                        for (int iform = 0; iform < nb_forms; ++iform)
                        {
                            string form = SolarixGrammarEngineNET.GrammarEngine.sol_GetStringFX(hForms, iform);
                            form = gren.RestoreCasing(id_entry, form);
                            if (!form2lemma.Contains(form, lemma))
                            {
                                form2lemma.Add(form, lemma);
                                form2entry.Add(form, id_entry);
                                forms.Add(form);
                            }
                        }
                        SolarixGrammarEngineNET.GrammarEngine.sol_DeleteStrings(hForms);
                    }

                    if ((i % 10000) == 0)
                    {
                        Console.Write("{0}/{1}\r", i, nb_entries);
                    }
                }
            }
            SolarixGrammarEngineNET.GrammarEngine.sol_DeleteInts(hList);

            List <string> unambiguous_forms = forms.Where(z => form2lemma[z].Count == 1).ToList();

            Console.WriteLine("{0} forms are good for lemmatization", unambiguous_forms.Count);

            foreach (string word in unambiguous_forms)
            {
                //SolarixGrammarEngineNET.AnalysisResults morphology = gren.AnalyzeMorphology(word, -1);
                SolarixGrammarEngineNET.AnalysisResults tokenization = gren.AnalyzeMorphology(word, -1, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_TOKENIZE_ONLY, 0);

                bool         ok   = true;
                SentenceData sent = new SentenceData(word);

                for (int i = 0; i < tokenization.Count; ++i)
                {
                    if (i == 1 && !form2entry[word].Contains(tokenization[i].GetEntryID()))
                    {
                        ok = false;
                    }

                    WordData word_data = new WordData();

                    word_data.word           = tokenization[i].GetWord();
                    word_data.word_index     = tokenization[i].GetWordPosition();
                    word_data.entry_id       = tokenization[i].GetEntryID();
                    word_data.part_of_speech = gren.GetEntryClass(word_data.entry_id);
                    word_data.all_projs      = tokenization[i];

                    foreach (var tag in tokenization[i].GetPairs())
                    {
                        word_data.tags.Add(Tuple.Create(tag.CoordID, tag.StateID));
                    }

                    word_data.lemma = gren.GetEntryName(word_data.entry_id).ToLower();

                    if (word_data.lemma == "???")
                    {
                        word_data.lemma = word_data.word;
                    }
                    else if (word_data.lemma == "NUMBER_")
                    {
                        word_data.lemma = word_data.word;
                    }

                    sent.AddWord(word_data);
                }

                if (ok)
                {
                    yield return(sent);
                }
            }
        }